505 files changed, 48552 insertions, 27201 deletions
diff --git a/arch/s390/Kbuild b/arch/s390/Kbuild
index e63940bb57cd..a5d3503b353c 100644
--- a/arch/s390/Kbuild
+++ b/arch/s390/Kbuild
@@ -3,9 +3,11 @@ obj-y				+= kernel/
 obj-y				+= mm/
 obj-$(CONFIG_KVM)		+= kvm/
 obj-y				+= crypto/
-obj-$(CONFIG_S390_HYPFS_FS)	+= hypfs/
+obj-$(CONFIG_S390_HYPFS)	+= hypfs/
 obj-$(CONFIG_APPLDATA_BASE)	+= appldata/
 obj-y				+= net/
 obj-$(CONFIG_PCI)		+= pci/
-obj-$(CONFIG_NUMA)		+= numa/
-obj-$(CONFIG_ARCH_HAS_KEXEC_PURGATORY) += purgatory/
+obj-$(CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY) += purgatory/
+
+# for cleaning
+subdir- += boot tools
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index d4051e88e625..db8161ebb43c 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -2,9 +2,6 @@
 config MMU
 	def_bool y
 
-config ZONE_DMA
-	def_bool y
-
 config CPU_BIG_ENDIAN
 	def_bool y
 
@@ -20,9 +17,12 @@ config ARCH_HAS_ILOG2_U32
 config ARCH_HAS_ILOG2_U64
 	def_bool n
 
-config GENERIC_HWEIGHT
+config ARCH_PROC_KCORE_TEXT
 	def_bool y
 
+config GENERIC_HWEIGHT
+	def_bool !HAVE_MARCH_Z196_FEATURES
+
 config GENERIC_BUG
 	def_bool y if BUG
 
@@ -30,48 +30,78 @@ config GENERIC_BUG_RELATIVE_POINTERS
 	def_bool y
 
 config GENERIC_LOCKBREAK
-	def_bool y if PREEMPT
+	def_bool y if PREEMPTION
 
 config PGSTE
 	def_bool y if KVM
 
-config ARCH_SUPPORTS_DEBUG_PAGEALLOC
-	def_bool y
-
 config AUDIT_ARCH
 	def_bool y
 
 config NO_IOPORT_MAP
 	def_bool y
 
-config PCI_QUIRKS
-	def_bool n
-
 config ARCH_SUPPORTS_UPROBES
 	def_bool y
 
 config KASAN_SHADOW_OFFSET
 	hex
 	depends on KASAN
-	default 0x18000000000000 if KASAN_S390_4_LEVEL_PAGING
-	default 0x30000000000
+	default 0x1C000000000000
+
+config CC_ASM_FLAG_OUTPUT_BROKEN
+	def_bool CC_IS_GCC && GCC_VERSION < 140200
+	help
+	  GCC versions before 14.2.0 may die with an internal
+	  compiler error in some configurations if flag output
+	  operands are used within inline assemblies.
+
+config CC_HAS_ASM_AOR_FORMAT_FLAGS
+	def_bool !(CC_IS_CLANG && CLANG_VERSION < 190100)
+	help
+	  Clang versions before 19.1.0 do not support A,
+	  O, and R inline assembly format flags.
 
 config S390
 	def_bool y
-	select ARCH_BINFMT_ELF_STATE
+	#
+	# Note: keep this list sorted alphabetically
+	#
+	imply IMA_SECURE_AND_OR_TRUSTED_BOOT
+	select ALTERNATE_USER_ADDRESS_SPACE
+	select ARCH_32BIT_USTAT_F_TINODE
+	select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
+	select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM
+	select ARCH_ENABLE_MEMORY_HOTREMOVE
+	select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
+	select ARCH_HAS_CPU_FINALIZE_INIT
+	select ARCH_HAS_CRC32
+	select ARCH_HAS_CURRENT_STACK_POINTER
+	select ARCH_HAS_DEBUG_VIRTUAL
+	select ARCH_HAS_DEBUG_VM_PGTABLE
+	select ARCH_HAS_DEBUG_WX
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
+	select ARCH_HAS_DMA_OPS if PCI
 	select ARCH_HAS_ELF_RANDOMIZE
+	select ARCH_HAS_FORCE_DMA_UNENCRYPTED
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_GIGANTIC_PAGE
 	select ARCH_HAS_KCOV
+	select ARCH_HAS_MEMBARRIER_SYNC_CORE
 	select ARCH_HAS_MEM_ENCRYPT
+	select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
+	select ARCH_HAS_PREEMPT_LAZY
+	select ARCH_HAS_PTDUMP
 	select ARCH_HAS_PTE_SPECIAL
+	select ARCH_HAS_SCALED_CPUTIME
+	select ARCH_HAS_SET_DIRECT_MAP
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_HAS_STRICT_MODULE_RWX
 	select ARCH_HAS_SYSCALL_WRAPPER
-	select ARCH_HAS_UBSAN_SANITIZE_ALL
+	select ARCH_HAS_UBSAN
+	select ARCH_HAS_VDSO_TIME_DATA
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_INLINE_READ_LOCK
 	select ARCH_INLINE_READ_LOCK_BH
@@ -101,52 +131,82 @@ config S390
 	select ARCH_INLINE_WRITE_UNLOCK_BH
 	select ARCH_INLINE_WRITE_UNLOCK_IRQ
 	select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
-	select ARCH_KEEP_MEMBLOCK
-	select ARCH_SAVE_PAGE_KEYS if HIBERNATION
+	select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
 	select ARCH_STACKWALK
 	select ARCH_SUPPORTS_ATOMIC_RMW
+	select ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	select ARCH_SUPPORTS_HUGETLBFS
+	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && CC_IS_CLANG
+	select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS
 	select ARCH_SUPPORTS_NUMA_BALANCING
+	select ARCH_SUPPORTS_PER_VMA_LOCK
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF
-	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
+	select ARCH_USE_SYM_ANNOTATIONS
+	select ARCH_WANTS_NO_INSTR
+	select ARCH_WANT_DEFAULT_BPF_JIT
 	select ARCH_WANT_IPC_PARSE_VERSION
-	select BUILDTIME_EXTABLE_SORT
+	select ARCH_WANT_KERNEL_PMD_MKWRITE
+	select ARCH_WANT_LD_ORPHAN_WARN
+	select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
+	select BUILDTIME_TABLE_SORT
 	select CLONE_BACKWARDS2
+	select DCACHE_WORD_ACCESS if !KMSAN
 	select DYNAMIC_FTRACE if FUNCTION_TRACER
-	select GENERIC_CLOCKEVENTS
+	select FUNCTION_ALIGNMENT_8B if CC_IS_GCC
+	select FUNCTION_ALIGNMENT_16B if !CC_IS_GCC
+	select GENERIC_ALLOCATOR
+	select GENERIC_CPU_DEVICES
 	select GENERIC_CPU_AUTOPROBE
 	select GENERIC_CPU_VULNERABILITIES
-	select GENERIC_FIND_FIRST_BIT
+	select GENERIC_ENTRY
+	select GENERIC_GETTIMEOFDAY
 	select GENERIC_SMP_IDLE_THREAD
 	select GENERIC_TIME_VSYSCALL
-	select HAVE_ALIGNED_STRUCT_PAGE if SLUB
+	select GENERIC_VDSO_DATA_STORE
+	select GENERIC_VDSO_TIME_NS
+	select GENERIC_IOREMAP if PCI
+	select HAVE_ALIGNED_STRUCT_PAGE
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_ARCH_JUMP_LABEL_RELATIVE
 	select HAVE_ARCH_KASAN
-	select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES
+	select HAVE_ARCH_KASAN_VMALLOC
+	select HAVE_ARCH_KCSAN
+	select HAVE_ARCH_KMSAN
+	select HAVE_ARCH_KFENCE
+	select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_SOFT_DIRTY
+	select HAVE_ARCH_STACKLEAK
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select HAVE_ARCH_VMAP_STACK
 	select HAVE_ASM_MODVERSIONS
-	select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES
+	select HAVE_BUILDTIME_MCOUNT_SORT
 	select HAVE_CMPXCHG_DOUBLE
 	select HAVE_CMPXCHG_LOCAL
-	select HAVE_COPY_THREAD_TLS
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
+	select HAVE_DYNAMIC_FTRACE_WITH_ARGS
+	select HAVE_FTRACE_REGS_HAVING_PT_REGS
+	select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
-	select HAVE_FAST_GUP
+	select HAVE_EBPF_JIT if HAVE_MARCH_Z196_FEATURES
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
+	select HAVE_GUP_FAST
 	select HAVE_FENTRY
+	select HAVE_FTRACE_GRAPH_FUNC
 	select HAVE_FTRACE_MCOUNT_RECORD
+	select HAVE_FUNCTION_ARG_ACCESS_API
+	select HAVE_FUNCTION_ERROR_INJECTION
+	select HAVE_FUNCTION_GRAPH_FREGS
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER
-	select HAVE_FUTEX_CMPXCHG if FUTEX
 	select HAVE_GCC_PLUGINS
+	select HAVE_GENERIC_VDSO
+	select HAVE_IOREMAP_PROT if PCI
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZ4
@@ -154,46 +214,63 @@ config S390
 	select HAVE_KERNEL_LZO
 	select HAVE_KERNEL_UNCOMPRESSED
 	select HAVE_KERNEL_XZ
+	select HAVE_KERNEL_ZSTD
 	select HAVE_KPROBES
+	select HAVE_KPROBES_ON_FTRACE
 	select HAVE_KRETPROBES
-	select HAVE_KVM
 	select HAVE_LIVEPATCH
-	select HAVE_PERF_REGS
-	select HAVE_PERF_USER_STACK_DUMP
-	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_MEMBLOCK_PHYS_MAP
-	select HAVE_MMU_GATHER_NO_GATHER
 	select HAVE_MOD_ARCH_SPECIFIC
+	select HAVE_NMI
 	select HAVE_NOP_MCOUNT
-	select HAVE_OPROFILE
+	select HAVE_PAGE_SIZE_4KB
 	select HAVE_PCI
 	select HAVE_PERF_EVENTS
-	select HAVE_RCU_TABLE_FREE
+	select HAVE_PERF_REGS
+	select HAVE_PERF_USER_STACK_DUMP
+	select HAVE_PREEMPT_DYNAMIC_KEY
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE
+	select HAVE_RETHOOK
 	select HAVE_RSEQ
+	select HAVE_SAMPLE_FTRACE_DIRECT
+	select HAVE_SAMPLE_FTRACE_DIRECT_MULTI
+	select HAVE_SETUP_PER_CPU_AREA
+	select HAVE_SOFTIRQ_ON_OWN_STACK
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_VIRT_CPU_ACCOUNTING
+	select HAVE_VIRT_CPU_ACCOUNTING_IDLE
+	select HOTPLUG_SMT
 	select IOMMU_HELPER		if PCI
 	select IOMMU_SUPPORT		if PCI
+	select KASAN_VMALLOC if KASAN
+	select LOCK_MM_AND_FIND_VMA
+	select MMU_GATHER_MERGE_VMAS
+	select MMU_GATHER_NO_GATHER
+	select MMU_GATHER_RCU_TABLE_FREE
 	select MODULES_USE_ELF_RELA
 	select NEED_DMA_MAP_STATE	if PCI
+	select NEED_PER_CPU_EMBED_FIRST_CHUNK
+	select NEED_PROC_VMCORE_DEVICE_RAM if PROC_VMCORE
 	select NEED_SG_DMA_LENGTH	if PCI
 	select OLD_SIGACTION
 	select OLD_SIGSUSPEND3
 	select PCI_DOMAINS		if PCI
 	select PCI_MSI			if PCI
+	select PCI_MSI_ARCH_FALLBACKS	if PCI_MSI
+	select PCI_QUIRKS		if PCI
 	select SPARSE_IRQ
+	select SWIOTLB
 	select SYSCTL_EXCEPTION_TRACE
 	select THREAD_INFO_IN_TASK
+	select TRACE_IRQFLAGS_SUPPORT
 	select TTY
+	select USER_STACKTRACE_SUPPORT
+	select VDSO_GETRANDOM
 	select VIRT_CPU_ACCOUNTING
-	select ARCH_HAS_SCALED_CPUTIME
-	select HAVE_NMI
-	select ARCH_HAS_FORCE_DMA_UNENCRYPTED
-	select SWIOTLB
-	select GENERIC_ALLOCATOR
-
+	select VMAP_STACK
+	select ZONE_DMA
+	# Note: keep the above list sorted alphabetically
 
 config SCHED_OMIT_FRAME_POINTER
 	def_bool y
@@ -204,22 +281,32 @@ config PGTABLE_LEVELS
 
 source "kernel/livepatch/Kconfig"
 
-menu "Processor type and features"
+config ARCH_SUPPORTS_KEXEC
+	def_bool y
 
-config HAVE_MARCH_Z900_FEATURES
-	def_bool n
+config ARCH_SUPPORTS_KEXEC_FILE
+	def_bool y
 
-config HAVE_MARCH_Z990_FEATURES
-	def_bool n
-	select HAVE_MARCH_Z900_FEATURES
+config ARCH_SUPPORTS_KEXEC_SIG
+	def_bool MODULE_SIG_FORMAT
 
-config HAVE_MARCH_Z9_109_FEATURES
-	def_bool n
-	select HAVE_MARCH_Z990_FEATURES
+config ARCH_SUPPORTS_KEXEC_PURGATORY
+	def_bool y
+
+config ARCH_SUPPORTS_CRASH_DUMP
+	def_bool y
+	help
+	  Refer to <file:Documentation/arch/s390/zfcpdump.rst> for more details on this.
+	  This option also enables s390 zfcpdump.
+	  See also <file:Documentation/arch/s390/zfcpdump.rst>
+
+config ARCH_DEFAULT_CRASH_DUMP
+	def_bool y
+
+menu "Processor type and features"
 
 config HAVE_MARCH_Z10_FEATURES
 	def_bool n
-	select HAVE_MARCH_Z9_109_FEATURES
 
 config HAVE_MARCH_Z196_FEATURES
 	def_bool n
@@ -241,45 +328,21 @@ config HAVE_MARCH_Z15_FEATURES
 	def_bool n
 	select HAVE_MARCH_Z14_FEATURES
 
+config HAVE_MARCH_Z16_FEATURES
+	def_bool n
+	select HAVE_MARCH_Z15_FEATURES
+
 choice
 	prompt "Processor type"
 	default MARCH_Z196
 
-config MARCH_Z900
-	bool "IBM zSeries model z800 and z900"
-	select HAVE_MARCH_Z900_FEATURES
-	depends on $(cc-option,-march=z900)
-	help
-	  Select this to enable optimizations for model z800/z900 (2064 and
-	  2066 series). This will enable some optimizations that are not
-	  available on older ESA/390 (31 Bit) only CPUs.
-
-config MARCH_Z990
-	bool "IBM zSeries model z890 and z990"
-	select HAVE_MARCH_Z990_FEATURES
-	depends on $(cc-option,-march=z990)
-	help
-	  Select this to enable optimizations for model z890/z990 (2084 and
-	  2086 series). The kernel will be slightly faster but will not work
-	  on older machines.
-
-config MARCH_Z9_109
-	bool "IBM System z9"
-	select HAVE_MARCH_Z9_109_FEATURES
-	depends on $(cc-option,-march=z9-109)
-	help
-	  Select this to enable optimizations for IBM System z9 (2094 and
-	  2096 series). The kernel will be slightly faster but will not work
-	  on older machines.
-
 config MARCH_Z10
 	bool "IBM System z10"
 	select HAVE_MARCH_Z10_FEATURES
 	depends on $(cc-option,-march=z10)
 	help
-	  Select this to enable optimizations for IBM System z10 (2097 and
-	  2098 series). The kernel will be slightly faster but will not work
-	  on older machines.
+	  Select this to enable optimizations for IBM System z10 (2097 and 2098
+	  series). This is the oldest machine generation currently supported.
 
 config MARCH_Z196
 	bool "IBM zEnterprise 114 and 196"
@@ -326,16 +389,15 @@ config MARCH_Z15
 	  and 8561 series). The kernel will be slightly faster but will not
 	  work on older machines.
 
-endchoice
-
-config MARCH_Z900_TUNE
-	def_bool TUNE_Z900 || MARCH_Z900 && TUNE_DEFAULT
-
-config MARCH_Z990_TUNE
-	def_bool TUNE_Z990 || MARCH_Z990 && TUNE_DEFAULT
+config MARCH_Z16
+	bool "IBM z16"
+	select HAVE_MARCH_Z16_FEATURES
+	depends on $(cc-option,-march=z16)
+	help
+	  Select this to enable optimizations for IBM z16 (3931 and
+	  3932 series).
 
-config MARCH_Z9_109_TUNE
-	def_bool TUNE_Z9_109 || MARCH_Z9_109 && TUNE_DEFAULT
+endchoice
 
 config MARCH_Z10_TUNE
 	def_bool TUNE_Z10 || MARCH_Z10 && TUNE_DEFAULT
@@ -355,6 +417,9 @@ config MARCH_Z14_TUNE
 config MARCH_Z15_TUNE
 	def_bool TUNE_Z15 || MARCH_Z15 && TUNE_DEFAULT
 
+config MARCH_Z16_TUNE
+	def_bool TUNE_Z16 || MARCH_Z16 && TUNE_DEFAULT
+
 choice
 	prompt "Tune code generation"
 	default TUNE_DEFAULT
@@ -372,21 +437,8 @@ config TUNE_DEFAULT
 	  Tune the generated code for the target processor for which the kernel
 	  will be compiled.
 
-config TUNE_Z900
-	bool "IBM zSeries model z800 and z900"
-	depends on $(cc-option,-mtune=z900)
-
-config TUNE_Z990
-	bool "IBM zSeries model z890 and z990"
-	depends on $(cc-option,-mtune=z990)
-
-config TUNE_Z9_109
-	bool "IBM System z9"
-	depends on $(cc-option,-mtune=z9-109)
-
 config TUNE_Z10
 	bool "IBM System z10"
-	depends on $(cc-option,-mtune=z10)
 
 config TUNE_Z196
 	bool "IBM zEnterprise 114 and 196"
@@ -408,27 +460,38 @@ config TUNE_Z15
 	bool "IBM z15"
 	depends on $(cc-option,-mtune=z15)
 
+config TUNE_Z16
+	bool "IBM z16"
+	depends on $(cc-option,-mtune=z16)
+
 endchoice
 
 config 64BIT
 	def_bool y
 
+config COMMAND_LINE_SIZE
+	int "Maximum size of kernel command line"
+	default 4096
+	range 896 1048576
+	help
+	  This allows you to specify the maximum length of the kernel command
+	  line.
+
 config COMPAT
-	def_bool y
+	def_bool n
 	prompt "Kernel support for 31 bit emulation"
-	select COMPAT_BINFMT_ELF if BINFMT_ELF
 	select ARCH_WANT_OLD_COMPAT_IPC
 	select COMPAT_OLD_SIGACTION
 	select HAVE_UID16
 	depends on MULTIUSER
+	depends on !CC_IS_CLANG && !LD_IS_LLD
 	help
 	  Select this option if you want to enable your system kernel to
 	  handle system-calls from ELF binaries for 31 bit ESA.  This option
 	  (and some other stuff like libraries and such) is needed for
-	  executing 31 bit applications.  It is safe to say "Y".
+	  executing 31 bit applications.
 
-config SYSVIPC_COMPAT
-	def_bool y if COMPAT && SYSVIPC
+	  If unsure say N.
 
 config SMP
 	def_bool y
@@ -448,14 +511,6 @@ config NR_CPUS
 config HOTPLUG_CPU
 	def_bool y
 
-# Some NUMA nodes have memory ranges that span
-# other nodes.	Even though a pfn is valid and
-# between a node's start and end pfns, it may not
-# reside on that node.	See memmap_init_zone()
-# for details. <- They meant memory holes!
-config NODES_SPAN_OTHER_NODES
-	def_bool NUMA
-
 config NUMA
 	bool "NUMA support"
 	depends on SCHED_TOPOLOGY
@@ -465,58 +520,10 @@ config NUMA
 
 	  This option adds NUMA support to the kernel.
 
-	  An operation mode can be selected by appending
-	  numa=<method> to the kernel command line.
-
-	  The default behaviour is identical to appending numa=plain to
-	  the command line. This will create just one node with all
-	  available memory and all CPUs in it.
-
 config NODES_SHIFT
-	int "Maximum NUMA nodes (as a power of 2)"
-	range 1 10
-	depends on NUMA
-	default "4"
-	help
-	  Specify the maximum number of NUMA nodes available on the target
-	  system. Increases memory reserved to accommodate various tables.
-
-menu "Select NUMA modes"
+	int
 	depends on NUMA
-
-config NUMA_EMU
-	bool "NUMA emulation"
-	default y
-	help
-	  Numa emulation mode will split the available system memory into
-	  equal chunks which then are distributed over the configured number
-	  of nodes in a round-robin manner.
-
-	  The number of fake nodes is limited by the number of available memory
-	  chunks (i.e. memory size / fake size) and the number of supported
-	  nodes in the kernel.
-
-	  The CPUs are assigned to the nodes in a way that partially respects
-	  the original machine topology (if supported by the machine).
-	  Fair distribution of the CPUs is not guaranteed.
-
-config EMU_SIZE
-	hex "NUMA emulation memory chunk size"
-	default 0x10000000
-	range 0x400000 0x100000000
-	depends on NUMA_EMU
-	help
-	  Select the default size by which the memory is chopped and then
-	  assigned to emulated NUMA nodes.
-
-	  This can be overridden by specifying
-
-	  emu_size=<n>
-
-	  on the kernel command line where also suffixes K, M, G, and T are
-	  supported.
-
-endmenu
+	default "1"
 
 config SCHED_SMT
 	def_bool n
@@ -524,71 +531,48 @@ config SCHED_SMT
 config SCHED_MC
 	def_bool n
 
-config SCHED_BOOK
-	def_bool n
-
-config SCHED_DRAWER
-	def_bool n
-
 config SCHED_TOPOLOGY
 	def_bool y
 	prompt "Topology scheduler support"
 	select SCHED_SMT
 	select SCHED_MC
-	select SCHED_BOOK
-	select SCHED_DRAWER
 	help
 	  Topology scheduler support improves the CPU scheduler's decision
 	  making when dealing with machines that have multi-threading,
 	  multiple cores or multiple books.
 
-source "kernel/Kconfig.hz"
-
-config KEXEC
+config SCHED_TOPOLOGY_VERTICAL
 	def_bool y
-	select KEXEC_CORE
-
-config KEXEC_FILE
-	bool "kexec file based system call"
-	select KEXEC_CORE
-	select BUILD_BIN2C
-	depends on CRYPTO
-	depends on CRYPTO_SHA256
-	depends on CRYPTO_SHA256_S390
+	bool "Use vertical CPU polarization by default"
+	depends on SCHED_TOPOLOGY
 	help
-	  Enable the kexec file based system call. In contrast to the normal
-	  kexec system call this system call takes file descriptors for the
-	  kernel and initramfs as arguments.
+	  Use vertical CPU polarization by default if available.
+	  The default CPU polarization is horizontal.
 
-config ARCH_HAS_KEXEC_PURGATORY
+config HIPERDISPATCH_ON
 	def_bool y
-	depends on KEXEC_FILE
-
-config KEXEC_SIG
-	bool "Verify kernel signature during kexec_file_load() syscall"
-	depends on KEXEC_FILE && MODULE_SIG_FORMAT
+	bool "Use hiperdispatch on vertical polarization by default"
+	depends on SCHED_TOPOLOGY
+	depends on PROC_SYSCTL
 	help
-	  This option makes kernel signature verification mandatory for
-	  the kexec_file_load() syscall.
+	  Hiperdispatch aims to improve the CPU scheduler's decision
+	  making when using vertical polarization by adjusting CPU
+	  capacities dynamically. Set this option to use hiperdispatch
+	  on vertical polarization by default. This can be overwritten
+	  by sysctl's s390.hiperdispatch attribute later on.
 
-	  In addition to that option, you need to enable signature
-	  verification for the corresponding kernel image type being
-	  loaded in order for this to work.
+source "kernel/Kconfig.hz"
 
-config ARCH_RANDOM
-	def_bool y
-	prompt "s390 architectural random number generation API"
+config CERT_STORE
+	bool "Get user certificates via DIAG320"
+	depends on KEYS
+	select CRYPTO_LIB_SHA256
 	help
-	  Enable the s390 architectural random number generation API
-	  to provide random data for all consumers within the Linux
-	  kernel.
-
-	  When enabled the arch_random_* functions declared in linux/random.h
-	  are implemented. The implementation is based on the s390 CPACF
-	  instruction subfunction TRNG which provides a real true random
-	  number generator.
+	  Enable this option if you want to access user-provided secure boot
+	  certificates via DIAG 0x320.
 
-	  If unsure, say Y.
+	  These certificates will be made available via the keyring named
+	  'cert_store'.
 
 config KERNEL_NOBP
 	def_bool n
@@ -609,6 +593,7 @@ config KERNEL_NOBP
 
 config EXPOLINE
 	def_bool n
+	depends on $(cc-option,-mindirect-branch=thunk)
 	prompt "Avoid speculative indirect branches in the kernel"
 	help
 	  Compile the kernel with the expoline compiler options to guard
@@ -619,6 +604,15 @@ config EXPOLINE
 
 	  If unsure, say N.
 
+config EXPOLINE_EXTERN
+	def_bool EXPOLINE && CC_IS_GCC && GCC_VERSION >= 110200 && \
+		 $(success,$(srctree)/arch/s390/tools/gcc-thunk-extern.sh $(CC))
+	help
+	  Generate expolines as external functions if the compiler supports it.
+	  This option is required for some tooling like kpatch, if expolines
+	  are enabled. The kernel is compiled with
+	  -mindirect-branch=thunk-extern, which requires a newer compiler.
+
 choice
 	prompt "Expoline default"
 	depends on EXPOLINE
@@ -636,21 +630,18 @@ config EXPOLINE_FULL
 endchoice
 
 config RELOCATABLE
-	bool "Build a relocatable kernel"
-	select MODULE_REL_CRCS if MODVERSIONS
-	default y
+	def_bool y
+	select ARCH_VMLINUX_NEEDS_RELOCS
 	help
 	  This builds a kernel image that retains relocation information
 	  so it can be loaded at an arbitrary address.
-	  The kernel is linked as a position-independent executable (PIE)
-	  and contains dynamic relocations which are processed early in the
-	  bootup process.
 	  The relocations make the kernel image about 15% larger (compressed
 	  10%), but are discarded at runtime.
+	  Note: this option exists only for documentation purposes, please do
+	  not remove it.
 
 config RANDOMIZE_BASE
 	bool "Randomize the address of the kernel image (KASLR)"
-	depends on RELOCATABLE
 	default y
 	help
 	  In support of Kernel Address Space Layout Randomization (KASLR),
@@ -658,6 +649,38 @@ config RANDOMIZE_BASE
 	  as a security feature that deters exploit attempts relying on
 	  knowledge of the location of kernel internals.
 
+config RANDOMIZE_IDENTITY_BASE
+	bool "Randomize the address of the identity mapping base"
+	depends on RANDOMIZE_BASE
+	default DEBUG_VM
+	help
+	  The identity mapping base address is pinned to zero by default.
+	  Allow randomization of that base to expose otherwise missed
+	  notion of physical and virtual addresses of data structures.
+	  That does not have any impact on the base address at which the
+	  kernel image is loaded.
+
+	  If unsure, say N
+
+config KERNEL_IMAGE_BASE
+	hex "Kernel image base address"
+	range 0x100000 0x1FFFFFE0000000 if !KASAN
+	range 0x100000 0x1BFFFFE0000000 if KASAN
+	default 0x3FFE0000000 if !KASAN
+	default 0x7FFFE0000000 if KASAN
+	help
+	  This is the address at which the kernel image is loaded in case
+	  Kernel Address Space Layout Randomization (KASLR) is disabled.
+
+	  In case the Protected virtualization guest support is enabled the
+	  Ultravisor imposes a virtual address limit. If the value of this
+	  option leads to the kernel image exceeding the Ultravisor limit,
+	  this option is ignored and the image is loaded below the limit.
+
+	  If the value of this option leads to the kernel image overlapping
+	  the virtual memory where other data structures are located, this
+	  option is ignored and the image is loaded above the structures.
+
 endmenu
 
 menu "Memory setup"
@@ -670,19 +693,6 @@ config ARCH_SPARSEMEM_ENABLE
 config ARCH_SPARSEMEM_DEFAULT
 	def_bool y
 
-config ARCH_ENABLE_MEMORY_HOTPLUG
-	def_bool y if SPARSEMEM
-
-config ARCH_ENABLE_MEMORY_HOTREMOVE
-	def_bool y
-
-config ARCH_ENABLE_SPLIT_PMD_PTLOCK
-	def_bool y
-
-config FORCE_MAX_ZONEORDER
-	int
-	default "9"
-
 config MAX_PHYSMEM_BITS
 	int "Maximum size of supported physical memory in bits (42-53)"
 	range 42 53
@@ -693,56 +703,6 @@ config MAX_PHYSMEM_BITS
 	  Increasing the number of bits also increases the kernel image size.
 	  By default 46 bits (64TB) are supported.
 
-config PACK_STACK
-	def_bool y
-	prompt "Pack kernel stack"
-	help
-	  This option enables the compiler option -mkernel-backchain if it
-	  is available. If the option is available the compiler supports
-	  the new stack layout which dramatically reduces the minimum stack
-	  frame size. With an old compiler a non-leaf function needs a
-	  minimum of 96 bytes on 31 bit and 160 bytes on 64 bit. With
-	  -mkernel-backchain the minimum size drops to 16 byte on 31 bit
-	  and 24 byte on 64 bit.
-
-	  Say Y if you are unsure.
-
-config CHECK_STACK
-	def_bool y
-	depends on !VMAP_STACK
-	prompt "Detect kernel stack overflow"
-	help
-	  This option enables the compiler option -mstack-guard and
-	  -mstack-size if they are available. If the compiler supports them
-	  it will emit additional code to each function prolog to trigger
-	  an illegal operation if the kernel stack is about to overflow.
-
-	  Say N if you are unsure.
-
-config STACK_GUARD
-	int "Size of the guard area (128-1024)"
-	range 128 1024
-	depends on CHECK_STACK
-	default "256"
-	help
-	  This allows you to specify the size of the guard area at the lower
-	  end of the kernel stack. If the kernel stack points into the guard
-	  area on function entry an illegal operation is triggered. The size
-	  needs to be a power of 2. Please keep in mind that the size of an
-	  interrupt frame is 184 bytes for 31 bit and 328 bytes on 64 bit.
-	  The minimum size for the stack guard should be 256 for 31 bit and
-	  512 for 64 bit.
-
-config WARN_DYNAMIC_STACK
-	def_bool n
-	prompt "Emit compiler warnings for function with dynamic stack usage"
-	help
-	  This option enables the compiler option -mwarn-dynamicstack. If the
-	  compiler supports this options generates warnings for functions
-	  that dynamically allocate stack space using alloca.
-
-	  Say N if you are unsure.
-
 endmenu
 
 menu "I/O subsystem"
@@ -750,7 +710,7 @@ menu "I/O subsystem"
 config QDIO
 	def_tristate y
 	prompt "QDIO support"
-	---help---
+	help
 	  This driver provides the Queued Direct I/O base support for
 	  IBM System z.
 
@@ -764,7 +724,7 @@ if PCI
 config PCI_NR_FUNCTIONS
 	int "Maximum number of PCI functions (1-4096)"
 	range 1 4096
-	default "128"
+	default "512"
 	help
 	  This allows you to specify the maximum number of PCI functions which
 	  this kernel will support.
@@ -808,68 +768,57 @@ config EADM_SCH
 	  To compile this driver as a module, choose M here: the
 	  module will be called eadm_sch.
 
-config VFIO_CCW
-	def_tristate n
-	prompt "Support for VFIO-CCW subchannels"
-	depends on S390_CCW_IOMMU && VFIO_MDEV
+config AP
+	def_tristate y
+	prompt "Support for Adjunct Processors (ap)"
 	help
-	  This driver allows usage of I/O subchannels via VFIO-CCW.
+	  This driver allows usage to Adjunct Processor (AP) devices via
+	  the ap bus, cards and queues. Supported Adjunct Processors are
+	  the CryptoExpress Cards (CEX).
 
 	  To compile this driver as a module, choose M here: the
-	  module will be called vfio_ccw.
+	  module will be called ap.
 
-config VFIO_AP
-	def_tristate n
-	prompt "VFIO support for AP devices"
-	depends on S390_AP_IOMMU && VFIO_MDEV_DEVICE && KVM
+	  If unsure, say Y (default).
+
+config AP_DEBUG
+	def_bool n
+	prompt "Enable debug features for Adjunct Processor (ap) devices"
+	depends on AP
 	help
-		This driver grants access to Adjunct Processor (AP) devices
-		via the VFIO mediated device interface.
+	  Say 'Y' here to enable some additional debug features for Adjunct
+	  Processor (ap) devices.
 
-		To compile this driver as a module, choose M here: the module
-		will be called vfio_ap.
+	  There will be some more sysfs attributes displayed for ap queues.
 
-endmenu
+	  Do not enable on production level kernel build.
 
-menu "Dump support"
+	  If unsure, say N.
 
-config CRASH_DUMP
-	bool "kernel crash dumps"
-	select KEXEC
+config VFIO_CCW
+	def_tristate n
+	prompt "Support for VFIO-CCW subchannels"
+	depends on VFIO
+	select VFIO_MDEV
 	help
-	  Generate crash dump after being started by kexec.
-	  Crash dump kernels are loaded in the main kernel with kexec-tools
-	  into a specially reserved region and then later executed after
-	  a crash by kdump/kexec.
-	  Refer to <file:Documentation/s390/zfcpdump.rst> for more details on this.
-	  This option also enables s390 zfcpdump.
-	  See also <file:Documentation/s390/zfcpdump.rst>
+	  This driver allows usage of I/O subchannels via VFIO-CCW.
 
-endmenu
+	  To compile this driver as a module, choose M here: the
+	  module will be called vfio_ccw.
 
-config SECCOMP
-	def_bool y
-	prompt "Enable seccomp to safely compute untrusted bytecode"
-	depends on PROC_FS
+config VFIO_AP
+	def_tristate n
+	prompt "VFIO support for AP devices"
+	depends on KVM
+	depends on VFIO
+	depends on AP
+	select VFIO_MDEV
 	help
-	  This kernel feature is useful for number crunching applications
-	  that may need to compute untrusted bytecode during their
-	  execution. By using pipes or other transports made available to
-	  the process as file descriptors supporting the read/write
-	  syscalls, it's possible to isolate those applications in
-	  their own address space using seccomp. Once seccomp is
-	  enabled via /proc/<pid>/seccomp, it cannot be disabled
-	  and the task is only allowed to execute a few safe syscalls
-	  defined by each seccomp mode.
-
-	  If unsure, say Y.
-
-menu "Power Management"
-
-config ARCH_HIBERNATION_POSSIBLE
-	def_bool y
+	  This driver grants access to Adjunct Processor (AP) devices
+	  via the VFIO mediated device interface.
 
-source "kernel/power/Kconfig"
+	  To compile this driver as a module, choose M here: the module
+	  will be called vfio_ap.
 
 endmenu
 
@@ -882,17 +831,6 @@ config HAVE_PNETID
 
 menu "Virtualization"
 
-config PROTECTED_VIRTUALIZATION_GUEST
-	def_bool n
-	prompt "Protected virtualization guest support"
-	help
-	  Select this option, if you want to be able to run this
-	  kernel as a protected virtualization KVM guest.
-	  Protected virtualization capable machines have a mini hypervisor
-	  located at machine level (an ultravisor). With help of the
-	  Ultravisor, KVM will be able to run "protected" VMs, special
-	  VMs whose memory and management data are unavailable to KVM.
-
 config PFAULT
 	def_bool y
 	prompt "Pseudo page fault support"
@@ -930,7 +868,7 @@ config CMM_IUCV
 config APPLDATA_BASE
 	def_bool n
 	prompt "Linux - VM Monitor Stream, base infrastructure"
-	depends on PROC_FS
+	depends on PROC_SYSCTL
 	help
 	  This provides a kernel interface for creating and updating z/VM APPLDATA
 	  monitor records. The monitor records are updated at certain time
@@ -991,13 +929,24 @@ config APPLDATA_NET_SUM
 	  This can also be compiled as a module, which will be called
 	  appldata_net_sum.o.
 
-config S390_HYPFS_FS
+config S390_HYPFS
 	def_bool y
+	prompt "s390 hypervisor information"
+	help
+	  This provides several binary files at (debugfs)/s390_hypfs/ to
+	  provide accounting information in an s390 hypervisor environment.
+
+config S390_HYPFS_FS
+	def_bool n
 	prompt "s390 hypervisor file system support"
 	select SYS_HYPERVISOR
+	depends on S390_HYPFS
 	help
 	  This is a virtual file system intended to provide accounting
-	  information in an s390 hypervisor environment.
+	  information in an s390 hypervisor environment. This file system
+	  is deprecated and should not be used.
+
+	  Say N if you are unsure.
 
 source "arch/s390/kvm/Kconfig"
 
@@ -1007,7 +956,6 @@ config S390_GUEST
 	select TTY
 	select VIRTUALIZATION
 	select VIRTIO
-	select VIRTIO_CONSOLE
 	help
 	  Enabling this option adds support for virtio based paravirtual device
 	  drivers on s390.
@@ -1017,10 +965,15 @@ config S390_GUEST
 
 endmenu
 
+config S390_MODULES_SANITY_TEST_HELPERS
+	def_bool n
+
 menu "Selftests"
 
 config S390_UNWIND_SELFTEST
 	def_tristate n
+	depends on KUNIT
+	default KUNIT_ALL_TESTS
 	prompt "Test unwind functions"
 	help
 	  This option enables s390 specific stack unwinder testing kernel
@@ -1029,4 +982,28 @@ config S390_UNWIND_SELFTEST
 
 	  Say N if you are unsure.
 
+config S390_KPROBES_SANITY_TEST
+	def_tristate n
+	prompt "Enable s390 specific kprobes tests"
+	depends on KPROBES
+	depends on KUNIT
+	help
+	  This option enables an s390 specific kprobes test module. This option
+	  is not useful for distributions or general kernels, but only for kernel
+	  developers working on architecture code.
+
+	  Say N if you are unsure.
+
+config S390_MODULES_SANITY_TEST
+	def_tristate n
+	depends on KUNIT
+	default KUNIT_ALL_TESTS
+	prompt "Enable s390 specific modules tests"
+	select S390_MODULES_SANITY_TEST_HELPERS
+	help
+	  This option enables an s390 specific modules test. This option is
+	  not useful for distributions or general kernels, but only for
+	  kernel developers working on architecture code.
+
+	  Say N if you are unsure.
 endmenu
diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug
index 190527560b2c..7955d7eee7d8 100644
--- a/arch/s390/Kconfig.debug
+++ b/arch/s390/Kconfig.debug
@@ -1,19 +1,32 @@
 # SPDX-License-Identifier: GPL-2.0
 
-config TRACE_IRQFLAGS_SUPPORT
+config EARLY_PRINTK
 	def_bool y
 
-config S390_PTDUMP
-	bool "Export kernel pagetable layout to userspace via debugfs"
+config DEBUG_ENTRY
+	bool "Debug low-level entry code"
 	depends on DEBUG_KERNEL
-	select DEBUG_FS
-	---help---
-	  Say Y here if you want to show the kernel pagetable layout in a
-	  debugfs file. This information is only useful for kernel developers
-	  who are working in architecture specific areas of the kernel.
-	  It is probably not a good idea to enable this feature in a production
-	  kernel.
-	  If in doubt, say "N"
+	help
+	  This option enables sanity checks in s390 low-level entry code.
+	  Some of these sanity checks may slow down kernel entries and
+	  exits or otherwise impact performance.
 
-config EARLY_PRINTK
-	def_bool y
+	  If unsure, say N.
+
+config STRICT_MM_TYPECHECKS
+	bool "Strict Memory Management Type Checks"
+	depends on DEBUG_KERNEL
+	help
+	  Enable strict type checking for memory management types like pte_t
+	  and pmd_t. This generates slightly worse code and should be used
+	  for debug builds.
+
+	  If unsure, say N.
+
+config CIO_INJECT
+	bool "CIO Inject interfaces"
+	depends on DEBUG_KERNEL && DEBUG_FS
+	help
+	  This option provides a debugging facility to inject certain artificial events
+	  and instruction responses to the CIO layer of Linux kernel. The newly created
+	  debugfs user-interfaces will be at /sys/kernel/debug/s390/cio/*
diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index ba8556bb0fb1..b06dc53bfed5 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -3,9 +3,7 @@
 # s390/Makefile
 #
 # This file is included by the global makefile so that you can add your own
-# architecture-specific flags and dependencies. Remember to do have actions
-# for "archclean" and "archdep" for cleaning up and making dependencies for
-# this architecture
+# architecture-specific flags and dependencies.
 #
 # Copyright (C) 1994 by Linus Torvalds
 #
@@ -16,51 +14,53 @@ KBUILD_AFLAGS_MODULE += -fPIC
 KBUILD_CFLAGS_MODULE += -fPIC
 KBUILD_AFLAGS	+= -m64
 KBUILD_CFLAGS	+= -m64
-ifeq ($(CONFIG_RELOCATABLE),y)
-KBUILD_CFLAGS	+= -fPIE
-LDFLAGS_vmlinux	:= -pie
-endif
+KBUILD_CFLAGS	+= -fPIC
+LDFLAGS_vmlinux	:= $(call ld-option,-no-pie)
+extra_tools	:= relocs
 aflags_dwarf	:= -Wa,-gdwarf-2
 KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__
+ifndef CONFIG_AS_IS_LLVM
 KBUILD_AFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),$(aflags_dwarf))
-KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2
+endif
+KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 -mpacked-stack -std=gnu11
 KBUILD_CFLAGS_DECOMPRESSOR += -DDISABLE_BRANCH_PROFILING -D__NO_FORTIFY
-KBUILD_CFLAGS_DECOMPRESSOR += -fno-delete-null-pointer-checks -msoft-float
+KBUILD_CFLAGS_DECOMPRESSOR += -D__DECOMPRESSOR
+KBUILD_CFLAGS_DECOMPRESSOR += -fno-delete-null-pointer-checks -msoft-float -mbackchain
 KBUILD_CFLAGS_DECOMPRESSOR += -fno-asynchronous-unwind-tables
-KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-option,-ffreestanding)
+KBUILD_CFLAGS_DECOMPRESSOR += -ffreestanding
+KBUILD_CFLAGS_DECOMPRESSOR += -fno-stack-protector
+KBUILD_CFLAGS_DECOMPRESSOR += -fPIE
 KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, address-of-packed-member)
 KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),-g)
 KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO_DWARF4), $(call cc-option, -gdwarf-4,))
+KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_CC_NO_ARRAY_BOUNDS),-Wno-array-bounds)
+
 UTS_MACHINE	:= s390x
-STACK_SIZE	:= $(if $(CONFIG_KASAN),65536,16384)
+STACK_SIZE	:= $(if $(CONFIG_KASAN),65536,$(if $(CONFIG_KMSAN),65536,16384))
 CHECKFLAGS	+= -D__s390__ -D__s390x__
 
 export LD_BFD
 
-mflags-$(CONFIG_MARCH_Z900)   := -march=z900
-mflags-$(CONFIG_MARCH_Z990)   := -march=z990
-mflags-$(CONFIG_MARCH_Z9_109) := -march=z9-109
 mflags-$(CONFIG_MARCH_Z10)    := -march=z10
 mflags-$(CONFIG_MARCH_Z196)   := -march=z196
 mflags-$(CONFIG_MARCH_ZEC12)  := -march=zEC12
 mflags-$(CONFIG_MARCH_Z13)    := -march=z13
 mflags-$(CONFIG_MARCH_Z14)    := -march=z14
 mflags-$(CONFIG_MARCH_Z15)    := -march=z15
+mflags-$(CONFIG_MARCH_Z16)    := -march=z16
 
 export CC_FLAGS_MARCH := $(mflags-y)
 
 aflags-y += $(mflags-y)
 cflags-y += $(mflags-y)
 
-cflags-$(CONFIG_MARCH_Z900_TUNE)	+= -mtune=z900
-cflags-$(CONFIG_MARCH_Z990_TUNE)	+= -mtune=z990
-cflags-$(CONFIG_MARCH_Z9_109_TUNE)	+= -mtune=z9-109
 cflags-$(CONFIG_MARCH_Z10_TUNE)		+= -mtune=z10
 cflags-$(CONFIG_MARCH_Z196_TUNE)	+= -mtune=z196
 cflags-$(CONFIG_MARCH_ZEC12_TUNE)	+= -mtune=zEC12
 cflags-$(CONFIG_MARCH_Z13_TUNE)		+= -mtune=z13
 cflags-$(CONFIG_MARCH_Z14_TUNE)		+= -mtune=z14
 cflags-$(CONFIG_MARCH_Z15_TUNE)		+= -mtune=z15
+cflags-$(CONFIG_MARCH_Z16_TUNE)		+= -mtune=z16
 
 cflags-y += -Wa,-I$(srctree)/arch/$(ARCH)/include
 
@@ -69,44 +69,28 @@ cflags-y += -Wa,-I$(srctree)/arch/$(ARCH)/include
 #
 cflags-$(CONFIG_FRAME_POINTER) += -fno-optimize-sibling-calls
 
-ifeq ($(call cc-option-yn,-mpacked-stack),y)
-cflags-$(CONFIG_PACK_STACK)  += -mpacked-stack -D__PACK_STACK
-aflags-$(CONFIG_PACK_STACK)  += -D__PACK_STACK
-endif
-
 KBUILD_AFLAGS_DECOMPRESSOR += $(aflags-y)
 KBUILD_CFLAGS_DECOMPRESSOR += $(cflags-y)
 
-ifeq ($(call cc-option-yn,-mstack-size=8192 -mstack-guard=128),y)
-cflags-$(CONFIG_CHECK_STACK) += -mstack-size=$(STACK_SIZE)
-ifneq ($(call cc-option-yn,-mstack-size=8192),y)
-cflags-$(CONFIG_CHECK_STACK) += -mstack-guard=$(CONFIG_STACK_GUARD)
-endif
-endif
-
-ifdef CONFIG_WARN_DYNAMIC_STACK
-  ifeq ($(call cc-option-yn,-mwarn-dynamicstack),y)
-    KBUILD_CFLAGS += -mwarn-dynamicstack
-    KBUILD_CFLAGS_DECOMPRESSOR += -mwarn-dynamicstack
-  endif
-endif
-
 ifdef CONFIG_EXPOLINE
-  ifeq ($(call cc-option-yn,$(CC_FLAGS_MARCH) -mindirect-branch=thunk),y)
+  ifdef CONFIG_EXPOLINE_EXTERN
+    CC_FLAGS_EXPOLINE := -mindirect-branch=thunk-extern
+    CC_FLAGS_EXPOLINE += -mfunction-return=thunk-extern
+  else
     CC_FLAGS_EXPOLINE := -mindirect-branch=thunk
     CC_FLAGS_EXPOLINE += -mfunction-return=thunk
-    CC_FLAGS_EXPOLINE += -mindirect-branch-table
-    export CC_FLAGS_EXPOLINE
-    cflags-y += $(CC_FLAGS_EXPOLINE) -DCC_USING_EXPOLINE
-    aflags-y += -DCC_USING_EXPOLINE
   endif
+  CC_FLAGS_EXPOLINE += -mindirect-branch-table
+  export CC_FLAGS_EXPOLINE
+  cflags-y += $(CC_FLAGS_EXPOLINE) -DCC_USING_EXPOLINE
+  aflags-y += -DCC_USING_EXPOLINE
 endif
 
 ifdef CONFIG_FUNCTION_TRACER
-  ifeq ($(call cc-option-yn,-mfentry -mnop-mcount),n)
+  ifeq ($(call cc-option,-mfentry -mnop-mcount),)
     # make use of hotpatch feature if the compiler supports it
     cc_hotpatch	:= -mhotpatch=0,3
-    ifeq ($(call cc-option-yn,$(cc_hotpatch)),y)
+    ifneq ($(call cc-option,$(cc_hotpatch)),)
       CC_FLAGS_FTRACE := $(cc_hotpatch)
       KBUILD_AFLAGS	+= -DCC_USING_HOTPATCH
       KBUILD_CFLAGS	+= -DCC_USING_HOTPATCH
@@ -117,7 +101,7 @@ endif
 # Test CFI features of binutils
 cfi := $(call as-instr,.cfi_startproc\n.cfi_val_offset 15$(comma)-160\n.cfi_endproc,-DCONFIG_AS_CFI_VAL_OFFSET=1)
 
-KBUILD_CFLAGS	+= -mbackchain -msoft-float $(cflags-y)
+KBUILD_CFLAGS	+= -mpacked-stack -mbackchain -msoft-float $(cflags-y)
 KBUILD_CFLAGS	+= -pipe -Wno-sign-compare
 KBUILD_CFLAGS	+= -fno-asynchronous-unwind-tables $(cfi)
 KBUILD_AFLAGS	+= $(aflags-y) $(cfi)
@@ -126,16 +110,7 @@ export KBUILD_CFLAGS_DECOMPRESSOR
 
 OBJCOPYFLAGS	:= -O binary
 
-head-y		:= arch/s390/kernel/head64.o
-
-# See arch/s390/Kbuild for content of core part of the kernel
-core-y		+= arch/s390/
-
 libs-y		+= arch/s390/lib/
-drivers-y	+= drivers/s390/
-
-# must be linked after kernel
-drivers-$(CONFIG_OPROFILE)	+= arch/s390/oprofile/
 
 boot		:= arch/s390/boot
 syscalls	:= arch/s390/kernel/syscalls
@@ -146,8 +121,8 @@ all: bzImage
 #KBUILD_IMAGE is necessary for packaging targets like rpm-pkg, deb-pkg...
 KBUILD_IMAGE	:= $(boot)/bzImage
 
-install: vmlinux
-	$(Q)$(MAKE) $(build)=$(boot) $@
+install:
+	$(call cmd,install)
 
 bzImage: vmlinux
 	$(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
@@ -155,19 +130,29 @@ bzImage: vmlinux
 zfcpdump:
 	$(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
 
-vdso_install:
-	$(Q)$(MAKE) $(build)=arch/$(ARCH)/kernel/vdso64 $@
-
-archclean:
-	$(Q)$(MAKE) $(clean)=$(boot)
-	$(Q)$(MAKE) $(clean)=$(tools)
-
 archheaders:
 	$(Q)$(MAKE) $(build)=$(syscalls) uapi
 
 archprepare:
 	$(Q)$(MAKE) $(build)=$(syscalls) kapi
-	$(Q)$(MAKE) $(build)=$(tools) kapi
+	$(Q)$(MAKE) $(build)=$(tools) kapi $(extra_tools)
+ifeq ($(KBUILD_EXTMOD),)
+# We need to generate vdso-offsets.h before compiling certain files in kernel/.
+# In order to do that, we should use the archprepare target, but we can't since
+# asm-offsets.h is included in some files used to generate vdso-offsets.h, and
+# asm-offsets.h is built in prepare0, for which archprepare is a dependency.
+# Therefore we need to generate the header after prepare0 has been made, hence
+# this hack.
+prepare: vdso_prepare
+vdso_prepare: prepare0
+	$(Q)$(MAKE) $(build)=arch/s390/kernel/vdso64 include/generated/vdso64-offsets.h
+	$(if $(CONFIG_COMPAT),$(Q)$(MAKE) \
+		$(build)=arch/s390/kernel/vdso32 include/generated/vdso32-offsets.h)
+
+vdso-install-y			+= arch/s390/kernel/vdso64/vdso64.so.dbg
+vdso-install-$(CONFIG_COMPAT)	+= arch/s390/kernel/vdso32/vdso32.so.dbg
+
+endif
 
 # Don't use tabs in echo arguments
 define archhelp
diff --git a/arch/s390/Makefile.postlink b/arch/s390/Makefile.postlink
new file mode 100644
index 000000000000..c2b737500a91
--- /dev/null
+++ b/arch/s390/Makefile.postlink
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: GPL-2.0
+# ===========================================================================
+# Post-link s390 pass
+# ===========================================================================
+#
+# 1. Separate relocations from vmlinux into relocs.S.
+# 2. Strip relocations from vmlinux.
+
+PHONY := __archpost
+__archpost:
+
+-include include/config/auto.conf
+include $(srctree)/scripts/Kbuild.include
+
+CMD_RELOCS=arch/s390/tools/relocs
+OUT_RELOCS = arch/s390/boot
+quiet_cmd_relocs = RELOCS  $(OUT_RELOCS)/relocs.S
+      cmd_relocs = \
+	mkdir -p $(OUT_RELOCS); \
+	$(CMD_RELOCS) $@ > $(OUT_RELOCS)/relocs.S
+
+vmlinux.unstripped: FORCE
+	$(call cmd,relocs)
+
+clean:
+	@rm -f $(OUT_RELOCS)/relocs.S
+
+PHONY += FORCE clean
+
+FORCE:
+
+.PHONY: $(PHONY)
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index aa738cad1338..dd7ba7587dd5 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -26,12 +26,10 @@
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/workqueue.h>
-#include <linux/suspend.h>
-#include <linux/platform_device.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
 #include <asm/appldata.h>
 #include <asm/vtimer.h>
-#include <linux/uaccess.h>
-#include <asm/io.h>
 #include <asm/smp.h>
 
 #include "appldata.h"
@@ -44,20 +42,17 @@
 #define TOD_MICRO	0x01000			/* nr. of TOD clock units
 						   for 1 microsecond */
 
-static struct platform_device *appldata_pdev;
-
 /*
  * /proc entries (sysctl)
  */
 static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata";
-static int appldata_timer_handler(struct ctl_table *ctl, int write,
-				  void __user *buffer, size_t *lenp, loff_t *ppos);
-static int appldata_interval_handler(struct ctl_table *ctl, int write,
-					 void __user *buffer,
-					 size_t *lenp, loff_t *ppos);
+static int appldata_timer_handler(const struct ctl_table *ctl, int write,
+				  void *buffer, size_t *lenp, loff_t *ppos);
+static int appldata_interval_handler(const struct ctl_table *ctl, int write,
+				     void *buffer, size_t *lenp, loff_t *ppos);
 
 static struct ctl_table_header *appldata_sysctl_header;
-static struct ctl_table appldata_table[] = {
+static const struct ctl_table appldata_table[] = {
 	{
 		.procname	= "timer",
 		.mode		= S_IRUGO | S_IWUSR,
@@ -68,17 +63,6 @@ static struct ctl_table appldata_table[] = {
 		.mode		= S_IRUGO | S_IWUSR,
 		.proc_handler	= appldata_interval_handler,
 	},
-	{ },
-};
-
-static struct ctl_table appldata_dir_table[] = {
-	{
-		.procname	= appldata_proc_name,
-		.maxlen		= 0,
-		.mode		= S_IRUGO | S_IXUGO,
-		.child		= appldata_table,
-	},
-	{ },
 };
 
 /*
@@ -89,7 +73,6 @@ static struct vtimer_list appldata_timer;
 static DEFINE_SPINLOCK(appldata_timer_lock);
 static int appldata_interval = APPLDATA_CPU_INTERVAL;
 static int appldata_timer_active;
-static int appldata_timer_suspended = 0;
 
 /*
  * Work queue
@@ -216,8 +199,8 @@ static void __appldata_vtimer_setup(int cmd)
  * Start/Stop timer, show status of timer (0 = not active, 1 = active)
  */
 static int
-appldata_timer_handler(struct ctl_table *ctl, int write,
-			   void __user *buffer, size_t *lenp, loff_t *ppos)
+appldata_timer_handler(const struct ctl_table *ctl, int write,
+			   void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int timer_active = appldata_timer_active;
 	int rc;
@@ -249,8 +232,8 @@ appldata_timer_handler(struct ctl_table *ctl, int write,
  * current timer interval.
  */
 static int
-appldata_interval_handler(struct ctl_table *ctl, int write,
-			   void __user *buffer, size_t *lenp, loff_t *ppos)
+appldata_interval_handler(const struct ctl_table *ctl, int write,
+			   void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int interval = appldata_interval;
 	int rc;
@@ -279,8 +262,8 @@ appldata_interval_handler(struct ctl_table *ctl, int write,
  * monitoring (0 = not in process, 1 = in process)
  */
 static int
-appldata_generic_handler(struct ctl_table *ctl, int write,
-			   void __user *buffer, size_t *lenp, loff_t *ppos)
+appldata_generic_handler(const struct ctl_table *ctl, int write,
+			   void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct appldata_ops *ops = NULL, *tmp_ops;
 	struct list_head *lh;
@@ -297,7 +280,7 @@ appldata_generic_handler(struct ctl_table *ctl, int write,
 	mutex_lock(&appldata_ops_mutex);
 	list_for_each(lh, &appldata_ops_list) {
 		tmp_ops = list_entry(lh, struct appldata_ops, list);
-		if (&tmp_ops->ctl_table[2] == ctl) {
+		if (&tmp_ops->ctl_table[0] == ctl) {
 			found = 1;
 		}
 	}
@@ -367,7 +350,7 @@ int appldata_register_ops(struct appldata_ops *ops)
 	if (ops->size > APPLDATA_MAX_REC_SIZE)
 		return -EINVAL;
 
-	ops->ctl_table = kcalloc(4, sizeof(struct ctl_table), GFP_KERNEL);
+	ops->ctl_table = kcalloc(1, sizeof(struct ctl_table), GFP_KERNEL);
 	if (!ops->ctl_table)
 		return -ENOMEM;
 
@@ -375,17 +358,12 @@ int appldata_register_ops(struct appldata_ops *ops)
 	list_add(&ops->list, &appldata_ops_list);
 	mutex_unlock(&appldata_ops_mutex);
 
-	ops->ctl_table[0].procname = appldata_proc_name;
-	ops->ctl_table[0].maxlen   = 0;
-	ops->ctl_table[0].mode     = S_IRUGO | S_IXUGO;
-	ops->ctl_table[0].child    = &ops->ctl_table[2];
-
-	ops->ctl_table[2].procname = ops->name;
-	ops->ctl_table[2].mode     = S_IRUGO | S_IWUSR;
-	ops->ctl_table[2].proc_handler = appldata_generic_handler;
-	ops->ctl_table[2].data = ops;
+	ops->ctl_table[0].procname = ops->name;
+	ops->ctl_table[0].mode = S_IRUGO | S_IWUSR;
+	ops->ctl_table[0].proc_handler = appldata_generic_handler;
+	ops->ctl_table[0].data = ops;
 
-	ops->sysctl_header = register_sysctl_table(ops->ctl_table);
+	ops->sysctl_header = register_sysctl_sz(appldata_proc_name, ops->ctl_table, 1);
 	if (!ops->sysctl_header)
 		goto out;
 	return 0;
@@ -413,88 +391,6 @@ void appldata_unregister_ops(struct appldata_ops *ops)
 /********************** module-ops management <END> **************************/
 
 
-/**************************** suspend / resume *******************************/
-static int appldata_freeze(struct device *dev)
-{
-	struct appldata_ops *ops;
-	int rc;
-	struct list_head *lh;
-
-	spin_lock(&appldata_timer_lock);
-	if (appldata_timer_active) {
-		__appldata_vtimer_setup(APPLDATA_DEL_TIMER);
-		appldata_timer_suspended = 1;
-	}
-	spin_unlock(&appldata_timer_lock);
-
-	mutex_lock(&appldata_ops_mutex);
-	list_for_each(lh, &appldata_ops_list) {
-		ops = list_entry(lh, struct appldata_ops, list);
-		if (ops->active == 1) {
-			rc = appldata_diag(ops->record_nr, APPLDATA_STOP_REC,
-					(unsigned long) ops->data, ops->size,
-					ops->mod_lvl);
-			if (rc != 0)
-				pr_err("Stopping the data collection for %s "
-				       "failed with rc=%d\n", ops->name, rc);
-		}
-	}
-	mutex_unlock(&appldata_ops_mutex);
-	return 0;
-}
-
-static int appldata_restore(struct device *dev)
-{
-	struct appldata_ops *ops;
-	int rc;
-	struct list_head *lh;
-
-	spin_lock(&appldata_timer_lock);
-	if (appldata_timer_suspended) {
-		__appldata_vtimer_setup(APPLDATA_ADD_TIMER);
-		appldata_timer_suspended = 0;
-	}
-	spin_unlock(&appldata_timer_lock);
-
-	mutex_lock(&appldata_ops_mutex);
-	list_for_each(lh, &appldata_ops_list) {
-		ops = list_entry(lh, struct appldata_ops, list);
-		if (ops->active == 1) {
-			ops->callback(ops->data);	// init record
-			rc = appldata_diag(ops->record_nr,
-					APPLDATA_START_INTERVAL_REC,
-					(unsigned long) ops->data, ops->size,
-					ops->mod_lvl);
-			if (rc != 0) {
-				pr_err("Starting the data collection for %s "
-				       "failed with rc=%d\n", ops->name, rc);
-			}
-		}
-	}
-	mutex_unlock(&appldata_ops_mutex);
-	return 0;
-}
-
-static int appldata_thaw(struct device *dev)
-{
-	return appldata_restore(dev);
-}
-
-static const struct dev_pm_ops appldata_pm_ops = {
-	.freeze		= appldata_freeze,
-	.thaw		= appldata_thaw,
-	.restore	= appldata_restore,
-};
-
-static struct platform_driver appldata_pdrv = {
-	.driver = {
-		.name	= "appldata",
-		.pm	= &appldata_pm_ops,
-	},
-};
-/************************* suspend / resume <END> ****************************/
-
-
 /******************************* init / exit *********************************/
 
 /*
@@ -504,36 +400,14 @@ static struct platform_driver appldata_pdrv = {
  */
 static int __init appldata_init(void)
 {
-	int rc;
-
 	init_virt_timer(&appldata_timer);
 	appldata_timer.function = appldata_timer_function;
 	appldata_timer.data = (unsigned long) &appldata_work;
-
-	rc = platform_driver_register(&appldata_pdrv);
-	if (rc)
-		return rc;
-
-	appldata_pdev = platform_device_register_simple("appldata", -1, NULL,
-							0);
-	if (IS_ERR(appldata_pdev)) {
-		rc = PTR_ERR(appldata_pdev);
-		goto out_driver;
-	}
 	appldata_wq = alloc_ordered_workqueue("appldata", 0);
-	if (!appldata_wq) {
-		rc = -ENOMEM;
-		goto out_device;
-	}
-
-	appldata_sysctl_header = register_sysctl_table(appldata_dir_table);
+	if (!appldata_wq)
+		return -ENOMEM;
+	appldata_sysctl_header = register_sysctl(appldata_proc_name, appldata_table);
 	return 0;
-
-out_device:
-	platform_device_unregister(appldata_pdev);
-out_driver:
-	platform_driver_unregister(&appldata_pdrv);
-	return rc;
 }
 
 __initcall(appldata_init);
diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c
index e68136c3c23a..fc608f9b79ab 100644
--- a/arch/s390/appldata/appldata_mem.c
+++ b/arch/s390/appldata/appldata_mem.c
@@ -15,7 +15,7 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
-#include <asm/io.h>
+#include <linux/io.h>
 
 #include "appldata.h"
 
@@ -29,10 +29,6 @@
  * the structure version (product ID, see appldata_base.c) needs to be changed
  * as well and all documentation and z/VM applications using it must be
  * updated.
- *
- * The record layout is documented in the Linux for zSeries Device Drivers
- * book:
- * http://oss.software.ibm.com/developerworks/opensource/linux390/index.shtml
  */
 struct appldata_mem_data {
 	u64 timestamp;
diff --git a/arch/s390/appldata/appldata_net_sum.c b/arch/s390/appldata/appldata_net_sum.c
index 8bc14b0d1def..59c282ca002f 100644
--- a/arch/s390/appldata/appldata_net_sum.c
+++ b/arch/s390/appldata/appldata_net_sum.c
@@ -25,10 +25,6 @@
  * This is accessed as binary data by z/VM. If changes to it can't be avoided,
  * the structure version (product ID, see appldata_base.c) needs to be changed
  * as well and all documentation and z/VM applications using it must be updated.
- *
- * The record layout is documented in the Linux for zSeries Device Drivers
- * book:
- * http://oss.software.ibm.com/developerworks/opensource/linux390/index.shtml
  */
 struct appldata_net_sum_data {
 	u64 timestamp;
diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
index 54f375627532..a363d30ce739 100644
--- a/arch/s390/appldata/appldata_os.c
+++ b/arch/s390/appldata/appldata_os.c
@@ -32,10 +32,6 @@
  * the structure version (product ID, see appldata_base.c) needs to be changed
  * as well and all documentation and z/VM applications using it must be
  * updated.
- *
- * The record layout is documented in the Linux for zSeries Device Drivers
- * book:
- * http://oss.software.ibm.com/developerworks/opensource/linux390/index.shtml
  */
 struct appldata_os_per_cpu {
 	u32 per_cpu_user;	/* timer ticks spent in user mode   */
@@ -75,7 +71,7 @@ struct appldata_os_data {
 				   (waiting for I/O)               */
 
 	/* per cpu data */
-	struct appldata_os_per_cpu os_cpu[0];
+	struct appldata_os_per_cpu os_cpu[];
 } __attribute__((packed));
 
 static struct appldata_os_data *appldata_os_data;
@@ -133,8 +129,7 @@ static void appldata_get_os_data(void *data)
 
 	os_data->nr_cpus = j;
 
-	new_size = sizeof(struct appldata_os_data) +
-		   (os_data->nr_cpus * sizeof(struct appldata_os_per_cpu));
+	new_size = struct_size(os_data, os_cpu, os_data->nr_cpus);
 	if (ops.size != new_size) {
 		if (ops.active) {
 			rc = appldata_diag(APPLDATA_RECORD_OS_ID,
@@ -169,8 +164,7 @@ static int __init appldata_os_init(void)
 {
 	int rc, max_size;
 
-	max_size = sizeof(struct appldata_os_data) +
-		   (num_possible_cpus() * sizeof(struct appldata_os_per_cpu));
+	max_size = struct_size(appldata_os_data, os_cpu, num_possible_cpus());
 	if (max_size > APPLDATA_MAX_REC_SIZE) {
 		pr_err("Maximum OS record size %i exceeds the maximum "
 		       "record size %i\n", max_size, APPLDATA_MAX_REC_SIZE);
diff --git a/arch/s390/boot/.gitignore b/arch/s390/boot/.gitignore
index 16ff906e4610..af2a6a7bc028 100644
--- a/arch/s390/boot/.gitignore
+++ b/arch/s390/boot/.gitignore
@@ -1,3 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
 image
 bzImage
+relocs.S
 section_cmp.*
+vmlinux
+vmlinux.lds
+vmlinux.map
+vmlinux.syms
diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile
index e2c47d3a1c89..bee49626be4b 100644
--- a/arch/s390/boot/Makefile
+++ b/arch/s390/boot/Makefile
@@ -3,53 +3,53 @@
 # Makefile for the linux s390-specific parts of the memory manager.
 #
 
+# Tooling runtimes are unavailable and cannot be linked for early boot code
 KCOV_INSTRUMENT := n
 GCOV_PROFILE := n
 UBSAN_SANITIZE := n
 KASAN_SANITIZE := n
-
-KBUILD_AFLAGS := $(KBUILD_AFLAGS_DECOMPRESSOR)
-KBUILD_CFLAGS := $(KBUILD_CFLAGS_DECOMPRESSOR)
+KCSAN_SANITIZE := n
+KMSAN_SANITIZE := n
 
 #
-# Use minimum architecture for als.c to be able to print an error
+# Use minimum architecture level so it is possible to print an error
 # message if the kernel is started on a machine which is too old
 #
-ifndef CONFIG_CC_IS_CLANG
-CC_FLAGS_MARCH_MINIMUM := -march=z900
-else
 CC_FLAGS_MARCH_MINIMUM := -march=z10
-endif
-
-ifneq ($(CC_FLAGS_MARCH),$(CC_FLAGS_MARCH_MINIMUM))
-AFLAGS_REMOVE_head.o		+= $(CC_FLAGS_MARCH)
-AFLAGS_head.o			+= $(CC_FLAGS_MARCH_MINIMUM)
-AFLAGS_REMOVE_mem.o		+= $(CC_FLAGS_MARCH)
-AFLAGS_mem.o			+= $(CC_FLAGS_MARCH_MINIMUM)
-CFLAGS_REMOVE_als.o		+= $(CC_FLAGS_MARCH)
-CFLAGS_als.o			+= $(CC_FLAGS_MARCH_MINIMUM)
-CFLAGS_REMOVE_sclp_early_core.o	+= $(CC_FLAGS_MARCH)
-CFLAGS_sclp_early_core.o	+= $(CC_FLAGS_MARCH_MINIMUM)
-endif
+
+KBUILD_AFLAGS := $(filter-out $(CC_FLAGS_MARCH),$(KBUILD_AFLAGS_DECOMPRESSOR))
+KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_MARCH),$(KBUILD_CFLAGS_DECOMPRESSOR))
+KBUILD_AFLAGS += $(CC_FLAGS_MARCH_MINIMUM)
+KBUILD_CFLAGS += $(CC_FLAGS_MARCH_MINIMUM)
 
 CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char
 
-obj-y	:= head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o
+obj-y	:= head.o als.o startup.o physmem_info.o ipl_parm.o ipl_report.o vmem.o
 obj-y	+= string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o
-obj-y	+= version.o pgm_check_info.o ctype.o text_dma.o
-obj-$(CONFIG_PROTECTED_VIRTUALIZATION_GUEST)	+= uv.o
-obj-$(CONFIG_RELOCATABLE)	+= machine_kexec_reloc.o
+obj-y	+= version.o pgm_check.o ctype.o ipl_data.o relocs.o alternative.o
+obj-y	+= uv.o printk.o
 obj-$(CONFIG_RANDOMIZE_BASE)	+= kaslr.o
-targets	:= bzImage startup.a section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y)
-subdir-	:= compressed
+obj-y	+= $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o
+obj-$(CONFIG_KERNEL_ZSTD) += clz_ctz.o
+obj-$(CONFIG_KMSAN) += kmsan.o
+obj-all := $(obj-y) piggy.o syms.o
+
+targets	:= bzImage section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y)
+targets	+= vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2
+targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4
+targets += vmlinux.bin.zst info.bin syms.bin vmlinux.syms $(obj-all)
+targets += relocs.S
 
 OBJECTS := $(addprefix $(obj)/,$(obj-y))
+OBJECTS_ALL := $(addprefix $(obj)/,$(obj-all))
+
+clean-files += vmlinux.map
 
 quiet_cmd_section_cmp = SECTCMP $*
 define cmd_section_cmp
-	s1=`$(OBJDUMP) -t -j "$*" "$<" | sort | \
+	s1=`$(OBJDUMP) -t "$<" | grep "\s$*\s\+" | sort | \
 		sed -n "/0000000000000000/! s/.*\s$*\s\+//p" | sha256sum`; \
-	s2=`$(OBJDUMP) -t -j "$*" "$(word 2,$^)" | sort | \
+	s2=`$(OBJDUMP) -t "$(word 2,$^)" | grep "\s$*\s\+" | sort | \
 		sed -n "/0000000000000000/! s/.*\s$*\s\+//p" | sha256sum`; \
 	if [ "$$s1" != "$$s2" ]; then \
 		echo "error: section $* differs between $< and $(word 2,$^)" >&2; \
@@ -58,22 +58,72 @@ define cmd_section_cmp
 	touch $@
 endef
 
-$(obj)/bzImage: $(obj)/compressed/vmlinux $(obj)/section_cmp.boot.data $(obj)/section_cmp.boot.preserved.data FORCE
+$(obj)/bzImage: $(obj)/vmlinux $(obj)/section_cmp.boot.data $(obj)/section_cmp.boot.preserved.data FORCE
 	$(call if_changed,objcopy)
 
-$(obj)/section_cmp%: vmlinux $(obj)/compressed/vmlinux FORCE
+$(obj)/section_cmp%: vmlinux $(obj)/vmlinux FORCE
 	$(call if_changed,section_cmp)
 
-$(obj)/compressed/vmlinux: $(obj)/startup.a FORCE
-	$(Q)$(MAKE) $(build)=$(obj)/compressed $@
+LDFLAGS_vmlinux-$(CONFIG_LD_ORPHAN_WARN) := --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL)
+LDFLAGS_vmlinux := $(LDFLAGS_vmlinux-y) --oformat $(LD_BFD) -e startup $(if $(CONFIG_VMLINUX_MAP),-Map=$(obj)/vmlinux.map) --build-id=sha1 -T
+$(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS_ALL) FORCE
+	$(call if_changed,ld)
+
+LDFLAGS_vmlinux.syms := $(LDFLAGS_vmlinux-y) --oformat $(LD_BFD) -e startup -T
+$(obj)/vmlinux.syms: $(obj)/vmlinux.lds $(OBJECTS) FORCE
+	$(call if_changed,ld)
+
+quiet_cmd_dumpsyms = DUMPSYMS $<
+define cmd_dumpsyms
+	$(NM) -n -S --format=bsd "$<" | sed -nE 's/^0*([0-9a-fA-F]+) 0*([0-9a-fA-F]+) [tT] ([^ ]*)$$/\1 \2 \3/p' | tr '\n' '\0' > "$@"
+endef
+
+$(obj)/syms.bin: $(obj)/vmlinux.syms FORCE
+	$(call if_changed,dumpsyms)
+
+OBJCOPYFLAGS_syms.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.decompressor.syms
+$(obj)/syms.o: $(obj)/syms.bin FORCE
+	$(call if_changed,objcopy)
 
-$(obj)/startup.a: $(OBJECTS) FORCE
-	$(call if_changed,ar)
+OBJCOPYFLAGS_info.bin := -O binary --only-section=.vmlinux.info --set-section-flags .vmlinux.info=alloc,load
+$(obj)/info.bin: vmlinux FORCE
+	$(call if_changed,objcopy)
 
-install: $(CONFIGURE) $(obj)/bzImage
-	sh -x  $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
-	      System.map "$(INSTALL_PATH)"
+OBJCOPYFLAGS_info.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.info
+$(obj)/info.o: $(obj)/info.bin FORCE
+	$(call if_changed,objcopy)
+
+OBJCOPYFLAGS_vmlinux.bin := -O binary --remove-section=.comment --remove-section=.vmlinux.info -S
+$(obj)/vmlinux.bin: vmlinux FORCE
+	$(call if_changed,objcopy)
 
-chkbss := $(obj-y)
-chkbss-target := startup.a
-include $(srctree)/arch/s390/scripts/Makefile.chkbss
+# relocs.S is created by the vmlinux postlink step.
+$(obj)/relocs.S: vmlinux
+	@true
+
+suffix-$(CONFIG_KERNEL_GZIP)  := .gz
+suffix-$(CONFIG_KERNEL_BZIP2) := .bz2
+suffix-$(CONFIG_KERNEL_LZ4)  := .lz4
+suffix-$(CONFIG_KERNEL_LZMA)  := .lzma
+suffix-$(CONFIG_KERNEL_LZO)  := .lzo
+suffix-$(CONFIG_KERNEL_XZ)  := .xz
+suffix-$(CONFIG_KERNEL_ZSTD)  := .zst
+
+$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
+	$(call if_changed,gzip)
+$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
+	$(call if_changed,bzip2_with_size)
+$(obj)/vmlinux.bin.lz4: $(obj)/vmlinux.bin FORCE
+	$(call if_changed,lz4_with_size)
+$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
+	$(call if_changed,lzma_with_size)
+$(obj)/vmlinux.bin.lzo: $(obj)/vmlinux.bin FORCE
+	$(call if_changed,lzo_with_size)
+$(obj)/vmlinux.bin.xz: $(obj)/vmlinux.bin FORCE
+	$(call if_changed,xzkern_with_size)
+$(obj)/vmlinux.bin.zst: $(obj)/vmlinux.bin FORCE
+	$(call if_changed,zstd22_with_size)
+
+OBJCOPYFLAGS_piggy.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.bin.compressed
+$(obj)/piggy.o: $(obj)/vmlinux.bin$(suffix-y) FORCE
+	$(call if_changed,objcopy)
diff --git a/arch/s390/boot/als.c b/arch/s390/boot/als.c
index ff6801d401c4..79afb5fa7f1f 100644
--- a/arch/s390/boot/als.c
+++ b/arch/s390/boot/als.c
@@ -9,42 +9,8 @@
 #include <asm/sclp.h>
 #include "boot.h"
 
-/*
- * The code within this file will be called very early. It may _not_
- * access anything within the bss section, since that is not cleared
- * yet and may contain data (e.g. initrd) that must be saved by other
- * code.
- * For temporary objects the stack (16k) should be used.
- */
-
 static unsigned long als[] = { FACILITIES_ALS };
 
-static void u16_to_hex(char *str, u16 val)
-{
-	int i, num;
-
-	for (i = 1; i <= 4; i++) {
-		num = (val >> (16 - 4 * i)) & 0xf;
-		if (num >= 10)
-			num += 7;
-		*str++ = '0' + num;
-	}
-	*str = '\0';
-}
-
-static void print_machine_type(void)
-{
-	static char mach_str[80] = "Detected machine-type number: ";
-	char type_str[5];
-	struct cpuid id;
-
-	get_cpu_id(&id);
-	u16_to_hex(type_str, id.machine);
-	strcat(mach_str, type_str);
-	strcat(mach_str, "\n");
-	sclp_early_printk(mach_str);
-}
-
 static void u16_to_decimal(char *str, u16 val)
 {
 	int div = 1;
@@ -68,7 +34,7 @@ void print_missing_facilities(void)
 
 	first = 1;
 	for (i = 0; i < ARRAY_SIZE(als); i++) {
-		val = ~S390_lowcore.stfle_fac_list[i] & als[i];
+		val = ~stfle_fac_list[i] & als[i];
 		for (j = 0; j < BITS_PER_LONG; j++) {
 			if (!(val & (1UL << (BITS_PER_LONG - 1 - j))))
 				continue;
@@ -80,8 +46,7 @@ void print_missing_facilities(void)
 			 * z/VM adds a four character prefix.
 			 */
 			if (strlen(als_str) > 70) {
-				strcat(als_str, "\n");
-				sclp_early_printk(als_str);
+				boot_emerg("%s\n", als_str);
 				*als_str = '\0';
 			}
 			u16_to_decimal(val_str, i * BITS_PER_LONG + j);
@@ -89,16 +54,18 @@ void print_missing_facilities(void)
 			first = 0;
 		}
 	}
-	strcat(als_str, "\n");
-	sclp_early_printk(als_str);
+	boot_emerg("%s\n", als_str);
 }
 
 static void facility_mismatch(void)
 {
-	sclp_early_printk("The Linux kernel requires more recent processor hardware\n");
-	print_machine_type();
+	struct cpuid id;
+
+	get_cpu_id(&id);
+	boot_emerg("The Linux kernel requires more recent processor hardware\n");
+	boot_emerg("Detected machine-type number: %4x\n", id.machine);
 	print_missing_facilities();
-	sclp_early_printk("See Principles of Operations for facility bits\n");
+	boot_emerg("See Principles of Operations for facility bits\n");
 	disabled_wait();
 }
 
@@ -106,9 +73,9 @@ void verify_facilities(void)
 {
 	int i;
 
-	__stfle(S390_lowcore.stfle_fac_list, ARRAY_SIZE(S390_lowcore.stfle_fac_list));
+	__stfle(stfle_fac_list, ARRAY_SIZE(stfle_fac_list));
 	for (i = 0; i < ARRAY_SIZE(als); i++) {
-		if ((S390_lowcore.stfle_fac_list[i] & als[i]) != als[i])
+		if ((stfle_fac_list[i] & als[i]) != als[i])
 			facility_mismatch();
 	}
 }
diff --git a/arch/s390/boot/alternative.c b/arch/s390/boot/alternative.c
new file mode 100644
index 000000000000..19ea7934b918
--- /dev/null
+++ b/arch/s390/boot/alternative.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+#define boot_fmt(fmt)	"alt: " fmt
+#include "boot.h"
+
+#define a_debug		boot_debug
+
+#include "../kernel/alternative.c"
+
+static void alt_debug_all(int type)
+{
+	int i;
+
+	switch (type) {
+	case ALT_TYPE_FACILITY:
+		for (i = 0; i < ARRAY_SIZE(alt_debug.facilities); i++)
+			alt_debug.facilities[i] = -1UL;
+		break;
+	case ALT_TYPE_FEATURE:
+		for (i = 0; i < ARRAY_SIZE(alt_debug.mfeatures); i++)
+			alt_debug.mfeatures[i] = -1UL;
+		break;
+	case ALT_TYPE_SPEC:
+		alt_debug.spec = 1;
+		break;
+	}
+}
+
+static void alt_debug_modify(int type, unsigned int nr, bool clear)
+{
+	switch (type) {
+	case ALT_TYPE_FACILITY:
+		if (clear)
+			__clear_facility(nr, alt_debug.facilities);
+		else
+			__set_facility(nr, alt_debug.facilities);
+		break;
+	case ALT_TYPE_FEATURE:
+		if (clear)
+			__clear_machine_feature(nr, alt_debug.mfeatures);
+		else
+			__set_machine_feature(nr, alt_debug.mfeatures);
+		break;
+	}
+}
+
+static char *alt_debug_parse(int type, char *str)
+{
+	unsigned long val, endval;
+	char *endp;
+	bool clear;
+	int i;
+
+	if (*str == ':') {
+		str++;
+	} else {
+		alt_debug_all(type);
+		return str;
+	}
+	clear = false;
+	if (*str == '!') {
+		alt_debug_all(type);
+		clear = true;
+		str++;
+	}
+	while (*str) {
+		val = simple_strtoull(str, &endp, 0);
+		if (str == endp)
+			break;
+		str = endp;
+		if (*str == '-') {
+			str++;
+			endval = simple_strtoull(str, &endp, 0);
+			if (str == endp)
+				break;
+			str = endp;
+			while (val <= endval) {
+				alt_debug_modify(type, val, clear);
+				val++;
+			}
+		} else {
+			alt_debug_modify(type, val, clear);
+		}
+		if (*str != ',')
+			break;
+		str++;
+	}
+	return str;
+}
+
+/*
+ * Use debug-alternative command line parameter for debugging:
+ * "debug-alternative"
+ *  -> print debug message for every single alternative
+ *
+ * "debug-alternative=0;2"
+ * -> print debug message for all alternatives with type 0 and 2
+ *
+ * "debug-alternative=0:0-7"
+ * -> print debug message for all alternatives with type 0 and with
+ *    facility numbers within the range of 0-7
+ *    (if type 0 is ALT_TYPE_FACILITY)
+ *
+ * "debug-alternative=0:!8;1"
+ * -> print debug message for all alternatives with type 0, for all
+ *    facility number, except facility 8, and in addition print all
+ *    alternatives with type 1
+ */
+void alt_debug_setup(char *str)
+{
+	unsigned long type;
+	char *endp;
+	int i;
+
+	if (!str) {
+		alt_debug_all(ALT_TYPE_FACILITY);
+		alt_debug_all(ALT_TYPE_FEATURE);
+		alt_debug_all(ALT_TYPE_SPEC);
+		return;
+	}
+	while (*str) {
+		type = simple_strtoull(str, &endp, 0);
+		if (str == endp)
+			break;
+		str = endp;
+		switch (type) {
+		case ALT_TYPE_FACILITY:
+		case ALT_TYPE_FEATURE:
+			str = alt_debug_parse(type, str);
+			break;
+		case ALT_TYPE_SPEC:
+			alt_debug_all(ALT_TYPE_SPEC);
+			break;
+		}
+		if (*str != ';')
+			break;
+		str++;
+	}
+}
diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h
index 2ea603f70c3b..e045cae6e80a 100644
--- a/arch/s390/boot/boot.h
+++ b/arch/s390/boot/boot.h
@@ -2,20 +2,124 @@
 #ifndef BOOT_BOOT_H
 #define BOOT_BOOT_H
 
+#include <linux/types.h>
+
+#define IPL_START	0x200
+
+#ifndef __ASSEMBLY__
+
+#include <linux/printk.h>
+#include <asm/physmem_info.h>
+
+struct vmlinux_info {
+	unsigned long entry;
+	unsigned long image_size;	/* does not include .bss */
+	unsigned long bss_size;		/* uncompressed image .bss size */
+	unsigned long bootdata_off;
+	unsigned long bootdata_size;
+	unsigned long bootdata_preserved_off;
+	unsigned long bootdata_preserved_size;
+	unsigned long got_start;
+	unsigned long got_end;
+	unsigned long amode31_size;
+	unsigned long init_mm_off;
+	unsigned long swapper_pg_dir_off;
+	unsigned long invalid_pg_dir_off;
+	unsigned long alt_instructions;
+	unsigned long alt_instructions_end;
+#ifdef CONFIG_KASAN
+	unsigned long kasan_early_shadow_page_off;
+	unsigned long kasan_early_shadow_pte_off;
+	unsigned long kasan_early_shadow_pmd_off;
+	unsigned long kasan_early_shadow_pud_off;
+	unsigned long kasan_early_shadow_p4d_off;
+#endif
+};
+
 void startup_kernel(void);
-void detect_memory(void);
+unsigned long detect_max_physmem_end(void);
+void detect_physmem_online_ranges(unsigned long max_physmem_end);
+void physmem_set_usable_limit(unsigned long limit);
+void physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size);
+void physmem_free(enum reserved_range_type type);
+/* for continuous/multiple allocations per type */
+unsigned long physmem_alloc_or_die(enum reserved_range_type type, unsigned long size,
+				   unsigned long align);
+unsigned long physmem_alloc(enum reserved_range_type type, unsigned long size,
+			    unsigned long align, bool die_on_oom);
+/* for single allocations, 1 per type */
+unsigned long physmem_alloc_range(enum reserved_range_type type, unsigned long size,
+				  unsigned long align, unsigned long min, unsigned long max,
+				  bool die_on_oom);
+unsigned long get_physmem_alloc_pos(void);
+void dump_physmem_reserved(void);
+bool ipl_report_certs_intersects(unsigned long addr, unsigned long size,
+				 unsigned long *intersection_start);
+bool is_ipl_block_dump(void);
 void store_ipl_parmblock(void);
+int read_ipl_report(void);
+void save_ipl_cert_comp_list(void);
 void setup_boot_command_line(void);
 void parse_boot_command_line(void);
-void setup_memory_end(void);
 void verify_facilities(void);
 void print_missing_facilities(void);
-void print_pgm_check_info(void);
-unsigned long get_random_base(unsigned long safe_addr);
+void sclp_early_setup_buffer(void);
+void alt_debug_setup(char *str);
+void do_pgm_check(struct pt_regs *regs);
+unsigned long randomize_within_range(unsigned long size, unsigned long align,
+				     unsigned long min, unsigned long max);
+void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned long asce_limit);
+int __printf(1, 2) boot_printk(const char *fmt, ...);
+void print_stacktrace(unsigned long sp);
+void error(char *m);
+int get_random(unsigned long limit, unsigned long *value);
+void boot_rb_dump(void);
 
-extern int kaslr_enabled;
+#ifndef boot_fmt
+#define boot_fmt(fmt)	fmt
+#endif
+
+#define boot_emerg(fmt, ...)	boot_printk(KERN_EMERG boot_fmt(fmt), ##__VA_ARGS__)
+#define boot_alert(fmt, ...)	boot_printk(KERN_ALERT boot_fmt(fmt), ##__VA_ARGS__)
+#define boot_crit(fmt, ...)	boot_printk(KERN_CRIT boot_fmt(fmt), ##__VA_ARGS__)
+#define boot_err(fmt, ...)	boot_printk(KERN_ERR boot_fmt(fmt), ##__VA_ARGS__)
+#define boot_warn(fmt, ...)	boot_printk(KERN_WARNING boot_fmt(fmt), ##__VA_ARGS__)
+#define boot_notice(fmt, ...)	boot_printk(KERN_NOTICE boot_fmt(fmt), ##__VA_ARGS__)
+#define boot_info(fmt, ...)	boot_printk(KERN_INFO boot_fmt(fmt), ##__VA_ARGS__)
+#define boot_debug(fmt, ...)	boot_printk(KERN_DEBUG boot_fmt(fmt), ##__VA_ARGS__)
+
+extern struct machine_info machine;
+extern int boot_console_loglevel;
+extern bool boot_ignore_loglevel;
+
+/* Symbols defined by linker scripts */
 extern const char kernel_version[];
+extern unsigned long memory_limit;
+extern unsigned long vmalloc_size;
+extern int vmalloc_size_set;
+extern char __boot_data_start[], __boot_data_end[];
+extern char __boot_data_preserved_start[], __boot_data_preserved_end[];
+extern char __vmlinux_relocs_64_start[], __vmlinux_relocs_64_end[];
+extern char _decompressor_syms_start[], _decompressor_syms_end[];
+extern char _stack_start[], _stack_end[];
+extern char _end[], _decompressor_end[];
+extern unsigned char _compressed_start[];
+extern unsigned char _compressed_end[];
+extern struct vmlinux_info _vmlinux_info;
+
+#define vmlinux _vmlinux_info
 
-unsigned long read_ipl_report(unsigned long safe_offset);
+#define __lowcore_pa(x)		((unsigned long)(x) % sizeof(struct lowcore))
+#define __abs_lowcore_pa(x)	(((unsigned long)(x) - __abs_lowcore) % sizeof(struct lowcore))
+#define __kernel_va(x)		((void *)((unsigned long)(x) - __kaslr_offset_phys + __kaslr_offset))
+#define __kernel_pa(x)		((unsigned long)(x) - __kaslr_offset + __kaslr_offset_phys)
+#define __identity_va(x)	((void *)((unsigned long)(x) + __identity_base))
+#define __identity_pa(x)	((unsigned long)(x) - __identity_base)
 
+static inline bool intersects(unsigned long addr0, unsigned long size0,
+			      unsigned long addr1, unsigned long size1)
+{
+	return addr0 + size0 > addr1 && addr1 + size1 > addr0;
+}
+#endif /* __ASSEMBLY__ */
 #endif /* BOOT_BOOT_H */
diff --git a/arch/s390/boot/clz_ctz.c b/arch/s390/boot/clz_ctz.c
new file mode 100644
index 000000000000..c3ebf248596b
--- /dev/null
+++ b/arch/s390/boot/clz_ctz.c
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "../../../../lib/clz_ctz.c"
diff --git a/arch/s390/boot/compressed/.gitignore b/arch/s390/boot/compressed/.gitignore
deleted file mode 100644
index e72fcd7ecebb..000000000000
--- a/arch/s390/boot/compressed/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-vmlinux
-vmlinux.lds
diff --git a/arch/s390/boot/compressed/Makefile b/arch/s390/boot/compressed/Makefile
deleted file mode 100644
index fa529c5b4486..000000000000
--- a/arch/s390/boot/compressed/Makefile
+++ /dev/null
@@ -1,68 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# linux/arch/s390/boot/compressed/Makefile
-#
-# create a compressed vmlinux image from the original vmlinux
-#
-
-KCOV_INSTRUMENT := n
-GCOV_PROFILE := n
-UBSAN_SANITIZE := n
-KASAN_SANITIZE := n
-
-obj-y	:= $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) piggy.o info.o
-targets	:= vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2
-targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4
-targets += info.bin $(obj-y)
-
-KBUILD_AFLAGS := $(KBUILD_AFLAGS_DECOMPRESSOR)
-KBUILD_CFLAGS := $(KBUILD_CFLAGS_DECOMPRESSOR)
-OBJCOPYFLAGS :=
-
-OBJECTS := $(addprefix $(obj)/,$(obj-y))
-
-LDFLAGS_vmlinux := --oformat $(LD_BFD) -e startup -T
-$(obj)/vmlinux: $(obj)/vmlinux.lds $(objtree)/arch/s390/boot/startup.a $(OBJECTS) FORCE
-	$(call if_changed,ld)
-
-OBJCOPYFLAGS_info.bin := -O binary --only-section=.vmlinux.info --set-section-flags .vmlinux.info=load
-$(obj)/info.bin: vmlinux FORCE
-	$(call if_changed,objcopy)
-
-OBJCOPYFLAGS_info.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.info
-$(obj)/info.o: $(obj)/info.bin FORCE
-	$(call if_changed,objcopy)
-
-OBJCOPYFLAGS_vmlinux.bin := -O binary --remove-section=.comment --remove-section=.vmlinux.info -S
-$(obj)/vmlinux.bin: vmlinux FORCE
-	$(call if_changed,objcopy)
-
-vmlinux.bin.all-y := $(obj)/vmlinux.bin
-
-suffix-$(CONFIG_KERNEL_GZIP)  := .gz
-suffix-$(CONFIG_KERNEL_BZIP2) := .bz2
-suffix-$(CONFIG_KERNEL_LZ4)  := .lz4
-suffix-$(CONFIG_KERNEL_LZMA)  := .lzma
-suffix-$(CONFIG_KERNEL_LZO)  := .lzo
-suffix-$(CONFIG_KERNEL_XZ)  := .xz
-
-$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
-	$(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
-	$(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE
-	$(call if_changed,lz4)
-$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
-	$(call if_changed,lzma)
-$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
-	$(call if_changed,lzo)
-$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE
-	$(call if_changed,xzkern)
-
-OBJCOPYFLAGS_piggy.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.bin.compressed
-$(obj)/piggy.o: $(obj)/vmlinux.bin$(suffix-y) FORCE
-	$(call if_changed,objcopy)
-
-chkbss := $(filter-out piggy.o info.o, $(obj-y))
-chkbss-target := vmlinux.bin
-include $(srctree)/arch/s390/scripts/Makefile.chkbss
diff --git a/arch/s390/boot/compressed/decompressor.h b/arch/s390/boot/compressed/decompressor.h
deleted file mode 100644
index c15eb7114d83..000000000000
--- a/arch/s390/boot/compressed/decompressor.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef BOOT_COMPRESSED_DECOMPRESSOR_H
-#define BOOT_COMPRESSED_DECOMPRESSOR_H
-
-#ifdef CONFIG_KERNEL_UNCOMPRESSED
-static inline void *decompress_kernel(void) {}
-#else
-void *decompress_kernel(void);
-#endif
-unsigned long mem_safe_offset(void);
-void error(char *m);
-
-struct vmlinux_info {
-	unsigned long default_lma;
-	void (*entry)(void);
-	unsigned long image_size;	/* does not include .bss */
-	unsigned long bss_size;		/* uncompressed image .bss size */
-	unsigned long bootdata_off;
-	unsigned long bootdata_size;
-	unsigned long bootdata_preserved_off;
-	unsigned long bootdata_preserved_size;
-	unsigned long dynsym_start;
-	unsigned long rela_dyn_start;
-	unsigned long rela_dyn_end;
-};
-
-extern char _vmlinux_info[];
-#define vmlinux (*(struct vmlinux_info *)_vmlinux_info)
-
-#endif /* BOOT_COMPRESSED_DECOMPRESSOR_H */
diff --git a/arch/s390/boot/compressed/vmlinux.lds.S b/arch/s390/boot/compressed/vmlinux.lds.S
deleted file mode 100644
index 44561b2c3712..000000000000
--- a/arch/s390/boot/compressed/vmlinux.lds.S
+++ /dev/null
@@ -1,102 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/vmlinux.lds.h>
-
-OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390")
-OUTPUT_ARCH(s390:64-bit)
-
-ENTRY(startup)
-
-SECTIONS
-{
-	. = 0;
-	.head.text : {
-		_head = . ;
-		HEAD_TEXT
-		_ehead = . ;
-	}
-	.text :	{
-		_text = .;	/* Text */
-		*(.text)
-		*(.text.*)
-		_etext = . ;
-	}
-	.rodata : {
-		_rodata = . ;
-		*(.rodata)	 /* read-only data */
-		*(.rodata.*)
-		_erodata = . ;
-	}
-	.data :	{
-		_data = . ;
-		*(.data)
-		*(.data.*)
-		_edata = . ;
-	}
-	/*
-	* .dma section for code, data, ex_table that need to stay below 2 GB,
-	* even when the kernel is relocate: above 2 GB.
-	*/
-	. = ALIGN(PAGE_SIZE);
-	_sdma = .;
-	.dma.text : {
-		_stext_dma = .;
-		*(.dma.text)
-		. = ALIGN(PAGE_SIZE);
-		_etext_dma = .;
-	}
-	. = ALIGN(16);
-	.dma.ex_table : {
-		_start_dma_ex_table = .;
-		KEEP(*(.dma.ex_table))
-		_stop_dma_ex_table = .;
-	}
-	.dma.data : { *(.dma.data) }
-	. = ALIGN(PAGE_SIZE);
-	_edma = .;
-
-	BOOT_DATA
-	BOOT_DATA_PRESERVED
-
-	/*
-	 * uncompressed image info used by the decompressor it should match
-	 * struct vmlinux_info. It comes from .vmlinux.info section of
-	 * uncompressed vmlinux in a form of info.o
-	 */
-	. = ALIGN(8);
-	.vmlinux.info : {
-		_vmlinux_info = .;
-		*(.vmlinux.info)
-	}
-
-#ifdef CONFIG_KERNEL_UNCOMPRESSED
-	. = 0x100000;
-#else
-	. = ALIGN(8);
-#endif
-	.rodata.compressed : {
-		_compressed_start = .;
-		*(.vmlinux.bin.compressed)
-		_compressed_end = .;
-		FILL(0xff);
-		. = ALIGN(4096);
-	}
-	. = ALIGN(256);
-	.bss : {
-		_bss = . ;
-		*(.bss)
-		*(.bss.*)
-		*(COMMON)
-		. = ALIGN(8);	/* For convenience during zeroing */
-		_ebss = .;
-	}
-	_end = .;
-
-	/* Sections to be discarded */
-	/DISCARD/ : {
-		*(.eh_frame)
-		*(__ex_table)
-		*(*__ksymtab*)
-		*(___kcrctab*)
-	}
-}
diff --git a/arch/s390/boot/compressed/decompressor.c b/arch/s390/boot/decompressor.c
index 45046630c56a..03500b9d9fb9 100644
--- a/arch/s390/boot/compressed/decompressor.c
+++ b/arch/s390/boot/decompressor.c
@@ -9,14 +9,15 @@
 
 #include <linux/kernel.h>
 #include <linux/string.h>
+#include <asm/boot_data.h>
 #include <asm/page.h>
 #include "decompressor.h"
+#include "boot.h"
 
 /*
  * gzip declarations
  */
 #define STATIC static
-#define STATIC_RW_DATA static __section(.data)
 
 #undef memset
 #undef memcpy
@@ -24,19 +25,16 @@
 #define memmove memmove
 #define memzero(s, n) memset((s), 0, (n))
 
-/* Symbols defined by linker scripts */
-extern char _end[];
-extern unsigned char _compressed_start[];
-extern unsigned char _compressed_end[];
-
-#ifdef CONFIG_HAVE_KERNEL_BZIP2
-#define HEAP_SIZE	0x400000
+#if defined(CONFIG_KERNEL_BZIP2)
+#define BOOT_HEAP_SIZE	0x400000
+#elif defined(CONFIG_KERNEL_ZSTD)
+#define BOOT_HEAP_SIZE	0x30000
 #else
-#define HEAP_SIZE	0x10000
+#define BOOT_HEAP_SIZE	0x10000
 #endif
 
 static unsigned long free_mem_ptr = (unsigned long) _end;
-static unsigned long free_mem_end_ptr = (unsigned long) _end + HEAP_SIZE;
+static unsigned long free_mem_end_ptr = (unsigned long) _end + BOOT_HEAP_SIZE;
 
 #ifdef CONFIG_KERNEL_GZIP
 #include "../../../../lib/decompress_inflate.c"
@@ -62,24 +60,26 @@ static unsigned long free_mem_end_ptr = (unsigned long) _end + HEAP_SIZE;
 #include "../../../../lib/decompress_unxz.c"
 #endif
 
-#define decompress_offset ALIGN((unsigned long)_end + HEAP_SIZE, PAGE_SIZE)
+#ifdef CONFIG_KERNEL_ZSTD
+#include "../../../../lib/decompress_unzstd.c"
+#endif
 
-unsigned long mem_safe_offset(void)
+static void decompress_error(char *m)
 {
-	/*
-	 * due to 4MB HEAD_SIZE for bzip2
-	 * 'decompress_offset + vmlinux.image_size' could be larger than
-	 * kernel at final position + its .bss, so take the larger of two
-	 */
-	return max(decompress_offset + vmlinux.image_size,
-		   vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size);
+	if (bootdebug)
+		boot_rb_dump();
+	boot_emerg("Decompression error: %s\n", m);
+	boot_emerg(" -- System halted\n");
+	disabled_wait();
 }
 
-void *decompress_kernel(void)
+unsigned long mem_safe_offset(void)
 {
-	void *output = (void *)decompress_offset;
+	return ALIGN(free_mem_end_ptr, PAGE_SIZE);
+}
 
+void deploy_kernel(void *output)
+{
 	__decompress(_compressed_start, _compressed_end - _compressed_start,
-		     NULL, NULL, output, 0, NULL, error);
-	return output;
+		     NULL, NULL, output, vmlinux.image_size, NULL, decompress_error);
 }
diff --git a/arch/s390/boot/decompressor.h b/arch/s390/boot/decompressor.h
new file mode 100644
index 000000000000..4f966f06bd65
--- /dev/null
+++ b/arch/s390/boot/decompressor.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_COMPRESSED_DECOMPRESSOR_H
+#define BOOT_COMPRESSED_DECOMPRESSOR_H
+
+#ifndef CONFIG_KERNEL_UNCOMPRESSED
+unsigned long mem_safe_offset(void);
+void deploy_kernel(void *output);
+#endif
+
+#endif /* BOOT_COMPRESSED_DECOMPRESSOR_H */
diff --git a/arch/s390/boot/head.S b/arch/s390/boot/head.S
index 4b86a8d3c121..0b511d5c030b 100644
--- a/arch/s390/boot/head.S
+++ b/arch/s390/boot/head.S
@@ -5,7 +5,6 @@
  *    Author(s): Hartmut Penner <hp@de.ibm.com>
  *		 Martin Schwidefsky <schwidefsky@de.ibm.com>
  *		 Rob van der Heij <rvdhei@iae.nl>
- *		 Heiko Carstens <heiko.carstens@de.ibm.com>
  *
  * There are 5 different IPL methods
  *  1) load the image directly into ram at address 0 and do an PSW restart
@@ -25,259 +24,201 @@
 #include <linux/init.h>
 #include <linux/linkage.h>
 #include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
+#include <asm/sclp.h>
+#include "boot.h"
 
-#define ARCH_OFFSET	4
+#define EP_OFFSET	0x10008
+#define EP_STRING	"S390EP"
+#define IPL_BS		0x730
 
 __HEAD
-
-#define IPL_BS	0x730
-	.org	0
-	.long	0x00080000,0x80000000+iplstart	# The first 24 bytes are loaded
-	.long	0x02000018,0x60000050		# by ipl to addresses 0-23.
-	.long	0x02000068,0x60000050		# (a PSW and two CCWs).
-	.fill	80-24,1,0x40			# bytes 24-79 are discarded !!
-	.long	0x020000f0,0x60000050		# The next 160 byte are loaded
-	.long	0x02000140,0x60000050		# to addresses 0x18-0xb7
-	.long	0x02000190,0x60000050		# They form the continuation
-	.long	0x020001e0,0x60000050		# of the CCW program started
-	.long	0x02000230,0x60000050		# by ipl and load the range
-	.long	0x02000280,0x60000050		# 0x0f0-0x730 from the image
-	.long	0x020002d0,0x60000050		# to the range 0x0f0-0x730
-	.long	0x02000320,0x60000050		# in memory. At the end of
-	.long	0x02000370,0x60000050		# the channel program the PSW
-	.long	0x020003c0,0x60000050		# at location 0 is loaded.
-	.long	0x02000410,0x60000050		# Initial processing starts
-	.long	0x02000460,0x60000050		# at 0x200 = iplstart.
-	.long	0x020004b0,0x60000050
-	.long	0x02000500,0x60000050
-	.long	0x02000550,0x60000050
-	.long	0x020005a0,0x60000050
-	.long	0x020005f0,0x60000050
-	.long	0x02000640,0x60000050
-	.long	0x02000690,0x60000050
-	.long	0x020006e0,0x20000050
-
-	.org	__LC_RST_NEW_PSW		# 0x1a0
-	.quad	0,iplstart
-	.org	__LC_PGM_NEW_PSW		# 0x1d0
-	.quad	0x0000000180000000,startup_pgm_check_handler
-
-	.org	0x200
-
+ipl_start:
+	mvi	__LC_AR_MODE_ID,1	# set esame flag
+	slr	%r0,%r0			# set cpuid to zero
+	lhi	%r1,2			# mode 2 = esame (dump)
+	sigp	%r1,%r0,0x12		# switch to esame mode
+	sam64				# switch to 64 bit addressing mode
+	lgh	%r1,__LC_SUBCHANNEL_ID	# test if subchannel number
+	brctg	%r1,.Lnoload		#  is valid
+	llgf	%r1,__LC_SUBCHANNEL_ID	# load ipl subchannel number
+	lghi	%r2,IPL_BS		# load start address
+	bras	%r14,.Lloader		# load rest of ipl image
+	larl	%r12,parmarea		# pointer to parameter area
+	stg	%r1,IPL_DEVICE-PARMAREA(%r12) # save ipl device number
+#
+# load parameter file from ipl device
+#
+.Lagain1:
+	larl	%r2,_end		# ramdisk loc. is temp
+	bras	%r14,.Lloader		# load parameter file
+	ltgr	%r2,%r2			# got anything ?
+	jz	.Lnopf
+	lg	%r3,MAX_COMMAND_LINE_SIZE-PARMAREA(%r12)
+	aghi	%r3,-1
+	clgr	%r2,%r3
+	jl	.Lnotrunc
+	lgr	%r2,%r3
+.Lnotrunc:
+	larl	%r4,_end
+	larl	%r13,.L_hdr
+	clc	0(3,%r4),0(%r13)	# if it is HDRx
+	jz	.Lagain1		# skip dataset header
+	larl	%r13,.L_eof
+	clc	0(3,%r4),0(%r13)	# if it is EOFx
+	jz	.Lagain1		# skip data set trailer
+	lgr	%r5,%r2
+	la	%r6,COMMAND_LINE-PARMAREA(%r12)
+	lgr	%r7,%r2
+	aghi	%r7,1
+	mvcl	%r6,%r4
+.Lnopf:
+#
+# load ramdisk from ipl device
+#
+.Lagain2:
+	larl	%r2,_end		# addr of ramdisk
+	stg	%r2,INITRD_START-PARMAREA(%r12)
+	bras	%r14,.Lloader		# load ramdisk
+	stg	%r2,INITRD_SIZE-PARMAREA(%r12) # store size of rd
+	ltgr	%r2,%r2
+	jnz	.Lrdcont
+	stg	%r2,INITRD_START-PARMAREA(%r12) # no ramdisk found
+.Lrdcont:
+	larl	%r2,_end
+	larl	%r13,.L_hdr		# skip HDRx and EOFx
+	clc	0(3,%r2),0(%r13)
+	jz	.Lagain2
+	larl	%r13,.L_eof
+	clc	0(3,%r2),0(%r13)
+	jz	.Lagain2
+#
+# reset files in VM reader
+#
+	larl	%r13,.Lcpuid
+	stidp	0(%r13)			# store cpuid
+	tm	0(%r13),0xff		# running VM ?
+	jno	.Lnoreset
+	larl	%r2,.Lreset
+	lghi	%r3,26
+	diag	%r2,%r3,8
+	larl	%r5,.Lirb
+	stsch	0(%r5)			# check if irq is pending
+	tm	30(%r5),0x0f		# by verifying if any of the
+	jnz	.Lwaitforirq		# activity or status control
+	tm	31(%r5),0xff		# bits is set in the schib
+	jz	.Lnoreset
+.Lwaitforirq:
+	bras	%r14,.Lirqwait		# wait for IO interrupt
+	c	%r1,__LC_SUBCHANNEL_ID	# compare subchannel number
+	jne	.Lwaitforirq
+	larl	%r5,.Lirb
+	tsch	0(%r5)
+.Lnoreset:
+	j	.Lnoload
+#
+# everything loaded, go for it
+#
+.Lnoload:
+	jg	startup
 #
 # subroutine to wait for end I/O
 #
 .Lirqwait:
-	mvc	__LC_IO_NEW_PSW(16),.Lnewpsw	# set up IO interrupt psw
-	lpsw	.Lwaitpsw
+	larl	%r13,.Lnewpswmask	# set up IO interrupt psw
+	mvc	__LC_IO_NEW_PSW(8),0(%r13)
+	stg	%r14,__LC_IO_NEW_PSW+8
+	larl	%r13,.Lwaitpsw
+	lpswe	0(%r13)
 .Lioint:
-	br	%r14
-	.align	8
-.Lnewpsw:
-	.quad	0x0000000080000000,.Lioint
-.Lwaitpsw:
-	.long	0x020a0000,0x80000000+.Lioint
-
 #
 # subroutine for loading cards from the reader
 #
 .Lloader:
-	la	%r4,0(%r14)
-	la	%r3,.Lorb		# r2 = address of orb into r2
-	la	%r5,.Lirb		# r4 = address of irb
-	la	%r6,.Lccws
-	la	%r7,20
+	lgr	%r4,%r14
+	larl	%r3,.Lorb		# r2 = address of orb into r2
+	larl	%r5,.Lirb		# r4 = address of irb
+	larl	%r6,.Lccws
+	lghi	%r7,20
 .Linit:
 	st	%r2,4(%r6)		# initialize CCW data addresses
 	la	%r2,0x50(%r2)
 	la	%r6,8(%r6)
-	bct	7,.Linit
-
-	lctl	%c6,%c6,.Lcr6		# set IO subclass mask
-	slr	%r2,%r2
+	brctg	%r7,.Linit
+	larl	%r13,.Lcr6
+	lctlg	%c6,%c6,0(%r13)
+	xgr	%r2,%r2
 .Lldlp:
 	ssch	0(%r3)			# load chunk of 1600 bytes
-	bnz	.Llderr
+	jnz	.Llderr
 .Lwait4irq:
-	bas	%r14,.Lirqwait
+	bras	%r14,.Lirqwait
 	c	%r1,__LC_SUBCHANNEL_ID	# compare subchannel number
-	bne	.Lwait4irq
+	jne	.Lwait4irq
 	tsch	0(%r5)
-
-	slr	%r0,%r0
+	xgr	%r0,%r0
 	ic	%r0,8(%r5)		# get device status
-	chi	%r0,8			# channel end ?
-	be	.Lcont
-	chi	%r0,12			# channel end + device end ?
-	be	.Lcont
-
-	l	%r0,4(%r5)
-	s	%r0,8(%r3)		# r0/8 = number of ccws executed
-	mhi	%r0,10			# *10 = number of bytes in ccws
-	lh	%r3,10(%r5)		# get residual count
-	sr	%r0,%r3 		# #ccws*80-residual=#bytes read
-	ar	%r2,%r0
-
+	cghi	%r0,8			# channel end ?
+	je	.Lcont
+	cghi	%r0,12			# channel end + device end ?
+	je	.Lcont
+	llgf	%r0,4(%r5)
+	sgf	%r0,8(%r3)		# r0/8 = number of ccws executed
+	mghi	%r0,10			# *10 = number of bytes in ccws
+	llgh	%r3,10(%r5)		# get residual count
+	sgr	%r0,%r3			# #ccws*80-residual=#bytes read
+	agr	%r2,%r0
 	br	%r4			# r2 contains the total size
-
 .Lcont:
-	ahi	%r2,0x640		# add 0x640 to total size
-	la	%r6,.Lccws
-	la	%r7,20
+	aghi	%r2,0x640		# add 0x640 to total size
+	larl	%r6,.Lccws
+	lghi	%r7,20
 .Lincr:
 	l	%r0,4(%r6)		# update CCW data addresses
-	ahi	%r0,0x640
+	aghi	%r0,0x640
 	st	%r0,4(%r6)
-	ahi	%r6,8
-	bct	7,.Lincr
-
-	b	.Lldlp
+	aghi	%r6,8
+	brctg	%r7,.Lincr
+	j	.Lldlp
 .Llderr:
-	lpsw	.Lcrash
+	larl	%r13,.Lcrash
+	lpsw	0(%r13)
 
-	.align	8
+	.balign	8
+.Lwaitpsw:
+	.quad	0x0202000180000000,.Lioint
+.Lnewpswmask:
+	.quad	0x0000000180000000
+	.balign	8
 .Lorb:	.long	0x00000000,0x0080ff00,.Lccws
 .Lirb:	.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-.Lcr6:	.long	0xff000000
-.Lloadp:.long	0,0
-	.align	8
+	.balign	8
+.Lcr6:	.quad	0x00000000ff000000
+	.balign	8
 .Lcrash:.long	0x000a0000,0x00000000
-
-	.align	8
+	.balign	8
 .Lccws: .rept	19
 	.long	0x02600050,0x00000000
 	.endr
 	.long	0x02200050,0x00000000
-
-iplstart:
-	mvi	__LC_AR_MODE_ID,1	# set esame flag
-	slr	%r0,%r0			# set cpuid to zero
-	lhi	%r1,2			# mode 2 = esame (dump)
-	sigp	%r1,%r0,0x12		# switch to esame mode
-	bras	%r13,0f
-	.fill	16,4,0x0
-0:	lmh	%r0,%r15,0(%r13)	# clear high-order half of gprs
-	sam31				# switch to 31 bit addressing mode
-	lh	%r1,__LC_SUBCHANNEL_ID	# test if subchannel number
-	bct	%r1,.Lnoload		#  is valid
-	l	%r1,__LC_SUBCHANNEL_ID	# load ipl subchannel number
-	la	%r2,IPL_BS		# load start address
-	bas	%r14,.Lloader		# load rest of ipl image
-	l	%r12,.Lparm		# pointer to parameter area
-	st	%r1,IPL_DEVICE+ARCH_OFFSET-PARMAREA(%r12) # save ipl device number
-
-#
-# load parameter file from ipl device
-#
-.Lagain1:
-	l	%r2,.Linitrd		# ramdisk loc. is temp
-	bas	%r14,.Lloader		# load parameter file
-	ltr	%r2,%r2 		# got anything ?
-	bz	.Lnopf
-	chi	%r2,895
-	bnh	.Lnotrunc
-	la	%r2,895
-.Lnotrunc:
-	l	%r4,.Linitrd
-	clc	0(3,%r4),.L_hdr		# if it is HDRx
-	bz	.Lagain1		# skip dataset header
-	clc	0(3,%r4),.L_eof		# if it is EOFx
-	bz	.Lagain1		# skip dateset trailer
-	la	%r5,0(%r4,%r2)
-	lr	%r3,%r2
-	la	%r3,COMMAND_LINE-PARMAREA(%r12) # load adr. of command line
-	mvc	0(256,%r3),0(%r4)
-	mvc	256(256,%r3),256(%r4)
-	mvc	512(256,%r3),512(%r4)
-	mvc	768(122,%r3),768(%r4)
-	slr	%r0,%r0
-	b	.Lcntlp
-.Ldelspc:
-	ic	%r0,0(%r2,%r3)
-	chi	%r0,0x20		# is it a space ?
-	be	.Lcntlp
-	ahi	%r2,1
-	b	.Leolp
-.Lcntlp:
-	brct	%r2,.Ldelspc
-.Leolp:
-	slr	%r0,%r0
-	stc	%r0,0(%r2,%r3)		# terminate buffer
-.Lnopf:
-
-#
-# load ramdisk from ipl device
-#
-.Lagain2:
-	l	%r2,.Linitrd		# addr of ramdisk
-	st	%r2,INITRD_START+ARCH_OFFSET-PARMAREA(%r12)
-	bas	%r14,.Lloader		# load ramdisk
-	st	%r2,INITRD_SIZE+ARCH_OFFSET-PARMAREA(%r12) # store size of rd
-	ltr	%r2,%r2
-	bnz	.Lrdcont
-	st	%r2,INITRD_START+ARCH_OFFSET-PARMAREA(%r12) # no ramdisk found
-.Lrdcont:
-	l	%r2,.Linitrd
-
-	clc	0(3,%r2),.L_hdr		# skip HDRx and EOFx
-	bz	.Lagain2
-	clc	0(3,%r2),.L_eof
-	bz	.Lagain2
-
-#
-# reset files in VM reader
-#
-	stidp	.Lcpuid			# store cpuid
-	tm	.Lcpuid,0xff		# running VM ?
-	bno	.Lnoreset
-	la	%r2,.Lreset
-	lhi	%r3,26
-	diag	%r2,%r3,8
-	la	%r5,.Lirb
-	stsch	0(%r5)			# check if irq is pending
-	tm	30(%r5),0x0f		# by verifying if any of the
-	bnz	.Lwaitforirq		# activity or status control
-	tm	31(%r5),0xff		# bits is set in the schib
-	bz	.Lnoreset
-.Lwaitforirq:
-	bas	%r14,.Lirqwait		# wait for IO interrupt
-	c	%r1,__LC_SUBCHANNEL_ID	# compare subchannel number
-	bne	.Lwaitforirq
-	la	%r5,.Lirb
-	tsch	0(%r5)
-.Lnoreset:
-	b	.Lnoload
-
-#
-# everything loaded, go for it
-#
-.Lnoload:
-	l	%r1,.Lstartup
-	br	%r1
-
-.Linitrd:.long _end			# default address of initrd
-.Lparm:	.long  PARMAREA
-.Lstartup: .long startup
 .Lreset:.byte	0xc3,0xc8,0xc1,0xd5,0xc7,0xc5,0x40,0xd9,0xc4,0xd9,0x40
 	.byte	0xc1,0xd3,0xd3,0x40,0xd2,0xc5,0xc5,0xd7,0x40,0xd5,0xd6
 	.byte	0xc8,0xd6,0xd3,0xc4	# "change rdr all keep nohold"
 .L_eof: .long	0xc5d6c600	 /* C'EOF' */
 .L_hdr: .long	0xc8c4d900	 /* C'HDR' */
-	.align	8
+	.balign	8
 .Lcpuid:.fill	8,1,0
 
 #
-# startup-code at 0x10000, running in absolute addressing mode
+# normal startup-code, running in absolute addressing mode
 # this is called either by the ipl loader or directly by PSW restart
 # or linload or SALIPL
 #
-	.org	0x10000
-ENTRY(startup)
-	j	.Lep_startup_normal
-	.org	EP_OFFSET
+	.org	STARTUP_NORMAL_OFFSET - IPL_START
+SYM_CODE_START(startup)
+	j	startup_normal
+	.org	EP_OFFSET - IPL_START
 #
 # This is a list of s390 kernel entry points. At address 0x1000f the number of
 # valid entry points is stored.
@@ -287,12 +228,12 @@ ENTRY(startup)
 	.ascii	EP_STRING
 	.byte	0x00,0x01
 #
-# kdump startup-code at 0x10010, running in 64 bit absolute addressing mode
+# kdump startup-code, running in 64 bit absolute addressing mode
 #
-	.org	0x10010
-ENTRY(startup_kdump)
-	j	.Lep_startup_kdump
-.Lep_startup_normal:
+	.org	STARTUP_KDUMP_OFFSET - IPL_START
+	j	startup_kdump
+SYM_CODE_END(startup)
+SYM_CODE_START_LOCAL(startup_normal)
 	mvi	__LC_AR_MODE_ID,1	# set esame flag
 	slr	%r0,%r0 		# set cpuid to zero
 	lhi	%r1,2			# mode 2 = esame (dump)
@@ -301,103 +242,84 @@ ENTRY(startup_kdump)
 	.fill	16,4,0x0
 0:	lmh	%r0,%r15,0(%r13)	# clear high-order half of gprs
 	sam64				# switch to 64 bit addressing mode
-	basr	%r13,0			# get base
-.LPG0:
+	larl	%r13,.Lext_new_psw
+	mvc	__LC_EXT_NEW_PSW(16),0(%r13)
+	larl	%r13,.Lpgm_new_psw
+	mvc	__LC_PGM_NEW_PSW(16),0(%r13)
+	larl	%r13,.Lio_new_psw
+	mvc	__LC_IO_NEW_PSW(16),0(%r13)
 	xc	0x200(256),0x200	# partially clear lowcore
 	xc	0x300(256),0x300
 	xc	0xe00(256),0xe00
 	xc	0xf00(256),0xf00
-	lctlg	%c0,%c15,.Lctl-.LPG0(%r13)	# load control registers
-	stcke	__LC_BOOT_CLOCK
-	mvc	__LC_LAST_UPDATE_CLOCK(8),__LC_BOOT_CLOCK+1
-	spt	6f-.LPG0(%r13)
-	mvc	__LC_LAST_UPDATE_TIMER(8),6f-.LPG0(%r13)
-	l	%r15,.Lstack-.LPG0(%r13)
+	larl	%r13,.Lctl
+	lctlg	%c0,%c15,0(%r13)	# load control registers
+	larl	%r13,tod_clock_base
+	stcke	0(%r13)
+	mvc	__LC_LAST_UPDATE_CLOCK(8),1(%r13)
+	larl	%r13,6f
+	spt	0(%r13)
+	mvc	__LC_LAST_UPDATE_TIMER(8),0(%r13)
+	larl	%r15,_stack_end-STACK_FRAME_OVERHEAD
+	brasl	%r14,sclp_early_setup_buffer
 	brasl	%r14,verify_facilities
 	brasl	%r14,startup_kernel
+SYM_CODE_END(startup_normal)
 
-.Lstack:
-	.long	0x8000 + (1<<(PAGE_SHIFT+BOOT_STACK_ORDER)) - STACK_FRAME_OVERHEAD
-	.align	8
+	.balign	8
 6:	.long	0x7fffffff,0xffffffff
-
+.Lext_new_psw:
+	.quad	0x0002000180000000,0x1b0	# disabled wait
+.Lpgm_new_psw:
+	.quad	0x0000000180000000,startup_pgm_check_handler
+.Lio_new_psw:
+	.quad	0x0002000180000000,0x1f0	# disabled wait
 .Lctl:	.quad	0x04040000		# cr0: AFP registers & secondary space
 	.quad	0			# cr1: primary space segment table
-	.quad	.Lduct			# cr2: dispatchable unit control table
+	.quad	0			# cr2: dispatchable unit control table
 	.quad	0			# cr3: instruction authorization
 	.quad	0xffff			# cr4: instruction authorization
-	.quad	.Lduct			# cr5: primary-aste origin
+	.quad	0			# cr5: primary-aste origin
 	.quad	0			# cr6:	I/O interrupts
 	.quad	0			# cr7:	secondary space segment table
-	.quad	0			# cr8:	access registers translation
+	.quad	0x0000000000008000	# cr8:	access registers translation
 	.quad	0			# cr9:	tracing off
 	.quad	0			# cr10: tracing off
 	.quad	0			# cr11: tracing off
 	.quad	0			# cr12: tracing off
 	.quad	0			# cr13: home space segment table
 	.quad	0xc0000000		# cr14: machine check handling off
-	.quad	.Llinkage_stack		# cr15: linkage stack operations
-
-	.section .dma.data,"aw",@progbits
-.Lduct: .long	0,.Laste,.Laste,0,.Lduald,0,0,0
-	.long	0,0,0,0,0,0,0,0
-.Llinkage_stack:
-	.long	0,0,0x89000000,0,0,0,0x8a000000,0
-	.align 64
-.Laste:	.quad	0,0xffffffffffffffff,0,0,0,0,0,0
-	.align	128
-.Lduald:.rept	8
-	.long	0x80000000,0,0,0	# invalid access-list entries
-	.endr
-	.previous
+	.quad	0			# cr15: linkage stack operations
 
 #include "head_kdump.S"
 
-#
-# This program check is active immediately after kernel start
-# and until early_pgm_check_handler is set in kernel/early.c
-# It simply saves general/control registers and psw in
-# the save area and does disabled wait with a faulty address.
-#
-ENTRY(startup_pgm_check_handler)
-	stmg	%r0,%r15,__LC_SAVE_AREA_SYNC
-	la	%r1,4095
-	stctg	%c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r1)
-	mvc	__LC_GPREGS_SAVE_AREA-4095(128,%r1),__LC_SAVE_AREA_SYNC
-	mvc	__LC_PSW_SAVE_AREA-4095(16,%r1),__LC_PGM_OLD_PSW
+SYM_CODE_START_LOCAL(startup_pgm_check_handler)
+	stmg	%r8,%r15,__LC_SAVE_AREA
+	la	%r8,4095
+	stctg	%c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r8)
+	stmg	%r0,%r7,__LC_GPREGS_SAVE_AREA-4095(%r8)
+	mvc	__LC_GPREGS_SAVE_AREA-4095+64(64,%r8),__LC_SAVE_AREA
+	mvc	__LC_PSW_SAVE_AREA-4095(16,%r8),__LC_PGM_OLD_PSW
 	mvc	__LC_RETURN_PSW(16),__LC_PGM_OLD_PSW
 	ni	__LC_RETURN_PSW,0xfc	# remove IO and EX bits
 	ni	__LC_RETURN_PSW+1,0xfb	# remove MCHK bit
 	oi	__LC_RETURN_PSW+1,0x2	# set wait state bit
-	larl	%r2,.Lold_psw_disabled_wait
-	stg	%r2,__LC_PGM_NEW_PSW+8
-	l	%r15,.Ldump_info_stack-.Lold_psw_disabled_wait(%r2)
-	brasl	%r14,print_pgm_check_info
+	larl	%r9,.Lold_psw_disabled_wait
+	stg	%r9,__LC_PGM_NEW_PSW+8
+	larl	%r15,_dump_info_stack_end-(STACK_FRAME_OVERHEAD+__PT_SIZE)
+	la	%r2,STACK_FRAME_OVERHEAD(%r15)
+	mvc	__PT_PSW(16,%r2),__LC_PSW_SAVE_AREA-4095(%r8)
+	mvc	__PT_R0(128,%r2),__LC_GPREGS_SAVE_AREA-4095(%r8)
+	mvc	__PT_LAST_BREAK(8,%r2),__LC_PGM_LAST_BREAK
+	mvc	__PT_INT_CODE(4,%r2),__LC_PGM_INT_CODE
+	brasl	%r14,do_pgm_check
+	larl	%r9,startup_pgm_check_handler
+	stg	%r9,__LC_PGM_NEW_PSW+8
+	mvc	__LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+	lmg	%r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
+	lpswe	__LC_RETURN_PSW
 .Lold_psw_disabled_wait:
-	la	%r1,4095
-	lmg	%r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)
+	la	%r8,4095
+	lmg	%r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r8)
 	lpswe	__LC_RETURN_PSW		# disabled wait
-.Ldump_info_stack:
-	.long	0x5000 + PAGE_SIZE - STACK_FRAME_OVERHEAD
-ENDPROC(startup_pgm_check_handler)
-
-#
-# params at 10400 (setup.h)
-# Must be keept in sync with struct parmarea in setup.h
-#
-	.org	PARMAREA
-	.quad	0			# IPL_DEVICE
-	.quad	0			# INITRD_START
-	.quad	0			# INITRD_SIZE
-	.quad	0			# OLDMEM_BASE
-	.quad	0			# OLDMEM_SIZE
-	.quad	kernel_version		# points to kernel version string
-
-	.org	COMMAND_LINE
-	.byte	"root=/dev/ram0 ro"
-	.byte	0
-
-	.org	EARLY_SCCB_OFFSET
-	.fill	4096
-
-	.org	HEAD_END
+SYM_CODE_END(startup_pgm_check_handler)
diff --git a/arch/s390/boot/head_kdump.S b/arch/s390/boot/head_kdump.S
index 174d6959bf5b..f7107c76258c 100644
--- a/arch/s390/boot/head_kdump.S
+++ b/arch/s390/boot/head_kdump.S
@@ -19,8 +19,7 @@
 # Note: This code has to be position independent
 #
 
-.align 2
-.Lep_startup_kdump:
+SYM_CODE_START_LOCAL(startup_kdump)
 	lhi	%r1,2				# mode 2 = esame (dump)
 	sigp	%r1,%r0,SIGP_SET_ARCHITECTURE	# Switch to esame mode
 	sam64					# Switch to 64 bit addressing
@@ -83,19 +82,20 @@
 #
 # Startup of kdump (relocated new kernel)
 #
-.align 2
+	.balign	2
 startup_kdump_relocated:
 	basr	%r13,0
 0:	lpswe	.Lrestart_psw-0b(%r13)		# Start new kernel...
-.align	8
+SYM_CODE_END(startup_kdump)
+	.balign	8
 .Lrestart_psw:
 	.quad	0x0000000080000000,0x0000000000000000 + startup
 #else
-.align 2
-.Lep_startup_kdump:
+SYM_CODE_START_LOCAL(startup_kdump)
 	larl	%r13,startup_kdump_crash
 	lpswe	0(%r13)
-.align 8
+SYM_CODE_END(startup_kdump)
+	.balign	8
 startup_kdump_crash:
 	.quad	0x0002000080000000,0x0000000000000000 + startup_kdump_crash
 #endif /* CONFIG_CRASH_DUMP */
diff --git a/arch/s390/boot/install.sh b/arch/s390/boot/install.sh
index bed227f267ae..fa41486258ee 100644..100755
--- a/arch/s390/boot/install.sh
+++ b/arch/s390/boot/install.sh
@@ -14,22 +14,13 @@
 #   $2 - kernel image file
 #   $3 - kernel map file
 #   $4 - default install path (blank if root directory)
-#
-
-# User may have a custom install script
-
-if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
-if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
-
-# Default install - same as make zlilo
 
-if [ -f $4/vmlinuz ]; then
-	mv $4/vmlinuz $4/vmlinuz.old
-fi
+set -e
 
-if [ -f $4/System.map ]; then
-	mv $4/System.map $4/System.old
-fi
+echo "Warning: '${INSTALLKERNEL}' command not available - additional " \
+     "bootloader config required" >&2
+if [ -f "$4/vmlinuz-$1" ]; then mv -- "$4/vmlinuz-$1" "$4/vmlinuz-$1.old"; fi
+if [ -f "$4/System.map-$1" ]; then mv -- "$4/System.map-$1" "$4/System.map-$1.old"; fi
 
-cat $2 > $4/vmlinuz
-cp $3 $4/System.map
+cat -- "$2" > "$4/vmlinuz-$1"
+cp -- "$3" "$4/System.map-$1"
diff --git a/arch/s390/boot/ipl_data.c b/arch/s390/boot/ipl_data.c
new file mode 100644
index 000000000000..0846e2b249c6
--- /dev/null
+++ b/arch/s390/boot/ipl_data.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/compat.h>
+#include <linux/ptrace.h>
+#include <asm/cio.h>
+#include <asm/asm-offsets.h>
+#include "boot.h"
+
+#define CCW0(cmd, addr, cnt, flg) \
+	{ .cmd_code = cmd, .cda = addr, .count = cnt, .flags = flg, }
+
+#define PSW_MASK_DISABLED (PSW_MASK_WAIT | PSW_MASK_EA | PSW_MASK_BA)
+
+struct ipl_lowcore {
+	psw_t32		ipl_psw;			/* 0x0000 */
+	struct ccw0	ccwpgm[2];			/* 0x0008 */
+	u8		fill[56];			/* 0x0018 */
+	struct ccw0	ccwpgmcc[20];			/* 0x0050 */
+	u8		pad_0xf0[0x01a0-0x00f0];	/* 0x00f0 */
+	psw_t		restart_psw;			/* 0x01a0 */
+	psw_t		external_new_psw;		/* 0x01b0 */
+	psw_t		svc_new_psw;			/* 0x01c0 */
+	psw_t		program_new_psw;		/* 0x01d0 */
+	psw_t		mcck_new_psw;			/* 0x01e0 */
+	psw_t		io_new_psw;			/* 0x01f0 */
+};
+
+/*
+ * Initial lowcore for IPL: the first 24 bytes are loaded by IPL to
+ * addresses 0-23 (a PSW and two CCWs). Bytes 24-79 are discarded.
+ * The next 160 bytes are loaded to addresses 0x18-0xb7. They form
+ * the continuation of the CCW program started by IPL and load the
+ * range 0x0f0-0x730 from the image to the range 0x0f0-0x730 in
+ * memory. At the end of the channel program the PSW at location 0 is
+ * loaded.
+ * Initial processing starts at 0x200 = iplstart.
+ *
+ * The restart psw points to iplstart which allows to load a kernel
+ * image into memory and starting it by a psw restart on any cpu. All
+ * other default psw new locations contain a disabled wait psw where
+ * the address indicates which psw was loaded.
+ *
+ * Note that the 'file' utility can detect s390 kernel images. For
+ * that to succeed the two initial CCWs, and the 0x40 fill bytes must
+ * be present.
+ */
+static struct ipl_lowcore ipl_lowcore __used __section(".ipldata") = {
+	.ipl_psw = { .mask = PSW32_MASK_BASE, .addr = PSW32_ADDR_AMODE | IPL_START },
+	.ccwpgm = {
+		[ 0] = CCW0(CCW_CMD_READ_IPL, 0x018, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+		[ 1] = CCW0(CCW_CMD_READ_IPL, 0x068, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+	},
+	.fill = {
+		[ 0 ... 55] = 0x40,
+	},
+	.ccwpgmcc = {
+		[ 0] = CCW0(CCW_CMD_READ_IPL, 0x0f0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+		[ 1] = CCW0(CCW_CMD_READ_IPL, 0x140, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+		[ 2] = CCW0(CCW_CMD_READ_IPL, 0x190, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+		[ 3] = CCW0(CCW_CMD_READ_IPL, 0x1e0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+		[ 4] = CCW0(CCW_CMD_READ_IPL, 0x230, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+		[ 5] = CCW0(CCW_CMD_READ_IPL, 0x280, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+		[ 6] = CCW0(CCW_CMD_READ_IPL, 0x2d0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+		[ 7] = CCW0(CCW_CMD_READ_IPL, 0x320, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+		[ 8] = CCW0(CCW_CMD_READ_IPL, 0x370, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+		[ 9] = CCW0(CCW_CMD_READ_IPL, 0x3c0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+		[10] = CCW0(CCW_CMD_READ_IPL, 0x410, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+		[11] = CCW0(CCW_CMD_READ_IPL, 0x460, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+		[12] = CCW0(CCW_CMD_READ_IPL, 0x4b0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+		[13] = CCW0(CCW_CMD_READ_IPL, 0x500, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+		[14] = CCW0(CCW_CMD_READ_IPL, 0x550, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+		[15] = CCW0(CCW_CMD_READ_IPL, 0x5a0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+		[16] = CCW0(CCW_CMD_READ_IPL, 0x5f0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+		[17] = CCW0(CCW_CMD_READ_IPL, 0x640, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+		[18] = CCW0(CCW_CMD_READ_IPL, 0x690, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+		[19] = CCW0(CCW_CMD_READ_IPL, 0x6e0, 0x50, CCW_FLAG_SLI),
+	},
+	.restart_psw	  = { .mask = 0, .addr = IPL_START, },
+	.external_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_EXT_NEW_PSW, },
+	.svc_new_psw	  = { .mask = PSW_MASK_DISABLED, .addr = __LC_SVC_NEW_PSW, },
+	.program_new_psw  = { .mask = PSW_MASK_DISABLED, .addr = __LC_PGM_NEW_PSW, },
+	.mcck_new_psw	  = { .mask = PSW_MASK_DISABLED, .addr = __LC_MCK_NEW_PSW, },
+	.io_new_psw	  = { .mask = PSW_MASK_DISABLED, .addr = __LC_IO_NEW_PSW, },
+};
diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c
index 24ef67eb1cef..d04e9b89d14a 100644
--- a/arch/s390/boot/ipl_parm.c
+++ b/arch/s390/boot/ipl_parm.c
@@ -2,48 +2,49 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/ctype.h>
+#include <linux/pgtable.h>
+#include <asm/abs_lowcore.h>
+#include <asm/page-states.h>
+#include <asm/machine.h>
 #include <asm/ebcdic.h>
 #include <asm/sclp.h>
 #include <asm/sections.h>
 #include <asm/boot_data.h>
 #include <asm/facility.h>
-#include <asm/pgtable.h>
+#include <asm/setup.h>
 #include <asm/uv.h>
 #include "boot.h"
 
+struct parmarea parmarea __section(".parmarea") = {
+	.kernel_version		= (unsigned long)kernel_version,
+	.max_command_line_size	= COMMAND_LINE_SIZE,
+	.command_line		= "root=/dev/ram0 ro",
+};
+
 char __bootdata(early_command_line)[COMMAND_LINE_SIZE];
+
+unsigned int __bootdata_preserved(zlib_dfltcc_support) = ZLIB_DFLTCC_FULL;
 struct ipl_parameter_block __bootdata_preserved(ipl_block);
 int __bootdata_preserved(ipl_block_valid);
+int __bootdata_preserved(__kaslr_enabled);
+int __bootdata_preserved(cmma_flag) = 1;
 
-unsigned long __bootdata(vmalloc_size) = VMALLOC_DEFAULT_SIZE;
-unsigned long __bootdata(memory_end);
-int __bootdata(memory_end_set);
-int __bootdata(noexec_disabled);
-
-int kaslr_enabled __section(.data);
+unsigned long vmalloc_size = VMALLOC_DEFAULT_SIZE;
+unsigned long memory_limit;
+int vmalloc_size_set;
 
 static inline int __diag308(unsigned long subcode, void *addr)
 {
-	register unsigned long _addr asm("0") = (unsigned long)addr;
-	register unsigned long _rc asm("1") = 0;
-	unsigned long reg1, reg2;
-	psw_t old = S390_lowcore.program_new_psw;
-
-	asm volatile(
-		"	epsw	%0,%1\n"
-		"	st	%0,%[psw_pgm]\n"
-		"	st	%1,%[psw_pgm]+4\n"
-		"	larl	%0,1f\n"
-		"	stg	%0,%[psw_pgm]+8\n"
-		"	diag	%[addr],%[subcode],0x308\n"
-		"1:	nopr	%%r7\n"
-		: "=&d" (reg1), "=&a" (reg2),
-		  [psw_pgm] "=Q" (S390_lowcore.program_new_psw),
-		  [addr] "+d" (_addr), "+d" (_rc)
+	union register_pair r1 = { .even = (unsigned long)addr, .odd = 0 };
+
+	asm_inline volatile(
+		"	diag	%[r1],%[subcode],0x308\n"
+		"0:\n"
+		EX_TABLE(0b, 0b)
+		: [r1] "+d" (r1.pair)
 		: [subcode] "d" (subcode)
 		: "cc", "memory");
-	S390_lowcore.program_new_psw = old;
-	return _rc;
+	return r1.odd;
 }
 
 void store_ipl_parmblock(void)
@@ -56,6 +57,20 @@ void store_ipl_parmblock(void)
 		ipl_block_valid = 1;
 }
 
+bool is_ipl_block_dump(void)
+{
+	if (ipl_block.pb0_hdr.pbt == IPL_PBT_FCP &&
+	    ipl_block.fcp.opt == IPL_PB0_FCP_OPT_DUMP)
+		return true;
+	if (ipl_block.pb0_hdr.pbt == IPL_PBT_NVME &&
+	    ipl_block.nvme.opt == IPL_PB0_NVME_OPT_DUMP)
+		return true;
+	if (ipl_block.pb0_hdr.pbt == IPL_PBT_ECKD &&
+	    ipl_block.eckd.opt == IPL_PB0_ECKD_OPT_DUMP)
+		return true;
+	return false;
+}
+
 static size_t scpdata_length(const u8 *buf, size_t count)
 {
 	while (count) {
@@ -69,30 +84,49 @@ static size_t scpdata_length(const u8 *buf, size_t count)
 static size_t ipl_block_get_ascii_scpdata(char *dest, size_t size,
 					  const struct ipl_parameter_block *ipb)
 {
-	size_t count;
-	size_t i;
+	const __u8 *scp_data;
+	__u32 scp_data_len;
 	int has_lowercase;
+	size_t count = 0;
+	size_t i;
 
-	count = min(size - 1, scpdata_length(ipb->fcp.scp_data,
-					     ipb->fcp.scp_data_len));
+	switch (ipb->pb0_hdr.pbt) {
+	case IPL_PBT_FCP:
+		scp_data_len = ipb->fcp.scp_data_len;
+		scp_data = ipb->fcp.scp_data;
+		break;
+	case IPL_PBT_NVME:
+		scp_data_len = ipb->nvme.scp_data_len;
+		scp_data = ipb->nvme.scp_data;
+		break;
+	case IPL_PBT_ECKD:
+		scp_data_len = ipb->eckd.scp_data_len;
+		scp_data = ipb->eckd.scp_data;
+		break;
+
+	default:
+		goto out;
+	}
+
+	count = min(size - 1, scpdata_length(scp_data, scp_data_len));
 	if (!count)
 		goto out;
 
 	has_lowercase = 0;
 	for (i = 0; i < count; i++) {
-		if (!isascii(ipb->fcp.scp_data[i])) {
+		if (!isascii(scp_data[i])) {
 			count = 0;
 			goto out;
 		}
-		if (!has_lowercase && islower(ipb->fcp.scp_data[i]))
+		if (!has_lowercase && islower(scp_data[i]))
 			has_lowercase = 1;
 	}
 
 	if (has_lowercase)
-		memcpy(dest, ipb->fcp.scp_data, count);
+		memcpy(dest, scp_data, count);
 	else
 		for (i = 0; i < count; i++)
-			dest[i] = tolower(ipb->fcp.scp_data[i]);
+			dest[i] = tolower(scp_data[i]);
 out:
 	dest[count] = '\0';
 	return count;
@@ -114,6 +148,8 @@ static void append_ipl_block_parm(void)
 			parm, COMMAND_LINE_SIZE - len - 1, &ipl_block);
 		break;
 	case IPL_PBT_FCP:
+	case IPL_PBT_NVME:
+	case IPL_PBT_ECKD:
 		rc = ipl_block_get_ascii_scpdata(
 			parm, COMMAND_LINE_SIZE - len - 1, &ipl_block);
 		break;
@@ -138,12 +174,12 @@ static inline int has_ebcdic_char(const char *str)
 
 void setup_boot_command_line(void)
 {
-	COMMAND_LINE[ARCH_COMMAND_LINE_SIZE - 1] = 0;
+	parmarea.command_line[COMMAND_LINE_SIZE - 1] = 0;
 	/* convert arch command line to ascii if necessary */
-	if (has_ebcdic_char(COMMAND_LINE))
-		EBCASC(COMMAND_LINE, ARCH_COMMAND_LINE_SIZE);
+	if (has_ebcdic_char(parmarea.command_line))
+		EBCASC(parmarea.command_line, COMMAND_LINE_SIZE);
 	/* copy arch command line */
-	strcpy(early_command_line, strim(COMMAND_LINE));
+	strcpy(early_command_line, strim(parmarea.command_line));
 
 	/* append IPL PARM data to the boot command line */
 	if (!is_prot_virt_guest() && ipl_block_valid)
@@ -153,9 +189,9 @@ void setup_boot_command_line(void)
 static void modify_facility(unsigned long nr, bool clear)
 {
 	if (clear)
-		__clear_facility(nr, S390_lowcore.stfle_fac_list);
+		__clear_facility(nr, stfle_fac_list);
 	else
-		__set_facility(nr, S390_lowcore.stfle_fac_list);
+		__set_facility(nr, stfle_fac_list);
 }
 
 static void check_cleared_facilities(void)
@@ -164,8 +200,8 @@ static void check_cleared_facilities(void)
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(als); i++) {
-		if ((S390_lowcore.stfle_fac_list[i] & als[i]) != als[i]) {
-			sclp_early_printk("Warning: The Linux kernel requires facilities cleared via command line option\n");
+		if ((stfle_fac_list[i] & als[i]) != als[i]) {
+			boot_emerg("The Linux kernel requires facilities cleared via command line option\n");
 			print_missing_facilities();
 			break;
 		}
@@ -208,7 +244,7 @@ static void modify_fac_list(char *str)
 	check_cleared_facilities();
 }
 
-static char command_line_buf[COMMAND_LINE_SIZE] __section(.data);
+static char command_line_buf[COMMAND_LINE_SIZE];
 void parse_boot_command_line(void)
 {
 	char *param, *val;
@@ -216,44 +252,73 @@ void parse_boot_command_line(void)
 	char *args;
 	int rc;
 
-	kaslr_enabled = IS_ENABLED(CONFIG_RANDOMIZE_BASE);
+	__kaslr_enabled = IS_ENABLED(CONFIG_RANDOMIZE_BASE);
 	args = strcpy(command_line_buf, early_command_line);
 	while (*args) {
 		args = next_arg(args, &param, &val);
 
-		if (!strcmp(param, "mem") && val) {
-			memory_end = round_down(memparse(val, NULL), PAGE_SIZE);
-			memory_end_set = 1;
-		}
+		if (!strcmp(param, "mem") && val)
+			memory_limit = round_down(memparse(val, NULL), PAGE_SIZE);
 
-		if (!strcmp(param, "vmalloc") && val)
-			vmalloc_size = round_up(memparse(val, NULL), PAGE_SIZE);
+		if (!strcmp(param, "vmalloc") && val) {
+			vmalloc_size = round_up(memparse(val, NULL), _SEGMENT_SIZE);
+			vmalloc_size_set = 1;
+		}
 
-		if (!strcmp(param, "noexec")) {
-			rc = kstrtobool(val, &enabled);
-			if (!rc && !enabled)
-				noexec_disabled = 1;
+		if (!strcmp(param, "dfltcc") && val) {
+			if (!strcmp(val, "off"))
+				zlib_dfltcc_support = ZLIB_DFLTCC_DISABLED;
+			else if (!strcmp(val, "on"))
+				zlib_dfltcc_support = ZLIB_DFLTCC_FULL;
+			else if (!strcmp(val, "def_only"))
+				zlib_dfltcc_support = ZLIB_DFLTCC_DEFLATE_ONLY;
+			else if (!strcmp(val, "inf_only"))
+				zlib_dfltcc_support = ZLIB_DFLTCC_INFLATE_ONLY;
+			else if (!strcmp(val, "always"))
+				zlib_dfltcc_support = ZLIB_DFLTCC_FULL_DEBUG;
 		}
 
 		if (!strcmp(param, "facilities") && val)
 			modify_fac_list(val);
 
+		if (!strcmp(param, "debug-alternative"))
+			alt_debug_setup(val);
+
 		if (!strcmp(param, "nokaslr"))
-			kaslr_enabled = 0;
-	}
-}
+			__kaslr_enabled = 0;
 
-void setup_memory_end(void)
-{
-#ifdef CONFIG_CRASH_DUMP
-	if (OLDMEM_BASE) {
-		kaslr_enabled = 0;
-	} else if (ipl_block_valid &&
-		   ipl_block.pb0_hdr.pbt == IPL_PBT_FCP &&
-		   ipl_block.fcp.opt == IPL_PB0_FCP_OPT_DUMP) {
-		kaslr_enabled = 0;
-		if (!sclp_early_get_hsa_size(&memory_end) && memory_end)
-			memory_end_set = 1;
-	}
+		if (!strcmp(param, "cmma")) {
+			rc = kstrtobool(val, &enabled);
+			if (!rc && !enabled)
+				cmma_flag = 0;
+		}
+
+#if IS_ENABLED(CONFIG_KVM)
+		if (!strcmp(param, "prot_virt")) {
+			rc = kstrtobool(val, &enabled);
+			if (!rc && enabled)
+				prot_virt_host = 1;
+		}
 #endif
+		if (!strcmp(param, "relocate_lowcore") && test_facility(193))
+			set_machine_feature(MFEATURE_LOWCORE);
+		if (!strcmp(param, "earlyprintk"))
+			boot_earlyprintk = true;
+		if (!strcmp(param, "debug"))
+			boot_console_loglevel = CONSOLE_LOGLEVEL_DEBUG;
+		if (!strcmp(param, "bootdebug")) {
+			bootdebug = true;
+			if (val)
+				strncpy(bootdebug_filter, val, sizeof(bootdebug_filter) - 1);
+		}
+		if (!strcmp(param, "quiet"))
+			boot_console_loglevel = CONSOLE_LOGLEVEL_QUIET;
+		if (!strcmp(param, "ignore_loglevel"))
+			boot_ignore_loglevel = true;
+		if (!strcmp(param, "loglevel")) {
+			boot_console_loglevel = simple_strtoull(val, NULL, 10);
+			if (boot_console_loglevel < CONSOLE_LOGLEVEL_MIN)
+				boot_console_loglevel = CONSOLE_LOGLEVEL_MIN;
+		}
+	}
 }
diff --git a/arch/s390/boot/ipl_report.c b/arch/s390/boot/ipl_report.c
index 0b4965573656..f73cd757a5f7 100644
--- a/arch/s390/boot/ipl_report.c
+++ b/arch/s390/boot/ipl_report.c
@@ -5,6 +5,7 @@
 #include <asm/sclp.h>
 #include <asm/sections.h>
 #include <asm/boot_data.h>
+#include <asm/physmem_info.h>
 #include <uapi/asm/ipl.h>
 #include "boot.h"
 
@@ -16,24 +17,19 @@ unsigned long __bootdata_preserved(ipl_cert_list_size);
 unsigned long __bootdata(early_ipl_comp_list_addr);
 unsigned long __bootdata(early_ipl_comp_list_size);
 
+static struct ipl_rb_certificates *certs;
+static struct ipl_rb_components *comps;
+static bool ipl_report_needs_saving;
+
 #define for_each_rb_entry(entry, rb) \
 	for (entry = rb->entries; \
 	     (void *) entry + sizeof(*entry) <= (void *) rb + rb->len; \
 	     entry++)
 
-static inline bool intersects(unsigned long addr0, unsigned long size0,
-			      unsigned long addr1, unsigned long size1)
-{
-	return addr0 + size0 > addr1 && addr1 + size1 > addr0;
-}
-
-static unsigned long find_bootdata_space(struct ipl_rb_components *comps,
-					 struct ipl_rb_certificates *certs,
-					 unsigned long safe_addr)
+static unsigned long get_cert_comp_list_size(void)
 {
 	struct ipl_rb_certificate_entry *cert;
 	struct ipl_rb_component_entry *comp;
-	size_t size;
 
 	/*
 	 * Find the length for the IPL report boot data
@@ -44,36 +40,27 @@ static unsigned long find_bootdata_space(struct ipl_rb_components *comps,
 	ipl_cert_list_size = 0;
 	for_each_rb_entry(cert, certs)
 		ipl_cert_list_size += sizeof(unsigned int) + cert->len;
-	size = ipl_cert_list_size + early_ipl_comp_list_size;
+	return ipl_cert_list_size + early_ipl_comp_list_size;
+}
 
-	/*
-	 * Start from safe_addr to find a free memory area large
-	 * enough for the IPL report boot data. This area is used
-	 * for ipl_cert_list_addr/ipl_cert_list_size and
-	 * early_ipl_comp_list_addr/early_ipl_comp_list_size. It must
-	 * not overlap with any component or any certificate.
-	 */
-repeat:
-	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE &&
-	    intersects(INITRD_START, INITRD_SIZE, safe_addr, size))
-		safe_addr = INITRD_START + INITRD_SIZE;
-	for_each_rb_entry(comp, comps)
-		if (intersects(safe_addr, size, comp->addr, comp->len)) {
-			safe_addr = comp->addr + comp->len;
-			goto repeat;
-		}
-	for_each_rb_entry(cert, certs)
-		if (intersects(safe_addr, size, cert->addr, cert->len)) {
-			safe_addr = cert->addr + cert->len;
-			goto repeat;
-		}
-	early_ipl_comp_list_addr = safe_addr;
-	ipl_cert_list_addr = safe_addr + early_ipl_comp_list_size;
+bool ipl_report_certs_intersects(unsigned long addr, unsigned long size,
+				 unsigned long *intersection_start)
+{
+	struct ipl_rb_certificate_entry *cert;
 
-	return safe_addr + size;
+	if (!ipl_report_needs_saving)
+		return false;
+
+	for_each_rb_entry(cert, certs) {
+		if (intersects(addr, size, cert->addr, cert->len)) {
+			*intersection_start = cert->addr;
+			return true;
+		}
+	}
+	return false;
 }
 
-static void copy_components_bootdata(struct ipl_rb_components *comps)
+static void copy_components_bootdata(void)
 {
 	struct ipl_rb_component_entry *comp, *ptr;
 
@@ -82,7 +69,7 @@ static void copy_components_bootdata(struct ipl_rb_components *comps)
 		memcpy(ptr++, comp, sizeof(*ptr));
 }
 
-static void copy_certificates_bootdata(struct ipl_rb_certificates *certs)
+static void copy_certificates_bootdata(void)
 {
 	struct ipl_rb_certificate_entry *cert;
 	void *ptr;
@@ -96,10 +83,8 @@ static void copy_certificates_bootdata(struct ipl_rb_certificates *certs)
 	}
 }
 
-unsigned long read_ipl_report(unsigned long safe_addr)
+int read_ipl_report(void)
 {
-	struct ipl_rb_certificates *certs;
-	struct ipl_rb_components *comps;
 	struct ipl_pl_hdr *pl_hdr;
 	struct ipl_rl_hdr *rl_hdr;
 	struct ipl_rb_hdr *rb_hdr;
@@ -112,7 +97,7 @@ unsigned long read_ipl_report(unsigned long safe_addr)
 	 */
 	if (!ipl_block_valid ||
 	    !(ipl_block.hdr.flags & IPL_PL_FLAG_IPLSR))
-		return safe_addr;
+		return -1;
 	ipl_secure_flag = !!(ipl_block.hdr.flags & IPL_PL_FLAG_SIPL);
 	/*
 	 * There is an IPL report, to find it load the pointer to the
@@ -120,7 +105,7 @@ unsigned long read_ipl_report(unsigned long safe_addr)
 	 * the IPL parameter list, then align the address to a double
 	 * word boundary.
 	 */
-	tmp = (unsigned long) S390_lowcore.ipl_parmblock_ptr;
+	tmp = (unsigned long)get_lowcore()->ipl_parmblock_ptr;
 	pl_hdr = (struct ipl_pl_hdr *) tmp;
 	tmp = (tmp + pl_hdr->len + 7) & -8UL;
 	rl_hdr = (struct ipl_rl_hdr *) tmp;
@@ -150,16 +135,30 @@ unsigned long read_ipl_report(unsigned long safe_addr)
 	 * With either the component list or the certificate list
 	 * missing the kernel will stay ignorant of secure IPL.
 	 */
-	if (!comps || !certs)
-		return safe_addr;
+	if (!comps || !certs) {
+		certs = NULL;
+		return -1;
+	}
 
-	/*
-	 * Copy component and certificate list to a safe area
-	 * where the decompressed kernel can find them.
-	 */
-	safe_addr = find_bootdata_space(comps, certs, safe_addr);
-	copy_components_bootdata(comps);
-	copy_certificates_bootdata(certs);
+	ipl_report_needs_saving = true;
+	physmem_reserve(RR_IPLREPORT, (unsigned long)pl_hdr,
+			(unsigned long)rl_end - (unsigned long)pl_hdr);
+	return 0;
+}
+
+void save_ipl_cert_comp_list(void)
+{
+	unsigned long size;
+
+	if (!ipl_report_needs_saving)
+		return;
+
+	size = get_cert_comp_list_size();
+	early_ipl_comp_list_addr = physmem_alloc_or_die(RR_CERT_COMP_LIST, size, sizeof(int));
+	ipl_cert_list_addr = early_ipl_comp_list_addr + early_ipl_comp_list_size;
 
-	return safe_addr;
+	copy_components_bootdata();
+	copy_certificates_bootdata();
+	physmem_free(RR_IPLREPORT);
+	ipl_report_needs_saving = false;
 }
diff --git a/arch/s390/boot/kaslr.c b/arch/s390/boot/kaslr.c
index 5d12352545c5..941f4c9e27cc 100644
--- a/arch/s390/boot/kaslr.c
+++ b/arch/s390/boot/kaslr.c
@@ -2,12 +2,13 @@
 /*
  * Copyright IBM Corp. 2019
  */
-#include <asm/mem_detect.h>
-#include <asm/pgtable.h>
+#include <linux/pgtable.h>
+#include <asm/physmem_info.h>
 #include <asm/cpacf.h>
 #include <asm/timex.h>
 #include <asm/sclp.h>
-#include "compressed/decompressor.h"
+#include <asm/kasan.h>
+#include "decompressor.h"
 #include "boot.h"
 
 #define PRNG_MODE_TDES	 1
@@ -31,7 +32,7 @@ struct prng_parm {
 static int check_prng(void)
 {
 	if (!cpacf_query_func(CPACF_KMC, CPACF_KMC_PRNG)) {
-		sclp_early_printk("KASLR disabled: CPU has no PRNG\n");
+		boot_warn("KASLR disabled: CPU has no PRNG\n");
 		return 0;
 	}
 	if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG))
@@ -42,7 +43,7 @@ static int check_prng(void)
 		return PRNG_MODE_TDES;
 }
 
-static unsigned long get_random(unsigned long limit)
+int get_random(unsigned long limit, unsigned long *value)
 {
 	struct prng_parm prng = {
 		/* initial parameter block for tdes mode, copied from libica */
@@ -75,7 +76,7 @@ static unsigned long get_random(unsigned long limit)
 		*(unsigned long *) prng.parm_block ^= seed;
 		for (i = 0; i < 16; i++) {
 			cpacf_kmc(CPACF_KMC_PRNG, prng.parm_block,
-				  (char *) entropy, (char *) entropy,
+				  (u8 *) entropy, (u8 *) entropy,
 				  sizeof(entropy));
 			memcpy(prng.parm_block, entropy, sizeof(entropy));
 		}
@@ -84,87 +85,114 @@ static unsigned long get_random(unsigned long limit)
 			  (u8 *) &random, sizeof(random));
 		break;
 	default:
-		random = 0;
+		return -1;
 	}
-	return random % limit;
+	*value = random % limit;
+	return 0;
 }
 
-unsigned long get_random_base(unsigned long safe_addr)
+static void sort_reserved_ranges(struct reserved_range *res, unsigned long size)
 {
-	unsigned long memory_limit = memory_end_set ? memory_end : 0;
-	unsigned long base, start, end, kernel_size;
-	unsigned long block_sum, offset;
-	unsigned long kasan_needs;
-	int i;
+	struct reserved_range tmp;
+	int i, j;
 
-	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE) {
-		if (safe_addr < INITRD_START + INITRD_SIZE)
-			safe_addr = INITRD_START + INITRD_SIZE;
+	for (i = 1; i < size; i++) {
+		tmp = res[i];
+		for (j = i - 1; j >= 0 && res[j].start > tmp.start; j--)
+			res[j + 1] = res[j];
+		res[j + 1] = tmp;
 	}
-	safe_addr = ALIGN(safe_addr, THREAD_SIZE);
+}
 
-	if ((IS_ENABLED(CONFIG_KASAN))) {
-		/*
-		 * Estimate kasan memory requirements, which it will reserve
-		 * at the very end of available physical memory. To estimate
-		 * that, we take into account that kasan would require
-		 * 1/8 of available physical memory (for shadow memory) +
-		 * creating page tables for the whole memory + shadow memory
-		 * region (1 + 1/8). To keep page tables estimates simple take
-		 * the double of combined ptes size.
-		 */
-		memory_limit = get_mem_detect_end();
-		if (memory_end_set && memory_limit > memory_end)
-			memory_limit = memory_end;
+static unsigned long iterate_valid_positions(unsigned long size, unsigned long align,
+					     unsigned long _min, unsigned long _max,
+					     struct reserved_range *res, size_t res_count,
+					     bool pos_count, unsigned long find_pos)
+{
+	unsigned long start, end, tmp_end, range_pos, pos = 0;
+	struct reserved_range *res_end = res + res_count;
+	struct reserved_range *skip_res;
+	int i;
 
-		/* for shadow memory */
-		kasan_needs = memory_limit / 8;
-		/* for paging structures */
-		kasan_needs += (memory_limit + kasan_needs) / PAGE_SIZE /
-			       _PAGE_ENTRIES * _PAGE_TABLE_SIZE * 2;
-		memory_limit -= kasan_needs;
-	}
+	align = max(align, 8UL);
+	_min = round_up(_min, align);
+	for_each_physmem_usable_range(i, &start, &end) {
+		if (_min >= end)
+			continue;
+		start = round_up(start, align);
+		if (start >= _max)
+			break;
+		start = max(_min, start);
+		end = min(_max, end);
 
-	kernel_size = vmlinux.image_size + vmlinux.bss_size;
-	block_sum = 0;
-	for_each_mem_detect_block(i, &start, &end) {
-		if (memory_limit) {
-			if (start >= memory_limit)
+		while (start + size <= end) {
+			/* skip reserved ranges below the start */
+			while (res && res->end <= start) {
+				res++;
+				if (res >= res_end)
+					res = NULL;
+			}
+			skip_res = NULL;
+			tmp_end = end;
+			/* has intersecting reserved range */
+			if (res && res->start < end) {
+				skip_res = res;
+				tmp_end = res->start;
+			}
+			if (start + size <= tmp_end) {
+				range_pos = (tmp_end - start - size) / align + 1;
+				if (pos_count) {
+					pos += range_pos;
+				} else {
+					if (range_pos >= find_pos)
+						return start + (find_pos - 1) * align;
+					find_pos -= range_pos;
+				}
+			}
+			if (!skip_res)
 				break;
-			if (end > memory_limit)
-				end = memory_limit;
+			start = round_up(skip_res->end, align);
 		}
-		if (end - start < kernel_size)
-			continue;
-		block_sum += end - start - kernel_size;
-	}
-	if (!block_sum) {
-		sclp_early_printk("KASLR disabled: not enough memory\n");
-		return 0;
 	}
 
-	base = get_random(block_sum);
-	if (base == 0)
+	return pos_count ? pos : 0;
+}
+
+/*
+ * Two types of decompressor memory allocations/reserves are considered
+ * differently.
+ *
+ * "Static" or "single" allocations are done via physmem_alloc_range() and
+ * physmem_reserve(), and they are listed in physmem_info.reserved[]. Each
+ * type of "static" allocation can only have one allocation per type and
+ * cannot have chains.
+ *
+ * On the other hand, "dynamic" or "repetitive" allocations are done via
+ * physmem_alloc_or_die(). These allocations are tightly packed together
+ * top down from the end of online memory. physmem_alloc_pos represents
+ * current position where those allocations start.
+ *
+ * Functions randomize_within_range() and iterate_valid_positions()
+ * only consider "dynamic" allocations by never looking above
+ * physmem_alloc_pos. "Static" allocations, however, are explicitly
+ * considered by checking the "res" (reserves) array. The first
+ * reserved_range of a "dynamic" allocation may also be checked along the
+ * way, but it will always be above the maximum value anyway.
+ */
+unsigned long randomize_within_range(unsigned long size, unsigned long align,
+				     unsigned long min, unsigned long max)
+{
+	struct reserved_range res[RR_MAX];
+	unsigned long max_pos, pos;
+
+	memcpy(res, physmem_info.reserved, sizeof(res));
+	sort_reserved_ranges(res, ARRAY_SIZE(res));
+	max = min(max, get_physmem_alloc_pos());
+
+	max_pos = iterate_valid_positions(size, align, min, max, res, ARRAY_SIZE(res), true, 0);
+	if (!max_pos)
 		return 0;
-	if (base < safe_addr)
-		base = safe_addr;
-	block_sum = offset = 0;
-	for_each_mem_detect_block(i, &start, &end) {
-		if (memory_limit) {
-			if (start >= memory_limit)
-				break;
-			if (end > memory_limit)
-				end = memory_limit;
-		}
-		if (end - start < kernel_size)
-			continue;
-		block_sum += end - start - kernel_size;
-		if (base <= block_sum) {
-			base = start + base - offset;
-			base = ALIGN_DOWN(base, THREAD_SIZE);
-			break;
-		}
-		offset = block_sum;
-	}
-	return base;
+	if (get_random(max_pos, &pos))
+		return 0;
+	return iterate_valid_positions(size, align, min, max, res, ARRAY_SIZE(res), false, pos + 1);
 }
diff --git a/arch/s390/boot/kmsan.c b/arch/s390/boot/kmsan.c
new file mode 100644
index 000000000000..e7b3ac48143e
--- /dev/null
+++ b/arch/s390/boot/kmsan.c
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kmsan-checks.h>
+
+void kmsan_unpoison_memory(const void *address, size_t size)
+{
+}
diff --git a/arch/s390/boot/mem_detect.c b/arch/s390/boot/mem_detect.c
deleted file mode 100644
index 62e7c13ce85c..000000000000
--- a/arch/s390/boot/mem_detect.c
+++ /dev/null
@@ -1,175 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <asm/sclp.h>
-#include <asm/sections.h>
-#include <asm/mem_detect.h>
-#include <asm/sparsemem.h>
-#include "compressed/decompressor.h"
-#include "boot.h"
-
-unsigned long __bootdata(max_physmem_end);
-struct mem_detect_info __bootdata(mem_detect);
-
-/* up to 256 storage elements, 1020 subincrements each */
-#define ENTRIES_EXTENDED_MAX						       \
-	(256 * (1020 / 2) * sizeof(struct mem_detect_block))
-
-/*
- * To avoid corrupting old kernel memory during dump, find lowest memory
- * chunk possible either right after the kernel end (decompressed kernel) or
- * after initrd (if it is present and there is no hole between the kernel end
- * and initrd)
- */
-static void *mem_detect_alloc_extended(void)
-{
-	unsigned long offset = ALIGN(mem_safe_offset(), sizeof(u64));
-
-	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE &&
-	    INITRD_START < offset + ENTRIES_EXTENDED_MAX)
-		offset = ALIGN(INITRD_START + INITRD_SIZE, sizeof(u64));
-
-	return (void *)offset;
-}
-
-static struct mem_detect_block *__get_mem_detect_block_ptr(u32 n)
-{
-	if (n < MEM_INLINED_ENTRIES)
-		return &mem_detect.entries[n];
-	if (unlikely(!mem_detect.entries_extended))
-		mem_detect.entries_extended = mem_detect_alloc_extended();
-	return &mem_detect.entries_extended[n - MEM_INLINED_ENTRIES];
-}
-
-/*
- * sequential calls to add_mem_detect_block with adjacent memory areas
- * are merged together into single memory block.
- */
-void add_mem_detect_block(u64 start, u64 end)
-{
-	struct mem_detect_block *block;
-
-	if (mem_detect.count) {
-		block = __get_mem_detect_block_ptr(mem_detect.count - 1);
-		if (block->end == start) {
-			block->end = end;
-			return;
-		}
-	}
-
-	block = __get_mem_detect_block_ptr(mem_detect.count);
-	block->start = start;
-	block->end = end;
-	mem_detect.count++;
-}
-
-static int __diag260(unsigned long rx1, unsigned long rx2)
-{
-	register unsigned long _rx1 asm("2") = rx1;
-	register unsigned long _rx2 asm("3") = rx2;
-	register unsigned long _ry asm("4") = 0x10; /* storage configuration */
-	int rc = -1;				    /* fail */
-	unsigned long reg1, reg2;
-	psw_t old = S390_lowcore.program_new_psw;
-
-	asm volatile(
-		"	epsw	%0,%1\n"
-		"	st	%0,%[psw_pgm]\n"
-		"	st	%1,%[psw_pgm]+4\n"
-		"	larl	%0,1f\n"
-		"	stg	%0,%[psw_pgm]+8\n"
-		"	diag	%[rx],%[ry],0x260\n"
-		"	ipm	%[rc]\n"
-		"	srl	%[rc],28\n"
-		"1:\n"
-		: "=&d" (reg1), "=&a" (reg2),
-		  [psw_pgm] "=Q" (S390_lowcore.program_new_psw),
-		  [rc] "+&d" (rc), [ry] "+d" (_ry)
-		: [rx] "d" (_rx1), "d" (_rx2)
-		: "cc", "memory");
-	S390_lowcore.program_new_psw = old;
-	return rc == 0 ? _ry : -1;
-}
-
-static int diag260(void)
-{
-	int rc, i;
-
-	struct {
-		unsigned long start;
-		unsigned long end;
-	} storage_extents[8] __aligned(16); /* VM supports up to 8 extends */
-
-	memset(storage_extents, 0, sizeof(storage_extents));
-	rc = __diag260((unsigned long)storage_extents, sizeof(storage_extents));
-	if (rc == -1)
-		return -1;
-
-	for (i = 0; i < min_t(int, rc, ARRAY_SIZE(storage_extents)); i++)
-		add_mem_detect_block(storage_extents[i].start, storage_extents[i].end + 1);
-	return 0;
-}
-
-static int tprot(unsigned long addr)
-{
-	unsigned long pgm_addr;
-	int rc = -EFAULT;
-	psw_t old = S390_lowcore.program_new_psw;
-
-	S390_lowcore.program_new_psw.mask = __extract_psw();
-	asm volatile(
-		"	larl	%[pgm_addr],1f\n"
-		"	stg	%[pgm_addr],%[psw_pgm_addr]\n"
-		"	tprot	0(%[addr]),0\n"
-		"	ipm	%[rc]\n"
-		"	srl	%[rc],28\n"
-		"1:\n"
-		: [pgm_addr] "=&d"(pgm_addr),
-		  [psw_pgm_addr] "=Q"(S390_lowcore.program_new_psw.addr),
-		  [rc] "+&d"(rc)
-		: [addr] "a"(addr)
-		: "cc", "memory");
-	S390_lowcore.program_new_psw = old;
-	return rc;
-}
-
-static void search_mem_end(void)
-{
-	unsigned long range = 1 << (MAX_PHYSMEM_BITS - 20); /* in 1MB blocks */
-	unsigned long offset = 0;
-	unsigned long pivot;
-
-	while (range > 1) {
-		range >>= 1;
-		pivot = offset + range;
-		if (!tprot(pivot << 20))
-			offset = pivot;
-	}
-
-	add_mem_detect_block(0, (offset + 1) << 20);
-}
-
-void detect_memory(void)
-{
-	sclp_early_get_memsize(&max_physmem_end);
-
-	if (!sclp_early_read_storage_info()) {
-		mem_detect.info_source = MEM_DETECT_SCLP_STOR_INFO;
-		return;
-	}
-
-	if (!diag260()) {
-		mem_detect.info_source = MEM_DETECT_DIAG260;
-		return;
-	}
-
-	if (max_physmem_end) {
-		add_mem_detect_block(0, max_physmem_end);
-		mem_detect.info_source = MEM_DETECT_SCLP_READ_INFO;
-		return;
-	}
-
-	search_mem_end();
-	mem_detect.info_source = MEM_DETECT_BIN_SEARCH;
-	max_physmem_end = get_mem_detect_end();
-}
diff --git a/arch/s390/boot/pgm_check.c b/arch/s390/boot/pgm_check.c
new file mode 100644
index 000000000000..fa621fa5bc02
--- /dev/null
+++ b/arch/s390/boot/pgm_check.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/stdarg.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <asm/stacktrace.h>
+#include <asm/boot_data.h>
+#include <asm/lowcore.h>
+#include <asm/setup.h>
+#include <asm/sclp.h>
+#include <asm/uv.h>
+#include "boot.h"
+
+void print_stacktrace(unsigned long sp)
+{
+	struct stack_info boot_stack = { STACK_TYPE_TASK, (unsigned long)_stack_start,
+					 (unsigned long)_stack_end };
+	bool first = true;
+
+	boot_emerg("Call Trace:\n");
+	while (!(sp & 0x7) && on_stack(&boot_stack, sp, sizeof(struct stack_frame))) {
+		struct stack_frame *sf = (struct stack_frame *)sp;
+
+		if (first)
+			boot_emerg("(sp:%016lx [<%016lx>] %pS)\n", sp, sf->gprs[8], (void *)sf->gprs[8]);
+		else
+			boot_emerg(" sp:%016lx [<%016lx>] %pS\n", sp, sf->gprs[8], (void *)sf->gprs[8]);
+		if (sf->back_chain <= sp)
+			break;
+		sp = sf->back_chain;
+		first = false;
+	}
+}
+
+extern struct exception_table_entry __start___ex_table[];
+extern struct exception_table_entry __stop___ex_table[];
+
+static inline unsigned long extable_insn(const struct exception_table_entry *x)
+{
+	return (unsigned long)&x->insn + x->insn;
+}
+
+static bool ex_handler(struct pt_regs *regs)
+{
+	const struct exception_table_entry *ex;
+
+	for (ex = __start___ex_table; ex < __stop___ex_table; ex++) {
+		if (extable_insn(ex) != regs->psw.addr)
+			continue;
+		if (ex->type != EX_TYPE_FIXUP)
+			return false;
+		regs->psw.addr = extable_fixup(ex);
+		return true;
+	}
+	return false;
+}
+
+void do_pgm_check(struct pt_regs *regs)
+{
+	struct psw_bits *psw = &psw_bits(regs->psw);
+	unsigned long *gpregs = regs->gprs;
+
+	if (ex_handler(regs))
+		return;
+	if (bootdebug)
+		boot_rb_dump();
+	boot_emerg("Linux version %s\n", kernel_version);
+	if (!is_prot_virt_guest() && early_command_line[0])
+		boot_emerg("Kernel command line: %s\n", early_command_line);
+	boot_emerg("Kernel fault: interruption code %04x ilc:%d\n",
+		   regs->int_code & 0xffff, regs->int_code >> 17);
+	if (kaslr_enabled()) {
+		boot_emerg("Kernel random base: %lx\n", __kaslr_offset);
+		boot_emerg("Kernel random base phys: %lx\n", __kaslr_offset_phys);
+	}
+	boot_emerg("PSW : %016lx %016lx (%pS)\n",
+		   regs->psw.mask, regs->psw.addr, (void *)regs->psw.addr);
+	boot_emerg("      R:%x T:%x IO:%x EX:%x Key:%x M:%x W:%x P:%x AS:%x CC:%x PM:%x RI:%x EA:%x\n",
+		   psw->per, psw->dat, psw->io, psw->ext, psw->key, psw->mcheck,
+		   psw->wait, psw->pstate, psw->as, psw->cc, psw->pm, psw->ri, psw->eaba);
+	boot_emerg("GPRS: %016lx %016lx %016lx %016lx\n", gpregs[0], gpregs[1], gpregs[2], gpregs[3]);
+	boot_emerg("      %016lx %016lx %016lx %016lx\n", gpregs[4], gpregs[5], gpregs[6], gpregs[7]);
+	boot_emerg("      %016lx %016lx %016lx %016lx\n", gpregs[8], gpregs[9], gpregs[10], gpregs[11]);
+	boot_emerg("      %016lx %016lx %016lx %016lx\n", gpregs[12], gpregs[13], gpregs[14], gpregs[15]);
+	print_stacktrace(gpregs[15]);
+	boot_emerg("Last Breaking-Event-Address:\n");
+	boot_emerg(" [<%016lx>] %pS\n", regs->last_break, (void *)regs->last_break);
+	/* Convert to disabled wait PSW */
+	psw->io = 0;
+	psw->ext = 0;
+	psw->wait = 1;
+}
diff --git a/arch/s390/boot/pgm_check_info.c b/arch/s390/boot/pgm_check_info.c
deleted file mode 100644
index 83b5b7915c32..000000000000
--- a/arch/s390/boot/pgm_check_info.c
+++ /dev/null
@@ -1,90 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <asm/lowcore.h>
-#include <asm/sclp.h>
-#include "boot.h"
-
-const char hex_asc[] = "0123456789abcdef";
-
-#define add_val_as_hex(dst, val)					       \
-	__add_val_as_hex(dst, (const unsigned char *)&val, sizeof(val))
-
-static char *__add_val_as_hex(char *dst, const unsigned char *src, size_t count)
-{
-	while (count--)
-		dst = hex_byte_pack(dst, *src++);
-	return dst;
-}
-
-static char *add_str(char *dst, char *src)
-{
-	strcpy(dst, src);
-	return dst + strlen(dst);
-}
-
-void print_pgm_check_info(void)
-{
-	struct psw_bits *psw = &psw_bits(S390_lowcore.psw_save_area);
-	unsigned short ilc = S390_lowcore.pgm_ilc >> 1;
-	char buf[256];
-	int row, col;
-	char *p;
-
-	add_str(buf, "Linux version ");
-	strlcat(buf, kernel_version, sizeof(buf));
-	sclp_early_printk(buf);
-
-	p = add_str(buf, "Kernel fault: interruption code ");
-	p = add_val_as_hex(buf + strlen(buf), S390_lowcore.pgm_code);
-	p = add_str(p, " ilc:");
-	*p++ = hex_asc_lo(ilc);
-	add_str(p, "\n");
-	sclp_early_printk(buf);
-
-	p = add_str(buf, "PSW : ");
-	p = add_val_as_hex(p, S390_lowcore.psw_save_area.mask);
-	p = add_str(p, " ");
-	p = add_val_as_hex(p, S390_lowcore.psw_save_area.addr);
-	add_str(p, "\n");
-	sclp_early_printk(buf);
-
-	p = add_str(buf, "      R:");
-	*p++ = hex_asc_lo(psw->per);
-	p = add_str(p, " T:");
-	*p++ = hex_asc_lo(psw->dat);
-	p = add_str(p, " IO:");
-	*p++ = hex_asc_lo(psw->io);
-	p = add_str(p, " EX:");
-	*p++ = hex_asc_lo(psw->ext);
-	p = add_str(p, " Key:");
-	*p++ = hex_asc_lo(psw->key);
-	p = add_str(p, " M:");
-	*p++ = hex_asc_lo(psw->mcheck);
-	p = add_str(p, " W:");
-	*p++ = hex_asc_lo(psw->wait);
-	p = add_str(p, " P:");
-	*p++ = hex_asc_lo(psw->pstate);
-	p = add_str(p, " AS:");
-	*p++ = hex_asc_lo(psw->as);
-	p = add_str(p, " CC:");
-	*p++ = hex_asc_lo(psw->cc);
-	p = add_str(p, " PM:");
-	*p++ = hex_asc_lo(psw->pm);
-	p = add_str(p, " RI:");
-	*p++ = hex_asc_lo(psw->ri);
-	p = add_str(p, " EA:");
-	*p++ = hex_asc_lo(psw->eaba);
-	add_str(p, "\n");
-	sclp_early_printk(buf);
-
-	for (row = 0; row < 4; row++) {
-		p = add_str(buf, row == 0 ? "GPRS:" : "     ");
-		for (col = 0; col < 4; col++) {
-			p = add_str(p, " ");
-			p = add_val_as_hex(p, S390_lowcore.gpregs_save_area[row * 4 + col]);
-		}
-		add_str(p, "\n");
-		sclp_early_printk(buf);
-	}
-}
diff --git a/arch/s390/boot/physmem_info.c b/arch/s390/boot/physmem_info.c
new file mode 100644
index 000000000000..45e3d057cfaa
--- /dev/null
+++ b/arch/s390/boot/physmem_info.c
@@ -0,0 +1,388 @@
+// SPDX-License-Identifier: GPL-2.0
+#define boot_fmt(fmt) "physmem: " fmt
+#include <linux/processor.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <asm/physmem_info.h>
+#include <asm/stacktrace.h>
+#include <asm/boot_data.h>
+#include <asm/sparsemem.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/sclp.h>
+#include <asm/asm.h>
+#include <asm/uv.h>
+#include "decompressor.h"
+#include "boot.h"
+
+struct physmem_info __bootdata(physmem_info);
+static unsigned int physmem_alloc_ranges;
+static unsigned long physmem_alloc_pos;
+
+/* up to 256 storage elements, 1020 subincrements each */
+#define ENTRIES_EXTENDED_MAX						       \
+	(256 * (1020 / 2) * sizeof(struct physmem_range))
+
+static struct physmem_range *__get_physmem_range_ptr(u32 n)
+{
+	if (n < MEM_INLINED_ENTRIES)
+		return &physmem_info.online[n];
+	if (unlikely(!physmem_info.online_extended)) {
+		physmem_info.online_extended = (struct physmem_range *)physmem_alloc_range(
+			RR_MEM_DETECT_EXT, ENTRIES_EXTENDED_MAX, sizeof(long), 0,
+			physmem_alloc_pos, true);
+	}
+	return &physmem_info.online_extended[n - MEM_INLINED_ENTRIES];
+}
+
+/*
+ * sequential calls to add_physmem_online_range with adjacent memory ranges
+ * are merged together into single memory range.
+ */
+void add_physmem_online_range(u64 start, u64 end)
+{
+	struct physmem_range *range;
+
+	if (physmem_info.range_count) {
+		range = __get_physmem_range_ptr(physmem_info.range_count - 1);
+		if (range->end == start) {
+			range->end = end;
+			return;
+		}
+	}
+
+	range = __get_physmem_range_ptr(physmem_info.range_count);
+	range->start = start;
+	range->end = end;
+	physmem_info.range_count++;
+}
+
+static int __diag260(unsigned long rx1, unsigned long rx2)
+{
+	union register_pair rx;
+	int cc, exception;
+	unsigned long ry;
+
+	rx.even = rx1;
+	rx.odd	= rx2;
+	ry = 0x10; /* storage configuration */
+	exception = 1;
+	asm_inline volatile(
+		"	diag	%[rx],%[ry],0x260\n"
+		"0:	lhi	%[exc],0\n"
+		"1:\n"
+		CC_IPM(cc)
+		EX_TABLE(0b, 1b)
+		: CC_OUT(cc, cc), [exc] "+d" (exception), [ry] "+d" (ry)
+		: [rx] "d" (rx.pair)
+		: CC_CLOBBER_LIST("memory"));
+	cc = exception ? -1 : CC_TRANSFORM(cc);
+	return cc == 0 ? ry : -1;
+}
+
+static int diag260(void)
+{
+	int rc, i;
+
+	struct {
+		unsigned long start;
+		unsigned long end;
+	} storage_extents[8] __aligned(16); /* VM supports up to 8 extends */
+
+	memset(storage_extents, 0, sizeof(storage_extents));
+	rc = __diag260((unsigned long)storage_extents, sizeof(storage_extents));
+	if (rc == -1)
+		return -1;
+
+	for (i = 0; i < min_t(int, rc, ARRAY_SIZE(storage_extents)); i++)
+		add_physmem_online_range(storage_extents[i].start, storage_extents[i].end + 1);
+	return 0;
+}
+
+#define DIAG500_SC_STOR_LIMIT 4
+
+static int diag500_storage_limit(unsigned long *max_physmem_end)
+{
+	unsigned long storage_limit;
+
+	asm_inline volatile(
+		"	lghi	%%r1,%[subcode]\n"
+		"	lghi	%%r2,0\n"
+		"	diag	%%r2,%%r4,0x500\n"
+		"0:	lgr	%[slimit],%%r2\n"
+		EX_TABLE(0b, 0b)
+		: [slimit] "=d" (storage_limit)
+		: [subcode] "i" (DIAG500_SC_STOR_LIMIT)
+		: "memory", "1", "2");
+	if (!storage_limit)
+		return -EINVAL;
+	/* Convert inclusive end to exclusive end */
+	*max_physmem_end = storage_limit + 1;
+	return 0;
+}
+
+static int tprot(unsigned long addr)
+{
+	int cc, exception;
+
+	exception = 1;
+	asm_inline volatile(
+		"	tprot	0(%[addr]),0\n"
+		"0:	lhi	%[exc],0\n"
+		"1:\n"
+		CC_IPM(cc)
+		EX_TABLE(0b, 1b)
+		: CC_OUT(cc, cc), [exc] "+d" (exception)
+		: [addr] "a" (addr)
+		: CC_CLOBBER_LIST("memory"));
+	cc = exception ? -EFAULT : CC_TRANSFORM(cc);
+	return cc;
+}
+
+static unsigned long search_mem_end(void)
+{
+	unsigned long range = 1 << (MAX_PHYSMEM_BITS - 20); /* in 1MB blocks */
+	unsigned long offset = 0;
+	unsigned long pivot;
+
+	while (range > 1) {
+		range >>= 1;
+		pivot = offset + range;
+		if (!tprot(pivot << 20))
+			offset = pivot;
+	}
+	return (offset + 1) << 20;
+}
+
+unsigned long detect_max_physmem_end(void)
+{
+	unsigned long max_physmem_end = 0;
+
+	if (!diag500_storage_limit(&max_physmem_end)) {
+		physmem_info.info_source = MEM_DETECT_DIAG500_STOR_LIMIT;
+	} else if (!sclp_early_get_memsize(&max_physmem_end)) {
+		physmem_info.info_source = MEM_DETECT_SCLP_READ_INFO;
+	} else {
+		max_physmem_end = search_mem_end();
+		physmem_info.info_source = MEM_DETECT_BIN_SEARCH;
+	}
+	boot_debug("Max physical memory: 0x%016lx (info source: %s)\n", max_physmem_end,
+		   get_physmem_info_source());
+	return max_physmem_end;
+}
+
+void detect_physmem_online_ranges(unsigned long max_physmem_end)
+{
+	unsigned long start, end;
+	int i;
+
+	if (!sclp_early_read_storage_info()) {
+		physmem_info.info_source = MEM_DETECT_SCLP_STOR_INFO;
+	} else if (physmem_info.info_source == MEM_DETECT_DIAG500_STOR_LIMIT) {
+		unsigned long online_end;
+
+		if (!sclp_early_get_memsize(&online_end)) {
+			physmem_info.info_source = MEM_DETECT_SCLP_READ_INFO;
+			add_physmem_online_range(0, online_end);
+		}
+	} else if (!diag260()) {
+		physmem_info.info_source = MEM_DETECT_DIAG260;
+	} else if (max_physmem_end) {
+		add_physmem_online_range(0, max_physmem_end);
+	}
+	boot_debug("Online memory ranges (info source: %s):\n", get_physmem_info_source());
+	for_each_physmem_online_range(i, &start, &end)
+		boot_debug(" online [%d]:   0x%016lx-0x%016lx\n", i, start, end);
+}
+
+void physmem_set_usable_limit(unsigned long limit)
+{
+	physmem_info.usable = limit;
+	physmem_alloc_pos = limit;
+	boot_debug("Usable memory limit: 0x%016lx\n", limit);
+}
+
+static void die_oom(unsigned long size, unsigned long align, unsigned long min, unsigned long max)
+{
+	unsigned long start, end, total_mem = 0, total_reserved_mem = 0;
+	struct reserved_range *range;
+	enum reserved_range_type t;
+	int i;
+
+	boot_emerg("Linux version %s\n", kernel_version);
+	if (!is_prot_virt_guest() && early_command_line[0])
+		boot_emerg("Kernel command line: %s\n", early_command_line);
+	boot_emerg("Out of memory allocating %lu bytes 0x%lx aligned in range %lx:%lx\n",
+		   size, align, min, max);
+	boot_emerg("Reserved memory ranges:\n");
+	for_each_physmem_reserved_range(t, range, &start, &end) {
+		boot_emerg("%016lx %016lx %s\n", start, end, get_rr_type_name(t));
+		total_reserved_mem += end - start;
+	}
+	boot_emerg("Usable online memory ranges (info source: %s [%d]):\n",
+		   get_physmem_info_source(), physmem_info.info_source);
+	for_each_physmem_usable_range(i, &start, &end) {
+		boot_emerg("%016lx %016lx\n", start, end);
+		total_mem += end - start;
+	}
+	boot_emerg("Usable online memory total: %lu Reserved: %lu Free: %lu\n",
+		   total_mem, total_reserved_mem,
+		   total_mem > total_reserved_mem ? total_mem - total_reserved_mem : 0);
+	print_stacktrace(current_frame_address());
+	boot_emerg(" -- System halted\n");
+	disabled_wait();
+}
+
+static void _physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size)
+{
+	physmem_info.reserved[type].start = addr;
+	physmem_info.reserved[type].end = addr + size;
+}
+
+void physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size)
+{
+	_physmem_reserve(type, addr, size);
+	boot_debug("%-14s 0x%016lx-0x%016lx %s\n", "Reserve:", addr, addr + size,
+		   get_rr_type_name(type));
+}
+
+void physmem_free(enum reserved_range_type type)
+{
+	boot_debug("%-14s 0x%016lx-0x%016lx %s\n", "Free:", physmem_info.reserved[type].start,
+		   physmem_info.reserved[type].end, get_rr_type_name(type));
+	physmem_info.reserved[type].start = 0;
+	physmem_info.reserved[type].end = 0;
+}
+
+static bool __physmem_alloc_intersects(unsigned long addr, unsigned long size,
+				       unsigned long *intersection_start)
+{
+	unsigned long res_addr, res_size;
+	int t;
+
+	for (t = 0; t < RR_MAX; t++) {
+		if (!get_physmem_reserved(t, &res_addr, &res_size))
+			continue;
+		if (intersects(addr, size, res_addr, res_size)) {
+			*intersection_start = res_addr;
+			return true;
+		}
+	}
+	return ipl_report_certs_intersects(addr, size, intersection_start);
+}
+
+static unsigned long __physmem_alloc_range(unsigned long size, unsigned long align,
+					   unsigned long min, unsigned long max,
+					   unsigned int from_ranges, unsigned int *ranges_left,
+					   bool die_on_oom)
+{
+	unsigned int nranges = from_ranges ?: physmem_info.range_count;
+	unsigned long range_start, range_end;
+	unsigned long intersection_start;
+	unsigned long addr, pos = max;
+
+	align = max(align, 8UL);
+	while (nranges) {
+		__get_physmem_range(nranges - 1, &range_start, &range_end, false);
+		pos = min(range_end, pos);
+
+		if (round_up(min, align) + size > pos)
+			break;
+		addr = round_down(pos - size, align);
+		if (range_start > addr) {
+			nranges--;
+			continue;
+		}
+		if (__physmem_alloc_intersects(addr, size, &intersection_start)) {
+			pos = intersection_start;
+			continue;
+		}
+
+		if (ranges_left)
+			*ranges_left = nranges;
+		return addr;
+	}
+	if (die_on_oom)
+		die_oom(size, align, min, max);
+	return 0;
+}
+
+unsigned long physmem_alloc_range(enum reserved_range_type type, unsigned long size,
+				  unsigned long align, unsigned long min, unsigned long max,
+				  bool die_on_oom)
+{
+	unsigned long addr;
+
+	max = min(max, physmem_alloc_pos);
+	addr = __physmem_alloc_range(size, align, min, max, 0, NULL, die_on_oom);
+	if (addr)
+		_physmem_reserve(type, addr, size);
+	boot_debug("%-14s 0x%016lx-0x%016lx %s\n", "Alloc range:", addr, addr + size,
+		   get_rr_type_name(type));
+	return addr;
+}
+
+unsigned long physmem_alloc(enum reserved_range_type type, unsigned long size,
+			    unsigned long align, bool die_on_oom)
+{
+	struct reserved_range *range = &physmem_info.reserved[type];
+	struct reserved_range *new_range = NULL;
+	unsigned int ranges_left;
+	unsigned long addr;
+
+	addr = __physmem_alloc_range(size, align, 0, physmem_alloc_pos, physmem_alloc_ranges,
+				     &ranges_left, die_on_oom);
+	if (!addr)
+		return 0;
+	/* if not a consecutive allocation of the same type or first allocation */
+	if (range->start != addr + size) {
+		if (range->end) {
+			addr = __physmem_alloc_range(sizeof(struct reserved_range), 0, 0,
+						     physmem_alloc_pos, physmem_alloc_ranges,
+						     &ranges_left, true);
+			new_range = (struct reserved_range *)addr;
+			addr = __physmem_alloc_range(size, align, 0, addr, ranges_left,
+						     &ranges_left, die_on_oom);
+			if (!addr)
+				return 0;
+			*new_range = *range;
+			range->chain = new_range;
+		}
+		range->end = addr + size;
+	}
+	if (type != RR_VMEM) {
+		boot_debug("%-14s 0x%016lx-0x%016lx %-20s align 0x%lx split %d\n", "Alloc topdown:",
+			   addr, addr + size, get_rr_type_name(type), align, !!new_range);
+	}
+	range->start = addr;
+	physmem_alloc_pos = addr;
+	physmem_alloc_ranges = ranges_left;
+	return addr;
+}
+
+unsigned long physmem_alloc_or_die(enum reserved_range_type type, unsigned long size,
+				   unsigned long align)
+{
+	return physmem_alloc(type, size, align, true);
+}
+
+unsigned long get_physmem_alloc_pos(void)
+{
+	return physmem_alloc_pos;
+}
+
+void dump_physmem_reserved(void)
+{
+	struct reserved_range *range;
+	enum reserved_range_type t;
+	unsigned long start, end;
+
+	boot_debug("Reserved memory ranges:\n");
+	for_each_physmem_reserved_range(t, range, &start, &end) {
+		if (end) {
+			boot_debug("%-14s 0x%016lx-0x%016lx @%012lx chain %012lx\n",
+				   get_rr_type_name(t), start, end, (unsigned long)range,
+				   (unsigned long)range->chain);
+		}
+	}
+}
diff --git a/arch/s390/boot/printk.c b/arch/s390/boot/printk.c
new file mode 100644
index 000000000000..8cf6331bc060
--- /dev/null
+++ b/arch/s390/boot/printk.c
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/stdarg.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <asm/stacktrace.h>
+#include <asm/boot_data.h>
+#include <asm/sections.h>
+#include <asm/lowcore.h>
+#include <asm/setup.h>
+#include <asm/timex.h>
+#include <asm/sclp.h>
+#include <asm/uv.h>
+#include "boot.h"
+
+int boot_console_loglevel = CONFIG_CONSOLE_LOGLEVEL_DEFAULT;
+bool boot_ignore_loglevel;
+char __bootdata(boot_rb)[PAGE_SIZE * 2];
+bool __bootdata(boot_earlyprintk);
+size_t __bootdata(boot_rb_off);
+char __bootdata(bootdebug_filter)[128];
+bool __bootdata(bootdebug);
+
+static void boot_rb_add(const char *str, size_t len)
+{
+	/* leave double '\0' in the end */
+	size_t avail = sizeof(boot_rb) - boot_rb_off - 1;
+
+	/* store strings separated by '\0' */
+	if (len + 1 > avail)
+		boot_rb_off = 0;
+	strcpy(boot_rb + boot_rb_off, str);
+	boot_rb_off += len + 1;
+}
+
+static void print_rb_entry(const char *str)
+{
+	sclp_early_printk(printk_skip_level(str));
+}
+
+static bool debug_messages_printed(void)
+{
+	return boot_earlyprintk && (boot_ignore_loglevel || boot_console_loglevel > LOGLEVEL_DEBUG);
+}
+
+void boot_rb_dump(void)
+{
+	if (debug_messages_printed())
+		return;
+	sclp_early_printk("Boot messages ring buffer:\n");
+	boot_rb_foreach(print_rb_entry);
+}
+
+const char hex_asc[] = "0123456789abcdef";
+
+static char *as_hex(char *dst, unsigned long val, int pad)
+{
+	char *p = dst + max(pad, (int)__fls(val | 1) / 4 + 1);
+
+	for (*p-- = '\0'; p >= dst; val >>= 4)
+		*p-- = hex_asc[val & 0x0f];
+	return dst;
+}
+
+#define MAX_NUMLEN 21
+static char *as_dec(char *buf, unsigned long val, bool is_signed)
+{
+	bool negative = false;
+	char *p = buf + MAX_NUMLEN;
+
+	if (is_signed && (long)val < 0) {
+		val = (val == LONG_MIN ? LONG_MIN : -(long)val);
+		negative = true;
+	}
+
+	*--p = '\0';
+	do {
+		*--p = '0' + (val % 10);
+		val /= 10;
+	} while (val);
+
+	if (negative)
+		*--p = '-';
+	return p;
+}
+
+static ssize_t strpad(char *dst, size_t dst_size, const char *src,
+		      int _pad, bool zero_pad, bool decimal)
+{
+	ssize_t len = strlen(src), pad = _pad;
+	char *p = dst;
+
+	if (max(len, abs(pad)) >= dst_size)
+		return -E2BIG;
+
+	if (pad > len) {
+		if (decimal && zero_pad && *src == '-') {
+			*p++ = '-';
+			src++;
+			len--;
+			pad--;
+		}
+		memset(p, zero_pad ? '0' : ' ', pad - len);
+		p += pad - len;
+	}
+	memcpy(p, src, len);
+	p += len;
+	if (pad < 0 && -pad > len) {
+		memset(p, ' ', -pad - len);
+		p += -pad - len;
+	}
+	*p = '\0';
+	return p - dst;
+}
+
+static char *symstart(char *p)
+{
+	while (*p)
+		p--;
+	return p + 1;
+}
+
+static noinline char *findsym(unsigned long ip, unsigned short *off, unsigned short *len)
+{
+	/* symbol entries are in a form "10000 c4 startup\0" */
+	char *a = _decompressor_syms_start;
+	char *b = _decompressor_syms_end;
+	unsigned long start;
+	unsigned long size;
+	char *pivot;
+	char *endp;
+
+	while (a < b) {
+		pivot = symstart(a + (b - a) / 2);
+		start = simple_strtoull(pivot, &endp, 16);
+		size = simple_strtoull(endp + 1, &endp, 16);
+		if (ip < start) {
+			b = pivot;
+			continue;
+		}
+		if (ip > start + size) {
+			a = pivot + strlen(pivot) + 1;
+			continue;
+		}
+		*off = ip - start;
+		*len = size;
+		return endp + 1;
+	}
+	return NULL;
+}
+
+#define MAX_SYMLEN 64
+static noinline char *strsym(char *buf, void *ip)
+{
+	unsigned short off;
+	unsigned short len;
+	char *p;
+
+	p = findsym((unsigned long)ip, &off, &len);
+	if (p) {
+		strncpy(buf, p, MAX_SYMLEN);
+		/* reserve 15 bytes for offset/len in symbol+0x1234/0x1234 */
+		p = buf + strnlen(buf, MAX_SYMLEN - 15);
+		strcpy(p, "+0x");
+		as_hex(p + 3, off, 0);
+		strcat(p, "/0x");
+		as_hex(p + strlen(p), len, 0);
+	} else {
+		as_hex(buf, (unsigned long)ip, 16);
+	}
+	return buf;
+}
+
+static inline int printk_loglevel(const char *buf)
+{
+	if (buf[0] == KERN_SOH_ASCII && buf[1]) {
+		switch (buf[1]) {
+		case '0' ... '7':
+			return buf[1] - '0';
+		}
+	}
+	return MESSAGE_LOGLEVEL_DEFAULT;
+}
+
+static void boot_console_earlyprintk(const char *buf)
+{
+	int level = printk_loglevel(buf);
+
+	/* always print emergency messages */
+	if (level > LOGLEVEL_EMERG && !boot_earlyprintk)
+		return;
+	buf = printk_skip_level(buf);
+	/* print debug messages only when bootdebug is enabled */
+	if (level == LOGLEVEL_DEBUG && (!bootdebug || !bootdebug_filter_match(skip_timestamp(buf))))
+		return;
+	if (boot_ignore_loglevel || level < boot_console_loglevel)
+		sclp_early_printk(buf);
+}
+
+static char *add_timestamp(char *buf)
+{
+#ifdef CONFIG_PRINTK_TIME
+	unsigned long ns = tod_to_ns(__get_tod_clock_monotonic());
+	char ts[MAX_NUMLEN];
+
+	*buf++ = '[';
+	buf += strpad(buf, MAX_NUMLEN, as_dec(ts, ns / NSEC_PER_SEC, 0), 5, 0, 0);
+	*buf++ = '.';
+	buf += strpad(buf, MAX_NUMLEN, as_dec(ts, (ns % NSEC_PER_SEC) / NSEC_PER_USEC, 0), 6, 1, 0);
+	*buf++ = ']';
+	*buf++ = ' ';
+#endif
+	return buf;
+}
+
+#define va_arg_len_type(args, lenmod, typemod)				\
+	((lenmod == 'l') ? va_arg(args, typemod long) :			\
+	 (lenmod == 'h') ? (typemod short)va_arg(args, typemod int) :	\
+	 (lenmod == 'H') ? (typemod char)va_arg(args, typemod int) :	\
+	 (lenmod == 'z') ? va_arg(args, typemod long) :			\
+			   va_arg(args, typemod int))
+
+int boot_printk(const char *fmt, ...)
+{
+	char buf[1024] = { 0 };
+	char *end = buf + sizeof(buf) - 1; /* make sure buf is 0 terminated */
+	bool zero_pad, decimal;
+	char *strval, *p = buf;
+	char valbuf[MAX(MAX_SYMLEN, MAX_NUMLEN)];
+	va_list args;
+	char lenmod;
+	ssize_t len;
+	int pad;
+
+	*p++ = KERN_SOH_ASCII;
+	*p++ = printk_get_level(fmt) ?: '0' + MESSAGE_LOGLEVEL_DEFAULT;
+	p = add_timestamp(p);
+	fmt = printk_skip_level(fmt);
+
+	va_start(args, fmt);
+	for (; p < end && *fmt; fmt++) {
+		if (*fmt != '%') {
+			*p++ = *fmt;
+			continue;
+		}
+		if (*++fmt == '%') {
+			*p++ = '%';
+			continue;
+		}
+		zero_pad = (*fmt == '0');
+		pad = simple_strtol(fmt, (char **)&fmt, 10);
+		lenmod = (*fmt == 'h' || *fmt == 'l' || *fmt == 'z') ? *fmt++ : 0;
+		if (lenmod == 'h' && *fmt == 'h') {
+			lenmod = 'H';
+			fmt++;
+		}
+		decimal = false;
+		switch (*fmt) {
+		case 's':
+			if (lenmod)
+				goto out;
+			strval = va_arg(args, char *);
+			zero_pad = false;
+			break;
+		case 'p':
+			if (*++fmt != 'S' || lenmod)
+				goto out;
+			strval = strsym(valbuf, va_arg(args, void *));
+			zero_pad = false;
+			break;
+		case 'd':
+		case 'i':
+			strval = as_dec(valbuf, va_arg_len_type(args, lenmod, signed), 1);
+			decimal = true;
+			break;
+		case 'u':
+			strval = as_dec(valbuf, va_arg_len_type(args, lenmod, unsigned), 0);
+			break;
+		case 'x':
+			strval = as_hex(valbuf, va_arg_len_type(args, lenmod, unsigned), 0);
+			break;
+		default:
+			goto out;
+		}
+		len = strpad(p, end - p, strval, pad, zero_pad, decimal);
+		if (len == -E2BIG)
+			break;
+		p += len;
+	}
+out:
+	va_end(args);
+	len = strlen(buf);
+	if (len) {
+		boot_rb_add(buf, len);
+		boot_console_earlyprintk(buf);
+	}
+	return len;
+}
diff --git a/arch/s390/boot/sclp_early_core.c b/arch/s390/boot/sclp_early_core.c
index 5a19fd7020b5..6f30646afbd0 100644
--- a/arch/s390/boot/sclp_early_core.c
+++ b/arch/s390/boot/sclp_early_core.c
@@ -1,2 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
+#include "boot.h"
 #include "../../../drivers/s390/char/sclp_early_core.c"
+
+/* SCLP early buffer must stay page-aligned and below 2GB */
+static char __sclp_early_sccb[EXT_SCCB_READ_SCP] __aligned(PAGE_SIZE);
+
+void sclp_early_setup_buffer(void)
+{
+	sclp_early_set_buffer(&__sclp_early_sccb);
+}
diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c
index 3b3a11f95269..06316fb8e0fa 100644
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -1,82 +1,205 @@
 // SPDX-License-Identifier: GPL-2.0
+#define boot_fmt(fmt) "startup: " fmt
 #include <linux/string.h>
 #include <linux/elf.h>
+#include <asm/page-states.h>
+#include <asm/boot_data.h>
+#include <asm/extmem.h>
 #include <asm/sections.h>
+#include <asm/maccess.h>
+#include <asm/machine.h>
+#include <asm/sysinfo.h>
+#include <asm/cpu_mf.h>
 #include <asm/setup.h>
+#include <asm/timex.h>
+#include <asm/kasan.h>
 #include <asm/kexec.h>
 #include <asm/sclp.h>
 #include <asm/diag.h>
 #include <asm/uv.h>
-#include "compressed/decompressor.h"
+#include <asm/abs_lowcore.h>
+#include <asm/physmem_info.h>
+#include "decompressor.h"
 #include "boot.h"
+#include "uv.h"
 
-extern char __boot_data_start[], __boot_data_end[];
-extern char __boot_data_preserved_start[], __boot_data_preserved_end[];
-unsigned long __bootdata_preserved(__kaslr_offset);
+struct vm_layout __bootdata_preserved(vm_layout);
+unsigned long __bootdata_preserved(__abs_lowcore);
+unsigned long __bootdata_preserved(__memcpy_real_area);
+pte_t *__bootdata_preserved(memcpy_real_ptep);
+unsigned long __bootdata_preserved(VMALLOC_START);
+unsigned long __bootdata_preserved(VMALLOC_END);
+struct page *__bootdata_preserved(vmemmap);
+unsigned long __bootdata_preserved(vmemmap_size);
+unsigned long __bootdata_preserved(MODULES_VADDR);
+unsigned long __bootdata_preserved(MODULES_END);
+unsigned long __bootdata_preserved(max_mappable);
+unsigned long __bootdata_preserved(page_noexec_mask);
+unsigned long __bootdata_preserved(segment_noexec_mask);
+unsigned long __bootdata_preserved(region_noexec_mask);
+union tod_clock __bootdata_preserved(tod_clock_base);
+u64 __bootdata_preserved(clock_comparator_max) = -1UL;
 
-/*
- * Some code and data needs to stay below 2 GB, even when the kernel would be
- * relocated above 2 GB, because it has to use 31 bit addresses.
- * Such code and data is part of the .dma section, and its location is passed
- * over to the decompressed / relocated kernel via the .boot.preserved.data
- * section.
- */
-extern char _sdma[], _edma[];
-extern char _stext_dma[], _etext_dma[];
-extern struct exception_table_entry _start_dma_ex_table[];
-extern struct exception_table_entry _stop_dma_ex_table[];
-unsigned long __bootdata_preserved(__sdma) = __pa(&_sdma);
-unsigned long __bootdata_preserved(__edma) = __pa(&_edma);
-unsigned long __bootdata_preserved(__stext_dma) = __pa(&_stext_dma);
-unsigned long __bootdata_preserved(__etext_dma) = __pa(&_etext_dma);
-struct exception_table_entry *
-	__bootdata_preserved(__start_dma_ex_table) = _start_dma_ex_table;
-struct exception_table_entry *
-	__bootdata_preserved(__stop_dma_ex_table) = _stop_dma_ex_table;
-
-int _diag210_dma(struct diag210 *addr);
-int _diag26c_dma(void *req, void *resp, enum diag26c_sc subcode);
-int _diag14_dma(unsigned long rx, unsigned long ry1, unsigned long subcode);
-void _diag0c_dma(struct hypfs_diag0c_entry *entry);
-void _diag308_reset_dma(void);
-struct diag_ops __bootdata_preserved(diag_dma_ops) = {
-	.diag210 = _diag210_dma,
-	.diag26c = _diag26c_dma,
-	.diag14 = _diag14_dma,
-	.diag0c = _diag0c_dma,
-	.diag308_reset = _diag308_reset_dma
-};
-static struct diag210 _diag210_tmp_dma __section(.dma.data);
-struct diag210 *__bootdata_preserved(__diag210_tmp_dma) = &_diag210_tmp_dma;
-void _swsusp_reset_dma(void);
-unsigned long __bootdata_preserved(__swsusp_reset_dma) = __pa(_swsusp_reset_dma);
+u64 __bootdata_preserved(stfle_fac_list[16]);
+struct oldmem_data __bootdata_preserved(oldmem_data);
 
 void error(char *x)
 {
-	sclp_early_printk("\n\n");
-	sclp_early_printk(x);
-	sclp_early_printk("\n\n -- System halted");
-
+	boot_emerg("%s\n", x);
+	boot_emerg(" -- System halted\n");
 	disabled_wait();
 }
 
+static char sysinfo_page[PAGE_SIZE] __aligned(PAGE_SIZE);
+
+static void detect_machine_type(void)
+{
+	struct sysinfo_3_2_2 *vmms = (struct sysinfo_3_2_2 *)&sysinfo_page;
+
+	/* Check current-configuration-level */
+	if (stsi(NULL, 0, 0, 0) <= 2) {
+		set_machine_feature(MFEATURE_LPAR);
+		return;
+	}
+	/* Get virtual-machine cpu information. */
+	if (stsi(vmms, 3, 2, 2) || !vmms->count)
+		return;
+	/* Detect known hypervisors */
+	if (!memcmp(vmms->vm[0].cpi, "\xd2\xe5\xd4", 3))
+		set_machine_feature(MFEATURE_KVM);
+	else if (!memcmp(vmms->vm[0].cpi, "\xa9\x61\xe5\xd4", 4))
+		set_machine_feature(MFEATURE_VM);
+}
+
+static void detect_diag9c(void)
+{
+	unsigned int cpu;
+	int rc = 1;
+
+	cpu = stap();
+	asm_inline volatile(
+		"	diag	%[cpu],%%r0,0x9c\n"
+		"0:	lhi	%[rc],0\n"
+		"1:\n"
+		EX_TABLE(0b, 1b)
+		: [rc] "+d" (rc)
+		: [cpu] "d" (cpu)
+		: "cc", "memory");
+	if (!rc)
+		set_machine_feature(MFEATURE_DIAG9C);
+}
+
+static void reset_tod_clock(void)
+{
+	union tod_clock clk;
+
+	if (store_tod_clock_ext_cc(&clk) == 0)
+		return;
+	/* TOD clock not running. Set the clock to Unix Epoch. */
+	if (set_tod_clock(TOD_UNIX_EPOCH) || store_tod_clock_ext_cc(&clk))
+		disabled_wait();
+	memset(&tod_clock_base, 0, sizeof(tod_clock_base));
+	tod_clock_base.tod = TOD_UNIX_EPOCH;
+	get_lowcore()->last_update_clock = TOD_UNIX_EPOCH;
+}
+
+static void detect_facilities(void)
+{
+	if (cpu_has_edat1())
+		local_ctl_set_bit(0, CR0_EDAT_BIT);
+	page_noexec_mask = -1UL;
+	segment_noexec_mask = -1UL;
+	region_noexec_mask = -1UL;
+	if (!cpu_has_nx()) {
+		page_noexec_mask &= ~_PAGE_NOEXEC;
+		segment_noexec_mask &= ~_SEGMENT_ENTRY_NOEXEC;
+		region_noexec_mask &= ~_REGION_ENTRY_NOEXEC;
+	}
+	if (IS_ENABLED(CONFIG_PCI) && test_facility(153))
+		set_machine_feature(MFEATURE_PCI_MIO);
+	reset_tod_clock();
+	if (test_facility(139) && (tod_clock_base.tod >> 63)) {
+		/* Enable signed clock comparator comparisons */
+		set_machine_feature(MFEATURE_SCC);
+		clock_comparator_max = -1UL >> 1;
+		local_ctl_set_bit(0, CR0_CLOCK_COMPARATOR_SIGN_BIT);
+	}
+	if (test_facility(50) && test_facility(73)) {
+		set_machine_feature(MFEATURE_TX);
+		local_ctl_set_bit(0, CR0_TRANSACTIONAL_EXECUTION_BIT);
+	}
+	if (cpu_has_vx())
+		local_ctl_set_bit(0, CR0_VECTOR_BIT);
+}
+
+static int cmma_test_essa(void)
+{
+	unsigned long tmp = 0;
+	int rc = 1;
+
+	/* Test ESSA_GET_STATE */
+	asm_inline volatile(
+		"	.insn	rrf,0xb9ab0000,%[tmp],%[tmp],%[cmd],0\n"
+		"0:	lhi	%[rc],0\n"
+		"1:\n"
+		EX_TABLE(0b, 1b)
+		: [rc] "+d" (rc), [tmp] "+d" (tmp)
+		: [cmd] "i" (ESSA_GET_STATE)
+		: "cc", "memory");
+	return rc;
+}
+
+static void cmma_init(void)
+{
+	if (!cmma_flag)
+		return;
+	if (cmma_test_essa()) {
+		cmma_flag = 0;
+		return;
+	}
+	if (test_facility(147))
+		cmma_flag = 2;
+}
+
+static void setup_lpp(void)
+{
+	get_lowcore()->current_pid = 0;
+	get_lowcore()->lpp = LPP_MAGIC;
+	if (test_facility(40))
+		lpp(&get_lowcore()->lpp);
+}
+
 #ifdef CONFIG_KERNEL_UNCOMPRESSED
-unsigned long mem_safe_offset(void)
+static unsigned long mem_safe_offset(void)
+{
+	return (unsigned long)_compressed_start;
+}
+
+static void deploy_kernel(void *output)
 {
-	return vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size;
+	void *uncompressed_start = (void *)_compressed_start;
+
+	if (output == uncompressed_start)
+		return;
+	memmove(output, uncompressed_start, vmlinux.image_size);
+	memset(uncompressed_start, 0, vmlinux.image_size);
 }
 #endif
 
-static void rescue_initrd(unsigned long addr)
+static void rescue_initrd(unsigned long min, unsigned long max)
 {
+	unsigned long old_addr, addr, size;
+
 	if (!IS_ENABLED(CONFIG_BLK_DEV_INITRD))
 		return;
-	if (!INITRD_START || !INITRD_SIZE)
+	if (!get_physmem_reserved(RR_INITRD, &addr, &size))
 		return;
-	if (addr <= INITRD_START)
+	if (addr >= min && addr + size <= max)
 		return;
-	memmove((void *)addr, (void *)INITRD_START, INITRD_SIZE);
-	INITRD_START = addr;
+	old_addr = addr;
+	physmem_free(RR_INITRD);
+	addr = physmem_alloc_or_die(RR_INITRD, size, 0);
+	memmove((void *)addr, (void *)old_addr, size);
 }
 
 static void copy_bootdata(void)
@@ -89,95 +212,418 @@ static void copy_bootdata(void)
 	memcpy((void *)vmlinux.bootdata_preserved_off, __boot_data_preserved_start, vmlinux.bootdata_preserved_size);
 }
 
-static void handle_relocs(unsigned long offset)
+static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr,
+				unsigned long offset, unsigned long phys_offset)
+{
+	int *reloc;
+	long loc;
+
+	/* Adjust R_390_64 relocations */
+	for (reloc = (int *)__vmlinux_relocs_64_start; reloc < (int *)__vmlinux_relocs_64_end; reloc++) {
+		loc = (long)*reloc + phys_offset;
+		if (loc < min_addr || loc > max_addr)
+			error("64-bit relocation outside of kernel!\n");
+		*(u64 *)loc += offset;
+	}
+}
+
+static void kaslr_adjust_got(unsigned long offset)
 {
-	Elf64_Rela *rela_start, *rela_end, *rela;
-	int r_type, r_sym, rc;
-	Elf64_Addr loc, val;
-	Elf64_Sym *dynsym;
+	u64 *entry;
+
+	/*
+	 * Adjust GOT entries, except for ones for undefined weak symbols
+	 * that resolved to zero. This also skips the first three reserved
+	 * entries on s390x that are zero.
+	 */
+	for (entry = (u64 *)vmlinux.got_start; entry < (u64 *)vmlinux.got_end; entry++) {
+		if (*entry)
+			*entry += offset;
+	}
+}
 
-	rela_start = (Elf64_Rela *) vmlinux.rela_dyn_start;
-	rela_end = (Elf64_Rela *) vmlinux.rela_dyn_end;
-	dynsym = (Elf64_Sym *) vmlinux.dynsym_start;
-	for (rela = rela_start; rela < rela_end; rela++) {
-		loc = rela->r_offset + offset;
-		val = rela->r_addend;
-		r_sym = ELF64_R_SYM(rela->r_info);
-		if (r_sym) {
-			if (dynsym[r_sym].st_shndx != SHN_UNDEF)
-				val += dynsym[r_sym].st_value + offset;
+/*
+ * Merge information from several sources into a single ident_map_size value.
+ * "ident_map_size" represents the upper limit of physical memory we may ever
+ * reach. It might not be all online memory, but also include standby (offline)
+ * memory or memory areas reserved for other means (e.g., memory devices such as
+ * virtio-mem).
+ *
+ * "ident_map_size" could be lower then actual standby/reserved or even online
+ * memory present, due to limiting factors. We should never go above this limit.
+ * It is the size of our identity mapping.
+ *
+ * Consider the following factors:
+ * 1. max_physmem_end - end of physical memory online, standby or reserved.
+ *    Always >= end of the last online memory range (get_physmem_online_end()).
+ * 2. CONFIG_MAX_PHYSMEM_BITS - the maximum size of physical memory the
+ *    kernel is able to support.
+ * 3. "mem=" kernel command line option which limits physical memory usage.
+ * 4. OLDMEM_BASE which is a kdump memory limit when the kernel is executed as
+ *    crash kernel.
+ * 5. "hsa" size which is a memory limit when the kernel is executed during
+ *    zfcp/nvme dump.
+ */
+static void setup_ident_map_size(unsigned long max_physmem_end)
+{
+	unsigned long hsa_size;
+
+	ident_map_size = max_physmem_end;
+	if (memory_limit)
+		ident_map_size = min(ident_map_size, memory_limit);
+	ident_map_size = min(ident_map_size, 1UL << MAX_PHYSMEM_BITS);
+
+#ifdef CONFIG_CRASH_DUMP
+	if (oldmem_data.start) {
+		__kaslr_enabled = 0;
+		ident_map_size = min(ident_map_size, oldmem_data.size);
+		boot_debug("kdump memory limit:  0x%016lx\n", oldmem_data.size);
+	} else if (ipl_block_valid && is_ipl_block_dump()) {
+		__kaslr_enabled = 0;
+		if (!sclp_early_get_hsa_size(&hsa_size) && hsa_size) {
+			ident_map_size = min(ident_map_size, hsa_size);
+			boot_debug("Stand-alone dump limit: 0x%016lx\n", hsa_size);
+		}
+	}
+#endif
+	boot_debug("Identity map size:   0x%016lx\n", ident_map_size);
+}
+
+#define FIXMAP_SIZE	round_up(MEMCPY_REAL_SIZE + ABS_LOWCORE_MAP_SIZE, sizeof(struct lowcore))
+
+static unsigned long get_vmem_size(unsigned long identity_size,
+				   unsigned long vmemmap_size,
+				   unsigned long vmalloc_size,
+				   unsigned long rte_size)
+{
+	unsigned long max_mappable, vsize;
+
+	max_mappable = max(identity_size, MAX_DCSS_ADDR);
+	vsize = round_up(SZ_2G + max_mappable, rte_size) +
+		round_up(vmemmap_size, rte_size) +
+		FIXMAP_SIZE + MODULES_LEN + KASLR_LEN;
+	if (IS_ENABLED(CONFIG_KMSAN))
+		vsize += MODULES_LEN * 2;
+	return size_add(vsize, vmalloc_size);
+}
+
+static unsigned long setup_kernel_memory_layout(unsigned long kernel_size)
+{
+	unsigned long vmemmap_start;
+	unsigned long kernel_start;
+	unsigned long asce_limit;
+	unsigned long rte_size;
+	unsigned long pages;
+	unsigned long vsize;
+	unsigned long vmax;
+
+	pages = ident_map_size / PAGE_SIZE;
+	/* vmemmap contains a multiple of PAGES_PER_SECTION struct pages */
+	vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page);
+
+	/* choose kernel address space layout: 4 or 3 levels. */
+	BUILD_BUG_ON(!IS_ALIGNED(TEXT_OFFSET, THREAD_SIZE));
+	BUILD_BUG_ON(!IS_ALIGNED(__NO_KASLR_START_KERNEL, THREAD_SIZE));
+	BUILD_BUG_ON(__NO_KASLR_END_KERNEL > _REGION1_SIZE);
+	vsize = get_vmem_size(ident_map_size, vmemmap_size, vmalloc_size, _REGION3_SIZE);
+	boot_debug("vmem size estimated: 0x%016lx\n", vsize);
+	if (IS_ENABLED(CONFIG_KASAN) || __NO_KASLR_END_KERNEL > _REGION2_SIZE ||
+	    (vsize > _REGION2_SIZE && kaslr_enabled())) {
+		asce_limit = _REGION1_SIZE;
+		if (__NO_KASLR_END_KERNEL > _REGION2_SIZE) {
+			rte_size = _REGION2_SIZE;
+			vsize = get_vmem_size(ident_map_size, vmemmap_size, vmalloc_size, _REGION2_SIZE);
 		} else {
-			/*
-			 * 0 == undefined symbol table index (STN_UNDEF),
-			 * used for R_390_RELATIVE, only add KASLR offset
-			 */
-			val += offset;
+			rte_size = _REGION3_SIZE;
 		}
-		r_type = ELF64_R_TYPE(rela->r_info);
-		rc = arch_kexec_do_relocs(r_type, (void *) loc, val, 0);
-		if (rc)
-			error("Unknown relocation type");
+	} else {
+		asce_limit = _REGION2_SIZE;
+		rte_size = _REGION3_SIZE;
+	}
+
+	/*
+	 * Forcing modules and vmalloc area under the ultravisor
+	 * secure storage limit, so that any vmalloc allocation
+	 * we do could be used to back secure guest storage.
+	 *
+	 * Assume the secure storage limit always exceeds _REGION2_SIZE,
+	 * otherwise asce_limit and rte_size would have been adjusted.
+	 */
+	vmax = adjust_to_uv_max(asce_limit);
+	boot_debug("%d level paging       0x%016lx vmax\n", vmax == _REGION1_SIZE ? 4 : 3, vmax);
+#ifdef CONFIG_KASAN
+	BUILD_BUG_ON(__NO_KASLR_END_KERNEL > KASAN_SHADOW_START);
+	boot_debug("KASAN shadow area:   0x%016lx-0x%016lx\n", KASAN_SHADOW_START, KASAN_SHADOW_END);
+	/* force vmalloc and modules below kasan shadow */
+	vmax = min(vmax, KASAN_SHADOW_START);
+#endif
+	vsize = min(vsize, vmax);
+	if (kaslr_enabled()) {
+		unsigned long kernel_end, kaslr_len, slots, pos;
+
+		kaslr_len = max(KASLR_LEN, vmax - vsize);
+		slots = DIV_ROUND_UP(kaslr_len - kernel_size, THREAD_SIZE);
+		if (get_random(slots, &pos))
+			pos = 0;
+		kernel_end = vmax - pos * THREAD_SIZE;
+		kernel_start = round_down(kernel_end - kernel_size, THREAD_SIZE);
+		boot_debug("Randomization range: 0x%016lx-0x%016lx\n", vmax - kaslr_len, vmax);
+		boot_debug("kernel image:        0x%016lx-0x%016lx (kaslr)\n", kernel_start,
+			   kernel_size + kernel_size);
+	} else if (vmax < __NO_KASLR_END_KERNEL || vsize > __NO_KASLR_END_KERNEL) {
+		kernel_start = round_down(vmax - kernel_size, THREAD_SIZE);
+		boot_debug("kernel image:        0x%016lx-0x%016lx (constrained)\n", kernel_start,
+			   kernel_start + kernel_size);
+	} else {
+		kernel_start = __NO_KASLR_START_KERNEL;
+		boot_debug("kernel image:        0x%016lx-0x%016lx (nokaslr)\n", kernel_start,
+			   kernel_start + kernel_size);
+	}
+	__kaslr_offset = kernel_start;
+	boot_debug("__kaslr_offset:      0x%016lx\n", __kaslr_offset);
+
+	MODULES_END = round_down(kernel_start, _SEGMENT_SIZE);
+	MODULES_VADDR = MODULES_END - MODULES_LEN;
+	VMALLOC_END = MODULES_VADDR;
+	if (IS_ENABLED(CONFIG_KMSAN))
+		VMALLOC_END -= MODULES_LEN * 2;
+	boot_debug("modules area:        0x%016lx-0x%016lx\n", MODULES_VADDR, MODULES_END);
+
+	/* allow vmalloc area to occupy up to about 1/2 of the rest virtual space left */
+	vsize = (VMALLOC_END - FIXMAP_SIZE) / 2;
+	vsize = round_down(vsize, _SEGMENT_SIZE);
+	vmalloc_size = min(vmalloc_size, vsize);
+	if (IS_ENABLED(CONFIG_KMSAN)) {
+		/* take 2/3 of vmalloc area for KMSAN shadow and origins */
+		vmalloc_size = round_down(vmalloc_size / 3, _SEGMENT_SIZE);
+		VMALLOC_END -= vmalloc_size * 2;
+	}
+	VMALLOC_START = VMALLOC_END - vmalloc_size;
+	boot_debug("vmalloc area:        0x%016lx-0x%016lx\n", VMALLOC_START, VMALLOC_END);
+
+	__memcpy_real_area = round_down(VMALLOC_START - MEMCPY_REAL_SIZE, PAGE_SIZE);
+	boot_debug("memcpy real area:    0x%016lx-0x%016lx\n", __memcpy_real_area,
+		   __memcpy_real_area + MEMCPY_REAL_SIZE);
+	__abs_lowcore = round_down(__memcpy_real_area - ABS_LOWCORE_MAP_SIZE,
+				   sizeof(struct lowcore));
+	boot_debug("abs lowcore:         0x%016lx-0x%016lx\n", __abs_lowcore,
+		   __abs_lowcore + ABS_LOWCORE_MAP_SIZE);
+
+	/* split remaining virtual space between 1:1 mapping & vmemmap array */
+	pages = __abs_lowcore / (PAGE_SIZE + sizeof(struct page));
+	pages = SECTION_ALIGN_UP(pages);
+	/* keep vmemmap_start aligned to a top level region table entry */
+	vmemmap_start = round_down(__abs_lowcore - pages * sizeof(struct page), rte_size);
+	/* make sure identity map doesn't overlay with vmemmap */
+	ident_map_size = min(ident_map_size, vmemmap_start);
+	vmemmap_size = SECTION_ALIGN_UP(ident_map_size / PAGE_SIZE) * sizeof(struct page);
+	/* make sure vmemmap doesn't overlay with absolute lowcore area */
+	if (vmemmap_start + vmemmap_size > __abs_lowcore) {
+		vmemmap_size = SECTION_ALIGN_DOWN(ident_map_size / PAGE_SIZE) * sizeof(struct page);
+		ident_map_size = vmemmap_size / sizeof(struct page) * PAGE_SIZE;
 	}
+	vmemmap = (struct page *)vmemmap_start;
+	/* maximum address for which linear mapping could be created (DCSS, memory) */
+	BUILD_BUG_ON(MAX_DCSS_ADDR > (1UL << MAX_PHYSMEM_BITS));
+	max_mappable = max(ident_map_size, MAX_DCSS_ADDR);
+	max_mappable = min(max_mappable, vmemmap_start);
+#ifdef CONFIG_RANDOMIZE_IDENTITY_BASE
+	__identity_base = round_down(vmemmap_start - max_mappable, rte_size);
+#endif
+	boot_debug("identity map:        0x%016lx-0x%016lx\n", __identity_base,
+		   __identity_base + ident_map_size);
+
+	return asce_limit;
 }
 
-static void clear_bss_section(void)
+/*
+ * This function clears the BSS section of the decompressed Linux kernel and NOT the decompressor's.
+ */
+static void clear_bss_section(unsigned long kernel_start)
+{
+	memset((void *)kernel_start + vmlinux.image_size, 0, vmlinux.bss_size);
+}
+
+/*
+ * Set vmalloc area size to an 8th of (potential) physical memory
+ * size, unless size has been set by kernel command line parameter.
+ */
+static void setup_vmalloc_size(void)
+{
+	unsigned long size;
+
+	if (vmalloc_size_set)
+		return;
+	size = round_up(ident_map_size / 8, _SEGMENT_SIZE);
+	vmalloc_size = max(size, vmalloc_size);
+}
+
+static void kaslr_adjust_vmlinux_info(long offset)
 {
-	memset((void *)vmlinux.default_lma + vmlinux.image_size, 0, vmlinux.bss_size);
+	vmlinux.bootdata_off += offset;
+	vmlinux.bootdata_preserved_off += offset;
+	vmlinux.got_start += offset;
+	vmlinux.got_end += offset;
+	vmlinux.init_mm_off += offset;
+	vmlinux.swapper_pg_dir_off += offset;
+	vmlinux.invalid_pg_dir_off += offset;
+	vmlinux.alt_instructions += offset;
+	vmlinux.alt_instructions_end += offset;
+#ifdef CONFIG_KASAN
+	vmlinux.kasan_early_shadow_page_off += offset;
+	vmlinux.kasan_early_shadow_pte_off += offset;
+	vmlinux.kasan_early_shadow_pmd_off += offset;
+	vmlinux.kasan_early_shadow_pud_off += offset;
+	vmlinux.kasan_early_shadow_p4d_off += offset;
+#endif
 }
 
 void startup_kernel(void)
 {
-	unsigned long random_lma;
+	unsigned long vmlinux_size = vmlinux.image_size + vmlinux.bss_size;
+	unsigned long nokaslr_text_lma, text_lma = 0, amode31_lma = 0;
+	unsigned long kernel_size = TEXT_OFFSET + vmlinux_size;
+	unsigned long kaslr_large_page_offset;
+	unsigned long max_physmem_end;
+	unsigned long asce_limit;
 	unsigned long safe_addr;
-	void *img;
+	psw_t psw;
 
+	setup_lpp();
 	store_ipl_parmblock();
-	safe_addr = mem_safe_offset();
-	safe_addr = read_ipl_report(safe_addr);
 	uv_query_info();
-	rescue_initrd(safe_addr);
-	sclp_early_read_info();
 	setup_boot_command_line();
 	parse_boot_command_line();
-	setup_memory_end();
-	detect_memory();
-
-	random_lma = __kaslr_offset = 0;
-	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_enabled) {
-		random_lma = get_random_base(safe_addr);
-		if (random_lma) {
-			__kaslr_offset = random_lma - vmlinux.default_lma;
-			img = (void *)vmlinux.default_lma;
-			vmlinux.default_lma += __kaslr_offset;
-			vmlinux.entry += __kaslr_offset;
-			vmlinux.bootdata_off += __kaslr_offset;
-			vmlinux.bootdata_preserved_off += __kaslr_offset;
-			vmlinux.rela_dyn_start += __kaslr_offset;
-			vmlinux.rela_dyn_end += __kaslr_offset;
-			vmlinux.dynsym_start += __kaslr_offset;
-		}
+
+	/*
+	 * Non-randomized kernel physical start address must be _SEGMENT_SIZE
+	 * aligned (see blow).
+	 */
+	nokaslr_text_lma = ALIGN(mem_safe_offset(), _SEGMENT_SIZE);
+	safe_addr = PAGE_ALIGN(nokaslr_text_lma + vmlinux_size);
+
+	/*
+	 * Reserve decompressor memory together with decompression heap,
+	 * buffer and memory which might be occupied by uncompressed kernel
+	 * (if KASLR is off or failed).
+	 */
+	physmem_reserve(RR_DECOMPRESSOR, 0, safe_addr);
+	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && parmarea.initrd_size)
+		physmem_reserve(RR_INITRD, parmarea.initrd_start, parmarea.initrd_size);
+	oldmem_data.start = parmarea.oldmem_base;
+	oldmem_data.size = parmarea.oldmem_size;
+
+	read_ipl_report();
+	sclp_early_read_info();
+	sclp_early_detect_machine_features();
+	detect_facilities();
+	detect_diag9c();
+	detect_machine_type();
+	cmma_init();
+	sanitize_prot_virt_host();
+	max_physmem_end = detect_max_physmem_end();
+	setup_ident_map_size(max_physmem_end);
+	setup_vmalloc_size();
+	asce_limit = setup_kernel_memory_layout(kernel_size);
+	/* got final ident_map_size, physmem allocations could be performed now */
+	physmem_set_usable_limit(ident_map_size);
+	detect_physmem_online_ranges(max_physmem_end);
+	save_ipl_cert_comp_list();
+	rescue_initrd(safe_addr, ident_map_size);
+
+	/*
+	 * __kaslr_offset_phys must be _SEGMENT_SIZE aligned, so the lower
+	 * 20 bits (the offset within a large page) are zero. Copy the last
+	 * 20 bits of __kaslr_offset, which is THREAD_SIZE aligned, to
+	 * __kaslr_offset_phys.
+	 *
+	 * With this the last 20 bits of __kaslr_offset_phys and __kaslr_offset
+	 * are identical, which is required to allow for large mappings of the
+	 * kernel image.
+	 */
+	kaslr_large_page_offset = __kaslr_offset & ~_SEGMENT_MASK;
+	if (kaslr_enabled()) {
+		unsigned long size = vmlinux_size + kaslr_large_page_offset;
+
+		text_lma = randomize_within_range(size, _SEGMENT_SIZE, TEXT_OFFSET, ident_map_size);
 	}
+	if (!text_lma)
+		text_lma = nokaslr_text_lma;
+	text_lma |= kaslr_large_page_offset;
 
-	if (!IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) {
-		img = decompress_kernel();
-		memmove((void *)vmlinux.default_lma, img, vmlinux.image_size);
-	} else if (__kaslr_offset)
-		memcpy((void *)vmlinux.default_lma, img, vmlinux.image_size);
+	/*
+	 * [__kaslr_offset_phys..__kaslr_offset_phys + TEXT_OFFSET] region is
+	 * never accessed via the kernel image mapping as per the linker script:
+	 *
+	 *	. = TEXT_OFFSET;
+	 *
+	 * Therefore, this region could be used for something else and does
+	 * not need to be reserved. See how it is skipped in setup_vmem().
+	 */
+	__kaslr_offset_phys = text_lma - TEXT_OFFSET;
+	kaslr_adjust_vmlinux_info(__kaslr_offset_phys);
+	physmem_reserve(RR_VMLINUX, text_lma, vmlinux_size);
+	deploy_kernel((void *)text_lma);
 
-	clear_bss_section();
-	copy_bootdata();
-	if (IS_ENABLED(CONFIG_RELOCATABLE))
-		handle_relocs(__kaslr_offset);
-
-	if (__kaslr_offset) {
-		/*
-		 * Save KASLR offset for early dumps, before vmcore_info is set.
-		 * Mark as uneven to distinguish from real vmcore_info pointer.
-		 */
-		S390_lowcore.vmcore_info = __kaslr_offset | 0x1UL;
-		/* Clear non-relocated kernel */
-		if (IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED))
-			memset(img, 0, vmlinux.image_size);
+	/* vmlinux decompression is done, shrink reserved low memory */
+	physmem_reserve(RR_DECOMPRESSOR, 0, (unsigned long)_decompressor_end);
+
+	/*
+	 * In case KASLR is enabled the randomized location of .amode31
+	 * section might overlap with .vmlinux.relocs section. To avoid that
+	 * the below randomize_within_range() could have been called with
+	 * __vmlinux_relocs_64_end as the lower range address. However,
+	 * .amode31 section is written to by the decompressed kernel - at
+	 * that time the contents of .vmlinux.relocs is not needed anymore.
+	 * Conversely, .vmlinux.relocs is read only by the decompressor, even
+	 * before the kernel started. Therefore, in case the two sections
+	 * overlap there is no risk of corrupting any data.
+	 */
+	if (kaslr_enabled()) {
+		unsigned long amode31_min;
+
+		amode31_min = (unsigned long)_decompressor_end;
+		amode31_lma = randomize_within_range(vmlinux.amode31_size, PAGE_SIZE, amode31_min, SZ_2G);
 	}
-	vmlinux.entry();
+	if (!amode31_lma)
+		amode31_lma = text_lma - vmlinux.amode31_size;
+	physmem_reserve(RR_AMODE31, amode31_lma, vmlinux.amode31_size);
+
+	/*
+	 * The order of the following operations is important:
+	 *
+	 * - kaslr_adjust_relocs() must follow clear_bss_section() to establish
+	 *   static memory references to data in .bss to be used by setup_vmem()
+	 *   (i.e init_mm.pgd)
+	 *
+	 * - setup_vmem() must follow kaslr_adjust_relocs() to be able using
+	 *   static memory references to data in .bss (i.e init_mm.pgd)
+	 *
+	 * - copy_bootdata() must follow setup_vmem() to propagate changes
+	 *   to bootdata made by setup_vmem()
+	 */
+	clear_bss_section(text_lma);
+	kaslr_adjust_relocs(text_lma, text_lma + vmlinux.image_size,
+			    __kaslr_offset, __kaslr_offset_phys);
+	kaslr_adjust_got(__kaslr_offset);
+	setup_vmem(__kaslr_offset, __kaslr_offset + kernel_size, asce_limit);
+	dump_physmem_reserved();
+	copy_bootdata();
+	__apply_alternatives((struct alt_instr *)_vmlinux_info.alt_instructions,
+			     (struct alt_instr *)_vmlinux_info.alt_instructions_end,
+			     ALT_CTX_EARLY);
+
+	/*
+	 * Save KASLR offset for early dumps, before vmcore_info is set.
+	 * Mark as uneven to distinguish from real vmcore_info pointer.
+	 */
+	get_lowcore()->vmcore_info = __kaslr_offset_phys ? __kaslr_offset_phys | 0x1UL : 0;
+
+	/*
+	 * Jump to the decompressed kernel entry point and switch DAT mode on.
+	 */
+	psw.addr = __kaslr_offset + vmlinux.entry;
+	psw.mask = PSW_KERNEL_BITS;
+	boot_debug("Starting kernel at:  0x%016lx\n", psw.addr);
+	__load_psw(psw);
 }
diff --git a/arch/s390/boot/string.c b/arch/s390/boot/string.c
index b11e8108773a..f6b9b1df48a8 100644
--- a/arch/s390/boot/string.c
+++ b/arch/s390/boot/string.c
@@ -1,10 +1,18 @@
 // SPDX-License-Identifier: GPL-2.0
+#define IN_BOOT_STRING_C 1
 #include <linux/ctype.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #undef CONFIG_KASAN
+#undef CONFIG_KASAN_GENERIC
+#undef CONFIG_KMSAN
 #include "../lib/string.c"
 
+/*
+ * Duplicate some functions from the common lib/string.c
+ * instead of fully including it.
+ */
+
 int strncmp(const char *cs, const char *ct, size_t count)
 {
 	unsigned char c1, c2;
@@ -21,6 +29,15 @@ int strncmp(const char *cs, const char *ct, size_t count)
 	return 0;
 }
 
+void *memset64(uint64_t *s, uint64_t v, size_t count)
+{
+	uint64_t *xs = s;
+
+	while (count--)
+		*xs++ = v;
+	return s;
+}
+
 char *skip_spaces(const char *str)
 {
 	while (isspace(*str))
diff --git a/arch/s390/boot/text_dma.S b/arch/s390/boot/text_dma.S
deleted file mode 100644
index 9715715c4c28..000000000000
--- a/arch/s390/boot/text_dma.S
+++ /dev/null
@@ -1,184 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Code that needs to run below 2 GB.
- *
- * Copyright IBM Corp. 2019
- */
-
-#include <linux/linkage.h>
-#include <asm/errno.h>
-#include <asm/sigp.h>
-
-#ifdef CC_USING_EXPOLINE
-	.pushsection .dma.text.__s390_indirect_jump_r14,"axG"
-__dma__s390_indirect_jump_r14:
-	larl	%r1,0f
-	ex	0,0(%r1)
-	j	.
-0:	br	%r14
-	.popsection
-#endif
-
-	.section .dma.text,"ax"
-/*
- * Simplified version of expoline thunk. The normal thunks can not be used here,
- * because they might be more than 2 GB away, and not reachable by the relative
- * branch. No comdat, exrl, etc. optimizations used here, because it only
- * affects a few functions that are not performance-relevant.
- */
-	.macro BR_EX_DMA_r14
-#ifdef CC_USING_EXPOLINE
-	jg	__dma__s390_indirect_jump_r14
-#else
-	br	%r14
-#endif
-	.endm
-
-/*
- * int _diag14_dma(unsigned long rx, unsigned long ry1, unsigned long subcode)
- */
-ENTRY(_diag14_dma)
-	lgr	%r1,%r2
-	lgr	%r2,%r3
-	lgr	%r3,%r4
-	lhi	%r5,-EIO
-	sam31
-	diag	%r1,%r2,0x14
-.Ldiag14_ex:
-	ipm	%r5
-	srl	%r5,28
-.Ldiag14_fault:
-	sam64
-	lgfr	%r2,%r5
-	BR_EX_DMA_r14
-	EX_TABLE_DMA(.Ldiag14_ex, .Ldiag14_fault)
-ENDPROC(_diag14_dma)
-
-/*
- * int _diag210_dma(struct diag210 *addr)
- */
-ENTRY(_diag210_dma)
-	lgr	%r1,%r2
-	lhi	%r2,-1
-	sam31
-	diag	%r1,%r0,0x210
-.Ldiag210_ex:
-	ipm	%r2
-	srl	%r2,28
-.Ldiag210_fault:
-	sam64
-	lgfr	%r2,%r2
-	BR_EX_DMA_r14
-	EX_TABLE_DMA(.Ldiag210_ex, .Ldiag210_fault)
-ENDPROC(_diag210_dma)
-
-/*
- * int _diag26c_dma(void *req, void *resp, enum diag26c_sc subcode)
- */
-ENTRY(_diag26c_dma)
-	lghi	%r5,-EOPNOTSUPP
-	sam31
-	diag	%r2,%r4,0x26c
-.Ldiag26c_ex:
-	sam64
-	lgfr	%r2,%r5
-	BR_EX_DMA_r14
-	EX_TABLE_DMA(.Ldiag26c_ex, .Ldiag26c_ex)
-ENDPROC(_diag26c_dma)
-
-/*
- * void _diag0c_dma(struct hypfs_diag0c_entry *entry)
- */
-ENTRY(_diag0c_dma)
-	sam31
-	diag	%r2,%r2,0x0c
-	sam64
-	BR_EX_DMA_r14
-ENDPROC(_diag0c_dma)
-
-/*
- * void _swsusp_reset_dma(void)
- */
-ENTRY(_swsusp_reset_dma)
-	larl	%r1,restart_entry
-	larl	%r2,.Lrestart_diag308_psw
-	og	%r1,0(%r2)
-	stg	%r1,0(%r0)
-	lghi	%r0,0
-	diag	%r0,%r0,0x308
-restart_entry:
-	lhi	%r1,1
-	sigp	%r1,%r0,SIGP_SET_ARCHITECTURE
-	sam64
-	BR_EX_DMA_r14
-ENDPROC(_swsusp_reset_dma)
-
-/*
- * void _diag308_reset_dma(void)
- *
- * Calls diag 308 subcode 1 and continues execution
- */
-ENTRY(_diag308_reset_dma)
-	larl	%r4,.Lctlregs		# Save control registers
-	stctg	%c0,%c15,0(%r4)
-	lg	%r2,0(%r4)		# Disable lowcore protection
-	nilh	%r2,0xefff
-	larl	%r4,.Lctlreg0
-	stg	%r2,0(%r4)
-	lctlg	%c0,%c0,0(%r4)
-	larl	%r4,.Lfpctl		# Floating point control register
-	stfpc	0(%r4)
-	larl	%r4,.Lprefix		# Save prefix register
-	stpx	0(%r4)
-	larl	%r4,.Lprefix_zero	# Set prefix register to 0
-	spx	0(%r4)
-	larl	%r4,.Lcontinue_psw	# Save PSW flags
-	epsw	%r2,%r3
-	stm	%r2,%r3,0(%r4)
-	larl	%r4,restart_part2	# Setup restart PSW at absolute 0
-	larl	%r3,.Lrestart_diag308_psw
-	og	%r4,0(%r3)		# Save PSW
-	lghi	%r3,0
-	sturg	%r4,%r3			# Use sturg, because of large pages
-	lghi	%r1,1
-	lghi	%r0,0
-	diag	%r0,%r1,0x308
-restart_part2:
-	lhi	%r0,0			# Load r0 with zero
-	lhi	%r1,2			# Use mode 2 = ESAME (dump)
-	sigp	%r1,%r0,SIGP_SET_ARCHITECTURE	# Switch to ESAME mode
-	sam64				# Switch to 64 bit addressing mode
-	larl	%r4,.Lctlregs		# Restore control registers
-	lctlg	%c0,%c15,0(%r4)
-	larl	%r4,.Lfpctl		# Restore floating point ctl register
-	lfpc	0(%r4)
-	larl	%r4,.Lprefix		# Restore prefix register
-	spx	0(%r4)
-	larl	%r4,.Lcontinue_psw	# Restore PSW flags
-	lpswe	0(%r4)
-.Lcontinue:
-	BR_EX_DMA_r14
-ENDPROC(_diag308_reset_dma)
-
-	.section .dma.data,"aw",@progbits
-.align	8
-.Lrestart_diag308_psw:
-	.long	0x00080000,0x80000000
-
-.align 8
-.Lcontinue_psw:
-	.quad	0,.Lcontinue
-
-.align 8
-.Lctlreg0:
-	.quad	0
-.Lctlregs:
-	.rept	16
-	.quad	0
-	.endr
-.Lfpctl:
-	.long	0
-.Lprefix:
-	.long	0
-.Lprefix_zero:
-	.long	0
diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c
index ed007f4a6444..4568e8f81dac 100644
--- a/arch/s390/boot/uv.c
+++ b/arch/s390/boot/uv.c
@@ -1,9 +1,16 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <asm/uv.h>
+#include <asm/boot_data.h>
 #include <asm/facility.h>
 #include <asm/sections.h>
 
+#include "boot.h"
+#include "uv.h"
+
+/* will be used in arch/s390/kernel/uv.c */
 int __bootdata_preserved(prot_virt_guest);
+int __bootdata_preserved(prot_virt_host);
+struct uv_info __bootdata_preserved(uv_info);
 
 void uv_query_info(void)
 {
@@ -15,10 +22,67 @@ void uv_query_info(void)
 	if (!test_facility(158))
 		return;
 
-	if (uv_call(0, (uint64_t)&uvcb))
+	/* Ignore that there might be more data we do not process */
+	if (uv_call(0, (uint64_t)&uvcb) && uvcb.header.rc != UVC_RC_MORE_DATA)
 		return;
 
+	if (IS_ENABLED(CONFIG_KVM)) {
+		memcpy(uv_info.inst_calls_list, uvcb.inst_calls_list, sizeof(uv_info.inst_calls_list));
+		uv_info.uv_base_stor_len = uvcb.uv_base_stor_len;
+		uv_info.guest_base_stor_len = uvcb.conf_base_phys_stor_len;
+		uv_info.guest_virt_base_stor_len = uvcb.conf_base_virt_stor_len;
+		uv_info.guest_virt_var_stor_len = uvcb.conf_virt_var_stor_len;
+		uv_info.guest_cpu_stor_len = uvcb.cpu_stor_len;
+		uv_info.max_sec_stor_addr = ALIGN(uvcb.max_guest_stor_addr, PAGE_SIZE);
+		uv_info.max_num_sec_conf = uvcb.max_num_sec_conf;
+		uv_info.max_guest_cpu_id = uvcb.max_guest_cpu_id;
+		uv_info.uv_feature_indications = uvcb.uv_feature_indications;
+		uv_info.supp_se_hdr_ver = uvcb.supp_se_hdr_versions;
+		uv_info.supp_se_hdr_pcf = uvcb.supp_se_hdr_pcf;
+		uv_info.conf_dump_storage_state_len = uvcb.conf_dump_storage_state_len;
+		uv_info.conf_dump_finalize_len = uvcb.conf_dump_finalize_len;
+		uv_info.supp_att_req_hdr_ver = uvcb.supp_att_req_hdr_ver;
+		uv_info.supp_att_pflags = uvcb.supp_att_pflags;
+		uv_info.supp_add_secret_req_ver = uvcb.supp_add_secret_req_ver;
+		uv_info.supp_add_secret_pcf = uvcb.supp_add_secret_pcf;
+		uv_info.supp_secret_types = uvcb.supp_secret_types;
+		uv_info.max_assoc_secrets = uvcb.max_assoc_secrets;
+		uv_info.max_retr_secrets = uvcb.max_retr_secrets;
+	}
+
 	if (test_bit_inv(BIT_UVC_CMD_SET_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list) &&
 	    test_bit_inv(BIT_UVC_CMD_REMOVE_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list))
 		prot_virt_guest = 1;
 }
+
+unsigned long adjust_to_uv_max(unsigned long limit)
+{
+	if (is_prot_virt_host() && uv_info.max_sec_stor_addr)
+		limit = min_t(unsigned long, limit, uv_info.max_sec_stor_addr);
+	return limit;
+}
+
+static int is_prot_virt_host_capable(void)
+{
+	/* disable if no prot_virt=1 given on command-line */
+	if (!is_prot_virt_host())
+		return 0;
+	/* disable if protected guest virtualization is enabled */
+	if (is_prot_virt_guest())
+		return 0;
+	/* disable if no hardware support */
+	if (!test_facility(158))
+		return 0;
+	/* disable if kdump */
+	if (oldmem_data.start)
+		return 0;
+	/* disable if stand-alone dump */
+	if (ipl_block_valid && is_ipl_block_dump())
+		return 0;
+	return 1;
+}
+
+void sanitize_prot_virt_host(void)
+{
+	prot_virt_host = is_prot_virt_host_capable();
+}
diff --git a/arch/s390/boot/uv.h b/arch/s390/boot/uv.h
new file mode 100644
index 000000000000..da4a4a8d48e0
--- /dev/null
+++ b/arch/s390/boot/uv.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_UV_H
+#define BOOT_UV_H
+
+unsigned long adjust_to_uv_max(unsigned long limit);
+void sanitize_prot_virt_host(void);
+void uv_query_info(void);
+
+#endif /* BOOT_UV_H */
diff --git a/arch/s390/boot/version.c b/arch/s390/boot/version.c
index d32e58bdda6a..fd32f038777f 100644
--- a/arch/s390/boot/version.c
+++ b/arch/s390/boot/version.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <generated/utsversion.h>
 #include <generated/utsrelease.h>
 #include <generated/compile.h>
 #include "boot.h"
diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c
new file mode 100644
index 000000000000..1d073acd05a7
--- /dev/null
+++ b/arch/s390/boot/vmem.c
@@ -0,0 +1,566 @@
+// SPDX-License-Identifier: GPL-2.0
+#define boot_fmt(fmt) "vmem: " fmt
+#include <linux/cpufeature.h>
+#include <linux/sched/task.h>
+#include <linux/pgtable.h>
+#include <linux/kasan.h>
+#include <asm/page-states.h>
+#include <asm/pgalloc.h>
+#include <asm/facility.h>
+#include <asm/sections.h>
+#include <asm/ctlreg.h>
+#include <asm/physmem_info.h>
+#include <asm/maccess.h>
+#include <asm/machine.h>
+#include <asm/abs_lowcore.h>
+#include "decompressor.h"
+#include "boot.h"
+
+#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
+struct ctlreg __bootdata_preserved(s390_invalid_asce);
+
+#ifdef CONFIG_PROC_FS
+atomic_long_t __bootdata_preserved(direct_pages_count[PG_DIRECT_MAP_MAX]);
+#endif
+
+#define init_mm			(*(struct mm_struct *)vmlinux.init_mm_off)
+#define swapper_pg_dir		vmlinux.swapper_pg_dir_off
+#define invalid_pg_dir		vmlinux.invalid_pg_dir_off
+
+enum populate_mode {
+	POPULATE_NONE,
+	POPULATE_DIRECT,
+	POPULATE_LOWCORE,
+	POPULATE_ABS_LOWCORE,
+	POPULATE_IDENTITY,
+	POPULATE_KERNEL,
+#ifdef CONFIG_KASAN
+	/* KASAN modes should be last and grouped together, see is_kasan_populate_mode() */
+	POPULATE_KASAN_MAP_SHADOW,
+	POPULATE_KASAN_ZERO_SHADOW,
+	POPULATE_KASAN_SHALLOW
+#endif
+};
+
+#define POPULATE_MODE_NAME(t) case POPULATE_ ## t: return #t
+static inline const char *get_populate_mode_name(enum populate_mode t)
+{
+	switch (t) {
+	POPULATE_MODE_NAME(NONE);
+	POPULATE_MODE_NAME(DIRECT);
+	POPULATE_MODE_NAME(LOWCORE);
+	POPULATE_MODE_NAME(ABS_LOWCORE);
+	POPULATE_MODE_NAME(IDENTITY);
+	POPULATE_MODE_NAME(KERNEL);
+#ifdef CONFIG_KASAN
+	POPULATE_MODE_NAME(KASAN_MAP_SHADOW);
+	POPULATE_MODE_NAME(KASAN_ZERO_SHADOW);
+	POPULATE_MODE_NAME(KASAN_SHALLOW);
+#endif
+	default:
+		return "UNKNOWN";
+	}
+}
+
+static bool is_kasan_populate_mode(enum populate_mode mode)
+{
+#ifdef CONFIG_KASAN
+	return mode >= POPULATE_KASAN_MAP_SHADOW;
+#else
+	return false;
+#endif
+}
+
+static void pgtable_populate(unsigned long addr, unsigned long end, enum populate_mode mode);
+
+#ifdef CONFIG_KASAN
+
+#define kasan_early_shadow_page	vmlinux.kasan_early_shadow_page_off
+#define kasan_early_shadow_pte	((pte_t *)vmlinux.kasan_early_shadow_pte_off)
+#define kasan_early_shadow_pmd	((pmd_t *)vmlinux.kasan_early_shadow_pmd_off)
+#define kasan_early_shadow_pud	((pud_t *)vmlinux.kasan_early_shadow_pud_off)
+#define kasan_early_shadow_p4d	((p4d_t *)vmlinux.kasan_early_shadow_p4d_off)
+#define __sha(x)		((unsigned long)kasan_mem_to_shadow((void *)x))
+
+static pte_t pte_z;
+
+static inline void kasan_populate(unsigned long start, unsigned long end, enum populate_mode mode)
+{
+	unsigned long sha_start = PAGE_ALIGN_DOWN(__sha(start));
+	unsigned long sha_end = PAGE_ALIGN(__sha(end));
+
+	boot_debug("%-17s 0x%016lx-0x%016lx >> 0x%016lx-0x%016lx\n", get_populate_mode_name(mode),
+		   start, end, sha_start, sha_end);
+	pgtable_populate(sha_start, sha_end, mode);
+}
+
+static void kasan_populate_shadow(unsigned long kernel_start, unsigned long kernel_end)
+{
+	pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY);
+	pud_t pud_z = __pud(__pa(kasan_early_shadow_pmd) | _REGION3_ENTRY);
+	p4d_t p4d_z = __p4d(__pa(kasan_early_shadow_pud) | _REGION2_ENTRY);
+	unsigned long memgap_start = 0;
+	unsigned long start, end;
+	int i;
+
+	pte_z = __pte(__pa(kasan_early_shadow_page) | pgprot_val(PAGE_KERNEL_RO));
+	crst_table_init((unsigned long *)kasan_early_shadow_p4d, p4d_val(p4d_z));
+	crst_table_init((unsigned long *)kasan_early_shadow_pud, pud_val(pud_z));
+	crst_table_init((unsigned long *)kasan_early_shadow_pmd, pmd_val(pmd_z));
+	memset64((u64 *)kasan_early_shadow_pte, pte_val(pte_z), PTRS_PER_PTE);
+	__arch_set_page_dat(kasan_early_shadow_p4d, 1UL << CRST_ALLOC_ORDER);
+	__arch_set_page_dat(kasan_early_shadow_pud, 1UL << CRST_ALLOC_ORDER);
+	__arch_set_page_dat(kasan_early_shadow_pmd, 1UL << CRST_ALLOC_ORDER);
+	__arch_set_page_dat(kasan_early_shadow_pte, 1);
+
+	for_each_physmem_usable_range(i, &start, &end) {
+		kasan_populate((unsigned long)__identity_va(start),
+			       (unsigned long)__identity_va(end),
+			       POPULATE_KASAN_MAP_SHADOW);
+		if (memgap_start && physmem_info.info_source == MEM_DETECT_DIAG260) {
+			kasan_populate((unsigned long)__identity_va(memgap_start),
+				       (unsigned long)__identity_va(start),
+				       POPULATE_KASAN_ZERO_SHADOW);
+		}
+		memgap_start = end;
+	}
+	kasan_populate(kernel_start + TEXT_OFFSET, kernel_end, POPULATE_KASAN_MAP_SHADOW);
+	kasan_populate(0, (unsigned long)__identity_va(0), POPULATE_KASAN_ZERO_SHADOW);
+	kasan_populate(AMODE31_START, AMODE31_END, POPULATE_KASAN_ZERO_SHADOW);
+	/* shallowly populate kasan shadow for vmalloc and modules */
+	kasan_populate(VMALLOC_START, MODULES_END, POPULATE_KASAN_SHALLOW);
+	/* populate kasan shadow for untracked memory */
+	kasan_populate((unsigned long)__identity_va(ident_map_size), VMALLOC_START,
+		       POPULATE_KASAN_ZERO_SHADOW);
+	kasan_populate(kernel_end, _REGION1_SIZE, POPULATE_KASAN_ZERO_SHADOW);
+}
+
+static bool kasan_pgd_populate_zero_shadow(pgd_t *pgd, unsigned long addr,
+					   unsigned long end, enum populate_mode mode)
+{
+	if (mode == POPULATE_KASAN_ZERO_SHADOW &&
+	    IS_ALIGNED(addr, PGDIR_SIZE) && end - addr >= PGDIR_SIZE) {
+		pgd_populate(&init_mm, pgd, kasan_early_shadow_p4d);
+		return true;
+	}
+	return false;
+}
+
+static bool kasan_p4d_populate_zero_shadow(p4d_t *p4d, unsigned long addr,
+					   unsigned long end, enum populate_mode mode)
+{
+	if (mode == POPULATE_KASAN_ZERO_SHADOW &&
+	    IS_ALIGNED(addr, P4D_SIZE) && end - addr >= P4D_SIZE) {
+		p4d_populate(&init_mm, p4d, kasan_early_shadow_pud);
+		return true;
+	}
+	return false;
+}
+
+static bool kasan_pud_populate_zero_shadow(pud_t *pud, unsigned long addr,
+					   unsigned long end, enum populate_mode mode)
+{
+	if (mode == POPULATE_KASAN_ZERO_SHADOW &&
+	    IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) {
+		pud_populate(&init_mm, pud, kasan_early_shadow_pmd);
+		return true;
+	}
+	return false;
+}
+
+static bool kasan_pmd_populate_zero_shadow(pmd_t *pmd, unsigned long addr,
+					   unsigned long end, enum populate_mode mode)
+{
+	if (mode == POPULATE_KASAN_ZERO_SHADOW &&
+	    IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) {
+		pmd_populate(&init_mm, pmd, kasan_early_shadow_pte);
+		return true;
+	}
+	return false;
+}
+
+static bool kasan_pte_populate_zero_shadow(pte_t *pte, enum populate_mode mode)
+{
+	if (mode == POPULATE_KASAN_ZERO_SHADOW) {
+		set_pte(pte, pte_z);
+		return true;
+	}
+	return false;
+}
+#else
+
+static inline void kasan_populate_shadow(unsigned long kernel_start, unsigned long kernel_end)
+{
+}
+
+static inline bool kasan_pgd_populate_zero_shadow(pgd_t *pgd, unsigned long addr,
+						  unsigned long end, enum populate_mode mode)
+{
+	return false;
+}
+
+static inline bool kasan_p4d_populate_zero_shadow(p4d_t *p4d, unsigned long addr,
+						  unsigned long end, enum populate_mode mode)
+{
+	return false;
+}
+
+static inline bool kasan_pud_populate_zero_shadow(pud_t *pud, unsigned long addr,
+						  unsigned long end, enum populate_mode mode)
+{
+	return false;
+}
+
+static inline bool kasan_pmd_populate_zero_shadow(pmd_t *pmd, unsigned long addr,
+						  unsigned long end, enum populate_mode mode)
+{
+	return false;
+}
+
+static bool kasan_pte_populate_zero_shadow(pte_t *pte, enum populate_mode mode)
+{
+	return false;
+}
+
+#endif
+
+/*
+ * Mimic virt_to_kpte() in lack of init_mm symbol. Skip pmd NULL check though.
+ */
+static inline pte_t *__virt_to_kpte(unsigned long va)
+{
+	return pte_offset_kernel(pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va), va);
+}
+
+static void *boot_crst_alloc(unsigned long val)
+{
+	unsigned long size = PAGE_SIZE << CRST_ALLOC_ORDER;
+	unsigned long *table;
+
+	table = (unsigned long *)physmem_alloc_or_die(RR_VMEM, size, size);
+	crst_table_init(table, val);
+	__arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER);
+	return table;
+}
+
+static pte_t *boot_pte_alloc(void)
+{
+	static void *pte_leftover;
+	pte_t *pte;
+
+	/*
+	 * handling pte_leftovers this way helps to avoid memory fragmentation
+	 * during POPULATE_KASAN_MAP_SHADOW when EDAT is off
+	 */
+	if (!pte_leftover) {
+		pte_leftover = (void *)physmem_alloc_or_die(RR_VMEM, PAGE_SIZE, PAGE_SIZE);
+		pte = pte_leftover + _PAGE_TABLE_SIZE;
+		__arch_set_page_dat(pte, 1);
+	} else {
+		pte = pte_leftover;
+		pte_leftover = NULL;
+	}
+
+	memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE);
+	return pte;
+}
+
+static unsigned long resolve_pa_may_alloc(unsigned long addr, unsigned long size,
+					  enum populate_mode mode)
+{
+	switch (mode) {
+	case POPULATE_NONE:
+		return INVALID_PHYS_ADDR;
+	case POPULATE_DIRECT:
+		return addr;
+	case POPULATE_LOWCORE:
+		return __lowcore_pa(addr);
+	case POPULATE_ABS_LOWCORE:
+		return __abs_lowcore_pa(addr);
+	case POPULATE_KERNEL:
+		return __kernel_pa(addr);
+	case POPULATE_IDENTITY:
+		return __identity_pa(addr);
+#ifdef CONFIG_KASAN
+	case POPULATE_KASAN_MAP_SHADOW:
+		/* Allow to fail large page allocations, this will fall back to 1mb/4k pages */
+		addr = physmem_alloc(RR_VMEM, size, size, size == PAGE_SIZE);
+		if (addr) {
+			memset((void *)addr, 0, size);
+			return addr;
+		}
+		return INVALID_PHYS_ADDR;
+#endif
+	default:
+		return INVALID_PHYS_ADDR;
+	}
+}
+
+static bool large_page_mapping_allowed(enum populate_mode mode)
+{
+	switch (mode) {
+	case POPULATE_DIRECT:
+	case POPULATE_IDENTITY:
+	case POPULATE_KERNEL:
+#ifdef CONFIG_KASAN
+	case POPULATE_KASAN_MAP_SHADOW:
+#endif
+		return true;
+	default:
+		return false;
+	}
+}
+
+static unsigned long try_get_large_pud_pa(pud_t *pu_dir, unsigned long addr, unsigned long end,
+					  enum populate_mode mode)
+{
+	unsigned long pa, size = end - addr;
+
+	if (!cpu_has_edat2() || !large_page_mapping_allowed(mode) ||
+	    !IS_ALIGNED(addr, PUD_SIZE) || (size < PUD_SIZE))
+		return INVALID_PHYS_ADDR;
+
+	pa = resolve_pa_may_alloc(addr, size, mode);
+	if (!IS_ALIGNED(pa, PUD_SIZE))
+		return INVALID_PHYS_ADDR;
+
+	return pa;
+}
+
+static unsigned long try_get_large_pmd_pa(pmd_t *pm_dir, unsigned long addr, unsigned long end,
+					  enum populate_mode mode)
+{
+	unsigned long pa, size = end - addr;
+
+	if (!cpu_has_edat1() || !large_page_mapping_allowed(mode) ||
+	    !IS_ALIGNED(addr, PMD_SIZE) || (size < PMD_SIZE))
+		return INVALID_PHYS_ADDR;
+
+	pa = resolve_pa_may_alloc(addr, size, mode);
+	if (!IS_ALIGNED(pa, PMD_SIZE))
+		return INVALID_PHYS_ADDR;
+
+	return pa;
+}
+
+static void pgtable_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long end,
+				 enum populate_mode mode)
+{
+	unsigned long pages = 0;
+	pte_t *pte, entry;
+
+	pte = pte_offset_kernel(pmd, addr);
+	for (; addr < end; addr += PAGE_SIZE, pte++) {
+		if (pte_none(*pte)) {
+			if (kasan_pte_populate_zero_shadow(pte, mode))
+				continue;
+			entry = __pte(resolve_pa_may_alloc(addr, PAGE_SIZE, mode));
+			entry = set_pte_bit(entry, PAGE_KERNEL);
+			set_pte(pte, entry);
+			pages++;
+		}
+	}
+	if (mode == POPULATE_IDENTITY)
+		update_page_count(PG_DIRECT_MAP_4K, pages);
+}
+
+static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long end,
+				 enum populate_mode mode)
+{
+	unsigned long pa, next, pages = 0;
+	pmd_t *pmd, entry;
+	pte_t *pte;
+
+	pmd = pmd_offset(pud, addr);
+	for (; addr < end; addr = next, pmd++) {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(*pmd)) {
+			if (kasan_pmd_populate_zero_shadow(pmd, addr, next, mode))
+				continue;
+			pa = try_get_large_pmd_pa(pmd, addr, next, mode);
+			if (pa != INVALID_PHYS_ADDR) {
+				entry = __pmd(pa);
+				entry = set_pmd_bit(entry, SEGMENT_KERNEL);
+				set_pmd(pmd, entry);
+				pages++;
+				continue;
+			}
+			pte = boot_pte_alloc();
+			pmd_populate(&init_mm, pmd, pte);
+		} else if (pmd_leaf(*pmd)) {
+			continue;
+		}
+		pgtable_pte_populate(pmd, addr, next, mode);
+	}
+	if (mode == POPULATE_IDENTITY)
+		update_page_count(PG_DIRECT_MAP_1M, pages);
+}
+
+static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end,
+				 enum populate_mode mode)
+{
+	unsigned long pa, next, pages = 0;
+	pud_t *pud, entry;
+	pmd_t *pmd;
+
+	pud = pud_offset(p4d, addr);
+	for (; addr < end; addr = next, pud++) {
+		next = pud_addr_end(addr, end);
+		if (pud_none(*pud)) {
+			if (kasan_pud_populate_zero_shadow(pud, addr, next, mode))
+				continue;
+			pa = try_get_large_pud_pa(pud, addr, next, mode);
+			if (pa != INVALID_PHYS_ADDR) {
+				entry = __pud(pa);
+				entry = set_pud_bit(entry, REGION3_KERNEL);
+				set_pud(pud, entry);
+				pages++;
+				continue;
+			}
+			pmd = boot_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+			pud_populate(&init_mm, pud, pmd);
+		} else if (pud_leaf(*pud)) {
+			continue;
+		}
+		pgtable_pmd_populate(pud, addr, next, mode);
+	}
+	if (mode == POPULATE_IDENTITY)
+		update_page_count(PG_DIRECT_MAP_2G, pages);
+}
+
+static void pgtable_p4d_populate(pgd_t *pgd, unsigned long addr, unsigned long end,
+				 enum populate_mode mode)
+{
+	unsigned long next;
+	p4d_t *p4d;
+	pud_t *pud;
+
+	p4d = p4d_offset(pgd, addr);
+	for (; addr < end; addr = next, p4d++) {
+		next = p4d_addr_end(addr, end);
+		if (p4d_none(*p4d)) {
+			if (kasan_p4d_populate_zero_shadow(p4d, addr, next, mode))
+				continue;
+			pud = boot_crst_alloc(_REGION3_ENTRY_EMPTY);
+			p4d_populate(&init_mm, p4d, pud);
+		}
+		pgtable_pud_populate(p4d, addr, next, mode);
+	}
+}
+
+static void pgtable_populate(unsigned long addr, unsigned long end, enum populate_mode mode)
+{
+	unsigned long next;
+	pgd_t *pgd;
+	p4d_t *p4d;
+
+	if (!is_kasan_populate_mode(mode)) {
+		boot_debug("%-17s 0x%016lx-0x%016lx -> 0x%016lx-0x%016lx\n",
+			   get_populate_mode_name(mode), addr, end,
+			   resolve_pa_may_alloc(addr, 0, mode),
+			   resolve_pa_may_alloc(end - 1, 0, mode) + 1);
+	}
+
+	pgd = pgd_offset(&init_mm, addr);
+	for (; addr < end; addr = next, pgd++) {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(*pgd)) {
+			if (kasan_pgd_populate_zero_shadow(pgd, addr, next, mode))
+				continue;
+			p4d = boot_crst_alloc(_REGION2_ENTRY_EMPTY);
+			pgd_populate(&init_mm, pgd, p4d);
+		}
+#ifdef CONFIG_KASAN
+		if (mode == POPULATE_KASAN_SHALLOW)
+			continue;
+#endif
+		pgtable_p4d_populate(pgd, addr, next, mode);
+	}
+}
+
+void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned long asce_limit)
+{
+	unsigned long lowcore_address = 0;
+	unsigned long start, end;
+	unsigned long asce_type;
+	unsigned long asce_bits;
+	pgd_t *init_mm_pgd;
+	int i;
+
+	/*
+	 * Mark whole memory as no-dat. This must be done before any
+	 * page tables are allocated, or kernel image builtin pages
+	 * are marked as dat tables.
+	 */
+	for_each_physmem_online_range(i, &start, &end)
+		__arch_set_page_nodat((void *)start, (end - start) >> PAGE_SHIFT);
+
+	/*
+	 * init_mm->pgd contains virtual address of swapper_pg_dir.
+	 * It is unusable at this stage since DAT is yet off. Swap
+	 * it for physical address of swapper_pg_dir and restore
+	 * the virtual address after all page tables are created.
+	 */
+	init_mm_pgd = init_mm.pgd;
+	init_mm.pgd = (pgd_t *)swapper_pg_dir;
+
+	if (asce_limit == _REGION1_SIZE) {
+		asce_type = _REGION2_ENTRY_EMPTY;
+		asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
+	} else {
+		asce_type = _REGION3_ENTRY_EMPTY;
+		asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
+	}
+	s390_invalid_asce.val = invalid_pg_dir | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
+
+	crst_table_init((unsigned long *)swapper_pg_dir, asce_type);
+	crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY);
+	__arch_set_page_dat((void *)swapper_pg_dir, 1UL << CRST_ALLOC_ORDER);
+	__arch_set_page_dat((void *)invalid_pg_dir, 1UL << CRST_ALLOC_ORDER);
+
+	if (machine_has_relocated_lowcore())
+		lowcore_address = LOWCORE_ALT_ADDRESS;
+
+	/*
+	 * To allow prefixing the lowcore must be mapped with 4KB pages.
+	 * To prevent creation of a large page at address 0 first map
+	 * the lowcore and create the identity mapping only afterwards.
+	 */
+	pgtable_populate(lowcore_address,
+			 lowcore_address + sizeof(struct lowcore),
+			 POPULATE_LOWCORE);
+	for_each_physmem_usable_range(i, &start, &end) {
+		pgtable_populate((unsigned long)__identity_va(start),
+				 (unsigned long)__identity_va(end),
+				 POPULATE_IDENTITY);
+	}
+
+	/*
+	 * [kernel_start..kernel_start + TEXT_OFFSET] region is never
+	 * accessed as per the linker script:
+	 *
+	 *	. = TEXT_OFFSET;
+	 *
+	 * Therefore, skip mapping TEXT_OFFSET bytes to prevent access to
+	 * [__kaslr_offset_phys..__kaslr_offset_phys + TEXT_OFFSET] region.
+	 */
+	pgtable_populate(kernel_start + TEXT_OFFSET, kernel_end, POPULATE_KERNEL);
+	pgtable_populate(AMODE31_START, AMODE31_END, POPULATE_DIRECT);
+	pgtable_populate(__abs_lowcore, __abs_lowcore + sizeof(struct lowcore),
+			 POPULATE_ABS_LOWCORE);
+	pgtable_populate(__memcpy_real_area, __memcpy_real_area + PAGE_SIZE,
+			 POPULATE_NONE);
+	memcpy_real_ptep = __identity_va(__virt_to_kpte(__memcpy_real_area));
+
+	kasan_populate_shadow(kernel_start, kernel_end);
+
+	get_lowcore()->kernel_asce.val = swapper_pg_dir | asce_bits;
+	get_lowcore()->user_asce = s390_invalid_asce;
+
+	local_ctl_load(1, &get_lowcore()->kernel_asce);
+	local_ctl_load(7, &get_lowcore()->user_asce);
+	local_ctl_load(13, &get_lowcore()->kernel_asce);
+
+	init_mm.context.asce = get_lowcore()->kernel_asce.val;
+	init_mm.pgd = init_mm_pgd;
+}
diff --git a/arch/s390/boot/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S
new file mode 100644
index 000000000000..50988022f9ea
--- /dev/null
+++ b/arch/s390/boot/vmlinux.lds.S
@@ -0,0 +1,172 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/vmlinux.lds.h>
+#include <asm/thread_info.h>
+#include <asm/page.h>
+#include <asm/sclp.h>
+#include "boot.h"
+
+OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390")
+OUTPUT_ARCH(s390:64-bit)
+
+ENTRY(startup)
+
+SECTIONS
+{
+	. = 0;
+	.ipldata : {
+		*(.ipldata)
+	}
+	. = IPL_START;
+	.head.text : {
+		_head = . ;
+		HEAD_TEXT
+		_ehead = . ;
+	}
+	. = PARMAREA;
+	.parmarea : {
+		*(.parmarea)
+	}
+	.text :	{
+		_text = .;	/* Text */
+		*(.text)
+		*(.text.*)
+		INIT_TEXT
+		_etext = . ;
+	}
+	.rodata : {
+		_rodata = . ;
+		*(.rodata)	 /* read-only data */
+		*(.rodata.*)
+		_erodata = . ;
+	}
+	EXCEPTION_TABLE(16)
+	.got : {
+		*(.got)
+	}
+	NOTES
+	.data :	{
+		_data = . ;
+		*(.data)
+		*(.data.*)
+		_edata = . ;
+	}
+
+	BOOT_DATA
+	BOOT_DATA_PRESERVED
+
+	/*
+	 * This is the BSS section of the decompressor and not of the decompressed Linux kernel.
+	 * It will consume place in the decompressor's image.
+	 */
+	. = ALIGN(8);
+	.bss : {
+		_bss = . ;
+		*(.bss)
+		*(.bss.*)
+		*(COMMON)
+		/*
+		 * Stacks for the decompressor
+		 */
+		. = ALIGN(PAGE_SIZE);
+		_dump_info_stack_start = .;
+		. += PAGE_SIZE;
+		_dump_info_stack_end = .;
+		. = ALIGN(PAGE_SIZE);
+		_stack_start = .;
+		. += BOOT_STACK_SIZE;
+		_stack_end = .;
+		_ebss = .;
+	}
+
+	/*
+	 * uncompressed image info used by the decompressor it should match
+	 * struct vmlinux_info. It comes from .vmlinux.info section of
+	 * uncompressed vmlinux in a form of info.o
+	 */
+	. = ALIGN(8);
+	.vmlinux.info : {
+		_vmlinux_info = .;
+		*(.vmlinux.info)
+	}
+
+	.decompressor.syms : {
+		. += 1; /* make sure we have \0 before the first entry */
+		. = ALIGN(2);
+		_decompressor_syms_start = .;
+		*(.decompressor.syms)
+		_decompressor_syms_end = .;
+	}
+
+	_decompressor_end = .;
+
+	. = ALIGN(4);
+	.vmlinux.relocs : {
+		__vmlinux_relocs_64_start = .;
+		*(.vmlinux.relocs_64)
+		__vmlinux_relocs_64_end = .;
+	}
+
+#ifdef CONFIG_KERNEL_UNCOMPRESSED
+	. = ALIGN(PAGE_SIZE);
+	. += AMODE31_SIZE;		/* .amode31 section */
+
+	/*
+	 * Make sure the location counter is not less than TEXT_OFFSET.
+	 * _SEGMENT_SIZE is not available, use ALIGN(1 << 20) instead.
+	 */
+	. = MAX(TEXT_OFFSET, ALIGN(1 << 20));
+#else
+	. = ALIGN(8);
+#endif
+	.rodata.compressed : {
+		_compressed_start = .;
+		*(.vmlinux.bin.compressed)
+		_compressed_end = .;
+	}
+
+#define SB_TRAILER_SIZE 32
+	/* Trailer needed for Secure Boot */
+	. += SB_TRAILER_SIZE; /* make sure .sb.trailer does not overwrite the previous section */
+	. = ALIGN(4096) - SB_TRAILER_SIZE;
+	.sb.trailer : {
+		QUAD(0)
+		QUAD(0)
+		QUAD(0)
+		QUAD(0x000000207a49504c)
+	}
+	_end = .;
+
+	DWARF_DEBUG
+	ELF_DETAILS
+
+	/*
+	 * Make sure that the .got.plt is either completely empty or it
+	 * contains only the three reserved double words.
+	 */
+	.got.plt : {
+		*(.got.plt)
+	}
+	ASSERT(SIZEOF(.got.plt) == 0 || SIZEOF(.got.plt) == 0x18, "Unexpected GOT/PLT entries detected!")
+
+	/*
+	 * Sections that should stay zero sized, which is safer to
+	 * explicitly check instead of blindly discarding.
+	 */
+	.plt : {
+		*(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt)
+	}
+	ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!")
+	.rela.dyn : {
+		*(.rela.*) *(.rela_*)
+	}
+	ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!")
+
+	/* Sections to be discarded */
+	/DISCARD/ : {
+		COMMON_DISCARDS
+		*(.eh_frame)
+		*(*__ksymtab*)
+		*(___kcrctab*)
+	}
+}
diff --git a/arch/s390/configs/btf.config b/arch/s390/configs/btf.config
new file mode 100644
index 000000000000..eb7f84f5925c
--- /dev/null
+++ b/arch/s390/configs/btf.config
@@ -0,0 +1,2 @@
+# Help: Enable BTF debug info
+CONFIG_DEBUG_INFO_BTF=y
diff --git a/arch/s390/configs/compat.config b/arch/s390/configs/compat.config
new file mode 100644
index 000000000000..6fd051453ae8
--- /dev/null
+++ b/arch/s390/configs/compat.config
@@ -0,0 +1,3 @@
+# Help: Enable compat support
+CONFIG_COMPAT=y
+CONFIG_COMPAT_32BIT_TIME=y
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 2e60c80395ab..6f2c9ce1b154 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -1,9 +1,16 @@
+CONFIG_UAPI_HEADER_TEST=y
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_WATCH_QUEUE=y
 CONFIG_AUDIT=y
 CONFIG_NO_HZ_IDLE=y
 CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_BPF_JIT=y
+CONFIG_BPF_JIT_ALWAYS_ON=y
+CONFIG_BPF_LSM=y
 CONFIG_PREEMPT=y
+CONFIG_SCHED_CORE=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
 CONFIG_TASKSTATS=y
@@ -14,10 +21,8 @@ CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_NUMA_BALANCING=y
 CONFIG_MEMCG=y
-CONFIG_MEMCG_SWAP=y
 CONFIG_BLK_CGROUP=y
 CONFIG_CFS_BANDWIDTH=y
-CONFIG_RT_GROUP_SCHED=y
 CONFIG_CGROUP_PIDS=y
 CONFIG_CGROUP_RDMA=y
 CONFIG_CGROUP_FREEZER=y
@@ -27,55 +32,57 @@ CONFIG_CGROUP_DEVICE=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_CGROUP_PERF=y
 CONFIG_CGROUP_BPF=y
+CONFIG_CGROUP_MISC=y
 CONFIG_NAMESPACES=y
 CONFIG_USER_NS=y
 CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_SCHED_AUTOGROUP=y
-CONFIG_BLK_DEV_INITRD=y
 CONFIG_EXPERT=y
 # CONFIG_SYSFS_SYSCALL is not set
-CONFIG_BPF_SYSCALL=y
-CONFIG_USERFAULTFD=y
-# CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
+CONFIG_KEXEC=y
+CONFIG_KEXEC_FILE=y
+CONFIG_KEXEC_SIG=y
 CONFIG_LIVEPATCH=y
-CONFIG_TUNE_ZEC12=y
+CONFIG_MARCH_Z13=y
 CONFIG_NR_CPUS=512
 CONFIG_NUMA=y
 CONFIG_HZ_100=y
-CONFIG_KEXEC_FILE=y
-CONFIG_KEXEC_SIG=y
+CONFIG_CERT_STORE=y
 CONFIG_EXPOLINE=y
 CONFIG_EXPOLINE_AUTO=y
 CONFIG_CHSC_SCH=y
 CONFIG_VFIO_CCW=m
 CONFIG_VFIO_AP=m
-CONFIG_CRASH_DUMP=y
-CONFIG_HIBERNATION=y
-CONFIG_PM_DEBUG=y
 CONFIG_CMM=m
 CONFIG_APPLDATA_BASE=y
+CONFIG_S390_HYPFS_FS=y
 CONFIG_KVM=m
-CONFIG_VHOST_NET=m
-CONFIG_VHOST_VSOCK=m
-CONFIG_OPROFILE=m
+CONFIG_KVM_S390_UCONTROL=y
+CONFIG_S390_UNWIND_SELFTEST=m
+CONFIG_S390_KPROBES_SANITY_TEST=m
+CONFIG_S390_MODULES_SANITY_TEST=m
 CONFIG_KPROBES=y
 CONFIG_JUMP_LABEL=y
 CONFIG_STATIC_KEYS_SELFTEST=y
+CONFIG_SECCOMP_CACHE_DEBUG=y
 CONFIG_LOCK_EVENT_COUNTS=y
+# CONFIG_GCC_PLUGINS is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_FORCE_LOAD=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
+CONFIG_MODULE_UNLOAD_TAINT_TRACKING=y
 CONFIG_MODVERSIONS=y
 CONFIG_MODULE_SRCVERSION_ALL=y
 CONFIG_MODULE_SIG_SHA256=y
-CONFIG_UNUSED_SYMBOLS=y
-CONFIG_BLK_DEV_INTEGRITY=y
 CONFIG_BLK_DEV_THROTTLING=y
 CONFIG_BLK_WBT=y
 CONFIG_BLK_CGROUP_IOLATENCY=y
 CONFIG_BLK_CGROUP_IOCOST=y
+CONFIG_BLK_CGROUP_IOPRIO=y
+CONFIG_BLK_INLINE_ENCRYPTION=y
+CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_IBM_PARTITION=y
 CONFIG_BSD_DISKLABEL=y
@@ -83,25 +90,27 @@ CONFIG_MINIX_SUBPARTITION=y
 CONFIG_SOLARIS_X86_PARTITION=y
 CONFIG_UNIXWARE_DISKLABEL=y
 CONFIG_IOSCHED_BFQ=y
-CONFIG_BFQ_GROUP_IOSCHED=y
 CONFIG_BINFMT_MISC=m
+CONFIG_ZSWAP=y
+CONFIG_ZSMALLOC=y
+CONFIG_ZSMALLOC_STAT=y
+CONFIG_SLAB_BUCKETS=y
+CONFIG_SLUB_STATS=y
+# CONFIG_COMPAT_BRK is not set
 CONFIG_MEMORY_HOTPLUG=y
 CONFIG_MEMORY_HOTREMOVE=y
 CONFIG_KSM=y
 CONFIG_TRANSPARENT_HUGEPAGE=y
-CONFIG_CLEANCACHE=y
-CONFIG_FRONTSWAP=y
-CONFIG_CMA_DEBUG=y
 CONFIG_CMA_DEBUGFS=y
+CONFIG_CMA_SYSFS=y
+CONFIG_CMA_AREAS=7
 CONFIG_MEM_SOFT_DIRTY=y
-CONFIG_ZSWAP=y
-CONFIG_ZBUD=m
-CONFIG_ZSMALLOC=m
-CONFIG_ZSMALLOC_STAT=y
 CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
 CONFIG_IDLE_PAGE_TRACKING=y
 CONFIG_PERCPU_STATS=y
-CONFIG_GUP_BENCHMARK=y
+CONFIG_GUP_TEST=y
+CONFIG_ANON_VMA_NAME=y
+CONFIG_USERFAULTFD=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_PACKET_DIAG=m
@@ -109,8 +118,8 @@ CONFIG_UNIX=y
 CONFIG_UNIX_DIAG=m
 CONFIG_XFRM_USER=m
 CONFIG_NET_KEY=m
-CONFIG_SMC=m
 CONFIG_SMC_DIAG=m
+CONFIG_SMC_LO=y
 CONFIG_INET=y
 CONFIG_IP_MULTICAST=y
 CONFIG_IP_ADVANCED_ROUTER=y
@@ -125,10 +134,10 @@ CONFIG_IP_MROUTE=y
 CONFIG_IP_MROUTE_MULTIPLE_TABLES=y
 CONFIG_IP_PIMSM_V1=y
 CONFIG_IP_PIMSM_V2=y
-CONFIG_SYN_COOKIES=y
 CONFIG_NET_IPVTI=m
 CONFIG_INET_AH=m
 CONFIG_INET_ESP=m
+CONFIG_INET_ESPINTCP=y
 CONFIG_INET_IPCOMP=m
 CONFIG_INET_DIAG=m
 CONFIG_INET_UDP_DIAG=m
@@ -143,6 +152,7 @@ CONFIG_TCP_CONG_ILLINOIS=m
 CONFIG_IPV6_ROUTER_PREF=y
 CONFIG_INET6_AH=m
 CONFIG_INET6_ESP=m
+CONFIG_INET6_ESPINTCP=y
 CONFIG_INET6_IPCOMP=m
 CONFIG_IPV6_MIP6=m
 CONFIG_IPV6_VTI=m
@@ -150,9 +160,15 @@ CONFIG_IPV6_SIT=m
 CONFIG_IPV6_GRE=m
 CONFIG_IPV6_MULTIPLE_TABLES=y
 CONFIG_IPV6_SUBTREES=y
+CONFIG_IPV6_RPL_LWTUNNEL=y
+CONFIG_MPTCP=y
 CONFIG_NETFILTER=y
+CONFIG_BRIDGE_NETFILTER=m
+CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_ZONES=y
+CONFIG_NF_CONNTRACK_PROCFS=y
 CONFIG_NF_CONNTRACK_EVENTS=y
 CONFIG_NF_CONNTRACK_TIMEOUT=y
 CONFIG_NF_CONNTRACK_TIMESTAMP=y
@@ -168,14 +184,39 @@ CONFIG_NF_CONNTRACK_SIP=m
 CONFIG_NF_CONNTRACK_TFTP=m
 CONFIG_NF_CT_NETLINK=m
 CONFIG_NF_CT_NETLINK_TIMEOUT=m
+CONFIG_NF_CT_NETLINK_HELPER=m
+CONFIG_NETFILTER_NETLINK_GLUE_CT=y
 CONFIG_NF_TABLES=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
+CONFIG_NFT_NUMGEN=m
 CONFIG_NFT_CT=m
-CONFIG_NFT_COUNTER=m
+CONFIG_NFT_FLOW_OFFLOAD=m
+CONFIG_NFT_CONNLIMIT=m
 CONFIG_NFT_LOG=m
 CONFIG_NFT_LIMIT=m
+CONFIG_NFT_MASQ=m
+CONFIG_NFT_REDIR=m
 CONFIG_NFT_NAT=m
+CONFIG_NFT_TUNNEL=m
+CONFIG_NFT_QUEUE=m
+CONFIG_NFT_QUOTA=m
+CONFIG_NFT_REJECT=m
 CONFIG_NFT_COMPAT=m
 CONFIG_NFT_HASH=m
+CONFIG_NFT_FIB_INET=m
+CONFIG_NFT_XFRM=m
+CONFIG_NFT_SOCKET=m
+CONFIG_NFT_OSF=m
+CONFIG_NFT_TPROXY=m
+CONFIG_NFT_SYNPROXY=m
+CONFIG_NFT_DUP_NETDEV=m
+CONFIG_NFT_FWD_NETDEV=m
+CONFIG_NFT_FIB_NETDEV=m
+CONFIG_NFT_REJECT_NETDEV=m
+CONFIG_NF_FLOW_TABLE_INET=m
+CONFIG_NF_FLOW_TABLE=m
+CONFIG_NF_FLOW_TABLE_PROCFS=y
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_AUDIT=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
@@ -188,8 +229,10 @@ CONFIG_NETFILTER_XT_TARGET_HMARK=m
 CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m
 CONFIG_NETFILTER_XT_TARGET_LOG=m
 CONFIG_NETFILTER_XT_TARGET_MARK=m
+CONFIG_NETFILTER_XT_TARGET_NETMAP=m
 CONFIG_NETFILTER_XT_TARGET_NFLOG=m
 CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
+CONFIG_NETFILTER_XT_TARGET_REDIRECT=m
 CONFIG_NETFILTER_XT_TARGET_TEE=m
 CONFIG_NETFILTER_XT_TARGET_TPROXY=m
 CONFIG_NETFILTER_XT_TARGET_TRACE=m
@@ -198,6 +241,7 @@ CONFIG_NETFILTER_XT_TARGET_TCPMSS=m
 CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m
 CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m
 CONFIG_NETFILTER_XT_MATCH_BPF=m
+CONFIG_NETFILTER_XT_MATCH_CGROUP=m
 CONFIG_NETFILTER_XT_MATCH_CLUSTER=m
 CONFIG_NETFILTER_XT_MATCH_COMMENT=m
 CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
@@ -212,6 +256,7 @@ CONFIG_NETFILTER_XT_MATCH_DSCP=m
 CONFIG_NETFILTER_XT_MATCH_ESP=m
 CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
 CONFIG_NETFILTER_XT_MATCH_HELPER=m
+CONFIG_NETFILTER_XT_MATCH_IPCOMP=m
 CONFIG_NETFILTER_XT_MATCH_IPRANGE=m
 CONFIG_NETFILTER_XT_MATCH_IPVS=m
 CONFIG_NETFILTER_XT_MATCH_LENGTH=m
@@ -229,6 +274,7 @@ CONFIG_NETFILTER_XT_MATCH_QUOTA=m
 CONFIG_NETFILTER_XT_MATCH_RATEEST=m
 CONFIG_NETFILTER_XT_MATCH_REALM=m
 CONFIG_NETFILTER_XT_MATCH_RECENT=m
+CONFIG_NETFILTER_XT_MATCH_SOCKET=m
 CONFIG_NETFILTER_XT_MATCH_STATE=m
 CONFIG_NETFILTER_XT_MATCH_STATISTIC=m
 CONFIG_NETFILTER_XT_MATCH_STRING=m
@@ -264,10 +310,12 @@ CONFIG_IP_VS_DH=m
 CONFIG_IP_VS_SH=m
 CONFIG_IP_VS_SED=m
 CONFIG_IP_VS_NQ=m
+CONFIG_IP_VS_TWOS=m
 CONFIG_IP_VS_FTP=m
 CONFIG_IP_VS_PE_SIP=m
-CONFIG_NF_TABLES_IPV4=y
+CONFIG_NFT_FIB_IPV4=m
 CONFIG_NF_TABLES_ARP=y
+CONFIG_NF_LOG_IPV4=m
 CONFIG_IP_NF_IPTABLES=m
 CONFIG_IP_NF_MATCH_AH=m
 CONFIG_IP_NF_MATCH_ECN=m
@@ -278,15 +326,13 @@ CONFIG_IP_NF_TARGET_REJECT=m
 CONFIG_IP_NF_NAT=m
 CONFIG_IP_NF_TARGET_MASQUERADE=m
 CONFIG_IP_NF_MANGLE=m
-CONFIG_IP_NF_TARGET_CLUSTERIP=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
 CONFIG_IP_NF_SECURITY=m
-CONFIG_IP_NF_ARPTABLES=m
 CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
-CONFIG_NF_TABLES_IPV6=y
+CONFIG_NFT_FIB_IPV6=m
 CONFIG_IP6_NF_IPTABLES=m
 CONFIG_IP6_NF_MATCH_AH=m
 CONFIG_IP6_NF_MATCH_EUI64=m
@@ -315,11 +361,11 @@ CONFIG_L2TP_DEBUGFS=m
 CONFIG_L2TP_V3=y
 CONFIG_L2TP_IP=m
 CONFIG_L2TP_ETH=m
-CONFIG_BRIDGE=m
+CONFIG_BRIDGE=y
+CONFIG_BRIDGE_MRP=y
 CONFIG_VLAN_8021Q=m
 CONFIG_VLAN_8021Q_GVRP=y
 CONFIG_NET_SCHED=y
-CONFIG_NET_SCH_CBQ=m
 CONFIG_NET_SCH_HTB=m
 CONFIG_NET_SCH_HFSC=m
 CONFIG_NET_SCH_PRIO=m
@@ -330,7 +376,6 @@ CONFIG_NET_SCH_SFQ=m
 CONFIG_NET_SCH_TEQL=m
 CONFIG_NET_SCH_TBF=m
 CONFIG_NET_SCH_GRED=m
-CONFIG_NET_SCH_DSMARK=m
 CONFIG_NET_SCH_NETEM=m
 CONFIG_NET_SCH_DRR=m
 CONFIG_NET_SCH_MQPRIO=m
@@ -340,15 +385,13 @@ CONFIG_NET_SCH_CODEL=m
 CONFIG_NET_SCH_FQ_CODEL=m
 CONFIG_NET_SCH_INGRESS=m
 CONFIG_NET_SCH_PLUG=m
+CONFIG_NET_SCH_ETS=m
 CONFIG_NET_CLS_BASIC=m
-CONFIG_NET_CLS_TCINDEX=m
 CONFIG_NET_CLS_ROUTE4=m
 CONFIG_NET_CLS_FW=m
 CONFIG_NET_CLS_U32=m
 CONFIG_CLS_U32_PERF=y
 CONFIG_CLS_U32_MARK=y
-CONFIG_NET_CLS_RSVP=m
-CONFIG_NET_CLS_RSVP6=m
 CONFIG_NET_CLS_FLOW=m
 CONFIG_NET_CLS_CGROUP=y
 CONFIG_NET_CLS_BPF=m
@@ -357,30 +400,40 @@ CONFIG_NET_ACT_POLICE=m
 CONFIG_NET_ACT_GACT=m
 CONFIG_GACT_PROB=y
 CONFIG_NET_ACT_MIRRED=m
-CONFIG_NET_ACT_IPT=m
 CONFIG_NET_ACT_NAT=m
 CONFIG_NET_ACT_PEDIT=m
 CONFIG_NET_ACT_SIMP=m
 CONFIG_NET_ACT_SKBEDIT=m
 CONFIG_NET_ACT_CSUM=m
+CONFIG_NET_ACT_GATE=m
+CONFIG_NET_TC_SKB_EXT=y
 CONFIG_DNS_RESOLVER=y
 CONFIG_OPENVSWITCH=m
 CONFIG_VSOCKETS=m
 CONFIG_VIRTIO_VSOCKETS=m
 CONFIG_NETLINK_DIAG=m
+CONFIG_NET_SWITCHDEV=y
 CONFIG_CGROUP_NET_PRIO=y
-CONFIG_BPF_JIT=y
 CONFIG_NET_PKTGEN=m
-# CONFIG_NET_DROP_MONITOR is not set
 CONFIG_PCI=y
+# CONFIG_PCIEASPM is not set
 CONFIG_PCI_DEBUG=y
+CONFIG_PCI_IOV=y
 CONFIG_HOTPLUG_PCI=y
 CONFIG_HOTPLUG_PCI_S390=y
 CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_SAFE=y
+# CONFIG_FW_LOADER is not set
 CONFIG_CONNECTOR=y
-CONFIG_ZRAM=m
+CONFIG_ZRAM=y
+CONFIG_ZRAM_BACKEND_LZ4=y
+CONFIG_ZRAM_BACKEND_LZ4HC=y
+CONFIG_ZRAM_BACKEND_ZSTD=y
+CONFIG_ZRAM_BACKEND_DEFLATE=y
+CONFIG_ZRAM_BACKEND_842=y
+CONFIG_ZRAM_BACKEND_LZO=y
+CONFIG_ZRAM_DEF_COMP_DEFLATE=y
 CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
@@ -401,12 +454,12 @@ CONFIG_SCSI_ENCLOSURE=m
 CONFIG_SCSI_CONSTANTS=y
 CONFIG_SCSI_LOGGING=y
 CONFIG_SCSI_SPI_ATTRS=m
-CONFIG_SCSI_FC_ATTRS=y
+CONFIG_SCSI_FC_ATTRS=m
 CONFIG_SCSI_SAS_LIBSAS=m
 CONFIG_SCSI_SRP_ATTRS=m
 CONFIG_ISCSI_TCP=m
 CONFIG_SCSI_DEBUG=m
-CONFIG_ZFCP=y
+CONFIG_ZFCP=m
 CONFIG_SCSI_VIRTIO=m
 CONFIG_SCSI_DH=y
 CONFIG_SCSI_DH_RDAC=m
@@ -415,12 +468,11 @@ CONFIG_SCSI_DH_EMC=m
 CONFIG_SCSI_DH_ALUA=m
 CONFIG_MD=y
 CONFIG_BLK_DEV_MD=y
+# CONFIG_MD_BITMAP_FILE is not set
 CONFIG_MD_LINEAR=m
-CONFIG_MD_MULTIPATH=m
-CONFIG_MD_FAULTY=m
 CONFIG_MD_CLUSTER=m
 CONFIG_BCACHE=m
-CONFIG_BLK_DEV_DM=m
+CONFIG_BLK_DEV_DM=y
 CONFIG_DM_UNSTRIPED=m
 CONFIG_DM_CRYPT=m
 CONFIG_DM_SNAPSHOT=m
@@ -434,12 +486,18 @@ CONFIG_DM_ZERO=m
 CONFIG_DM_MULTIPATH=m
 CONFIG_DM_MULTIPATH_QL=m
 CONFIG_DM_MULTIPATH_ST=m
+CONFIG_DM_MULTIPATH_HST=m
+CONFIG_DM_MULTIPATH_IOA=m
 CONFIG_DM_DELAY=m
+CONFIG_DM_INIT=y
 CONFIG_DM_UEVENT=y
 CONFIG_DM_FLAKEY=m
 CONFIG_DM_VERITY=m
 CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y
+CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG_PLATFORM_KEYRING=y
 CONFIG_DM_SWITCH=m
+CONFIG_DM_INTEGRITY=m
+CONFIG_DM_VDO=m
 CONFIG_NETDEVICES=y
 CONFIG_BONDING=m
 CONFIG_DUMMY=m
@@ -447,6 +505,9 @@ CONFIG_EQUALIZER=m
 CONFIG_IFB=m
 CONFIG_MACVLAN=m
 CONFIG_MACVTAP=m
+CONFIG_VXLAN=m
+CONFIG_BAREUDP=m
+CONFIG_AMT=m
 CONFIG_TUN=m
 CONFIG_VETH=m
 CONFIG_VIRTIO_NET=m
@@ -460,41 +521,45 @@ CONFIG_NLMON=m
 # CONFIG_NET_VENDOR_AMD is not set
 # CONFIG_NET_VENDOR_AQUANTIA is not set
 # CONFIG_NET_VENDOR_ARC is not set
+# CONFIG_NET_VENDOR_ASIX is not set
 # CONFIG_NET_VENDOR_ATHEROS is not set
-# CONFIG_NET_VENDOR_AURORA is not set
 # CONFIG_NET_VENDOR_BROADCOM is not set
-# CONFIG_NET_VENDOR_BROCADE is not set
 # CONFIG_NET_VENDOR_CADENCE is not set
 # CONFIG_NET_VENDOR_CAVIUM is not set
 # CONFIG_NET_VENDOR_CHELSIO is not set
 # CONFIG_NET_VENDOR_CISCO is not set
 # CONFIG_NET_VENDOR_CORTINA is not set
+# CONFIG_NET_VENDOR_DAVICOM is not set
 # CONFIG_NET_VENDOR_DEC is not set
 # CONFIG_NET_VENDOR_DLINK is not set
 # CONFIG_NET_VENDOR_EMULEX is not set
+# CONFIG_NET_VENDOR_ENGLEDER is not set
 # CONFIG_NET_VENDOR_EZCHIP is not set
+# CONFIG_NET_VENDOR_FUNGIBLE is not set
 # CONFIG_NET_VENDOR_GOOGLE is not set
-# CONFIG_NET_VENDOR_HP is not set
 # CONFIG_NET_VENDOR_HUAWEI is not set
 # CONFIG_NET_VENDOR_INTEL is not set
+# CONFIG_NET_VENDOR_LITEX is not set
 # CONFIG_NET_VENDOR_MARVELL is not set
 CONFIG_MLX4_EN=m
 CONFIG_MLX5_CORE=m
 CONFIG_MLX5_CORE_EN=y
-# CONFIG_MLXFW is not set
+# CONFIG_NET_VENDOR_META is not set
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_MICROCHIP is not set
 # CONFIG_NET_VENDOR_MICROSEMI is not set
+# CONFIG_NET_VENDOR_MICROSOFT is not set
 # CONFIG_NET_VENDOR_MYRI is not set
+# CONFIG_NET_VENDOR_NI is not set
 # CONFIG_NET_VENDOR_NATSEMI is not set
 # CONFIG_NET_VENDOR_NETERION is not set
 # CONFIG_NET_VENDOR_NETRONOME is not set
-# CONFIG_NET_VENDOR_NI is not set
 # CONFIG_NET_VENDOR_NVIDIA is not set
 # CONFIG_NET_VENDOR_OKI is not set
 # CONFIG_NET_VENDOR_PACKET_ENGINES is not set
 # CONFIG_NET_VENDOR_PENSANDO is not set
 # CONFIG_NET_VENDOR_QLOGIC is not set
+# CONFIG_NET_VENDOR_BROCADE is not set
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RDC is not set
 # CONFIG_NET_VENDOR_REALTEK is not set
@@ -502,9 +567,9 @@ CONFIG_MLX5_CORE_EN=y
 # CONFIG_NET_VENDOR_ROCKER is not set
 # CONFIG_NET_VENDOR_SAMSUNG is not set
 # CONFIG_NET_VENDOR_SEEQ is not set
-# CONFIG_NET_VENDOR_SOLARFLARE is not set
 # CONFIG_NET_VENDOR_SILAN is not set
 # CONFIG_NET_VENDOR_SIS is not set
+# CONFIG_NET_VENDOR_SOLARFLARE is not set
 # CONFIG_NET_VENDOR_SMSC is not set
 # CONFIG_NET_VENDOR_SOCIONEXT is not set
 # CONFIG_NET_VENDOR_STMICRO is not set
@@ -512,8 +577,11 @@ CONFIG_MLX5_CORE_EN=y
 # CONFIG_NET_VENDOR_SYNOPSYS is not set
 # CONFIG_NET_VENDOR_TEHUTI is not set
 # CONFIG_NET_VENDOR_TI is not set
+# CONFIG_NET_VENDOR_VERTEXCOM is not set
 # CONFIG_NET_VENDOR_VIA is not set
+# CONFIG_NET_VENDOR_WANGXUN is not set
 # CONFIG_NET_VENDOR_WIZNET is not set
+# CONFIG_NET_VENDOR_XILINX is not set
 CONFIG_PPP=m
 CONFIG_PPP_BSDCOMP=m
 CONFIG_PPP_DEFLATE=m
@@ -531,9 +599,9 @@ CONFIG_INPUT_EVDEV=y
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_SERIO is not set
 CONFIG_LEGACY_PTY_COUNT=0
-CONFIG_NULL_TTY=m
+# CONFIG_LEGACY_TIOCSTI is not set
+CONFIG_VIRTIO_CONSOLE=m
 CONFIG_HW_RANDOM_VIRTIO=m
-CONFIG_RAW_DRIVER=m
 CONFIG_HANGCHECK_TIMER=m
 CONFIG_TN3270_FS=y
 CONFIG_PPS=m
@@ -543,25 +611,25 @@ CONFIG_WATCHDOG=y
 CONFIG_WATCHDOG_NOWAYOUT=y
 CONFIG_SOFT_WATCHDOG=m
 CONFIG_DIAG288_WATCHDOG=m
+CONFIG_DRM=m
+CONFIG_DRM_VIRTIO_GPU=m
 CONFIG_FB=y
-CONFIG_FRAMEBUFFER_CONSOLE=y
-CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y
-# CONFIG_HID is not set
+# CONFIG_FB_DEVICE is not set
+# CONFIG_HID_SUPPORT is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_INFINIBAND=m
 CONFIG_INFINIBAND_USER_ACCESS=m
 CONFIG_MLX4_INFINIBAND=m
 CONFIG_MLX5_INFINIBAND=m
-CONFIG_SYNC_FILE=y
 CONFIG_VFIO=m
 CONFIG_VFIO_PCI=m
-CONFIG_VFIO_MDEV=m
-CONFIG_VFIO_MDEV_DEVICE=m
+CONFIG_MLX5_VFIO_PCI=m
 CONFIG_VIRTIO_PCI=m
 CONFIG_VIRTIO_BALLOON=m
+CONFIG_VIRTIO_MEM=m
 CONFIG_VIRTIO_INPUT=y
-CONFIG_S390_CCW_IOMMU=y
-CONFIG_S390_AP_IOMMU=y
+CONFIG_VHOST_NET=m
+CONFIG_VHOST_VSOCK=m
 CONFIG_EXT4_FS=y
 CONFIG_EXT4_FS_POSIX_ACL=y
 CONFIG_EXT4_FS_SECURITY=y
@@ -583,6 +651,9 @@ CONFIG_BTRFS_FS_POSIX_ACL=y
 CONFIG_BTRFS_DEBUG=y
 CONFIG_BTRFS_ASSERT=y
 CONFIG_NILFS2_FS=m
+CONFIG_BCACHEFS_FS=y
+CONFIG_BCACHEFS_QUOTA=y
+CONFIG_BCACHEFS_POSIX_ACL=y
 CONFIG_FS_DAX=y
 CONFIG_EXPORTFS_BLOCK_OPS=y
 CONFIG_FS_ENCRYPTION=y
@@ -594,12 +665,13 @@ CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_QUOTA_DEBUG=y
 CONFIG_QFMT_V1=m
 CONFIG_QFMT_V2=m
-CONFIG_AUTOFS4_FS=m
+CONFIG_AUTOFS_FS=m
 CONFIG_FUSE_FS=y
 CONFIG_CUSE=m
 CONFIG_VIRTIO_FS=m
 CONFIG_OVERLAY_FS=m
-CONFIG_FSCACHE=m
+CONFIG_NETFS_STATS=y
+CONFIG_FSCACHE=y
 CONFIG_CACHEFILES=m
 CONFIG_ISO9660_FS=y
 CONFIG_JOLIET=y
@@ -607,16 +679,18 @@ CONFIG_ZISOFS=y
 CONFIG_UDF_FS=m
 CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
+CONFIG_EXFAT_FS=m
 CONFIG_NTFS_FS=m
-CONFIG_NTFS_RW=y
 CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_TMPFS_INODE64=y
+CONFIG_TMPFS_QUOTA=y
 CONFIG_HUGETLBFS=y
-CONFIG_CONFIGFS_FS=m
 CONFIG_ECRYPT_FS=m
 CONFIG_CRAMFS=m
 CONFIG_SQUASHFS=m
+CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT=y
 CONFIG_SQUASHFS_XATTR=y
 CONFIG_SQUASHFS_LZ4=y
 CONFIG_SQUASHFS_LZO=y
@@ -624,6 +698,7 @@ CONFIG_SQUASHFS_XZ=y
 CONFIG_SQUASHFS_ZSTD=y
 CONFIG_ROMFS_FS=m
 CONFIG_NFS_FS=m
+CONFIG_NFS_V2=m
 CONFIG_NFS_V3_ACL=y
 CONFIG_NFS_V4=m
 CONFIG_NFS_SWAP=y
@@ -631,14 +706,14 @@ CONFIG_NFSD=m
 CONFIG_NFSD_V3_ACL=y
 CONFIG_NFSD_V4=y
 CONFIG_NFSD_V4_SECURITY_LABEL=y
+# CONFIG_NFSD_LEGACY_CLIENT_TRACKING is not set
 CONFIG_CIFS=m
-CONFIG_CIFS_STATS2=y
-CONFIG_CIFS_WEAK_PW_HASH=y
 CONFIG_CIFS_UPCALL=y
 CONFIG_CIFS_XATTR=y
 CONFIG_CIFS_POSIX=y
 # CONFIG_CIFS_DEBUG is not set
 CONFIG_CIFS_DFS_UPCALL=y
+CONFIG_CIFS_SWN_UPCALL=y
 CONFIG_NLS_DEFAULT="utf8"
 CONFIG_NLS_CODEPAGE_437=m
 CONFIG_NLS_CODEPAGE_850=m
@@ -649,23 +724,24 @@ CONFIG_NLS_UTF8=m
 CONFIG_DLM=m
 CONFIG_UNICODE=y
 CONFIG_PERSISTENT_KEYRINGS=y
-CONFIG_BIG_KEYS=y
 CONFIG_ENCRYPTED_KEYS=m
+CONFIG_KEY_NOTIFICATIONS=y
 CONFIG_SECURITY=y
-CONFIG_SECURITY_NETWORK=y
+CONFIG_HARDENED_USERCOPY=y
 CONFIG_FORTIFY_SOURCE=y
 CONFIG_SECURITY_SELINUX=y
 CONFIG_SECURITY_SELINUX_BOOTPARAM=y
-CONFIG_SECURITY_SELINUX_DISABLE=y
 CONFIG_SECURITY_LOCKDOWN_LSM=y
 CONFIG_SECURITY_LOCKDOWN_LSM_EARLY=y
+CONFIG_SECURITY_LANDLOCK=y
 CONFIG_INTEGRITY_SIGNATURE=y
 CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y
+CONFIG_INTEGRITY_PLATFORM_KEYRING=y
 CONFIG_IMA=y
 CONFIG_IMA_DEFAULT_HASH_SHA256=y
 CONFIG_IMA_WRITE_POLICY=y
 CONFIG_IMA_APPRAISE=y
-CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor"
+CONFIG_BUG_ON_DATA_CORRUPTION=y
 CONFIG_CRYPTO_USER=m
 # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
 CONFIG_CRYPTO_PCRYPT=m
@@ -673,42 +749,41 @@ CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
+CONFIG_CRYPTO_ECDSA=m
 CONFIG_CRYPTO_ECRDSA=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
-CONFIG_CRYPTO_AEGIS128=m
-CONFIG_CRYPTO_CFB=m
-CONFIG_CRYPTO_LRW=m
-CONFIG_CRYPTO_PCBC=m
-CONFIG_CRYPTO_KEYWRAP=m
-CONFIG_CRYPTO_ADIANTUM=m
-CONFIG_CRYPTO_XCBC=m
-CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_CRC32=m
-CONFIG_CRYPTO_XXHASH=m
-CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_RMD128=m
-CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_RMD256=m
-CONFIG_CRYPTO_RMD320=m
-CONFIG_CRYPTO_SHA3=m
-CONFIG_CRYPTO_SM3=m
-CONFIG_CRYPTO_TGR192=m
-CONFIG_CRYPTO_WP512=m
+CONFIG_CRYPTO_CURVE25519=m
 CONFIG_CRYPTO_AES_TI=m
 CONFIG_CRYPTO_ANUBIS=m
-CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_ARIA=m
 CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAMELLIA=m
 CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
-CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SM4_GENERIC=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_ADIANTUM=m
+CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_HCTR2=m
+CONFIG_CRYPTO_LRW=m
+CONFIG_CRYPTO_PCBC=m
+CONFIG_CRYPTO_AEGIS128=m
+CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_GCM=y
+CONFIG_CRYPTO_SEQIV=y
+CONFIG_CRYPTO_MD4=m
+CONFIG_CRYPTO_MD5=y
+CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SM3_GENERIC=m
+CONFIG_CRYPTO_WP512=m
+CONFIG_CRYPTO_XCBC=m
+CONFIG_CRYPTO_CRC32=m
 CONFIG_CRYPTO_842=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
@@ -718,42 +793,45 @@ CONFIG_CRYPTO_USER_API_HASH=m
 CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
-CONFIG_CRYPTO_STATS=y
-CONFIG_ZCRYPT=m
-CONFIG_PKEY=m
-CONFIG_CRYPTO_PAES_S390=m
+CONFIG_CRYPTO_SHA512_S390=m
 CONFIG_CRYPTO_SHA1_S390=m
 CONFIG_CRYPTO_SHA256_S390=m
-CONFIG_CRYPTO_SHA512_S390=m
 CONFIG_CRYPTO_SHA3_256_S390=m
 CONFIG_CRYPTO_SHA3_512_S390=m
-CONFIG_CRYPTO_DES_S390=m
-CONFIG_CRYPTO_AES_S390=m
 CONFIG_CRYPTO_GHASH_S390=m
-CONFIG_CRYPTO_CRC32_S390=y
+CONFIG_CRYPTO_AES_S390=m
+CONFIG_CRYPTO_DES_S390=m
+CONFIG_CRYPTO_CHACHA_S390=m
+CONFIG_CRYPTO_HMAC_S390=m
+CONFIG_ZCRYPT=m
+CONFIG_PKEY=m
+CONFIG_PKEY_CCA=m
+CONFIG_PKEY_EP11=m
+CONFIG_PKEY_PCKMO=m
+CONFIG_PKEY_UV=m
+CONFIG_CRYPTO_PAES_S390=m
+CONFIG_CRYPTO_DEV_VIRTIO=m
+CONFIG_SYSTEM_BLACKLIST_KEYRING=y
 CONFIG_CORDIC=m
-CONFIG_CRC32_SELFTEST=y
-CONFIG_CRC4=m
-CONFIG_CRC7=m
-CONFIG_CRC8=m
+CONFIG_CRYPTO_LIB_CURVE25519=m
+CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
 CONFIG_RANDOM32_SELFTEST=y
+CONFIG_XZ_DEC_MICROLZMA=y
 CONFIG_DMA_CMA=y
 CONFIG_CMA_SIZE_MBYTES=0
-CONFIG_DMA_API_DEBUG=y
-CONFIG_STRING_SELFTEST=y
 CONFIG_PRINTK_TIME=y
 CONFIG_DYNAMIC_DEBUG=y
-CONFIG_DEBUG_INFO=y
 CONFIG_DEBUG_INFO_DWARF4=y
 CONFIG_GDB_SCRIPTS=y
-CONFIG_FRAME_WARN=1024
 CONFIG_HEADERS_INSTALL=y
-CONFIG_HEADERS_CHECK=y
 CONFIG_DEBUG_SECTION_MISMATCH=y
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_PAGEALLOC=y
+CONFIG_SLUB_DEBUG_ON=y
 CONFIG_PAGE_OWNER=y
 CONFIG_DEBUG_RODATA_TEST=y
+CONFIG_DEBUG_WX=y
+CONFIG_PTDUMP_DEBUGFS=y
 CONFIG_DEBUG_OBJECTS=y
 CONFIG_DEBUG_OBJECTS_SELFTEST=y
 CONFIG_DEBUG_OBJECTS_FREE=y
@@ -761,58 +839,81 @@ CONFIG_DEBUG_OBJECTS_TIMERS=y
 CONFIG_DEBUG_OBJECTS_WORK=y
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
 CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER=y
-CONFIG_SLUB_DEBUG_ON=y
-CONFIG_SLUB_STATS=y
 CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_DEBUG_VM=y
-CONFIG_DEBUG_VM_VMACACHE=y
-CONFIG_DEBUG_VM_RB=y
 CONFIG_DEBUG_VM_PGFLAGS=y
+CONFIG_DEBUG_VIRTUAL=y
 CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m
 CONFIG_DEBUG_PER_CPU_MAPS=y
+CONFIG_KFENCE=y
+CONFIG_KFENCE_DEFERRABLE=y
+CONFIG_KFENCE_STATIC_KEYS=y
 CONFIG_DEBUG_SHIRQ=y
+CONFIG_PANIC_ON_OOPS=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_WQ_WATCHDOG=y
-CONFIG_PANIC_ON_OOPS=y
-CONFIG_DEBUG_TIMEKEEPING=y
+CONFIG_TEST_LOCKUP=m
+CONFIG_DEBUG_PREEMPT=y
 CONFIG_PROVE_LOCKING=y
 CONFIG_LOCK_STAT=y
-CONFIG_DEBUG_LOCKDEP=y
+CONFIG_LOCKDEP_BITS=16
+CONFIG_LOCKDEP_CHAINS_BITS=17
 CONFIG_DEBUG_ATOMIC_SLEEP=y
 CONFIG_DEBUG_LOCKING_API_SELFTESTS=y
+CONFIG_DEBUG_IRQFLAGS=y
+CONFIG_DEBUG_LIST=y
 CONFIG_DEBUG_SG=y
 CONFIG_DEBUG_NOTIFIERS=y
-CONFIG_DEBUG_CREDENTIALS=y
 CONFIG_RCU_TORTURE_TEST=m
+CONFIG_RCU_REF_SCALE_TEST=m
 CONFIG_RCU_CPU_STALL_TIMEOUT=300
+# CONFIG_RCU_TRACE is not set
+CONFIG_LATENCYTOP=y
+CONFIG_BOOTTIME_TRACING=y
+CONFIG_FUNCTION_GRAPH_RETVAL=y
+CONFIG_FUNCTION_GRAPH_RETADDR=y
+CONFIG_FPROBE=y
+CONFIG_FUNCTION_PROFILER=y
+CONFIG_STACK_TRACER=y
+CONFIG_IRQSOFF_TRACER=y
+CONFIG_PREEMPT_TRACER=y
+CONFIG_SCHED_TRACER=y
+CONFIG_FTRACE_SYSCALLS=y
+CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_USER_EVENTS=y
+CONFIG_HIST_TRIGGERS=y
+CONFIG_FTRACE_STARTUP_TEST=y
+# CONFIG_EVENT_TRACE_STARTUP_TEST is not set
+CONFIG_FTRACE_SORT_STARTUP_TEST=y
+CONFIG_SAMPLES=y
+CONFIG_SAMPLE_TRACE_PRINTK=m
+CONFIG_SAMPLE_FTRACE_DIRECT=m
+CONFIG_SAMPLE_FTRACE_DIRECT_MULTI=m
+CONFIG_SAMPLE_FTRACE_OPS=m
+CONFIG_DEBUG_ENTRY=y
+CONFIG_STRICT_MM_TYPECHECKS=y
+CONFIG_CIO_INJECT=y
+CONFIG_KUNIT=m
+CONFIG_KUNIT_DEBUGFS=y
 CONFIG_NOTIFIER_ERROR_INJECTION=m
 CONFIG_NETDEV_NOTIFIER_ERROR_INJECT=m
 CONFIG_FAULT_INJECTION=y
 CONFIG_FAILSLAB=y
 CONFIG_FAIL_PAGE_ALLOC=y
+CONFIG_FAULT_INJECTION_USERCOPY=y
 CONFIG_FAIL_MAKE_REQUEST=y
 CONFIG_FAIL_IO_TIMEOUT=y
 CONFIG_FAIL_FUTEX=y
 CONFIG_FAULT_INJECTION_DEBUG_FS=y
+CONFIG_FAULT_INJECTION_CONFIGFS=y
 CONFIG_FAULT_INJECTION_STACKTRACE_FILTER=y
-CONFIG_LATENCYTOP=y
-CONFIG_IRQSOFF_TRACER=y
-CONFIG_PREEMPT_TRACER=y
-CONFIG_SCHED_TRACER=y
-CONFIG_FTRACE_SYSCALLS=y
-CONFIG_STACK_TRACER=y
-CONFIG_BLK_DEV_IO_TRACE=y
-CONFIG_FUNCTION_PROFILER=y
-CONFIG_HIST_TRIGGERS=y
 CONFIG_LKDTM=m
-CONFIG_TEST_LIST_SORT=y
-CONFIG_TEST_SORT=y
-CONFIG_KPROBES_SANITY_TEST=y
+CONFIG_TEST_MIN_HEAP=y
+CONFIG_KPROBES_SANITY_TEST=m
 CONFIG_RBTREE_TEST=y
 CONFIG_INTERVAL_TREE_TEST=m
 CONFIG_PERCPU_TEST=m
 CONFIG_ATOMIC64_SELFTEST=y
+CONFIG_TEST_BITOPS=m
 CONFIG_TEST_BPF=m
-CONFIG_BUG_ON_DATA_CORRUPTION=y
-CONFIG_S390_PTDUMP=y
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index 25f799849582..f18a7d97ac21 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -1,8 +1,14 @@
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_WATCH_QUEUE=y
 CONFIG_AUDIT=y
 CONFIG_NO_HZ_IDLE=y
 CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_BPF_JIT=y
+CONFIG_BPF_JIT_ALWAYS_ON=y
+CONFIG_BPF_LSM=y
+CONFIG_SCHED_CORE=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
 CONFIG_TASKSTATS=y
@@ -13,10 +19,8 @@ CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_NUMA_BALANCING=y
 CONFIG_MEMCG=y
-CONFIG_MEMCG_SWAP=y
 CONFIG_BLK_CGROUP=y
 CONFIG_CFS_BANDWIDTH=y
-CONFIG_RT_GROUP_SCHED=y
 CONFIG_CGROUP_PIDS=y
 CONFIG_CGROUP_RDMA=y
 CONFIG_CGROUP_FREEZER=y
@@ -26,53 +30,53 @@ CONFIG_CGROUP_DEVICE=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_CGROUP_PERF=y
 CONFIG_CGROUP_BPF=y
+CONFIG_CGROUP_MISC=y
 CONFIG_NAMESPACES=y
 CONFIG_USER_NS=y
 CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_SCHED_AUTOGROUP=y
-CONFIG_BLK_DEV_INITRD=y
 CONFIG_EXPERT=y
 # CONFIG_SYSFS_SYSCALL is not set
-CONFIG_BPF_SYSCALL=y
-CONFIG_USERFAULTFD=y
-# CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
+CONFIG_KEXEC=y
+CONFIG_KEXEC_FILE=y
+CONFIG_KEXEC_SIG=y
 CONFIG_LIVEPATCH=y
-CONFIG_TUNE_ZEC12=y
+CONFIG_MARCH_Z13=y
 CONFIG_NR_CPUS=512
 CONFIG_NUMA=y
-# CONFIG_NUMA_EMU is not set
 CONFIG_HZ_100=y
-CONFIG_KEXEC_FILE=y
-CONFIG_KEXEC_SIG=y
+CONFIG_CERT_STORE=y
 CONFIG_EXPOLINE=y
 CONFIG_EXPOLINE_AUTO=y
 CONFIG_CHSC_SCH=y
 CONFIG_VFIO_CCW=m
 CONFIG_VFIO_AP=m
-CONFIG_CRASH_DUMP=y
-CONFIG_HIBERNATION=y
-CONFIG_PM_DEBUG=y
 CONFIG_CMM=m
 CONFIG_APPLDATA_BASE=y
+CONFIG_S390_HYPFS_FS=y
 CONFIG_KVM=m
-CONFIG_VHOST_NET=m
-CONFIG_VHOST_VSOCK=m
-CONFIG_OPROFILE=m
+CONFIG_S390_UNWIND_SELFTEST=m
+CONFIG_S390_KPROBES_SANITY_TEST=m
+CONFIG_S390_MODULES_SANITY_TEST=m
 CONFIG_KPROBES=y
 CONFIG_JUMP_LABEL=y
+# CONFIG_GCC_PLUGINS is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_FORCE_LOAD=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
+CONFIG_MODULE_UNLOAD_TAINT_TRACKING=y
 CONFIG_MODVERSIONS=y
 CONFIG_MODULE_SRCVERSION_ALL=y
 CONFIG_MODULE_SIG_SHA256=y
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_BLK_DEV_THROTTLING=y
 CONFIG_BLK_WBT=y
 CONFIG_BLK_CGROUP_IOLATENCY=y
 CONFIG_BLK_CGROUP_IOCOST=y
+CONFIG_BLK_CGROUP_IOPRIO=y
+CONFIG_BLK_INLINE_ENCRYPTION=y
+CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_IBM_PARTITION=y
 CONFIG_BSD_DISKLABEL=y
@@ -80,23 +84,24 @@ CONFIG_MINIX_SUBPARTITION=y
 CONFIG_SOLARIS_X86_PARTITION=y
 CONFIG_UNIXWARE_DISKLABEL=y
 CONFIG_IOSCHED_BFQ=y
-CONFIG_BFQ_GROUP_IOSCHED=y
 CONFIG_BINFMT_MISC=m
+CONFIG_ZSWAP=y
+CONFIG_ZSMALLOC=y
+CONFIG_ZSMALLOC_STAT=y
+CONFIG_SLAB_BUCKETS=y
+# CONFIG_COMPAT_BRK is not set
 CONFIG_MEMORY_HOTPLUG=y
 CONFIG_MEMORY_HOTREMOVE=y
 CONFIG_KSM=y
 CONFIG_TRANSPARENT_HUGEPAGE=y
-CONFIG_CLEANCACHE=y
-CONFIG_FRONTSWAP=y
+CONFIG_CMA_SYSFS=y
+CONFIG_CMA_AREAS=7
 CONFIG_MEM_SOFT_DIRTY=y
-CONFIG_ZSWAP=y
-CONFIG_ZBUD=m
-CONFIG_ZSMALLOC=m
-CONFIG_ZSMALLOC_STAT=y
 CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
 CONFIG_IDLE_PAGE_TRACKING=y
 CONFIG_PERCPU_STATS=y
-CONFIG_GUP_BENCHMARK=y
+CONFIG_ANON_VMA_NAME=y
+CONFIG_USERFAULTFD=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_PACKET_DIAG=m
@@ -104,8 +109,8 @@ CONFIG_UNIX=y
 CONFIG_UNIX_DIAG=m
 CONFIG_XFRM_USER=m
 CONFIG_NET_KEY=m
-CONFIG_SMC=m
 CONFIG_SMC_DIAG=m
+CONFIG_SMC_LO=y
 CONFIG_INET=y
 CONFIG_IP_MULTICAST=y
 CONFIG_IP_ADVANCED_ROUTER=y
@@ -120,10 +125,10 @@ CONFIG_IP_MROUTE=y
 CONFIG_IP_MROUTE_MULTIPLE_TABLES=y
 CONFIG_IP_PIMSM_V1=y
 CONFIG_IP_PIMSM_V2=y
-CONFIG_SYN_COOKIES=y
 CONFIG_NET_IPVTI=m
 CONFIG_INET_AH=m
 CONFIG_INET_ESP=m
+CONFIG_INET_ESPINTCP=y
 CONFIG_INET_IPCOMP=m
 CONFIG_INET_DIAG=m
 CONFIG_INET_UDP_DIAG=m
@@ -138,6 +143,7 @@ CONFIG_TCP_CONG_ILLINOIS=m
 CONFIG_IPV6_ROUTER_PREF=y
 CONFIG_INET6_AH=m
 CONFIG_INET6_ESP=m
+CONFIG_INET6_ESPINTCP=y
 CONFIG_INET6_IPCOMP=m
 CONFIG_IPV6_MIP6=m
 CONFIG_IPV6_VTI=m
@@ -145,9 +151,15 @@ CONFIG_IPV6_SIT=m
 CONFIG_IPV6_GRE=m
 CONFIG_IPV6_MULTIPLE_TABLES=y
 CONFIG_IPV6_SUBTREES=y
+CONFIG_IPV6_RPL_LWTUNNEL=y
+CONFIG_MPTCP=y
 CONFIG_NETFILTER=y
+CONFIG_BRIDGE_NETFILTER=m
+CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_ZONES=y
+CONFIG_NF_CONNTRACK_PROCFS=y
 CONFIG_NF_CONNTRACK_EVENTS=y
 CONFIG_NF_CONNTRACK_TIMEOUT=y
 CONFIG_NF_CONNTRACK_TIMESTAMP=y
@@ -163,14 +175,39 @@ CONFIG_NF_CONNTRACK_SIP=m
 CONFIG_NF_CONNTRACK_TFTP=m
 CONFIG_NF_CT_NETLINK=m
 CONFIG_NF_CT_NETLINK_TIMEOUT=m
+CONFIG_NF_CT_NETLINK_HELPER=m
+CONFIG_NETFILTER_NETLINK_GLUE_CT=y
 CONFIG_NF_TABLES=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
+CONFIG_NFT_NUMGEN=m
 CONFIG_NFT_CT=m
-CONFIG_NFT_COUNTER=m
+CONFIG_NFT_FLOW_OFFLOAD=m
+CONFIG_NFT_CONNLIMIT=m
 CONFIG_NFT_LOG=m
 CONFIG_NFT_LIMIT=m
+CONFIG_NFT_MASQ=m
+CONFIG_NFT_REDIR=m
 CONFIG_NFT_NAT=m
+CONFIG_NFT_TUNNEL=m
+CONFIG_NFT_QUEUE=m
+CONFIG_NFT_QUOTA=m
+CONFIG_NFT_REJECT=m
 CONFIG_NFT_COMPAT=m
 CONFIG_NFT_HASH=m
+CONFIG_NFT_FIB_INET=m
+CONFIG_NFT_XFRM=m
+CONFIG_NFT_SOCKET=m
+CONFIG_NFT_OSF=m
+CONFIG_NFT_TPROXY=m
+CONFIG_NFT_SYNPROXY=m
+CONFIG_NFT_DUP_NETDEV=m
+CONFIG_NFT_FWD_NETDEV=m
+CONFIG_NFT_FIB_NETDEV=m
+CONFIG_NFT_REJECT_NETDEV=m
+CONFIG_NF_FLOW_TABLE_INET=m
+CONFIG_NF_FLOW_TABLE=m
+CONFIG_NF_FLOW_TABLE_PROCFS=y
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_AUDIT=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
@@ -183,8 +220,10 @@ CONFIG_NETFILTER_XT_TARGET_HMARK=m
 CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m
 CONFIG_NETFILTER_XT_TARGET_LOG=m
 CONFIG_NETFILTER_XT_TARGET_MARK=m
+CONFIG_NETFILTER_XT_TARGET_NETMAP=m
 CONFIG_NETFILTER_XT_TARGET_NFLOG=m
 CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
+CONFIG_NETFILTER_XT_TARGET_REDIRECT=m
 CONFIG_NETFILTER_XT_TARGET_TEE=m
 CONFIG_NETFILTER_XT_TARGET_TPROXY=m
 CONFIG_NETFILTER_XT_TARGET_TRACE=m
@@ -193,6 +232,7 @@ CONFIG_NETFILTER_XT_TARGET_TCPMSS=m
 CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m
 CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m
 CONFIG_NETFILTER_XT_MATCH_BPF=m
+CONFIG_NETFILTER_XT_MATCH_CGROUP=m
 CONFIG_NETFILTER_XT_MATCH_CLUSTER=m
 CONFIG_NETFILTER_XT_MATCH_COMMENT=m
 CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
@@ -207,6 +247,7 @@ CONFIG_NETFILTER_XT_MATCH_DSCP=m
 CONFIG_NETFILTER_XT_MATCH_ESP=m
 CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
 CONFIG_NETFILTER_XT_MATCH_HELPER=m
+CONFIG_NETFILTER_XT_MATCH_IPCOMP=m
 CONFIG_NETFILTER_XT_MATCH_IPRANGE=m
 CONFIG_NETFILTER_XT_MATCH_IPVS=m
 CONFIG_NETFILTER_XT_MATCH_LENGTH=m
@@ -224,6 +265,7 @@ CONFIG_NETFILTER_XT_MATCH_QUOTA=m
 CONFIG_NETFILTER_XT_MATCH_RATEEST=m
 CONFIG_NETFILTER_XT_MATCH_REALM=m
 CONFIG_NETFILTER_XT_MATCH_RECENT=m
+CONFIG_NETFILTER_XT_MATCH_SOCKET=m
 CONFIG_NETFILTER_XT_MATCH_STATE=m
 CONFIG_NETFILTER_XT_MATCH_STATISTIC=m
 CONFIG_NETFILTER_XT_MATCH_STRING=m
@@ -259,10 +301,12 @@ CONFIG_IP_VS_DH=m
 CONFIG_IP_VS_SH=m
 CONFIG_IP_VS_SED=m
 CONFIG_IP_VS_NQ=m
+CONFIG_IP_VS_TWOS=m
 CONFIG_IP_VS_FTP=m
 CONFIG_IP_VS_PE_SIP=m
-CONFIG_NF_TABLES_IPV4=y
+CONFIG_NFT_FIB_IPV4=m
 CONFIG_NF_TABLES_ARP=y
+CONFIG_NF_LOG_IPV4=m
 CONFIG_IP_NF_IPTABLES=m
 CONFIG_IP_NF_MATCH_AH=m
 CONFIG_IP_NF_MATCH_ECN=m
@@ -273,15 +317,13 @@ CONFIG_IP_NF_TARGET_REJECT=m
 CONFIG_IP_NF_NAT=m
 CONFIG_IP_NF_TARGET_MASQUERADE=m
 CONFIG_IP_NF_MANGLE=m
-CONFIG_IP_NF_TARGET_CLUSTERIP=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
 CONFIG_IP_NF_SECURITY=m
-CONFIG_IP_NF_ARPTABLES=m
 CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
-CONFIG_NF_TABLES_IPV6=y
+CONFIG_NFT_FIB_IPV6=m
 CONFIG_IP6_NF_IPTABLES=m
 CONFIG_IP6_NF_MATCH_AH=m
 CONFIG_IP6_NF_MATCH_EUI64=m
@@ -309,11 +351,11 @@ CONFIG_L2TP_DEBUGFS=m
 CONFIG_L2TP_V3=y
 CONFIG_L2TP_IP=m
 CONFIG_L2TP_ETH=m
-CONFIG_BRIDGE=m
+CONFIG_BRIDGE=y
+CONFIG_BRIDGE_MRP=y
 CONFIG_VLAN_8021Q=m
 CONFIG_VLAN_8021Q_GVRP=y
 CONFIG_NET_SCHED=y
-CONFIG_NET_SCH_CBQ=m
 CONFIG_NET_SCH_HTB=m
 CONFIG_NET_SCH_HFSC=m
 CONFIG_NET_SCH_PRIO=m
@@ -324,7 +366,6 @@ CONFIG_NET_SCH_SFQ=m
 CONFIG_NET_SCH_TEQL=m
 CONFIG_NET_SCH_TBF=m
 CONFIG_NET_SCH_GRED=m
-CONFIG_NET_SCH_DSMARK=m
 CONFIG_NET_SCH_NETEM=m
 CONFIG_NET_SCH_DRR=m
 CONFIG_NET_SCH_MQPRIO=m
@@ -334,15 +375,13 @@ CONFIG_NET_SCH_CODEL=m
 CONFIG_NET_SCH_FQ_CODEL=m
 CONFIG_NET_SCH_INGRESS=m
 CONFIG_NET_SCH_PLUG=m
+CONFIG_NET_SCH_ETS=m
 CONFIG_NET_CLS_BASIC=m
-CONFIG_NET_CLS_TCINDEX=m
 CONFIG_NET_CLS_ROUTE4=m
 CONFIG_NET_CLS_FW=m
 CONFIG_NET_CLS_U32=m
 CONFIG_CLS_U32_PERF=y
 CONFIG_CLS_U32_MARK=y
-CONFIG_NET_CLS_RSVP=m
-CONFIG_NET_CLS_RSVP6=m
 CONFIG_NET_CLS_FLOW=m
 CONFIG_NET_CLS_CGROUP=y
 CONFIG_NET_CLS_BPF=m
@@ -351,35 +390,44 @@ CONFIG_NET_ACT_POLICE=m
 CONFIG_NET_ACT_GACT=m
 CONFIG_GACT_PROB=y
 CONFIG_NET_ACT_MIRRED=m
-CONFIG_NET_ACT_IPT=m
 CONFIG_NET_ACT_NAT=m
 CONFIG_NET_ACT_PEDIT=m
 CONFIG_NET_ACT_SIMP=m
 CONFIG_NET_ACT_SKBEDIT=m
 CONFIG_NET_ACT_CSUM=m
+CONFIG_NET_ACT_GATE=m
+CONFIG_NET_TC_SKB_EXT=y
 CONFIG_DNS_RESOLVER=y
 CONFIG_OPENVSWITCH=m
 CONFIG_VSOCKETS=m
 CONFIG_VIRTIO_VSOCKETS=m
 CONFIG_NETLINK_DIAG=m
+CONFIG_NET_SWITCHDEV=y
 CONFIG_CGROUP_NET_PRIO=y
-CONFIG_BPF_JIT=y
 CONFIG_NET_PKTGEN=m
-# CONFIG_NET_DROP_MONITOR is not set
 CONFIG_PCI=y
+# CONFIG_PCIEASPM is not set
+CONFIG_PCI_IOV=y
 CONFIG_HOTPLUG_PCI=y
 CONFIG_HOTPLUG_PCI_S390=y
 CONFIG_UEVENT_HELPER=y
 CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_SAFE=y
+# CONFIG_FW_LOADER is not set
 CONFIG_CONNECTOR=y
-CONFIG_ZRAM=m
+CONFIG_ZRAM=y
+CONFIG_ZRAM_BACKEND_LZ4=y
+CONFIG_ZRAM_BACKEND_LZ4HC=y
+CONFIG_ZRAM_BACKEND_ZSTD=y
+CONFIG_ZRAM_BACKEND_DEFLATE=y
+CONFIG_ZRAM_BACKEND_842=y
+CONFIG_ZRAM_BACKEND_LZO=y
+CONFIG_ZRAM_DEF_COMP_DEFLATE=y
 CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=32768
-# CONFIG_BLK_DEV_XPRAM is not set
 CONFIG_VIRTIO_BLK=y
 CONFIG_BLK_DEV_RBD=m
 CONFIG_BLK_DEV_NVME=m
@@ -396,12 +444,12 @@ CONFIG_SCSI_ENCLOSURE=m
 CONFIG_SCSI_CONSTANTS=y
 CONFIG_SCSI_LOGGING=y
 CONFIG_SCSI_SPI_ATTRS=m
-CONFIG_SCSI_FC_ATTRS=y
+CONFIG_SCSI_FC_ATTRS=m
 CONFIG_SCSI_SAS_LIBSAS=m
 CONFIG_SCSI_SRP_ATTRS=m
 CONFIG_ISCSI_TCP=m
 CONFIG_SCSI_DEBUG=m
-CONFIG_ZFCP=y
+CONFIG_ZFCP=m
 CONFIG_SCSI_VIRTIO=m
 CONFIG_SCSI_DH=y
 CONFIG_SCSI_DH_RDAC=m
@@ -410,12 +458,11 @@ CONFIG_SCSI_DH_EMC=m
 CONFIG_SCSI_DH_ALUA=m
 CONFIG_MD=y
 CONFIG_BLK_DEV_MD=y
+# CONFIG_MD_BITMAP_FILE is not set
 CONFIG_MD_LINEAR=m
-CONFIG_MD_MULTIPATH=m
-CONFIG_MD_FAULTY=m
 CONFIG_MD_CLUSTER=m
 CONFIG_BCACHE=m
-CONFIG_BLK_DEV_DM=m
+CONFIG_BLK_DEV_DM=y
 CONFIG_DM_UNSTRIPED=m
 CONFIG_DM_CRYPT=m
 CONFIG_DM_SNAPSHOT=m
@@ -429,13 +476,18 @@ CONFIG_DM_ZERO=m
 CONFIG_DM_MULTIPATH=m
 CONFIG_DM_MULTIPATH_QL=m
 CONFIG_DM_MULTIPATH_ST=m
+CONFIG_DM_MULTIPATH_HST=m
+CONFIG_DM_MULTIPATH_IOA=m
 CONFIG_DM_DELAY=m
+CONFIG_DM_INIT=y
 CONFIG_DM_UEVENT=y
 CONFIG_DM_FLAKEY=m
 CONFIG_DM_VERITY=m
 CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y
+CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG_PLATFORM_KEYRING=y
 CONFIG_DM_SWITCH=m
 CONFIG_DM_INTEGRITY=m
+CONFIG_DM_VDO=m
 CONFIG_NETDEVICES=y
 CONFIG_BONDING=m
 CONFIG_DUMMY=m
@@ -443,6 +495,9 @@ CONFIG_EQUALIZER=m
 CONFIG_IFB=m
 CONFIG_MACVLAN=m
 CONFIG_MACVTAP=m
+CONFIG_VXLAN=m
+CONFIG_BAREUDP=m
+CONFIG_AMT=m
 CONFIG_TUN=m
 CONFIG_VETH=m
 CONFIG_VIRTIO_NET=m
@@ -456,41 +511,45 @@ CONFIG_NLMON=m
 # CONFIG_NET_VENDOR_AMD is not set
 # CONFIG_NET_VENDOR_AQUANTIA is not set
 # CONFIG_NET_VENDOR_ARC is not set
+# CONFIG_NET_VENDOR_ASIX is not set
 # CONFIG_NET_VENDOR_ATHEROS is not set
-# CONFIG_NET_VENDOR_AURORA is not set
 # CONFIG_NET_VENDOR_BROADCOM is not set
-# CONFIG_NET_VENDOR_BROCADE is not set
 # CONFIG_NET_VENDOR_CADENCE is not set
 # CONFIG_NET_VENDOR_CAVIUM is not set
 # CONFIG_NET_VENDOR_CHELSIO is not set
 # CONFIG_NET_VENDOR_CISCO is not set
 # CONFIG_NET_VENDOR_CORTINA is not set
+# CONFIG_NET_VENDOR_DAVICOM is not set
 # CONFIG_NET_VENDOR_DEC is not set
 # CONFIG_NET_VENDOR_DLINK is not set
 # CONFIG_NET_VENDOR_EMULEX is not set
+# CONFIG_NET_VENDOR_ENGLEDER is not set
 # CONFIG_NET_VENDOR_EZCHIP is not set
+# CONFIG_NET_VENDOR_FUNGIBLE is not set
 # CONFIG_NET_VENDOR_GOOGLE is not set
-# CONFIG_NET_VENDOR_HP is not set
 # CONFIG_NET_VENDOR_HUAWEI is not set
 # CONFIG_NET_VENDOR_INTEL is not set
+# CONFIG_NET_VENDOR_LITEX is not set
 # CONFIG_NET_VENDOR_MARVELL is not set
 CONFIG_MLX4_EN=m
 CONFIG_MLX5_CORE=m
 CONFIG_MLX5_CORE_EN=y
-# CONFIG_MLXFW is not set
+# CONFIG_NET_VENDOR_META is not set
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_MICROCHIP is not set
 # CONFIG_NET_VENDOR_MICROSEMI is not set
+# CONFIG_NET_VENDOR_MICROSOFT is not set
 # CONFIG_NET_VENDOR_MYRI is not set
+# CONFIG_NET_VENDOR_NI is not set
 # CONFIG_NET_VENDOR_NATSEMI is not set
 # CONFIG_NET_VENDOR_NETERION is not set
 # CONFIG_NET_VENDOR_NETRONOME is not set
-# CONFIG_NET_VENDOR_NI is not set
 # CONFIG_NET_VENDOR_NVIDIA is not set
 # CONFIG_NET_VENDOR_OKI is not set
 # CONFIG_NET_VENDOR_PACKET_ENGINES is not set
 # CONFIG_NET_VENDOR_PENSANDO is not set
 # CONFIG_NET_VENDOR_QLOGIC is not set
+# CONFIG_NET_VENDOR_BROCADE is not set
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RDC is not set
 # CONFIG_NET_VENDOR_REALTEK is not set
@@ -498,9 +557,9 @@ CONFIG_MLX5_CORE_EN=y
 # CONFIG_NET_VENDOR_ROCKER is not set
 # CONFIG_NET_VENDOR_SAMSUNG is not set
 # CONFIG_NET_VENDOR_SEEQ is not set
-# CONFIG_NET_VENDOR_SOLARFLARE is not set
 # CONFIG_NET_VENDOR_SILAN is not set
 # CONFIG_NET_VENDOR_SIS is not set
+# CONFIG_NET_VENDOR_SOLARFLARE is not set
 # CONFIG_NET_VENDOR_SMSC is not set
 # CONFIG_NET_VENDOR_SOCIONEXT is not set
 # CONFIG_NET_VENDOR_STMICRO is not set
@@ -508,8 +567,11 @@ CONFIG_MLX5_CORE_EN=y
 # CONFIG_NET_VENDOR_SYNOPSYS is not set
 # CONFIG_NET_VENDOR_TEHUTI is not set
 # CONFIG_NET_VENDOR_TI is not set
+# CONFIG_NET_VENDOR_VERTEXCOM is not set
 # CONFIG_NET_VENDOR_VIA is not set
+# CONFIG_NET_VENDOR_WANGXUN is not set
 # CONFIG_NET_VENDOR_WIZNET is not set
+# CONFIG_NET_VENDOR_XILINX is not set
 CONFIG_PPP=m
 CONFIG_PPP_BSDCOMP=m
 CONFIG_PPP_DEFLATE=m
@@ -527,9 +589,9 @@ CONFIG_INPUT_EVDEV=y
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_SERIO is not set
 CONFIG_LEGACY_PTY_COUNT=0
-CONFIG_NULL_TTY=m
+# CONFIG_LEGACY_TIOCSTI is not set
+CONFIG_VIRTIO_CONSOLE=m
 CONFIG_HW_RANDOM_VIRTIO=m
-CONFIG_RAW_DRIVER=m
 CONFIG_HANGCHECK_TIMER=m
 CONFIG_TN3270_FS=y
 # CONFIG_PTP_1588_CLOCK is not set
@@ -539,25 +601,25 @@ CONFIG_WATCHDOG_CORE=y
 CONFIG_WATCHDOG_NOWAYOUT=y
 CONFIG_SOFT_WATCHDOG=m
 CONFIG_DIAG288_WATCHDOG=m
+CONFIG_DRM=m
+CONFIG_DRM_VIRTIO_GPU=m
 CONFIG_FB=y
-CONFIG_FRAMEBUFFER_CONSOLE=y
-CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y
-# CONFIG_HID is not set
+# CONFIG_FB_DEVICE is not set
+# CONFIG_HID_SUPPORT is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_INFINIBAND=m
 CONFIG_INFINIBAND_USER_ACCESS=m
 CONFIG_MLX4_INFINIBAND=m
 CONFIG_MLX5_INFINIBAND=m
-CONFIG_SYNC_FILE=y
 CONFIG_VFIO=m
 CONFIG_VFIO_PCI=m
-CONFIG_VFIO_MDEV=m
-CONFIG_VFIO_MDEV_DEVICE=m
+CONFIG_MLX5_VFIO_PCI=m
 CONFIG_VIRTIO_PCI=m
 CONFIG_VIRTIO_BALLOON=m
+CONFIG_VIRTIO_MEM=m
 CONFIG_VIRTIO_INPUT=y
-CONFIG_S390_CCW_IOMMU=y
-CONFIG_S390_AP_IOMMU=y
+CONFIG_VHOST_NET=m
+CONFIG_VHOST_VSOCK=m
 CONFIG_EXT4_FS=y
 CONFIG_EXT4_FS_POSIX_ACL=y
 CONFIG_EXT4_FS_SECURITY=y
@@ -576,6 +638,9 @@ CONFIG_OCFS2_FS=m
 CONFIG_BTRFS_FS=y
 CONFIG_BTRFS_FS_POSIX_ACL=y
 CONFIG_NILFS2_FS=m
+CONFIG_BCACHEFS_FS=m
+CONFIG_BCACHEFS_QUOTA=y
+CONFIG_BCACHEFS_POSIX_ACL=y
 CONFIG_FS_DAX=y
 CONFIG_EXPORTFS_BLOCK_OPS=y
 CONFIG_FS_ENCRYPTION=y
@@ -586,12 +651,13 @@ CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_QFMT_V1=m
 CONFIG_QFMT_V2=m
-CONFIG_AUTOFS4_FS=m
+CONFIG_AUTOFS_FS=m
 CONFIG_FUSE_FS=y
 CONFIG_CUSE=m
 CONFIG_VIRTIO_FS=m
 CONFIG_OVERLAY_FS=m
-CONFIG_FSCACHE=m
+CONFIG_NETFS_STATS=y
+CONFIG_FSCACHE=y
 CONFIG_CACHEFILES=m
 CONFIG_ISO9660_FS=y
 CONFIG_JOLIET=y
@@ -599,16 +665,19 @@ CONFIG_ZISOFS=y
 CONFIG_UDF_FS=m
 CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
+CONFIG_EXFAT_FS=m
 CONFIG_NTFS_FS=m
-CONFIG_NTFS_RW=y
 CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_TMPFS_INODE64=y
+CONFIG_TMPFS_QUOTA=y
 CONFIG_HUGETLBFS=y
 CONFIG_CONFIGFS_FS=m
 CONFIG_ECRYPT_FS=m
 CONFIG_CRAMFS=m
 CONFIG_SQUASHFS=m
+CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT=y
 CONFIG_SQUASHFS_XATTR=y
 CONFIG_SQUASHFS_LZ4=y
 CONFIG_SQUASHFS_LZO=y
@@ -616,6 +685,7 @@ CONFIG_SQUASHFS_XZ=y
 CONFIG_SQUASHFS_ZSTD=y
 CONFIG_ROMFS_FS=m
 CONFIG_NFS_FS=m
+CONFIG_NFS_V2=m
 CONFIG_NFS_V3_ACL=y
 CONFIG_NFS_V4=m
 CONFIG_NFS_SWAP=y
@@ -623,14 +693,14 @@ CONFIG_NFSD=m
 CONFIG_NFSD_V3_ACL=y
 CONFIG_NFSD_V4=y
 CONFIG_NFSD_V4_SECURITY_LABEL=y
+# CONFIG_NFSD_LEGACY_CLIENT_TRACKING is not set
 CONFIG_CIFS=m
-CONFIG_CIFS_STATS2=y
-CONFIG_CIFS_WEAK_PW_HASH=y
 CONFIG_CIFS_UPCALL=y
 CONFIG_CIFS_XATTR=y
 CONFIG_CIFS_POSIX=y
 # CONFIG_CIFS_DEBUG is not set
 CONFIG_CIFS_DFS_UPCALL=y
+CONFIG_CIFS_SWN_UPCALL=y
 CONFIG_NLS_DEFAULT="utf8"
 CONFIG_NLS_CODEPAGE_437=m
 CONFIG_NLS_CODEPAGE_850=m
@@ -641,22 +711,22 @@ CONFIG_NLS_UTF8=m
 CONFIG_DLM=m
 CONFIG_UNICODE=y
 CONFIG_PERSISTENT_KEYRINGS=y
-CONFIG_BIG_KEYS=y
 CONFIG_ENCRYPTED_KEYS=m
+CONFIG_KEY_NOTIFICATIONS=y
 CONFIG_SECURITY=y
-CONFIG_SECURITY_NETWORK=y
 CONFIG_SECURITY_SELINUX=y
 CONFIG_SECURITY_SELINUX_BOOTPARAM=y
-CONFIG_SECURITY_SELINUX_DISABLE=y
 CONFIG_SECURITY_LOCKDOWN_LSM=y
 CONFIG_SECURITY_LOCKDOWN_LSM_EARLY=y
+CONFIG_SECURITY_LANDLOCK=y
 CONFIG_INTEGRITY_SIGNATURE=y
 CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y
+CONFIG_INTEGRITY_PLATFORM_KEYRING=y
 CONFIG_IMA=y
 CONFIG_IMA_DEFAULT_HASH_SHA256=y
 CONFIG_IMA_WRITE_POLICY=y
 CONFIG_IMA_APPRAISE=y
-CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor"
+CONFIG_BUG_ON_DATA_CORRUPTION=y
 CONFIG_CRYPTO_FIPS=y
 CONFIG_CRYPTO_USER=m
 # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
@@ -665,92 +735,112 @@ CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
+CONFIG_CRYPTO_ECDSA=m
 CONFIG_CRYPTO_ECRDSA=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
-CONFIG_CRYPTO_AEGIS128=m
-CONFIG_CRYPTO_CFB=m
-CONFIG_CRYPTO_LRW=m
-CONFIG_CRYPTO_OFB=m
-CONFIG_CRYPTO_PCBC=m
-CONFIG_CRYPTO_KEYWRAP=m
-CONFIG_CRYPTO_ADIANTUM=m
-CONFIG_CRYPTO_XCBC=m
-CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_CRC32=m
-CONFIG_CRYPTO_XXHASH=m
-CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_RMD128=m
-CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_RMD256=m
-CONFIG_CRYPTO_RMD320=m
-CONFIG_CRYPTO_SHA3=m
-CONFIG_CRYPTO_SM3=m
-CONFIG_CRYPTO_TGR192=m
-CONFIG_CRYPTO_WP512=m
+CONFIG_CRYPTO_CURVE25519=m
 CONFIG_CRYPTO_AES_TI=m
 CONFIG_CRYPTO_ANUBIS=m
-CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_ARIA=m
 CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAMELLIA=m
 CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
-CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SM4_GENERIC=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_ADIANTUM=m
+CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_HCTR2=m
+CONFIG_CRYPTO_LRW=m
+CONFIG_CRYPTO_PCBC=m
+CONFIG_CRYPTO_AEGIS128=m
+CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_GCM=y
+CONFIG_CRYPTO_SEQIV=y
+CONFIG_CRYPTO_MD4=m
+CONFIG_CRYPTO_MD5=y
+CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SM3_GENERIC=m
+CONFIG_CRYPTO_WP512=m
+CONFIG_CRYPTO_XCBC=m
+CONFIG_CRYPTO_CRC32=m
 CONFIG_CRYPTO_842=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
 CONFIG_CRYPTO_ZSTD=m
 CONFIG_CRYPTO_ANSI_CPRNG=m
+CONFIG_CRYPTO_JITTERENTROPY_OSR=1
 CONFIG_CRYPTO_USER_API_HASH=m
 CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
-CONFIG_CRYPTO_STATS=y
-CONFIG_ZCRYPT=m
-CONFIG_PKEY=m
-CONFIG_CRYPTO_PAES_S390=m
+CONFIG_CRYPTO_SHA512_S390=m
 CONFIG_CRYPTO_SHA1_S390=m
 CONFIG_CRYPTO_SHA256_S390=m
-CONFIG_CRYPTO_SHA512_S390=m
 CONFIG_CRYPTO_SHA3_256_S390=m
 CONFIG_CRYPTO_SHA3_512_S390=m
-CONFIG_CRYPTO_DES_S390=m
-CONFIG_CRYPTO_AES_S390=m
 CONFIG_CRYPTO_GHASH_S390=m
-CONFIG_CRYPTO_CRC32_S390=y
+CONFIG_CRYPTO_AES_S390=m
+CONFIG_CRYPTO_DES_S390=m
+CONFIG_CRYPTO_CHACHA_S390=m
+CONFIG_CRYPTO_HMAC_S390=m
+CONFIG_ZCRYPT=m
+CONFIG_PKEY=m
+CONFIG_PKEY_CCA=m
+CONFIG_PKEY_EP11=m
+CONFIG_PKEY_PCKMO=m
+CONFIG_PKEY_UV=m
+CONFIG_CRYPTO_PAES_S390=m
+CONFIG_CRYPTO_DEV_VIRTIO=m
+CONFIG_SYSTEM_BLACKLIST_KEYRING=y
 CONFIG_CORDIC=m
-CONFIG_CRC4=m
-CONFIG_CRC7=m
-CONFIG_CRC8=m
+CONFIG_PRIME_NUMBERS=m
+CONFIG_CRYPTO_LIB_CURVE25519=m
+CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
+CONFIG_XZ_DEC_MICROLZMA=y
 CONFIG_DMA_CMA=y
 CONFIG_CMA_SIZE_MBYTES=0
 CONFIG_PRINTK_TIME=y
-CONFIG_DEBUG_INFO=y
+CONFIG_DYNAMIC_DEBUG=y
 CONFIG_DEBUG_INFO_DWARF4=y
 CONFIG_GDB_SCRIPTS=y
-CONFIG_FRAME_WARN=1024
 CONFIG_DEBUG_SECTION_MISMATCH=y
 CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_WX=y
+CONFIG_PTDUMP_DEBUGFS=y
 CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_PANIC_ON_OOPS=y
+CONFIG_TEST_LOCKUP=m
 CONFIG_RCU_TORTURE_TEST=m
+CONFIG_RCU_REF_SCALE_TEST=m
 CONFIG_RCU_CPU_STALL_TIMEOUT=60
 CONFIG_LATENCYTOP=y
+CONFIG_BOOTTIME_TRACING=y
+CONFIG_FUNCTION_GRAPH_RETVAL=y
+CONFIG_FUNCTION_GRAPH_RETADDR=y
+CONFIG_FPROBE=y
+CONFIG_FUNCTION_PROFILER=y
+CONFIG_STACK_TRACER=y
 CONFIG_SCHED_TRACER=y
 CONFIG_FTRACE_SYSCALLS=y
-CONFIG_STACK_TRACER=y
 CONFIG_BLK_DEV_IO_TRACE=y
-CONFIG_FUNCTION_PROFILER=y
+CONFIG_USER_EVENTS=y
 CONFIG_HIST_TRIGGERS=y
+CONFIG_SAMPLES=y
+CONFIG_SAMPLE_TRACE_PRINTK=m
+CONFIG_SAMPLE_FTRACE_DIRECT=m
+CONFIG_SAMPLE_FTRACE_DIRECT_MULTI=m
+CONFIG_SAMPLE_FTRACE_OPS=m
+CONFIG_KUNIT=m
+CONFIG_KUNIT_DEBUGFS=y
 CONFIG_LKDTM=m
+CONFIG_KPROBES_SANITY_TEST=m
 CONFIG_PERCPU_TEST=m
 CONFIG_ATOMIC64_SELFTEST=y
 CONFIG_TEST_BPF=m
-CONFIG_BUG_ON_DATA_CORRUPTION=y
-CONFIG_S390_PTDUMP=y
diff --git a/arch/s390/configs/kasan.config b/arch/s390/configs/kasan.config
new file mode 100644
index 000000000000..cefbe2ba1228
--- /dev/null
+++ b/arch/s390/configs/kasan.config
@@ -0,0 +1,4 @@
+# Help: Enable KASan for debugging
+CONFIG_KASAN=y
+CONFIG_KASAN_INLINE=y
+CONFIG_KERNEL_IMAGE_BASE=0x7FFFE0000000
diff --git a/arch/s390/configs/mmtypes.config b/arch/s390/configs/mmtypes.config
new file mode 100644
index 000000000000..fe32b442d789
--- /dev/null
+++ b/arch/s390/configs/mmtypes.config
@@ -0,0 +1,2 @@
+# Help: Enable strict memory management typechecks
+CONFIG_STRICT_MM_TYPECHECKS=y
diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig
index 20c51e5d9353..853b2326a171 100644
--- a/arch/s390/configs/zfcpdump_defconfig
+++ b/arch/s390/configs/zfcpdump_defconfig
@@ -1,38 +1,37 @@
-# CONFIG_SWAP is not set
 CONFIG_NO_HZ_IDLE=y
 CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BPF_SYSCALL=y
 # CONFIG_CPU_ISOLATION is not set
 # CONFIG_UTS_NS is not set
+# CONFIG_TIME_NS is not set
 # CONFIG_PID_NS is not set
 # CONFIG_NET_NS is not set
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-# CONFIG_COMPAT_BRK is not set
-CONFIG_TUNE_ZEC12=y
-# CONFIG_COMPAT is not set
+CONFIG_KEXEC=y
+CONFIG_MARCH_Z13=y
 CONFIG_NR_CPUS=2
 CONFIG_HZ_100=y
-# CONFIG_ARCH_RANDOM is not set
-# CONFIG_RELOCATABLE is not set
 # CONFIG_CHSC_SCH is not set
 # CONFIG_SCM_BUS is not set
-CONFIG_CRASH_DUMP=y
-# CONFIG_SECCOMP is not set
+# CONFIG_AP is not set
 # CONFIG_PFAULT is not set
-# CONFIG_S390_HYPFS_FS is not set
+# CONFIG_S390_HYPFS is not set
 # CONFIG_VIRTUALIZATION is not set
 # CONFIG_S390_GUEST is not set
+# CONFIG_SECCOMP is not set
+# CONFIG_BLOCK_LEGACY_AUTOLOAD is not set
 CONFIG_PARTITION_ADVANCED=y
-CONFIG_IBM_PARTITION=y
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
-# CONFIG_COMPACTION is not set
-# CONFIG_MIGRATION is not set
-# CONFIG_BOUNCE is not set
+# CONFIG_SWAP is not set
+# CONFIG_COMPAT_BRK is not set
 CONFIG_NET=y
 # CONFIG_IUCV is not set
+# CONFIG_PCPU_DEV_REFCNT is not set
+# CONFIG_ETHTOOL_NETLINK is not set
 CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_SAFE=y
 CONFIG_BLK_DEV_RAM=y
-# CONFIG_BLK_DEV_XPRAM is not set
 # CONFIG_DCSSBLK is not set
 # CONFIG_DASD is not set
 CONFIG_ENCLOSURE_SERVICES=y
@@ -46,28 +45,33 @@ CONFIG_ZFCP=y
 # CONFIG_INPUT_KEYBOARD is not set
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_SERIO is not set
+# CONFIG_LEGACY_TIOCSTI is not set
 # CONFIG_HVC_IUCV is not set
 # CONFIG_HW_RANDOM_S390 is not set
-CONFIG_RAW_DRIVER=y
 # CONFIG_HMC_DRV is not set
+# CONFIG_S390_UV_UAPI is not set
 # CONFIG_S390_TAPE is not set
 # CONFIG_VMCP is not set
 # CONFIG_MONWRITER is not set
 # CONFIG_S390_VMUR is not set
-# CONFIG_HID is not set
+# CONFIG_HID_SUPPORT is not set
+# CONFIG_VIRTIO_MENU is not set
+# CONFIG_VHOST_MENU is not set
 # CONFIG_IOMMU_SUPPORT is not set
 # CONFIG_DNOTIFY is not set
 # CONFIG_INOTIFY_USER is not set
-CONFIG_CONFIGFS_FS=y
 # CONFIG_MISC_FILESYSTEMS is not set
 # CONFIG_NETWORK_FILESYSTEMS is not set
-CONFIG_LSM="yama,loadpin,safesetid,integrity"
+# CONFIG_ZLIB_DFLTCC is not set
+CONFIG_XZ_DEC_MICROLZMA=y
 CONFIG_PRINTK_TIME=y
-CONFIG_DEBUG_INFO=y
-CONFIG_DEBUG_FS=y
+# CONFIG_SYMBOLIC_ERRNAME is not set
 CONFIG_DEBUG_KERNEL=y
+CONFIG_DEBUG_INFO_DWARF4=y
+CONFIG_DEBUG_FS=y
 CONFIG_PANIC_ON_OOPS=y
 # CONFIG_SCHED_DEBUG is not set
 CONFIG_RCU_CPU_STALL_TIMEOUT=60
+# CONFIG_RCU_TRACE is not set
 # CONFIG_FTRACE is not set
 # CONFIG_RUNTIME_TESTING_MENU is not set
diff --git a/arch/s390/crypto/Kconfig b/arch/s390/crypto/Kconfig
new file mode 100644
index 000000000000..8c4db8b64fa2
--- /dev/null
+++ b/arch/s390/crypto/Kconfig
@@ -0,0 +1,134 @@
+# SPDX-License-Identifier: GPL-2.0
+
+menu "Accelerated Cryptographic Algorithms for CPU (s390)"
+
+config CRYPTO_SHA512_S390
+	tristate "Hash functions: SHA-384 and SHA-512"
+	depends on S390
+	select CRYPTO_HASH
+	help
+	  SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
+
+	  Architecture: s390
+
+	  It is available as of z10.
+
+config CRYPTO_SHA1_S390
+	tristate "Hash functions: SHA-1"
+	depends on S390
+	select CRYPTO_HASH
+	help
+	  SHA-1 secure hash algorithm (FIPS 180)
+
+	  Architecture: s390
+
+	  It is available as of z990.
+
+config CRYPTO_SHA256_S390
+	tristate "Hash functions: SHA-224 and SHA-256"
+	depends on S390
+	select CRYPTO_HASH
+	help
+	  SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
+
+	  Architecture: s390
+
+	  It is available as of z9.
+
+config CRYPTO_SHA3_256_S390
+	tristate "Hash functions: SHA3-224 and SHA3-256"
+	depends on S390
+	select CRYPTO_HASH
+	help
+	  SHA3-224 and SHA3-256 secure hash algorithms (FIPS 202)
+
+	  Architecture: s390
+
+	  It is available as of z14.
+
+config CRYPTO_SHA3_512_S390
+	tristate "Hash functions: SHA3-384 and SHA3-512"
+	depends on S390
+	select CRYPTO_HASH
+	help
+	  SHA3-384 and SHA3-512 secure hash algorithms (FIPS 202)
+
+	  Architecture: s390
+
+	  It is available as of z14.
+
+config CRYPTO_GHASH_S390
+	tristate "Hash functions: GHASH"
+	depends on S390
+	select CRYPTO_HASH
+	help
+	  GCM GHASH hash function (NIST SP800-38D)
+
+	  Architecture: s390
+
+	  It is available as of z196.
+
+config CRYPTO_AES_S390
+	tristate "Ciphers: AES, modes: ECB, CBC, CTR, XTS, GCM"
+	depends on S390
+	select CRYPTO_ALGAPI
+	select CRYPTO_SKCIPHER
+	help
+	  Block cipher: AES cipher algorithms (FIPS 197)
+	  AEAD cipher: AES with GCM
+	  Length-preserving ciphers: AES with ECB, CBC, XTS, and CTR modes
+
+	  Architecture: s390
+
+	  As of z9 the ECB and CBC modes are hardware accelerated
+	  for 128 bit keys.
+
+	  As of z10 the ECB and CBC modes are hardware accelerated
+	  for all AES key sizes.
+
+	  As of z196 the CTR mode is hardware accelerated for all AES
+	  key sizes and XTS mode is hardware accelerated for 256 and
+	  512 bit keys.
+
+config CRYPTO_DES_S390
+	tristate "Ciphers: DES and Triple DES EDE, modes: ECB, CBC, CTR"
+	depends on S390
+	select CRYPTO_ALGAPI
+	select CRYPTO_SKCIPHER
+	select CRYPTO_LIB_DES
+	help
+	  Block ciphers: DES (FIPS 46-2) cipher algorithm
+	  Block ciphers: Triple DES EDE (FIPS 46-3) cipher algorithm
+	  Length-preserving ciphers: DES with ECB, CBC, and CTR modes
+	  Length-preserving ciphers: Triple DES EDED with ECB, CBC, and CTR modes
+
+	  Architecture: s390
+
+	  As of z990 the ECB and CBC mode are hardware accelerated.
+	  As of z196 the CTR mode is hardware accelerated.
+
+config CRYPTO_CHACHA_S390
+	tristate
+	depends on S390
+	select CRYPTO_SKCIPHER
+	select CRYPTO_LIB_CHACHA_GENERIC
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+	default CRYPTO_LIB_CHACHA_INTERNAL
+	help
+	  Length-preserving cipher: ChaCha20 stream cipher (RFC 7539)
+
+	  Architecture: s390
+
+	  It is available as of z13.
+
+config CRYPTO_HMAC_S390
+	tristate "Keyed-hash message authentication code: HMAC"
+	depends on S390
+	select CRYPTO_HASH
+	help
+	  s390 specific HMAC hardware support for SHA224, SHA256, SHA384 and
+	  SHA512.
+
+	  Architecture: s390
+
+endmenu
diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile
index 12889d4652cc..14dafadbcbed 100644
--- a/arch/s390/crypto/Makefile
+++ b/arch/s390/crypto/Makefile
@@ -11,9 +11,10 @@ obj-$(CONFIG_CRYPTO_SHA3_512_S390) += sha3_512_s390.o sha_common.o
 obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o
 obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o
 obj-$(CONFIG_CRYPTO_PAES_S390) += paes_s390.o
+obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o
 obj-$(CONFIG_S390_PRNG) += prng.o
 obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o
-obj-$(CONFIG_CRYPTO_CRC32_S390) += crc32-vx_s390.o
-obj-$(CONFIG_ARCH_RANDOM) += arch_random.o
+obj-$(CONFIG_CRYPTO_HMAC_S390) += hmac_s390.o
+obj-y += arch_random.o
 
-crc32-vx_s390-y := crc32-vx.o crc32le-vx.o crc32be-vx.o
+chacha_s390-y := chacha-glue.o chacha-s390.o
diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c
index ead0b2c9881d..5d36f4020dfa 100644
--- a/arch/s390/crypto/aes_s390.c
+++ b/arch/s390/crypto/aes_s390.c
@@ -21,6 +21,7 @@
 #include <crypto/algapi.h>
 #include <crypto/ghash.h>
 #include <crypto/internal/aead.h>
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
 #include <linux/err.h>
@@ -50,8 +51,13 @@ struct s390_aes_ctx {
 };
 
 struct s390_xts_ctx {
-	u8 key[32];
-	u8 pcc_key[32];
+	union {
+		u8 keys[64];
+		struct {
+			u8 key[32];
+			u8 pcc_key[32];
+		};
+	};
 	int key_len;
 	unsigned long fc;
 	struct crypto_skcipher *fallback;
@@ -60,7 +66,6 @@ struct s390_xts_ctx {
 struct gcm_sg_walk {
 	struct scatter_walk walk;
 	unsigned int walk_bytes;
-	u8 *walk_ptr;
 	unsigned int walk_bytes_remain;
 	u8 buf[AES_BLOCK_SIZE];
 	unsigned int buf_bytes;
@@ -72,19 +77,12 @@ static int setkey_fallback_cip(struct crypto_tfm *tfm, const u8 *in_key,
 		unsigned int key_len)
 {
 	struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
-	int ret;
 
 	sctx->fallback.cip->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK;
 	sctx->fallback.cip->base.crt_flags |= (tfm->crt_flags &
 			CRYPTO_TFM_REQ_MASK);
 
-	ret = crypto_cipher_setkey(sctx->fallback.cip, in_key, key_len);
-	if (ret) {
-		tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
-		tfm->crt_flags |= (sctx->fallback.cip->base.crt_flags &
-				CRYPTO_TFM_RES_MASK);
-	}
-	return ret;
+	return crypto_cipher_setkey(sctx->fallback.cip, in_key, key_len);
 }
 
 static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
@@ -182,18 +180,13 @@ static int setkey_fallback_skcipher(struct crypto_skcipher *tfm, const u8 *key,
 				    unsigned int len)
 {
 	struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm);
-	int ret;
 
 	crypto_skcipher_clear_flags(sctx->fallback.skcipher,
 				    CRYPTO_TFM_REQ_MASK);
 	crypto_skcipher_set_flags(sctx->fallback.skcipher,
 				  crypto_skcipher_get_flags(tfm) &
 				  CRYPTO_TFM_REQ_MASK);
-	ret = crypto_skcipher_setkey(sctx->fallback.skcipher, key, len);
-	crypto_skcipher_set_flags(tfm,
-				  crypto_skcipher_get_flags(sctx->fallback.skcipher) &
-				  CRYPTO_TFM_RES_MASK);
-	return ret;
+	return crypto_skcipher_setkey(sctx->fallback.skcipher, key, len);
 }
 
 static int fallback_skcipher_crypt(struct s390_aes_ctx *sctx,
@@ -354,6 +347,7 @@ static int cbc_aes_crypt(struct skcipher_request *req, unsigned long modifier)
 		memcpy(walk.iv, param.iv, AES_BLOCK_SIZE);
 		ret = skcipher_walk_done(&walk, nbytes - n);
 	}
+	memzero_explicit(&param, sizeof(param));
 	return ret;
 }
 
@@ -389,17 +383,12 @@ static int xts_fallback_setkey(struct crypto_skcipher *tfm, const u8 *key,
 			       unsigned int len)
 {
 	struct s390_xts_ctx *xts_ctx = crypto_skcipher_ctx(tfm);
-	int ret;
 
 	crypto_skcipher_clear_flags(xts_ctx->fallback, CRYPTO_TFM_REQ_MASK);
 	crypto_skcipher_set_flags(xts_ctx->fallback,
 				  crypto_skcipher_get_flags(tfm) &
 				  CRYPTO_TFM_REQ_MASK);
-	ret = crypto_skcipher_setkey(xts_ctx->fallback, key, len);
-	crypto_skcipher_set_flags(tfm,
-				  crypto_skcipher_get_flags(xts_ctx->fallback) &
-				  CRYPTO_TFM_RES_MASK);
-	return ret;
+	return crypto_skcipher_setkey(xts_ctx->fallback, key, len);
 }
 
 static int xts_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
@@ -413,12 +402,6 @@ static int xts_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
 	if (err)
 		return err;
 
-	/* In fips mode only 128 bit or 256 bit keys are valid */
-	if (fips_enabled && key_len != 32 && key_len != 64) {
-		crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-
 	/* Pick the correct function code based on the key length */
 	fc = (key_len == 32) ? CPACF_KM_XTS_128 :
 	     (key_len == 64) ? CPACF_KM_XTS_256 : 0;
@@ -489,6 +472,8 @@ static int xts_aes_crypt(struct skcipher_request *req, unsigned long modifier)
 			 walk.dst.virt.addr, walk.src.virt.addr, n);
 		ret = skcipher_walk_done(&walk, nbytes - n);
 	}
+	memzero_explicit(&pcc_param, sizeof(pcc_param));
+	memzero_explicit(&xts_param, sizeof(xts_param));
 	return ret;
 }
 
@@ -545,6 +530,108 @@ static struct skcipher_alg xts_aes_alg = {
 	.decrypt		=	xts_aes_decrypt,
 };
 
+static int fullxts_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
+			       unsigned int key_len)
+{
+	struct s390_xts_ctx *xts_ctx = crypto_skcipher_ctx(tfm);
+	unsigned long fc;
+	int err;
+
+	err = xts_fallback_setkey(tfm, in_key, key_len);
+	if (err)
+		return err;
+
+	/* Pick the correct function code based on the key length */
+	fc = (key_len == 32) ? CPACF_KM_XTS_128_FULL :
+	     (key_len == 64) ? CPACF_KM_XTS_256_FULL : 0;
+
+	/* Check if the function code is available */
+	xts_ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0;
+	if (!xts_ctx->fc)
+		return 0;
+
+	/* Store double-key */
+	memcpy(xts_ctx->keys, in_key, key_len);
+	xts_ctx->key_len = key_len;
+	return 0;
+}
+
+static int fullxts_aes_crypt(struct skcipher_request *req,  unsigned long modifier)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct s390_xts_ctx *xts_ctx = crypto_skcipher_ctx(tfm);
+	unsigned int offset, nbytes, n;
+	struct skcipher_walk walk;
+	int ret;
+	struct {
+		__u8 key[64];
+		__u8 tweak[16];
+		__u8 nap[16];
+	} fxts_param = {
+		.nap = {0},
+	};
+
+	if (req->cryptlen < AES_BLOCK_SIZE)
+		return -EINVAL;
+
+	if (unlikely(!xts_ctx->fc || (req->cryptlen % AES_BLOCK_SIZE) != 0)) {
+		struct skcipher_request *subreq = skcipher_request_ctx(req);
+
+		*subreq = *req;
+		skcipher_request_set_tfm(subreq, xts_ctx->fallback);
+		return (modifier & CPACF_DECRYPT) ?
+			crypto_skcipher_decrypt(subreq) :
+			crypto_skcipher_encrypt(subreq);
+	}
+
+	ret = skcipher_walk_virt(&walk, req, false);
+	if (ret)
+		return ret;
+
+	offset = xts_ctx->key_len & 0x20;
+	memcpy(fxts_param.key + offset, xts_ctx->keys, xts_ctx->key_len);
+	memcpy(fxts_param.tweak, req->iv, AES_BLOCK_SIZE);
+	fxts_param.nap[0] = 0x01; /* initial alpha power (1, little-endian) */
+
+	while ((nbytes = walk.nbytes) != 0) {
+		/* only use complete blocks */
+		n = nbytes & ~(AES_BLOCK_SIZE - 1);
+		cpacf_km(xts_ctx->fc | modifier, fxts_param.key + offset,
+			 walk.dst.virt.addr, walk.src.virt.addr, n);
+		ret = skcipher_walk_done(&walk, nbytes - n);
+	}
+	memzero_explicit(&fxts_param, sizeof(fxts_param));
+	return ret;
+}
+
+static int fullxts_aes_encrypt(struct skcipher_request *req)
+{
+	return fullxts_aes_crypt(req, 0);
+}
+
+static int fullxts_aes_decrypt(struct skcipher_request *req)
+{
+	return fullxts_aes_crypt(req, CPACF_DECRYPT);
+}
+
+static struct skcipher_alg fullxts_aes_alg = {
+	.base.cra_name		=	"xts(aes)",
+	.base.cra_driver_name	=	"full-xts-aes-s390",
+	.base.cra_priority	=	403,	/* aes-xts-s390 + 1 */
+	.base.cra_flags		=	CRYPTO_ALG_NEED_FALLBACK,
+	.base.cra_blocksize	=	AES_BLOCK_SIZE,
+	.base.cra_ctxsize	=	sizeof(struct s390_xts_ctx),
+	.base.cra_module	=	THIS_MODULE,
+	.init			=	xts_fallback_init,
+	.exit			=	xts_fallback_exit,
+	.min_keysize		=	2 * AES_MIN_KEY_SIZE,
+	.max_keysize		=	2 * AES_MAX_KEY_SIZE,
+	.ivsize			=	AES_BLOCK_SIZE,
+	.setkey			=	fullxts_aes_set_key,
+	.encrypt		=	fullxts_aes_encrypt,
+	.decrypt		=	fullxts_aes_decrypt,
+};
+
 static int ctr_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
 			   unsigned int key_len)
 {
@@ -616,7 +703,9 @@ static int ctr_aes_crypt(struct skcipher_request *req)
 	 * final block may be < AES_BLOCK_SIZE, copy only nbytes
 	 */
 	if (nbytes) {
-		cpacf_kmctr(sctx->fc, sctx->key, buf, walk.src.virt.addr,
+		memset(buf, 0, AES_BLOCK_SIZE);
+		memcpy(buf, walk.src.virt.addr, nbytes);
+		cpacf_kmctr(sctx->fc, sctx->key, buf, buf,
 			    AES_BLOCK_SIZE, walk.iv);
 		memcpy(walk.dst.virt.addr, buf, nbytes);
 		crypto_inc(walk.iv, AES_BLOCK_SIZE);
@@ -697,29 +786,20 @@ static void gcm_walk_start(struct gcm_sg_walk *gw, struct scatterlist *sg,
 
 static inline unsigned int _gcm_sg_clamp_and_map(struct gcm_sg_walk *gw)
 {
-	struct scatterlist *nextsg;
-
-	gw->walk_bytes = scatterwalk_clamp(&gw->walk, gw->walk_bytes_remain);
-	while (!gw->walk_bytes) {
-		nextsg = sg_next(gw->walk.sg);
-		if (!nextsg)
-			return 0;
-		scatterwalk_start(&gw->walk, nextsg);
-		gw->walk_bytes = scatterwalk_clamp(&gw->walk,
-						   gw->walk_bytes_remain);
-	}
-	gw->walk_ptr = scatterwalk_map(&gw->walk);
+	if (gw->walk_bytes_remain == 0)
+		return 0;
+	gw->walk_bytes = scatterwalk_next(&gw->walk, gw->walk_bytes_remain);
 	return gw->walk_bytes;
 }
 
 static inline void _gcm_sg_unmap_and_advance(struct gcm_sg_walk *gw,
-					     unsigned int nbytes)
+					     unsigned int nbytes, bool out)
 {
 	gw->walk_bytes_remain -= nbytes;
-	scatterwalk_unmap(&gw->walk);
-	scatterwalk_advance(&gw->walk, nbytes);
-	scatterwalk_done(&gw->walk, 0, gw->walk_bytes_remain);
-	gw->walk_ptr = NULL;
+	if (out)
+		scatterwalk_done_dst(&gw->walk, nbytes);
+	else
+		scatterwalk_done_src(&gw->walk, nbytes);
 }
 
 static int gcm_in_walk_go(struct gcm_sg_walk *gw, unsigned int minbytesneeded)
@@ -745,16 +825,16 @@ static int gcm_in_walk_go(struct gcm_sg_walk *gw, unsigned int minbytesneeded)
 	}
 
 	if (!gw->buf_bytes && gw->walk_bytes >= minbytesneeded) {
-		gw->ptr = gw->walk_ptr;
+		gw->ptr = gw->walk.addr;
 		gw->nbytes = gw->walk_bytes;
 		goto out;
 	}
 
 	while (1) {
 		n = min(gw->walk_bytes, AES_BLOCK_SIZE - gw->buf_bytes);
-		memcpy(gw->buf + gw->buf_bytes, gw->walk_ptr, n);
+		memcpy(gw->buf + gw->buf_bytes, gw->walk.addr, n);
 		gw->buf_bytes += n;
-		_gcm_sg_unmap_and_advance(gw, n);
+		_gcm_sg_unmap_and_advance(gw, n, false);
 		if (gw->buf_bytes >= minbytesneeded) {
 			gw->ptr = gw->buf;
 			gw->nbytes = gw->buf_bytes;
@@ -786,13 +866,12 @@ static int gcm_out_walk_go(struct gcm_sg_walk *gw, unsigned int minbytesneeded)
 	}
 
 	if (gw->walk_bytes >= minbytesneeded) {
-		gw->ptr = gw->walk_ptr;
+		gw->ptr = gw->walk.addr;
 		gw->nbytes = gw->walk_bytes;
 		goto out;
 	}
 
 	scatterwalk_unmap(&gw->walk);
-	gw->walk_ptr = NULL;
 
 	gw->ptr = gw->buf;
 	gw->nbytes = sizeof(gw->buf);
@@ -814,7 +893,7 @@ static int gcm_in_walk_done(struct gcm_sg_walk *gw, unsigned int bytesdone)
 		} else
 			gw->buf_bytes = 0;
 	} else
-		_gcm_sg_unmap_and_advance(gw, bytesdone);
+		_gcm_sg_unmap_and_advance(gw, bytesdone, false);
 
 	return bytesdone;
 }
@@ -831,11 +910,11 @@ static int gcm_out_walk_done(struct gcm_sg_walk *gw, unsigned int bytesdone)
 			if (!_gcm_sg_clamp_and_map(gw))
 				return i;
 			n = min(gw->walk_bytes, bytesdone - i);
-			memcpy(gw->walk_ptr, gw->buf + i, n);
-			_gcm_sg_unmap_and_advance(gw, n);
+			memcpy(gw->walk.addr, gw->buf + i, n);
+			_gcm_sg_unmap_and_advance(gw, n, true);
 		}
 	} else
-		_gcm_sg_unmap_and_advance(gw, bytesdone);
+		_gcm_sg_unmap_and_advance(gw, bytesdone, true);
 
 	return bytesdone;
 }
@@ -972,7 +1051,7 @@ static struct aead_alg gcm_aes_aead = {
 };
 
 static struct crypto_alg *aes_s390_alg;
-static struct skcipher_alg *aes_s390_skcipher_algs[4];
+static struct skcipher_alg *aes_s390_skcipher_algs[5];
 static int aes_s390_skciphers_num;
 static struct aead_alg *aes_s390_aead_alg;
 
@@ -1029,6 +1108,13 @@ static int __init aes_s390_init(void)
 			goto out_err;
 	}
 
+	if (cpacf_test_func(&km_functions, CPACF_KM_XTS_128_FULL) ||
+	    cpacf_test_func(&km_functions, CPACF_KM_XTS_256_FULL)) {
+		ret = aes_s390_register_skcipher(&fullxts_aes_alg);
+		if (ret)
+			goto out_err;
+	}
+
 	if (cpacf_test_func(&km_functions, CPACF_KM_XTS_128) ||
 	    cpacf_test_func(&km_functions, CPACF_KM_XTS_256)) {
 		ret = aes_s390_register_skcipher(&xts_aes_alg);
@@ -1064,10 +1150,11 @@ out_err:
 	return ret;
 }
 
-module_cpu_feature_match(MSA, aes_s390_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, aes_s390_init);
 module_exit(aes_s390_fini);
 
 MODULE_ALIAS_CRYPTO("aes-all");
 
 MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm");
 MODULE_LICENSE("GPL");
+MODULE_IMPORT_NS("CRYPTO_INTERNAL");
diff --git a/arch/s390/crypto/arch_random.c b/arch/s390/crypto/arch_random.c
index dd95cdbd22ce..a8a2407381af 100644
--- a/arch/s390/crypto/arch_random.c
+++ b/arch/s390/crypto/arch_random.c
@@ -2,122 +2,18 @@
 /*
  * s390 arch random implementation.
  *
- * Copyright IBM Corp. 2017, 2018
+ * Copyright IBM Corp. 2017, 2020
  * Author(s): Harald Freudenberger
- *
- * The s390_arch_random_generate() function may be called from random.c
- * in interrupt context. So this implementation does the best to be very
- * fast. There is a buffer of random data which is asynchronously checked
- * and filled by a workqueue thread.
- * If there are enough bytes in the buffer the s390_arch_random_generate()
- * just delivers these bytes. Otherwise false is returned until the
- * worker thread refills the buffer.
- * The worker fills the rng buffer by pulling fresh entropy from the
- * high quality (but slow) true hardware random generator. This entropy
- * is then spread over the buffer with an pseudo random generator PRNG.
- * As the arch_get_random_seed_long() fetches 8 bytes and the calling
- * function add_interrupt_randomness() counts this as 1 bit entropy the
- * distribution needs to make sure there is in fact 1 bit entropy contained
- * in 8 bytes of the buffer. The current values pull 32 byte entropy
- * and scatter this into a 2048 byte buffer. So 8 byte in the buffer
- * will contain 1 bit of entropy.
- * The worker thread is rescheduled based on the charge level of the
- * buffer but at least with 500 ms delay to avoid too much CPU consumption.
- * So the max. amount of rng data delivered via arch_get_random_seed is
- * limited to 4k bytes per second.
  */
 
 #include <linux/kernel.h>
 #include <linux/atomic.h>
 #include <linux/random.h>
-#include <linux/slab.h>
 #include <linux/static_key.h>
-#include <linux/workqueue.h>
+#include <asm/archrandom.h>
 #include <asm/cpacf.h>
 
 DEFINE_STATIC_KEY_FALSE(s390_arch_random_available);
 
 atomic64_t s390_arch_random_counter = ATOMIC64_INIT(0);
 EXPORT_SYMBOL(s390_arch_random_counter);
-
-#define ARCH_REFILL_TICKS (HZ/2)
-#define ARCH_PRNG_SEED_SIZE 32
-#define ARCH_RNG_BUF_SIZE 2048
-
-static DEFINE_SPINLOCK(arch_rng_lock);
-static u8 *arch_rng_buf;
-static unsigned int arch_rng_buf_idx;
-
-static void arch_rng_refill_buffer(struct work_struct *);
-static DECLARE_DELAYED_WORK(arch_rng_work, arch_rng_refill_buffer);
-
-bool s390_arch_random_generate(u8 *buf, unsigned int nbytes)
-{
-	/* lock rng buffer */
-	if (!spin_trylock(&arch_rng_lock))
-		return false;
-
-	/* try to resolve the requested amount of bytes from the buffer */
-	arch_rng_buf_idx -= nbytes;
-	if (arch_rng_buf_idx < ARCH_RNG_BUF_SIZE) {
-		memcpy(buf, arch_rng_buf + arch_rng_buf_idx, nbytes);
-		atomic64_add(nbytes, &s390_arch_random_counter);
-		spin_unlock(&arch_rng_lock);
-		return true;
-	}
-
-	/* not enough bytes in rng buffer, refill is done asynchronously */
-	spin_unlock(&arch_rng_lock);
-
-	return false;
-}
-EXPORT_SYMBOL(s390_arch_random_generate);
-
-static void arch_rng_refill_buffer(struct work_struct *unused)
-{
-	unsigned int delay = ARCH_REFILL_TICKS;
-
-	spin_lock(&arch_rng_lock);
-	if (arch_rng_buf_idx > ARCH_RNG_BUF_SIZE) {
-		/* buffer is exhausted and needs refill */
-		u8 seed[ARCH_PRNG_SEED_SIZE];
-		u8 prng_wa[240];
-		/* fetch ARCH_PRNG_SEED_SIZE bytes of entropy */
-		cpacf_trng(NULL, 0, seed, sizeof(seed));
-		/* blow this entropy up to ARCH_RNG_BUF_SIZE with PRNG */
-		memset(prng_wa, 0, sizeof(prng_wa));
-		cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED,
-			   &prng_wa, NULL, 0, seed, sizeof(seed));
-		cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN,
-			   &prng_wa, arch_rng_buf, ARCH_RNG_BUF_SIZE, NULL, 0);
-		arch_rng_buf_idx = ARCH_RNG_BUF_SIZE;
-	}
-	delay += (ARCH_REFILL_TICKS * arch_rng_buf_idx) / ARCH_RNG_BUF_SIZE;
-	spin_unlock(&arch_rng_lock);
-
-	/* kick next check */
-	queue_delayed_work(system_long_wq, &arch_rng_work, delay);
-}
-
-static int __init s390_arch_random_init(void)
-{
-	/* all the needed PRNO subfunctions available ? */
-	if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG) &&
-	    cpacf_query_func(CPACF_PRNO, CPACF_PRNO_SHA512_DRNG_GEN)) {
-
-		/* alloc arch random working buffer */
-		arch_rng_buf = kmalloc(ARCH_RNG_BUF_SIZE, GFP_KERNEL);
-		if (!arch_rng_buf)
-			return -ENOMEM;
-
-		/* kick worker queue job to fill the random buffer */
-		queue_delayed_work(system_long_wq,
-				   &arch_rng_work, ARCH_REFILL_TICKS);
-
-		/* enable arch random to the outside world */
-		static_branch_enable(&s390_arch_random_available);
-	}
-
-	return 0;
-}
-arch_initcall(s390_arch_random_init);
diff --git a/arch/s390/crypto/chacha-glue.c b/arch/s390/crypto/chacha-glue.c
new file mode 100644
index 000000000000..920e9f0941e7
--- /dev/null
+++ b/arch/s390/crypto/chacha-glue.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * s390 ChaCha stream cipher.
+ *
+ * Copyright IBM Corp. 2021
+ */
+
+#define KMSG_COMPONENT "chacha_s390"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <crypto/internal/chacha.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/algapi.h>
+#include <linux/cpufeature.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+#include <asm/fpu.h>
+#include "chacha-s390.h"
+
+static void chacha20_crypt_s390(u32 *state, u8 *dst, const u8 *src,
+				unsigned int nbytes, const u32 *key,
+				u32 *counter)
+{
+	DECLARE_KERNEL_FPU_ONSTACK32(vxstate);
+
+	kernel_fpu_begin(&vxstate, KERNEL_VXR);
+	chacha20_vx(dst, src, nbytes, key, counter);
+	kernel_fpu_end(&vxstate, KERNEL_VXR);
+
+	*counter += round_up(nbytes, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
+}
+
+static int chacha20_s390(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	u32 state[CHACHA_STATE_WORDS] __aligned(16);
+	struct skcipher_walk walk;
+	unsigned int nbytes;
+	int rc;
+
+	rc = skcipher_walk_virt(&walk, req, false);
+	chacha_init(state, ctx->key, req->iv);
+
+	while (walk.nbytes > 0) {
+		nbytes = walk.nbytes;
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.stride);
+
+		if (nbytes <= CHACHA_BLOCK_SIZE) {
+			chacha_crypt_generic(state, walk.dst.virt.addr,
+					     walk.src.virt.addr, nbytes,
+					     ctx->nrounds);
+		} else {
+			chacha20_crypt_s390(state, walk.dst.virt.addr,
+					    walk.src.virt.addr, nbytes,
+					    &state[4], &state[12]);
+		}
+		rc = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+	}
+	return rc;
+}
+
+void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
+{
+	/* TODO: implement hchacha_block_arch() in assembly */
+	hchacha_block_generic(state, stream, nrounds);
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
+		       unsigned int bytes, int nrounds)
+{
+	/* s390 chacha20 implementation has 20 rounds hard-coded,
+	 * it cannot handle a block of data or less, but otherwise
+	 * it can handle data of arbitrary size
+	 */
+	if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20 || !cpu_has_vx())
+		chacha_crypt_generic(state, dst, src, bytes, nrounds);
+	else
+		chacha20_crypt_s390(state, dst, src, bytes,
+				    &state[4], &state[12]);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+static struct skcipher_alg chacha_algs[] = {
+	{
+		.base.cra_name		= "chacha20",
+		.base.cra_driver_name	= "chacha20-s390",
+		.base.cra_priority	= 900,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= CHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= chacha20_setkey,
+		.encrypt		= chacha20_s390,
+		.decrypt		= chacha20_s390,
+	}
+};
+
+static int __init chacha_mod_init(void)
+{
+	return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ?
+		crypto_register_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs)) : 0;
+}
+
+static void __exit chacha_mod_fini(void)
+{
+	if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER))
+		crypto_unregister_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs));
+}
+
+module_cpu_feature_match(S390_CPU_FEATURE_VXRS, chacha_mod_init);
+module_exit(chacha_mod_fini);
+
+MODULE_DESCRIPTION("ChaCha20 stream cipher");
+MODULE_LICENSE("GPL v2");
+
+MODULE_ALIAS_CRYPTO("chacha20");
diff --git a/arch/s390/crypto/chacha-s390.S b/arch/s390/crypto/chacha-s390.S
new file mode 100644
index 000000000000..63f3102678c0
--- /dev/null
+++ b/arch/s390/crypto/chacha-s390.S
@@ -0,0 +1,908 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Original implementation written by Andy Polyakov, @dot-asm.
+ * This is an adaptation of the original code for kernel use.
+ *
+ * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+#include <asm/nospec-insn.h>
+#include <asm/fpu-insn.h>
+
+#define SP	%r15
+#define FRAME	(16 * 8 + 4 * 8)
+
+	.data
+	.balign	32
+
+SYM_DATA_START_LOCAL(sigma)
+	.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	# endian-neutral
+	.long	1,0,0,0
+	.long	2,0,0,0
+	.long	3,0,0,0
+	.long	0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c	# byte swap
+
+	.long	0,1,2,3
+	.long	0x61707865,0x61707865,0x61707865,0x61707865	# smashed sigma
+	.long	0x3320646e,0x3320646e,0x3320646e,0x3320646e
+	.long	0x79622d32,0x79622d32,0x79622d32,0x79622d32
+	.long	0x6b206574,0x6b206574,0x6b206574,0x6b206574
+SYM_DATA_END(sigma)
+
+	.previous
+
+	GEN_BR_THUNK %r14
+
+	.text
+
+#############################################################################
+# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len,
+#		      counst u32 *key, const u32 *counter)
+
+#define	OUT		%r2
+#define	INP		%r3
+#define	LEN		%r4
+#define	KEY		%r5
+#define	COUNTER		%r6
+
+#define BEPERM		%v31
+#define CTR		%v26
+
+#define K0		%v16
+#define K1		%v17
+#define K2		%v18
+#define K3		%v19
+
+#define XA0		%v0
+#define XA1		%v1
+#define XA2		%v2
+#define XA3		%v3
+
+#define XB0		%v4
+#define XB1		%v5
+#define XB2		%v6
+#define XB3		%v7
+
+#define XC0		%v8
+#define XC1		%v9
+#define XC2		%v10
+#define XC3		%v11
+
+#define XD0		%v12
+#define XD1		%v13
+#define XD2		%v14
+#define XD3		%v15
+
+#define XT0		%v27
+#define XT1		%v28
+#define XT2		%v29
+#define XT3		%v30
+
+SYM_FUNC_START(chacha20_vx_4x)
+	stmg	%r6,%r7,6*8(SP)
+
+	larl	%r7,sigma
+	lhi	%r0,10
+	lhi	%r1,0
+
+	VL	K0,0,,%r7		# load sigma
+	VL	K1,0,,KEY		# load key
+	VL	K2,16,,KEY
+	VL	K3,0,,COUNTER		# load counter
+
+	VL	BEPERM,0x40,,%r7
+	VL	CTR,0x50,,%r7
+
+	VLM	XA0,XA3,0x60,%r7,4	# load [smashed] sigma
+
+	VREPF	XB0,K1,0		# smash the key
+	VREPF	XB1,K1,1
+	VREPF	XB2,K1,2
+	VREPF	XB3,K1,3
+
+	VREPF	XD0,K3,0
+	VREPF	XD1,K3,1
+	VREPF	XD2,K3,2
+	VREPF	XD3,K3,3
+	VAF	XD0,XD0,CTR
+
+	VREPF	XC0,K2,0
+	VREPF	XC1,K2,1
+	VREPF	XC2,K2,2
+	VREPF	XC3,K2,3
+
+.Loop_4x:
+	VAF	XA0,XA0,XB0
+	VX	XD0,XD0,XA0
+	VERLLF	XD0,XD0,16
+
+	VAF	XA1,XA1,XB1
+	VX	XD1,XD1,XA1
+	VERLLF	XD1,XD1,16
+
+	VAF	XA2,XA2,XB2
+	VX	XD2,XD2,XA2
+	VERLLF	XD2,XD2,16
+
+	VAF	XA3,XA3,XB3
+	VX	XD3,XD3,XA3
+	VERLLF	XD3,XD3,16
+
+	VAF	XC0,XC0,XD0
+	VX	XB0,XB0,XC0
+	VERLLF	XB0,XB0,12
+
+	VAF	XC1,XC1,XD1
+	VX	XB1,XB1,XC1
+	VERLLF	XB1,XB1,12
+
+	VAF	XC2,XC2,XD2
+	VX	XB2,XB2,XC2
+	VERLLF	XB2,XB2,12
+
+	VAF	XC3,XC3,XD3
+	VX	XB3,XB3,XC3
+	VERLLF	XB3,XB3,12
+
+	VAF	XA0,XA0,XB0
+	VX	XD0,XD0,XA0
+	VERLLF	XD0,XD0,8
+
+	VAF	XA1,XA1,XB1
+	VX	XD1,XD1,XA1
+	VERLLF	XD1,XD1,8
+
+	VAF	XA2,XA2,XB2
+	VX	XD2,XD2,XA2
+	VERLLF	XD2,XD2,8
+
+	VAF	XA3,XA3,XB3
+	VX	XD3,XD3,XA3
+	VERLLF	XD3,XD3,8
+
+	VAF	XC0,XC0,XD0
+	VX	XB0,XB0,XC0
+	VERLLF	XB0,XB0,7
+
+	VAF	XC1,XC1,XD1
+	VX	XB1,XB1,XC1
+	VERLLF	XB1,XB1,7
+
+	VAF	XC2,XC2,XD2
+	VX	XB2,XB2,XC2
+	VERLLF	XB2,XB2,7
+
+	VAF	XC3,XC3,XD3
+	VX	XB3,XB3,XC3
+	VERLLF	XB3,XB3,7
+
+	VAF	XA0,XA0,XB1
+	VX	XD3,XD3,XA0
+	VERLLF	XD3,XD3,16
+
+	VAF	XA1,XA1,XB2
+	VX	XD0,XD0,XA1
+	VERLLF	XD0,XD0,16
+
+	VAF	XA2,XA2,XB3
+	VX	XD1,XD1,XA2
+	VERLLF	XD1,XD1,16
+
+	VAF	XA3,XA3,XB0
+	VX	XD2,XD2,XA3
+	VERLLF	XD2,XD2,16
+
+	VAF	XC2,XC2,XD3
+	VX	XB1,XB1,XC2
+	VERLLF	XB1,XB1,12
+
+	VAF	XC3,XC3,XD0
+	VX	XB2,XB2,XC3
+	VERLLF	XB2,XB2,12
+
+	VAF	XC0,XC0,XD1
+	VX	XB3,XB3,XC0
+	VERLLF	XB3,XB3,12
+
+	VAF	XC1,XC1,XD2
+	VX	XB0,XB0,XC1
+	VERLLF	XB0,XB0,12
+
+	VAF	XA0,XA0,XB1
+	VX	XD3,XD3,XA0
+	VERLLF	XD3,XD3,8
+
+	VAF	XA1,XA1,XB2
+	VX	XD0,XD0,XA1
+	VERLLF	XD0,XD0,8
+
+	VAF	XA2,XA2,XB3
+	VX	XD1,XD1,XA2
+	VERLLF	XD1,XD1,8
+
+	VAF	XA3,XA3,XB0
+	VX	XD2,XD2,XA3
+	VERLLF	XD2,XD2,8
+
+	VAF	XC2,XC2,XD3
+	VX	XB1,XB1,XC2
+	VERLLF	XB1,XB1,7
+
+	VAF	XC3,XC3,XD0
+	VX	XB2,XB2,XC3
+	VERLLF	XB2,XB2,7
+
+	VAF	XC0,XC0,XD1
+	VX	XB3,XB3,XC0
+	VERLLF	XB3,XB3,7
+
+	VAF	XC1,XC1,XD2
+	VX	XB0,XB0,XC1
+	VERLLF	XB0,XB0,7
+	brct	%r0,.Loop_4x
+
+	VAF	XD0,XD0,CTR
+
+	VMRHF	XT0,XA0,XA1		# transpose data
+	VMRHF	XT1,XA2,XA3
+	VMRLF	XT2,XA0,XA1
+	VMRLF	XT3,XA2,XA3
+	VPDI	XA0,XT0,XT1,0b0000
+	VPDI	XA1,XT0,XT1,0b0101
+	VPDI	XA2,XT2,XT3,0b0000
+	VPDI	XA3,XT2,XT3,0b0101
+
+	VMRHF	XT0,XB0,XB1
+	VMRHF	XT1,XB2,XB3
+	VMRLF	XT2,XB0,XB1
+	VMRLF	XT3,XB2,XB3
+	VPDI	XB0,XT0,XT1,0b0000
+	VPDI	XB1,XT0,XT1,0b0101
+	VPDI	XB2,XT2,XT3,0b0000
+	VPDI	XB3,XT2,XT3,0b0101
+
+	VMRHF	XT0,XC0,XC1
+	VMRHF	XT1,XC2,XC3
+	VMRLF	XT2,XC0,XC1
+	VMRLF	XT3,XC2,XC3
+	VPDI	XC0,XT0,XT1,0b0000
+	VPDI	XC1,XT0,XT1,0b0101
+	VPDI	XC2,XT2,XT3,0b0000
+	VPDI	XC3,XT2,XT3,0b0101
+
+	VMRHF	XT0,XD0,XD1
+	VMRHF	XT1,XD2,XD3
+	VMRLF	XT2,XD0,XD1
+	VMRLF	XT3,XD2,XD3
+	VPDI	XD0,XT0,XT1,0b0000
+	VPDI	XD1,XT0,XT1,0b0101
+	VPDI	XD2,XT2,XT3,0b0000
+	VPDI	XD3,XT2,XT3,0b0101
+
+	VAF	XA0,XA0,K0
+	VAF	XB0,XB0,K1
+	VAF	XC0,XC0,K2
+	VAF	XD0,XD0,K3
+
+	VPERM	XA0,XA0,XA0,BEPERM
+	VPERM	XB0,XB0,XB0,BEPERM
+	VPERM	XC0,XC0,XC0,BEPERM
+	VPERM	XD0,XD0,XD0,BEPERM
+
+	VLM	XT0,XT3,0,INP,0
+
+	VX	XT0,XT0,XA0
+	VX	XT1,XT1,XB0
+	VX	XT2,XT2,XC0
+	VX	XT3,XT3,XD0
+
+	VSTM	XT0,XT3,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+
+	VAF	XA0,XA1,K0
+	VAF	XB0,XB1,K1
+	VAF	XC0,XC1,K2
+	VAF	XD0,XD1,K3
+
+	VPERM	XA0,XA0,XA0,BEPERM
+	VPERM	XB0,XB0,XB0,BEPERM
+	VPERM	XC0,XC0,XC0,BEPERM
+	VPERM	XD0,XD0,XD0,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_4x
+
+	VLM	XT0,XT3,0,INP,0
+
+	VX	XT0,XT0,XA0
+	VX	XT1,XT1,XB0
+	VX	XT2,XT2,XC0
+	VX	XT3,XT3,XD0
+
+	VSTM	XT0,XT3,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_4x
+
+	VAF	XA0,XA2,K0
+	VAF	XB0,XB2,K1
+	VAF	XC0,XC2,K2
+	VAF	XD0,XD2,K3
+
+	VPERM	XA0,XA0,XA0,BEPERM
+	VPERM	XB0,XB0,XB0,BEPERM
+	VPERM	XC0,XC0,XC0,BEPERM
+	VPERM	XD0,XD0,XD0,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_4x
+
+	VLM	XT0,XT3,0,INP,0
+
+	VX	XT0,XT0,XA0
+	VX	XT1,XT1,XB0
+	VX	XT2,XT2,XC0
+	VX	XT3,XT3,XD0
+
+	VSTM	XT0,XT3,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_4x
+
+	VAF	XA0,XA3,K0
+	VAF	XB0,XB3,K1
+	VAF	XC0,XC3,K2
+	VAF	XD0,XD3,K3
+
+	VPERM	XA0,XA0,XA0,BEPERM
+	VPERM	XB0,XB0,XB0,BEPERM
+	VPERM	XC0,XC0,XC0,BEPERM
+	VPERM	XD0,XD0,XD0,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_4x
+
+	VLM	XT0,XT3,0,INP,0
+
+	VX	XT0,XT0,XA0
+	VX	XT1,XT1,XB0
+	VX	XT2,XT2,XC0
+	VX	XT3,XT3,XD0
+
+	VSTM	XT0,XT3,0,OUT,0
+
+.Ldone_4x:
+	lmg	%r6,%r7,6*8(SP)
+	BR_EX	%r14
+
+.Ltail_4x:
+	VLR	XT0,XC0
+	VLR	XT1,XD0
+
+	VST	XA0,8*8+0x00,,SP
+	VST	XB0,8*8+0x10,,SP
+	VST	XT0,8*8+0x20,,SP
+	VST	XT1,8*8+0x30,,SP
+
+	lghi	%r1,0
+
+.Loop_tail_4x:
+	llgc	%r5,0(%r1,INP)
+	llgc	%r6,8*8(%r1,SP)
+	xr	%r6,%r5
+	stc	%r6,0(%r1,OUT)
+	la	%r1,1(%r1)
+	brct	LEN,.Loop_tail_4x
+
+	lmg	%r6,%r7,6*8(SP)
+	BR_EX	%r14
+SYM_FUNC_END(chacha20_vx_4x)
+
+#undef	OUT
+#undef	INP
+#undef	LEN
+#undef	KEY
+#undef	COUNTER
+
+#undef BEPERM
+
+#undef K0
+#undef K1
+#undef K2
+#undef K3
+
+
+#############################################################################
+# void chacha20_vx(u8 *out, counst u8 *inp, size_t len,
+#		   counst u32 *key, const u32 *counter)
+
+#define	OUT		%r2
+#define	INP		%r3
+#define	LEN		%r4
+#define	KEY		%r5
+#define	COUNTER		%r6
+
+#define BEPERM		%v31
+
+#define K0		%v27
+#define K1		%v24
+#define K2		%v25
+#define K3		%v26
+
+#define A0		%v0
+#define B0		%v1
+#define C0		%v2
+#define D0		%v3
+
+#define A1		%v4
+#define B1		%v5
+#define C1		%v6
+#define D1		%v7
+
+#define A2		%v8
+#define B2		%v9
+#define C2		%v10
+#define D2		%v11
+
+#define A3		%v12
+#define B3		%v13
+#define C3		%v14
+#define D3		%v15
+
+#define A4		%v16
+#define B4		%v17
+#define C4		%v18
+#define D4		%v19
+
+#define A5		%v20
+#define B5		%v21
+#define C5		%v22
+#define D5		%v23
+
+#define T0		%v27
+#define T1		%v28
+#define T2		%v29
+#define T3		%v30
+
+SYM_FUNC_START(chacha20_vx)
+	clgfi	LEN,256
+	jle	chacha20_vx_4x
+	stmg	%r6,%r7,6*8(SP)
+
+	lghi	%r1,-FRAME
+	lgr	%r0,SP
+	la	SP,0(%r1,SP)
+	stg	%r0,0(SP)		# back-chain
+
+	larl	%r7,sigma
+	lhi	%r0,10
+
+	VLM	K1,K2,0,KEY,0		# load key
+	VL	K3,0,,COUNTER		# load counter
+
+	VLM	K0,BEPERM,0,%r7,4	# load sigma, increments, ...
+
+.Loop_outer_vx:
+	VLR	A0,K0
+	VLR	B0,K1
+	VLR	A1,K0
+	VLR	B1,K1
+	VLR	A2,K0
+	VLR	B2,K1
+	VLR	A3,K0
+	VLR	B3,K1
+	VLR	A4,K0
+	VLR	B4,K1
+	VLR	A5,K0
+	VLR	B5,K1
+
+	VLR	D0,K3
+	VAF	D1,K3,T1		# K[3]+1
+	VAF	D2,K3,T2		# K[3]+2
+	VAF	D3,K3,T3		# K[3]+3
+	VAF	D4,D2,T2		# K[3]+4
+	VAF	D5,D2,T3		# K[3]+5
+
+	VLR	C0,K2
+	VLR	C1,K2
+	VLR	C2,K2
+	VLR	C3,K2
+	VLR	C4,K2
+	VLR	C5,K2
+
+	VLR	T1,D1
+	VLR	T2,D2
+	VLR	T3,D3
+
+.Loop_vx:
+	VAF	A0,A0,B0
+	VAF	A1,A1,B1
+	VAF	A2,A2,B2
+	VAF	A3,A3,B3
+	VAF	A4,A4,B4
+	VAF	A5,A5,B5
+	VX	D0,D0,A0
+	VX	D1,D1,A1
+	VX	D2,D2,A2
+	VX	D3,D3,A3
+	VX	D4,D4,A4
+	VX	D5,D5,A5
+	VERLLF	D0,D0,16
+	VERLLF	D1,D1,16
+	VERLLF	D2,D2,16
+	VERLLF	D3,D3,16
+	VERLLF	D4,D4,16
+	VERLLF	D5,D5,16
+
+	VAF	C0,C0,D0
+	VAF	C1,C1,D1
+	VAF	C2,C2,D2
+	VAF	C3,C3,D3
+	VAF	C4,C4,D4
+	VAF	C5,C5,D5
+	VX	B0,B0,C0
+	VX	B1,B1,C1
+	VX	B2,B2,C2
+	VX	B3,B3,C3
+	VX	B4,B4,C4
+	VX	B5,B5,C5
+	VERLLF	B0,B0,12
+	VERLLF	B1,B1,12
+	VERLLF	B2,B2,12
+	VERLLF	B3,B3,12
+	VERLLF	B4,B4,12
+	VERLLF	B5,B5,12
+
+	VAF	A0,A0,B0
+	VAF	A1,A1,B1
+	VAF	A2,A2,B2
+	VAF	A3,A3,B3
+	VAF	A4,A4,B4
+	VAF	A5,A5,B5
+	VX	D0,D0,A0
+	VX	D1,D1,A1
+	VX	D2,D2,A2
+	VX	D3,D3,A3
+	VX	D4,D4,A4
+	VX	D5,D5,A5
+	VERLLF	D0,D0,8
+	VERLLF	D1,D1,8
+	VERLLF	D2,D2,8
+	VERLLF	D3,D3,8
+	VERLLF	D4,D4,8
+	VERLLF	D5,D5,8
+
+	VAF	C0,C0,D0
+	VAF	C1,C1,D1
+	VAF	C2,C2,D2
+	VAF	C3,C3,D3
+	VAF	C4,C4,D4
+	VAF	C5,C5,D5
+	VX	B0,B0,C0
+	VX	B1,B1,C1
+	VX	B2,B2,C2
+	VX	B3,B3,C3
+	VX	B4,B4,C4
+	VX	B5,B5,C5
+	VERLLF	B0,B0,7
+	VERLLF	B1,B1,7
+	VERLLF	B2,B2,7
+	VERLLF	B3,B3,7
+	VERLLF	B4,B4,7
+	VERLLF	B5,B5,7
+
+	VSLDB	C0,C0,C0,8
+	VSLDB	C1,C1,C1,8
+	VSLDB	C2,C2,C2,8
+	VSLDB	C3,C3,C3,8
+	VSLDB	C4,C4,C4,8
+	VSLDB	C5,C5,C5,8
+	VSLDB	B0,B0,B0,4
+	VSLDB	B1,B1,B1,4
+	VSLDB	B2,B2,B2,4
+	VSLDB	B3,B3,B3,4
+	VSLDB	B4,B4,B4,4
+	VSLDB	B5,B5,B5,4
+	VSLDB	D0,D0,D0,12
+	VSLDB	D1,D1,D1,12
+	VSLDB	D2,D2,D2,12
+	VSLDB	D3,D3,D3,12
+	VSLDB	D4,D4,D4,12
+	VSLDB	D5,D5,D5,12
+
+	VAF	A0,A0,B0
+	VAF	A1,A1,B1
+	VAF	A2,A2,B2
+	VAF	A3,A3,B3
+	VAF	A4,A4,B4
+	VAF	A5,A5,B5
+	VX	D0,D0,A0
+	VX	D1,D1,A1
+	VX	D2,D2,A2
+	VX	D3,D3,A3
+	VX	D4,D4,A4
+	VX	D5,D5,A5
+	VERLLF	D0,D0,16
+	VERLLF	D1,D1,16
+	VERLLF	D2,D2,16
+	VERLLF	D3,D3,16
+	VERLLF	D4,D4,16
+	VERLLF	D5,D5,16
+
+	VAF	C0,C0,D0
+	VAF	C1,C1,D1
+	VAF	C2,C2,D2
+	VAF	C3,C3,D3
+	VAF	C4,C4,D4
+	VAF	C5,C5,D5
+	VX	B0,B0,C0
+	VX	B1,B1,C1
+	VX	B2,B2,C2
+	VX	B3,B3,C3
+	VX	B4,B4,C4
+	VX	B5,B5,C5
+	VERLLF	B0,B0,12
+	VERLLF	B1,B1,12
+	VERLLF	B2,B2,12
+	VERLLF	B3,B3,12
+	VERLLF	B4,B4,12
+	VERLLF	B5,B5,12
+
+	VAF	A0,A0,B0
+	VAF	A1,A1,B1
+	VAF	A2,A2,B2
+	VAF	A3,A3,B3
+	VAF	A4,A4,B4
+	VAF	A5,A5,B5
+	VX	D0,D0,A0
+	VX	D1,D1,A1
+	VX	D2,D2,A2
+	VX	D3,D3,A3
+	VX	D4,D4,A4
+	VX	D5,D5,A5
+	VERLLF	D0,D0,8
+	VERLLF	D1,D1,8
+	VERLLF	D2,D2,8
+	VERLLF	D3,D3,8
+	VERLLF	D4,D4,8
+	VERLLF	D5,D5,8
+
+	VAF	C0,C0,D0
+	VAF	C1,C1,D1
+	VAF	C2,C2,D2
+	VAF	C3,C3,D3
+	VAF	C4,C4,D4
+	VAF	C5,C5,D5
+	VX	B0,B0,C0
+	VX	B1,B1,C1
+	VX	B2,B2,C2
+	VX	B3,B3,C3
+	VX	B4,B4,C4
+	VX	B5,B5,C5
+	VERLLF	B0,B0,7
+	VERLLF	B1,B1,7
+	VERLLF	B2,B2,7
+	VERLLF	B3,B3,7
+	VERLLF	B4,B4,7
+	VERLLF	B5,B5,7
+
+	VSLDB	C0,C0,C0,8
+	VSLDB	C1,C1,C1,8
+	VSLDB	C2,C2,C2,8
+	VSLDB	C3,C3,C3,8
+	VSLDB	C4,C4,C4,8
+	VSLDB	C5,C5,C5,8
+	VSLDB	B0,B0,B0,12
+	VSLDB	B1,B1,B1,12
+	VSLDB	B2,B2,B2,12
+	VSLDB	B3,B3,B3,12
+	VSLDB	B4,B4,B4,12
+	VSLDB	B5,B5,B5,12
+	VSLDB	D0,D0,D0,4
+	VSLDB	D1,D1,D1,4
+	VSLDB	D2,D2,D2,4
+	VSLDB	D3,D3,D3,4
+	VSLDB	D4,D4,D4,4
+	VSLDB	D5,D5,D5,4
+	brct	%r0,.Loop_vx
+
+	VAF	A0,A0,K0
+	VAF	B0,B0,K1
+	VAF	C0,C0,K2
+	VAF	D0,D0,K3
+	VAF	A1,A1,K0
+	VAF	D1,D1,T1		# +K[3]+1
+
+	VPERM	A0,A0,A0,BEPERM
+	VPERM	B0,B0,B0,BEPERM
+	VPERM	C0,C0,C0,BEPERM
+	VPERM	D0,D0,D0,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_vx
+
+	VAF	D2,D2,T2		# +K[3]+2
+	VAF	D3,D3,T3		# +K[3]+3
+	VLM	T0,T3,0,INP,0
+
+	VX	A0,A0,T0
+	VX	B0,B0,T1
+	VX	C0,C0,T2
+	VX	D0,D0,T3
+
+	VLM	K0,T3,0,%r7,4		# re-load sigma and increments
+
+	VSTM	A0,D0,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_vx
+
+	VAF	B1,B1,K1
+	VAF	C1,C1,K2
+
+	VPERM	A0,A1,A1,BEPERM
+	VPERM	B0,B1,B1,BEPERM
+	VPERM	C0,C1,C1,BEPERM
+	VPERM	D0,D1,D1,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_vx
+
+	VLM	A1,D1,0,INP,0
+
+	VX	A0,A0,A1
+	VX	B0,B0,B1
+	VX	C0,C0,C1
+	VX	D0,D0,D1
+
+	VSTM	A0,D0,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_vx
+
+	VAF	A2,A2,K0
+	VAF	B2,B2,K1
+	VAF	C2,C2,K2
+
+	VPERM	A0,A2,A2,BEPERM
+	VPERM	B0,B2,B2,BEPERM
+	VPERM	C0,C2,C2,BEPERM
+	VPERM	D0,D2,D2,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_vx
+
+	VLM	A1,D1,0,INP,0
+
+	VX	A0,A0,A1
+	VX	B0,B0,B1
+	VX	C0,C0,C1
+	VX	D0,D0,D1
+
+	VSTM	A0,D0,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_vx
+
+	VAF	A3,A3,K0
+	VAF	B3,B3,K1
+	VAF	C3,C3,K2
+	VAF	D2,K3,T3		# K[3]+3
+
+	VPERM	A0,A3,A3,BEPERM
+	VPERM	B0,B3,B3,BEPERM
+	VPERM	C0,C3,C3,BEPERM
+	VPERM	D0,D3,D3,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_vx
+
+	VAF	D3,D2,T1		# K[3]+4
+	VLM	A1,D1,0,INP,0
+
+	VX	A0,A0,A1
+	VX	B0,B0,B1
+	VX	C0,C0,C1
+	VX	D0,D0,D1
+
+	VSTM	A0,D0,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_vx
+
+	VAF	A4,A4,K0
+	VAF	B4,B4,K1
+	VAF	C4,C4,K2
+	VAF	D4,D4,D3		# +K[3]+4
+	VAF	D3,D3,T1		# K[3]+5
+	VAF	K3,D2,T3		# K[3]+=6
+
+	VPERM	A0,A4,A4,BEPERM
+	VPERM	B0,B4,B4,BEPERM
+	VPERM	C0,C4,C4,BEPERM
+	VPERM	D0,D4,D4,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_vx
+
+	VLM	A1,D1,0,INP,0
+
+	VX	A0,A0,A1
+	VX	B0,B0,B1
+	VX	C0,C0,C1
+	VX	D0,D0,D1
+
+	VSTM	A0,D0,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_vx
+
+	VAF	A5,A5,K0
+	VAF	B5,B5,K1
+	VAF	C5,C5,K2
+	VAF	D5,D5,D3		# +K[3]+5
+
+	VPERM	A0,A5,A5,BEPERM
+	VPERM	B0,B5,B5,BEPERM
+	VPERM	C0,C5,C5,BEPERM
+	VPERM	D0,D5,D5,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_vx
+
+	VLM	A1,D1,0,INP,0
+
+	VX	A0,A0,A1
+	VX	B0,B0,B1
+	VX	C0,C0,C1
+	VX	D0,D0,D1
+
+	VSTM	A0,D0,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	lhi	%r0,10
+	aghi	LEN,-0x40
+	jne	.Loop_outer_vx
+
+.Ldone_vx:
+	lmg	%r6,%r7,FRAME+6*8(SP)
+	la	SP,FRAME(SP)
+	BR_EX	%r14
+
+.Ltail_vx:
+	VSTM	A0,D0,8*8,SP,3
+	lghi	%r1,0
+
+.Loop_tail_vx:
+	llgc	%r5,0(%r1,INP)
+	llgc	%r6,8*8(%r1,SP)
+	xr	%r6,%r5
+	stc	%r6,0(%r1,OUT)
+	la	%r1,1(%r1)
+	brct	LEN,.Loop_tail_vx
+
+	lmg	%r6,%r7,FRAME+6*8(SP)
+	la	SP,FRAME(SP)
+	BR_EX	%r14
+SYM_FUNC_END(chacha20_vx)
+
+.previous
diff --git a/arch/s390/crypto/chacha-s390.h b/arch/s390/crypto/chacha-s390.h
new file mode 100644
index 000000000000..733744ce30f5
--- /dev/null
+++ b/arch/s390/crypto/chacha-s390.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * s390 ChaCha stream cipher.
+ *
+ * Copyright IBM Corp. 2021
+ */
+
+#ifndef _CHACHA_S390_H
+#define _CHACHA_S390_H
+
+void chacha20_vx(u8 *out, const u8 *inp, size_t len, const u32 *key,
+		 const u32 *counter);
+
+#endif /* _CHACHA_S390_H */
diff --git a/arch/s390/crypto/crc32-vx.c b/arch/s390/crypto/crc32-vx.c
deleted file mode 100644
index 423ee05887e6..000000000000
--- a/arch/s390/crypto/crc32-vx.c
+++ /dev/null
@@ -1,314 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Crypto-API module for CRC-32 algorithms implemented with the
- * z/Architecture Vector Extension Facility.
- *
- * Copyright IBM Corp. 2015
- * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
- */
-#define KMSG_COMPONENT	"crc32-vx"
-#define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
-
-#include <linux/module.h>
-#include <linux/cpufeature.h>
-#include <linux/crc32.h>
-#include <crypto/internal/hash.h>
-#include <asm/fpu/api.h>
-
-
-#define CRC32_BLOCK_SIZE	1
-#define CRC32_DIGEST_SIZE	4
-
-#define VX_MIN_LEN		64
-#define VX_ALIGNMENT		16L
-#define VX_ALIGN_MASK		(VX_ALIGNMENT - 1)
-
-struct crc_ctx {
-	u32 key;
-};
-
-struct crc_desc_ctx {
-	u32 crc;
-};
-
-/* Prototypes for functions in assembly files */
-u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
-u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
-u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
-
-/*
- * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension
- *
- * Creates a function to perform a particular CRC-32 computation. Depending
- * on the message buffer, the hardware-accelerated or software implementation
- * is used.   Note that the message buffer is aligned to improve fetch
- * operations of VECTOR LOAD MULTIPLE instructions.
- *
- */
-#define DEFINE_CRC32_VX(___fname, ___crc32_vx, ___crc32_sw)		    \
-	static u32 __pure ___fname(u32 crc,				    \
-				unsigned char const *data, size_t datalen)  \
-	{								    \
-		struct kernel_fpu vxstate;				    \
-		unsigned long prealign, aligned, remaining;		    \
-									    \
-		if (datalen < VX_MIN_LEN + VX_ALIGN_MASK)		    \
-			return ___crc32_sw(crc, data, datalen);		    \
-									    \
-		if ((unsigned long)data & VX_ALIGN_MASK) {		    \
-			prealign = VX_ALIGNMENT -			    \
-				  ((unsigned long)data & VX_ALIGN_MASK);    \
-			datalen -= prealign;				    \
-			crc = ___crc32_sw(crc, data, prealign);		    \
-			data = (void *)((unsigned long)data + prealign);    \
-		}							    \
-									    \
-		aligned = datalen & ~VX_ALIGN_MASK;			    \
-		remaining = datalen & VX_ALIGN_MASK;			    \
-									    \
-		kernel_fpu_begin(&vxstate, KERNEL_VXR_LOW);		    \
-		crc = ___crc32_vx(crc, data, aligned);			    \
-		kernel_fpu_end(&vxstate, KERNEL_VXR_LOW);		    \
-									    \
-		if (remaining)						    \
-			crc = ___crc32_sw(crc, data + aligned, remaining);  \
-									    \
-		return crc;						    \
-	}
-
-DEFINE_CRC32_VX(crc32_le_vx, crc32_le_vgfm_16, crc32_le)
-DEFINE_CRC32_VX(crc32_be_vx, crc32_be_vgfm_16, crc32_be)
-DEFINE_CRC32_VX(crc32c_le_vx, crc32c_le_vgfm_16, __crc32c_le)
-
-
-static int crc32_vx_cra_init_zero(struct crypto_tfm *tfm)
-{
-	struct crc_ctx *mctx = crypto_tfm_ctx(tfm);
-
-	mctx->key = 0;
-	return 0;
-}
-
-static int crc32_vx_cra_init_invert(struct crypto_tfm *tfm)
-{
-	struct crc_ctx *mctx = crypto_tfm_ctx(tfm);
-
-	mctx->key = ~0;
-	return 0;
-}
-
-static int crc32_vx_init(struct shash_desc *desc)
-{
-	struct crc_ctx *mctx = crypto_shash_ctx(desc->tfm);
-	struct crc_desc_ctx *ctx = shash_desc_ctx(desc);
-
-	ctx->crc = mctx->key;
-	return 0;
-}
-
-static int crc32_vx_setkey(struct crypto_shash *tfm, const u8 *newkey,
-			   unsigned int newkeylen)
-{
-	struct crc_ctx *mctx = crypto_shash_ctx(tfm);
-
-	if (newkeylen != sizeof(mctx->key)) {
-		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-	mctx->key = le32_to_cpu(*(__le32 *)newkey);
-	return 0;
-}
-
-static int crc32be_vx_setkey(struct crypto_shash *tfm, const u8 *newkey,
-			     unsigned int newkeylen)
-{
-	struct crc_ctx *mctx = crypto_shash_ctx(tfm);
-
-	if (newkeylen != sizeof(mctx->key)) {
-		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-	mctx->key = be32_to_cpu(*(__be32 *)newkey);
-	return 0;
-}
-
-static int crc32le_vx_final(struct shash_desc *desc, u8 *out)
-{
-	struct crc_desc_ctx *ctx = shash_desc_ctx(desc);
-
-	*(__le32 *)out = cpu_to_le32p(&ctx->crc);
-	return 0;
-}
-
-static int crc32be_vx_final(struct shash_desc *desc, u8 *out)
-{
-	struct crc_desc_ctx *ctx = shash_desc_ctx(desc);
-
-	*(__be32 *)out = cpu_to_be32p(&ctx->crc);
-	return 0;
-}
-
-static int crc32c_vx_final(struct shash_desc *desc, u8 *out)
-{
-	struct crc_desc_ctx *ctx = shash_desc_ctx(desc);
-
-	/*
-	 * Perform a final XOR with 0xFFFFFFFF to be in sync
-	 * with the generic crc32c shash implementation.
-	 */
-	*(__le32 *)out = ~cpu_to_le32p(&ctx->crc);
-	return 0;
-}
-
-static int __crc32le_vx_finup(u32 *crc, const u8 *data, unsigned int len,
-			      u8 *out)
-{
-	*(__le32 *)out = cpu_to_le32(crc32_le_vx(*crc, data, len));
-	return 0;
-}
-
-static int __crc32be_vx_finup(u32 *crc, const u8 *data, unsigned int len,
-			      u8 *out)
-{
-	*(__be32 *)out = cpu_to_be32(crc32_be_vx(*crc, data, len));
-	return 0;
-}
-
-static int __crc32c_vx_finup(u32 *crc, const u8 *data, unsigned int len,
-			     u8 *out)
-{
-	/*
-	 * Perform a final XOR with 0xFFFFFFFF to be in sync
-	 * with the generic crc32c shash implementation.
-	 */
-	*(__le32 *)out = ~cpu_to_le32(crc32c_le_vx(*crc, data, len));
-	return 0;
-}
-
-
-#define CRC32_VX_FINUP(alg, func)					      \
-	static int alg ## _vx_finup(struct shash_desc *desc, const u8 *data,  \
-				   unsigned int datalen, u8 *out)	      \
-	{								      \
-		return __ ## alg ## _vx_finup(shash_desc_ctx(desc),	      \
-					      data, datalen, out);	      \
-	}
-
-CRC32_VX_FINUP(crc32le, crc32_le_vx)
-CRC32_VX_FINUP(crc32be, crc32_be_vx)
-CRC32_VX_FINUP(crc32c, crc32c_le_vx)
-
-#define CRC32_VX_DIGEST(alg, func)					      \
-	static int alg ## _vx_digest(struct shash_desc *desc, const u8 *data, \
-				     unsigned int len, u8 *out)		      \
-	{								      \
-		return __ ## alg ## _vx_finup(crypto_shash_ctx(desc->tfm),    \
-					      data, len, out);		      \
-	}
-
-CRC32_VX_DIGEST(crc32le, crc32_le_vx)
-CRC32_VX_DIGEST(crc32be, crc32_be_vx)
-CRC32_VX_DIGEST(crc32c, crc32c_le_vx)
-
-#define CRC32_VX_UPDATE(alg, func)					      \
-	static int alg ## _vx_update(struct shash_desc *desc, const u8 *data, \
-				     unsigned int datalen)		      \
-	{								      \
-		struct crc_desc_ctx *ctx = shash_desc_ctx(desc);	      \
-		ctx->crc = func(ctx->crc, data, datalen);		      \
-		return 0;						      \
-	}
-
-CRC32_VX_UPDATE(crc32le, crc32_le_vx)
-CRC32_VX_UPDATE(crc32be, crc32_be_vx)
-CRC32_VX_UPDATE(crc32c, crc32c_le_vx)
-
-
-static struct shash_alg crc32_vx_algs[] = {
-	/* CRC-32 LE */
-	{
-		.init		=	crc32_vx_init,
-		.setkey		=	crc32_vx_setkey,
-		.update		=	crc32le_vx_update,
-		.final		=	crc32le_vx_final,
-		.finup		=	crc32le_vx_finup,
-		.digest		=	crc32le_vx_digest,
-		.descsize	=	sizeof(struct crc_desc_ctx),
-		.digestsize	=	CRC32_DIGEST_SIZE,
-		.base		=	{
-			.cra_name	 = "crc32",
-			.cra_driver_name = "crc32-vx",
-			.cra_priority	 = 200,
-			.cra_flags	 = CRYPTO_ALG_OPTIONAL_KEY,
-			.cra_blocksize	 = CRC32_BLOCK_SIZE,
-			.cra_ctxsize	 = sizeof(struct crc_ctx),
-			.cra_module	 = THIS_MODULE,
-			.cra_init	 = crc32_vx_cra_init_zero,
-		},
-	},
-	/* CRC-32 BE */
-	{
-		.init		=	crc32_vx_init,
-		.setkey		=	crc32be_vx_setkey,
-		.update		=	crc32be_vx_update,
-		.final		=	crc32be_vx_final,
-		.finup		=	crc32be_vx_finup,
-		.digest		=	crc32be_vx_digest,
-		.descsize	=	sizeof(struct crc_desc_ctx),
-		.digestsize	=	CRC32_DIGEST_SIZE,
-		.base		=	{
-			.cra_name	 = "crc32be",
-			.cra_driver_name = "crc32be-vx",
-			.cra_priority	 = 200,
-			.cra_flags	 = CRYPTO_ALG_OPTIONAL_KEY,
-			.cra_blocksize	 = CRC32_BLOCK_SIZE,
-			.cra_ctxsize	 = sizeof(struct crc_ctx),
-			.cra_module	 = THIS_MODULE,
-			.cra_init	 = crc32_vx_cra_init_zero,
-		},
-	},
-	/* CRC-32C LE */
-	{
-		.init		=	crc32_vx_init,
-		.setkey		=	crc32_vx_setkey,
-		.update		=	crc32c_vx_update,
-		.final		=	crc32c_vx_final,
-		.finup		=	crc32c_vx_finup,
-		.digest		=	crc32c_vx_digest,
-		.descsize	=	sizeof(struct crc_desc_ctx),
-		.digestsize	=	CRC32_DIGEST_SIZE,
-		.base		=	{
-			.cra_name	 = "crc32c",
-			.cra_driver_name = "crc32c-vx",
-			.cra_priority	 = 200,
-			.cra_flags	 = CRYPTO_ALG_OPTIONAL_KEY,
-			.cra_blocksize	 = CRC32_BLOCK_SIZE,
-			.cra_ctxsize	 = sizeof(struct crc_ctx),
-			.cra_module	 = THIS_MODULE,
-			.cra_init	 = crc32_vx_cra_init_invert,
-		},
-	},
-};
-
-
-static int __init crc_vx_mod_init(void)
-{
-	return crypto_register_shashes(crc32_vx_algs,
-				       ARRAY_SIZE(crc32_vx_algs));
-}
-
-static void __exit crc_vx_mod_exit(void)
-{
-	crypto_unregister_shashes(crc32_vx_algs, ARRAY_SIZE(crc32_vx_algs));
-}
-
-module_cpu_feature_match(VXRS, crc_vx_mod_init);
-module_exit(crc_vx_mod_exit);
-
-MODULE_AUTHOR("Hendrik Brueckner <brueckner@linux.vnet.ibm.com>");
-MODULE_LICENSE("GPL");
-
-MODULE_ALIAS_CRYPTO("crc32");
-MODULE_ALIAS_CRYPTO("crc32-vx");
-MODULE_ALIAS_CRYPTO("crc32c");
-MODULE_ALIAS_CRYPTO("crc32c-vx");
diff --git a/arch/s390/crypto/des_s390.c b/arch/s390/crypto/des_s390.c
index bfbafd35bcbd..8e75b83a5ddc 100644
--- a/arch/s390/crypto/des_s390.c
+++ b/arch/s390/crypto/des_s390.c
@@ -194,7 +194,7 @@ static struct skcipher_alg cbc_des_alg = {
  *   same as DES.  Implementers MUST reject keys that exhibit this
  *   property.
  *
- *   In fips mode additinally check for all 3 keys are unique.
+ *   In fips mode additionally check for all 3 keys are unique.
  *
  */
 static int des3_setkey(struct crypto_tfm *tfm, const u8 *key,
@@ -492,7 +492,7 @@ out_err:
 	return ret;
 }
 
-module_cpu_feature_match(MSA, des_s390_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, des_s390_init);
 module_exit(des_s390_exit);
 
 MODULE_ALIAS_CRYPTO("des");
diff --git a/arch/s390/crypto/ghash_s390.c b/arch/s390/crypto/ghash_s390.c
index a3e7400e031c..0800a2a5799f 100644
--- a/arch/s390/crypto/ghash_s390.c
+++ b/arch/s390/crypto/ghash_s390.c
@@ -43,10 +43,8 @@ static int ghash_setkey(struct crypto_shash *tfm,
 {
 	struct ghash_ctx *ctx = crypto_shash_ctx(tfm);
 
-	if (keylen != GHASH_BLOCK_SIZE) {
-		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+	if (keylen != GHASH_BLOCK_SIZE)
 		return -EINVAL;
-	}
 
 	memcpy(ctx->key, key, GHASH_BLOCK_SIZE);
 
@@ -147,7 +145,7 @@ static void __exit ghash_mod_exit(void)
 	crypto_unregister_shash(&ghash_alg);
 }
 
-module_cpu_feature_match(MSA, ghash_mod_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, ghash_mod_init);
 module_exit(ghash_mod_exit);
 
 MODULE_ALIAS_CRYPTO("ghash");
diff --git a/arch/s390/crypto/hmac_s390.c b/arch/s390/crypto/hmac_s390.c
new file mode 100644
index 000000000000..bba9a818dfdc
--- /dev/null
+++ b/arch/s390/crypto/hmac_s390.c
@@ -0,0 +1,359 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright IBM Corp. 2024
+ *
+ * s390 specific HMAC support.
+ */
+
+#define KMSG_COMPONENT	"hmac_s390"
+#define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
+
+#include <asm/cpacf.h>
+#include <crypto/sha2.h>
+#include <crypto/internal/hash.h>
+#include <linux/cpufeature.h>
+#include <linux/module.h>
+
+/*
+ * KMAC param block layout for sha2 function codes:
+ * The layout of the param block for the KMAC instruction depends on the
+ * blocksize of the used hashing sha2-algorithm function codes. The param block
+ * contains the hash chaining value (cv), the input message bit-length (imbl)
+ * and the hmac-secret (key). To prevent code duplication, the sizes of all
+ * these are calculated based on the blocksize.
+ *
+ * param-block:
+ * +-------+
+ * | cv    |
+ * +-------+
+ * | imbl  |
+ * +-------+
+ * | key   |
+ * +-------+
+ *
+ * sizes:
+ * part | sh2-alg | calculation | size | type
+ * -----+---------+-------------+------+--------
+ * cv	| 224/256 | blocksize/2 |   32 |  u64[8]
+ *	| 384/512 |		|   64 | u128[8]
+ * imbl | 224/256 | blocksize/8 |    8 |     u64
+ *	| 384/512 |		|   16 |    u128
+ * key	| 224/256 | blocksize	|   64 |  u8[64]
+ *	| 384/512 |		|  128 | u8[128]
+ */
+
+#define MAX_DIGEST_SIZE		SHA512_DIGEST_SIZE
+#define MAX_IMBL_SIZE		sizeof(u128)
+#define MAX_BLOCK_SIZE		SHA512_BLOCK_SIZE
+
+#define SHA2_CV_SIZE(bs)	((bs) >> 1)
+#define SHA2_IMBL_SIZE(bs)	((bs) >> 3)
+
+#define SHA2_IMBL_OFFSET(bs)	(SHA2_CV_SIZE(bs))
+#define SHA2_KEY_OFFSET(bs)	(SHA2_CV_SIZE(bs) + SHA2_IMBL_SIZE(bs))
+
+struct s390_hmac_ctx {
+	u8 key[MAX_BLOCK_SIZE];
+};
+
+union s390_kmac_gr0 {
+	unsigned long reg;
+	struct {
+		unsigned long		: 48;
+		unsigned long ikp	:  1;
+		unsigned long iimp	:  1;
+		unsigned long ccup	:  1;
+		unsigned long		:  6;
+		unsigned long fc	:  7;
+	};
+};
+
+struct s390_kmac_sha2_ctx {
+	u8 param[MAX_DIGEST_SIZE + MAX_IMBL_SIZE + MAX_BLOCK_SIZE];
+	union s390_kmac_gr0 gr0;
+	u8 buf[MAX_BLOCK_SIZE];
+	unsigned int buflen;
+};
+
+/*
+ * kmac_sha2_set_imbl - sets the input message bit-length based on the blocksize
+ */
+static inline void kmac_sha2_set_imbl(u8 *param, unsigned int buflen,
+				      unsigned int blocksize)
+{
+	u8 *imbl = param + SHA2_IMBL_OFFSET(blocksize);
+
+	switch (blocksize) {
+	case SHA256_BLOCK_SIZE:
+		*(u64 *)imbl = (u64)buflen * BITS_PER_BYTE;
+		break;
+	case SHA512_BLOCK_SIZE:
+		*(u128 *)imbl = (u128)buflen * BITS_PER_BYTE;
+		break;
+	default:
+		break;
+	}
+}
+
+static int hash_key(const u8 *in, unsigned int inlen,
+		    u8 *digest, unsigned int digestsize)
+{
+	unsigned long func;
+	union {
+		struct sha256_paramblock {
+			u32 h[8];
+			u64 mbl;
+		} sha256;
+		struct sha512_paramblock {
+			u64 h[8];
+			u128 mbl;
+		} sha512;
+	} __packed param;
+
+#define PARAM_INIT(x, y, z)		   \
+	param.sha##x.h[0] = SHA##y ## _H0; \
+	param.sha##x.h[1] = SHA##y ## _H1; \
+	param.sha##x.h[2] = SHA##y ## _H2; \
+	param.sha##x.h[3] = SHA##y ## _H3; \
+	param.sha##x.h[4] = SHA##y ## _H4; \
+	param.sha##x.h[5] = SHA##y ## _H5; \
+	param.sha##x.h[6] = SHA##y ## _H6; \
+	param.sha##x.h[7] = SHA##y ## _H7; \
+	param.sha##x.mbl = (z)
+
+	switch (digestsize) {
+	case SHA224_DIGEST_SIZE:
+		func = CPACF_KLMD_SHA_256;
+		PARAM_INIT(256, 224, inlen * 8);
+		break;
+	case SHA256_DIGEST_SIZE:
+		func = CPACF_KLMD_SHA_256;
+		PARAM_INIT(256, 256, inlen * 8);
+		break;
+	case SHA384_DIGEST_SIZE:
+		func = CPACF_KLMD_SHA_512;
+		PARAM_INIT(512, 384, inlen * 8);
+		break;
+	case SHA512_DIGEST_SIZE:
+		func = CPACF_KLMD_SHA_512;
+		PARAM_INIT(512, 512, inlen * 8);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+#undef PARAM_INIT
+
+	cpacf_klmd(func, &param, in, inlen);
+
+	memcpy(digest, &param, digestsize);
+
+	return 0;
+}
+
+static int s390_hmac_sha2_setkey(struct crypto_shash *tfm,
+				 const u8 *key, unsigned int keylen)
+{
+	struct s390_hmac_ctx *tfm_ctx = crypto_shash_ctx(tfm);
+	unsigned int ds = crypto_shash_digestsize(tfm);
+	unsigned int bs = crypto_shash_blocksize(tfm);
+
+	memset(tfm_ctx, 0, sizeof(*tfm_ctx));
+
+	if (keylen > bs)
+		return hash_key(key, keylen, tfm_ctx->key, ds);
+
+	memcpy(tfm_ctx->key, key, keylen);
+	return 0;
+}
+
+static int s390_hmac_sha2_init(struct shash_desc *desc)
+{
+	struct s390_hmac_ctx *tfm_ctx = crypto_shash_ctx(desc->tfm);
+	struct s390_kmac_sha2_ctx *ctx = shash_desc_ctx(desc);
+	unsigned int bs = crypto_shash_blocksize(desc->tfm);
+
+	memcpy(ctx->param + SHA2_KEY_OFFSET(bs),
+	       tfm_ctx->key, bs);
+
+	ctx->buflen = 0;
+	ctx->gr0.reg = 0;
+	switch (crypto_shash_digestsize(desc->tfm)) {
+	case SHA224_DIGEST_SIZE:
+		ctx->gr0.fc = CPACF_KMAC_HMAC_SHA_224;
+		break;
+	case SHA256_DIGEST_SIZE:
+		ctx->gr0.fc = CPACF_KMAC_HMAC_SHA_256;
+		break;
+	case SHA384_DIGEST_SIZE:
+		ctx->gr0.fc = CPACF_KMAC_HMAC_SHA_384;
+		break;
+	case SHA512_DIGEST_SIZE:
+		ctx->gr0.fc = CPACF_KMAC_HMAC_SHA_512;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int s390_hmac_sha2_update(struct shash_desc *desc,
+				 const u8 *data, unsigned int len)
+{
+	struct s390_kmac_sha2_ctx *ctx = shash_desc_ctx(desc);
+	unsigned int bs = crypto_shash_blocksize(desc->tfm);
+	unsigned int offset, n;
+
+	/* check current buffer */
+	offset = ctx->buflen % bs;
+	ctx->buflen += len;
+	if (offset + len < bs)
+		goto store;
+
+	/* process one stored block */
+	if (offset) {
+		n = bs - offset;
+		memcpy(ctx->buf + offset, data, n);
+		ctx->gr0.iimp = 1;
+		_cpacf_kmac(&ctx->gr0.reg, ctx->param, ctx->buf, bs);
+		data += n;
+		len -= n;
+		offset = 0;
+	}
+	/* process as many blocks as possible */
+	if (len >= bs) {
+		n = (len / bs) * bs;
+		ctx->gr0.iimp = 1;
+		_cpacf_kmac(&ctx->gr0.reg, ctx->param, data, n);
+		data += n;
+		len -= n;
+	}
+store:
+	/* store incomplete block in buffer */
+	if (len)
+		memcpy(ctx->buf + offset, data, len);
+
+	return 0;
+}
+
+static int s390_hmac_sha2_final(struct shash_desc *desc, u8 *out)
+{
+	struct s390_kmac_sha2_ctx *ctx = shash_desc_ctx(desc);
+	unsigned int bs = crypto_shash_blocksize(desc->tfm);
+
+	ctx->gr0.iimp = 0;
+	kmac_sha2_set_imbl(ctx->param, ctx->buflen, bs);
+	_cpacf_kmac(&ctx->gr0.reg, ctx->param, ctx->buf, ctx->buflen % bs);
+	memcpy(out, ctx->param, crypto_shash_digestsize(desc->tfm));
+
+	return 0;
+}
+
+static int s390_hmac_sha2_digest(struct shash_desc *desc,
+				 const u8 *data, unsigned int len, u8 *out)
+{
+	struct s390_kmac_sha2_ctx *ctx = shash_desc_ctx(desc);
+	unsigned int ds = crypto_shash_digestsize(desc->tfm);
+	int rc;
+
+	rc = s390_hmac_sha2_init(desc);
+	if (rc)
+		return rc;
+
+	ctx->gr0.iimp = 0;
+	kmac_sha2_set_imbl(ctx->param, len,
+			   crypto_shash_blocksize(desc->tfm));
+	_cpacf_kmac(&ctx->gr0.reg, ctx->param, data, len);
+	memcpy(out, ctx->param, ds);
+
+	return 0;
+}
+
+#define S390_HMAC_SHA2_ALG(x) {						\
+	.fc = CPACF_KMAC_HMAC_SHA_##x,					\
+	.alg = {							\
+		.init = s390_hmac_sha2_init,				\
+		.update = s390_hmac_sha2_update,			\
+		.final = s390_hmac_sha2_final,				\
+		.digest = s390_hmac_sha2_digest,			\
+		.setkey = s390_hmac_sha2_setkey,			\
+		.descsize = sizeof(struct s390_kmac_sha2_ctx),		\
+		.halg = {						\
+			.digestsize = SHA##x##_DIGEST_SIZE,		\
+			.base = {					\
+				.cra_name = "hmac(sha" #x ")",		\
+				.cra_driver_name = "hmac_s390_sha" #x,	\
+				.cra_blocksize = SHA##x##_BLOCK_SIZE,	\
+				.cra_priority = 400,			\
+				.cra_ctxsize = sizeof(struct s390_hmac_ctx), \
+				.cra_module = THIS_MODULE,		\
+			},						\
+		},							\
+	},								\
+}
+
+static struct s390_hmac_alg {
+	bool registered;
+	unsigned int fc;
+	struct shash_alg alg;
+} s390_hmac_algs[] = {
+	S390_HMAC_SHA2_ALG(224),
+	S390_HMAC_SHA2_ALG(256),
+	S390_HMAC_SHA2_ALG(384),
+	S390_HMAC_SHA2_ALG(512),
+};
+
+static __always_inline void _s390_hmac_algs_unregister(void)
+{
+	struct s390_hmac_alg *hmac;
+	int i;
+
+	for (i = ARRAY_SIZE(s390_hmac_algs) - 1; i >= 0; i--) {
+		hmac = &s390_hmac_algs[i];
+		if (!hmac->registered)
+			continue;
+		crypto_unregister_shash(&hmac->alg);
+	}
+}
+
+static int __init hmac_s390_init(void)
+{
+	struct s390_hmac_alg *hmac;
+	int i, rc = -ENODEV;
+
+	if (!cpacf_query_func(CPACF_KLMD, CPACF_KLMD_SHA_256))
+		return -ENODEV;
+	if (!cpacf_query_func(CPACF_KLMD, CPACF_KLMD_SHA_512))
+		return -ENODEV;
+
+	for (i = 0; i < ARRAY_SIZE(s390_hmac_algs); i++) {
+		hmac = &s390_hmac_algs[i];
+		if (!cpacf_query_func(CPACF_KMAC, hmac->fc))
+			continue;
+
+		rc = crypto_register_shash(&hmac->alg);
+		if (rc) {
+			pr_err("unable to register %s\n",
+			       hmac->alg.halg.base.cra_name);
+			goto out;
+		}
+		hmac->registered = true;
+		pr_debug("registered %s\n", hmac->alg.halg.base.cra_name);
+	}
+	return rc;
+out:
+	_s390_hmac_algs_unregister();
+	return rc;
+}
+
+static void __exit hmac_s390_exit(void)
+{
+	_s390_hmac_algs_unregister();
+}
+
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, hmac_s390_init);
+module_exit(hmac_s390_exit);
+
+MODULE_DESCRIPTION("S390 HMAC driver");
+MODULE_LICENSE("GPL");
diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c
index c7119c617b6e..511093713a6f 100644
--- a/arch/s390/crypto/paes_s390.c
+++ b/arch/s390/crypto/paes_s390.c
@@ -5,7 +5,7 @@
  * s390 implementation of the AES Cipher Algorithm with protected keys.
  *
  * s390 Version:
- *   Copyright IBM Corp. 2017,2019
+ *   Copyright IBM Corp. 2017, 2023
  *   Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  *		Harald Freudenberger <freude@de.ibm.com>
  */
@@ -20,7 +20,9 @@
 #include <linux/module.h>
 #include <linux/cpufeature.h>
 #include <linux/init.h>
+#include <linux/mutex.h>
 #include <linux/spinlock.h>
+#include <linux/delay.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/xts.h>
 #include <asm/cpacf.h>
@@ -32,14 +34,22 @@
  * is called. As paes can handle different kinds of key blobs
  * and padding is also possible, the limits need to be generous.
  */
-#define PAES_MIN_KEYSIZE 64
-#define PAES_MAX_KEYSIZE 256
+#define PAES_MIN_KEYSIZE	16
+#define PAES_MAX_KEYSIZE	MAXEP11AESKEYBLOBSIZE
+#define PAES_256_PROTKEY_SIZE	(32 + 32)	/* key + verification pattern */
+#define PXTS_256_PROTKEY_SIZE	(32 + 32 + 32)	/* k1 + k2 + verification pattern */
 
 static u8 *ctrblk;
-static DEFINE_SPINLOCK(ctrblk_lock);
+static DEFINE_MUTEX(ctrblk_lock);
 
 static cpacf_mask_t km_functions, kmc_functions, kmctr_functions;
 
+struct paes_protkey {
+	u32 type;
+	u32 len;
+	u8 protkey[PXTS_256_PROTKEY_SIZE];
+};
+
 struct key_blob {
 	/*
 	 * Small keys will be stored in the keybuf. Larger keys are
@@ -53,19 +63,92 @@ struct key_blob {
 	unsigned int keylen;
 };
 
-static inline int _copy_key_to_kb(struct key_blob *kb,
-				  const u8 *key,
-				  unsigned int keylen)
+/*
+ * make_clrkey_token() - wrap the raw key ck with pkey clearkey token
+ * information.
+ * @returns the size of the clearkey token
+ */
+static inline u32 make_clrkey_token(const u8 *ck, size_t cklen, u8 *dest)
+{
+	struct clrkey_token {
+		u8  type;
+		u8  res0[3];
+		u8  version;
+		u8  res1[3];
+		u32 keytype;
+		u32 len;
+		u8 key[];
+	} __packed *token = (struct clrkey_token *)dest;
+
+	token->type = 0x00;
+	token->version = 0x02;
+	token->keytype = (cklen - 8) >> 3;
+	token->len = cklen;
+	memcpy(token->key, ck, cklen);
+
+	return sizeof(*token) + cklen;
+}
+
+static inline int _key_to_kb(struct key_blob *kb,
+			     const u8 *key,
+			     unsigned int keylen)
+{
+	switch (keylen) {
+	case 16:
+	case 24:
+	case 32:
+		/* clear key value, prepare pkey clear key token in keybuf */
+		memset(kb->keybuf, 0, sizeof(kb->keybuf));
+		kb->keylen = make_clrkey_token(key, keylen, kb->keybuf);
+		kb->key = kb->keybuf;
+		break;
+	default:
+		/* other key material, let pkey handle this */
+		if (keylen <= sizeof(kb->keybuf))
+			kb->key = kb->keybuf;
+		else {
+			kb->key = kmalloc(keylen, GFP_KERNEL);
+			if (!kb->key)
+				return -ENOMEM;
+		}
+		memcpy(kb->key, key, keylen);
+		kb->keylen = keylen;
+		break;
+	}
+
+	return 0;
+}
+
+static inline int _xts_key_to_kb(struct key_blob *kb,
+				 const u8 *key,
+				 unsigned int keylen)
 {
-	if (keylen <= sizeof(kb->keybuf))
+	size_t cklen = keylen / 2;
+
+	memset(kb->keybuf, 0, sizeof(kb->keybuf));
+
+	switch (keylen) {
+	case 32:
+	case 64:
+		/* clear key value, prepare pkey clear key tokens in keybuf */
 		kb->key = kb->keybuf;
-	else {
-		kb->key = kmalloc(keylen, GFP_KERNEL);
-		if (!kb->key)
-			return -ENOMEM;
+		kb->keylen  = make_clrkey_token(key, cklen, kb->key);
+		kb->keylen += make_clrkey_token(key + cklen, cklen,
+						kb->key + kb->keylen);
+		break;
+	default:
+		/* other key material, let pkey handle this */
+		if (keylen <= sizeof(kb->keybuf)) {
+			kb->key = kb->keybuf;
+		} else {
+			kb->key = kmalloc(keylen, GFP_KERNEL);
+			if (!kb->key)
+				return -ENOMEM;
+		}
+		memcpy(kb->key, key, keylen);
+		kb->keylen = keylen;
+		break;
 	}
-	memcpy(kb->key, key, keylen);
-	kb->keylen = keylen;
 
 	return 0;
 }
@@ -74,54 +157,59 @@ static inline void _free_kb_keybuf(struct key_blob *kb)
 {
 	if (kb->key && kb->key != kb->keybuf
 	    && kb->keylen > sizeof(kb->keybuf)) {
-		kfree(kb->key);
+		kfree_sensitive(kb->key);
 		kb->key = NULL;
 	}
+	memzero_explicit(kb->keybuf, sizeof(kb->keybuf));
 }
 
 struct s390_paes_ctx {
 	struct key_blob kb;
-	struct pkey_protkey pk;
+	struct paes_protkey pk;
+	spinlock_t pk_lock;
 	unsigned long fc;
 };
 
 struct s390_pxts_ctx {
-	struct key_blob kb[2];
-	struct pkey_protkey pk[2];
+	struct key_blob kb;
+	struct paes_protkey pk[2];
+	spinlock_t pk_lock;
 	unsigned long fc;
 };
 
-static inline int __paes_convert_key(struct key_blob *kb,
-				     struct pkey_protkey *pk)
+static inline int __paes_keyblob2pkey(const u8 *key, unsigned int keylen,
+				      struct paes_protkey *pk)
 {
-	int i, ret;
+	int i, rc = -EIO;
 
-	/* try three times in case of failure */
-	for (i = 0; i < 3; i++) {
-		ret = pkey_keyblob2pkey(kb->key, kb->keylen, pk);
-		if (ret == 0)
-			break;
+	/* try three times in case of busy card */
+	for (i = 0; rc && i < 3; i++) {
+		if (rc == -EBUSY && in_task()) {
+			if (msleep_interruptible(1000))
+				return -EINTR;
+		}
+		rc = pkey_key2protkey(key, keylen, pk->protkey, &pk->len,
+				      &pk->type);
 	}
 
-	return ret;
+	return rc;
 }
 
-static int __paes_set_key(struct s390_paes_ctx *ctx)
+static inline int __paes_convert_key(struct s390_paes_ctx *ctx)
 {
-	unsigned long fc;
-
-	if (__paes_convert_key(&ctx->kb, &ctx->pk))
-		return -EINVAL;
+	struct paes_protkey pk;
+	int rc;
 
-	/* Pick the correct function code based on the protected key type */
-	fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PAES_128 :
-		(ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KM_PAES_192 :
-		(ctx->pk.type == PKEY_KEYTYPE_AES_256) ? CPACF_KM_PAES_256 : 0;
+	pk.len = sizeof(pk.protkey);
+	rc = __paes_keyblob2pkey(ctx->kb.key, ctx->kb.keylen, &pk);
+	if (rc)
+		return rc;
 
-	/* Check if the function code is available */
-	ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0;
+	spin_lock_bh(&ctx->pk_lock);
+	memcpy(&ctx->pk, &pk, sizeof(pk));
+	spin_unlock_bh(&ctx->pk_lock);
 
-	return ctx->fc ? 0 : -EINVAL;
+	return 0;
 }
 
 static int ecb_paes_init(struct crypto_skcipher *tfm)
@@ -129,6 +217,7 @@ static int ecb_paes_init(struct crypto_skcipher *tfm)
 	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
 
 	ctx->kb.key = NULL;
+	spin_lock_init(&ctx->pk_lock);
 
 	return 0;
 }
@@ -140,46 +229,75 @@ static void ecb_paes_exit(struct crypto_skcipher *tfm)
 	_free_kb_keybuf(&ctx->kb);
 }
 
+static inline int __ecb_paes_set_key(struct s390_paes_ctx *ctx)
+{
+	unsigned long fc;
+	int rc;
+
+	rc = __paes_convert_key(ctx);
+	if (rc)
+		return rc;
+
+	/* Pick the correct function code based on the protected key type */
+	fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PAES_128 :
+		(ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KM_PAES_192 :
+		(ctx->pk.type == PKEY_KEYTYPE_AES_256) ? CPACF_KM_PAES_256 : 0;
+
+	/* Check if the function code is available */
+	ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0;
+
+	return ctx->fc ? 0 : -EINVAL;
+}
+
 static int ecb_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
 			    unsigned int key_len)
 {
-	int rc;
 	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
+	int rc;
 
 	_free_kb_keybuf(&ctx->kb);
-	rc = _copy_key_to_kb(&ctx->kb, in_key, key_len);
+	rc = _key_to_kb(&ctx->kb, in_key, key_len);
 	if (rc)
 		return rc;
 
-	if (__paes_set_key(ctx)) {
-		crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-	return 0;
+	return __ecb_paes_set_key(ctx);
 }
 
 static int ecb_paes_crypt(struct skcipher_request *req, unsigned long modifier)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct {
+		u8 key[PAES_256_PROTKEY_SIZE];
+	} param;
 	struct skcipher_walk walk;
 	unsigned int nbytes, n, k;
-	int ret;
+	int rc;
+
+	rc = skcipher_walk_virt(&walk, req, false);
+	if (rc)
+		return rc;
+
+	spin_lock_bh(&ctx->pk_lock);
+	memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
+	spin_unlock_bh(&ctx->pk_lock);
 
-	ret = skcipher_walk_virt(&walk, req, false);
 	while ((nbytes = walk.nbytes) != 0) {
 		/* only use complete blocks */
 		n = nbytes & ~(AES_BLOCK_SIZE - 1);
-		k = cpacf_km(ctx->fc | modifier, ctx->pk.protkey,
+		k = cpacf_km(ctx->fc | modifier, &param,
 			     walk.dst.virt.addr, walk.src.virt.addr, n);
 		if (k)
-			ret = skcipher_walk_done(&walk, nbytes - k);
+			rc = skcipher_walk_done(&walk, nbytes - k);
 		if (k < n) {
-			if (__paes_set_key(ctx) != 0)
+			if (__paes_convert_key(ctx))
 				return skcipher_walk_done(&walk, -EIO);
+			spin_lock_bh(&ctx->pk_lock);
+			memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
+			spin_unlock_bh(&ctx->pk_lock);
 		}
 	}
-	return ret;
+	return rc;
 }
 
 static int ecb_paes_encrypt(struct skcipher_request *req)
@@ -214,6 +332,7 @@ static int cbc_paes_init(struct crypto_skcipher *tfm)
 	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
 
 	ctx->kb.key = NULL;
+	spin_lock_init(&ctx->pk_lock);
 
 	return 0;
 }
@@ -225,12 +344,14 @@ static void cbc_paes_exit(struct crypto_skcipher *tfm)
 	_free_kb_keybuf(&ctx->kb);
 }
 
-static int __cbc_paes_set_key(struct s390_paes_ctx *ctx)
+static inline int __cbc_paes_set_key(struct s390_paes_ctx *ctx)
 {
 	unsigned long fc;
+	int rc;
 
-	if (__paes_convert_key(&ctx->kb, &ctx->pk))
-		return -EINVAL;
+	rc = __paes_convert_key(ctx);
+	if (rc)
+		return rc;
 
 	/* Pick the correct function code based on the protected key type */
 	fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMC_PAES_128 :
@@ -246,38 +367,38 @@ static int __cbc_paes_set_key(struct s390_paes_ctx *ctx)
 static int cbc_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
 			    unsigned int key_len)
 {
-	int rc;
 	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
+	int rc;
 
 	_free_kb_keybuf(&ctx->kb);
-	rc = _copy_key_to_kb(&ctx->kb, in_key, key_len);
+	rc = _key_to_kb(&ctx->kb, in_key, key_len);
 	if (rc)
 		return rc;
 
-	if (__cbc_paes_set_key(ctx)) {
-		crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-	return 0;
+	return __cbc_paes_set_key(ctx);
 }
 
 static int cbc_paes_crypt(struct skcipher_request *req, unsigned long modifier)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	unsigned int nbytes, n, k;
-	int ret;
 	struct {
 		u8 iv[AES_BLOCK_SIZE];
-		u8 key[MAXPROTKEYSIZE];
+		u8 key[PAES_256_PROTKEY_SIZE];
 	} param;
+	struct skcipher_walk walk;
+	unsigned int nbytes, n, k;
+	int rc;
+
+	rc = skcipher_walk_virt(&walk, req, false);
+	if (rc)
+		return rc;
 
-	ret = skcipher_walk_virt(&walk, req, false);
-	if (ret)
-		return ret;
 	memcpy(param.iv, walk.iv, AES_BLOCK_SIZE);
-	memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+	spin_lock_bh(&ctx->pk_lock);
+	memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
+	spin_unlock_bh(&ctx->pk_lock);
+
 	while ((nbytes = walk.nbytes) != 0) {
 		/* only use complete blocks */
 		n = nbytes & ~(AES_BLOCK_SIZE - 1);
@@ -285,15 +406,17 @@ static int cbc_paes_crypt(struct skcipher_request *req, unsigned long modifier)
 			      walk.dst.virt.addr, walk.src.virt.addr, n);
 		if (k) {
 			memcpy(walk.iv, param.iv, AES_BLOCK_SIZE);
-			ret = skcipher_walk_done(&walk, nbytes - k);
+			rc = skcipher_walk_done(&walk, nbytes - k);
 		}
 		if (k < n) {
-			if (__cbc_paes_set_key(ctx) != 0)
+			if (__paes_convert_key(ctx))
 				return skcipher_walk_done(&walk, -EIO);
-			memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+			spin_lock_bh(&ctx->pk_lock);
+			memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
+			spin_unlock_bh(&ctx->pk_lock);
 		}
 	}
-	return ret;
+	return rc;
 }
 
 static int cbc_paes_encrypt(struct skcipher_request *req)
@@ -328,8 +451,8 @@ static int xts_paes_init(struct crypto_skcipher *tfm)
 {
 	struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-	ctx->kb[0].key = NULL;
-	ctx->kb[1].key = NULL;
+	ctx->kb.key = NULL;
+	spin_lock_init(&ctx->pk_lock);
 
 	return 0;
 }
@@ -338,25 +461,83 @@ static void xts_paes_exit(struct crypto_skcipher *tfm)
 {
 	struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-	_free_kb_keybuf(&ctx->kb[0]);
-	_free_kb_keybuf(&ctx->kb[1]);
+	_free_kb_keybuf(&ctx->kb);
 }
 
-static int __xts_paes_set_key(struct s390_pxts_ctx *ctx)
+static inline int __xts_paes_convert_key(struct s390_pxts_ctx *ctx)
 {
-	unsigned long fc;
+	struct paes_protkey pk0, pk1;
+	size_t split_keylen;
+	int rc;
 
-	if (__paes_convert_key(&ctx->kb[0], &ctx->pk[0]) ||
-	    __paes_convert_key(&ctx->kb[1], &ctx->pk[1]))
-		return -EINVAL;
+	pk0.len = sizeof(pk0.protkey);
+	pk1.len = sizeof(pk1.protkey);
 
-	if (ctx->pk[0].type != ctx->pk[1].type)
+	rc = __paes_keyblob2pkey(ctx->kb.key, ctx->kb.keylen, &pk0);
+	if (rc)
+		return rc;
+
+	switch (pk0.type) {
+	case PKEY_KEYTYPE_AES_128:
+	case PKEY_KEYTYPE_AES_256:
+		/* second keytoken required */
+		if (ctx->kb.keylen % 2)
+			return -EINVAL;
+		split_keylen = ctx->kb.keylen / 2;
+
+		rc = __paes_keyblob2pkey(ctx->kb.key + split_keylen,
+					 split_keylen, &pk1);
+		if (rc)
+			return rc;
+
+		if (pk0.type != pk1.type)
+			return -EINVAL;
+		break;
+	case PKEY_KEYTYPE_AES_XTS_128:
+	case PKEY_KEYTYPE_AES_XTS_256:
+		/* single key */
+		pk1.type = 0;
+		break;
+	default:
+		/* unsupported protected keytype */
 		return -EINVAL;
+	}
+
+	spin_lock_bh(&ctx->pk_lock);
+	ctx->pk[0] = pk0;
+	ctx->pk[1] = pk1;
+	spin_unlock_bh(&ctx->pk_lock);
+
+	return 0;
+}
+
+static inline int __xts_paes_set_key(struct s390_pxts_ctx *ctx)
+{
+	unsigned long fc;
+	int rc;
+
+	rc = __xts_paes_convert_key(ctx);
+	if (rc)
+		return rc;
 
 	/* Pick the correct function code based on the protected key type */
-	fc = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PXTS_128 :
-		(ctx->pk[0].type == PKEY_KEYTYPE_AES_256) ?
-		CPACF_KM_PXTS_256 : 0;
+	switch (ctx->pk[0].type) {
+	case PKEY_KEYTYPE_AES_128:
+		fc = CPACF_KM_PXTS_128;
+		break;
+	case PKEY_KEYTYPE_AES_256:
+		fc = CPACF_KM_PXTS_256;
+		break;
+	case PKEY_KEYTYPE_AES_XTS_128:
+		fc = CPACF_KM_PXTS_128_FULL;
+		break;
+	case PKEY_KEYTYPE_AES_XTS_256:
+		fc = CPACF_KM_PXTS_256_FULL;
+		break;
+	default:
+		fc = 0;
+		break;
+	}
 
 	/* Check if the function code is available */
 	ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0;
@@ -365,34 +546,35 @@ static int __xts_paes_set_key(struct s390_pxts_ctx *ctx)
 }
 
 static int xts_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
-			    unsigned int xts_key_len)
+			    unsigned int in_keylen)
 {
-	int rc;
 	struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
 	u8 ckey[2 * AES_MAX_KEY_SIZE];
-	unsigned int ckey_len, key_len;
+	unsigned int ckey_len;
+	int rc;
 
-	if (xts_key_len % 2)
+	if ((in_keylen == 32 || in_keylen == 64) &&
+	    xts_verify_key(tfm, in_key, in_keylen))
 		return -EINVAL;
 
-	key_len = xts_key_len / 2;
-
-	_free_kb_keybuf(&ctx->kb[0]);
-	_free_kb_keybuf(&ctx->kb[1]);
-	rc = _copy_key_to_kb(&ctx->kb[0], in_key, key_len);
+	_free_kb_keybuf(&ctx->kb);
+	rc = _xts_key_to_kb(&ctx->kb, in_key, in_keylen);
 	if (rc)
 		return rc;
-	rc = _copy_key_to_kb(&ctx->kb[1], in_key + key_len, key_len);
+
+	rc = __xts_paes_set_key(ctx);
 	if (rc)
 		return rc;
 
-	if (__xts_paes_set_key(ctx)) {
-		crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-
 	/*
-	 * xts_check_key verifies the key length is not odd and makes
+	 * It is not possible on a single protected key (e.g. full AES-XTS) to
+	 * check, if k1 and k2 are the same.
+	 */
+	if (ctx->pk[0].type == PKEY_KEYTYPE_AES_XTS_128 ||
+	    ctx->pk[0].type == PKEY_KEYTYPE_AES_XTS_256)
+		return 0;
+	/*
+	 * xts_verify_key verifies the key length is not odd and makes
 	 * sure that the two keys are not the same. This can be done
 	 * on the two protected keys as well
 	 */
@@ -403,37 +585,93 @@ static int xts_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
 	return xts_verify_key(tfm, ckey, 2*ckey_len);
 }
 
-static int xts_paes_crypt(struct skcipher_request *req, unsigned long modifier)
+static int paes_xts_crypt_full(struct skcipher_request *req,
+			       unsigned long modifier)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	unsigned int keylen, offset, nbytes, n, k;
+	struct {
+		u8 key[64];
+		u8 tweak[16];
+		u8 nap[16];
+		u8 wkvp[32];
+	} fxts_param = {
+		.nap = {0},
+	};
 	struct skcipher_walk walk;
+	int rc;
+
+	rc = skcipher_walk_virt(&walk, req, false);
+	if (rc)
+		return rc;
+
+	keylen = (ctx->pk[0].type == PKEY_KEYTYPE_AES_XTS_128) ? 32 : 64;
+	offset = (ctx->pk[0].type == PKEY_KEYTYPE_AES_XTS_128) ? 32 : 0;
+
+	spin_lock_bh(&ctx->pk_lock);
+	memcpy(fxts_param.key + offset, ctx->pk[0].protkey, keylen);
+	memcpy(fxts_param.wkvp, ctx->pk[0].protkey + keylen,
+	       sizeof(fxts_param.wkvp));
+	spin_unlock_bh(&ctx->pk_lock);
+	memcpy(fxts_param.tweak, walk.iv, sizeof(fxts_param.tweak));
+	fxts_param.nap[0] = 0x01; /* initial alpha power (1, little-endian) */
+
+	while ((nbytes = walk.nbytes) != 0) {
+		/* only use complete blocks */
+		n = nbytes & ~(AES_BLOCK_SIZE - 1);
+		k = cpacf_km(ctx->fc | modifier, fxts_param.key + offset,
+			     walk.dst.virt.addr, walk.src.virt.addr, n);
+		if (k)
+			rc = skcipher_walk_done(&walk, nbytes - k);
+		if (k < n) {
+			if (__xts_paes_convert_key(ctx))
+				return skcipher_walk_done(&walk, -EIO);
+			spin_lock_bh(&ctx->pk_lock);
+			memcpy(fxts_param.key + offset, ctx->pk[0].protkey,
+			       keylen);
+			memcpy(fxts_param.wkvp, ctx->pk[0].protkey + keylen,
+			       sizeof(fxts_param.wkvp));
+			spin_unlock_bh(&ctx->pk_lock);
+		}
+	}
+
+	return rc;
+}
+
+static int paes_xts_crypt(struct skcipher_request *req, unsigned long modifier)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
 	unsigned int keylen, offset, nbytes, n, k;
-	int ret;
 	struct {
-		u8 key[MAXPROTKEYSIZE];	/* key + verification pattern */
+		u8 key[PAES_256_PROTKEY_SIZE];
 		u8 tweak[16];
 		u8 block[16];
 		u8 bit[16];
 		u8 xts[16];
 	} pcc_param;
 	struct {
-		u8 key[MAXPROTKEYSIZE];	/* key + verification pattern */
+		u8 key[PAES_256_PROTKEY_SIZE];
 		u8 init[16];
 	} xts_param;
+	struct skcipher_walk walk;
+	int rc;
+
+	rc = skcipher_walk_virt(&walk, req, false);
+	if (rc)
+		return rc;
 
-	ret = skcipher_walk_virt(&walk, req, false);
-	if (ret)
-		return ret;
 	keylen = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 48 : 64;
 	offset = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 16 : 0;
-retry:
+
 	memset(&pcc_param, 0, sizeof(pcc_param));
 	memcpy(pcc_param.tweak, walk.iv, sizeof(pcc_param.tweak));
+	spin_lock_bh(&ctx->pk_lock);
 	memcpy(pcc_param.key + offset, ctx->pk[1].protkey, keylen);
-	cpacf_pcc(ctx->fc, pcc_param.key + offset);
-
 	memcpy(xts_param.key + offset, ctx->pk[0].protkey, keylen);
+	spin_unlock_bh(&ctx->pk_lock);
+	cpacf_pcc(ctx->fc, pcc_param.key + offset);
 	memcpy(xts_param.init, pcc_param.xts, 16);
 
 	while ((nbytes = walk.nbytes) != 0) {
@@ -442,14 +680,35 @@ retry:
 		k = cpacf_km(ctx->fc | modifier, xts_param.key + offset,
 			     walk.dst.virt.addr, walk.src.virt.addr, n);
 		if (k)
-			ret = skcipher_walk_done(&walk, nbytes - k);
+			rc = skcipher_walk_done(&walk, nbytes - k);
 		if (k < n) {
-			if (__xts_paes_set_key(ctx) != 0)
+			if (__xts_paes_convert_key(ctx))
 				return skcipher_walk_done(&walk, -EIO);
-			goto retry;
+			spin_lock_bh(&ctx->pk_lock);
+			memcpy(xts_param.key + offset,
+			       ctx->pk[0].protkey, keylen);
+			spin_unlock_bh(&ctx->pk_lock);
 		}
 	}
-	return ret;
+
+	return rc;
+}
+
+static inline int xts_paes_crypt(struct skcipher_request *req, unsigned long modifier)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	switch (ctx->fc) {
+	case CPACF_KM_PXTS_128:
+	case CPACF_KM_PXTS_256:
+		return paes_xts_crypt(req, modifier);
+	case CPACF_KM_PXTS_128_FULL:
+	case CPACF_KM_PXTS_256_FULL:
+		return paes_xts_crypt_full(req, modifier);
+	default:
+		return -EINVAL;
+	}
 }
 
 static int xts_paes_encrypt(struct skcipher_request *req)
@@ -485,6 +744,7 @@ static int ctr_paes_init(struct crypto_skcipher *tfm)
 	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
 
 	ctx->kb.key = NULL;
+	spin_lock_init(&ctx->pk_lock);
 
 	return 0;
 }
@@ -496,12 +756,14 @@ static void ctr_paes_exit(struct crypto_skcipher *tfm)
 	_free_kb_keybuf(&ctx->kb);
 }
 
-static int __ctr_paes_set_key(struct s390_paes_ctx *ctx)
+static inline int __ctr_paes_set_key(struct s390_paes_ctx *ctx)
 {
 	unsigned long fc;
+	int rc;
 
-	if (__paes_convert_key(&ctx->kb, &ctx->pk))
-		return -EINVAL;
+	rc = __paes_convert_key(ctx);
+	if (rc)
+		return rc;
 
 	/* Pick the correct function code based on the protected key type */
 	fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMCTR_PAES_128 :
@@ -518,19 +780,15 @@ static int __ctr_paes_set_key(struct s390_paes_ctx *ctx)
 static int ctr_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
 			    unsigned int key_len)
 {
-	int rc;
 	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
+	int rc;
 
 	_free_kb_keybuf(&ctx->kb);
-	rc = _copy_key_to_kb(&ctx->kb, in_key, key_len);
+	rc = _key_to_kb(&ctx->kb, in_key, key_len);
 	if (rc)
 		return rc;
 
-	if (__ctr_paes_set_key(ctx)) {
-		crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-	return 0;
+	return __ctr_paes_set_key(ctx);
 }
 
 static unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes)
@@ -553,55 +811,73 @@ static int ctr_paes_crypt(struct skcipher_request *req)
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
 	u8 buf[AES_BLOCK_SIZE], *ctrptr;
+	struct {
+		u8 key[PAES_256_PROTKEY_SIZE];
+	} param;
 	struct skcipher_walk walk;
 	unsigned int nbytes, n, k;
-	int ret, locked;
+	int rc, locked;
 
-	locked = spin_trylock(&ctrblk_lock);
+	rc = skcipher_walk_virt(&walk, req, false);
+	if (rc)
+		return rc;
+
+	spin_lock_bh(&ctx->pk_lock);
+	memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
+	spin_unlock_bh(&ctx->pk_lock);
+
+	locked = mutex_trylock(&ctrblk_lock);
 
-	ret = skcipher_walk_virt(&walk, req, false);
 	while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
 		n = AES_BLOCK_SIZE;
 		if (nbytes >= 2*AES_BLOCK_SIZE && locked)
 			n = __ctrblk_init(ctrblk, walk.iv, nbytes);
 		ctrptr = (n > AES_BLOCK_SIZE) ? ctrblk : walk.iv;
-		k = cpacf_kmctr(ctx->fc, ctx->pk.protkey, walk.dst.virt.addr,
+		k = cpacf_kmctr(ctx->fc, &param, walk.dst.virt.addr,
 				walk.src.virt.addr, n, ctrptr);
 		if (k) {
 			if (ctrptr == ctrblk)
 				memcpy(walk.iv, ctrptr + k - AES_BLOCK_SIZE,
 				       AES_BLOCK_SIZE);
 			crypto_inc(walk.iv, AES_BLOCK_SIZE);
-			ret = skcipher_walk_done(&walk, nbytes - n);
+			rc = skcipher_walk_done(&walk, nbytes - k);
 		}
 		if (k < n) {
-			if (__ctr_paes_set_key(ctx) != 0) {
+			if (__paes_convert_key(ctx)) {
 				if (locked)
-					spin_unlock(&ctrblk_lock);
+					mutex_unlock(&ctrblk_lock);
 				return skcipher_walk_done(&walk, -EIO);
 			}
+			spin_lock_bh(&ctx->pk_lock);
+			memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
+			spin_unlock_bh(&ctx->pk_lock);
 		}
 	}
 	if (locked)
-		spin_unlock(&ctrblk_lock);
+		mutex_unlock(&ctrblk_lock);
 	/*
 	 * final block may be < AES_BLOCK_SIZE, copy only nbytes
 	 */
 	if (nbytes) {
+		memset(buf, 0, AES_BLOCK_SIZE);
+		memcpy(buf, walk.src.virt.addr, nbytes);
 		while (1) {
-			if (cpacf_kmctr(ctx->fc, ctx->pk.protkey, buf,
-					walk.src.virt.addr, AES_BLOCK_SIZE,
+			if (cpacf_kmctr(ctx->fc, &param, buf,
+					buf, AES_BLOCK_SIZE,
 					walk.iv) == AES_BLOCK_SIZE)
 				break;
-			if (__ctr_paes_set_key(ctx) != 0)
+			if (__paes_convert_key(ctx))
 				return skcipher_walk_done(&walk, -EIO);
+			spin_lock_bh(&ctx->pk_lock);
+			memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
+			spin_unlock_bh(&ctx->pk_lock);
 		}
 		memcpy(walk.dst.virt.addr, buf, nbytes);
 		crypto_inc(walk.iv, AES_BLOCK_SIZE);
-		ret = skcipher_walk_done(&walk, 0);
+		rc = skcipher_walk_done(&walk, nbytes);
 	}
 
-	return ret;
+	return rc;
 }
 
 static struct skcipher_alg ctr_paes_alg = {
@@ -631,17 +907,17 @@ static inline void __crypto_unregister_skcipher(struct skcipher_alg *alg)
 
 static void paes_s390_fini(void)
 {
-	if (ctrblk)
-		free_page((unsigned long) ctrblk);
 	__crypto_unregister_skcipher(&ctr_paes_alg);
 	__crypto_unregister_skcipher(&xts_paes_alg);
 	__crypto_unregister_skcipher(&cbc_paes_alg);
 	__crypto_unregister_skcipher(&ecb_paes_alg);
+	if (ctrblk)
+		free_page((unsigned long) ctrblk);
 }
 
 static int __init paes_s390_init(void)
 {
-	int ret;
+	int rc;
 
 	/* Query available functions for KM, KMC and KMCTR */
 	cpacf_query(CPACF_KM, &km_functions);
@@ -651,49 +927,52 @@ static int __init paes_s390_init(void)
 	if (cpacf_test_func(&km_functions, CPACF_KM_PAES_128) ||
 	    cpacf_test_func(&km_functions, CPACF_KM_PAES_192) ||
 	    cpacf_test_func(&km_functions, CPACF_KM_PAES_256)) {
-		ret = crypto_register_skcipher(&ecb_paes_alg);
-		if (ret)
+		rc = crypto_register_skcipher(&ecb_paes_alg);
+		if (rc)
 			goto out_err;
 	}
 
 	if (cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_128) ||
 	    cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_192) ||
 	    cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_256)) {
-		ret = crypto_register_skcipher(&cbc_paes_alg);
-		if (ret)
+		rc = crypto_register_skcipher(&cbc_paes_alg);
+		if (rc)
 			goto out_err;
 	}
 
 	if (cpacf_test_func(&km_functions, CPACF_KM_PXTS_128) ||
 	    cpacf_test_func(&km_functions, CPACF_KM_PXTS_256)) {
-		ret = crypto_register_skcipher(&xts_paes_alg);
-		if (ret)
+		rc = crypto_register_skcipher(&xts_paes_alg);
+		if (rc)
 			goto out_err;
 	}
 
 	if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_128) ||
 	    cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_192) ||
 	    cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_256)) {
-		ret = crypto_register_skcipher(&ctr_paes_alg);
-		if (ret)
-			goto out_err;
 		ctrblk = (u8 *) __get_free_page(GFP_KERNEL);
 		if (!ctrblk) {
-			ret = -ENOMEM;
+			rc = -ENOMEM;
 			goto out_err;
 		}
+		rc = crypto_register_skcipher(&ctr_paes_alg);
+		if (rc)
+			goto out_err;
 	}
 
 	return 0;
 out_err:
 	paes_s390_fini();
-	return ret;
+	return rc;
 }
 
 module_init(paes_s390_init);
 module_exit(paes_s390_fini);
 
-MODULE_ALIAS_CRYPTO("paes");
+MODULE_ALIAS_CRYPTO("ecb(paes)");
+MODULE_ALIAS_CRYPTO("cbc(paes)");
+MODULE_ALIAS_CRYPTO("ctr(paes)");
+MODULE_ALIAS_CRYPTO("xts(paes)");
 
 MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm with protected keys");
 MODULE_LICENSE("GPL");
diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c
index d977643fa627..2becd77df741 100644
--- a/arch/s390/crypto/prng.c
+++ b/arch/s390/crypto/prng.c
@@ -249,7 +249,7 @@ static void prng_tdes_deinstantiate(void)
 {
 	pr_debug("The prng module stopped "
 		 "after running in triple DES mode\n");
-	kzfree(prng_data);
+	kfree_sensitive(prng_data);
 }
 
 
@@ -414,7 +414,7 @@ static int __init prng_sha512_instantiate(void)
 	}
 
 	/* append the seed by 16 bytes of unique nonce */
-	get_tod_clock_ext(seed + seedlen);
+	store_tod_clock_ext((union tod_clock *)(seed + seedlen));
 	seedlen += 16;
 
 	/* now initial seed of the prno drng */
@@ -442,7 +442,7 @@ outfree:
 static void prng_sha512_deinstantiate(void)
 {
 	pr_debug("The prng module stopped after running in SHA-512 mode\n");
-	kzfree(prng_data);
+	kfree_sensitive(prng_data);
 }
 
 
@@ -528,7 +528,7 @@ static ssize_t prng_tdes_read(struct file *file, char __user *ubuf,
 			/* give mutex free before calling schedule() */
 			mutex_unlock(&prng_data->mutex);
 			schedule();
-			/* occopy mutex again */
+			/* occupy mutex again */
 			if (mutex_lock_interruptible(&prng_data->mutex)) {
 				if (ret == 0)
 					ret = -ERESTARTSYS;
@@ -674,26 +674,12 @@ static const struct file_operations prng_tdes_fops = {
 	.llseek		= noop_llseek,
 };
 
-static struct miscdevice prng_sha512_dev = {
-	.name	= "prandom",
-	.minor	= MISC_DYNAMIC_MINOR,
-	.mode	= 0644,
-	.fops	= &prng_sha512_fops,
-};
-static struct miscdevice prng_tdes_dev = {
-	.name	= "prandom",
-	.minor	= MISC_DYNAMIC_MINOR,
-	.mode	= 0644,
-	.fops	= &prng_tdes_fops,
-};
-
-
 /* chunksize attribute (ro) */
 static ssize_t prng_chunksize_show(struct device *dev,
 				   struct device_attribute *attr,
 				   char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%u\n", prng_chunk_size);
+	return sysfs_emit(buf, "%u\n", prng_chunk_size);
 }
 static DEVICE_ATTR(chunksize, 0444, prng_chunksize_show, NULL);
 
@@ -712,7 +698,7 @@ static ssize_t prng_counter_show(struct device *dev,
 		counter = prng_data->prngws.byte_counter;
 	mutex_unlock(&prng_data->mutex);
 
-	return snprintf(buf, PAGE_SIZE, "%llu\n", counter);
+	return sysfs_emit(buf, "%llu\n", counter);
 }
 static DEVICE_ATTR(byte_counter, 0444, prng_counter_show, NULL);
 
@@ -721,7 +707,7 @@ static ssize_t prng_errorflag_show(struct device *dev,
 				   struct device_attribute *attr,
 				   char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%d\n", prng_errorflag);
+	return sysfs_emit(buf, "%d\n", prng_errorflag);
 }
 static DEVICE_ATTR(errorflag, 0444, prng_errorflag_show, NULL);
 
@@ -731,9 +717,9 @@ static ssize_t prng_mode_show(struct device *dev,
 			      char *buf)
 {
 	if (prng_mode == PRNG_MODE_TDES)
-		return snprintf(buf, PAGE_SIZE, "TDES\n");
+		return sysfs_emit(buf, "TDES\n");
 	else
-		return snprintf(buf, PAGE_SIZE, "SHA512\n");
+		return sysfs_emit(buf, "SHA512\n");
 }
 static DEVICE_ATTR(mode, 0444, prng_mode_show, NULL);
 
@@ -756,7 +742,7 @@ static ssize_t prng_reseed_limit_show(struct device *dev,
 				      struct device_attribute *attr,
 				      char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%u\n", prng_reseed_limit);
+	return sysfs_emit(buf, "%u\n", prng_reseed_limit);
 }
 static ssize_t prng_reseed_limit_store(struct device *dev,
 				       struct device_attribute *attr,
@@ -787,7 +773,7 @@ static ssize_t prng_strength_show(struct device *dev,
 				  struct device_attribute *attr,
 				  char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "256\n");
+	return sysfs_emit(buf, "256\n");
 }
 static DEVICE_ATTR(strength, 0444, prng_strength_show, NULL);
 
@@ -801,18 +787,30 @@ static struct attribute *prng_sha512_dev_attrs[] = {
 	&dev_attr_strength.attr,
 	NULL
 };
+ATTRIBUTE_GROUPS(prng_sha512_dev);
+
 static struct attribute *prng_tdes_dev_attrs[] = {
 	&dev_attr_chunksize.attr,
 	&dev_attr_byte_counter.attr,
 	&dev_attr_mode.attr,
 	NULL
 };
+ATTRIBUTE_GROUPS(prng_tdes_dev);
 
-static struct attribute_group prng_sha512_dev_attr_group = {
-	.attrs = prng_sha512_dev_attrs
+static struct miscdevice prng_sha512_dev = {
+	.name	= "prandom",
+	.minor	= MISC_DYNAMIC_MINOR,
+	.mode	= 0644,
+	.fops	= &prng_sha512_fops,
+	.groups = prng_sha512_dev_groups,
 };
-static struct attribute_group prng_tdes_dev_attr_group = {
-	.attrs = prng_tdes_dev_attrs
+
+static struct miscdevice prng_tdes_dev = {
+	.name	= "prandom",
+	.minor	= MISC_DYNAMIC_MINOR,
+	.mode	= 0644,
+	.fops	= &prng_tdes_fops,
+	.groups = prng_tdes_dev_groups,
 };
 
 
@@ -867,13 +865,6 @@ static int __init prng_init(void)
 			prng_sha512_deinstantiate();
 			goto out;
 		}
-		ret = sysfs_create_group(&prng_sha512_dev.this_device->kobj,
-					 &prng_sha512_dev_attr_group);
-		if (ret) {
-			misc_deregister(&prng_sha512_dev);
-			prng_sha512_deinstantiate();
-			goto out;
-		}
 
 	} else {
 
@@ -898,14 +889,6 @@ static int __init prng_init(void)
 			prng_tdes_deinstantiate();
 			goto out;
 		}
-		ret = sysfs_create_group(&prng_tdes_dev.this_device->kobj,
-					 &prng_tdes_dev_attr_group);
-		if (ret) {
-			misc_deregister(&prng_tdes_dev);
-			prng_tdes_deinstantiate();
-			goto out;
-		}
-
 	}
 
 out:
@@ -916,17 +899,13 @@ out:
 static void __exit prng_exit(void)
 {
 	if (prng_mode == PRNG_MODE_SHA512) {
-		sysfs_remove_group(&prng_sha512_dev.this_device->kobj,
-				   &prng_sha512_dev_attr_group);
 		misc_deregister(&prng_sha512_dev);
 		prng_sha512_deinstantiate();
 	} else {
-		sysfs_remove_group(&prng_tdes_dev.this_device->kobj,
-				   &prng_tdes_dev_attr_group);
 		misc_deregister(&prng_tdes_dev);
 		prng_tdes_deinstantiate();
 	}
 }
 
-module_cpu_feature_match(MSA, prng_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, prng_init);
 module_exit(prng_exit);
diff --git a/arch/s390/crypto/sha.h b/arch/s390/crypto/sha.h
index ada2f98c27b7..2bb22db54c31 100644
--- a/arch/s390/crypto/sha.h
+++ b/arch/s390/crypto/sha.h
@@ -11,7 +11,8 @@
 #define _CRYPTO_ARCH_S390_SHA_H
 
 #include <linux/crypto.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/sha3.h>
 
 /* must be big enough for the largest SHA variant */
@@ -24,6 +25,7 @@ struct s390_sha_ctx {
 	u32 state[CPACF_MAX_PARMBLOCK_SIZE / sizeof(u32)];
 	u8 buf[SHA_MAX_BLOCK_SIZE];
 	int func;		/* KIMD function to use */
+	int first_message_part;
 };
 
 struct shash_desc;
diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c
index 7c15542d3685..bc3a22704e09 100644
--- a/arch/s390/crypto/sha1_s390.c
+++ b/arch/s390/crypto/sha1_s390.c
@@ -22,12 +22,12 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/cpufeature.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <asm/cpacf.h>
 
 #include "sha.h"
 
-static int sha1_init(struct shash_desc *desc)
+static int s390_sha1_init(struct shash_desc *desc)
 {
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
 
@@ -42,7 +42,7 @@ static int sha1_init(struct shash_desc *desc)
 	return 0;
 }
 
-static int sha1_export(struct shash_desc *desc, void *out)
+static int s390_sha1_export(struct shash_desc *desc, void *out)
 {
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
 	struct sha1_state *octx = out;
@@ -53,7 +53,7 @@ static int sha1_export(struct shash_desc *desc, void *out)
 	return 0;
 }
 
-static int sha1_import(struct shash_desc *desc, const void *in)
+static int s390_sha1_import(struct shash_desc *desc, const void *in)
 {
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
 	const struct sha1_state *ictx = in;
@@ -67,11 +67,11 @@ static int sha1_import(struct shash_desc *desc, const void *in)
 
 static struct shash_alg alg = {
 	.digestsize	=	SHA1_DIGEST_SIZE,
-	.init		=	sha1_init,
+	.init		=	s390_sha1_init,
 	.update		=	s390_sha_update,
 	.final		=	s390_sha_final,
-	.export		=	sha1_export,
-	.import		=	sha1_import,
+	.export		=	s390_sha1_export,
+	.import		=	s390_sha1_import,
 	.descsize	=	sizeof(struct s390_sha_ctx),
 	.statesize	=	sizeof(struct sha1_state),
 	.base		=	{
@@ -95,7 +95,7 @@ static void __exit sha1_s390_fini(void)
 	crypto_unregister_shash(&alg);
 }
 
-module_cpu_feature_match(MSA, sha1_s390_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha1_s390_init);
 module_exit(sha1_s390_fini);
 
 MODULE_ALIAS_CRYPTO("sha1");
diff --git a/arch/s390/crypto/sha256_s390.c b/arch/s390/crypto/sha256_s390.c
index b52c87e44939..6f1ccdf93d3e 100644
--- a/arch/s390/crypto/sha256_s390.c
+++ b/arch/s390/crypto/sha256_s390.c
@@ -12,7 +12,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/cpufeature.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <asm/cpacf.h>
 
 #include "sha.h"
@@ -134,7 +134,7 @@ static void __exit sha256_s390_fini(void)
 	crypto_unregister_shash(&sha256_alg);
 }
 
-module_cpu_feature_match(MSA, sha256_s390_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha256_s390_init);
 module_exit(sha256_s390_fini);
 
 MODULE_ALIAS_CRYPTO("sha256");
diff --git a/arch/s390/crypto/sha3_256_s390.c b/arch/s390/crypto/sha3_256_s390.c
index 460cbbbaa44a..a84ef692f572 100644
--- a/arch/s390/crypto/sha3_256_s390.c
+++ b/arch/s390/crypto/sha3_256_s390.c
@@ -12,7 +12,6 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/cpufeature.h>
-#include <crypto/sha.h>
 #include <crypto/sha3.h>
 #include <asm/cpacf.h>
 
@@ -22,9 +21,11 @@ static int sha3_256_init(struct shash_desc *desc)
 {
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
 
-	memset(sctx->state, 0, sizeof(sctx->state));
+	if (!test_facility(86)) /* msa 12 */
+		memset(sctx->state, 0, sizeof(sctx->state));
 	sctx->count = 0;
 	sctx->func = CPACF_KIMD_SHA3_256;
+	sctx->first_message_part = 1;
 
 	return 0;
 }
@@ -37,6 +38,7 @@ static int sha3_256_export(struct shash_desc *desc, void *out)
 	octx->rsiz = sctx->count;
 	memcpy(octx->st, sctx->state, sizeof(octx->st));
 	memcpy(octx->buf, sctx->buf, sizeof(octx->buf));
+	octx->partial = sctx->first_message_part;
 
 	return 0;
 }
@@ -49,6 +51,7 @@ static int sha3_256_import(struct shash_desc *desc, const void *in)
 	sctx->count = ictx->rsiz;
 	memcpy(sctx->state, ictx->st, sizeof(ictx->st));
 	memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf));
+	sctx->first_message_part = ictx->partial;
 	sctx->func = CPACF_KIMD_SHA3_256;
 
 	return 0;
@@ -62,6 +65,7 @@ static int sha3_224_import(struct shash_desc *desc, const void *in)
 	sctx->count = ictx->rsiz;
 	memcpy(sctx->state, ictx->st, sizeof(ictx->st));
 	memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf));
+	sctx->first_message_part = ictx->partial;
 	sctx->func = CPACF_KIMD_SHA3_224;
 
 	return 0;
@@ -89,9 +93,11 @@ static int sha3_224_init(struct shash_desc *desc)
 {
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
 
-	memset(sctx->state, 0, sizeof(sctx->state));
+	if (!test_facility(86)) /* msa 12 */
+		memset(sctx->state, 0, sizeof(sctx->state));
 	sctx->count = 0;
 	sctx->func = CPACF_KIMD_SHA3_224;
+	sctx->first_message_part = 1;
 
 	return 0;
 }
@@ -138,7 +144,7 @@ static void __exit sha3_256_s390_fini(void)
 	crypto_unregister_shash(&sha3_256_alg);
 }
 
-module_cpu_feature_match(MSA, sha3_256_s390_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha3_256_s390_init);
 module_exit(sha3_256_s390_fini);
 
 MODULE_ALIAS_CRYPTO("sha3-256");
diff --git a/arch/s390/crypto/sha3_512_s390.c b/arch/s390/crypto/sha3_512_s390.c
index 72cf460a53e5..07528fc98ff7 100644
--- a/arch/s390/crypto/sha3_512_s390.c
+++ b/arch/s390/crypto/sha3_512_s390.c
@@ -11,7 +11,6 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/cpufeature.h>
-#include <crypto/sha.h>
 #include <crypto/sha3.h>
 #include <asm/cpacf.h>
 
@@ -21,9 +20,11 @@ static int sha3_512_init(struct shash_desc *desc)
 {
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
 
-	memset(sctx->state, 0, sizeof(sctx->state));
+	if (!test_facility(86)) /* msa 12 */
+		memset(sctx->state, 0, sizeof(sctx->state));
 	sctx->count = 0;
 	sctx->func = CPACF_KIMD_SHA3_512;
+	sctx->first_message_part = 1;
 
 	return 0;
 }
@@ -38,6 +39,7 @@ static int sha3_512_export(struct shash_desc *desc, void *out)
 
 	memcpy(octx->st, sctx->state, sizeof(octx->st));
 	memcpy(octx->buf, sctx->buf, sizeof(octx->buf));
+	octx->partial = sctx->first_message_part;
 
 	return 0;
 }
@@ -53,6 +55,7 @@ static int sha3_512_import(struct shash_desc *desc, const void *in)
 
 	memcpy(sctx->state, ictx->st, sizeof(ictx->st));
 	memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf));
+	sctx->first_message_part = ictx->partial;
 	sctx->func = CPACF_KIMD_SHA3_512;
 
 	return 0;
@@ -69,6 +72,7 @@ static int sha3_384_import(struct shash_desc *desc, const void *in)
 
 	memcpy(sctx->state, ictx->st, sizeof(ictx->st));
 	memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf));
+	sctx->first_message_part = ictx->partial;
 	sctx->func = CPACF_KIMD_SHA3_384;
 
 	return 0;
@@ -98,9 +102,11 @@ static int sha3_384_init(struct shash_desc *desc)
 {
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
 
-	memset(sctx->state, 0, sizeof(sctx->state));
+	if (!test_facility(86)) /* msa 12 */
+		memset(sctx->state, 0, sizeof(sctx->state));
 	sctx->count = 0;
 	sctx->func = CPACF_KIMD_SHA3_384;
+	sctx->first_message_part = 1;
 
 	return 0;
 }
@@ -148,7 +154,7 @@ static void __exit fini(void)
 	crypto_unregister_shash(&sha3_384_alg);
 }
 
-module_cpu_feature_match(MSA, init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, init);
 module_exit(fini);
 
 MODULE_LICENSE("GPL");
diff --git a/arch/s390/crypto/sha512_s390.c b/arch/s390/crypto/sha512_s390.c
index ad29db085a18..04f11c407763 100644
--- a/arch/s390/crypto/sha512_s390.c
+++ b/arch/s390/crypto/sha512_s390.c
@@ -8,7 +8,7 @@
  * Author(s): Jan Glauber (jang@de.ibm.com)
  */
 #include <crypto/internal/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -22,14 +22,14 @@ static int sha512_init(struct shash_desc *desc)
 {
 	struct s390_sha_ctx *ctx = shash_desc_ctx(desc);
 
-	*(__u64 *)&ctx->state[0] = 0x6a09e667f3bcc908ULL;
-	*(__u64 *)&ctx->state[2] = 0xbb67ae8584caa73bULL;
-	*(__u64 *)&ctx->state[4] = 0x3c6ef372fe94f82bULL;
-	*(__u64 *)&ctx->state[6] = 0xa54ff53a5f1d36f1ULL;
-	*(__u64 *)&ctx->state[8] = 0x510e527fade682d1ULL;
-	*(__u64 *)&ctx->state[10] = 0x9b05688c2b3e6c1fULL;
-	*(__u64 *)&ctx->state[12] = 0x1f83d9abfb41bd6bULL;
-	*(__u64 *)&ctx->state[14] = 0x5be0cd19137e2179ULL;
+	*(__u64 *)&ctx->state[0] = SHA512_H0;
+	*(__u64 *)&ctx->state[2] = SHA512_H1;
+	*(__u64 *)&ctx->state[4] = SHA512_H2;
+	*(__u64 *)&ctx->state[6] = SHA512_H3;
+	*(__u64 *)&ctx->state[8] = SHA512_H4;
+	*(__u64 *)&ctx->state[10] = SHA512_H5;
+	*(__u64 *)&ctx->state[12] = SHA512_H6;
+	*(__u64 *)&ctx->state[14] = SHA512_H7;
 	ctx->count = 0;
 	ctx->func = CPACF_KIMD_SHA_512;
 
@@ -87,14 +87,14 @@ static int sha384_init(struct shash_desc *desc)
 {
 	struct s390_sha_ctx *ctx = shash_desc_ctx(desc);
 
-	*(__u64 *)&ctx->state[0] = 0xcbbb9d5dc1059ed8ULL;
-	*(__u64 *)&ctx->state[2] = 0x629a292a367cd507ULL;
-	*(__u64 *)&ctx->state[4] = 0x9159015a3070dd17ULL;
-	*(__u64 *)&ctx->state[6] = 0x152fecd8f70e5939ULL;
-	*(__u64 *)&ctx->state[8] = 0x67332667ffc00b31ULL;
-	*(__u64 *)&ctx->state[10] = 0x8eb44a8768581511ULL;
-	*(__u64 *)&ctx->state[12] = 0xdb0c2e0d64f98fa7ULL;
-	*(__u64 *)&ctx->state[14] = 0x47b5481dbefa4fa4ULL;
+	*(__u64 *)&ctx->state[0] = SHA384_H0;
+	*(__u64 *)&ctx->state[2] = SHA384_H1;
+	*(__u64 *)&ctx->state[4] = SHA384_H2;
+	*(__u64 *)&ctx->state[6] = SHA384_H3;
+	*(__u64 *)&ctx->state[8] = SHA384_H4;
+	*(__u64 *)&ctx->state[10] = SHA384_H5;
+	*(__u64 *)&ctx->state[12] = SHA384_H6;
+	*(__u64 *)&ctx->state[14] = SHA384_H7;
 	ctx->count = 0;
 	ctx->func = CPACF_KIMD_SHA_512;
 
@@ -142,7 +142,7 @@ static void __exit fini(void)
 	crypto_unregister_shash(&sha384_alg);
 }
 
-module_cpu_feature_match(MSA, init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, init);
 module_exit(fini);
 
 MODULE_LICENSE("GPL");
diff --git a/arch/s390/crypto/sha_common.c b/arch/s390/crypto/sha_common.c
index 686fe7aa192f..961d7d522af1 100644
--- a/arch/s390/crypto/sha_common.c
+++ b/arch/s390/crypto/sha_common.c
@@ -18,6 +18,7 @@ int s390_sha_update(struct shash_desc *desc, const u8 *data, unsigned int len)
 	struct s390_sha_ctx *ctx = shash_desc_ctx(desc);
 	unsigned int bsize = crypto_shash_blocksize(desc->tfm);
 	unsigned int index, n;
+	int fc;
 
 	/* how much is already in the buffer? */
 	index = ctx->count % bsize;
@@ -26,10 +27,16 @@ int s390_sha_update(struct shash_desc *desc, const u8 *data, unsigned int len)
 	if ((index + len) < bsize)
 		goto store;
 
+	fc = ctx->func;
+	if (ctx->first_message_part)
+		fc |= test_facility(86) ? CPACF_KIMD_NIP : 0;
+
 	/* process one stored block */
 	if (index) {
 		memcpy(ctx->buf + index, data, bsize - index);
-		cpacf_kimd(ctx->func, ctx->state, ctx->buf, bsize);
+		cpacf_kimd(fc, ctx->state, ctx->buf, bsize);
+		ctx->first_message_part = 0;
+		fc &= ~CPACF_KIMD_NIP;
 		data += bsize - index;
 		len -= bsize - index;
 		index = 0;
@@ -38,7 +45,8 @@ int s390_sha_update(struct shash_desc *desc, const u8 *data, unsigned int len)
 	/* process as many blocks as possible */
 	if (len >= bsize) {
 		n = (len / bsize) * bsize;
-		cpacf_kimd(ctx->func, ctx->state, data, n);
+		cpacf_kimd(fc, ctx->state, data, n);
+		ctx->first_message_part = 0;
 		data += n;
 		len -= n;
 	}
@@ -75,7 +83,7 @@ int s390_sha_final(struct shash_desc *desc, u8 *out)
 	unsigned int bsize = crypto_shash_blocksize(desc->tfm);
 	u64 bits;
 	unsigned int n;
-	int mbl_offset;
+	int mbl_offset, fc;
 
 	n = ctx->count % bsize;
 	bits = ctx->count * 8;
@@ -109,7 +117,11 @@ int s390_sha_final(struct shash_desc *desc, u8 *out)
 		return -EINVAL;
 	}
 
-	cpacf_klmd(ctx->func, ctx->state, ctx->buf, n);
+	fc = ctx->func;
+	fc |= test_facility(86) ? CPACF_KLMD_DUFOP : 0;
+	if (ctx->first_message_part)
+		fc |= CPACF_KLMD_NIP;
+	cpacf_klmd(fc, ctx->state, ctx->buf, n);
 
 	/* copy digest to out */
 	memcpy(out, ctx->state, crypto_shash_digestsize(desc->tfm));
diff --git a/arch/s390/hypfs/Makefile b/arch/s390/hypfs/Makefile
index 06f601509ce9..c34854d298f8 100644
--- a/arch/s390/hypfs/Makefile
+++ b/arch/s390/hypfs/Makefile
@@ -3,7 +3,12 @@
 # Makefile for the linux hypfs filesystem routines.
 #
 
-obj-$(CONFIG_S390_HYPFS_FS) += s390_hypfs.o
+obj-$(CONFIG_S390_HYPFS)	+= hypfs_dbfs.o
+obj-$(CONFIG_S390_HYPFS)	+= hypfs_diag.o
+obj-$(CONFIG_S390_HYPFS)	+= hypfs_diag0c.o
+obj-$(CONFIG_S390_HYPFS)	+= hypfs_sprp.o
+obj-$(CONFIG_S390_HYPFS)	+= hypfs_vm.o
 
-s390_hypfs-objs := inode.o hypfs_diag.o hypfs_vm.o hypfs_dbfs.o hypfs_sprp.o
-s390_hypfs-objs += hypfs_diag0c.o
+obj-$(CONFIG_S390_HYPFS_FS)	+= hypfs_diag_fs.o
+obj-$(CONFIG_S390_HYPFS_FS)	+= hypfs_vm_fs.o
+obj-$(CONFIG_S390_HYPFS_FS)	+= inode.o
diff --git a/arch/s390/hypfs/hypfs.h b/arch/s390/hypfs/hypfs.h
index 05f3f9aee5fc..83ebf54cca6b 100644
--- a/arch/s390/hypfs/hypfs.h
+++ b/arch/s390/hypfs/hypfs.h
@@ -46,6 +46,15 @@ void hypfs_diag0c_exit(void);
 void hypfs_sprp_init(void);
 void hypfs_sprp_exit(void);
 
+int __hypfs_fs_init(void);
+
+static inline int hypfs_fs_init(void)
+{
+	if (IS_ENABLED(CONFIG_S390_HYPFS_FS))
+		return __hypfs_fs_init();
+	return 0;
+}
+
 /* debugfs interface */
 struct hypfs_dbfs_file;
 
@@ -69,8 +78,6 @@ struct hypfs_dbfs_file {
 	struct dentry		*dentry;
 };
 
-extern void hypfs_dbfs_init(void);
-extern void hypfs_dbfs_exit(void);
 extern void hypfs_dbfs_create_file(struct hypfs_dbfs_file *df);
 extern void hypfs_dbfs_remove_file(struct hypfs_dbfs_file *df);
 
diff --git a/arch/s390/hypfs/hypfs_dbfs.c b/arch/s390/hypfs/hypfs_dbfs.c
index f4c7dbfaf8ee..5d9effb0867c 100644
--- a/arch/s390/hypfs/hypfs_dbfs.c
+++ b/arch/s390/hypfs/hypfs_dbfs.c
@@ -39,7 +39,9 @@ static ssize_t dbfs_read(struct file *file, char __user *buf,
 		return 0;
 
 	df = file_inode(file)->i_private;
-	mutex_lock(&df->lock);
+	if (mutex_lock_interruptible(&df->lock))
+		return -ERESTARTSYS;
+
 	data = hypfs_dbfs_data_alloc(df);
 	if (!data) {
 		mutex_unlock(&df->lock);
@@ -74,7 +76,6 @@ static long dbfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 static const struct file_operations dbfs_ops = {
 	.read		= dbfs_read,
-	.llseek		= no_llseek,
 	.unlocked_ioctl = dbfs_ioctl,
 };
 
@@ -90,12 +91,33 @@ void hypfs_dbfs_remove_file(struct hypfs_dbfs_file *df)
 	debugfs_remove(df->dentry);
 }
 
-void hypfs_dbfs_init(void)
+static int __init hypfs_dbfs_init(void)
 {
-	dbfs_dir = debugfs_create_dir("s390_hypfs", NULL);
-}
+	int rc = -ENODATA;
 
-void hypfs_dbfs_exit(void)
-{
+	dbfs_dir = debugfs_create_dir("s390_hypfs", NULL);
+	if (hypfs_diag_init())
+		goto fail_dbfs_exit;
+	if (hypfs_vm_init())
+		goto fail_hypfs_diag_exit;
+	hypfs_sprp_init();
+	if (hypfs_diag0c_init())
+		goto fail_hypfs_sprp_exit;
+	rc = hypfs_fs_init();
+	if (rc)
+		goto fail_hypfs_diag0c_exit;
+	return 0;
+
+fail_hypfs_diag0c_exit:
+	hypfs_diag0c_exit();
+fail_hypfs_sprp_exit:
+	hypfs_sprp_exit();
+	hypfs_vm_exit();
+fail_hypfs_diag_exit:
+	hypfs_diag_exit();
+	pr_err("Initialization of hypfs failed with rc=%i\n", rc);
+fail_dbfs_exit:
 	debugfs_remove(dbfs_dir);
+	return rc;
 }
+device_initcall(hypfs_dbfs_init)
diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c
index f0bc4dc3e9bf..c8af67d20994 100644
--- a/arch/s390/hypfs/hypfs_diag.c
+++ b/arch/s390/hypfs/hypfs_diag.c
@@ -18,196 +18,25 @@
 #include <linux/mm.h>
 #include <asm/diag.h>
 #include <asm/ebcdic.h>
+#include "hypfs_diag.h"
 #include "hypfs.h"
 
-#define TMP_SIZE 64		/* size of temporary buffers */
-
 #define DBFS_D204_HDR_VERSION	0
 
-static char *diag224_cpu_names;			/* diag 224 name table */
 static enum diag204_sc diag204_store_sc;	/* used subcode for store */
 static enum diag204_format diag204_info_type;	/* used diag 204 data format */
 
 static void *diag204_buf;		/* 4K aligned buffer for diag204 data */
-static void *diag204_buf_vmalloc;	/* vmalloc pointer for diag204 data */
 static int diag204_buf_pages;		/* number of pages for diag204 data */
 
-static struct dentry *dbfs_d204_file;
-
-/*
- * DIAG 204 member access functions.
- *
- * Since we have two different diag 204 data formats for old and new s390
- * machines, we do not access the structs directly, but use getter functions for
- * each struct member instead. This should make the code more readable.
- */
-
-/* Time information block */
-
-static inline int info_blk_hdr__size(enum diag204_format type)
-{
-	if (type == DIAG204_INFO_SIMPLE)
-		return sizeof(struct diag204_info_blk_hdr);
-	else /* DIAG204_INFO_EXT */
-		return sizeof(struct diag204_x_info_blk_hdr);
-}
-
-static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr)
+enum diag204_format diag204_get_info_type(void)
 {
-	if (type == DIAG204_INFO_SIMPLE)
-		return ((struct diag204_info_blk_hdr *)hdr)->npar;
-	else /* DIAG204_INFO_EXT */
-		return ((struct diag204_x_info_blk_hdr *)hdr)->npar;
+	return diag204_info_type;
 }
 
-static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr)
+static void diag204_set_info_type(enum diag204_format type)
 {
-	if (type == DIAG204_INFO_SIMPLE)
-		return ((struct diag204_info_blk_hdr *)hdr)->flags;
-	else /* DIAG204_INFO_EXT */
-		return ((struct diag204_x_info_blk_hdr *)hdr)->flags;
-}
-
-static inline __u16 info_blk_hdr__pcpus(enum diag204_format type, void *hdr)
-{
-	if (type == DIAG204_INFO_SIMPLE)
-		return ((struct diag204_info_blk_hdr *)hdr)->phys_cpus;
-	else /* DIAG204_INFO_EXT */
-		return ((struct diag204_x_info_blk_hdr *)hdr)->phys_cpus;
-}
-
-/* Partition header */
-
-static inline int part_hdr__size(enum diag204_format type)
-{
-	if (type == DIAG204_INFO_SIMPLE)
-		return sizeof(struct diag204_part_hdr);
-	else /* DIAG204_INFO_EXT */
-		return sizeof(struct diag204_x_part_hdr);
-}
-
-static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr)
-{
-	if (type == DIAG204_INFO_SIMPLE)
-		return ((struct diag204_part_hdr *)hdr)->cpus;
-	else /* DIAG204_INFO_EXT */
-		return ((struct diag204_x_part_hdr *)hdr)->rcpus;
-}
-
-static inline void part_hdr__part_name(enum diag204_format type, void *hdr,
-				       char *name)
-{
-	if (type == DIAG204_INFO_SIMPLE)
-		memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name,
-		       DIAG204_LPAR_NAME_LEN);
-	else /* DIAG204_INFO_EXT */
-		memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name,
-		       DIAG204_LPAR_NAME_LEN);
-	EBCASC(name, DIAG204_LPAR_NAME_LEN);
-	name[DIAG204_LPAR_NAME_LEN] = 0;
-	strim(name);
-}
-
-/* CPU info block */
-
-static inline int cpu_info__size(enum diag204_format type)
-{
-	if (type == DIAG204_INFO_SIMPLE)
-		return sizeof(struct diag204_cpu_info);
-	else /* DIAG204_INFO_EXT */
-		return sizeof(struct diag204_x_cpu_info);
-}
-
-static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr)
-{
-	if (type == DIAG204_INFO_SIMPLE)
-		return ((struct diag204_cpu_info *)hdr)->ctidx;
-	else /* DIAG204_INFO_EXT */
-		return ((struct diag204_x_cpu_info *)hdr)->ctidx;
-}
-
-static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr)
-{
-	if (type == DIAG204_INFO_SIMPLE)
-		return ((struct diag204_cpu_info *)hdr)->cpu_addr;
-	else /* DIAG204_INFO_EXT */
-		return ((struct diag204_x_cpu_info *)hdr)->cpu_addr;
-}
-
-static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr)
-{
-	if (type == DIAG204_INFO_SIMPLE)
-		return ((struct diag204_cpu_info *)hdr)->acc_time;
-	else /* DIAG204_INFO_EXT */
-		return ((struct diag204_x_cpu_info *)hdr)->acc_time;
-}
-
-static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr)
-{
-	if (type == DIAG204_INFO_SIMPLE)
-		return ((struct diag204_cpu_info *)hdr)->lp_time;
-	else /* DIAG204_INFO_EXT */
-		return ((struct diag204_x_cpu_info *)hdr)->lp_time;
-}
-
-static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr)
-{
-	if (type == DIAG204_INFO_SIMPLE)
-		return 0;	/* online_time not available in simple info */
-	else /* DIAG204_INFO_EXT */
-		return ((struct diag204_x_cpu_info *)hdr)->online_time;
-}
-
-/* Physical header */
-
-static inline int phys_hdr__size(enum diag204_format type)
-{
-	if (type == DIAG204_INFO_SIMPLE)
-		return sizeof(struct diag204_phys_hdr);
-	else /* DIAG204_INFO_EXT */
-		return sizeof(struct diag204_x_phys_hdr);
-}
-
-static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr)
-{
-	if (type == DIAG204_INFO_SIMPLE)
-		return ((struct diag204_phys_hdr *)hdr)->cpus;
-	else /* DIAG204_INFO_EXT */
-		return ((struct diag204_x_phys_hdr *)hdr)->cpus;
-}
-
-/* Physical CPU info block */
-
-static inline int phys_cpu__size(enum diag204_format type)
-{
-	if (type == DIAG204_INFO_SIMPLE)
-		return sizeof(struct diag204_phys_cpu);
-	else /* DIAG204_INFO_EXT */
-		return sizeof(struct diag204_x_phys_cpu);
-}
-
-static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr)
-{
-	if (type == DIAG204_INFO_SIMPLE)
-		return ((struct diag204_phys_cpu *)hdr)->cpu_addr;
-	else /* DIAG204_INFO_EXT */
-		return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr;
-}
-
-static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr)
-{
-	if (type == DIAG204_INFO_SIMPLE)
-		return ((struct diag204_phys_cpu *)hdr)->mgm_time;
-	else /* DIAG204_INFO_EXT */
-		return ((struct diag204_x_phys_cpu *)hdr)->mgm_time;
-}
-
-static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr)
-{
-	if (type == DIAG204_INFO_SIMPLE)
-		return ((struct diag204_phys_cpu *)hdr)->ctidx;
-	else /* DIAG204_INFO_EXT */
-		return ((struct diag204_x_phys_cpu *)hdr)->ctidx;
+	diag204_info_type = type;
 }
 
 /* Diagnose 204 functions */
@@ -220,43 +49,11 @@ static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr)
 
 static void diag204_free_buffer(void)
 {
-	if (!diag204_buf)
-		return;
-	if (diag204_buf_vmalloc) {
-		vfree(diag204_buf_vmalloc);
-		diag204_buf_vmalloc = NULL;
-	} else {
-		free_pages((unsigned long) diag204_buf, 0);
-	}
+	vfree(diag204_buf);
 	diag204_buf = NULL;
 }
 
-static void *page_align_ptr(void *ptr)
-{
-	return (void *) PAGE_ALIGN((unsigned long) ptr);
-}
-
-static void *diag204_alloc_vbuf(int pages)
-{
-	/* The buffer has to be page aligned! */
-	diag204_buf_vmalloc = vmalloc(array_size(PAGE_SIZE, (pages + 1)));
-	if (!diag204_buf_vmalloc)
-		return ERR_PTR(-ENOMEM);
-	diag204_buf = page_align_ptr(diag204_buf_vmalloc);
-	diag204_buf_pages = pages;
-	return diag204_buf;
-}
-
-static void *diag204_alloc_rbuf(void)
-{
-	diag204_buf = (void*)__get_free_pages(GFP_KERNEL,0);
-	if (!diag204_buf)
-		return ERR_PTR(-ENOMEM);
-	diag204_buf_pages = 1;
-	return diag204_buf;
-}
-
-static void *diag204_get_buffer(enum diag204_format fmt, int *pages)
+void *diag204_get_buffer(enum diag204_format fmt, int *pages)
 {
 	if (diag204_buf) {
 		*pages = diag204_buf_pages;
@@ -264,15 +61,19 @@ static void *diag204_get_buffer(enum diag204_format fmt, int *pages)
 	}
 	if (fmt == DIAG204_INFO_SIMPLE) {
 		*pages = 1;
-		return diag204_alloc_rbuf();
 	} else {/* DIAG204_INFO_EXT */
 		*pages = diag204((unsigned long)DIAG204_SUBC_RSI |
 				 (unsigned long)DIAG204_INFO_EXT, 0, NULL);
 		if (*pages <= 0)
-			return ERR_PTR(-ENOSYS);
-		else
-			return diag204_alloc_vbuf(*pages);
+			return ERR_PTR(-EOPNOTSUPP);
 	}
+	diag204_buf = __vmalloc_node(array_size(*pages, PAGE_SIZE),
+				     PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE,
+				     __builtin_return_address(0));
+	if (!diag204_buf)
+		return ERR_PTR(-ENOMEM);
+	diag204_buf_pages = *pages;
+	return diag204_buf;
 }
 
 /*
@@ -299,13 +100,13 @@ static int diag204_probe(void)
 		if (diag204((unsigned long)DIAG204_SUBC_STIB7 |
 			    (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) {
 			diag204_store_sc = DIAG204_SUBC_STIB7;
-			diag204_info_type = DIAG204_INFO_EXT;
+			diag204_set_info_type(DIAG204_INFO_EXT);
 			goto out;
 		}
 		if (diag204((unsigned long)DIAG204_SUBC_STIB6 |
 			    (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) {
 			diag204_store_sc = DIAG204_SUBC_STIB6;
-			diag204_info_type = DIAG204_INFO_EXT;
+			diag204_set_info_type(DIAG204_INFO_EXT);
 			goto out;
 		}
 		diag204_free_buffer();
@@ -321,10 +122,10 @@ static int diag204_probe(void)
 	if (diag204((unsigned long)DIAG204_SUBC_STIB4 |
 		    (unsigned long)DIAG204_INFO_SIMPLE, pages, buf) >= 0) {
 		diag204_store_sc = DIAG204_SUBC_STIB4;
-		diag204_info_type = DIAG204_INFO_SIMPLE;
+		diag204_set_info_type(DIAG204_INFO_SIMPLE);
 		goto out;
 	} else {
-		rc = -ENOSYS;
+		rc = -EOPNOTSUPP;
 		goto fail_store;
 	}
 out:
@@ -335,58 +136,24 @@ fail_alloc:
 	return rc;
 }
 
-static int diag204_do_store(void *buf, int pages)
+int diag204_store(void *buf, int pages)
 {
+	unsigned long subcode;
 	int rc;
 
-	rc = diag204((unsigned long) diag204_store_sc |
-		     (unsigned long) diag204_info_type, pages, buf);
-	return rc < 0 ? -ENOSYS : 0;
-}
-
-static void *diag204_store(void)
-{
-	void *buf;
-	int pages, rc;
-
-	buf = diag204_get_buffer(diag204_info_type, &pages);
-	if (IS_ERR(buf))
-		goto out;
-	rc = diag204_do_store(buf, pages);
-	if (rc)
-		return ERR_PTR(rc);
-out:
-	return buf;
-}
-
-/* Diagnose 224 functions */
-
-static int diag224_get_name_table(void)
-{
-	/* memory must be below 2GB */
-	diag224_cpu_names = (char *) __get_free_page(GFP_KERNEL | GFP_DMA);
-	if (!diag224_cpu_names)
-		return -ENOMEM;
-	if (diag224(diag224_cpu_names)) {
-		free_page((unsigned long) diag224_cpu_names);
-		return -EOPNOTSUPP;
+	subcode = diag204_get_info_type();
+	subcode |= diag204_store_sc;
+	if (diag204_has_bif())
+		subcode |= DIAG204_BIF_BIT;
+	while (1) {
+		rc = diag204(subcode, pages, buf);
+		if (rc != -EBUSY)
+			break;
+		if (signal_pending(current))
+			return -ERESTARTSYS;
+		schedule_timeout_interruptible(DIAG204_BUSY_WAIT);
 	}
-	EBCASC(diag224_cpu_names + 16, (*diag224_cpu_names + 1) * 16);
-	return 0;
-}
-
-static void diag224_delete_name_table(void)
-{
-	free_page((unsigned long) diag224_cpu_names);
-}
-
-static int diag224_idx2name(int index, char *name)
-{
-	memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN),
-	       DIAG204_CPU_NAME_LEN);
-	name[DIAG204_CPU_NAME_LEN] = 0;
-	strim(name);
-	return 0;
+	return rc < 0 ? rc : 0;
 }
 
 struct dbfs_d204_hdr {
@@ -411,8 +178,8 @@ static int dbfs_d204_create(void **data, void **data_free_ptr, size_t *size)
 	base = vzalloc(buf_size);
 	if (!base)
 		return -ENOMEM;
-	d204 = page_align_ptr(base + sizeof(d204->hdr)) - sizeof(d204->hdr);
-	rc = diag204_do_store(d204->buf, diag204_buf_pages);
+	d204 = PTR_ALIGN(base + sizeof(d204->hdr), PAGE_SIZE) - sizeof(d204->hdr);
+	rc = diag204_store(d204->buf, diag204_buf_pages);
 	if (rc) {
 		vfree(base);
 		return rc;
@@ -437,180 +204,22 @@ __init int hypfs_diag_init(void)
 	int rc;
 
 	if (diag204_probe()) {
-		pr_err("The hardware system does not support hypfs\n");
+		pr_info("The hardware system does not support hypfs\n");
 		return -ENODATA;
 	}
 
-	if (diag204_info_type == DIAG204_INFO_EXT)
+	if (diag204_get_info_type() == DIAG204_INFO_EXT)
 		hypfs_dbfs_create_file(&dbfs_file_d204);
 
-	if (MACHINE_IS_LPAR) {
-		rc = diag224_get_name_table();
-		if (rc) {
-			pr_err("The hardware system does not provide all "
-			       "functions required by hypfs\n");
-			debugfs_remove(dbfs_d204_file);
-			return rc;
-		}
-	}
-	return 0;
+	rc = hypfs_diag_fs_init();
+	if (rc)
+		pr_err("The hardware system does not provide all functions required by hypfs\n");
+	return rc;
 }
 
 void hypfs_diag_exit(void)
 {
-	debugfs_remove(dbfs_d204_file);
-	diag224_delete_name_table();
+	hypfs_diag_fs_exit();
 	diag204_free_buffer();
 	hypfs_dbfs_remove_file(&dbfs_file_d204);
 }
-
-/*
- * Functions to create the directory structure
- * *******************************************
- */
-
-static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info)
-{
-	struct dentry *cpu_dir;
-	char buffer[TMP_SIZE];
-	void *rc;
-
-	snprintf(buffer, TMP_SIZE, "%d", cpu_info__cpu_addr(diag204_info_type,
-							    cpu_info));
-	cpu_dir = hypfs_mkdir(cpus_dir, buffer);
-	rc = hypfs_create_u64(cpu_dir, "mgmtime",
-			      cpu_info__acc_time(diag204_info_type, cpu_info) -
-			      cpu_info__lp_time(diag204_info_type, cpu_info));
-	if (IS_ERR(rc))
-		return PTR_ERR(rc);
-	rc = hypfs_create_u64(cpu_dir, "cputime",
-			      cpu_info__lp_time(diag204_info_type, cpu_info));
-	if (IS_ERR(rc))
-		return PTR_ERR(rc);
-	if (diag204_info_type == DIAG204_INFO_EXT) {
-		rc = hypfs_create_u64(cpu_dir, "onlinetime",
-				      cpu_info__online_time(diag204_info_type,
-							    cpu_info));
-		if (IS_ERR(rc))
-			return PTR_ERR(rc);
-	}
-	diag224_idx2name(cpu_info__ctidx(diag204_info_type, cpu_info), buffer);
-	rc = hypfs_create_str(cpu_dir, "type", buffer);
-	return PTR_ERR_OR_ZERO(rc);
-}
-
-static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr)
-{
-	struct dentry *cpus_dir;
-	struct dentry *lpar_dir;
-	char lpar_name[DIAG204_LPAR_NAME_LEN + 1];
-	void *cpu_info;
-	int i;
-
-	part_hdr__part_name(diag204_info_type, part_hdr, lpar_name);
-	lpar_name[DIAG204_LPAR_NAME_LEN] = 0;
-	lpar_dir = hypfs_mkdir(systems_dir, lpar_name);
-	if (IS_ERR(lpar_dir))
-		return lpar_dir;
-	cpus_dir = hypfs_mkdir(lpar_dir, "cpus");
-	if (IS_ERR(cpus_dir))
-		return cpus_dir;
-	cpu_info = part_hdr + part_hdr__size(diag204_info_type);
-	for (i = 0; i < part_hdr__rcpus(diag204_info_type, part_hdr); i++) {
-		int rc;
-		rc = hypfs_create_cpu_files(cpus_dir, cpu_info);
-		if (rc)
-			return ERR_PTR(rc);
-		cpu_info += cpu_info__size(diag204_info_type);
-	}
-	return cpu_info;
-}
-
-static int hypfs_create_phys_cpu_files(struct dentry *cpus_dir, void *cpu_info)
-{
-	struct dentry *cpu_dir;
-	char buffer[TMP_SIZE];
-	void *rc;
-
-	snprintf(buffer, TMP_SIZE, "%i", phys_cpu__cpu_addr(diag204_info_type,
-							    cpu_info));
-	cpu_dir = hypfs_mkdir(cpus_dir, buffer);
-	if (IS_ERR(cpu_dir))
-		return PTR_ERR(cpu_dir);
-	rc = hypfs_create_u64(cpu_dir, "mgmtime",
-			      phys_cpu__mgm_time(diag204_info_type, cpu_info));
-	if (IS_ERR(rc))
-		return PTR_ERR(rc);
-	diag224_idx2name(phys_cpu__ctidx(diag204_info_type, cpu_info), buffer);
-	rc = hypfs_create_str(cpu_dir, "type", buffer);
-	return PTR_ERR_OR_ZERO(rc);
-}
-
-static void *hypfs_create_phys_files(struct dentry *parent_dir, void *phys_hdr)
-{
-	int i;
-	void *cpu_info;
-	struct dentry *cpus_dir;
-
-	cpus_dir = hypfs_mkdir(parent_dir, "cpus");
-	if (IS_ERR(cpus_dir))
-		return cpus_dir;
-	cpu_info = phys_hdr + phys_hdr__size(diag204_info_type);
-	for (i = 0; i < phys_hdr__cpus(diag204_info_type, phys_hdr); i++) {
-		int rc;
-		rc = hypfs_create_phys_cpu_files(cpus_dir, cpu_info);
-		if (rc)
-			return ERR_PTR(rc);
-		cpu_info += phys_cpu__size(diag204_info_type);
-	}
-	return cpu_info;
-}
-
-int hypfs_diag_create_files(struct dentry *root)
-{
-	struct dentry *systems_dir, *hyp_dir;
-	void *time_hdr, *part_hdr;
-	int i, rc;
-	void *buffer, *ptr;
-
-	buffer = diag204_store();
-	if (IS_ERR(buffer))
-		return PTR_ERR(buffer);
-
-	systems_dir = hypfs_mkdir(root, "systems");
-	if (IS_ERR(systems_dir)) {
-		rc = PTR_ERR(systems_dir);
-		goto err_out;
-	}
-	time_hdr = (struct x_info_blk_hdr *)buffer;
-	part_hdr = time_hdr + info_blk_hdr__size(diag204_info_type);
-	for (i = 0; i < info_blk_hdr__npar(diag204_info_type, time_hdr); i++) {
-		part_hdr = hypfs_create_lpar_files(systems_dir, part_hdr);
-		if (IS_ERR(part_hdr)) {
-			rc = PTR_ERR(part_hdr);
-			goto err_out;
-		}
-	}
-	if (info_blk_hdr__flags(diag204_info_type, time_hdr) &
-	    DIAG204_LPAR_PHYS_FLG) {
-		ptr = hypfs_create_phys_files(root, part_hdr);
-		if (IS_ERR(ptr)) {
-			rc = PTR_ERR(ptr);
-			goto err_out;
-		}
-	}
-	hyp_dir = hypfs_mkdir(root, "hyp");
-	if (IS_ERR(hyp_dir)) {
-		rc = PTR_ERR(hyp_dir);
-		goto err_out;
-	}
-	ptr = hypfs_create_str(hyp_dir, "type", "LPAR Hypervisor");
-	if (IS_ERR(ptr)) {
-		rc = PTR_ERR(ptr);
-		goto err_out;
-	}
-	rc = 0;
-
-err_out:
-	return rc;
-}
diff --git a/arch/s390/hypfs/hypfs_diag.h b/arch/s390/hypfs/hypfs_diag.h
new file mode 100644
index 000000000000..7090eff27fef
--- /dev/null
+++ b/arch/s390/hypfs/hypfs_diag.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *    Hypervisor filesystem for Linux on s390. Diag 204 and 224
+ *    implementation.
+ *
+ *    Copyright IBM Corp. 2006, 2008
+ *    Author(s): Michael Holzheu <holzheu@de.ibm.com>
+ */
+
+#ifndef _S390_HYPFS_DIAG_H_
+#define _S390_HYPFS_DIAG_H_
+
+#include <asm/diag.h>
+
+enum diag204_format diag204_get_info_type(void);
+void *diag204_get_buffer(enum diag204_format fmt, int *pages);
+int diag204_store(void *buf, int pages);
+
+int __hypfs_diag_fs_init(void);
+void __hypfs_diag_fs_exit(void);
+
+static inline int hypfs_diag_fs_init(void)
+{
+	if (IS_ENABLED(CONFIG_S390_HYPFS_FS))
+		return __hypfs_diag_fs_init();
+	return 0;
+}
+
+static inline void hypfs_diag_fs_exit(void)
+{
+	if (IS_ENABLED(CONFIG_S390_HYPFS_FS))
+		__hypfs_diag_fs_exit();
+}
+
+#endif /* _S390_HYPFS_DIAG_H_ */
diff --git a/arch/s390/hypfs/hypfs_diag0c.c b/arch/s390/hypfs/hypfs_diag0c.c
index 3235e4d82f2d..61220e717af0 100644
--- a/arch/s390/hypfs/hypfs_diag0c.c
+++ b/arch/s390/hypfs/hypfs_diag0c.c
@@ -9,6 +9,7 @@
 
 #include <linux/slab.h>
 #include <linux/cpu.h>
+#include <asm/machine.h>
 #include <asm/diag.h>
 #include <asm/hypfs.h>
 #include "hypfs.h"
@@ -20,8 +21,7 @@
  */
 static void diag0c_fn(void *data)
 {
-	diag_stat_inc(DIAG_STAT_X00C);
-	diag_dma_ops.diag0c(((void **) data)[smp_processor_id()]);
+	diag0c(((void **)data)[smp_processor_id()]);
 }
 
 /*
@@ -33,12 +33,12 @@ static void *diag0c_store(unsigned int *count)
 	unsigned int cpu_count, cpu, i;
 	void **cpu_vec;
 
-	get_online_cpus();
+	cpus_read_lock();
 	cpu_count = num_online_cpus();
 	cpu_vec = kmalloc_array(num_possible_cpus(), sizeof(*cpu_vec),
 				GFP_KERNEL);
 	if (!cpu_vec)
-		goto fail_put_online_cpus;
+		goto fail_unlock_cpus;
 	/* Note: Diag 0c needs 8 byte alignment and real storage */
 	diag0c_data = kzalloc(struct_size(diag0c_data, entry, cpu_count),
 			      GFP_KERNEL | GFP_DMA);
@@ -54,13 +54,13 @@ static void *diag0c_store(unsigned int *count)
 	on_each_cpu(diag0c_fn, cpu_vec, 1);
 	*count = cpu_count;
 	kfree(cpu_vec);
-	put_online_cpus();
+	cpus_read_unlock();
 	return diag0c_data;
 
 fail_kfree_cpu_vec:
 	kfree(cpu_vec);
-fail_put_online_cpus:
-	put_online_cpus();
+fail_unlock_cpus:
+	cpus_read_unlock();
 	return ERR_PTR(-ENOMEM);
 }
 
@@ -84,7 +84,7 @@ static int dbfs_diag0c_create(void **data, void **data_free_ptr, size_t *size)
 	if (IS_ERR(diag0c_data))
 		return PTR_ERR(diag0c_data);
 	memset(&diag0c_data->hdr, 0, sizeof(diag0c_data->hdr));
-	get_tod_clock_ext(diag0c_data->hdr.tod_ext);
+	store_tod_clock_ext((union tod_clock *)diag0c_data->hdr.tod_ext);
 	diag0c_data->hdr.len = count * sizeof(struct hypfs_diag0c_entry);
 	diag0c_data->hdr.version = DBFS_D0C_HDR_VERSION;
 	diag0c_data->hdr.count = count;
@@ -108,7 +108,7 @@ static struct hypfs_dbfs_file dbfs_file_0c = {
  */
 int __init hypfs_diag0c_init(void)
 {
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return 0;
 	hypfs_dbfs_create_file(&dbfs_file_0c);
 	return 0;
@@ -119,7 +119,7 @@ int __init hypfs_diag0c_init(void)
  */
 void hypfs_diag0c_exit(void)
 {
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return;
 	hypfs_dbfs_remove_file(&dbfs_file_0c);
 }
diff --git a/arch/s390/hypfs/hypfs_diag_fs.c b/arch/s390/hypfs/hypfs_diag_fs.c
new file mode 100644
index 000000000000..ede951dc0085
--- /dev/null
+++ b/arch/s390/hypfs/hypfs_diag_fs.c
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *    Hypervisor filesystem for Linux on s390. Diag 204 and 224
+ *    implementation.
+ *
+ *    Copyright IBM Corp. 2006, 2008
+ *    Author(s): Michael Holzheu <holzheu@de.ibm.com>
+ */
+
+#define KMSG_COMPONENT "hypfs"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <asm/machine.h>
+#include <asm/diag.h>
+#include <asm/ebcdic.h>
+#include "hypfs_diag.h"
+#include "hypfs.h"
+
+#define TMP_SIZE 64		/* size of temporary buffers */
+
+static char *diag224_cpu_names;			/* diag 224 name table */
+static int diag224_idx2name(int index, char *name);
+
+/*
+ * DIAG 204 member access functions.
+ *
+ * Since we have two different diag 204 data formats for old and new s390
+ * machines, we do not access the structs directly, but use getter functions for
+ * each struct member instead. This should make the code more readable.
+ */
+
+/* Time information block */
+
+static inline int info_blk_hdr__size(enum diag204_format type)
+{
+	if (type == DIAG204_INFO_SIMPLE)
+		return sizeof(struct diag204_info_blk_hdr);
+	else /* DIAG204_INFO_EXT */
+		return sizeof(struct diag204_x_info_blk_hdr);
+}
+
+static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr)
+{
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_info_blk_hdr *)hdr)->npar;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_info_blk_hdr *)hdr)->npar;
+}
+
+static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr)
+{
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_info_blk_hdr *)hdr)->flags;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_info_blk_hdr *)hdr)->flags;
+}
+
+/* Partition header */
+
+static inline int part_hdr__size(enum diag204_format type)
+{
+	if (type == DIAG204_INFO_SIMPLE)
+		return sizeof(struct diag204_part_hdr);
+	else /* DIAG204_INFO_EXT */
+		return sizeof(struct diag204_x_part_hdr);
+}
+
+static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr)
+{
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_part_hdr *)hdr)->cpus;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_part_hdr *)hdr)->rcpus;
+}
+
+static inline void part_hdr__part_name(enum diag204_format type, void *hdr,
+				       char *name)
+{
+	if (type == DIAG204_INFO_SIMPLE)
+		memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name,
+		       DIAG204_LPAR_NAME_LEN);
+	else /* DIAG204_INFO_EXT */
+		memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name,
+		       DIAG204_LPAR_NAME_LEN);
+	EBCASC(name, DIAG204_LPAR_NAME_LEN);
+	name[DIAG204_LPAR_NAME_LEN] = 0;
+	strim(name);
+}
+
+/* CPU info block */
+
+static inline int cpu_info__size(enum diag204_format type)
+{
+	if (type == DIAG204_INFO_SIMPLE)
+		return sizeof(struct diag204_cpu_info);
+	else /* DIAG204_INFO_EXT */
+		return sizeof(struct diag204_x_cpu_info);
+}
+
+static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr)
+{
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_cpu_info *)hdr)->ctidx;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_cpu_info *)hdr)->ctidx;
+}
+
+static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr)
+{
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_cpu_info *)hdr)->cpu_addr;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_cpu_info *)hdr)->cpu_addr;
+}
+
+static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr)
+{
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_cpu_info *)hdr)->acc_time;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_cpu_info *)hdr)->acc_time;
+}
+
+static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr)
+{
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_cpu_info *)hdr)->lp_time;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_cpu_info *)hdr)->lp_time;
+}
+
+static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr)
+{
+	if (type == DIAG204_INFO_SIMPLE)
+		return 0;	/* online_time not available in simple info */
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_cpu_info *)hdr)->online_time;
+}
+
+/* Physical header */
+
+static inline int phys_hdr__size(enum diag204_format type)
+{
+	if (type == DIAG204_INFO_SIMPLE)
+		return sizeof(struct diag204_phys_hdr);
+	else /* DIAG204_INFO_EXT */
+		return sizeof(struct diag204_x_phys_hdr);
+}
+
+static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr)
+{
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_phys_hdr *)hdr)->cpus;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_phys_hdr *)hdr)->cpus;
+}
+
+/* Physical CPU info block */
+
+static inline int phys_cpu__size(enum diag204_format type)
+{
+	if (type == DIAG204_INFO_SIMPLE)
+		return sizeof(struct diag204_phys_cpu);
+	else /* DIAG204_INFO_EXT */
+		return sizeof(struct diag204_x_phys_cpu);
+}
+
+static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr)
+{
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_phys_cpu *)hdr)->cpu_addr;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr;
+}
+
+static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr)
+{
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_phys_cpu *)hdr)->mgm_time;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_phys_cpu *)hdr)->mgm_time;
+}
+
+static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr)
+{
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_phys_cpu *)hdr)->ctidx;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_phys_cpu *)hdr)->ctidx;
+}
+
+/*
+ * Functions to create the directory structure
+ * *******************************************
+ */
+
+static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info)
+{
+	struct dentry *cpu_dir;
+	char buffer[TMP_SIZE];
+	void *rc;
+
+	snprintf(buffer, TMP_SIZE, "%d", cpu_info__cpu_addr(diag204_get_info_type(),
+							    cpu_info));
+	cpu_dir = hypfs_mkdir(cpus_dir, buffer);
+	if (IS_ERR(cpu_dir))
+		return PTR_ERR(cpu_dir);
+	rc = hypfs_create_u64(cpu_dir, "mgmtime",
+			      cpu_info__acc_time(diag204_get_info_type(), cpu_info) -
+			      cpu_info__lp_time(diag204_get_info_type(), cpu_info));
+	if (IS_ERR(rc))
+		return PTR_ERR(rc);
+	rc = hypfs_create_u64(cpu_dir, "cputime",
+			      cpu_info__lp_time(diag204_get_info_type(), cpu_info));
+	if (IS_ERR(rc))
+		return PTR_ERR(rc);
+	if (diag204_get_info_type() == DIAG204_INFO_EXT) {
+		rc = hypfs_create_u64(cpu_dir, "onlinetime",
+				      cpu_info__online_time(diag204_get_info_type(),
+							    cpu_info));
+		if (IS_ERR(rc))
+			return PTR_ERR(rc);
+	}
+	diag224_idx2name(cpu_info__ctidx(diag204_get_info_type(), cpu_info), buffer);
+	rc = hypfs_create_str(cpu_dir, "type", buffer);
+	return PTR_ERR_OR_ZERO(rc);
+}
+
+static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr)
+{
+	struct dentry *cpus_dir;
+	struct dentry *lpar_dir;
+	char lpar_name[DIAG204_LPAR_NAME_LEN + 1];
+	void *cpu_info;
+	int i;
+
+	part_hdr__part_name(diag204_get_info_type(), part_hdr, lpar_name);
+	lpar_name[DIAG204_LPAR_NAME_LEN] = 0;
+	lpar_dir = hypfs_mkdir(systems_dir, lpar_name);
+	if (IS_ERR(lpar_dir))
+		return lpar_dir;
+	cpus_dir = hypfs_mkdir(lpar_dir, "cpus");
+	if (IS_ERR(cpus_dir))
+		return cpus_dir;
+	cpu_info = part_hdr + part_hdr__size(diag204_get_info_type());
+	for (i = 0; i < part_hdr__rcpus(diag204_get_info_type(), part_hdr); i++) {
+		int rc;
+
+		rc = hypfs_create_cpu_files(cpus_dir, cpu_info);
+		if (rc)
+			return ERR_PTR(rc);
+		cpu_info += cpu_info__size(diag204_get_info_type());
+	}
+	return cpu_info;
+}
+
+static int hypfs_create_phys_cpu_files(struct dentry *cpus_dir, void *cpu_info)
+{
+	struct dentry *cpu_dir;
+	char buffer[TMP_SIZE];
+	void *rc;
+
+	snprintf(buffer, TMP_SIZE, "%i", phys_cpu__cpu_addr(diag204_get_info_type(),
+							    cpu_info));
+	cpu_dir = hypfs_mkdir(cpus_dir, buffer);
+	if (IS_ERR(cpu_dir))
+		return PTR_ERR(cpu_dir);
+	rc = hypfs_create_u64(cpu_dir, "mgmtime",
+			      phys_cpu__mgm_time(diag204_get_info_type(), cpu_info));
+	if (IS_ERR(rc))
+		return PTR_ERR(rc);
+	diag224_idx2name(phys_cpu__ctidx(diag204_get_info_type(), cpu_info), buffer);
+	rc = hypfs_create_str(cpu_dir, "type", buffer);
+	return PTR_ERR_OR_ZERO(rc);
+}
+
+static void *hypfs_create_phys_files(struct dentry *parent_dir, void *phys_hdr)
+{
+	int i;
+	void *cpu_info;
+	struct dentry *cpus_dir;
+
+	cpus_dir = hypfs_mkdir(parent_dir, "cpus");
+	if (IS_ERR(cpus_dir))
+		return cpus_dir;
+	cpu_info = phys_hdr + phys_hdr__size(diag204_get_info_type());
+	for (i = 0; i < phys_hdr__cpus(diag204_get_info_type(), phys_hdr); i++) {
+		int rc;
+
+		rc = hypfs_create_phys_cpu_files(cpus_dir, cpu_info);
+		if (rc)
+			return ERR_PTR(rc);
+		cpu_info += phys_cpu__size(diag204_get_info_type());
+	}
+	return cpu_info;
+}
+
+int hypfs_diag_create_files(struct dentry *root)
+{
+	struct dentry *systems_dir, *hyp_dir;
+	void *time_hdr, *part_hdr;
+	void *buffer, *ptr;
+	int i, rc, pages;
+
+	buffer = diag204_get_buffer(diag204_get_info_type(), &pages);
+	if (IS_ERR(buffer))
+		return PTR_ERR(buffer);
+	rc = diag204_store(buffer, pages);
+	if (rc)
+		return rc;
+
+	systems_dir = hypfs_mkdir(root, "systems");
+	if (IS_ERR(systems_dir)) {
+		rc = PTR_ERR(systems_dir);
+		goto err_out;
+	}
+	time_hdr = (struct x_info_blk_hdr *)buffer;
+	part_hdr = time_hdr + info_blk_hdr__size(diag204_get_info_type());
+	for (i = 0; i < info_blk_hdr__npar(diag204_get_info_type(), time_hdr); i++) {
+		part_hdr = hypfs_create_lpar_files(systems_dir, part_hdr);
+		if (IS_ERR(part_hdr)) {
+			rc = PTR_ERR(part_hdr);
+			goto err_out;
+		}
+	}
+	if (info_blk_hdr__flags(diag204_get_info_type(), time_hdr) &
+	    DIAG204_LPAR_PHYS_FLG) {
+		ptr = hypfs_create_phys_files(root, part_hdr);
+		if (IS_ERR(ptr)) {
+			rc = PTR_ERR(ptr);
+			goto err_out;
+		}
+	}
+	hyp_dir = hypfs_mkdir(root, "hyp");
+	if (IS_ERR(hyp_dir)) {
+		rc = PTR_ERR(hyp_dir);
+		goto err_out;
+	}
+	ptr = hypfs_create_str(hyp_dir, "type", "LPAR Hypervisor");
+	if (IS_ERR(ptr)) {
+		rc = PTR_ERR(ptr);
+		goto err_out;
+	}
+	rc = 0;
+
+err_out:
+	return rc;
+}
+
+/* Diagnose 224 functions */
+
+static int diag224_idx2name(int index, char *name)
+{
+	memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN),
+	       DIAG204_CPU_NAME_LEN);
+	name[DIAG204_CPU_NAME_LEN] = 0;
+	strim(name);
+	return 0;
+}
+
+static int diag224_get_name_table(void)
+{
+	/* memory must be below 2GB */
+	diag224_cpu_names = (char *)__get_free_page(GFP_KERNEL | GFP_DMA);
+	if (!diag224_cpu_names)
+		return -ENOMEM;
+	if (diag224(diag224_cpu_names)) {
+		free_page((unsigned long)diag224_cpu_names);
+		return -EOPNOTSUPP;
+	}
+	EBCASC(diag224_cpu_names + 16, (*diag224_cpu_names + 1) * 16);
+	return 0;
+}
+
+static void diag224_delete_name_table(void)
+{
+	free_page((unsigned long)diag224_cpu_names);
+}
+
+int __init __hypfs_diag_fs_init(void)
+{
+	if (machine_is_lpar())
+		return diag224_get_name_table();
+	return 0;
+}
+
+void __hypfs_diag_fs_exit(void)
+{
+	diag224_delete_name_table();
+}
diff --git a/arch/s390/hypfs/hypfs_sprp.c b/arch/s390/hypfs/hypfs_sprp.c
index 7d9fb496d155..9fc3f0dae8f0 100644
--- a/arch/s390/hypfs/hypfs_sprp.c
+++ b/arch/s390/hypfs/hypfs_sprp.c
@@ -25,14 +25,13 @@
 
 static inline unsigned long __hypfs_sprp_diag304(void *data, unsigned long cmd)
 {
-	register unsigned long _data asm("2") = (unsigned long) data;
-	register unsigned long _rc asm("3");
-	register unsigned long _cmd asm("4") = cmd;
+	union register_pair r1 = { .even = virt_to_phys(data), };
 
-	asm volatile("diag %1,%2,0x304\n"
-		     : "=d" (_rc) : "d" (_data), "d" (_cmd) : "memory");
-
-	return _rc;
+	asm volatile("diag %[r1],%[r3],0x304\n"
+		     : [r1] "+&d" (r1.pair)
+		     : [r3] "d" (cmd)
+		     : "memory");
+	return r1.odd;
 }
 
 static unsigned long hypfs_sprp_diag304(void *data, unsigned long cmd)
@@ -75,7 +74,7 @@ static int __hypfs_sprp_ioctl(void __user *user_area)
 	int rc;
 
 	rc = -ENOMEM;
-	data = (void *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
+	data = (void *)get_zeroed_page(GFP_KERNEL);
 	diag304 = kzalloc(sizeof(*diag304), GFP_KERNEL);
 	if (!data || !diag304)
 		goto out;
diff --git a/arch/s390/hypfs/hypfs_vm.c b/arch/s390/hypfs/hypfs_vm.c
index e1fcc03159ef..4db2895e4da3 100644
--- a/arch/s390/hypfs/hypfs_vm.c
+++ b/arch/s390/hypfs/hypfs_vm.c
@@ -10,49 +10,20 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/vmalloc.h>
+#include <asm/extable.h>
+#include <asm/machine.h>
 #include <asm/diag.h>
 #include <asm/ebcdic.h>
 #include <asm/timex.h>
+#include "hypfs_vm.h"
 #include "hypfs.h"
 
-#define NAME_LEN 8
 #define DBFS_D2FC_HDR_VERSION 0
 
 static char local_guest[] = "        ";
 static char all_guests[] = "*       ";
-static char *guest_query;
-
-struct diag2fc_data {
-	__u32 version;
-	__u32 flags;
-	__u64 used_cpu;
-	__u64 el_time;
-	__u64 mem_min_kb;
-	__u64 mem_max_kb;
-	__u64 mem_share_kb;
-	__u64 mem_used_kb;
-	__u32 pcpus;
-	__u32 lcpus;
-	__u32 vcpus;
-	__u32 ocpus;
-	__u32 cpu_max;
-	__u32 cpu_shares;
-	__u32 cpu_use_samp;
-	__u32 cpu_delay_samp;
-	__u32 page_wait_samp;
-	__u32 idle_samp;
-	__u32 other_samp;
-	__u32 total_samp;
-	char  guest_name[NAME_LEN];
-};
-
-struct diag2fc_parm_list {
-	char userid[NAME_LEN];
-	char aci_grp[NAME_LEN];
-	__u64 addr;
-	__u32 size;
-	__u32 fmt;
-};
+static char *all_groups = all_guests;
+char *diag2fc_guest_query;
 
 static int diag2fc(int size, char* query, void *addr)
 {
@@ -60,12 +31,13 @@ static int diag2fc(int size, char* query, void *addr)
 	unsigned long rc;
 	struct diag2fc_parm_list parm_list;
 
-	memcpy(parm_list.userid, query, NAME_LEN);
-	ASCEBC(parm_list.userid, NAME_LEN);
-	parm_list.addr = (unsigned long) addr ;
+	memcpy(parm_list.userid, query, DIAG2FC_NAME_LEN);
+	ASCEBC(parm_list.userid, DIAG2FC_NAME_LEN);
+	memcpy(parm_list.aci_grp, all_groups, DIAG2FC_NAME_LEN);
+	ASCEBC(parm_list.aci_grp, DIAG2FC_NAME_LEN);
+	parm_list.addr = (unsigned long)addr;
 	parm_list.size = size;
 	parm_list.fmt = 0x02;
-	memset(parm_list.aci_grp, 0x40, NAME_LEN);
 	rc = -1;
 
 	diag_stat_inc(DIAG_STAT_X2FC);
@@ -84,7 +56,7 @@ static int diag2fc(int size, char* query, void *addr)
 /*
  * Allocate buffer for "query" and store diag 2fc at "offset"
  */
-static void *diag2fc_store(char *query, unsigned int *count, int offset)
+void *diag2fc_store(char *query, unsigned int *count, int offset)
 {
 	void *data;
 	int size;
@@ -105,136 +77,15 @@ static void *diag2fc_store(char *query, unsigned int *count, int offset)
 	return data;
 }
 
-static void diag2fc_free(const void *data)
+void diag2fc_free(const void *data)
 {
 	vfree(data);
 }
 
-#define ATTRIBUTE(dir, name, member) \
-do { \
-	void *rc; \
-	rc = hypfs_create_u64(dir, name, member); \
-	if (IS_ERR(rc)) \
-		return PTR_ERR(rc); \
-} while(0)
-
-static int hypfs_vm_create_guest(struct dentry *systems_dir,
-				 struct diag2fc_data *data)
-{
-	char guest_name[NAME_LEN + 1] = {};
-	struct dentry *guest_dir, *cpus_dir, *samples_dir, *mem_dir;
-	int dedicated_flag, capped_value;
-
-	capped_value = (data->flags & 0x00000006) >> 1;
-	dedicated_flag = (data->flags & 0x00000008) >> 3;
-
-	/* guest dir */
-	memcpy(guest_name, data->guest_name, NAME_LEN);
-	EBCASC(guest_name, NAME_LEN);
-	strim(guest_name);
-	guest_dir = hypfs_mkdir(systems_dir, guest_name);
-	if (IS_ERR(guest_dir))
-		return PTR_ERR(guest_dir);
-	ATTRIBUTE(guest_dir, "onlinetime_us", data->el_time);
-
-	/* logical cpu information */
-	cpus_dir = hypfs_mkdir(guest_dir, "cpus");
-	if (IS_ERR(cpus_dir))
-		return PTR_ERR(cpus_dir);
-	ATTRIBUTE(cpus_dir, "cputime_us", data->used_cpu);
-	ATTRIBUTE(cpus_dir, "capped", capped_value);
-	ATTRIBUTE(cpus_dir, "dedicated", dedicated_flag);
-	ATTRIBUTE(cpus_dir, "count", data->vcpus);
-	/*
-	 * Note: The "weight_min" attribute got the wrong name.
-	 * The value represents the number of non-stopped (operating)
-	 * CPUS.
-	 */
-	ATTRIBUTE(cpus_dir, "weight_min", data->ocpus);
-	ATTRIBUTE(cpus_dir, "weight_max", data->cpu_max);
-	ATTRIBUTE(cpus_dir, "weight_cur", data->cpu_shares);
-
-	/* memory information */
-	mem_dir = hypfs_mkdir(guest_dir, "mem");
-	if (IS_ERR(mem_dir))
-		return PTR_ERR(mem_dir);
-	ATTRIBUTE(mem_dir, "min_KiB", data->mem_min_kb);
-	ATTRIBUTE(mem_dir, "max_KiB", data->mem_max_kb);
-	ATTRIBUTE(mem_dir, "used_KiB", data->mem_used_kb);
-	ATTRIBUTE(mem_dir, "share_KiB", data->mem_share_kb);
-
-	/* samples */
-	samples_dir = hypfs_mkdir(guest_dir, "samples");
-	if (IS_ERR(samples_dir))
-		return PTR_ERR(samples_dir);
-	ATTRIBUTE(samples_dir, "cpu_using", data->cpu_use_samp);
-	ATTRIBUTE(samples_dir, "cpu_delay", data->cpu_delay_samp);
-	ATTRIBUTE(samples_dir, "mem_delay", data->page_wait_samp);
-	ATTRIBUTE(samples_dir, "idle", data->idle_samp);
-	ATTRIBUTE(samples_dir, "other", data->other_samp);
-	ATTRIBUTE(samples_dir, "total", data->total_samp);
-	return 0;
-}
-
-int hypfs_vm_create_files(struct dentry *root)
-{
-	struct dentry *dir, *file;
-	struct diag2fc_data *data;
-	unsigned int count = 0;
-	int rc, i;
-
-	data = diag2fc_store(guest_query, &count, 0);
-	if (IS_ERR(data))
-		return PTR_ERR(data);
-
-	/* Hpervisor Info */
-	dir = hypfs_mkdir(root, "hyp");
-	if (IS_ERR(dir)) {
-		rc = PTR_ERR(dir);
-		goto failed;
-	}
-	file = hypfs_create_str(dir, "type", "z/VM Hypervisor");
-	if (IS_ERR(file)) {
-		rc = PTR_ERR(file);
-		goto failed;
-	}
-
-	/* physical cpus */
-	dir = hypfs_mkdir(root, "cpus");
-	if (IS_ERR(dir)) {
-		rc = PTR_ERR(dir);
-		goto failed;
-	}
-	file = hypfs_create_u64(dir, "count", data->lcpus);
-	if (IS_ERR(file)) {
-		rc = PTR_ERR(file);
-		goto failed;
-	}
-
-	/* guests */
-	dir = hypfs_mkdir(root, "systems");
-	if (IS_ERR(dir)) {
-		rc = PTR_ERR(dir);
-		goto failed;
-	}
-
-	for (i = 0; i < count; i++) {
-		rc = hypfs_vm_create_guest(dir, &(data[i]));
-		if (rc)
-			goto failed;
-	}
-	diag2fc_free(data);
-	return 0;
-
-failed:
-	diag2fc_free(data);
-	return rc;
-}
-
 struct dbfs_d2fc_hdr {
 	u64	len;		/* Length of d2fc buffer without header */
 	u16	version;	/* Version of header */
-	char	tod_ext[STORE_CLOCK_EXT_SIZE]; /* TOD clock for d2fc */
+	union tod_clock tod_ext; /* TOD clock for d2fc */
 	u64	count;		/* Number of VM guests in d2fc buffer */
 	char	reserved[30];
 } __attribute__ ((packed));
@@ -249,10 +100,10 @@ static int dbfs_diag2fc_create(void **data, void **data_free_ptr, size_t *size)
 	struct dbfs_d2fc *d2fc;
 	unsigned int count;
 
-	d2fc = diag2fc_store(guest_query, &count, sizeof(d2fc->hdr));
+	d2fc = diag2fc_store(diag2fc_guest_query, &count, sizeof(d2fc->hdr));
 	if (IS_ERR(d2fc))
 		return PTR_ERR(d2fc);
-	get_tod_clock_ext(d2fc->hdr.tod_ext);
+	store_tod_clock_ext(&d2fc->hdr.tod_ext);
 	d2fc->hdr.len = count * sizeof(struct diag2fc_data);
 	d2fc->hdr.version = DBFS_D2FC_HDR_VERSION;
 	d2fc->hdr.count = count;
@@ -271,12 +122,12 @@ static struct hypfs_dbfs_file dbfs_file_2fc = {
 
 int hypfs_vm_init(void)
 {
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return 0;
 	if (diag2fc(0, all_guests, NULL) > 0)
-		guest_query = all_guests;
+		diag2fc_guest_query = all_guests;
 	else if (diag2fc(0, local_guest, NULL) > 0)
-		guest_query = local_guest;
+		diag2fc_guest_query = local_guest;
 	else
 		return -EACCES;
 	hypfs_dbfs_create_file(&dbfs_file_2fc);
@@ -285,7 +136,7 @@ int hypfs_vm_init(void)
 
 void hypfs_vm_exit(void)
 {
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return;
 	hypfs_dbfs_remove_file(&dbfs_file_2fc);
 }
diff --git a/arch/s390/hypfs/hypfs_vm.h b/arch/s390/hypfs/hypfs_vm.h
new file mode 100644
index 000000000000..fe2e5851addd
--- /dev/null
+++ b/arch/s390/hypfs/hypfs_vm.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *    Hypervisor filesystem for Linux on s390. z/VM implementation.
+ *
+ *    Copyright IBM Corp. 2006
+ *    Author(s): Michael Holzheu <holzheu@de.ibm.com>
+ */
+
+#ifndef _S390_HYPFS_VM_H_
+#define _S390_HYPFS_VM_H_
+
+#define DIAG2FC_NAME_LEN 8
+
+struct diag2fc_data {
+	__u32 version;
+	__u32 flags;
+	__u64 used_cpu;
+	__u64 el_time;
+	__u64 mem_min_kb;
+	__u64 mem_max_kb;
+	__u64 mem_share_kb;
+	__u64 mem_used_kb;
+	__u32 pcpus;
+	__u32 lcpus;
+	__u32 vcpus;
+	__u32 ocpus;
+	__u32 cpu_max;
+	__u32 cpu_shares;
+	__u32 cpu_use_samp;
+	__u32 cpu_delay_samp;
+	__u32 page_wait_samp;
+	__u32 idle_samp;
+	__u32 other_samp;
+	__u32 total_samp;
+	char  guest_name[DIAG2FC_NAME_LEN];
+};
+
+struct diag2fc_parm_list {
+	char userid[DIAG2FC_NAME_LEN];
+	char aci_grp[DIAG2FC_NAME_LEN];
+	__u64 addr;
+	__u32 size;
+	__u32 fmt;
+};
+
+void *diag2fc_store(char *query, unsigned int *count, int offset);
+void diag2fc_free(const void *data);
+extern char *diag2fc_guest_query;
+
+#endif /* _S390_HYPFS_VM_H_ */
diff --git a/arch/s390/hypfs/hypfs_vm_fs.c b/arch/s390/hypfs/hypfs_vm_fs.c
new file mode 100644
index 000000000000..6011289afa8c
--- /dev/null
+++ b/arch/s390/hypfs/hypfs_vm_fs.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *    Hypervisor filesystem for Linux on s390. z/VM implementation.
+ *
+ *    Copyright IBM Corp. 2006
+ *    Author(s): Michael Holzheu <holzheu@de.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include <asm/extable.h>
+#include <asm/diag.h>
+#include <asm/ebcdic.h>
+#include <asm/timex.h>
+#include "hypfs_vm.h"
+#include "hypfs.h"
+
+#define ATTRIBUTE(dir, name, member) \
+do { \
+	void *rc; \
+	rc = hypfs_create_u64(dir, name, member); \
+	if (IS_ERR(rc)) \
+		return PTR_ERR(rc); \
+} while (0)
+
+static int hypfs_vm_create_guest(struct dentry *systems_dir,
+				 struct diag2fc_data *data)
+{
+	char guest_name[DIAG2FC_NAME_LEN + 1] = {};
+	struct dentry *guest_dir, *cpus_dir, *samples_dir, *mem_dir;
+	int dedicated_flag, capped_value;
+
+	capped_value = (data->flags & 0x00000006) >> 1;
+	dedicated_flag = (data->flags & 0x00000008) >> 3;
+
+	/* guest dir */
+	memcpy(guest_name, data->guest_name, DIAG2FC_NAME_LEN);
+	EBCASC(guest_name, DIAG2FC_NAME_LEN);
+	strim(guest_name);
+	guest_dir = hypfs_mkdir(systems_dir, guest_name);
+	if (IS_ERR(guest_dir))
+		return PTR_ERR(guest_dir);
+	ATTRIBUTE(guest_dir, "onlinetime_us", data->el_time);
+
+	/* logical cpu information */
+	cpus_dir = hypfs_mkdir(guest_dir, "cpus");
+	if (IS_ERR(cpus_dir))
+		return PTR_ERR(cpus_dir);
+	ATTRIBUTE(cpus_dir, "cputime_us", data->used_cpu);
+	ATTRIBUTE(cpus_dir, "capped", capped_value);
+	ATTRIBUTE(cpus_dir, "dedicated", dedicated_flag);
+	ATTRIBUTE(cpus_dir, "count", data->vcpus);
+	/*
+	 * Note: The "weight_min" attribute got the wrong name.
+	 * The value represents the number of non-stopped (operating)
+	 * CPUS.
+	 */
+	ATTRIBUTE(cpus_dir, "weight_min", data->ocpus);
+	ATTRIBUTE(cpus_dir, "weight_max", data->cpu_max);
+	ATTRIBUTE(cpus_dir, "weight_cur", data->cpu_shares);
+
+	/* memory information */
+	mem_dir = hypfs_mkdir(guest_dir, "mem");
+	if (IS_ERR(mem_dir))
+		return PTR_ERR(mem_dir);
+	ATTRIBUTE(mem_dir, "min_KiB", data->mem_min_kb);
+	ATTRIBUTE(mem_dir, "max_KiB", data->mem_max_kb);
+	ATTRIBUTE(mem_dir, "used_KiB", data->mem_used_kb);
+	ATTRIBUTE(mem_dir, "share_KiB", data->mem_share_kb);
+
+	/* samples */
+	samples_dir = hypfs_mkdir(guest_dir, "samples");
+	if (IS_ERR(samples_dir))
+		return PTR_ERR(samples_dir);
+	ATTRIBUTE(samples_dir, "cpu_using", data->cpu_use_samp);
+	ATTRIBUTE(samples_dir, "cpu_delay", data->cpu_delay_samp);
+	ATTRIBUTE(samples_dir, "mem_delay", data->page_wait_samp);
+	ATTRIBUTE(samples_dir, "idle", data->idle_samp);
+	ATTRIBUTE(samples_dir, "other", data->other_samp);
+	ATTRIBUTE(samples_dir, "total", data->total_samp);
+	return 0;
+}
+
+int hypfs_vm_create_files(struct dentry *root)
+{
+	struct dentry *dir, *file;
+	struct diag2fc_data *data;
+	unsigned int count = 0;
+	int rc, i;
+
+	data = diag2fc_store(diag2fc_guest_query, &count, 0);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	/* Hypervisor Info */
+	dir = hypfs_mkdir(root, "hyp");
+	if (IS_ERR(dir)) {
+		rc = PTR_ERR(dir);
+		goto failed;
+	}
+	file = hypfs_create_str(dir, "type", "z/VM Hypervisor");
+	if (IS_ERR(file)) {
+		rc = PTR_ERR(file);
+		goto failed;
+	}
+
+	/* physical cpus */
+	dir = hypfs_mkdir(root, "cpus");
+	if (IS_ERR(dir)) {
+		rc = PTR_ERR(dir);
+		goto failed;
+	}
+	file = hypfs_create_u64(dir, "count", data->lcpus);
+	if (IS_ERR(file)) {
+		rc = PTR_ERR(file);
+		goto failed;
+	}
+
+	/* guests */
+	dir = hypfs_mkdir(root, "systems");
+	if (IS_ERR(dir)) {
+		rc = PTR_ERR(dir);
+		goto failed;
+	}
+
+	for (i = 0; i < count; i++) {
+		rc = hypfs_vm_create_guest(dir, &data[i]);
+		if (rc)
+			goto failed;
+	}
+	diag2fc_free(data);
+	return 0;
+
+failed:
+	diag2fc_free(data);
+	return rc;
+}
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 70139d0791b6..04ea1c03a5ff 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -24,6 +24,7 @@
 #include <linux/kobject.h>
 #include <linux/seq_file.h>
 #include <linux/uio.h>
+#include <asm/machine.h>
 #include <asm/ebcdic.h>
 #include "hypfs.h"
 
@@ -53,7 +54,7 @@ static void hypfs_update_update(struct super_block *sb)
 	struct inode *inode = d_inode(sb_info->update_file);
 
 	sb_info->last_update = ktime_get_seconds();
-	inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+	simple_inode_init_ts(inode);
 }
 
 /* directory tree removal functions */
@@ -101,7 +102,7 @@ static struct inode *hypfs_make_inode(struct super_block *sb, umode_t mode)
 		ret->i_mode = mode;
 		ret->i_uid = hypfs_info->uid;
 		ret->i_gid = hypfs_info->gid;
-		ret->i_atime = ret->i_mtime = ret->i_ctime = current_time(ret);
+		simple_inode_init_ts(ret);
 		if (S_ISDIR(mode))
 			set_nlink(ret, 2);
 	}
@@ -184,7 +185,7 @@ static ssize_t hypfs_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		goto out;
 	}
 	hypfs_delete_tree(sb->s_root);
-	if (MACHINE_IS_VM)
+	if (machine_is_vm())
 		rc = hypfs_vm_create_files(sb->s_root);
 	else
 		rc = hypfs_diag_create_files(sb->s_root);
@@ -209,17 +210,12 @@ static int hypfs_release(struct inode *inode, struct file *filp)
 
 enum { Opt_uid, Opt_gid, };
 
-static const struct fs_parameter_spec hypfs_param_specs[] = {
+static const struct fs_parameter_spec hypfs_fs_parameters[] = {
 	fsparam_u32("gid", Opt_gid),
 	fsparam_u32("uid", Opt_uid),
 	{}
 };
 
-static const struct fs_parameter_description hypfs_fs_parameters = {
-	.name		= "hypfs",
-	.specs		= hypfs_param_specs,
-};
-
 static int hypfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
 	struct hypfs_sb_info *hypfs_info = fc->s_fs_info;
@@ -228,7 +224,7 @@ static int hypfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	kgid_t gid;
 	int opt;
 
-	opt = fs_parse(fc, &hypfs_fs_parameters, param, &result);
+	opt = fs_parse(fc, hypfs_fs_parameters, param, &result);
 	if (opt < 0)
 		return opt;
 
@@ -278,7 +274,7 @@ static int hypfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_root = root_dentry = d_make_root(root_inode);
 	if (!root_dentry)
 		return -ENOMEM;
-	if (MACHINE_IS_VM)
+	if (machine_is_vm())
 		rc = hypfs_vm_create_files(root_dentry);
 	else
 		rc = hypfs_diag_create_files(root_dentry);
@@ -448,14 +444,13 @@ static const struct file_operations hypfs_file_ops = {
 	.release	= hypfs_release,
 	.read_iter	= hypfs_read_iter,
 	.write_iter	= hypfs_write_iter,
-	.llseek		= no_llseek,
 };
 
 static struct file_system_type hypfs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "s390_hypfs",
 	.init_fs_context = hypfs_init_fs_context,
-	.parameters	= &hypfs_fs_parameters,
+	.parameters	= hypfs_fs_parameters,
 	.kill_sb	= hypfs_kill_super
 };
 
@@ -465,45 +460,18 @@ static const struct super_operations hypfs_s_ops = {
 	.show_options	= hypfs_show_options,
 };
 
-static int __init hypfs_init(void)
+int __init __hypfs_fs_init(void)
 {
 	int rc;
 
-	hypfs_dbfs_init();
-
-	if (hypfs_diag_init()) {
-		rc = -ENODATA;
-		goto fail_dbfs_exit;
-	}
-	if (hypfs_vm_init()) {
-		rc = -ENODATA;
-		goto fail_hypfs_diag_exit;
-	}
-	hypfs_sprp_init();
-	if (hypfs_diag0c_init()) {
-		rc = -ENODATA;
-		goto fail_hypfs_sprp_exit;
-	}
 	rc = sysfs_create_mount_point(hypervisor_kobj, "s390");
 	if (rc)
-		goto fail_hypfs_diag0c_exit;
+		return rc;
 	rc = register_filesystem(&hypfs_type);
 	if (rc)
-		goto fail_filesystem;
+		goto fail;
 	return 0;
-
-fail_filesystem:
+fail:
 	sysfs_remove_mount_point(hypervisor_kobj, "s390");
-fail_hypfs_diag0c_exit:
-	hypfs_diag0c_exit();
-fail_hypfs_sprp_exit:
-	hypfs_sprp_exit();
-	hypfs_vm_exit();
-fail_hypfs_diag_exit:
-	hypfs_diag_exit();
-fail_dbfs_exit:
-	hypfs_dbfs_exit();
-	pr_err("Initialization of hypfs failed with rc=%i\n", rc);
 	return rc;
 }
-device_initcall(hypfs_init)
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index 2531f673f099..297bf7157968 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -5,22 +5,6 @@ generated-y += syscall_table.h
 generated-y += unistd_nr.h
 
 generic-y += asm-offsets.h
-generic-y += cacheflush.h
-generic-y += device.h
-generic-y += dma-contiguous.h
-generic-y += dma-mapping.h
-generic-y += div64.h
-generic-y += emergency-restart.h
-generic-y += export.h
-generic-y += fb.h
-generic-y += irq_regs.h
-generic-y += irq_work.h
-generic-y += kmap_types.h
-generic-y += local.h
-generic-y += local64.h
+generic-y += kvm_types.h
 generic-y += mcs_spinlock.h
-generic-y += mm-arch-hooks.h
-generic-y += mmiowb.h
-generic-y += trace_clock.h
-generic-y += unaligned.h
-generic-y += word-at-a-time.h
+generic-y += mmzone.h
diff --git a/arch/s390/include/asm/abs_lowcore.h b/arch/s390/include/asm/abs_lowcore.h
new file mode 100644
index 000000000000..317c07c09ae4
--- /dev/null
+++ b/arch/s390/include/asm/abs_lowcore.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_ABS_LOWCORE_H
+#define _ASM_S390_ABS_LOWCORE_H
+
+#include <linux/smp.h>
+#include <asm/lowcore.h>
+
+#define ABS_LOWCORE_MAP_SIZE	(NR_CPUS * sizeof(struct lowcore))
+
+extern unsigned long __abs_lowcore;
+
+int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc);
+void abs_lowcore_unmap(int cpu);
+
+static inline struct lowcore *get_abs_lowcore(void)
+{
+	int cpu;
+
+	cpu = get_cpu();
+	return ((struct lowcore *)__abs_lowcore) + cpu;
+}
+
+static inline void put_abs_lowcore(struct lowcore *lc)
+{
+	put_cpu();
+}
+
+#endif /* _ASM_S390_ABS_LOWCORE_H */
diff --git a/arch/s390/include/asm/access-regs.h b/arch/s390/include/asm/access-regs.h
new file mode 100644
index 000000000000..1a6412d9f5ad
--- /dev/null
+++ b/arch/s390/include/asm/access-regs.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 1999, 2024
+ */
+
+#ifndef __ASM_S390_ACCESS_REGS_H
+#define __ASM_S390_ACCESS_REGS_H
+
+#include <linux/instrumented.h>
+#include <asm/sigcontext.h>
+
+struct access_regs {
+	unsigned int regs[NUM_ACRS];
+};
+
+static inline void save_access_regs(unsigned int *acrs)
+{
+	struct access_regs *regs = (struct access_regs *)acrs;
+
+	instrument_write(regs, sizeof(*regs));
+	asm volatile("stamy	0,15,%[regs]"
+		     : [regs] "=QS" (*regs)
+		     :
+		     : "memory");
+}
+
+static inline void restore_access_regs(unsigned int *acrs)
+{
+	struct access_regs *regs = (struct access_regs *)acrs;
+
+	instrument_read(regs, sizeof(*regs));
+	asm volatile("lamy	0,15,%[regs]"
+		     :
+		     : [regs] "QS" (*regs)
+		     : "memory");
+}
+
+#endif /* __ASM_S390_ACCESS_REGS_H */
diff --git a/arch/s390/include/asm/airq.h b/arch/s390/include/asm/airq.h
index 01936fdfaddb..c4c28c2609a5 100644
--- a/arch/s390/include/asm/airq.h
+++ b/arch/s390/include/asm/airq.h
@@ -12,12 +12,12 @@
 
 #include <linux/bit_spinlock.h>
 #include <linux/dma-mapping.h>
+#include <asm/tpi.h>
 
 struct airq_struct {
 	struct hlist_node list;		/* Handler queueing. */
-	void (*handler)(struct airq_struct *airq, bool floating);
+	void (*handler)(struct airq_struct *airq, struct tpi_info *tpi_info);
 	u8 *lsi_ptr;			/* Local-Summary-Indicator pointer */
-	u8 lsi_mask;			/* Local-Summary-Indicator mask */
 	u8 isc;				/* Interrupt-subclass */
 	u8 flags;
 };
@@ -46,8 +46,10 @@ struct airq_iv {
 #define AIRQ_IV_PTR		4	/* Allocate the ptr array */
 #define AIRQ_IV_DATA		8	/* Allocate the data array */
 #define AIRQ_IV_CACHELINE	16	/* Cacheline alignment for the vector */
+#define AIRQ_IV_GUESTVEC	32	/* Vector is a pinned guest page */
 
-struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags);
+struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags,
+			       unsigned long *vec);
 void airq_iv_release(struct airq_iv *iv);
 unsigned long airq_iv_alloc(struct airq_iv *iv, unsigned long num);
 void airq_iv_free(struct airq_iv *iv, unsigned long bit, unsigned long num);
diff --git a/arch/s390/include/asm/alternative-asm.h b/arch/s390/include/asm/alternative-asm.h
deleted file mode 100644
index 955d620db23e..000000000000
--- a/arch/s390/include/asm/alternative-asm.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_S390_ALTERNATIVE_ASM_H
-#define _ASM_S390_ALTERNATIVE_ASM_H
-
-#ifdef __ASSEMBLY__
-
-/*
- * Check the length of an instruction sequence. The length may not be larger
- * than 254 bytes and it has to be divisible by 2.
- */
-.macro alt_len_check start,end
-	.if ( \end - \start ) > 254
-	.error "cpu alternatives does not support instructions blocks > 254 bytes\n"
-	.endif
-	.if ( \end - \start ) % 2
-	.error "cpu alternatives instructions length is odd\n"
-	.endif
-.endm
-
-/*
- * Issue one struct alt_instr descriptor entry (need to put it into
- * the section .altinstructions, see below). This entry contains
- * enough information for the alternatives patching code to patch an
- * instruction. See apply_alternatives().
- */
-.macro alt_entry orig_start, orig_end, alt_start, alt_end, feature
-	.long	\orig_start - .
-	.long	\alt_start - .
-	.word	\feature
-	.byte	\orig_end - \orig_start
-	.byte	\alt_end - \alt_start
-.endm
-
-/*
- * Fill up @bytes with nops. The macro emits 6-byte nop instructions
- * for the bulk of the area, possibly followed by a 4-byte and/or
- * a 2-byte nop if the size of the area is not divisible by 6.
- */
-.macro alt_pad_fill bytes
-	.fill	( \bytes ) / 6, 6, 0xc0040000
-	.fill	( \bytes ) % 6 / 4, 4, 0x47000000
-	.fill	( \bytes ) % 6 % 4 / 2, 2, 0x0700
-.endm
-
-/*
- * Fill up @bytes with nops. If the number of bytes is larger
- * than 6, emit a jg instruction to branch over all nops, then
- * fill an area of size (@bytes - 6) with nop instructions.
- */
-.macro alt_pad bytes
-	.if ( \bytes > 0 )
-	.if ( \bytes > 6 )
-	jg	. + \bytes
-	alt_pad_fill \bytes - 6
-	.else
-	alt_pad_fill \bytes
-	.endif
-	.endif
-.endm
-
-/*
- * Define an alternative between two instructions. If @feature is
- * present, early code in apply_alternatives() replaces @oldinstr with
- * @newinstr. ".skip" directive takes care of proper instruction padding
- * in case @newinstr is longer than @oldinstr.
- */
-.macro ALTERNATIVE oldinstr, newinstr, feature
-	.pushsection .altinstr_replacement,"ax"
-770:	\newinstr
-771:	.popsection
-772:	\oldinstr
-773:	alt_len_check 770b, 771b
-	alt_len_check 772b, 773b
-	alt_pad ( ( 771b - 770b ) - ( 773b - 772b ) )
-774:	.pushsection .altinstructions,"a"
-	alt_entry 772b, 774b, 770b, 771b, \feature
-	.popsection
-.endm
-
-/*
- * Define an alternative between two instructions. If @feature is
- * present, early code in apply_alternatives() replaces @oldinstr with
- * @newinstr. ".skip" directive takes care of proper instruction padding
- * in case @newinstr is longer than @oldinstr.
- */
-.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
-	.pushsection .altinstr_replacement,"ax"
-770:	\newinstr1
-771:	\newinstr2
-772:	.popsection
-773:	\oldinstr
-774:	alt_len_check 770b, 771b
-	alt_len_check 771b, 772b
-	alt_len_check 773b, 774b
-	.if ( 771b - 770b > 772b - 771b )
-	alt_pad ( ( 771b - 770b ) - ( 774b - 773b ) )
-	.else
-	alt_pad ( ( 772b - 771b ) - ( 774b - 773b ) )
-	.endif
-775:	.pushsection .altinstructions,"a"
-	alt_entry 773b, 775b, 770b, 771b,\feature1
-	alt_entry 773b, 775b, 771b, 772b,\feature2
-	.popsection
-.endm
-
-#endif	/*  __ASSEMBLY__  */
-
-#endif /* _ASM_S390_ALTERNATIVE_ASM_H */
diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h
index 1c8a38f762a3..c7bf60a541e9 100644
--- a/arch/s390/include/asm/alternative.h
+++ b/arch/s390/include/asm/alternative.h
@@ -2,6 +2,55 @@
 #ifndef _ASM_S390_ALTERNATIVE_H
 #define _ASM_S390_ALTERNATIVE_H
 
+/*
+ * Each alternative comes with a 32 bit feature field:
+ *	union {
+ *		u32 feature;
+ *		struct {
+ *			u32 ctx	 : 4;
+ *			u32 type : 8;
+ *			u32 data : 20;
+ *		};
+ *	}
+ *
+ * @ctx is a bitfield, where only one bit must be set. Each bit defines
+ * in which context an alternative is supposed to be applied to the
+ * kernel image:
+ *
+ * - from the decompressor before the kernel itself is executed
+ * - from early kernel code from within the kernel
+ *
+ * @type is a number which defines the type and with that the type
+ * specific alternative patching.
+ *
+ * @data is additional type specific information which defines if an
+ * alternative should be applied.
+ */
+
+#define ALT_CTX_EARLY		1
+#define ALT_CTX_LATE		2
+#define ALT_CTX_ALL		(ALT_CTX_EARLY | ALT_CTX_LATE)
+
+#define ALT_TYPE_FACILITY	0
+#define ALT_TYPE_FEATURE	1
+#define ALT_TYPE_SPEC		2
+
+#define ALT_DATA_SHIFT		0
+#define ALT_TYPE_SHIFT		20
+#define ALT_CTX_SHIFT		28
+
+#define ALT_FACILITY(facility)		(ALT_CTX_EARLY << ALT_CTX_SHIFT		| \
+					 ALT_TYPE_FACILITY << ALT_TYPE_SHIFT	| \
+					 (facility) << ALT_DATA_SHIFT)
+
+#define ALT_FEATURE(feature)		(ALT_CTX_EARLY << ALT_CTX_SHIFT		| \
+					 ALT_TYPE_FEATURE << ALT_TYPE_SHIFT	| \
+					 (feature) << ALT_DATA_SHIFT)
+
+#define ALT_SPEC(facility)		(ALT_CTX_LATE << ALT_CTX_SHIFT		| \
+					 ALT_TYPE_SPEC << ALT_TYPE_SHIFT	| \
+					 (facility) << ALT_DATA_SHIFT)
+
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
@@ -11,34 +60,45 @@
 struct alt_instr {
 	s32 instr_offset;	/* original instruction */
 	s32 repl_offset;	/* offset to replacement instruction */
-	u16 facility;		/* facility bit set for replacement */
+	union {
+		u32 feature;	/* feature required for replacement */
+		struct {
+			u32 ctx	 : 4;  /* context */
+			u32 type : 8;  /* type of alternative */
+			u32 data : 20; /* patching information */
+		};
+	};
 	u8  instrlen;		/* length of original instruction */
-	u8  replacementlen;	/* length of new instruction */
 } __packed;
 
-void apply_alternative_instructions(void);
-void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
+extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+
+void __apply_alternatives(struct alt_instr *start, struct alt_instr *end, unsigned int ctx);
+
+static inline void apply_alternative_instructions(void)
+{
+	__apply_alternatives(__alt_instructions, __alt_instructions_end, ALT_CTX_LATE);
+}
+
+static inline void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
+{
+	__apply_alternatives(start, end, ALT_CTX_ALL);
+}
 
 /*
- * |661:       |662:	  |6620      |663:
- * +-----------+---------------------+
- * | oldinstr  | oldinstr_padding    |
- * |	       +----------+----------+
- * |	       |	  |	     |
- * |	       | >6 bytes |6/4/2 nops|
- * |	       |6 bytes jg----------->
- * +-----------+---------------------+
- *		 ^^ static padding ^^
+ * +---------------------------------+
+ * |661:			     |662:
+ * | oldinstr			     |
+ * +---------------------------------+
  *
  * .altinstr_replacement section
- * +---------------------+-----------+
+ * +---------------------------------+
  * |6641:			     |6651:
  * | alternative instr 1	     |
- * +-----------+---------+- - - - - -+
- * |6642:		 |6652:      |
- * | alternative instr 2 | padding
- * +---------------------+- - - - - -+
- *			  ^ runtime ^
+ * +---------------------------------+
+ * |6642:			     |6652:
+ * | alternative instr 2	     |
+ * +---------------------------------+
  *
  * .altinstructions section
  * +---------------------------------+
@@ -47,83 +107,45 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
  * +---------------------------------+
  */
 
-#define b_altinstr(num)	"664"#num
-#define e_altinstr(num)	"665"#num
-
-#define e_oldinstr_pad_end	"663"
+#define b_altinstr(num)		"664"#num
+#define e_altinstr(num)		"665"#num
 #define oldinstr_len		"662b-661b"
-#define oldinstr_total_len	e_oldinstr_pad_end"b-661b"
 #define altinstr_len(num)	e_altinstr(num)"b-"b_altinstr(num)"b"
-#define oldinstr_pad_len(num) \
-	"-(((" altinstr_len(num) ")-(" oldinstr_len ")) > 0) * " \
-	"((" altinstr_len(num) ")-(" oldinstr_len "))"
-
-#define INSTR_LEN_SANITY_CHECK(len)					\
-	".if " len " > 254\n"						\
-	"\t.error \"cpu alternatives does not support instructions "	\
-		"blocks > 254 bytes\"\n"				\
-	".endif\n"							\
-	".if (" len ") %% 2\n"						\
-	"\t.error \"cpu alternatives instructions length is odd\"\n"	\
-	".endif\n"
-
-#define OLDINSTR_PADDING(oldinstr, num)					\
-	".if " oldinstr_pad_len(num) " > 6\n"				\
-	"\tjg " e_oldinstr_pad_end "f\n"				\
-	"6620:\n"							\
-	"\t.fill (" oldinstr_pad_len(num) " - (6620b-662b)) / 2, 2, 0x0700\n" \
-	".else\n"							\
-	"\t.fill " oldinstr_pad_len(num) " / 6, 6, 0xc0040000\n"	\
-	"\t.fill " oldinstr_pad_len(num) " %% 6 / 4, 4, 0x47000000\n"	\
-	"\t.fill " oldinstr_pad_len(num) " %% 6 %% 4 / 2, 2, 0x0700\n"	\
-	".endif\n"
-
-#define OLDINSTR(oldinstr, num)						\
-	"661:\n\t" oldinstr "\n662:\n"					\
-	OLDINSTR_PADDING(oldinstr, num)					\
-	e_oldinstr_pad_end ":\n"					\
-	INSTR_LEN_SANITY_CHECK(oldinstr_len)
-
-#define OLDINSTR_2(oldinstr, num1, num2)				\
-	"661:\n\t" oldinstr "\n662:\n"					\
-	".if " altinstr_len(num1) " < " altinstr_len(num2) "\n"		\
-	OLDINSTR_PADDING(oldinstr, num2)				\
-	".else\n"							\
-	OLDINSTR_PADDING(oldinstr, num1)				\
-	".endif\n"							\
-	e_oldinstr_pad_end ":\n"					\
-	INSTR_LEN_SANITY_CHECK(oldinstr_len)
-
-#define ALTINSTR_ENTRY(facility, num)					\
+
+#define OLDINSTR(oldinstr) \
+	"661:\n\t" oldinstr "\n662:\n"
+
+#define ALTINSTR_ENTRY(feature, num)					\
 	"\t.long 661b - .\n"			/* old instruction */	\
 	"\t.long " b_altinstr(num)"b - .\n"	/* alt instruction */	\
-	"\t.word " __stringify(facility) "\n"	/* facility bit    */	\
-	"\t.byte " oldinstr_total_len "\n"	/* source len	   */	\
-	"\t.byte " altinstr_len(num) "\n"	/* alt instruction len */
+	"\t.long " __stringify(feature) "\n"	/* feature	   */	\
+	"\t.byte " oldinstr_len "\n"		/* instruction len */	\
+	"\t.org . - (" oldinstr_len ") & 1\n"				\
+	"\t.org . - (" oldinstr_len ") + (" altinstr_len(num) ")\n"	\
+	"\t.org . - (" altinstr_len(num) ") + (" oldinstr_len ")\n"
 
 #define ALTINSTR_REPLACEMENT(altinstr, num)	/* replacement */	\
-	b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n"	\
-	INSTR_LEN_SANITY_CHECK(altinstr_len(num))
+	b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n"
 
 /* alternative assembly primitive: */
-#define ALTERNATIVE(oldinstr, altinstr, facility) \
+#define ALTERNATIVE(oldinstr, altinstr, feature) \
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
 	ALTINSTR_REPLACEMENT(altinstr, 1)				\
 	".popsection\n"							\
-	OLDINSTR(oldinstr, 1)						\
+	OLDINSTR(oldinstr)						\
 	".pushsection .altinstructions,\"a\"\n"				\
-	ALTINSTR_ENTRY(facility, 1)					\
+	ALTINSTR_ENTRY(feature, 1)					\
 	".popsection\n"
 
-#define ALTERNATIVE_2(oldinstr, altinstr1, facility1, altinstr2, facility2)\
+#define ALTERNATIVE_2(oldinstr, altinstr1, feature1, altinstr2, feature2)\
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
 	ALTINSTR_REPLACEMENT(altinstr1, 1)				\
 	ALTINSTR_REPLACEMENT(altinstr2, 2)				\
 	".popsection\n"							\
-	OLDINSTR_2(oldinstr, 1, 2)					\
+	OLDINSTR(oldinstr)						\
 	".pushsection .altinstructions,\"a\"\n"				\
-	ALTINSTR_ENTRY(facility1, 1)					\
-	ALTINSTR_ENTRY(facility2, 2)					\
+	ALTINSTR_ENTRY(feature1, 1)					\
+	ALTINSTR_ENTRY(feature2, 2)					\
 	".popsection\n"
 
 /*
@@ -138,12 +160,78 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
  * For non barrier like inlines please define new variants
  * without volatile and memory clobber.
  */
-#define alternative(oldinstr, altinstr, facility)			\
-	asm_inline volatile(ALTERNATIVE(oldinstr, altinstr, facility) : : : "memory")
+#define alternative(oldinstr, altinstr, feature)			\
+	asm_inline volatile(ALTERNATIVE(oldinstr, altinstr, feature) : : : "memory")
+
+#define alternative_2(oldinstr, altinstr1, feature1, altinstr2, feature2) \
+	asm_inline volatile(ALTERNATIVE_2(oldinstr, altinstr1, feature1,   \
+				   altinstr2, feature2) ::: "memory")
+
+/* Alternative inline assembly with input. */
+#define alternative_input(oldinstr, newinstr, feature, input...)	\
+	asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature)	\
+		: : input)
+
+/* Like alternative_input, but with a single output argument */
+#define alternative_io(oldinstr, altinstr, feature, output, input...)	\
+	asm_inline volatile(ALTERNATIVE(oldinstr, altinstr, feature)	\
+		: output : input)
+
+/* Use this macro if more than one output parameter is needed. */
+#define ASM_OUTPUT2(a...) a
+
+/* Use this macro if clobbers are needed without inputs. */
+#define ASM_NO_INPUT_CLOBBER(clobber...) : clobber
 
-#define alternative_2(oldinstr, altinstr1, facility1, altinstr2, facility2) \
-	asm_inline volatile(ALTERNATIVE_2(oldinstr, altinstr1, facility1,   \
-				   altinstr2, facility2) ::: "memory")
+#else  /* __ASSEMBLY__ */
+
+/*
+ * Issue one struct alt_instr descriptor entry (need to put it into
+ * the section .altinstructions, see below). This entry contains
+ * enough information for the alternatives patching code to patch an
+ * instruction. See apply_alternatives().
+ */
+.macro alt_entry orig_start, orig_end, alt_start, alt_end, feature
+	.long	\orig_start - .
+	.long	\alt_start - .
+	.long	\feature
+	.byte	\orig_end - \orig_start
+	.org	. - ( \orig_end - \orig_start ) & 1
+	.org	. - ( \orig_end - \orig_start ) + ( \alt_end - \alt_start )
+	.org	. - ( \alt_end - \alt_start ) + ( \orig_end - \orig_start )
+.endm
+
+/*
+ * Define an alternative between two instructions. If @feature is
+ * present, early code in apply_alternatives() replaces @oldinstr with
+ * @newinstr.
+ */
+.macro ALTERNATIVE oldinstr, newinstr, feature
+	.pushsection .altinstr_replacement,"ax"
+770:	\newinstr
+771:	.popsection
+772:	\oldinstr
+773:	.pushsection .altinstructions,"a"
+	alt_entry 772b, 773b, 770b, 771b, \feature
+	.popsection
+.endm
+
+/*
+ * Define an alternative between two instructions. If @feature is
+ * present, early code in apply_alternatives() replaces @oldinstr with
+ * @newinstr.
+ */
+.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
+	.pushsection .altinstr_replacement,"ax"
+770:	\newinstr1
+771:	\newinstr2
+772:	.popsection
+773:	\oldinstr
+774:	.pushsection .altinstructions,"a"
+	alt_entry 773b, 774b, 770b, 771b,\feature1
+	alt_entry 773b, 774b, 771b, 772b,\feature2
+	.popsection
+.endm
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h
index aea32dda3d14..395b02d6a133 100644
--- a/arch/s390/include/asm/ap.h
+++ b/arch/s390/include/asm/ap.h
@@ -12,6 +12,9 @@
 #ifndef _ASM_S390_AP_H_
 #define _ASM_S390_AP_H_
 
+#include <linux/io.h>
+#include <asm/asm-extable.h>
+
 /**
  * The ap_qid_t identifier of an ap queue.
  * If the AP facilities test (APFT) facility is available,
@@ -40,10 +43,24 @@ struct ap_queue_status {
 	unsigned int queue_empty	: 1;
 	unsigned int replies_waiting	: 1;
 	unsigned int queue_full		: 1;
-	unsigned int _pad1		: 4;
+	unsigned int			: 3;
+	unsigned int async		: 1;
 	unsigned int irq_enabled	: 1;
 	unsigned int response_code	: 8;
-	unsigned int _pad2		: 16;
+	unsigned int			: 16;
+};
+
+/*
+ * AP queue status reg union to access the reg1
+ * register with the lower 32 bits comprising the
+ * ap queue status.
+ */
+union ap_queue_status_reg {
+	unsigned long value;
+	struct {
+		u32 _pad;
+		struct ap_queue_status status;
+	};
 };
 
 /**
@@ -53,54 +70,98 @@ struct ap_queue_status {
  */
 static inline bool ap_instructions_available(void)
 {
-	register unsigned long reg0 asm ("0") = AP_MKQID(0, 0);
-	register unsigned long reg1 asm ("1") = 0;
-	register unsigned long reg2 asm ("2") = 0;
+	unsigned long reg0 = AP_MKQID(0, 0);
+	unsigned long reg1 = 0;
 
 	asm volatile(
-		"   .long 0xb2af0000\n"		/* PQAP(TAPQ) */
-		"0: la    %0,1\n"
+		"	lgr	0,%[reg0]\n"		/* qid into gr0 */
+		"	lghi	1,0\n"			/* 0 into gr1 */
+		"	lghi	2,0\n"			/* 0 into gr2 */
+		"	.insn	rre,0xb2af0000,0,0\n"	/* PQAP(TAPQ) */
+		"0:	la	%[reg1],1\n"		/* 1 into reg1 */
 		"1:\n"
 		EX_TABLE(0b, 1b)
-		: "+d" (reg1), "+d" (reg2)
-		: "d" (reg0)
-		: "cc");
+		: [reg1] "+&d" (reg1)
+		: [reg0] "d" (reg0)
+		: "cc", "0", "1", "2");
 	return reg1 != 0;
 }
 
+/* TAPQ register GR2 response struct */
+struct ap_tapq_hwinfo {
+	union {
+		unsigned long value;
+		struct {
+			unsigned int fac    : 32; /* facility bits */
+			unsigned int apinfo : 32; /* ap type, ... */
+		};
+		struct {
+			unsigned int apsc  :  1; /* APSC */
+			unsigned int mex4k :  1; /* AP4KM */
+			unsigned int crt4k :  1; /* AP4KC */
+			unsigned int cca   :  1; /* D */
+			unsigned int accel :  1; /* A */
+			unsigned int ep11  :  1; /* X */
+			unsigned int apxa  :  1; /* APXA */
+			unsigned int	   :  1;
+			unsigned int class :  8;
+			unsigned int bs	   :  2; /* SE bind/assoc */
+			unsigned int	   : 14;
+			unsigned int at	   :  8; /* ap type */
+			unsigned int nd	   :  8; /* nr of domains */
+			unsigned int	   :  4;
+			unsigned int ml	   :  4; /* apxl ml */
+			unsigned int	   :  4;
+			unsigned int qd	   :  4; /* queue depth */
+		};
+	};
+};
+
+/*
+ * Convenience defines to be used with the bs field from struct ap_tapq_gr2
+ */
+#define AP_BS_Q_USABLE		      0
+#define AP_BS_Q_USABLE_NO_SECURE_KEY  1
+#define AP_BS_Q_AVAIL_FOR_BINDING     2
+#define AP_BS_Q_UNUSABLE	      3
+
 /**
  * ap_tapq(): Test adjunct processor queue.
  * @qid: The AP queue number
- * @info: Pointer to queue descriptor
+ * @info: Pointer to tapq hwinfo struct
  *
  * Returns AP queue status structure.
  */
-static inline struct ap_queue_status ap_tapq(ap_qid_t qid, unsigned long *info)
+static inline struct ap_queue_status ap_tapq(ap_qid_t qid,
+					     struct ap_tapq_hwinfo *info)
 {
-	register unsigned long reg0 asm ("0") = qid;
-	register struct ap_queue_status reg1 asm ("1");
-	register unsigned long reg2 asm ("2");
-
-	asm volatile(".long 0xb2af0000"		/* PQAP(TAPQ) */
-		     : "=d" (reg1), "=d" (reg2)
-		     : "d" (reg0)
-		     : "cc");
+	union ap_queue_status_reg reg1;
+	unsigned long reg2;
+
+	asm volatile(
+		"	lgr	0,%[qid]\n"		/* qid into gr0 */
+		"	lghi	2,0\n"			/* 0 into gr2 */
+		"	.insn	rre,0xb2af0000,0,0\n"	/* PQAP(TAPQ) */
+		"	lgr	%[reg1],1\n"		/* gr1 (status) into reg1 */
+		"	lgr	%[reg2],2\n"		/* gr2 into reg2 */
+		: [reg1] "=&d" (reg1.value), [reg2] "=&d" (reg2)
+		: [qid] "d" (qid)
+		: "cc", "0", "1", "2");
 	if (info)
-		*info = reg2;
-	return reg1;
+		info->value = reg2;
+	return reg1.status;
 }
 
 /**
  * ap_test_queue(): Test adjunct processor queue.
  * @qid: The AP queue number
  * @tbit: Test facilities bit
- * @info: Pointer to queue descriptor
+ * @info: Ptr to tapq gr2 struct
  *
  * Returns AP queue status structure.
  */
-static inline struct ap_queue_status ap_test_queue(ap_qid_t qid,
-						   int tbit,
-						   unsigned long *info)
+static inline struct ap_queue_status ap_test_queue(ap_qid_t qid, int tbit,
+						   struct ap_tapq_hwinfo *info)
 {
 	if (tbit)
 		qid |= 1UL << 23; /* set T bit*/
@@ -110,39 +171,51 @@ static inline struct ap_queue_status ap_test_queue(ap_qid_t qid,
 /**
  * ap_pqap_rapq(): Reset adjunct processor queue.
  * @qid: The AP queue number
+ * @fbit: if != 0 set F bit
  *
  * Returns AP queue status structure.
  */
-static inline struct ap_queue_status ap_rapq(ap_qid_t qid)
+static inline struct ap_queue_status ap_rapq(ap_qid_t qid, int fbit)
 {
-	register unsigned long reg0 asm ("0") = qid | (1UL << 24);
-	register struct ap_queue_status reg1 asm ("1");
+	unsigned long reg0 = qid | (1UL << 24);  /* fc 1UL is RAPQ */
+	union ap_queue_status_reg reg1;
+
+	if (fbit)
+		reg0 |= 1UL << 22;
 
 	asm volatile(
-		".long 0xb2af0000"		/* PQAP(RAPQ) */
-		: "=d" (reg1)
-		: "d" (reg0)
-		: "cc");
-	return reg1;
+		"	lgr	0,%[reg0]\n"		/* qid arg into gr0 */
+		"	.insn	rre,0xb2af0000,0,0\n"	/* PQAP(RAPQ) */
+		"	lgr	%[reg1],1\n"		/* gr1 (status) into reg1 */
+		: [reg1] "=&d" (reg1.value)
+		: [reg0] "d" (reg0)
+		: "cc", "0", "1");
+	return reg1.status;
 }
 
 /**
  * ap_pqap_zapq(): Reset and zeroize adjunct processor queue.
  * @qid: The AP queue number
+ * @fbit: if != 0 set F bit
  *
  * Returns AP queue status structure.
  */
-static inline struct ap_queue_status ap_zapq(ap_qid_t qid)
+static inline struct ap_queue_status ap_zapq(ap_qid_t qid, int fbit)
 {
-	register unsigned long reg0 asm ("0") = qid | (2UL << 24);
-	register struct ap_queue_status reg1 asm ("1");
+	unsigned long reg0 = qid | (2UL << 24);  /* fc 2UL is ZAPQ */
+	union ap_queue_status_reg reg1;
+
+	if (fbit)
+		reg0 |= 1UL << 22;
 
 	asm volatile(
-		".long 0xb2af0000"		/* PQAP(ZAPQ) */
-		: "=d" (reg1)
-		: "d" (reg0)
-		: "cc");
-	return reg1;
+		"	lgr	0,%[reg0]\n"		/* qid arg into gr0 */
+		"	.insn	rre,0xb2af0000,0,0\n"	/* PQAP(ZAPQ) */
+		"	lgr	%[reg1],1\n"		/* gr1 (status) into reg1 */
+		: [reg1] "=&d" (reg1.value)
+		: [reg0] "d" (reg0)
+		: "cc", "0", "1");
+	return reg1.status;
 }
 
 /**
@@ -150,19 +223,25 @@ static inline struct ap_queue_status ap_zapq(ap_qid_t qid)
  * config info as returned by the ap_qci() function.
  */
 struct ap_config_info {
-	unsigned int apsc	 : 1;	/* S bit */
-	unsigned int apxa	 : 1;	/* N bit */
-	unsigned int qact	 : 1;	/* C bit */
-	unsigned int rc8a	 : 1;	/* R bit */
-	unsigned char _reserved1 : 4;
-	unsigned char _reserved2[3];
-	unsigned char Na;		/* max # of APs - 1 */
-	unsigned char Nd;		/* max # of Domains - 1 */
-	unsigned char _reserved3[10];
+	union {
+		unsigned int flags;
+		struct {
+			unsigned int apsc	 : 1;	/* S bit */
+			unsigned int apxa	 : 1;	/* N bit */
+			unsigned int qact	 : 1;	/* C bit */
+			unsigned int rc8a	 : 1;	/* R bit */
+			unsigned int		 : 4;
+			unsigned int apsb	 : 1;	/* B bit */
+			unsigned int		 : 23;
+		};
+	};
+	unsigned char na;		/* max # of APs - 1 */
+	unsigned char nd;		/* max # of Domains - 1 */
+	unsigned char _reserved0[10];
 	unsigned int apm[8];		/* AP ID mask */
 	unsigned int aqm[8];		/* AP (usage) queue mask */
 	unsigned int adm[8];		/* AP (control) domain mask */
-	unsigned char _reserved4[16];
+	unsigned char _reserved1[16];
 } __aligned(8);
 
 /**
@@ -172,18 +251,20 @@ struct ap_config_info {
  */
 static inline int ap_qci(struct ap_config_info *config)
 {
-	register unsigned long reg0 asm ("0") = 4UL << 24;
-	register unsigned long reg1 asm ("1") = -EOPNOTSUPP;
-	register struct ap_config_info *reg2 asm ("2") = config;
+	unsigned long reg0 = 4UL << 24;  /* fc 4UL is QCI */
+	unsigned long reg1 = -EOPNOTSUPP;
+	struct ap_config_info *reg2 = config;
 
 	asm volatile(
-		".long 0xb2af0000\n"		/* PQAP(QCI) */
-		"0: la    %0,0\n"
+		"	lgr	0,%[reg0]\n"		/* QCI fc into gr0 */
+		"	lgr	2,%[reg2]\n"		/* ptr to config into gr2 */
+		"	.insn	rre,0xb2af0000,0,0\n"	/* PQAP(QCI) */
+		"0:	la	%[reg1],0\n"		/* good case, QCI fc available */
 		"1:\n"
 		EX_TABLE(0b, 1b)
-		: "+d" (reg1)
-		: "d" (reg0), "d" (reg2)
-		: "cc", "memory");
+		: [reg1] "+&d" (reg1)
+		: [reg0] "d" (reg0), [reg2] "d" (reg2)
+		: "cc", "memory", "0", "2");
 
 	return reg1;
 }
@@ -194,47 +275,50 @@ static inline int ap_qci(struct ap_config_info *config)
  * parameter to the PQAP(AQIC) instruction. For details please
  * see the AR documentation.
  */
-struct ap_qirq_ctrl {
-	unsigned int _res1 : 8;
-	unsigned int zone  : 8;	/* zone info */
-	unsigned int ir    : 1;	/* ir flag: enable (1) or disable (0) irq */
-	unsigned int _res2 : 4;
-	unsigned int gisc  : 3;	/* guest isc field */
-	unsigned int _res3 : 6;
-	unsigned int gf    : 2;	/* gisa format */
-	unsigned int _res4 : 1;
-	unsigned int gisa  : 27;	/* gisa origin */
-	unsigned int _res5 : 1;
-	unsigned int isc   : 3;	/* irq sub class */
+union ap_qirq_ctrl {
+	unsigned long value;
+	struct {
+		unsigned int	   : 8;
+		unsigned int zone  : 8;	/* zone info */
+		unsigned int ir	   : 1;	/* ir flag: enable (1) or disable (0) irq */
+		unsigned int	   : 4;
+		unsigned int gisc  : 3;	/* guest isc field */
+		unsigned int	   : 6;
+		unsigned int gf	   : 2;	/* gisa format */
+		unsigned int	   : 1;
+		unsigned int gisa  : 27;	/* gisa origin */
+		unsigned int	   : 1;
+		unsigned int isc   : 3;	/* irq sub class */
+	};
 };
 
 /**
  * ap_aqic(): Control interruption for a specific AP.
  * @qid: The AP queue number
  * @qirqctrl: struct ap_qirq_ctrl (64 bit value)
- * @ind: The notification indicator byte
+ * @pa_ind: Physical address of the notification indicator byte
  *
  * Returns AP queue status.
  */
 static inline struct ap_queue_status ap_aqic(ap_qid_t qid,
-					     struct ap_qirq_ctrl qirqctrl,
-					     void *ind)
+					     union ap_qirq_ctrl qirqctrl,
+					     phys_addr_t pa_ind)
 {
-	register unsigned long reg0 asm ("0") = qid | (3UL << 24);
-	register union {
-		unsigned long value;
-		struct ap_qirq_ctrl qirqctrl;
-		struct ap_queue_status status;
-	} reg1 asm ("1");
-	register void *reg2 asm ("2") = ind;
+	unsigned long reg0 = qid | (3UL << 24);  /* fc 3UL is AQIC */
+	union ap_queue_status_reg reg1;
+	unsigned long reg2 = pa_ind;
 
-	reg1.qirqctrl = qirqctrl;
+	reg1.value = qirqctrl.value;
 
 	asm volatile(
-		".long 0xb2af0000"		/* PQAP(AQIC) */
-		: "+d" (reg1)
-		: "d" (reg0), "d" (reg2)
-		: "cc");
+		"	lgr	0,%[reg0]\n"		/* qid param into gr0 */
+		"	lgr	1,%[reg1]\n"		/* irq ctrl into gr1 */
+		"	lgr	2,%[reg2]\n"		/* ni addr into gr2 */
+		"	.insn	rre,0xb2af0000,0,0\n"	/* PQAP(AQIC) */
+		"	lgr	%[reg1],1\n"		/* gr1 (status) into reg1 */
+		: [reg1] "+&d" (reg1.value)
+		: [reg0] "d" (reg0), [reg2] "d" (reg2)
+		: "cc", "memory", "0", "1", "2");
 
 	return reg1.status;
 }
@@ -257,7 +341,7 @@ union ap_qact_ap_info {
 };
 
 /**
- * ap_qact(): Query AP combatibility type.
+ * ap_qact(): Query AP compatibility type.
  * @qid: The AP queue number
  * @apinfo: On input the info about the AP queue. On output the
  *	    alternate AP queue info provided by the qact function
@@ -268,25 +352,78 @@ union ap_qact_ap_info {
 static inline struct ap_queue_status ap_qact(ap_qid_t qid, int ifbit,
 					     union ap_qact_ap_info *apinfo)
 {
-	register unsigned long reg0 asm ("0") = qid | (5UL << 24)
-		| ((ifbit & 0x01) << 22);
-	register union {
-		unsigned long value;
-		struct ap_queue_status status;
-	} reg1 asm ("1");
-	register unsigned long reg2 asm ("2");
+	unsigned long reg0 = qid | (5UL << 24) | ((ifbit & 0x01) << 22);
+	union ap_queue_status_reg reg1;
+	unsigned long reg2;
 
 	reg1.value = apinfo->val;
 
 	asm volatile(
-		".long 0xb2af0000"		/* PQAP(QACT) */
-		: "+d" (reg1), "=d" (reg2)
-		: "d" (reg0)
-		: "cc");
+		"	lgr	0,%[reg0]\n"		/* qid param into gr0 */
+		"	lgr	1,%[reg1]\n"		/* qact in info into gr1 */
+		"	.insn	rre,0xb2af0000,0,0\n"	/* PQAP(QACT) */
+		"	lgr	%[reg1],1\n"		/* gr1 (status) into reg1 */
+		"	lgr	%[reg2],2\n"		/* qact out info into reg2 */
+		: [reg1] "+&d" (reg1.value), [reg2] "=&d" (reg2)
+		: [reg0] "d" (reg0)
+		: "cc", "0", "1", "2");
 	apinfo->val = reg2;
 	return reg1.status;
 }
 
+/*
+ * ap_bapq(): SE bind AP queue.
+ * @qid: The AP queue number
+ *
+ * Returns AP queue status structure.
+ *
+ * Invoking this function in a non-SE environment
+ * may case a specification exception.
+ */
+static inline struct ap_queue_status ap_bapq(ap_qid_t qid)
+{
+	unsigned long reg0 = qid | (7UL << 24);  /* fc 7 is BAPQ */
+	union ap_queue_status_reg reg1;
+
+	asm volatile(
+		"	lgr	0,%[reg0]\n"		/* qid arg into gr0 */
+		"	.insn	rre,0xb2af0000,0,0\n"	/* PQAP(BAPQ) */
+		"	lgr	%[reg1],1\n"		/* gr1 (status) into reg1 */
+		: [reg1] "=&d" (reg1.value)
+		: [reg0] "d" (reg0)
+		: "cc", "0", "1");
+
+	return reg1.status;
+}
+
+/*
+ * ap_aapq(): SE associate AP queue.
+ * @qid: The AP queue number
+ * @sec_idx: The secret index
+ *
+ * Returns AP queue status structure.
+ *
+ * Invoking this function in a non-SE environment
+ * may case a specification exception.
+ */
+static inline struct ap_queue_status ap_aapq(ap_qid_t qid, unsigned int sec_idx)
+{
+	unsigned long reg0 = qid | (8UL << 24);  /* fc 8 is AAPQ */
+	unsigned long reg2 = sec_idx;
+	union ap_queue_status_reg reg1;
+
+	asm volatile(
+		"	lgr	0,%[reg0]\n"		/* qid arg into gr0 */
+		"	lgr	2,%[reg2]\n"		/* secret index into gr2 */
+		"	.insn	rre,0xb2af0000,0,0\n"	/* PQAP(AAPQ) */
+		"	lgr	%[reg1],1\n"		/* gr1 (status) into reg1 */
+		: [reg1] "=&d" (reg1.value)
+		: [reg0] "d" (reg0), [reg2] "d" (reg2)
+		: "cc", "0", "1", "2");
+
+	return reg1.status;
+}
+
 /**
  * ap_nqap(): Send message to adjunct processor queue.
  * @qid: The AP queue number
@@ -303,28 +440,36 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid,
 					     unsigned long long psmid,
 					     void *msg, size_t length)
 {
-	register unsigned long reg0 asm ("0") = qid | 0x40000000UL;
-	register struct ap_queue_status reg1 asm ("1");
-	register unsigned long reg2 asm ("2") = (unsigned long) msg;
-	register unsigned long reg3 asm ("3") = (unsigned long) length;
-	register unsigned long reg4 asm ("4") = (unsigned int) (psmid >> 32);
-	register unsigned long reg5 asm ("5") = psmid & 0xffffffff;
+	unsigned long reg0 = qid | 0x40000000UL;  /* 0x4... is last msg part */
+	union register_pair nqap_r1, nqap_r2;
+	union ap_queue_status_reg reg1;
+
+	nqap_r1.even = (unsigned int)(psmid >> 32);
+	nqap_r1.odd  = psmid & 0xffffffff;
+	nqap_r2.even = (unsigned long)msg;
+	nqap_r2.odd  = (unsigned long)length;
 
 	asm volatile (
-		"0: .long 0xb2ad0042\n"		/* NQAP */
-		"   brc   2,0b"
-		: "+d" (reg0), "=d" (reg1), "+d" (reg2), "+d" (reg3)
-		: "d" (reg4), "d" (reg5)
-		: "cc", "memory");
-	return reg1;
+		"	lgr	0,%[reg0]\n"  /* qid param in gr0 */
+		"0:	.insn	rre,0xb2ad0000,%[nqap_r1],%[nqap_r2]\n"
+		"	brc	2,0b\n"       /* handle partial completion */
+		"	lgr	%[reg1],1\n"  /* gr1 (status) into reg1 */
+		: [reg0] "+&d" (reg0), [reg1] "=&d" (reg1.value),
+		  [nqap_r2] "+&d" (nqap_r2.pair)
+		: [nqap_r1] "d" (nqap_r1.pair)
+		: "cc", "memory", "0", "1");
+	return reg1.status;
 }
 
 /**
  * ap_dqap(): Receive message from adjunct processor queue.
  * @qid: The AP queue number
  * @psmid: Pointer to program supplied message identifier
- * @msg: The message text
- * @length: The message length
+ * @msg: Pointer to message buffer
+ * @msglen: Message buffer size
+ * @length: Pointer to length of actually written bytes
+ * @reslength: Residual length on return
+ * @resgr0: input: gr0 value (only used if != 0), output: residual gr0 content
  *
  * Returns AP queue status structure.
  * Condition code 1 on DQAP means the receive has taken place
@@ -336,39 +481,72 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid,
  * Note that gpr2 is used by the DQAP instruction to keep track of
  * any 'residual' length, in case the instruction gets interrupted.
  * Hence it gets zeroed before the instruction.
+ * If the message does not fit into the buffer, this function will
+ * return with a truncated message and the reply in the firmware queue
+ * is not removed. This is indicated to the caller with an
+ * ap_queue_status response_code value of all bits on (0xFF) and (if
+ * the reslength ptr is given) the remaining length is stored in
+ * *reslength and (if the resgr0 ptr is given) the updated gr0 value
+ * for further processing of this msg entry is stored in *resgr0. The
+ * caller needs to detect this situation and should invoke ap_dqap
+ * with a valid resgr0 ptr and a value in there != 0 to indicate that
+ * *resgr0 is to be used instead of qid to further process this entry.
  */
 static inline struct ap_queue_status ap_dqap(ap_qid_t qid,
-					     unsigned long long *psmid,
-					     void *msg, size_t length)
+					     unsigned long *psmid,
+					     void *msg, size_t msglen,
+					     size_t *length,
+					     size_t *reslength,
+					     unsigned long *resgr0)
 {
-	register unsigned long reg0 asm("0") = qid | 0x80000000UL;
-	register struct ap_queue_status reg1 asm ("1");
-	register unsigned long reg2 asm("2") = 0UL;
-	register unsigned long reg4 asm("4") = (unsigned long) msg;
-	register unsigned long reg5 asm("5") = (unsigned long) length;
-	register unsigned long reg6 asm("6") = 0UL;
-	register unsigned long reg7 asm("7") = 0UL;
+	unsigned long reg0 = resgr0 && *resgr0 ? *resgr0 : qid | 0x80000000UL;
+	union ap_queue_status_reg reg1;
+	unsigned long reg2;
+	union register_pair rp1, rp2;
 
+	rp1.even = 0UL;
+	rp1.odd  = 0UL;
+	rp2.even = (unsigned long)msg;
+	rp2.odd  = (unsigned long)msglen;
 
 	asm volatile(
-		"0: .long 0xb2ae0064\n"		/* DQAP */
-		"   brc   6,0b\n"
-		: "+d" (reg0), "=d" (reg1), "+d" (reg2),
-		  "+d" (reg4), "+d" (reg5), "+d" (reg6), "+d" (reg7)
-		: : "cc", "memory");
-	*psmid = (((unsigned long long) reg6) << 32) + reg7;
-	return reg1;
-}
+		"	lgr	0,%[reg0]\n"   /* qid param into gr0 */
+		"	lghi	2,0\n"	       /* 0 into gr2 (res length) */
+		"0:	ltgr	%N[rp2],%N[rp2]\n" /* check buf len */
+		"	jz	2f\n"	       /* go out if buf len is 0 */
+		"1:	.insn	rre,0xb2ae0000,%[rp1],%[rp2]\n"
+		"	brc	6,0b\n"        /* handle partial complete */
+		"2:	lgr	%[reg0],0\n"   /* gr0 (qid + info) into reg0 */
+		"	lgr	%[reg1],1\n"   /* gr1 (status) into reg1 */
+		"	lgr	%[reg2],2\n"   /* gr2 (res length) into reg2 */
+		: [reg0] "+&d" (reg0), [reg1] "=&d" (reg1.value),
+		  [reg2] "=&d" (reg2), [rp1] "+&d" (rp1.pair),
+		  [rp2] "+&d" (rp2.pair)
+		:
+		: "cc", "memory", "0", "1", "2");
+
+	if (reslength)
+		*reslength = reg2;
+	if (reg2 != 0 && rp2.odd == 0) {
+		/*
+		 * Partially complete, status in gr1 is not set.
+		 * Signal the caller that this dqap is only partially received
+		 * with a special status response code 0xFF and *resgr0 updated
+		 */
+		reg1.status.response_code = 0xFF;
+		if (resgr0)
+			*resgr0 = reg0;
+	} else {
+		*psmid = (rp1.even << 32) + rp1.odd;
+		if (resgr0)
+			*resgr0 = 0;
+	}
+
+	/* update *length with the nr of bytes stored into the msg buffer */
+	if (length)
+		*length = msglen - rp2.odd;
 
-/*
- * Interface to tell the AP bus code that a configuration
- * change has happened. The bus code should at least do
- * an ap bus resource rescan.
- */
-#if IS_ENABLED(CONFIG_ZCRYPT)
-void ap_bus_cfg_chg(void);
-#else
-static inline void ap_bus_cfg_chg(void){};
-#endif
+	return reg1.status;
+}
 
 #endif /* _ASM_S390_AP_H_ */
diff --git a/arch/s390/include/asm/appldata.h b/arch/s390/include/asm/appldata.h
index c5bd9f4437e5..99b2902c10fd 100644
--- a/arch/s390/include/asm/appldata.h
+++ b/arch/s390/include/asm/appldata.h
@@ -8,8 +8,9 @@
 #ifndef _ASM_S390_APPLDATA_H
 #define _ASM_S390_APPLDATA_H
 
+#include <linux/io.h>
+#include <asm/machine.h>
 #include <asm/diag.h>
-#include <asm/io.h>
 
 #define APPLDATA_START_INTERVAL_REC	0x80
 #define APPLDATA_STOP_REC		0x81
@@ -48,19 +49,19 @@ static inline int appldata_asm(struct appldata_parameter_list *parm_list,
 {
 	int ry;
 
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return -EOPNOTSUPP;
 	parm_list->diag = 0xdc;
 	parm_list->function = fn;
 	parm_list->parlist_length = sizeof(*parm_list);
 	parm_list->buffer_length = length;
-	parm_list->product_id_addr = (unsigned long) id;
+	parm_list->product_id_addr = virt_to_phys(id);
 	parm_list->buffer_addr = virt_to_phys(buffer);
 	diag_stat_inc(DIAG_STAT_X0DC);
 	asm volatile(
 		"	diag	%1,%0,0xdc"
 		: "=d" (ry)
-		: "d" (parm_list), "m" (*parm_list), "m" (*id)
+		: "d" (virt_to_phys(parm_list)), "m" (*parm_list), "m" (*id)
 		: "cc");
 	return ry;
 }
diff --git a/arch/s390/include/asm/arch_hweight.h b/arch/s390/include/asm/arch_hweight.h
new file mode 100644
index 000000000000..aca08b0acbc1
--- /dev/null
+++ b/arch/s390/include/asm/arch_hweight.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_S390_ARCH_HWEIGHT_H
+#define _ASM_S390_ARCH_HWEIGHT_H
+
+#include <linux/types.h>
+#include <asm/march.h>
+
+static __always_inline unsigned long popcnt_z196(unsigned long w)
+{
+	unsigned long cnt;
+
+	asm volatile(".insn	rrf,0xb9e10000,%[cnt],%[w],0,0"
+		     : [cnt] "=d" (cnt)
+		     : [w] "d" (w)
+		     : "cc");
+	return cnt;
+}
+
+static __always_inline unsigned long popcnt_z15(unsigned long w)
+{
+	unsigned long cnt;
+
+	asm volatile(".insn	rrf,0xb9e10000,%[cnt],%[w],8,0"
+		     : [cnt] "=d" (cnt)
+		     : [w] "d" (w)
+		     : "cc");
+	return cnt;
+}
+
+static __always_inline unsigned long __arch_hweight64(__u64 w)
+{
+	if (__is_defined(MARCH_HAS_Z15_FEATURES))
+		return popcnt_z15(w);
+	if (__is_defined(MARCH_HAS_Z196_FEATURES)) {
+		w = popcnt_z196(w);
+		w += w >> 32;
+		w += w >> 16;
+		w += w >> 8;
+		return w & 0xff;
+	}
+	return __sw_hweight64(w);
+}
+
+static __always_inline unsigned int __arch_hweight32(unsigned int w)
+{
+	if (__is_defined(MARCH_HAS_Z15_FEATURES))
+		return popcnt_z15(w);
+	if (__is_defined(MARCH_HAS_Z196_FEATURES)) {
+		w = popcnt_z196(w);
+		w += w >> 16;
+		w += w >> 8;
+		return w & 0xff;
+	}
+	return __sw_hweight32(w);
+}
+
+static __always_inline unsigned int __arch_hweight16(unsigned int w)
+{
+	if (__is_defined(MARCH_HAS_Z15_FEATURES))
+		return popcnt_z15((unsigned short)w);
+	if (__is_defined(MARCH_HAS_Z196_FEATURES)) {
+		w = popcnt_z196(w);
+		w += w >> 8;
+		return w & 0xff;
+	}
+	return __sw_hweight16(w);
+}
+
+static __always_inline unsigned int __arch_hweight8(unsigned int w)
+{
+	if (__is_defined(MARCH_HAS_Z196_FEATURES))
+		return popcnt_z196((unsigned char)w);
+	return __sw_hweight8(w);
+}
+
+#endif /* _ASM_S390_ARCH_HWEIGHT_H */
diff --git a/arch/s390/include/asm/archrandom.h b/arch/s390/include/asm/archrandom.h
index c67b82dfa558..1594049893e0 100644
--- a/arch/s390/include/asm/archrandom.h
+++ b/arch/s390/include/asm/archrandom.h
@@ -2,7 +2,7 @@
 /*
  * Kernel interface for the s390 arch_random_* functions
  *
- * Copyright IBM Corp. 2017
+ * Copyright IBM Corp. 2017, 2022
  *
  * Author: Harald Freudenberger <freude@de.ibm.com>
  *
@@ -11,53 +11,28 @@
 #ifndef _ASM_S390_ARCHRANDOM_H
 #define _ASM_S390_ARCHRANDOM_H
 
-#ifdef CONFIG_ARCH_RANDOM
-
 #include <linux/static_key.h>
+#include <linux/preempt.h>
 #include <linux/atomic.h>
+#include <asm/cpacf.h>
 
 DECLARE_STATIC_KEY_FALSE(s390_arch_random_available);
 extern atomic64_t s390_arch_random_counter;
 
-bool s390_arch_random_generate(u8 *buf, unsigned int nbytes);
-
-static inline bool arch_has_random(void)
-{
-	return false;
-}
-
-static inline bool arch_has_random_seed(void)
-{
-	if (static_branch_likely(&s390_arch_random_available))
-		return true;
-	return false;
-}
-
-static inline bool arch_get_random_long(unsigned long *v)
+static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs)
 {
-	return false;
-}
-
-static inline bool arch_get_random_int(unsigned int *v)
-{
-	return false;
-}
-
-static inline bool arch_get_random_seed_long(unsigned long *v)
-{
-	if (static_branch_likely(&s390_arch_random_available)) {
-		return s390_arch_random_generate((u8 *)v, sizeof(*v));
-	}
-	return false;
+	return 0;
 }
 
-static inline bool arch_get_random_seed_int(unsigned int *v)
+static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs)
 {
-	if (static_branch_likely(&s390_arch_random_available)) {
-		return s390_arch_random_generate((u8 *)v, sizeof(*v));
+	if (static_branch_likely(&s390_arch_random_available) &&
+	    in_task()) {
+		cpacf_trng(NULL, 0, (u8 *)v, max_longs * sizeof(*v));
+		atomic64_add(max_longs * sizeof(*v), &s390_arch_random_counter);
+		return max_longs;
 	}
-	return false;
+	return 0;
 }
 
-#endif /* CONFIG_ARCH_RANDOM */
 #endif /* _ASM_S390_ARCHRANDOM_H */
diff --git a/arch/s390/include/asm/asm-const.h b/arch/s390/include/asm/asm-const.h
new file mode 100644
index 000000000000..11f615eb0066
--- /dev/null
+++ b/arch/s390/include/asm/asm-const.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_ASM_CONST_H
+#define _ASM_S390_ASM_CONST_H
+
+#ifdef __ASSEMBLY__
+#  define stringify_in_c(...)	__VA_ARGS__
+#else
+/* This version of stringify will deal with commas... */
+#  define __stringify_in_c(...)	#__VA_ARGS__
+#  define stringify_in_c(...)	__stringify_in_c(__VA_ARGS__) " "
+#endif
+#endif /* _ASM_S390_ASM_CONST_H */
diff --git a/arch/s390/include/asm/asm-extable.h b/arch/s390/include/asm/asm-extable.h
new file mode 100644
index 000000000000..d23ea0c94e4e
--- /dev/null
+++ b/arch/s390/include/asm/asm-extable.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_EXTABLE_H
+#define __ASM_EXTABLE_H
+
+#include <linux/stringify.h>
+#include <linux/bits.h>
+#include <asm/asm-const.h>
+
+#define EX_TYPE_NONE		0
+#define EX_TYPE_FIXUP		1
+#define EX_TYPE_BPF		2
+#define EX_TYPE_UA_FAULT	3
+#define EX_TYPE_UA_LOAD_REG	5
+#define EX_TYPE_UA_LOAD_REGPAIR	6
+#define EX_TYPE_ZEROPAD		7
+#define EX_TYPE_FPC		8
+#define EX_TYPE_UA_MVCOS_TO	9
+#define EX_TYPE_UA_MVCOS_FROM	10
+
+#define EX_DATA_REG_ERR_SHIFT	0
+#define EX_DATA_REG_ERR		GENMASK(3, 0)
+
+#define EX_DATA_REG_ADDR_SHIFT	4
+#define EX_DATA_REG_ADDR	GENMASK(7, 4)
+
+#define EX_DATA_LEN_SHIFT	8
+#define EX_DATA_LEN		GENMASK(11, 8)
+
+#define __EX_TABLE(_section, _fault, _target, _type, _regerr, _regaddr, _len)	\
+	stringify_in_c(.section _section,"a";)					\
+	stringify_in_c(.balign	4;)						\
+	stringify_in_c(.long	(_fault) - .;)					\
+	stringify_in_c(.long	(_target) - .;)					\
+	stringify_in_c(.short	(_type);)					\
+	stringify_in_c(.macro	extable_reg regerr, regaddr;)			\
+	stringify_in_c(.set	.Lfound, 0;)					\
+	stringify_in_c(.set	.Lcurr, 0;)					\
+	stringify_in_c(.irp	rs,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15;)	\
+	stringify_in_c(		.ifc	"\regerr", "%%r\rs";)			\
+	stringify_in_c(			.set	.Lfound, 1;)			\
+	stringify_in_c(			.set	.Lregerr, .Lcurr;)		\
+	stringify_in_c(		.endif;)					\
+	stringify_in_c(		.set	.Lcurr, .Lcurr+1;)			\
+	stringify_in_c(.endr;)							\
+	stringify_in_c(.ifne	(.Lfound != 1);)				\
+	stringify_in_c(		.error	"extable_reg: bad register argument1";)	\
+	stringify_in_c(.endif;)							\
+	stringify_in_c(.set	.Lfound, 0;)					\
+	stringify_in_c(.set	.Lcurr, 0;)					\
+	stringify_in_c(.irp	rs,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15;)	\
+	stringify_in_c(		.ifc	"\regaddr", "%%r\rs";)			\
+	stringify_in_c(			.set	.Lfound, 1;)			\
+	stringify_in_c(			.set	.Lregaddr, .Lcurr;)		\
+	stringify_in_c(		.endif;)					\
+	stringify_in_c(		.set	.Lcurr, .Lcurr+1;)			\
+	stringify_in_c(.endr;)							\
+	stringify_in_c(.ifne	(.Lfound != 1);)				\
+	stringify_in_c(		.error	"extable_reg: bad register argument2";)	\
+	stringify_in_c(.endif;)							\
+	stringify_in_c(.short	.Lregerr << EX_DATA_REG_ERR_SHIFT |		\
+				.Lregaddr << EX_DATA_REG_ADDR_SHIFT |		\
+				_len << EX_DATA_LEN_SHIFT;)			\
+	stringify_in_c(.endm;)							\
+	stringify_in_c(extable_reg _regerr,_regaddr;)				\
+	stringify_in_c(.purgem	extable_reg;)					\
+	stringify_in_c(.previous)
+
+#define EX_TABLE(_fault, _target)					\
+	__EX_TABLE(__ex_table, _fault, _target, EX_TYPE_FIXUP, __stringify(%%r0), __stringify(%%r0), 0)
+
+#define EX_TABLE_AMODE31(_fault, _target)				\
+	__EX_TABLE(.amode31.ex_table, _fault, _target, EX_TYPE_FIXUP, __stringify(%%r0), __stringify(%%r0), 0)
+
+#define EX_TABLE_UA_FAULT(_fault, _target, _regerr)			\
+	__EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_FAULT, _regerr, _regerr, 0)
+
+#define EX_TABLE_UA_LOAD_REG(_fault, _target, _regerr, _regzero)	\
+	__EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_REG, _regerr, _regzero, 0)
+
+#define EX_TABLE_UA_LOAD_REGPAIR(_fault, _target, _regerr, _regzero)	\
+	__EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_REGPAIR, _regerr, _regzero, 0)
+
+#define EX_TABLE_ZEROPAD(_fault, _target, _regdata, _regaddr)		\
+	__EX_TABLE(__ex_table, _fault, _target, EX_TYPE_ZEROPAD, _regdata, _regaddr, 0)
+
+#define EX_TABLE_FPC(_fault, _target)					\
+	__EX_TABLE(__ex_table, _fault, _target, EX_TYPE_FPC, __stringify(%%r0), __stringify(%%r0), 0)
+
+#define EX_TABLE_UA_MVCOS_TO(_fault, _target)				\
+	__EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_MVCOS_TO, __stringify(%%r0), __stringify(%%r0), 0)
+
+#define EX_TABLE_UA_MVCOS_FROM(_fault, _target)				\
+	__EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_MVCOS_FROM, __stringify(%%r0), __stringify(%%r0), 0)
+
+#endif /* __ASM_EXTABLE_H */
diff --git a/arch/s390/include/asm/asm-prototypes.h b/arch/s390/include/asm/asm-prototypes.h
index c37eb921bfbf..f662eb4b9246 100644
--- a/arch/s390/include/asm/asm-prototypes.h
+++ b/arch/s390/include/asm/asm-prototypes.h
@@ -3,7 +3,12 @@
 
 #include <linux/kvm_host.h>
 #include <linux/ftrace.h>
-#include <asm/fpu/api.h>
+#include <asm/fpu.h>
+#include <asm/nospec-branch.h>
 #include <asm-generic/asm-prototypes.h>
 
+__int128_t __ashlti3(__int128_t a, int b);
+__int128_t __ashrti3(__int128_t a, int b);
+__int128_t __lshrti3(__int128_t a, int b);
+
 #endif /* _ASM_S390_PROTOTYPES_H */
diff --git a/arch/s390/include/asm/asm.h b/arch/s390/include/asm/asm.h
new file mode 100644
index 000000000000..e9062b01e2a2
--- /dev/null
+++ b/arch/s390/include/asm/asm.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_ASM_H
+#define _ASM_S390_ASM_H
+
+#include <linux/stringify.h>
+
+/*
+ * Helper macros to be used for flag output operand handling.
+ * Inline assemblies must use four of the five supplied macros:
+ *
+ * Use CC_IPM(sym) at the end of the inline assembly; this extracts the
+ * condition code and program mask with the ipm instruction and writes it to
+ * the variable with symbolic name [sym] if the compiler has no support for
+ * flag output operands. If the compiler has support for flag output operands
+ * this generates no code.
+ *
+ * Use CC_OUT(sym, var) at the output operand list of an inline assembly. This
+ * defines an output operand with symbolic name [sym] for the variable
+ * [var]. [var] must be an int variable and [sym] must be identical with [sym]
+ * used with CC_IPM().
+ *
+ * Use either CC_CLOBBER or CC_CLOBBER_LIST() for the clobber list. Use
+ * CC_CLOBBER if the clobber list contains only "cc", otherwise use
+ * CC_CLOBBER_LIST() and add all clobbers as argument to the macro.
+ *
+ * Use CC_TRANSFORM() to convert the variable [var] which contains the
+ * extracted condition code. If the condition code is extracted with ipm, the
+ * [var] also contains the program mask. CC_TRANSFORM() moves the condition
+ * code to the two least significant bits and sets all other bits to zero.
+ */
+#if defined(__GCC_ASM_FLAG_OUTPUTS__) && !(IS_ENABLED(CONFIG_CC_ASM_FLAG_OUTPUT_BROKEN))
+
+#define __HAVE_ASM_FLAG_OUTPUTS__
+
+#define CC_IPM(sym)
+#define CC_OUT(sym, var)	"=@cc" (var)
+#define CC_TRANSFORM(cc)	({ cc; })
+#define CC_CLOBBER
+#define CC_CLOBBER_LIST(...)	__VA_ARGS__
+
+#else
+
+#define CC_IPM(sym)		"	ipm	%[" __stringify(sym) "]\n"
+#define CC_OUT(sym, var)	[sym] "=d" (var)
+#define CC_TRANSFORM(cc)	({ (cc) >> 28; })
+#define CC_CLOBBER		"cc"
+#define CC_CLOBBER_LIST(...)	"cc", __VA_ARGS__
+
+#endif
+
+#endif /* _ASM_S390_ASM_H */
diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h
index 491ad53a0d4e..b36dd6a1d652 100644
--- a/arch/s390/include/asm/atomic.h
+++ b/arch/s390/include/asm/atomic.h
@@ -15,56 +15,76 @@
 #include <asm/barrier.h>
 #include <asm/cmpxchg.h>
 
-#define ATOMIC_INIT(i)  { (i) }
-
-static inline int atomic_read(const atomic_t *v)
+static __always_inline int arch_atomic_read(const atomic_t *v)
 {
-	int c;
-
-	asm volatile(
-		"	l	%0,%1\n"
-		: "=d" (c) : "Q" (v->counter));
-	return c;
+	return __atomic_read(&v->counter);
 }
+#define arch_atomic_read arch_atomic_read
 
-static inline void atomic_set(atomic_t *v, int i)
+static __always_inline void arch_atomic_set(atomic_t *v, int i)
 {
-	asm volatile(
-		"	st	%1,%0\n"
-		: "=Q" (v->counter) : "d" (i));
+	__atomic_set(&v->counter, i);
 }
+#define arch_atomic_set arch_atomic_set
 
-static inline int atomic_add_return(int i, atomic_t *v)
+static __always_inline int arch_atomic_add_return(int i, atomic_t *v)
 {
 	return __atomic_add_barrier(i, &v->counter) + i;
 }
+#define arch_atomic_add_return arch_atomic_add_return
 
-static inline int atomic_fetch_add(int i, atomic_t *v)
+static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
 {
 	return __atomic_add_barrier(i, &v->counter);
 }
+#define arch_atomic_fetch_add arch_atomic_fetch_add
 
-static inline void atomic_add(int i, atomic_t *v)
+static __always_inline void arch_atomic_add(int i, atomic_t *v)
 {
-#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
-	if (__builtin_constant_p(i) && (i > -129) && (i < 128)) {
-		__atomic_add_const(i, &v->counter);
-		return;
-	}
-#endif
 	__atomic_add(i, &v->counter);
 }
+#define arch_atomic_add arch_atomic_add
+
+static __always_inline void arch_atomic_inc(atomic_t *v)
+{
+	__atomic_add_const(1, &v->counter);
+}
+#define arch_atomic_inc arch_atomic_inc
+
+static __always_inline void arch_atomic_dec(atomic_t *v)
+{
+	__atomic_add_const(-1, &v->counter);
+}
+#define arch_atomic_dec arch_atomic_dec
+
+static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
+{
+	return __atomic_add_and_test_barrier(-i, &v->counter);
+}
+#define arch_atomic_sub_and_test arch_atomic_sub_and_test
+
+static __always_inline bool arch_atomic_dec_and_test(atomic_t *v)
+{
+	return __atomic_add_const_and_test_barrier(-1, &v->counter);
+}
+#define arch_atomic_dec_and_test arch_atomic_dec_and_test
+
+static __always_inline bool arch_atomic_inc_and_test(atomic_t *v)
+{
+	return __atomic_add_const_and_test_barrier(1, &v->counter);
+}
+#define arch_atomic_inc_and_test arch_atomic_inc_and_test
 
-#define atomic_sub(_i, _v)		atomic_add(-(int)(_i), _v)
-#define atomic_sub_return(_i, _v)	atomic_add_return(-(int)(_i), _v)
-#define atomic_fetch_sub(_i, _v)	atomic_fetch_add(-(int)(_i), _v)
+#define arch_atomic_sub(_i, _v)		arch_atomic_add(-(int)(_i), _v)
+#define arch_atomic_sub_return(_i, _v)	arch_atomic_add_return(-(int)(_i), _v)
+#define arch_atomic_fetch_sub(_i, _v)	arch_atomic_fetch_add(-(int)(_i), _v)
 
 #define ATOMIC_OPS(op)							\
-static inline void atomic_##op(int i, atomic_t *v)			\
+static __always_inline void arch_atomic_##op(int i, atomic_t *v)	\
 {									\
 	__atomic_##op(i, &v->counter);					\
 }									\
-static inline int atomic_fetch_##op(int i, atomic_t *v)			\
+static __always_inline int arch_atomic_fetch_##op(int i, atomic_t *v)	\
 {									\
 	return __atomic_##op##_barrier(i, &v->counter);			\
 }
@@ -75,68 +95,119 @@ ATOMIC_OPS(xor)
 
 #undef ATOMIC_OPS
 
-#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
+#define arch_atomic_and			arch_atomic_and
+#define arch_atomic_or			arch_atomic_or
+#define arch_atomic_xor			arch_atomic_xor
+#define arch_atomic_fetch_and		arch_atomic_fetch_and
+#define arch_atomic_fetch_or		arch_atomic_fetch_or
+#define arch_atomic_fetch_xor		arch_atomic_fetch_xor
 
-static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
+static __always_inline int arch_atomic_xchg(atomic_t *v, int new)
 {
-	return __atomic_cmpxchg(&v->counter, old, new);
+	return arch_xchg(&v->counter, new);
 }
+#define arch_atomic_xchg arch_atomic_xchg
 
-#define ATOMIC64_INIT(i)  { (i) }
+static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
+{
+	return arch_cmpxchg(&v->counter, old, new);
+}
+#define arch_atomic_cmpxchg arch_atomic_cmpxchg
 
-static inline s64 atomic64_read(const atomic64_t *v)
+static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
 {
-	s64 c;
+	return arch_try_cmpxchg(&v->counter, old, new);
+}
+#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg
 
-	asm volatile(
-		"	lg	%0,%1\n"
-		: "=d" (c) : "Q" (v->counter));
-	return c;
+#define ATOMIC64_INIT(i)  { (i) }
+
+static __always_inline s64 arch_atomic64_read(const atomic64_t *v)
+{
+	return __atomic64_read((long *)&v->counter);
 }
+#define arch_atomic64_read arch_atomic64_read
 
-static inline void atomic64_set(atomic64_t *v, s64 i)
+static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
 {
-	asm volatile(
-		"	stg	%1,%0\n"
-		: "=Q" (v->counter) : "d" (i));
+	__atomic64_set((long *)&v->counter, i);
 }
+#define arch_atomic64_set arch_atomic64_set
 
-static inline s64 atomic64_add_return(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
 {
 	return __atomic64_add_barrier(i, (long *)&v->counter) + i;
 }
+#define arch_atomic64_add_return arch_atomic64_add_return
 
-static inline s64 atomic64_fetch_add(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
 {
 	return __atomic64_add_barrier(i, (long *)&v->counter);
 }
+#define arch_atomic64_fetch_add arch_atomic64_fetch_add
 
-static inline void atomic64_add(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v)
 {
-#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
-	if (__builtin_constant_p(i) && (i > -129) && (i < 128)) {
-		__atomic64_add_const(i, (long *)&v->counter);
-		return;
-	}
-#endif
 	__atomic64_add(i, (long *)&v->counter);
 }
+#define arch_atomic64_add arch_atomic64_add
 
-#define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
+static __always_inline void arch_atomic64_inc(atomic64_t *v)
+{
+	__atomic64_add_const(1, (long *)&v->counter);
+}
+#define arch_atomic64_inc arch_atomic64_inc
 
-static inline s64 atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
+static __always_inline void arch_atomic64_dec(atomic64_t *v)
 {
-	return __atomic64_cmpxchg((long *)&v->counter, old, new);
+	__atomic64_add_const(-1, (long *)&v->counter);
 }
+#define arch_atomic64_dec arch_atomic64_dec
 
-#define ATOMIC64_OPS(op)						\
-static inline void atomic64_##op(s64 i, atomic64_t *v)			\
-{									\
-	__atomic64_##op(i, (long *)&v->counter);			\
-}									\
-static inline long atomic64_fetch_##op(s64 i, atomic64_t *v)		\
-{									\
-	return __atomic64_##op##_barrier(i, (long *)&v->counter);	\
+static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
+{
+	return __atomic64_add_and_test_barrier(-i, (long *)&v->counter);
+}
+#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test
+
+static __always_inline bool arch_atomic64_dec_and_test(atomic64_t *v)
+{
+	return __atomic64_add_const_and_test_barrier(-1, (long *)&v->counter);
+}
+#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test
+
+static __always_inline bool arch_atomic64_inc_and_test(atomic64_t *v)
+{
+	return __atomic64_add_const_and_test_barrier(1, (long *)&v->counter);
+}
+#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test
+
+static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
+{
+	return arch_xchg(&v->counter, new);
+}
+#define arch_atomic64_xchg arch_atomic64_xchg
+
+static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
+{
+	return arch_cmpxchg(&v->counter, old, new);
+}
+#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg
+
+static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
+{
+	return arch_try_cmpxchg(&v->counter, old, new);
+}
+#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
+
+#define ATOMIC64_OPS(op)							\
+static __always_inline void arch_atomic64_##op(s64 i, atomic64_t *v)		\
+{										\
+	__atomic64_##op(i, (long *)&v->counter);				\
+}										\
+static __always_inline long arch_atomic64_fetch_##op(s64 i, atomic64_t *v)	\
+{										\
+	return __atomic64_##op##_barrier(i, (long *)&v->counter);		\
 }
 
 ATOMIC64_OPS(and)
@@ -145,8 +216,15 @@ ATOMIC64_OPS(xor)
 
 #undef ATOMIC64_OPS
 
-#define atomic64_sub_return(_i, _v)	atomic64_add_return(-(s64)(_i), _v)
-#define atomic64_fetch_sub(_i, _v)	atomic64_fetch_add(-(s64)(_i), _v)
-#define atomic64_sub(_i, _v)		atomic64_add(-(s64)(_i), _v)
+#define arch_atomic64_and		arch_atomic64_and
+#define arch_atomic64_or		arch_atomic64_or
+#define arch_atomic64_xor		arch_atomic64_xor
+#define arch_atomic64_fetch_and		arch_atomic64_fetch_and
+#define arch_atomic64_fetch_or		arch_atomic64_fetch_or
+#define arch_atomic64_fetch_xor		arch_atomic64_fetch_xor
+
+#define arch_atomic64_sub_return(_i, _v) arch_atomic64_add_return(-(s64)(_i), _v)
+#define arch_atomic64_fetch_sub(_i, _v)  arch_atomic64_fetch_add(-(s64)(_i), _v)
+#define arch_atomic64_sub(_i, _v)	 arch_atomic64_add(-(s64)(_i), _v)
 
 #endif /* __ARCH_S390_ATOMIC__  */
diff --git a/arch/s390/include/asm/atomic_ops.h b/arch/s390/include/asm/atomic_ops.h
index 61467b9eecc7..21c26d842832 100644
--- a/arch/s390/include/asm/atomic_ops.h
+++ b/arch/s390/include/asm/atomic_ops.h
@@ -8,23 +8,73 @@
 #ifndef __ARCH_S390_ATOMIC_OPS__
 #define __ARCH_S390_ATOMIC_OPS__
 
-#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
+#include <linux/limits.h>
+#include <asm/march.h>
+#include <asm/asm.h>
+
+static __always_inline int __atomic_read(const int *ptr)
+{
+	int val;
+
+	asm volatile(
+		"	l	%[val],%[ptr]\n"
+		: [val] "=d" (val) : [ptr] "R" (*ptr));
+	return val;
+}
+
+static __always_inline void __atomic_set(int *ptr, int val)
+{
+	if (__builtin_constant_p(val) && val >= S16_MIN && val <= S16_MAX) {
+		asm volatile(
+			"	mvhi	%[ptr],%[val]\n"
+			: [ptr] "=Q" (*ptr) : [val] "K" (val));
+	} else {
+		asm volatile(
+			"	st	%[val],%[ptr]\n"
+			: [ptr] "=R" (*ptr) : [val] "d" (val));
+	}
+}
+
+static __always_inline long __atomic64_read(const long *ptr)
+{
+	long val;
+
+	asm volatile(
+		"	lg	%[val],%[ptr]\n"
+		: [val] "=d" (val) : [ptr] "RT" (*ptr));
+	return val;
+}
+
+static __always_inline void __atomic64_set(long *ptr, long val)
+{
+	if (__builtin_constant_p(val) && val >= S16_MIN && val <= S16_MAX) {
+		asm volatile(
+			"	mvghi	%[ptr],%[val]\n"
+			: [ptr] "=Q" (*ptr) : [val] "K" (val));
+	} else {
+		asm volatile(
+			"	stg	%[val],%[ptr]\n"
+			: [ptr] "=RT" (*ptr) : [val] "d" (val));
+	}
+}
+
+#ifdef MARCH_HAS_Z196_FEATURES
 
 #define __ATOMIC_OP(op_name, op_type, op_string, op_barrier)		\
-static inline op_type op_name(op_type val, op_type *ptr)		\
+static __always_inline op_type op_name(op_type val, op_type *ptr)	\
 {									\
 	op_type old;							\
 									\
 	asm volatile(							\
 		op_string "	%[old],%[val],%[ptr]\n"			\
 		op_barrier						\
-		: [old] "=d" (old), [ptr] "+Q" (*ptr)			\
+		: [old] "=d" (old), [ptr] "+QS" (*ptr)			\
 		: [val] "d" (val) : "cc", "memory");			\
 	return old;							\
 }									\
 
 #define __ATOMIC_OPS(op_name, op_type, op_string)			\
-	__ATOMIC_OP(op_name, op_type, op_string, "\n")			\
+	__ATOMIC_OP(op_name, op_type, op_string, "")			\
 	__ATOMIC_OP(op_name##_barrier, op_type, op_string, "bcr 14,0\n")
 
 __ATOMIC_OPS(__atomic_add, int, "laa")
@@ -46,11 +96,11 @@ static __always_inline void op_name(op_type val, op_type *ptr)		\
 	asm volatile(							\
 		op_string "	%[ptr],%[val]\n"			\
 		op_barrier						\
-		: [ptr] "+Q" (*ptr) : [val] "i" (val) : "cc", "memory");\
+		: [ptr] "+QS" (*ptr) : [val] "i" (val) : "cc", "memory");\
 }
 
 #define __ATOMIC_CONST_OPS(op_name, op_type, op_string)			\
-	__ATOMIC_CONST_OP(op_name, op_type, op_string, "\n")		\
+	__ATOMIC_CONST_OP(op_name, op_type, op_string, "")		\
 	__ATOMIC_CONST_OP(op_name##_barrier, op_type, op_string, "bcr 14,0\n")
 
 __ATOMIC_CONST_OPS(__atomic_add_const, int, "asi")
@@ -59,10 +109,10 @@ __ATOMIC_CONST_OPS(__atomic64_add_const, long, "agsi")
 #undef __ATOMIC_CONST_OPS
 #undef __ATOMIC_CONST_OP
 
-#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */
+#else /* MARCH_HAS_Z196_FEATURES */
 
 #define __ATOMIC_OP(op_name, op_string)					\
-static inline int op_name(int val, int *ptr)				\
+static __always_inline int op_name(int val, int *ptr)			\
 {									\
 	int old, new;							\
 									\
@@ -88,7 +138,7 @@ __ATOMIC_OPS(__atomic_xor, "xr")
 #undef __ATOMIC_OPS
 
 #define __ATOMIC64_OP(op_name, op_string)				\
-static inline long op_name(long val, long *ptr)				\
+static __always_inline long op_name(long val, long *ptr)		\
 {									\
 	long old, new;							\
 									\
@@ -97,7 +147,7 @@ static inline long op_name(long val, long *ptr)				\
 		op_string "	%[new],%[val]\n"			\
 		"	csg	%[old],%[new],%[ptr]\n"			\
 		"	jl	0b"					\
-		: [old] "=d" (old), [new] "=&d" (new), [ptr] "+Q" (*ptr)\
+		: [old] "=d" (old), [new] "=&d" (new), [ptr] "+QS" (*ptr)\
 		: [val] "d" (val), "0" (*ptr) : "cc", "memory");	\
 	return old;							\
 }
@@ -113,31 +163,83 @@ __ATOMIC64_OPS(__atomic64_xor, "xgr")
 
 #undef __ATOMIC64_OPS
 
-#define __atomic_add_const(val, ptr)		__atomic_add(val, ptr)
-#define __atomic_add_const_barrier(val, ptr)	__atomic_add(val, ptr)
-#define __atomic64_add_const(val, ptr)		__atomic64_add(val, ptr)
-#define __atomic64_add_const_barrier(val, ptr)	__atomic64_add(val, ptr)
+#define __atomic_add_const(val, ptr)		((void)__atomic_add(val, ptr))
+#define __atomic_add_const_barrier(val, ptr)	((void)__atomic_add(val, ptr))
+#define __atomic64_add_const(val, ptr)		((void)__atomic64_add(val, ptr))
+#define __atomic64_add_const_barrier(val, ptr)	((void)__atomic64_add(val, ptr))
 
-#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */
+#endif /* MARCH_HAS_Z196_FEATURES */
 
-static inline int __atomic_cmpxchg(int *ptr, int old, int new)
-{
-	return __sync_val_compare_and_swap(ptr, old, new);
-}
+#if defined(MARCH_HAS_Z196_FEATURES) && defined(__HAVE_ASM_FLAG_OUTPUTS__)
 
-static inline int __atomic_cmpxchg_bool(int *ptr, int old, int new)
-{
-	return __sync_bool_compare_and_swap(ptr, old, new);
-}
+#define __ATOMIC_TEST_OP(op_name, op_type, op_string, op_barrier)	\
+static __always_inline bool op_name(op_type val, op_type *ptr)		\
+{									\
+	op_type tmp;							\
+	int cc;								\
+									\
+	asm volatile(							\
+		op_string "	%[tmp],%[val],%[ptr]\n"			\
+		op_barrier						\
+		: "=@cc" (cc), [tmp] "=d" (tmp), [ptr] "+QS" (*ptr)	\
+		: [val] "d" (val)					\
+		: "memory");						\
+	return (cc == 0) || (cc == 2);					\
+}									\
 
-static inline long __atomic64_cmpxchg(long *ptr, long old, long new)
-{
-	return __sync_val_compare_and_swap(ptr, old, new);
+#define __ATOMIC_TEST_OPS(op_name, op_type, op_string)			\
+	__ATOMIC_TEST_OP(op_name, op_type, op_string, "")		\
+	__ATOMIC_TEST_OP(op_name##_barrier, op_type, op_string, "bcr 14,0\n")
+
+__ATOMIC_TEST_OPS(__atomic_add_and_test, int, "laal")
+__ATOMIC_TEST_OPS(__atomic64_add_and_test, long, "laalg")
+
+#undef __ATOMIC_TEST_OPS
+#undef __ATOMIC_TEST_OP
+
+#define __ATOMIC_CONST_TEST_OP(op_name, op_type, op_string, op_barrier)	\
+static __always_inline bool op_name(op_type val, op_type *ptr)		\
+{									\
+	int cc;								\
+									\
+	asm volatile(							\
+		op_string "	%[ptr],%[val]\n"			\
+		op_barrier						\
+		: "=@cc" (cc), [ptr] "+QS" (*ptr)			\
+		: [val] "i" (val)					\
+		: "memory");						\
+	return (cc == 0) || (cc == 2);					\
 }
 
-static inline long __atomic64_cmpxchg_bool(long *ptr, long old, long new)
-{
-	return __sync_bool_compare_and_swap(ptr, old, new);
+#define __ATOMIC_CONST_TEST_OPS(op_name, op_type, op_string)		\
+	__ATOMIC_CONST_TEST_OP(op_name, op_type, op_string, "")		\
+	__ATOMIC_CONST_TEST_OP(op_name##_barrier, op_type, op_string, "bcr 14,0\n")
+
+__ATOMIC_CONST_TEST_OPS(__atomic_add_const_and_test, int, "alsi")
+__ATOMIC_CONST_TEST_OPS(__atomic64_add_const_and_test, long, "algsi")
+
+#undef __ATOMIC_CONST_TEST_OPS
+#undef __ATOMIC_CONST_TEST_OP
+
+#else /* defined(MARCH_HAS_Z196_FEATURES) && defined(__HAVE_ASM_FLAG_OUTPUTS__) */
+
+#define __ATOMIC_TEST_OP(op_name, op_func, op_type)			\
+static __always_inline bool op_name(op_type val, op_type *ptr)		\
+{									\
+	return op_func(val, ptr) == -val;				\
 }
 
+__ATOMIC_TEST_OP(__atomic_add_and_test,			__atomic_add,		int)
+__ATOMIC_TEST_OP(__atomic_add_and_test_barrier,		__atomic_add_barrier,	int)
+__ATOMIC_TEST_OP(__atomic_add_const_and_test,		__atomic_add,		int)
+__ATOMIC_TEST_OP(__atomic_add_const_and_test_barrier,	__atomic_add_barrier,	int)
+__ATOMIC_TEST_OP(__atomic64_add_and_test,		__atomic64_add,		long)
+__ATOMIC_TEST_OP(__atomic64_add_and_test_barrier,	__atomic64_add_barrier, long)
+__ATOMIC_TEST_OP(__atomic64_add_const_and_test,		__atomic64_add,		long)
+__ATOMIC_TEST_OP(__atomic64_add_const_and_test_barrier,	__atomic64_add_barrier,	long)
+
+#undef __ATOMIC_TEST_OP
+
+#endif /* defined(MARCH_HAS_Z196_FEATURES) && defined(__HAVE_ASM_FLAG_OUTPUTS__) */
+
 #endif /* __ARCH_S390_ATOMIC_OPS__  */
diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h
index f9eddbca79d2..d82130d7f2b6 100644
--- a/arch/s390/include/asm/barrier.h
+++ b/arch/s390/include/asm/barrier.h
@@ -8,28 +8,34 @@
 #ifndef __ASM_BARRIER_H
 #define __ASM_BARRIER_H
 
+#include <asm/march.h>
+
 /*
  * Force strict CPU ordering.
  * And yes, this is required on UP too when we're talking
  * to devices.
  */
 
-#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
+#ifdef MARCH_HAS_Z196_FEATURES
 /* Fast-BCR without checkpoint synchronization */
-#define __ASM_BARRIER "bcr 14,0\n"
+#define __ASM_BCR_SERIALIZE "bcr 14,0\n"
 #else
-#define __ASM_BARRIER "bcr 15,0\n"
+#define __ASM_BCR_SERIALIZE "bcr 15,0\n"
 #endif
 
-#define mb() do {  asm volatile(__ASM_BARRIER : : : "memory"); } while (0)
+static __always_inline void bcr_serialize(void)
+{
+	asm volatile(__ASM_BCR_SERIALIZE : : : "memory");
+}
 
-#define rmb()				barrier()
-#define wmb()				barrier()
-#define dma_rmb()			mb()
-#define dma_wmb()			mb()
-#define __smp_mb()			mb()
-#define __smp_rmb()			rmb()
-#define __smp_wmb()			wmb()
+#define __mb()		bcr_serialize()
+#define __rmb()		barrier()
+#define __wmb()		barrier()
+#define __dma_rmb()	__mb()
+#define __dma_wmb()	__mb()
+#define __smp_mb()	__mb()
+#define __smp_rmb()	__rmb()
+#define __smp_wmb()	__wmb()
 
 #define __smp_store_release(p, v)					\
 do {									\
diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h
index 431e208a5ea4..a5ca0a947691 100644
--- a/arch/s390/include/asm/bitops.h
+++ b/arch/s390/include/asm/bitops.h
@@ -36,214 +36,45 @@
 #include <linux/typecheck.h>
 #include <linux/compiler.h>
 #include <linux/types.h>
-#include <asm/atomic_ops.h>
-#include <asm/barrier.h>
+#include <asm/asm.h>
 
-#define __BITOPS_WORDS(bits) (((bits) + BITS_PER_LONG - 1) / BITS_PER_LONG)
+#define arch___set_bit			generic___set_bit
+#define arch___clear_bit		generic___clear_bit
+#define arch___change_bit		generic___change_bit
+#define arch___test_and_set_bit		generic___test_and_set_bit
+#define arch___test_and_clear_bit	generic___test_and_clear_bit
+#define arch___test_and_change_bit	generic___test_and_change_bit
+#define arch_test_bit_acquire		generic_test_bit_acquire
 
-static inline unsigned long *
-__bitops_word(unsigned long nr, volatile unsigned long *ptr)
+static __always_inline bool arch_test_bit(unsigned long nr, const volatile unsigned long *ptr)
 {
-	unsigned long addr;
-
-	addr = (unsigned long)ptr + ((nr ^ (nr & (BITS_PER_LONG - 1))) >> 3);
-	return (unsigned long *)addr;
-}
-
-static inline unsigned char *
-__bitops_byte(unsigned long nr, volatile unsigned long *ptr)
-{
-	return ((unsigned char *)ptr) + ((nr ^ (BITS_PER_LONG - 8)) >> 3);
-}
-
-static __always_inline void arch_set_bit(unsigned long nr, volatile unsigned long *ptr)
-{
-	unsigned long *addr = __bitops_word(nr, ptr);
-	unsigned long mask;
-
-#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES
-	if (__builtin_constant_p(nr)) {
-		unsigned char *caddr = __bitops_byte(nr, ptr);
-
-		asm volatile(
-			"oi	%0,%b1\n"
-			: "+Q" (*caddr)
-			: "i" (1 << (nr & 7))
-			: "cc", "memory");
-		return;
-	}
-#endif
-	mask = 1UL << (nr & (BITS_PER_LONG - 1));
-	__atomic64_or(mask, (long *)addr);
-}
-
-static __always_inline void arch_clear_bit(unsigned long nr, volatile unsigned long *ptr)
-{
-	unsigned long *addr = __bitops_word(nr, ptr);
-	unsigned long mask;
-
-#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES
-	if (__builtin_constant_p(nr)) {
-		unsigned char *caddr = __bitops_byte(nr, ptr);
-
-		asm volatile(
-			"ni	%0,%b1\n"
-			: "+Q" (*caddr)
-			: "i" (~(1 << (nr & 7)))
-			: "cc", "memory");
-		return;
-	}
-#endif
-	mask = ~(1UL << (nr & (BITS_PER_LONG - 1)));
-	__atomic64_and(mask, (long *)addr);
-}
-
-static __always_inline void arch_change_bit(unsigned long nr,
-					    volatile unsigned long *ptr)
-{
-	unsigned long *addr = __bitops_word(nr, ptr);
+#ifdef __HAVE_ASM_FLAG_OUTPUTS__
+	const volatile unsigned char *addr;
 	unsigned long mask;
-
-#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES
-	if (__builtin_constant_p(nr)) {
-		unsigned char *caddr = __bitops_byte(nr, ptr);
-
+	int cc;
+
+	/*
+	 * With CONFIG_PROFILE_ALL_BRANCHES enabled gcc fails to
+	 * handle __builtin_constant_p() in some cases.
+	 */
+	if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && __builtin_constant_p(nr)) {
+		addr = (const volatile unsigned char *)ptr;
+		addr += (nr ^ (BITS_PER_LONG - BITS_PER_BYTE)) / BITS_PER_BYTE;
+		mask = 1UL << (nr & (BITS_PER_BYTE - 1));
 		asm volatile(
-			"xi	%0,%b1\n"
-			: "+Q" (*caddr)
-			: "i" (1 << (nr & 7))
-			: "cc", "memory");
-		return;
+			"	tm	%[addr],%[mask]\n"
+			: "=@cc" (cc)
+			: [addr] "Q" (*addr), [mask] "I" (mask)
+			);
+		return cc == 3;
 	}
 #endif
-	mask = 1UL << (nr & (BITS_PER_LONG - 1));
-	__atomic64_xor(mask, (long *)addr);
+	return generic_test_bit(nr, ptr);
 }
 
-static inline bool arch_test_and_set_bit(unsigned long nr,
-					 volatile unsigned long *ptr)
-{
-	unsigned long *addr = __bitops_word(nr, ptr);
-	unsigned long old, mask;
-
-	mask = 1UL << (nr & (BITS_PER_LONG - 1));
-	old = __atomic64_or_barrier(mask, (long *)addr);
-	return (old & mask) != 0;
-}
-
-static inline bool arch_test_and_clear_bit(unsigned long nr,
-					   volatile unsigned long *ptr)
-{
-	unsigned long *addr = __bitops_word(nr, ptr);
-	unsigned long old, mask;
-
-	mask = ~(1UL << (nr & (BITS_PER_LONG - 1)));
-	old = __atomic64_and_barrier(mask, (long *)addr);
-	return (old & ~mask) != 0;
-}
-
-static inline bool arch_test_and_change_bit(unsigned long nr,
-					    volatile unsigned long *ptr)
-{
-	unsigned long *addr = __bitops_word(nr, ptr);
-	unsigned long old, mask;
-
-	mask = 1UL << (nr & (BITS_PER_LONG - 1));
-	old = __atomic64_xor_barrier(mask, (long *)addr);
-	return (old & mask) != 0;
-}
-
-static inline void arch___set_bit(unsigned long nr, volatile unsigned long *ptr)
-{
-	unsigned char *addr = __bitops_byte(nr, ptr);
-
-	*addr |= 1 << (nr & 7);
-}
-
-static inline void arch___clear_bit(unsigned long nr,
-				    volatile unsigned long *ptr)
-{
-	unsigned char *addr = __bitops_byte(nr, ptr);
-
-	*addr &= ~(1 << (nr & 7));
-}
-
-static inline void arch___change_bit(unsigned long nr,
-				     volatile unsigned long *ptr)
-{
-	unsigned char *addr = __bitops_byte(nr, ptr);
-
-	*addr ^= 1 << (nr & 7);
-}
-
-static inline bool arch___test_and_set_bit(unsigned long nr,
-					   volatile unsigned long *ptr)
-{
-	unsigned char *addr = __bitops_byte(nr, ptr);
-	unsigned char ch;
-
-	ch = *addr;
-	*addr |= 1 << (nr & 7);
-	return (ch >> (nr & 7)) & 1;
-}
-
-static inline bool arch___test_and_clear_bit(unsigned long nr,
-					     volatile unsigned long *ptr)
-{
-	unsigned char *addr = __bitops_byte(nr, ptr);
-	unsigned char ch;
-
-	ch = *addr;
-	*addr &= ~(1 << (nr & 7));
-	return (ch >> (nr & 7)) & 1;
-}
-
-static inline bool arch___test_and_change_bit(unsigned long nr,
-					      volatile unsigned long *ptr)
-{
-	unsigned char *addr = __bitops_byte(nr, ptr);
-	unsigned char ch;
-
-	ch = *addr;
-	*addr ^= 1 << (nr & 7);
-	return (ch >> (nr & 7)) & 1;
-}
-
-static inline bool arch_test_bit(unsigned long nr,
-				 const volatile unsigned long *ptr)
-{
-	const volatile unsigned char *addr;
-
-	addr = ((const volatile unsigned char *)ptr);
-	addr += (nr ^ (BITS_PER_LONG - 8)) >> 3;
-	return (*addr >> (nr & 7)) & 1;
-}
-
-static inline bool arch_test_and_set_bit_lock(unsigned long nr,
-					      volatile unsigned long *ptr)
-{
-	if (arch_test_bit(nr, ptr))
-		return 1;
-	return arch_test_and_set_bit(nr, ptr);
-}
-
-static inline void arch_clear_bit_unlock(unsigned long nr,
-					 volatile unsigned long *ptr)
-{
-	smp_mb__before_atomic();
-	arch_clear_bit(nr, ptr);
-}
-
-static inline void arch___clear_bit_unlock(unsigned long nr,
-					   volatile unsigned long *ptr)
-{
-	smp_mb();
-	arch___clear_bit(nr, ptr);
-}
-
-#include <asm-generic/bitops/instrumented-atomic.h>
-#include <asm-generic/bitops/instrumented-non-atomic.h>
-#include <asm-generic/bitops/instrumented-lock.h>
+#include <asm-generic/bitops/atomic.h>
+#include <asm-generic/bitops/non-instrumented-non-atomic.h>
+#include <asm-generic/bitops/lock.h>
 
 /*
  * Functions which use MSB0 bit numbering.
@@ -291,8 +122,6 @@ static inline bool test_bit_inv(unsigned long nr,
 	return test_bit(nr ^ (BITS_PER_LONG - 1), ptr);
 }
 
-#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES
-
 /**
  * __flogr - find leftmost one
  * @word - The word to search
@@ -334,13 +163,13 @@ static inline unsigned char __flogr(unsigned long word)
 		}
 		return bit;
 	} else {
-		register unsigned long bit asm("4") = word;
-		register unsigned long out asm("5");
+		union register_pair rp;
 
+		rp.even = word;
 		asm volatile(
-			"       flogr   %[bit],%[bit]\n"
-			: [bit] "+d" (bit), [out] "=d" (out) : : "cc");
-		return bit;
+			"       flogr   %[rp],%[rp]\n"
+			: [rp] "+d" (rp.pair) : : "cc");
+		return rp.even;
 	}
 }
 
@@ -411,19 +240,9 @@ static inline int fls(unsigned int word)
 	return fls64(word);
 }
 
-#else /* CONFIG_HAVE_MARCH_Z9_109_FEATURES */
-
-#include <asm-generic/bitops/__ffs.h>
-#include <asm-generic/bitops/ffs.h>
-#include <asm-generic/bitops/__fls.h>
-#include <asm-generic/bitops/fls.h>
-#include <asm-generic/bitops/fls64.h>
-
-#endif /* CONFIG_HAVE_MARCH_Z9_109_FEATURES */
-
+#include <asm/arch_hweight.h>
+#include <asm-generic/bitops/const_hweight.h>
 #include <asm-generic/bitops/ffz.h>
-#include <asm-generic/bitops/find.h>
-#include <asm-generic/bitops/hweight.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/le.h>
 #include <asm-generic/bitops/ext2-atomic-setbit.h>
diff --git a/arch/s390/include/asm/boot_data.h b/arch/s390/include/asm/boot_data.h
index f7eed27b3220..f55f8227058e 100644
--- a/arch/s390/include/asm/boot_data.h
+++ b/arch/s390/include/asm/boot_data.h
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _ASM_S390_BOOT_DATA_H
 
+#include <linux/string.h>
 #include <asm/setup.h>
 #include <asm/ipl.h>
 
@@ -15,4 +16,54 @@ extern unsigned long ipl_cert_list_size;
 extern unsigned long early_ipl_comp_list_addr;
 extern unsigned long early_ipl_comp_list_size;
 
+extern char boot_rb[PAGE_SIZE * 2];
+extern bool boot_earlyprintk;
+extern size_t boot_rb_off;
+extern char bootdebug_filter[128];
+extern bool bootdebug;
+
+#define boot_rb_foreach(cb)							\
+	do {									\
+		size_t off = boot_rb_off + strlen(boot_rb + boot_rb_off) + 1;	\
+		size_t len;							\
+		for (; off < sizeof(boot_rb) && (len = strlen(boot_rb + off)); off += len + 1) \
+			cb(boot_rb + off);					\
+		for (off = 0; off < boot_rb_off && (len = strlen(boot_rb + off)); off += len + 1) \
+			cb(boot_rb + off);					\
+	} while (0)
+
+/*
+ * bootdebug_filter is a comma separated list of strings,
+ * where each string can be a prefix of the message.
+ */
+static inline bool bootdebug_filter_match(const char *buf)
+{
+	char *p = bootdebug_filter, *s;
+	char *end;
+
+	if (!*p)
+		return true;
+
+	end = p + strlen(p);
+	while (p < end) {
+		p = skip_spaces(p);
+		s = memscan(p, ',', end - p);
+		if (!strncmp(p, buf, s - p))
+			return true;
+		p = s + 1;
+	}
+	return false;
+}
+
+static inline const char *skip_timestamp(const char *buf)
+{
+#ifdef CONFIG_PRINTK_TIME
+	const char *p = memchr(buf, ']', strlen(buf));
+
+	if (p && p[1] == ' ')
+		return p + 2;
+#endif
+	return buf;
+}
+
 #endif /* _ASM_S390_BOOT_DATA_H */
diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h
index a2b11ac00f60..c500d45fb465 100644
--- a/arch/s390/include/asm/bug.h
+++ b/arch/s390/include/asm/bug.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_S390_BUG_H
 #define _ASM_S390_BUG_H
 
-#include <linux/kernel.h>
+#include <linux/compiler.h>
 
 #ifdef CONFIG_BUG
 
@@ -10,15 +10,15 @@
 
 #define __EMIT_BUG(x) do {					\
 	asm_inline volatile(					\
-		"0:	j	0b+2\n"				\
-		"1:\n"						\
+		"0:	mc	0,0\n"				\
 		".section .rodata.str,\"aMS\",@progbits,1\n"	\
-		"2:	.asciz	\""__FILE__"\"\n"		\
+		"1:	.asciz	\""__FILE__"\"\n"		\
 		".previous\n"					\
-		".section __bug_table,\"awM\",@progbits,%2\n"	\
-		"3:	.long	1b-3b,2b-3b\n"			\
+		".section __bug_table,\"aw\"\n"			\
+		"2:	.long	0b-.\n"				\
+		"	.long	1b-.\n"				\
 		"	.short	%0,%1\n"			\
-		"	.org	3b+%2\n"			\
+		"	.org	2b+%2\n"			\
 		".previous\n"					\
 		: : "i" (__LINE__),				\
 		    "i" (x),					\
@@ -29,12 +29,11 @@
 
 #define __EMIT_BUG(x) do {					\
 	asm_inline volatile(					\
-		"0:	j	0b+2\n"				\
-		"1:\n"						\
-		".section __bug_table,\"awM\",@progbits,%1\n"	\
-		"2:	.long	1b-2b\n"			\
+		"0:	mc	0,0\n"				\
+		".section __bug_table,\"aw\"\n"			\
+		"1:	.long	0b-.\n"				\
 		"	.short	%0\n"				\
-		"	.org	2b+%1\n"			\
+		"	.org	1b+%1\n"			\
 		".previous\n"					\
 		: : "i" (x),					\
 		    "i" (sizeof(struct bug_entry)));		\
diff --git a/arch/s390/include/asm/bugs.h b/arch/s390/include/asm/bugs.h
deleted file mode 100644
index aa42a179be33..000000000000
--- a/arch/s390/include/asm/bugs.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *  S390 version
- *    Copyright IBM Corp. 1999
- *    Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
- *
- *  Derived from "include/asm-i386/bugs.h"
- *    Copyright (C) 1994  Linus Torvalds
- */
-
-/*
- * This is included by init/main.c to check for architecture-dependent bugs.
- *
- * Needs:
- *      void check_bugs(void);
- */
-
-static inline void check_bugs(void)
-{
-  /* s390 has no bugs ... */
-}
diff --git a/arch/s390/include/asm/cache.h b/arch/s390/include/asm/cache.h
index d5e22e837416..00128174c025 100644
--- a/arch/s390/include/asm/cache.h
+++ b/arch/s390/include/asm/cache.h
@@ -14,6 +14,6 @@
 #define L1_CACHE_SHIFT     8
 #define NET_SKB_PAD	   32
 
-#define __read_mostly __section(.data..read_mostly)
+#define __read_mostly __section(".data..read_mostly")
 
 #endif
diff --git a/arch/s390/include/asm/ccwdev.h b/arch/s390/include/asm/ccwdev.h
index 865ce1cb86d5..e3afcece375e 100644
--- a/arch/s390/include/asm/ccwdev.h
+++ b/arch/s390/include/asm/ccwdev.h
@@ -11,9 +11,11 @@
 
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
+#include <asm/chsc.h>
 #include <asm/fcx.h>
 #include <asm/irq.h>
 #include <asm/schid.h>
+#include <linux/mutex.h>
 
 /* structs from asm/cio.h */
 struct irb;
@@ -86,6 +88,7 @@ struct ccw_device {
 	spinlock_t *ccwlock;
 /* private: */
 	struct ccw_device_private *private;	/* cio private information */
+	struct mutex reg_mutex;
 /* public: */
 	struct ccw_device_id id;
 	struct ccw_driver *drv;
@@ -103,6 +106,8 @@ struct ccw_device {
 					       was successfully verified. */
 #define PE_PATHGROUP_ESTABLISHED	0x4 /* A pathgroup was reset and had
 					       to be established again. */
+#define PE_PATH_FCES_EVENT		0x8 /* The FCES Status of a path has
+					     * changed. */
 
 /*
  * Possible CIO actions triggered by the unit check handler.
@@ -114,7 +119,7 @@ enum uc_todo {
 };
 
 /**
- * struct ccw driver - device driver for channel attached devices
+ * struct ccw_driver - device driver for channel attached devices
  * @ids: ids supported by this driver
  * @probe: function called on probe
  * @remove: function called on remove
@@ -123,11 +128,6 @@ enum uc_todo {
  * @notify: notify driver of device state changes
  * @path_event: notify driver of channel path events
  * @shutdown: called at device shutdown
- * @prepare: prepare for pm state transition
- * @complete: undo work done in @prepare
- * @freeze: callback for freezing during hibernation snapshotting
- * @thaw: undo work done in @freeze
- * @restore: callback for restoring after hibernation
  * @uc_handler: callback for unit check handler
  * @driver: embedded device driver structure
  * @int_class: interruption class to use for accounting interrupts
@@ -141,11 +141,6 @@ struct ccw_driver {
 	int (*notify) (struct ccw_device *, int);
 	void (*path_event) (struct ccw_device *, int *);
 	void (*shutdown) (struct ccw_device *);
-	int (*prepare) (struct ccw_device *);
-	void (*complete) (struct ccw_device *);
-	int (*freeze)(struct ccw_device *);
-	int (*thaw) (struct ccw_device *);
-	int (*restore)(struct ccw_device *);
 	enum uc_todo (*uc_handler) (struct ccw_device *, struct irb *);
 	struct device_driver driver;
 	enum interruption_class int_class;
@@ -159,9 +154,6 @@ extern struct ccw_device *get_ccwdev_by_busid(struct ccw_driver *cdrv,
  * when new devices for its type pop up */
 extern int  ccw_driver_register   (struct ccw_driver *driver);
 extern void ccw_driver_unregister (struct ccw_driver *driver);
-
-struct ccw1;
-
 extern int ccw_device_set_options_mask(struct ccw_device *, unsigned long);
 extern int ccw_device_set_options(struct ccw_device *, unsigned long);
 extern void ccw_device_clear_options(struct ccw_device *, unsigned long);
@@ -218,15 +210,15 @@ extern void ccw_device_get_id(struct ccw_device *, struct ccw_dev_id *);
 #define get_ccwdev_lock(x) (x)->ccwlock
 
 #define to_ccwdev(n) container_of(n, struct ccw_device, dev)
-#define to_ccwdrv(n) container_of(n, struct ccw_driver, driver)
+#define to_ccwdrv(n) container_of_const(n, struct ccw_driver, driver)
 
 extern struct ccw_device *ccw_device_create_console(struct ccw_driver *);
 extern void ccw_device_destroy_console(struct ccw_device *);
 extern int ccw_device_enable_console(struct ccw_device *);
 extern void ccw_device_wait_idle(struct ccw_device *);
-extern int ccw_device_force_console(struct ccw_device *);
 
-extern void *ccw_device_dma_zalloc(struct ccw_device *cdev, size_t size);
+extern void *ccw_device_dma_zalloc(struct ccw_device *cdev, size_t size,
+				   dma32_t *dma_handle);
 extern void ccw_device_dma_free(struct ccw_device *cdev,
 				void *cpu_addr, size_t size);
 
@@ -236,4 +228,11 @@ extern void ccw_device_get_schid(struct ccw_device *, struct subchannel_id *);
 
 struct channel_path_desc_fmt0 *ccw_device_get_chp_desc(struct ccw_device *, int);
 u8 *ccw_device_get_util_str(struct ccw_device *cdev, int chp_idx);
+int ccw_device_pnso(struct ccw_device *cdev,
+		    struct chsc_pnso_area *pnso_area, u8 oc,
+		    struct chsc_pnso_resume_token resume_token, int cnc);
+int ccw_device_get_cssid(struct ccw_device *cdev, u8 *cssid);
+int ccw_device_get_iid(struct ccw_device *cdev, u8 *iid);
+int ccw_device_get_chpid(struct ccw_device *cdev, int chp_idx, u8 *chpid);
+int ccw_device_get_chid(struct ccw_device *cdev, int chp_idx, u16 *chid);
 #endif /* _S390_CCWDEV_H_ */
diff --git a/arch/s390/include/asm/ccwgroup.h b/arch/s390/include/asm/ccwgroup.h
index 7293c139dd79..11d2fb3de4f5 100644
--- a/arch/s390/include/asm/ccwgroup.h
+++ b/arch/s390/include/asm/ccwgroup.h
@@ -11,8 +11,7 @@ struct ccw_driver;
  * @count: number of attached slave devices
  * @dev: embedded device structure
  * @cdev: variable number of slave devices, allocated as needed
- * @ungroup_work: work to be done when a ccwgroup notifier has action
- *	type %BUS_NOTIFY_UNBIND_DRIVER
+ * @ungroup_work: used to ungroup the ccwgroup device
  */
 struct ccwgroup_device {
 	enum {
@@ -26,7 +25,7 @@ struct ccwgroup_device {
 	unsigned int count;
 	struct device	dev;
 	struct work_struct ungroup_work;
-	struct ccw_device *cdev[0];
+	struct ccw_device *cdev[];
 };
 
 /**
@@ -36,11 +35,6 @@ struct ccwgroup_device {
  * @set_online: function called when device is set online
  * @set_offline: function called when device is set offline
  * @shutdown: function called when device is shut down
- * @prepare: prepare for pm state transition
- * @complete: undo work done in @prepare
- * @freeze: callback for freezing during hibernation snapshotting
- * @thaw: undo work done in @freeze
- * @restore: callback for restoring after hibernation
  * @driver: embedded driver structure
  * @ccw_driver: supported ccw_driver (optional)
  */
@@ -50,11 +44,6 @@ struct ccwgroup_driver {
 	int (*set_online) (struct ccwgroup_device *);
 	int (*set_offline) (struct ccwgroup_device *);
 	void (*shutdown)(struct ccwgroup_device *);
-	int (*prepare) (struct ccwgroup_device *);
-	void (*complete) (struct ccwgroup_device *);
-	int (*freeze)(struct ccwgroup_device *);
-	int (*thaw) (struct ccwgroup_device *);
-	int (*restore)(struct ccwgroup_device *);
 
 	struct device_driver driver;
 	struct ccw_driver *ccw_driver;
@@ -64,11 +53,9 @@ extern int  ccwgroup_driver_register   (struct ccwgroup_driver *cdriver);
 extern void ccwgroup_driver_unregister (struct ccwgroup_driver *cdriver);
 int ccwgroup_create_dev(struct device *root, struct ccwgroup_driver *gdrv,
 			int num_devices, const char *buf);
-struct ccwgroup_device *get_ccwgroupdev_by_busid(struct ccwgroup_driver *gdrv,
-						 char *bus_id);
 
 extern int ccwgroup_set_online(struct ccwgroup_device *gdev);
-extern int ccwgroup_set_offline(struct ccwgroup_device *gdev);
+int ccwgroup_set_offline(struct ccwgroup_device *gdev, bool call_gdrv);
 
 extern int ccwgroup_probe_ccwdev(struct ccw_device *cdev);
 extern void ccwgroup_remove_ccwdev(struct ccw_device *cdev);
diff --git a/arch/s390/include/asm/checksum.h b/arch/s390/include/asm/checksum.h
index 91e376b0d28c..d86dea5900e7 100644
--- a/arch/s390/include/asm/checksum.h
+++ b/arch/s390/include/asm/checksum.h
@@ -12,128 +12,117 @@
 #ifndef _S390_CHECKSUM_H
 #define _S390_CHECKSUM_H
 
-#include <linux/uaccess.h>
+#include <linux/instrumented.h>
+#include <linux/kmsan-checks.h>
+#include <linux/in6.h>
 
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 32-bit boundary
- */
-static inline __wsum
-csum_partial(const void *buff, int len, __wsum sum)
+static inline __wsum cksm(const void *buff, int len, __wsum sum)
 {
-	register unsigned long reg2 asm("2") = (unsigned long) buff;
-	register unsigned long reg3 asm("3") = (unsigned long) len;
+	union register_pair rp = {
+		.even = (unsigned long)buff,
+		.odd = (unsigned long)len,
+	};
 
+	instrument_read(buff, len);
+	kmsan_check_memory(buff, len);
 	asm volatile(
-		"0:	cksm	%0,%1\n"	/* do checksum on longs */
+		"0:	cksm	%[sum],%[rp]\n"
 		"	jo	0b\n"
-		: "+d" (sum), "+d" (reg2), "+d" (reg3) : : "cc", "memory");
+		: [sum] "+&d" (sum), [rp] "+&d" (rp.pair) : : "cc", "memory");
 	return sum;
 }
 
-/*
- * the same as csum_partial_copy, but copies from user space.
- *
- * here even more important to align src and dst on a 32-bit (or even
- * better 64-bit) boundary
- *
- * Copy from userspace and compute checksum.
- */
-static inline __wsum
-csum_partial_copy_from_user(const void __user *src, void *dst,
-                                          int len, __wsum sum,
-                                          int *err_ptr)
-{
-	if (unlikely(copy_from_user(dst, src, len)))
-		*err_ptr = -EFAULT;
-	return csum_partial(dst, len, sum);
-}
+__wsum csum_partial(const void *buff, int len, __wsum sum);
 
-
-static inline __wsum
-csum_partial_copy_nocheck (const void *src, void *dst, int len, __wsum sum)
-{
-        memcpy(dst,src,len);
-	return csum_partial(dst, len, sum);
-}
+#define _HAVE_ARCH_CSUM_AND_COPY
+__wsum csum_partial_copy_nocheck(const void *src, void *dst, int len);
 
 /*
- *      Fold a partial checksum without adding pseudo headers
+ * Fold a partial checksum without adding pseudo headers.
  */
 static inline __sum16 csum_fold(__wsum sum)
 {
 	u32 csum = (__force u32) sum;
 
-	csum += (csum >> 16) + (csum << 16);
+	csum += (csum >> 16) | (csum << 16);
 	csum >>= 16;
 	return (__force __sum16) ~csum;
 }
 
 /*
- *	This is a version of ip_compute_csum() optimized for IP headers,
- *	which always checksum on 4 octet boundaries.
- *
+ * This is a version of ip_compute_csum() optimized for IP headers,
+ * which always checksums on 4 octet boundaries.
  */
 static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
 {
-	return csum_fold(csum_partial(iph, ihl*4, 0));
+	__u64 csum = 0;
+	__u32 *ptr = (u32 *)iph;
+
+	csum += *ptr++;
+	csum += *ptr++;
+	csum += *ptr++;
+	csum += *ptr++;
+	ihl -= 4;
+	while (ihl--)
+		csum += *ptr++;
+	csum += (csum >> 32) | (csum << 32);
+	return csum_fold((__force __wsum)(csum >> 32));
 }
 
 /*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 32-bit checksum
+ * Computes the checksum of the TCP/UDP pseudo-header.
+ * Returns a 32-bit checksum.
  */
-static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto,
-                   __wsum sum)
+static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+					__u8 proto, __wsum sum)
 {
-	__u32 csum = (__force __u32)sum;
+	__u64 csum = (__force __u64)sum;
 
 	csum += (__force __u32)saddr;
-	if (csum < (__force __u32)saddr)
-		csum++;
-
 	csum += (__force __u32)daddr;
-	if (csum < (__force __u32)daddr)
-		csum++;
-
-	csum += len + proto;
-	if (csum < len + proto)
-		csum++;
-
-	return (__force __wsum)csum;
+	csum += len;
+	csum += proto;
+	csum += (csum >> 32) | (csum << 32);
+	return (__force __wsum)(csum >> 32);
 }
 
 /*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 16-bit checksum, already complemented
+ * Computes the checksum of the TCP/UDP pseudo-header.
+ * Returns a 16-bit checksum, already complemented.
  */
-
-static inline __sum16
-csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto,
-                  __wsum sum)
+static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
+					__u8 proto, __wsum sum)
 {
-	return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
+	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
 }
 
 /*
- * this routine is used for miscellaneous IP-like checksums, mainly
- * in icmp.c
+ * Used for miscellaneous IP-like checksums, mainly icmp.
  */
-
 static inline __sum16 ip_compute_csum(const void *buff, int len)
 {
 	return csum_fold(csum_partial(buff, len, 0));
 }
 
-#endif /* _S390_CHECKSUM_H */
-
+#define _HAVE_ARCH_IPV6_CSUM
+static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+				      const struct in6_addr *daddr,
+				      __u32 len, __u8 proto, __wsum csum)
+{
+	__u64 sum = (__force __u64)csum;
+
+	sum += (__force __u32)saddr->s6_addr32[0];
+	sum += (__force __u32)saddr->s6_addr32[1];
+	sum += (__force __u32)saddr->s6_addr32[2];
+	sum += (__force __u32)saddr->s6_addr32[3];
+	sum += (__force __u32)daddr->s6_addr32[0];
+	sum += (__force __u32)daddr->s6_addr32[1];
+	sum += (__force __u32)daddr->s6_addr32[2];
+	sum += (__force __u32)daddr->s6_addr32[3];
+	sum += len;
+	sum += proto;
+	sum += (sum >> 32) | (sum << 32);
+	return csum_fold((__force __wsum)(sum >> 32));
+}
 
+#endif /* _S390_CHECKSUM_H */
diff --git a/arch/s390/include/asm/chsc.h b/arch/s390/include/asm/chsc.h
new file mode 100644
index 000000000000..bb78159d8042
--- /dev/null
+++ b/arch/s390/include/asm/chsc.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s): Alexandra Winter <wintera@linux.ibm.com>
+ *
+ * Interface for Channel Subsystem Call
+ */
+#ifndef _ASM_S390_CHSC_H
+#define _ASM_S390_CHSC_H
+
+#include <uapi/asm/chsc.h>
+
+/* struct from linux/notifier.h */
+struct notifier_block;
+
+/**
+ * Operation codes for CHSC PNSO:
+ *    PNSO_OC_NET_BRIDGE_INFO - only addresses that are visible to a bridgeport
+ *    PNSO_OC_NET_ADDR_INFO   - all addresses
+ */
+#define PNSO_OC_NET_BRIDGE_INFO		0
+#define PNSO_OC_NET_ADDR_INFO		3
+/**
+ * struct chsc_pnso_naid_l2 - network address information descriptor
+ * @nit:  Network interface token
+ * @addr_lnid: network address and logical network id (VLAN ID)
+ */
+struct chsc_pnso_naid_l2 {
+	u64 nit;
+	struct { u8 mac[6]; u16 lnid; } addr_lnid;
+} __packed;
+
+struct chsc_pnso_resume_token {
+	u64 t1;
+	u64 t2;
+} __packed;
+
+struct chsc_pnso_naihdr {
+	struct chsc_pnso_resume_token resume_token;
+	u32:32;
+	u32 instance;
+	u32:24;
+	u8 naids;
+	u32 reserved[3];
+} __packed;
+
+struct chsc_pnso_area {
+	struct chsc_header request;
+	u8:2;
+	u8 m:1;
+	u8:5;
+	u8:2;
+	u8 ssid:2;
+	u8 fmt:4;
+	u16 sch;
+	u8:8;
+	u8 cssid;
+	u16:16;
+	u8 oc;
+	u32:24;
+	struct chsc_pnso_resume_token resume_token;
+	u32 n:1;
+	u32:31;
+	u32 reserved[3];
+	struct chsc_header response;
+	u32:32;
+	struct chsc_pnso_naihdr naihdr;
+	struct chsc_pnso_naid_l2 entries[];
+} __packed __aligned(PAGE_SIZE);
+
+/*
+ * notifier interface - registered notifiers gets called on
+ * the following events:
+ * - ap config changed (CHSC_NOTIFY_AP_CFG)
+ */
+enum chsc_notify_type {
+	CHSC_NOTIFY_AP_CFG = 3,
+};
+
+int chsc_notifier_register(struct notifier_block *nb);
+int chsc_notifier_unregister(struct notifier_block *nb);
+
+#endif /* _ASM_S390_CHSC_H */
diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h
index b5bfb3123cb1..b6b619f340a5 100644
--- a/arch/s390/include/asm/cio.h
+++ b/arch/s390/include/asm/cio.h
@@ -5,10 +5,11 @@
 #ifndef _ASM_S390_CIO_H_
 #define _ASM_S390_CIO_H_
 
-#include <linux/spinlock.h>
 #include <linux/bitops.h>
 #include <linux/genalloc.h>
+#include <asm/dma-types.h>
 #include <asm/types.h>
+#include <asm/tpi.h>
 
 #define LPM_ANYPATH 0xff
 #define __MAX_CSSID 0
@@ -32,7 +33,7 @@ struct ccw1 {
 	__u8  cmd_code;
 	__u8  flags;
 	__u16 count;
-	__u32 cda;
+	dma32_t cda;
 } __attribute__ ((packed,aligned(8)));
 
 /**
@@ -152,8 +153,8 @@ struct sublog {
 struct esw0 {
 	struct sublog sublog;
 	struct erw erw;
-	__u32  faddr[2];
-	__u32  saddr;
+	dma32_t faddr[2];
+	dma32_t saddr;
 } __attribute__ ((packed));
 
 /**
@@ -329,7 +330,7 @@ struct ccw_dev_id {
 };
 
 /**
- * ccw_device_id_is_equal() - compare two ccw_dev_ids
+ * ccw_dev_id_is_equal() - compare two ccw_dev_ids
  * @dev_id1: a ccw_dev_id
  * @dev_id2: another ccw_dev_id
  * Returns:
@@ -356,7 +357,6 @@ static inline u8 pathmask_to_pos(u8 mask)
 	return 8 - ffs(mask);
 }
 
-void channel_subsystem_reinit(void);
 extern void css_schedule_reprobe(void);
 
 extern void *cio_dma_zalloc(size_t size);
@@ -365,13 +365,17 @@ extern struct device *cio_get_dma_css_dev(void);
 
 void *cio_gp_dma_zalloc(struct gen_pool *gp_dma, struct device *dma_dev,
 			size_t size);
+void *__cio_gp_dma_zalloc(struct gen_pool *gp_dma, struct device *dma_dev,
+			  size_t size, dma32_t *dma_handle);
 void cio_gp_dma_free(struct gen_pool *gp_dma, void *cpu_addr, size_t size);
 void cio_gp_dma_destroy(struct gen_pool *gp_dma, struct device *dma_dev);
 struct gen_pool *cio_gp_dma_create(struct device *dma_dev, int nr_pages);
 
 /* Function from drivers/s390/cio/chsc.c */
-int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta);
+int chsc_sstpc(void *page, unsigned int op, u16 ctrl, long *clock_delta);
 int chsc_sstpi(void *page, void *result, size_t size);
+int chsc_stzi(void *page, void *result, size_t size);
 int chsc_sgib(u32 origin);
+int chsc_scud(u16 cu, u64 *esm, u8 *esm_valid);
 
 #endif
diff --git a/arch/s390/include/asm/clocksource.h b/arch/s390/include/asm/clocksource.h
new file mode 100644
index 000000000000..03434369fce4
--- /dev/null
+++ b/arch/s390/include/asm/clocksource.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* s390-specific clocksource additions */
+
+#ifndef _ASM_S390_CLOCKSOURCE_H
+#define _ASM_S390_CLOCKSOURCE_H
+
+#endif /* _ASM_S390_CLOCKSOURCE_H */
diff --git a/arch/s390/include/asm/clp.h b/arch/s390/include/asm/clp.h
index 3925b0f085b7..10919eeb7533 100644
--- a/arch/s390/include/asm/clp.h
+++ b/arch/s390/include/asm/clp.h
@@ -5,6 +5,9 @@
 /* CLP common request & response block size */
 #define CLP_BLK_SIZE			PAGE_SIZE
 
+/* Call Logical Processor - Command Code */
+#define CLP_SLPC		0x0001
+
 #define CLP_LPS_BASE	0
 #define CLP_LPS_PCI	2
 
diff --git a/arch/s390/include/asm/cmpxchg.h b/arch/s390/include/asm/cmpxchg.h
index af99c1f66f12..a9e2006033b7 100644
--- a/arch/s390/include/asm/cmpxchg.h
+++ b/arch/s390/include/asm/cmpxchg.h
@@ -11,56 +11,263 @@
 #include <linux/mmdebug.h>
 #include <linux/types.h>
 #include <linux/bug.h>
+#include <asm/asm.h>
 
-#define cmpxchg(ptr, o, n)						\
+void __cmpxchg_called_with_bad_pointer(void);
+
+static __always_inline u32 __cs_asm(u64 ptr, u32 old, u32 new)
+{
+	asm volatile(
+		"	cs	%[old],%[new],%[ptr]\n"
+		: [old] "+d" (old), [ptr] "+Q" (*(u32 *)ptr)
+		: [new] "d" (new)
+		: "memory", "cc");
+	return old;
+}
+
+static __always_inline u64 __csg_asm(u64 ptr, u64 old, u64 new)
+{
+	asm volatile(
+		"	csg	%[old],%[new],%[ptr]\n"
+		: [old] "+d" (old), [ptr] "+QS" (*(u64 *)ptr)
+		: [new] "d" (new)
+		: "memory", "cc");
+	return old;
+}
+
+static inline u8 __arch_cmpxchg1(u64 ptr, u8 old, u8 new)
+{
+	union {
+		u8 b[4];
+		u32 w;
+	} old32, new32;
+	u32 prev;
+	int i;
+
+	i = ptr & 3;
+	ptr &= ~0x3;
+	prev = READ_ONCE(*(u32 *)ptr);
+	do {
+		old32.w = prev;
+		if (old32.b[i] != old)
+			return old32.b[i];
+		new32.w = old32.w;
+		new32.b[i] = new;
+		prev = __cs_asm(ptr, old32.w, new32.w);
+	} while (prev != old32.w);
+	return old;
+}
+
+static inline u16 __arch_cmpxchg2(u64 ptr, u16 old, u16 new)
+{
+	union {
+		u16 b[2];
+		u32 w;
+	} old32, new32;
+	u32 prev;
+	int i;
+
+	i = (ptr & 3) >> 1;
+	ptr &= ~0x3;
+	prev = READ_ONCE(*(u32 *)ptr);
+	do {
+		old32.w = prev;
+		if (old32.b[i] != old)
+			return old32.b[i];
+		new32.w = old32.w;
+		new32.b[i] = new;
+		prev = __cs_asm(ptr, old32.w, new32.w);
+	} while (prev != old32.w);
+	return old;
+}
+
+static __always_inline u64 __arch_cmpxchg(u64 ptr, u64 old, u64 new, int size)
+{
+	switch (size) {
+	case 1:	 return __arch_cmpxchg1(ptr, old & 0xff, new & 0xff);
+	case 2:  return __arch_cmpxchg2(ptr, old & 0xffff, new & 0xffff);
+	case 4:  return __cs_asm(ptr, old & 0xffffffff, new & 0xffffffff);
+	case 8:  return __csg_asm(ptr, old, new);
+	default: __cmpxchg_called_with_bad_pointer();
+	}
+	return old;
+}
+
+#define arch_cmpxchg(ptr, o, n)						\
 ({									\
-	__typeof__(*(ptr)) __o = (o);					\
-	__typeof__(*(ptr)) __n = (n);					\
-	(__typeof__(*(ptr))) __sync_val_compare_and_swap((ptr),__o,__n);\
+	(__typeof__(*(ptr)))__arch_cmpxchg((unsigned long)(ptr),	\
+					   (unsigned long)(o),		\
+					   (unsigned long)(n),		\
+					   sizeof(*(ptr)));		\
 })
 
-#define cmpxchg64	cmpxchg
-#define cmpxchg_local	cmpxchg
-#define cmpxchg64_local	cmpxchg
+#define arch_cmpxchg64		arch_cmpxchg
+#define arch_cmpxchg_local	arch_cmpxchg
+#define arch_cmpxchg64_local	arch_cmpxchg
+
+#ifdef __HAVE_ASM_FLAG_OUTPUTS__
 
-#define xchg(ptr, x)							\
+#define arch_try_cmpxchg(ptr, oldp, new)				\
 ({									\
-	__typeof__(ptr) __ptr = (ptr);					\
-	__typeof__(*(ptr)) __old;					\
-	do {								\
-		__old = *__ptr;						\
-	} while (!__sync_bool_compare_and_swap(__ptr, __old, x));	\
-	__old;								\
+	__typeof__(ptr) __oldp = (__typeof__(ptr))(oldp);		\
+	__typeof__(*(ptr)) __old = *__oldp;				\
+	__typeof__(*(ptr)) __new = (new);				\
+	__typeof__(*(ptr)) __prev;					\
+	int __cc;							\
+									\
+	switch (sizeof(*(ptr))) {					\
+	case 1:								\
+	case 2: {							\
+		__prev = arch_cmpxchg((ptr), (__old), (__new));		\
+		__cc = (__prev != __old);				\
+		if (unlikely(__cc))					\
+			*__oldp = __prev;				\
+		break;							\
+	}								\
+	case 4:	{							\
+		asm volatile(						\
+			"	cs	%[__old],%[__new],%[__ptr]\n"	\
+			: [__old] "+d" (*__oldp),			\
+			  [__ptr] "+Q" (*(ptr)),			\
+			  "=@cc" (__cc)					\
+			: [__new] "d" (__new)				\
+			: "memory");					\
+		break;							\
+	}								\
+	case 8:	{							\
+		 asm volatile(						\
+			 "	csg	%[__old],%[__new],%[__ptr]\n"	\
+			 : [__old] "+d" (*__oldp),			\
+			   [__ptr] "+QS" (*(ptr)),			\
+			   "=@cc" (__cc)				\
+			 : [__new] "d" (__new)				\
+			 : "memory");					\
+		 break;							\
+	}								\
+	default:							\
+		__cmpxchg_called_with_bad_pointer();			\
+	}								\
+	likely(__cc == 0);						\
 })
 
-#define __cmpxchg_double(p1, p2, o1, o2, n1, n2)			\
+#else /* __HAVE_ASM_FLAG_OUTPUTS__ */
+
+#define arch_try_cmpxchg(ptr, oldp, new)				\
 ({									\
-	register __typeof__(*(p1)) __old1 asm("2") = (o1);		\
-	register __typeof__(*(p2)) __old2 asm("3") = (o2);		\
-	register __typeof__(*(p1)) __new1 asm("4") = (n1);		\
-	register __typeof__(*(p2)) __new2 asm("5") = (n2);		\
-	int cc;								\
-	asm volatile(							\
-		"	cdsg	%[old],%[new],%[ptr]\n"			\
-		"	ipm	%[cc]\n"				\
-		"	srl	%[cc],28"				\
-		: [cc] "=d" (cc), [old] "+d" (__old1), "+d" (__old2)	\
-		: [new] "d" (__new1), "d" (__new2),			\
-		  [ptr] "Q" (*(p1)), "Q" (*(p2))			\
-		: "memory", "cc");					\
-	!cc;								\
+	__typeof__((ptr)) __oldp = (__typeof__(ptr))(oldp);		\
+	__typeof__(*(ptr)) __old = *__oldp;				\
+	__typeof__(*(ptr)) __new = (new);				\
+	__typeof__(*(ptr)) __prev;					\
+									\
+	__prev = arch_cmpxchg((ptr), (__old), (__new));			\
+	if (unlikely(__prev != __old))					\
+		*__oldp = __prev;					\
+	likely(__prev == __old);					\
 })
 
-#define cmpxchg_double(p1, p2, o1, o2, n1, n2)				\
+#endif /* __HAVE_ASM_FLAG_OUTPUTS__ */
+
+#define arch_try_cmpxchg64		arch_try_cmpxchg
+#define arch_try_cmpxchg_local		arch_try_cmpxchg
+#define arch_try_cmpxchg64_local	arch_try_cmpxchg
+
+void __xchg_called_with_bad_pointer(void);
+
+static inline u8 __arch_xchg1(u64 ptr, u8 x)
+{
+	int shift = (3 ^ (ptr & 3)) << 3;
+	u32 mask, old, new;
+
+	ptr &= ~0x3;
+	mask = ~(0xff << shift);
+	old = READ_ONCE(*(u32 *)ptr);
+	do {
+		new = old & mask;
+		new |= x << shift;
+	} while (!arch_try_cmpxchg((u32 *)ptr, &old, new));
+	return old >> shift;
+}
+
+static inline u16 __arch_xchg2(u64 ptr, u16 x)
+{
+	int shift = (2 ^ (ptr & 2)) << 3;
+	u32 mask, old, new;
+
+	ptr &= ~0x3;
+	mask = ~(0xffff << shift);
+	old = READ_ONCE(*(u32 *)ptr);
+	do {
+		new = old & mask;
+		new |= x << shift;
+	} while (!arch_try_cmpxchg((u32 *)ptr, &old, new));
+	return old >> shift;
+}
+
+static __always_inline u64 __arch_xchg(u64 ptr, u64 x, int size)
+{
+	switch (size) {
+	case 1:
+		return __arch_xchg1(ptr, x & 0xff);
+	case 2:
+		return __arch_xchg2(ptr, x & 0xffff);
+	case 4: {
+		u32 old = READ_ONCE(*(u32 *)ptr);
+
+		do {
+		} while (!arch_try_cmpxchg((u32 *)ptr, &old, x & 0xffffffff));
+		return old;
+	}
+	case 8: {
+		u64 old = READ_ONCE(*(u64 *)ptr);
+
+		do {
+		} while (!arch_try_cmpxchg((u64 *)ptr, &old, x));
+		return old;
+	}
+	}
+	__xchg_called_with_bad_pointer();
+	return x;
+}
+
+#define arch_xchg(ptr, x)						\
 ({									\
-	__typeof__(p1) __p1 = (p1);					\
-	__typeof__(p2) __p2 = (p2);					\
-	BUILD_BUG_ON(sizeof(*(p1)) != sizeof(long));			\
-	BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long));			\
-	VM_BUG_ON((unsigned long)((__p1) + 1) != (unsigned long)(__p2));\
-	__cmpxchg_double(__p1, __p2, o1, o2, n1, n2);			\
+	(__typeof__(*(ptr)))__arch_xchg((unsigned long)(ptr),		\
+					(unsigned long)(x),		\
+					sizeof(*(ptr)));		\
 })
 
-#define system_has_cmpxchg_double()	1
+#define system_has_cmpxchg128()		1
+
+static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new)
+{
+	asm volatile(
+		"	cdsg	%[old],%[new],%[ptr]\n"
+		: [old] "+d" (old), [ptr] "+QS" (*ptr)
+		: [new] "d" (new)
+		: "memory", "cc");
+	return old;
+}
+
+#define arch_cmpxchg128		arch_cmpxchg128
+#define arch_cmpxchg128_local	arch_cmpxchg128
+
+#ifdef __HAVE_ASM_FLAG_OUTPUTS__
+
+static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *oldp, u128 new)
+{
+	int cc;
+
+	asm volatile(
+		"	cdsg	%[old],%[new],%[ptr]\n"
+		: [old] "+d" (*oldp), [ptr] "+QS" (*ptr), "=@cc" (cc)
+		: [new] "d" (new)
+		: "memory");
+	return likely(cc == 0);
+}
+
+#define arch_try_cmpxchg128		arch_try_cmpxchg128
+#define arch_try_cmpxchg128_local	arch_try_cmpxchg128
+
+#endif /* __HAVE_ASM_FLAG_OUTPUTS__ */
 
 #endif /* __ASM_CMPXCHG_H */
diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h
index 63b46e30b2c3..3cb9d813f022 100644
--- a/arch/s390/include/asm/compat.h
+++ b/arch/s390/include/asm/compat.h
@@ -8,6 +8,22 @@
 #include <linux/sched.h>
 #include <linux/sched/task_stack.h>
 #include <linux/thread_info.h>
+#include <asm/ptrace.h>
+
+#define compat_mode_t	compat_mode_t
+typedef u16		compat_mode_t;
+
+#define __compat_uid_t	__compat_uid_t
+typedef u16		__compat_uid_t;
+typedef u16		__compat_gid_t;
+
+#define compat_dev_t	compat_dev_t
+typedef u16		compat_dev_t;
+
+#define compat_ipc_pid_t compat_ipc_pid_t
+typedef u16		 compat_ipc_pid_t;
+
+#define compat_statfs	compat_statfs
 
 #include <asm-generic/compat.h>
 
@@ -19,52 +35,16 @@
 	(__force t)(__TYPE_IS_PTR(t) ? ((v) & 0x7fffffff) : (v)); \
 })
 
-#define PSW32_MASK_PER		0x40000000UL
-#define PSW32_MASK_DAT		0x04000000UL
-#define PSW32_MASK_IO		0x02000000UL
-#define PSW32_MASK_EXT		0x01000000UL
-#define PSW32_MASK_KEY		0x00F00000UL
-#define PSW32_MASK_BASE		0x00080000UL	/* Always one */
-#define PSW32_MASK_MCHECK	0x00040000UL
-#define PSW32_MASK_WAIT		0x00020000UL
-#define PSW32_MASK_PSTATE	0x00010000UL
-#define PSW32_MASK_ASC		0x0000C000UL
-#define PSW32_MASK_CC		0x00003000UL
-#define PSW32_MASK_PM		0x00000f00UL
-#define PSW32_MASK_RI		0x00000080UL
-
 #define PSW32_MASK_USER		0x0000FF00UL
 
-#define PSW32_ADDR_AMODE	0x80000000UL
-#define PSW32_ADDR_INSN		0x7FFFFFFFUL
-
-#define PSW32_DEFAULT_KEY	(((u32) PAGE_DEFAULT_ACC) << 20)
-
-#define PSW32_ASC_PRIMARY	0x00000000UL
-#define PSW32_ASC_ACCREG	0x00004000UL
-#define PSW32_ASC_SECONDARY	0x00008000UL
-#define PSW32_ASC_HOME		0x0000C000UL
-
 #define PSW32_USER_BITS (PSW32_MASK_DAT | PSW32_MASK_IO | PSW32_MASK_EXT | \
 			 PSW32_DEFAULT_KEY | PSW32_MASK_BASE | \
 			 PSW32_MASK_MCHECK | PSW32_MASK_PSTATE | \
 			 PSW32_ASC_PRIMARY)
 
-#define COMPAT_USER_HZ		100
 #define COMPAT_UTS_MACHINE	"s390\0\0\0\0"
 
-typedef u16		__compat_uid_t;
-typedef u16		__compat_gid_t;
-typedef u32		__compat_uid32_t;
-typedef u32		__compat_gid32_t;
-typedef u16		compat_mode_t;
-typedef u16		compat_dev_t;
 typedef u16		compat_nlink_t;
-typedef u16		compat_ipc_pid_t;
-typedef u32		compat_caddr_t;
-typedef __kernel_fsid_t	compat_fsid_t;
-typedef s64		compat_s64;
-typedef u64		compat_u64;
 
 typedef struct {
 	u32 mask;
@@ -105,26 +85,6 @@ struct compat_stat {
 	u32		__unused5;
 };
 
-struct compat_flock {
-	short		l_type;
-	short		l_whence;
-	compat_off_t	l_start;
-	compat_off_t	l_len;
-	compat_pid_t	l_pid;
-};
-
-#define F_GETLK64       12
-#define F_SETLK64       13
-#define F_SETLKW64      14    
-
-struct compat_flock64 {
-	short		l_type;
-	short		l_whence;
-	compat_loff_t	l_start;
-	compat_loff_t	l_len;
-	compat_pid_t	l_pid;
-};
-
 struct compat_statfs {
 	u32		f_type;
 	u32		f_bsize;
@@ -152,20 +112,9 @@ struct compat_statfs64 {
 	u32		f_namelen;
 	u32		f_frsize;
 	u32		f_flags;
-	u32		f_spare[4];
+	u32		f_spare[5];
 };
 
-#define COMPAT_RLIM_INFINITY		0xffffffff
-
-typedef u32		compat_old_sigset_t;	/* at least 32 bits */
-
-#define _COMPAT_NSIG		64
-#define _COMPAT_NSIG_BPW	32
-
-typedef u32		compat_sigset_word;
-
-#define COMPAT_OFF_T_MAX	0x7fffffff
-
 /*
  * A pointer passed in from user mode. This should not
  * be used for syscall parameters, just declare them
@@ -177,11 +126,7 @@ static inline void __user *compat_ptr(compat_uptr_t uptr)
 {
 	return (void __user *)(unsigned long)(uptr & 0x7fffffffUL);
 }
-
-static inline compat_uptr_t ptr_to_compat(void __user *uptr)
-{
-	return (u32)(unsigned long)uptr;
-}
+#define compat_ptr(uptr) compat_ptr(uptr)
 
 #ifdef CONFIG_COMPAT
 
@@ -190,73 +135,6 @@ static inline int is_compat_task(void)
 	return test_thread_flag(TIF_31BIT);
 }
 
-static inline void __user *arch_compat_alloc_user_space(long len)
-{
-	unsigned long stack;
-
-	stack = KSTK_ESP(current);
-	if (is_compat_task())
-		stack &= 0x7fffffffUL;
-	return (void __user *) (stack - len);
-}
-
 #endif
 
-struct compat_ipc64_perm {
-	compat_key_t key;
-	__compat_uid32_t uid;
-	__compat_gid32_t gid;
-	__compat_uid32_t cuid;
-	__compat_gid32_t cgid;
-	compat_mode_t mode;
-	unsigned short __pad1;
-	unsigned short seq;
-	unsigned short __pad2;
-	unsigned int __unused1;
-	unsigned int __unused2;
-};
-
-struct compat_semid64_ds {
-	struct compat_ipc64_perm sem_perm;
-	compat_ulong_t sem_otime;
-	compat_ulong_t sem_otime_high;
-	compat_ulong_t sem_ctime;
-	compat_ulong_t sem_ctime_high;
-	compat_ulong_t sem_nsems;
-	compat_ulong_t __unused1;
-	compat_ulong_t __unused2;
-};
-
-struct compat_msqid64_ds {
-	struct compat_ipc64_perm msg_perm;
-	compat_ulong_t msg_stime;
-	compat_ulong_t msg_stime_high;
-	compat_ulong_t msg_rtime;
-	compat_ulong_t msg_rtime_high;
-	compat_ulong_t msg_ctime;
-	compat_ulong_t msg_ctime_high;
-	compat_ulong_t msg_cbytes;
-	compat_ulong_t msg_qnum;
-	compat_ulong_t msg_qbytes;
-	compat_pid_t   msg_lspid;
-	compat_pid_t   msg_lrpid;
-	compat_ulong_t __unused1;
-	compat_ulong_t __unused2;
-};
-
-struct compat_shmid64_ds {
-	struct compat_ipc64_perm shm_perm;
-	compat_size_t  shm_segsz;
-	compat_ulong_t shm_atime;
-	compat_ulong_t shm_atime_high;
-	compat_ulong_t shm_dtime;
-	compat_ulong_t shm_dtime_high;
-	compat_ulong_t shm_ctime;
-	compat_ulong_t shm_ctime_high;
-	compat_pid_t   shm_cpid;
-	compat_pid_t   shm_lpid;
-	compat_ulong_t shm_nattch;
-	compat_ulong_t __unused1;
-	compat_ulong_t __unused2;
-};
 #endif /* _ASM_S390X_COMPAT_H */
diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index c0f3bfeddcbe..59ab1192e2d5 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -2,7 +2,7 @@
 /*
  * CP Assist for Cryptographic Functions (CPACF)
  *
- * Copyright IBM Corp. 2003, 2017
+ * Copyright IBM Corp. 2003, 2023
  * Author(s): Thomas Spatzier
  *	      Jan Glauber
  *	      Harald Freudenberger (freude@de.ibm.com)
@@ -12,6 +12,7 @@
 #define _ASM_S390_CPACF_H
 
 #include <asm/facility.h>
+#include <linux/kmsan-checks.h>
 
 /*
  * Instruction opcodes for the CPACF instructions
@@ -53,6 +54,10 @@
 #define CPACF_KM_XTS_256	0x34
 #define CPACF_KM_PXTS_128	0x3a
 #define CPACF_KM_PXTS_256	0x3c
+#define CPACF_KM_XTS_128_FULL	0x52
+#define CPACF_KM_XTS_256_FULL	0x54
+#define CPACF_KM_PXTS_128_FULL	0x5a
+#define CPACF_KM_PXTS_256_FULL	0x5c
 
 /*
  * Function codes for the KMC (CIPHER MESSAGE WITH CHAINING)
@@ -120,18 +125,31 @@
 #define CPACF_KMAC_DEA		0x01
 #define CPACF_KMAC_TDEA_128	0x02
 #define CPACF_KMAC_TDEA_192	0x03
+#define CPACF_KMAC_HMAC_SHA_224	0x70
+#define CPACF_KMAC_HMAC_SHA_256	0x71
+#define CPACF_KMAC_HMAC_SHA_384	0x72
+#define CPACF_KMAC_HMAC_SHA_512	0x73
 
 /*
  * Function codes for the PCKMO (PERFORM CRYPTOGRAPHIC KEY MANAGEMENT)
  * instruction
  */
-#define CPACF_PCKMO_QUERY		0x00
-#define CPACF_PCKMO_ENC_DES_KEY		0x01
-#define CPACF_PCKMO_ENC_TDES_128_KEY	0x02
-#define CPACF_PCKMO_ENC_TDES_192_KEY	0x03
-#define CPACF_PCKMO_ENC_AES_128_KEY	0x12
-#define CPACF_PCKMO_ENC_AES_192_KEY	0x13
-#define CPACF_PCKMO_ENC_AES_256_KEY	0x14
+#define CPACF_PCKMO_QUERY		       0x00
+#define CPACF_PCKMO_ENC_DES_KEY		       0x01
+#define CPACF_PCKMO_ENC_TDES_128_KEY	       0x02
+#define CPACF_PCKMO_ENC_TDES_192_KEY	       0x03
+#define CPACF_PCKMO_ENC_AES_128_KEY	       0x12
+#define CPACF_PCKMO_ENC_AES_192_KEY	       0x13
+#define CPACF_PCKMO_ENC_AES_256_KEY	       0x14
+#define CPACF_PCKMO_ENC_AES_XTS_128_DOUBLE_KEY 0x15
+#define CPACF_PCKMO_ENC_AES_XTS_256_DOUBLE_KEY 0x16
+#define CPACF_PCKMO_ENC_ECC_P256_KEY	       0x20
+#define CPACF_PCKMO_ENC_ECC_P384_KEY	       0x21
+#define CPACF_PCKMO_ENC_ECC_P521_KEY	       0x22
+#define CPACF_PCKMO_ENC_ECC_ED25519_KEY	       0x28
+#define CPACF_PCKMO_ENC_ECC_ED448_KEY	       0x29
+#define CPACF_PCKMO_ENC_HMAC_512_KEY	       0x76
+#define CPACF_PCKMO_ENC_HMAC_1024_KEY	       0x7a
 
 /*
  * Function codes for the PRNO (PERFORM RANDOM NUMBER OPERATION)
@@ -159,31 +177,126 @@
 #define CPACF_KMA_LAAD	0x200	/* Last-AAD */
 #define CPACF_KMA_HS	0x400	/* Hash-subkey Supplied */
 
+/*
+ * Flags for the KIMD/KLMD (COMPUTE INTERMEDIATE/LAST MESSAGE DIGEST)
+ * instructions
+ */
+#define CPACF_KIMD_NIP		0x8000
+#define CPACF_KLMD_DUFOP	0x4000
+#define CPACF_KLMD_NIP		0x8000
+
+/*
+ * Function codes for KDSA (COMPUTE DIGITAL SIGNATURE AUTHENTICATION)
+ * instruction
+ */
+#define CPACF_KDSA_QUERY 0x00
+#define CPACF_KDSA_ECDSA_VERIFY_P256 0x01
+#define CPACF_KDSA_ECDSA_VERIFY_P384 0x02
+#define CPACF_KDSA_ECDSA_VERIFY_P521 0x03
+#define CPACF_KDSA_ECDSA_SIGN_P256 0x09
+#define CPACF_KDSA_ECDSA_SIGN_P384 0x0a
+#define CPACF_KDSA_ECDSA_SIGN_P521 0x0b
+#define CPACF_KDSA_ENC_ECDSA_SIGN_P256 0x11
+#define CPACF_KDSA_ENC_ECDSA_SIGN_P384 0x12
+#define CPACF_KDSA_ENC_ECDSA_SIGN_P521 0x13
+#define CPACF_KDSA_EDDSA_VERIFY_ED25519 0x20
+#define CPACF_KDSA_EDDSA_VERIFY_ED448 0x24
+#define CPACF_KDSA_EDDSA_SIGN_ED25519 0x28
+#define CPACF_KDSA_EDDSA_SIGN_ED448 0x2c
+#define CPACF_KDSA_ENC_EDDSA_SIGN_ED25519 0x30
+#define CPACF_KDSA_ENC_EDDSA_SIGN_ED448 0x34
+
+#define CPACF_FC_QUERY 0x00
+#define CPACF_FC_QUERY_AUTH_INFO 0x7F
+
 typedef struct { unsigned char bytes[16]; } cpacf_mask_t;
+typedef struct { unsigned char bytes[256]; } cpacf_qai_t;
 
-/**
- * cpacf_query() - check if a specific CPACF function is available
- * @opcode: the opcode of the crypto instruction
- * @func: the function code to test for
- *
- * Executes the query function for the given crypto instruction @opcode
- * and checks if @func is available
- *
- * Returns 1 if @func is available for @opcode, 0 otherwise
+/*
+ * Prototype for a not existing function to produce a link
+ * error if __cpacf_query() or __cpacf_check_opcode() is used
+ * with an invalid compile time const opcode.
  */
-static __always_inline void __cpacf_query(unsigned int opcode, cpacf_mask_t *mask)
+void __cpacf_bad_opcode(void);
+
+static __always_inline void __cpacf_query_rre(u32 opc, u8 r1, u8 r2,
+					      u8 *pb, u8 fc)
 {
-	register unsigned long r0 asm("0") = 0;	/* query function */
-	register unsigned long r1 asm("1") = (unsigned long) mask;
+	asm volatile(
+		"	la	%%r1,%[pb]\n"
+		"	lghi	%%r0,%[fc]\n"
+		"	.insn	rre,%[opc] << 16,%[r1],%[r2]\n"
+		: [pb] "=R" (*pb)
+		: [opc] "i" (opc), [fc] "i" (fc),
+		  [r1] "i" (r1), [r2] "i" (r2)
+		: "cc", "memory", "r0", "r1");
+}
 
+static __always_inline void __cpacf_query_rrf(u32 opc, u8 r1, u8 r2, u8 r3,
+					      u8 m4, u8 *pb, u8 fc)
+{
 	asm volatile(
-		"	spm 0\n" /* pckmo doesn't change the cc */
-		/* Parameter regs are ignored, but must be nonzero and unique */
-		"0:	.insn	rrf,%[opc] << 16,2,4,6,0\n"
-		"	brc	1,0b\n"	/* handle partial completion */
-		: "=m" (*mask)
-		: [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (opcode)
-		: "cc");
+		"	la	%%r1,%[pb]\n"
+		"	lghi	%%r0,%[fc]\n"
+		"	.insn	rrf,%[opc] << 16,%[r1],%[r2],%[r3],%[m4]\n"
+		: [pb] "=R" (*pb)
+		: [opc] "i" (opc), [fc] "i" (fc), [r1] "i" (r1),
+		  [r2] "i" (r2), [r3] "i" (r3), [m4] "i" (m4)
+		: "cc", "memory", "r0", "r1");
+}
+
+static __always_inline void __cpacf_query_insn(unsigned int opcode, void *pb,
+					       u8 fc)
+{
+	switch (opcode) {
+	case CPACF_KDSA:
+		__cpacf_query_rre(CPACF_KDSA, 0, 2, pb, fc);
+		break;
+	case CPACF_KIMD:
+		__cpacf_query_rre(CPACF_KIMD, 0, 2, pb, fc);
+		break;
+	case CPACF_KLMD:
+		__cpacf_query_rre(CPACF_KLMD, 0, 2, pb, fc);
+		break;
+	case CPACF_KM:
+		__cpacf_query_rre(CPACF_KM, 2, 4, pb, fc);
+		break;
+	case CPACF_KMA:
+		__cpacf_query_rrf(CPACF_KMA, 2, 4, 6, 0, pb, fc);
+		break;
+	case CPACF_KMAC:
+		__cpacf_query_rre(CPACF_KMAC, 0, 2, pb, fc);
+		break;
+	case CPACF_KMC:
+		__cpacf_query_rre(CPACF_KMC, 2, 4, pb, fc);
+		break;
+	case CPACF_KMCTR:
+		__cpacf_query_rrf(CPACF_KMCTR, 2, 4, 6, 0, pb, fc);
+		break;
+	case CPACF_KMF:
+		__cpacf_query_rre(CPACF_KMF, 2, 4, pb, fc);
+		break;
+	case CPACF_KMO:
+		__cpacf_query_rre(CPACF_KMO, 2, 4, pb, fc);
+		break;
+	case CPACF_PCC:
+		__cpacf_query_rre(CPACF_PCC, 0, 0, pb, fc);
+		break;
+	case CPACF_PCKMO:
+		__cpacf_query_rre(CPACF_PCKMO, 0, 0, pb, fc);
+		break;
+	case CPACF_PRNO:
+		__cpacf_query_rre(CPACF_PRNO, 2, 4, pb, fc);
+		break;
+	default:
+		__cpacf_bad_opcode();
+	}
+}
+
+static __always_inline void __cpacf_query(unsigned int opcode,
+					  cpacf_mask_t *mask)
+{
+	__cpacf_query_insn(opcode, mask, CPACF_FC_QUERY);
 }
 
 static __always_inline int __cpacf_check_opcode(unsigned int opcode)
@@ -206,11 +319,25 @@ static __always_inline int __cpacf_check_opcode(unsigned int opcode)
 		return test_facility(57);	/* check for MSA5 */
 	case CPACF_KMA:
 		return test_facility(146);	/* check for MSA8 */
+	case CPACF_KDSA:
+		return test_facility(155);	/* check for MSA9 */
 	default:
-		BUG();
+		__cpacf_bad_opcode();
+		return 0;
 	}
 }
 
+/**
+ * cpacf_query() - Query the function code mask for this CPACF opcode
+ * @opcode: the opcode of the crypto instruction
+ * @mask: ptr to struct cpacf_mask_t
+ *
+ * Executes the query function for the given crypto instruction @opcode
+ * and checks if @func is available
+ *
+ * On success 1 is returned and the mask is filled with the function
+ * code mask for this CPACF opcode, otherwise 0 is returned.
+ */
 static __always_inline int cpacf_query(unsigned int opcode, cpacf_mask_t *mask)
 {
 	if (__cpacf_check_opcode(opcode)) {
@@ -226,7 +353,8 @@ static inline int cpacf_test_func(cpacf_mask_t *mask, unsigned int func)
 	return (mask->bytes[func >> 3] & (0x80 >> (func & 7))) != 0;
 }
 
-static __always_inline int cpacf_query_func(unsigned int opcode, unsigned int func)
+static __always_inline int cpacf_query_func(unsigned int opcode,
+					    unsigned int func)
 {
 	cpacf_mask_t mask;
 
@@ -235,6 +363,32 @@ static __always_inline int cpacf_query_func(unsigned int opcode, unsigned int fu
 	return 0;
 }
 
+static __always_inline void __cpacf_qai(unsigned int opcode, cpacf_qai_t *qai)
+{
+	__cpacf_query_insn(opcode, qai, CPACF_FC_QUERY_AUTH_INFO);
+}
+
+/**
+ * cpacf_qai() - Get the query authentication information for a CPACF opcode
+ * @opcode: the opcode of the crypto instruction
+ * @mask: ptr to struct cpacf_qai_t
+ *
+ * Executes the query authentication information function for the given crypto
+ * instruction @opcode and checks if @func is available
+ *
+ * On success 1 is returned and the mask is filled with the query authentication
+ * information for this CPACF opcode, otherwise 0 is returned.
+ */
+static __always_inline int cpacf_qai(unsigned int opcode, cpacf_qai_t *qai)
+{
+	if (cpacf_query_func(opcode, CPACF_FC_QUERY_AUTH_INFO)) {
+		__cpacf_qai(opcode, qai);
+		return 1;
+	}
+	memset(qai, 0, sizeof(*qai));
+	return 0;
+}
+
 /**
  * cpacf_km() - executes the KM (CIPHER MESSAGE) instruction
  * @func: the function code passed to KM; see CPACF_KM_xxx defines
@@ -249,20 +403,22 @@ static __always_inline int cpacf_query_func(unsigned int opcode, unsigned int fu
 static inline int cpacf_km(unsigned long func, void *param,
 			   u8 *dest, const u8 *src, long src_len)
 {
-	register unsigned long r0 asm("0") = (unsigned long) func;
-	register unsigned long r1 asm("1") = (unsigned long) param;
-	register unsigned long r2 asm("2") = (unsigned long) src;
-	register unsigned long r3 asm("3") = (unsigned long) src_len;
-	register unsigned long r4 asm("4") = (unsigned long) dest;
+	union register_pair d, s;
 
+	d.even = (unsigned long)dest;
+	s.even = (unsigned long)src;
+	s.odd  = (unsigned long)src_len;
 	asm volatile(
+		"	lgr	0,%[fc]\n"
+		"	lgr	1,%[pba]\n"
 		"0:	.insn	rre,%[opc] << 16,%[dst],%[src]\n"
 		"	brc	1,0b\n" /* handle partial completion */
-		: [src] "+a" (r2), [len] "+d" (r3), [dst] "+a" (r4)
-		: [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KM)
-		: "cc", "memory");
+		: [src] "+&d" (s.pair), [dst] "+&d" (d.pair)
+		: [fc] "d" (func), [pba] "d" ((unsigned long)param),
+		  [opc] "i" (CPACF_KM)
+		: "cc", "memory", "0", "1");
 
-	return src_len - r3;
+	return src_len - s.odd;
 }
 
 /**
@@ -279,20 +435,22 @@ static inline int cpacf_km(unsigned long func, void *param,
 static inline int cpacf_kmc(unsigned long func, void *param,
 			    u8 *dest, const u8 *src, long src_len)
 {
-	register unsigned long r0 asm("0") = (unsigned long) func;
-	register unsigned long r1 asm("1") = (unsigned long) param;
-	register unsigned long r2 asm("2") = (unsigned long) src;
-	register unsigned long r3 asm("3") = (unsigned long) src_len;
-	register unsigned long r4 asm("4") = (unsigned long) dest;
+	union register_pair d, s;
 
+	d.even = (unsigned long)dest;
+	s.even = (unsigned long)src;
+	s.odd  = (unsigned long)src_len;
 	asm volatile(
+		"	lgr	0,%[fc]\n"
+		"	lgr	1,%[pba]\n"
 		"0:	.insn	rre,%[opc] << 16,%[dst],%[src]\n"
 		"	brc	1,0b\n" /* handle partial completion */
-		: [src] "+a" (r2), [len] "+d" (r3), [dst] "+a" (r4)
-		: [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KMC)
-		: "cc", "memory");
+		: [src] "+&d" (s.pair), [dst] "+&d" (d.pair)
+		: [fc] "d" (func), [pba] "d" ((unsigned long)param),
+		  [opc] "i" (CPACF_KMC)
+		: "cc", "memory", "0", "1");
 
-	return src_len - r3;
+	return src_len - s.odd;
 }
 
 /**
@@ -306,17 +464,19 @@ static inline int cpacf_kmc(unsigned long func, void *param,
 static inline void cpacf_kimd(unsigned long func, void *param,
 			      const u8 *src, long src_len)
 {
-	register unsigned long r0 asm("0") = (unsigned long) func;
-	register unsigned long r1 asm("1") = (unsigned long) param;
-	register unsigned long r2 asm("2") = (unsigned long) src;
-	register unsigned long r3 asm("3") = (unsigned long) src_len;
+	union register_pair s;
 
+	s.even = (unsigned long)src;
+	s.odd  = (unsigned long)src_len;
 	asm volatile(
-		"0:	.insn	rre,%[opc] << 16,0,%[src]\n"
+		"	lgr	0,%[fc]\n"
+		"	lgr	1,%[pba]\n"
+		"0:	.insn	rrf,%[opc] << 16,0,%[src],8,0\n"
 		"	brc	1,0b\n" /* handle partial completion */
-		: [src] "+a" (r2), [len] "+d" (r3)
-		: [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KIMD)
-		: "cc", "memory");
+		: [src] "+&d" (s.pair)
+		: [fc] "d" (func), [pba] "d" ((unsigned long)(param)),
+		  [opc] "i" (CPACF_KIMD)
+		: "cc", "memory", "0", "1");
 }
 
 /**
@@ -329,45 +489,66 @@ static inline void cpacf_kimd(unsigned long func, void *param,
 static inline void cpacf_klmd(unsigned long func, void *param,
 			      const u8 *src, long src_len)
 {
-	register unsigned long r0 asm("0") = (unsigned long) func;
-	register unsigned long r1 asm("1") = (unsigned long) param;
-	register unsigned long r2 asm("2") = (unsigned long) src;
-	register unsigned long r3 asm("3") = (unsigned long) src_len;
+	union register_pair s;
 
+	s.even = (unsigned long)src;
+	s.odd  = (unsigned long)src_len;
 	asm volatile(
-		"0:	.insn	rre,%[opc] << 16,0,%[src]\n"
+		"	lgr	0,%[fc]\n"
+		"	lgr	1,%[pba]\n"
+		"0:	.insn	rrf,%[opc] << 16,0,%[src],8,0\n"
 		"	brc	1,0b\n" /* handle partial completion */
-		: [src] "+a" (r2), [len] "+d" (r3)
-		: [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KLMD)
-		: "cc", "memory");
+		: [src] "+&d" (s.pair)
+		: [fc] "d" (func), [pba] "d" ((unsigned long)param),
+		  [opc] "i" (CPACF_KLMD)
+		: "cc", "memory", "0", "1");
 }
 
 /**
- * cpacf_kmac() - executes the KMAC (COMPUTE MESSAGE AUTHENTICATION CODE)
- *		  instruction
- * @func: the function code passed to KM; see CPACF_KMAC_xxx defines
+ * _cpacf_kmac() - executes the KMAC (COMPUTE MESSAGE AUTHENTICATION CODE)
+ * instruction and updates flags in gr0
+ * @gr0: pointer to gr0 (fc and flags) passed to KMAC; see CPACF_KMAC_xxx defines
  * @param: address of parameter block; see POP for details on each func
  * @src: address of source memory area
  * @src_len: length of src operand in bytes
  *
  * Returns 0 for the query func, number of processed bytes for digest funcs
  */
-static inline int cpacf_kmac(unsigned long func, void *param,
-			     const u8 *src, long src_len)
+static inline int _cpacf_kmac(unsigned long *gr0, void *param,
+			      const u8 *src, long src_len)
 {
-	register unsigned long r0 asm("0") = (unsigned long) func;
-	register unsigned long r1 asm("1") = (unsigned long) param;
-	register unsigned long r2 asm("2") = (unsigned long) src;
-	register unsigned long r3 asm("3") = (unsigned long) src_len;
+	union register_pair s;
 
+	s.even = (unsigned long)src;
+	s.odd  = (unsigned long)src_len;
 	asm volatile(
+		"	lgr	0,%[r0]\n"
+		"	lgr	1,%[pba]\n"
 		"0:	.insn	rre,%[opc] << 16,0,%[src]\n"
 		"	brc	1,0b\n" /* handle partial completion */
-		: [src] "+a" (r2), [len] "+d" (r3)
-		: [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KMAC)
-		: "cc", "memory");
+		"	lgr	%[r0],0\n"
+		: [r0] "+d" (*gr0), [src] "+&d" (s.pair)
+		: [pba] "d" ((unsigned long)param),
+		  [opc] "i" (CPACF_KMAC)
+		: "cc", "memory", "0", "1");
 
-	return src_len - r3;
+	return src_len - s.odd;
+}
+
+/**
+ * cpacf_kmac() - executes the KMAC (COMPUTE MESSAGE AUTHENTICATION CODE)
+ * instruction
+ * @func: function code passed to KMAC; see CPACF_KMAC_xxx defines
+ * @param: address of parameter block; see POP for details on each func
+ * @src: address of source memory area
+ * @src_len: length of src operand in bytes
+ *
+ * Returns 0 for the query func, number of processed bytes for digest funcs
+ */
+static inline int cpacf_kmac(unsigned long func, void *param,
+			     const u8 *src, long src_len)
+{
+	return _cpacf_kmac(&func, param, src, src_len);
 }
 
 /**
@@ -385,22 +566,24 @@ static inline int cpacf_kmac(unsigned long func, void *param,
 static inline int cpacf_kmctr(unsigned long func, void *param, u8 *dest,
 			      const u8 *src, long src_len, u8 *counter)
 {
-	register unsigned long r0 asm("0") = (unsigned long) func;
-	register unsigned long r1 asm("1") = (unsigned long) param;
-	register unsigned long r2 asm("2") = (unsigned long) src;
-	register unsigned long r3 asm("3") = (unsigned long) src_len;
-	register unsigned long r4 asm("4") = (unsigned long) dest;
-	register unsigned long r6 asm("6") = (unsigned long) counter;
+	union register_pair d, s, c;
 
+	d.even = (unsigned long)dest;
+	s.even = (unsigned long)src;
+	s.odd  = (unsigned long)src_len;
+	c.even = (unsigned long)counter;
 	asm volatile(
+		"	lgr	0,%[fc]\n"
+		"	lgr	1,%[pba]\n"
 		"0:	.insn	rrf,%[opc] << 16,%[dst],%[src],%[ctr],0\n"
 		"	brc	1,0b\n" /* handle partial completion */
-		: [src] "+a" (r2), [len] "+d" (r3),
-		  [dst] "+a" (r4), [ctr] "+a" (r6)
-		: [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KMCTR)
-		: "cc", "memory");
+		: [src] "+&d" (s.pair), [dst] "+&d" (d.pair),
+		  [ctr] "+&d" (c.pair)
+		: [fc] "d" (func), [pba] "d" ((unsigned long)param),
+		  [opc] "i" (CPACF_KMCTR)
+		: "cc", "memory", "0", "1");
 
-	return src_len - r3;
+	return src_len - s.odd;
 }
 
 /**
@@ -417,20 +600,21 @@ static inline void cpacf_prno(unsigned long func, void *param,
 			      u8 *dest, unsigned long dest_len,
 			      const u8 *seed, unsigned long seed_len)
 {
-	register unsigned long r0 asm("0") = (unsigned long) func;
-	register unsigned long r1 asm("1") = (unsigned long) param;
-	register unsigned long r2 asm("2") = (unsigned long) dest;
-	register unsigned long r3 asm("3") = (unsigned long) dest_len;
-	register unsigned long r4 asm("4") = (unsigned long) seed;
-	register unsigned long r5 asm("5") = (unsigned long) seed_len;
+	union register_pair d, s;
 
+	d.even = (unsigned long)dest;
+	d.odd  = (unsigned long)dest_len;
+	s.even = (unsigned long)seed;
+	s.odd  = (unsigned long)seed_len;
 	asm volatile (
+		"	lgr	0,%[fc]\n"
+		"	lgr	1,%[pba]\n"
 		"0:	.insn	rre,%[opc] << 16,%[dst],%[seed]\n"
 		"	brc	1,0b\n"	  /* handle partial completion */
-		: [dst] "+a" (r2), [dlen] "+d" (r3)
-		: [fc] "d" (r0), [pba] "a" (r1),
-		  [seed] "a" (r4), [slen] "d" (r5), [opc] "i" (CPACF_PRNO)
-		: "cc", "memory");
+		: [dst] "+&d" (d.pair)
+		: [fc] "d" (func), [pba] "d" ((unsigned long)param),
+		  [seed] "d" (s.pair), [opc] "i" (CPACF_PRNO)
+		: "cc", "memory", "0", "1");
 }
 
 /**
@@ -443,19 +627,21 @@ static inline void cpacf_prno(unsigned long func, void *param,
 static inline void cpacf_trng(u8 *ucbuf, unsigned long ucbuf_len,
 			      u8 *cbuf, unsigned long cbuf_len)
 {
-	register unsigned long r0 asm("0") = (unsigned long) CPACF_PRNO_TRNG;
-	register unsigned long r2 asm("2") = (unsigned long) ucbuf;
-	register unsigned long r3 asm("3") = (unsigned long) ucbuf_len;
-	register unsigned long r4 asm("4") = (unsigned long) cbuf;
-	register unsigned long r5 asm("5") = (unsigned long) cbuf_len;
+	union register_pair u, c;
 
+	u.even = (unsigned long)ucbuf;
+	u.odd  = (unsigned long)ucbuf_len;
+	c.even = (unsigned long)cbuf;
+	c.odd  = (unsigned long)cbuf_len;
 	asm volatile (
+		"	lghi	0,%[fc]\n"
 		"0:	.insn	rre,%[opc] << 16,%[ucbuf],%[cbuf]\n"
 		"	brc	1,0b\n"	  /* handle partial completion */
-		: [ucbuf] "+a" (r2), [ucbuflen] "+d" (r3),
-		  [cbuf] "+a" (r4), [cbuflen] "+d" (r5)
-		: [fc] "d" (r0), [opc] "i" (CPACF_PRNO)
-		: "cc", "memory");
+		: [ucbuf] "+&d" (u.pair), [cbuf] "+&d" (c.pair)
+		: [fc] "K" (CPACF_PRNO_TRNG), [opc] "i" (CPACF_PRNO)
+		: "cc", "memory", "0");
+	kmsan_unpoison_memory(ucbuf, ucbuf_len);
+	kmsan_unpoison_memory(cbuf, cbuf_len);
 }
 
 /**
@@ -466,15 +652,15 @@ static inline void cpacf_trng(u8 *ucbuf, unsigned long ucbuf_len,
  */
 static inline void cpacf_pcc(unsigned long func, void *param)
 {
-	register unsigned long r0 asm("0") = (unsigned long) func;
-	register unsigned long r1 asm("1") = (unsigned long) param;
-
 	asm volatile(
+		"	lgr	0,%[fc]\n"
+		"	lgr	1,%[pba]\n"
 		"0:	.insn	rre,%[opc] << 16,0,0\n" /* PCC opcode */
 		"	brc	1,0b\n" /* handle partial completion */
 		:
-		: [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_PCC)
-		: "cc", "memory");
+		: [fc] "d" (func), [pba] "d" ((unsigned long)param),
+		  [opc] "i" (CPACF_PCC)
+		: "cc", "memory", "0", "1");
 }
 
 /**
@@ -487,14 +673,14 @@ static inline void cpacf_pcc(unsigned long func, void *param)
  */
 static inline void cpacf_pckmo(long func, void *param)
 {
-	register unsigned long r0 asm("0") = (unsigned long) func;
-	register unsigned long r1 asm("1") = (unsigned long) param;
-
 	asm volatile(
+		"	lgr	0,%[fc]\n"
+		"	lgr	1,%[pba]\n"
 		"       .insn   rre,%[opc] << 16,0,0\n" /* PCKMO opcode */
 		:
-		: [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_PCKMO)
-		: "cc", "memory");
+		: [fc] "d" (func), [pba] "d" ((unsigned long)param),
+		  [opc] "i" (CPACF_PCKMO)
+		: "cc", "memory", "0", "1");
 }
 
 /**
@@ -512,21 +698,23 @@ static inline void cpacf_kma(unsigned long func, void *param, u8 *dest,
 			     const u8 *src, unsigned long src_len,
 			     const u8 *aad, unsigned long aad_len)
 {
-	register unsigned long r0 asm("0") = (unsigned long) func;
-	register unsigned long r1 asm("1") = (unsigned long) param;
-	register unsigned long r2 asm("2") = (unsigned long) src;
-	register unsigned long r3 asm("3") = (unsigned long) src_len;
-	register unsigned long r4 asm("4") = (unsigned long) aad;
-	register unsigned long r5 asm("5") = (unsigned long) aad_len;
-	register unsigned long r6 asm("6") = (unsigned long) dest;
+	union register_pair d, s, a;
 
+	d.even = (unsigned long)dest;
+	s.even = (unsigned long)src;
+	s.odd  = (unsigned long)src_len;
+	a.even = (unsigned long)aad;
+	a.odd  = (unsigned long)aad_len;
 	asm volatile(
+		"	lgr	0,%[fc]\n"
+		"	lgr	1,%[pba]\n"
 		"0:	.insn	rrf,%[opc] << 16,%[dst],%[src],%[aad],0\n"
 		"	brc	1,0b\n"	/* handle partial completion */
-		: [dst] "+a" (r6), [src] "+a" (r2), [slen] "+d" (r3),
-		  [aad] "+a" (r4), [alen] "+d" (r5)
-		: [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KMA)
-		: "cc", "memory");
+		: [dst] "+&d" (d.pair), [src] "+&d" (s.pair),
+		  [aad] "+&d" (a.pair)
+		: [fc] "d" (func), [pba] "d" ((unsigned long)param),
+		  [opc] "i" (CPACF_KMA)
+		: "cc", "memory", "0", "1");
 }
 
 #endif	/* _ASM_S390_CPACF_H */
diff --git a/arch/s390/include/asm/cpu.h b/arch/s390/include/asm/cpu.h
index 62228a884e06..26c710cd3485 100644
--- a/arch/s390/include/asm/cpu.h
+++ b/arch/s390/include/asm/cpu.h
@@ -12,6 +12,7 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
+#include <linux/jump_label.h>
 
 struct cpuid
 {
@@ -21,5 +22,7 @@ struct cpuid
 	unsigned int unused  : 16;
 } __attribute__ ((packed, aligned(8)));
 
+DECLARE_STATIC_KEY_FALSE(cpu_has_bear);
+
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_S390_CPU_H */
diff --git a/arch/s390/include/asm/cpu_mcf.h b/arch/s390/include/asm/cpu_mcf.h
deleted file mode 100644
index 649b9fc60685..000000000000
--- a/arch/s390/include/asm/cpu_mcf.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Counter facility support definitions for the Linux perf
- *
- * Copyright IBM Corp. 2019
- * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
- */
-#ifndef _ASM_S390_CPU_MCF_H
-#define _ASM_S390_CPU_MCF_H
-
-#include <linux/perf_event.h>
-#include <asm/cpu_mf.h>
-
-enum cpumf_ctr_set {
-	CPUMF_CTR_SET_BASIC   = 0,    /* Basic Counter Set */
-	CPUMF_CTR_SET_USER    = 1,    /* Problem-State Counter Set */
-	CPUMF_CTR_SET_CRYPTO  = 2,    /* Crypto-Activity Counter Set */
-	CPUMF_CTR_SET_EXT     = 3,    /* Extended Counter Set */
-	CPUMF_CTR_SET_MT_DIAG = 4,    /* MT-diagnostic Counter Set */
-
-	/* Maximum number of counter sets */
-	CPUMF_CTR_SET_MAX,
-};
-
-#define CPUMF_LCCTL_ENABLE_SHIFT    16
-#define CPUMF_LCCTL_ACTCTL_SHIFT     0
-static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = {
-	[CPUMF_CTR_SET_BASIC]	= 0x02,
-	[CPUMF_CTR_SET_USER]	= 0x04,
-	[CPUMF_CTR_SET_CRYPTO]	= 0x08,
-	[CPUMF_CTR_SET_EXT]	= 0x01,
-	[CPUMF_CTR_SET_MT_DIAG] = 0x20,
-};
-
-static inline void ctr_set_enable(u64 *state, int ctr_set)
-{
-	*state |= cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT;
-}
-static inline void ctr_set_disable(u64 *state, int ctr_set)
-{
-	*state &= ~(cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT);
-}
-static inline void ctr_set_start(u64 *state, int ctr_set)
-{
-	*state |= cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT;
-}
-static inline void ctr_set_stop(u64 *state, int ctr_set)
-{
-	*state &= ~(cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT);
-}
-
-static inline void ctr_set_multiple_enable(u64 *state, u64 ctrsets)
-{
-	*state |= ctrsets << CPUMF_LCCTL_ENABLE_SHIFT;
-}
-
-static inline void ctr_set_multiple_disable(u64 *state, u64 ctrsets)
-{
-	*state &= ~(ctrsets << CPUMF_LCCTL_ENABLE_SHIFT);
-}
-
-static inline void ctr_set_multiple_start(u64 *state, u64 ctrsets)
-{
-	*state |= ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT;
-}
-
-static inline void ctr_set_multiple_stop(u64 *state, u64 ctrsets)
-{
-	*state &= ~(ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT);
-}
-
-static inline int ctr_stcctm(enum cpumf_ctr_set set, u64 range, u64 *dest)
-{
-	switch (set) {
-	case CPUMF_CTR_SET_BASIC:
-		return stcctm(BASIC, range, dest);
-	case CPUMF_CTR_SET_USER:
-		return stcctm(PROBLEM_STATE, range, dest);
-	case CPUMF_CTR_SET_CRYPTO:
-		return stcctm(CRYPTO_ACTIVITY, range, dest);
-	case CPUMF_CTR_SET_EXT:
-		return stcctm(EXTENDED, range, dest);
-	case CPUMF_CTR_SET_MT_DIAG:
-		return stcctm(MT_DIAG_CLEARING, range, dest);
-	case CPUMF_CTR_SET_MAX:
-		return 3;
-	}
-	return 3;
-}
-
-struct cpu_cf_events {
-	struct cpumf_ctr_info	info;
-	atomic_t		ctr_set[CPUMF_CTR_SET_MAX];
-	atomic64_t		alert;
-	u64			state, tx_state;
-	unsigned int		flags;
-	unsigned int		txn_flags;
-};
-DECLARE_PER_CPU(struct cpu_cf_events, cpu_cf_events);
-
-bool kernel_cpumcf_avail(void);
-int __kernel_cpumcf_begin(void);
-unsigned long kernel_cpumcf_alert(int clear);
-void __kernel_cpumcf_end(void);
-
-static inline int kernel_cpumcf_begin(void)
-{
-	if (!cpum_cf_avail())
-		return -ENODEV;
-
-	preempt_disable();
-	return __kernel_cpumcf_begin();
-}
-static inline void kernel_cpumcf_end(void)
-{
-	__kernel_cpumcf_end();
-	preempt_enable();
-}
-
-/* Return true if store counter set multiple instruction is available */
-static inline int stccm_avail(void)
-{
-	return test_facility(142);
-}
-
-#endif /* _ASM_S390_CPU_MCF_H */
diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h
index 0d90cbeb89b4..1798fbd59068 100644
--- a/arch/s390/include/asm/cpu_mf.h
+++ b/arch/s390/include/asm/cpu_mf.h
@@ -10,7 +10,10 @@
 #define _ASM_S390_CPU_MF_H
 
 #include <linux/errno.h>
+#include <linux/kmsan-checks.h>
+#include <asm/asm-extable.h>
 #include <asm/facility.h>
+#include <asm/asm.h>
 
 asm(".include \"asm/cpu_mf-insn.h\"\n");
 
@@ -41,7 +44,6 @@ static inline int cpum_sf_avail(void)
 	return test_facility(40) && test_facility(68);
 }
 
-
 struct cpumf_ctr_info {
 	u16   cfvn;
 	u16   auth_ctl;
@@ -109,7 +111,9 @@ struct hws_basic_entry {
 	unsigned int AS:2;	    /* 29-30 PSW address-space control	 */
 	unsigned int I:1;	    /* 31 entry valid or invalid	 */
 	unsigned int CL:2;	    /* 32-33 Configuration Level	 */
-	unsigned int:14;
+	unsigned int H:1;	    /* 34 Host Indicator		 */
+	unsigned int LS:1;	    /* 35 Limited Sampling		 */
+	unsigned int:12;
 	unsigned int prim_asn:16;   /* primary ASN			 */
 	unsigned long long ia;	    /* Instruction Address		 */
 	unsigned long long gpp;     /* Guest Program Parameter		 */
@@ -128,19 +132,21 @@ struct hws_combined_entry {
 	struct hws_diag_entry	diag;	/* Diagnostic-sampling data entry */
 } __packed;
 
-struct hws_trailer_entry {
-	union {
-		struct {
-			unsigned int f:1;	/* 0 - Block Full Indicator   */
-			unsigned int a:1;	/* 1 - Alert request control  */
-			unsigned int t:1;	/* 2 - Timestamp format	      */
-			unsigned int :29;	/* 3 - 31: Reserved	      */
-			unsigned int bsdes:16;	/* 32-47: size of basic SDE   */
-			unsigned int dsdes:16;	/* 48-63: size of diagnostic SDE */
-		};
-		unsigned long long flags;	/* 0 - 63: All indicators     */
+union hws_trailer_header {
+	struct {
+		unsigned int f:1;	/* 0 - Block Full Indicator   */
+		unsigned int a:1;	/* 1 - Alert request control  */
+		unsigned int t:1;	/* 2 - Timestamp format	      */
+		unsigned int :29;	/* 3 - 31: Reserved	      */
+		unsigned int bsdes:16;	/* 32-47: size of basic SDE   */
+		unsigned int dsdes:16;	/* 48-63: size of diagnostic SDE */
+		unsigned long long overflow; /* 64 - Overflow Count   */
 	};
-	unsigned long long overflow;	 /* 64 - sample Overflow count	      */
+	u128 val;
+};
+
+struct hws_trailer_entry {
+	union hws_trailer_header header; /* 0 - 15 Flags + Overflow Count     */
 	unsigned char timestamp[16];	 /* 16 - 31 timestamp		      */
 	unsigned long long reserved1;	 /* 32 -Reserved		      */
 	unsigned long long reserved2;	 /*				      */
@@ -157,7 +163,7 @@ struct hws_trailer_entry {
 /* Load program parameter */
 static inline void lpp(void *pp)
 {
-	asm volatile(".insn s,0xb2800000,0(%0)\n":: "a" (pp) : "memory");
+	asm volatile("lpp 0(%0)\n" :: "a" (pp) : "memory");
 }
 
 /* Query counter information */
@@ -165,8 +171,8 @@ static inline int qctri(struct cpumf_ctr_info *info)
 {
 	int rc = -EINVAL;
 
-	asm volatile (
-		"0:	.insn	s,0xb28e0000,%1\n"
+	asm_inline volatile (
+		"0:	qctri	%1\n"
 		"1:	lhi	%0,0\n"
 		"2:\n"
 		EX_TABLE(1b, 2b)
@@ -179,12 +185,13 @@ static inline int lcctl(u64 ctl)
 {
 	int cc;
 
-	asm volatile (
-		"	.insn	s,0xb2840000,%1\n"
-		"	ipm	%0\n"
-		"	srl	%0,28\n"
-		: "=d" (cc) : "Q" (ctl) : "cc");
-	return cc;
+	asm_inline volatile (
+		"	lcctl	%[ctl]\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc)
+		: [ctl] "Q" (ctl)
+		: CC_CLOBBER);
+	return CC_TRANSFORM(cc);
 }
 
 /* Extract CPU counter */
@@ -193,13 +200,14 @@ static inline int __ecctr(u64 ctr, u64 *content)
 	u64 _content;
 	int cc;
 
-	asm volatile (
-		"	.insn	rre,0xb2e40000,%0,%2\n"
-		"	ipm	%1\n"
-		"	srl	%1,28\n"
-		: "=d" (_content), "=d" (cc) : "d" (ctr) : "cc");
+	asm_inline volatile (
+		"	ecctr	%[_content],%[ctr]\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc), [_content] "=d" (_content)
+		: [ctr] "d" (ctr)
+		: CC_CLOBBER);
 	*content = _content;
-	return cc;
+	return CC_TRANSFORM(cc);
 }
 
 /* Extract CPU counter */
@@ -229,13 +237,17 @@ static __always_inline int stcctm(enum stcctm_ctr_set set, u64 range, u64 *dest)
 	int cc;
 
 	asm volatile (
-		"	STCCTM	%2,%3,%1\n"
-		"	ipm	%0\n"
-		"	srl	%0,28\n"
-		: "=d" (cc)
-		: "Q" (*dest), "d" (range), "i" (set)
-		: "cc", "memory");
-	return cc;
+		"	STCCTM	%[range],%[set],%[dest]\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc)
+		: [dest] "Q" (*dest), [range] "d" (range), [set] "i" (set)
+		: CC_CLOBBER_LIST("memory"));
+	/*
+	 * If cc == 2, less than RANGE counters are stored, but it's not easy
+	 * to tell how many. Always unpoison the whole range for simplicity.
+	 */
+	kmsan_unpoison_memory(dest, range * sizeof(u64));
+	return CC_TRANSFORM(cc);
 }
 
 /* Query sampling information */
@@ -244,7 +256,7 @@ static inline int qsi(struct hws_qsi_info_block *info)
 	int cc = 1;
 
 	asm volatile(
-		"0:	.insn	s,0xb2860000,%1\n"
+		"0:	qsi	%1\n"
 		"1:	lhi	%0,0\n"
 		"2:\n"
 		EX_TABLE(0b, 2b) EX_TABLE(1b, 2b)
@@ -255,74 +267,20 @@ static inline int qsi(struct hws_qsi_info_block *info)
 /* Load sampling controls */
 static inline int lsctl(struct hws_lsctl_request_block *req)
 {
-	int cc;
+	int cc, exception;
 
-	cc = 1;
+	exception = 1;
 	asm volatile(
-		"0:	.insn	s,0xb2870000,0(%1)\n"
-		"1:	ipm	%0\n"
-		"	srl	%0,28\n"
+		"0:	lsctl	%[req]\n"
+		"1:	lhi	%[exc],0\n"
 		"2:\n"
+		CC_IPM(cc)
 		EX_TABLE(0b, 2b) EX_TABLE(1b, 2b)
-		: "+d" (cc), "+a" (req)
-		: "m" (*req)
-		: "cc", "memory");
-
-	return cc ? -EINVAL : 0;
-}
-
-/* Sampling control helper functions */
-
-#include <linux/time.h>
-
-static inline unsigned long freq_to_sample_rate(struct hws_qsi_info_block *qsi,
-						unsigned long freq)
-{
-	return (USEC_PER_SEC / freq) * qsi->cpu_speed;
-}
-
-static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi,
-						unsigned long rate)
-{
-	return USEC_PER_SEC * qsi->cpu_speed / rate;
-}
-
-#define SDB_TE_ALERT_REQ_MASK	0x4000000000000000UL
-#define SDB_TE_BUFFER_FULL_MASK 0x8000000000000000UL
-
-/* Return TOD timestamp contained in an trailer entry */
-static inline unsigned long long trailer_timestamp(struct hws_trailer_entry *te)
-{
-	/* TOD in STCKE format */
-	if (te->t)
-		return *((unsigned long long *) &te->timestamp[1]);
-
-	/* TOD in STCK format */
-	return *((unsigned long long *) &te->timestamp[0]);
-}
-
-/* Return pointer to trailer entry of an sample data block */
-static inline unsigned long *trailer_entry_ptr(unsigned long v)
-{
-	void *ret;
-
-	ret = (void *) v;
-	ret += PAGE_SIZE;
-	ret -= sizeof(struct hws_trailer_entry);
-
-	return (unsigned long *) ret;
-}
-
-/* Return true if the entry in the sample data block table (sdbt)
- * is a link to the next sdbt */
-static inline int is_link_entry(unsigned long *s)
-{
-	return *s & 0x1ul ? 1 : 0;
-}
-
-/* Return pointer to the linked sdbt */
-static inline unsigned long *get_next_sdbt(unsigned long *s)
-{
-	return (unsigned long *) (*s & ~0x1ul);
+		: CC_OUT(cc, cc), [exc] "+d" (exception)
+		: [req] "Q" (*req)
+		: CC_CLOBBER);
+	if (exception || CC_TRANSFORM(cc))
+		return -EINVAL;
+	return 0;
 }
 #endif /* _ASM_S390_CPU_MF_H */
diff --git a/arch/s390/include/asm/cpufeature.h b/arch/s390/include/asm/cpufeature.h
index 1d007c6ede95..e08169bd63a5 100644
--- a/arch/s390/include/asm/cpufeature.h
+++ b/arch/s390/include/asm/cpufeature.h
@@ -2,29 +2,36 @@
 /*
  * Module interface for CPU features
  *
- * Copyright IBM Corp. 2015
+ * Copyright IBM Corp. 2015, 2022
  * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
  */
 
 #ifndef __ASM_S390_CPUFEATURE_H
 #define __ASM_S390_CPUFEATURE_H
 
-#include <asm/elf.h>
+#include <asm/facility.h>
 
-/* Hardware features on Linux on z Systems are indicated by facility bits that
- * are mapped to the so-called machine flags.  Particular machine flags are
- * then used to define ELF hardware capabilities; most notably hardware flags
- * that are essential for user space / glibc.
- *
- * Restrict the set of exposed CPU features to ELF hardware capabilities for
- * now.  Additional machine flags can be indicated by values larger than
- * MAX_ELF_HWCAP_FEATURES.
- */
-#define MAX_ELF_HWCAP_FEATURES	(8 * sizeof(elf_hwcap))
-#define MAX_CPU_FEATURES	MAX_ELF_HWCAP_FEATURES
+enum {
+	S390_CPU_FEATURE_MSA,
+	S390_CPU_FEATURE_VXRS,
+	S390_CPU_FEATURE_UV,
+	MAX_CPU_FEATURES
+};
 
-#define cpu_feature(feat)	ilog2(HWCAP_S390_ ## feat)
+#define cpu_feature(feature)	(feature)
 
 int cpu_have_feature(unsigned int nr);
 
+#define cpu_has_bear()		test_facility(193)
+#define cpu_has_edat1()		test_facility(8)
+#define cpu_has_edat2()		test_facility(78)
+#define cpu_has_gs()		test_facility(133)
+#define cpu_has_idte()		test_facility(3)
+#define cpu_has_nx()		test_facility(130)
+#define cpu_has_rdp()		test_facility(194)
+#define cpu_has_seq_insn()	test_facility(85)
+#define cpu_has_tlb_lc()	test_facility(51)
+#define cpu_has_topology()	test_facility(11)
+#define cpu_has_vx()		test_facility(129)
+
 #endif /* __ASM_S390_CPUFEATURE_H */
diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h
index cb729d111e20..30bb3ec4e5fc 100644
--- a/arch/s390/include/asm/cputime.h
+++ b/arch/s390/include/asm/cputime.h
@@ -11,28 +11,11 @@
 #include <linux/types.h>
 #include <asm/timex.h>
 
-#define CPUTIME_PER_USEC 4096ULL
-#define CPUTIME_PER_SEC (CPUTIME_PER_USEC * USEC_PER_SEC)
-
-/* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */
-
-#define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new)
-
-/*
- * Convert cputime to microseconds.
- */
-static inline u64 cputime_to_usecs(const u64 cputime)
-{
-	return cputime >> 12;
-}
-
 /*
  * Convert cputime to nanoseconds.
  */
 #define cputime_to_nsecs(cputime) tod_to_ns(cputime)
 
-u64 arch_cpu_idle_time(int cpu);
-
-#define arch_idle_time(cpu) arch_cpu_idle_time(cpu)
+void account_idle_time_irq(void);
 
 #endif /* _S390_CPUTIME_H */
diff --git a/arch/s390/include/asm/crw.h b/arch/s390/include/asm/crw.h
index c6ebfd31f1db..97456d98fe76 100644
--- a/arch/s390/include/asm/crw.h
+++ b/arch/s390/include/asm/crw.h
@@ -5,7 +5,6 @@
  *    Author(s): Ingo Adlung <adlung@de.ibm.com>,
  *		 Martin Schwidefsky <schwidefsky@de.ibm.com>,
  *		 Cornelia Huck <cornelia.huck@de.ibm.com>,
- *		 Heiko Carstens <heiko.carstens@de.ibm.com>,
  */
 
 #ifndef _ASM_S390_CRW_H
diff --git a/arch/s390/include/asm/css_chars.h b/arch/s390/include/asm/css_chars.h
index 480bb02ccacd..a03f64033760 100644
--- a/arch/s390/include/asm/css_chars.h
+++ b/arch/s390/include/asm/css_chars.h
@@ -25,7 +25,7 @@ struct css_general_char {
 	u64 : 2;
 
 	u64 : 3;
-	u64 aif_osa : 1; /* bit 67 */
+	u64 aif_qdio : 1;/* bit 67 */
 	u64 : 12;
 	u64 eadm_rf : 1; /* bit 80 */
 	u64 : 1;
@@ -36,7 +36,9 @@ struct css_general_char {
 	u64 alt_ssi : 1; /* bit 108 */
 	u64 : 1;
 	u64 narf : 1;	 /* bit 110 */
-	u64 : 12;
+	u64 : 5;
+	u64 enarf: 1;	 /* bit 116 */
+	u64 : 6;
 	u64 util_str : 1;/* bit 123 */
 } __packed;
 
diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h
deleted file mode 100644
index ed5efbb531c4..000000000000
--- a/arch/s390/include/asm/ctl_reg.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright IBM Corp. 1999, 2009
- *
- * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
- */
-
-#ifndef __ASM_CTL_REG_H
-#define __ASM_CTL_REG_H
-
-#include <linux/bits.h>
-
-#define CR0_CLOCK_COMPARATOR_SIGN	BIT(63 - 10)
-#define CR0_LOW_ADDRESS_PROTECTION	BIT(63 - 35)
-#define CR0_EMERGENCY_SIGNAL_SUBMASK	BIT(63 - 49)
-#define CR0_EXTERNAL_CALL_SUBMASK	BIT(63 - 50)
-#define CR0_CLOCK_COMPARATOR_SUBMASK	BIT(63 - 52)
-#define CR0_CPU_TIMER_SUBMASK		BIT(63 - 53)
-#define CR0_SERVICE_SIGNAL_SUBMASK	BIT(63 - 54)
-#define CR0_UNUSED_56			BIT(63 - 56)
-#define CR0_INTERRUPT_KEY_SUBMASK	BIT(63 - 57)
-#define CR0_MEASUREMENT_ALERT_SUBMASK	BIT(63 - 58)
-
-#define CR2_GUARDED_STORAGE		BIT(63 - 59)
-
-#define CR14_UNUSED_32			BIT(63 - 32)
-#define CR14_UNUSED_33			BIT(63 - 33)
-#define CR14_CHANNEL_REPORT_SUBMASK	BIT(63 - 35)
-#define CR14_RECOVERY_SUBMASK		BIT(63 - 36)
-#define CR14_DEGRADATION_SUBMASK	BIT(63 - 37)
-#define CR14_EXTERNAL_DAMAGE_SUBMASK	BIT(63 - 38)
-#define CR14_WARNING_SUBMASK		BIT(63 - 39)
-
-#ifndef __ASSEMBLY__
-
-#include <linux/bug.h>
-
-#define __ctl_load(array, low, high) do {				\
-	typedef struct { char _[sizeof(array)]; } addrtype;		\
-									\
-	BUILD_BUG_ON(sizeof(addrtype) != (high - low + 1) * sizeof(long));\
-	asm volatile(							\
-		"	lctlg	%1,%2,%0\n"				\
-		:							\
-		: "Q" (*(addrtype *)(&array)), "i" (low), "i" (high)	\
-		: "memory");						\
-} while (0)
-
-#define __ctl_store(array, low, high) do {				\
-	typedef struct { char _[sizeof(array)]; } addrtype;		\
-									\
-	BUILD_BUG_ON(sizeof(addrtype) != (high - low + 1) * sizeof(long));\
-	asm volatile(							\
-		"	stctg	%1,%2,%0\n"				\
-		: "=Q" (*(addrtype *)(&array))				\
-		: "i" (low), "i" (high));				\
-} while (0)
-
-static __always_inline void __ctl_set_bit(unsigned int cr, unsigned int bit)
-{
-	unsigned long reg;
-
-	__ctl_store(reg, cr, cr);
-	reg |= 1UL << bit;
-	__ctl_load(reg, cr, cr);
-}
-
-static __always_inline void __ctl_clear_bit(unsigned int cr, unsigned int bit)
-{
-	unsigned long reg;
-
-	__ctl_store(reg, cr, cr);
-	reg &= ~(1UL << bit);
-	__ctl_load(reg, cr, cr);
-}
-
-void smp_ctl_set_bit(int cr, int bit);
-void smp_ctl_clear_bit(int cr, int bit);
-
-union ctlreg0 {
-	unsigned long val;
-	struct {
-		unsigned long	   : 8;
-		unsigned long tcx  : 1;	/* Transactional-Execution control */
-		unsigned long pifo : 1;	/* Transactional-Execution Program-
-					   Interruption-Filtering Override */
-		unsigned long	   : 22;
-		unsigned long	   : 3;
-		unsigned long lap  : 1; /* Low-address-protection control */
-		unsigned long	   : 4;
-		unsigned long edat : 1; /* Enhanced-DAT-enablement control */
-		unsigned long	   : 2;
-		unsigned long iep  : 1; /* Instruction-Execution-Protection */
-		unsigned long	   : 1;
-		unsigned long afp  : 1; /* AFP-register control */
-		unsigned long vx   : 1; /* Vector enablement control */
-		unsigned long	   : 7;
-		unsigned long sssm : 1; /* Service signal subclass mask */
-		unsigned long	   : 9;
-	};
-};
-
-union ctlreg2 {
-	unsigned long val;
-	struct {
-		unsigned long	    : 33;
-		unsigned long ducto : 25;
-		unsigned long	    : 1;
-		unsigned long gse   : 1;
-		unsigned long	    : 1;
-		unsigned long tds   : 1;
-		unsigned long tdc   : 2;
-	};
-};
-
-#define ctl_set_bit(cr, bit) smp_ctl_set_bit(cr, bit)
-#define ctl_clear_bit(cr, bit) smp_ctl_clear_bit(cr, bit)
-
-#endif /* __ASSEMBLY__ */
-#endif /* __ASM_CTL_REG_H */
diff --git a/arch/s390/include/asm/ctlreg.h b/arch/s390/include/asm/ctlreg.h
new file mode 100644
index 000000000000..e6527f51ad0b
--- /dev/null
+++ b/arch/s390/include/asm/ctlreg.h
@@ -0,0 +1,256 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 1999, 2009
+ *
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#ifndef __ASM_S390_CTLREG_H
+#define __ASM_S390_CTLREG_H
+
+#include <linux/bits.h>
+
+#define CR0_TRANSACTIONAL_EXECUTION_BIT		(63 - 8)
+#define CR0_CLOCK_COMPARATOR_SIGN_BIT		(63 - 10)
+#define CR0_CRYPTOGRAPHY_COUNTER_BIT		(63 - 13)
+#define CR0_PAI_EXTENSION_BIT			(63 - 14)
+#define CR0_CPUMF_EXTRACTION_AUTH_BIT		(63 - 15)
+#define CR0_WARNING_TRACK_BIT			(63 - 30)
+#define CR0_LOW_ADDRESS_PROTECTION_BIT		(63 - 35)
+#define CR0_FETCH_PROTECTION_OVERRIDE_BIT	(63 - 38)
+#define CR0_STORAGE_PROTECTION_OVERRIDE_BIT	(63 - 39)
+#define CR0_EDAT_BIT				(63 - 40)
+#define CR0_INSTRUCTION_EXEC_PROTECTION_BIT	(63 - 43)
+#define CR0_VECTOR_BIT				(63 - 46)
+#define CR0_MALFUNCTION_ALERT_SUBMASK_BIT	(63 - 48)
+#define CR0_EMERGENCY_SIGNAL_SUBMASK_BIT	(63 - 49)
+#define CR0_EXTERNAL_CALL_SUBMASK_BIT		(63 - 50)
+#define CR0_CLOCK_COMPARATOR_SUBMASK_BIT	(63 - 52)
+#define CR0_CPU_TIMER_SUBMASK_BIT		(63 - 53)
+#define CR0_SERVICE_SIGNAL_SUBMASK_BIT		(63 - 54)
+#define CR0_UNUSED_56_BIT			(63 - 56)
+#define CR0_INTERRUPT_KEY_SUBMASK_BIT		(63 - 57)
+#define CR0_MEASUREMENT_ALERT_SUBMASK_BIT	(63 - 58)
+#define CR0_ETR_SUBMASK_BIT			(63 - 59)
+#define CR0_IUCV_BIT				(63 - 62)
+
+#define CR0_TRANSACTIONAL_EXECUTION		BIT(CR0_TRANSACTIONAL_EXECUTION_BIT)
+#define CR0_CLOCK_COMPARATOR_SIGN		BIT(CR0_CLOCK_COMPARATOR_SIGN_BIT)
+#define CR0_CRYPTOGRAPHY_COUNTER		BIT(CR0_CRYPTOGRAPHY_COUNTER_BIT)
+#define CR0_PAI_EXTENSION			BIT(CR0_PAI_EXTENSION_BIT)
+#define CR0_CPUMF_EXTRACTION_AUTH		BIT(CR0_CPUMF_EXTRACTION_AUTH_BIT)
+#define CR0_WARNING_TRACK			BIT(CR0_WARNING_TRACK_BIT)
+#define CR0_LOW_ADDRESS_PROTECTION		BIT(CR0_LOW_ADDRESS_PROTECTION_BIT)
+#define CR0_FETCH_PROTECTION_OVERRIDE		BIT(CR0_FETCH_PROTECTION_OVERRIDE_BIT)
+#define CR0_STORAGE_PROTECTION_OVERRIDE		BIT(CR0_STORAGE_PROTECTION_OVERRIDE_BIT)
+#define CR0_EDAT				BIT(CR0_EDAT_BIT)
+#define CR0_INSTRUCTION_EXEC_PROTECTION		BIT(CR0_INSTRUCTION_EXEC_PROTECTION_BIT)
+#define CR0_VECTOR				BIT(CR0_VECTOR_BIT)
+#define CR0_MALFUNCTION_ALERT_SUBMASK		BIT(CR0_MALFUNCTION_ALERT_SUBMASK_BIT)
+#define CR0_EMERGENCY_SIGNAL_SUBMASK		BIT(CR0_EMERGENCY_SIGNAL_SUBMASK_BIT)
+#define CR0_EXTERNAL_CALL_SUBMASK		BIT(CR0_EXTERNAL_CALL_SUBMASK_BIT)
+#define CR0_CLOCK_COMPARATOR_SUBMASK		BIT(CR0_CLOCK_COMPARATOR_SUBMASK_BIT)
+#define CR0_CPU_TIMER_SUBMASK			BIT(CR0_CPU_TIMER_SUBMASK_BIT)
+#define CR0_SERVICE_SIGNAL_SUBMASK		BIT(CR0_SERVICE_SIGNAL_SUBMASK_BIT)
+#define CR0_UNUSED_56				BIT(CR0_UNUSED_56_BIT)
+#define CR0_INTERRUPT_KEY_SUBMASK		BIT(CR0_INTERRUPT_KEY_SUBMASK_BIT)
+#define CR0_MEASUREMENT_ALERT_SUBMASK		BIT(CR0_MEASUREMENT_ALERT_SUBMASK_BIT)
+#define CR0_ETR_SUBMASK				BIT(CR0_ETR_SUBMASK_BIT)
+#define CR0_IUCV				BIT(CR0_IUCV_BIT)
+
+#define CR2_MIO_ADDRESSING_BIT			(63 - 58)
+#define CR2_GUARDED_STORAGE_BIT			(63 - 59)
+
+#define CR2_MIO_ADDRESSING			BIT(CR2_MIO_ADDRESSING_BIT)
+#define CR2_GUARDED_STORAGE			BIT(CR2_GUARDED_STORAGE_BIT)
+
+#define CR14_UNUSED_32_BIT			(63 - 32)
+#define CR14_UNUSED_33_BIT			(63 - 33)
+#define CR14_CHANNEL_REPORT_SUBMASK_BIT		(63 - 35)
+#define CR14_RECOVERY_SUBMASK_BIT		(63 - 36)
+#define CR14_DEGRADATION_SUBMASK_BIT		(63 - 37)
+#define CR14_EXTERNAL_DAMAGE_SUBMASK_BIT	(63 - 38)
+#define CR14_WARNING_SUBMASK_BIT		(63 - 39)
+
+#define CR14_UNUSED_32				BIT(CR14_UNUSED_32_BIT)
+#define CR14_UNUSED_33				BIT(CR14_UNUSED_33_BIT)
+#define CR14_CHANNEL_REPORT_SUBMASK		BIT(CR14_CHANNEL_REPORT_SUBMASK_BIT)
+#define CR14_RECOVERY_SUBMASK			BIT(CR14_RECOVERY_SUBMASK_BIT)
+#define CR14_DEGRADATION_SUBMASK		BIT(CR14_DEGRADATION_SUBMASK_BIT)
+#define CR14_EXTERNAL_DAMAGE_SUBMASK		BIT(CR14_EXTERNAL_DAMAGE_SUBMASK_BIT)
+#define CR14_WARNING_SUBMASK			BIT(CR14_WARNING_SUBMASK_BIT)
+
+#ifndef __ASSEMBLY__
+
+#include <linux/bug.h>
+
+struct ctlreg {
+	unsigned long val;
+};
+
+#define __local_ctl_load(low, high, array) do {				\
+	struct addrtype {						\
+		char _[sizeof(array)];					\
+	};								\
+	int _high = high;						\
+	int _low = low;							\
+	int _esize;							\
+									\
+	_esize = (_high - _low + 1) * sizeof(struct ctlreg);		\
+	BUILD_BUG_ON(sizeof(struct addrtype) != _esize);		\
+	typecheck(struct ctlreg, array[0]);				\
+	asm volatile(							\
+		"	lctlg	%[_low],%[_high],%[_arr]\n"		\
+		:							\
+		: [_arr] "Q" (*(struct addrtype *)(&array)),		\
+		  [_low] "i" (low), [_high] "i" (high)			\
+		: "memory");						\
+} while (0)
+
+#define __local_ctl_store(low, high, array) do {			\
+	struct addrtype {						\
+		char _[sizeof(array)];					\
+	};								\
+	int _high = high;						\
+	int _low = low;							\
+	int _esize;							\
+									\
+	_esize = (_high - _low + 1) * sizeof(struct ctlreg);		\
+	BUILD_BUG_ON(sizeof(struct addrtype) != _esize);		\
+	typecheck(struct ctlreg, array[0]);				\
+	asm volatile(							\
+		"	stctg	%[_low],%[_high],%[_arr]\n"		\
+		: [_arr] "=Q" (*(struct addrtype *)(&array))		\
+		: [_low] "i" (low), [_high] "i" (high));		\
+} while (0)
+
+static __always_inline void local_ctl_load(unsigned int cr, struct ctlreg *reg)
+{
+	asm volatile(
+		"	lctlg	%[cr],%[cr],%[reg]\n"
+		:
+		: [reg] "Q" (*reg), [cr] "i" (cr)
+		: "memory");
+}
+
+static __always_inline void local_ctl_store(unsigned int cr, struct ctlreg *reg)
+{
+	asm volatile(
+		"	stctg	%[cr],%[cr],%[reg]\n"
+		: [reg] "=Q" (*reg)
+		: [cr] "i" (cr));
+}
+
+static __always_inline struct ctlreg local_ctl_set_bit(unsigned int cr, unsigned int bit)
+{
+	struct ctlreg new, old;
+
+	local_ctl_store(cr, &old);
+	new = old;
+	new.val |= 1UL << bit;
+	local_ctl_load(cr, &new);
+	return old;
+}
+
+static __always_inline struct ctlreg local_ctl_clear_bit(unsigned int cr, unsigned int bit)
+{
+	struct ctlreg new, old;
+
+	local_ctl_store(cr, &old);
+	new = old;
+	new.val &= ~(1UL << bit);
+	local_ctl_load(cr, &new);
+	return old;
+}
+
+struct lowcore;
+
+void system_ctlreg_lock(void);
+void system_ctlreg_unlock(void);
+void system_ctlreg_init_save_area(struct lowcore *lc);
+void system_ctlreg_modify(unsigned int cr, unsigned long data, int request);
+
+enum {
+	CTLREG_SET_BIT,
+	CTLREG_CLEAR_BIT,
+	CTLREG_LOAD,
+};
+
+static inline void system_ctl_set_bit(unsigned int cr, unsigned int bit)
+{
+	system_ctlreg_modify(cr, bit, CTLREG_SET_BIT);
+}
+
+static inline void system_ctl_clear_bit(unsigned int cr, unsigned int bit)
+{
+	system_ctlreg_modify(cr, bit, CTLREG_CLEAR_BIT);
+}
+
+static inline void system_ctl_load(unsigned int cr, struct ctlreg *reg)
+{
+	system_ctlreg_modify(cr, reg->val, CTLREG_LOAD);
+}
+
+union ctlreg0 {
+	unsigned long val;
+	struct ctlreg reg;
+	struct {
+		unsigned long	   : 8;
+		unsigned long tcx  : 1;	/* Transactional-Execution control */
+		unsigned long pifo : 1;	/* Transactional-Execution Program-
+					   Interruption-Filtering Override */
+		unsigned long	   : 3;
+		unsigned long ccc  : 1; /* Cryptography counter control */
+		unsigned long pec  : 1; /* PAI extension control */
+		unsigned long	   : 15;
+		unsigned long wti  : 1; /* Warning-track */
+		unsigned long	   : 4;
+		unsigned long lap  : 1; /* Low-address-protection control */
+		unsigned long	   : 4;
+		unsigned long edat : 1; /* Enhanced-DAT-enablement control */
+		unsigned long	   : 2;
+		unsigned long iep  : 1; /* Instruction-Execution-Protection */
+		unsigned long	   : 1;
+		unsigned long afp  : 1; /* AFP-register control */
+		unsigned long vx   : 1; /* Vector enablement control */
+		unsigned long	   : 7;
+		unsigned long sssm : 1; /* Service signal subclass mask */
+		unsigned long	   : 9;
+	};
+};
+
+union ctlreg2 {
+	unsigned long val;
+	struct ctlreg reg;
+	struct {
+		unsigned long	    : 33;
+		unsigned long ducto : 25;
+		unsigned long	    : 1;
+		unsigned long gse   : 1;
+		unsigned long	    : 1;
+		unsigned long tds   : 1;
+		unsigned long tdc   : 2;
+	};
+};
+
+union ctlreg5 {
+	unsigned long val;
+	struct ctlreg reg;
+	struct {
+		unsigned long	    : 33;
+		unsigned long pasteo: 25;
+		unsigned long	    : 6;
+	};
+};
+
+union ctlreg15 {
+	unsigned long val;
+	struct ctlreg reg;
+	struct {
+		unsigned long lsea  : 61;
+		unsigned long	    : 3;
+	};
+};
+
+#endif /* __ASSEMBLY__ */
+#endif /* __ASM_S390_CTLREG_H */
diff --git a/arch/s390/include/asm/current.h b/arch/s390/include/asm/current.h
index 68f84315277c..f9529f7cf62c 100644
--- a/arch/s390/include/asm/current.h
+++ b/arch/s390/include/asm/current.h
@@ -11,9 +11,25 @@
 #define _S390_CURRENT_H
 
 #include <asm/lowcore.h>
+#include <asm/machine.h>
 
 struct task_struct;
 
-#define current ((struct task_struct *const)S390_lowcore.current_task)
+static __always_inline struct task_struct *get_current(void)
+{
+	unsigned long ptr, lc_current;
+
+	lc_current = offsetof(struct lowcore, current_task);
+	asm_inline(
+		ALTERNATIVE("	lg	%[ptr],%[offzero](%%r0)\n",
+			    "	lg	%[ptr],%[offalt](%%r0)\n",
+			    ALT_FEATURE(MFEATURE_LOWCORE))
+		: [ptr] "=d" (ptr)
+		: [offzero] "i" (lc_current),
+		  [offalt] "i" (lc_current + LOWCORE_ALT_ADDRESS));
+	return (struct task_struct *)ptr;
+}
+
+#define current get_current()
 
 #endif /* !(_S390_CURRENT_H) */
diff --git a/arch/s390/include/asm/dat-bits.h b/arch/s390/include/asm/dat-bits.h
new file mode 100644
index 000000000000..8d65eec2f124
--- /dev/null
+++ b/arch/s390/include/asm/dat-bits.h
@@ -0,0 +1,170 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * DAT table and related structures
+ *
+ * Copyright IBM Corp. 2024
+ *
+ */
+
+#ifndef _S390_DAT_BITS_H
+#define _S390_DAT_BITS_H
+
+union asce {
+	unsigned long val;
+	struct {
+		unsigned long rsto: 52;/* Region- or Segment-Table Origin */
+		unsigned long	  : 2;
+		unsigned long g   : 1; /* Subspace Group control */
+		unsigned long p   : 1; /* Private Space control */
+		unsigned long s   : 1; /* Storage-Alteration-Event control */
+		unsigned long x   : 1; /* Space-Switch-Event control */
+		unsigned long r   : 1; /* Real-Space control */
+		unsigned long	  : 1;
+		unsigned long dt  : 2; /* Designation-Type control */
+		unsigned long tl  : 2; /* Region- or Segment-Table Length */
+	};
+};
+
+enum {
+	ASCE_TYPE_SEGMENT = 0,
+	ASCE_TYPE_REGION3 = 1,
+	ASCE_TYPE_REGION2 = 2,
+	ASCE_TYPE_REGION1 = 3
+};
+
+union region1_table_entry {
+	unsigned long val;
+	struct {
+		unsigned long rto: 52;/* Region-Table Origin */
+		unsigned long	 : 2;
+		unsigned long p  : 1; /* DAT-Protection Bit */
+		unsigned long	 : 1;
+		unsigned long tf : 2; /* Region-Second-Table Offset */
+		unsigned long i  : 1; /* Region-Invalid Bit */
+		unsigned long	 : 1;
+		unsigned long tt : 2; /* Table-Type Bits */
+		unsigned long tl : 2; /* Region-Second-Table Length */
+	};
+};
+
+union region2_table_entry {
+	unsigned long val;
+	struct {
+		unsigned long rto: 52;/* Region-Table Origin */
+		unsigned long	 : 2;
+		unsigned long p  : 1; /* DAT-Protection Bit */
+		unsigned long	 : 1;
+		unsigned long tf : 2; /* Region-Third-Table Offset */
+		unsigned long i  : 1; /* Region-Invalid Bit */
+		unsigned long	 : 1;
+		unsigned long tt : 2; /* Table-Type Bits */
+		unsigned long tl : 2; /* Region-Third-Table Length */
+	};
+};
+
+struct region3_table_entry_fc0 {
+	unsigned long sto: 52;/* Segment-Table Origin */
+	unsigned long	 : 1;
+	unsigned long fc : 1; /* Format-Control */
+	unsigned long p  : 1; /* DAT-Protection Bit */
+	unsigned long	 : 1;
+	unsigned long tf : 2; /* Segment-Table Offset */
+	unsigned long i  : 1; /* Region-Invalid Bit */
+	unsigned long cr : 1; /* Common-Region Bit */
+	unsigned long tt : 2; /* Table-Type Bits */
+	unsigned long tl : 2; /* Segment-Table Length */
+};
+
+struct region3_table_entry_fc1 {
+	unsigned long rfaa: 33;/* Region-Frame Absolute Address */
+	unsigned long	  : 14;
+	unsigned long av  : 1; /* ACCF-Validity Control */
+	unsigned long acc : 4; /* Access-Control Bits */
+	unsigned long f   : 1; /* Fetch-Protection Bit */
+	unsigned long fc  : 1; /* Format-Control */
+	unsigned long p   : 1; /* DAT-Protection Bit */
+	unsigned long iep : 1; /* Instruction-Execution-Protection */
+	unsigned long	  : 2;
+	unsigned long i   : 1; /* Region-Invalid Bit */
+	unsigned long cr  : 1; /* Common-Region Bit */
+	unsigned long tt  : 2; /* Table-Type Bits */
+	unsigned long	  : 2;
+};
+
+union region3_table_entry {
+	unsigned long val;
+	struct region3_table_entry_fc0 fc0;
+	struct region3_table_entry_fc1 fc1;
+	struct {
+		unsigned long	: 53;
+		unsigned long fc: 1; /* Format-Control */
+		unsigned long	: 4;
+		unsigned long i : 1; /* Region-Invalid Bit */
+		unsigned long cr: 1; /* Common-Region Bit */
+		unsigned long tt: 2; /* Table-Type Bits */
+		unsigned long	: 2;
+	};
+};
+
+struct segment_table_entry_fc0 {
+	unsigned long pto: 53;/* Page-Table Origin */
+	unsigned long fc : 1; /* Format-Control */
+	unsigned long p  : 1; /* DAT-Protection Bit */
+	unsigned long	 : 3;
+	unsigned long i  : 1; /* Segment-Invalid Bit */
+	unsigned long cs : 1; /* Common-Segment Bit */
+	unsigned long tt : 2; /* Table-Type Bits */
+	unsigned long	 : 2;
+};
+
+struct segment_table_entry_fc1 {
+	unsigned long sfaa: 44;/* Segment-Frame Absolute Address */
+	unsigned long	  : 3;
+	unsigned long av  : 1; /* ACCF-Validity Control */
+	unsigned long acc : 4; /* Access-Control Bits */
+	unsigned long f   : 1; /* Fetch-Protection Bit */
+	unsigned long fc  : 1; /* Format-Control */
+	unsigned long p   : 1; /* DAT-Protection Bit */
+	unsigned long iep : 1; /* Instruction-Execution-Protection */
+	unsigned long	  : 2;
+	unsigned long i   : 1; /* Segment-Invalid Bit */
+	unsigned long cs  : 1; /* Common-Segment Bit */
+	unsigned long tt  : 2; /* Table-Type Bits */
+	unsigned long	  : 2;
+};
+
+union segment_table_entry {
+	unsigned long val;
+	struct segment_table_entry_fc0 fc0;
+	struct segment_table_entry_fc1 fc1;
+	struct {
+		unsigned long	: 53;
+		unsigned long fc: 1; /* Format-Control */
+		unsigned long	: 4;
+		unsigned long i : 1; /* Segment-Invalid Bit */
+		unsigned long cs: 1; /* Common-Segment Bit */
+		unsigned long tt: 2; /* Table-Type Bits */
+		unsigned long	: 2;
+	};
+};
+
+union page_table_entry {
+	unsigned long val;
+	struct {
+		unsigned long pfra: 52;/* Page-Frame Real Address */
+		unsigned long z   : 1; /* Zero Bit */
+		unsigned long i   : 1; /* Page-Invalid Bit */
+		unsigned long p   : 1; /* DAT-Protection Bit */
+		unsigned long iep : 1; /* Instruction-Execution-Protection */
+		unsigned long	  : 8;
+	};
+};
+
+enum {
+	TABLE_TYPE_SEGMENT = 0,
+	TABLE_TYPE_REGION3 = 1,
+	TABLE_TYPE_REGION2 = 2,
+	TABLE_TYPE_REGION1 = 3
+};
+
+#endif /* _S390_DAT_BITS_H */
diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h
index 310134015541..6375276d94ea 100644
--- a/arch/s390/include/asm/debug.h
+++ b/arch/s390/include/asm/debug.h
@@ -2,17 +2,18 @@
 /*
  *   S/390 debug facility
  *
- *    Copyright IBM Corp. 1999, 2000
+ *    Copyright IBM Corp. 1999, 2020
  */
-#ifndef DEBUG_H
-#define DEBUG_H
+#ifndef _ASM_S390_DEBUG_H
+#define _ASM_S390_DEBUG_H
 
 #include <linux/string.h>
 #include <linux/spinlock.h>
 #include <linux/kernel.h>
 #include <linux/time.h>
 #include <linux/refcount.h>
-#include <uapi/asm/debug.h>
+#include <linux/fs.h>
+#include <linux/init.h>
 
 #define DEBUG_MAX_LEVEL		   6  /* debug levels range from 0 to 6 */
 #define DEBUG_OFF_LEVEL		   -1 /* level where debug is switched off */
@@ -26,6 +27,16 @@
 #define DEBUG_DATA(entry) (char *)(entry + 1) /* data is stored behind */
 					      /* the entry information */
 
+#define __DEBUG_FEATURE_VERSION	   3  /* version of debug feature */
+
+struct __debug_entry {
+	unsigned long clock	: 60;
+	unsigned long exception	:  1;
+	unsigned long level	:  3;
+	void *caller;
+	unsigned short cpu;
+} __packed;
+
 typedef struct __debug_entry debug_entry_t;
 
 struct debug_view;
@@ -55,14 +66,15 @@ typedef int (debug_header_proc_t) (debug_info_t *id,
 				   struct debug_view *view,
 				   int area,
 				   debug_entry_t *entry,
-				   char *out_buf);
+				   char *out_buf, size_t out_buf_size);
 
 typedef int (debug_format_proc_t) (debug_info_t *id,
 				   struct debug_view *view, char *out_buf,
+				   size_t out_buf_size,
 				   const char *in_buf);
 typedef int (debug_prolog_proc_t) (debug_info_t *id,
 				   struct debug_view *view,
-				   char *out_buf);
+				   char *out_buf, size_t out_buf_size);
 typedef int (debug_input_proc_t) (debug_info_t *id,
 				  struct debug_view *view,
 				  struct file *file,
@@ -70,8 +82,13 @@ typedef int (debug_input_proc_t) (debug_info_t *id,
 				  size_t in_buf_size, loff_t *offset);
 
 int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view,
-			 int area, debug_entry_t *entry, char *out_buf);
+			 int area, debug_entry_t *entry,
+			 char *out_buf, size_t out_buf_size);
 
+#define DEBUG_SPRINTF_MAX_ARGS 10
+int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view,
+			    char *out_buf, size_t out_buf_size,
+			    const char *inbuf);
 struct debug_view {
 	char name[DEBUG_MAX_NAME_LEN];
 	debug_prolog_proc_t *prolog_proc;
@@ -82,7 +99,6 @@ struct debug_view {
 };
 
 extern struct debug_view debug_hex_ascii_view;
-extern struct debug_view debug_raw_view;
 extern struct debug_view debug_sprintf_view;
 
 /* do NOT use the _common functions */
@@ -102,6 +118,9 @@ debug_info_t *debug_register_mode(const char *name, int pages, int nr_areas,
 				  int buf_size, umode_t mode, uid_t uid,
 				  gid_t gid);
 
+ssize_t debug_dump(debug_info_t *id, struct debug_view *view,
+		   char *buf, size_t buf_size, bool reverse);
+
 void debug_unregister(debug_info_t *id);
 
 void debug_set_level(debug_info_t *id, int new_level);
@@ -212,7 +231,7 @@ static inline debug_entry_t *debug_text_event(debug_info_t *id, int level,
 
 /*
  * IMPORTANT: Use "%s" in sprintf format strings with care! Only pointers are
- * stored in the s390dbf. See Documentation/s390/s390dbf.rst for more details!
+ * stored in the s390dbf. See Documentation/arch/s390/s390dbf.rst for more details!
  */
 extern debug_entry_t *
 __debug_sprintf_event(debug_info_t *id, int level, char *string, ...)
@@ -340,7 +359,7 @@ static inline debug_entry_t *debug_text_exception(debug_info_t *id, int level,
 
 /*
  * IMPORTANT: Use "%s" in sprintf format strings with care! Only pointers are
- * stored in the s390dbf. See Documentation/s390/s390dbf.rst for more details!
+ * stored in the s390dbf. See Documentation/arch/s390/s390dbf.rst for more details!
  */
 extern debug_entry_t *
 __debug_sprintf_exception(debug_info_t *id, int level, char *string, ...)
@@ -382,38 +401,99 @@ int debug_register_view(debug_info_t *id, struct debug_view *view);
 
 int debug_unregister_view(debug_info_t *id, struct debug_view *view);
 
+#ifndef MODULE
+
+/*
+ * Note: Initial page and area numbers must be fixed to allow static
+ * initialization. This enables very early tracing. Changes to these values
+ * must be reflected in __DEFINE_STATIC_AREA.
+ */
+#define EARLY_PAGES		8
+#define EARLY_AREAS		1
+
+#define VNAME(var, suffix)	__##var##_##suffix
+
 /*
-   define the debug levels:
-   - 0 No debugging output to console or syslog
-   - 1 Log internal errors to syslog, ignore check conditions
-   - 2 Log internal errors and check conditions to syslog
-   - 3 Log internal errors to console, log check conditions to syslog
-   - 4 Log internal errors and check conditions to console
-   - 5 panic on internal errors, log check conditions to console
-   - 6 panic on both, internal errors and check conditions
+ * Define static areas for early trace data. During boot debug_register_static()
+ * will replace these with dynamically allocated areas to allow custom page and
+ * area sizes, and dynamic resizing.
  */
+#define __DEFINE_STATIC_AREA(var)					\
+static char VNAME(var, data)[EARLY_PAGES][PAGE_SIZE] __initdata;	\
+static debug_entry_t *VNAME(var, pages)[EARLY_PAGES] __initdata = {	\
+	(debug_entry_t *)VNAME(var, data)[0],				\
+	(debug_entry_t *)VNAME(var, data)[1],				\
+	(debug_entry_t *)VNAME(var, data)[2],				\
+	(debug_entry_t *)VNAME(var, data)[3],				\
+	(debug_entry_t *)VNAME(var, data)[4],				\
+	(debug_entry_t *)VNAME(var, data)[5],				\
+	(debug_entry_t *)VNAME(var, data)[6],				\
+	(debug_entry_t *)VNAME(var, data)[7],				\
+};									\
+static debug_entry_t **VNAME(var, areas)[EARLY_AREAS] __initdata = {	\
+	(debug_entry_t **)VNAME(var, pages),				\
+};									\
+static int VNAME(var, active_pages)[EARLY_AREAS] __initdata;		\
+static int VNAME(var, active_entries)[EARLY_AREAS] __initdata
+
+#define __DEBUG_INFO_INIT(var, _name, _buf_size) {			\
+	.next = NULL,							\
+	.prev = NULL,							\
+	.ref_count = REFCOUNT_INIT(1),					\
+	.lock = __SPIN_LOCK_UNLOCKED(var.lock),				\
+	.level = DEBUG_DEFAULT_LEVEL,					\
+	.nr_areas = EARLY_AREAS,					\
+	.pages_per_area = EARLY_PAGES,					\
+	.buf_size = (_buf_size),					\
+	.entry_size = sizeof(debug_entry_t) + (_buf_size),		\
+	.areas = VNAME(var, areas),					\
+	.active_area = 0,						\
+	.active_pages = VNAME(var, active_pages),			\
+	.active_entries = VNAME(var, active_entries),			\
+	.debugfs_root_entry = NULL,					\
+	.debugfs_entries = { NULL },					\
+	.views = { NULL },						\
+	.name = (_name),						\
+	.mode = 0600,							\
+}
+
+#define __REGISTER_STATIC_DEBUG_INFO(var, name, pages, areas, view)	\
+static int __init VNAME(var, reg)(void)					\
+{									\
+	debug_register_static(&var, (pages), (areas));			\
+	debug_register_view(&var, (view));				\
+	return 0;							\
+}									\
+arch_initcall(VNAME(var, reg))
+
+/**
+ * DEFINE_STATIC_DEBUG_INFO - Define static debug_info_t
+ *
+ * @var: Name of debug_info_t variable
+ * @name: Name of debug log (e.g. used for debugfs entry)
+ * @pages: Number of pages per area
+ * @nr_areas: Number of debug areas
+ * @buf_size: Size of data area in each debug entry
+ * @view: Pointer to debug view struct
+ *
+ * Define a static debug_info_t for early tracing. The associated debugfs log
+ * is automatically registered with the specified debug view.
+ *
+ * Important: Users of this macro must not call any of the
+ * debug_register/_unregister() functions for this debug_info_t!
+ *
+ * Note: Tracing will start with a fixed number of initial pages and areas.
+ * The debug area will be changed to use the specified numbers during
+ * arch_initcall.
+ */
+#define DEFINE_STATIC_DEBUG_INFO(var, name, pages, nr_areas, buf_size, view) \
+__DEFINE_STATIC_AREA(var);						\
+static debug_info_t __refdata var =					\
+	__DEBUG_INFO_INIT(var, (name), (buf_size));			\
+__REGISTER_STATIC_DEBUG_INFO(var, name, pages, nr_areas, view)
+
+void debug_register_static(debug_info_t *id, int pages_per_area, int nr_areas);
+
+#endif /* MODULE */
 
-#ifndef DEBUG_LEVEL
-#define DEBUG_LEVEL 4
-#endif
-
-#define INTERNAL_ERRMSG(x,y...) "E" __FILE__ "%d: " x, __LINE__, y
-#define INTERNAL_WRNMSG(x,y...) "W" __FILE__ "%d: " x, __LINE__, y
-#define INTERNAL_INFMSG(x,y...) "I" __FILE__ "%d: " x, __LINE__, y
-#define INTERNAL_DEBMSG(x,y...) "D" __FILE__ "%d: " x, __LINE__, y
-
-#if DEBUG_LEVEL > 0
-#define PRINT_DEBUG(x...)	printk(KERN_DEBUG PRINTK_HEADER x)
-#define PRINT_INFO(x...)	printk(KERN_INFO PRINTK_HEADER x)
-#define PRINT_WARN(x...)	printk(KERN_WARNING PRINTK_HEADER x)
-#define PRINT_ERR(x...)		printk(KERN_ERR PRINTK_HEADER x)
-#define PRINT_FATAL(x...)	panic(PRINTK_HEADER x)
-#else
-#define PRINT_DEBUG(x...)	printk(KERN_DEBUG PRINTK_HEADER x)
-#define PRINT_INFO(x...)	printk(KERN_DEBUG PRINTK_HEADER x)
-#define PRINT_WARN(x...)	printk(KERN_DEBUG PRINTK_HEADER x)
-#define PRINT_ERR(x...)		printk(KERN_DEBUG PRINTK_HEADER x)
-#define PRINT_FATAL(x...)	printk(KERN_DEBUG PRINTK_HEADER x)
-#endif /* DASD_DEBUG */
-
-#endif /* DEBUG_H */
+#endif /* _ASM_S390_DEBUG_H */
diff --git a/arch/s390/include/asm/delay.h b/arch/s390/include/asm/delay.h
index 898323fd93d2..21a8fe18fe66 100644
--- a/arch/s390/include/asm/delay.h
+++ b/arch/s390/include/asm/delay.h
@@ -13,13 +13,12 @@
 #ifndef _S390_DELAY_H
 #define _S390_DELAY_H
 
-void __ndelay(unsigned long long nsecs);
-void __udelay(unsigned long long usecs);
-void udelay_simple(unsigned long long usecs);
+void __ndelay(unsigned long nsecs);
+void __udelay(unsigned long usecs);
 void __delay(unsigned long loops);
 
-#define ndelay(n) __ndelay((unsigned long long) (n))
-#define udelay(n) __udelay((unsigned long long) (n))
-#define mdelay(n) __udelay((unsigned long long) (n) * 1000)
+#define ndelay(n) __ndelay((unsigned long)(n))
+#define udelay(n) __udelay((unsigned long)(n))
+#define mdelay(n) __udelay((unsigned long)(n) * 1000)
 
 #endif /* defined(_S390_DELAY_H) */
diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h
index 0036eab14391..8db8db3b1018 100644
--- a/arch/s390/include/asm/diag.h
+++ b/arch/s390/include/asm/diag.h
@@ -11,6 +11,9 @@
 
 #include <linux/if_ether.h>
 #include <linux/percpu.h>
+#include <asm/asm-extable.h>
+#include <asm/sclp.h>
+#include <asm/cio.h>
 
 enum diag_stat_enum {
 	DIAG_STAT_X008,
@@ -19,6 +22,7 @@ enum diag_stat_enum {
 	DIAG_STAT_X014,
 	DIAG_STAT_X044,
 	DIAG_STAT_X064,
+	DIAG_STAT_X08C,
 	DIAG_STAT_X09C,
 	DIAG_STAT_X0DC,
 	DIAG_STAT_X204,
@@ -32,7 +36,11 @@ enum diag_stat_enum {
 	DIAG_STAT_X2FC,
 	DIAG_STAT_X304,
 	DIAG_STAT_X308,
+	DIAG_STAT_X310,
 	DIAG_STAT_X318,
+	DIAG_STAT_X320,
+	DIAG_STAT_X324,
+	DIAG_STAT_X49C,
 	DIAG_STAT_X500,
 	NR_DIAG_STAT
 };
@@ -40,6 +48,13 @@ enum diag_stat_enum {
 void diag_stat_inc(enum diag_stat_enum nr);
 void diag_stat_inc_norecursion(enum diag_stat_enum nr);
 
+struct hypfs_diag0c_entry;
+
+/*
+ * Diagnose 0c: Pseudo Timer
+ */
+void diag0c(struct hypfs_diag0c_entry *data);
+
 /*
  * Diagnose 10: Release page range
  */
@@ -47,11 +62,11 @@ static inline void diag10_range(unsigned long start_pfn, unsigned long num_pfn)
 {
 	unsigned long start_addr, end_addr;
 
-	start_addr = start_pfn << PAGE_SHIFT;
-	end_addr = (start_pfn + num_pfn - 1) << PAGE_SHIFT;
+	start_addr = pfn_to_phys(start_pfn);
+	end_addr = pfn_to_phys(start_pfn + num_pfn - 1);
 
 	diag_stat_inc(DIAG_STAT_X010);
-	asm volatile(
+	asm_inline volatile(
 		"0:	diag	%0,%1,0x10\n"
 		"1:	nopr	%%r7\n"
 		EX_TABLE(0b, 1b)
@@ -78,10 +93,20 @@ struct diag210 {
 	u8 vrdccrty;	/* real device type (output) */
 	u8 vrdccrmd;	/* real device model (output) */
 	u8 vrdccrft;	/* real device feature (output) */
-} __attribute__((packed, aligned(4)));
+} __packed __aligned(4);
 
 extern int diag210(struct diag210 *addr);
 
+struct diag8c {
+	u8 flags;
+	u8 num_partitions;
+	u16 width;
+	u16 height;
+	u8 data[];
+} __packed __aligned(4);
+
+extern int diag8c(struct diag8c *out, struct ccw_dev_id *devno);
+
 /* bit is set in flags, when physical cpu info is included in diag 204 data */
 #define DIAG204_LPAR_PHYS_FLG 0x80
 #define DIAG204_LPAR_NAME_LEN 8		/* lpar name len in diag 204 data */
@@ -95,6 +120,10 @@ enum diag204_sc {
 	DIAG204_SUBC_STIB7 = 7
 };
 
+#define DIAG204_SUBCODE_MASK 0xffff
+#define DIAG204_BIF_BIT 0x80000000
+#define DIAG204_BUSY_WAIT (HZ / 10)
+
 /* The two available diag 204 data formats */
 enum diag204_format {
 	DIAG204_INFO_SIMPLE = 0,
@@ -298,27 +327,51 @@ struct diag26c_mac_resp {
 union diag318_info {
 	unsigned long val;
 	struct {
-		unsigned int cpnc : 8;
-		unsigned int cpvc_linux : 24;
-		unsigned char cpvc_distro[3];
-		unsigned char zero;
+		unsigned long cpnc : 8;
+		unsigned long cpvc : 56;
 	};
 };
 
+static inline bool diag204_has_bif(void)
+{
+	return sclp.has_diag204_bif;
+}
+
 int diag204(unsigned long subcode, unsigned long size, void *addr);
 int diag224(void *ptr);
 int diag26c(void *req, void *resp, enum diag26c_sc subcode);
 
 struct hypfs_diag0c_entry;
 
+/*
+ * This structure must contain only pointers/references into
+ * the AMODE31 text section.
+ */
 struct diag_ops {
 	int (*diag210)(struct diag210 *addr);
-	int (*diag26c)(void *req, void *resp, enum diag26c_sc subcode);
+	int (*diag26c)(unsigned long rx, unsigned long rx1, enum diag26c_sc subcode);
 	int (*diag14)(unsigned long rx, unsigned long ry1, unsigned long subcode);
-	void (*diag0c)(struct hypfs_diag0c_entry *entry);
+	int (*diag8c)(struct diag8c *addr, struct ccw_dev_id *devno, size_t len);
+	void (*diag0c)(unsigned long rx);
 	void (*diag308_reset)(void);
 };
 
-extern struct diag_ops diag_dma_ops;
-extern struct diag210 *__diag210_tmp_dma;
+extern struct diag_ops diag_amode31_ops;
+extern struct diag210 *__diag210_tmp_amode31;
+
+int _diag210_amode31(struct diag210 *addr);
+int _diag26c_amode31(unsigned long rx, unsigned long rx1, enum diag26c_sc subcode);
+int _diag14_amode31(unsigned long rx, unsigned long ry1, unsigned long subcode);
+void _diag0c_amode31(unsigned long rx);
+void _diag308_reset_amode31(void);
+int _diag8c_amode31(struct diag8c *addr, struct ccw_dev_id *devno, size_t len);
+
+/* diag 49c subcodes */
+enum diag49c_sc {
+	DIAG49C_SUBC_ACK = 0,
+	DIAG49C_SUBC_REG = 1
+};
+
+int diag49c(unsigned long subcode);
+
 #endif /* _ASM_S390_DIAG_H */
diff --git a/arch/s390/include/asm/dma-types.h b/arch/s390/include/asm/dma-types.h
new file mode 100644
index 000000000000..5c5734e6946c
--- /dev/null
+++ b/arch/s390/include/asm/dma-types.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_S390_DMA_TYPES_H_
+#define _ASM_S390_DMA_TYPES_H_
+
+#include <linux/types.h>
+#include <linux/io.h>
+
+/*
+ * typedef dma32_t
+ * Contains a 31 bit absolute address to a DMA capable piece of storage.
+ *
+ * For CIO, DMA addresses are always absolute addresses. These addresses tend
+ * to be used in architectured memory blocks (like ORB, IDAW, MIDAW). Under
+ * certain circumstances 31 bit wide addresses must be used because the
+ * address must fit in 31 bits.
+ *
+ * This type is to be used when such fields can be modelled as 32 bit wide.
+ */
+typedef u32 __bitwise dma32_t;
+
+/*
+ * typedef dma64_t
+ * Contains a 64 bit absolute address to a DMA capable piece of storage.
+ *
+ * For CIO, DMA addresses are always absolute addresses. These addresses tend
+ * to be used in architectured memory blocks (like ORB, IDAW, MIDAW).
+ *
+ * This type is to be used to model such 64 bit wide fields.
+ */
+typedef u64 __bitwise dma64_t;
+
+/*
+ * Although DMA addresses should be obtained using the DMA API, in cases when
+ * it is known that the first argument holds a virtual address that points to
+ * DMA-able 31 bit addressable storage, then this function can be safely used.
+ */
+static inline dma32_t virt_to_dma32(void *ptr)
+{
+	return (__force dma32_t)__pa32(ptr);
+}
+
+static inline void *dma32_to_virt(dma32_t addr)
+{
+	return __va((__force unsigned long)addr);
+}
+
+static inline dma32_t u32_to_dma32(u32 addr)
+{
+	return (__force dma32_t)addr;
+}
+
+static inline u32 dma32_to_u32(dma32_t addr)
+{
+	return (__force u32)addr;
+}
+
+static inline dma32_t dma32_add(dma32_t a, u32 b)
+{
+	return (__force dma32_t)((__force u32)a + b);
+}
+
+static inline dma32_t dma32_and(dma32_t a, u32 b)
+{
+	return (__force dma32_t)((__force u32)a & b);
+}
+
+/*
+ * Although DMA addresses should be obtained using the DMA API, in cases when
+ * it is known that the first argument holds a virtual address that points to
+ * DMA-able storage, then this function can be safely used.
+ */
+static inline dma64_t virt_to_dma64(void *ptr)
+{
+	return (__force dma64_t)__pa(ptr);
+}
+
+static inline void *dma64_to_virt(dma64_t addr)
+{
+	return __va((__force unsigned long)addr);
+}
+
+static inline dma64_t u64_to_dma64(u64 addr)
+{
+	return (__force dma64_t)addr;
+}
+
+static inline u64 dma64_to_u64(dma64_t addr)
+{
+	return (__force u64)addr;
+}
+
+static inline dma64_t dma64_add(dma64_t a, u64 b)
+{
+	return (__force dma64_t)((__force u64)a + b);
+}
+
+static inline dma64_t dma64_and(dma64_t a, u64 b)
+{
+	return (__force dma64_t)((__force u64)a & b);
+}
+
+#endif /* _ASM_S390_DMA_TYPES_H_ */
diff --git a/arch/s390/include/asm/dma.h b/arch/s390/include/asm/dma.h
index 6f26f35d4a71..7fe3e31956d7 100644
--- a/arch/s390/include/asm/dma.h
+++ b/arch/s390/include/asm/dma.h
@@ -2,19 +2,13 @@
 #ifndef _ASM_S390_DMA_H
 #define _ASM_S390_DMA_H
 
-#include <asm/io.h>
+#include <linux/io.h>
 
 /*
  * MAX_DMA_ADDRESS is ambiguous because on s390 its completely unrelated
  * to DMA. It _is_ used for the s390 memory zone split at 2GB caused
  * by the 31 bit heritage.
  */
-#define MAX_DMA_ADDRESS         0x80000000
-
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy	(0)
-#endif
+#define MAX_DMA_ADDRESS		__va(0x80000000)
 
 #endif /* _ASM_S390_DMA_H */
diff --git a/arch/s390/include/asm/dwarf.h b/arch/s390/include/asm/dwarf.h
index 4f21ae561e4d..390906b8e386 100644
--- a/arch/s390/include/asm/dwarf.h
+++ b/arch/s390/include/asm/dwarf.h
@@ -9,6 +9,7 @@
 #define CFI_DEF_CFA_OFFSET	.cfi_def_cfa_offset
 #define CFI_ADJUST_CFA_OFFSET	.cfi_adjust_cfa_offset
 #define CFI_RESTORE		.cfi_restore
+#define CFI_REL_OFFSET		.cfi_rel_offset
 
 #ifdef CONFIG_AS_CFI_VAL_OFFSET
 #define CFI_VAL_OFFSET		.cfi_val_offset
diff --git a/arch/s390/include/asm/eadm.h b/arch/s390/include/asm/eadm.h
index bb63b2afdf6f..c4589ec4505e 100644
--- a/arch/s390/include/asm/eadm.h
+++ b/arch/s390/include/asm/eadm.h
@@ -5,6 +5,7 @@
 #include <linux/types.h>
 #include <linux/device.h>
 #include <linux/blk_types.h>
+#include <asm/dma-types.h>
 
 struct arqb {
 	u64 data;
@@ -45,7 +46,7 @@ struct msb {
 	u16:12;
 	u16 bs:4;
 	u32 blk_count;
-	u64 data_addr;
+	dma64_t data_addr;
 	u64 scm_addr;
 	u64:64;
 } __packed;
@@ -54,7 +55,7 @@ struct aidaw {
 	u8 flags;
 	u32 :24;
 	u32 :32;
-	u64 data_addr;
+	dma64_t data_addr;
 } __packed;
 
 #define MSB_OC_CLEAR	0
@@ -78,7 +79,7 @@ struct aob {
 
 struct aob_rq_header {
 	struct scm_device *scmdev;
-	char data[0];
+	char data[];
 };
 
 struct scm_device {
@@ -105,7 +106,7 @@ enum scm_event {SCM_CHANGE, SCM_AVAIL};
 struct scm_driver {
 	struct device_driver drv;
 	int (*probe) (struct scm_device *scmdev);
-	int (*remove) (struct scm_device *scmdev);
+	void (*remove) (struct scm_device *scmdev);
 	void (*notify) (struct scm_device *scmdev, enum scm_event event);
 	void (*handler) (struct scm_device *scmdev, void *data,
 			blk_status_t error);
diff --git a/arch/s390/include/asm/ebcdic.h b/arch/s390/include/asm/ebcdic.h
index efb50fc6866c..7164cb658435 100644
--- a/arch/s390/include/asm/ebcdic.h
+++ b/arch/s390/include/asm/ebcdic.h
@@ -22,18 +22,18 @@ extern __u8 _ebc_toupper[256]; /* EBCDIC -> uppercase */
 static inline void
 codepage_convert(const __u8 *codepage, volatile char *addr, unsigned long nr)
 {
-	if (nr-- <= 0)
+	if (!nr--)
 		return;
 	asm volatile(
-		"	bras	1,1f\n"
-		"	tr	0(1,%0),0(%2)\n"
-		"0:	tr	0(256,%0),0(%2)\n"
+		"	j	2f\n"
+		"0:	tr	0(1,%0),0(%2)\n"
+		"1:	tr	0(256,%0),0(%2)\n"
 		"	la	%0,256(%0)\n"
-		"1:	ahi	%1,-256\n"
-		"	jnm	0b\n"
-		"	ex	%1,0(1)"
+		"2:	aghi	%1,-256\n"
+		"	jnm	1b\n"
+		"	exrl	%1,0b"
 		: "+&a" (addr), "+&a" (nr)
-		: "a" (codepage) : "cc", "memory", "1");
+		: "a" (codepage) : "cc", "memory");
 }
 
 #define ASCEBC(addr,nr) codepage_convert(_ascebc, addr, nr)
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index 5775fc22f410..a03df312081e 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -91,29 +91,65 @@
 /* Keep this the last entry.  */
 #define R_390_NUM	61
 
-/* Bits present in AT_HWCAP. */
-#define HWCAP_S390_ESAN3	1
-#define HWCAP_S390_ZARCH	2
-#define HWCAP_S390_STFLE	4
-#define HWCAP_S390_MSA		8
-#define HWCAP_S390_LDISP	16
-#define HWCAP_S390_EIMM		32
-#define HWCAP_S390_DFP		64
-#define HWCAP_S390_HPAGE	128
-#define HWCAP_S390_ETF3EH	256
-#define HWCAP_S390_HIGH_GPRS	512
-#define HWCAP_S390_TE		1024
-#define HWCAP_S390_VXRS		2048
-#define HWCAP_S390_VXRS_BCD	4096
-#define HWCAP_S390_VXRS_EXT	8192
-#define HWCAP_S390_GS		16384
-#define HWCAP_S390_VXRS_EXT2	32768
-#define HWCAP_S390_VXRS_PDE	65536
-#define HWCAP_S390_SORT		131072
-#define HWCAP_S390_DFLT		262144
+/*
+ * HWCAP flags - for AT_HWCAP
+ *
+ * Bits 32-63 are reserved for use by libc.
+ * Bit 31 is reserved and will be used by libc to determine if a second
+ * argument is passed to IFUNC resolvers. This will be implemented when
+ * there is a need for AT_HWCAP2.
+ */
+enum {
+	HWCAP_NR_ESAN3		= 0,
+	HWCAP_NR_ZARCH		= 1,
+	HWCAP_NR_STFLE		= 2,
+	HWCAP_NR_MSA		= 3,
+	HWCAP_NR_LDISP		= 4,
+	HWCAP_NR_EIMM		= 5,
+	HWCAP_NR_DFP		= 6,
+	HWCAP_NR_HPAGE		= 7,
+	HWCAP_NR_ETF3EH		= 8,
+	HWCAP_NR_HIGH_GPRS	= 9,
+	HWCAP_NR_TE		= 10,
+	HWCAP_NR_VXRS		= 11,
+	HWCAP_NR_VXRS_BCD	= 12,
+	HWCAP_NR_VXRS_EXT	= 13,
+	HWCAP_NR_GS		= 14,
+	HWCAP_NR_VXRS_EXT2	= 15,
+	HWCAP_NR_VXRS_PDE	= 16,
+	HWCAP_NR_SORT		= 17,
+	HWCAP_NR_DFLT		= 18,
+	HWCAP_NR_VXRS_PDE2	= 19,
+	HWCAP_NR_NNPA		= 20,
+	HWCAP_NR_PCI_MIO	= 21,
+	HWCAP_NR_SIE		= 22,
+	HWCAP_NR_MAX
+};
 
-/* Internal bits, not exposed via elf */
-#define HWCAP_INT_SIE		1UL
+/* Bits present in AT_HWCAP. */
+#define HWCAP_ESAN3		BIT(HWCAP_NR_ESAN3)
+#define HWCAP_ZARCH		BIT(HWCAP_NR_ZARCH)
+#define HWCAP_STFLE		BIT(HWCAP_NR_STFLE)
+#define HWCAP_MSA		BIT(HWCAP_NR_MSA)
+#define HWCAP_LDISP		BIT(HWCAP_NR_LDISP)
+#define HWCAP_EIMM		BIT(HWCAP_NR_EIMM)
+#define HWCAP_DFP		BIT(HWCAP_NR_DFP)
+#define HWCAP_HPAGE		BIT(HWCAP_NR_HPAGE)
+#define HWCAP_ETF3EH		BIT(HWCAP_NR_ETF3EH)
+#define HWCAP_HIGH_GPRS		BIT(HWCAP_NR_HIGH_GPRS)
+#define HWCAP_TE		BIT(HWCAP_NR_TE)
+#define HWCAP_VXRS		BIT(HWCAP_NR_VXRS)
+#define HWCAP_VXRS_BCD		BIT(HWCAP_NR_VXRS_BCD)
+#define HWCAP_VXRS_EXT		BIT(HWCAP_NR_VXRS_EXT)
+#define HWCAP_GS		BIT(HWCAP_NR_GS)
+#define HWCAP_VXRS_EXT2		BIT(HWCAP_NR_VXRS_EXT2)
+#define HWCAP_VXRS_PDE		BIT(HWCAP_NR_VXRS_PDE)
+#define HWCAP_SORT		BIT(HWCAP_NR_SORT)
+#define HWCAP_DFLT		BIT(HWCAP_NR_DFLT)
+#define HWCAP_VXRS_PDE2		BIT(HWCAP_NR_VXRS_PDE2)
+#define HWCAP_NNPA		BIT(HWCAP_NR_NNPA)
+#define HWCAP_PCI_MIO		BIT(HWCAP_NR_PCI_MIO)
+#define HWCAP_SIE		BIT(HWCAP_NR_SIE)
 
 /*
  * These are used to set parameters in the core dumps.
@@ -122,9 +158,6 @@
 #define ELF_DATA	ELFDATA2MSB
 #define ELF_ARCH	EM_S390
 
-/* s390 specific phdr types */
-#define PT_S390_PGSTE	0x70000000
-
 /*
  * ELF register definitions..
  */
@@ -144,10 +177,6 @@ typedef s390_compat_regs compat_elf_gregset_t;
 #include <linux/sched/mm.h>	/* for task_struct */
 #include <asm/mmu_context.h>
 
-#include <asm/vdso.h>
-
-extern unsigned int vdso_enabled;
-
 /*
  * This is used to ensure we don't load something for the wrong architecture.
  */
@@ -159,35 +188,6 @@ extern unsigned int vdso_enabled;
 	 && (x)->e_ident[EI_CLASS] == ELF_CLASS)
 #define compat_start_thread	start_thread31
 
-struct arch_elf_state {
-	int rc;
-};
-
-#define INIT_ARCH_ELF_STATE { .rc = 0 }
-
-#define arch_check_elf(ehdr, interp, interp_ehdr, state) (0)
-#ifdef CONFIG_PGSTE
-#define arch_elf_pt_proc(ehdr, phdr, elf, interp, state)	\
-({								\
-	struct arch_elf_state *_state = state;			\
-	if ((phdr)->p_type == PT_S390_PGSTE &&			\
-	    !page_table_allocate_pgste &&			\
-	    !test_thread_flag(TIF_PGSTE) &&			\
-	    !current->mm->context.alloc_pgste) {		\
-		set_thread_flag(TIF_PGSTE);			\
-		set_pt_regs_flag(task_pt_regs(current),		\
-				 PIF_SYSCALL_RESTART);		\
-		_state->rc = -EAGAIN;				\
-	}							\
-	_state->rc;						\
-})
-#else
-#define arch_elf_pt_proc(ehdr, phdr, elf, interp, state)	\
-({								\
-	(state)->rc;						\
-})
-#endif
-
 /* For SVR4/S390 the function pointer to be registered with `atexit` is
    passed in R14. */
 #define ELF_PLAT_INIT(_r, load_addr) \
@@ -213,10 +213,6 @@ struct arch_elf_state {
 extern unsigned long elf_hwcap;
 #define ELF_HWCAP (elf_hwcap)
 
-/* Internal hardware capabilities, not exposed via elf */
-
-extern unsigned long int_hwcap;
-
 /* This yields a string that ld.so will use to load implementation
    specific libraries for optimization.  This is more specific in
    intent than poking at uname or /proc/cpuinfo.
@@ -233,8 +229,7 @@ extern char elf_platform[];
 do {								\
 	set_personality(PER_LINUX |				\
 		(current->personality & (~PER_MASK)));		\
-	current->thread.sys_call_table =			\
-		(unsigned long) &sys_call_table;		\
+	current->thread.sys_call_table = sys_call_table;	\
 } while (0)
 #else /* CONFIG_COMPAT */
 #define SET_PERSONALITY(ex)					\
@@ -245,11 +240,11 @@ do {								\
 	if ((ex).e_ident[EI_CLASS] == ELFCLASS32) {		\
 		set_thread_flag(TIF_31BIT);			\
 		current->thread.sys_call_table =		\
-			(unsigned long)	&sys_call_table_emu;	\
+			sys_call_table_emu;			\
 	} else {						\
 		clear_thread_flag(TIF_31BIT);			\
 		current->thread.sys_call_table =		\
-			(unsigned long) &sys_call_table;	\
+			sys_call_table;				\
 	}							\
 } while (0)
 #endif /* CONFIG_COMPAT */
@@ -269,11 +264,10 @@ do {								\
 #define STACK_RND_MASK	MMAP_RND_MASK
 
 /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
-#define ARCH_DLINFO							    \
-do {									    \
-	if (vdso_enabled)						    \
-		NEW_AUX_ENT(AT_SYSINFO_EHDR,				    \
-			    (unsigned long)current->mm->context.vdso_base); \
+#define ARCH_DLINFO							\
+do {									\
+	NEW_AUX_ENT(AT_SYSINFO_EHDR,					\
+		    (unsigned long)current->mm->context.vdso_base);	\
 } while (0)
 
 struct linux_binprm;
diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h
new file mode 100644
index 000000000000..35555c944630
--- /dev/null
+++ b/arch/s390/include/asm/entry-common.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_S390_ENTRY_COMMON_H
+#define ARCH_S390_ENTRY_COMMON_H
+
+#include <linux/sched.h>
+#include <linux/audit.h>
+#include <linux/randomize_kstack.h>
+#include <linux/processor.h>
+#include <linux/uaccess.h>
+#include <asm/timex.h>
+#include <asm/fpu.h>
+#include <asm/pai.h>
+
+#define ARCH_EXIT_TO_USER_MODE_WORK (_TIF_GUARDED_STORAGE | _TIF_PER_TRAP)
+
+void do_per_trap(struct pt_regs *regs);
+
+static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs)
+{
+	if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
+		debug_user_asce(0);
+
+	pai_kernel_enter(regs);
+}
+
+#define arch_enter_from_user_mode arch_enter_from_user_mode
+
+static __always_inline void arch_exit_to_user_mode_work(struct pt_regs *regs,
+							unsigned long ti_work)
+{
+	if (ti_work & _TIF_PER_TRAP) {
+		clear_thread_flag(TIF_PER_TRAP);
+		do_per_trap(regs);
+	}
+
+	if (ti_work & _TIF_GUARDED_STORAGE)
+		gs_load_bc_cb(regs);
+}
+
+#define arch_exit_to_user_mode_work arch_exit_to_user_mode_work
+
+static __always_inline void arch_exit_to_user_mode(void)
+{
+	load_user_fpu_regs();
+
+	if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
+		debug_user_asce(1);
+
+	pai_kernel_exit(current_pt_regs());
+}
+
+#define arch_exit_to_user_mode arch_exit_to_user_mode
+
+static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
+						  unsigned long ti_work)
+{
+	choose_random_kstack_offset(get_tod_clock_fast());
+}
+
+#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare
+
+#endif
diff --git a/arch/s390/include/asm/extable.h b/arch/s390/include/asm/extable.h
index ae27f756b409..af6ba52743e9 100644
--- a/arch/s390/include/asm/extable.h
+++ b/arch/s390/include/asm/extable.h
@@ -1,12 +1,20 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef __S390_EXTABLE_H
 #define __S390_EXTABLE_H
+
+#include <asm/ptrace.h>
+#include <linux/compiler.h>
+
 /*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
+ * The exception table consists of three addresses:
+ *
+ * - Address of an instruction that is allowed to fault.
+ * - Address at which the program should continue.
+ * - Optional address of handler that takes pt_regs * argument and runs in
+ *   interrupt context.
+ *
+ * No registers are modified, so it is entirely up to the continuation code
+ * to figure out what to do.
  *
  * All the routines below use bits of fixup code that are out of line
  * with the main instruction path.  This means when everything is well,
@@ -17,10 +25,11 @@
 struct exception_table_entry
 {
 	int insn, fixup;
+	short type, data;
 };
 
-extern struct exception_table_entry *__start_dma_ex_table;
-extern struct exception_table_entry *__stop_dma_ex_table;
+extern struct exception_table_entry *__start_amode31_ex_table;
+extern struct exception_table_entry *__stop_amode31_ex_table;
 
 const struct exception_table_entry *s390_search_extables(unsigned long addr);
 
@@ -31,4 +40,33 @@ static inline unsigned long extable_fixup(const struct exception_table_entry *x)
 
 #define ARCH_HAS_RELATIVE_EXTABLE
 
+static inline void swap_ex_entry_fixup(struct exception_table_entry *a,
+				       struct exception_table_entry *b,
+				       struct exception_table_entry tmp,
+				       int delta)
+{
+	a->fixup = b->fixup + delta;
+	b->fixup = tmp.fixup - delta;
+	a->type = b->type;
+	b->type = tmp.type;
+	a->data = b->data;
+	b->data = tmp.data;
+}
+#define swap_ex_entry_fixup swap_ex_entry_fixup
+
+#ifdef CONFIG_BPF_JIT
+
+bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs);
+
+#else /* !CONFIG_BPF_JIT */
+
+static inline bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+	return false;
+}
+
+#endif /* CONFIG_BPF_JIT */
+
+bool fixup_exception(struct pt_regs *regs);
+
 #endif
diff --git a/arch/s390/include/asm/extmem.h b/arch/s390/include/asm/extmem.h
index 568fd81bb77b..e0a06060afdd 100644
--- a/arch/s390/include/asm/extmem.h
+++ b/arch/s390/include/asm/extmem.h
@@ -8,6 +8,13 @@
 #define _ASM_S390X_DCSS_H
 #ifndef __ASSEMBLY__
 
+/*
+ * DCSS segment is defined as a contiguous range of pages using DEFSEG command.
+ * The range start and end is a page number with a value less than or equal to
+ * 0x7ffffff (see CP Commands and Utilities Reference).
+ */
+#define MAX_DCSS_ADDR	(512UL * SZ_1G)
+
 /* possible values for segment type as returned by segment_info */
 #define SEG_TYPE_SW 0
 #define SEG_TYPE_EW 1
diff --git a/arch/s390/include/asm/facility.h b/arch/s390/include/asm/facility.h
index 68c476b20b57..5f5b1aa6c233 100644
--- a/arch/s390/include/asm/facility.h
+++ b/arch/s390/include/asm/facility.h
@@ -9,11 +9,17 @@
 #define __ASM_FACILITY_H
 
 #include <asm/facility-defs.h>
+
+#include <linux/minmax.h>
 #include <linux/string.h>
+#include <linux/types.h>
 #include <linux/preempt.h>
+#include <asm/alternative.h>
 #include <asm/lowcore.h>
 
-#define MAX_FACILITY_BIT (sizeof(((struct lowcore *)0)->stfle_fac_list) * 8)
+#define MAX_FACILITY_BIT (sizeof(stfle_fac_list) * 8)
+
+extern u64 stfle_fac_list[16];
 
 static inline void __set_facility(unsigned long nr, void *facilities)
 {
@@ -33,71 +39,104 @@ static inline void __clear_facility(unsigned long nr, void *facilities)
 	ptr[nr >> 3] &= ~(0x80 >> (nr & 7));
 }
 
-static inline int __test_facility(unsigned long nr, void *facilities)
+static __always_inline bool __test_facility(unsigned long nr, void *facilities)
 {
 	unsigned char *ptr;
 
 	if (nr >= MAX_FACILITY_BIT)
-		return 0;
+		return false;
 	ptr = (unsigned char *) facilities + (nr >> 3);
 	return (*ptr & (0x80 >> (nr & 7))) != 0;
 }
 
 /*
- * The test_facility function uses the bit odering where the MSB is bit 0.
+ * __test_facility_constant() generates a single instruction branch. If the
+ * tested facility is available (likely) the branch is patched into a nop.
+ *
+ * Do not use this function unless you know what you are doing. All users are
+ * supposed to use test_facility() which will do the right thing.
+ */
+static __always_inline bool __test_facility_constant(unsigned long nr)
+{
+	asm goto(
+		ALTERNATIVE("brcl 15,%l[l_no]", "brcl 0,0", ALT_FACILITY(%[nr]))
+		:
+		: [nr] "i" (nr)
+		:
+		: l_no);
+	return true;
+l_no:
+	return false;
+}
+
+/*
+ * The test_facility function uses the bit ordering where the MSB is bit 0.
  * That makes it easier to query facility bits with the bit number as
  * documented in the Principles of Operation.
  */
-static inline int test_facility(unsigned long nr)
+static __always_inline bool test_facility(unsigned long nr)
 {
 	unsigned long facilities_als[] = { FACILITIES_ALS };
 
-	if (__builtin_constant_p(nr) && nr < sizeof(facilities_als) * 8) {
-		if (__test_facility(nr, &facilities_als))
-			return 1;
+	if (!__is_defined(__DECOMPRESSOR) && __builtin_constant_p(nr)) {
+		if (nr < sizeof(facilities_als) * 8) {
+			if (__test_facility(nr, &facilities_als))
+				return true;
+		}
+		return __test_facility_constant(nr);
 	}
-	return __test_facility(nr, &S390_lowcore.stfle_fac_list);
+	return __test_facility(nr, &stfle_fac_list);
 }
 
-static inline unsigned long __stfle_asm(u64 *stfle_fac_list, int size)
+static inline unsigned long __stfle_asm(u64 *fac_list, int size)
 {
-	register unsigned long reg0 asm("0") = size - 1;
+	unsigned long reg0 = size - 1;
 
 	asm volatile(
-		".insn s,0xb2b00000,0(%1)" /* stfle */
-		: "+d" (reg0)
-		: "a" (stfle_fac_list)
-		: "memory", "cc");
+		"	lgr	0,%[reg0]\n"
+		"	.insn	s,0xb2b00000,%[list]\n" /* stfle */
+		"	lgr	%[reg0],0\n"
+		: [reg0] "+&d" (reg0), [list] "+Q" (*fac_list)
+		:
+		: "memory", "cc", "0");
 	return reg0;
 }
 
 /**
  * stfle - Store facility list extended
- * @stfle_fac_list: array where facility list can be stored
+ * @fac_list: array where facility list can be stored
  * @size: size of passed in array in double words
  */
-static inline void __stfle(u64 *stfle_fac_list, int size)
+static inline void __stfle(u64 *fac_list, int size)
 {
 	unsigned long nr;
+	u32 stfl_fac_list;
 
 	asm volatile(
 		"	stfl	0(0)\n"
-		: "=m" (S390_lowcore.stfl_fac_list));
+		: "=m" (get_lowcore()->stfl_fac_list));
+	stfl_fac_list = get_lowcore()->stfl_fac_list;
+	memcpy(fac_list, &stfl_fac_list, 4);
 	nr = 4; /* bytes stored by stfl */
-	memcpy(stfle_fac_list, &S390_lowcore.stfl_fac_list, 4);
-	if (S390_lowcore.stfl_fac_list & 0x01000000) {
+	if (stfl_fac_list & 0x01000000) {
 		/* More facility bits available with stfle */
-		nr = __stfle_asm(stfle_fac_list, size);
+		nr = __stfle_asm(fac_list, size);
 		nr = min_t(unsigned long, (nr + 1) * 8, size * 8);
 	}
-	memset((char *) stfle_fac_list + nr, 0, size * 8 - nr);
+	memset((char *)fac_list + nr, 0, size * 8 - nr);
 }
 
-static inline void stfle(u64 *stfle_fac_list, int size)
+static inline void stfle(u64 *fac_list, int size)
 {
 	preempt_disable();
-	__stfle(stfle_fac_list, size);
+	__stfle(fac_list, size);
 	preempt_enable();
 }
 
+/**
+ * stfle_size - Actual size of the facility list as specified by stfle
+ * (number of double words)
+ */
+unsigned int stfle_size(void);
+
 #endif /* __ASM_FACILITY_H */
diff --git a/arch/s390/include/asm/fault.h b/arch/s390/include/asm/fault.h
new file mode 100644
index 000000000000..d326f56603d6
--- /dev/null
+++ b/arch/s390/include/asm/fault.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *    Copyright IBM Corp. 1999, 2023
+ */
+#ifndef _ASM_S390_FAULT_H
+#define _ASM_S390_FAULT_H
+
+union teid {
+	unsigned long val;
+	struct {
+		unsigned long addr : 52; /* Translation-exception Address */
+		unsigned long fsi  : 2;	 /* Access Exception Fetch/Store Indication */
+		unsigned long	   : 2;
+		unsigned long b56  : 1;
+		unsigned long	   : 3;
+		unsigned long b60  : 1;
+		unsigned long b61  : 1;
+		unsigned long as   : 2;	 /* ASCE Identifier */
+	};
+};
+
+enum {
+	TEID_FSI_UNKNOWN = 0, /* Unknown whether fetch or store */
+	TEID_FSI_STORE	 = 1, /* Exception was due to store operation */
+	TEID_FSI_FETCH	 = 2  /* Exception was due to fetch operation */
+};
+
+#endif /* _ASM_S390_FAULT_H */
diff --git a/arch/s390/include/asm/fcx.h b/arch/s390/include/asm/fcx.h
index cff0749e9657..80f82a739b45 100644
--- a/arch/s390/include/asm/fcx.h
+++ b/arch/s390/include/asm/fcx.h
@@ -10,6 +10,7 @@
 #define _ASM_S390_FCX_H
 
 #include <linux/types.h>
+#include <asm/dma-types.h>
 
 #define TCW_FORMAT_DEFAULT		0
 #define TCW_TIDAW_FORMAT_DEFAULT	0
@@ -43,16 +44,16 @@ struct tcw {
 	u32 r:1;
 	u32 w:1;
 	u32 :16;
-	u64 output;
-	u64 input;
-	u64 tsb;
-	u64 tccb;
+	dma64_t output;
+	dma64_t input;
+	dma64_t tsb;
+	dma64_t tccb;
 	u32 output_count;
 	u32 input_count;
 	u32 :32;
 	u32 :32;
 	u32 :32;
-	u32 intrg;
+	dma32_t intrg;
 } __attribute__ ((packed, aligned(64)));
 
 #define TIDAW_FLAGS_LAST		(1 << (7 - 0))
@@ -73,7 +74,7 @@ struct tidaw {
 	u32 flags:8;
 	u32 :24;
 	u32 count;
-	u64 addr;
+	dma64_t addr;
 } __attribute__ ((packed, aligned(16)));
 
 /**
@@ -214,7 +215,7 @@ struct dcw_intrg_data {
 	u32 :32;
 	u64 time;
 	u64 prog_id;
-	u8  prog_data[0];
+	u8  prog_data[];
 } __attribute__ ((packed));
 
 #define DCW_FLAGS_CC		(1 << (7 - 1))
@@ -241,7 +242,7 @@ struct dcw {
 	u32 :8;
 	u32 cd_count:8;
 	u32 count;
-	u8 cd[0];
+	u8 cd[];
 } __attribute__ ((packed));
 
 #define TCCB_FORMAT_DEFAULT	0x7f
@@ -286,7 +287,7 @@ struct tccb_tcat {
  */
 struct tccb {
 	struct tccb_tcah tcah;
-	u8 tca[0];
+	u8 tca[];
 } __attribute__ ((packed, aligned(8)));
 
 struct tcw *tcw_get_intrg(struct tcw *tcw);
diff --git a/arch/s390/include/asm/fprobe.h b/arch/s390/include/asm/fprobe.h
new file mode 100644
index 000000000000..5ef600b372f4
--- /dev/null
+++ b/arch/s390/include/asm/fprobe.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_FPROBE_H
+#define _ASM_S390_FPROBE_H
+
+#include <asm-generic/fprobe.h>
+
+#undef FPROBE_HEADER_MSB_PATTERN
+#define FPROBE_HEADER_MSB_PATTERN 0
+
+#endif /* _ASM_S390_FPROBE_H */
diff --git a/arch/s390/include/asm/vx-insn.h b/arch/s390/include/asm/fpu-insn-asm.h
index 0c05a673811c..d296322be4bc 100644
--- a/arch/s390/include/asm/vx-insn.h
+++ b/arch/s390/include/asm/fpu-insn-asm.h
@@ -9,11 +9,14 @@
  * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
  */
 
-#ifndef __ASM_S390_VX_INSN_H
-#define __ASM_S390_VX_INSN_H
+#ifndef __ASM_S390_FPU_INSN_ASM_H
+#define __ASM_S390_FPU_INSN_ASM_H
 
-#ifdef __ASSEMBLY__
+#ifndef __ASM_S390_FPU_INSN_H
+#error only <asm/fpu-insn.h> can be included directly
+#endif
 
+#ifdef __ASSEMBLY__
 
 /* Macros to generate vector instruction byte code */
 
@@ -192,10 +195,26 @@
 /* RXB - Compute most significant bit used vector registers
  *
  * @rxb:	Operand to store computed RXB value
- * @v1:		First vector register designated operand
- * @v2:		Second vector register designated operand
- * @v3:		Third vector register designated operand
- * @v4:		Fourth vector register designated operand
+ * @v1:		Vector register designated operand whose MSB is stored in
+ *		RXB bit 0 (instruction bit 36) and whose remaining bits
+ *		are stored in instruction bits 8-11.
+ * @v2:		Vector register designated operand whose MSB is stored in
+ *		RXB bit 1 (instruction bit 37) and whose remaining bits
+ *		are stored in instruction bits 12-15.
+ * @v3:		Vector register designated operand whose MSB is stored in
+ *		RXB bit 2 (instruction bit 38) and whose remaining bits
+ *		are stored in instruction bits 16-19.
+ * @v4:		Vector register designated operand whose MSB is stored in
+ *		RXB bit 3 (instruction bit 39) and whose remaining bits
+ *		are stored in instruction bits 32-35.
+ *
+ * Note: In most vector instruction formats [1] V1, V2, V3, and V4 directly
+ * correspond to @v1, @v2, @v3, and @v4. But there are exceptions, such as but
+ * not limited to the vector instruction formats VRR-g, VRR-h, VRS-a, VRS-d,
+ * and VSI.
+ *
+ * [1] IBM z/Architecture Principles of Operation, chapter "Program
+ * Execution, section "Instructions", subsection "Instruction Formats".
  */
 .macro	RXB	rxb v1 v2=0 v3=0 v4=0
 	\rxb = 0
@@ -220,6 +239,9 @@
  * @v2:		Second vector register designated operand (for RXB)
  * @v3:		Third vector register designated operand (for RXB)
  * @v4:		Fourth vector register designated operand (for RXB)
+ *
+ * Note: For @v1, @v2, @v3, and @v4 also refer to the RXB macro
+ * description for further details.
  */
 .macro	MRXB	m v1 v2=0 v3=0 v4=0
 	rxb = 0
@@ -235,6 +257,9 @@
  * @v2:		Second vector register designated operand (for RXB)
  * @v3:		Third vector register designated operand (for RXB)
  * @v4:		Fourth vector register designated operand (for RXB)
+ *
+ * Note: For @v1, @v2, @v3, and @v4 also refer to the RXB macro
+ * description for further details.
  */
 .macro	MRXBOPC	m opc v1 v2=0 v3=0 v4=0
 	MRXB	\m, \v1, \v2, \v3, \v4
@@ -347,7 +372,7 @@
 	VX_NUM	v3, \vr
 	.word	0xE700 | (r1 << 4) | (v3&15)
 	.word	(b2 << 12) | (\disp)
-	MRXBOPC	\m, 0x21, v3
+	MRXBOPC	\m, 0x21, 0, v3
 .endm
 .macro	VLGVB	gr, vr, disp, base="%r0"
 	VLGV	\gr, \vr, \disp, \base, 0
@@ -366,17 +391,49 @@
 .macro	VLM	vfrom, vto, disp, base, hint=3
 	VX_NUM	v1, \vfrom
 	VX_NUM	v3, \vto
-	GR_NUM	b2, \base	    /* Base register */
+	GR_NUM	b2, \base
 	.word	0xE700 | ((v1&15) << 4) | (v3&15)
 	.word	(b2 << 12) | (\disp)
 	MRXBOPC	\hint, 0x36, v1, v3
 .endm
 
+/* VECTOR STORE */
+.macro	VST	vr1, disp, index="%r0", base
+	VX_NUM	v1, \vr1
+	GR_NUM	x2, \index
+	GR_NUM	b2, \base
+	.word	0xE700 | ((v1&15) << 4) | (x2&15)
+	.word	(b2 << 12) | (\disp)
+	MRXBOPC	0, 0x0E, v1
+.endm
+
+/* VECTOR STORE BYTE REVERSED ELEMENTS */
+	.macro	VSTBR	vr1, disp, index="%r0", base, m
+	VX_NUM	v1, \vr1
+	GR_NUM	x2, \index
+	GR_NUM	b2, \base
+	.word	0xE600 | ((v1&15) << 4) | (x2&15)
+	.word	(b2 << 12) | (\disp)
+	MRXBOPC	\m, 0x0E, v1
+.endm
+.macro	VSTBRH	vr1, disp, index="%r0", base
+	VSTBR	\vr1, \disp, \index, \base, 1
+.endm
+.macro	VSTBRF	vr1, disp, index="%r0", base
+	VSTBR	\vr1, \disp, \index, \base, 2
+.endm
+.macro	VSTBRG	vr1, disp, index="%r0", base
+	VSTBR	\vr1, \disp, \index, \base, 3
+.endm
+.macro	VSTBRQ	vr1, disp, index="%r0", base
+	VSTBR	\vr1, \disp, \index, \base, 4
+.endm
+
 /* VECTOR STORE MULTIPLE */
 .macro	VSTM	vfrom, vto, disp, base, hint=3
 	VX_NUM	v1, \vfrom
 	VX_NUM	v3, \vto
-	GR_NUM	b2, \base	    /* Base register */
+	GR_NUM	b2, \base
 	.word	0xE700 | ((v1&15) << 4) | (v3&15)
 	.word	(b2 << 12) | (\disp)
 	MRXBOPC	\hint, 0x3E, v1, v3
@@ -411,6 +468,100 @@
 	VUPLL	\vr1, \vr2, 2
 .endm
 
+/* VECTOR PERMUTE DOUBLEWORD IMMEDIATE */
+.macro	VPDI	vr1, vr2, vr3, m4
+	VX_NUM	v1, \vr1
+	VX_NUM	v2, \vr2
+	VX_NUM	v3, \vr3
+	.word	0xE700 | ((v1&15) << 4) | (v2&15)
+	.word	((v3&15) << 12)
+	MRXBOPC	\m4, 0x84, v1, v2, v3
+.endm
+
+/* VECTOR REPLICATE */
+.macro	VREP	vr1, vr3, imm2, m4
+	VX_NUM	v1, \vr1
+	VX_NUM	v3, \vr3
+	.word	0xE700 | ((v1&15) << 4) | (v3&15)
+	.word	\imm2
+	MRXBOPC	\m4, 0x4D, v1, v3
+.endm
+.macro	VREPB	vr1, vr3, imm2
+	VREP	\vr1, \vr3, \imm2, 0
+.endm
+.macro	VREPH	vr1, vr3, imm2
+	VREP	\vr1, \vr3, \imm2, 1
+.endm
+.macro	VREPF	vr1, vr3, imm2
+	VREP	\vr1, \vr3, \imm2, 2
+.endm
+.macro	VREPG	vr1, vr3, imm2
+	VREP	\vr1, \vr3, \imm2, 3
+.endm
+
+/* VECTOR MERGE HIGH */
+.macro	VMRH	vr1, vr2, vr3, m4
+	VX_NUM	v1, \vr1
+	VX_NUM	v2, \vr2
+	VX_NUM	v3, \vr3
+	.word	0xE700 | ((v1&15) << 4) | (v2&15)
+	.word	((v3&15) << 12)
+	MRXBOPC	\m4, 0x61, v1, v2, v3
+.endm
+.macro	VMRHB	vr1, vr2, vr3
+	VMRH	\vr1, \vr2, \vr3, 0
+.endm
+.macro	VMRHH	vr1, vr2, vr3
+	VMRH	\vr1, \vr2, \vr3, 1
+.endm
+.macro	VMRHF	vr1, vr2, vr3
+	VMRH	\vr1, \vr2, \vr3, 2
+.endm
+.macro	VMRHG	vr1, vr2, vr3
+	VMRH	\vr1, \vr2, \vr3, 3
+.endm
+
+/* VECTOR MERGE LOW */
+.macro	VMRL	vr1, vr2, vr3, m4
+	VX_NUM	v1, \vr1
+	VX_NUM	v2, \vr2
+	VX_NUM	v3, \vr3
+	.word	0xE700 | ((v1&15) << 4) | (v2&15)
+	.word	((v3&15) << 12)
+	MRXBOPC	\m4, 0x60, v1, v2, v3
+.endm
+.macro	VMRLB	vr1, vr2, vr3
+	VMRL	\vr1, \vr2, \vr3, 0
+.endm
+.macro	VMRLH	vr1, vr2, vr3
+	VMRL	\vr1, \vr2, \vr3, 1
+.endm
+.macro	VMRLF	vr1, vr2, vr3
+	VMRL	\vr1, \vr2, \vr3, 2
+.endm
+.macro	VMRLG	vr1, vr2, vr3
+	VMRL	\vr1, \vr2, \vr3, 3
+.endm
+
+/* VECTOR LOAD WITH LENGTH */
+.macro VLL	v, gr, disp, base
+	VX_NUM	v1, \v
+	GR_NUM	b2, \base
+	GR_NUM	r3, \gr
+	.word	0xE700 | ((v1&15) << 4) | r3
+	.word	(b2 << 12) | (\disp)
+	MRXBOPC 0, 0x37, v1
+.endm
+
+/* VECTOR STORE WITH LENGTH */
+.macro VSTL	v, gr, disp, base
+	VX_NUM	v1, \v
+	GR_NUM	b2, \base
+	GR_NUM	r3, \gr
+	.word	0xE700 | ((v1&15) << 4) | r3
+	.word	(b2 << 12) | (\disp)
+	MRXBOPC 0, 0x3f, v1
+.endm
 
 /* Vector integer instructions */
 
@@ -424,6 +575,16 @@
 	MRXBOPC	0, 0x68, v1, v2, v3
 .endm
 
+/* VECTOR CHECKSUM */
+.macro VCKSM	vr1, vr2, vr3
+	VX_NUM	v1, \vr1
+	VX_NUM	v2, \vr2
+	VX_NUM	v3, \vr3
+	.word	0xE700 | ((v1&15) << 4) | (v2&15)
+	.word	((v3&15) << 12)
+	MRXBOPC 0, 0x66, v1, v2, v3
+.endm
+
 /* VECTOR EXCLUSIVE OR */
 .macro	VX	vr1, vr2, vr3
 	VX_NUM	v1, \vr1
@@ -557,5 +718,37 @@
 	VESRAV	\vr1, \vr2, \vr3, 3
 .endm
 
+/* VECTOR ELEMENT ROTATE LEFT LOGICAL */
+.macro	VERLL	vr1, vr3, disp, base="%r0", m4
+	VX_NUM	v1, \vr1
+	VX_NUM	v3, \vr3
+	GR_NUM	b2, \base
+	.word	0xE700 | ((v1&15) << 4) | (v3&15)
+	.word	(b2 << 12) | (\disp)
+	MRXBOPC	\m4, 0x33, v1, v3
+.endm
+.macro	VERLLB	vr1, vr3, disp, base="%r0"
+	VERLL	\vr1, \vr3, \disp, \base, 0
+.endm
+.macro	VERLLH	vr1, vr3, disp, base="%r0"
+	VERLL	\vr1, \vr3, \disp, \base, 1
+.endm
+.macro	VERLLF	vr1, vr3, disp, base="%r0"
+	VERLL	\vr1, \vr3, \disp, \base, 2
+.endm
+.macro	VERLLG	vr1, vr3, disp, base="%r0"
+	VERLL	\vr1, \vr3, \disp, \base, 3
+.endm
+
+/* VECTOR SHIFT LEFT DOUBLE BY BYTE */
+.macro	VSLDB	vr1, vr2, vr3, imm4
+	VX_NUM	v1, \vr1
+	VX_NUM	v2, \vr2
+	VX_NUM	v3, \vr3
+	.word	0xE700 | ((v1&15) << 4) | (v2&15)
+	.word	((v3&15) << 12) | (\imm4)
+	MRXBOPC	0, 0x77, v1, v2, v3
+.endm
+
 #endif	/* __ASSEMBLY__ */
-#endif	/* __ASM_S390_VX_INSN_H */
+#endif	/* __ASM_S390_FPU_INSN_ASM_H */
diff --git a/arch/s390/include/asm/fpu-insn.h b/arch/s390/include/asm/fpu-insn.h
new file mode 100644
index 000000000000..f668bffd6dd3
--- /dev/null
+++ b/arch/s390/include/asm/fpu-insn.h
@@ -0,0 +1,479 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Support for Floating Point and Vector Instructions
+ *
+ */
+
+#ifndef __ASM_S390_FPU_INSN_H
+#define __ASM_S390_FPU_INSN_H
+
+#include <asm/fpu-insn-asm.h>
+
+#ifndef __ASSEMBLY__
+
+#include <linux/instrumented.h>
+#include <asm/asm-extable.h>
+
+asm(".include \"asm/fpu-insn-asm.h\"\n");
+
+/*
+ * Various small helper functions, which can and should be used within
+ * kernel fpu code sections. Each function represents only one floating
+ * point or vector instruction (except for helper functions which require
+ * exception handling).
+ *
+ * This allows to use floating point and vector instructions like C
+ * functions, which has the advantage that all supporting code, like
+ * e.g. loops, can be written in easy to read C code.
+ *
+ * Each of the helper functions provides support for code instrumentation,
+ * like e.g. KASAN. Therefore instrumentation is also covered automatically
+ * when using these functions.
+ *
+ * In order to ensure that code generated with the helper functions stays
+ * within kernel fpu sections, which are guarded with kernel_fpu_begin()
+ * and kernel_fpu_end() calls, each function has a mandatory "memory"
+ * barrier.
+ */
+
+static __always_inline void fpu_cefbr(u8 f1, s32 val)
+{
+	asm volatile("cefbr	%[f1],%[val]\n"
+		     :
+		     : [f1] "I" (f1), [val] "d" (val)
+		     : "memory");
+}
+
+static __always_inline unsigned long fpu_cgebr(u8 f2, u8 mode)
+{
+	unsigned long val;
+
+	asm volatile("cgebr	%[val],%[mode],%[f2]\n"
+		     : [val] "=d" (val)
+		     : [f2] "I" (f2), [mode] "I" (mode)
+		     : "memory");
+	return val;
+}
+
+static __always_inline void fpu_debr(u8 f1, u8 f2)
+{
+	asm volatile("debr	%[f1],%[f2]\n"
+		     :
+		     : [f1] "I" (f1), [f2] "I" (f2)
+		     : "memory");
+}
+
+static __always_inline void fpu_ld(unsigned short fpr, freg_t *reg)
+{
+	instrument_read(reg, sizeof(*reg));
+	asm volatile("ld	 %[fpr],%[reg]\n"
+		     :
+		     : [fpr] "I" (fpr), [reg] "Q" (reg->ui)
+		     : "memory");
+}
+
+static __always_inline void fpu_ldgr(u8 f1, u32 val)
+{
+	asm volatile("ldgr	%[f1],%[val]\n"
+		     :
+		     : [f1] "I" (f1), [val] "d" (val)
+		     : "memory");
+}
+
+static __always_inline void fpu_lfpc(unsigned int *fpc)
+{
+	instrument_read(fpc, sizeof(*fpc));
+	asm volatile("lfpc	%[fpc]"
+		     :
+		     : [fpc] "Q" (*fpc)
+		     : "memory");
+}
+
+/**
+ * fpu_lfpc_safe - Load floating point control register safely.
+ * @fpc: new value for floating point control register
+ *
+ * Load floating point control register. This may lead to an exception,
+ * since a saved value may have been modified by user space (ptrace,
+ * signal return, kvm registers) to an invalid value. In such a case
+ * set the floating point control register to zero.
+ */
+static inline void fpu_lfpc_safe(unsigned int *fpc)
+{
+	instrument_read(fpc, sizeof(*fpc));
+	asm_inline volatile(
+		"	lfpc	%[fpc]\n"
+		"0:	nopr	%%r7\n"
+		EX_TABLE_FPC(0b, 0b)
+		:
+		: [fpc] "Q" (*fpc)
+		: "memory");
+}
+
+static __always_inline void fpu_std(unsigned short fpr, freg_t *reg)
+{
+	instrument_write(reg, sizeof(*reg));
+	asm volatile("std	 %[fpr],%[reg]\n"
+		     : [reg] "=Q" (reg->ui)
+		     : [fpr] "I" (fpr)
+		     : "memory");
+}
+
+static __always_inline void fpu_sfpc(unsigned int fpc)
+{
+	asm volatile("sfpc	%[fpc]"
+		     :
+		     : [fpc] "d" (fpc)
+		     : "memory");
+}
+
+static __always_inline void fpu_stfpc(unsigned int *fpc)
+{
+	instrument_write(fpc, sizeof(*fpc));
+	asm volatile("stfpc	%[fpc]"
+		     : [fpc] "=Q" (*fpc)
+		     :
+		     : "memory");
+}
+
+static __always_inline void fpu_vab(u8 v1, u8 v2, u8 v3)
+{
+	asm volatile("VAB	%[v1],%[v2],%[v3]"
+		     :
+		     : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3)
+		     : "memory");
+}
+
+static __always_inline void fpu_vcksm(u8 v1, u8 v2, u8 v3)
+{
+	asm volatile("VCKSM	%[v1],%[v2],%[v3]"
+		     :
+		     : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3)
+		     : "memory");
+}
+
+static __always_inline void fpu_vesravb(u8 v1, u8 v2, u8 v3)
+{
+	asm volatile("VESRAVB	%[v1],%[v2],%[v3]"
+		     :
+		     : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3)
+		     : "memory");
+}
+
+static __always_inline void fpu_vgfmag(u8 v1, u8 v2, u8 v3, u8 v4)
+{
+	asm volatile("VGFMAG	%[v1],%[v2],%[v3],%[v4]"
+		     :
+		     : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3), [v4] "I" (v4)
+		     : "memory");
+}
+
+static __always_inline void fpu_vgfmg(u8 v1, u8 v2, u8 v3)
+{
+	asm volatile("VGFMG	%[v1],%[v2],%[v3]"
+		     :
+		     : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3)
+		     : "memory");
+}
+
+#ifdef CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS
+
+static __always_inline void fpu_vl(u8 v1, const void *vxr)
+{
+	instrument_read(vxr, sizeof(__vector128));
+	asm volatile("VL	%[v1],%O[vxr],,%R[vxr]\n"
+		     :
+		     : [vxr] "Q" (*(__vector128 *)vxr),
+		       [v1] "I" (v1)
+		     : "memory");
+}
+
+#else /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */
+
+static __always_inline void fpu_vl(u8 v1, const void *vxr)
+{
+	instrument_read(vxr, sizeof(__vector128));
+	asm volatile(
+		"	la	1,%[vxr]\n"
+		"	VL	%[v1],0,,1\n"
+		:
+		: [vxr] "R" (*(__vector128 *)vxr),
+		  [v1] "I" (v1)
+		: "memory", "1");
+}
+
+#endif /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */
+
+static __always_inline void fpu_vleib(u8 v, s16 val, u8 index)
+{
+	asm volatile("VLEIB	%[v],%[val],%[index]"
+		     :
+		     : [v] "I" (v), [val] "K" (val), [index] "I" (index)
+		     : "memory");
+}
+
+static __always_inline void fpu_vleig(u8 v, s16 val, u8 index)
+{
+	asm volatile("VLEIG	%[v],%[val],%[index]"
+		     :
+		     : [v] "I" (v), [val] "K" (val), [index] "I" (index)
+		     : "memory");
+}
+
+static __always_inline u64 fpu_vlgvf(u8 v, u16 index)
+{
+	u64 val;
+
+	asm volatile("VLGVF	%[val],%[v],%[index]"
+		     : [val] "=d" (val)
+		     : [v] "I" (v), [index] "L" (index)
+		     : "memory");
+	return val;
+}
+
+#ifdef CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS
+
+static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr)
+{
+	unsigned int size;
+
+	size = min(index + 1, sizeof(__vector128));
+	instrument_read(vxr, size);
+	asm volatile("VLL	%[v1],%[index],%O[vxr],%R[vxr]\n"
+		     :
+		     : [vxr] "Q" (*(u8 *)vxr),
+		       [index] "d" (index),
+		       [v1] "I" (v1)
+		     : "memory");
+}
+
+#else /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */
+
+static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr)
+{
+	unsigned int size;
+
+	size = min(index + 1, sizeof(__vector128));
+	instrument_read(vxr, size);
+	asm volatile(
+		"	la	1,%[vxr]\n"
+		"	VLL	%[v1],%[index],0,1\n"
+		:
+		: [vxr] "R" (*(u8 *)vxr),
+		  [index] "d" (index),
+		  [v1] "I" (v1)
+		: "memory", "1");
+}
+
+#endif /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */
+
+#ifdef CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS
+
+#define fpu_vlm(_v1, _v3, _vxrs)					\
+({									\
+	unsigned int size = ((_v3) - (_v1) + 1) * sizeof(__vector128);	\
+	struct {							\
+		__vector128 _v[(_v3) - (_v1) + 1];			\
+	} *_v = (void *)(_vxrs);					\
+									\
+	instrument_read(_v, size);					\
+	asm volatile("VLM	%[v1],%[v3],%O[vxrs],%R[vxrs]\n"	\
+		     :							\
+		     : [vxrs] "Q" (*_v),				\
+		       [v1] "I" (_v1), [v3] "I" (_v3)			\
+		     : "memory");					\
+	(_v3) - (_v1) + 1;						\
+})
+
+#else /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */
+
+#define fpu_vlm(_v1, _v3, _vxrs)					\
+({									\
+	unsigned int size = ((_v3) - (_v1) + 1) * sizeof(__vector128);	\
+	struct {							\
+		__vector128 _v[(_v3) - (_v1) + 1];			\
+	} *_v = (void *)(_vxrs);					\
+									\
+	instrument_read(_v, size);					\
+	asm volatile(							\
+		"	la	1,%[vxrs]\n"				\
+		"	VLM	%[v1],%[v3],0,1\n"			\
+		:							\
+		: [vxrs] "R" (*_v),					\
+		  [v1] "I" (_v1), [v3] "I" (_v3)			\
+		: "memory", "1");					\
+	(_v3) - (_v1) + 1;						\
+})
+
+#endif /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */
+
+static __always_inline void fpu_vlr(u8 v1, u8 v2)
+{
+	asm volatile("VLR	%[v1],%[v2]"
+		     :
+		     : [v1] "I" (v1), [v2] "I" (v2)
+		     : "memory");
+}
+
+static __always_inline void fpu_vlvgf(u8 v, u32 val, u16 index)
+{
+	asm volatile("VLVGF	%[v],%[val],%[index]"
+		     :
+		     : [v] "I" (v), [val] "d" (val), [index] "L" (index)
+		     : "memory");
+}
+
+static __always_inline void fpu_vn(u8 v1, u8 v2, u8 v3)
+{
+	asm volatile("VN	%[v1],%[v2],%[v3]"
+		     :
+		     : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3)
+		     : "memory");
+}
+
+static __always_inline void fpu_vperm(u8 v1, u8 v2, u8 v3, u8 v4)
+{
+	asm volatile("VPERM	%[v1],%[v2],%[v3],%[v4]"
+		     :
+		     : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3), [v4] "I" (v4)
+		     : "memory");
+}
+
+static __always_inline void fpu_vrepib(u8 v1, s16 i2)
+{
+	asm volatile("VREPIB	%[v1],%[i2]"
+		     :
+		     : [v1] "I" (v1), [i2] "K" (i2)
+		     : "memory");
+}
+
+static __always_inline void fpu_vsrlb(u8 v1, u8 v2, u8 v3)
+{
+	asm volatile("VSRLB	%[v1],%[v2],%[v3]"
+		     :
+		     : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3)
+		     : "memory");
+}
+
+#ifdef CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS
+
+static __always_inline void fpu_vst(u8 v1, const void *vxr)
+{
+	instrument_write(vxr, sizeof(__vector128));
+	asm volatile("VST	%[v1],%O[vxr],,%R[vxr]\n"
+		     : [vxr] "=Q" (*(__vector128 *)vxr)
+		     : [v1] "I" (v1)
+		     : "memory");
+}
+
+#else /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */
+
+static __always_inline void fpu_vst(u8 v1, const void *vxr)
+{
+	instrument_write(vxr, sizeof(__vector128));
+	asm volatile(
+		"	la	1,%[vxr]\n"
+		"	VST	%[v1],0,,1\n"
+		: [vxr] "=R" (*(__vector128 *)vxr)
+		: [v1] "I" (v1)
+		: "memory", "1");
+}
+
+#endif /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */
+
+#ifdef CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS
+
+static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr)
+{
+	unsigned int size;
+
+	size = min(index + 1, sizeof(__vector128));
+	instrument_write(vxr, size);
+	asm volatile("VSTL	%[v1],%[index],%O[vxr],%R[vxr]\n"
+		     : [vxr] "=Q" (*(u8 *)vxr)
+		     : [index] "d" (index), [v1] "I" (v1)
+		     : "memory");
+}
+
+#else /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */
+
+static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr)
+{
+	unsigned int size;
+
+	size = min(index + 1, sizeof(__vector128));
+	instrument_write(vxr, size);
+	asm volatile(
+		"	la	1,%[vxr]\n"
+		"	VSTL	%[v1],%[index],0,1\n"
+		: [vxr] "=R" (*(u8 *)vxr)
+		: [index] "d" (index), [v1] "I" (v1)
+		: "memory", "1");
+}
+
+#endif /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */
+
+#ifdef CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS
+
+#define fpu_vstm(_v1, _v3, _vxrs)					\
+({									\
+	unsigned int size = ((_v3) - (_v1) + 1) * sizeof(__vector128);	\
+	struct {							\
+		__vector128 _v[(_v3) - (_v1) + 1];			\
+	} *_v = (void *)(_vxrs);					\
+									\
+	instrument_write(_v, size);					\
+	asm volatile("VSTM	%[v1],%[v3],%O[vxrs],%R[vxrs]\n"	\
+		     : [vxrs] "=Q" (*_v)				\
+		     : [v1] "I" (_v1), [v3] "I" (_v3)			\
+		     : "memory");					\
+	(_v3) - (_v1) + 1;						\
+})
+
+#else /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */
+
+#define fpu_vstm(_v1, _v3, _vxrs)					\
+({									\
+	unsigned int size = ((_v3) - (_v1) + 1) * sizeof(__vector128);	\
+	struct {							\
+		__vector128 _v[(_v3) - (_v1) + 1];			\
+	} *_v = (void *)(_vxrs);					\
+									\
+	instrument_write(_v, size);					\
+	asm volatile(							\
+		"	la	1,%[vxrs]\n"				\
+		"	VSTM	%[v1],%[v3],0,1\n"			\
+		: [vxrs] "=R" (*_v)					\
+		: [v1] "I" (_v1), [v3] "I" (_v3)			\
+		: "memory", "1");					\
+	(_v3) - (_v1) + 1;						\
+})
+
+#endif /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */
+
+static __always_inline void fpu_vupllf(u8 v1, u8 v2)
+{
+	asm volatile("VUPLLF	%[v1],%[v2]"
+		     :
+		     : [v1] "I" (v1), [v2] "I" (v2)
+		     : "memory");
+}
+
+static __always_inline void fpu_vx(u8 v1, u8 v2, u8 v3)
+{
+	asm volatile("VX	%[v1],%[v2],%[v3]"
+		     :
+		     : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3)
+		     : "memory");
+}
+
+static __always_inline void fpu_vzero(u8 v)
+{
+	asm volatile("VZERO	%[v]"
+		     :
+		     : [v] "I" (v)
+		     : "memory");
+}
+
+#endif /* __ASSEMBLY__ */
+#endif	/* __ASM_S390_FPU_INSN_H */
diff --git a/arch/s390/include/asm/fpu-types.h b/arch/s390/include/asm/fpu-types.h
new file mode 100644
index 000000000000..8d58d5a95399
--- /dev/null
+++ b/arch/s390/include/asm/fpu-types.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * FPU data structures
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+
+#ifndef _ASM_S390_FPU_TYPES_H
+#define _ASM_S390_FPU_TYPES_H
+
+#include <asm/sigcontext.h>
+
+struct fpu {
+	u32 fpc;
+	__vector128 vxrs[__NUM_VXRS] __aligned(8);
+};
+
+struct kernel_fpu_hdr {
+	int	mask;
+	u32	fpc;
+};
+
+struct kernel_fpu {
+	struct kernel_fpu_hdr hdr;
+	__vector128 vxrs[] __aligned(8);
+};
+
+#define KERNEL_FPU_STRUCT(vxr_size)				\
+struct kernel_fpu_##vxr_size {					\
+	struct kernel_fpu_hdr hdr;				\
+	__vector128 vxrs[vxr_size] __aligned(8);		\
+}
+
+KERNEL_FPU_STRUCT(8);
+KERNEL_FPU_STRUCT(16);
+KERNEL_FPU_STRUCT(32);
+
+#define DECLARE_KERNEL_FPU_ONSTACK(vxr_size, name)		\
+	struct kernel_fpu_##vxr_size name __uninitialized
+
+#define DECLARE_KERNEL_FPU_ONSTACK8(name)			\
+	DECLARE_KERNEL_FPU_ONSTACK(8, name)
+
+#define DECLARE_KERNEL_FPU_ONSTACK16(name)			\
+	DECLARE_KERNEL_FPU_ONSTACK(16, name)
+
+#define DECLARE_KERNEL_FPU_ONSTACK32(name)			\
+	DECLARE_KERNEL_FPU_ONSTACK(32, name)
+
+#endif /* _ASM_S390_FPU_TYPES_H */
diff --git a/arch/s390/include/asm/fpu.h b/arch/s390/include/asm/fpu.h
new file mode 100644
index 000000000000..960c6c67ad6c
--- /dev/null
+++ b/arch/s390/include/asm/fpu.h
@@ -0,0 +1,290 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * In-kernel FPU support functions
+ *
+ *
+ * Consider these guidelines before using in-kernel FPU functions:
+ *
+ *  1. Use kernel_fpu_begin() and kernel_fpu_end() to enclose all in-kernel
+ *     use of floating-point or vector registers and instructions.
+ *
+ *  2. For kernel_fpu_begin(), specify the vector register range you want to
+ *     use with the KERNEL_VXR_* constants. Consider these usage guidelines:
+ *
+ *     a) If your function typically runs in process-context, use the lower
+ *	  half of the vector registers, for example, specify KERNEL_VXR_LOW.
+ *     b) If your function typically runs in soft-irq or hard-irq context,
+ *	  prefer using the upper half of the vector registers, for example,
+ *	  specify KERNEL_VXR_HIGH.
+ *
+ *     If you adhere to these guidelines, an interrupted process context
+ *     does not require to save and restore vector registers because of
+ *     disjoint register ranges.
+ *
+ *     Also note that the __kernel_fpu_begin()/__kernel_fpu_end() functions
+ *     includes logic to save and restore up to 16 vector registers at once.
+ *
+ *  3. You can nest kernel_fpu_begin()/kernel_fpu_end() by using different
+ *     struct kernel_fpu states.  Vector registers that are in use by outer
+ *     levels are saved and restored.  You can minimize the save and restore
+ *     effort by choosing disjoint vector register ranges.
+ *
+ *  5. To use vector floating-point instructions, specify the KERNEL_FPC
+ *     flag to save and restore floating-point controls in addition to any
+ *     vector register range.
+ *
+ *  6. To use floating-point registers and instructions only, specify the
+ *     KERNEL_FPR flag.  This flag triggers a save and restore of vector
+ *     registers V0 to V15 and floating-point controls.
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+
+#ifndef _ASM_S390_FPU_H
+#define _ASM_S390_FPU_H
+
+#include <linux/cpufeature.h>
+#include <linux/processor.h>
+#include <linux/preempt.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <asm/sigcontext.h>
+#include <asm/fpu-types.h>
+#include <asm/fpu-insn.h>
+
+enum {
+	KERNEL_FPC_BIT = 0,
+	KERNEL_VXR_V0V7_BIT,
+	KERNEL_VXR_V8V15_BIT,
+	KERNEL_VXR_V16V23_BIT,
+	KERNEL_VXR_V24V31_BIT,
+};
+
+#define KERNEL_FPC		BIT(KERNEL_FPC_BIT)
+#define KERNEL_VXR_V0V7		BIT(KERNEL_VXR_V0V7_BIT)
+#define KERNEL_VXR_V8V15	BIT(KERNEL_VXR_V8V15_BIT)
+#define KERNEL_VXR_V16V23	BIT(KERNEL_VXR_V16V23_BIT)
+#define KERNEL_VXR_V24V31	BIT(KERNEL_VXR_V24V31_BIT)
+
+#define KERNEL_VXR_LOW		(KERNEL_VXR_V0V7   | KERNEL_VXR_V8V15)
+#define KERNEL_VXR_MID		(KERNEL_VXR_V8V15  | KERNEL_VXR_V16V23)
+#define KERNEL_VXR_HIGH		(KERNEL_VXR_V16V23 | KERNEL_VXR_V24V31)
+
+#define KERNEL_VXR		(KERNEL_VXR_LOW	   | KERNEL_VXR_HIGH)
+#define KERNEL_FPR		(KERNEL_FPC	   | KERNEL_VXR_LOW)
+
+void load_fpu_state(struct fpu *state, int flags);
+void save_fpu_state(struct fpu *state, int flags);
+void __kernel_fpu_begin(struct kernel_fpu *state, int flags);
+void __kernel_fpu_end(struct kernel_fpu *state, int flags);
+
+static __always_inline void save_vx_regs(__vector128 *vxrs)
+{
+	fpu_vstm(0, 15, &vxrs[0]);
+	fpu_vstm(16, 31, &vxrs[16]);
+}
+
+static __always_inline void load_vx_regs(__vector128 *vxrs)
+{
+	fpu_vlm(0, 15, &vxrs[0]);
+	fpu_vlm(16, 31, &vxrs[16]);
+}
+
+static __always_inline void __save_fp_regs(freg_t *fprs, unsigned int offset)
+{
+	fpu_std(0, &fprs[0 * offset]);
+	fpu_std(1, &fprs[1 * offset]);
+	fpu_std(2, &fprs[2 * offset]);
+	fpu_std(3, &fprs[3 * offset]);
+	fpu_std(4, &fprs[4 * offset]);
+	fpu_std(5, &fprs[5 * offset]);
+	fpu_std(6, &fprs[6 * offset]);
+	fpu_std(7, &fprs[7 * offset]);
+	fpu_std(8, &fprs[8 * offset]);
+	fpu_std(9, &fprs[9 * offset]);
+	fpu_std(10, &fprs[10 * offset]);
+	fpu_std(11, &fprs[11 * offset]);
+	fpu_std(12, &fprs[12 * offset]);
+	fpu_std(13, &fprs[13 * offset]);
+	fpu_std(14, &fprs[14 * offset]);
+	fpu_std(15, &fprs[15 * offset]);
+}
+
+static __always_inline void __load_fp_regs(freg_t *fprs, unsigned int offset)
+{
+	fpu_ld(0, &fprs[0 * offset]);
+	fpu_ld(1, &fprs[1 * offset]);
+	fpu_ld(2, &fprs[2 * offset]);
+	fpu_ld(3, &fprs[3 * offset]);
+	fpu_ld(4, &fprs[4 * offset]);
+	fpu_ld(5, &fprs[5 * offset]);
+	fpu_ld(6, &fprs[6 * offset]);
+	fpu_ld(7, &fprs[7 * offset]);
+	fpu_ld(8, &fprs[8 * offset]);
+	fpu_ld(9, &fprs[9 * offset]);
+	fpu_ld(10, &fprs[10 * offset]);
+	fpu_ld(11, &fprs[11 * offset]);
+	fpu_ld(12, &fprs[12 * offset]);
+	fpu_ld(13, &fprs[13 * offset]);
+	fpu_ld(14, &fprs[14 * offset]);
+	fpu_ld(15, &fprs[15 * offset]);
+}
+
+static __always_inline void save_fp_regs(freg_t *fprs)
+{
+	__save_fp_regs(fprs, sizeof(freg_t) / sizeof(freg_t));
+}
+
+static __always_inline void load_fp_regs(freg_t *fprs)
+{
+	__load_fp_regs(fprs, sizeof(freg_t) / sizeof(freg_t));
+}
+
+static __always_inline void save_fp_regs_vx(__vector128 *vxrs)
+{
+	freg_t *fprs = (freg_t *)&vxrs[0].high;
+
+	__save_fp_regs(fprs, sizeof(__vector128) / sizeof(freg_t));
+}
+
+static __always_inline void load_fp_regs_vx(__vector128 *vxrs)
+{
+	freg_t *fprs = (freg_t *)&vxrs[0].high;
+
+	__load_fp_regs(fprs, sizeof(__vector128) / sizeof(freg_t));
+}
+
+static inline void load_user_fpu_regs(void)
+{
+	struct thread_struct *thread = &current->thread;
+
+	if (!thread->ufpu_flags)
+		return;
+	load_fpu_state(&thread->ufpu, thread->ufpu_flags);
+	thread->ufpu_flags = 0;
+}
+
+static __always_inline void __save_user_fpu_regs(struct thread_struct *thread, int flags)
+{
+	save_fpu_state(&thread->ufpu, flags);
+	__atomic_or(flags, &thread->ufpu_flags);
+}
+
+static inline void save_user_fpu_regs(void)
+{
+	struct thread_struct *thread = &current->thread;
+	int mask, flags;
+
+	mask = __atomic_or(KERNEL_FPC | KERNEL_VXR, &thread->kfpu_flags);
+	flags = ~READ_ONCE(thread->ufpu_flags) & (KERNEL_FPC | KERNEL_VXR);
+	if (flags)
+		__save_user_fpu_regs(thread, flags);
+	barrier();
+	WRITE_ONCE(thread->kfpu_flags, mask);
+}
+
+static __always_inline void _kernel_fpu_begin(struct kernel_fpu *state, int flags)
+{
+	struct thread_struct *thread = &current->thread;
+	int mask, uflags;
+
+	mask = __atomic_or(flags, &thread->kfpu_flags);
+	state->hdr.mask = mask;
+	uflags = READ_ONCE(thread->ufpu_flags);
+	if ((uflags & flags) != flags)
+		__save_user_fpu_regs(thread, ~uflags & flags);
+	if (mask & flags)
+		__kernel_fpu_begin(state, flags);
+}
+
+static __always_inline void _kernel_fpu_end(struct kernel_fpu *state, int flags)
+{
+	int mask = state->hdr.mask;
+
+	if (mask & flags)
+		__kernel_fpu_end(state, flags);
+	barrier();
+	WRITE_ONCE(current->thread.kfpu_flags, mask);
+}
+
+void __kernel_fpu_invalid_size(void);
+
+static __always_inline void kernel_fpu_check_size(int flags, unsigned int size)
+{
+	unsigned int cnt = 0;
+
+	if (flags & KERNEL_VXR_V0V7)
+		cnt += 8;
+	if (flags & KERNEL_VXR_V8V15)
+		cnt += 8;
+	if (flags & KERNEL_VXR_V16V23)
+		cnt += 8;
+	if (flags & KERNEL_VXR_V24V31)
+		cnt += 8;
+	if (cnt != size)
+		__kernel_fpu_invalid_size();
+}
+
+#define kernel_fpu_begin(state, flags)					\
+{									\
+	typeof(state) s = (state);					\
+	int _flags = (flags);						\
+									\
+	kernel_fpu_check_size(_flags, ARRAY_SIZE(s->vxrs));		\
+	_kernel_fpu_begin((struct kernel_fpu *)s, _flags);		\
+}
+
+#define kernel_fpu_end(state, flags)					\
+{									\
+	typeof(state) s = (state);					\
+	int _flags = (flags);						\
+									\
+	kernel_fpu_check_size(_flags, ARRAY_SIZE(s->vxrs));		\
+	_kernel_fpu_end((struct kernel_fpu *)s, _flags);		\
+}
+
+static inline void save_kernel_fpu_regs(struct thread_struct *thread)
+{
+	if (!thread->kfpu_flags)
+		return;
+	save_fpu_state(&thread->kfpu, thread->kfpu_flags);
+}
+
+static inline void restore_kernel_fpu_regs(struct thread_struct *thread)
+{
+	if (!thread->kfpu_flags)
+		return;
+	load_fpu_state(&thread->kfpu, thread->kfpu_flags);
+}
+
+static inline void convert_vx_to_fp(freg_t *fprs, __vector128 *vxrs)
+{
+	int i;
+
+	for (i = 0; i < __NUM_FPRS; i++)
+		fprs[i].ui = vxrs[i].high;
+}
+
+static inline void convert_fp_to_vx(__vector128 *vxrs, freg_t *fprs)
+{
+	int i;
+
+	for (i = 0; i < __NUM_FPRS; i++)
+		vxrs[i].high = fprs[i].ui;
+}
+
+static inline void fpregs_store(_s390_fp_regs *fpregs, struct fpu *fpu)
+{
+	fpregs->pad = 0;
+	fpregs->fpc = fpu->fpc;
+	convert_vx_to_fp((freg_t *)&fpregs->fprs, fpu->vxrs);
+}
+
+static inline void fpregs_load(_s390_fp_regs *fpregs, struct fpu *fpu)
+{
+	fpu->fpc = fpregs->fpc;
+	convert_fp_to_vx(fpu->vxrs, (freg_t *)&fpregs->fprs);
+}
+
+#endif /* _ASM_S390_FPU_H */
diff --git a/arch/s390/include/asm/fpu/api.h b/arch/s390/include/asm/fpu/api.h
deleted file mode 100644
index 34a7ae68485c..000000000000
--- a/arch/s390/include/asm/fpu/api.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * In-kernel FPU support functions
- *
- *
- * Consider these guidelines before using in-kernel FPU functions:
- *
- *  1. Use kernel_fpu_begin() and kernel_fpu_end() to enclose all in-kernel
- *     use of floating-point or vector registers and instructions.
- *
- *  2. For kernel_fpu_begin(), specify the vector register range you want to
- *     use with the KERNEL_VXR_* constants. Consider these usage guidelines:
- *
- *     a) If your function typically runs in process-context, use the lower
- *	  half of the vector registers, for example, specify KERNEL_VXR_LOW.
- *     b) If your function typically runs in soft-irq or hard-irq context,
- *	  prefer using the upper half of the vector registers, for example,
- *	  specify KERNEL_VXR_HIGH.
- *
- *     If you adhere to these guidelines, an interrupted process context
- *     does not require to save and restore vector registers because of
- *     disjoint register ranges.
- *
- *     Also note that the __kernel_fpu_begin()/__kernel_fpu_end() functions
- *     includes logic to save and restore up to 16 vector registers at once.
- *
- *  3. You can nest kernel_fpu_begin()/kernel_fpu_end() by using different
- *     struct kernel_fpu states.  Vector registers that are in use by outer
- *     levels are saved and restored.  You can minimize the save and restore
- *     effort by choosing disjoint vector register ranges.
- *
- *  5. To use vector floating-point instructions, specify the KERNEL_FPC
- *     flag to save and restore floating-point controls in addition to any
- *     vector register range.
- *
- *  6. To use floating-point registers and instructions only, specify the
- *     KERNEL_FPR flag.  This flag triggers a save and restore of vector
- *     registers V0 to V15 and floating-point controls.
- *
- * Copyright IBM Corp. 2015
- * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
- */
-
-#ifndef _ASM_S390_FPU_API_H
-#define _ASM_S390_FPU_API_H
-
-#include <linux/preempt.h>
-
-void save_fpu_regs(void);
-
-static inline int test_fp_ctl(u32 fpc)
-{
-	u32 orig_fpc;
-	int rc;
-
-	asm volatile(
-		"	efpc    %1\n"
-		"	sfpc	%2\n"
-		"0:	sfpc	%1\n"
-		"	la	%0,0\n"
-		"1:\n"
-		EX_TABLE(0b,1b)
-		: "=d" (rc), "=&d" (orig_fpc)
-		: "d" (fpc), "0" (-EINVAL));
-	return rc;
-}
-
-#define KERNEL_FPC		1
-#define KERNEL_VXR_V0V7		2
-#define KERNEL_VXR_V8V15	4
-#define KERNEL_VXR_V16V23	8
-#define KERNEL_VXR_V24V31	16
-
-#define KERNEL_VXR_LOW		(KERNEL_VXR_V0V7|KERNEL_VXR_V8V15)
-#define KERNEL_VXR_MID		(KERNEL_VXR_V8V15|KERNEL_VXR_V16V23)
-#define KERNEL_VXR_HIGH		(KERNEL_VXR_V16V23|KERNEL_VXR_V24V31)
-
-#define KERNEL_VXR		(KERNEL_VXR_LOW|KERNEL_VXR_HIGH)
-#define KERNEL_FPR		(KERNEL_FPC|KERNEL_VXR_V0V7)
-
-struct kernel_fpu;
-
-/*
- * Note the functions below must be called with preemption disabled.
- * Do not enable preemption before calling __kernel_fpu_end() to prevent
- * an corruption of an existing kernel FPU state.
- *
- * Prefer using the kernel_fpu_begin()/kernel_fpu_end() pair of functions.
- */
-void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags);
-void __kernel_fpu_end(struct kernel_fpu *state, u32 flags);
-
-
-static inline void kernel_fpu_begin(struct kernel_fpu *state, u32 flags)
-{
-	preempt_disable();
-	state->mask = S390_lowcore.fpu_flags;
-	if (!test_cpu_flag(CIF_FPU))
-		/* Save user space FPU state and register contents */
-		save_fpu_regs();
-	else if (state->mask & flags)
-		/* Save FPU/vector register in-use by the kernel */
-		__kernel_fpu_begin(state, flags);
-	S390_lowcore.fpu_flags |= flags;
-}
-
-static inline void kernel_fpu_end(struct kernel_fpu *state, u32 flags)
-{
-	S390_lowcore.fpu_flags = state->mask;
-	if (state->mask & flags)
-		/* Restore FPU/vector register in-use by the kernel */
-		__kernel_fpu_end(state, flags);
-	preempt_enable();
-}
-
-#endif /* _ASM_S390_FPU_API_H */
diff --git a/arch/s390/include/asm/fpu/internal.h b/arch/s390/include/asm/fpu/internal.h
deleted file mode 100644
index 4a71dbbf76fb..000000000000
--- a/arch/s390/include/asm/fpu/internal.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * FPU state and register content conversion primitives
- *
- * Copyright IBM Corp. 2015
- * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
- */
-
-#ifndef _ASM_S390_FPU_INTERNAL_H
-#define _ASM_S390_FPU_INTERNAL_H
-
-#include <linux/string.h>
-#include <asm/ctl_reg.h>
-#include <asm/fpu/types.h>
-
-static inline void save_vx_regs(__vector128 *vxrs)
-{
-	asm volatile(
-		"	la	1,%0\n"
-		"	.word	0xe70f,0x1000,0x003e\n"	/* vstm 0,15,0(1) */
-		"	.word	0xe70f,0x1100,0x0c3e\n"	/* vstm 16,31,256(1) */
-		: "=Q" (*(struct vx_array *) vxrs) : : "1");
-}
-
-static inline void convert_vx_to_fp(freg_t *fprs, __vector128 *vxrs)
-{
-	int i;
-
-	for (i = 0; i < __NUM_FPRS; i++)
-		fprs[i] = *(freg_t *)(vxrs + i);
-}
-
-static inline void convert_fp_to_vx(__vector128 *vxrs, freg_t *fprs)
-{
-	int i;
-
-	for (i = 0; i < __NUM_FPRS; i++)
-		*(freg_t *)(vxrs + i) = fprs[i];
-}
-
-static inline void fpregs_store(_s390_fp_regs *fpregs, struct fpu *fpu)
-{
-	fpregs->pad = 0;
-	fpregs->fpc = fpu->fpc;
-	if (MACHINE_HAS_VX)
-		convert_vx_to_fp((freg_t *)&fpregs->fprs, fpu->vxrs);
-	else
-		memcpy((freg_t *)&fpregs->fprs, fpu->fprs,
-		       sizeof(fpregs->fprs));
-}
-
-static inline void fpregs_load(_s390_fp_regs *fpregs, struct fpu *fpu)
-{
-	fpu->fpc = fpregs->fpc;
-	if (MACHINE_HAS_VX)
-		convert_fp_to_vx(fpu->vxrs, (freg_t *)&fpregs->fprs);
-	else
-		memcpy(fpu->fprs, (freg_t *)&fpregs->fprs,
-		       sizeof(fpregs->fprs));
-}
-
-#endif /* _ASM_S390_FPU_INTERNAL_H */
diff --git a/arch/s390/include/asm/fpu/types.h b/arch/s390/include/asm/fpu/types.h
deleted file mode 100644
index d889e9436865..000000000000
--- a/arch/s390/include/asm/fpu/types.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * FPU data structures
- *
- * Copyright IBM Corp. 2015
- * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
- */
-
-#ifndef _ASM_S390_FPU_TYPES_H
-#define _ASM_S390_FPU_TYPES_H
-
-#include <asm/sigcontext.h>
-
-struct fpu {
-	__u32 fpc;		/* Floating-point control */
-	void *regs;		/* Pointer to the current save area */
-	union {
-		/* Floating-point register save area */
-		freg_t fprs[__NUM_FPRS];
-		/* Vector register save area */
-		__vector128 vxrs[__NUM_VXRS];
-	};
-};
-
-/* VX array structure for address operand constraints in inline assemblies */
-struct vx_array { __vector128 _[__NUM_VXRS]; };
-
-/* In-kernel FPU state structure */
-struct kernel_fpu {
-	u32	    mask;
-	u32	    fpc;
-	union {
-		freg_t fprs[__NUM_FPRS];
-		__vector128 vxrs[__NUM_VXRS];
-	};
-};
-
-#endif /* _ASM_S390_FPU_TYPES_H */
diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h
index 68d362f8d6c1..185331e91f83 100644
--- a/arch/s390/include/asm/ftrace.h
+++ b/arch/s390/include/asm/ftrace.h
@@ -3,91 +3,108 @@
 #define _ASM_S390_FTRACE_H
 
 #define ARCH_SUPPORTS_FTRACE_OPS 1
-
-#if defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT)
 #define MCOUNT_INSN_SIZE	6
-#else
-#define MCOUNT_INSN_SIZE	24
-#define MCOUNT_RETURN_FIXUP	18
-#endif
-
-#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
 
 #ifndef __ASSEMBLY__
+#include <asm/stacktrace.h>
 
-#ifdef CONFIG_CC_IS_CLANG
-/* https://bugs.llvm.org/show_bug.cgi?id=41424 */
-#define ftrace_return_address(n) 0UL
-#else
-#define ftrace_return_address(n) __builtin_return_address(n)
-#endif
+static __always_inline unsigned long return_address(unsigned int n)
+{
+	struct stack_frame *sf;
+
+	if (!n)
+		return (unsigned long)__builtin_return_address(0);
+
+	sf = (struct stack_frame *)current_frame_address();
+	do {
+		sf = (struct stack_frame *)sf->back_chain;
+		if (!sf)
+			return 0;
+	} while (--n);
+	return sf->gprs[8];
+}
+#define ftrace_return_address(n) return_address(n)
 
-void _mcount(void);
 void ftrace_caller(void);
 
-extern char ftrace_graph_caller_end;
-extern unsigned long ftrace_plt;
+extern void *ftrace_func;
 
 struct dyn_arch_ftrace { };
 
-#define MCOUNT_ADDR ((unsigned long)_mcount)
+#define MCOUNT_ADDR 0
 #define FTRACE_ADDR ((unsigned long)ftrace_caller)
 
 #define KPROBE_ON_FTRACE_NOP	0
 #define KPROBE_ON_FTRACE_CALL	1
 
+struct module;
+struct dyn_ftrace;
+struct ftrace_ops;
+
+bool ftrace_need_init_nop(void);
+#define ftrace_need_init_nop ftrace_need_init_nop
+
+int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec);
+#define ftrace_init_nop ftrace_init_nop
+
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
 {
 	return addr;
 }
+#define ftrace_get_symaddr(fentry_ip) ((unsigned long)(fentry_ip))
 
-struct ftrace_insn {
-	u16 opc;
-	s32 disp;
-} __packed;
+#include <linux/ftrace_regs.h>
 
-static inline void ftrace_generate_nop_insn(struct ftrace_insn *insn)
+static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs)
 {
-#ifdef CONFIG_FUNCTION_TRACER
-#if defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT)
-	/* brcl 0,0 */
-	insn->opc = 0xc004;
-	insn->disp = 0;
-#else
-	/* jg .+24 */
-	insn->opc = 0xc0f4;
-	insn->disp = MCOUNT_INSN_SIZE / 2;
-#endif
-#endif
+	struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs;
+
+	if (test_pt_regs_flag(regs, PIF_FTRACE_FULL_REGS))
+		return regs;
+	return NULL;
 }
 
-static inline int is_ftrace_nop(struct ftrace_insn *insn)
+static __always_inline void
+ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs,
+				    unsigned long ip)
 {
-#ifdef CONFIG_FUNCTION_TRACER
-#if defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT)
-	if (insn->disp == 0)
-		return 1;
-#else
-	if (insn->disp == MCOUNT_INSN_SIZE / 2)
-		return 1;
-#endif
-#endif
-	return 0;
+	arch_ftrace_regs(fregs)->regs.psw.addr = ip;
 }
 
-static inline void ftrace_generate_call_insn(struct ftrace_insn *insn,
-					     unsigned long ip)
+#undef ftrace_regs_get_frame_pointer
+static __always_inline unsigned long
+ftrace_regs_get_frame_pointer(struct ftrace_regs *fregs)
 {
-#ifdef CONFIG_FUNCTION_TRACER
-	unsigned long target;
+	return ftrace_regs_get_stack_pointer(fregs);
+}
 
-	/* brasl r0,ftrace_caller */
-	target = is_module_addr((void *) ip) ? ftrace_plt : FTRACE_ADDR;
-	insn->opc = 0xc005;
-	insn->disp = (target - ip) / 2;
-#endif
+static __always_inline unsigned long
+ftrace_regs_get_return_address(const struct ftrace_regs *fregs)
+{
+	return arch_ftrace_regs(fregs)->regs.gprs[14];
 }
 
+#define arch_ftrace_fill_perf_regs(fregs, _regs)	 do {		\
+		(_regs)->psw.mask = 0;					\
+		(_regs)->psw.addr = arch_ftrace_regs(fregs)->regs.psw.addr;		\
+		(_regs)->gprs[15] = arch_ftrace_regs(fregs)->regs.gprs[15];		\
+	} while (0)
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+/*
+ * When an ftrace registered caller is tracing a function that is
+ * also set by a register_ftrace_direct() call, it needs to be
+ * differentiated in the ftrace_caller trampoline. To do this,
+ * place the direct caller in the ORIG_GPR2 part of pt_regs. This
+ * tells the ftrace_caller that there's a direct caller.
+ */
+static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsigned long addr)
+{
+	struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs;
+	regs->orig_gpr2 = addr;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
 /*
  * Even though the system call numbers are identical for s390/s390x a
  * different system call table is used for compat tasks. This may lead
@@ -113,5 +130,37 @@ static inline bool arch_syscall_match_sym_name(const char *sym,
 	return !strcmp(sym + 7, name) || !strcmp(sym + 8, name);
 }
 
+void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
+		       struct ftrace_ops *op, struct ftrace_regs *fregs);
+#define ftrace_graph_func ftrace_graph_func
+
 #endif /* __ASSEMBLY__ */
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+#define FTRACE_NOP_INSN .word 0xc004, 0x0000, 0x0000 /* brcl 0,0 */
+
+#ifndef CC_USING_HOTPATCH
+
+#define FTRACE_GEN_MCOUNT_RECORD(name)		\
+	.section __mcount_loc, "a", @progbits;	\
+	.quad name;				\
+	.previous;
+
+#else /* !CC_USING_HOTPATCH */
+
+#define FTRACE_GEN_MCOUNT_RECORD(name)
+
+#endif /* !CC_USING_HOTPATCH */
+
+#define FTRACE_GEN_NOP_ASM(name)		\
+	FTRACE_GEN_MCOUNT_RECORD(name)		\
+	FTRACE_NOP_INSN
+
+#else /* CONFIG_FUNCTION_TRACER */
+
+#define FTRACE_GEN_NOP_ASM(name)
+
+#endif /* CONFIG_FUNCTION_TRACER */
+
 #endif /* _ASM_S390_FTRACE_H */
diff --git a/arch/s390/include/asm/ftrace.lds.h b/arch/s390/include/asm/ftrace.lds.h
new file mode 100644
index 000000000000..968adfd41240
--- /dev/null
+++ b/arch/s390/include/asm/ftrace.lds.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef DIV_ROUND_UP
+#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
+#endif
+
+#define SIZEOF_MCOUNT_LOC_ENTRY 8
+#define SIZEOF_FTRACE_HOTPATCH_TRAMPOLINE 24
+#define FTRACE_HOTPATCH_TRAMPOLINES_SIZE(n)				       \
+	DIV_ROUND_UP(SIZEOF_FTRACE_HOTPATCH_TRAMPOLINE * (n),		       \
+		     SIZEOF_MCOUNT_LOC_ENTRY)
+
+#ifdef CONFIG_FUNCTION_TRACER
+#define FTRACE_HOTPATCH_TRAMPOLINES_TEXT				       \
+	. = ALIGN(8);							       \
+	__ftrace_hotpatch_trampolines_start = .;			       \
+	. = . + FTRACE_HOTPATCH_TRAMPOLINES_SIZE(__stop_mcount_loc -	       \
+						 __start_mcount_loc);	       \
+	__ftrace_hotpatch_trampolines_end = .;
+#else
+#define FTRACE_HOTPATCH_TRAMPOLINES_TEXT
+#endif
diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h
index 5e97a4353147..f5781794356b 100644
--- a/arch/s390/include/asm/futex.h
+++ b/arch/s390/include/asm/futex.h
@@ -2,86 +2,95 @@
 #ifndef _ASM_S390_FUTEX_H
 #define _ASM_S390_FUTEX_H
 
+#include <linux/instrumented.h>
 #include <linux/uaccess.h>
 #include <linux/futex.h>
+#include <asm/asm-extable.h>
 #include <asm/mmu_context.h>
 #include <asm/errno.h>
 
-#define __futex_atomic_op(insn, ret, oldval, newval, uaddr, oparg)	\
-	asm volatile(							\
-		"   sacf  256\n"					\
-		"0: l     %1,0(%6)\n"					\
-		"1:"insn						\
-		"2: cs    %1,%2,0(%6)\n"				\
-		"3: jl    1b\n"						\
-		"   lhi   %0,0\n"					\
-		"4: sacf  768\n"					\
-		EX_TABLE(0b,4b) EX_TABLE(2b,4b) EX_TABLE(3b,4b)		\
-		: "=d" (ret), "=&d" (oldval), "=&d" (newval),		\
-		  "=m" (*uaddr)						\
-		: "0" (-EFAULT), "d" (oparg), "a" (uaddr),		\
-		  "m" (*uaddr) : "cc");
+#define FUTEX_OP_FUNC(name, insn)						\
+static uaccess_kmsan_or_inline int						\
+__futex_atomic_##name(int oparg, int *old, u32 __user *uaddr)			\
+{										\
+	int rc, new;								\
+										\
+	instrument_copy_from_user_before(old, uaddr, sizeof(*old));		\
+	asm_inline volatile(							\
+		"	sacf	256\n"						\
+		"0:	l	%[old],%[uaddr]\n"				\
+		"1:"insn							\
+		"2:	cs	%[old],%[new],%[uaddr]\n"			\
+		"3:	jl	1b\n"						\
+		"	lhi	%[rc],0\n"					\
+		"4:	sacf	768\n"						\
+		EX_TABLE_UA_FAULT(0b, 4b, %[rc])				\
+		EX_TABLE_UA_FAULT(1b, 4b, %[rc])				\
+		EX_TABLE_UA_FAULT(2b, 4b, %[rc])				\
+		EX_TABLE_UA_FAULT(3b, 4b, %[rc])				\
+		: [rc] "=d" (rc), [old] "=&d" (*old),				\
+		  [new] "=&d" (new), [uaddr] "+Q" (*uaddr)			\
+		: [oparg] "d" (oparg)						\
+		: "cc");							\
+	if (!rc)								\
+		instrument_copy_from_user_after(old, uaddr, sizeof(*old), 0);	\
+	return rc;								\
+}
+
+FUTEX_OP_FUNC(set, "lr %[new],%[oparg]\n")
+FUTEX_OP_FUNC(add, "lr %[new],%[old]\n ar %[new],%[oparg]\n")
+FUTEX_OP_FUNC(or,  "lr %[new],%[old]\n or %[new],%[oparg]\n")
+FUTEX_OP_FUNC(and, "lr %[new],%[old]\n nr %[new],%[oparg]\n")
+FUTEX_OP_FUNC(xor, "lr %[new],%[old]\n xr %[new],%[oparg]\n")
 
-static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
-		u32 __user *uaddr)
+static inline
+int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int oldval = 0, newval, ret;
-	mm_segment_t old_fs;
+	int old, rc;
 
-	old_fs = enable_sacf_uaccess();
-	pagefault_disable();
 	switch (op) {
 	case FUTEX_OP_SET:
-		__futex_atomic_op("lr %2,%5\n",
-				  ret, oldval, newval, uaddr, oparg);
+		rc = __futex_atomic_set(oparg, &old, uaddr);
 		break;
 	case FUTEX_OP_ADD:
-		__futex_atomic_op("lr %2,%1\nar %2,%5\n",
-				  ret, oldval, newval, uaddr, oparg);
+		rc = __futex_atomic_add(oparg, &old, uaddr);
 		break;
 	case FUTEX_OP_OR:
-		__futex_atomic_op("lr %2,%1\nor %2,%5\n",
-				  ret, oldval, newval, uaddr, oparg);
+		rc = __futex_atomic_or(oparg, &old, uaddr);
 		break;
 	case FUTEX_OP_ANDN:
-		__futex_atomic_op("lr %2,%1\nnr %2,%5\n",
-				  ret, oldval, newval, uaddr, oparg);
+		rc = __futex_atomic_and(~oparg, &old, uaddr);
 		break;
 	case FUTEX_OP_XOR:
-		__futex_atomic_op("lr %2,%1\nxr %2,%5\n",
-				  ret, oldval, newval, uaddr, oparg);
+		rc = __futex_atomic_xor(oparg, &old, uaddr);
 		break;
 	default:
-		ret = -ENOSYS;
+		rc = -ENOSYS;
 	}
-	pagefault_enable();
-	disable_sacf_uaccess(old_fs);
-
-	if (!ret)
-		*oval = oldval;
-
-	return ret;
+	if (!rc)
+		*oval = old;
+	return rc;
 }
 
-static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
-						u32 oldval, u32 newval)
+static uaccess_kmsan_or_inline
+int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newval)
 {
-	mm_segment_t old_fs;
-	int ret;
+	int rc;
 
-	old_fs = enable_sacf_uaccess();
-	asm volatile(
-		"   sacf 256\n"
-		"0: cs   %1,%4,0(%5)\n"
-		"1: la   %0,0\n"
-		"2: sacf 768\n"
-		EX_TABLE(0b,2b) EX_TABLE(1b,2b)
-		: "=d" (ret), "+d" (oldval), "=m" (*uaddr)
-		: "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr)
+	instrument_copy_from_user_before(uval, uaddr, sizeof(*uval));
+	asm_inline volatile(
+		"	sacf	256\n"
+		"0:	cs	%[old],%[new],%[uaddr]\n"
+		"1:	lhi	%[rc],0\n"
+		"2:	sacf	768\n"
+		EX_TABLE_UA_FAULT(0b, 2b, %[rc])
+		EX_TABLE_UA_FAULT(1b, 2b, %[rc])
+		: [rc] "=d" (rc), [old] "+d" (oldval), [uaddr] "+Q" (*uaddr)
+		: [new] "d" (newval)
 		: "cc", "memory");
-	disable_sacf_uaccess(old_fs);
 	*uval = oldval;
-	return ret;
+	instrument_copy_from_user_after(uval, uaddr, sizeof(*uval), 0);
+	return rc;
 }
 
 #endif /* _ASM_S390_FUTEX_H */
diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 37f96b6f0e61..9f2814d0e1e9 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -9,6 +9,7 @@
 #ifndef _ASM_S390_GMAP_H
 #define _ASM_S390_GMAP_H
 
+#include <linux/radix-tree.h>
 #include <linux/refcount.h>
 
 /* Generic bits for GMAP notification on DAT table entry changes. */
@@ -16,13 +17,12 @@
 #define GMAP_NOTIFY_MPROT	0x1
 
 /* Status bits only for huge segment entries */
-#define _SEGMENT_ENTRY_GMAP_IN		0x8000	/* invalidation notify bit */
-#define _SEGMENT_ENTRY_GMAP_UC		0x4000	/* dirty (migration) */
+#define _SEGMENT_ENTRY_GMAP_IN		0x0800	/* invalidation notify bit */
+#define _SEGMENT_ENTRY_GMAP_UC		0x0002	/* dirty (migration) */
 
 /**
  * struct gmap_struct - guest address space
  * @list: list head for the mm->context gmap list
- * @crst_list: list of all crst tables used in the guest address space
  * @mm: pointer to the parent mm_struct
  * @guest_to_host: radix tree with guest to host address translation
  * @host_to_guest: radix tree with pointer to segment table entries
@@ -31,9 +31,9 @@
  * @table: pointer to the page directory
  * @asce: address space control element for gmap page table
  * @pfault_enabled: defines if pfaults are applicable for the guest
+ * @guest_handle: protected virtual machine handle for the ultravisor
  * @host_to_rmap: radix tree with gmap_rmap lists
  * @children: list of shadow gmap structures
- * @pt_list: list of all page tables used in the shadow guest address space
  * @shadow_lock: spinlock to protect the shadow gmap list
  * @parent: pointer to the parent gmap for shadow guest address spaces
  * @orig_asce: ASCE for which the shadow page table has been created
@@ -43,7 +43,6 @@
  */
 struct gmap {
 	struct list_head list;
-	struct list_head crst_list;
 	struct mm_struct *mm;
 	struct radix_tree_root guest_to_host;
 	struct radix_tree_root host_to_guest;
@@ -54,10 +53,11 @@ struct gmap {
 	unsigned long asce_end;
 	void *private;
 	bool pfault_enabled;
+	/* only set for protected virtual machines */
+	unsigned long guest_handle;
 	/* Additional data for shadow guest address spaces */
 	struct radix_tree_root host_to_rmap;
 	struct list_head children;
-	struct list_head pt_list;
 	spinlock_t shadow_lock;
 	struct gmap *parent;
 	unsigned long orig_asce;
@@ -102,26 +102,21 @@ struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit);
 void gmap_remove(struct gmap *gmap);
 struct gmap *gmap_get(struct gmap *gmap);
 void gmap_put(struct gmap *gmap);
+void gmap_free(struct gmap *gmap);
+struct gmap *gmap_alloc(unsigned long limit);
 
-void gmap_enable(struct gmap *gmap);
-void gmap_disable(struct gmap *gmap);
-struct gmap *gmap_get_enabled(void);
 int gmap_map_segment(struct gmap *gmap, unsigned long from,
 		     unsigned long to, unsigned long len);
 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len);
 unsigned long __gmap_translate(struct gmap *, unsigned long gaddr);
-unsigned long gmap_translate(struct gmap *, unsigned long gaddr);
 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr);
-int gmap_fault(struct gmap *, unsigned long gaddr, unsigned int fault_flags);
 void gmap_discard(struct gmap *, unsigned long from, unsigned long to);
 void __gmap_zap(struct gmap *, unsigned long gaddr);
 void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr);
 
 int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val);
 
-struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
-			 int edat_level);
-int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level);
+void gmap_unshadow(struct gmap *sg);
 int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
 		    int fake);
 int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
@@ -130,18 +125,52 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
 		    int fake);
 int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
 		    int fake);
-int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
-			   unsigned long *pgt, int *dat_protection, int *fake);
 int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte);
 
 void gmap_register_pte_notifier(struct gmap_notifier *);
 void gmap_unregister_pte_notifier(struct gmap_notifier *);
-void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *,
-		     unsigned long bits);
 
-int gmap_mprotect_notify(struct gmap *, unsigned long start,
-			 unsigned long len, int prot);
+int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits);
 
 void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
 			     unsigned long gaddr, unsigned long vmaddr);
+int s390_disable_cow_sharing(void);
+int s390_replace_asce(struct gmap *gmap);
+void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns);
+int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
+			    unsigned long end, bool interruptible);
+unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level);
+
+/**
+ * s390_uv_destroy_range - Destroy a range of pages in the given mm.
+ * @mm: the mm on which to operate on
+ * @start: the start of the range
+ * @end: the end of the range
+ *
+ * This function will call cond_sched, so it should not generate stalls, but
+ * it will otherwise only return when it completed.
+ */
+static inline void s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
+					 unsigned long end)
+{
+	(void)__s390_uv_destroy_range(mm, start, end, false);
+}
+
+/**
+ * s390_uv_destroy_range_interruptible - Destroy a range of pages in the
+ * given mm, but stop when a fatal signal is received.
+ * @mm: the mm on which to operate on
+ * @start: the start of the range
+ * @end: the end of the range
+ *
+ * This function will call cond_sched, so it should not generate stalls. If
+ * a fatal signal is received, it will return with -EINTR immediately,
+ * without finishing destroying the whole range. Upon successful
+ * completion, 0 is returned.
+ */
+static inline int s390_uv_destroy_range_interruptible(struct mm_struct *mm, unsigned long start,
+						      unsigned long end)
+{
+	return __s390_uv_destroy_range(mm, start, end, true);
+}
 #endif /* _ASM_S390_GMAP_H */
diff --git a/arch/s390/include/asm/hardirq.h b/arch/s390/include/asm/hardirq.h
index dfbc3c6c0674..a5b45388c91f 100644
--- a/arch/s390/include/asm/hardirq.h
+++ b/arch/s390/include/asm/hardirq.h
@@ -13,12 +13,11 @@
 
 #include <asm/lowcore.h>
 
-#define local_softirq_pending() (S390_lowcore.softirq_pending)
-#define set_softirq_pending(x) (S390_lowcore.softirq_pending = (x))
-#define or_softirq_pending(x)  (S390_lowcore.softirq_pending |= (x))
+#define local_softirq_pending() (get_lowcore()->softirq_pending)
+#define set_softirq_pending(x) (get_lowcore()->softirq_pending = (x))
+#define or_softirq_pending(x)  (get_lowcore()->softirq_pending |= (x))
 
 #define __ARCH_IRQ_STAT
-#define __ARCH_HAS_DO_SOFTIRQ
 #define __ARCH_IRQ_EXIT_IRQS_DISABLED
 
 static inline void ack_bad_irq(unsigned int irq)
diff --git a/arch/s390/include/asm/hiperdispatch.h b/arch/s390/include/asm/hiperdispatch.h
new file mode 100644
index 000000000000..27e23aa27a24
--- /dev/null
+++ b/arch/s390/include/asm/hiperdispatch.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 2024
+ */
+
+#ifndef _ASM_HIPERDISPATCH_H
+#define _ASM_HIPERDISPATCH_H
+
+void hd_reset_state(void);
+void hd_add_core(int cpu);
+void hd_disable_hiperdispatch(void);
+int hd_enable_hiperdispatch(void);
+
+#endif /* _ASM_HIPERDISPATCH_H */
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index de8f0bf5f238..931fcc413598 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -9,121 +9,98 @@
 #ifndef _ASM_S390_HUGETLB_H
 #define _ASM_S390_HUGETLB_H
 
+#include <linux/cpufeature.h>
+#include <linux/pgtable.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 #include <asm/page.h>
-#include <asm/pgtable.h>
 
-#define hugetlb_free_pgd_range			free_pgd_range
-#define hugepages_supported()			(MACHINE_HAS_EDAT1)
+#define hugepages_supported()	cpu_has_edat1()
 
+#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-		     pte_t *ptep, pte_t pte);
-pte_t huge_ptep_get(pte_t *ptep);
-pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
-			      unsigned long addr, pte_t *ptep);
-
-static inline bool is_hugepage_only_range(struct mm_struct *mm,
-					  unsigned long addr,
-					  unsigned long len)
-{
-	return false;
-}
+		     pte_t *ptep, pte_t pte, unsigned long sz);
+void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		       pte_t *ptep, pte_t pte);
 
-/*
- * If the arch doesn't supply something else, assume that hugepage
- * size aligned regions are ok without further preparation.
- */
-static inline int prepare_hugepage_range(struct file *file,
-			unsigned long addr, unsigned long len)
+#define __HAVE_ARCH_HUGE_PTEP_GET
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+
+pte_t __huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep);
+
+#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
+static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+					    unsigned long addr, pte_t *ptep,
+					    unsigned long sz)
 {
-	if (len & ~HPAGE_MASK)
-		return -EINVAL;
-	if (addr & ~HPAGE_MASK)
-		return -EINVAL;
-	return 0;
+	return __huge_ptep_get_and_clear(mm, addr, ptep);
 }
 
-static inline void arch_clear_hugepage_flags(struct page *page)
+static inline void arch_clear_hugetlb_flags(struct folio *folio)
 {
-	clear_bit(PG_arch_1, &page->flags);
+	clear_bit(PG_arch_1, &folio->flags);
 }
+#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
 
+#define __HAVE_ARCH_HUGE_PTE_CLEAR
 static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
 				  pte_t *ptep, unsigned long sz)
 {
 	if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
-		pte_val(*ptep) = _REGION3_ENTRY_EMPTY;
+		set_pte(ptep, __pte(_REGION3_ENTRY_EMPTY));
 	else
-		pte_val(*ptep) = _SEGMENT_ENTRY_EMPTY;
+		set_pte(ptep, __pte(_SEGMENT_ENTRY_EMPTY));
 }
 
-static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
-					 unsigned long address, pte_t *ptep)
+#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
+static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
+					  unsigned long address, pte_t *ptep)
 {
-	huge_ptep_get_and_clear(vma->vm_mm, address, ptep);
+	return __huge_ptep_get_and_clear(vma->vm_mm, address, ptep);
 }
 
+#define  __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
 static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 					     unsigned long addr, pte_t *ptep,
 					     pte_t pte, int dirty)
 {
-	int changed = !pte_same(huge_ptep_get(ptep), pte);
+	int changed = !pte_same(huge_ptep_get(vma->vm_mm, addr, ptep), pte);
+
 	if (changed) {
-		huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
-		set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+		__huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
+		__set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
 	}
 	return changed;
 }
 
+#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
 static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
 					   unsigned long addr, pte_t *ptep)
 {
-	pte_t pte = huge_ptep_get_and_clear(mm, addr, ptep);
-	set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte));
-}
-
-static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot)
-{
-	return mk_pte(page, pgprot);
-}
+	pte_t pte = __huge_ptep_get_and_clear(mm, addr, ptep);
 
-static inline int huge_pte_none(pte_t pte)
-{
-	return pte_none(pte);
+	__set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte));
 }
 
-static inline int huge_pte_write(pte_t pte)
+#define __HAVE_ARCH_HUGE_PTE_MKUFFD_WP
+static inline pte_t huge_pte_mkuffd_wp(pte_t pte)
 {
-	return pte_write(pte);
+	return pte;
 }
 
-static inline int huge_pte_dirty(pte_t pte)
+#define __HAVE_ARCH_HUGE_PTE_CLEAR_UFFD_WP
+static inline pte_t huge_pte_clear_uffd_wp(pte_t pte)
 {
-	return pte_dirty(pte);
+	return pte;
 }
 
-static inline pte_t huge_pte_mkwrite(pte_t pte)
+#define __HAVE_ARCH_HUGE_PTE_UFFD_WP
+static inline int huge_pte_uffd_wp(pte_t pte)
 {
-	return pte_mkwrite(pte);
-}
-
-static inline pte_t huge_pte_mkdirty(pte_t pte)
-{
-	return pte_mkdirty(pte);
-}
-
-static inline pte_t huge_pte_wrprotect(pte_t pte)
-{
-	return pte_wrprotect(pte);
-}
-
-static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
-{
-	return pte_modify(pte, newprot);
+	return 0;
 }
 
-static inline bool gigantic_page_runtime_supported(void)
-{
-	return true;
-}
+#include <asm-generic/hugetlb.h>
 
 #endif /* _ASM_S390_HUGETLB_H */
diff --git a/arch/s390/include/asm/hw_irq.h b/arch/s390/include/asm/hw_irq.h
index adae176757ae..9078b5b6b837 100644
--- a/arch/s390/include/asm/hw_irq.h
+++ b/arch/s390/include/asm/hw_irq.h
@@ -7,6 +7,5 @@
 
 void __init init_airq_interrupts(void);
 void __init init_cio_interrupts(void);
-void __init init_ext_interrupts(void);
 
 #endif
diff --git a/arch/s390/include/asm/idals.h b/arch/s390/include/asm/idals.h
index 6fb7aced104a..ac68c657b28c 100644
--- a/arch/s390/include/asm/idals.h
+++ b/arch/s390/include/asm/idals.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* 
+/*
  * Author(s)......: Holger Smolinski <Holger.Smolinski@de.ibm.com>
  *		    Martin Schwidefsky <schwidefsky@de.ibm.com>
  * Bugreports.to..: <Linux390@de.ibm.com>
@@ -17,47 +17,65 @@
 #include <linux/err.h>
 #include <linux/types.h>
 #include <linux/slab.h>
-#include <asm/cio.h>
 #include <linux/uaccess.h>
+#include <asm/dma-types.h>
+#include <asm/cio.h>
+
+#define IDA_SIZE_SHIFT		12
+#define IDA_BLOCK_SIZE		(1UL << IDA_SIZE_SHIFT)
 
-#define IDA_SIZE_LOG 12 /* 11 for 2k , 12 for 4k */
-#define IDA_BLOCK_SIZE (1L<<IDA_SIZE_LOG)
+#define IDA_2K_SIZE_SHIFT	11
+#define IDA_2K_BLOCK_SIZE	(1UL << IDA_2K_SIZE_SHIFT)
 
 /*
  * Test if an address/length pair needs an idal list.
  */
-static inline int
-idal_is_needed(void *vaddr, unsigned int length)
+static inline bool idal_is_needed(void *vaddr, unsigned int length)
 {
-	return ((__pa(vaddr) + length - 1) >> 31) != 0;
-}
+	dma64_t paddr = virt_to_dma64(vaddr);
 
+	return (((__force unsigned long)(paddr) + length - 1) >> 31) != 0;
+}
 
 /*
  * Return the number of idal words needed for an address/length pair.
  */
 static inline unsigned int idal_nr_words(void *vaddr, unsigned int length)
 {
-	return ((__pa(vaddr) & (IDA_BLOCK_SIZE-1)) + length +
-		(IDA_BLOCK_SIZE-1)) >> IDA_SIZE_LOG;
+	unsigned int cidaw;
+
+	cidaw = (unsigned long)vaddr & (IDA_BLOCK_SIZE - 1);
+	cidaw += length + IDA_BLOCK_SIZE - 1;
+	cidaw >>= IDA_SIZE_SHIFT;
+	return cidaw;
+}
+
+/*
+ * Return the number of 2K IDA words needed for an address/length pair.
+ */
+static inline unsigned int idal_2k_nr_words(void *vaddr, unsigned int length)
+{
+	unsigned int cidaw;
+
+	cidaw = (unsigned long)vaddr & (IDA_2K_BLOCK_SIZE - 1);
+	cidaw += length + IDA_2K_BLOCK_SIZE - 1;
+	cidaw >>= IDA_2K_SIZE_SHIFT;
+	return cidaw;
 }
 
 /*
  * Create the list of idal words for an address/length pair.
  */
-static inline unsigned long *idal_create_words(unsigned long *idaws,
-					       void *vaddr, unsigned int length)
+static inline dma64_t *idal_create_words(dma64_t *idaws, void *vaddr, unsigned int length)
 {
-	unsigned long paddr;
+	dma64_t paddr = virt_to_dma64(vaddr);
 	unsigned int cidaw;
 
-	paddr = __pa(vaddr);
-	cidaw = ((paddr & (IDA_BLOCK_SIZE-1)) + length + 
-		 (IDA_BLOCK_SIZE-1)) >> IDA_SIZE_LOG;
 	*idaws++ = paddr;
-	paddr &= -IDA_BLOCK_SIZE;
+	cidaw = idal_nr_words(vaddr, length);
+	paddr = dma64_and(paddr, -IDA_BLOCK_SIZE);
 	while (--cidaw > 0) {
-		paddr += IDA_BLOCK_SIZE;
+		paddr = dma64_add(paddr, IDA_BLOCK_SIZE);
 		*idaws++ = paddr;
 	}
 	return idaws;
@@ -67,36 +85,33 @@ static inline unsigned long *idal_create_words(unsigned long *idaws,
  * Sets the address of the data in CCW.
  * If necessary it allocates an IDAL and sets the appropriate flags.
  */
-static inline int
-set_normalized_cda(struct ccw1 * ccw, void *vaddr)
+static inline int set_normalized_cda(struct ccw1 *ccw, void *vaddr)
 {
 	unsigned int nridaws;
-	unsigned long *idal;
+	dma64_t *idal;
 
 	if (ccw->flags & CCW_FLAG_IDA)
 		return -EINVAL;
 	nridaws = idal_nr_words(vaddr, ccw->count);
 	if (nridaws > 0) {
-		idal = kmalloc(nridaws * sizeof(unsigned long),
-			       GFP_ATOMIC | GFP_DMA );
-		if (idal == NULL)
+		idal = kcalloc(nridaws, sizeof(*idal), GFP_ATOMIC | GFP_DMA);
+		if (!idal)
 			return -ENOMEM;
 		idal_create_words(idal, vaddr, ccw->count);
 		ccw->flags |= CCW_FLAG_IDA;
 		vaddr = idal;
 	}
-	ccw->cda = (__u32)(unsigned long) vaddr;
+	ccw->cda = virt_to_dma32(vaddr);
 	return 0;
 }
 
 /*
  * Releases any allocated IDAL related to the CCW.
  */
-static inline void
-clear_normalized_cda(struct ccw1 * ccw)
+static inline void clear_normalized_cda(struct ccw1 *ccw)
 {
 	if (ccw->flags & CCW_FLAG_IDA) {
-		kfree((void *)(unsigned long) ccw->cda);
+		kfree(dma32_to_virt(ccw->cda));
 		ccw->flags &= ~CCW_FLAG_IDA;
 	}
 	ccw->cda = 0;
@@ -108,125 +123,138 @@ clear_normalized_cda(struct ccw1 * ccw)
 struct idal_buffer {
 	size_t size;
 	size_t page_order;
-	void *data[0];
+	dma64_t data[];
 };
 
 /*
  * Allocate an idal buffer
  */
-static inline struct idal_buffer *
-idal_buffer_alloc(size_t size, int page_order)
+static inline struct idal_buffer *idal_buffer_alloc(size_t size, int page_order)
 {
-	struct idal_buffer *ib;
 	int nr_chunks, nr_ptrs, i;
+	struct idal_buffer *ib;
+	void *vaddr;
 
-	nr_ptrs = (size + IDA_BLOCK_SIZE - 1) >> IDA_SIZE_LOG;
-	nr_chunks = (4096 << page_order) >> IDA_SIZE_LOG;
+	nr_ptrs = (size + IDA_BLOCK_SIZE - 1) >> IDA_SIZE_SHIFT;
+	nr_chunks = (PAGE_SIZE << page_order) >> IDA_SIZE_SHIFT;
 	ib = kmalloc(struct_size(ib, data, nr_ptrs), GFP_DMA | GFP_KERNEL);
-	if (ib == NULL)
+	if (!ib)
 		return ERR_PTR(-ENOMEM);
 	ib->size = size;
 	ib->page_order = page_order;
 	for (i = 0; i < nr_ptrs; i++) {
-		if ((i & (nr_chunks - 1)) != 0) {
-			ib->data[i] = ib->data[i-1] + IDA_BLOCK_SIZE;
-			continue;
-		}
-		ib->data[i] = (void *)
-			__get_free_pages(GFP_KERNEL, page_order);
-		if (ib->data[i] != NULL)
+		if (i & (nr_chunks - 1)) {
+			ib->data[i] = dma64_add(ib->data[i - 1], IDA_BLOCK_SIZE);
 			continue;
-		// Not enough memory
-		while (i >= nr_chunks) {
-			i -= nr_chunks;
-			free_pages((unsigned long) ib->data[i],
-				   ib->page_order);
 		}
-		kfree(ib);
-		return ERR_PTR(-ENOMEM);
+		vaddr = (void *)__get_free_pages(GFP_KERNEL, page_order);
+		if (!vaddr)
+			goto error;
+		ib->data[i] = virt_to_dma64(vaddr);
 	}
 	return ib;
+error:
+	while (i >= nr_chunks) {
+		i -= nr_chunks;
+		vaddr = dma64_to_virt(ib->data[i]);
+		free_pages((unsigned long)vaddr, ib->page_order);
+	}
+	kfree(ib);
+	return ERR_PTR(-ENOMEM);
 }
 
 /*
  * Free an idal buffer.
  */
-static inline void
-idal_buffer_free(struct idal_buffer *ib)
+static inline void idal_buffer_free(struct idal_buffer *ib)
 {
 	int nr_chunks, nr_ptrs, i;
+	void *vaddr;
 
-	nr_ptrs = (ib->size + IDA_BLOCK_SIZE - 1) >> IDA_SIZE_LOG;
-	nr_chunks = (4096 << ib->page_order) >> IDA_SIZE_LOG;
-	for (i = 0; i < nr_ptrs; i += nr_chunks)
-		free_pages((unsigned long) ib->data[i], ib->page_order);
+	nr_ptrs = (ib->size + IDA_BLOCK_SIZE - 1) >> IDA_SIZE_SHIFT;
+	nr_chunks = (PAGE_SIZE << ib->page_order) >> IDA_SIZE_SHIFT;
+	for (i = 0; i < nr_ptrs; i += nr_chunks) {
+		vaddr = dma64_to_virt(ib->data[i]);
+		free_pages((unsigned long)vaddr, ib->page_order);
+	}
 	kfree(ib);
 }
 
 /*
  * Test if a idal list is really needed.
  */
-static inline int
-__idal_buffer_is_needed(struct idal_buffer *ib)
+static inline bool __idal_buffer_is_needed(struct idal_buffer *ib)
 {
-	return ib->size > (4096ul << ib->page_order) ||
-		idal_is_needed(ib->data[0], ib->size);
+	if (ib->size > (PAGE_SIZE << ib->page_order))
+		return true;
+	return idal_is_needed(dma64_to_virt(ib->data[0]), ib->size);
 }
 
 /*
  * Set channel data address to idal buffer.
  */
-static inline void
-idal_buffer_set_cda(struct idal_buffer *ib, struct ccw1 *ccw)
+static inline void idal_buffer_set_cda(struct idal_buffer *ib, struct ccw1 *ccw)
 {
+	void *vaddr;
+
 	if (__idal_buffer_is_needed(ib)) {
-		// setup idals;
-		ccw->cda = (u32)(addr_t) ib->data;
+		/* Setup idals */
+		ccw->cda = virt_to_dma32(ib->data);
 		ccw->flags |= CCW_FLAG_IDA;
-	} else
-		// we do not need idals - use direct addressing
-		ccw->cda = (u32)(addr_t) ib->data[0];
+	} else {
+		/*
+		 * No idals needed - use direct addressing. Convert from
+		 * dma64_t to virt and then to dma32_t only because of type
+		 * checking. The physical address is known to be below 2GB.
+		 */
+		vaddr = dma64_to_virt(ib->data[0]);
+		ccw->cda = virt_to_dma32(vaddr);
+	}
 	ccw->count = ib->size;
 }
 
 /*
  * Copy count bytes from an idal buffer to user memory
  */
-static inline size_t
-idal_buffer_to_user(struct idal_buffer *ib, void __user *to, size_t count)
+static inline size_t idal_buffer_to_user(struct idal_buffer *ib, void __user *to, size_t count)
 {
 	size_t left;
+	void *vaddr;
 	int i;
 
 	BUG_ON(count > ib->size);
 	for (i = 0; count > IDA_BLOCK_SIZE; i++) {
-		left = copy_to_user(to, ib->data[i], IDA_BLOCK_SIZE);
+		vaddr = dma64_to_virt(ib->data[i]);
+		left = copy_to_user(to, vaddr, IDA_BLOCK_SIZE);
 		if (left)
 			return left + count - IDA_BLOCK_SIZE;
-		to = (void __user *) to + IDA_BLOCK_SIZE;
+		to = (void __user *)to + IDA_BLOCK_SIZE;
 		count -= IDA_BLOCK_SIZE;
 	}
-	return copy_to_user(to, ib->data[i], count);
+	vaddr = dma64_to_virt(ib->data[i]);
+	return copy_to_user(to, vaddr, count);
 }
 
 /*
  * Copy count bytes from user memory to an idal buffer
  */
-static inline size_t
-idal_buffer_from_user(struct idal_buffer *ib, const void __user *from, size_t count)
+static inline size_t idal_buffer_from_user(struct idal_buffer *ib, const void __user *from, size_t count)
 {
 	size_t left;
+	void *vaddr;
 	int i;
 
 	BUG_ON(count > ib->size);
 	for (i = 0; count > IDA_BLOCK_SIZE; i++) {
-		left = copy_from_user(ib->data[i], from, IDA_BLOCK_SIZE);
+		vaddr = dma64_to_virt(ib->data[i]);
+		left = copy_from_user(vaddr, from, IDA_BLOCK_SIZE);
 		if (left)
 			return left + count - IDA_BLOCK_SIZE;
-		from = (void __user *) from + IDA_BLOCK_SIZE;
+		from = (void __user *)from + IDA_BLOCK_SIZE;
 		count -= IDA_BLOCK_SIZE;
 	}
-	return copy_from_user(ib->data[i], from, count);
+	vaddr = dma64_to_virt(ib->data[i]);
+	return copy_from_user(vaddr, from, count);
 }
 
 #endif
diff --git a/arch/s390/include/asm/idle.h b/arch/s390/include/asm/idle.h
index 6d4226dcf42a..09f763b9eb40 100644
--- a/arch/s390/include/asm/idle.h
+++ b/arch/s390/include/asm/idle.h
@@ -10,21 +10,18 @@
 
 #include <linux/types.h>
 #include <linux/device.h>
-#include <linux/seqlock.h>
 
 struct s390_idle_data {
-	seqcount_t seqcount;
-	unsigned long long idle_count;
-	unsigned long long idle_time;
-	unsigned long long clock_idle_enter;
-	unsigned long long clock_idle_exit;
-	unsigned long long timer_idle_enter;
-	unsigned long long timer_idle_exit;
+	unsigned long idle_count;
+	unsigned long idle_time;
+	unsigned long clock_idle_enter;
+	unsigned long timer_idle_enter;
+	unsigned long mt_cycles_enter[8];
 };
 
 extern struct device_attribute dev_attr_idle_count;
 extern struct device_attribute dev_attr_idle_time_us;
 
-void psw_idle(struct s390_idle_data *, unsigned long);
+void psw_idle(struct s390_idle_data *data, unsigned long psw_mask);
 
 #endif /* _S390_IDLE_H */
diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h
index 5a16f500515a..faddb9aef3b8 100644
--- a/arch/s390/include/asm/io.h
+++ b/arch/s390/include/asm/io.h
@@ -12,22 +12,28 @@
 
 #include <linux/kernel.h>
 #include <asm/page.h>
+#include <asm/pgtable.h>
 #include <asm/pci_io.h>
 
 #define xlate_dev_mem_ptr xlate_dev_mem_ptr
+#define kc_xlate_dev_mem_ptr xlate_dev_mem_ptr
 void *xlate_dev_mem_ptr(phys_addr_t phys);
 #define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
+#define kc_unxlate_dev_mem_ptr unxlate_dev_mem_ptr
 void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
 
+#define IO_SPACE_LIMIT 0
+
 /*
- * Convert a virtual cached pointer to an uncached pointer
+ * I/O memory mapping functions.
  */
-#define xlate_dev_kmem_ptr(p)	p
+#define ioremap_prot ioremap_prot
+#define iounmap iounmap
 
-#define IO_SPACE_LIMIT 0
+#define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL)
 
-void __iomem *ioremap(unsigned long offset, unsigned long size);
-void iounmap(volatile void __iomem *addr);
+#define ioremap_wc(addr, size)  \
+	ioremap_prot((addr), (size), pgprot_writecombine(PAGE_KERNEL))
 
 static inline void __iomem *ioport_map(unsigned long port, unsigned int nr)
 {
@@ -67,6 +73,21 @@ static inline void ioport_unmap(void __iomem *p)
 #define __raw_writel	zpci_write_u32
 #define __raw_writeq	zpci_write_u64
 
+/* combine single writes by using store-block insn */
+static inline void __iowrite32_copy(void __iomem *to, const void *from,
+				    size_t count)
+{
+	zpci_memcpy_toio(to, from, count * 4);
+}
+#define __iowrite32_copy __iowrite32_copy
+
+static inline void __iowrite64_copy(void __iomem *to, const void *from,
+				    size_t count)
+{
+	zpci_memcpy_toio(to, from, count * 8);
+}
+#define __iowrite64_copy __iowrite64_copy
+
 #endif /* CONFIG_PCI */
 
 #include <asm-generic/io.h>
diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h
index 084e71b7272a..b0d00032479d 100644
--- a/arch/s390/include/asm/ipl.h
+++ b/arch/s390/include/asm/ipl.h
@@ -12,6 +12,7 @@
 #include <asm/types.h>
 #include <asm/cio.h>
 #include <asm/setup.h>
+#include <asm/page.h>
 #include <uapi/asm/ipl.h>
 
 struct ipl_parameter_block {
@@ -21,6 +22,8 @@ struct ipl_parameter_block {
 		struct ipl_pb0_common common;
 		struct ipl_pb0_fcp fcp;
 		struct ipl_pb0_ccw ccw;
+		struct ipl_pb0_eckd eckd;
+		struct ipl_pb0_nvme nvme;
 		char raw[PAGE_SIZE - sizeof(struct ipl_pl_hdr)];
 	};
 } __packed __aligned(PAGE_SIZE);
@@ -30,10 +33,19 @@ struct ipl_parameter_block {
 #define IPL_BP_FCP_LEN (sizeof(struct ipl_pl_hdr) + \
 			      sizeof(struct ipl_pb0_fcp))
 #define IPL_BP0_FCP_LEN (sizeof(struct ipl_pb0_fcp))
+
+#define IPL_BP_NVME_LEN (sizeof(struct ipl_pl_hdr) + \
+			      sizeof(struct ipl_pb0_nvme))
+#define IPL_BP0_NVME_LEN (sizeof(struct ipl_pb0_nvme))
+
 #define IPL_BP_CCW_LEN (sizeof(struct ipl_pl_hdr) + \
 			      sizeof(struct ipl_pb0_ccw))
 #define IPL_BP0_CCW_LEN (sizeof(struct ipl_pb0_ccw))
 
+#define IPL_BP_ECKD_LEN (sizeof(struct ipl_pl_hdr) + \
+			      sizeof(struct ipl_pb0_eckd))
+#define IPL_BP0_ECKD_LEN (sizeof(struct ipl_pb0_eckd))
+
 #define IPL_MAX_SUPPORTED_VERSION (0)
 
 #define IPL_RB_CERT_UNKNOWN ((unsigned short)-1)
@@ -59,6 +71,10 @@ enum ipl_type {
 	IPL_TYPE_FCP		= 4,
 	IPL_TYPE_FCP_DUMP	= 8,
 	IPL_TYPE_NSS		= 16,
+	IPL_TYPE_NVME		= 32,
+	IPL_TYPE_NVME_DUMP	= 64,
+	IPL_TYPE_ECKD		= 128,
+	IPL_TYPE_ECKD_DUMP	= 256,
 };
 
 struct ipl_info
@@ -70,10 +86,17 @@ struct ipl_info
 		} ccw;
 		struct {
 			struct ccw_dev_id dev_id;
+		} eckd;
+		struct {
+			struct ccw_dev_id dev_id;
 			u64 wwpn;
 			u64 lun;
 		} fcp;
 		struct {
+			u32 fid;
+			u32 nsid;
+		} nvme;
+		struct {
 			char name[NSS_NAME_SIZE + 1];
 		} nss;
 	} data;
@@ -83,6 +106,13 @@ extern struct ipl_info ipl_info;
 extern void setup_ipl(void);
 extern void set_os_info_reipl_block(void);
 
+static inline bool is_ipl_type_dump(void)
+{
+	return (ipl_info.type == IPL_TYPE_FCP_DUMP) ||
+		(ipl_info.type == IPL_TYPE_ECKD_DUMP) ||
+		(ipl_info.type == IPL_TYPE_NVME_DUMP);
+}
+
 struct ipl_report {
 	struct ipl_parameter_block *ipib;
 	struct list_head components;
@@ -114,11 +144,18 @@ int ipl_report_add_certificate(struct ipl_report *report, void *key,
  * DIAG 308 support
  */
 enum diag308_subcode  {
+	DIAG308_CLEAR_RESET = 0,
+	DIAG308_LOAD_NORMAL_RESET = 1,
 	DIAG308_REL_HSA = 2,
 	DIAG308_LOAD_CLEAR = 3,
 	DIAG308_LOAD_NORMAL_DUMP = 4,
 	DIAG308_SET = 5,
 	DIAG308_STORE = 6,
+	DIAG308_LOAD_NORMAL = 7,
+};
+
+enum diag308_subcode_flags {
+	DIAG308_FLAG_EI = 1UL << 16,
 };
 
 enum diag308_rc {
diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h
index 9f75d67b8c20..bde6a496df5f 100644
--- a/arch/s390/include/asm/irq.h
+++ b/arch/s390/include/asm/irq.h
@@ -31,6 +31,7 @@
 #include <linux/percpu.h>
 #include <linux/cache.h>
 #include <linux/types.h>
+#include <asm/ctlreg.h>
 
 enum interruption_class {
 	IRQEXT_CLK,
@@ -46,13 +47,13 @@ enum interruption_class {
 	IRQEXT_CMS,
 	IRQEXT_CMC,
 	IRQEXT_FTP,
+	IRQEXT_WTI,
 	IRQIO_CIO,
 	IRQIO_DAS,
 	IRQIO_C15,
 	IRQIO_C70,
 	IRQIO_TAP,
 	IRQIO_VMR,
-	IRQIO_LCS,
 	IRQIO_CTC,
 	IRQIO_ADM,
 	IRQIO_CSC,
@@ -81,8 +82,13 @@ static __always_inline void inc_irq_stat(enum interruption_class irq)
 }
 
 struct ext_code {
-	unsigned short subcode;
-	unsigned short code;
+	union {
+		struct {
+			unsigned short subcode;
+			unsigned short code;
+		};
+		unsigned int int_code;
+	};
 };
 
 typedef void (*ext_int_handler_t)(struct ext_code, unsigned int, unsigned long);
@@ -93,20 +99,21 @@ int unregister_external_irq(u16 code, ext_int_handler_t handler);
 enum irq_subclass {
 	IRQ_SUBCLASS_MEASUREMENT_ALERT = 5,
 	IRQ_SUBCLASS_SERVICE_SIGNAL = 9,
+	IRQ_SUBCLASS_WARNING_TRACK = 33,
 };
 
 #define CR0_IRQ_SUBCLASS_MASK					  \
-	((1UL << (63 - 30))  /* Warning Track */		| \
-	 (1UL << (63 - 48))  /* Malfunction Alert */		| \
-	 (1UL << (63 - 49))  /* Emergency Signal */		| \
-	 (1UL << (63 - 50))  /* External Call */		| \
-	 (1UL << (63 - 52))  /* Clock Comparator */		| \
-	 (1UL << (63 - 53))  /* CPU Timer */			| \
-	 (1UL << (63 - 54))  /* Service Signal */		| \
-	 (1UL << (63 - 57))  /* Interrupt Key */		| \
-	 (1UL << (63 - 58))  /* Measurement Alert */		| \
-	 (1UL << (63 - 59))  /* Timing Alert */			| \
-	 (1UL << (63 - 62))) /* IUCV */
+	(CR0_WARNING_TRACK					| \
+	 CR0_MALFUNCTION_ALERT_SUBMASK				| \
+	 CR0_EMERGENCY_SIGNAL_SUBMASK				| \
+	 CR0_EXTERNAL_CALL_SUBMASK				| \
+	 CR0_CLOCK_COMPARATOR_SUBMASK				| \
+	 CR0_CPU_TIMER_SUBMASK					| \
+	 CR0_SERVICE_SIGNAL_SUBMASK				| \
+	 CR0_INTERRUPT_KEY_SUBMASK				| \
+	 CR0_MEASUREMENT_ALERT_SUBMASK				| \
+	 CR0_ETR_SUBMASK					| \
+	 CR0_IUCV)
 
 void irq_subclass_register(enum irq_subclass subclass);
 void irq_subclass_unregister(enum irq_subclass subclass);
diff --git a/arch/s390/include/asm/irq_work.h b/arch/s390/include/asm/irq_work.h
new file mode 100644
index 000000000000..f00c9f610d5a
--- /dev/null
+++ b/arch/s390/include/asm/irq_work.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_IRQ_WORK_H
+#define _ASM_S390_IRQ_WORK_H
+
+static inline bool arch_irq_work_has_interrupt(void)
+{
+	return true;
+}
+
+#endif /* _ASM_S390_IRQ_WORK_H */
diff --git a/arch/s390/include/asm/irqflags.h b/arch/s390/include/asm/irqflags.h
index 586df4c9e2f2..bcab456dfb80 100644
--- a/arch/s390/include/asm/irqflags.h
+++ b/arch/s390/include/asm/irqflags.h
@@ -32,45 +32,56 @@
 })
 
 /* set system mask. */
-static inline notrace void __arch_local_irq_ssm(unsigned long flags)
+static __always_inline void __arch_local_irq_ssm(unsigned long flags)
 {
 	asm volatile("ssm   %0" : : "Q" (flags) : "memory");
 }
 
-static inline notrace unsigned long arch_local_save_flags(void)
+#ifdef CONFIG_KMSAN
+#define arch_local_irq_attributes noinline notrace __no_sanitize_memory __maybe_unused
+#else
+#define arch_local_irq_attributes __always_inline
+#endif
+
+static arch_local_irq_attributes unsigned long arch_local_save_flags(void)
 {
 	return __arch_local_irq_stnsm(0xff);
 }
 
-static inline notrace unsigned long arch_local_irq_save(void)
+static arch_local_irq_attributes unsigned long arch_local_irq_save(void)
 {
 	return __arch_local_irq_stnsm(0xfc);
 }
 
-static inline notrace void arch_local_irq_disable(void)
+static __always_inline void arch_local_irq_disable(void)
 {
 	arch_local_irq_save();
 }
 
-static inline notrace void arch_local_irq_enable(void)
+static arch_local_irq_attributes void arch_local_irq_enable_external(void)
+{
+	__arch_local_irq_stosm(0x01);
+}
+
+static arch_local_irq_attributes void arch_local_irq_enable(void)
 {
 	__arch_local_irq_stosm(0x03);
 }
 
 /* This only restores external and I/O interrupt state */
-static inline notrace void arch_local_irq_restore(unsigned long flags)
+static __always_inline void arch_local_irq_restore(unsigned long flags)
 {
 	/* only disabled->disabled and disabled->enabled is valid */
 	if (flags & ARCH_IRQ_ENABLED)
 		arch_local_irq_enable();
 }
 
-static inline notrace bool arch_irqs_disabled_flags(unsigned long flags)
+static __always_inline bool arch_irqs_disabled_flags(unsigned long flags)
 {
 	return !(flags & ARCH_IRQ_ENABLED);
 }
 
-static inline notrace bool arch_irqs_disabled(void)
+static __always_inline bool arch_irqs_disabled(void)
 {
 	return arch_irqs_disabled_flags(arch_local_save_flags());
 }
diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h
index 39f747d63758..bf78cf381dfc 100644
--- a/arch/s390/include/asm/jump_label.h
+++ b/arch/s390/include/asm/jump_label.h
@@ -2,27 +2,30 @@
 #ifndef _ASM_S390_JUMP_LABEL_H
 #define _ASM_S390_JUMP_LABEL_H
 
+#define HAVE_JUMP_LABEL_BATCH
+
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
 #include <linux/stringify.h>
 
 #define JUMP_LABEL_NOP_SIZE 6
-#define JUMP_LABEL_NOP_OFFSET 2
 
-#if __GNUC__ < 9
+#ifdef CONFIG_CC_IS_CLANG
+#define JUMP_LABEL_STATIC_KEY_CONSTRAINT "i"
+#elif __GNUC__ < 9
 #define JUMP_LABEL_STATIC_KEY_CONSTRAINT "X"
 #else
 #define JUMP_LABEL_STATIC_KEY_CONSTRAINT "jdd"
 #endif
 
 /*
- * We use a brcl 0,2 instruction for jump labels at compile time so it
+ * We use a brcl 0,<offset> instruction for jump labels so it
  * can be easily distinguished from a hotpatch generated instruction.
  */
 static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("0:	brcl	0,"__stringify(JUMP_LABEL_NOP_OFFSET)"\n"
+	asm goto("0:	brcl 0,%l[label]\n"
 			  ".pushsection __jump_table,\"aw\"\n"
 			  ".balign	8\n"
 			  ".long	0b-.,%l[label]-.\n"
@@ -36,7 +39,7 @@ label:
 
 static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("0:	brcl 15,%l[label]\n"
+	asm goto("0:	brcl 15,%l[label]\n"
 			  ".pushsection __jump_table,\"aw\"\n"
 			  ".balign	8\n"
 			  ".long	0b-.,%l[label]-.\n"
diff --git a/arch/s390/include/asm/kasan.h b/arch/s390/include/asm/kasan.h
index 70930fe5c496..0cffead0f2f2 100644
--- a/arch/s390/include/asm/kasan.h
+++ b/arch/s390/include/asm/kasan.h
@@ -2,29 +2,17 @@
 #ifndef __ASM_KASAN_H
 #define __ASM_KASAN_H
 
-#include <asm/pgtable.h>
+#include <linux/const.h>
 
 #ifdef CONFIG_KASAN
 
 #define KASAN_SHADOW_SCALE_SHIFT 3
-#ifdef CONFIG_KASAN_S390_4_LEVEL_PAGING
 #define KASAN_SHADOW_SIZE						       \
 	(_AC(1, UL) << (_REGION1_SHIFT - KASAN_SHADOW_SCALE_SHIFT))
-#else
-#define KASAN_SHADOW_SIZE						       \
-	(_AC(1, UL) << (_REGION2_SHIFT - KASAN_SHADOW_SCALE_SHIFT))
-#endif
 #define KASAN_SHADOW_OFFSET	_AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
 #define KASAN_SHADOW_START	KASAN_SHADOW_OFFSET
 #define KASAN_SHADOW_END	(KASAN_SHADOW_START + KASAN_SHADOW_SIZE)
 
-extern void kasan_early_init(void);
-extern void kasan_copy_shadow(pgd_t *dst);
-extern void kasan_free_early_identity(void);
-#else
-static inline void kasan_early_init(void) { }
-static inline void kasan_copy_shadow(pgd_t *dst) { }
-static inline void kasan_free_early_identity(void) { }
 #endif
 
 #endif
diff --git a/arch/s390/include/asm/kdebug.h b/arch/s390/include/asm/kdebug.h
index d5327f064799..4377238e4752 100644
--- a/arch/s390/include/asm/kdebug.h
+++ b/arch/s390/include/asm/kdebug.h
@@ -23,6 +23,6 @@ enum die_val {
 	DIE_NMI_IPI,
 };
 
-extern void die(struct pt_regs *, const char *);
+extern void __noreturn die(struct pt_regs *, const char *);
 
 #endif
diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h
index ea398a05f643..9084b750350d 100644
--- a/arch/s390/include/asm/kexec.h
+++ b/arch/s390/include/asm/kexec.h
@@ -9,6 +9,8 @@
 #ifndef _S390_KEXEC_H
 #define _S390_KEXEC_H
 
+#include <linux/module.h>
+
 #include <asm/processor.h>
 #include <asm/page.h>
 #include <asm/setup.h>
@@ -29,7 +31,7 @@
 #define KEXEC_CONTROL_MEMORY_LIMIT (1UL<<31)
 
 /* Allocate control page with GFP_DMA */
-#define KEXEC_CONTROL_MEMORY_GFP GFP_DMA
+#define KEXEC_CONTROL_MEMORY_GFP (GFP_DMA | __GFP_NORETRY)
 
 /* Maximum address we can use for the crash control pages */
 #define KEXEC_CRASH_CONTROL_MEMORY_LIMIT (-1UL)
@@ -74,7 +76,38 @@ void *kexec_file_add_components(struct kimage *image,
 int arch_kexec_do_relocs(int r_type, void *loc, unsigned long val,
 			 unsigned long addr);
 
+#define ARCH_HAS_KIMAGE_ARCH
+
+struct kimage_arch {
+	void *ipl_buf;
+};
+
 extern const struct kexec_file_ops s390_kexec_image_ops;
 extern const struct kexec_file_ops s390_kexec_elf_ops;
 
+#ifdef CONFIG_CRASH_DUMP
+void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
+#define crash_free_reserved_phys_range crash_free_reserved_phys_range
+
+void arch_kexec_protect_crashkres(void);
+#define arch_kexec_protect_crashkres arch_kexec_protect_crashkres
+
+void arch_kexec_unprotect_crashkres(void);
+#define arch_kexec_unprotect_crashkres arch_kexec_unprotect_crashkres
+
+bool is_kdump_kernel(void);
+#define is_kdump_kernel is_kdump_kernel
+#endif
+
+#ifdef CONFIG_KEXEC_FILE
+struct purgatory_info;
+int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
+				     Elf_Shdr *section,
+				     const Elf_Shdr *relsec,
+				     const Elf_Shdr *symtab);
+#define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add
+
+int arch_kimage_file_post_load_cleanup(struct kimage *image);
+#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup
+#endif
 #endif /*_S390_KEXEC_H */
diff --git a/arch/s390/include/asm/kfence.h b/arch/s390/include/asm/kfence.h
new file mode 100644
index 000000000000..e95e35eb8a3f
--- /dev/null
+++ b/arch/s390/include/asm/kfence.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_S390_KFENCE_H
+#define _ASM_S390_KFENCE_H
+
+#include <linux/mm.h>
+#include <linux/kfence.h>
+#include <asm/set_memory.h>
+#include <asm/page.h>
+
+void __kernel_map_pages(struct page *page, int numpages, int enable);
+
+static __always_inline bool arch_kfence_init_pool(void)
+{
+#ifdef CONFIG_KFENCE
+	unsigned long pool_pages = KFENCE_POOL_SIZE >> PAGE_SHIFT;
+
+	set_memory_4k((unsigned long)__kfence_pool, pool_pages);
+#endif
+	return true;
+}
+
+#define arch_kfence_test_address(addr) ((addr) & PAGE_MASK)
+
+static inline bool kfence_protect_page(unsigned long addr, bool protect)
+{
+	__kernel_map_pages(virt_to_page((void *)addr), 1, !protect);
+	return true;
+}
+
+#endif /* _ASM_S390_KFENCE_H */
diff --git a/arch/s390/include/asm/kmsan.h b/arch/s390/include/asm/kmsan.h
new file mode 100644
index 000000000000..f73e181d09ae
--- /dev/null
+++ b/arch/s390/include/asm/kmsan.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_KMSAN_H
+#define _ASM_S390_KMSAN_H
+
+#include <asm/lowcore.h>
+#include <asm/page.h>
+#include <linux/kmsan.h>
+#include <linux/mmzone.h>
+#include <linux/stddef.h>
+
+#ifndef MODULE
+
+static inline bool is_lowcore_addr(void *addr)
+{
+	return addr >= (void *)get_lowcore() &&
+	       addr < (void *)(get_lowcore() + 1);
+}
+
+static inline void *arch_kmsan_get_meta_or_null(void *addr, bool is_origin)
+{
+	if (is_lowcore_addr(addr)) {
+		/*
+		 * Different lowcores accessed via S390_lowcore are described
+		 * by the same struct page. Resolve the prefix manually in
+		 * order to get a distinct struct page.
+		 */
+		addr += (void *)lowcore_ptr[raw_smp_processor_id()] -
+			(void *)get_lowcore();
+		if (KMSAN_WARN_ON(is_lowcore_addr(addr)))
+			return NULL;
+		return kmsan_get_metadata(addr, is_origin);
+	}
+	return NULL;
+}
+
+static inline bool kmsan_virt_addr_valid(void *addr)
+{
+	bool ret;
+
+	/*
+	 * pfn_valid() relies on RCU, and may call into the scheduler on exiting
+	 * the critical section. However, this would result in recursion with
+	 * KMSAN. Therefore, disable preemption here, and re-enable preemption
+	 * below while suppressing reschedules to avoid recursion.
+	 *
+	 * Note, this sacrifices occasionally breaking scheduling guarantees.
+	 * Although, a kernel compiled with KMSAN has already given up on any
+	 * performance guarantees due to being heavily instrumented.
+	 */
+	preempt_disable();
+	ret = virt_addr_valid(addr);
+	preempt_enable_no_resched();
+
+	return ret;
+}
+
+#endif /* !MODULE */
+
+#endif /* _ASM_S390_KMSAN_H */
diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h
index b106aa29bf55..01f1682a73b7 100644
--- a/arch/s390/include/asm/kprobes.h
+++ b/arch/s390/include/asm/kprobes.h
@@ -15,6 +15,7 @@
  *		<grundym@us.ibm.com>
  */
 #include <linux/types.h>
+#include <asm/ctlreg.h>
 #include <asm-generic/kprobes.h>
 
 #define BREAKPOINT_INSTRUCTION	0x0002
@@ -54,7 +55,6 @@ typedef u16 kprobe_opcode_t;
 struct arch_specific_insn {
 	/* copy of original instruction */
 	kprobe_opcode_t *insn;
-	unsigned int is_ftrace_insn : 1;
 };
 
 struct prev_kprobe {
@@ -66,16 +66,13 @@ struct prev_kprobe {
 struct kprobe_ctlblk {
 	unsigned long kprobe_status;
 	unsigned long kprobe_saved_imask;
-	unsigned long kprobe_saved_ctl[3];
+	struct ctlreg kprobe_saved_ctl[3];
 	struct prev_kprobe prev_kprobe;
 };
 
 void arch_remove_kprobe(struct kprobe *p);
-void kretprobe_trampoline(void);
 
 int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
-int kprobe_exceptions_notify(struct notifier_block *self,
-	unsigned long val, void *data);
 
 #define flush_insn_slot(p)	do { } while (0)
 
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 02f4c21c57f6..cb89e54ada25 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -15,28 +15,29 @@
 #include <linux/hrtimer.h>
 #include <linux/interrupt.h>
 #include <linux/kvm_types.h>
-#include <linux/kvm_host.h>
 #include <linux/kvm.h>
 #include <linux/seqlock.h>
 #include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/mmu_notifier.h>
+#include <asm/kvm_host_types.h>
 #include <asm/debug.h>
 #include <asm/cpu.h>
-#include <asm/fpu/api.h>
+#include <asm/fpu.h>
 #include <asm/isc.h>
 #include <asm/guarded_storage.h>
 
-#define KVM_S390_BSCA_CPU_SLOTS 64
-#define KVM_S390_ESCA_CPU_SLOTS 248
 #define KVM_MAX_VCPUS 255
-#define KVM_USER_MEM_SLOTS 32
+
+#define KVM_INTERNAL_MEM_SLOTS 1
 
 /*
- * These seem to be used for allocating ->chip in the routing table,
- * which we don't use. 4096 is an out-of-thin-air value. If we need
- * to look at ->chip later on, we'll need to revisit this.
+ * These seem to be used for allocating ->chip in the routing table, which we
+ * don't use. 1 is as small as we can get to reduce the needed memory. If we
+ * need to look at ->chip later on, we'll need to revisit this.
  */
 #define KVM_NR_IRQCHIPS 1
-#define KVM_IRQCHIP_NUM_PINS 4096
+#define KVM_IRQCHIP_NUM_PINS 1
 #define KVM_HALT_POLL_NS_DEFAULT 50000
 
 /* s390-specific vcpu->requests bit members */
@@ -46,262 +47,11 @@
 #define KVM_REQ_START_MIGRATION KVM_ARCH_REQ(3)
 #define KVM_REQ_STOP_MIGRATION  KVM_ARCH_REQ(4)
 #define KVM_REQ_VSIE_RESTART	KVM_ARCH_REQ(5)
-
-#define SIGP_CTRL_C		0x80
-#define SIGP_CTRL_SCN_MASK	0x3f
-
-union bsca_sigp_ctrl {
-	__u8 value;
-	struct {
-		__u8 c : 1;
-		__u8 r : 1;
-		__u8 scn : 6;
-	};
-};
-
-union esca_sigp_ctrl {
-	__u16 value;
-	struct {
-		__u8 c : 1;
-		__u8 reserved: 7;
-		__u8 scn;
-	};
-};
-
-struct esca_entry {
-	union esca_sigp_ctrl sigp_ctrl;
-	__u16   reserved1[3];
-	__u64   sda;
-	__u64   reserved2[6];
-};
-
-struct bsca_entry {
-	__u8	reserved0;
-	union bsca_sigp_ctrl	sigp_ctrl;
-	__u16	reserved[3];
-	__u64	sda;
-	__u64	reserved2[2];
-};
-
-union ipte_control {
-	unsigned long val;
-	struct {
-		unsigned long k  : 1;
-		unsigned long kh : 31;
-		unsigned long kg : 32;
-	};
-};
-
-struct bsca_block {
-	union ipte_control ipte_control;
-	__u64	reserved[5];
-	__u64	mcn;
-	__u64	reserved2;
-	struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS];
-};
-
-struct esca_block {
-	union ipte_control ipte_control;
-	__u64   reserved1[7];
-	__u64   mcn[4];
-	__u64   reserved2[20];
-	struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS];
-};
-
-/*
- * This struct is used to store some machine check info from lowcore
- * for machine checks that happen while the guest is running.
- * This info in host's lowcore might be overwritten by a second machine
- * check from host when host is in the machine check's high-level handling.
- * The size is 24 bytes.
- */
-struct mcck_volatile_info {
-	__u64 mcic;
-	__u64 failing_storage_address;
-	__u32 ext_damage_code;
-	__u32 reserved;
-};
-
-#define CPUSTAT_STOPPED    0x80000000
-#define CPUSTAT_WAIT       0x10000000
-#define CPUSTAT_ECALL_PEND 0x08000000
-#define CPUSTAT_STOP_INT   0x04000000
-#define CPUSTAT_IO_INT     0x02000000
-#define CPUSTAT_EXT_INT    0x01000000
-#define CPUSTAT_RUNNING    0x00800000
-#define CPUSTAT_RETAINED   0x00400000
-#define CPUSTAT_TIMING_SUB 0x00020000
-#define CPUSTAT_SIE_SUB    0x00010000
-#define CPUSTAT_RRF        0x00008000
-#define CPUSTAT_SLSV       0x00004000
-#define CPUSTAT_SLSR       0x00002000
-#define CPUSTAT_ZARCH      0x00000800
-#define CPUSTAT_MCDS       0x00000100
-#define CPUSTAT_KSS        0x00000200
-#define CPUSTAT_SM         0x00000080
-#define CPUSTAT_IBS        0x00000040
-#define CPUSTAT_GED2       0x00000010
-#define CPUSTAT_G          0x00000008
-#define CPUSTAT_GED        0x00000004
-#define CPUSTAT_J          0x00000002
-#define CPUSTAT_P          0x00000001
-
-struct kvm_s390_sie_block {
-	atomic_t cpuflags;		/* 0x0000 */
-	__u32 : 1;			/* 0x0004 */
-	__u32 prefix : 18;
-	__u32 : 1;
-	__u32 ibc : 12;
-	__u8	reserved08[4];		/* 0x0008 */
-#define PROG_IN_SIE (1<<0)
-	__u32	prog0c;			/* 0x000c */
-	__u8	reserved10[16];		/* 0x0010 */
-#define PROG_BLOCK_SIE	(1<<0)
-#define PROG_REQUEST	(1<<1)
-	atomic_t prog20;		/* 0x0020 */
-	__u8	reserved24[4];		/* 0x0024 */
-	__u64	cputm;			/* 0x0028 */
-	__u64	ckc;			/* 0x0030 */
-	__u64	epoch;			/* 0x0038 */
-	__u32	svcc;			/* 0x0040 */
-#define LCTL_CR0	0x8000
-#define LCTL_CR6	0x0200
-#define LCTL_CR9	0x0040
-#define LCTL_CR10	0x0020
-#define LCTL_CR11	0x0010
-#define LCTL_CR14	0x0002
-	__u16   lctl;			/* 0x0044 */
-	__s16	icpua;			/* 0x0046 */
-#define ICTL_OPEREXC	0x80000000
-#define ICTL_PINT	0x20000000
-#define ICTL_LPSW	0x00400000
-#define ICTL_STCTL	0x00040000
-#define ICTL_ISKE	0x00004000
-#define ICTL_SSKE	0x00002000
-#define ICTL_RRBE	0x00001000
-#define ICTL_TPROT	0x00000200
-	__u32	ictl;			/* 0x0048 */
-#define ECA_CEI		0x80000000
-#define ECA_IB		0x40000000
-#define ECA_SIGPI	0x10000000
-#define ECA_MVPGI	0x01000000
-#define ECA_AIV		0x00200000
-#define ECA_VX		0x00020000
-#define ECA_PROTEXCI	0x00002000
-#define ECA_APIE	0x00000008
-#define ECA_SII		0x00000001
-	__u32	eca;			/* 0x004c */
-#define ICPT_INST	0x04
-#define ICPT_PROGI	0x08
-#define ICPT_INSTPROGI	0x0C
-#define ICPT_EXTREQ	0x10
-#define ICPT_EXTINT	0x14
-#define ICPT_IOREQ	0x18
-#define ICPT_WAIT	0x1c
-#define ICPT_VALIDITY	0x20
-#define ICPT_STOP	0x28
-#define ICPT_OPEREXC	0x2C
-#define ICPT_PARTEXEC	0x38
-#define ICPT_IOINST	0x40
-#define ICPT_KSS	0x5c
-	__u8	icptcode;		/* 0x0050 */
-	__u8	icptstatus;		/* 0x0051 */
-	__u16	ihcpu;			/* 0x0052 */
-	__u8	reserved54[2];		/* 0x0054 */
-	__u16	ipa;			/* 0x0056 */
-	__u32	ipb;			/* 0x0058 */
-	__u32	scaoh;			/* 0x005c */
-#define FPF_BPBC 	0x20
-	__u8	fpf;			/* 0x0060 */
-#define ECB_GS		0x40
-#define ECB_TE		0x10
-#define ECB_SRSI	0x04
-#define ECB_HOSTPROTINT	0x02
-	__u8	ecb;			/* 0x0061 */
-#define ECB2_CMMA	0x80
-#define ECB2_IEP	0x20
-#define ECB2_PFMFI	0x08
-#define ECB2_ESCA	0x04
-	__u8    ecb2;                   /* 0x0062 */
-#define ECB3_DEA 0x08
-#define ECB3_AES 0x04
-#define ECB3_RI  0x01
-	__u8    ecb3;			/* 0x0063 */
-	__u32	scaol;			/* 0x0064 */
-	__u8	reserved68;		/* 0x0068 */
-	__u8    epdx;			/* 0x0069 */
-	__u8    reserved6a[2];		/* 0x006a */
-	__u32	todpr;			/* 0x006c */
-#define GISA_FORMAT1 0x00000001
-	__u32	gd;			/* 0x0070 */
-	__u8	reserved74[12];		/* 0x0074 */
-	__u64	mso;			/* 0x0080 */
-	__u64	msl;			/* 0x0088 */
-	psw_t	gpsw;			/* 0x0090 */
-	__u64	gg14;			/* 0x00a0 */
-	__u64	gg15;			/* 0x00a8 */
-	__u8	reservedb0[8];		/* 0x00b0 */
-#define HPID_KVM	0x4
-#define HPID_VSIE	0x5
-	__u8	hpid;			/* 0x00b8 */
-	__u8	reservedb9[11];		/* 0x00b9 */
-	__u16	extcpuaddr;		/* 0x00c4 */
-	__u16	eic;			/* 0x00c6 */
-	__u32	reservedc8;		/* 0x00c8 */
-	__u16	pgmilc;			/* 0x00cc */
-	__u16	iprcc;			/* 0x00ce */
-	__u32	dxc;			/* 0x00d0 */
-	__u16	mcn;			/* 0x00d4 */
-	__u8	perc;			/* 0x00d6 */
-	__u8	peratmid;		/* 0x00d7 */
-	__u64	peraddr;		/* 0x00d8 */
-	__u8	eai;			/* 0x00e0 */
-	__u8	peraid;			/* 0x00e1 */
-	__u8	oai;			/* 0x00e2 */
-	__u8	armid;			/* 0x00e3 */
-	__u8	reservede4[4];		/* 0x00e4 */
-	__u64	tecmc;			/* 0x00e8 */
-	__u8	reservedf0[12];		/* 0x00f0 */
-#define CRYCB_FORMAT_MASK 0x00000003
-#define CRYCB_FORMAT0 0x00000000
-#define CRYCB_FORMAT1 0x00000001
-#define CRYCB_FORMAT2 0x00000003
-	__u32	crycbd;			/* 0x00fc */
-	__u64	gcr[16];		/* 0x0100 */
-	__u64	gbea;			/* 0x0180 */
-	__u8    reserved188[8];		/* 0x0188 */
-	__u64   sdnxo;			/* 0x0190 */
-	__u8    reserved198[8];		/* 0x0198 */
-	__u32	fac;			/* 0x01a0 */
-	__u8	reserved1a4[20];	/* 0x01a4 */
-	__u64	cbrlo;			/* 0x01b8 */
-	__u8	reserved1c0[8];		/* 0x01c0 */
-#define ECD_HOSTREGMGMT	0x20000000
-#define ECD_MEF		0x08000000
-#define ECD_ETOKENF	0x02000000
-#define ECD_ECC		0x00200000
-	__u32	ecd;			/* 0x01c8 */
-	__u8	reserved1cc[18];	/* 0x01cc */
-	__u64	pp;			/* 0x01de */
-	__u8	reserved1e6[2];		/* 0x01e6 */
-	__u64	itdba;			/* 0x01e8 */
-	__u64   riccbd;			/* 0x01f0 */
-	__u64	gvrd;			/* 0x01f8 */
-} __attribute__((packed));
-
-struct kvm_s390_itdb {
-	__u8	data[256];
-};
-
-struct sie_page {
-	struct kvm_s390_sie_block sie_block;
-	struct mcck_volatile_info mcck_info;	/* 0x0200 */
-	__u8 reserved218[1000];		/* 0x0218 */
-	struct kvm_s390_itdb itdb;	/* 0x0600 */
-	__u8 reserved700[2304];		/* 0x0700 */
-};
+#define KVM_REQ_REFRESH_GUEST_PREFIX	\
+	KVM_ARCH_REQ_FLAGS(6, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 
 struct kvm_vcpu_stat {
+	struct kvm_vcpu_stat_generic generic;
 	u64 exit_userspace;
 	u64 exit_null;
 	u64 exit_external_request;
@@ -311,11 +61,7 @@ struct kvm_vcpu_stat {
 	u64 exit_validity;
 	u64 exit_instruction;
 	u64 exit_pei;
-	u64 halt_successful_poll;
-	u64 halt_attempted_poll;
-	u64 halt_poll_invalid;
 	u64 halt_no_poll_steal;
-	u64 halt_wakeup;
 	u64 instruction_lctl;
 	u64 instruction_lctlg;
 	u64 instruction_stctl;
@@ -351,6 +97,7 @@ struct kvm_vcpu_stat {
 	u64 instruction_io_other;
 	u64 instruction_lpsw;
 	u64 instruction_lpswe;
+	u64 instruction_lpswey;
 	u64 instruction_pfmf;
 	u64 instruction_ptff;
 	u64 instruction_sck;
@@ -389,14 +136,16 @@ struct kvm_vcpu_stat {
 	u64 instruction_sigp_init_cpu_reset;
 	u64 instruction_sigp_cpu_reset;
 	u64 instruction_sigp_unknown;
-	u64 diagnose_10;
-	u64 diagnose_44;
-	u64 diagnose_9c;
-	u64 diagnose_9c_ignored;
-	u64 diagnose_258;
-	u64 diagnose_308;
-	u64 diagnose_500;
-	u64 diagnose_other;
+	u64 instruction_diagnose_10;
+	u64 instruction_diagnose_44;
+	u64 instruction_diagnose_9c;
+	u64 diag_9c_ignored;
+	u64 diag_9c_forward;
+	u64 instruction_diagnose_258;
+	u64 instruction_diagnose_308;
+	u64 instruction_diagnose_500;
+	u64 instruction_diagnose_other;
+	u64 pfault_sync;
 };
 
 #define PGM_OPERATION			0x01
@@ -449,6 +198,9 @@ struct kvm_vcpu_stat {
 #define PGM_REGION_FIRST_TRANS		0x39
 #define PGM_REGION_SECOND_TRANS		0x3a
 #define PGM_REGION_THIRD_TRANS		0x3b
+#define PGM_SECURE_STORAGE_ACCESS	0x3d
+#define PGM_NON_SECURE_STORAGE_ACCESS	0x3e
+#define PGM_SECURE_STORAGE_VIOLATION	0x3f
 #define PGM_MONITOR			0x40
 #define PGM_PER				0x80
 #define PGM_CRYPTO_OPERATION		0x119
@@ -471,6 +223,7 @@ enum irq_types {
 	IRQ_PEND_PFAULT_INIT,
 	IRQ_PEND_EXT_HOST,
 	IRQ_PEND_EXT_SERVICE,
+	IRQ_PEND_EXT_SERVICE_EV,
 	IRQ_PEND_EXT_TIMING,
 	IRQ_PEND_EXT_CPU_TIMER,
 	IRQ_PEND_EXT_CLOCK_COMP,
@@ -515,6 +268,7 @@ enum irq_types {
 			   (1UL << IRQ_PEND_EXT_TIMING)     | \
 			   (1UL << IRQ_PEND_EXT_HOST)       | \
 			   (1UL << IRQ_PEND_EXT_SERVICE)    | \
+			   (1UL << IRQ_PEND_EXT_SERVICE_EV) | \
 			   (1UL << IRQ_PEND_VIRTIO)         | \
 			   (1UL << IRQ_PEND_PFAULT_INIT)    | \
 			   (1UL << IRQ_PEND_PFAULT_DONE))
@@ -531,6 +285,13 @@ enum irq_types {
 #define IRQ_PEND_MCHK_MASK ((1UL << IRQ_PEND_MCHK_REP) | \
 			    (1UL << IRQ_PEND_MCHK_EX))
 
+#define IRQ_PEND_EXT_II_MASK ((1UL << IRQ_PEND_EXT_CPU_TIMER)  | \
+			      (1UL << IRQ_PEND_EXT_CLOCK_COMP) | \
+			      (1UL << IRQ_PEND_EXT_EMERGENCY)  | \
+			      (1UL << IRQ_PEND_EXT_EXTERNAL)   | \
+			      (1UL << IRQ_PEND_EXT_SERVICE)    | \
+			      (1UL << IRQ_PEND_EXT_SERVICE_EV))
+
 struct kvm_s390_interrupt_info {
 	struct list_head list;
 	u64	type;
@@ -589,6 +350,7 @@ struct kvm_s390_local_interrupt {
 
 struct kvm_s390_float_interrupt {
 	unsigned long pending_irqs;
+	unsigned long masked_irqs;
 	spinlock_t lock;
 	struct list_head lists[FIRQ_LIST_COUNT];
 	int counters[FIRQ_MAX_COUNT];
@@ -628,6 +390,10 @@ struct kvm_hw_bp_info_arch {
 #define guestdbg_exit_pending(vcpu) (guestdbg_enabled(vcpu) && \
 		(vcpu->guest_debug & KVM_GUESTDBG_EXIT_PENDING))
 
+#define KVM_GUESTDBG_VALID_MASK \
+		(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP |\
+		KVM_GUESTDBG_USE_HW_BP | KVM_GUESTDBG_EXIT_PENDING)
+
 struct kvm_guestdbg_info_arch {
 	unsigned long cr0;
 	unsigned long cr9;
@@ -640,19 +406,21 @@ struct kvm_guestdbg_info_arch {
 	unsigned long last_bp;
 };
 
+struct kvm_s390_pv_vcpu {
+	u64 handle;
+	unsigned long stor_base;
+};
+
 struct kvm_vcpu_arch {
 	struct kvm_s390_sie_block *sie_block;
 	/* if vsie is active, currently executed shadow sie control block */
 	struct kvm_s390_sie_block *vsie_block;
 	unsigned int      host_acrs[NUM_ACRS];
 	struct gs_cb      *host_gscb;
-	struct fpu	  host_fpregs;
 	struct kvm_s390_local_interrupt local_int;
 	struct hrtimer    ckc_timer;
 	struct kvm_s390_pgm_info pgm;
 	struct gmap *gmap;
-	/* backup location for the currently enabled gmap when scheduled out */
-	struct gmap *enabled_gmap;
 	struct kvm_guestdbg_info_arch guestdbg;
 	unsigned long pfault_token;
 	unsigned long pfault_select;
@@ -668,15 +436,27 @@ struct kvm_vcpu_arch {
 	__u64 cputm_start;
 	bool gs_enabled;
 	bool skey_enabled;
+	/* Indicator if the access registers have been loaded from guest */
+	bool acrs_loaded;
+	struct kvm_s390_pv_vcpu pv;
+	union diag318_info diag318_info;
 };
 
 struct kvm_vm_stat {
+	struct kvm_vm_stat_generic generic;
 	u64 inject_io;
 	u64 inject_float_mchk;
 	u64 inject_pfault_done;
 	u64 inject_service_signal;
 	u64 inject_virtio;
-	u64 remote_tlb_flush;
+	u64 aen_forward;
+	u64 gmap_shadow_create;
+	u64 gmap_shadow_reuse;
+	u64 gmap_shadow_r1_entry;
+	u64 gmap_shadow_r2_entry;
+	u64 gmap_shadow_r3_entry;
+	u64 gmap_shadow_sg_entry;
+	u64 gmap_shadow_pg_entry;
 };
 
 struct kvm_arch_memory_slot {
@@ -696,9 +476,6 @@ struct s390_io_adapter {
 	bool masked;
 	bool swap;
 	bool suppressible;
-	struct rw_semaphore maps_lock;
-	struct list_head maps;
-	atomic_t nr_maps;
 };
 
 #define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8)
@@ -714,22 +491,22 @@ struct s390_io_adapter {
 
 struct kvm_s390_cpu_model {
 	/* facility mask supported by kvm & hosting machine */
-	__u64 fac_mask[S390_ARCH_FAC_LIST_SIZE_U64];
+	__u64 fac_mask[S390_ARCH_FAC_MASK_SIZE_U64];
 	struct kvm_s390_vm_cpu_subfunc subfuncs;
 	/* facility list requested by guest (in dma page) */
 	__u64 *fac_list;
 	u64 cpuid;
 	unsigned short ibc;
+	/* subset of available UV-features for pv-guests enabled by user space */
+	struct kvm_s390_vm_cpu_uv_feat uv_feat_guest;
 };
 
-struct kvm_s390_module_hook {
-	int (*hook)(struct kvm_vcpu *vcpu);
-	struct module *owner;
-};
+typedef int (*crypto_hook)(struct kvm_vcpu *vcpu);
 
 struct kvm_s390_crypto {
 	struct kvm_s390_crypto_cb *crycb;
-	struct kvm_s390_module_hook *pqap_hook;
+	struct rw_semaphore pqap_hook_rwsem;
+	crypto_hook *pqap_hook;
 	__u32 crycbd;
 	__u8 aes_kw;
 	__u8 dea_kw;
@@ -819,12 +596,14 @@ struct sie_page2 {
 	u8 reserved928[0x1000 - 0x928];			/* 0x0928 */
 };
 
+struct vsie_page;
+
 struct kvm_s390_vsie {
 	struct mutex mutex;
 	struct radix_tree_root addr_to_page;
 	int page_count;
 	int next;
-	struct page *pages[KVM_MAX_VCPUS];
+	struct vsie_page *pages[KVM_MAX_VCPUS];
 };
 
 struct kvm_s390_gisa_iam {
@@ -841,6 +620,17 @@ struct kvm_s390_gisa_interrupt {
 	DECLARE_BITMAP(kicked_mask, KVM_MAX_VCPUS);
 };
 
+struct kvm_s390_pv {
+	u64 handle;
+	u64 guest_len;
+	unsigned long stor_base;
+	void *stor_var;
+	bool dumping;
+	void *set_aside;
+	struct list_head need_cleanup;
+	struct mmu_notifier mmu_notifier;
+};
+
 struct kvm_arch{
 	void *sca;
 	int use_esca;
@@ -855,6 +645,7 @@ struct kvm_arch{
 	int use_cmma;
 	int use_pfmfi;
 	int use_skf;
+	int use_zpci_interp;
 	int user_cpu_state_ctrl;
 	int user_sigp;
 	int user_stsi;
@@ -874,8 +665,12 @@ struct kvm_arch{
 	atomic64_t cmma_dirty_pages;
 	/* subset of available cpu features enabled by user space */
 	DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+	/* indexed by vcpu_idx */
 	DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS);
 	struct kvm_s390_gisa_interrupt gisa_int;
+	struct kvm_s390_pv pv;
+	struct list_head kzdev_list;
+	spinlock_t kzdev_list_lock;
 };
 
 #define KVM_HVA_ERR_BAD		(-1UL)
@@ -891,33 +686,41 @@ struct kvm_arch_async_pf {
 	unsigned long pfault_token;
 };
 
-bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
+bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu);
 
 void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
 			       struct kvm_async_pf *work);
 
-void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
 				     struct kvm_async_pf *work);
 
 void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 				 struct kvm_async_pf *work);
 
+static inline void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) {}
+
 void kvm_arch_crypto_clear_masks(struct kvm *kvm);
 void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm,
 			       unsigned long *aqm, unsigned long *adm);
 
-extern int sie64a(struct kvm_s390_sie_block *, u64 *);
+int __sie64a(phys_addr_t sie_block_phys, struct kvm_s390_sie_block *sie_block, u64 *rsa,
+	     unsigned long gasce);
+
+static inline int sie64a(struct kvm_s390_sie_block *sie_block, u64 *rsa, unsigned long gasce)
+{
+	return __sie64a(virt_to_phys(sie_block), sie_block, rsa, gasce);
+}
+
 extern char sie_exit;
 
+bool kvm_s390_pv_is_protected(struct kvm *kvm);
+bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu);
+
 extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc);
 extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc);
 
-static inline void kvm_arch_hardware_disable(void) {}
-static inline void kvm_arch_sync_events(struct kvm *kvm) {}
-static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
-static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_free_memslot(struct kvm *kvm,
-		struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
+					 struct kvm_memory_slot *slot) {}
 static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
@@ -925,6 +728,14 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 
-void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu);
+#define __KVM_HAVE_ARCH_VM_FREE
+void kvm_arch_free_vm(struct kvm *kvm);
+
+struct zpci_kvm_hook {
+	int (*kvm_register)(void *opaque, struct kvm *kvm);
+	void (*kvm_unregister)(void *opaque);
+};
+
+extern struct zpci_kvm_hook zpci_kvm_hook;
 
 #endif
diff --git a/arch/s390/include/asm/kvm_host_types.h b/arch/s390/include/asm/kvm_host_types.h
new file mode 100644
index 000000000000..1394d3fb648f
--- /dev/null
+++ b/arch/s390/include/asm/kvm_host_types.h
@@ -0,0 +1,348 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_KVM_HOST_TYPES_H
+#define _ASM_KVM_HOST_TYPES_H
+
+#include <linux/atomic.h>
+#include <linux/types.h>
+
+#define KVM_S390_BSCA_CPU_SLOTS 64
+#define KVM_S390_ESCA_CPU_SLOTS 248
+
+#define SIGP_CTRL_C		0x80
+#define SIGP_CTRL_SCN_MASK	0x3f
+
+union bsca_sigp_ctrl {
+	__u8 value;
+	struct {
+		__u8 c : 1;
+		__u8 r : 1;
+		__u8 scn : 6;
+	};
+};
+
+union esca_sigp_ctrl {
+	__u16 value;
+	struct {
+		__u8 c : 1;
+		__u8 reserved: 7;
+		__u8 scn;
+	};
+};
+
+struct esca_entry {
+	union esca_sigp_ctrl sigp_ctrl;
+	__u16	reserved1[3];
+	__u64	sda;
+	__u64	reserved2[6];
+};
+
+struct bsca_entry {
+	__u8	reserved0;
+	union bsca_sigp_ctrl	sigp_ctrl;
+	__u16	reserved[3];
+	__u64	sda;
+	__u64	reserved2[2];
+};
+
+union ipte_control {
+	unsigned long val;
+	struct {
+		unsigned long k  : 1;
+		unsigned long kh : 31;
+		unsigned long kg : 32;
+	};
+};
+
+/*
+ * Utility is defined as two bytes but having it four bytes wide
+ * generates more efficient code. Since the following bytes are
+ * reserved this makes no functional difference.
+ */
+union sca_utility {
+	__u32 val;
+	struct {
+		__u32 mtcr : 1;
+		__u32	   : 31;
+	};
+};
+
+struct bsca_block {
+	union ipte_control ipte_control;
+	__u64	reserved[5];
+	__u64	mcn;
+	union sca_utility utility;
+	__u8	reserved2[4];
+	struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS];
+};
+
+struct esca_block {
+	union ipte_control ipte_control;
+	__u64	reserved1[6];
+	union sca_utility utility;
+	__u8	reserved2[4];
+	__u64	mcn[4];
+	__u64	reserved3[20];
+	struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS];
+};
+
+/*
+ * This struct is used to store some machine check info from lowcore
+ * for machine checks that happen while the guest is running.
+ * This info in host's lowcore might be overwritten by a second machine
+ * check from host when host is in the machine check's high-level handling.
+ * The size is 24 bytes.
+ */
+struct mcck_volatile_info {
+	__u64 mcic;
+	__u64 failing_storage_address;
+	__u32 ext_damage_code;
+	__u32 reserved;
+};
+
+#define CR0_INITIAL_MASK (CR0_UNUSED_56 | CR0_INTERRUPT_KEY_SUBMASK | \
+			  CR0_MEASUREMENT_ALERT_SUBMASK)
+#define CR14_INITIAL_MASK (CR14_UNUSED_32 | CR14_UNUSED_33 | \
+			   CR14_EXTERNAL_DAMAGE_SUBMASK)
+
+#define SIDAD_SIZE_MASK		0xff
+#define sida_addr(sie_block) phys_to_virt((sie_block)->sidad & PAGE_MASK)
+#define sida_size(sie_block) \
+	((((sie_block)->sidad & SIDAD_SIZE_MASK) + 1) * PAGE_SIZE)
+
+#define CPUSTAT_STOPPED    0x80000000
+#define CPUSTAT_WAIT	   0x10000000
+#define CPUSTAT_ECALL_PEND 0x08000000
+#define CPUSTAT_STOP_INT   0x04000000
+#define CPUSTAT_IO_INT	   0x02000000
+#define CPUSTAT_EXT_INT    0x01000000
+#define CPUSTAT_RUNNING    0x00800000
+#define CPUSTAT_RETAINED   0x00400000
+#define CPUSTAT_TIMING_SUB 0x00020000
+#define CPUSTAT_SIE_SUB    0x00010000
+#define CPUSTAT_RRF	   0x00008000
+#define CPUSTAT_SLSV	   0x00004000
+#define CPUSTAT_SLSR	   0x00002000
+#define CPUSTAT_ZARCH	   0x00000800
+#define CPUSTAT_MCDS	   0x00000100
+#define CPUSTAT_KSS	   0x00000200
+#define CPUSTAT_SM	   0x00000080
+#define CPUSTAT_IBS	   0x00000040
+#define CPUSTAT_GED2	   0x00000010
+#define CPUSTAT_G	   0x00000008
+#define CPUSTAT_GED	   0x00000004
+#define CPUSTAT_J	   0x00000002
+#define CPUSTAT_P	   0x00000001
+
+struct kvm_s390_sie_block {
+	atomic_t cpuflags;		/* 0x0000 */
+	__u32 : 1;			/* 0x0004 */
+	__u32 prefix : 18;
+	__u32 : 1;
+	__u32 ibc : 12;
+	__u8	reserved08[4];		/* 0x0008 */
+#define PROG_IN_SIE (1<<0)
+	__u32	prog0c;			/* 0x000c */
+	union {
+		__u8	reserved10[16];		/* 0x0010 */
+		struct {
+			__u64	pv_handle_cpu;
+			__u64	pv_handle_config;
+		};
+	};
+#define PROG_BLOCK_SIE	(1<<0)
+#define PROG_REQUEST	(1<<1)
+	atomic_t prog20;		/* 0x0020 */
+	__u8	reserved24[4];		/* 0x0024 */
+	__u64	cputm;			/* 0x0028 */
+	__u64	ckc;			/* 0x0030 */
+	__u64	epoch;			/* 0x0038 */
+	__u32	svcc;			/* 0x0040 */
+#define LCTL_CR0	0x8000
+#define LCTL_CR6	0x0200
+#define LCTL_CR9	0x0040
+#define LCTL_CR10	0x0020
+#define LCTL_CR11	0x0010
+#define LCTL_CR14	0x0002
+	__u16	lctl;			/* 0x0044 */
+	__s16	icpua;			/* 0x0046 */
+#define ICTL_OPEREXC	0x80000000
+#define ICTL_PINT	0x20000000
+#define ICTL_LPSW	0x00400000
+#define ICTL_STCTL	0x00040000
+#define ICTL_ISKE	0x00004000
+#define ICTL_SSKE	0x00002000
+#define ICTL_RRBE	0x00001000
+#define ICTL_TPROT	0x00000200
+	__u32	ictl;			/* 0x0048 */
+#define ECA_CEI		0x80000000
+#define ECA_IB		0x40000000
+#define ECA_SIGPI	0x10000000
+#define ECA_MVPGI	0x01000000
+#define ECA_AIV		0x00200000
+#define ECA_VX		0x00020000
+#define ECA_PROTEXCI	0x00002000
+#define ECA_APIE	0x00000008
+#define ECA_SII		0x00000001
+	__u32	eca;			/* 0x004c */
+#define ICPT_INST	0x04
+#define ICPT_PROGI	0x08
+#define ICPT_INSTPROGI	0x0C
+#define ICPT_EXTREQ	0x10
+#define ICPT_EXTINT	0x14
+#define ICPT_IOREQ	0x18
+#define ICPT_WAIT	0x1c
+#define ICPT_VALIDITY	0x20
+#define ICPT_STOP	0x28
+#define ICPT_OPEREXC	0x2C
+#define ICPT_PARTEXEC	0x38
+#define ICPT_IOINST	0x40
+#define ICPT_KSS	0x5c
+#define ICPT_MCHKREQ	0x60
+#define ICPT_INT_ENABLE	0x64
+#define ICPT_PV_INSTR	0x68
+#define ICPT_PV_NOTIFY	0x6c
+#define ICPT_PV_PREF	0x70
+	__u8	icptcode;		/* 0x0050 */
+	__u8	icptstatus;		/* 0x0051 */
+	__u16	ihcpu;			/* 0x0052 */
+	__u8	reserved54;		/* 0x0054 */
+#define IICTL_CODE_NONE		 0x00
+#define IICTL_CODE_MCHK		 0x01
+#define IICTL_CODE_EXT		 0x02
+#define IICTL_CODE_IO		 0x03
+#define IICTL_CODE_RESTART	 0x04
+#define IICTL_CODE_SPECIFICATION 0x10
+#define IICTL_CODE_OPERAND	 0x11
+	__u8	iictl;			/* 0x0055 */
+	__u16	ipa;			/* 0x0056 */
+	__u32	ipb;			/* 0x0058 */
+	__u32	scaoh;			/* 0x005c */
+#define FPF_BPBC	0x20
+	__u8	fpf;			/* 0x0060 */
+#define ECB_GS		0x40
+#define ECB_TE		0x10
+#define ECB_SPECI	0x08
+#define ECB_SRSI	0x04
+#define ECB_HOSTPROTINT	0x02
+#define ECB_PTF		0x01
+	__u8	ecb;			/* 0x0061 */
+#define ECB2_CMMA	0x80
+#define ECB2_IEP	0x20
+#define ECB2_PFMFI	0x08
+#define ECB2_ESCA	0x04
+#define ECB2_ZPCI_LSI	0x02
+	__u8	ecb2;			/* 0x0062 */
+#define ECB3_AISI	0x20
+#define ECB3_AISII	0x10
+#define ECB3_DEA 0x08
+#define ECB3_AES 0x04
+#define ECB3_RI  0x01
+	__u8	ecb3;			/* 0x0063 */
+#define ESCA_SCAOL_MASK ~0x3fU
+	__u32	scaol;			/* 0x0064 */
+	__u8	sdf;			/* 0x0068 */
+	__u8	epdx;			/* 0x0069 */
+	__u8	cpnc;			/* 0x006a */
+	__u8	reserved6b;		/* 0x006b */
+	__u32	todpr;			/* 0x006c */
+#define GISA_FORMAT1 0x00000001
+	__u32	gd;			/* 0x0070 */
+	__u8	reserved74[12];		/* 0x0074 */
+	__u64	mso;			/* 0x0080 */
+	__u64	msl;			/* 0x0088 */
+	psw_t	gpsw;			/* 0x0090 */
+	__u64	gg14;			/* 0x00a0 */
+	__u64	gg15;			/* 0x00a8 */
+	__u8	reservedb0[8];		/* 0x00b0 */
+#define HPID_KVM	0x4
+#define HPID_VSIE	0x5
+	__u8	hpid;			/* 0x00b8 */
+	__u8	reservedb9[7];		/* 0x00b9 */
+	union {
+		struct {
+			__u32	eiparams;	/* 0x00c0 */
+			__u16	extcpuaddr;	/* 0x00c4 */
+			__u16	eic;		/* 0x00c6 */
+		};
+		__u64	mcic;			/* 0x00c0 */
+	} __packed;
+	__u32	reservedc8;		/* 0x00c8 */
+	union {
+		struct {
+			__u16	pgmilc;		/* 0x00cc */
+			__u16	iprcc;		/* 0x00ce */
+		};
+		__u32	edc;			/* 0x00cc */
+	} __packed;
+	union {
+		struct {
+			__u32	dxc;		/* 0x00d0 */
+			__u16	mcn;		/* 0x00d4 */
+			__u8	perc;		/* 0x00d6 */
+			__u8	peratmid;	/* 0x00d7 */
+		};
+		__u64	faddr;			/* 0x00d0 */
+	} __packed;
+	__u64	peraddr;		/* 0x00d8 */
+	__u8	eai;			/* 0x00e0 */
+	__u8	peraid;			/* 0x00e1 */
+	__u8	oai;			/* 0x00e2 */
+	__u8	armid;			/* 0x00e3 */
+	__u8	reservede4[4];		/* 0x00e4 */
+	union {
+		__u64	tecmc;		/* 0x00e8 */
+		struct {
+			__u16	subchannel_id;	/* 0x00e8 */
+			__u16	subchannel_nr;	/* 0x00ea */
+			__u32	io_int_parm;	/* 0x00ec */
+			__u32	io_int_word;	/* 0x00f0 */
+		};
+	} __packed;
+	__u8	reservedf4[8];		/* 0x00f4 */
+#define CRYCB_FORMAT_MASK 0x00000003
+#define CRYCB_FORMAT0 0x00000000
+#define CRYCB_FORMAT1 0x00000001
+#define CRYCB_FORMAT2 0x00000003
+	__u32	crycbd;			/* 0x00fc */
+	__u64	gcr[16];		/* 0x0100 */
+	union {
+		__u64	gbea;		/* 0x0180 */
+		__u64	sidad;
+	};
+	__u8	reserved188[8];		/* 0x0188 */
+	__u64	sdnxo;			/* 0x0190 */
+	__u8	reserved198[8];		/* 0x0198 */
+	__u32	fac;			/* 0x01a0 */
+	__u8	reserved1a4[20];	/* 0x01a4 */
+	__u64	cbrlo;			/* 0x01b8 */
+	__u8	reserved1c0[8];		/* 0x01c0 */
+#define ECD_HOSTREGMGMT	0x20000000
+#define ECD_MEF		0x08000000
+#define ECD_ETOKENF	0x02000000
+#define ECD_ECC		0x00200000
+#define ECD_HMAC	0x00004000
+	__u32	ecd;			/* 0x01c8 */
+	__u8	reserved1cc[18];	/* 0x01cc */
+	__u64	pp;			/* 0x01de */
+	__u8	reserved1e6[2];		/* 0x01e6 */
+	__u64	itdba;			/* 0x01e8 */
+	__u64	riccbd;			/* 0x01f0 */
+	__u64	gvrd;			/* 0x01f8 */
+} __packed __aligned(512);
+
+struct kvm_s390_itdb {
+	__u8	data[256];
+};
+
+struct sie_page {
+	struct kvm_s390_sie_block sie_block;
+	struct mcck_volatile_info mcck_info;	/* 0x0200 */
+	__u8 reserved218[360];		/* 0x0218 */
+	__u64 pv_grregs[16];		/* 0x0380 */
+	__u8 reserved400[512];		/* 0x0400 */
+	struct kvm_s390_itdb itdb;	/* 0x0600 */
+	__u8 reserved700[2304];		/* 0x0700 */
+};
+
+#endif /* _ASM_KVM_HOST_TYPES_H */
diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h
index cbc7c3a68e4d..df73a052760c 100644
--- a/arch/s390/include/asm/kvm_para.h
+++ b/arch/s390/include/asm/kvm_para.h
@@ -24,162 +24,79 @@
 #include <uapi/asm/kvm_para.h>
 #include <asm/diag.h>
 
-static inline long __kvm_hypercall0(unsigned long nr)
-{
-	register unsigned long __nr asm("1") = nr;
-	register long __rc asm("2");
-
-	asm volatile ("diag 2,4,0x500\n"
-		      : "=d" (__rc) : "d" (__nr): "memory", "cc");
-	return __rc;
-}
-
-static inline long kvm_hypercall0(unsigned long nr)
-{
-	diag_stat_inc(DIAG_STAT_X500);
-	return __kvm_hypercall0(nr);
-}
-
-static inline long __kvm_hypercall1(unsigned long nr, unsigned long p1)
-{
-	register unsigned long __nr asm("1") = nr;
-	register unsigned long __p1 asm("2") = p1;
-	register long __rc asm("2");
-
-	asm volatile ("diag 2,4,0x500\n"
-		      : "=d" (__rc) : "d" (__nr), "0" (__p1) : "memory", "cc");
-	return __rc;
-}
-
-static inline long kvm_hypercall1(unsigned long nr, unsigned long p1)
-{
-	diag_stat_inc(DIAG_STAT_X500);
-	return __kvm_hypercall1(nr, p1);
-}
-
-static inline long __kvm_hypercall2(unsigned long nr, unsigned long p1,
-			       unsigned long p2)
-{
-	register unsigned long __nr asm("1") = nr;
-	register unsigned long __p1 asm("2") = p1;
-	register unsigned long __p2 asm("3") = p2;
-	register long __rc asm("2");
-
-	asm volatile ("diag 2,4,0x500\n"
-		      : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2)
-		      : "memory", "cc");
-	return __rc;
-}
-
-static inline long kvm_hypercall2(unsigned long nr, unsigned long p1,
-			       unsigned long p2)
-{
-	diag_stat_inc(DIAG_STAT_X500);
-	return __kvm_hypercall2(nr, p1, p2);
-}
-
-static inline long __kvm_hypercall3(unsigned long nr, unsigned long p1,
-			       unsigned long p2, unsigned long p3)
-{
-	register unsigned long __nr asm("1") = nr;
-	register unsigned long __p1 asm("2") = p1;
-	register unsigned long __p2 asm("3") = p2;
-	register unsigned long __p3 asm("4") = p3;
-	register long __rc asm("2");
-
-	asm volatile ("diag 2,4,0x500\n"
-		      : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2),
-			"d" (__p3) : "memory", "cc");
-	return __rc;
-}
-
-static inline long kvm_hypercall3(unsigned long nr, unsigned long p1,
-			       unsigned long p2, unsigned long p3)
-{
-	diag_stat_inc(DIAG_STAT_X500);
-	return __kvm_hypercall3(nr, p1, p2, p3);
-}
-
-static inline long __kvm_hypercall4(unsigned long nr, unsigned long p1,
-			       unsigned long p2, unsigned long p3,
-			       unsigned long p4)
-{
-	register unsigned long __nr asm("1") = nr;
-	register unsigned long __p1 asm("2") = p1;
-	register unsigned long __p2 asm("3") = p2;
-	register unsigned long __p3 asm("4") = p3;
-	register unsigned long __p4 asm("5") = p4;
-	register long __rc asm("2");
-
-	asm volatile ("diag 2,4,0x500\n"
-		      : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2),
-			"d" (__p3), "d" (__p4) : "memory", "cc");
-	return __rc;
-}
-
-static inline long kvm_hypercall4(unsigned long nr, unsigned long p1,
-			       unsigned long p2, unsigned long p3,
-			       unsigned long p4)
-{
-	diag_stat_inc(DIAG_STAT_X500);
-	return __kvm_hypercall4(nr, p1, p2, p3, p4);
-}
-
-static inline long __kvm_hypercall5(unsigned long nr, unsigned long p1,
-			       unsigned long p2, unsigned long p3,
-			       unsigned long p4, unsigned long p5)
-{
-	register unsigned long __nr asm("1") = nr;
-	register unsigned long __p1 asm("2") = p1;
-	register unsigned long __p2 asm("3") = p2;
-	register unsigned long __p3 asm("4") = p3;
-	register unsigned long __p4 asm("5") = p4;
-	register unsigned long __p5 asm("6") = p5;
-	register long __rc asm("2");
-
-	asm volatile ("diag 2,4,0x500\n"
-		      : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2),
-			"d" (__p3), "d" (__p4), "d" (__p5)  : "memory", "cc");
-	return __rc;
-}
-
-static inline long kvm_hypercall5(unsigned long nr, unsigned long p1,
-			       unsigned long p2, unsigned long p3,
-			       unsigned long p4, unsigned long p5)
-{
-	diag_stat_inc(DIAG_STAT_X500);
-	return __kvm_hypercall5(nr, p1, p2, p3, p4, p5);
-}
-
-static inline long __kvm_hypercall6(unsigned long nr, unsigned long p1,
-			       unsigned long p2, unsigned long p3,
-			       unsigned long p4, unsigned long p5,
-			       unsigned long p6)
-{
-	register unsigned long __nr asm("1") = nr;
-	register unsigned long __p1 asm("2") = p1;
-	register unsigned long __p2 asm("3") = p2;
-	register unsigned long __p3 asm("4") = p3;
-	register unsigned long __p4 asm("5") = p4;
-	register unsigned long __p5 asm("6") = p5;
-	register unsigned long __p6 asm("7") = p6;
-	register long __rc asm("2");
-
-	asm volatile ("diag 2,4,0x500\n"
-		      : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2),
-			"d" (__p3), "d" (__p4), "d" (__p5), "d" (__p6)
-		      : "memory", "cc");
-	return __rc;
-}
-
-static inline long kvm_hypercall6(unsigned long nr, unsigned long p1,
-			       unsigned long p2, unsigned long p3,
-			       unsigned long p4, unsigned long p5,
-			       unsigned long p6)
-{
-	diag_stat_inc(DIAG_STAT_X500);
-	return __kvm_hypercall6(nr, p1, p2, p3, p4, p5, p6);
-}
+#define HYPERCALL_FMT_0
+#define HYPERCALL_FMT_1 , "0" (r2)
+#define HYPERCALL_FMT_2 , "d" (r3) HYPERCALL_FMT_1
+#define HYPERCALL_FMT_3 , "d" (r4) HYPERCALL_FMT_2
+#define HYPERCALL_FMT_4 , "d" (r5) HYPERCALL_FMT_3
+#define HYPERCALL_FMT_5 , "d" (r6) HYPERCALL_FMT_4
+#define HYPERCALL_FMT_6 , "d" (r7) HYPERCALL_FMT_5
+
+#define HYPERCALL_PARM_0
+#define HYPERCALL_PARM_1 , unsigned long arg1
+#define HYPERCALL_PARM_2 HYPERCALL_PARM_1, unsigned long arg2
+#define HYPERCALL_PARM_3 HYPERCALL_PARM_2, unsigned long arg3
+#define HYPERCALL_PARM_4 HYPERCALL_PARM_3, unsigned long arg4
+#define HYPERCALL_PARM_5 HYPERCALL_PARM_4, unsigned long arg5
+#define HYPERCALL_PARM_6 HYPERCALL_PARM_5, unsigned long arg6
+
+#define HYPERCALL_REGS_0
+#define HYPERCALL_REGS_1						\
+	register unsigned long r2 asm("2") = arg1
+#define HYPERCALL_REGS_2						\
+	HYPERCALL_REGS_1;						\
+	register unsigned long r3 asm("3") = arg2
+#define HYPERCALL_REGS_3						\
+	HYPERCALL_REGS_2;						\
+	register unsigned long r4 asm("4") = arg3
+#define HYPERCALL_REGS_4						\
+	HYPERCALL_REGS_3;						\
+	register unsigned long r5 asm("5") = arg4
+#define HYPERCALL_REGS_5						\
+	HYPERCALL_REGS_4;						\
+	register unsigned long r6 asm("6") = arg5
+#define HYPERCALL_REGS_6						\
+	HYPERCALL_REGS_5;						\
+	register unsigned long r7 asm("7") = arg6
+
+#define HYPERCALL_ARGS_0
+#define HYPERCALL_ARGS_1 , arg1
+#define HYPERCALL_ARGS_2 HYPERCALL_ARGS_1, arg2
+#define HYPERCALL_ARGS_3 HYPERCALL_ARGS_2, arg3
+#define HYPERCALL_ARGS_4 HYPERCALL_ARGS_3, arg4
+#define HYPERCALL_ARGS_5 HYPERCALL_ARGS_4, arg5
+#define HYPERCALL_ARGS_6 HYPERCALL_ARGS_5, arg6
+
+#define GENERATE_KVM_HYPERCALL_FUNC(args)				\
+static inline								\
+long __kvm_hypercall##args(unsigned long nr HYPERCALL_PARM_##args)	\
+{									\
+	register unsigned long __nr asm("1") = nr;			\
+	register long __rc asm("2");					\
+	HYPERCALL_REGS_##args;						\
+									\
+	asm volatile (							\
+		"	diag	2,4,0x500\n"				\
+		: "=d" (__rc)						\
+		: "d" (__nr) HYPERCALL_FMT_##args			\
+		: "memory", "cc");					\
+	return __rc;							\
+}									\
+									\
+static inline								\
+long kvm_hypercall##args(unsigned long nr HYPERCALL_PARM_##args)	\
+{									\
+	diag_stat_inc(DIAG_STAT_X500);					\
+	return __kvm_hypercall##args(nr HYPERCALL_ARGS_##args);		\
+}
+
+GENERATE_KVM_HYPERCALL_FUNC(0)
+GENERATE_KVM_HYPERCALL_FUNC(1)
+GENERATE_KVM_HYPERCALL_FUNC(2)
+GENERATE_KVM_HYPERCALL_FUNC(3)
+GENERATE_KVM_HYPERCALL_FUNC(4)
+GENERATE_KVM_HYPERCALL_FUNC(5)
+GENERATE_KVM_HYPERCALL_FUNC(6)
 
 /* kvm on s390 is always paravirtualization enabled */
 static inline int kvm_para_available(void)
diff --git a/arch/s390/include/asm/linkage.h b/arch/s390/include/asm/linkage.h
index 7f22262b0e46..df3fb7d8227b 100644
--- a/arch/s390/include/asm/linkage.h
+++ b/arch/s390/include/asm/linkage.h
@@ -4,36 +4,7 @@
 
 #include <linux/stringify.h>
 
-#define __ALIGN .align 4, 0x07
+#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x07
 #define __ALIGN_STR __stringify(__ALIGN)
 
-#ifndef __ASSEMBLY__
-
-/*
- * Helper macro for exception table entries
- */
-#define EX_TABLE(_fault, _target)	\
-	".section __ex_table,\"a\"\n"	\
-	".align	4\n"			\
-	".long	(" #_fault ") - .\n"	\
-	".long	(" #_target ") - .\n"	\
-	".previous\n"
-
-#else /* __ASSEMBLY__ */
-
-#define EX_TABLE(_fault, _target)	\
-	.section __ex_table,"a"	;	\
-	.align	4 ;			\
-	.long	(_fault) - . ;		\
-	.long	(_target) - . ;		\
-	.previous
-
-#define EX_TABLE_DMA(_fault, _target)	\
-	.section .dma.ex_table, "a" ;	\
-	.align	4 ;			\
-	.long	(_fault) - . ;		\
-	.long	(_target) - . ;		\
-	.previous
-
-#endif /* __ASSEMBLY__ */
 #endif
diff --git a/arch/s390/include/asm/livepatch.h b/arch/s390/include/asm/livepatch.h
deleted file mode 100644
index 818612b784cd..000000000000
--- a/arch/s390/include/asm/livepatch.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * livepatch.h - s390-specific Kernel Live Patching Core
- *
- *  Copyright (c) 2013-2015 SUSE
- *   Authors: Jiri Kosina
- *	      Vojtech Pavlik
- *	      Jiri Slaby
- */
-
-#ifndef ASM_LIVEPATCH_H
-#define ASM_LIVEPATCH_H
-
-#include <asm/ptrace.h>
-
-static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
-{
-	regs->psw.addr = ip;
-}
-
-#endif
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index 237ee0c4169f..e99e9c87b1ce 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -10,28 +10,53 @@
 #define _ASM_S390_LOWCORE_H
 
 #include <linux/types.h>
+#include <asm/machine.h>
 #include <asm/ptrace.h>
+#include <asm/ctlreg.h>
 #include <asm/cpu.h>
 #include <asm/types.h>
+#include <asm/alternative.h>
 
 #define LC_ORDER 1
 #define LC_PAGES 2
 
+#define LOWCORE_ALT_ADDRESS	_AC(0x70000, UL)
+
+#ifndef __ASSEMBLY__
+
+struct pgm_tdb {
+	u64 data[32];
+};
+
 struct lowcore {
 	__u8	pad_0x0000[0x0014-0x0000];	/* 0x0000 */
 	__u32	ipl_parmblock_ptr;		/* 0x0014 */
 	__u8	pad_0x0018[0x0080-0x0018];	/* 0x0018 */
 	__u32	ext_params;			/* 0x0080 */
-	__u16	ext_cpu_addr;			/* 0x0084 */
-	__u16	ext_int_code;			/* 0x0086 */
-	__u16	svc_ilc;			/* 0x0088 */
-	__u16	svc_code;			/* 0x008a */
-	__u16	pgm_ilc;			/* 0x008c */
-	__u16	pgm_code;			/* 0x008e */
+	union {
+		struct {
+			__u16 ext_cpu_addr;	/* 0x0084 */
+			__u16 ext_int_code;	/* 0x0086 */
+		};
+		__u32 ext_int_code_addr;
+	};
+	__u32	svc_int_code;			/* 0x0088 */
+	union {
+		struct {
+			__u16	pgm_ilc;	/* 0x008c */
+			__u16	pgm_code;	/* 0x008e */
+		};
+		__u32 pgm_int_code;
+	};
 	__u32	data_exc_code;			/* 0x0090 */
 	__u16	mon_class_num;			/* 0x0094 */
-	__u8	per_code;			/* 0x0096 */
-	__u8	per_atmid;			/* 0x0097 */
+	union {
+		struct {
+			__u8	per_code;	/* 0x0096 */
+			__u8	per_atmid;	/* 0x0097 */
+		};
+		__u16 per_code_combined;
+	};
 	__u64	per_address;			/* 0x0098 */
 	__u8	exc_access_id;			/* 0x00a0 */
 	__u8	per_access_id;			/* 0x00a1 */
@@ -40,10 +65,15 @@ struct lowcore {
 	__u8	pad_0x00a4[0x00a8-0x00a4];	/* 0x00a4 */
 	__u64	trans_exc_code;			/* 0x00a8 */
 	__u64	monitor_code;			/* 0x00b0 */
-	__u16	subchannel_id;			/* 0x00b8 */
-	__u16	subchannel_nr;			/* 0x00ba */
-	__u32	io_int_parm;			/* 0x00bc */
-	__u32	io_int_word;			/* 0x00c0 */
+	union {
+		struct {
+			__u16	subchannel_id;	/* 0x00b8 */
+			__u16	subchannel_nr;	/* 0x00ba */
+			__u32	io_int_parm;	/* 0x00bc */
+			__u32	io_int_word;	/* 0x00c0 */
+		};
+		struct tpi_info	tpi_info;	/* 0x00b8 */
+	};
 	__u8	pad_0x00c4[0x00c8-0x00c4];	/* 0x00c4 */
 	__u32	stfl_fac_list;			/* 0x00c8 */
 	__u8	pad_0x00cc[0x00e8-0x00cc];	/* 0x00cc */
@@ -52,7 +82,7 @@ struct lowcore {
 	__u32	external_damage_code;		/* 0x00f4 */
 	__u64	failing_storage_address;	/* 0x00f8 */
 	__u8	pad_0x0100[0x0110-0x0100];	/* 0x0100 */
-	__u64	breaking_event_addr;		/* 0x0110 */
+	__u64	pgm_last_break;			/* 0x0110 */
 	__u8	pad_0x0118[0x0120-0x0118];	/* 0x0118 */
 	psw_t	restart_old_psw;		/* 0x0120 */
 	psw_t	external_old_psw;		/* 0x0130 */
@@ -69,20 +99,20 @@ struct lowcore {
 	psw_t	io_new_psw;			/* 0x01f0 */
 
 	/* Save areas. */
-	__u64	save_area_sync[8];		/* 0x0200 */
-	__u64	save_area_async[8];		/* 0x0240 */
+	__u64	save_area[8];			/* 0x0200 */
+	__u8	pad_0x0240[0x0280-0x0240];	/* 0x0240 */
 	__u64	save_area_restart[1];		/* 0x0280 */
 
-	/* CPU flags. */
-	__u64	cpu_flags;			/* 0x0288 */
+	__u64	pcpu;				/* 0x0288 */
 
 	/* Return psws. */
 	psw_t	return_psw;			/* 0x0290 */
 	psw_t	return_mcck_psw;		/* 0x02a0 */
 
+	__u64	last_break;			/* 0x02b0 */
+
 	/* CPU accounting and timing values. */
-	__u64	sync_enter_timer;		/* 0x02b0 */
-	__u64	async_enter_timer;		/* 0x02b8 */
+	__u64	sys_enter_timer;		/* 0x02b8 */
 	__u64	mcck_enter_timer;		/* 0x02c0 */
 	__u64	exit_timer;			/* 0x02c8 */
 	__u64	user_timer;			/* 0x02d0 */
@@ -94,10 +124,10 @@ struct lowcore {
 	__u64	avg_steal_timer;		/* 0x0300 */
 	__u64	last_update_timer;		/* 0x0308 */
 	__u64	last_update_clock;		/* 0x0310 */
-	__u64	int_clock;			/* 0x0318*/
-	__u64	mcck_clock;			/* 0x0320 */
+	__u64	int_clock;			/* 0x0318 */
+	__u8	pad_0x0320[0x0328-0x0320];	/* 0x0320 */
 	__u64	clock_comparator;		/* 0x0328 */
-	__u64	boot_clock[2];			/* 0x0330 */
+	__u8	pad_0x0330[0x0340-0x0330];	/* 0x0330 */
 
 	/* Current process. */
 	__u64	current_task;			/* 0x0340 */
@@ -107,16 +137,16 @@ struct lowcore {
 	__u64	async_stack;			/* 0x0350 */
 	__u64	nodat_stack;			/* 0x0358 */
 	__u64	restart_stack;			/* 0x0360 */
-
+	__u64	mcck_stack;			/* 0x0368 */
 	/* Restart function and parameter. */
-	__u64	restart_fn;			/* 0x0368 */
-	__u64	restart_data;			/* 0x0370 */
-	__u64	restart_source;			/* 0x0378 */
+	__u64	restart_fn;			/* 0x0370 */
+	__u64	restart_data;			/* 0x0378 */
+	__u32	restart_source;			/* 0x0380 */
+	__u32	restart_flags;			/* 0x0384 */
 
 	/* Address space pointer. */
-	__u64	kernel_asce;			/* 0x0380 */
-	__u64	user_asce;			/* 0x0388 */
-	__u64	vdso_asce;			/* 0x0390 */
+	struct ctlreg kernel_asce;		/* 0x0388 */
+	struct ctlreg user_asce;		/* 0x0390 */
 
 	/*
 	 * The lpp and current_pid fields form a
@@ -132,16 +162,13 @@ struct lowcore {
 	__s32	preempt_count;			/* 0x03a8 */
 	__u32	spinlock_lockval;		/* 0x03ac */
 	__u32	spinlock_index;			/* 0x03b0 */
-	__u32	fpu_flags;			/* 0x03b4 */
+	__u8	pad_0x03b4[0x03b8-0x03b4];	/* 0x03b4 */
 	__u64	percpu_offset;			/* 0x03b8 */
-	__u64	vdso_per_cpu_data;		/* 0x03c0 */
-	__u64	machine_flags;			/* 0x03c8 */
-	__u64	gmap;				/* 0x03d0 */
-	__u8	pad_0x03d8[0x0400-0x03d8];	/* 0x03d8 */
+	__u8	pad_0x03c0[0x0400-0x03c0];	/* 0x03c0 */
 
-	/* br %r1 trampoline */
-	__u16	br_r1_trampoline;		/* 0x0400 */
-	__u8	pad_0x0402[0x0e00-0x0402];	/* 0x0402 */
+	__u32	return_lpswe;			/* 0x0400 */
+	__u32	return_mcck_lpswe;		/* 0x0404 */
+	__u8	pad_0x040a[0x0e00-0x0408];	/* 0x0408 */
 
 	/*
 	 * 0xe00 contains the address of the IPL Parameter Information
@@ -153,12 +180,7 @@ struct lowcore {
 	__u64	vmcore_info;			/* 0x0e0c */
 	__u8	pad_0x0e14[0x0e18-0x0e14];	/* 0x0e14 */
 	__u64	os_info;			/* 0x0e18 */
-	__u8	pad_0x0e20[0x0f00-0x0e20];	/* 0x0e20 */
-
-	/* Extended facility list */
-	__u64	stfle_fac_list[16];		/* 0x0f00 */
-	__u64	alt_stfle_fac_list[16];		/* 0x0f80 */
-	__u8	pad_0x1000[0x11b0-0x1000];	/* 0x1000 */
+	__u8	pad_0x0e20[0x11b0-0x0e20];	/* 0x0e20 */
 
 	/* Pointer to the machine check extended save area */
 	__u64	mcesad;				/* 0x11b0 */
@@ -178,17 +200,35 @@ struct lowcore {
 	__u32	tod_progreg_save_area;		/* 0x1324 */
 	__u32	cpu_timer_save_area[2];		/* 0x1328 */
 	__u32	clock_comp_save_area[2];	/* 0x1330 */
-	__u8	pad_0x1338[0x1340-0x1338];	/* 0x1338 */
+	__u64	last_break_save_area;		/* 0x1338 */
 	__u32	access_regs_save_area[16];	/* 0x1340 */
-	__u64	cregs_save_area[16];		/* 0x1380 */
-	__u8	pad_0x1400[0x1800-0x1400];	/* 0x1400 */
+	struct ctlreg cregs_save_area[16];	/* 0x1380 */
+	__u8	pad_0x1400[0x1500-0x1400];	/* 0x1400 */
+	/* Cryptography-counter designation */
+	__u64	ccd;				/* 0x1500 */
+	/* AI-extension counter designation */
+	__u64	aicd;				/* 0x1508 */
+	__u8	pad_0x1510[0x1800-0x1510];	/* 0x1510 */
 
 	/* Transaction abort diagnostic block */
-	__u8	pgm_tdb[256];			/* 0x1800 */
+	struct pgm_tdb pgm_tdb;			/* 0x1800 */
 	__u8	pad_0x1900[0x2000-0x1900];	/* 0x1900 */
 } __packed __aligned(8192);
 
-#define S390_lowcore (*((struct lowcore *) 0))
+static __always_inline struct lowcore *get_lowcore(void)
+{
+	struct lowcore *lc;
+
+	if (__is_defined(__DECOMPRESSOR))
+		return NULL;
+	asm_inline(
+		ALTERNATIVE("	lghi	%[lc],0",
+			    "	llilh	%[lc],%[alt]",
+			    ALT_FEATURE(MFEATURE_LOWCORE))
+		: [lc] "=d" (lc)
+		: [alt] "i" (LOWCORE_ALT_ADDRESS >> 16));
+	return lc;
+}
 
 extern struct lowcore *lowcore_ptr[];
 
@@ -197,12 +237,19 @@ static inline void set_prefix(__u32 address)
 	asm volatile("spx %0" : : "Q" (address) : "memory");
 }
 
-static inline __u32 store_prefix(void)
-{
-	__u32 address;
+#else /* __ASSEMBLY__ */
 
-	asm volatile("stpx %0" : "=Q" (address));
-	return address;
-}
+.macro GET_LC reg
+	ALTERNATIVE "lghi	\reg,0",					\
+		__stringify(llilh	\reg, LOWCORE_ALT_ADDRESS >> 16),	\
+		ALT_FEATURE(MFEATURE_LOWCORE)
+.endm
+
+.macro STMG_LC start, end, savearea
+	ALTERNATIVE "stmg	\start, \end, \savearea",				\
+		__stringify(stmg	\start, \end, LOWCORE_ALT_ADDRESS + \savearea),	\
+		ALT_FEATURE(MFEATURE_LOWCORE)
+.endm
 
+#endif /* __ASSEMBLY__ */
 #endif /* _ASM_S390_LOWCORE_H */
diff --git a/arch/s390/include/asm/maccess.h b/arch/s390/include/asm/maccess.h
new file mode 100644
index 000000000000..50225940d971
--- /dev/null
+++ b/arch/s390/include/asm/maccess.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_S390_MACCESS_H
+#define __ASM_S390_MACCESS_H
+
+#include <linux/types.h>
+
+#define MEMCPY_REAL_SIZE	PAGE_SIZE
+#define MEMCPY_REAL_MASK	PAGE_MASK
+
+struct iov_iter;
+
+extern unsigned long __memcpy_real_area;
+extern pte_t *memcpy_real_ptep;
+size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count);
+int memcpy_real(void *dest, unsigned long src, size_t count);
+#ifdef CONFIG_CRASH_DUMP
+int copy_oldmem_kernel(void *dst, unsigned long src, size_t count);
+#endif
+
+#endif /* __ASM_S390_MACCESS_H */
diff --git a/arch/s390/include/asm/machine.h b/arch/s390/include/asm/machine.h
new file mode 100644
index 000000000000..54478caa5237
--- /dev/null
+++ b/arch/s390/include/asm/machine.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 2024
+ */
+
+#ifndef __ASM_S390_MACHINE_H
+#define __ASM_S390_MACHINE_H
+
+#include <linux/const.h>
+
+#define MFEATURE_LOWCORE	0
+#define MFEATURE_PCI_MIO	1
+#define MFEATURE_SCC		2
+#define MFEATURE_TLB_GUEST	3
+#define MFEATURE_TX		4
+#define MFEATURE_ESOP		5
+#define MFEATURE_DIAG9C		6
+#define MFEATURE_VM		7
+#define MFEATURE_KVM		8
+#define MFEATURE_LPAR		9
+
+#ifndef __ASSEMBLY__
+
+#include <linux/bitops.h>
+#include <asm/alternative.h>
+
+extern unsigned long machine_features[1];
+
+#define MAX_MFEATURE_BIT (sizeof(machine_features) * BITS_PER_BYTE)
+
+static inline void __set_machine_feature(unsigned int nr, unsigned long *mfeatures)
+{
+	if (nr >= MAX_MFEATURE_BIT)
+		return;
+	__set_bit(nr, mfeatures);
+}
+
+static inline void set_machine_feature(unsigned int nr)
+{
+	__set_machine_feature(nr, machine_features);
+}
+
+static inline void __clear_machine_feature(unsigned int nr, unsigned long *mfeatures)
+{
+	if (nr >= MAX_MFEATURE_BIT)
+		return;
+	__clear_bit(nr, mfeatures);
+}
+
+static inline void clear_machine_feature(unsigned int nr)
+{
+	__clear_machine_feature(nr, machine_features);
+}
+
+static bool __test_machine_feature(unsigned int nr, unsigned long *mfeatures)
+{
+	if (nr >= MAX_MFEATURE_BIT)
+		return false;
+	return test_bit(nr, mfeatures);
+}
+
+static bool test_machine_feature(unsigned int nr)
+{
+	return __test_machine_feature(nr, machine_features);
+}
+
+static __always_inline bool __test_machine_feature_constant(unsigned int nr)
+{
+	asm goto(
+		ALTERNATIVE("brcl 15,%l[l_no]", "brcl 0,0", ALT_FEATURE(%[nr]))
+		:
+		: [nr] "i" (nr)
+		:
+		: l_no);
+	return true;
+l_no:
+	return false;
+}
+
+#define DEFINE_MACHINE_HAS_FEATURE(name, feature)				\
+static __always_inline bool machine_has_##name(void)				\
+{										\
+	if (!__is_defined(__DECOMPRESSOR) && __builtin_constant_p(feature))	\
+		return __test_machine_feature_constant(feature);		\
+	return test_machine_feature(feature);					\
+}
+
+DEFINE_MACHINE_HAS_FEATURE(relocated_lowcore, MFEATURE_LOWCORE)
+DEFINE_MACHINE_HAS_FEATURE(scc, MFEATURE_SCC)
+DEFINE_MACHINE_HAS_FEATURE(tlb_guest, MFEATURE_TLB_GUEST)
+DEFINE_MACHINE_HAS_FEATURE(tx, MFEATURE_TX)
+DEFINE_MACHINE_HAS_FEATURE(esop, MFEATURE_ESOP)
+DEFINE_MACHINE_HAS_FEATURE(diag9c, MFEATURE_DIAG9C)
+DEFINE_MACHINE_HAS_FEATURE(vm, MFEATURE_VM)
+DEFINE_MACHINE_HAS_FEATURE(kvm, MFEATURE_KVM)
+DEFINE_MACHINE_HAS_FEATURE(lpar, MFEATURE_LPAR)
+
+#define machine_is_vm	machine_has_vm
+#define machine_is_kvm	machine_has_kvm
+#define machine_is_lpar	machine_has_lpar
+
+#endif /* __ASSEMBLY__ */
+#endif /* __ASM_S390_MACHINE_H */
diff --git a/arch/s390/include/asm/march.h b/arch/s390/include/asm/march.h
new file mode 100644
index 000000000000..fd9eef3be44c
--- /dev/null
+++ b/arch/s390/include/asm/march.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_S390_MARCH_H
+#define __ASM_S390_MARCH_H
+
+#include <linux/kconfig.h>
+
+#define MARCH_HAS_Z10_FEATURES 1
+
+#ifndef __DECOMPRESSOR
+
+#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
+#define MARCH_HAS_Z196_FEATURES 1
+#endif
+
+#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES
+#define MARCH_HAS_ZEC12_FEATURES 1
+#endif
+
+#ifdef CONFIG_HAVE_MARCH_Z13_FEATURES
+#define MARCH_HAS_Z13_FEATURES 1
+#endif
+
+#ifdef CONFIG_HAVE_MARCH_Z14_FEATURES
+#define MARCH_HAS_Z14_FEATURES 1
+#endif
+
+#ifdef CONFIG_HAVE_MARCH_Z15_FEATURES
+#define MARCH_HAS_Z15_FEATURES 1
+#endif
+
+#ifdef CONFIG_HAVE_MARCH_Z16_FEATURES
+#define MARCH_HAS_Z16_FEATURES 1
+#endif
+
+#endif /* __DECOMPRESSOR */
+
+#endif /* __ASM_S390_MARCH_H */
diff --git a/arch/s390/include/asm/mem_detect.h b/arch/s390/include/asm/mem_detect.h
deleted file mode 100644
index a7c922a69050..000000000000
--- a/arch/s390/include/asm/mem_detect.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_S390_MEM_DETECT_H
-#define _ASM_S390_MEM_DETECT_H
-
-#include <linux/types.h>
-
-enum mem_info_source {
-	MEM_DETECT_NONE = 0,
-	MEM_DETECT_SCLP_STOR_INFO,
-	MEM_DETECT_DIAG260,
-	MEM_DETECT_SCLP_READ_INFO,
-	MEM_DETECT_BIN_SEARCH
-};
-
-struct mem_detect_block {
-	u64 start;
-	u64 end;
-};
-
-/*
- * Storage element id is defined as 1 byte (up to 256 storage elements).
- * In practise only storage element id 0 and 1 are used).
- * According to architecture one storage element could have as much as
- * 1020 subincrements. 255 mem_detect_blocks are embedded in mem_detect_info.
- * If more mem_detect_blocks are required, a block of memory from already
- * known mem_detect_block is taken (entries_extended points to it).
- */
-#define MEM_INLINED_ENTRIES 255 /* (PAGE_SIZE - 16) / 16 */
-
-struct mem_detect_info {
-	u32 count;
-	u8 info_source;
-	struct mem_detect_block entries[MEM_INLINED_ENTRIES];
-	struct mem_detect_block *entries_extended;
-};
-extern struct mem_detect_info mem_detect;
-
-void add_mem_detect_block(u64 start, u64 end);
-
-static inline int __get_mem_detect_block(u32 n, unsigned long *start,
-					 unsigned long *end)
-{
-	if (n >= mem_detect.count) {
-		*start = 0;
-		*end = 0;
-		return -1;
-	}
-
-	if (n < MEM_INLINED_ENTRIES) {
-		*start = (unsigned long)mem_detect.entries[n].start;
-		*end = (unsigned long)mem_detect.entries[n].end;
-	} else {
-		*start = (unsigned long)mem_detect.entries_extended[n - MEM_INLINED_ENTRIES].start;
-		*end = (unsigned long)mem_detect.entries_extended[n - MEM_INLINED_ENTRIES].end;
-	}
-	return 0;
-}
-
-/**
- * for_each_mem_detect_block - early online memory range iterator
- * @i: an integer used as loop variable
- * @p_start: ptr to unsigned long for start address of the range
- * @p_end: ptr to unsigned long for end address of the range
- *
- * Walks over detected online memory ranges.
- */
-#define for_each_mem_detect_block(i, p_start, p_end)			\
-	for (i = 0, __get_mem_detect_block(i, p_start, p_end);		\
-	     i < mem_detect.count;					\
-	     i++, __get_mem_detect_block(i, p_start, p_end))
-
-static inline void get_mem_detect_reserved(unsigned long *start,
-					   unsigned long *size)
-{
-	*start = (unsigned long)mem_detect.entries_extended;
-	if (mem_detect.count > MEM_INLINED_ENTRIES)
-		*size = (mem_detect.count - MEM_INLINED_ENTRIES) * sizeof(struct mem_detect_block);
-	else
-		*size = 0;
-}
-
-static inline unsigned long get_mem_detect_end(void)
-{
-	unsigned long start;
-	unsigned long end;
-
-	if (mem_detect.count) {
-		__get_mem_detect_block(mem_detect.count - 1, &start, &end);
-		return end;
-	}
-	return 0;
-}
-
-#endif
diff --git a/arch/s390/include/asm/mem_encrypt.h b/arch/s390/include/asm/mem_encrypt.h
index 2542cbf7e2d1..b85e13505a0f 100644
--- a/arch/s390/include/asm/mem_encrypt.h
+++ b/arch/s390/include/asm/mem_encrypt.h
@@ -4,10 +4,8 @@
 
 #ifndef __ASSEMBLY__
 
-static inline bool mem_encrypt_active(void) { return false; }
-
-int set_memory_encrypted(unsigned long addr, int numpages);
-int set_memory_decrypted(unsigned long addr, int numpages);
+int set_memory_encrypted(unsigned long vaddr, int numpages);
+int set_memory_decrypted(unsigned long vaddr, int numpages);
 
 #endif	/* __ASSEMBLY__ */
 
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index bcfb6371086f..f07e49b419ab 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -4,55 +4,42 @@
 
 #include <linux/cpumask.h>
 #include <linux/errno.h>
+#include <asm/asm-extable.h>
 
 typedef struct {
 	spinlock_t lock;
 	cpumask_t cpu_attach_mask;
 	atomic_t flush_count;
 	unsigned int flush_mm;
-	struct list_head pgtable_list;
 	struct list_head gmap_list;
 	unsigned long gmap_asce;
 	unsigned long asce;
 	unsigned long asce_limit;
 	unsigned long vdso_base;
+	/* The mmu context belongs to a secure guest. */
+	atomic_t protected_count;
 	/*
 	 * The following bitfields need a down_write on the mm
 	 * semaphore when they are written to. As they are only
 	 * written once, they can be read without a lock.
-	 *
-	 * The mmu context allocates 4K page tables.
 	 */
-	unsigned int alloc_pgste:1;
 	/* The mmu context uses extended page tables. */
 	unsigned int has_pgste:1;
 	/* The mmu context uses storage keys. */
 	unsigned int uses_skeys:1;
 	/* The mmu context uses CMM. */
 	unsigned int uses_cmm:1;
+	/*
+	 * The mmu context allows COW-sharing of memory pages (KSM, zeropage).
+	 * Note that COW-sharing during fork() is currently always allowed.
+	 */
+	unsigned int allow_cow_sharing:1;
 	/* The gmaps associated with this context are allowed to use huge pages. */
 	unsigned int allow_gmap_hpage_1m:1;
-	/* The mmu context is for compat task */
-	unsigned int compat_mm:1;
 } mm_context_t;
 
 #define INIT_MM_CONTEXT(name)						   \
 	.context.lock =	__SPIN_LOCK_UNLOCKED(name.context.lock),	   \
-	.context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
 	.context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list),
 
-static inline int tprot(unsigned long addr)
-{
-	int rc = -EFAULT;
-
-	asm volatile(
-		"	tprot	0(%1),0\n"
-		"0:	ipm	%0\n"
-		"	srl	%0,28\n"
-		"1:\n"
-		EX_TABLE(0b,1b)
-		: "+d" (rc) : "a" (addr) : "cc");
-	return rc;
-}
-
 #endif
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 8d04e6f3f796..88f84beebb9e 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -12,97 +12,86 @@
 #include <linux/uaccess.h>
 #include <linux/mm_types.h>
 #include <asm/tlbflush.h>
-#include <asm/ctl_reg.h>
+#include <asm/ctlreg.h>
 #include <asm-generic/mm_hooks.h>
 
+#define init_new_context init_new_context
 static inline int init_new_context(struct task_struct *tsk,
 				   struct mm_struct *mm)
 {
+	unsigned long asce_type, init_entry;
+
 	spin_lock_init(&mm->context.lock);
-	INIT_LIST_HEAD(&mm->context.pgtable_list);
 	INIT_LIST_HEAD(&mm->context.gmap_list);
 	cpumask_clear(&mm->context.cpu_attach_mask);
 	atomic_set(&mm->context.flush_count, 0);
+	atomic_set(&mm->context.protected_count, 0);
 	mm->context.gmap_asce = 0;
 	mm->context.flush_mm = 0;
-	mm->context.compat_mm = test_thread_flag(TIF_31BIT);
 #ifdef CONFIG_PGSTE
-	mm->context.alloc_pgste = page_table_allocate_pgste ||
-		test_thread_flag(TIF_PGSTE) ||
-		(current->mm && current->mm->context.alloc_pgste);
 	mm->context.has_pgste = 0;
 	mm->context.uses_skeys = 0;
 	mm->context.uses_cmm = 0;
+	mm->context.allow_cow_sharing = 1;
 	mm->context.allow_gmap_hpage_1m = 0;
 #endif
 	switch (mm->context.asce_limit) {
-	case _REGION2_SIZE:
+	default:
 		/*
-		 * forked 3-level task, fall through to set new asce with new
-		 * mm->pgd
+		 * context created by exec, the value of asce_limit can
+		 * only be zero in this case
 		 */
-	case 0:
-		/* context created by exec, set asce limit to 4TB */
-		mm->context.asce_limit = STACK_TOP_MAX;
-		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
-				   _ASCE_USER_BITS | _ASCE_TYPE_REGION3;
+		VM_BUG_ON(mm->context.asce_limit);
+		/* continue as 3-level task */
+		mm->context.asce_limit = _REGION2_SIZE;
+		fallthrough;
+	case _REGION2_SIZE:
+		/* forked 3-level task */
+		init_entry = _REGION3_ENTRY_EMPTY;
+		asce_type = _ASCE_TYPE_REGION3;
 		break;
-	case -PAGE_SIZE:
-		/* forked 5-level task, set new asce with new_mm->pgd */
-		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
-			_ASCE_USER_BITS | _ASCE_TYPE_REGION1;
+	case TASK_SIZE_MAX:
+		/* forked 5-level task */
+		init_entry = _REGION1_ENTRY_EMPTY;
+		asce_type = _ASCE_TYPE_REGION1;
 		break;
 	case _REGION1_SIZE:
-		/* forked 4-level task, set new asce with new mm->pgd */
-		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
-				   _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
+		/* forked 4-level task */
+		init_entry = _REGION2_ENTRY_EMPTY;
+		asce_type = _ASCE_TYPE_REGION2;
 		break;
-	case _REGION3_SIZE:
-		/* forked 2-level compat task, set new asce with new mm->pgd */
-		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
-				   _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
 	}
-	crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
+	mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+			   _ASCE_USER_BITS | asce_type;
+	crst_table_init((unsigned long *) mm->pgd, init_entry);
 	return 0;
 }
 
-#define destroy_context(mm)             do { } while (0)
-
-static inline void set_user_asce(struct mm_struct *mm)
+static inline void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+				      struct task_struct *tsk)
 {
-	S390_lowcore.user_asce = mm->context.asce;
-	__ctl_load(S390_lowcore.user_asce, 1, 1);
-	clear_cpu_flag(CIF_ASCE_PRIMARY);
-}
+	int cpu = smp_processor_id();
 
-static inline void clear_user_asce(void)
-{
-	S390_lowcore.user_asce = S390_lowcore.kernel_asce;
-	__ctl_load(S390_lowcore.kernel_asce, 1, 1);
-	set_cpu_flag(CIF_ASCE_PRIMARY);
+	if (next == &init_mm)
+		get_lowcore()->user_asce = s390_invalid_asce;
+	else
+		get_lowcore()->user_asce.val = next->context.asce;
+	cpumask_set_cpu(cpu, &next->context.cpu_attach_mask);
+	/* Clear previous user-ASCE from CR7 */
+	local_ctl_load(7, &s390_invalid_asce);
+	if (prev != next)
+		cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask);
 }
-
-mm_segment_t enable_sacf_uaccess(void);
-void disable_sacf_uaccess(mm_segment_t old_fs);
+#define switch_mm_irqs_off switch_mm_irqs_off
 
 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 			     struct task_struct *tsk)
 {
-	int cpu = smp_processor_id();
+	unsigned long flags;
 
-	S390_lowcore.user_asce = next->context.asce;
-	cpumask_set_cpu(cpu, &next->context.cpu_attach_mask);
-	/* Clear previous user-ASCE from CR1 and CR7 */
-	if (!test_cpu_flag(CIF_ASCE_PRIMARY)) {
-		__ctl_load(S390_lowcore.kernel_asce, 1, 1);
-		set_cpu_flag(CIF_ASCE_PRIMARY);
-	}
-	if (test_cpu_flag(CIF_ASCE_SECONDARY)) {
-		__ctl_load(S390_lowcore.vdso_asce, 7, 7);
-		clear_cpu_flag(CIF_ASCE_SECONDARY);
-	}
-	if (prev != next)
-		cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask);
+	local_irq_save(flags);
+	switch_mm_irqs_off(prev, next, tsk);
+	local_irq_restore(flags);
 }
 
 #define finish_arch_post_lock_switch finish_arch_post_lock_switch
@@ -119,18 +108,18 @@ static inline void finish_arch_post_lock_switch(void)
 		__tlb_flush_mm_lazy(mm);
 		preempt_enable();
 	}
-	set_fs(current->thread.mm_segment);
+	local_ctl_load(7, &get_lowcore()->user_asce);
 }
 
-#define enter_lazy_tlb(mm,tsk)	do { } while (0)
-#define deactivate_mm(tsk,mm)	do { } while (0)
-
+#define activate_mm activate_mm
 static inline void activate_mm(struct mm_struct *prev,
                                struct mm_struct *next)
 {
 	switch_mm(prev, next, current);
 	cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
-	set_user_asce(next);
+	local_ctl_load(7, &get_lowcore()->user_asce);
 }
 
+#include <asm-generic/mmu_context.h>
+
 #endif /* __S390_MMU_CONTEXT_H */
diff --git a/arch/s390/include/asm/mmzone.h b/arch/s390/include/asm/mmzone.h
deleted file mode 100644
index 73e3e7c6976c..000000000000
--- a/arch/s390/include/asm/mmzone.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * NUMA support for s390
- *
- * Copyright IBM Corp. 2015
- */
-
-#ifndef _ASM_S390_MMZONE_H
-#define _ASM_S390_MMZONE_H
-
-#ifdef CONFIG_NUMA
-
-extern struct pglist_data *node_data[];
-#define NODE_DATA(nid) (node_data[nid])
-
-#endif /* CONFIG_NUMA */
-#endif /* _ASM_S390_MMZONE_H */
diff --git a/arch/s390/include/asm/module.h b/arch/s390/include/asm/module.h
index e0a6d29846e2..916ab59e458a 100644
--- a/arch/s390/include/asm/module.h
+++ b/arch/s390/include/asm/module.h
@@ -8,16 +8,14 @@
  * This file contains the s390 architecture specific module code.
  */
 
-struct mod_arch_syminfo
-{
+struct mod_arch_syminfo {
 	unsigned long got_offset;
 	unsigned long plt_offset;
 	int got_initialized;
 	int plt_initialized;
 };
 
-struct mod_arch_specific
-{
+struct mod_arch_specific {
 	/* Starting offset of got in the module core memory. */
 	unsigned long got_offset;
 	/* Starting offset of plt in the module core memory. */
@@ -30,6 +28,28 @@ struct mod_arch_specific
 	int nsyms;
 	/* Additional symbol information (got and plt offsets). */
 	struct mod_arch_syminfo *syminfo;
+#ifdef CONFIG_FUNCTION_TRACER
+	/* Start of memory reserved for ftrace hotpatch trampolines. */
+	struct ftrace_hotpatch_trampoline *trampolines_start;
+	/* End of memory reserved for ftrace hotpatch trampolines. */
+	struct ftrace_hotpatch_trampoline *trampolines_end;
+	/* Next unused ftrace hotpatch trampoline slot. */
+	struct ftrace_hotpatch_trampoline *next_trampoline;
+#endif /* CONFIG_FUNCTION_TRACER */
 };
 
+static inline const Elf_Shdr *find_section(const Elf_Ehdr *hdr,
+					   const Elf_Shdr *sechdrs,
+					   const char *name)
+{
+	const char *secstrs = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+	const Elf_Shdr *s, *se;
+
+	for (s = sechdrs, se = sechdrs + hdr->e_shnum; s < se; s++) {
+		if (strcmp(name, secstrs + s->sh_name) == 0)
+			return s;
+	}
+	return NULL;
+}
+
 #endif /* _ASM_S390_MODULE_H */
diff --git a/arch/s390/include/asm/msi.h b/arch/s390/include/asm/msi.h
new file mode 100644
index 000000000000..399343ed9ffb
--- /dev/null
+++ b/arch/s390/include/asm/msi.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_MSI_H
+#define _ASM_S390_MSI_H
+#include <asm-generic/msi.h>
+
+/*
+ * Work around S390 not using irq_domain at all so we can't set
+ * IRQ_DOMAIN_FLAG_ISOLATED_MSI. See for an explanation how it works:
+ *
+ * https://lore.kernel.org/r/31af8174-35e9-ebeb-b9ef-74c90d4bfd93@linux.ibm.com/
+ *
+ * Note this is less isolated than the ARM/x86 versions as userspace can trigger
+ * MSI belonging to kernel devices within the same gisa.
+ */
+#define arch_is_isolated_msi() true
+
+#endif
diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h
index b160da8fa14b..227466ce9e41 100644
--- a/arch/s390/include/asm/nmi.h
+++ b/arch/s390/include/asm/nmi.h
@@ -6,7 +6,6 @@
  *    Author(s): Ingo Adlung <adlung@de.ibm.com>,
  *		 Martin Schwidefsky <schwidefsky@de.ibm.com>,
  *		 Cornelia Huck <cornelia.huck@de.ibm.com>,
- *		 Heiko Carstens <heiko.carstens@de.ibm.com>,
  */
 
 #ifndef _ASM_S390_NMI_H
@@ -23,12 +22,16 @@
 #define MCCK_CODE_SYSTEM_DAMAGE		BIT(63)
 #define MCCK_CODE_EXT_DAMAGE		BIT(63 - 5)
 #define MCCK_CODE_CP			BIT(63 - 9)
-#define MCCK_CODE_CPU_TIMER_VALID	BIT(63 - 46)
+#define MCCK_CODE_STG_ERROR		BIT(63 - 16)
+#define MCCK_CODE_STG_KEY_ERROR		BIT(63 - 18)
+#define MCCK_CODE_STG_DEGRAD		BIT(63 - 19)
 #define MCCK_CODE_PSW_MWP_VALID		BIT(63 - 20)
 #define MCCK_CODE_PSW_IA_VALID		BIT(63 - 23)
+#define MCCK_CODE_STG_FAIL_ADDR		BIT(63 - 24)
 #define MCCK_CODE_CR_VALID		BIT(63 - 29)
 #define MCCK_CODE_GS_VALID		BIT(63 - 36)
 #define MCCK_CODE_FC_VALID		BIT(63 - 43)
+#define MCCK_CODE_CPU_TIMER_VALID	BIT(63 - 46)
 
 #ifndef __ASSEMBLY__
 
@@ -94,9 +97,9 @@ struct mcesa {
 
 struct pt_regs;
 
-void nmi_alloc_boot_cpu(struct lowcore *lc);
-int nmi_alloc_per_cpu(struct lowcore *lc);
-void nmi_free_per_cpu(struct lowcore *lc);
+void nmi_alloc_mcesa_early(u64 *mcesad);
+int nmi_alloc_mcesa(u64 *mcesad);
+void nmi_free_mcesa(u64 *mcesad);
 
 void s390_handle_mcck(void);
 void s390_do_machine_check(struct pt_regs *regs);
diff --git a/arch/s390/include/asm/nospec-branch.h b/arch/s390/include/asm/nospec-branch.h
index b4bd8c41e9d3..192835a3e24d 100644
--- a/arch/s390/include/asm/nospec-branch.h
+++ b/arch/s390/include/asm/nospec-branch.h
@@ -5,13 +5,47 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
+#include <asm/facility.h>
 
 extern int nospec_disable;
+extern int nobp;
+
+static inline bool nobp_enabled(void)
+{
+	if (__is_defined(__DECOMPRESSOR))
+		return false;
+	return nobp && test_facility(82);
+}
 
 void nospec_init_branches(void);
 void nospec_auto_detect(void);
 void nospec_revert(s32 *start, s32 *end);
 
+static inline bool nospec_uses_trampoline(void)
+{
+	return __is_defined(CC_USING_EXPOLINE) && !nospec_disable;
+}
+
+#ifdef CONFIG_EXPOLINE_EXTERN
+
+void __s390_indirect_jump_r1(void);
+void __s390_indirect_jump_r2(void);
+void __s390_indirect_jump_r3(void);
+void __s390_indirect_jump_r4(void);
+void __s390_indirect_jump_r5(void);
+void __s390_indirect_jump_r6(void);
+void __s390_indirect_jump_r7(void);
+void __s390_indirect_jump_r8(void);
+void __s390_indirect_jump_r9(void);
+void __s390_indirect_jump_r10(void);
+void __s390_indirect_jump_r11(void);
+void __s390_indirect_jump_r12(void);
+void __s390_indirect_jump_r13(void);
+void __s390_indirect_jump_r14(void);
+void __s390_indirect_jump_r15(void);
+
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_S390_EXPOLINE_H */
diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h
index 0033dcd663b1..cb15dd25bf21 100644
--- a/arch/s390/include/asm/nospec-insn.h
+++ b/arch/s390/include/asm/nospec-insn.h
@@ -2,191 +2,128 @@
 #ifndef _ASM_S390_NOSPEC_ASM_H
 #define _ASM_S390_NOSPEC_ASM_H
 
-#include <asm/alternative-asm.h>
-#include <asm/asm-offsets.h>
+#include <linux/linkage.h>
 #include <asm/dwarf.h>
 
 #ifdef __ASSEMBLY__
 
 #ifdef CC_USING_EXPOLINE
 
-_LC_BR_R1 = __LC_BR_R1
-
 /*
  * The expoline macros are used to create thunks in the same format
  * as gcc generates them. The 'comdat' section flag makes sure that
  * the various thunks are merged into a single copy.
  */
 	.macro __THUNK_PROLOG_NAME name
+#ifdef CONFIG_EXPOLINE_EXTERN
+	SYM_CODE_START(\name)
+#else
 	.pushsection .text.\name,"axG",@progbits,\name,comdat
 	.globl \name
 	.hidden \name
 	.type \name,@function
 \name:
 	CFI_STARTPROC
+#endif
 	.endm
 
-	.macro __THUNK_EPILOG
+	.macro __THUNK_EPILOG_NAME name
+#ifdef CONFIG_EXPOLINE_EXTERN
+	SYM_CODE_END(\name)
+	EXPORT_SYMBOL(\name)
+#else
 	CFI_ENDPROC
 	.popsection
+#endif
 	.endm
 
-	.macro __THUNK_PROLOG_BR r1,r2
-	__THUNK_PROLOG_NAME __s390_indirect_jump_r\r2\()use_r\r1
-	.endm
-
-	.macro __THUNK_PROLOG_BC d0,r1,r2
-	__THUNK_PROLOG_NAME __s390_indirect_branch_\d0\()_\r2\()use_\r1
+	.macro __THUNK_PROLOG_BR r1
+	__THUNK_PROLOG_NAME __s390_indirect_jump_r\r1
 	.endm
 
-	.macro __THUNK_BR r1,r2
-	jg	__s390_indirect_jump_r\r2\()use_r\r1
+	.macro __THUNK_EPILOG_BR r1
+	__THUNK_EPILOG_NAME __s390_indirect_jump_r\r1
 	.endm
 
-	.macro __THUNK_BC d0,r1,r2
-	jg	__s390_indirect_branch_\d0\()_\r2\()use_\r1
+	.macro __THUNK_BR r1
+	jg	__s390_indirect_jump_r\r1
 	.endm
 
-	.macro __THUNK_BRASL r1,r2,r3
-	brasl	\r1,__s390_indirect_jump_r\r3\()use_r\r2
+	.macro __THUNK_BRASL r1,r2
+	brasl	\r1,__s390_indirect_jump_r\r2
 	.endm
 
-	.macro	__DECODE_RR expand,reg,ruse
-	.set __decode_fail,1
+	.macro	__DECODE_R expand,reg
+	.set .L__decode_fail,1
 	.irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
 	.ifc \reg,%r\r1
-	.irp r2,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
-	.ifc \ruse,%r\r2
-	\expand \r1,\r2
-	.set __decode_fail,0
+	\expand \r1
+	.set .L__decode_fail,0
 	.endif
 	.endr
-	.endif
-	.endr
-	.if __decode_fail == 1
-	.error "__DECODE_RR failed"
+	.if .L__decode_fail == 1
+	.error "__DECODE_R failed"
 	.endif
 	.endm
 
-	.macro	__DECODE_RRR expand,rsave,rtarget,ruse
-	.set __decode_fail,1
+	.macro	__DECODE_RR expand,rsave,rtarget
+	.set .L__decode_fail,1
 	.irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
 	.ifc \rsave,%r\r1
 	.irp r2,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
 	.ifc \rtarget,%r\r2
-	.irp r3,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
-	.ifc \ruse,%r\r3
-	\expand \r1,\r2,\r3
-	.set __decode_fail,0
-	.endif
-	.endr
-	.endif
-	.endr
-	.endif
-	.endr
-	.if __decode_fail == 1
-	.error "__DECODE_RRR failed"
-	.endif
-	.endm
-
-	.macro	__DECODE_DRR expand,disp,reg,ruse
-	.set __decode_fail,1
-	.irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
-	.ifc \reg,%r\r1
-	.irp r2,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
-	.ifc \ruse,%r\r2
-	\expand \disp,\r1,\r2
-	.set __decode_fail,0
+	\expand \r1,\r2
+	.set .L__decode_fail,0
 	.endif
 	.endr
 	.endif
 	.endr
-	.if __decode_fail == 1
-	.error "__DECODE_DRR failed"
+	.if .L__decode_fail == 1
+	.error "__DECODE_RR failed"
 	.endif
 	.endm
 
-	.macro __THUNK_EX_BR reg,ruse
-	# Be very careful when adding instructions to this macro!
-	# The ALTERNATIVE replacement code has a .+10 which targets
-	# the "br \reg" after the code has been patched.
-#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
+	.macro __THUNK_EX_BR reg
 	exrl	0,555f
 	j	.
-#else
-	.ifc \reg,%r1
-	ALTERNATIVE "ex %r0,_LC_BR_R1", ".insn ril,0xc60000000000,0,.+10", 35
-	j	.
-	.else
-	larl	\ruse,555f
-	ex	0,0(\ruse)
-	j	.
-	.endif
-#endif
 555:	br	\reg
 	.endm
 
-	.macro __THUNK_EX_BC disp,reg,ruse
-#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
-	exrl	0,556f
-	j	.
+#ifdef CONFIG_EXPOLINE_EXTERN
+	.macro GEN_BR_THUNK reg
+	.endm
+	.macro GEN_BR_THUNK_EXTERN reg
 #else
-	larl	\ruse,556f
-	ex	0,0(\ruse)
-	j	.
+	.macro GEN_BR_THUNK reg
 #endif
-556:	b	\disp(\reg)
+	__DECODE_R __THUNK_PROLOG_BR,\reg
+	__THUNK_EX_BR \reg
+	__DECODE_R __THUNK_EPILOG_BR,\reg
 	.endm
 
-	.macro GEN_BR_THUNK reg,ruse=%r1
-	__DECODE_RR __THUNK_PROLOG_BR,\reg,\ruse
-	__THUNK_EX_BR \reg,\ruse
-	__THUNK_EPILOG
-	.endm
-
-	.macro GEN_B_THUNK disp,reg,ruse=%r1
-	__DECODE_DRR __THUNK_PROLOG_BC,\disp,\reg,\ruse
-	__THUNK_EX_BC \disp,\reg,\ruse
-	__THUNK_EPILOG
-	.endm
-
-	.macro BR_EX reg,ruse=%r1
-557:	__DECODE_RR __THUNK_BR,\reg,\ruse
+	.macro BR_EX reg
+557:	__DECODE_R __THUNK_BR,\reg
 	.pushsection .s390_indirect_branches,"a",@progbits
 	.long	557b-.
 	.popsection
 	.endm
 
-	 .macro B_EX disp,reg,ruse=%r1
-558:	__DECODE_DRR __THUNK_BC,\disp,\reg,\ruse
-	.pushsection .s390_indirect_branches,"a",@progbits
-	.long	558b-.
-	.popsection
-	.endm
-
-	.macro BASR_EX rsave,rtarget,ruse=%r1
-559:	__DECODE_RRR __THUNK_BRASL,\rsave,\rtarget,\ruse
+	.macro BASR_EX rsave,rtarget
+559:	__DECODE_RR __THUNK_BRASL,\rsave,\rtarget
 	.pushsection .s390_indirect_branches,"a",@progbits
 	.long	559b-.
 	.popsection
 	.endm
 
 #else
-	.macro GEN_BR_THUNK reg,ruse=%r1
-	.endm
-
-	.macro GEN_B_THUNK disp,reg,ruse=%r1
+	.macro GEN_BR_THUNK reg
 	.endm
 
-	 .macro BR_EX reg,ruse=%r1
+	 .macro BR_EX reg
 	br	\reg
 	.endm
 
-	 .macro B_EX disp,reg,ruse=%r1
-	b	\disp(\reg)
-	.endm
-
-	.macro BASR_EX rsave,rtarget,ruse=%r1
+	.macro BASR_EX rsave,rtarget
 	basr	\rsave,\rtarget
 	.endm
 #endif /* CC_USING_EXPOLINE */
diff --git a/arch/s390/include/asm/numa.h b/arch/s390/include/asm/numa.h
index 35f8cbe7e5bb..23cd5d1b734b 100644
--- a/arch/s390/include/asm/numa.h
+++ b/arch/s390/include/asm/numa.h
@@ -13,24 +13,13 @@
 #ifdef CONFIG_NUMA
 
 #include <linux/numa.h>
-#include <linux/cpumask.h>
 
 void numa_setup(void);
-int numa_pfn_to_nid(unsigned long pfn);
-int __node_distance(int a, int b);
-void numa_update_cpu_topology(void);
-
-extern cpumask_t node_to_cpumask_map[MAX_NUMNODES];
-extern int numa_debug_enabled;
 
 #else
 
 static inline void numa_setup(void) { }
-static inline void numa_update_cpu_topology(void) { }
-static inline int numa_pfn_to_nid(unsigned long pfn)
-{
-	return 0;
-}
 
 #endif /* CONFIG_NUMA */
+
 #endif /* _ASM_S390_NUMA_H */
diff --git a/arch/s390/include/asm/os_info.h b/arch/s390/include/asm/os_info.h
index 3c89279d2a4b..3ee9e8f5ceae 100644
--- a/arch/s390/include/asm/os_info.h
+++ b/arch/s390/include/asm/os_info.h
@@ -8,15 +8,34 @@
 #ifndef _ASM_S390_OS_INFO_H
 #define _ASM_S390_OS_INFO_H
 
+#include <linux/uio.h>
+
 #define OS_INFO_VERSION_MAJOR	1
 #define OS_INFO_VERSION_MINOR	1
 #define OS_INFO_MAGIC		0x4f53494e464f535aULL /* OSINFOSZ */
 
 #define OS_INFO_VMCOREINFO	0
 #define OS_INFO_REIPL_BLOCK	1
+#define OS_INFO_FLAGS_ENTRY	2
+#define OS_INFO_RESERVED	3
+#define OS_INFO_IDENTITY_BASE	4
+#define OS_INFO_KASLR_OFFSET	5
+#define OS_INFO_KASLR_OFF_PHYS	6
+#define OS_INFO_VMEMMAP		7
+#define OS_INFO_AMODE31_START	8
+#define OS_INFO_AMODE31_END	9
+#define OS_INFO_IMAGE_START	10
+#define OS_INFO_IMAGE_END	11
+#define OS_INFO_IMAGE_PHYS	12
+#define OS_INFO_MAX		13
+
+#define OS_INFO_FLAG_REIPL_CLEAR	(1UL << 0)
 
 struct os_info_entry {
-	u64	addr;
+	union {
+		u64	addr;
+		u64	val;
+	};
 	u64	size;
 	u32	csum;
 } __packed;
@@ -28,18 +47,24 @@ struct os_info {
 	u16	version_minor;
 	u64	crashkernel_addr;
 	u64	crashkernel_size;
-	struct os_info_entry entry[2];
-	u8	reserved[4024];
+	struct os_info_entry entry[OS_INFO_MAX];
+	u8	reserved[3804];
 } __packed;
 
 void os_info_init(void);
-void os_info_entry_add(int nr, void *ptr, u64 len);
+void os_info_entry_add_data(int nr, void *ptr, u64 len);
+void os_info_entry_add_val(int nr, u64 val);
 void os_info_crashkernel_add(unsigned long base, unsigned long size);
 u32 os_info_csum(struct os_info *os_info);
 
 #ifdef CONFIG_CRASH_DUMP
 void *os_info_old_entry(int nr, unsigned long *size);
-int copy_oldmem_kernel(void *dst, void *src, size_t count);
+static inline unsigned long os_info_old_value(int nr)
+{
+	unsigned long size;
+
+	return (unsigned long)os_info_old_entry(nr, &size);
+}
 #else
 static inline void *os_info_old_entry(int nr, unsigned long *size)
 {
diff --git a/arch/s390/include/asm/page-states.h b/arch/s390/include/asm/page-states.h
index c33c4deb545f..794fdb21500a 100644
--- a/arch/s390/include/asm/page-states.h
+++ b/arch/s390/include/asm/page-states.h
@@ -7,6 +7,8 @@
 #ifndef PAGE_STATES_H
 #define PAGE_STATES_H
 
+#include <asm/page.h>
+
 #define ESSA_GET_STATE			0
 #define ESSA_SET_STABLE			1
 #define ESSA_SET_UNUSED			2
@@ -18,4 +20,60 @@
 
 #define ESSA_MAX	ESSA_SET_STABLE_NODAT
 
+extern int cmma_flag;
+
+static __always_inline unsigned long essa(unsigned long paddr, unsigned char cmd)
+{
+	unsigned long rc;
+
+	asm volatile(
+		"	.insn	rrf,0xb9ab0000,%[rc],%[paddr],%[cmd],0"
+		: [rc] "=d" (rc)
+		: [paddr] "d" (paddr),
+		  [cmd] "i" (cmd));
+	return rc;
+}
+
+static __always_inline void __set_page_state(void *addr, unsigned long num_pages, unsigned char cmd)
+{
+	unsigned long paddr = __pa(addr) & PAGE_MASK;
+
+	while (num_pages--) {
+		essa(paddr, cmd);
+		paddr += PAGE_SIZE;
+	}
+}
+
+static inline void __set_page_unused(void *addr, unsigned long num_pages)
+{
+	__set_page_state(addr, num_pages, ESSA_SET_UNUSED);
+}
+
+static inline void __set_page_stable_dat(void *addr, unsigned long num_pages)
+{
+	__set_page_state(addr, num_pages, ESSA_SET_STABLE);
+}
+
+static inline void __set_page_stable_nodat(void *addr, unsigned long num_pages)
+{
+	__set_page_state(addr, num_pages, ESSA_SET_STABLE_NODAT);
+}
+
+static inline void __arch_set_page_nodat(void *addr, unsigned long num_pages)
+{
+	if (!cmma_flag)
+		return;
+	if (cmma_flag < 2)
+		__set_page_stable_dat(addr, num_pages);
+	else
+		__set_page_stable_nodat(addr, num_pages);
+}
+
+static inline void __arch_set_page_dat(void *addr, unsigned long num_pages)
+{
+	if (!cmma_flag)
+		return;
+	__set_page_stable_dat(addr, num_pages);
+}
+
 #endif
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index a4d38092530a..4e5dbabdf202 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -10,16 +10,13 @@
 
 #include <linux/const.h>
 #include <asm/types.h>
+#include <asm/asm.h>
 
-#define _PAGE_SHIFT	12
-#define _PAGE_SIZE	(_AC(1, UL) << _PAGE_SHIFT)
-#define _PAGE_MASK	(~(_PAGE_SIZE - 1))
+#include <vdso/page.h>
 
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT	_PAGE_SHIFT
-#define PAGE_SIZE	_PAGE_SIZE
-#define PAGE_MASK	_PAGE_MASK
-#define PAGE_DEFAULT_ACC	0
+#define PAGE_DEFAULT_ACC	_AC(0, UL)
+/* storage-protection override */
+#define PAGE_SPO_ACC		9
 #define PAGE_DEFAULT_KEY	(PAGE_DEFAULT_ACC << 4)
 
 #define HPAGE_SHIFT	20
@@ -33,6 +30,8 @@
 #define ARCH_HAS_PREPARE_HUGEPAGE
 #define ARCH_HAS_HUGEPAGE_CLEAR_FLUSH
 
+#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+
 #include <asm/setup.h>
 #ifndef __ASSEMBLY__
 
@@ -40,7 +39,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end);
 
 static inline void storage_key_init_range(unsigned long start, unsigned long end)
 {
-	if (PAGE_DEFAULT_KEY)
+	if (PAGE_DEFAULT_KEY != 0)
 		__storage_key_init_range(start, end);
 }
 
@@ -53,26 +52,30 @@ static inline void storage_key_init_range(unsigned long start, unsigned long end
  */
 static inline void copy_page(void *to, void *from)
 {
-	register void *reg2 asm ("2") = to;
-	register unsigned long reg3 asm ("3") = 0x1000;
-	register void *reg4 asm ("4") = from;
-	register unsigned long reg5 asm ("5") = 0xb0001000;
+	union register_pair dst, src;
+
+	dst.even = (unsigned long) to;
+	dst.odd  = 0x1000;
+	src.even = (unsigned long) from;
+	src.odd  = 0xb0001000;
+
 	asm volatile(
-		"	mvcl	2,4"
-		: "+d" (reg2), "+d" (reg3), "+d" (reg4), "+d" (reg5)
+		"	mvcl	%[dst],%[src]"
+		: [dst] "+&d" (dst.pair), [src] "+&d" (src.pair)
 		: : "memory", "cc");
 }
 
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
-	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+#define vma_alloc_zeroed_movable_folio(vma, vaddr) \
+	vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr)
 
-/*
- * These are used to make use of C type-checking..
- */
+#ifdef CONFIG_STRICT_MM_TYPECHECKS
+#define STRICT_MM_TYPECHECKS
+#endif
+
+#ifdef STRICT_MM_TYPECHECKS
 
 typedef struct { unsigned long pgprot; } pgprot_t;
 typedef struct { unsigned long pgste; } pgste_t;
@@ -81,23 +84,48 @@ typedef struct { unsigned long pmd; } pmd_t;
 typedef struct { unsigned long pud; } pud_t;
 typedef struct { unsigned long p4d; } p4d_t;
 typedef struct { unsigned long pgd; } pgd_t;
-typedef pte_t *pgtable_t;
 
-#define pgprot_val(x)	((x).pgprot)
-#define pgste_val(x)	((x).pgste)
-#define pte_val(x)	((x).pte)
-#define pmd_val(x)	((x).pmd)
-#define pud_val(x)	((x).pud)
-#define p4d_val(x)	((x).p4d)
-#define pgd_val(x)      ((x).pgd)
+#define DEFINE_PGVAL_FUNC(name)						\
+static __always_inline unsigned long name ## _val(name ## _t name)	\
+{									\
+	return name.name;						\
+}
+
+#else /* STRICT_MM_TYPECHECKS */
+
+typedef unsigned long pgprot_t;
+typedef unsigned long pgste_t;
+typedef unsigned long pte_t;
+typedef unsigned long pmd_t;
+typedef unsigned long pud_t;
+typedef unsigned long p4d_t;
+typedef unsigned long pgd_t;
+
+#define DEFINE_PGVAL_FUNC(name)						\
+static __always_inline unsigned long name ## _val(name ## _t name)	\
+{									\
+	return name;							\
+}
+
+#endif /* STRICT_MM_TYPECHECKS */
+
+DEFINE_PGVAL_FUNC(pgprot)
+DEFINE_PGVAL_FUNC(pgste)
+DEFINE_PGVAL_FUNC(pte)
+DEFINE_PGVAL_FUNC(pmd)
+DEFINE_PGVAL_FUNC(pud)
+DEFINE_PGVAL_FUNC(p4d)
+DEFINE_PGVAL_FUNC(pgd)
 
+typedef pte_t *pgtable_t;
+
+#define __pgprot(x)	((pgprot_t) { (x) } )
 #define __pgste(x)	((pgste_t) { (x) } )
 #define __pte(x)        ((pte_t) { (x) } )
 #define __pmd(x)        ((pmd_t) { (x) } )
 #define __pud(x)	((pud_t) { (x) } )
 #define __p4d(x)	((p4d_t) { (x) } )
 #define __pgd(x)        ((pgd_t) { (x) } )
-#define __pgprot(x)     ((pgprot_t) { (x) } )
 
 static inline void page_set_storage_key(unsigned long addr,
 					unsigned char skey, int mapped)
@@ -122,11 +150,12 @@ static inline int page_reset_referenced(unsigned long addr)
 	int cc;
 
 	asm volatile(
-		"	rrbe	0,%1\n"
-		"	ipm	%0\n"
-		"	srl	%0,28\n"
-		: "=d" (cc) : "a" (addr) : "cc");
-	return cc;
+		"	rrbe	0,%[addr]\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc)
+		: [addr] "a" (addr)
+		: CC_CLOBBER);
+	return CC_TRANSFORM(cc);
 }
 
 /* Bits int the storage key */
@@ -136,12 +165,9 @@ static inline int page_reset_referenced(unsigned long addr)
 #define _PAGE_ACC_BITS		0xf0	/* HW access control bits	*/
 
 struct page;
+struct folio;
 void arch_free_page(struct page *page, int order);
 void arch_alloc_page(struct page *page, int order);
-void arch_set_page_dat(struct page *page, int order);
-void arch_set_page_nodat(struct page *page, int order);
-int arch_test_page_nodat(struct page *page);
-void arch_set_page_states(int make_stable);
 
 static inline int devmem_is_allowed(unsigned long pfn)
 {
@@ -151,33 +177,114 @@ static inline int devmem_is_allowed(unsigned long pfn)
 #define HAVE_ARCH_FREE_PAGE
 #define HAVE_ARCH_ALLOC_PAGE
 
-#endif /* !__ASSEMBLY__ */
+int arch_make_folio_accessible(struct folio *folio);
+#define HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
+
+struct vm_layout {
+	unsigned long kaslr_offset;
+	unsigned long kaslr_offset_phys;
+	unsigned long identity_base;
+	unsigned long identity_size;
+};
+
+extern struct vm_layout vm_layout;
+
+#define __kaslr_offset		vm_layout.kaslr_offset
+#define __kaslr_offset_phys	vm_layout.kaslr_offset_phys
+#ifdef CONFIG_RANDOMIZE_IDENTITY_BASE
+#define __identity_base		vm_layout.identity_base
+#else
+#define __identity_base		0UL
+#endif
+#define ident_map_size		vm_layout.identity_size
+
+static inline unsigned long kaslr_offset(void)
+{
+	return __kaslr_offset;
+}
+
+extern int __kaslr_enabled;
+static inline int kaslr_enabled(void)
+{
+	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
+		return __kaslr_enabled;
+	return 0;
+}
+
+#define __PAGE_OFFSET		__identity_base
+#define PAGE_OFFSET		__PAGE_OFFSET
 
-#define __PAGE_OFFSET		0x0UL
-#define PAGE_OFFSET		0x0UL
+#ifdef __DECOMPRESSOR
 
-#define __pa(x)			((unsigned long)(x))
+#define __pa_nodebug(x)		((unsigned long)(x))
+#define __pa(x)			__pa_nodebug(x)
+#define __pa32(x)		__pa(x)
 #define __va(x)			((void *)(unsigned long)(x))
 
-#define virt_to_pfn(kaddr)	(__pa(kaddr) >> PAGE_SHIFT)
-#define pfn_to_virt(pfn)	__va((pfn) << PAGE_SHIFT)
+#else /* __DECOMPRESSOR */
+
+static inline unsigned long __pa_nodebug(unsigned long x)
+{
+	if (x < __kaslr_offset)
+		return x - __identity_base;
+	return x - __kaslr_offset + __kaslr_offset_phys;
+}
+
+#ifdef CONFIG_DEBUG_VIRTUAL
+
+unsigned long __phys_addr(unsigned long x, bool is_31bit);
+
+#else /* CONFIG_DEBUG_VIRTUAL */
+
+static inline unsigned long __phys_addr(unsigned long x, bool is_31bit)
+{
+	return __pa_nodebug(x);
+}
+
+#endif /* CONFIG_DEBUG_VIRTUAL */
+
+#define __pa(x)			__phys_addr((unsigned long)(x), false)
+#define __pa32(x)		__phys_addr((unsigned long)(x), true)
+#define __va(x)			((void *)((unsigned long)(x) + __identity_base))
+
+#endif /* __DECOMPRESSOR */
+
+#define phys_to_pfn(phys)	((phys) >> PAGE_SHIFT)
+#define pfn_to_phys(pfn)	((pfn) << PAGE_SHIFT)
+
+#define phys_to_folio(phys)	page_folio(phys_to_page(phys))
+#define folio_to_phys(page)	pfn_to_phys(folio_pfn(folio))
+
+static inline void *pfn_to_virt(unsigned long pfn)
+{
+	return __va(pfn_to_phys(pfn));
+}
+
+static inline unsigned long virt_to_pfn(const void *kaddr)
+{
+	return phys_to_pfn(__pa(kaddr));
+}
+
 #define pfn_to_kaddr(pfn)	pfn_to_virt(pfn)
 
 #define virt_to_page(kaddr)	pfn_to_page(virt_to_pfn(kaddr))
 #define page_to_virt(page)	pfn_to_virt(page_to_pfn(page))
 
-#define phys_to_pfn(kaddr)	((kaddr) >> PAGE_SHIFT)
-#define pfn_to_phys(pfn)	((pfn) << PAGE_SHIFT)
-
-#define phys_to_page(kaddr)	pfn_to_page(phys_to_pfn(kaddr))
-#define page_to_phys(page)	(page_to_pfn(page) << PAGE_SHIFT)
+#define virt_addr_valid(kaddr)	pfn_valid(phys_to_pfn(__pa_nodebug((unsigned long)(kaddr))))
 
-#define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+#define VM_DATA_DEFAULT_FLAGS	VM_DATA_FLAGS_NON_EXEC
 
-#define VM_DATA_DEFAULT_FLAGS	(VM_READ | VM_WRITE | \
-				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#endif /* !__ASSEMBLY__ */
 
 #include <asm-generic/memory_model.h>
 #include <asm-generic/getorder.h>
 
+#define AMODE31_SIZE		(3 * PAGE_SIZE)
+
+#define KERNEL_IMAGE_SIZE	(512 * 1024 * 1024)
+#define __NO_KASLR_START_KERNEL	CONFIG_KERNEL_IMAGE_BASE
+#define __NO_KASLR_END_KERNEL	(__NO_KASLR_START_KERNEL + KERNEL_IMAGE_SIZE)
+
+#define TEXT_OFFSET		0x100000
+
 #endif /* _S390_PAGE_H */
diff --git a/arch/s390/include/asm/pai.h b/arch/s390/include/asm/pai.h
new file mode 100644
index 000000000000..ebeabd0aaa51
--- /dev/null
+++ b/arch/s390/include/asm/pai.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Processor Activity Instrumentation support for cryptography counters
+ *
+ *  Copyright IBM Corp. 2022
+ *  Author(s): Thomas Richter <tmricht@linux.ibm.com>
+ */
+#ifndef _ASM_S390_PAI_H
+#define _ASM_S390_PAI_H
+
+#include <linux/jump_label.h>
+#include <asm/lowcore.h>
+#include <asm/ptrace.h>
+#include <asm/asm.h>
+
+struct qpaci_info_block {
+	u64 header;
+	struct {
+		u64 : 8;
+		u64 num_cc : 8;		/* # of supported crypto counters */
+		u64 : 9;
+		u64 num_nnpa : 7;	/* # of supported NNPA counters */
+		u64 : 32;
+	};
+};
+
+static inline int qpaci(struct qpaci_info_block *info)
+{
+	/* Size of info (in double words minus one) */
+	size_t size = sizeof(*info) / sizeof(u64) - 1;
+	int cc;
+
+	asm volatile(
+		"	lgr	0,%[size]\n"
+		"	.insn	s,0xb28f0000,%[info]\n"
+		"	lgr	%[size],0\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc), [info] "=Q" (*info), [size] "+&d" (size)
+		:
+		: CC_CLOBBER_LIST("0", "memory"));
+	return CC_TRANSFORM(cc) ? (size + 1) * sizeof(u64) : 0;
+}
+
+#define PAI_CRYPTO_BASE			0x1000	/* First event number */
+#define PAI_CRYPTO_MAXCTR		256	/* Max # of event counters */
+#define PAI_CRYPTO_KERNEL_OFFSET	2048
+#define PAI_NNPA_BASE			0x1800	/* First event number */
+#define PAI_NNPA_MAXCTR			128	/* Max # of event counters */
+
+DECLARE_STATIC_KEY_FALSE(pai_key);
+
+static __always_inline void pai_kernel_enter(struct pt_regs *regs)
+{
+	if (!IS_ENABLED(CONFIG_PERF_EVENTS))
+		return;
+	if (!static_branch_unlikely(&pai_key))
+		return;
+	if (!get_lowcore()->ccd)
+		return;
+	if (!user_mode(regs))
+		return;
+	WRITE_ONCE(get_lowcore()->ccd, get_lowcore()->ccd | PAI_CRYPTO_KERNEL_OFFSET);
+}
+
+static __always_inline void pai_kernel_exit(struct pt_regs *regs)
+{
+	if (!IS_ENABLED(CONFIG_PERF_EVENTS))
+		return;
+	if (!static_branch_unlikely(&pai_key))
+		return;
+	if (!get_lowcore()->ccd)
+		return;
+	if (!user_mode(regs))
+		return;
+	WRITE_ONCE(get_lowcore()->ccd, get_lowcore()->ccd & ~PAI_CRYPTO_KERNEL_OFFSET);
+}
+
+#define PAI_SAVE_AREA(x)	((x)->hw.event_base)
+#define PAI_CPU_MASK(x)		((x)->hw.addr_filters)
+#define PAI_SWLIST(x)		(&(x)->hw.tp_list)
+
+#endif
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index 3a06c264ea53..41f900f693d9 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -5,11 +5,15 @@
 #include <linux/pci.h>
 #include <linux/mutex.h>
 #include <linux/iommu.h>
-#include <asm-generic/pci.h>
+#include <linux/pci_hotplug.h>
 #include <asm/pci_clp.h>
 #include <asm/pci_debug.h>
+#include <asm/pci_insn.h>
 #include <asm/sclp.h>
 
+#define ARCH_GENERIC_PCI_MMAP_RESOURCE	1
+#define arch_can_pci_mmap_wc()		1
+
 #define PCIBIOS_MIN_IO		0x1000
 #define PCIBIOS_MIN_MEM		0x10000000
 
@@ -21,10 +25,16 @@ int pci_domain_nr(struct pci_bus *);
 int pci_proc_domain(struct pci_bus *);
 
 #define ZPCI_BUS_NR			0	/* default bus number */
-#define ZPCI_DEVFN			0	/* default device number */
 
 #define ZPCI_NR_DMA_SPACES		1
 #define ZPCI_NR_DEVICES			CONFIG_PCI_NR_FUNCTIONS
+#define ZPCI_DOMAIN_BITMAP_SIZE		(1 << 16)
+
+#ifdef PCI
+#if (ZPCI_NR_DEVICES > ZPCI_DOMAIN_BITMAP_SIZE)
+# error ZPCI_NR_DEVICES can not be bigger than ZPCI_DOMAIN_BITMAP_SIZE
+#endif
+#endif /* PCI */
 
 /* PCI Function Controls */
 #define ZPCI_FC_FN_ENABLED		0x80
@@ -78,7 +88,6 @@ enum zpci_state {
 	ZPCI_FN_STATE_STANDBY = 0,
 	ZPCI_FN_STATE_CONFIGURED = 1,
 	ZPCI_FN_STATE_RESERVED = 2,
-	ZPCI_FN_STATE_ONLINE = 3,
 };
 
 struct zpci_bar_struct {
@@ -90,23 +99,57 @@ struct zpci_bar_struct {
 	u8		size;		/* order 2 exponent */
 };
 
-struct s390_domain;
+struct kvm_zdev;
+
+#define ZPCI_FUNCTIONS_PER_BUS 256
+struct zpci_bus {
+	struct kref		kref;
+	struct pci_bus		*bus;
+	struct zpci_dev		*function[ZPCI_FUNCTIONS_PER_BUS];
+	struct list_head	resources;
+	struct list_head	bus_next;
+	struct resource		bus_resource;
+	int			topo;		/* TID if topo_is_tid, PCHID otherwise */
+	int			domain_nr;
+	u8			multifunction	: 1;
+	u8			topo_is_tid	: 1;
+	enum pci_bus_speed	max_bus_speed;
+};
 
 /* Private data per function */
 struct zpci_dev {
-	struct pci_bus	*bus;
+	struct zpci_bus *zbus;
 	struct list_head entry;		/* list of all zpci_devices, needed for hotplug, etc. */
+	struct list_head iommu_list;
+	struct kref kref;
+	struct rcu_head rcu;
+	struct hotplug_slot hotplug_slot;
 
+	struct mutex state_lock;	/* protect state changes */
 	enum zpci_state state;
 	u32		fid;		/* function ID, used by sclp */
 	u32		fh;		/* function handle, used by insn's */
+	u32		gisa;		/* GISA designation for passthrough */
 	u16		vfn;		/* virtual function number */
 	u16		pchid;		/* physical channel ID */
+	u16		maxstbl;	/* Maximum store block size */
+	u16		rid;		/* RID as supplied by firmware */
+	u16		tid;		/* Topology for which RID is valid */
 	u8		pfgid;		/* function group ID */
 	u8		pft;		/* pci function type */
-	u16		domain;
+	u8		port;
+	u8		fidparm;
+	u8		dtsm;		/* Supported DT mask */
+	u8		rid_available	: 1;
+	u8		has_hp_slot	: 1;
+	u8		has_resources	: 1;
+	u8		is_physfn	: 1;
+	u8		util_str_avail	: 1;
+	u8		irqs_registered	: 1;
+	u8		tid_avail	: 1;
+	u8		rtr_avail	: 1; /* Relaxed translation allowed */
+	unsigned int	devfn;		/* DEVFN part of the RID*/
 
-	struct mutex lock;
 	u8 pfip[CLP_PFIP_NR_SEGMENTS];	/* pci function internal path */
 	u32 uid;			/* user defined id */
 	u8 util_str[CLP_UTIL_STR_LEN];	/* utility string */
@@ -121,16 +164,8 @@ struct zpci_dev {
 
 	/* DMA stuff */
 	unsigned long	*dma_table;
-	spinlock_t	dma_table_lock;
 	int		tlb_refresh;
 
-	spinlock_t	iommu_bitmap_lock;
-	unsigned long	*iommu_bitmap;
-	unsigned long	*lazy_bitmap;
-	unsigned long	iommu_size;
-	unsigned long	iommu_pages;
-	unsigned int	next_bit;
-
 	struct iommu_device iommu_dev;  /* IOMMU core handle */
 
 	char res_name[16];
@@ -142,19 +177,21 @@ struct zpci_dev {
 	u64		dma_mask;	/* DMA address space mask */
 
 	/* Function measurement block */
+	struct mutex fmb_lock;
 	struct zpci_fmb *fmb;
 	u16		fmb_update;	/* update interval */
 	u16		fmb_length;
-	/* software counters */
-	atomic64_t allocated_pages;
-	atomic64_t mapped_pages;
-	atomic64_t unmapped_pages;
 
+	u8		version;
 	enum pci_bus_speed max_bus_speed;
 
 	struct dentry	*debugfs_dev;
 
-	struct s390_domain *s390_domain; /* s390 IOMMU domain data */
+	/* IOMMU and passthrough */
+	struct iommu_domain *s390_domain; /* attached IOMMU domain */
+	struct kvm_zdev *kzdev;
+	struct mutex kzdev_lock;
+	spinlock_t dom_lock;		/* protect s390_domain change */
 };
 
 static inline bool zdev_enabled(struct zpci_dev *zdev)
@@ -162,33 +199,57 @@ static inline bool zdev_enabled(struct zpci_dev *zdev)
 	return (zdev->fh & (1UL << 31)) ? true : false;
 }
 
-extern const struct attribute_group *zpci_attr_groups[];
+extern const struct attribute_group zpci_attr_group;
+extern const struct attribute_group pfip_attr_group;
+extern const struct attribute_group zpci_ident_attr_group;
+
+#define ARCH_PCI_DEV_GROUPS &zpci_attr_group,		 \
+			    &pfip_attr_group,		 \
+			    &zpci_ident_attr_group,
+
 extern unsigned int s390_pci_force_floating __initdata;
+extern unsigned int s390_pci_no_rid;
+
+extern union zpci_sic_iib *zpci_aipb;
+extern struct airq_iv *zpci_aif_sbv;
 
 /* -----------------------------------------------------------------------------
   Prototypes
 ----------------------------------------------------------------------------- */
 /* Base stuff */
-int zpci_create_device(struct zpci_dev *);
-void zpci_remove_device(struct zpci_dev *zdev);
+struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state);
+int zpci_add_device(struct zpci_dev *zdev);
 int zpci_enable_device(struct zpci_dev *);
+int zpci_reenable_device(struct zpci_dev *zdev);
 int zpci_disable_device(struct zpci_dev *);
-int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64);
+int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh);
+int zpci_deconfigure_device(struct zpci_dev *zdev);
+void zpci_device_reserved(struct zpci_dev *zdev);
+bool zpci_is_device_configured(struct zpci_dev *zdev);
+int zpci_scan_devices(void);
+
+int zpci_hot_reset_device(struct zpci_dev *zdev);
+int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64, u8 *);
 int zpci_unregister_ioat(struct zpci_dev *, u8);
 void zpci_remove_reserved_devices(void);
+void zpci_update_fh(struct zpci_dev *zdev, u32 fh);
 
 /* CLP */
-int clp_scan_pci_devices(void);
-int clp_rescan_pci_devices(void);
-int clp_rescan_pci_devices_simple(void);
-int clp_add_pci_device(u32, u32, int);
-int clp_enable_fh(struct zpci_dev *, u8);
-int clp_disable_fh(struct zpci_dev *);
+int clp_setup_writeback_mio(void);
+int clp_scan_pci_devices(struct list_head *scan_list);
+int clp_query_pci_fn(struct zpci_dev *zdev);
+int clp_enable_fh(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as);
+int clp_disable_fh(struct zpci_dev *zdev, u32 *fh);
 int clp_get_state(u32 fid, enum zpci_state *state);
+int clp_refresh_fh(u32 fid, u32 *fh);
+
+/* UID */
+void update_uid_checking(bool new);
 
 /* IOMMU Interface */
 int zpci_init_iommu(struct zpci_dev *zdev);
 void zpci_destroy_iommu(struct zpci_dev *zdev);
+int zpci_iommu_register_ioat(struct zpci_dev *zdev, u8 *status);
 
 #ifdef CONFIG_PCI
 static inline bool zpci_use_mio(struct zpci_dev *zdev)
@@ -199,12 +260,10 @@ static inline bool zpci_use_mio(struct zpci_dev *zdev)
 /* Error handling and recovery */
 void zpci_event_error(void *);
 void zpci_event_availability(void *);
-void zpci_rescan(void);
 bool zpci_is_enabled(void);
 #else /* CONFIG_PCI */
 static inline void zpci_event_error(void *e) {}
 static inline void zpci_event_availability(void *e) {}
-static inline void zpci_rescan(void) {}
 #endif /* CONFIG_PCI */
 
 #ifdef CONFIG_HOTPLUG_PCI_S390
@@ -221,7 +280,14 @@ static inline void zpci_exit_slot(struct zpci_dev *zdev) {}
 /* Helpers */
 static inline struct zpci_dev *to_zpci(struct pci_dev *pdev)
 {
-	return pdev->sysdata;
+	struct zpci_bus *zbus = pdev->sysdata;
+
+	return zbus->function[pdev->devfn];
+}
+
+static inline struct zpci_dev *to_zpci_dev(struct device *dev)
+{
+	return to_zpci(to_pci_dev(dev));
 }
 
 struct zpci_dev *get_zdev_by_fid(u32);
@@ -229,7 +295,10 @@ struct zpci_dev *get_zdev_by_fid(u32);
 /* DMA */
 int zpci_dma_init(void);
 void zpci_dma_exit(void);
+int zpci_dma_init_device(struct zpci_dev *zdev);
+int zpci_dma_exit_device(struct zpci_dev *zdev);
 
+/* IRQ */
 int __init zpci_irq_init(void);
 void __init zpci_irq_exit(void);
 
@@ -242,10 +311,11 @@ int zpci_debug_init(void);
 void zpci_debug_exit(void);
 void zpci_debug_init_device(struct zpci_dev *, const char *);
 void zpci_debug_exit_device(struct zpci_dev *);
-void zpci_debug_info(struct zpci_dev *, struct seq_file *);
 
-/* Error reporting */
+/* Error handling */
 int zpci_report_error(struct pci_dev *, struct zpci_report_error_header *);
+int zpci_clear_error_state(struct zpci_dev *zdev);
+int zpci_reset_load_store_blocked(struct zpci_dev *zdev);
 
 #ifdef CONFIG_NUMA
 
diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h
index bd2cb4ea7d93..7ebff39c84b3 100644
--- a/arch/s390/include/asm/pci_clp.h
+++ b/arch/s390/include/asm/pci_clp.h
@@ -7,6 +7,7 @@
 /*
  * Call Logical Processor - Command Codes
  */
+#define CLP_SLPC		0x0001
 #define CLP_LIST_PCI		0x0002
 #define CLP_QUERY_PCI_FN	0x0003
 #define CLP_QUERY_PCI_FNGRP	0x0004
@@ -49,8 +50,24 @@ struct clp_fh_list_entry {
 #define CLP_UTIL_STR_LEN	64
 #define CLP_PFIP_NR_SEGMENTS	4
 
+/* PCI function type numbers */
+#define PCI_FUNC_TYPE_ISM	0x5	/* ISM device */
+
 extern bool zpci_unique_uid;
 
+struct clp_rsp_slpc_pci {
+	struct clp_rsp_hdr hdr;
+	u32 reserved2[4];
+	u32 lpif[8];
+	u32 reserved3[4];
+	u32 vwb		:  1;
+	u32		:  1;
+	u32 mio_wb	:  6;
+	u32		: 24;
+	u32 reserved5[3];
+	u32 lpic[8];
+} __packed;
+
 /* List PCI functions request */
 struct clp_req_list_pci {
 	struct clp_req_hdr hdr;
@@ -93,7 +110,11 @@ struct clp_req_query_pci {
 struct clp_rsp_query_pci {
 	struct clp_rsp_hdr hdr;
 	u16 vfn;			/* virtual fn number */
-	u16			:  6;
+	u16			:  2;
+	u16 tid_avail		:  1;
+	u16 rid_avail		:  1;
+	u16 is_physfn		:  1;
+	u16 reserved1		:  1;
 	u16 mio_addr_avail	:  1;
 	u16 util_str_avail	:  1;	/* utility string available? */
 	u16 pfgid		:  8;	/* pci function group id */
@@ -102,12 +123,18 @@ struct clp_rsp_query_pci {
 	u16 pchid;
 	__le32 bar[PCI_STD_NUM_BARS];
 	u8 pfip[CLP_PFIP_NR_SEGMENTS];	/* pci function internal path */
-	u32			: 16;
+	u8 fidparm;
+	u8 reserved3		:  4;
+	u8 port			:  4;
 	u8 fmb_len;
 	u8 pft;				/* pci function type */
 	u64 sdma;			/* start dma as */
 	u64 edma;			/* end dma as */
-	u32 reserved[11];
+#define ZPCI_RID_MASK_DEVFN 0x00ff
+	u16 rid;			/* BUS/DEVFN PCI address */
+	u32 reserved0;
+	u16 tid;
+	u32 reserved[9];
 	u32 uid;			/* user defined id */
 	u8 util_str[CLP_UTIL_STR_LEN];	/* utility string */
 	u32 reserved2[16];
@@ -129,12 +156,16 @@ struct clp_rsp_query_pci_grp {
 	u16			:  4;
 	u16 noi			: 12;	/* number of interrupts */
 	u8 version;
-	u8			:  6;
+	u8			:  2;
+	u8 rtr			:  1;	/* Relaxed translation requirement */
+	u8			:  3;
 	u8 frame		:  1;
 	u8 refresh		:  1;	/* TLB refresh mode */
-	u16 reserved2;
+	u16			:  3;
+	u16 maxstbl		: 13;	/* Maximum store block size */
 	u16 mui;
-	u16			: 16;
+	u8 dtsm;			/* Supported DT mask */
+	u8 reserved3;
 	u16 maxfaal;
 	u16			:  4;
 	u16 dnoi		: 12;
@@ -152,7 +183,8 @@ struct clp_req_set_pci {
 	u16 reserved2;
 	u8 oc;				/* operation controls */
 	u8 ndas;			/* number of dma spaces */
-	u64 reserved3;
+	u32 reserved3;
+	u32 gisa;			/* GISA designation */
 } __packed;
 
 /* Set PCI function response */
@@ -165,6 +197,11 @@ struct clp_rsp_set_pci {
 } __packed;
 
 /* Combined request/response block structures used by clp insn */
+struct clp_req_rsp_slpc_pci {
+	struct clp_req_slpc request;
+	struct clp_rsp_slpc_pci response;
+} __packed;
+
 struct clp_req_rsp_list_pci {
 	struct clp_req_list_pci request;
 	struct clp_rsp_list_pci response;
diff --git a/arch/s390/include/asm/pci_debug.h b/arch/s390/include/asm/pci_debug.h
index 5dfe47588277..3bb4e7e33a0e 100644
--- a/arch/s390/include/asm/pci_debug.h
+++ b/arch/s390/include/asm/pci_debug.h
@@ -17,9 +17,14 @@ extern debug_info_t *pci_debug_err_id;
 		debug_text_event(pci_debug_err_id, 0, debug_buffer);		\
 	} while (0)
 
+static inline void zpci_err_hex_level(int level, void *addr, int len)
+{
+	debug_event(pci_debug_err_id, level, addr, len);
+}
+
 static inline void zpci_err_hex(void *addr, int len)
 {
-	debug_event(pci_debug_err_id, 0, addr, len);
+	zpci_err_hex_level(0, addr, len);
 }
 
 #endif
diff --git a/arch/s390/include/asm/pci_dma.h b/arch/s390/include/asm/pci_dma.h
index 419fac7a62c0..42d7cc4262ca 100644
--- a/arch/s390/include/asm/pci_dma.h
+++ b/arch/s390/include/asm/pci_dma.h
@@ -82,126 +82,16 @@ enum zpci_ioat_dtype {
 #define ZPCI_TABLE_VALID_MASK		0x20
 #define ZPCI_TABLE_PROT_MASK		0x200
 
-static inline unsigned int calc_rtx(dma_addr_t ptr)
-{
-	return ((unsigned long) ptr >> ZPCI_RT_SHIFT) & ZPCI_INDEX_MASK;
-}
-
-static inline unsigned int calc_sx(dma_addr_t ptr)
-{
-	return ((unsigned long) ptr >> ZPCI_ST_SHIFT) & ZPCI_INDEX_MASK;
-}
-
-static inline unsigned int calc_px(dma_addr_t ptr)
-{
-	return ((unsigned long) ptr >> PAGE_SHIFT) & ZPCI_PT_MASK;
-}
-
-static inline void set_pt_pfaa(unsigned long *entry, void *pfaa)
-{
-	*entry &= ZPCI_PTE_FLAG_MASK;
-	*entry |= ((unsigned long) pfaa & ZPCI_PTE_ADDR_MASK);
-}
-
-static inline void set_rt_sto(unsigned long *entry, void *sto)
-{
-	*entry &= ZPCI_RTE_FLAG_MASK;
-	*entry |= ((unsigned long) sto & ZPCI_RTE_ADDR_MASK);
-	*entry |= ZPCI_TABLE_TYPE_RTX;
-}
-
-static inline void set_st_pto(unsigned long *entry, void *pto)
-{
-	*entry &= ZPCI_STE_FLAG_MASK;
-	*entry |= ((unsigned long) pto & ZPCI_STE_ADDR_MASK);
-	*entry |= ZPCI_TABLE_TYPE_SX;
-}
-
-static inline void validate_rt_entry(unsigned long *entry)
-{
-	*entry &= ~ZPCI_TABLE_VALID_MASK;
-	*entry &= ~ZPCI_TABLE_OFFSET_MASK;
-	*entry |= ZPCI_TABLE_VALID;
-	*entry |= ZPCI_TABLE_LEN_RTX;
-}
-
-static inline void validate_st_entry(unsigned long *entry)
-{
-	*entry &= ~ZPCI_TABLE_VALID_MASK;
-	*entry |= ZPCI_TABLE_VALID;
-}
-
-static inline void invalidate_table_entry(unsigned long *entry)
-{
-	*entry &= ~ZPCI_TABLE_VALID_MASK;
-	*entry |= ZPCI_TABLE_INVALID;
-}
-
-static inline void invalidate_pt_entry(unsigned long *entry)
-{
-	WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_INVALID);
-	*entry &= ~ZPCI_PTE_VALID_MASK;
-	*entry |= ZPCI_PTE_INVALID;
-}
-
-static inline void validate_pt_entry(unsigned long *entry)
-{
-	WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID);
-	*entry &= ~ZPCI_PTE_VALID_MASK;
-	*entry |= ZPCI_PTE_VALID;
-}
-
-static inline void entry_set_protected(unsigned long *entry)
-{
-	*entry &= ~ZPCI_TABLE_PROT_MASK;
-	*entry |= ZPCI_TABLE_PROTECTED;
-}
-
-static inline void entry_clr_protected(unsigned long *entry)
-{
-	*entry &= ~ZPCI_TABLE_PROT_MASK;
-	*entry |= ZPCI_TABLE_UNPROTECTED;
-}
-
-static inline int reg_entry_isvalid(unsigned long entry)
-{
-	return (entry & ZPCI_TABLE_VALID_MASK) == ZPCI_TABLE_VALID;
-}
-
-static inline int pt_entry_isvalid(unsigned long entry)
-{
-	return (entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID;
-}
-
-static inline int entry_isprotected(unsigned long entry)
-{
-	return (entry & ZPCI_TABLE_PROT_MASK) == ZPCI_TABLE_PROTECTED;
-}
-
-static inline unsigned long *get_rt_sto(unsigned long entry)
-{
-	return ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RTX)
-		? (unsigned long *) (entry & ZPCI_RTE_ADDR_MASK)
-		: NULL;
-}
-
-static inline unsigned long *get_st_pto(unsigned long entry)
-{
-	return ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_SX)
-		? (unsigned long *) (entry & ZPCI_STE_ADDR_MASK)
-		: NULL;
-}
-
-/* Prototypes */
-int zpci_dma_init_device(struct zpci_dev *);
-void zpci_dma_exit_device(struct zpci_dev *);
-void dma_free_seg_table(unsigned long);
-unsigned long *dma_alloc_cpu_table(void);
-void dma_cleanup_tables(unsigned long *);
-unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr);
-void dma_update_cpu_trans(unsigned long *entry, void *page_addr, int flags);
-
-extern const struct dma_map_ops s390_pci_dma_ops;
+struct zpci_iommu_ctrs {
+	atomic64_t		mapped_pages;
+	atomic64_t		unmapped_pages;
+	atomic64_t		global_rpcits;
+	atomic64_t		sync_map_rpcits;
+	atomic64_t		sync_rpcits;
+};
+
+struct zpci_dev;
 
+struct zpci_iommu_ctrs *zpci_get_iommu_ctrs(struct zpci_dev *zdev);
 
 #endif
diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h
index 61cf9531f68f..e5f57cfe1d45 100644
--- a/arch/s390/include/asm/pci_insn.h
+++ b/arch/s390/include/asm/pci_insn.h
@@ -98,6 +98,15 @@ struct zpci_fib {
 	u32 gd;
 } __packed __aligned(8);
 
+/* Set Interruption Controls Operation Controls  */
+#define	SIC_IRQ_MODE_ALL		0
+#define	SIC_IRQ_MODE_SINGLE		1
+#define	SIC_SET_AENI_CONTROLS		2
+#define	SIC_IRQ_MODE_DIRECT		4
+#define	SIC_IRQ_MODE_D_ALL		16
+#define	SIC_IRQ_MODE_D_SINGLE		17
+#define	SIC_IRQ_MODE_SET_CPU		18
+
 /* directed interruption information block */
 struct zpci_diib {
 	u32 : 1;
@@ -119,9 +128,20 @@ struct zpci_cdiib {
 	u64 : 64;
 } __packed __aligned(8);
 
+/* adapter interruption parameters block */
+struct zpci_aipb {
+	u64 faisb;
+	u64 gait;
+	u16 : 13;
+	u16 afi : 3;
+	u32 : 32;
+	u16 faal;
+} __packed __aligned(8);
+
 union zpci_sic_iib {
 	struct zpci_diib diib;
 	struct zpci_cdiib cdiib;
+	struct zpci_aipb aipb;
 };
 
 DECLARE_STATIC_KEY_FALSE(have_mio);
@@ -134,13 +154,6 @@ int __zpci_store(u64 data, u64 req, u64 offset);
 int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len);
 int __zpci_store_block(const u64 *data, u64 req, u64 offset);
 void zpci_barrier(void);
-int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib);
-
-static inline int zpci_set_irq_ctrl(u16 ctl, u8 isc)
-{
-	union zpci_sic_iib iib = {{0}};
-
-	return __zpci_set_irq_ctrl(ctl, isc, &iib);
-}
+int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib);
 
 #endif
diff --git a/arch/s390/include/asm/pci_io.h b/arch/s390/include/asm/pci_io.h
index cd060b5dd8fd..43a5ea4ee20f 100644
--- a/arch/s390/include/asm/pci_io.h
+++ b/arch/s390/include/asm/pci_io.h
@@ -8,14 +8,21 @@
 #include <linux/slab.h>
 #include <asm/pci_insn.h>
 
+/* I/O size constraints */
+#define ZPCI_MAX_READ_SIZE	8
+#define ZPCI_MAX_WRITE_SIZE	128
+#define ZPCI_BOUNDARY_SIZE	(1 << 12)
+#define ZPCI_BOUNDARY_MASK	(ZPCI_BOUNDARY_SIZE - 1)
+
 /* I/O Map */
 #define ZPCI_IOMAP_SHIFT		48
-#define ZPCI_IOMAP_ADDR_BASE		0x8000000000000000UL
+#define ZPCI_IOMAP_ADDR_SHIFT		62
+#define ZPCI_IOMAP_ADDR_BASE		(1UL << ZPCI_IOMAP_ADDR_SHIFT)
 #define ZPCI_IOMAP_ADDR_OFF_MASK	((1UL << ZPCI_IOMAP_SHIFT) - 1)
 #define ZPCI_IOMAP_MAX_ENTRIES							\
-	((ULONG_MAX - ZPCI_IOMAP_ADDR_BASE + 1) / (1UL << ZPCI_IOMAP_SHIFT))
+	(1UL << (ZPCI_IOMAP_ADDR_SHIFT - ZPCI_IOMAP_SHIFT))
 #define ZPCI_IOMAP_ADDR_IDX_MASK						\
-	(~ZPCI_IOMAP_ADDR_OFF_MASK - ZPCI_IOMAP_ADDR_BASE)
+	((ZPCI_IOMAP_ADDR_BASE - 1) & ~ZPCI_IOMAP_ADDR_OFF_MASK)
 
 struct zpci_iomap_entry {
 	u32 fh;
@@ -120,27 +127,30 @@ out:
 int zpci_write_block(volatile void __iomem *dst, const void *src,
 		     unsigned long len);
 
-static inline u8 zpci_get_max_write_size(u64 src, u64 dst, int len, int max)
+static inline int zpci_get_max_io_size(u64 src, u64 dst, int len, int max)
 {
-	int count = len > max ? max : len, size = 1;
+	int offset = dst & ZPCI_BOUNDARY_MASK;
+	int size;
 
-	while (!(src & 0x1) && !(dst & 0x1) && ((size << 1) <= count)) {
-		dst = dst >> 1;
-		src = src >> 1;
-		size = size << 1;
-	}
-	return size;
+	size = min3(len, ZPCI_BOUNDARY_SIZE - offset, max);
+	if (IS_ALIGNED(src, 8) && IS_ALIGNED(dst, 8) && IS_ALIGNED(size, 8))
+		return size;
+
+	if (size >= 8)
+		return 8;
+	return rounddown_pow_of_two(size);
 }
 
 static inline int zpci_memcpy_fromio(void *dst,
 				     const volatile void __iomem *src,
-				     unsigned long n)
+				     size_t n)
 {
 	int size, rc = 0;
 
 	while (n > 0) {
-		size = zpci_get_max_write_size((u64 __force) src,
-					       (u64) dst, n, 8);
+		size = zpci_get_max_io_size((u64 __force) src,
+					    (u64) dst, n,
+					    ZPCI_MAX_READ_SIZE);
 		rc = zpci_read_single(dst, src, size);
 		if (rc)
 			break;
@@ -152,7 +162,7 @@ static inline int zpci_memcpy_fromio(void *dst,
 }
 
 static inline int zpci_memcpy_toio(volatile void __iomem *dst,
-				   const void *src, unsigned long n)
+				   const void *src, size_t n)
 {
 	int size, rc = 0;
 
@@ -160,8 +170,9 @@ static inline int zpci_memcpy_toio(volatile void __iomem *dst,
 		return -EINVAL;
 
 	while (n > 0) {
-		size = zpci_get_max_write_size((u64 __force) dst,
-					       (u64) src, n, 128);
+		size = zpci_get_max_io_size((u64 __force) dst,
+					    (u64) src, n,
+					    ZPCI_MAX_WRITE_SIZE);
 		if (size > 8) /* main path */
 			rc = zpci_write_block(dst, src, size);
 		else
@@ -176,7 +187,7 @@ static inline int zpci_memcpy_toio(volatile void __iomem *dst,
 }
 
 static inline int zpci_memset_io(volatile void __iomem *dst,
-				 unsigned char val, size_t count)
+				 int val, size_t count)
 {
 	u8 *src = kmalloc(count, GFP_KERNEL);
 	int rc;
diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h
index 50b4ce8cddfd..84f6b8357b45 100644
--- a/arch/s390/include/asm/percpu.h
+++ b/arch/s390/include/asm/percpu.h
@@ -4,12 +4,13 @@
 
 #include <linux/preempt.h>
 #include <asm/cmpxchg.h>
+#include <asm/march.h>
 
 /*
  * s390 uses its own implementation for per cpu data, the offset of
  * the cpu local data area is cached in the cpu's lowcore memory.
  */
-#define __my_cpu_offset S390_lowcore.percpu_offset
+#define __my_cpu_offset get_lowcore()->percpu_offset
 
 /*
  * For 64 bit module code, the module may be more than 4G above the
@@ -29,15 +30,15 @@
 	typedef typeof(pcp) pcp_op_T__;					\
 	pcp_op_T__ old__, new__, prev__;				\
 	pcp_op_T__ *ptr__;						\
-	preempt_disable();						\
+	preempt_disable_notrace();					\
 	ptr__ = raw_cpu_ptr(&(pcp));					\
-	prev__ = *ptr__;						\
+	prev__ = READ_ONCE(*ptr__);					\
 	do {								\
 		old__ = prev__;						\
 		new__ = old__ op (val);					\
 		prev__ = cmpxchg(ptr__, old__, new__);			\
 	} while (prev__ != old__);					\
-	preempt_enable();						\
+	preempt_enable_notrace();					\
 	new__;								\
 })
 
@@ -50,7 +51,7 @@
 #define this_cpu_or_1(pcp, val)		arch_this_cpu_to_op_simple(pcp, val, |)
 #define this_cpu_or_2(pcp, val)		arch_this_cpu_to_op_simple(pcp, val, |)
 
-#ifndef CONFIG_HAVE_MARCH_Z196_FEATURES
+#ifndef MARCH_HAS_Z196_FEATURES
 
 #define this_cpu_add_4(pcp, val)	arch_this_cpu_to_op_simple(pcp, val, +)
 #define this_cpu_add_8(pcp, val)	arch_this_cpu_to_op_simple(pcp, val, +)
@@ -61,14 +62,14 @@
 #define this_cpu_or_4(pcp, val)		arch_this_cpu_to_op_simple(pcp, val, |)
 #define this_cpu_or_8(pcp, val)		arch_this_cpu_to_op_simple(pcp, val, |)
 
-#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */
+#else /* MARCH_HAS_Z196_FEATURES */
 
 #define arch_this_cpu_add(pcp, val, op1, op2, szcast)			\
 {									\
 	typedef typeof(pcp) pcp_op_T__; 				\
 	pcp_op_T__ val__ = (val);					\
 	pcp_op_T__ old__, *ptr__;					\
-	preempt_disable();						\
+	preempt_disable_notrace();					\
 	ptr__ = raw_cpu_ptr(&(pcp)); 				\
 	if (__builtin_constant_p(val__) &&				\
 	    ((szcast)val__ > -129) && ((szcast)val__ < 128)) {		\
@@ -84,7 +85,7 @@
 			: [val__] "d" (val__)				\
 			: "cc");					\
 	}								\
-	preempt_enable();						\
+	preempt_enable_notrace();					\
 }
 
 #define this_cpu_add_4(pcp, val) arch_this_cpu_add(pcp, val, "laa", "asi", int)
@@ -95,14 +96,14 @@
 	typedef typeof(pcp) pcp_op_T__; 				\
 	pcp_op_T__ val__ = (val);					\
 	pcp_op_T__ old__, *ptr__;					\
-	preempt_disable();						\
+	preempt_disable_notrace();					\
 	ptr__ = raw_cpu_ptr(&(pcp));	 				\
 	asm volatile(							\
 		op "    %[old__],%[val__],%[ptr__]\n"			\
 		: [old__] "=d" (old__), [ptr__] "+Q" (*ptr__)		\
 		: [val__] "d" (val__)					\
 		: "cc");						\
-	preempt_enable();						\
+	preempt_enable_notrace();						\
 	old__ + val__;							\
 })
 
@@ -114,14 +115,14 @@
 	typedef typeof(pcp) pcp_op_T__; 				\
 	pcp_op_T__ val__ = (val);					\
 	pcp_op_T__ old__, *ptr__;					\
-	preempt_disable();						\
+	preempt_disable_notrace();					\
 	ptr__ = raw_cpu_ptr(&(pcp));	 				\
 	asm volatile(							\
 		op "    %[old__],%[val__],%[ptr__]\n"			\
 		: [old__] "=d" (old__), [ptr__] "+Q" (*ptr__)		\
 		: [val__] "d" (val__)					\
 		: "cc");						\
-	preempt_enable();						\
+	preempt_enable_notrace();					\
 }
 
 #define this_cpu_and_4(pcp, val)	arch_this_cpu_to_op(pcp, val, "lan")
@@ -129,17 +130,17 @@
 #define this_cpu_or_4(pcp, val)		arch_this_cpu_to_op(pcp, val, "lao")
 #define this_cpu_or_8(pcp, val)		arch_this_cpu_to_op(pcp, val, "laog")
 
-#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */
+#endif /* MARCH_HAS_Z196_FEATURES */
 
 #define arch_this_cpu_cmpxchg(pcp, oval, nval)				\
 ({									\
 	typedef typeof(pcp) pcp_op_T__;					\
 	pcp_op_T__ ret__;						\
 	pcp_op_T__ *ptr__;						\
-	preempt_disable();						\
+	preempt_disable_notrace();					\
 	ptr__ = raw_cpu_ptr(&(pcp));					\
 	ret__ = cmpxchg(ptr__, oval, nval);				\
-	preempt_enable();						\
+	preempt_enable_notrace();					\
 	ret__;								\
 })
 
@@ -148,14 +149,30 @@
 #define this_cpu_cmpxchg_4(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval)
 #define this_cpu_cmpxchg_8(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval)
 
+#define this_cpu_cmpxchg64(pcp, o, n)	this_cpu_cmpxchg_8(pcp, o, n)
+
+#define this_cpu_cmpxchg128(pcp, oval, nval)				\
+({									\
+	typedef typeof(pcp) pcp_op_T__;					\
+	u128 old__, new__, ret__;					\
+	pcp_op_T__ *ptr__;						\
+	old__ = oval;							\
+	new__ = nval;							\
+	preempt_disable_notrace();					\
+	ptr__ = raw_cpu_ptr(&(pcp));					\
+	ret__ = cmpxchg128((void *)ptr__, old__, new__);		\
+	preempt_enable_notrace();					\
+	ret__;								\
+})
+
 #define arch_this_cpu_xchg(pcp, nval)					\
 ({									\
 	typeof(pcp) *ptr__;						\
 	typeof(pcp) ret__;						\
-	preempt_disable();						\
+	preempt_disable_notrace();					\
 	ptr__ = raw_cpu_ptr(&(pcp));					\
 	ret__ = xchg(ptr__, nval);					\
-	preempt_enable();						\
+	preempt_enable_notrace();					\
 	ret__;								\
 })
 
@@ -164,23 +181,6 @@
 #define this_cpu_xchg_4(pcp, nval) arch_this_cpu_xchg(pcp, nval)
 #define this_cpu_xchg_8(pcp, nval) arch_this_cpu_xchg(pcp, nval)
 
-#define arch_this_cpu_cmpxchg_double(pcp1, pcp2, o1, o2, n1, n2)	\
-({									\
-	typeof(pcp1) o1__ = (o1), n1__ = (n1);				\
-	typeof(pcp2) o2__ = (o2), n2__ = (n2);				\
-	typeof(pcp1) *p1__;						\
-	typeof(pcp2) *p2__;						\
-	int ret__;							\
-	preempt_disable();						\
-	p1__ = raw_cpu_ptr(&(pcp1));					\
-	p2__ = raw_cpu_ptr(&(pcp2));					\
-	ret__ = __cmpxchg_double(p1__, p2__, o1__, o2__, n1__, n2__);	\
-	preempt_enable();						\
-	ret__;								\
-})
-
-#define this_cpu_cmpxchg_double_8 arch_this_cpu_cmpxchg_double
-
 #include <asm-generic/percpu.h>
 
 #endif /* __ARCH_S390_PERCPU__ */
diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h
index b9da71632827..e53894cedf08 100644
--- a/arch/s390/include/asm/perf_event.h
+++ b/arch/s390/include/asm/perf_event.h
@@ -37,9 +37,9 @@ extern ssize_t cpumf_events_sysfs_show(struct device *dev,
 
 /* Perf callbacks */
 struct pt_regs;
-extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
-extern unsigned long perf_misc_flags(struct pt_regs *regs);
-#define perf_misc_flags(regs) perf_misc_flags(regs)
+extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs);
+extern unsigned long perf_arch_misc_flags(struct pt_regs *regs);
+#define perf_arch_misc_flags(regs) perf_arch_misc_flags(regs)
 #define perf_arch_bpf_user_pt_regs(regs) &regs->user_regs
 
 /* Perf pt_regs extension for sample-data-entry indicators */
@@ -48,33 +48,8 @@ struct perf_sf_sde_regs {
 	unsigned long reserved:63;	  /* reserved */
 };
 
-/* Perf PMU definitions for the counter facility */
-#define PERF_CPUM_CF_MAX_CTR		0xffffUL  /* Max ctr for ECCTR */
-
-/* Perf PMU definitions for the sampling facility */
-#define PERF_CPUM_SF_MAX_CTR		2
-#define PERF_EVENT_CPUM_SF		0xB0000UL /* Event: Basic-sampling */
-#define PERF_EVENT_CPUM_SF_DIAG		0xBD000UL /* Event: Combined-sampling */
-#define PERF_EVENT_CPUM_CF_DIAG		0xBC000UL /* Event: Counter sets */
-#define PERF_CPUM_SF_BASIC_MODE		0x0001	  /* Basic-sampling flag */
-#define PERF_CPUM_SF_DIAG_MODE		0x0002	  /* Diagnostic-sampling flag */
-#define PERF_CPUM_SF_MODE_MASK		(PERF_CPUM_SF_BASIC_MODE| \
-					 PERF_CPUM_SF_DIAG_MODE)
-#define PERF_CPUM_SF_FULL_BLOCKS	0x0004	  /* Process full SDBs only */
-#define PERF_CPUM_SF_FREQ_MODE		0x0008	  /* Sampling with frequency */
-
-#define REG_NONE		0
-#define REG_OVERFLOW		1
-#define OVERFLOW_REG(hwc)	((hwc)->extra_reg.config)
-#define SFB_ALLOC_REG(hwc)	((hwc)->extra_reg.alloc)
-#define TEAR_REG(hwc)		((hwc)->last_tag)
-#define SAMPL_RATE(hwc)		((hwc)->event_base)
-#define SAMPL_FLAGS(hwc)	((hwc)->config_base)
-#define SAMPL_DIAG_MODE(hwc)	(SAMPL_FLAGS(hwc) & PERF_CPUM_SF_DIAG_MODE)
-#define SDB_FULL_BLOCKS(hwc)	(SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FULL_BLOCKS)
-#define SAMPLE_FREQ_MODE(hwc)	(SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FREQ_MODE)
-
 #define perf_arch_fetch_caller_regs(regs, __ip) do {			\
+	(regs)->psw.mask = 0;						\
 	(regs)->psw.addr = (__ip);					\
 	(regs)->gprs[15] = (unsigned long)__builtin_frame_address(0) -	\
 		offsetof(struct stack_frame, back_chain);		\
diff --git a/arch/s390/include/asm/pfault.h b/arch/s390/include/asm/pfault.h
new file mode 100644
index 000000000000..a1bee4a1e470
--- /dev/null
+++ b/arch/s390/include/asm/pfault.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *    Copyright IBM Corp. 1999, 2023
+ */
+#ifndef _ASM_S390_PFAULT_H
+#define _ASM_S390_PFAULT_H
+
+#include <linux/errno.h>
+
+int __pfault_init(void);
+void __pfault_fini(void);
+
+static inline int pfault_init(void)
+{
+	if (IS_ENABLED(CONFIG_PFAULT))
+		return __pfault_init();
+	return -EOPNOTSUPP;
+}
+
+static inline void pfault_fini(void)
+{
+	if (IS_ENABLED(CONFIG_PFAULT))
+		__pfault_fini();
+}
+
+#endif /* _ASM_S390_PFAULT_H */
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 77606c4acd58..005497ffebda 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -23,58 +23,71 @@ unsigned long *crst_table_alloc(struct mm_struct *);
 void crst_table_free(struct mm_struct *, unsigned long *);
 
 unsigned long *page_table_alloc(struct mm_struct *);
-struct page *page_table_alloc_pgste(struct mm_struct *mm);
+struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm);
 void page_table_free(struct mm_struct *, unsigned long *);
-void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long);
-void page_table_free_pgste(struct page *page);
-extern int page_table_allocate_pgste;
+void page_table_free_pgste(struct ptdesc *ptdesc);
 
 static inline void crst_table_init(unsigned long *crst, unsigned long entry)
 {
 	memset64((u64 *)crst, entry, _CRST_ENTRIES);
 }
 
-static inline unsigned long pgd_entry_type(struct mm_struct *mm)
+int crst_table_upgrade(struct mm_struct *mm, unsigned long limit);
+
+static inline unsigned long check_asce_limit(struct mm_struct *mm, unsigned long addr,
+					     unsigned long len)
 {
-	if (mm_pmd_folded(mm))
-		return _SEGMENT_ENTRY_EMPTY;
-	if (mm_pud_folded(mm))
-		return _REGION3_ENTRY_EMPTY;
-	if (mm_p4d_folded(mm))
-		return _REGION2_ENTRY_EMPTY;
-	return _REGION1_ENTRY_EMPTY;
-}
+	int rc;
 
-int crst_table_upgrade(struct mm_struct *mm, unsigned long limit);
-void crst_table_downgrade(struct mm_struct *);
+	if (addr + len > mm->context.asce_limit &&
+	    addr + len <= TASK_SIZE) {
+		rc = crst_table_upgrade(mm, addr + len);
+		if (rc)
+			return (unsigned long) rc;
+	}
+	return addr;
+}
 
 static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address)
 {
 	unsigned long *table = crst_table_alloc(mm);
 
-	if (table)
-		crst_table_init(table, _REGION2_ENTRY_EMPTY);
+	if (!table)
+		return NULL;
+	crst_table_init(table, _REGION2_ENTRY_EMPTY);
+	pagetable_p4d_ctor(virt_to_ptdesc(table));
+
 	return (p4d_t *) table;
 }
 
 static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
 {
-	if (!mm_p4d_folded(mm))
-		crst_table_free(mm, (unsigned long *) p4d);
+	if (mm_p4d_folded(mm))
+		return;
+
+	pagetable_dtor(virt_to_ptdesc(p4d));
+	crst_table_free(mm, (unsigned long *) p4d);
 }
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
 {
 	unsigned long *table = crst_table_alloc(mm);
-	if (table)
-		crst_table_init(table, _REGION3_ENTRY_EMPTY);
+
+	if (!table)
+		return NULL;
+	crst_table_init(table, _REGION3_ENTRY_EMPTY);
+	pagetable_pud_ctor(virt_to_ptdesc(table));
+
 	return (pud_t *) table;
 }
 
 static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 {
-	if (!mm_pud_folded(mm))
-		crst_table_free(mm, (unsigned long *) pud);
+	if (mm_pud_folded(mm))
+		return;
+
+	pagetable_dtor(virt_to_ptdesc(pud));
+	crst_table_free(mm, (unsigned long *) pud);
 }
 
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr)
@@ -84,7 +97,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr)
 	if (!table)
 		return NULL;
 	crst_table_init(table, _SEGMENT_ENTRY_EMPTY);
-	if (!pgtable_pmd_page_ctor(virt_to_page(table))) {
+	if (!pagetable_pmd_ctor(virt_to_ptdesc(table))) {
 		crst_table_free(mm, table);
 		return NULL;
 	}
@@ -95,23 +108,23 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
 	if (mm_pmd_folded(mm))
 		return;
-	pgtable_pmd_page_dtor(virt_to_page(pmd));
+	pagetable_dtor(virt_to_ptdesc(pmd));
 	crst_table_free(mm, (unsigned long *) pmd);
 }
 
 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
 {
-	pgd_val(*pgd) = _REGION1_ENTRY | __pa(p4d);
+	set_pgd(pgd, __pgd(_REGION1_ENTRY | __pa(p4d)));
 }
 
 static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
 {
-	p4d_val(*p4d) = _REGION2_ENTRY | __pa(pud);
+	set_p4d(p4d, __p4d(_REGION2_ENTRY | __pa(pud)));
 }
 
 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 {
-	pud_val(*pud) = _REGION3_ENTRY | __pa(pmd);
+	set_pud(pud, __pud(_REGION3_ENTRY | __pa(pmd)));
 }
 
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
@@ -120,34 +133,25 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 
 	if (!table)
 		return NULL;
-	if (mm->context.asce_limit == _REGION3_SIZE) {
-		/* Forking a compat process with 2 page table levels */
-		if (!pgtable_pmd_page_ctor(virt_to_page(table))) {
-			crst_table_free(mm, table);
-			return NULL;
-		}
-	}
+	pagetable_pgd_ctor(virt_to_ptdesc(table));
+
 	return (pgd_t *) table;
 }
 
 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
-	if (mm->context.asce_limit == _REGION3_SIZE)
-		pgtable_pmd_page_dtor(virt_to_page(pgd));
+	pagetable_dtor(virt_to_ptdesc(pgd));
 	crst_table_free(mm, (unsigned long *) pgd);
 }
 
 static inline void pmd_populate(struct mm_struct *mm,
 				pmd_t *pmd, pgtable_t pte)
 {
-	pmd_val(*pmd) = _SEGMENT_ENTRY + __pa(pte);
+	set_pmd(pmd, __pmd(_SEGMENT_ENTRY | __pa(pte)));
 }
 
 #define pmd_populate_kernel(mm, pmd, pte) pmd_populate(mm, pmd, pte)
 
-#define pmd_pgtable(pmd) \
-	(pgtable_t)(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE)
-
 /*
  * page table entry allocation/free routines.
  */
@@ -157,7 +161,9 @@ static inline void pmd_populate(struct mm_struct *mm,
 #define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte)
 #define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte)
 
-extern void rcu_table_freelist_finish(void);
+/* arch use pte_free_defer() implementation in arch/s390/mm/pgalloc.c */
+#define pte_free_defer pte_free_defer
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable);
 
 void vmem_map_init(void);
 void *vmem_crst_alloc(unsigned long val);
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 7b03037a8475..f8a6b54986ec 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -14,14 +14,19 @@
 
 #include <linux/sched.h>
 #include <linux/mm_types.h>
+#include <linux/cpufeature.h>
 #include <linux/page-flags.h>
 #include <linux/radix-tree.h>
 #include <linux/atomic.h>
+#include <asm/ctlreg.h>
 #include <asm/bug.h>
 #include <asm/page.h>
+#include <asm/uv.h>
 
 extern pgd_t swapper_pg_dir[];
+extern pgd_t invalid_pg_dir[];
 extern void paging_init(void);
+extern struct ctlreg s390_invalid_asce;
 
 enum {
 	PG_DIRECT_MAP_4K = 0,
@@ -38,14 +43,12 @@ static inline void update_page_count(int level, long count)
 		atomic_long_add(count, &direct_pages_count[level]);
 }
 
-struct seq_file;
-void arch_report_meminfo(struct seq_file *m);
-
 /*
  * The S390 doesn't have any external MMU info: the kernel page
  * tables contain all the necessary information.
  */
 #define update_mmu_cache(vma, address, ptep)     do { } while (0)
+#define update_mmu_cache_range(vmf, vma, addr, ptep, nr) do { } while (0)
 #define update_mmu_cache_pmd(vma, address, ptep) do { } while (0)
 
 /*
@@ -63,33 +66,30 @@ extern unsigned long zero_page_mask;
 
 /* TODO: s390 cannot support io_remap_pfn_range... */
 
-#define FIRST_USER_ADDRESS  0UL
-
 #define pte_ERROR(e) \
-	printk("%s:%d: bad pte %p.\n", __FILE__, __LINE__, (void *) pte_val(e))
+	pr_err("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e))
 #define pmd_ERROR(e) \
-	printk("%s:%d: bad pmd %p.\n", __FILE__, __LINE__, (void *) pmd_val(e))
+	pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
 #define pud_ERROR(e) \
-	printk("%s:%d: bad pud %p.\n", __FILE__, __LINE__, (void *) pud_val(e))
+	pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
 #define p4d_ERROR(e) \
-	printk("%s:%d: bad p4d %p.\n", __FILE__, __LINE__, (void *) p4d_val(e))
+	pr_err("%s:%d: bad p4d %016lx.\n", __FILE__, __LINE__, p4d_val(e))
 #define pgd_ERROR(e) \
-	printk("%s:%d: bad pgd %p.\n", __FILE__, __LINE__, (void *) pgd_val(e))
+	pr_err("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e))
 
 /*
  * The vmalloc and module area will always be on the topmost area of the
- * kernel mapping. We reserve 128GB (64bit) for vmalloc and modules.
- * On 64 bit kernels we have a 2GB area at the top of the vmalloc area where
- * modules will reside. That makes sure that inter module branches always
- * happen without trampolines and in addition the placement within a 2GB frame
- * is branch prediction unit friendly.
+ * kernel mapping. 512GB are reserved for vmalloc by default.
+ * At the top of the vmalloc area a 2GB area is reserved where modules
+ * will reside. That makes sure that inter module branches always
+ * happen without trampolines and in addition the placement within a
+ * 2GB frame is branch prediction unit friendly.
  */
 extern unsigned long VMALLOC_START;
 extern unsigned long VMALLOC_END;
-#define VMALLOC_DEFAULT_SIZE	((128UL << 30) - MODULES_LEN)
+#define VMALLOC_DEFAULT_SIZE	((512UL << 30) - MODULES_LEN)
 extern struct page *vmemmap;
-
-#define VMEM_MAX_PHYS ((unsigned long) vmemmap)
+extern unsigned long vmemmap_size;
 
 extern unsigned long MODULES_VADDR;
 extern unsigned long MODULES_END;
@@ -107,6 +107,26 @@ static inline int is_module_addr(void *addr)
 	return 1;
 }
 
+#ifdef CONFIG_KMSAN
+#define KMSAN_VMALLOC_SIZE (VMALLOC_END - VMALLOC_START)
+#define KMSAN_VMALLOC_SHADOW_START VMALLOC_END
+#define KMSAN_VMALLOC_SHADOW_END (KMSAN_VMALLOC_SHADOW_START + KMSAN_VMALLOC_SIZE)
+#define KMSAN_VMALLOC_ORIGIN_START KMSAN_VMALLOC_SHADOW_END
+#define KMSAN_VMALLOC_ORIGIN_END (KMSAN_VMALLOC_ORIGIN_START + KMSAN_VMALLOC_SIZE)
+#define KMSAN_MODULES_SHADOW_START KMSAN_VMALLOC_ORIGIN_END
+#define KMSAN_MODULES_SHADOW_END (KMSAN_MODULES_SHADOW_START + MODULES_LEN)
+#define KMSAN_MODULES_ORIGIN_START KMSAN_MODULES_SHADOW_END
+#define KMSAN_MODULES_ORIGIN_END (KMSAN_MODULES_ORIGIN_START + MODULES_LEN)
+#endif
+
+#ifdef CONFIG_RANDOMIZE_BASE
+#define KASLR_LEN	(1UL << 31)
+#else
+#define KASLR_LEN	0UL
+#endif
+
+void setup_protection_map(void);
+
 /*
  * A 64 bit pagetable entry of S390 has following format:
  * |			 PFRA			      |0IPC|  OS  |
@@ -179,11 +199,21 @@ static inline int is_module_addr(void *addr)
 #define _PAGE_SOFT_DIRTY 0x000
 #endif
 
+#define _PAGE_SW_BITS	0xffUL		/* All SW bits */
+
+#define _PAGE_SWP_EXCLUSIVE _PAGE_LARGE	/* SW pte exclusive swap bit */
+
 /* Set of bits not changed in pte_modify */
 #define _PAGE_CHG_MASK		(PAGE_MASK | _PAGE_SPECIAL | _PAGE_DIRTY | \
 				 _PAGE_YOUNG | _PAGE_SOFT_DIRTY)
 
 /*
+ * Mask of bits that must not be changed with RDP. Allow only _PAGE_PROTECT
+ * HW bit and all SW bits.
+ */
+#define _PAGE_RDP_MASK		~(_PAGE_PROTECT | _PAGE_SW_BITS)
+
+/*
  * handle_pte_fault uses pte_present and pte_none to find out the pte type
  * WITHOUT holding the page table lock. The _PAGE_PRESENT bit is used to
  * distinguish present from not-present ptes. It is changed only with the page
@@ -249,28 +279,40 @@ static inline int is_module_addr(void *addr)
 #define _REGION1_ENTRY_EMPTY	(_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID)
 #define _REGION2_ENTRY		(_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_LENGTH)
 #define _REGION2_ENTRY_EMPTY	(_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID)
-#define _REGION3_ENTRY		(_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_LENGTH)
+#define _REGION3_ENTRY		(_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_LENGTH | \
+				 _REGION3_ENTRY_PRESENT)
 #define _REGION3_ENTRY_EMPTY	(_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID)
 
+#define _REGION3_ENTRY_HARDWARE_BITS		0xfffffffffffff6ffUL
+#define _REGION3_ENTRY_HARDWARE_BITS_LARGE	0xffffffff8001073cUL
 #define _REGION3_ENTRY_ORIGIN_LARGE ~0x7fffffffUL /* large page address	     */
 #define _REGION3_ENTRY_DIRTY	0x2000	/* SW region dirty bit */
 #define _REGION3_ENTRY_YOUNG	0x1000	/* SW region young bit */
+#define _REGION3_ENTRY_COMM	0x0010	/* Common-Region, marks swap entry */
 #define _REGION3_ENTRY_LARGE	0x0400	/* RTTE-format control, large page  */
-#define _REGION3_ENTRY_READ	0x0002	/* SW region read bit */
-#define _REGION3_ENTRY_WRITE	0x0001	/* SW region write bit */
+#define _REGION3_ENTRY_WRITE	0x8000	/* SW region write bit */
+#define _REGION3_ENTRY_READ	0x4000	/* SW region read bit */
 
 #ifdef CONFIG_MEM_SOFT_DIRTY
-#define _REGION3_ENTRY_SOFT_DIRTY 0x4000 /* SW region soft dirty bit */
+#define _REGION3_ENTRY_SOFT_DIRTY 0x0002 /* SW region soft dirty bit */
 #else
 #define _REGION3_ENTRY_SOFT_DIRTY 0x0000 /* SW region soft dirty bit */
 #endif
 
 #define _REGION_ENTRY_BITS	 0xfffffffffffff22fUL
 
+/*
+ * SW region present bit. For non-leaf region-third-table entries, bits 62-63
+ * indicate the TABLE LENGTH and both must be set to 1. But such entries
+ * would always be considered as present, so it is safe to use bit 63 as
+ * PRESENT bit for PUD.
+ */
+#define _REGION3_ENTRY_PRESENT	0x0001
+
 /* Bits in the segment table entry */
-#define _SEGMENT_ENTRY_BITS			0xfffffffffffffe33UL
-#define _SEGMENT_ENTRY_HARDWARE_BITS		0xfffffffffffffe30UL
-#define _SEGMENT_ENTRY_HARDWARE_BITS_LARGE	0xfffffffffff00730UL
+#define _SEGMENT_ENTRY_BITS			0xfffffffffffffe3fUL
+#define _SEGMENT_ENTRY_HARDWARE_BITS		0xfffffffffffffe3cUL
+#define _SEGMENT_ENTRY_HARDWARE_BITS_LARGE	0xfffffffffff1073cUL
 #define _SEGMENT_ENTRY_ORIGIN_LARGE ~0xfffffUL /* large page address	    */
 #define _SEGMENT_ENTRY_ORIGIN	~0x7ffUL/* page table origin		    */
 #define _SEGMENT_ENTRY_PROTECT	0x200	/* segment protection bit	    */
@@ -278,21 +320,29 @@ static inline int is_module_addr(void *addr)
 #define _SEGMENT_ENTRY_INVALID	0x20	/* invalid segment table entry	    */
 #define _SEGMENT_ENTRY_TYPE_MASK 0x0c	/* segment table type mask	    */
 
-#define _SEGMENT_ENTRY		(0)
+#define _SEGMENT_ENTRY		(_SEGMENT_ENTRY_PRESENT)
 #define _SEGMENT_ENTRY_EMPTY	(_SEGMENT_ENTRY_INVALID)
 
 #define _SEGMENT_ENTRY_DIRTY	0x2000	/* SW segment dirty bit */
 #define _SEGMENT_ENTRY_YOUNG	0x1000	/* SW segment young bit */
+
+#define _SEGMENT_ENTRY_COMM	0x0010	/* Common-Segment, marks swap entry */
 #define _SEGMENT_ENTRY_LARGE	0x0400	/* STE-format control, large page */
-#define _SEGMENT_ENTRY_WRITE	0x0002	/* SW segment write bit */
-#define _SEGMENT_ENTRY_READ	0x0001	/* SW segment read bit */
+#define _SEGMENT_ENTRY_WRITE	0x8000	/* SW segment write bit */
+#define _SEGMENT_ENTRY_READ	0x4000	/* SW segment read bit */
 
 #ifdef CONFIG_MEM_SOFT_DIRTY
-#define _SEGMENT_ENTRY_SOFT_DIRTY 0x4000 /* SW segment soft dirty bit */
+#define _SEGMENT_ENTRY_SOFT_DIRTY 0x0002 /* SW segment soft dirty bit */
 #else
 #define _SEGMENT_ENTRY_SOFT_DIRTY 0x0000 /* SW segment soft dirty bit */
 #endif
 
+#define _SEGMENT_ENTRY_PRESENT	0x0001	/* SW segment present bit */
+
+/* Common bits in region and segment table entries, for swap entries */
+#define _RST_ENTRY_COMM		0x0010	/* Common-Region/Segment, marks swap entry */
+#define _RST_ENTRY_INVALID	0x0020	/* invalid region/segment table entry */
+
 #define _CRST_ENTRIES	2048	/* number of region/segment table entries */
 #define _PAGE_ENTRIES	256	/* number of page table entries	*/
 
@@ -308,7 +358,7 @@ static inline int is_module_addr(void *addr)
 #define _REGION2_INDEX	(0x7ffUL << _REGION2_SHIFT)
 #define _REGION3_INDEX	(0x7ffUL << _REGION3_SHIFT)
 #define _SEGMENT_INDEX	(0x7ffUL << _SEGMENT_SHIFT)
-#define _PAGE_INDEX	(0xffUL  << _PAGE_SHIFT)
+#define _PAGE_INDEX	(0xffUL  << PAGE_SHIFT)
 
 #define _REGION1_SIZE	(1UL << _REGION1_SHIFT)
 #define _REGION2_SIZE	(1UL << _REGION2_SHIFT)
@@ -341,8 +391,6 @@ static inline int is_module_addr(void *addr)
 #define PTRS_PER_P4D	_CRST_ENTRIES
 #define PTRS_PER_PGD	_CRST_ENTRIES
 
-#define MAX_PTRS_PER_P4D	PTRS_PER_P4D
-
 /*
  * Segment table and region3 table entry encoding
  * (R = read-only, I = invalid, y = young bit):
@@ -373,9 +421,10 @@ static inline int is_module_addr(void *addr)
 #define PGSTE_HC_BIT	0x0020000000000000UL
 #define PGSTE_GR_BIT	0x0004000000000000UL
 #define PGSTE_GC_BIT	0x0002000000000000UL
-#define PGSTE_UC_BIT	0x0000800000000000UL	/* user dirty (migration) */
-#define PGSTE_IN_BIT	0x0000400000000000UL	/* IPTE notify bit */
-#define PGSTE_VSIE_BIT	0x0000200000000000UL	/* ref'd in a shadow table */
+#define PGSTE_ST2_MASK	0x0000ffff00000000UL
+#define PGSTE_UC_BIT	0x0000000000008000UL	/* user dirty (migration) */
+#define PGSTE_IN_BIT	0x0000000000004000UL	/* IPTE notify bit */
+#define PGSTE_VSIE_BIT	0x0000000000002000UL	/* ref'd in a shadow table */
 
 /* Guest Page State used for virtualization */
 #define _PGSTE_GPS_ZERO			0x0000000080000000UL
@@ -397,101 +446,107 @@ static inline int is_module_addr(void *addr)
 /*
  * Page protection definitions.
  */
-#define PAGE_NONE	__pgprot(_PAGE_PRESENT | _PAGE_INVALID | _PAGE_PROTECT)
-#define PAGE_RO		__pgprot(_PAGE_PRESENT | _PAGE_READ | \
+#define __PAGE_NONE		(_PAGE_PRESENT | _PAGE_INVALID | _PAGE_PROTECT)
+#define __PAGE_RO		(_PAGE_PRESENT | _PAGE_READ | \
 				 _PAGE_NOEXEC  | _PAGE_INVALID | _PAGE_PROTECT)
-#define PAGE_RX		__pgprot(_PAGE_PRESENT | _PAGE_READ | \
+#define __PAGE_RX		(_PAGE_PRESENT | _PAGE_READ | \
 				 _PAGE_INVALID | _PAGE_PROTECT)
-#define PAGE_RW		__pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
+#define __PAGE_RW		(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
 				 _PAGE_NOEXEC  | _PAGE_INVALID | _PAGE_PROTECT)
-#define PAGE_RWX	__pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
+#define __PAGE_RWX		(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
 				 _PAGE_INVALID | _PAGE_PROTECT)
-
-#define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
+#define __PAGE_SHARED		(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
 				 _PAGE_YOUNG | _PAGE_DIRTY | _PAGE_NOEXEC)
-#define PAGE_KERNEL	__pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
+#define __PAGE_KERNEL		(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
 				 _PAGE_YOUNG | _PAGE_DIRTY | _PAGE_NOEXEC)
-#define PAGE_KERNEL_RO	__pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_YOUNG | \
+#define __PAGE_KERNEL_RO	(_PAGE_PRESENT | _PAGE_READ | _PAGE_YOUNG | \
 				 _PAGE_PROTECT | _PAGE_NOEXEC)
-#define PAGE_KERNEL_EXEC __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
-				  _PAGE_YOUNG |	_PAGE_DIRTY)
 
-/*
- * On s390 the page table entry has an invalid bit and a read-only bit.
- * Read permission implies execute permission and write permission
- * implies read permission.
- */
-         /*xwr*/
-#define __P000	PAGE_NONE
-#define __P001	PAGE_RO
-#define __P010	PAGE_RO
-#define __P011	PAGE_RO
-#define __P100	PAGE_RX
-#define __P101	PAGE_RX
-#define __P110	PAGE_RX
-#define __P111	PAGE_RX
-
-#define __S000	PAGE_NONE
-#define __S001	PAGE_RO
-#define __S010	PAGE_RW
-#define __S011	PAGE_RW
-#define __S100	PAGE_RX
-#define __S101	PAGE_RX
-#define __S110	PAGE_RWX
-#define __S111	PAGE_RWX
+extern unsigned long page_noexec_mask;
+
+#define __pgprot_page_mask(x)	__pgprot((x) & page_noexec_mask)
+
+#define PAGE_NONE		__pgprot_page_mask(__PAGE_NONE)
+#define PAGE_RO			__pgprot_page_mask(__PAGE_RO)
+#define PAGE_RX			__pgprot_page_mask(__PAGE_RX)
+#define PAGE_RW			__pgprot_page_mask(__PAGE_RW)
+#define PAGE_RWX		__pgprot_page_mask(__PAGE_RWX)
+#define PAGE_SHARED		__pgprot_page_mask(__PAGE_SHARED)
+#define PAGE_KERNEL		__pgprot_page_mask(__PAGE_KERNEL)
+#define PAGE_KERNEL_RO		__pgprot_page_mask(__PAGE_KERNEL_RO)
 
 /*
  * Segment entry (large page) protection definitions.
  */
-#define SEGMENT_NONE	__pgprot(_SEGMENT_ENTRY_INVALID | \
+#define __SEGMENT_NONE		(_SEGMENT_ENTRY_PRESENT | \
+				 _SEGMENT_ENTRY_INVALID | \
 				 _SEGMENT_ENTRY_PROTECT)
-#define SEGMENT_RO	__pgprot(_SEGMENT_ENTRY_PROTECT | \
+#define __SEGMENT_RO		(_SEGMENT_ENTRY_PRESENT | \
+				 _SEGMENT_ENTRY_PROTECT | \
 				 _SEGMENT_ENTRY_READ | \
 				 _SEGMENT_ENTRY_NOEXEC)
-#define SEGMENT_RX	__pgprot(_SEGMENT_ENTRY_PROTECT | \
+#define __SEGMENT_RX		(_SEGMENT_ENTRY_PRESENT | \
+				 _SEGMENT_ENTRY_PROTECT | \
 				 _SEGMENT_ENTRY_READ)
-#define SEGMENT_RW	__pgprot(_SEGMENT_ENTRY_READ | \
+#define __SEGMENT_RW		(_SEGMENT_ENTRY_PRESENT | \
+				 _SEGMENT_ENTRY_READ | \
 				 _SEGMENT_ENTRY_WRITE | \
 				 _SEGMENT_ENTRY_NOEXEC)
-#define SEGMENT_RWX	__pgprot(_SEGMENT_ENTRY_READ | \
+#define __SEGMENT_RWX		(_SEGMENT_ENTRY_PRESENT | \
+				 _SEGMENT_ENTRY_READ | \
 				 _SEGMENT_ENTRY_WRITE)
-#define SEGMENT_KERNEL	__pgprot(_SEGMENT_ENTRY |	\
+#define __SEGMENT_KERNEL	(_SEGMENT_ENTRY |	\
 				 _SEGMENT_ENTRY_LARGE |	\
 				 _SEGMENT_ENTRY_READ |	\
 				 _SEGMENT_ENTRY_WRITE | \
 				 _SEGMENT_ENTRY_YOUNG | \
 				 _SEGMENT_ENTRY_DIRTY | \
 				 _SEGMENT_ENTRY_NOEXEC)
-#define SEGMENT_KERNEL_RO __pgprot(_SEGMENT_ENTRY |	\
+#define __SEGMENT_KERNEL_RO	(_SEGMENT_ENTRY |	\
 				 _SEGMENT_ENTRY_LARGE |	\
 				 _SEGMENT_ENTRY_READ |	\
 				 _SEGMENT_ENTRY_YOUNG |	\
 				 _SEGMENT_ENTRY_PROTECT | \
 				 _SEGMENT_ENTRY_NOEXEC)
-#define SEGMENT_KERNEL_EXEC __pgprot(_SEGMENT_ENTRY |	\
-				 _SEGMENT_ENTRY_LARGE |	\
-				 _SEGMENT_ENTRY_READ |	\
-				 _SEGMENT_ENTRY_WRITE | \
-				 _SEGMENT_ENTRY_YOUNG |	\
-				 _SEGMENT_ENTRY_DIRTY)
+
+extern unsigned long segment_noexec_mask;
+
+#define __pgprot_segment_mask(x) __pgprot((x) & segment_noexec_mask)
+
+#define SEGMENT_NONE		__pgprot_segment_mask(__SEGMENT_NONE)
+#define SEGMENT_RO		__pgprot_segment_mask(__SEGMENT_RO)
+#define SEGMENT_RX		__pgprot_segment_mask(__SEGMENT_RX)
+#define SEGMENT_RW		__pgprot_segment_mask(__SEGMENT_RW)
+#define SEGMENT_RWX		__pgprot_segment_mask(__SEGMENT_RWX)
+#define SEGMENT_KERNEL		__pgprot_segment_mask(__SEGMENT_KERNEL)
+#define SEGMENT_KERNEL_RO	__pgprot_segment_mask(__SEGMENT_KERNEL_RO)
 
 /*
  * Region3 entry (large page) protection definitions.
  */
 
-#define REGION3_KERNEL	__pgprot(_REGION_ENTRY_TYPE_R3 | \
-				 _REGION3_ENTRY_LARGE |	 \
-				 _REGION3_ENTRY_READ |	 \
-				 _REGION3_ENTRY_WRITE |	 \
-				 _REGION3_ENTRY_YOUNG |	 \
+#define __REGION3_KERNEL	(_REGION_ENTRY_TYPE_R3 | \
+				 _REGION3_ENTRY_PRESENT | \
+				 _REGION3_ENTRY_LARGE | \
+				 _REGION3_ENTRY_READ | \
+				 _REGION3_ENTRY_WRITE | \
+				 _REGION3_ENTRY_YOUNG | \
 				 _REGION3_ENTRY_DIRTY | \
 				 _REGION_ENTRY_NOEXEC)
-#define REGION3_KERNEL_RO __pgprot(_REGION_ENTRY_TYPE_R3 | \
-				   _REGION3_ENTRY_LARGE |  \
-				   _REGION3_ENTRY_READ |   \
-				   _REGION3_ENTRY_YOUNG |  \
-				   _REGION_ENTRY_PROTECT | \
-				   _REGION_ENTRY_NOEXEC)
+#define __REGION3_KERNEL_RO	(_REGION_ENTRY_TYPE_R3 | \
+				 _REGION3_ENTRY_PRESENT | \
+				 _REGION3_ENTRY_LARGE | \
+				 _REGION3_ENTRY_READ | \
+				 _REGION3_ENTRY_YOUNG | \
+				 _REGION_ENTRY_PROTECT | \
+				 _REGION_ENTRY_NOEXEC)
+
+extern unsigned long region_noexec_mask;
+
+#define __pgprot_region_mask(x)	__pgprot((x) & region_noexec_mask)
+
+#define REGION3_KERNEL		__pgprot_region_mask(__REGION3_KERNEL)
+#define REGION3_KERNEL_RO	__pgprot_region_mask(__REGION3_KERNEL_RO)
 
 static inline bool mm_p4d_folded(struct mm_struct *mm)
 {
@@ -520,20 +575,70 @@ static inline int mm_has_pgste(struct mm_struct *mm)
 	return 0;
 }
 
-static inline int mm_alloc_pgste(struct mm_struct *mm)
+static inline int mm_is_protected(struct mm_struct *mm)
 {
 #ifdef CONFIG_PGSTE
-	if (unlikely(mm->context.alloc_pgste))
+	if (unlikely(atomic_read(&mm->context.protected_count)))
 		return 1;
 #endif
 	return 0;
 }
 
+static inline pgste_t clear_pgste_bit(pgste_t pgste, unsigned long mask)
+{
+	return __pgste(pgste_val(pgste) & ~mask);
+}
+
+static inline pgste_t set_pgste_bit(pgste_t pgste, unsigned long mask)
+{
+	return __pgste(pgste_val(pgste) | mask);
+}
+
+static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot)
+{
+	return __pte(pte_val(pte) & ~pgprot_val(prot));
+}
+
+static inline pte_t set_pte_bit(pte_t pte, pgprot_t prot)
+{
+	return __pte(pte_val(pte) | pgprot_val(prot));
+}
+
+static inline pmd_t clear_pmd_bit(pmd_t pmd, pgprot_t prot)
+{
+	return __pmd(pmd_val(pmd) & ~pgprot_val(prot));
+}
+
+static inline pmd_t set_pmd_bit(pmd_t pmd, pgprot_t prot)
+{
+	return __pmd(pmd_val(pmd) | pgprot_val(prot));
+}
+
+static inline pud_t clear_pud_bit(pud_t pud, pgprot_t prot)
+{
+	return __pud(pud_val(pud) & ~pgprot_val(prot));
+}
+
+static inline pud_t set_pud_bit(pud_t pud, pgprot_t prot)
+{
+	return __pud(pud_val(pud) | pgprot_val(prot));
+}
+
 /*
- * In the case that a guest uses storage keys
- * faults should no longer be backed by zero pages
+ * As soon as the guest uses storage keys or enables PV, we deduplicate all
+ * mapped shared zeropages and prevent new shared zeropages from getting
+ * mapped.
  */
-#define mm_forbids_zeropage mm_has_pgste
+#define mm_forbids_zeropage mm_forbids_zeropage
+static inline int mm_forbids_zeropage(struct mm_struct *mm)
+{
+#ifdef CONFIG_PGSTE
+	if (!mm->context.allow_cow_sharing)
+		return 1;
+#endif
+	return 0;
+}
+
 static inline int mm_uses_skeys(struct mm_struct *mm)
 {
 #ifdef CONFIG_PGSTE
@@ -545,28 +650,35 @@ static inline int mm_uses_skeys(struct mm_struct *mm)
 
 static inline void csp(unsigned int *ptr, unsigned int old, unsigned int new)
 {
-	register unsigned long reg2 asm("2") = old;
-	register unsigned long reg3 asm("3") = new;
+	union register_pair r1 = { .even = old, .odd = new, };
 	unsigned long address = (unsigned long)ptr | 1;
 
 	asm volatile(
-		"	csp	%0,%3"
-		: "+d" (reg2), "+m" (*ptr)
-		: "d" (reg3), "d" (address)
+		"	csp	%[r1],%[address]"
+		: [r1] "+&d" (r1.pair), "+m" (*ptr)
+		: [address] "d" (address)
 		: "cc");
 }
 
-static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new)
+/**
+ * cspg() - Compare and Swap and Purge (CSPG)
+ * @ptr: Pointer to the value to be exchanged
+ * @old: The expected old value
+ * @new: The new value
+ *
+ * Return: True if compare and swap was successful, otherwise false.
+ */
+static inline bool cspg(unsigned long *ptr, unsigned long old, unsigned long new)
 {
-	register unsigned long reg2 asm("2") = old;
-	register unsigned long reg3 asm("3") = new;
+	union register_pair r1 = { .even = old, .odd = new, };
 	unsigned long address = (unsigned long)ptr | 1;
 
 	asm volatile(
-		"	.insn	rre,0xb98a0000,%0,%3"
-		: "+d" (reg2), "+m" (*ptr)
-		: "d" (reg3), "d" (address)
+		"	cspg	%[r1],%[address]"
+		: [r1] "+&d" (r1.pair), "+m" (*ptr)
+		: [address] "d" (address)
 		: "cc");
+	return old == r1.even;
 }
 
 #define CRDTE_DTT_PAGE		0x00UL
@@ -575,19 +687,29 @@ static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new
 #define CRDTE_DTT_REGION2	0x18UL
 #define CRDTE_DTT_REGION1	0x1cUL
 
-static inline void crdte(unsigned long old, unsigned long new,
-			 unsigned long table, unsigned long dtt,
+/**
+ * crdte() - Compare and Replace DAT Table Entry
+ * @old:     The expected old value
+ * @new:     The new value
+ * @table:   Pointer to the value to be exchanged
+ * @dtt:     Table type of the table to be exchanged
+ * @address: The address mapped by the entry to be replaced
+ * @asce:    The ASCE of this entry
+ *
+ * Return: True if compare and replace was successful, otherwise false.
+ */
+static inline bool crdte(unsigned long old, unsigned long new,
+			 unsigned long *table, unsigned long dtt,
 			 unsigned long address, unsigned long asce)
 {
-	register unsigned long reg2 asm("2") = old;
-	register unsigned long reg3 asm("3") = new;
-	register unsigned long reg4 asm("4") = table | dtt;
-	register unsigned long reg5 asm("5") = address;
+	union register_pair r1 = { .even = old, .odd = new, };
+	union register_pair r2 = { .even = __pa(table) | dtt, .odd = address, };
 
-	asm volatile(".insn rrf,0xb98f0000,%0,%2,%4,0"
-		     : "+d" (reg2)
-		     : "d" (reg3), "d" (reg4), "d" (reg5), "a" (asce)
+	asm volatile(".insn rrf,0xb98f0000,%[r1],%[r2],%[asce],0"
+		     : [r1] "+&d" (r1.pair)
+		     : [r2] "d" (r2.pair), [asce] "a" (asce)
 		     : "memory", "cc");
+	return old == r1.even;
 }
 
 /*
@@ -663,7 +785,7 @@ static inline int pud_present(pud_t pud)
 {
 	if (pud_folded(pud))
 		return 1;
-	return (pud_val(pud) & _REGION_ENTRY_ORIGIN) != 0UL;
+	return (pud_val(pud) & _REGION3_ENTRY_PRESENT) != 0;
 }
 
 static inline int pud_none(pud_t pud)
@@ -673,31 +795,28 @@ static inline int pud_none(pud_t pud)
 	return pud_val(pud) == _REGION3_ENTRY_EMPTY;
 }
 
-static inline int pud_large(pud_t pud)
+#define pud_leaf pud_leaf
+static inline bool pud_leaf(pud_t pud)
 {
 	if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) != _REGION_ENTRY_TYPE_R3)
 		return 0;
-	return !!(pud_val(pud) & _REGION3_ENTRY_LARGE);
+	return (pud_present(pud) && (pud_val(pud) & _REGION3_ENTRY_LARGE) != 0);
 }
 
-static inline unsigned long pud_pfn(pud_t pud)
+static inline int pmd_present(pmd_t pmd)
 {
-	unsigned long origin_mask;
-
-	origin_mask = _REGION_ENTRY_ORIGIN;
-	if (pud_large(pud))
-		origin_mask = _REGION3_ENTRY_ORIGIN_LARGE;
-	return (pud_val(pud) & origin_mask) >> PAGE_SHIFT;
+	return (pmd_val(pmd) & _SEGMENT_ENTRY_PRESENT) != 0;
 }
 
-static inline int pmd_large(pmd_t pmd)
+#define pmd_leaf pmd_leaf
+static inline bool pmd_leaf(pmd_t pmd)
 {
-	return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0;
+	return (pmd_present(pmd) && (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0);
 }
 
 static inline int pmd_bad(pmd_t pmd)
 {
-	if ((pmd_val(pmd) & _SEGMENT_ENTRY_TYPE_MASK) > 0 || pmd_large(pmd))
+	if ((pmd_val(pmd) & _SEGMENT_ENTRY_TYPE_MASK) > 0 || pmd_leaf(pmd))
 		return 1;
 	return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
 }
@@ -706,7 +825,7 @@ static inline int pud_bad(pud_t pud)
 {
 	unsigned long type = pud_val(pud) & _REGION_ENTRY_TYPE_MASK;
 
-	if (type > _REGION_ENTRY_TYPE_R3 || pud_large(pud))
+	if (type > _REGION_ENTRY_TYPE_R3 || pud_leaf(pud))
 		return 1;
 	if (type < _REGION_ENTRY_TYPE_R3)
 		return 0;
@@ -724,37 +843,30 @@ static inline int p4d_bad(p4d_t p4d)
 	return (p4d_val(p4d) & ~_REGION_ENTRY_BITS) != 0;
 }
 
-static inline int pmd_present(pmd_t pmd)
-{
-	return pmd_val(pmd) != _SEGMENT_ENTRY_EMPTY;
-}
-
 static inline int pmd_none(pmd_t pmd)
 {
 	return pmd_val(pmd) == _SEGMENT_ENTRY_EMPTY;
 }
 
-static inline unsigned long pmd_pfn(pmd_t pmd)
-{
-	unsigned long origin_mask;
-
-	origin_mask = _SEGMENT_ENTRY_ORIGIN;
-	if (pmd_large(pmd))
-		origin_mask = _SEGMENT_ENTRY_ORIGIN_LARGE;
-	return (pmd_val(pmd) & origin_mask) >> PAGE_SHIFT;
-}
-
 #define pmd_write pmd_write
 static inline int pmd_write(pmd_t pmd)
 {
 	return (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE) != 0;
 }
 
+#define pud_write pud_write
+static inline int pud_write(pud_t pud)
+{
+	return (pud_val(pud) & _REGION3_ENTRY_WRITE) != 0;
+}
+
+#define pmd_dirty pmd_dirty
 static inline int pmd_dirty(pmd_t pmd)
 {
 	return (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) != 0;
 }
 
+#define pmd_young pmd_young
 static inline int pmd_young(pmd_t pmd)
 {
 	return (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) != 0;
@@ -798,11 +910,26 @@ static inline int pte_protnone(pte_t pte)
 
 static inline int pmd_protnone(pmd_t pmd)
 {
-	/* pmd_large(pmd) implies pmd_present(pmd) */
-	return pmd_large(pmd) && !(pmd_val(pmd) & _SEGMENT_ENTRY_READ);
+	/* pmd_leaf(pmd) implies pmd_present(pmd) */
+	return pmd_leaf(pmd) && !(pmd_val(pmd) & _SEGMENT_ENTRY_READ);
 }
 #endif
 
+static inline int pte_swp_exclusive(pte_t pte)
+{
+	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
+}
+
+static inline pte_t pte_swp_mkexclusive(pte_t pte)
+{
+	return set_pte_bit(pte, __pgprot(_PAGE_SWP_EXCLUSIVE));
+}
+
+static inline pte_t pte_swp_clear_exclusive(pte_t pte)
+{
+	return clear_pte_bit(pte, __pgprot(_PAGE_SWP_EXCLUSIVE));
+}
+
 static inline int pte_soft_dirty(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SOFT_DIRTY;
@@ -811,15 +938,13 @@ static inline int pte_soft_dirty(pte_t pte)
 
 static inline pte_t pte_mksoft_dirty(pte_t pte)
 {
-	pte_val(pte) |= _PAGE_SOFT_DIRTY;
-	return pte;
+	return set_pte_bit(pte, __pgprot(_PAGE_SOFT_DIRTY));
 }
 #define pte_swp_mksoft_dirty pte_mksoft_dirty
 
 static inline pte_t pte_clear_soft_dirty(pte_t pte)
 {
-	pte_val(pte) &= ~_PAGE_SOFT_DIRTY;
-	return pte;
+	return clear_pte_bit(pte, __pgprot(_PAGE_SOFT_DIRTY));
 }
 #define pte_swp_clear_soft_dirty pte_clear_soft_dirty
 
@@ -830,14 +955,12 @@ static inline int pmd_soft_dirty(pmd_t pmd)
 
 static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
 {
-	pmd_val(pmd) |= _SEGMENT_ENTRY_SOFT_DIRTY;
-	return pmd;
+	return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_SOFT_DIRTY));
 }
 
 static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
 {
-	pmd_val(pmd) &= ~_SEGMENT_ENTRY_SOFT_DIRTY;
-	return pmd;
+	return clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_SOFT_DIRTY));
 }
 
 /*
@@ -866,35 +989,80 @@ static inline int pte_unused(pte_t pte)
 }
 
 /*
+ * Extract the pgprot value from the given pte while at the same time making it
+ * usable for kernel address space mappings where fault driven dirty and
+ * young/old accounting is not supported, i.e _PAGE_PROTECT and _PAGE_INVALID
+ * must not be set.
+ */
+#define pte_pgprot pte_pgprot
+static inline pgprot_t pte_pgprot(pte_t pte)
+{
+	unsigned long pte_flags = pte_val(pte) & _PAGE_CHG_MASK;
+
+	if (pte_write(pte))
+		pte_flags |= pgprot_val(PAGE_KERNEL);
+	else
+		pte_flags |= pgprot_val(PAGE_KERNEL_RO);
+	pte_flags |= pte_val(pte) & mio_wb_bit_mask;
+
+	return __pgprot(pte_flags);
+}
+
+/*
  * pgd/pmd/pte modification functions
  */
 
+static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	WRITE_ONCE(*pgdp, pgd);
+}
+
+static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
+{
+	WRITE_ONCE(*p4dp, p4d);
+}
+
+static inline void set_pud(pud_t *pudp, pud_t pud)
+{
+	WRITE_ONCE(*pudp, pud);
+}
+
+static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+	WRITE_ONCE(*pmdp, pmd);
+}
+
+static inline void set_pte(pte_t *ptep, pte_t pte)
+{
+	WRITE_ONCE(*ptep, pte);
+}
+
 static inline void pgd_clear(pgd_t *pgd)
 {
 	if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R1)
-		pgd_val(*pgd) = _REGION1_ENTRY_EMPTY;
+		set_pgd(pgd, __pgd(_REGION1_ENTRY_EMPTY));
 }
 
 static inline void p4d_clear(p4d_t *p4d)
 {
 	if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2)
-		p4d_val(*p4d) = _REGION2_ENTRY_EMPTY;
+		set_p4d(p4d, __p4d(_REGION2_ENTRY_EMPTY));
 }
 
 static inline void pud_clear(pud_t *pud)
 {
 	if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
-		pud_val(*pud) = _REGION3_ENTRY_EMPTY;
+		set_pud(pud, __pud(_REGION3_ENTRY_EMPTY));
 }
 
 static inline void pmd_clear(pmd_t *pmdp)
 {
-	pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
+	set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
 }
 
 static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
-	pte_val(*ptep) = _PAGE_INVALID;
+	set_pte(ptep, __pte(_PAGE_INVALID));
 }
 
 /*
@@ -903,79 +1071,74 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *pt
  */
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
-	pte_val(pte) &= _PAGE_CHG_MASK;
-	pte_val(pte) |= pgprot_val(newprot);
+	pte = clear_pte_bit(pte, __pgprot(~_PAGE_CHG_MASK));
+	pte = set_pte_bit(pte, newprot);
 	/*
 	 * newprot for PAGE_NONE, PAGE_RO, PAGE_RX, PAGE_RW and PAGE_RWX
 	 * has the invalid bit set, clear it again for readable, young pages
 	 */
 	if ((pte_val(pte) & _PAGE_YOUNG) && (pte_val(pte) & _PAGE_READ))
-		pte_val(pte) &= ~_PAGE_INVALID;
+		pte = clear_pte_bit(pte, __pgprot(_PAGE_INVALID));
 	/*
 	 * newprot for PAGE_RO, PAGE_RX, PAGE_RW and PAGE_RWX has the page
 	 * protection bit set, clear it again for writable, dirty pages
 	 */
 	if ((pte_val(pte) & _PAGE_DIRTY) && (pte_val(pte) & _PAGE_WRITE))
-		pte_val(pte) &= ~_PAGE_PROTECT;
+		pte = clear_pte_bit(pte, __pgprot(_PAGE_PROTECT));
 	return pte;
 }
 
 static inline pte_t pte_wrprotect(pte_t pte)
 {
-	pte_val(pte) &= ~_PAGE_WRITE;
-	pte_val(pte) |= _PAGE_PROTECT;
-	return pte;
+	pte = clear_pte_bit(pte, __pgprot(_PAGE_WRITE));
+	return set_pte_bit(pte, __pgprot(_PAGE_PROTECT));
 }
 
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite_novma(pte_t pte)
 {
-	pte_val(pte) |= _PAGE_WRITE;
+	pte = set_pte_bit(pte, __pgprot(_PAGE_WRITE));
 	if (pte_val(pte) & _PAGE_DIRTY)
-		pte_val(pte) &= ~_PAGE_PROTECT;
+		pte = clear_pte_bit(pte, __pgprot(_PAGE_PROTECT));
 	return pte;
 }
 
 static inline pte_t pte_mkclean(pte_t pte)
 {
-	pte_val(pte) &= ~_PAGE_DIRTY;
-	pte_val(pte) |= _PAGE_PROTECT;
-	return pte;
+	pte = clear_pte_bit(pte, __pgprot(_PAGE_DIRTY));
+	return set_pte_bit(pte, __pgprot(_PAGE_PROTECT));
 }
 
 static inline pte_t pte_mkdirty(pte_t pte)
 {
-	pte_val(pte) |= _PAGE_DIRTY | _PAGE_SOFT_DIRTY;
+	pte = set_pte_bit(pte, __pgprot(_PAGE_DIRTY | _PAGE_SOFT_DIRTY));
 	if (pte_val(pte) & _PAGE_WRITE)
-		pte_val(pte) &= ~_PAGE_PROTECT;
+		pte = clear_pte_bit(pte, __pgprot(_PAGE_PROTECT));
 	return pte;
 }
 
 static inline pte_t pte_mkold(pte_t pte)
 {
-	pte_val(pte) &= ~_PAGE_YOUNG;
-	pte_val(pte) |= _PAGE_INVALID;
-	return pte;
+	pte = clear_pte_bit(pte, __pgprot(_PAGE_YOUNG));
+	return set_pte_bit(pte, __pgprot(_PAGE_INVALID));
 }
 
 static inline pte_t pte_mkyoung(pte_t pte)
 {
-	pte_val(pte) |= _PAGE_YOUNG;
+	pte = set_pte_bit(pte, __pgprot(_PAGE_YOUNG));
 	if (pte_val(pte) & _PAGE_READ)
-		pte_val(pte) &= ~_PAGE_INVALID;
+		pte = clear_pte_bit(pte, __pgprot(_PAGE_INVALID));
 	return pte;
 }
 
 static inline pte_t pte_mkspecial(pte_t pte)
 {
-	pte_val(pte) |= _PAGE_SPECIAL;
-	return pte;
+	return set_pte_bit(pte, __pgprot(_PAGE_SPECIAL));
 }
 
 #ifdef CONFIG_HUGETLB_PAGE
 static inline pte_t pte_mkhuge(pte_t pte)
 {
-	pte_val(pte) |= _PAGE_LARGE;
-	return pte;
+	return set_pte_bit(pte, __pgprot(_PAGE_LARGE));
 }
 #endif
 
@@ -985,16 +1148,29 @@ static inline pte_t pte_mkhuge(pte_t pte)
 #define IPTE_NODAT	0x400
 #define IPTE_GUEST_ASCE	0x800
 
+static __always_inline void __ptep_rdp(unsigned long addr, pte_t *ptep,
+				       unsigned long opt, unsigned long asce,
+				       int local)
+{
+	unsigned long pto;
+
+	pto = __pa(ptep) & ~(PTRS_PER_PTE * sizeof(pte_t) - 1);
+	asm volatile(".insn rrf,0xb98b0000,%[r1],%[r2],%[asce],%[m4]"
+		     : "+m" (*ptep)
+		     : [r1] "a" (pto), [r2] "a" ((addr & PAGE_MASK) | opt),
+		       [asce] "a" (asce), [m4] "i" (local));
+}
+
 static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep,
 					unsigned long opt, unsigned long asce,
 					int local)
 {
-	unsigned long pto = (unsigned long) ptep;
+	unsigned long pto = __pa(ptep);
 
 	if (__builtin_constant_p(opt) && opt == 0) {
 		/* Invalidation + TLB flush for the pte */
 		asm volatile(
-			"	.insn	rrf,0xb2210000,%[r1],%[r2],0,%[m4]"
+			"	ipte	%[r1],%[r2],0,%[m4]"
 			: "+m" (*ptep) : [r1] "a" (pto), [r2] "a" (address),
 			  [m4] "i" (local));
 		return;
@@ -1003,7 +1179,7 @@ static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep,
 	/* Invalidate ptes with options + TLB flush of the ptes */
 	opt = opt | (asce & _ASCE_ORIGIN);
 	asm volatile(
-		"	.insn	rrf,0xb2210000,%[r1],%[r2],%[r3],%[m4]"
+		"	ipte	%[r1],%[r2],%[r3],%[m4]"
 		: [r2] "+a" (address), [r3] "+a" (opt)
 		: [r1] "a" (pto), [m4] "i" (local) : "memory");
 }
@@ -1011,12 +1187,12 @@ static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep,
 static __always_inline void __ptep_ipte_range(unsigned long address, int nr,
 					      pte_t *ptep, int local)
 {
-	unsigned long pto = (unsigned long) ptep;
+	unsigned long pto = __pa(ptep);
 
 	/* Invalidate a range of ptes + TLB flush of the ptes */
 	do {
 		asm volatile(
-			"       .insn rrf,0xb2210000,%[r1],%[r2],%[r3],%[m4]"
+			"	ipte %[r1],%[r2],%[r3],%[m4]"
 			: [r2] "+a" (address), [r3] "+a" (nr)
 			: [r1] "a" (pto), [m4] "i" (local) : "memory");
 	} while (nr != 255);
@@ -1059,7 +1235,13 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 				       unsigned long addr, pte_t *ptep)
 {
-	return ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
+	pte_t res;
+
+	res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
+	/* At this point the reference through the mapping is still present */
+	if (mm_is_protected(mm) && pte_present(res))
+		uv_convert_from_secure_pte(res);
+	return res;
 }
 
 #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
@@ -1071,7 +1253,13 @@ void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
 static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
 				     unsigned long addr, pte_t *ptep)
 {
-	return ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID));
+	pte_t res;
+
+	res = ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID));
+	/* At this point the reference through the mapping is still present */
+	if (mm_is_protected(vma->vm_mm) && pte_present(res))
+		uv_convert_from_secure_pte(res);
+	return res;
 }
 
 /*
@@ -1086,12 +1274,31 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
 					    unsigned long addr,
 					    pte_t *ptep, int full)
 {
+	pte_t res;
+
 	if (full) {
-		pte_t pte = *ptep;
-		*ptep = __pte(_PAGE_INVALID);
-		return pte;
+		res = *ptep;
+		set_pte(ptep, __pte(_PAGE_INVALID));
+	} else {
+		res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
 	}
-	return ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
+	/* Nothing to do */
+	if (!mm_is_protected(mm) || !pte_present(res))
+		return res;
+	/*
+	 * At this point the reference through the mapping is still present.
+	 * The notifier should have destroyed all protected vCPUs at this
+	 * point, so the destroy should be successful.
+	 */
+	if (full && !uv_destroy_pte(res))
+		return res;
+	/*
+	 * If something went wrong and the page could not be destroyed, or
+	 * if this is not a mm teardown, the slower export is used as
+	 * fallback instead.
+	 */
+	uv_convert_from_secure_pte(res);
+	return res;
 }
 
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
@@ -1104,6 +1311,44 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
 		ptep_xchg_lazy(mm, addr, ptep, pte_wrprotect(pte));
 }
 
+/*
+ * Check if PTEs only differ in _PAGE_PROTECT HW bit, but also allow SW PTE
+ * bits in the comparison. Those might change e.g. because of dirty and young
+ * tracking.
+ */
+static inline int pte_allow_rdp(pte_t old, pte_t new)
+{
+	/*
+	 * Only allow changes from RO to RW
+	 */
+	if (!(pte_val(old) & _PAGE_PROTECT) || pte_val(new) & _PAGE_PROTECT)
+		return 0;
+
+	return (pte_val(old) & _PAGE_RDP_MASK) == (pte_val(new) & _PAGE_RDP_MASK);
+}
+
+static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
+						unsigned long address,
+						pte_t *ptep)
+{
+	/*
+	 * RDP might not have propagated the PTE protection reset to all CPUs,
+	 * so there could be spurious TLB protection faults.
+	 * NOTE: This will also be called when a racing pagetable update on
+	 * another thread already installed the correct PTE. Both cases cannot
+	 * really be distinguished.
+	 * Therefore, only do the local TLB flush when RDP can be used, and the
+	 * PTE does not have _PAGE_PROTECT set, to avoid unnecessary overhead.
+	 * A local RDP can be used to do the flush.
+	 */
+	if (cpu_has_rdp() && !(pte_val(*ptep) & _PAGE_PROTECT))
+		__ptep_rdp(address, ptep, 0, 0, 1);
+}
+#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
+
+void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+			 pte_t new);
+
 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 static inline int ptep_set_access_flags(struct vm_area_struct *vma,
 					unsigned long addr, pte_t *ptep,
@@ -1111,7 +1356,10 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
 {
 	if (pte_same(*ptep, entry))
 		return 0;
-	ptep_xchg_direct(vma->vm_mm, addr, ptep, entry);
+	if (cpu_has_rdp() && !mm_has_pgste(vma->vm_mm) && pte_allow_rdp(*ptep, entry))
+		ptep_reset_dat_prot(vma->vm_mm, addr, ptep, entry);
+	else
+		ptep_xchg_direct(vma->vm_mm, addr, ptep, entry);
 	return 1;
 }
 
@@ -1153,21 +1401,40 @@ void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr);
 void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr);
 void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr);
 
+#define pgprot_writecombine	pgprot_writecombine
+pgprot_t pgprot_writecombine(pgprot_t prot);
+
+#define PFN_PTE_SHIFT		PAGE_SHIFT
+
 /*
- * Certain architectures need to do special things when PTEs
- * within a page table are directly modified.  Thus, the following
- * hook is made available.
+ * Set multiple PTEs to consecutive pages with a single call.  All PTEs
+ * are within the same folio, PMD and VMA.
  */
-static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
-			      pte_t *ptep, pte_t entry)
+static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
+			      pte_t *ptep, pte_t entry, unsigned int nr)
 {
 	if (pte_present(entry))
-		pte_val(entry) &= ~_PAGE_UNUSED;
-	if (mm_has_pgste(mm))
-		ptep_set_pte_at(mm, addr, ptep, entry);
-	else
-		*ptep = entry;
+		entry = clear_pte_bit(entry, __pgprot(_PAGE_UNUSED));
+	if (mm_has_pgste(mm)) {
+		for (;;) {
+			ptep_set_pte_at(mm, addr, ptep, entry);
+			if (--nr == 0)
+				break;
+			ptep++;
+			entry = __pte(pte_val(entry) + PAGE_SIZE);
+			addr += PAGE_SIZE;
+		}
+	} else {
+		for (;;) {
+			set_pte(ptep, entry);
+			if (--nr == 0)
+				break;
+			ptep++;
+			entry = __pte(pte_val(entry) + PAGE_SIZE);
+		}
+	}
 }
+#define set_ptes set_ptes
 
 /*
  * Conversion functions: convert a page and protection to a page entry,
@@ -1176,9 +1443,8 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
 static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot)
 {
 	pte_t __pte;
-	pte_val(__pte) = physpage + pgprot_val(pgprot);
-	if (!MACHINE_HAS_NX)
-		pte_val(__pte) &= ~_PAGE_NOEXEC;
+
+	__pte = __pte(physpage | pgprot_val(pgprot));
 	return pte_mkyoung(__pte);
 }
 
@@ -1196,12 +1462,40 @@ static inline pte_t mk_pte(struct page *page, pgprot_t pgprot)
 #define p4d_index(address) (((address) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
 #define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
-#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1))
 
-#define pmd_deref(pmd) (pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN)
-#define pud_deref(pud) (pud_val(pud) & _REGION_ENTRY_ORIGIN)
-#define p4d_deref(pud) (p4d_val(pud) & _REGION_ENTRY_ORIGIN)
-#define pgd_deref(pgd) (pgd_val(pgd) & _REGION_ENTRY_ORIGIN)
+#define p4d_deref(pud) ((unsigned long)__va(p4d_val(pud) & _REGION_ENTRY_ORIGIN))
+#define pgd_deref(pgd) ((unsigned long)__va(pgd_val(pgd) & _REGION_ENTRY_ORIGIN))
+
+static inline unsigned long pmd_deref(pmd_t pmd)
+{
+	unsigned long origin_mask;
+
+	origin_mask = _SEGMENT_ENTRY_ORIGIN;
+	if (pmd_leaf(pmd))
+		origin_mask = _SEGMENT_ENTRY_ORIGIN_LARGE;
+	return (unsigned long)__va(pmd_val(pmd) & origin_mask);
+}
+
+static inline unsigned long pmd_pfn(pmd_t pmd)
+{
+	return __pa(pmd_deref(pmd)) >> PAGE_SHIFT;
+}
+
+static inline unsigned long pud_deref(pud_t pud)
+{
+	unsigned long origin_mask;
+
+	origin_mask = _REGION_ENTRY_ORIGIN;
+	if (pud_leaf(pud))
+		origin_mask = _REGION3_ENTRY_ORIGIN_LARGE;
+	return (unsigned long)__va(pud_val(pud) & origin_mask);
+}
+
+#define pud_pfn pud_pfn
+static inline unsigned long pud_pfn(pud_t pud)
+{
+	return __pa(pud_deref(pud)) >> PAGE_SHIFT;
+}
 
 /*
  * The pgd_offset function *always* adds the index for the top-level
@@ -1227,38 +1521,52 @@ static inline pgd_t *pgd_offset_raw(pgd_t *pgd, unsigned long address)
 }
 
 #define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address)
-#define pgd_offset_k(address) pgd_offset(&init_mm, address)
 
-static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
+static inline p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned long address)
 {
-	if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
-		return (p4d_t *) pgd_deref(*pgd) + p4d_index(address);
-	return (p4d_t *) pgd;
+	if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
+		return (p4d_t *) pgd_deref(pgd) + p4d_index(address);
+	return (p4d_t *) pgdp;
 }
+#define p4d_offset_lockless p4d_offset_lockless
 
-static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
+static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long address)
 {
-	if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
-		return (pud_t *) p4d_deref(*p4d) + pud_index(address);
-	return (pud_t *) p4d;
+	return p4d_offset_lockless(pgdp, *pgdp, address);
 }
 
-static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
+static inline pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned long address)
 {
-	if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3)
-		return (pmd_t *) pud_deref(*pud) + pmd_index(address);
-	return (pmd_t *) pud;
+	if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
+		return (pud_t *) p4d_deref(p4d) + pud_index(address);
+	return (pud_t *) p4dp;
 }
+#define pud_offset_lockless pud_offset_lockless
 
-static inline pte_t *pte_offset(pmd_t *pmd, unsigned long address)
+static inline pud_t *pud_offset(p4d_t *p4dp, unsigned long address)
 {
-	return (pte_t *) pmd_deref(*pmd) + pte_index(address);
+	return pud_offset_lockless(p4dp, *p4dp, address);
 }
+#define pud_offset pud_offset
 
-#define pte_offset_kernel(pmd, address) pte_offset(pmd, address)
-#define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address)
+static inline pmd_t *pmd_offset_lockless(pud_t *pudp, pud_t pud, unsigned long address)
+{
+	if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3)
+		return (pmd_t *) pud_deref(pud) + pmd_index(address);
+	return (pmd_t *) pudp;
+}
+#define pmd_offset_lockless pmd_offset_lockless
+
+static inline pmd_t *pmd_offset(pud_t *pudp, unsigned long address)
+{
+	return pmd_offset_lockless(pudp, *pudp, address);
+}
+#define pmd_offset pmd_offset
 
-static inline void pte_unmap(pte_t *pte) { }
+static inline unsigned long pmd_page_vaddr(pmd_t pmd)
+{
+	return (unsigned long) pmd_deref(pmd);
+}
 
 static inline bool gup_fast_permitted(unsigned long start, unsigned long end)
 {
@@ -1266,7 +1574,7 @@ static inline bool gup_fast_permitted(unsigned long start, unsigned long end)
 }
 #define gup_fast_permitted gup_fast_permitted
 
-#define pfn_pte(pfn,pgprot) mk_pte_phys(__pa((pfn) << PAGE_SHIFT),(pgprot))
+#define pfn_pte(pfn, pgprot)	mk_pte_phys(((pfn) << PAGE_SHIFT), (pgprot))
 #define pte_pfn(x) (pte_val(x) >> PAGE_SHIFT)
 #define pte_page(x) pfn_to_page(pte_pfn(x))
 
@@ -1277,61 +1585,57 @@ static inline bool gup_fast_permitted(unsigned long start, unsigned long end)
 
 static inline pmd_t pmd_wrprotect(pmd_t pmd)
 {
-	pmd_val(pmd) &= ~_SEGMENT_ENTRY_WRITE;
-	pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
-	return pmd;
+	pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_WRITE));
+	return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT));
 }
 
-static inline pmd_t pmd_mkwrite(pmd_t pmd)
+static inline pmd_t pmd_mkwrite_novma(pmd_t pmd)
 {
-	pmd_val(pmd) |= _SEGMENT_ENTRY_WRITE;
+	pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_WRITE));
 	if (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY)
-		pmd_val(pmd) &= ~_SEGMENT_ENTRY_PROTECT;
+		pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT));
 	return pmd;
 }
 
 static inline pmd_t pmd_mkclean(pmd_t pmd)
 {
-	pmd_val(pmd) &= ~_SEGMENT_ENTRY_DIRTY;
-	pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
-	return pmd;
+	pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_DIRTY));
+	return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT));
 }
 
 static inline pmd_t pmd_mkdirty(pmd_t pmd)
 {
-	pmd_val(pmd) |= _SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_SOFT_DIRTY;
+	pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_SOFT_DIRTY));
 	if (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE)
-		pmd_val(pmd) &= ~_SEGMENT_ENTRY_PROTECT;
+		pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT));
 	return pmd;
 }
 
 static inline pud_t pud_wrprotect(pud_t pud)
 {
-	pud_val(pud) &= ~_REGION3_ENTRY_WRITE;
-	pud_val(pud) |= _REGION_ENTRY_PROTECT;
-	return pud;
+	pud = clear_pud_bit(pud, __pgprot(_REGION3_ENTRY_WRITE));
+	return set_pud_bit(pud, __pgprot(_REGION_ENTRY_PROTECT));
 }
 
 static inline pud_t pud_mkwrite(pud_t pud)
 {
-	pud_val(pud) |= _REGION3_ENTRY_WRITE;
+	pud = set_pud_bit(pud, __pgprot(_REGION3_ENTRY_WRITE));
 	if (pud_val(pud) & _REGION3_ENTRY_DIRTY)
-		pud_val(pud) &= ~_REGION_ENTRY_PROTECT;
+		pud = clear_pud_bit(pud, __pgprot(_REGION_ENTRY_PROTECT));
 	return pud;
 }
 
 static inline pud_t pud_mkclean(pud_t pud)
 {
-	pud_val(pud) &= ~_REGION3_ENTRY_DIRTY;
-	pud_val(pud) |= _REGION_ENTRY_PROTECT;
-	return pud;
+	pud = clear_pud_bit(pud, __pgprot(_REGION3_ENTRY_DIRTY));
+	return set_pud_bit(pud, __pgprot(_REGION_ENTRY_PROTECT));
 }
 
 static inline pud_t pud_mkdirty(pud_t pud)
 {
-	pud_val(pud) |= _REGION3_ENTRY_DIRTY | _REGION3_ENTRY_SOFT_DIRTY;
+	pud = set_pud_bit(pud, __pgprot(_REGION3_ENTRY_DIRTY | _REGION3_ENTRY_SOFT_DIRTY));
 	if (pud_val(pud) & _REGION3_ENTRY_WRITE)
-		pud_val(pud) &= ~_REGION_ENTRY_PROTECT;
+		pud = clear_pud_bit(pud, __pgprot(_REGION_ENTRY_PROTECT));
 	return pud;
 }
 
@@ -1355,37 +1659,39 @@ static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot)
 
 static inline pmd_t pmd_mkyoung(pmd_t pmd)
 {
-	pmd_val(pmd) |= _SEGMENT_ENTRY_YOUNG;
+	pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_YOUNG));
 	if (pmd_val(pmd) & _SEGMENT_ENTRY_READ)
-		pmd_val(pmd) &= ~_SEGMENT_ENTRY_INVALID;
+		pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_INVALID));
 	return pmd;
 }
 
 static inline pmd_t pmd_mkold(pmd_t pmd)
 {
-	pmd_val(pmd) &= ~_SEGMENT_ENTRY_YOUNG;
-	pmd_val(pmd) |= _SEGMENT_ENTRY_INVALID;
-	return pmd;
+	pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_YOUNG));
+	return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_INVALID));
 }
 
 static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 {
-	pmd_val(pmd) &= _SEGMENT_ENTRY_ORIGIN_LARGE |
-		_SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_YOUNG |
-		_SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SOFT_DIRTY;
-	pmd_val(pmd) |= massage_pgprot_pmd(newprot);
+	unsigned long mask;
+
+	mask  = _SEGMENT_ENTRY_ORIGIN_LARGE;
+	mask |= _SEGMENT_ENTRY_DIRTY;
+	mask |= _SEGMENT_ENTRY_YOUNG;
+	mask |=	_SEGMENT_ENTRY_LARGE;
+	mask |= _SEGMENT_ENTRY_SOFT_DIRTY;
+	pmd = __pmd(pmd_val(pmd) & mask);
+	pmd = set_pmd_bit(pmd, __pgprot(massage_pgprot_pmd(newprot)));
 	if (!(pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY))
-		pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
+		pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT));
 	if (!(pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG))
-		pmd_val(pmd) |= _SEGMENT_ENTRY_INVALID;
+		pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_INVALID));
 	return pmd;
 }
 
 static inline pmd_t mk_pmd_phys(unsigned long physpage, pgprot_t pgprot)
 {
-	pmd_t __pmd;
-	pmd_val(__pmd) = physpage + massage_pgprot_pmd(pgprot);
-	return __pmd;
+	return __pmd(physpage + massage_pgprot_pmd(pgprot));
 }
 
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLB_PAGE */
@@ -1409,11 +1715,11 @@ static __always_inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp,
 {
 	unsigned long sto;
 
-	sto = (unsigned long) pmdp - pmd_index(addr) * sizeof(pmd_t);
+	sto = __pa(pmdp) - pmd_index(addr) * sizeof(pmd_t);
 	if (__builtin_constant_p(opt) && opt == 0) {
 		/* flush without guest asce */
 		asm volatile(
-			"	.insn	rrf,0xb98e0000,%[r1],%[r2],0,%[m4]"
+			"	idte	%[r1],0,%[r2],%[m4]"
 			: "+m" (*pmdp)
 			: [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK)),
 			  [m4] "i" (local)
@@ -1421,7 +1727,7 @@ static __always_inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp,
 	} else {
 		/* flush with guest asce */
 		asm volatile(
-			"	.insn	rrf,0xb98e0000,%[r1],%[r2],%[r3],%[m4]"
+			"	idte	%[r1],%[r3],%[r2],%[m4]"
 			: "+m" (*pmdp)
 			: [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK) | opt),
 			  [r3] "a" (asce), [m4] "i" (local)
@@ -1435,12 +1741,12 @@ static __always_inline void __pudp_idte(unsigned long addr, pud_t *pudp,
 {
 	unsigned long r3o;
 
-	r3o = (unsigned long) pudp - pud_index(addr) * sizeof(pud_t);
+	r3o = __pa(pudp) - pud_index(addr) * sizeof(pud_t);
 	r3o |= _ASCE_TYPE_REGION3;
 	if (__builtin_constant_p(opt) && opt == 0) {
 		/* flush without guest asce */
 		asm volatile(
-			"	.insn	rrf,0xb98e0000,%[r1],%[r2],0,%[m4]"
+			"	idte	%[r1],0,%[r2],%[m4]"
 			: "+m" (*pudp)
 			: [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK)),
 			  [m4] "i" (local)
@@ -1448,7 +1754,7 @@ static __always_inline void __pudp_idte(unsigned long addr, pud_t *pudp,
 	} else {
 		/* flush with guest asce */
 		asm volatile(
-			"	.insn	rrf,0xb98e0000,%[r1],%[r2],%[r3],%[m4]"
+			"	idte	%[r1],%[r3],%[r2],%[m4]"
 			: "+m" (*pudp)
 			: [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK) | opt),
 			  [r3] "a" (asce), [m4] "i" (local)
@@ -1506,17 +1812,14 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
 static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 			      pmd_t *pmdp, pmd_t entry)
 {
-	if (!MACHINE_HAS_NX)
-		pmd_val(entry) &= ~_SEGMENT_ENTRY_NOEXEC;
-	*pmdp = entry;
+	set_pmd(pmdp, entry);
 }
 
 static inline pmd_t pmd_mkhuge(pmd_t pmd)
 {
-	pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE;
-	pmd_val(pmd) |= _SEGMENT_ENTRY_YOUNG;
-	pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
-	return pmd;
+	pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_LARGE));
+	pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_YOUNG));
+	return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT));
 }
 
 #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
@@ -1527,16 +1830,16 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 }
 
 #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
-static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
+static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
 						 unsigned long addr,
 						 pmd_t *pmdp, int full)
 {
 	if (full) {
 		pmd_t pmd = *pmdp;
-		*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
+		set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
 		return pmd;
 	}
-	return pmdp_xchg_lazy(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
+	return pmdp_xchg_lazy(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
 }
 
 #define __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
@@ -1550,8 +1853,10 @@ static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
 static inline pmd_t pmdp_invalidate(struct vm_area_struct *vma,
 				   unsigned long addr, pmd_t *pmdp)
 {
-	pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
+	pmd_t pmd;
 
+	VM_WARN_ON_ONCE(!pmd_present(*pmdp));
+	pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
 	return pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd);
 }
 
@@ -1573,36 +1878,36 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
 }
 #define pmdp_collapse_flush pmdp_collapse_flush
 
-#define pfn_pmd(pfn, pgprot)	mk_pmd_phys(__pa((pfn) << PAGE_SHIFT), (pgprot))
+#define pfn_pmd(pfn, pgprot)	mk_pmd_phys(((pfn) << PAGE_SHIFT), (pgprot))
 #define mk_pmd(page, pgprot)	pfn_pmd(page_to_pfn(page), (pgprot))
 
 static inline int pmd_trans_huge(pmd_t pmd)
 {
-	return pmd_val(pmd) & _SEGMENT_ENTRY_LARGE;
+	return pmd_leaf(pmd);
 }
 
 #define has_transparent_hugepage has_transparent_hugepage
 static inline int has_transparent_hugepage(void)
 {
-	return MACHINE_HAS_EDAT1 ? 1 : 0;
+	return cpu_has_edat1() ? 1 : 0;
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 /*
  * 64 bit swap entry format:
  * A page-table entry has some bits we have to treat in a special way.
- * Bits 52 and bit 55 have to be zero, otherwise a specification
- * exception will occur instead of a page translation exception. The
- * specification exception has the bad habit not to store necessary
- * information in the lowcore.
- * Bits 54 and 63 are used to indicate the page type.
+ * Bits 54 and 63 are used to indicate the page type. Bit 53 marks the pte
+ * as invalid.
  * A swap pte is indicated by bit pattern (pte & 0x201) == 0x200
- * This leaves the bits 0-51 and bits 56-62 to store type and offset.
- * We use the 5 bits from 57-61 for the type and the 52 bits from 0-51
- * for the offset.
- * |			  offset			|01100|type |00|
+ * |			  offset			|E11XX|type |S0|
  * |0000000000111111111122222222223333333333444444444455|55555|55566|66|
  * |0123456789012345678901234567890123456789012345678901|23456|78901|23|
+ *
+ * Bits 0-51 store the offset.
+ * Bit 52 (E) is used to remember PG_anon_exclusive.
+ * Bits 57-61 store the type.
+ * Bit 62 (S) is used for softdirty tracking.
+ * Bits 55 and 56 (X) are unused.
  */
 
 #define __SWP_OFFSET_MASK	((1UL << 52) - 1)
@@ -1612,12 +1917,12 @@ static inline int has_transparent_hugepage(void)
 
 static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset)
 {
-	pte_t pte;
+	unsigned long pteval;
 
-	pte_val(pte) = _PAGE_INVALID | _PAGE_PROTECT;
-	pte_val(pte) |= (offset & __SWP_OFFSET_MASK) << __SWP_OFFSET_SHIFT;
-	pte_val(pte) |= (type & __SWP_TYPE_MASK) << __SWP_TYPE_SHIFT;
-	return pte;
+	pteval = _PAGE_INVALID | _PAGE_PROTECT;
+	pteval |= (offset & __SWP_OFFSET_MASK) << __SWP_OFFSET_SHIFT;
+	pteval |= (type & __SWP_TYPE_MASK) << __SWP_TYPE_SHIFT;
+	return __pte(pteval);
 }
 
 static inline unsigned long __swp_type(swp_entry_t entry)
@@ -1638,10 +1943,59 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset)
 #define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(x)	((pte_t) { (x).val })
 
-#define kern_addr_valid(addr)   (1)
+/*
+ * 64 bit swap entry format for REGION3 and SEGMENT table entries (RSTE)
+ * Bits 59 and 63 are used to indicate the swap entry. Bit 58 marks the rste
+ * as invalid.
+ * A swap entry is indicated by bit pattern (rste & 0x011) == 0x010
+ * |			  offset			|Xtype |11TT|S0|
+ * |0000000000111111111122222222223333333333444444444455|555555|5566|66|
+ * |0123456789012345678901234567890123456789012345678901|234567|8901|23|
+ *
+ * Bits 0-51 store the offset.
+ * Bits 53-57 store the type.
+ * Bit 62 (S) is used for softdirty tracking.
+ * Bits 60-61 (TT) indicate the table type: 0x01 for REGION3 and 0x00 for SEGMENT.
+ * Bit 52 (X) is unused.
+ */
+
+#define __SWP_OFFSET_MASK_RSTE	((1UL << 52) - 1)
+#define __SWP_OFFSET_SHIFT_RSTE	12
+#define __SWP_TYPE_MASK_RSTE		((1UL << 5) - 1)
+#define __SWP_TYPE_SHIFT_RSTE	6
+
+/*
+ * TT bits set to 0x00 == SEGMENT. For REGION3 entries, caller must add R3
+ * bits 0x01. See also __set_huge_pte_at().
+ */
+static inline unsigned long mk_swap_rste(unsigned long type, unsigned long offset)
+{
+	unsigned long rste;
+
+	rste = _RST_ENTRY_INVALID | _RST_ENTRY_COMM;
+	rste |= (offset & __SWP_OFFSET_MASK_RSTE) << __SWP_OFFSET_SHIFT_RSTE;
+	rste |= (type & __SWP_TYPE_MASK_RSTE) << __SWP_TYPE_SHIFT_RSTE;
+	return rste;
+}
+
+static inline unsigned long __swp_type_rste(swp_entry_t entry)
+{
+	return (entry.val >> __SWP_TYPE_SHIFT_RSTE) & __SWP_TYPE_MASK_RSTE;
+}
+
+static inline unsigned long __swp_offset_rste(swp_entry_t entry)
+{
+	return (entry.val >> __SWP_OFFSET_SHIFT_RSTE) & __SWP_OFFSET_MASK_RSTE;
+}
+
+#define __rste_to_swp_entry(rste)	((swp_entry_t) { rste })
 
 extern int vmem_add_mapping(unsigned long start, unsigned long size);
-extern int vmem_remove_mapping(unsigned long start, unsigned long size);
+extern void vmem_remove_mapping(unsigned long start, unsigned long size);
+extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc);
+extern int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot);
+extern void vmem_unmap_4k_page(unsigned long addr);
+extern pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc);
 extern int s390_enable_sie(void);
 extern int s390_enable_skey(void);
 extern void s390_reset_cmma(struct mm_struct *mm);
@@ -1650,6 +2004,21 @@ extern void s390_reset_cmma(struct mm_struct *mm);
 #define HAVE_ARCH_UNMAPPED_AREA
 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
 
-#include <asm-generic/pgtable.h>
+#define pmd_pgtable(pmd) \
+	((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE))
+
+static inline unsigned long gmap_pgste_get_pgt_addr(unsigned long *pgt)
+{
+	unsigned long *pgstes, res;
+
+	pgstes = pgt + _PAGE_ENTRIES;
+
+	res = (pgstes[0] & PGSTE_ST2_MASK) << 16;
+	res |= pgstes[1] & PGSTE_ST2_MASK;
+	res |= (pgstes[2] & PGSTE_ST2_MASK) >> 16;
+	res |= (pgstes[3] & PGSTE_ST2_MASK) >> 32;
+
+	return res;
+}
 
 #endif /* _S390_PAGE_H */
diff --git a/arch/s390/include/asm/physmem_info.h b/arch/s390/include/asm/physmem_info.h
new file mode 100644
index 000000000000..7ef3bbec98b0
--- /dev/null
+++ b/arch/s390/include/asm/physmem_info.h
@@ -0,0 +1,178 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_MEM_DETECT_H
+#define _ASM_S390_MEM_DETECT_H
+
+#include <linux/types.h>
+#include <asm/page.h>
+
+enum physmem_info_source {
+	MEM_DETECT_NONE = 0,
+	MEM_DETECT_SCLP_STOR_INFO,
+	MEM_DETECT_DIAG260,
+	MEM_DETECT_DIAG500_STOR_LIMIT,
+	MEM_DETECT_SCLP_READ_INFO,
+	MEM_DETECT_BIN_SEARCH
+};
+
+struct physmem_range {
+	u64 start;
+	u64 end;
+};
+
+enum reserved_range_type {
+	RR_DECOMPRESSOR,
+	RR_INITRD,
+	RR_VMLINUX,
+	RR_AMODE31,
+	RR_IPLREPORT,
+	RR_CERT_COMP_LIST,
+	RR_MEM_DETECT_EXT,
+	RR_VMEM,
+	RR_MAX
+};
+
+struct reserved_range {
+	unsigned long start;
+	unsigned long end;
+	struct reserved_range *chain;
+};
+
+/*
+ * Storage element id is defined as 1 byte (up to 256 storage elements).
+ * In practise only storage element id 0 and 1 are used).
+ * According to architecture one storage element could have as much as
+ * 1020 subincrements. 255 physmem_ranges are embedded in physmem_info.
+ * If more physmem_ranges are required, a block of memory from already
+ * known physmem_range is taken (online_extended points to it).
+ */
+#define MEM_INLINED_ENTRIES 255 /* (PAGE_SIZE - 16) / 16 */
+
+struct physmem_info {
+	u32 range_count;
+	u8 info_source;
+	unsigned long usable;
+	struct reserved_range reserved[RR_MAX];
+	struct physmem_range online[MEM_INLINED_ENTRIES];
+	struct physmem_range *online_extended;
+};
+
+extern struct physmem_info physmem_info;
+
+void add_physmem_online_range(u64 start, u64 end);
+
+static inline int __get_physmem_range(u32 n, unsigned long *start,
+				      unsigned long *end, bool respect_usable_limit)
+{
+	if (n >= physmem_info.range_count) {
+		*start = 0;
+		*end = 0;
+		return -1;
+	}
+
+	if (n < MEM_INLINED_ENTRIES) {
+		*start = (unsigned long)physmem_info.online[n].start;
+		*end = (unsigned long)physmem_info.online[n].end;
+	} else {
+		*start = (unsigned long)physmem_info.online_extended[n - MEM_INLINED_ENTRIES].start;
+		*end = (unsigned long)physmem_info.online_extended[n - MEM_INLINED_ENTRIES].end;
+	}
+
+	if (respect_usable_limit && physmem_info.usable) {
+		if (*start >= physmem_info.usable)
+			return -1;
+		if (*end > physmem_info.usable)
+			*end = physmem_info.usable;
+	}
+	return 0;
+}
+
+/**
+ * for_each_physmem_usable_range - early online memory range iterator
+ * @i: an integer used as loop variable
+ * @p_start: ptr to unsigned long for start address of the range
+ * @p_end: ptr to unsigned long for end address of the range
+ *
+ * Walks over detected online memory ranges below usable limit.
+ */
+#define for_each_physmem_usable_range(i, p_start, p_end)		\
+	for (i = 0; !__get_physmem_range(i, p_start, p_end, true); i++)
+
+/* Walks over all detected online memory ranges disregarding usable limit. */
+#define for_each_physmem_online_range(i, p_start, p_end)		\
+	for (i = 0; !__get_physmem_range(i, p_start, p_end, false); i++)
+
+static inline const char *get_physmem_info_source(void)
+{
+	switch (physmem_info.info_source) {
+	case MEM_DETECT_SCLP_STOR_INFO:
+		return "sclp storage info";
+	case MEM_DETECT_DIAG260:
+		return "diag260";
+	case MEM_DETECT_DIAG500_STOR_LIMIT:
+		return "diag500 storage limit";
+	case MEM_DETECT_SCLP_READ_INFO:
+		return "sclp read info";
+	case MEM_DETECT_BIN_SEARCH:
+		return "binary search";
+	}
+	return "none";
+}
+
+#define RR_TYPE_NAME(t) case RR_ ## t: return #t
+static inline const char *get_rr_type_name(enum reserved_range_type t)
+{
+	switch (t) {
+	RR_TYPE_NAME(DECOMPRESSOR);
+	RR_TYPE_NAME(INITRD);
+	RR_TYPE_NAME(VMLINUX);
+	RR_TYPE_NAME(AMODE31);
+	RR_TYPE_NAME(IPLREPORT);
+	RR_TYPE_NAME(CERT_COMP_LIST);
+	RR_TYPE_NAME(MEM_DETECT_EXT);
+	RR_TYPE_NAME(VMEM);
+	default:
+		return "UNKNOWN";
+	}
+}
+
+#define for_each_physmem_reserved_type_range(t, range, p_start, p_end)				\
+	for (range = &physmem_info.reserved[t], *p_start = range->start, *p_end = range->end;	\
+	     range && range->end; range = range->chain ? __va(range->chain) : NULL,		\
+	     *p_start = range ? range->start : 0, *p_end = range ? range->end : 0)
+
+static inline struct reserved_range *__physmem_reserved_next(enum reserved_range_type *t,
+							     struct reserved_range *range)
+{
+	if (!range) {
+		range = &physmem_info.reserved[*t];
+		if (range->end)
+			return range;
+	}
+	if (range->chain)
+		return __va(range->chain);
+	while (++*t < RR_MAX) {
+		range = &physmem_info.reserved[*t];
+		if (range->end)
+			return range;
+	}
+	return NULL;
+}
+
+#define for_each_physmem_reserved_range(t, range, p_start, p_end)			\
+	for (t = 0, range = __physmem_reserved_next(&t, NULL),			\
+	    *p_start = range ? range->start : 0, *p_end = range ? range->end : 0;	\
+	     range; range = __physmem_reserved_next(&t, range),			\
+	    *p_start = range ? range->start : 0, *p_end = range ? range->end : 0)
+
+static inline unsigned long get_physmem_reserved(enum reserved_range_type type,
+						 unsigned long *addr, unsigned long *size)
+{
+	*addr = physmem_info.reserved[type].start;
+	*size = physmem_info.reserved[type].end - physmem_info.reserved[type].start;
+	return *size;
+}
+
+#define AMODE31_START	(physmem_info.reserved[RR_AMODE31].start)
+#define AMODE31_END	(physmem_info.reserved[RR_AMODE31].end)
+
+#endif
diff --git a/arch/s390/include/asm/pkey.h b/arch/s390/include/asm/pkey.h
index dd3d20c332ac..5dca1a46a9f6 100644
--- a/arch/s390/include/asm/pkey.h
+++ b/arch/s390/include/asm/pkey.h
@@ -2,7 +2,7 @@
 /*
  * Kernelspace interface to the pkey device driver
  *
- * Copyright IBM Corp. 2016,2019
+ * Copyright IBM Corp. 2016, 2023
  *
  * Author: Harald Freudenberger <freude@de.ibm.com>
  *
@@ -22,7 +22,7 @@
  * @param protkey pointer to buffer receiving the protected key
  * @return 0 on success, negative errno value on failure
  */
-int pkey_keyblob2pkey(const u8 *key, u32 keylen,
-		      struct pkey_protkey *protkey);
+int pkey_key2protkey(const u8 *key, u32 keylen,
+		     u8 *protkey, u32 *protkeylen, u32 *protkeytype);
 
 #endif /* _KAPI_PKEY_H */
diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h
index b5ea9e14c017..6ccd033acfe5 100644
--- a/arch/s390/include/asm/preempt.h
+++ b/arch/s390/include/asm/preempt.h
@@ -5,136 +5,122 @@
 #include <asm/current.h>
 #include <linux/thread_info.h>
 #include <asm/atomic_ops.h>
-
-#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
+#include <asm/cmpxchg.h>
+#include <asm/march.h>
 
 /* We use the MSB mostly because its available */
 #define PREEMPT_NEED_RESCHED	0x80000000
+
+/*
+ * We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such
+ * that a decrement hitting 0 means we can and should reschedule.
+ */
 #define PREEMPT_ENABLED	(0 + PREEMPT_NEED_RESCHED)
 
-static inline int preempt_count(void)
+/*
+ * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
+ * that think a non-zero value indicates we cannot preempt.
+ */
+static __always_inline int preempt_count(void)
 {
-	return READ_ONCE(S390_lowcore.preempt_count) & ~PREEMPT_NEED_RESCHED;
+	return READ_ONCE(get_lowcore()->preempt_count) & ~PREEMPT_NEED_RESCHED;
 }
 
-static inline void preempt_count_set(int pc)
+static __always_inline void preempt_count_set(int pc)
 {
 	int old, new;
 
+	old = READ_ONCE(get_lowcore()->preempt_count);
 	do {
-		old = READ_ONCE(S390_lowcore.preempt_count);
-		new = (old & PREEMPT_NEED_RESCHED) |
-			(pc & ~PREEMPT_NEED_RESCHED);
-	} while (__atomic_cmpxchg(&S390_lowcore.preempt_count,
-				  old, new) != old);
+		new = (old & PREEMPT_NEED_RESCHED) | (pc & ~PREEMPT_NEED_RESCHED);
+	} while (!arch_try_cmpxchg(&get_lowcore()->preempt_count, &old, new));
 }
 
-#define init_task_preempt_count(p)	do { } while (0)
-
-#define init_idle_preempt_count(p, cpu)	do { \
-	S390_lowcore.preempt_count = PREEMPT_ENABLED; \
-} while (0)
+/*
+ * We fold the NEED_RESCHED bit into the preempt count such that
+ * preempt_enable() can decrement and test for needing to reschedule with a
+ * short instruction sequence.
+ *
+ * We invert the actual bit, so that when the decrement hits 0 we know we both
+ * need to resched (the bit is cleared) and can resched (no preempt count).
+ */
 
-static inline void set_preempt_need_resched(void)
+static __always_inline void set_preempt_need_resched(void)
 {
-	__atomic_and(~PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count);
+	__atomic_and(~PREEMPT_NEED_RESCHED, &get_lowcore()->preempt_count);
 }
 
-static inline void clear_preempt_need_resched(void)
+static __always_inline void clear_preempt_need_resched(void)
 {
-	__atomic_or(PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count);
+	__atomic_or(PREEMPT_NEED_RESCHED, &get_lowcore()->preempt_count);
 }
 
-static inline bool test_preempt_need_resched(void)
+static __always_inline bool test_preempt_need_resched(void)
 {
-	return !(READ_ONCE(S390_lowcore.preempt_count) & PREEMPT_NEED_RESCHED);
+	return !(READ_ONCE(get_lowcore()->preempt_count) & PREEMPT_NEED_RESCHED);
 }
 
-static inline void __preempt_count_add(int val)
+static __always_inline void __preempt_count_add(int val)
 {
-	if (__builtin_constant_p(val) && (val >= -128) && (val <= 127))
-		__atomic_add_const(val, &S390_lowcore.preempt_count);
-	else
-		__atomic_add(val, &S390_lowcore.preempt_count);
+	/*
+	 * With some obscure config options and CONFIG_PROFILE_ALL_BRANCHES
+	 * enabled, gcc 12 fails to handle __builtin_constant_p().
+	 */
+	if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES)) {
+		if (__builtin_constant_p(val) && (val >= -128) && (val <= 127)) {
+			__atomic_add_const(val, &get_lowcore()->preempt_count);
+			return;
+		}
+	}
+	__atomic_add(val, &get_lowcore()->preempt_count);
 }
 
-static inline void __preempt_count_sub(int val)
+static __always_inline void __preempt_count_sub(int val)
 {
 	__preempt_count_add(-val);
 }
 
-static inline bool __preempt_count_dec_and_test(void)
-{
-	return __atomic_add(-1, &S390_lowcore.preempt_count) == 1;
-}
-
-static inline bool should_resched(int preempt_offset)
+/*
+ * Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule
+ * a decrement which hits zero means we have no preempt_count and should
+ * reschedule.
+ */
+static __always_inline bool __preempt_count_dec_and_test(void)
 {
-	return unlikely(READ_ONCE(S390_lowcore.preempt_count) ==
-			preempt_offset);
+	return __atomic_add_const_and_test(-1, &get_lowcore()->preempt_count);
 }
 
-#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */
-
-#define PREEMPT_ENABLED	(0)
-
-static inline int preempt_count(void)
+/*
+ * Returns true when we need to resched and can (barring IRQ state).
+ */
+static __always_inline bool should_resched(int preempt_offset)
 {
-	return READ_ONCE(S390_lowcore.preempt_count);
-}
-
-static inline void preempt_count_set(int pc)
-{
-	S390_lowcore.preempt_count = pc;
+	return unlikely(READ_ONCE(get_lowcore()->preempt_count) == preempt_offset);
 }
 
 #define init_task_preempt_count(p)	do { } while (0)
+/* Deferred to CPU bringup time */
+#define init_idle_preempt_count(p, cpu)	do { } while (0)
 
-#define init_idle_preempt_count(p, cpu)	do { \
-	S390_lowcore.preempt_count = PREEMPT_ENABLED; \
-} while (0)
-
-static inline void set_preempt_need_resched(void)
-{
-}
-
-static inline void clear_preempt_need_resched(void)
-{
-}
+#ifdef CONFIG_PREEMPTION
 
-static inline bool test_preempt_need_resched(void)
-{
-	return false;
-}
+void preempt_schedule(void);
+void preempt_schedule_notrace(void);
 
-static inline void __preempt_count_add(int val)
-{
-	S390_lowcore.preempt_count += val;
-}
+#ifdef CONFIG_PREEMPT_DYNAMIC
 
-static inline void __preempt_count_sub(int val)
-{
-	S390_lowcore.preempt_count -= val;
-}
+void dynamic_preempt_schedule(void);
+void dynamic_preempt_schedule_notrace(void);
+#define __preempt_schedule()		dynamic_preempt_schedule()
+#define __preempt_schedule_notrace()	dynamic_preempt_schedule_notrace()
 
-static inline bool __preempt_count_dec_and_test(void)
-{
-	return !--S390_lowcore.preempt_count && tif_need_resched();
-}
+#else /* CONFIG_PREEMPT_DYNAMIC */
 
-static inline bool should_resched(int preempt_offset)
-{
-	return unlikely(preempt_count() == preempt_offset &&
-			tif_need_resched());
-}
+#define __preempt_schedule()		preempt_schedule()
+#define __preempt_schedule_notrace()	preempt_schedule_notrace()
 
-#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */
+#endif /* CONFIG_PREEMPT_DYNAMIC */
 
-#ifdef CONFIG_PREEMPT
-extern asmlinkage void preempt_schedule(void);
-#define __preempt_schedule() preempt_schedule()
-extern asmlinkage void preempt_schedule_notrace(void);
-#define __preempt_schedule_notrace() preempt_schedule_notrace()
-#endif /* CONFIG_PREEMPT */
+#endif /* CONFIG_PREEMPTION */
 
 #endif /* __ASM_PREEMPT_H */
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 361ef5eda468..6c8063cb8fe7 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -14,62 +14,85 @@
 
 #include <linux/bits.h>
 
-#define CIF_MCCK_PENDING	0	/* machine check handling is pending */
-#define CIF_ASCE_PRIMARY	1	/* primary asce needs fixup / uaccess */
-#define CIF_ASCE_SECONDARY	2	/* secondary asce needs fixup / uaccess */
-#define CIF_NOHZ_DELAY		3	/* delay HZ disable for a tick */
-#define CIF_FPU			4	/* restore FPU registers */
-#define CIF_IGNORE_IRQ		5	/* ignore interrupt (for udelay) */
-#define CIF_ENABLED_WAIT	6	/* in enabled wait state */
-#define CIF_MCCK_GUEST		7	/* machine check happening in guest */
-#define CIF_DEDICATED_CPU	8	/* this CPU is dedicated */
-
-#define _CIF_MCCK_PENDING	BIT(CIF_MCCK_PENDING)
-#define _CIF_ASCE_PRIMARY	BIT(CIF_ASCE_PRIMARY)
-#define _CIF_ASCE_SECONDARY	BIT(CIF_ASCE_SECONDARY)
+#define CIF_NOHZ_DELAY		2	/* delay HZ disable for a tick */
+#define CIF_ENABLED_WAIT	5	/* in enabled wait state */
+#define CIF_MCCK_GUEST		6	/* machine check happening in guest */
+#define CIF_DEDICATED_CPU	7	/* this CPU is dedicated */
+
 #define _CIF_NOHZ_DELAY		BIT(CIF_NOHZ_DELAY)
-#define _CIF_FPU		BIT(CIF_FPU)
-#define _CIF_IGNORE_IRQ		BIT(CIF_IGNORE_IRQ)
 #define _CIF_ENABLED_WAIT	BIT(CIF_ENABLED_WAIT)
 #define _CIF_MCCK_GUEST		BIT(CIF_MCCK_GUEST)
 #define _CIF_DEDICATED_CPU	BIT(CIF_DEDICATED_CPU)
 
+#define RESTART_FLAG_CTLREGS	_AC(1 << 0, U)
+
 #ifndef __ASSEMBLY__
 
 #include <linux/cpumask.h>
 #include <linux/linkage.h>
 #include <linux/irqflags.h>
+#include <linux/bitops.h>
+#include <asm/fpu-types.h>
 #include <asm/cpu.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
 #include <asm/setup.h>
 #include <asm/runtime_instr.h>
-#include <asm/fpu/types.h>
-#include <asm/fpu/internal.h>
+#include <asm/irqflags.h>
+#include <asm/alternative.h>
+#include <asm/fault.h>
+
+struct pcpu {
+	unsigned long ec_mask;		/* bit mask for ec_xxx functions */
+	unsigned long ec_clk;		/* sigp timestamp for ec_xxx */
+	unsigned long flags;		/* per CPU flags */
+	unsigned long capacity;		/* cpu capacity for scheduler */
+	signed char state;		/* physical cpu state */
+	signed char polarization;	/* physical polarization */
+	u16 address;			/* physical cpu address */
+};
+
+DECLARE_PER_CPU(struct pcpu, pcpu_devices);
+
+typedef long (*sys_call_ptr_t)(struct pt_regs *regs);
+
+static __always_inline struct pcpu *this_pcpu(void)
+{
+	return (struct pcpu *)(get_lowcore()->pcpu);
+}
 
-static inline void set_cpu_flag(int flag)
+static __always_inline void set_cpu_flag(int flag)
 {
-	S390_lowcore.cpu_flags |= (1UL << flag);
+	set_bit(flag, &this_pcpu()->flags);
 }
 
-static inline void clear_cpu_flag(int flag)
+static __always_inline void clear_cpu_flag(int flag)
 {
-	S390_lowcore.cpu_flags &= ~(1UL << flag);
+	clear_bit(flag, &this_pcpu()->flags);
 }
 
-static inline int test_cpu_flag(int flag)
+static __always_inline bool test_cpu_flag(int flag)
 {
-	return !!(S390_lowcore.cpu_flags & (1UL << flag));
+	return test_bit(flag, &this_pcpu()->flags);
+}
+
+static __always_inline bool test_and_set_cpu_flag(int flag)
+{
+	return test_and_set_bit(flag, &this_pcpu()->flags);
+}
+
+static __always_inline bool test_and_clear_cpu_flag(int flag)
+{
+	return test_and_clear_bit(flag, &this_pcpu()->flags);
 }
 
 /*
  * Test CIF flag of another CPU. The caller needs to ensure that
  * CPU hotplug can not happen, e.g. by disabling preemption.
  */
-static inline int test_cpu_flag_of(int flag, int cpu)
+static __always_inline bool test_cpu_flag_of(int flag, int cpu)
 {
-	struct lowcore *lc = lowcore_ptr[cpu];
-	return !!(lc->cpu_flags & (1UL << flag));
+	return test_bit(flag, &per_cpu(pcpu_devices, cpu).flags);
 }
 
 #define arch_needs_cpu() test_cpu_flag(CIF_NOHZ_DELAY)
@@ -79,70 +102,107 @@ static inline void get_cpu_id(struct cpuid *ptr)
 	asm volatile("stidp %0" : "=Q" (*ptr));
 }
 
+static __always_inline unsigned long get_cpu_timer(void)
+{
+	unsigned long timer;
+
+	asm volatile("stpt	%[timer]" : [timer] "=Q" (timer));
+	return timer;
+}
+
 void s390_adjust_jiffies(void);
 void s390_update_cpu_mhz(void);
 void cpu_detect_mhz_feature(void);
 
 extern const struct seq_operations cpuinfo_op;
-extern int sysctl_ieee_emulation_warnings;
 extern void execve_tail(void);
-extern void __bpon(void);
+unsigned long vdso_text_size(void);
+unsigned long vdso_size(void);
 
 /*
  * User space process size: 2GB for 31 bit, 4TB or 8PT for 64 bit.
  */
 
-#define TASK_SIZE_OF(tsk)	(test_tsk_thread_flag(tsk, TIF_31BIT) ? \
-					(1UL << 31) : -PAGE_SIZE)
+#define TASK_SIZE		(test_thread_flag(TIF_31BIT) ? \
+					_REGION3_SIZE : TASK_SIZE_MAX)
 #define TASK_UNMAPPED_BASE	(test_thread_flag(TIF_31BIT) ? \
-					(1UL << 30) : (1UL << 41))
-#define TASK_SIZE		TASK_SIZE_OF(current)
+					(_REGION3_SIZE >> 1) : (_REGION2_SIZE >> 1))
 #define TASK_SIZE_MAX		(-PAGE_SIZE)
 
-#define STACK_TOP		(test_thread_flag(TIF_31BIT) ? \
-					(1UL << 31) : (1UL << 42))
-#define STACK_TOP_MAX		(1UL << 42)
+#define VDSO_BASE		(STACK_TOP + PAGE_SIZE)
+#define VDSO_LIMIT		(test_thread_flag(TIF_31BIT) ? _REGION3_SIZE : _REGION2_SIZE)
+#define STACK_TOP		(VDSO_LIMIT - vdso_size() - PAGE_SIZE)
+#define STACK_TOP_MAX		(_REGION2_SIZE - vdso_size() - PAGE_SIZE)
 
 #define HAVE_ARCH_PICK_MMAP_LAYOUT
 
-typedef unsigned int mm_segment_t;
+#define __stackleak_poison __stackleak_poison
+static __always_inline void __stackleak_poison(unsigned long erase_low,
+					       unsigned long erase_high,
+					       unsigned long poison)
+{
+	unsigned long tmp, count;
+
+	count = erase_high - erase_low;
+	if (!count)
+		return;
+	asm volatile(
+		"	cghi	%[count],8\n"
+		"	je	2f\n"
+		"	aghi	%[count],-(8+1)\n"
+		"	srlg	%[tmp],%[count],8\n"
+		"	ltgr	%[tmp],%[tmp]\n"
+		"	jz	1f\n"
+		"0:	stg	%[poison],0(%[addr])\n"
+		"	mvc	8(256-8,%[addr]),0(%[addr])\n"
+		"	la	%[addr],256(%[addr])\n"
+		"	brctg	%[tmp],0b\n"
+		"1:	stg	%[poison],0(%[addr])\n"
+		"	exrl	%[count],3f\n"
+		"	j	4f\n"
+		"2:	stg	%[poison],0(%[addr])\n"
+		"	j	4f\n"
+		"3:	mvc	8(1,%[addr]),0(%[addr])\n"
+		"4:\n"
+		: [addr] "+&a" (erase_low), [count] "+&d" (count), [tmp] "=&a" (tmp)
+		: [poison] "d" (poison)
+		: "memory", "cc"
+		);
+}
 
 /*
  * Thread structure
  */
 struct thread_struct {
 	unsigned int  acrs[NUM_ACRS];
-        unsigned long ksp;              /* kernel stack pointer             */
-	unsigned long user_timer;	/* task cputime in user space */
-	unsigned long guest_timer;	/* task cputime in kvm guest */
-	unsigned long system_timer;	/* task cputime in kernel space */
-	unsigned long hardirq_timer;	/* task cputime in hardirq context */
-	unsigned long softirq_timer;	/* task cputime in softirq context */
-	unsigned long sys_call_table;	/* system call table address */
-	mm_segment_t mm_segment;
-	unsigned long gmap_addr;	/* address of last gmap fault. */
-	unsigned int gmap_write_flag;	/* gmap fault write indication */
-	unsigned int gmap_int_code;	/* int code of last gmap fault */
-	unsigned int gmap_pfault;	/* signal of a pending guest pfault */
+	unsigned long ksp;			/* kernel stack pointer */
+	unsigned long user_timer;		/* task cputime in user space */
+	unsigned long guest_timer;		/* task cputime in kvm guest */
+	unsigned long system_timer;		/* task cputime in kernel space */
+	unsigned long hardirq_timer;		/* task cputime in hardirq context */
+	unsigned long softirq_timer;		/* task cputime in softirq context */
+	const sys_call_ptr_t *sys_call_table;	/* system call table address */
+	union teid gmap_teid;			/* address and flags of last gmap fault */
+	unsigned int gmap_int_code;		/* int code of last gmap fault */
+	int ufpu_flags;				/* user fpu flags */
+	int kfpu_flags;				/* kernel fpu flags */
+
 	/* Per-thread information related to debugging */
-	struct per_regs per_user;	/* User specified PER registers */
-	struct per_event per_event;	/* Cause of the last PER trap */
-	unsigned long per_flags;	/* Flags to control debug behavior */
-	unsigned int system_call;	/* system call number in signal */
-	unsigned long last_break;	/* last breaking-event-address. */
-        /* pfault_wait is used to block the process on a pfault event */
+	struct per_regs per_user;		/* User specified PER registers */
+	struct per_event per_event;		/* Cause of the last PER trap */
+	unsigned long per_flags;		/* Flags to control debug behavior */
+	unsigned int system_call;		/* system call number in signal */
+	unsigned long last_break;		/* last breaking-event-address. */
+	/* pfault_wait is used to block the process on a pfault event */
 	unsigned long pfault_wait;
 	struct list_head list;
 	/* cpu runtime instrumentation */
 	struct runtime_instr_cb *ri_cb;
-	struct gs_cb *gs_cb;		/* Current guarded storage cb */
-	struct gs_cb *gs_bc_cb;		/* Broadcast guarded storage cb */
-	unsigned char trap_tdb[256];	/* Transaction abort diagnose block */
-	/*
-	 * Warning: 'fpu' is dynamically-sized. It *MUST* be at
-	 * the end.
-	 */
-	struct fpu fpu;			/* FP and VX register save area */
+	struct gs_cb *gs_cb;			/* Current guarded storage cb */
+	struct gs_cb *gs_bc_cb;			/* Broadcast guarded storage cb */
+	struct pgm_tdb trap_tdb;		/* Transaction abort diagnose block */
+	struct fpu ufpu;			/* User FP and VX register save area */
+	struct fpu kfpu;			/* Kernel FP and VX register save area */
 };
 
 /* Flag to disable transactions. */
@@ -161,7 +221,7 @@ typedef struct thread_struct thread_struct;
 
 #define INIT_THREAD {							\
 	.ksp = sizeof(init_stack) + (unsigned long) &init_stack,	\
-	.fpu.regs = (void *) init_task.thread.fpu.fprs,			\
+	.last_break = 1,						\
 }
 
 /*
@@ -178,11 +238,9 @@ typedef struct thread_struct thread_struct;
 	regs->psw.mask	= PSW_USER_BITS | PSW_MASK_BA;			\
 	regs->psw.addr	= new_psw;					\
 	regs->gprs[15]	= new_stackp;					\
-	crst_table_downgrade(current->mm);				\
 	execve_tail();							\
 } while (0)
 
-/* Forward declaration, a strange C thing */
 struct task_struct;
 struct mm_struct;
 struct seq_file;
@@ -191,13 +249,11 @@ struct pt_regs;
 void show_registers(struct pt_regs *regs);
 void show_cacheinfo(struct seq_file *m);
 
-/* Free all resources held by a thread. */
-static inline void release_thread(struct task_struct *tsk) { }
-
 /* Free guarded storage control block */
 void guarded_storage_release(struct task_struct *tsk);
+void gs_load_bc_cb(struct pt_regs *regs);
 
-unsigned long get_wchan(struct task_struct *p);
+unsigned long __get_wchan(struct task_struct *p);
 #define task_pt_regs(tsk) ((struct pt_regs *) \
         (task_stack_page(tsk) + THREAD_SIZE) - 1)
 #define KSTK_EIP(tsk)	(task_pt_regs(tsk)->psw.addr)
@@ -206,15 +262,25 @@ unsigned long get_wchan(struct task_struct *p);
 /* Has task runtime instrumentation enabled ? */
 #define is_ri_task(tsk) (!!(tsk)->thread.ri_cb)
 
-static __always_inline unsigned long current_stack_pointer(void)
+/* avoid using global register due to gcc bug in versions < 8.4 */
+#define current_stack_pointer (__current_stack_pointer())
+
+static __always_inline unsigned long __current_stack_pointer(void)
 {
 	unsigned long sp;
 
-	asm volatile("la %0,0(15)" : "=a" (sp));
+	asm volatile("lgr %0,15" : "=d" (sp));
 	return sp;
 }
 
-static __no_kasan_or_inline unsigned short stap(void)
+static __always_inline bool on_thread_stack(void)
+{
+	unsigned long ksp = get_lowcore()->kernel_stack;
+
+	return !((ksp ^ current_stack_pointer) & ~(THREAD_SIZE - 1));
+}
+
+static __always_inline unsigned short stap(void)
 {
 	unsigned short cpu_address;
 
@@ -231,8 +297,7 @@ static inline unsigned long __ecag(unsigned int asi, unsigned char parm)
 {
 	unsigned long val;
 
-	asm volatile(".insn	rsy,0xeb000000004c,%0,0,0(%1)" /* ecag */
-		     : "=d" (val) : "a" (asi << 8 | parm));
+	asm volatile("ecag %0,0,0(%1)" : "=d" (val) : "a" (asi << 8 | parm));
 	return val;
 }
 
@@ -253,10 +318,10 @@ static inline void __load_psw(psw_t psw)
  * Set PSW mask to specified value, while leaving the
  * PSW addr pointing to the next instruction.
  */
-static __no_kasan_or_inline void __load_psw_mask(unsigned long mask)
+static __always_inline void __load_psw_mask(unsigned long mask)
 {
+	psw_t psw __uninitialized;
 	unsigned long addr;
-	psw_t psw;
 
 	psw.mask = mask;
 
@@ -279,14 +344,36 @@ static inline unsigned long __extract_psw(void)
 	return (((unsigned long) reg1) << 32) | ((unsigned long) reg2);
 }
 
-static inline void local_mcck_enable(void)
+static inline unsigned long __local_mcck_save(void)
 {
-	__load_psw_mask(__extract_psw() | PSW_MASK_MCHECK);
+	unsigned long mask = __extract_psw();
+
+	__load_psw_mask(mask & ~PSW_MASK_MCHECK);
+	return mask & PSW_MASK_MCHECK;
+}
+
+#define local_mcck_save(mflags)			\
+do {						\
+	typecheck(unsigned long, mflags);	\
+	mflags = __local_mcck_save();		\
+} while (0)
+
+static inline void local_mcck_restore(unsigned long mflags)
+{
+	unsigned long mask = __extract_psw();
+
+	mask &= ~PSW_MASK_MCHECK;
+	__load_psw_mask(mask | mflags);
 }
 
 static inline void local_mcck_disable(void)
 {
-	__load_psw_mask(__extract_psw() & ~PSW_MASK_MCHECK);
+	__local_mcck_save();
+}
+
+static inline void local_mcck_enable(void)
+{
+	__load_psw_mask(__extract_psw() | PSW_MASK_MCHECK);
 }
 
 /*
@@ -303,11 +390,6 @@ static inline unsigned long __rewind_psw(psw_t psw, unsigned long ilc)
 }
 
 /*
- * Function to stop a processor until the next interrupt occurs
- */
-void enabled_wait(void);
-
-/*
  * Function to drop a processor into disabled wait state
  */
 static __always_inline void __noreturn disabled_wait(void)
@@ -320,30 +402,21 @@ static __always_inline void __noreturn disabled_wait(void)
 	while (1);
 }
 
-/*
- * Basic Machine Check/Program Check Handler.
- */
-
-extern void s390_base_pgm_handler(void);
-extern void s390_base_ext_handler(void);
-
-extern void (*s390_base_pgm_handler_fn)(void);
-extern void (*s390_base_ext_handler_fn)(void);
-
 #define ARCH_LOW_ADDRESS_LIMIT	0x7fffffffUL
 
-extern int memcpy_real(void *, void *, size_t);
-extern void memcpy_absolute(void *, void *, size_t);
-
-#define mem_assign_absolute(dest, val) do {			\
-	__typeof__(dest) __tmp = (val);				\
-								\
-	BUILD_BUG_ON(sizeof(__tmp) != sizeof(val));		\
-	memcpy_absolute(&(dest), &__tmp, sizeof(__tmp));	\
-} while (0)
+static __always_inline bool regs_irqs_disabled(struct pt_regs *regs)
+{
+	return arch_irqs_disabled_flags(regs->psw.mask);
+}
 
-extern int s390_isolate_bp(void);
-extern int s390_isolate_bp_guest(void);
+static __always_inline void bpon(void)
+{
+	asm_inline volatile(
+		ALTERNATIVE("	nop\n",
+			    "	.insn	rrf,0xb2e80000,0,0,13,0\n",
+			    ALT_SPEC(82))
+		);
+}
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index f009a13afe71..c66f3fc6daaf 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -9,25 +9,52 @@
 
 #include <linux/bits.h>
 #include <uapi/asm/ptrace.h>
+#include <asm/tpi.h>
 
-#define PIF_SYSCALL		0	/* inside a system call */
-#define PIF_PER_TRAP		1	/* deliver sigtrap on return to user */
-#define PIF_SYSCALL_RESTART	2	/* restart the current system call */
-#define PIF_GUEST_FAULT		3	/* indicates program check in sie64a */
+#define PIF_SYSCALL			0	/* inside a system call */
+#define PIF_SYSCALL_RET_SET		2	/* return value was set via ptrace */
+#define PIF_GUEST_FAULT			3	/* indicates program check in sie64a */
+#define PIF_FTRACE_FULL_REGS		4	/* all register contents valid (ftrace) */
 
-#define _PIF_SYSCALL		BIT(PIF_SYSCALL)
-#define _PIF_PER_TRAP		BIT(PIF_PER_TRAP)
-#define _PIF_SYSCALL_RESTART	BIT(PIF_SYSCALL_RESTART)
-#define _PIF_GUEST_FAULT	BIT(PIF_GUEST_FAULT)
+#define _PIF_SYSCALL			BIT(PIF_SYSCALL)
+#define _PIF_SYSCALL_RET_SET		BIT(PIF_SYSCALL_RET_SET)
+#define _PIF_GUEST_FAULT		BIT(PIF_GUEST_FAULT)
+#define _PIF_FTRACE_FULL_REGS		BIT(PIF_FTRACE_FULL_REGS)
 
-#ifndef __ASSEMBLY__
+#define PSW32_MASK_PER		_AC(0x40000000, UL)
+#define PSW32_MASK_DAT		_AC(0x04000000, UL)
+#define PSW32_MASK_IO		_AC(0x02000000, UL)
+#define PSW32_MASK_EXT		_AC(0x01000000, UL)
+#define PSW32_MASK_KEY		_AC(0x00F00000, UL)
+#define PSW32_MASK_BASE		_AC(0x00080000, UL)	/* Always one */
+#define PSW32_MASK_MCHECK	_AC(0x00040000, UL)
+#define PSW32_MASK_WAIT		_AC(0x00020000, UL)
+#define PSW32_MASK_PSTATE	_AC(0x00010000, UL)
+#define PSW32_MASK_ASC		_AC(0x0000C000, UL)
+#define PSW32_MASK_CC		_AC(0x00003000, UL)
+#define PSW32_MASK_PM		_AC(0x00000f00, UL)
+#define PSW32_MASK_RI		_AC(0x00000080, UL)
+
+#define PSW32_ADDR_AMODE	_AC(0x80000000, UL)
+#define PSW32_ADDR_INSN		_AC(0x7FFFFFFF, UL)
+
+#define PSW32_DEFAULT_KEY	((PAGE_DEFAULT_ACC) << 20)
+
+#define PSW32_ASC_PRIMARY	_AC(0x00000000, UL)
+#define PSW32_ASC_ACCREG	_AC(0x00004000, UL)
+#define PSW32_ASC_SECONDARY	_AC(0x00008000, UL)
+#define PSW32_ASC_HOME		_AC(0x0000C000, UL)
+
+#define PSW_DEFAULT_KEY			((PAGE_DEFAULT_ACC) << 52)
 
 #define PSW_KERNEL_BITS	(PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_ASC_HOME | \
-			 PSW_MASK_EA | PSW_MASK_BA)
+			 PSW_MASK_EA | PSW_MASK_BA | PSW_MASK_DAT)
 #define PSW_USER_BITS	(PSW_MASK_DAT | PSW_MASK_IO | PSW_MASK_EXT | \
 			 PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_MASK_MCHECK | \
 			 PSW_MASK_PSTATE | PSW_ASC_PRIMARY)
 
+#ifndef __ASSEMBLY__
+
 struct psw_bits {
 	unsigned long	     :	1;
 	unsigned long per    :	1; /* PER-Mask */
@@ -68,12 +95,19 @@ enum {
 	&(*(struct psw_bits *)(&(__psw)));	\
 }))
 
+typedef struct {
+	unsigned int mask;
+	unsigned int addr;
+} psw_t32 __aligned(8);
+
+#define PGM_INT_CODE_MASK	0x7f
+#define PGM_INT_CODE_PER	0x80
+
 /*
  * The pt_regs struct defines the way the registers are stored on
  * the stack during a system call.
  */
-struct pt_regs 
-{
+struct pt_regs {
 	union {
 		user_pt_regs user_regs;
 		struct {
@@ -83,10 +117,17 @@ struct pt_regs
 		};
 	};
 	unsigned long orig_gpr2;
-	unsigned int int_code;
-	unsigned int int_parm;
-	unsigned long int_parm_long;
+	union {
+		struct {
+			unsigned int int_code;
+			unsigned int int_parm;
+			unsigned long int_parm_long;
+		};
+		struct tpi_info tpi_info;
+	};
 	unsigned long flags;
+	unsigned long cr1;
+	unsigned long last_break;
 };
 
 /*
@@ -152,6 +193,18 @@ static inline int test_pt_regs_flag(struct pt_regs *regs, int flag)
 	return !!(regs->flags & (1UL << flag));
 }
 
+static inline int test_and_clear_pt_regs_flag(struct pt_regs *regs, int flag)
+{
+	int ret = test_pt_regs_flag(regs, flag);
+
+	clear_pt_regs_flag(regs, flag);
+	return ret;
+}
+
+struct task_struct;
+
+void update_cr_regs(struct task_struct *task);
+
 /*
  * These are defined as per linux/ptrace.h, which see.
  */
@@ -179,10 +232,34 @@ const char *regs_query_register_name(unsigned int offset);
 unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset);
 unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n);
 
+/**
+ * regs_get_kernel_argument() - get Nth function argument in kernel
+ * @regs:	pt_regs of that context
+ * @n:		function argument number (start from 0)
+ *
+ * regs_get_kernel_argument() returns @n th argument of the function call.
+ */
+static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs,
+						     unsigned int n)
+{
+	unsigned int argoffset = STACK_FRAME_OVERHEAD / sizeof(long);
+
+#define NR_REG_ARGUMENTS 5
+	if (n < NR_REG_ARGUMENTS)
+		return regs_get_register(regs, 2 + n);
+	n -= NR_REG_ARGUMENTS;
+	return regs_get_kernel_stack_nth(regs, argoffset + n);
+}
+
 static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
 {
 	return regs->gprs[15];
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
+{
+	regs->gprs[2] = rc;
+}
+
 #endif /* __ASSEMBLY__ */
 #endif /* _S390_PTRACE_H */
diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h
index 71e3f0146cda..69c4ead0c332 100644
--- a/arch/s390/include/asm/qdio.h
+++ b/arch/s390/include/asm/qdio.h
@@ -9,8 +9,9 @@
 #define __QDIO_H__
 
 #include <linux/interrupt.h>
-#include <asm/cio.h>
+#include <asm/dma-types.h>
 #include <asm/ccwdev.h>
+#include <asm/cio.h>
 
 /* only use 4 queues to save some cachelines */
 #define QDIO_MAX_QUEUES_PER_IRQ		4
@@ -18,7 +19,6 @@
 #define QDIO_MAX_BUFFERS_MASK		(QDIO_MAX_BUFFERS_PER_Q - 1)
 #define QDIO_BUFNR(num)			((num) & QDIO_MAX_BUFFERS_MASK)
 #define QDIO_MAX_ELEMENTS_PER_BUFFER	16
-#define QDIO_SBAL_SIZE			256
 
 #define QDIO_QETH_QFMT			0
 #define QDIO_ZFCP_QFMT			1
@@ -26,18 +26,18 @@
 
 /**
  * struct qdesfmt0 - queue descriptor, format 0
- * @sliba: storage list information block address
- * @sla: storage list address
- * @slsba: storage list state block address
+ * @sliba: absolute address of storage list information block
+ * @sla: absolute address of storage list
+ * @slsba: absolute address of storage list state block
  * @akey: access key for SLIB
  * @bkey: access key for SL
  * @ckey: access key for SBALs
  * @dkey: access key for SLSB
  */
 struct qdesfmt0 {
-	u64 sliba;
-	u64 sla;
-	u64 slsba;
+	dma64_t sliba;
+	dma64_t sla;
+	dma64_t slsba;
 	u32	 : 32;
 	u32 akey : 4;
 	u32 bkey : 4;
@@ -56,7 +56,7 @@ struct qdesfmt0 {
  * @oqdcnt: output queue descriptor count
  * @iqdsz: input queue descriptor size
  * @oqdsz: output queue descriptor size
- * @qiba: queue information block address
+ * @qiba: absolute address of queue information block
  * @qkey: queue information block key
  * @qdf0: queue descriptions
  */
@@ -75,7 +75,7 @@ struct qdr {
 	/* private: */
 	u32 res[9];
 	/* public: */
-	u64 qiba;
+	dma64_t qiba;
 	u32	   : 32;
 	u32 qkey   : 4;
 	u32	   : 28;
@@ -92,8 +92,8 @@ struct qdr {
  * @pfmt: implementation dependent parameter format
  * @rflags: QEBSM
  * @ac: adapter characteristics
- * @isliba: absolute address of first input SLIB
- * @osliba: absolute address of first output SLIB
+ * @isliba: logical address of first input SLIB
+ * @osliba: logical address of first output SLIB
  * @ebcnam: adapter identifier in EBCDIC
  * @parm: implementation dependent parameters
  */
@@ -134,10 +134,9 @@ struct slibe {
  * @sb_count: number of storage blocks
  * @sba: storage block element addresses
  * @dcount: size of storage block elements
- * @user0: user defineable value
- * @res4: reserved paramater
- * @user1: user defineable value
- * @user2: user defineable value
+ * @user0: user definable value
+ * @res4: reserved parameter
+ * @user1: user definable value
  */
 struct qaob {
 	u64 res0[6];
@@ -148,12 +147,11 @@ struct qaob {
 	u8 flags;
 	u16 cbtbs;
 	u8 sb_count;
-	u64 sba[QDIO_MAX_ELEMENTS_PER_BUFFER];
+	dma64_t sba[QDIO_MAX_ELEMENTS_PER_BUFFER];
 	u16 dcount[QDIO_MAX_ELEMENTS_PER_BUFFER];
 	u64 user0;
 	u64 res4[2];
-	u64 user1;
-	u64 user2;
+	u8 user1[16];
 } __attribute__ ((packed, aligned(256)));
 
 /**
@@ -201,7 +199,7 @@ struct slib {
  * @scount: SBAL count
  * @sflags: whole SBAL flags
  * @length: length
- * @addr: address
+ * @addr: absolute data address
 */
 struct qdio_buffer_element {
 	u8 eflags;
@@ -211,7 +209,7 @@ struct qdio_buffer_element {
 	u8 scount;
 	u8 sflags;
 	u32 length;
-	void *addr;
+	dma64_t addr;
 } __attribute__ ((packed, aligned(16)));
 
 /**
@@ -227,7 +225,7 @@ struct qdio_buffer {
  * @sbal: absolute SBAL address
  */
 struct sl_element {
-	unsigned long sbal;
+	dma64_t sbal;
 } __attribute__ ((packed));
 
 /**
@@ -246,25 +244,8 @@ struct slsb {
 	u8 val[QDIO_MAX_BUFFERS_PER_Q];
 } __attribute__ ((packed, aligned(256)));
 
-/**
- * struct qdio_outbuf_state - SBAL related asynchronous operation information
- *   (for communication with upper layer programs)
- *   (only required for use with completion queues)
- * @flags: flags indicating state of buffer
- * @user: pointer to upper layer program's state information related to SBAL
- *        (stored in user1 data of QAOB)
- */
-struct qdio_outbuf_state {
-	u8 flags;
-	void *user;
-};
-
-#define QDIO_OUTBUF_STATE_FLAG_PENDING	0x01
-
-#define CHSC_AC1_INITIATE_INPUTQ	0x80
-
-
 /* qdio adapter-characteristics-1 flag */
+#define CHSC_AC1_INITIATE_INPUTQ	0x80
 #define AC1_SIGA_INPUT_NEEDED		0x40	/* process input queues */
 #define AC1_SIGA_OUTPUT_NEEDED		0x20	/* process output queues */
 #define AC1_SIGA_SYNC_NEEDED		0x10	/* ask hypervisor to sync */
@@ -310,14 +291,14 @@ struct qdio_ssqd_desc {
 typedef void qdio_handler_t(struct ccw_device *, unsigned int, int,
 			    int, int, unsigned long);
 
-/* qdio errors reported to the upper-layer program */
+/* qdio errors reported through the queue handlers: */
 #define QDIO_ERROR_ACTIVATE			0x0001
 #define QDIO_ERROR_GET_BUF_STATE		0x0002
 #define QDIO_ERROR_SET_BUF_STATE		0x0004
-#define QDIO_ERROR_SLSB_STATE			0x0100
 
-#define QDIO_ERROR_FATAL			0x00ff
-#define QDIO_ERROR_TEMPORARY			0xff00
+/* extra info for completed SBALs: */
+#define QDIO_ERROR_SLSB_STATE			0x0100
+#define QDIO_ERROR_SLSB_PENDING			0x0200
 
 /* for qdio_cleanup */
 #define QDIO_FLAG_CLEANUP_USING_CLEAR		0x01
@@ -325,109 +306,60 @@ typedef void qdio_handler_t(struct ccw_device *, unsigned int, int,
 
 /**
  * struct qdio_initialize - qdio initialization data
- * @cdev: associated ccw device
  * @q_format: queue format
  * @qdr_ac: feature flags to set
- * @adapter_name: name for the adapter
  * @qib_param_field_format: format for qib_parm_field
  * @qib_param_field: pointer to 128 bytes or NULL, if no param field
  * @qib_rflags: rflags to set
- * @input_slib_elements: pointer to no_input_qs * 128 words of data or NULL
- * @output_slib_elements: pointer to no_output_qs * 128 words of data or NULL
  * @no_input_qs: number of input queues
  * @no_output_qs: number of output queues
- * @input_handler: handler to be called for input queues
+ * @input_handler: handler to be called for input queues, and device-wide errors
  * @output_handler: handler to be called for output queues
- * @queue_start_poll_array: polling handlers (one per input queue or NULL)
+ * @irq_poll: Data IRQ polling handler
  * @scan_threshold: # of in-use buffers that triggers scan on output queue
  * @int_parm: interruption parameter
- * @input_sbal_addr_array:  address of no_input_qs * 128 pointers
- * @output_sbal_addr_array: address of no_output_qs * 128 pointers
- * @output_sbal_state_array: no_output_qs * 128 state info (for CQ or NULL)
+ * @input_sbal_addr_array:  per-queue array, each element points to 128 SBALs
+ * @output_sbal_addr_array: per-queue array, each element points to 128 SBALs
  */
 struct qdio_initialize {
-	struct ccw_device *cdev;
 	unsigned char q_format;
 	unsigned char qdr_ac;
-	unsigned char adapter_name[8];
 	unsigned int qib_param_field_format;
 	unsigned char *qib_param_field;
 	unsigned char qib_rflags;
-	unsigned long *input_slib_elements;
-	unsigned long *output_slib_elements;
 	unsigned int no_input_qs;
 	unsigned int no_output_qs;
 	qdio_handler_t *input_handler;
 	qdio_handler_t *output_handler;
-	void (**queue_start_poll_array) (struct ccw_device *, int,
-					  unsigned long);
-	unsigned int scan_threshold;
+	void (*irq_poll)(struct ccw_device *cdev, unsigned long data);
 	unsigned long int_parm;
-	struct qdio_buffer **input_sbal_addr_array;
-	struct qdio_buffer **output_sbal_addr_array;
-	struct qdio_outbuf_state *output_sbal_state_array;
+	struct qdio_buffer ***input_sbal_addr_array;
+	struct qdio_buffer ***output_sbal_addr_array;
 };
 
-/**
- * enum qdio_brinfo_entry_type - type of address entry for qdio_brinfo_desc()
- * @l3_ipv6_addr: entry contains IPv6 address
- * @l3_ipv4_addr: entry contains IPv4 address
- * @l2_addr_lnid: entry contains MAC address and VLAN ID
- */
-enum qdio_brinfo_entry_type {l3_ipv6_addr, l3_ipv4_addr, l2_addr_lnid};
-
-/**
- * struct qdio_brinfo_entry_XXX - Address entry for qdio_brinfo_desc()
- * @nit:  Network interface token
- * @addr: Address of one of the three types
- *
- * The struct is passed to the callback function by qdio_brinfo_desc()
- */
-struct qdio_brinfo_entry_l3_ipv6 {
-	u64 nit;
-	struct { unsigned char _s6_addr[16]; } addr;
-} __packed;
-struct qdio_brinfo_entry_l3_ipv4 {
-	u64 nit;
-	struct { uint32_t _s_addr; } addr;
-} __packed;
-struct qdio_brinfo_entry_l2 {
-	u64 nit;
-	struct { u8 mac[6]; u16 lnid; } addr_lnid;
-} __packed;
-
-#define QDIO_STATE_INACTIVE		0x00000002 /* after qdio_cleanup */
-#define QDIO_STATE_ESTABLISHED		0x00000004 /* after qdio_establish */
-#define QDIO_STATE_ACTIVE		0x00000008 /* after qdio_activate */
-#define QDIO_STATE_STOPPED		0x00000010 /* after queues went down */
-
-#define QDIO_FLAG_SYNC_INPUT		0x01
-#define QDIO_FLAG_SYNC_OUTPUT		0x02
-#define QDIO_FLAG_PCI_OUT		0x10
-
 int qdio_alloc_buffers(struct qdio_buffer **buf, unsigned int count);
 void qdio_free_buffers(struct qdio_buffer **buf, unsigned int count);
 void qdio_reset_buffers(struct qdio_buffer **buf, unsigned int count);
 
-extern int qdio_allocate(struct qdio_initialize *);
-extern int qdio_establish(struct qdio_initialize *);
+extern int qdio_allocate(struct ccw_device *cdev, unsigned int no_input_qs,
+			 unsigned int no_output_qs);
+extern int qdio_establish(struct ccw_device *cdev,
+			  struct qdio_initialize *init_data);
 extern int qdio_activate(struct ccw_device *);
-extern void qdio_release_aob(struct qaob *);
-extern int do_QDIO(struct ccw_device *, unsigned int, int, unsigned int,
-		   unsigned int);
-extern int qdio_start_irq(struct ccw_device *, int);
-extern int qdio_stop_irq(struct ccw_device *, int);
-extern int qdio_get_next_buffers(struct ccw_device *, int, int *, int *);
-extern int qdio_inspect_queue(struct ccw_device *cdev, unsigned int nr,
-			      bool is_input, unsigned int *bufnr,
-			      unsigned int *error);
+extern int qdio_start_irq(struct ccw_device *cdev);
+extern int qdio_stop_irq(struct ccw_device *cdev);
+extern int qdio_inspect_input_queue(struct ccw_device *cdev, unsigned int nr,
+				    unsigned int *bufnr, unsigned int *error);
+extern int qdio_inspect_output_queue(struct ccw_device *cdev, unsigned int nr,
+				     unsigned int *bufnr, unsigned int *error);
+extern int qdio_add_bufs_to_input_queue(struct ccw_device *cdev,
+					unsigned int q_nr, unsigned int bufnr,
+					unsigned int count);
+extern int qdio_add_bufs_to_output_queue(struct ccw_device *cdev,
+					 unsigned int q_nr, unsigned int bufnr,
+					 unsigned int count, struct qaob *aob);
 extern int qdio_shutdown(struct ccw_device *, int);
 extern int qdio_free(struct ccw_device *);
 extern int qdio_get_ssqd_desc(struct ccw_device *, struct qdio_ssqd_desc *);
-extern int qdio_pnso_brinfo(struct subchannel_id schid,
-		int cnc, u16 *response,
-		void (*cb)(void *priv, enum qdio_brinfo_entry_type type,
-				void *entry),
-		void *priv);
 
 #endif /* __QDIO_H__ */
diff --git a/arch/s390/include/asm/runtime-const.h b/arch/s390/include/asm/runtime-const.h
new file mode 100644
index 000000000000..17878b1d048c
--- /dev/null
+++ b/arch/s390/include/asm/runtime-const.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_RUNTIME_CONST_H
+#define _ASM_S390_RUNTIME_CONST_H
+
+#include <linux/uaccess.h>
+
+#define runtime_const_ptr(sym)					\
+({								\
+	typeof(sym) __ret;					\
+								\
+	asm_inline(						\
+		"0:	iihf	%[__ret],%[c1]\n"		\
+		"	iilf	%[__ret],%[c2]\n"		\
+		".pushsection runtime_ptr_" #sym ",\"a\"\n"	\
+		".long 0b - .\n"				\
+		".popsection"					\
+		: [__ret] "=d" (__ret)				\
+		: [c1] "i" (0x01234567UL),			\
+		  [c2] "i" (0x89abcdefUL));			\
+	__ret;							\
+})
+
+#define runtime_const_shift_right_32(val, sym)			\
+({								\
+	unsigned int __ret = (val);				\
+								\
+	asm_inline(						\
+		"0:	srl	%[__ret],12\n"			\
+		".pushsection runtime_shift_" #sym ",\"a\"\n"	\
+		".long 0b - .\n"				\
+		".popsection"					\
+		: [__ret] "+d" (__ret));			\
+	__ret;							\
+})
+
+#define runtime_const_init(type, sym) do {			\
+	extern s32 __start_runtime_##type##_##sym[];		\
+	extern s32 __stop_runtime_##type##_##sym[];		\
+								\
+	runtime_const_fixup(__runtime_fixup_##type,		\
+			    (unsigned long)(sym),		\
+			    __start_runtime_##type##_##sym,	\
+			    __stop_runtime_##type##_##sym);	\
+} while (0)
+
+/* 32-bit immediate for iihf and iilf in bits in I2 field */
+static inline void __runtime_fixup_32(u32 *p, unsigned int val)
+{
+	s390_kernel_write(p, &val, sizeof(val));
+}
+
+static inline void __runtime_fixup_ptr(void *where, unsigned long val)
+{
+	__runtime_fixup_32(where + 2, val >> 32);
+	__runtime_fixup_32(where + 8, val);
+}
+
+/* Immediate value is lower 12 bits of D2 field of srl */
+static inline void __runtime_fixup_shift(void *where, unsigned long val)
+{
+	u32 insn = *(u32 *)where;
+
+	insn &= 0xfffff000;
+	insn |= (val & 63);
+	s390_kernel_write(where, &insn, sizeof(insn));
+}
+
+static inline void runtime_const_fixup(void (*fn)(void *, unsigned long),
+				       unsigned long val, s32 *start, s32 *end)
+{
+	while (start < end) {
+		fn(*start + (void *)start, val);
+		start++;
+	}
+}
+
+#endif /* _ASM_S390_RUNTIME_CONST_H */
diff --git a/arch/s390/include/asm/rwonce.h b/arch/s390/include/asm/rwonce.h
new file mode 100644
index 000000000000..91fc24520e82
--- /dev/null
+++ b/arch/s390/include/asm/rwonce.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_S390_RWONCE_H
+#define __ASM_S390_RWONCE_H
+
+#include <linux/compiler_types.h>
+
+/*
+ * Use READ_ONCE_ALIGNED_128() for 128-bit block concurrent (atomic) read
+ * accesses. Note that x must be 128-bit aligned, otherwise a specification
+ * exception is generated.
+ */
+#define READ_ONCE_ALIGNED_128(x)			\
+({							\
+	union {						\
+		typeof(x) __x;				\
+		__uint128_t val;			\
+	} __u;						\
+							\
+	BUILD_BUG_ON(sizeof(x) != 16);			\
+	asm volatile(					\
+		"	lpq	%[val],%[_x]\n"		\
+		: [val] "=d" (__u.val)			\
+		: [_x] "QS" (x)				\
+		: "memory");				\
+	__u.__x;					\
+})
+
+#include <asm-generic/rwonce.h>
+
+#endif	/* __ASM_S390_RWONCE_H */
diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index c563f8368b19..1e62919bacf4 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -1,18 +1,30 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *    Copyright IBM Corp. 2007
- *    Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
  */
 
 #ifndef _ASM_S390_SCLP_H
 #define _ASM_S390_SCLP_H
 
 #include <linux/types.h>
-#include <asm/chpid.h>
-#include <asm/cpu.h>
 
 #define SCLP_CHP_INFO_MASK_SIZE		32
-#define SCLP_MAX_CORES			256
+#define EARLY_SCCB_SIZE		PAGE_SIZE
+#define SCLP_MAX_CORES		512
+/* 144 + 16 * SCLP_MAX_CORES + 2 * (SCLP_MAX_CORES - 1) */
+#define EXT_SCCB_READ_SCP	(3 * PAGE_SIZE)
+/* 24 + 16 * SCLP_MAX_CORES */
+#define EXT_SCCB_READ_CPU	(3 * PAGE_SIZE)
+
+#define SCLP_ERRNOTIFY_AQ_RESET			0
+#define SCLP_ERRNOTIFY_AQ_REPAIR		1
+#define SCLP_ERRNOTIFY_AQ_INFO_LOG		2
+#define SCLP_ERRNOTIFY_AQ_OPTICS_DATA		3
+
+#ifndef __ASSEMBLY__
+#include <linux/uio.h>
+#include <asm/chpid.h>
+#include <asm/cpu.h>
 
 struct sclp_chp_info {
 	u8 recognized[SCLP_CHP_INFO_MASK_SIZE];
@@ -65,6 +77,7 @@ struct sclp_info {
 	unsigned char has_core_type : 1;
 	unsigned char has_sprp : 1;
 	unsigned char has_hvs : 1;
+	unsigned char has_wti : 1;
 	unsigned char has_esca : 1;
 	unsigned char has_sief2 : 1;
 	unsigned char has_64bscao : 1;
@@ -77,10 +90,20 @@ struct sclp_info {
 	unsigned char has_ibs : 1;
 	unsigned char has_skey : 1;
 	unsigned char has_kss : 1;
+	unsigned char has_diag204_bif : 1;
 	unsigned char has_gisaf : 1;
+	unsigned char has_diag310 : 1;
 	unsigned char has_diag318 : 1;
+	unsigned char has_diag320 : 1;
+	unsigned char has_diag324 : 1;
 	unsigned char has_sipl : 1;
+	unsigned char has_sipl_eckd : 1;
 	unsigned char has_dirq : 1;
+	unsigned char has_iplcc : 1;
+	unsigned char has_zpci_lsi : 1;
+	unsigned char has_aisii : 1;
+	unsigned char has_aeni : 1;
+	unsigned char has_aisi : 1;
 	unsigned int ibc;
 	unsigned int mtid;
 	unsigned int mtid_cp;
@@ -95,6 +118,34 @@ struct sclp_info {
 };
 extern struct sclp_info sclp;
 
+struct sccb_header {
+	u16	length;
+	u8	function_code;
+	u8	control_mask[3];
+	u16	response_code;
+} __packed;
+
+struct evbuf_header {
+	u16	length;
+	u8	type;
+	u8	flags;
+	u16	_reserved;
+} __packed;
+
+struct err_notify_evbuf {
+	struct evbuf_header header;
+	u8 action;
+	u8 atype;
+	u32 fh;
+	u32 fid;
+	u8 data[];
+} __packed;
+
+struct err_notify_sccb {
+	struct sccb_header header;
+	struct err_notify_evbuf evbuf;
+} __packed;
+
 struct zpci_report_error_header {
 	u8 version;	/* Interface version byte */
 	u8 action;	/* Action qualifier byte
@@ -105,18 +156,24 @@ struct zpci_report_error_header {
 			 *	(OpenCrypto Successful Diagnostics Execution)
 			 */
 	u16 length;	/* Length of Subsequent Data (up to 4K – SCLP header */
-	u8 data[0];	/* Subsequent Data passed verbatim to SCLP ET 24 */
+	u8 data[];	/* Subsequent Data passed verbatim to SCLP ET 24 */
 } __packed;
 
+extern char *sclp_early_sccb;
+
+void sclp_early_adjust_va(void);
+void sclp_early_set_buffer(void *sccb);
 int sclp_early_read_info(void);
 int sclp_early_read_storage_info(void);
 int sclp_early_get_core_info(struct sclp_core_info *info);
 void sclp_early_get_ipl_info(struct sclp_ipl_info *info);
 void sclp_early_detect(void);
+void sclp_early_detect_machine_features(void);
 void sclp_early_printk(const char *s);
-void sclp_early_printk_force(const char *s);
-void __sclp_early_printk(const char *s, unsigned int len, unsigned int force);
+void __sclp_early_printk(const char *s, unsigned int len);
+void sclp_emergency_printk(const char *s);
 
+int sclp_init(void);
 int sclp_early_get_memsize(unsigned long *mem);
 int sclp_early_get_hsa_size(unsigned long *hsa_size);
 int _sclp_get_core_info(struct sclp_core_info *info);
@@ -129,9 +186,10 @@ int sclp_chp_deconfigure(struct chp_id chpid);
 int sclp_chp_read_info(struct sclp_chp_info *info);
 int sclp_pci_configure(u32 fid);
 int sclp_pci_deconfigure(u32 fid);
+int sclp_ap_configure(u32 apid);
+int sclp_ap_deconfigure(u32 apid);
 int sclp_pci_report(struct zpci_report_error_header *report, u32 fh, u32 fid);
-int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count);
-int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count);
+size_t memcpy_hsa_iter(struct iov_iter *iter, unsigned long src, size_t count);
 void sclp_ocf_cpc_name_copy(char *dst);
 
 static inline int sclp_get_core_info(struct sclp_core_info *info, int early)
@@ -141,4 +199,5 @@ static inline int sclp_get_core_info(struct sclp_core_info *info, int early)
 	return _sclp_get_core_info(info);
 }
 
+#endif /* __ASSEMBLY__ */
 #endif /* _ASM_S390_SCLP_H */
diff --git a/arch/s390/include/asm/scsw.h b/arch/s390/include/asm/scsw.h
index c00f7b031628..56003e26cdbf 100644
--- a/arch/s390/include/asm/scsw.h
+++ b/arch/s390/include/asm/scsw.h
@@ -11,6 +11,7 @@
 
 #include <linux/types.h>
 #include <asm/css_chars.h>
+#include <asm/dma-types.h>
 #include <asm/cio.h>
 
 /**
@@ -53,7 +54,7 @@ struct cmd_scsw {
 	__u32 fctl : 3;
 	__u32 actl : 7;
 	__u32 stctl : 5;
-	__u32 cpa;
+	dma32_t cpa;
 	__u32 dstat : 8;
 	__u32 cstat : 8;
 	__u32 count : 16;
@@ -93,7 +94,7 @@ struct tm_scsw {
 	u32 fctl:3;
 	u32 actl:7;
 	u32 stctl:5;
-	u32 tcw;
+	dma32_t tcw;
 	u32 dstat:8;
 	u32 cstat:8;
 	u32 fcxs:8;
@@ -125,7 +126,7 @@ struct eadm_scsw {
 	u32 fctl:3;
 	u32 actl:7;
 	u32 stctl:5;
-	u32 aob;
+	dma32_t aob;
 	u32 dstat:8;
 	u32 cstat:8;
 	u32:16;
@@ -215,6 +216,11 @@ union scsw {
 #define SNS2_ENV_DATA_PRESENT	0x10
 #define SNS2_INPRECISE_END	0x04
 
+/*
+ * architectured values for PPRC errors
+ */
+#define SNS7_INVALID_ON_SEC	0x0e
+
 /**
  * scsw_is_tm - check for transport mode scsw
  * @scsw: pointer to scsw
@@ -508,9 +514,21 @@ static inline int scsw_cmd_is_valid_zcc(union scsw *scsw)
  */
 static inline int scsw_cmd_is_valid_ectl(union scsw *scsw)
 {
-	return (scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND) &&
-	       !(scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS) &&
-	       (scsw->cmd.stctl & SCSW_STCTL_ALERT_STATUS);
+	/* Must be status pending. */
+	if (!(scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND))
+		return 0;
+
+	/* Must have alert status. */
+	if (!(scsw->cmd.stctl & SCSW_STCTL_ALERT_STATUS))
+		return 0;
+
+	/* Must be alone or together with primary, secondary or both,
+	 * => no intermediate status.
+	 */
+	if (scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS)
+		return 0;
+
+	return 1;
 }
 
 /**
@@ -522,11 +540,25 @@ static inline int scsw_cmd_is_valid_ectl(union scsw *scsw)
  */
 static inline int scsw_cmd_is_valid_pno(union scsw *scsw)
 {
-	return (scsw->cmd.fctl != 0) &&
-	       (scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND) &&
-	       (!(scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS) ||
-		 ((scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS) &&
-		  (scsw->cmd.actl & SCSW_ACTL_SUSPENDED)));
+	/* Must indicate at least one I/O function. */
+	if (!scsw->cmd.fctl)
+		return 0;
+
+	/* Must be status pending. */
+	if (!(scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND))
+		return 0;
+
+	/* Can be status pending alone, or with any combination of primary,
+	 * secondary and alert => no intermediate status.
+	 */
+	if (!(scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS))
+		return 1;
+
+	/* If intermediate, must be suspended. */
+	if (scsw->cmd.actl & SCSW_ACTL_SUSPENDED)
+		return 1;
+
+	return 0;
 }
 
 /**
@@ -676,9 +708,21 @@ static inline int scsw_tm_is_valid_q(union scsw *scsw)
  */
 static inline int scsw_tm_is_valid_ectl(union scsw *scsw)
 {
-	return (scsw->tm.stctl & SCSW_STCTL_STATUS_PEND) &&
-	       !(scsw->tm.stctl & SCSW_STCTL_INTER_STATUS) &&
-	       (scsw->tm.stctl & SCSW_STCTL_ALERT_STATUS);
+	/* Must be status pending. */
+	if (!(scsw->tm.stctl & SCSW_STCTL_STATUS_PEND))
+		return 0;
+
+	/* Must have alert status. */
+	if (!(scsw->tm.stctl & SCSW_STCTL_ALERT_STATUS))
+		return 0;
+
+	/* Must be alone or together with primary, secondary or both,
+	 * => no intermediate status.
+	 */
+	if (scsw->tm.stctl & SCSW_STCTL_INTER_STATUS)
+		return 0;
+
+	return 1;
 }
 
 /**
@@ -690,11 +734,25 @@ static inline int scsw_tm_is_valid_ectl(union scsw *scsw)
  */
 static inline int scsw_tm_is_valid_pno(union scsw *scsw)
 {
-	return (scsw->tm.fctl != 0) &&
-	       (scsw->tm.stctl & SCSW_STCTL_STATUS_PEND) &&
-	       (!(scsw->tm.stctl & SCSW_STCTL_INTER_STATUS) ||
-		 ((scsw->tm.stctl & SCSW_STCTL_INTER_STATUS) &&
-		  (scsw->tm.actl & SCSW_ACTL_SUSPENDED)));
+	/* Must indicate at least one I/O function. */
+	if (!scsw->tm.fctl)
+		return 0;
+
+	/* Must be status pending. */
+	if (!(scsw->tm.stctl & SCSW_STCTL_STATUS_PEND))
+		return 0;
+
+	/* Can be status pending alone, or with any combination of primary,
+	 * secondary and alert => no intermediate status.
+	 */
+	if (!(scsw->tm.stctl & SCSW_STCTL_INTER_STATUS))
+		return 1;
+
+	/* If intermediate, must be suspended. */
+	if (scsw->tm.actl & SCSW_ACTL_SUSPENDED)
+		return 1;
+
+	return 0;
 }
 
 /**
diff --git a/arch/s390/include/asm/seccomp.h b/arch/s390/include/asm/seccomp.h
index 795bbe0d7ca6..71d46f0ba97b 100644
--- a/arch/s390/include/asm/seccomp.h
+++ b/arch/s390/include/asm/seccomp.h
@@ -16,4 +16,13 @@
 
 #include <asm-generic/seccomp.h>
 
+#define SECCOMP_ARCH_NATIVE		AUDIT_ARCH_S390X
+#define SECCOMP_ARCH_NATIVE_NR		NR_syscalls
+#define SECCOMP_ARCH_NATIVE_NAME	"s390x"
+#ifdef CONFIG_COMPAT
+# define SECCOMP_ARCH_COMPAT		AUDIT_ARCH_S390
+# define SECCOMP_ARCH_COMPAT_NR		NR_syscalls
+# define SECCOMP_ARCH_COMPAT_NAME	"s390"
+#endif
+
 #endif	/* _ASM_S390_SECCOMP_H */
diff --git a/arch/s390/include/asm/sections.h b/arch/s390/include/asm/sections.h
index 42de04ad9c07..0486e6ef62bf 100644
--- a/arch/s390/include/asm/sections.h
+++ b/arch/s390/include/asm/sections.h
@@ -2,20 +2,8 @@
 #ifndef _S390_SECTIONS_H
 #define _S390_SECTIONS_H
 
-#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed
-
 #include <asm-generic/sections.h>
 
-extern bool initmem_freed;
-
-static inline int arch_is_kernel_initmem_freed(unsigned long addr)
-{
-	if (!initmem_freed)
-		return 0;
-	return addr >= (unsigned long)__init_begin &&
-	       addr < (unsigned long)__init_end;
-}
-
 /*
  * .boot.data section contains variables "shared" between the decompressor and
  * the decompressed kernel. The decompressor will store values in them, and
@@ -26,16 +14,16 @@ static inline int arch_is_kernel_initmem_freed(unsigned long addr)
  * final .boot.data section, which should be identical in the decompressor and
  * the decompressed kernel (that is checked during the build).
  */
-#define __bootdata(var) __section(.boot.data.var) var
+#define __bootdata(var) __section(".boot.data." #var) var
 
 /*
  * .boot.preserved.data is similar to .boot.data, but it is not part of the
  * .init section and thus will be preserved for later use in the decompressed
  * kernel.
  */
-#define __bootdata_preserved(var) __section(.boot.preserved.data.var) var
+#define __bootdata_preserved(var) __section(".boot.preserved.data." #var) var
 
-extern unsigned long __sdma, __edma;
-extern unsigned long __stext_dma, __etext_dma;
+extern char *__samode31, *__eamode31;
+extern char *__stext_amode31, *__etext_amode31;
 
 #endif
diff --git a/arch/s390/include/asm/serial.h b/arch/s390/include/asm/serial.h
deleted file mode 100644
index aaf85a69061c..000000000000
--- a/arch/s390/include/asm/serial.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_S390_SERIAL_H
-#define _ASM_S390_SERIAL_H
-
-#define BASE_BAUD 0
-
-#endif /* _ASM_S390_SERIAL_H */
diff --git a/arch/s390/include/asm/set_memory.h b/arch/s390/include/asm/set_memory.h
index c59a83536c70..94092f4ae764 100644
--- a/arch/s390/include/asm/set_memory.h
+++ b/arch/s390/include/asm/set_memory.h
@@ -2,31 +2,67 @@
 #ifndef _ASMS390_SET_MEMORY_H
 #define _ASMS390_SET_MEMORY_H
 
-#define SET_MEMORY_RO	1UL
-#define SET_MEMORY_RW	2UL
-#define SET_MEMORY_NX	4UL
-#define SET_MEMORY_X	8UL
+#include <linux/mutex.h>
 
-int __set_memory(unsigned long addr, int numpages, unsigned long flags);
+extern struct mutex cpa_mutex;
 
-static inline int set_memory_ro(unsigned long addr, int numpages)
-{
-	return __set_memory(addr, numpages, SET_MEMORY_RO);
-}
+enum {
+	_SET_MEMORY_RO_BIT,
+	_SET_MEMORY_RW_BIT,
+	_SET_MEMORY_NX_BIT,
+	_SET_MEMORY_X_BIT,
+	_SET_MEMORY_4K_BIT,
+	_SET_MEMORY_INV_BIT,
+	_SET_MEMORY_DEF_BIT,
+};
 
-static inline int set_memory_rw(unsigned long addr, int numpages)
-{
-	return __set_memory(addr, numpages, SET_MEMORY_RW);
-}
+#define SET_MEMORY_RO	BIT(_SET_MEMORY_RO_BIT)
+#define SET_MEMORY_RW	BIT(_SET_MEMORY_RW_BIT)
+#define SET_MEMORY_NX	BIT(_SET_MEMORY_NX_BIT)
+#define SET_MEMORY_X	BIT(_SET_MEMORY_X_BIT)
+#define SET_MEMORY_4K	BIT(_SET_MEMORY_4K_BIT)
+#define SET_MEMORY_INV	BIT(_SET_MEMORY_INV_BIT)
+#define SET_MEMORY_DEF	BIT(_SET_MEMORY_DEF_BIT)
 
-static inline int set_memory_nx(unsigned long addr, int numpages)
-{
-	return __set_memory(addr, numpages, SET_MEMORY_NX);
-}
+int __set_memory(unsigned long addr, unsigned long numpages, unsigned long flags);
 
-static inline int set_memory_x(unsigned long addr, int numpages)
-{
-	return __set_memory(addr, numpages, SET_MEMORY_X);
+#define set_memory_rox set_memory_rox
+
+/*
+ * Generate two variants of each set_memory() function:
+ *
+ * set_memory_yy(unsigned long addr, int numpages);
+ * __set_memory_yy(void *start, void *end);
+ *
+ * The second variant exists for both convenience to avoid the usual
+ * (unsigned long) casts, but unlike the first variant it can also be used
+ * for areas larger than 8TB, which may happen at memory initialization.
+ */
+#define __SET_MEMORY_FUNC(fname, flags)					\
+static inline int fname(unsigned long addr, int numpages)		\
+{									\
+	return __set_memory(addr, numpages, (flags));			\
+}									\
+									\
+static inline int __##fname(void *start, void *end)			\
+{									\
+	unsigned long numpages;						\
+									\
+	numpages = (end - start) >> PAGE_SHIFT;				\
+	return __set_memory((unsigned long)start, numpages, (flags));	\
 }
 
+__SET_MEMORY_FUNC(set_memory_ro, SET_MEMORY_RO)
+__SET_MEMORY_FUNC(set_memory_rw, SET_MEMORY_RW)
+__SET_MEMORY_FUNC(set_memory_nx, SET_MEMORY_NX)
+__SET_MEMORY_FUNC(set_memory_x, SET_MEMORY_X)
+__SET_MEMORY_FUNC(set_memory_rox, SET_MEMORY_RO | SET_MEMORY_X)
+__SET_MEMORY_FUNC(set_memory_rwnx, SET_MEMORY_RW | SET_MEMORY_NX)
+__SET_MEMORY_FUNC(set_memory_4k, SET_MEMORY_4K)
+
+int set_direct_map_invalid_noflush(struct page *page);
+int set_direct_map_default_noflush(struct page *page);
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
+bool kernel_page_present(struct page *page);
+
 #endif
diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index 6dc6c4fbc8e2..031e881b4d88 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -8,36 +8,11 @@
 
 #include <linux/bits.h>
 #include <uapi/asm/setup.h>
+#include <linux/build_bug.h>
 
-#define EP_OFFSET		0x10008
-#define EP_STRING		"S390EP"
 #define PARMAREA		0x10400
-#define EARLY_SCCB_OFFSET	0x11000
-#define HEAD_END		0x12000
 
-#define EARLY_SCCB_SIZE		PAGE_SIZE
-
-/*
- * Machine features detected in early.c
- */
-
-#define MACHINE_FLAG_VM		BIT(0)
-#define MACHINE_FLAG_KVM	BIT(1)
-#define MACHINE_FLAG_LPAR	BIT(2)
-#define MACHINE_FLAG_DIAG9C	BIT(3)
-#define MACHINE_FLAG_ESOP	BIT(4)
-#define MACHINE_FLAG_IDTE	BIT(5)
-#define MACHINE_FLAG_DIAG44	BIT(6)
-#define MACHINE_FLAG_EDAT1	BIT(7)
-#define MACHINE_FLAG_EDAT2	BIT(8)
-#define MACHINE_FLAG_TOPOLOGY	BIT(10)
-#define MACHINE_FLAG_TE		BIT(11)
-#define MACHINE_FLAG_TLB_LC	BIT(12)
-#define MACHINE_FLAG_VX		BIT(13)
-#define MACHINE_FLAG_TLB_GUEST	BIT(14)
-#define MACHINE_FLAG_NX		BIT(15)
-#define MACHINE_FLAG_GS		BIT(16)
-#define MACHINE_FLAG_SCC	BIT(17)
+#define COMMAND_LINE_SIZE CONFIG_COMMAND_LINE_SIZE
 
 #define LPP_MAGIC		BIT(31)
 #define LPP_PID_MASK		_AC(0xffffffff, UL)
@@ -47,28 +22,13 @@
 #define STARTUP_NORMAL_OFFSET	0x10000
 #define STARTUP_KDUMP_OFFSET	0x10010
 
-/* Offsets to parameters in kernel/head.S  */
-
-#define IPL_DEVICE_OFFSET	0x10400
-#define INITRD_START_OFFSET	0x10408
-#define INITRD_SIZE_OFFSET	0x10410
-#define OLDMEM_BASE_OFFSET	0x10418
-#define OLDMEM_SIZE_OFFSET	0x10420
-#define KERNEL_VERSION_OFFSET	0x10428
-#define COMMAND_LINE_OFFSET	0x10480
+#define LEGACY_COMMAND_LINE_SIZE	896
 
 #ifndef __ASSEMBLY__
 
 #include <asm/lowcore.h>
 #include <asm/types.h>
 
-#define IPL_DEVICE	(*(unsigned long *)  (IPL_DEVICE_OFFSET))
-#define INITRD_START	(*(unsigned long *)  (INITRD_START_OFFSET))
-#define INITRD_SIZE	(*(unsigned long *)  (INITRD_SIZE_OFFSET))
-#define OLDMEM_BASE	(*(unsigned long *)  (OLDMEM_BASE_OFFSET))
-#define OLDMEM_SIZE	(*(unsigned long *)  (OLDMEM_SIZE_OFFSET))
-#define COMMAND_LINE	((char *)	     (COMMAND_LINE_OFFSET))
-
 struct parmarea {
 	unsigned long ipl_device;			/* 0x10400 */
 	unsigned long initrd_start;			/* 0x10408 */
@@ -76,35 +36,25 @@ struct parmarea {
 	unsigned long oldmem_base;			/* 0x10418 */
 	unsigned long oldmem_size;			/* 0x10420 */
 	unsigned long kernel_version;			/* 0x10428 */
-	char pad1[0x10480 - 0x10430];			/* 0x10430 - 0x10480 */
-	char command_line[ARCH_COMMAND_LINE_SIZE];	/* 0x10480 */
+	unsigned long max_command_line_size;		/* 0x10430 */
+	char pad1[0x10480-0x10438];			/* 0x10438 - 0x10480 */
+	char command_line[COMMAND_LINE_SIZE];		/* 0x10480 */
 };
 
-extern int noexec_disabled;
-extern int memory_end_set;
-extern unsigned long memory_end;
-extern unsigned long vmalloc_size;
-extern unsigned long max_physmem_end;
-extern unsigned long __swsusp_reset_dma;
-
-#define MACHINE_IS_VM		(S390_lowcore.machine_flags & MACHINE_FLAG_VM)
-#define MACHINE_IS_KVM		(S390_lowcore.machine_flags & MACHINE_FLAG_KVM)
-#define MACHINE_IS_LPAR		(S390_lowcore.machine_flags & MACHINE_FLAG_LPAR)
-
-#define MACHINE_HAS_DIAG9C	(S390_lowcore.machine_flags & MACHINE_FLAG_DIAG9C)
-#define MACHINE_HAS_ESOP	(S390_lowcore.machine_flags & MACHINE_FLAG_ESOP)
-#define MACHINE_HAS_IDTE	(S390_lowcore.machine_flags & MACHINE_FLAG_IDTE)
-#define MACHINE_HAS_DIAG44	(S390_lowcore.machine_flags & MACHINE_FLAG_DIAG44)
-#define MACHINE_HAS_EDAT1	(S390_lowcore.machine_flags & MACHINE_FLAG_EDAT1)
-#define MACHINE_HAS_EDAT2	(S390_lowcore.machine_flags & MACHINE_FLAG_EDAT2)
-#define MACHINE_HAS_TOPOLOGY	(S390_lowcore.machine_flags & MACHINE_FLAG_TOPOLOGY)
-#define MACHINE_HAS_TE		(S390_lowcore.machine_flags & MACHINE_FLAG_TE)
-#define MACHINE_HAS_TLB_LC	(S390_lowcore.machine_flags & MACHINE_FLAG_TLB_LC)
-#define MACHINE_HAS_VX		(S390_lowcore.machine_flags & MACHINE_FLAG_VX)
-#define MACHINE_HAS_TLB_GUEST	(S390_lowcore.machine_flags & MACHINE_FLAG_TLB_GUEST)
-#define MACHINE_HAS_NX		(S390_lowcore.machine_flags & MACHINE_FLAG_NX)
-#define MACHINE_HAS_GS		(S390_lowcore.machine_flags & MACHINE_FLAG_GS)
-#define MACHINE_HAS_SCC		(S390_lowcore.machine_flags & MACHINE_FLAG_SCC)
+extern struct parmarea parmarea;
+
+extern unsigned int zlib_dfltcc_support;
+#define ZLIB_DFLTCC_DISABLED		0
+#define ZLIB_DFLTCC_FULL		1
+#define ZLIB_DFLTCC_DEFLATE_ONLY	2
+#define ZLIB_DFLTCC_INFLATE_ONLY	3
+#define ZLIB_DFLTCC_FULL_DEBUG		4
+
+extern unsigned long ident_map_size;
+extern unsigned long max_mappable;
+
+/* The Write Back bit position in the physaddr is given by the SLPC PCI */
+extern unsigned long mio_wb_bit_mask;
 
 /*
  * Console mode. Override with conmode=
@@ -113,9 +63,6 @@ extern unsigned int console_mode;
 extern unsigned int console_devno;
 extern unsigned int console_irq;
 
-extern char vmhalt_cmd[];
-extern char vmpoff_cmd[];
-
 #define CONSOLE_IS_UNDEFINED	(console_mode == 0)
 #define CONSOLE_IS_SCLP		(console_mode == 1)
 #define CONSOLE_IS_3215		(console_mode == 2)
@@ -128,13 +75,7 @@ extern char vmpoff_cmd[];
 #define SET_CONSOLE_VT220	do { console_mode = 4; } while (0)
 #define SET_CONSOLE_HVC		do { console_mode = 5; } while (0)
 
-#ifdef CONFIG_PFAULT
-extern int pfault_init(void);
-extern void pfault_fini(void);
-#else /* CONFIG_PFAULT */
-#define pfault_init()		({-1;})
-#define pfault_fini()		do { } while (0)
-#endif /* CONFIG_PFAULT */
+void register_early_console(void);
 
 #ifdef CONFIG_VMCP
 void vmcp_cma_reserve(void);
@@ -144,27 +85,20 @@ static inline void vmcp_cma_reserve(void) { }
 
 void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault);
 
-void cmma_init(void);
-void cmma_init_nodat(void);
-
 extern void (*_machine_restart)(char *command);
 extern void (*_machine_halt)(void);
 extern void (*_machine_power_off)(void);
 
-extern unsigned long __kaslr_offset;
-static inline unsigned long kaslr_offset(void)
+struct oldmem_data {
+	unsigned long start;
+	unsigned long size;
+};
+extern struct oldmem_data oldmem_data;
+
+static __always_inline u32 gen_lpswe(unsigned long addr)
 {
-	return __kaslr_offset;
+	BUILD_BUG_ON(addr > 0xfff);
+	return 0xb2b20000 | addr;
 }
-
-#else /* __ASSEMBLY__ */
-
-#define IPL_DEVICE	(IPL_DEVICE_OFFSET)
-#define INITRD_START	(INITRD_START_OFFSET)
-#define INITRD_SIZE	(INITRD_SIZE_OFFSET)
-#define OLDMEM_BASE	(OLDMEM_BASE_OFFSET)
-#define OLDMEM_SIZE	(OLDMEM_SIZE_OFFSET)
-#define COMMAND_LINE	(COMMAND_LINE_OFFSET)
-
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_S390_SETUP_H */
diff --git a/arch/s390/include/asm/shmparam.h b/arch/s390/include/asm/shmparam.h
deleted file mode 100644
index e75d45649c54..000000000000
--- a/arch/s390/include/asm/shmparam.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *  S390 version
- *
- *  Derived from "include/asm-i386/shmparam.h"
- */
-#ifndef _ASM_S390_SHMPARAM_H
-#define _ASM_S390_SHMPARAM_H
-
-#define SHMLBA PAGE_SIZE                 /* attach addr a multiple of this */
-
-#endif /* _ASM_S390_SHMPARAM_H */
diff --git a/arch/s390/include/asm/sigp.h b/arch/s390/include/asm/sigp.h
index 53ee795cd3d3..472943b77066 100644
--- a/arch/s390/include/asm/sigp.h
+++ b/arch/s390/include/asm/sigp.h
@@ -38,19 +38,22 @@
 
 #ifndef __ASSEMBLY__
 
+#include <asm/asm.h>
+
 static inline int ____pcpu_sigp(u16 addr, u8 order, unsigned long parm,
 				u32 *status)
 {
-	register unsigned long reg1 asm ("1") = parm;
+	union register_pair r1 = { .odd = parm, };
 	int cc;
 
 	asm volatile(
-		"	sigp	%1,%2,0(%3)\n"
-		"	ipm	%0\n"
-		"	srl	%0,28\n"
-		: "=d" (cc), "+d" (reg1) : "d" (addr), "a" (order) : "cc");
-	*status = reg1;
-	return cc;
+		"	sigp	%[r1],%[addr],0(%[order])\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc), [r1] "+d" (r1.pair)
+		: [addr] "d" (addr), [order] "a" (order)
+		: CC_CLOBBER);
+	*status = r1.even;
+	return CC_TRANSFORM(cc);
 }
 
 static inline int __pcpu_sigp(u16 addr, u8 order, unsigned long parm,
diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h
index b157a81fb977..03f4d01664f8 100644
--- a/arch/s390/include/asm/smp.h
+++ b/arch/s390/include/asm/smp.h
@@ -3,37 +3,60 @@
  *    Copyright IBM Corp. 1999, 2012
  *    Author(s): Denis Joseph Barrow,
  *		 Martin Schwidefsky <schwidefsky@de.ibm.com>,
- *		 Heiko Carstens <heiko.carstens@de.ibm.com>,
  */
 #ifndef __ASM_SMP_H
 #define __ASM_SMP_H
 
-#include <asm/sigp.h>
+#include <asm/processor.h>
 #include <asm/lowcore.h>
+#include <asm/machine.h>
+#include <asm/sigp.h>
 
-#define raw_smp_processor_id()	(S390_lowcore.cpu_nr)
+static __always_inline unsigned int raw_smp_processor_id(void)
+{
+	unsigned long lc_cpu_nr;
+	unsigned int cpu;
+
+	BUILD_BUG_ON(sizeof_field(struct lowcore, cpu_nr) != sizeof(cpu));
+	lc_cpu_nr = offsetof(struct lowcore, cpu_nr);
+	asm_inline(
+		ALTERNATIVE("   ly      %[cpu],%[offzero](%%r0)\n",
+			    "   ly      %[cpu],%[offalt](%%r0)\n",
+			    ALT_FEATURE(MFEATURE_LOWCORE))
+		: [cpu] "=d" (cpu)
+		: [offzero] "i" (lc_cpu_nr),
+		  [offalt] "i" (lc_cpu_nr + LOWCORE_ALT_ADDRESS),
+		  "m" (((struct lowcore *)0)->cpu_nr));
+	return cpu;
+}
+
+#define arch_scale_cpu_capacity smp_cpu_get_capacity
 
 extern struct mutex smp_cpu_state_mutex;
 extern unsigned int smp_cpu_mt_shift;
 extern unsigned int smp_cpu_mtid;
 extern __vector128 __initdata boot_cpu_vector_save_area[__NUM_VXRS];
+extern cpumask_t cpu_setup_mask;
 
 extern int __cpu_up(unsigned int cpu, struct task_struct *tidle);
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
 
-extern void smp_call_online_cpu(void (*func)(void *), void *);
 extern void smp_call_ipl_cpu(void (*func)(void *), void *);
 extern void smp_emergency_stop(void);
 
 extern int smp_find_processor_id(u16 address);
 extern int smp_store_status(int cpu);
-extern void smp_save_dump_cpus(void);
-extern int smp_vcpu_scheduled(int cpu);
+extern void smp_save_dump_ipl_cpu(void);
+extern void smp_save_dump_secondary_cpus(void);
 extern void smp_yield_cpu(int cpu);
 extern void smp_cpu_set_polarization(int cpu, int val);
 extern int smp_cpu_get_polarization(int cpu);
+extern void smp_cpu_set_capacity(int cpu, unsigned long val);
+extern void smp_set_core_capacity(int cpu, unsigned long val);
+extern unsigned long smp_cpu_get_capacity(int cpu);
+extern int smp_cpu_get_cpu_address(int cpu);
 extern void smp_fill_possible_mask(void);
 extern void smp_detect_cpus(void);
 
@@ -53,9 +76,15 @@ static inline int smp_get_base_cpu(int cpu)
 	return cpu - (cpu % (smp_cpu_mtid + 1));
 }
 
-extern int smp_rescan_cpus(void);
+static inline void smp_cpus_done(unsigned int max_cpus)
+{
+}
+
+extern int smp_rescan_cpus(bool early);
 extern void __noreturn cpu_die(void);
 extern void __cpu_die(unsigned int cpu);
 extern int __cpu_disable(void);
+extern void schedule_mcck_handler(void);
+void notrace smp_yield_cpu(int cpu);
 
 #endif /* __ASM_SMP_H */
diff --git a/arch/s390/include/asm/softirq_stack.h b/arch/s390/include/asm/softirq_stack.h
new file mode 100644
index 000000000000..42d61296bbad
--- /dev/null
+++ b/arch/s390/include/asm/softirq_stack.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __ASM_S390_SOFTIRQ_STACK_H
+#define __ASM_S390_SOFTIRQ_STACK_H
+
+#include <asm/lowcore.h>
+#include <asm/stacktrace.h>
+
+#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK
+static inline void do_softirq_own_stack(void)
+{
+	call_on_stack(0, get_lowcore()->async_stack, void, __do_softirq);
+}
+#endif
+#endif /* __ASM_S390_SOFTIRQ_STACK_H */
diff --git a/arch/s390/include/asm/sparsemem.h b/arch/s390/include/asm/sparsemem.h
index c549893602ea..668dfc5de538 100644
--- a/arch/s390/include/asm/sparsemem.h
+++ b/arch/s390/include/asm/sparsemem.h
@@ -2,7 +2,23 @@
 #ifndef _ASM_S390_SPARSEMEM_H
 #define _ASM_S390_SPARSEMEM_H
 
-#define SECTION_SIZE_BITS	28
+#define SECTION_SIZE_BITS	27
 #define MAX_PHYSMEM_BITS	CONFIG_MAX_PHYSMEM_BITS
 
+#ifdef CONFIG_NUMA
+
+static inline int memory_add_physaddr_to_nid(u64 addr)
+{
+	return 0;
+}
+#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
+
+static inline int phys_to_target_node(u64 start)
+{
+	return 0;
+}
+#define phys_to_target_node phys_to_target_node
+
+#endif /* CONFIG_NUMA */
+
 #endif /* _ASM_S390_SPARSEMEM_H */
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index 3a37172d5398..f9935db9fd76 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -16,7 +16,23 @@
 #include <asm/processor.h>
 #include <asm/alternative.h>
 
-#define SPINLOCK_LOCKVAL (S390_lowcore.spinlock_lockval)
+static __always_inline unsigned int spinlock_lockval(void)
+{
+	unsigned long lc_lockval;
+	unsigned int lockval;
+
+	BUILD_BUG_ON(sizeof_field(struct lowcore, spinlock_lockval) != sizeof(lockval));
+	lc_lockval = offsetof(struct lowcore, spinlock_lockval);
+	asm_inline(
+		ALTERNATIVE("   ly      %[lockval],%[offzero](%%r0)\n",
+			    "   ly      %[lockval],%[offalt](%%r0)\n",
+			    ALT_FEATURE(MFEATURE_LOWCORE))
+		: [lockval] "=d" (lockval)
+		: [offzero] "i" (lc_lockval),
+		  [offalt] "i" (lc_lockval + LOWCORE_ALT_ADDRESS),
+		  "m" (((struct lowcore *)0)->spinlock_lockval));
+	return lockval;
+}
 
 extern int spin_retry;
 
@@ -57,8 +73,10 @@ static inline int arch_spin_is_locked(arch_spinlock_t *lp)
 
 static inline int arch_spin_trylock_once(arch_spinlock_t *lp)
 {
+	int old = 0;
+
 	barrier();
-	return likely(__atomic_cmpxchg_bool(&lp->lock, 0, SPINLOCK_LOCKVAL));
+	return likely(arch_try_cmpxchg(&lp->lock, &old, spinlock_lockval()));
 }
 
 static inline void arch_spin_lock(arch_spinlock_t *lp)
@@ -67,14 +85,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lp)
 		arch_spin_lock_wait(lp);
 }
 
-static inline void arch_spin_lock_flags(arch_spinlock_t *lp,
-					unsigned long flags)
-{
-	if (!arch_spin_trylock_once(lp))
-		arch_spin_lock_wait(lp);
-}
-#define arch_spin_lock_flags	arch_spin_lock_flags
-
 static inline int arch_spin_trylock(arch_spinlock_t *lp)
 {
 	if (!arch_spin_trylock_once(lp))
@@ -85,11 +95,13 @@ static inline int arch_spin_trylock(arch_spinlock_t *lp)
 static inline void arch_spin_unlock(arch_spinlock_t *lp)
 {
 	typecheck(int, lp->lock);
+	kcsan_release();
 	asm_inline volatile(
-		ALTERNATIVE("", ".long 0xb2fa0070", 49)	/* NIAI 7 */
-		"	sth	%1,%0\n"
-		: "=Q" (((unsigned short *) &lp->lock)[1])
-		: "d" (0) : "cc", "memory");
+		ALTERNATIVE("nop", ".insn rre,0xb2fa0000,7,0", ALT_FACILITY(49)) /* NIAI 7 */
+		"	mvhhi	%[lock],0\n"
+		: [lock] "=Q" (((unsigned short *)&lp->lock)[1])
+		:
+		: "memory");
 }
 
 /*
@@ -125,7 +137,9 @@ static inline void arch_read_unlock(arch_rwlock_t *rw)
 
 static inline void arch_write_lock(arch_rwlock_t *rw)
 {
-	if (!__atomic_cmpxchg_bool(&rw->cnts, 0, 0x30000))
+	int old = 0;
+
+	if (!arch_try_cmpxchg(&rw->cnts, &old, 0x30000))
 		arch_write_lock_wait(rw);
 }
 
@@ -140,8 +154,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 	int old;
 
 	old = READ_ONCE(rw->cnts);
-	return (!(old & 0xffff0000) &&
-		__atomic_cmpxchg_bool(&rw->cnts, old, old + 1));
+	return (!(old & 0xffff0000) && arch_try_cmpxchg(&rw->cnts, &old, old + 1));
 }
 
 static inline int arch_write_trylock(arch_rwlock_t *rw)
@@ -149,7 +162,7 @@ static inline int arch_write_trylock(arch_rwlock_t *rw)
 	int old;
 
 	old = READ_ONCE(rw->cnts);
-	return !old && __atomic_cmpxchg_bool(&rw->cnts, 0, 0x30000);
+	return !old && arch_try_cmpxchg(&rw->cnts, &old, 0x30000);
 }
 
 #endif /* __ASM_SPINLOCK_H */
diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h
index cfed272e4fd5..3653ff57d6d9 100644
--- a/arch/s390/include/asm/spinlock_types.h
+++ b/arch/s390/include/asm/spinlock_types.h
@@ -2,13 +2,13 @@
 #ifndef __ASM_SPINLOCK_TYPES_H
 #define __ASM_SPINLOCK_TYPES_H
 
-#ifndef __LINUX_SPINLOCK_TYPES_H
-# error "please don't include this file directly"
+#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
+# error "Please do not include this file directly."
 #endif
 
 typedef struct {
 	int lock;
-} __attribute__ ((aligned (4))) arch_spinlock_t;
+} arch_spinlock_t;
 
 #define __ARCH_SPIN_LOCK_UNLOCKED { .lock = 0, }
 
diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h
index ee056f4a4fa3..1d5ca13dc90f 100644
--- a/arch/s390/include/asm/stacktrace.h
+++ b/arch/s390/include/asm/stacktrace.h
@@ -2,9 +2,27 @@
 #ifndef _ASM_S390_STACKTRACE_H
 #define _ASM_S390_STACKTRACE_H
 
+#include <linux/stacktrace.h>
 #include <linux/uaccess.h>
 #include <linux/ptrace.h>
-#include <asm/switch_to.h>
+
+struct stack_frame_user {
+	unsigned long back_chain;
+	unsigned long empty1[5];
+	unsigned long gprs[10];
+	unsigned long empty2[4];
+};
+
+struct stack_frame_vdso_wrapper {
+	struct stack_frame_user sf;
+	unsigned long return_address;
+};
+
+struct perf_callchain_entry_ctx;
+
+void arch_stack_walk_user_common(stack_trace_consume_fn consume_entry, void *cookie,
+				 struct perf_callchain_entry_ctx *entry,
+				 const struct pt_regs *regs, bool perf);
 
 enum stack_type {
 	STACK_TYPE_UNKNOWN,
@@ -12,6 +30,7 @@ enum stack_type {
 	STACK_TYPE_IRQ,
 	STACK_TYPE_NODAT,
 	STACK_TYPE_RESTART,
+	STACK_TYPE_MCCK,
 };
 
 struct stack_info {
@@ -33,37 +52,28 @@ static inline bool on_stack(struct stack_info *info,
 	return addr >= info->begin && addr + len <= info->end;
 }
 
-static __always_inline unsigned long get_stack_pointer(struct task_struct *task,
-						       struct pt_regs *regs)
-{
-	if (regs)
-		return (unsigned long) kernel_stack_pointer(regs);
-	if (task == current)
-		return current_stack_pointer();
-	return (unsigned long) task->thread.ksp;
-}
-
 /*
  * Stack layout of a C stack frame.
+ * Kernel uses the packed stack layout (-mpacked-stack).
  */
-#ifndef __PACK_STACK
 struct stack_frame {
-	unsigned long back_chain;
-	unsigned long empty1[5];
-	unsigned long gprs[10];
-	unsigned int  empty2[8];
-};
-#else
-struct stack_frame {
-	unsigned long empty1[5];
-	unsigned int  empty2[8];
+	union {
+		unsigned long empty[9];
+		struct {
+			unsigned long sie_control_block;
+			unsigned long sie_savearea;
+			unsigned long sie_reason;
+			unsigned long sie_flags;
+			unsigned long sie_control_block_phys;
+			unsigned long sie_guest_asce;
+		};
+	};
 	unsigned long gprs[10];
 	unsigned long back_chain;
 };
-#endif
 
 /*
- * Unlike current_stack_pointer() which simply returns current value of %r15
+ * Unlike current_stack_pointer which simply contains the current value of %r15
  * current_frame_address() returns function stack frame address, which matches
  * %r15 upon function invocation. It may differ from %r15 later if function
  * allocates stack for local variables or new stack frame to call other
@@ -73,29 +83,26 @@ struct stack_frame {
 	((unsigned long)__builtin_frame_address(0) -			\
 	 offsetof(struct stack_frame, back_chain))
 
-#define CALL_ARGS_0()							\
-	register unsigned long r2 asm("2")
-#define CALL_ARGS_1(arg1)						\
-	register unsigned long r2 asm("2") = (unsigned long)(arg1)
-#define CALL_ARGS_2(arg1, arg2)						\
-	CALL_ARGS_1(arg1);						\
-	register unsigned long r3 asm("3") = (unsigned long)(arg2)
-#define CALL_ARGS_3(arg1, arg2, arg3)					\
-	CALL_ARGS_2(arg1, arg2);					\
-	register unsigned long r4 asm("4") = (unsigned long)(arg3)
-#define CALL_ARGS_4(arg1, arg2, arg3, arg4)				\
-	CALL_ARGS_3(arg1, arg2, arg3);					\
-	register unsigned long r4 asm("5") = (unsigned long)(arg4)
-#define CALL_ARGS_5(arg1, arg2, arg3, arg4, arg5)			\
-	CALL_ARGS_4(arg1, arg2, arg3, arg4);				\
-	register unsigned long r4 asm("6") = (unsigned long)(arg5)
-
-#define CALL_FMT_0 "=&d" (r2) :
-#define CALL_FMT_1 "+&d" (r2) :
-#define CALL_FMT_2 CALL_FMT_1 "d" (r3),
-#define CALL_FMT_3 CALL_FMT_2 "d" (r4),
-#define CALL_FMT_4 CALL_FMT_3 "d" (r5),
-#define CALL_FMT_5 CALL_FMT_4 "d" (r6),
+static __always_inline unsigned long get_stack_pointer(struct task_struct *task,
+						       struct pt_regs *regs)
+{
+	if (regs)
+		return (unsigned long)kernel_stack_pointer(regs);
+	if (task == current)
+		return current_frame_address();
+	return (unsigned long)task->thread.ksp;
+}
+
+/*
+ * To keep this simple mark register 2-6 as being changed (volatile)
+ * by the called function, even though register 6 is saved/nonvolatile.
+ */
+#define CALL_FMT_0 "=&d" (r2)
+#define CALL_FMT_1 "+&d" (r2)
+#define CALL_FMT_2 CALL_FMT_1, "+&d" (r3)
+#define CALL_FMT_3 CALL_FMT_2, "+&d" (r4)
+#define CALL_FMT_4 CALL_FMT_3, "+&d" (r5)
+#define CALL_FMT_5 CALL_FMT_4, "+&d" (r6)
 
 #define CALL_CLOBBER_5 "0", "1", "14", "cc", "memory"
 #define CALL_CLOBBER_4 CALL_CLOBBER_5
@@ -104,35 +111,150 @@ struct stack_frame {
 #define CALL_CLOBBER_1 CALL_CLOBBER_2, "3"
 #define CALL_CLOBBER_0 CALL_CLOBBER_1
 
-#define CALL_ON_STACK(fn, stack, nr, args...)				\
+#define CALL_LARGS_0(...)						\
+	long dummy = 0
+#define CALL_LARGS_1(t1, a1)						\
+	long arg1  = (long)(t1)(a1)
+#define CALL_LARGS_2(t1, a1, t2, a2)					\
+	CALL_LARGS_1(t1, a1);						\
+	long arg2 = (long)(t2)(a2)
+#define CALL_LARGS_3(t1, a1, t2, a2, t3, a3)				\
+	CALL_LARGS_2(t1, a1, t2, a2);					\
+	long arg3 = (long)(t3)(a3)
+#define CALL_LARGS_4(t1, a1, t2, a2, t3, a3, t4, a4)			\
+	CALL_LARGS_3(t1, a1, t2, a2, t3, a3);				\
+	long arg4  = (long)(t4)(a4)
+#define CALL_LARGS_5(t1, a1, t2, a2, t3, a3, t4, a4, t5, a5)		\
+	CALL_LARGS_4(t1, a1, t2, a2, t3, a3, t4, a4);			\
+	long arg5 = (long)(t5)(a5)
+
+#define CALL_REGS_0							\
+	register long r2 asm("2") = dummy
+#define CALL_REGS_1							\
+	register long r2 asm("2") = arg1
+#define CALL_REGS_2							\
+	CALL_REGS_1;							\
+	register long r3 asm("3") = arg2
+#define CALL_REGS_3							\
+	CALL_REGS_2;							\
+	register long r4 asm("4") = arg3
+#define CALL_REGS_4							\
+	CALL_REGS_3;							\
+	register long r5 asm("5") = arg4
+#define CALL_REGS_5							\
+	CALL_REGS_4;							\
+	register long r6 asm("6") = arg5
+
+#define CALL_TYPECHECK_0(...)
+#define CALL_TYPECHECK_1(t, a, ...)					\
+	typecheck(t, a)
+#define CALL_TYPECHECK_2(t, a, ...)					\
+	CALL_TYPECHECK_1(__VA_ARGS__);					\
+	typecheck(t, a)
+#define CALL_TYPECHECK_3(t, a, ...)					\
+	CALL_TYPECHECK_2(__VA_ARGS__);					\
+	typecheck(t, a)
+#define CALL_TYPECHECK_4(t, a, ...)					\
+	CALL_TYPECHECK_3(__VA_ARGS__);					\
+	typecheck(t, a)
+#define CALL_TYPECHECK_5(t, a, ...)					\
+	CALL_TYPECHECK_4(__VA_ARGS__);					\
+	typecheck(t, a)
+
+#define CALL_PARM_0(...) void
+#define CALL_PARM_1(t, a, ...) t
+#define CALL_PARM_2(t, a, ...) t, CALL_PARM_1(__VA_ARGS__)
+#define CALL_PARM_3(t, a, ...) t, CALL_PARM_2(__VA_ARGS__)
+#define CALL_PARM_4(t, a, ...) t, CALL_PARM_3(__VA_ARGS__)
+#define CALL_PARM_5(t, a, ...) t, CALL_PARM_4(__VA_ARGS__)
+#define CALL_PARM_6(t, a, ...) t, CALL_PARM_5(__VA_ARGS__)
+
+/*
+ * Use call_on_stack() to call a function switching to a specified
+ * stack. Proper sign and zero extension of function arguments is
+ * done. Usage:
+ *
+ * rc = call_on_stack(nr, stack, rettype, fn, t1, a1, t2, a2, ...)
+ *
+ * - nr specifies the number of function arguments of fn.
+ * - stack specifies the stack to be used.
+ * - fn is the function to be called.
+ * - rettype is the return type of fn.
+ * - t1, a1, ... are pairs, where t1 must match the type of the first
+ *   argument of fn, t2 the second, etc. a1 is the corresponding
+ *   first function argument (not name), etc.
+ */
+#define call_on_stack(nr, stack, rettype, fn, ...)			\
 ({									\
+	rettype (*__fn)(CALL_PARM_##nr(__VA_ARGS__)) = fn;		\
 	unsigned long frame = current_frame_address();			\
-	CALL_ARGS_##nr(args);						\
+	unsigned long __stack = stack;					\
 	unsigned long prev;						\
+	CALL_LARGS_##nr(__VA_ARGS__);					\
+	CALL_REGS_##nr;							\
 									\
+	CALL_TYPECHECK_##nr(__VA_ARGS__);				\
 	asm volatile(							\
-		"	la	%[_prev],0(15)\n"			\
+		"	lgr	%[_prev],15\n"				\
 		"	lg	15,%[_stack]\n"				\
 		"	stg	%[_frame],%[_bc](15)\n"			\
 		"	brasl	14,%[_fn]\n"				\
-		"	la	15,0(%[_prev])\n"			\
-		: [_prev] "=&a" (prev), CALL_FMT_##nr			\
-		  [_stack] "R" (stack),					\
+		"	lgr	15,%[_prev]\n"				\
+		: [_prev] "=&d" (prev), CALL_FMT_##nr			\
+		: [_stack] "R" (__stack),				\
 		  [_bc] "i" (offsetof(struct stack_frame, back_chain)),	\
 		  [_frame] "d" (frame),					\
-		  [_fn] "X" (fn) : CALL_CLOBBER_##nr);			\
-	r2;								\
+		  [_fn] "X" (__fn) : CALL_CLOBBER_##nr);		\
+	(rettype)r2;							\
 })
 
-#define CALL_ON_STACK_NORETURN(fn, stack)				\
+/*
+ * Use call_nodat() to call a function with DAT disabled.
+ * Proper sign and zero extension of function arguments is done.
+ * Usage:
+ *
+ * rc = call_nodat(nr, rettype, fn, t1, a1, t2, a2, ...)
+ *
+ * - nr specifies the number of function arguments of fn.
+ * - fn is the function to be called, where fn is a physical address.
+ * - rettype is the return type of fn.
+ * - t1, a1, ... are pairs, where t1 must match the type of the first
+ *   argument of fn, t2 the second, etc. a1 is the corresponding
+ *   first function argument (not name), etc.
+ *
+ * fn() is called with standard C function call ABI, with the exception
+ * that no useful stackframe or stackpointer is passed via register 15.
+ * Therefore the called function must not use r15 to access the stack.
+ */
+#define call_nodat(nr, rettype, fn, ...)				\
 ({									\
+	rettype (*__fn)(CALL_PARM_##nr(__VA_ARGS__)) = (fn);		\
+	/* aligned since psw_leave must not cross page boundary */	\
+	psw_t __aligned(16) psw_leave;					\
+	psw_t psw_enter;						\
+	CALL_LARGS_##nr(__VA_ARGS__);					\
+	CALL_REGS_##nr;							\
+									\
+	CALL_TYPECHECK_##nr(__VA_ARGS__);				\
+	psw_enter.mask = PSW_KERNEL_BITS & ~PSW_MASK_DAT;		\
+	psw_enter.addr = (unsigned long)__fn;				\
 	asm volatile(							\
-		"	la	15,0(%[_stack])\n"			\
-		"	xc	%[_bc](8,15),%[_bc](15)\n"		\
-		"	brasl	14,%[_fn]\n"				\
-		::[_bc] "i" (offsetof(struct stack_frame, back_chain)),	\
-		  [_stack] "a" (stack), [_fn] "X" (fn));		\
-	BUG();								\
+		"	epsw	0,1\n"					\
+		"	risbg	1,0,0,31,32\n"				\
+		"	larl	7,1f\n"					\
+		"	stg	1,%[psw_leave]\n"			\
+		"	stg	7,8+%[psw_leave]\n"			\
+		"	la	7,%[psw_leave]\n"			\
+		"	lra	7,0(7)\n"				\
+		"	larl	1,0f\n"					\
+		"	lra	14,0(1)\n"				\
+		"	lpswe	%[psw_enter]\n"				\
+		"0:	lpswe	0(7)\n"					\
+		"1:\n"							\
+		: CALL_FMT_##nr, [psw_leave] "=Q" (psw_leave)		\
+		: [psw_enter] "Q" (psw_enter)				\
+		: "7", CALL_CLOBBER_##nr);				\
+	(rettype)r2;							\
 })
 
 #endif /* _ASM_S390_STACKTRACE_H */
diff --git a/arch/s390/include/asm/stp.h b/arch/s390/include/asm/stp.h
index f0ddefb06ec8..827cb208de86 100644
--- a/arch/s390/include/asm/stp.h
+++ b/arch/s390/include/asm/stp.h
@@ -6,47 +6,94 @@
 #ifndef __S390_STP_H
 #define __S390_STP_H
 
+#include <linux/compiler.h>
+
 /* notifier for syncs */
 extern struct atomic_notifier_head s390_epoch_delta_notifier;
 
 /* STP interruption parameter */
 struct stp_irq_parm {
-	unsigned int _pad0	: 14;
-	unsigned int tsc	: 1;	/* Timing status change */
-	unsigned int lac	: 1;	/* Link availability change */
-	unsigned int tcpc	: 1;	/* Time control parameter change */
-	unsigned int _pad2	: 15;
-} __attribute__ ((packed));
+	u32		: 14;
+	u32 tsc		:  1;	/* Timing status change */
+	u32 lac		:  1;	/* Link availability change */
+	u32 tcpc	:  1;	/* Time control parameter change */
+	u32		: 15;
+} __packed;
 
 #define STP_OP_SYNC	1
 #define STP_OP_CTRL	3
 
 struct stp_sstpi {
-	unsigned int rsvd0;
-	unsigned int rsvd1 : 8;
-	unsigned int stratum : 8;
-	unsigned int vbits : 16;
-	unsigned int leaps : 16;
-	unsigned int tmd : 4;
-	unsigned int ctn : 4;
-	unsigned int rsvd2 : 3;
-	unsigned int c : 1;
-	unsigned int tst : 4;
-	unsigned int tzo : 16;
-	unsigned int dsto : 16;
-	unsigned int ctrl : 16;
-	unsigned int rsvd3 : 16;
-	unsigned int tto;
-	unsigned int rsvd4;
-	unsigned int ctnid[3];
-	unsigned int rsvd5;
-	unsigned int todoff[4];
-	unsigned int rsvd6[48];
-} __attribute__ ((packed));
+	u32		: 32;
+	u32 tu		:  1;
+	u32 lu		:  1;
+	u32		:  6;
+	u32 stratum	:  8;
+	u32 vbits	: 16;
+	u32 leaps	: 16;
+	u32 tmd		:  4;
+	u32 ctn		:  4;
+	u32		:  3;
+	u32 c		:  1;
+	u32 tst		:  4;
+	u32 tzo		: 16;
+	u32 dsto	: 16;
+	u32 ctrl	: 16;
+	u32		: 16;
+	u32 tto;
+	u32		: 32;
+	u32 ctnid[3];
+	u32		: 32;
+	u64 todoff;
+	u32 rsvd[50];
+} __packed;
+
+struct stp_tzib {
+	u32 tzan	: 16;
+	u32		: 16;
+	u32 tzo		: 16;
+	u32 dsto	: 16;
+	u32 stn;
+	u32 dstn;
+	u64 dst_on_alg;
+	u64 dst_off_alg;
+} __packed;
+
+struct stp_tcpib {
+	u32 atcode	: 4;
+	u32 ntcode	: 4;
+	u32 d		: 1;
+	u32		: 23;
+	s32 tto;
+	struct stp_tzib atzib;
+	struct stp_tzib ntzib;
+	s32 adst_offset : 16;
+	s32 ndst_offset : 16;
+	u32 rsvd1;
+	u64 ntzib_update;
+	u64 ndsto_update;
+} __packed;
+
+struct stp_lsoib {
+	u32 p		: 1;
+	u32		: 31;
+	s32 also	: 16;
+	s32 nlso	: 16;
+	u64 nlsout;
+} __packed;
+
+struct stp_stzi {
+	u32 rsvd0[3];
+	u64 data_ts;
+	u32 rsvd1[22];
+	struct stp_tcpib tcpib;
+	struct stp_lsoib lsoib;
+} __packed;
 
 /* Functions needed by the machine check handler */
 int stp_sync_check(void);
 int stp_island_check(void);
 void stp_queue_work(void);
+bool stp_enabled(void);
 
 #endif /* __S390_STP_H */
diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h
index 4c0690fc5167..2ab868cbae6c 100644
--- a/arch/s390/include/asm/string.h
+++ b/arch/s390/include/asm/string.h
@@ -15,15 +15,12 @@
 #define __HAVE_ARCH_MEMCPY	/* gcc builtin & arch function */
 #define __HAVE_ARCH_MEMMOVE	/* gcc builtin & arch function */
 #define __HAVE_ARCH_MEMSET	/* gcc builtin & arch function */
-#define __HAVE_ARCH_MEMSET16	/* arch function */
-#define __HAVE_ARCH_MEMSET32	/* arch function */
-#define __HAVE_ARCH_MEMSET64	/* arch function */
 
 void *memcpy(void *dest, const void *src, size_t n);
 void *memset(void *s, int c, size_t n);
 void *memmove(void *dest, const void *src, size_t n);
 
-#ifndef CONFIG_KASAN
+#if !defined(CONFIG_KASAN) && !defined(CONFIG_KMSAN)
 #define __HAVE_ARCH_MEMCHR	/* inline & arch function */
 #define __HAVE_ARCH_MEMCMP	/* arch function */
 #define __HAVE_ARCH_MEMSCAN	/* inline & arch function */
@@ -31,24 +28,23 @@ void *memmove(void *dest, const void *src, size_t n);
 #define __HAVE_ARCH_STRCMP	/* arch function */
 #define __HAVE_ARCH_STRCPY	/* inline & arch function */
 #define __HAVE_ARCH_STRLCAT	/* arch function */
-#define __HAVE_ARCH_STRLCPY	/* arch function */
 #define __HAVE_ARCH_STRLEN	/* inline & arch function */
 #define __HAVE_ARCH_STRNCAT	/* arch function */
 #define __HAVE_ARCH_STRNCPY	/* arch function */
 #define __HAVE_ARCH_STRNLEN	/* inline & arch function */
-#define __HAVE_ARCH_STRRCHR	/* arch function */
 #define __HAVE_ARCH_STRSTR	/* arch function */
+#define __HAVE_ARCH_MEMSET16	/* arch function */
+#define __HAVE_ARCH_MEMSET32	/* arch function */
+#define __HAVE_ARCH_MEMSET64	/* arch function */
 
 /* Prototypes for non-inlined arch strings functions. */
 int memcmp(const void *s1, const void *s2, size_t n);
 int strcmp(const char *s1, const char *s2);
 size_t strlcat(char *dest, const char *src, size_t n);
-size_t strlcpy(char *dest, const char *src, size_t size);
 char *strncat(char *dest, const char *src, size_t n);
 char *strncpy(char *dest, const char *src, size_t n);
-char *strrchr(const char *s, int c);
 char *strstr(const char *s1, const char *s2);
-#endif /* !CONFIG_KASAN */
+#endif /* !defined(CONFIG_KASAN) && !defined(CONFIG_KMSAN) */
 
 #undef __HAVE_ARCH_STRCHR
 #undef __HAVE_ARCH_STRNCHR
@@ -59,18 +55,6 @@ char *strstr(const char *s1, const char *s2);
 
 #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
 
-extern void *__memcpy(void *dest, const void *src, size_t n);
-extern void *__memset(void *s, int c, size_t n);
-extern void *__memmove(void *dest, const void *src, size_t n);
-
-/*
- * For files that are not instrumented (e.g. mm/slub.c) we
- * should use not instrumented version of mem* functions.
- */
-
-#define memcpy(dst, src, len) __memcpy(dst, src, len)
-#define memmove(dst, src, len) __memmove(dst, src, len)
-#define memset(s, c, n) __memset(s, c, n)
 #define strlen(s) __strlen(s)
 
 #define __no_sanitize_prefix_strfunc(x) __##x
@@ -83,40 +67,55 @@ extern void *__memmove(void *dest, const void *src, size_t n);
 #define __no_sanitize_prefix_strfunc(x) x
 #endif /* defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) */
 
+void *__memcpy(void *dest, const void *src, size_t n);
+void *__memset(void *s, int c, size_t n);
+void *__memmove(void *dest, const void *src, size_t n);
 void *__memset16(uint16_t *s, uint16_t v, size_t count);
 void *__memset32(uint32_t *s, uint32_t v, size_t count);
 void *__memset64(uint64_t *s, uint64_t v, size_t count);
 
+#ifdef __HAVE_ARCH_MEMSET16
 static inline void *memset16(uint16_t *s, uint16_t v, size_t count)
 {
 	return __memset16(s, v, count * sizeof(v));
 }
+#endif
 
+#ifdef __HAVE_ARCH_MEMSET32
 static inline void *memset32(uint32_t *s, uint32_t v, size_t count)
 {
 	return __memset32(s, v, count * sizeof(v));
 }
+#endif
 
+#ifdef __HAVE_ARCH_MEMSET64
+#ifdef IN_BOOT_STRING_C
+void *memset64(uint64_t *s, uint64_t v, size_t count);
+#else
 static inline void *memset64(uint64_t *s, uint64_t v, size_t count)
 {
 	return __memset64(s, v, count * sizeof(v));
 }
+#endif
+#endif
 
 #if !defined(IN_ARCH_STRING_C) && (!defined(CONFIG_FORTIFY_SOURCE) || defined(__NO_FORTIFY))
 
 #ifdef __HAVE_ARCH_MEMCHR
 static inline void *memchr(const void * s, int c, size_t n)
 {
-	register int r0 asm("0") = (char) c;
 	const void *ret = s + n;
 
 	asm volatile(
-		"0:	srst	%0,%1\n"
+		"	lgr	0,%[c]\n"
+		"0:	srst	%[ret],%[s]\n"
 		"	jo	0b\n"
 		"	jl	1f\n"
-		"	la	%0,0\n"
+		"	la	%[ret],0\n"
 		"1:"
-		: "+a" (ret), "+&a" (s) : "d" (r0) : "cc", "memory");
+		: [ret] "+&a" (ret), [s] "+&a" (s)
+		: [c] "d" (c)
+		: "cc", "memory", "0");
 	return (void *) ret;
 }
 #endif
@@ -124,13 +123,15 @@ static inline void *memchr(const void * s, int c, size_t n)
 #ifdef __HAVE_ARCH_MEMSCAN
 static inline void *memscan(void *s, int c, size_t n)
 {
-	register int r0 asm("0") = (char) c;
 	const void *ret = s + n;
 
 	asm volatile(
-		"0:	srst	%0,%1\n"
+		"	lgr	0,%[c]\n"
+		"0:	srst	%[ret],%[s]\n"
 		"	jo	0b\n"
-		: "+a" (ret), "+&a" (s) : "d" (r0) : "cc", "memory");
+		: [ret] "+&a" (ret), [s] "+&a" (s)
+		: [c] "d" (c)
+		: "cc", "memory", "0");
 	return (void *) ret;
 }
 #endif
@@ -138,17 +139,18 @@ static inline void *memscan(void *s, int c, size_t n)
 #ifdef __HAVE_ARCH_STRCAT
 static inline char *strcat(char *dst, const char *src)
 {
-	register int r0 asm("0") = 0;
-	unsigned long dummy;
+	unsigned long dummy = 0;
 	char *ret = dst;
 
 	asm volatile(
-		"0:	srst	%0,%1\n"
+		"	lghi	0,0\n"
+		"0:	srst	%[dummy],%[dst]\n"
 		"	jo	0b\n"
-		"1:	mvst	%0,%2\n"
+		"1:	mvst	%[dummy],%[src]\n"
 		"	jo	1b"
-		: "=&a" (dummy), "+a" (dst), "+a" (src)
-		: "d" (r0), "0" (0) : "cc", "memory" );
+		: [dummy] "+&a" (dummy), [dst] "+&a" (dst), [src] "+&a" (src)
+		:
+		: "cc", "memory", "0");
 	return ret;
 }
 #endif
@@ -156,14 +158,15 @@ static inline char *strcat(char *dst, const char *src)
 #ifdef __HAVE_ARCH_STRCPY
 static inline char *strcpy(char *dst, const char *src)
 {
-	register int r0 asm("0") = 0;
 	char *ret = dst;
 
 	asm volatile(
-		"0:	mvst	%0,%1\n"
+		"	lghi	0,0\n"
+		"0:	mvst	%[dst],%[src]\n"
 		"	jo	0b"
-		: "+&a" (dst), "+&a" (src) : "d" (r0)
-		: "cc", "memory");
+		: [dst] "+&a" (dst), [src] "+&a" (src)
+		:
+		: "cc", "memory", "0");
 	return ret;
 }
 #endif
@@ -171,28 +174,33 @@ static inline char *strcpy(char *dst, const char *src)
 #if defined(__HAVE_ARCH_STRLEN) || (defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__))
 static inline size_t __no_sanitize_prefix_strfunc(strlen)(const char *s)
 {
-	register unsigned long r0 asm("0") = 0;
+	unsigned long end = 0;
 	const char *tmp = s;
 
 	asm volatile(
-		"0:	srst	%0,%1\n"
+		"	lghi	0,0\n"
+		"0:	srst	%[end],%[tmp]\n"
 		"	jo	0b"
-		: "+d" (r0), "+a" (tmp) :  : "cc", "memory");
-	return r0 - (unsigned long) s;
+		: [end] "+&a" (end), [tmp] "+&a" (tmp)
+		:
+		: "cc", "memory", "0");
+	return end - (unsigned long)s;
 }
 #endif
 
 #ifdef __HAVE_ARCH_STRNLEN
 static inline size_t strnlen(const char * s, size_t n)
 {
-	register int r0 asm("0") = 0;
 	const char *tmp = s;
 	const char *end = s + n;
 
 	asm volatile(
-		"0:	srst	%0,%1\n"
+		"	lghi	0,0\n"
+		"0:	srst	%[end],%[tmp]\n"
 		"	jo	0b"
-		: "+a" (end), "+a" (tmp) : "d" (r0)  : "cc", "memory");
+		: [end] "+&a" (end), [tmp] "+&a" (tmp)
+		:
+		: "cc", "memory", "0");
 	return end - s;
 }
 #endif
diff --git a/arch/s390/include/asm/switch_to.h b/arch/s390/include/asm/switch_to.h
deleted file mode 100644
index c61b2cc1a8a8..000000000000
--- a/arch/s390/include/asm/switch_to.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright IBM Corp. 1999, 2009
- *
- * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
- */
-
-#ifndef __ASM_SWITCH_TO_H
-#define __ASM_SWITCH_TO_H
-
-#include <linux/thread_info.h>
-#include <asm/fpu/api.h>
-#include <asm/ptrace.h>
-#include <asm/guarded_storage.h>
-
-extern struct task_struct *__switch_to(void *, void *);
-extern void update_cr_regs(struct task_struct *task);
-
-static inline void save_access_regs(unsigned int *acrs)
-{
-	typedef struct { int _[NUM_ACRS]; } acrstype;
-
-	asm volatile("stam 0,15,%0" : "=Q" (*(acrstype *)acrs));
-}
-
-static inline void restore_access_regs(unsigned int *acrs)
-{
-	typedef struct { int _[NUM_ACRS]; } acrstype;
-
-	asm volatile("lam 0,15,%0" : : "Q" (*(acrstype *)acrs));
-}
-
-#define switch_to(prev, next, last) do {				\
-	/* save_fpu_regs() sets the CIF_FPU flag, which enforces	\
-	 * a restore of the floating point / vector registers as	\
-	 * soon as the next task returns to user space			\
-	 */								\
-	save_fpu_regs();						\
-	save_access_regs(&prev->thread.acrs[0]);			\
-	save_ri_cb(prev->thread.ri_cb);					\
-	save_gs_cb(prev->thread.gs_cb);					\
-	update_cr_regs(next);						\
-	restore_access_regs(&next->thread.acrs[0]);			\
-	restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb);		\
-	restore_gs_cb(next->thread.gs_cb);				\
-	prev = __switch_to(prev, next);					\
-} while (0)
-
-#endif /* __ASM_SWITCH_TO_H */
diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h
index f073292e9fdb..0213ec800b57 100644
--- a/arch/s390/include/asm/syscall.h
+++ b/arch/s390/include/asm/syscall.h
@@ -14,8 +14,8 @@
 #include <linux/err.h>
 #include <asm/ptrace.h>
 
-extern const unsigned long sys_call_table[];
-extern const unsigned long sys_call_table_emu[];
+extern const sys_call_ptr_t sys_call_table[];
+extern const sys_call_ptr_t sys_call_table_emu[];
 
 static inline long syscall_get_nr(struct task_struct *task,
 				  struct pt_regs *regs)
@@ -33,7 +33,17 @@ static inline void syscall_rollback(struct task_struct *task,
 static inline long syscall_get_error(struct task_struct *task,
 				     struct pt_regs *regs)
 {
-	return IS_ERR_VALUE(regs->gprs[2]) ? regs->gprs[2] : 0;
+	unsigned long error = regs->gprs[2];
+#ifdef CONFIG_COMPAT
+	if (test_tsk_thread_flag(task, TIF_31BIT)) {
+		/*
+		 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
+		 * and will match correctly in comparisons.
+		 */
+		error = (long)(int)error;
+	}
+#endif
+	return IS_ERR_VALUE(error) ? error : 0;
 }
 
 static inline long syscall_get_return_value(struct task_struct *task,
@@ -46,6 +56,7 @@ static inline void syscall_set_return_value(struct task_struct *task,
 					    struct pt_regs *regs,
 					    int error, long val)
 {
+	set_pt_regs_flag(regs, PIF_SYSCALL_RET_SET);
 	regs->gprs[2] = error ? error : val;
 }
 
@@ -54,31 +65,17 @@ static inline void syscall_get_arguments(struct task_struct *task,
 					 unsigned long *args)
 {
 	unsigned long mask = -1UL;
-	unsigned int n = 6;
 
 #ifdef CONFIG_COMPAT
 	if (test_tsk_thread_flag(task, TIF_31BIT))
 		mask = 0xffffffff;
 #endif
-	while (n-- > 0)
-		if (n > 0)
-			args[n] = regs->gprs[2 + n] & mask;
+	for (int i = 1; i < 6; i++)
+		args[i] = regs->gprs[2 + i] & mask;
 
 	args[0] = regs->orig_gpr2 & mask;
 }
 
-static inline void syscall_set_arguments(struct task_struct *task,
-					 struct pt_regs *regs,
-					 const unsigned long *args)
-{
-	unsigned int n = 6;
-
-	while (n-- > 0)
-		if (n > 0)
-			regs->gprs[2 + n] = args[n];
-	regs->orig_gpr2 = args[0];
-}
-
 static inline int syscall_get_arch(struct task_struct *task)
 {
 #ifdef CONFIG_COMPAT
@@ -87,4 +84,69 @@ static inline int syscall_get_arch(struct task_struct *task)
 #endif
 	return AUDIT_ARCH_S390X;
 }
+
+static inline bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs)
+{
+	return false;
+}
+
+#define SYSCALL_FMT_0
+#define SYSCALL_FMT_1 , "0" (r2)
+#define SYSCALL_FMT_2 , "d" (r3) SYSCALL_FMT_1
+#define SYSCALL_FMT_3 , "d" (r4) SYSCALL_FMT_2
+#define SYSCALL_FMT_4 , "d" (r5) SYSCALL_FMT_3
+#define SYSCALL_FMT_5 , "d" (r6) SYSCALL_FMT_4
+#define SYSCALL_FMT_6 , "d" (r7) SYSCALL_FMT_5
+
+#define SYSCALL_PARM_0
+#define SYSCALL_PARM_1 , long arg1
+#define SYSCALL_PARM_2 SYSCALL_PARM_1, long arg2
+#define SYSCALL_PARM_3 SYSCALL_PARM_2, long arg3
+#define SYSCALL_PARM_4 SYSCALL_PARM_3, long arg4
+#define SYSCALL_PARM_5 SYSCALL_PARM_4, long arg5
+#define SYSCALL_PARM_6 SYSCALL_PARM_5, long arg6
+
+#define SYSCALL_REGS_0
+#define SYSCALL_REGS_1							\
+	register long r2 asm("2") = arg1
+#define SYSCALL_REGS_2							\
+	SYSCALL_REGS_1;							\
+	register long r3 asm("3") = arg2
+#define SYSCALL_REGS_3							\
+	SYSCALL_REGS_2;							\
+	register long r4 asm("4") = arg3
+#define SYSCALL_REGS_4							\
+	SYSCALL_REGS_3;							\
+	register long r5 asm("5") = arg4
+#define SYSCALL_REGS_5							\
+	SYSCALL_REGS_4;							\
+	register long r6 asm("6") = arg5
+#define SYSCALL_REGS_6							\
+	SYSCALL_REGS_5;							\
+	register long r7 asm("7") = arg6
+
+#define GENERATE_SYSCALL_FUNC(nr)					\
+static __always_inline							\
+long syscall##nr(unsigned long syscall SYSCALL_PARM_##nr)		\
+{									\
+	register unsigned long r1 asm ("1") = syscall;			\
+	register long rc asm ("2");					\
+	SYSCALL_REGS_##nr;						\
+									\
+	asm volatile (							\
+		"	svc	0\n"					\
+		: "=d" (rc)						\
+		: "d" (r1) SYSCALL_FMT_##nr				\
+		: "memory");						\
+	return rc;							\
+}
+
+GENERATE_SYSCALL_FUNC(0)
+GENERATE_SYSCALL_FUNC(1)
+GENERATE_SYSCALL_FUNC(2)
+GENERATE_SYSCALL_FUNC(3)
+GENERATE_SYSCALL_FUNC(4)
+GENERATE_SYSCALL_FUNC(5)
+GENERATE_SYSCALL_FUNC(6)
+
 #endif	/* _ASM_SYSCALL_H */
diff --git a/arch/s390/include/asm/syscall_wrapper.h b/arch/s390/include/asm/syscall_wrapper.h
index 3c3d6fe8e2f0..35c1d1b860d8 100644
--- a/arch/s390/include/asm/syscall_wrapper.h
+++ b/arch/s390/include/asm/syscall_wrapper.h
@@ -7,9 +7,13 @@
 #ifndef _ASM_S390_SYSCALL_WRAPPER_H
 #define _ASM_S390_SYSCALL_WRAPPER_H
 
+/* Mapping of registers to parameters for syscalls */
+#define SC_S390_REGS_TO_ARGS(x, ...)					\
+	__MAP(x, __SC_ARGS						\
+	      ,, regs->orig_gpr2,, regs->gprs[3],, regs->gprs[4]	\
+	      ,, regs->gprs[5],, regs->gprs[6],, regs->gprs[7])
+
 #ifdef CONFIG_COMPAT
-#define __SC_COMPAT_TYPE(t, a) \
-	__typeof(__builtin_choose_expr(sizeof(t) > 4, 0L, (t)0)) a
 
 #define __SC_COMPAT_CAST(t, a)						\
 ({									\
@@ -29,107 +33,108 @@
 	(t)__ReS;							\
 })
 
-#define __S390_SYS_STUBx(x, name, ...)					\
-	asmlinkage long __s390_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))\
-	ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO);			\
-	asmlinkage long __s390_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))\
-	{								\
-		long ret = __s390x_sys##name(__MAP(x,__SC_COMPAT_CAST,__VA_ARGS__));\
-		__MAP(x,__SC_TEST,__VA_ARGS__);				\
-		return ret;						\
-	}
-
 /*
  * To keep the naming coherent, re-define SYSCALL_DEFINE0 to create an alias
  * named __s390x_sys_*()
  */
 #define COMPAT_SYSCALL_DEFINE0(sname)					\
-	SYSCALL_METADATA(_##sname, 0);					\
-	asmlinkage long __s390_compat_sys_##sname(void);		\
-	ALLOW_ERROR_INJECTION(__s390_compat__sys_##sname, ERRNO);	\
-	asmlinkage long __s390_compat_sys_##sname(void)
+	long __s390_compat_sys_##sname(void);				\
+	ALLOW_ERROR_INJECTION(__s390_compat_sys_##sname, ERRNO);	\
+	long __s390_compat_sys_##sname(void)
 
 #define SYSCALL_DEFINE0(sname)						\
 	SYSCALL_METADATA(_##sname, 0);					\
-	asmlinkage long __s390x_sys_##sname(void);			\
+	long __s390_sys_##sname(void);					\
+	ALLOW_ERROR_INJECTION(__s390_sys_##sname, ERRNO);		\
+	long __s390x_sys_##sname(void);					\
 	ALLOW_ERROR_INJECTION(__s390x_sys_##sname, ERRNO);		\
-	asmlinkage long __s390_sys_##sname(void)			\
-		__attribute__((alias(__stringify(__s390x_sys_##sname)))); \
-	asmlinkage long __s390x_sys_##sname(void)
+	static inline long __do_sys_##sname(void);			\
+	long __s390_sys_##sname(void)					\
+	{								\
+		return __do_sys_##sname();				\
+	}								\
+	long __s390x_sys_##sname(void)					\
+	{								\
+		return __do_sys_##sname();				\
+	}								\
+	static inline long __do_sys_##sname(void)
 
 #define COND_SYSCALL(name)						\
 	cond_syscall(__s390x_sys_##name);				\
 	cond_syscall(__s390_sys_##name)
 
-#define SYS_NI(name)							\
-	SYSCALL_ALIAS(__s390x_sys_##name, sys_ni_posix_timers);		\
-	SYSCALL_ALIAS(__s390_sys_##name, sys_ni_posix_timers)
-
-#define COMPAT_SYSCALL_DEFINEx(x, name, ...)					\
-	__diag_push();								\
-	__diag_ignore(GCC, 8, "-Wattribute-alias",				\
-		      "Type aliasing is used to sanitize syscall arguments");\
-	asmlinkage long __s390_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));	\
-	asmlinkage long __s390_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))	\
-		__attribute__((alias(__stringify(__se_compat_sys##name))));	\
-	ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO);				\
-	static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
-	asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
-	asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
-	{									\
-		long ret = __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__));\
-		__MAP(x,__SC_TEST,__VA_ARGS__);					\
-		return ret;							\
-	}									\
-	__diag_pop();								\
-	static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
+#define COMPAT_SYSCALL_DEFINEx(x, name, ...)						\
+	long __s390_compat_sys##name(struct pt_regs *regs);				\
+	ALLOW_ERROR_INJECTION(__s390_compat_sys##name, ERRNO);				\
+	static inline long __se_compat_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__));	\
+	static inline long __do_compat_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__));	\
+	long __s390_compat_sys##name(struct pt_regs *regs)				\
+	{										\
+		return __se_compat_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__));	\
+	}										\
+	static inline long __se_compat_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__))	\
+	{										\
+		__MAP(x, __SC_TEST, __VA_ARGS__);					\
+		return __do_compat_sys##name(__MAP(x, __SC_DELOUSE, __VA_ARGS__));	\
+	}										\
+	static inline long __do_compat_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__))
 
 /*
  * As some compat syscalls may not be implemented, we need to expand
- * COND_SYSCALL_COMPAT in kernel/sys_ni.c and COMPAT_SYS_NI in
- * kernel/time/posix-stubs.c to cover this case as well.
+ * COND_SYSCALL_COMPAT in kernel/sys_ni.c to cover this case as well.
  */
 #define COND_SYSCALL_COMPAT(name)					\
 	cond_syscall(__s390_compat_sys_##name)
 
-#define COMPAT_SYS_NI(name)						\
-	SYSCALL_ALIAS(__s390_compat_sys_##name, sys_ni_posix_timers)
+#define __S390_SYS_STUBx(x, name, ...)						\
+	long __s390_sys##name(struct pt_regs *regs);				\
+	ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO);				\
+	static inline long ___se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__));	\
+	long __s390_sys##name(struct pt_regs *regs)				\
+	{									\
+		return ___se_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__));	\
+	}									\
+	static inline long ___se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__))	\
+	{									\
+		__MAP(x, __SC_TEST, __VA_ARGS__);				\
+		return __do_sys##name(__MAP(x, __SC_COMPAT_CAST, __VA_ARGS__));	\
+	}
 
 #else /* CONFIG_COMPAT */
 
-#define __S390_SYS_STUBx(x, fullname, name, ...)
-
 #define SYSCALL_DEFINE0(sname)						\
 	SYSCALL_METADATA(_##sname, 0);					\
-	asmlinkage long __s390x_sys_##sname(void);			\
+	long __s390x_sys_##sname(void);					\
 	ALLOW_ERROR_INJECTION(__s390x_sys_##sname, ERRNO);		\
-	asmlinkage long __s390x_sys_##sname(void)
+	static inline long __do_sys_##sname(void);			\
+	long __s390x_sys_##sname(void)					\
+	{								\
+		return __do_sys_##sname();				\
+	}								\
+	static inline long __do_sys_##sname(void)
 
 #define COND_SYSCALL(name)						\
 	cond_syscall(__s390x_sys_##name)
 
-#define SYS_NI(name)							\
-	SYSCALL_ALIAS(__s390x_sys_##name, sys_ni_posix_timers);
+#define __S390_SYS_STUBx(x, fullname, name, ...)
 
 #endif /* CONFIG_COMPAT */
 
 #define __SYSCALL_DEFINEx(x, name, ...)						\
-	__diag_push();								\
-	__diag_ignore(GCC, 8, "-Wattribute-alias",				\
-		      "Type aliasing is used to sanitize syscall arguments");\
-	asmlinkage long __s390x_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))	\
-		__attribute__((alias(__stringify(__se_sys##name))));		\
+	long __s390x_sys##name(struct pt_regs *regs);				\
 	ALLOW_ERROR_INJECTION(__s390x_sys##name, ERRNO);			\
-	long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__));			\
-	static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));	\
-	__S390_SYS_STUBx(x, name, __VA_ARGS__)					\
-	asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))		\
+	static inline long __se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__));	\
+	static inline long __do_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__));	\
+	__S390_SYS_STUBx(x, name, __VA_ARGS__);					\
+	long __s390x_sys##name(struct pt_regs *regs)				\
+	{									\
+		return __se_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__));	\
+	}									\
+	static inline long __se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__))	\
 	{									\
-		long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
-		__MAP(x,__SC_TEST,__VA_ARGS__);					\
-		return ret;							\
+		__MAP(x, __SC_TEST, __VA_ARGS__);				\
+		return __do_sys##name(__MAP(x, __SC_CAST, __VA_ARGS__));	\
 	}									\
-	__diag_pop();								\
-	static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
+	static inline long __do_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__))
 
-#endif /* _ASM_X86_SYSCALL_WRAPPER_H */
+#endif /* _ASM_S390_SYSCALL_WRAPPER_H */
diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h
index fe7b3f8f0791..9088c5267f35 100644
--- a/arch/s390/include/asm/sysinfo.h
+++ b/arch/s390/include/asm/sysinfo.h
@@ -11,8 +11,34 @@
 #ifndef __ASM_S390_SYSINFO_H
 #define __ASM_S390_SYSINFO_H
 
-#include <asm/bitsperlong.h>
 #include <linux/uuid.h>
+#include <asm/bitsperlong.h>
+#include <asm/asm.h>
+
+/*
+ * stsi - store system information
+ *
+ * Returns the current configuration level if function code 0 was specified.
+ * Otherwise returns 0 on success or a negative value on error.
+ */
+static inline int stsi(void *sysinfo, int fc, int sel1, int sel2)
+{
+	int r0 = (fc << 28) | sel1;
+	int cc;
+
+	asm volatile(
+		"	lr	%%r0,%[r0]\n"
+		"	lr	%%r1,%[r1]\n"
+		"	stsi	%[sysinfo]\n"
+		"	lr	%[r0],%%r0\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc), [r0] "+d" (r0), [sysinfo] "=Q" (*(char *)sysinfo)
+		: [r1] "d" (sel2)
+		: CC_CLOBBER_LIST("0", "1", "memory"));
+	if (cc == 3)
+		return -EOPNOTSUPP;
+	return fc ? 0 : (unsigned int)r0 >> 28;
+}
 
 struct sysinfo_1_1_1 {
 	unsigned char p:1;
@@ -40,6 +66,10 @@ struct sysinfo_1_1_1 {
 	unsigned int ncr;
 	unsigned int npr;
 	unsigned int ntr;
+	char reserved_3[4];
+	char model_var_cap[16];
+	unsigned int model_var_cap_rating;
+	unsigned int nvr;
 };
 
 struct sysinfo_1_2_1 {
@@ -67,12 +97,12 @@ struct sysinfo_1_2_2 {
 	unsigned short cpus_configured;
 	unsigned short cpus_standby;
 	unsigned short cpus_reserved;
-	unsigned short adjustment[0];
+	unsigned short adjustment[];
 };
 
 struct sysinfo_1_2_2_extension {
 	unsigned int alt_capability;
-	unsigned short alt_adjustment[0];
+	unsigned short alt_adjustment[];
 };
 
 struct sysinfo_2_2_1 {
@@ -181,7 +211,7 @@ struct sysinfo_15_1_x {
 	unsigned char reserved1;
 	unsigned char mnest;
 	unsigned char reserved2[4];
-	union topology_entry tle[0];
+	union topology_entry tle[];
 };
 
 int stsi(void *sysinfo, int fc, int sel1, int sel2);
diff --git a/arch/s390/include/asm/termios.h b/arch/s390/include/asm/termios.h
deleted file mode 100644
index 46fa3020b41e..000000000000
--- a/arch/s390/include/asm/termios.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *  S390 version
- *
- *  Derived from "include/asm-i386/termios.h"
- */
-#ifndef _S390_TERMIOS_H
-#define _S390_TERMIOS_H
-
-#include <uapi/asm/termios.h>
-
-
-/*	intr=^C		quit=^\		erase=del	kill=^U
-	eof=^D		vtime=\0	vmin=\1		sxtc=\0
-	start=^Q	stop=^S		susp=^Z		eol=\0
-	reprint=^R	discard=^U	werase=^W	lnext=^V
-	eol2=\0
-*/
-#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0"
-
-#define user_termios_to_kernel_termios(k, u) copy_from_user(k, u, sizeof(struct termios2))
-#define kernel_termios_to_user_termios(u, k) copy_to_user(u, k, sizeof(struct termios2))
-
-#include <asm-generic/termios-base.h>
-
-#endif	/* _S390_TERMIOS_H */
diff --git a/arch/s390/include/asm/text-patching.h b/arch/s390/include/asm/text-patching.h
new file mode 100644
index 000000000000..b219056a8817
--- /dev/null
+++ b/arch/s390/include/asm/text-patching.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_S390_TEXT_PATCHING_H
+#define _ASM_S390_TEXT_PATCHING_H
+
+#include <asm/barrier.h>
+
+static __always_inline void sync_core(void)
+{
+	bcr_serialize();
+}
+
+void text_poke_sync(void);
+void text_poke_sync_lock(void);
+
+#endif /* _ASM_S390_TEXT_PATCHING_H */
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index e582fbe59e20..91f569cae1ce 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -13,21 +13,19 @@
 /*
  * General size of kernel stacks
  */
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN) || defined(CONFIG_KMSAN)
 #define THREAD_SIZE_ORDER 4
 #else
 #define THREAD_SIZE_ORDER 2
 #endif
-#define BOOT_STACK_ORDER  2
+#define BOOT_STACK_SIZE (PAGE_SIZE << 2)
 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
 
+#define STACK_INIT_OFFSET (THREAD_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE)
+
 #ifndef __ASSEMBLY__
 #include <asm/lowcore.h>
 #include <asm/page.h>
-#include <asm/processor.h>
-
-#define STACK_INIT_OFFSET \
-	(THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs))
 
 /*
  * low level task data that entry.S needs immediate access to
@@ -37,6 +35,9 @@
  */
 struct thread_info {
 	unsigned long		flags;		/* low level flags */
+	unsigned long		syscall_work;	/* SYSCALL_WORK_ flags */
+	unsigned int		cpu;		/* current CPU */
+	unsigned char		sie;		/* running in SIE context */
 };
 
 /*
@@ -47,8 +48,7 @@ struct thread_info {
 	.flags		= 0,			\
 }
 
-void arch_release_task_struct(struct task_struct *tsk);
-int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
+struct task_struct;
 
 void arch_setup_new_exec(void);
 #define arch_setup_new_exec arch_setup_new_exec
@@ -58,25 +58,22 @@ void arch_setup_new_exec(void);
 /*
  * thread information flags bit numbers
  */
-/* _TIF_WORK bits */
 #define TIF_NOTIFY_RESUME	0	/* callback before returning to user */
 #define TIF_SIGPENDING		1	/* signal pending */
 #define TIF_NEED_RESCHED	2	/* rescheduling necessary */
-#define TIF_UPROBE		3	/* breakpointed or single-stepping */
-#define TIF_GUARDED_STORAGE	4	/* load guarded storage control block */
+#define TIF_NEED_RESCHED_LAZY	3	/* lazy rescheduling needed */
+#define TIF_UPROBE		4	/* breakpointed or single-stepping */
 #define TIF_PATCH_PENDING	5	/* pending live patching update */
-#define TIF_PGSTE		6	/* New mm's will use 4K page tables */
-#define TIF_ISOLATE_BP		8	/* Run process with isolated BP */
+#define TIF_NOTIFY_SIGNAL	7	/* signal notifications exist */
+#define TIF_GUARDED_STORAGE	8	/* load guarded storage control block */
 #define TIF_ISOLATE_BP_GUEST	9	/* Run KVM guests with isolated BP */
-
+#define TIF_PER_TRAP		10	/* Need to handle PER trap on exit to usermode */
 #define TIF_31BIT		16	/* 32bit process */
 #define TIF_MEMDIE		17	/* is terminating due to OOM killer */
 #define TIF_RESTORE_SIGMASK	18	/* restore signal mask in do_signal() */
 #define TIF_SINGLE_STEP		19	/* This task is single stepped */
 #define TIF_BLOCK_STEP		20	/* This task is block stepped */
 #define TIF_UPROBE_SINGLESTEP	21	/* This task is uprobe single stepped */
-
-/* _TIF_TRACE bits */
 #define TIF_SYSCALL_TRACE	24	/* syscall trace active */
 #define TIF_SYSCALL_AUDIT	25	/* syscall auditing active */
 #define TIF_SECCOMP		26	/* secure computing */
@@ -85,15 +82,19 @@ void arch_setup_new_exec(void);
 #define _TIF_NOTIFY_RESUME	BIT(TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		BIT(TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	BIT(TIF_NEED_RESCHED)
+#define _TIF_NEED_RESCHED_LAZY	BIT(TIF_NEED_RESCHED_LAZY)
 #define _TIF_UPROBE		BIT(TIF_UPROBE)
-#define _TIF_GUARDED_STORAGE	BIT(TIF_GUARDED_STORAGE)
 #define _TIF_PATCH_PENDING	BIT(TIF_PATCH_PENDING)
-#define _TIF_ISOLATE_BP		BIT(TIF_ISOLATE_BP)
+#define _TIF_NOTIFY_SIGNAL	BIT(TIF_NOTIFY_SIGNAL)
+#define _TIF_GUARDED_STORAGE	BIT(TIF_GUARDED_STORAGE)
 #define _TIF_ISOLATE_BP_GUEST	BIT(TIF_ISOLATE_BP_GUEST)
-
+#define _TIF_PER_TRAP		BIT(TIF_PER_TRAP)
 #define _TIF_31BIT		BIT(TIF_31BIT)
+#define _TIF_MEMDIE		BIT(TIF_MEMDIE)
+#define _TIF_RESTORE_SIGMASK	BIT(TIF_RESTORE_SIGMASK)
 #define _TIF_SINGLE_STEP	BIT(TIF_SINGLE_STEP)
-
+#define _TIF_BLOCK_STEP		BIT(TIF_BLOCK_STEP)
+#define _TIF_UPROBE_SINGLESTEP	BIT(TIF_UPROBE_SINGLESTEP)
 #define _TIF_SYSCALL_TRACE	BIT(TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT	BIT(TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		BIT(TIF_SECCOMP)
diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h
index 6da8885251d6..bed8d0b5a282 100644
--- a/arch/s390/include/asm/timex.h
+++ b/arch/s390/include/asm/timex.h
@@ -13,35 +13,63 @@
 #include <linux/preempt.h>
 #include <linux/time64.h>
 #include <asm/lowcore.h>
+#include <asm/machine.h>
+#include <asm/asm.h>
 
 /* The value of the TOD clock for 1.1.1970. */
 #define TOD_UNIX_EPOCH 0x7d91048bca000000ULL
 
 extern u64 clock_comparator_max;
 
+union tod_clock {
+	__uint128_t val;
+	struct {
+		__uint128_t ei	:  8; /* epoch index */
+		__uint128_t tod : 64; /* bits 0-63 of tod clock */
+		__uint128_t	: 40;
+		__uint128_t pf	: 16; /* programmable field */
+	};
+	struct {
+		__uint128_t eitod : 72; /* epoch index + bits 0-63 tod clock */
+		__uint128_t	  : 56;
+	};
+	struct {
+		__uint128_t us	: 60; /* micro-seconds */
+		__uint128_t sus	: 12; /* sub-microseconds */
+		__uint128_t	: 56;
+	};
+} __packed;
+
 /* Inline functions for clock register access. */
 static inline int set_tod_clock(__u64 time)
 {
 	int cc;
 
 	asm volatile(
-		"   sck   %1\n"
-		"   ipm   %0\n"
-		"   srl   %0,28\n"
-		: "=d" (cc) : "Q" (time) : "cc");
-	return cc;
+		"	sck	%[time]\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc)
+		: [time] "Q" (time)
+		: CC_CLOBBER);
+	return CC_TRANSFORM(cc);
 }
 
-static inline int store_tod_clock(__u64 *time)
+static inline int store_tod_clock_ext_cc(union tod_clock *clk)
 {
 	int cc;
 
 	asm volatile(
-		"   stck  %1\n"
-		"   ipm   %0\n"
-		"   srl   %0,28\n"
-		: "=d" (cc), "=Q" (*time) : : "cc");
-	return cc;
+		"	stcke	%[clk]\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc), [clk] "=Q" (*clk)
+		:
+		: CC_CLOBBER);
+	return CC_TRANSFORM(cc);
+}
+
+static __always_inline void store_tod_clock_ext(union tod_clock *tod)
+{
+	asm volatile("stcke %0" : "=Q" (*tod) : : "cc");
 }
 
 static inline void set_clock_comparator(__u64 time)
@@ -49,9 +77,14 @@ static inline void set_clock_comparator(__u64 time)
 	asm volatile("sckc %0" : : "Q" (time));
 }
 
-static inline void store_clock_comparator(__u64 *time)
+static inline void set_tod_programmable_field(u16 val)
 {
-	asm volatile("stckc %0" : "=Q" (*time));
+	asm volatile(
+		"	lgr	0,%[val]\n"
+		"	sckpf\n"
+		:
+		: [val] "d" ((unsigned long)val)
+		: "0");
 }
 
 void clock_comparator_work(void);
@@ -64,6 +97,7 @@ extern unsigned char ptff_function_mask[16];
 #define PTFF_QAF	0x00	/* query available functions */
 #define PTFF_QTO	0x01	/* query tod offset */
 #define PTFF_QSI	0x02	/* query steering information */
+#define PTFF_QPT	0x03	/* query physical clock */
 #define PTFF_QUI	0x04	/* query UTC information */
 #define PTFF_ATO	0x40	/* adjust tod offset */
 #define PTFF_STO	0x41	/* set tod offset */
@@ -72,10 +106,10 @@ extern unsigned char ptff_function_mask[16];
 
 /* Query TOD offset result */
 struct ptff_qto {
-	unsigned long long physical_clock;
-	unsigned long long tod_offset;
-	unsigned long long logical_tod_offset;
-	unsigned long long tod_epoch_difference;
+	unsigned long physical_clock;
+	unsigned long tod_offset;
+	unsigned long logical_tod_offset;
+	unsigned long tod_epoch_difference;
 } __packed;
 
 static inline int ptff_query(unsigned int nr)
@@ -112,76 +146,72 @@ struct ptff_qui {
 #define ptff(ptff_block, len, func)					\
 ({									\
 	struct addrtype { char _[len]; };				\
-	register unsigned int reg0 asm("0") = func;			\
-	register unsigned long reg1 asm("1") = (unsigned long) (ptff_block);\
+	unsigned int reg0 = func;					\
+	unsigned long reg1 = (unsigned long)(ptff_block);		\
 	int rc;								\
 									\
 	asm volatile(							\
-		"	.word	0x0104\n"				\
-		"	ipm	%0\n"					\
-		"	srl	%0,28\n"				\
-		: "=d" (rc), "+m" (*(struct addrtype *) reg1)		\
-		: "d" (reg0), "d" (reg1) : "cc");			\
-	rc;								\
+		"	lgr	0,%[reg0]\n"				\
+		"	lgr	1,%[reg1]\n"				\
+		"	ptff\n"						\
+		CC_IPM(rc)						\
+		: CC_OUT(rc, rc), "+m" (*(struct addrtype *)reg1)	\
+		: [reg0] "d" (reg0), [reg1] "d" (reg1)			\
+		: CC_CLOBBER_LIST("0", "1"));				\
+	CC_TRANSFORM(rc);						\
 })
 
-static inline unsigned long long local_tick_disable(void)
+static inline unsigned long local_tick_disable(void)
 {
-	unsigned long long old;
+	unsigned long old;
 
-	old = S390_lowcore.clock_comparator;
-	S390_lowcore.clock_comparator = clock_comparator_max;
-	set_clock_comparator(S390_lowcore.clock_comparator);
+	old = get_lowcore()->clock_comparator;
+	get_lowcore()->clock_comparator = clock_comparator_max;
+	set_clock_comparator(get_lowcore()->clock_comparator);
 	return old;
 }
 
-static inline void local_tick_enable(unsigned long long comp)
+static inline void local_tick_enable(unsigned long comp)
 {
-	S390_lowcore.clock_comparator = comp;
-	set_clock_comparator(S390_lowcore.clock_comparator);
+	get_lowcore()->clock_comparator = comp;
+	set_clock_comparator(get_lowcore()->clock_comparator);
 }
 
 #define CLOCK_TICK_RATE		1193180 /* Underlying HZ */
-#define STORE_CLOCK_EXT_SIZE	16	/* stcke writes 16 bytes */
 
-typedef unsigned long long cycles_t;
+typedef unsigned long cycles_t;
 
-static inline void get_tod_clock_ext(char *clk)
+static __always_inline unsigned long get_tod_clock(void)
 {
-	typedef struct { char _[STORE_CLOCK_EXT_SIZE]; } addrtype;
+	union tod_clock clk;
 
-	asm volatile("stcke %0" : "=Q" (*(addrtype *) clk) : : "cc");
+	store_tod_clock_ext(&clk);
+	return clk.tod;
 }
 
-static inline unsigned long long get_tod_clock(void)
+static inline unsigned long get_tod_clock_fast(void)
 {
-	unsigned char clk[STORE_CLOCK_EXT_SIZE];
-
-	get_tod_clock_ext(clk);
-	return *((unsigned long long *)&clk[1]);
-}
-
-static inline unsigned long long get_tod_clock_fast(void)
-{
-#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES
-	unsigned long long clk;
+	unsigned long clk;
 
 	asm volatile("stckf %0" : "=Q" (clk) : : "cc");
 	return clk;
-#else
-	return get_tod_clock();
-#endif
 }
 
 static inline cycles_t get_cycles(void)
 {
 	return (cycles_t) get_tod_clock() >> 2;
 }
+#define get_cycles get_cycles
 
 int get_phys_clock(unsigned long *clock);
 void init_cpu_timer(void);
 
-extern unsigned char tod_clock_base[16] __aligned(8);
+extern union tod_clock tod_clock_base;
+
+static __always_inline unsigned long __get_tod_clock_monotonic(void)
+{
+	return get_tod_clock() - tod_clock_base.tod;
+}
 
 /**
  * get_clock_monotonic - returns current time in clock rate units
@@ -190,13 +220,13 @@ extern unsigned char tod_clock_base[16] __aligned(8);
  * Therefore preemption must be disabled, otherwise the returned
  * value is not guaranteed to be monotonic.
  */
-static inline unsigned long long get_tod_clock_monotonic(void)
+static inline unsigned long get_tod_clock_monotonic(void)
 {
-	unsigned long long tod;
+	unsigned long tod;
 
-	preempt_disable();
-	tod = get_tod_clock() - *(unsigned long long *) &tod_clock_base[1];
-	preempt_enable();
+	preempt_disable_notrace();
+	tod = __get_tod_clock_monotonic();
+	preempt_enable_notrace();
 	return tod;
 }
 
@@ -219,11 +249,16 @@ static inline unsigned long long get_tod_clock_monotonic(void)
  * -> ns = (th * 125) + ((tl * 125) >> 9);
  *
  */
-static inline unsigned long long tod_to_ns(unsigned long long todval)
+static __always_inline unsigned long tod_to_ns(unsigned long todval)
 {
 	return ((todval >> 9) * 125) + (((todval & 0x1ff) * 125) >> 9);
 }
 
+static __always_inline u128 eitod_to_ns(u128 todval)
+{
+	return (todval * 125) >> 9;
+}
+
 /**
  * tod_after - compare two 64 bit TOD values
  * @a: first 64 bit TOD timestamp
@@ -231,10 +266,10 @@ static inline unsigned long long tod_to_ns(unsigned long long todval)
  *
  * Returns: true if a is later than b
  */
-static inline int tod_after(unsigned long long a, unsigned long long b)
+static inline int tod_after(unsigned long a, unsigned long b)
 {
-	if (MACHINE_HAS_SCC)
-		return (long long) a > (long long) b;
+	if (machine_has_scc())
+		return (long) a > (long) b;
 	return a > b;
 }
 
@@ -245,10 +280,10 @@ static inline int tod_after(unsigned long long a, unsigned long long b)
  *
  * Returns: true if a is later than b
  */
-static inline int tod_after_eq(unsigned long long a, unsigned long long b)
+static inline int tod_after_eq(unsigned long a, unsigned long b)
 {
-	if (MACHINE_HAS_SCC)
-		return (long long) a >= (long long) b;
+	if (machine_has_scc())
+		return (long) a >= (long) b;
 	return a >= b;
 }
 
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index aa406c05a350..f20601995bb0 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -22,13 +22,11 @@
  * Pages used for the page tables is a different story. FIXME: more
  */
 
-void __tlb_remove_table(void *_table);
 static inline void tlb_flush(struct mmu_gather *tlb);
 static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
-					  struct page *page, int page_size);
-
-#define tlb_start_vma(tlb, vma)			do { } while (0)
-#define tlb_end_vma(tlb, vma)			do { } while (0)
+		struct page *page, bool delay_rmap, int page_size);
+static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
+		struct page *page, unsigned int nr_pages, bool delay_rmap);
 
 #define tlb_flush tlb_flush
 #define pte_free_tlb pte_free_tlb
@@ -36,7 +34,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
 #define p4d_free_tlb p4d_free_tlb
 #define pud_free_tlb pud_free_tlb
 
-#include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm-generic/tlb.h>
 
@@ -44,14 +41,33 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
  * Release the page cache reference for a pte removed by
  * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page
  * has already been freed, so just do free_page_and_swap_cache.
+ *
+ * s390 doesn't delay rmap removal.
  */
 static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
-					  struct page *page, int page_size)
+		struct page *page, bool delay_rmap, int page_size)
 {
+	VM_WARN_ON_ONCE(delay_rmap);
+
 	free_page_and_swap_cache(page);
 	return false;
 }
 
+static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
+		struct page *page, unsigned int nr_pages, bool delay_rmap)
+{
+	struct encoded_page *encoded_pages[] = {
+		encode_page(page, ENCODED_PAGE_BIT_NR_PAGES_NEXT),
+		encode_nr_pages(nr_pages),
+	};
+
+	VM_WARN_ON_ONCE(delay_rmap);
+	VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
+
+	free_pages_and_swap_cache(encoded_pages, ARRAY_SIZE(encoded_pages));
+	return false;
+}
+
 static inline void tlb_flush(struct mmu_gather *tlb)
 {
 	__tlb_flush_mm_lazy(tlb->mm);
@@ -67,13 +83,10 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
 	__tlb_adjust_range(tlb, address, PAGE_SIZE);
 	tlb->mm->context.flush_mm = 1;
 	tlb->freed_tables = 1;
-	tlb->cleared_ptes = 1;
-	/*
-	 * page_table_free_rcu takes care of the allocation bit masks
-	 * of the 2K table fragments in the 4K page table page,
-	 * then calls tlb_remove_table.
-	 */
-	page_table_free_rcu(tlb, (unsigned long *) pte, address);
+	tlb->cleared_pmds = 1;
+	if (mm_has_pgste(tlb->mm))
+		gmap_unlink(tlb->mm, (unsigned long *)pte, address);
+	tlb_remove_ptdesc(tlb, virt_to_ptdesc(pte));
 }
 
 /*
@@ -88,12 +101,11 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
 {
 	if (mm_pmd_folded(tlb->mm))
 		return;
-	pgtable_pmd_page_dtor(virt_to_page(pmd));
 	__tlb_adjust_range(tlb, address, PAGE_SIZE);
 	tlb->mm->context.flush_mm = 1;
 	tlb->freed_tables = 1;
 	tlb->cleared_puds = 1;
-	tlb_remove_table(tlb, pmd);
+	tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd));
 }
 
 /*
@@ -111,8 +123,7 @@ static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
 	__tlb_adjust_range(tlb, address, PAGE_SIZE);
 	tlb->mm->context.flush_mm = 1;
 	tlb->freed_tables = 1;
-	tlb->cleared_p4ds = 1;
-	tlb_remove_table(tlb, p4d);
+	tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d));
 }
 
 /*
@@ -127,11 +138,11 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
 {
 	if (mm_pud_folded(tlb->mm))
 		return;
+	__tlb_adjust_range(tlb, address, PAGE_SIZE);
 	tlb->mm->context.flush_mm = 1;
 	tlb->freed_tables = 1;
-	tlb->cleared_puds = 1;
-	tlb_remove_table(tlb, pud);
+	tlb->cleared_p4ds = 1;
+	tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud));
 }
 
-
 #endif /* _S390_TLB_H */
diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h
index 82703e03f35d..75491baa2197 100644
--- a/arch/s390/include/asm/tlbflush.h
+++ b/arch/s390/include/asm/tlbflush.h
@@ -2,11 +2,11 @@
 #ifndef _S390_TLBFLUSH_H
 #define _S390_TLBFLUSH_H
 
+#include <linux/cpufeature.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <asm/processor.h>
-#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
+#include <asm/machine.h>
 
 /*
  * Flush all TLB entries on the local CPU.
@@ -24,16 +24,12 @@ static inline void __tlb_flush_idte(unsigned long asce)
 	unsigned long opt;
 
 	opt = IDTE_PTOA;
-	if (MACHINE_HAS_TLB_GUEST)
+	if (machine_has_tlb_guest())
 		opt |= IDTE_GUEST_ASCE;
 	/* Global TLB flush for the mm */
-	asm volatile(
-		"	.insn	rrf,0xb98e0000,0,%0,%1,0"
-		: : "a" (opt), "a" (asce) : "cc");
+	asm volatile("idte 0,%1,%0" : : "a" (opt), "a" (asce) : "cc");
 }
 
-void smp_ptlb_all(void);
-
 /*
  * Flush all TLB entries on all CPUs.
  */
@@ -52,18 +48,13 @@ static inline void __tlb_flush_mm(struct mm_struct *mm)
 {
 	unsigned long gmap_asce;
 
-	/*
-	 * If the machine has IDTE we prefer to do a per mm flush
-	 * on all cpus instead of doing a local flush if the mm
-	 * only ran on the local cpu.
-	 */
 	preempt_disable();
 	atomic_inc(&mm->context.flush_count);
 	/* Reset TLB flush mask */
 	cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask);
 	barrier();
 	gmap_asce = READ_ONCE(mm->context.gmap_asce);
-	if (MACHINE_HAS_IDTE && gmap_asce != -1UL) {
+	if (cpu_has_idte() && gmap_asce != -1UL) {
 		if (gmap_asce)
 			__tlb_flush_idte(gmap_asce);
 		__tlb_flush_idte(mm->context.asce);
@@ -77,7 +68,7 @@ static inline void __tlb_flush_mm(struct mm_struct *mm)
 
 static inline void __tlb_flush_kernel(void)
 {
-	if (MACHINE_HAS_IDTE)
+	if (cpu_has_idte())
 		__tlb_flush_idte(init_mm.context.asce);
 	else
 		__tlb_flush_global();
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h
index cca406fdbe51..44110847342a 100644
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -16,8 +16,8 @@ struct cpu_topology_s390 {
 	unsigned short socket_id;
 	unsigned short book_id;
 	unsigned short drawer_id;
-	unsigned short node_id;
 	unsigned short dedicated : 1;
+	int booted_cores;
 	cpumask_t thread_mask;
 	cpumask_t core_mask;
 	cpumask_t book_mask;
@@ -25,7 +25,6 @@ struct cpu_topology_s390 {
 };
 
 extern struct cpu_topology_s390 cpu_topology[NR_CPUS];
-extern cpumask_t cpus_with_topology;
 
 #define topology_physical_package_id(cpu) (cpu_topology[cpu].socket_id)
 #define topology_thread_id(cpu)		  (cpu_topology[cpu].thread_id)
@@ -37,6 +36,7 @@ extern cpumask_t cpus_with_topology;
 #define topology_drawer_id(cpu)		  (cpu_topology[cpu].drawer_id)
 #define topology_drawer_cpumask(cpu)	  (&cpu_topology[cpu].drawer_mask)
 #define topology_cpu_dedicated(cpu)	  (cpu_topology[cpu].dedicated)
+#define topology_booted_cores(cpu)	  (cpu_topology[cpu].booted_cores)
 
 #define mc_capable() 1
 
@@ -45,6 +45,7 @@ int topology_cpu_init(struct cpu *);
 int topology_set_cpu_management(int fc);
 void topology_schedule_update(void);
 void store_topology(struct sysinfo_15_1_x *info);
+void update_cpu_masks(void);
 void topology_expect_change(void);
 const struct cpumask *cpu_coregroup_mask(int cpu);
 
@@ -54,16 +55,27 @@ static inline void topology_init_early(void) { }
 static inline void topology_schedule_update(void) { }
 static inline int topology_cpu_init(struct cpu *cpu) { return 0; }
 static inline int topology_cpu_dedicated(int cpu_nr) { return 0; }
+static inline int topology_booted_cores(int cpu_nr) { return 1; }
+static inline void update_cpu_masks(void) { }
 static inline void topology_expect_change(void) { }
 
 #endif /* CONFIG_SCHED_TOPOLOGY */
 
+static inline bool topology_is_primary_thread(unsigned int cpu)
+{
+	return smp_get_base_cpu(cpu) == cpu;
+}
+#define topology_is_primary_thread topology_is_primary_thread
+
 #define POLARIZATION_UNKNOWN	(-1)
 #define POLARIZATION_HRZ	(0)
 #define POLARIZATION_VL		(1)
 #define POLARIZATION_VM		(2)
 #define POLARIZATION_VH		(3)
 
+#define CPU_CAPACITY_HIGH	SCHED_CAPACITY_SCALE
+#define CPU_CAPACITY_LOW	(SCHED_CAPACITY_SCALE >> 3)
+
 #define SD_BOOK_INIT	SD_CPU_INIT
 
 #ifdef CONFIG_NUMA
@@ -71,20 +83,18 @@ static inline void topology_expect_change(void) { }
 #define cpu_to_node cpu_to_node
 static inline int cpu_to_node(int cpu)
 {
-	return cpu_topology[cpu].node_id;
+	return 0;
 }
 
 /* Returns a pointer to the cpumask of CPUs on node 'node'. */
 #define cpumask_of_node cpumask_of_node
 static inline const struct cpumask *cpumask_of_node(int node)
 {
-	return &node_to_cpumask_map[node];
+	return cpu_possible_mask;
 }
 
 #define pcibus_to_node(bus) __pcibus_to_node(bus)
 
-#define node_distance(a, b) __node_distance(a, b)
-
 #else /* !CONFIG_NUMA */
 
 #define numa_node_id numa_node_id
diff --git a/arch/s390/include/asm/tpi.h b/arch/s390/include/asm/tpi.h
new file mode 100644
index 000000000000..f76e5fdff23a
--- /dev/null
+++ b/arch/s390/include/asm/tpi.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_S390_TPI_H
+#define _ASM_S390_TPI_H
+
+#include <linux/types.h>
+#include <uapi/asm/schid.h>
+
+#ifndef __ASSEMBLY__
+
+/* I/O-Interruption Code as stored by TEST PENDING INTERRUPTION (TPI). */
+struct tpi_info {
+	struct subchannel_id schid;
+	u32 intparm;
+	u32 adapter_IO:1;
+	u32 directed_irq:1;
+	u32 isc:3;
+	u32 :12;
+	u32 type:3;
+	u32 :12;
+} __packed __aligned(4);
+
+/* I/O-Interruption Code as stored by TPI for an Adapter I/O */
+struct tpi_adapter_info {
+	u32 aism:8;
+	u32 :22;
+	u32 error:1;
+	u32 forward:1;
+	u32 reserved;
+	u32 adapter_IO:1;
+	u32 directed_irq:1;
+	u32 isc:3;
+	u32 :27;
+} __packed __aligned(4);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_S390_TPI_H */
diff --git a/arch/s390/include/asm/trace/hiperdispatch.h b/arch/s390/include/asm/trace/hiperdispatch.h
new file mode 100644
index 000000000000..46462ee645b0
--- /dev/null
+++ b/arch/s390/include/asm/trace/hiperdispatch.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Tracepoint header for hiperdispatch
+ *
+ * Copyright IBM Corp. 2024
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM s390
+
+#if !defined(_TRACE_S390_HIPERDISPATCH_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_S390_HIPERDISPATCH_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+
+#define TRACE_INCLUDE_PATH asm/trace
+#define TRACE_INCLUDE_FILE hiperdispatch
+
+TRACE_EVENT(s390_hd_work_fn,
+	    TP_PROTO(int steal_time_percentage,
+		     int entitled_core_count,
+		     int highcap_core_count),
+	    TP_ARGS(steal_time_percentage,
+		    entitled_core_count,
+		    highcap_core_count),
+	    TP_STRUCT__entry(__field(int, steal_time_percentage)
+			     __field(int, entitled_core_count)
+			     __field(int, highcap_core_count)),
+	    TP_fast_assign(__entry->steal_time_percentage = steal_time_percentage;
+			   __entry->entitled_core_count = entitled_core_count;
+			   __entry->highcap_core_count = highcap_core_count;),
+	    TP_printk("steal: %d entitled_core_count: %d highcap_core_count: %d",
+		      __entry->steal_time_percentage,
+		      __entry->entitled_core_count,
+		      __entry->highcap_core_count)
+);
+
+TRACE_EVENT(s390_hd_rebuild_domains,
+	    TP_PROTO(int current_highcap_core_count,
+		     int new_highcap_core_count),
+	    TP_ARGS(current_highcap_core_count,
+		    new_highcap_core_count),
+	    TP_STRUCT__entry(__field(int, current_highcap_core_count)
+			     __field(int, new_highcap_core_count)),
+	    TP_fast_assign(__entry->current_highcap_core_count = current_highcap_core_count;
+			   __entry->new_highcap_core_count = new_highcap_core_count),
+	    TP_printk("change highcap_core_count: %u -> %u",
+		      __entry->current_highcap_core_count,
+		      __entry->new_highcap_core_count)
+);
+
+#endif /* _TRACE_S390_HIPERDISPATCH_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/s390/include/asm/types.h b/arch/s390/include/asm/types.h
new file mode 100644
index 000000000000..0b5d550a0478
--- /dev/null
+++ b/arch/s390/include/asm/types.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _ASM_S390_TYPES_H
+#define _ASM_S390_TYPES_H
+
+#include <uapi/asm/types.h>
+
+#ifndef __ASSEMBLY__
+
+union register_pair {
+	unsigned __int128 pair;
+	struct {
+		unsigned long even;
+		unsigned long odd;
+	};
+};
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_S390_TYPES_H */
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index a470f1fa9f2a..8629d70ec38b 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -3,7 +3,7 @@
  *  S390 version
  *    Copyright IBM Corp. 1999, 2000
  *    Author(s): Hartmut Penner (hp@de.ibm.com),
- *               Martin Schwidefsky (schwidefsky@de.ibm.com)
+ *		 Martin Schwidefsky (schwidefsky@de.ibm.com)
  *
  *  Derived from "include/asm-i386/uaccess.h"
  */
@@ -13,269 +13,669 @@
 /*
  * User space memory access functions
  */
+#include <linux/pgtable.h>
+#include <asm/asm-extable.h>
 #include <asm/processor.h>
-#include <asm/ctl_reg.h>
 #include <asm/extable.h>
 #include <asm/facility.h>
+#include <asm-generic/access_ok.h>
+#include <linux/instrumented.h>
 
-/*
- * The fs value determines whether argument validity checking should be
- * performed or not.  If get_fs() == USER_DS, checking is performed, with
- * get_fs() == KERNEL_DS, checking is bypassed.
- *
- * For historical reasons, these macros are grossly misnamed.
- */
-
-#define KERNEL_DS	(0)
-#define KERNEL_DS_SACF	(1)
-#define USER_DS		(2)
-#define USER_DS_SACF	(3)
-
-#define get_fs()        (current->thread.mm_segment)
-#define segment_eq(a,b) (((a) & 2) == ((b) & 2))
-
-void set_fs(mm_segment_t fs);
-
-static inline int __range_ok(unsigned long addr, unsigned long size)
-{
-	return 1;
-}
+void debug_user_asce(int exit);
 
-#define __access_ok(addr, size)				\
-({							\
-	__chk_user_ptr(addr);				\
-	__range_ok((unsigned long)(addr), (size));	\
-})
-
-#define access_ok(addr, size) __access_ok(addr, size)
-
-unsigned long __must_check
-raw_copy_from_user(void *to, const void __user *from, unsigned long n);
-
-unsigned long __must_check
-raw_copy_to_user(void __user *to, const void *from, unsigned long n);
+#ifdef CONFIG_KMSAN
+#define uaccess_kmsan_or_inline noinline __maybe_unused __no_sanitize_memory
+#else
+#define uaccess_kmsan_or_inline __always_inline
+#endif
 
-#ifndef CONFIG_KASAN
 #define INLINE_COPY_FROM_USER
 #define INLINE_COPY_TO_USER
-#endif
 
-#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
-
-#define __put_get_user_asm(to, from, size, spec)		\
-({								\
-	register unsigned long __reg0 asm("0") = spec;		\
-	int __rc;						\
-								\
-	asm volatile(						\
-		"0:	mvcos	%1,%3,%2\n"			\
-		"1:	xr	%0,%0\n"			\
-		"2:\n"						\
-		".pushsection .fixup, \"ax\"\n"			\
-		"3:	lhi	%0,%5\n"			\
-		"	jg	2b\n"				\
-		".popsection\n"					\
-		EX_TABLE(0b,3b) EX_TABLE(1b,3b)			\
-		: "=d" (__rc), "+Q" (*(to))			\
-		: "d" (size), "Q" (*(from)),			\
-		  "d" (__reg0), "K" (-EFAULT)			\
-		: "cc");					\
-	__rc;							\
-})
-
-static __always_inline int __put_user_fn(void *x, void __user *ptr, unsigned long size)
+static uaccess_kmsan_or_inline __must_check unsigned long
+raw_copy_from_user(void *to, const void __user *from, unsigned long size)
 {
-	unsigned long spec = 0x010000UL;
-	int rc;
-
-	switch (size) {
-	case 1:
-		rc = __put_get_user_asm((unsigned char __user *)ptr,
-					(unsigned char *)x,
-					size, spec);
-		break;
-	case 2:
-		rc = __put_get_user_asm((unsigned short __user *)ptr,
-					(unsigned short *)x,
-					size, spec);
-		break;
-	case 4:
-		rc = __put_get_user_asm((unsigned int __user *)ptr,
-					(unsigned int *)x,
-					size, spec);
-		break;
-	case 8:
-		rc = __put_get_user_asm((unsigned long __user *)ptr,
-					(unsigned long *)x,
-					size, spec);
-		break;
+	unsigned long osize;
+	int cc;
+
+	while (1) {
+		osize = size;
+		asm_inline volatile(
+			"	lhi	%%r0,%[spec]\n"
+			"0:	mvcos	%[to],%[from],%[size]\n"
+			"1:	nopr	%%r7\n"
+			CC_IPM(cc)
+			EX_TABLE_UA_MVCOS_FROM(0b, 0b)
+			EX_TABLE_UA_MVCOS_FROM(1b, 0b)
+			: CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char *)to)
+			: [spec] "I" (0x81), [from] "Q" (*(const char __user *)from)
+			: CC_CLOBBER_LIST("memory", "0"));
+		if (__builtin_constant_p(osize) && osize <= 4096)
+			return osize - size;
+		if (likely(CC_TRANSFORM(cc) == 0))
+			return osize - size;
+		size -= 4096;
+		to += 4096;
+		from += 4096;
 	}
-	return rc;
 }
 
-static __always_inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size)
+static uaccess_kmsan_or_inline __must_check unsigned long
+raw_copy_to_user(void __user *to, const void *from, unsigned long size)
 {
-	unsigned long spec = 0x01UL;
-	int rc;
-
-	switch (size) {
-	case 1:
-		rc = __put_get_user_asm((unsigned char *)x,
-					(unsigned char __user *)ptr,
-					size, spec);
-		break;
-	case 2:
-		rc = __put_get_user_asm((unsigned short *)x,
-					(unsigned short __user *)ptr,
-					size, spec);
-		break;
-	case 4:
-		rc = __put_get_user_asm((unsigned int *)x,
-					(unsigned int __user *)ptr,
-					size, spec);
-		break;
-	case 8:
-		rc = __put_get_user_asm((unsigned long *)x,
-					(unsigned long __user *)ptr,
-					size, spec);
-		break;
+	unsigned long osize;
+	int cc;
+
+	while (1) {
+		osize = size;
+		asm_inline volatile(
+			"	llilh	%%r0,%[spec]\n"
+			"0:	mvcos	%[to],%[from],%[size]\n"
+			"1:	nopr	%%r7\n"
+			CC_IPM(cc)
+			EX_TABLE_UA_MVCOS_TO(0b, 0b)
+			EX_TABLE_UA_MVCOS_TO(1b, 0b)
+			: CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char __user *)to)
+			: [spec] "I" (0x81), [from] "Q" (*(const char *)from)
+			: CC_CLOBBER_LIST("memory", "0"));
+		if (__builtin_constant_p(osize) && osize <= 4096)
+			return osize - size;
+		if (likely(CC_TRANSFORM(cc) == 0))
+			return osize - size;
+		size -= 4096;
+		to += 4096;
+		from += 4096;
 	}
-	return rc;
 }
 
-#else /* CONFIG_HAVE_MARCH_Z10_FEATURES */
+unsigned long __must_check
+_copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned long key);
 
-static inline int __put_user_fn(void *x, void __user *ptr, unsigned long size)
+static __always_inline unsigned long __must_check
+copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned long key)
 {
-	size = raw_copy_to_user(ptr, x, size);
-	return size ? -EFAULT : 0;
+	if (check_copy_size(to, n, false))
+		n = _copy_from_user_key(to, from, n, key);
+	return n;
 }
 
-static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size)
+unsigned long __must_check
+_copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned long key);
+
+static __always_inline unsigned long __must_check
+copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned long key)
 {
-	size = raw_copy_from_user(x, ptr, size);
-	return size ? -EFAULT : 0;
+	if (check_copy_size(from, n, true))
+		n = _copy_to_user_key(to, from, n, key);
+	return n;
 }
 
-#endif /* CONFIG_HAVE_MARCH_Z10_FEATURES */
+int __noreturn __put_user_bad(void);
+
+#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+
+#define DEFINE_PUT_USER_NOINSTR(type)					\
+static uaccess_kmsan_or_inline int					\
+__put_user_##type##_noinstr(unsigned type __user *to,			\
+			    unsigned type *from,			\
+			    unsigned long size)				\
+{									\
+	asm goto(							\
+		"	llilh	%%r0,%[spec]\n"				\
+		"0:	mvcos	%[to],%[from],%[size]\n"		\
+		"1:	nopr	%%r7\n"					\
+		EX_TABLE(0b, %l[Efault])				\
+		EX_TABLE(1b, %l[Efault])				\
+		: [to] "+Q" (*to)					\
+		: [size] "d" (size), [from] "Q" (*from),		\
+		  [spec] "I" (0x81)					\
+		: "cc", "0"						\
+		: Efault						\
+		);							\
+	return 0;							\
+Efault:									\
+	return -EFAULT;							\
+}
 
-/*
- * These are the main single-value transfer routines.  They automatically
- * use the right size if we just have the right pointer type.
- */
-#define __put_user(x, ptr) \
-({								\
-	__typeof__(*(ptr)) __x = (x);				\
-	int __pu_err = -EFAULT;					\
-        __chk_user_ptr(ptr);                                    \
-	switch (sizeof (*(ptr))) {				\
-	case 1:							\
-	case 2:							\
-	case 4:							\
-	case 8:							\
-		__pu_err = __put_user_fn(&__x, ptr,		\
-					 sizeof(*(ptr)));	\
-		break;						\
-	default:						\
-		__put_user_bad();				\
-		break;						\
-	 }							\
-	__builtin_expect(__pu_err, 0);				\
+#else /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+
+#define DEFINE_PUT_USER_NOINSTR(type)					\
+static uaccess_kmsan_or_inline int					\
+__put_user_##type##_noinstr(unsigned type __user *to,			\
+			    unsigned type *from,			\
+			    unsigned long size)				\
+{									\
+	int rc;								\
+									\
+	asm_inline volatile(						\
+		"	llilh	%%r0,%[spec]\n"				\
+		"0:	mvcos	%[to],%[from],%[size]\n"		\
+		"1:	lhi	%[rc],0\n"				\
+		"2:\n"							\
+		EX_TABLE_UA_FAULT(0b, 2b, %[rc])			\
+		EX_TABLE_UA_FAULT(1b, 2b, %[rc])			\
+		: [rc] "=d" (rc), [to] "+Q" (*to)			\
+		: [size] "d" (size), [from] "Q" (*from),		\
+		  [spec] "I" (0x81)					\
+		: "cc", "0");						\
+	return rc;							\
+}
+
+#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+
+DEFINE_PUT_USER_NOINSTR(char);
+DEFINE_PUT_USER_NOINSTR(short);
+DEFINE_PUT_USER_NOINSTR(int);
+DEFINE_PUT_USER_NOINSTR(long);
+
+#define DEFINE_PUT_USER(type)						\
+static __always_inline int						\
+__put_user_##type(unsigned type __user *to, unsigned type *from,	\
+		  unsigned long size)					\
+{									\
+	int rc;								\
+									\
+	rc = __put_user_##type##_noinstr(to, from, size);		\
+	instrument_put_user(*from, to, size);				\
+	return rc;							\
+}
+
+DEFINE_PUT_USER(char);
+DEFINE_PUT_USER(short);
+DEFINE_PUT_USER(int);
+DEFINE_PUT_USER(long);
+
+#define __put_user(x, ptr)						\
+({									\
+	__typeof__(*(ptr)) __x = (x);					\
+	int __prc;							\
+									\
+	__chk_user_ptr(ptr);						\
+	switch (sizeof(*(ptr))) {					\
+	case 1:								\
+		__prc = __put_user_char((unsigned char __user *)(ptr),	\
+					(unsigned char *)&__x,		\
+					sizeof(*(ptr)));		\
+		break;							\
+	case 2:								\
+		__prc = __put_user_short((unsigned short __user *)(ptr),\
+					 (unsigned short *)&__x,	\
+					 sizeof(*(ptr)));		\
+		break;							\
+	case 4:								\
+		__prc = __put_user_int((unsigned int __user *)(ptr),	\
+				       (unsigned int *)&__x,		\
+				       sizeof(*(ptr)));			\
+		break;							\
+	case 8:								\
+		__prc = __put_user_long((unsigned long __user *)(ptr),	\
+					(unsigned long *)&__x,		\
+					sizeof(*(ptr)));		\
+		break;							\
+	default:							\
+		__prc = __put_user_bad();				\
+		break;							\
+	}								\
+	__builtin_expect(__prc, 0);					\
 })
 
-#define put_user(x, ptr)					\
-({								\
-	might_fault();						\
-	__put_user(x, ptr);					\
+#define put_user(x, ptr)						\
+({									\
+	might_fault();							\
+	__put_user(x, ptr);						\
 })
 
+int __noreturn __get_user_bad(void);
+
+#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+
+#define DEFINE_GET_USER_NOINSTR(type)					\
+static uaccess_kmsan_or_inline int					\
+__get_user_##type##_noinstr(unsigned type *to,				\
+			    const unsigned type __user *from,		\
+			    unsigned long size)				\
+{									\
+	asm goto(							\
+		"	lhi	%%r0,%[spec]\n"				\
+		"0:	mvcos	%[to],%[from],%[size]\n"		\
+		"1:	nopr	%%r7\n"					\
+		EX_TABLE(0b, %l[Efault])				\
+		EX_TABLE(1b, %l[Efault])				\
+		: [to] "=Q" (*to)					\
+		: [size] "d" (size), [from] "Q" (*from),		\
+		  [spec] "I" (0x81)					\
+		: "cc", "0"						\
+		: Efault						\
+		);							\
+	return 0;							\
+Efault:									\
+	*to = 0;							\
+	return -EFAULT;							\
+}
 
-int __put_user_bad(void) __attribute__((noreturn));
-
-#define __get_user(x, ptr)					\
-({								\
-	int __gu_err = -EFAULT;					\
-	__chk_user_ptr(ptr);					\
-	switch (sizeof(*(ptr))) {				\
-	case 1: {						\
-		unsigned char __x = 0;				\
-		__gu_err = __get_user_fn(&__x, ptr,		\
-					 sizeof(*(ptr)));	\
-		(x) = *(__force __typeof__(*(ptr)) *) &__x;	\
-		break;						\
-	};							\
-	case 2: {						\
-		unsigned short __x = 0;				\
-		__gu_err = __get_user_fn(&__x, ptr,		\
-					 sizeof(*(ptr)));	\
-		(x) = *(__force __typeof__(*(ptr)) *) &__x;	\
-		break;						\
-	};							\
-	case 4: {						\
-		unsigned int __x = 0;				\
-		__gu_err = __get_user_fn(&__x, ptr,		\
-					 sizeof(*(ptr)));	\
-		(x) = *(__force __typeof__(*(ptr)) *) &__x;	\
-		break;						\
-	};							\
-	case 8: {						\
-		unsigned long long __x = 0;			\
-		__gu_err = __get_user_fn(&__x, ptr,		\
-					 sizeof(*(ptr)));	\
-		(x) = *(__force __typeof__(*(ptr)) *) &__x;	\
-		break;						\
-	};							\
-	default:						\
-		__get_user_bad();				\
-		break;						\
-	}							\
-	__builtin_expect(__gu_err, 0);				\
-})
+#else /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+
+#define DEFINE_GET_USER_NOINSTR(type)					\
+static uaccess_kmsan_or_inline int					\
+__get_user_##type##_noinstr(unsigned type *to,				\
+			    const unsigned type __user *from,		\
+			    unsigned long size)				\
+{									\
+	int rc;								\
+									\
+	asm_inline volatile(						\
+		"	lhi	%%r0,%[spec]\n"				\
+		"0:	mvcos	%[to],%[from],%[size]\n"		\
+		"1:	lhi	%[rc],0\n"				\
+		"2:\n"							\
+		EX_TABLE_UA_FAULT(0b, 2b, %[rc])			\
+		EX_TABLE_UA_FAULT(1b, 2b, %[rc])			\
+		: [rc] "=d" (rc), [to] "=Q" (*to)			\
+		: [size] "d" (size), [from] "Q" (*from),		\
+		  [spec] "I" (0x81)					\
+		: "cc", "0");						\
+	if (likely(!rc))						\
+		return 0;						\
+	*to = 0;							\
+	return rc;							\
+}
 
-#define get_user(x, ptr)					\
-({								\
-	might_fault();						\
-	__get_user(x, ptr);					\
-})
+#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+
+DEFINE_GET_USER_NOINSTR(char);
+DEFINE_GET_USER_NOINSTR(short);
+DEFINE_GET_USER_NOINSTR(int);
+DEFINE_GET_USER_NOINSTR(long);
+
+#define DEFINE_GET_USER(type)						\
+static __always_inline int						\
+__get_user_##type(unsigned type *to, const unsigned type __user *from,	\
+		  unsigned long size)					\
+{									\
+	int rc;								\
+									\
+	rc = __get_user_##type##_noinstr(to, from, size);		\
+	instrument_get_user(*to);					\
+	return rc;							\
+}
 
-int __get_user_bad(void) __attribute__((noreturn));
+DEFINE_GET_USER(char);
+DEFINE_GET_USER(short);
+DEFINE_GET_USER(int);
+DEFINE_GET_USER(long);
+
+#define __get_user(x, ptr)						\
+({									\
+	const __user void *____guptr = (ptr);				\
+	int __grc;							\
+									\
+	__chk_user_ptr(ptr);						\
+	switch (sizeof(*(ptr))) {					\
+	case 1: {							\
+		const unsigned char __user *__guptr = ____guptr;	\
+		unsigned char __x;					\
+									\
+		__grc = __get_user_char(&__x, __guptr, sizeof(*(ptr)));	\
+		(x) = *(__force __typeof__(*(ptr)) *)&__x;		\
+		break;							\
+	};								\
+	case 2: {							\
+		const unsigned short __user *__guptr = ____guptr;	\
+		unsigned short __x;					\
+									\
+		__grc = __get_user_short(&__x, __guptr, sizeof(*(ptr)));\
+		(x) = *(__force __typeof__(*(ptr)) *)&__x;		\
+		break;							\
+	};								\
+	case 4: {							\
+		const unsigned int __user *__guptr = ____guptr;		\
+		unsigned int __x;					\
+									\
+		__grc = __get_user_int(&__x, __guptr, sizeof(*(ptr)));	\
+		(x) = *(__force __typeof__(*(ptr)) *)&__x;		\
+		break;							\
+	};								\
+	case 8: {							\
+		const unsigned long __user *__guptr = ____guptr;	\
+		unsigned long __x;					\
+									\
+		__grc = __get_user_long(&__x, __guptr, sizeof(*(ptr)));	\
+		(x) = *(__force __typeof__(*(ptr)) *)&__x;		\
+		break;							\
+	};								\
+	default:							\
+		__grc = __get_user_bad();				\
+		break;							\
+	}								\
+	__builtin_expect(__grc, 0);					\
+})
 
-unsigned long __must_check
-raw_copy_in_user(void __user *to, const void __user *from, unsigned long n);
+#define get_user(x, ptr)						\
+({									\
+	might_fault();							\
+	__get_user(x, ptr);						\
+})
 
 /*
  * Copy a null terminated string from userspace.
  */
+long __must_check strncpy_from_user(char *dst, const char __user *src, long count);
 
-long __strncpy_from_user(char *dst, const char __user *src, long count);
+long __must_check strnlen_user(const char __user *src, long count);
 
-static inline long __must_check
-strncpy_from_user(char *dst, const char __user *src, long count)
+static uaccess_kmsan_or_inline __must_check unsigned long
+__clear_user(void __user *to, unsigned long size)
 {
-	might_fault();
-	return __strncpy_from_user(dst, src, count);
+	unsigned long osize;
+	int cc;
+
+	while (1) {
+		osize = size;
+		asm_inline volatile(
+			"	llilh	%%r0,%[spec]\n"
+			"0:	mvcos	%[to],%[from],%[size]\n"
+			"1:	nopr	%%r7\n"
+			CC_IPM(cc)
+			EX_TABLE_UA_MVCOS_TO(0b, 0b)
+			EX_TABLE_UA_MVCOS_TO(1b, 0b)
+			: CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char __user *)to)
+			: [spec] "I" (0x81), [from] "Q" (*(const char *)empty_zero_page)
+			: CC_CLOBBER_LIST("memory", "0"));
+		if (__builtin_constant_p(osize) && osize <= 4096)
+			return osize - size;
+		if (CC_TRANSFORM(cc) == 0)
+			return osize - size;
+		size -= 4096;
+		to += 4096;
+	}
 }
 
-unsigned long __must_check __strnlen_user(const char __user *src, unsigned long count);
-
-static inline unsigned long strnlen_user(const char __user *src, unsigned long n)
+static __always_inline unsigned long __must_check clear_user(void __user *to, unsigned long n)
 {
 	might_fault();
-	return __strnlen_user(src, n);
+	return __clear_user(to, n);
 }
 
-/*
- * Zero Userspace
- */
-unsigned long __must_check __clear_user(void __user *to, unsigned long size);
+void *__s390_kernel_write(void *dst, const void *src, size_t size);
 
-static inline unsigned long __must_check clear_user(void __user *to, unsigned long n)
+static inline void *s390_kernel_write(void *dst, const void *src, size_t size)
 {
-	might_fault();
-	return __clear_user(to, n);
+	if (__is_defined(__DECOMPRESSOR))
+		return memcpy(dst, src, size);
+	return __s390_kernel_write(dst, src, size);
 }
 
-int copy_to_user_real(void __user *dest, void *src, unsigned long count);
-void s390_kernel_write(void *dst, const void *src, size_t size);
+void __noreturn __mvc_kernel_nofault_bad(void);
+
+#if defined(CONFIG_CC_HAS_ASM_GOTO_OUTPUT) && defined(CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS)
+
+#define __mvc_kernel_nofault(dst, src, type, err_label)			\
+do {									\
+	switch (sizeof(type)) {						\
+	case 1:								\
+	case 2:								\
+	case 4:								\
+	case 8:								\
+		asm goto(						\
+			"0:	mvc	%O[_dst](%[_len],%R[_dst]),%[_src]\n" \
+			"1:	nopr	%%r7\n"				\
+			EX_TABLE(0b, %l[err_label])			\
+			EX_TABLE(1b, %l[err_label])			\
+			: [_dst] "=Q" (*(type *)dst)			\
+			: [_src] "Q" (*(type *)(src)),			\
+			  [_len] "I" (sizeof(type))			\
+			:						\
+			: err_label);					\
+		break;							\
+	default:							\
+		__mvc_kernel_nofault_bad();				\
+		break;							\
+	}								\
+} while (0)
+
+#else /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT) && CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */
+
+#define __mvc_kernel_nofault(dst, src, type, err_label)			\
+do {									\
+	type *(__dst) = (type *)(dst);					\
+	int __rc;							\
+									\
+	switch (sizeof(type)) {						\
+	case 1:								\
+	case 2:								\
+	case 4:								\
+	case 8:								\
+		asm_inline volatile(					\
+			"0:	mvc	0(%[_len],%[_dst]),%[_src]\n"	\
+			"1:	lhi	%[_rc],0\n"			\
+			"2:\n"						\
+			EX_TABLE_UA_FAULT(0b, 2b, %[_rc])		\
+			EX_TABLE_UA_FAULT(1b, 2b, %[_rc])		\
+			: [_rc] "=d" (__rc),				\
+			  "=m" (*__dst)					\
+			: [_src] "Q" (*(type *)(src)),			\
+			[_dst] "a" (__dst),				\
+			[_len] "I" (sizeof(type)));			\
+		if (__rc)						\
+			goto err_label;					\
+		break;							\
+	default:							\
+		__mvc_kernel_nofault_bad();				\
+		break;							\
+	}								\
+} while (0)
+
+#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT && CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */
+
+#define __get_kernel_nofault __mvc_kernel_nofault
+#define __put_kernel_nofault __mvc_kernel_nofault
+
+void __cmpxchg_user_key_called_with_bad_pointer(void);
+
+#define CMPXCHG_USER_KEY_MAX_LOOPS 128
+
+static __always_inline int __cmpxchg_user_key(unsigned long address, void *uval,
+					      __uint128_t old, __uint128_t new,
+					      unsigned long key, int size)
+{
+	int rc = 0;
+
+	switch (size) {
+	case 1: {
+		unsigned int prev, shift, mask, _old, _new;
+		unsigned long count;
+
+		shift = (3 ^ (address & 3)) << 3;
+		address ^= address & 3;
+		_old = ((unsigned int)old & 0xff) << shift;
+		_new = ((unsigned int)new & 0xff) << shift;
+		mask = ~(0xff << shift);
+		asm_inline volatile(
+			"	spka	0(%[key])\n"
+			"	sacf	256\n"
+			"	llill	%[count],%[max_loops]\n"
+			"0:	l	%[prev],%[address]\n"
+			"1:	nr	%[prev],%[mask]\n"
+			"	xilf	%[mask],0xffffffff\n"
+			"	or	%[new],%[prev]\n"
+			"	or	%[prev],%[tmp]\n"
+			"2:	lr	%[tmp],%[prev]\n"
+			"3:	cs	%[prev],%[new],%[address]\n"
+			"4:	jnl	5f\n"
+			"	xr	%[tmp],%[prev]\n"
+			"	xr	%[new],%[tmp]\n"
+			"	nr	%[tmp],%[mask]\n"
+			"	jnz	5f\n"
+			"	brct	%[count],2b\n"
+			"5:	sacf	768\n"
+			"	spka	%[default_key]\n"
+			EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev])
+			EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev])
+			EX_TABLE_UA_LOAD_REG(3b, 5b, %[rc], %[prev])
+			EX_TABLE_UA_LOAD_REG(4b, 5b, %[rc], %[prev])
+			: [rc] "+&d" (rc),
+			  [prev] "=&d" (prev),
+			  [address] "+Q" (*(int *)address),
+			  [tmp] "+&d" (_old),
+			  [new] "+&d" (_new),
+			  [mask] "+&d" (mask),
+			  [count] "=a" (count)
+			: [key] "%[count]" (key << 4),
+			  [default_key] "J" (PAGE_DEFAULT_KEY),
+			  [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS)
+			: "memory", "cc");
+		*(unsigned char *)uval = prev >> shift;
+		if (!count)
+			rc = -EAGAIN;
+		return rc;
+	}
+	case 2: {
+		unsigned int prev, shift, mask, _old, _new;
+		unsigned long count;
+
+		shift = (2 ^ (address & 2)) << 3;
+		address ^= address & 2;
+		_old = ((unsigned int)old & 0xffff) << shift;
+		_new = ((unsigned int)new & 0xffff) << shift;
+		mask = ~(0xffff << shift);
+		asm_inline volatile(
+			"	spka	0(%[key])\n"
+			"	sacf	256\n"
+			"	llill	%[count],%[max_loops]\n"
+			"0:	l	%[prev],%[address]\n"
+			"1:	nr	%[prev],%[mask]\n"
+			"	xilf	%[mask],0xffffffff\n"
+			"	or	%[new],%[prev]\n"
+			"	or	%[prev],%[tmp]\n"
+			"2:	lr	%[tmp],%[prev]\n"
+			"3:	cs	%[prev],%[new],%[address]\n"
+			"4:	jnl	5f\n"
+			"	xr	%[tmp],%[prev]\n"
+			"	xr	%[new],%[tmp]\n"
+			"	nr	%[tmp],%[mask]\n"
+			"	jnz	5f\n"
+			"	brct	%[count],2b\n"
+			"5:	sacf	768\n"
+			"	spka	%[default_key]\n"
+			EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev])
+			EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev])
+			EX_TABLE_UA_LOAD_REG(3b, 5b, %[rc], %[prev])
+			EX_TABLE_UA_LOAD_REG(4b, 5b, %[rc], %[prev])
+			: [rc] "+&d" (rc),
+			  [prev] "=&d" (prev),
+			  [address] "+Q" (*(int *)address),
+			  [tmp] "+&d" (_old),
+			  [new] "+&d" (_new),
+			  [mask] "+&d" (mask),
+			  [count] "=a" (count)
+			: [key] "%[count]" (key << 4),
+			  [default_key] "J" (PAGE_DEFAULT_KEY),
+			  [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS)
+			: "memory", "cc");
+		*(unsigned short *)uval = prev >> shift;
+		if (!count)
+			rc = -EAGAIN;
+		return rc;
+	}
+	case 4:	{
+		unsigned int prev = old;
+
+		asm_inline volatile(
+			"	spka	0(%[key])\n"
+			"	sacf	256\n"
+			"0:	cs	%[prev],%[new],%[address]\n"
+			"1:	sacf	768\n"
+			"	spka	%[default_key]\n"
+			EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev])
+			EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev])
+			: [rc] "+&d" (rc),
+			  [prev] "+&d" (prev),
+			  [address] "+Q" (*(int *)address)
+			: [new] "d" ((unsigned int)new),
+			  [key] "a" (key << 4),
+			  [default_key] "J" (PAGE_DEFAULT_KEY)
+			: "memory", "cc");
+		*(unsigned int *)uval = prev;
+		return rc;
+	}
+	case 8: {
+		unsigned long prev = old;
+
+		asm_inline volatile(
+			"	spka	0(%[key])\n"
+			"	sacf	256\n"
+			"0:	csg	%[prev],%[new],%[address]\n"
+			"1:	sacf	768\n"
+			"	spka	%[default_key]\n"
+			EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev])
+			EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev])
+			: [rc] "+&d" (rc),
+			  [prev] "+&d" (prev),
+			  [address] "+QS" (*(long *)address)
+			: [new] "d" ((unsigned long)new),
+			  [key] "a" (key << 4),
+			  [default_key] "J" (PAGE_DEFAULT_KEY)
+			: "memory", "cc");
+		*(unsigned long *)uval = prev;
+		return rc;
+	}
+	case 16: {
+		__uint128_t prev = old;
+
+		asm_inline volatile(
+			"	spka	0(%[key])\n"
+			"	sacf	256\n"
+			"0:	cdsg	%[prev],%[new],%[address]\n"
+			"1:	sacf	768\n"
+			"	spka	%[default_key]\n"
+			EX_TABLE_UA_LOAD_REGPAIR(0b, 1b, %[rc], %[prev])
+			EX_TABLE_UA_LOAD_REGPAIR(1b, 1b, %[rc], %[prev])
+			: [rc] "+&d" (rc),
+			  [prev] "+&d" (prev),
+			  [address] "+QS" (*(__int128_t *)address)
+			: [new] "d" (new),
+			  [key] "a" (key << 4),
+			  [default_key] "J" (PAGE_DEFAULT_KEY)
+			: "memory", "cc");
+		*(__uint128_t *)uval = prev;
+		return rc;
+	}
+	}
+	__cmpxchg_user_key_called_with_bad_pointer();
+	return rc;
+}
+
+/**
+ * cmpxchg_user_key() - cmpxchg with user space target, honoring storage keys
+ * @ptr: User space address of value to compare to @old and exchange with
+ *	 @new. Must be aligned to sizeof(*@ptr).
+ * @uval: Address where the old value of *@ptr is written to.
+ * @old: Old value. Compared to the content pointed to by @ptr in order to
+ *	 determine if the exchange occurs. The old value read from *@ptr is
+ *	 written to *@uval.
+ * @new: New value to place at *@ptr.
+ * @key: Access key to use for checking storage key protection.
+ *
+ * Perform a cmpxchg on a user space target, honoring storage key protection.
+ * @key alone determines how key checking is performed, neither
+ * storage-protection-override nor fetch-protection-override apply.
+ * The caller must compare *@uval and @old to determine if values have been
+ * exchanged. In case of an exception *@uval is set to zero.
+ *
+ * Return:     0: cmpxchg executed
+ *	       -EFAULT: an exception happened when trying to access *@ptr
+ *	       -EAGAIN: maxed out number of retries (byte and short only)
+ */
+#define cmpxchg_user_key(ptr, uval, old, new, key)			\
+({									\
+	__typeof__(ptr) __ptr = (ptr);					\
+	__typeof__(uval) __uval = (uval);				\
+									\
+	BUILD_BUG_ON(sizeof(*(__ptr)) != sizeof(*(__uval)));		\
+	might_fault();							\
+	__chk_user_ptr(__ptr);						\
+	__cmpxchg_user_key((unsigned long)(__ptr), (void *)(__uval),	\
+			   (old), (new), (key), sizeof(*(__ptr)));	\
+})
 
 #endif /* __S390_UACCESS_H */
diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h
index 9e9f75ef046a..70fc671397da 100644
--- a/arch/s390/include/asm/unistd.h
+++ b/arch/s390/include/asm/unistd.h
@@ -28,12 +28,12 @@
 #define __ARCH_WANT_SYS_SIGPENDING
 #define __ARCH_WANT_SYS_SIGPROCMASK
 # ifdef CONFIG_COMPAT
+#   define __ARCH_WANT_COMPAT_STAT
 #   define __ARCH_WANT_SYS_TIME32
 #   define __ARCH_WANT_SYS_UTIME32
 # endif
 #define __ARCH_WANT_SYS_FORK
 #define __ARCH_WANT_SYS_VFORK
 #define __ARCH_WANT_SYS_CLONE
-#define __ARCH_WANT_SYS_CLONE3
 
 #endif /* _ASM_S390_UNISTD_H_ */
diff --git a/arch/s390/include/asm/unwind.h b/arch/s390/include/asm/unwind.h
index de9006b0cfeb..b8ecf04e3468 100644
--- a/arch/s390/include/asm/unwind.h
+++ b/arch/s390/include/asm/unwind.h
@@ -4,6 +4,8 @@
 
 #include <linux/sched.h>
 #include <linux/ftrace.h>
+#include <linux/rethook.h>
+#include <linux/llist.h>
 #include <asm/ptrace.h>
 #include <asm/stacktrace.h>
 
@@ -36,10 +38,23 @@ struct unwind_state {
 	struct pt_regs *regs;
 	unsigned long sp, ip;
 	int graph_idx;
+	struct llist_node *kr_cur;
 	bool reliable;
 	bool error;
 };
 
+/* Recover the return address modified by rethook and ftrace_graph. */
+static inline unsigned long unwind_recover_ret_addr(struct unwind_state *state,
+						    unsigned long ip)
+{
+	ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, (void *)state->sp);
+#ifdef CONFIG_RETHOOK
+	if (is_rethook_trampoline(ip))
+		ip = rethook_find_ret_addr(state->task, state->sp, &state->kr_cur);
+#endif
+	return ip;
+}
+
 void __unwind_start(struct unwind_state *state, struct task_struct *task,
 		    struct pt_regs *regs, unsigned long first_frame);
 bool unwind_next_frame(struct unwind_state *state);
@@ -55,10 +70,10 @@ static inline bool unwind_error(struct unwind_state *state)
 	return state->error;
 }
 
-static inline void unwind_start(struct unwind_state *state,
-				struct task_struct *task,
-				struct pt_regs *regs,
-				unsigned long first_frame)
+static __always_inline void unwind_start(struct unwind_state *state,
+					 struct task_struct *task,
+					 struct pt_regs *regs,
+					 unsigned long first_frame)
 {
 	task = task ?: current;
 	first_frame = first_frame ?: get_stack_pointer(task, regs);
diff --git a/arch/s390/include/asm/user.h b/arch/s390/include/asm/user.h
index 0ca572ced21b..8e8aaf48582e 100644
--- a/arch/s390/include/asm/user.h
+++ b/arch/s390/include/asm/user.h
@@ -67,9 +67,5 @@ struct user {
   unsigned long magic;		/* To uniquely identify a core file */
   char u_comm[32];		/* User command that was responsible */
 };
-#define NBPG PAGE_SIZE
-#define UPAGES 1
-#define HOST_TEXT_START_ADDR (u.start_code)
-#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG)
 
 #endif /* _S390_USER_H */
diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
index ef3c00b049ab..46fb0ef6f984 100644
--- a/arch/s390/include/asm/uv.h
+++ b/arch/s390/include/asm/uv.h
@@ -2,7 +2,7 @@
 /*
  * Ultravisor Interfaces
  *
- * Copyright IBM Corp. 2019
+ * Copyright IBM Corp. 2019, 2024
  *
  * Author(s):
  *	Vasily Gorbik <gor@linux.ibm.com>
@@ -14,23 +14,99 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/bug.h>
+#include <linux/sched.h>
 #include <asm/page.h>
+#include <asm/gmap.h>
+#include <asm/asm.h>
+
+#define UVC_CC_OK	0
+#define UVC_CC_ERROR	1
+#define UVC_CC_BUSY	2
+#define UVC_CC_PARTIAL	3
 
 #define UVC_RC_EXECUTED		0x0001
 #define UVC_RC_INV_CMD		0x0002
 #define UVC_RC_INV_STATE	0x0003
 #define UVC_RC_INV_LEN		0x0005
 #define UVC_RC_NO_RESUME	0x0007
+#define UVC_RC_MORE_DATA	0x0100
+#define UVC_RC_NEED_DESTROY	0x8000
 
 #define UVC_CMD_QUI			0x0001
+#define UVC_CMD_QUERY_KEYS		0x0002
+#define UVC_CMD_INIT_UV			0x000f
+#define UVC_CMD_CREATE_SEC_CONF		0x0100
+#define UVC_CMD_DESTROY_SEC_CONF	0x0101
+#define UVC_CMD_DESTROY_SEC_CONF_FAST	0x0102
+#define UVC_CMD_CREATE_SEC_CPU		0x0120
+#define UVC_CMD_DESTROY_SEC_CPU		0x0121
+#define UVC_CMD_CONV_TO_SEC_STOR	0x0200
+#define UVC_CMD_CONV_FROM_SEC_STOR	0x0201
+#define UVC_CMD_DESTR_SEC_STOR		0x0202
+#define UVC_CMD_SET_SEC_CONF_PARAMS	0x0300
+#define UVC_CMD_UNPACK_IMG		0x0301
+#define UVC_CMD_VERIFY_IMG		0x0302
+#define UVC_CMD_CPU_RESET		0x0310
+#define UVC_CMD_CPU_RESET_INITIAL	0x0311
+#define UVC_CMD_PREPARE_RESET		0x0320
+#define UVC_CMD_CPU_RESET_CLEAR		0x0321
+#define UVC_CMD_CPU_SET_STATE		0x0330
+#define UVC_CMD_SET_UNSHARE_ALL		0x0340
+#define UVC_CMD_PIN_PAGE_SHARED		0x0341
+#define UVC_CMD_UNPIN_PAGE_SHARED	0x0342
+#define UVC_CMD_DUMP_INIT		0x0400
+#define UVC_CMD_DUMP_CONF_STOR_STATE	0x0401
+#define UVC_CMD_DUMP_CPU		0x0402
+#define UVC_CMD_DUMP_COMPLETE		0x0403
 #define UVC_CMD_SET_SHARED_ACCESS	0x1000
 #define UVC_CMD_REMOVE_SHARED_ACCESS	0x1001
+#define UVC_CMD_RETR_ATTEST		0x1020
+#define UVC_CMD_ADD_SECRET		0x1031
+#define UVC_CMD_LIST_SECRETS		0x1033
+#define UVC_CMD_LOCK_SECRETS		0x1034
+#define UVC_CMD_RETR_SECRET		0x1035
 
 /* Bits in installed uv calls */
 enum uv_cmds_inst {
 	BIT_UVC_CMD_QUI = 0,
+	BIT_UVC_CMD_INIT_UV = 1,
+	BIT_UVC_CMD_CREATE_SEC_CONF = 2,
+	BIT_UVC_CMD_DESTROY_SEC_CONF = 3,
+	BIT_UVC_CMD_CREATE_SEC_CPU = 4,
+	BIT_UVC_CMD_DESTROY_SEC_CPU = 5,
+	BIT_UVC_CMD_CONV_TO_SEC_STOR = 6,
+	BIT_UVC_CMD_CONV_FROM_SEC_STOR = 7,
 	BIT_UVC_CMD_SET_SHARED_ACCESS = 8,
 	BIT_UVC_CMD_REMOVE_SHARED_ACCESS = 9,
+	BIT_UVC_CMD_SET_SEC_PARMS = 11,
+	BIT_UVC_CMD_UNPACK_IMG = 13,
+	BIT_UVC_CMD_VERIFY_IMG = 14,
+	BIT_UVC_CMD_CPU_RESET = 15,
+	BIT_UVC_CMD_CPU_RESET_INITIAL = 16,
+	BIT_UVC_CMD_CPU_SET_STATE = 17,
+	BIT_UVC_CMD_PREPARE_RESET = 18,
+	BIT_UVC_CMD_CPU_PERFORM_CLEAR_RESET = 19,
+	BIT_UVC_CMD_UNSHARE_ALL = 20,
+	BIT_UVC_CMD_PIN_PAGE_SHARED = 21,
+	BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22,
+	BIT_UVC_CMD_DESTROY_SEC_CONF_FAST = 23,
+	BIT_UVC_CMD_DUMP_INIT = 24,
+	BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE = 25,
+	BIT_UVC_CMD_DUMP_CPU = 26,
+	BIT_UVC_CMD_DUMP_COMPLETE = 27,
+	BIT_UVC_CMD_RETR_ATTEST = 28,
+	BIT_UVC_CMD_ADD_SECRET = 29,
+	BIT_UVC_CMD_LIST_SECRETS = 30,
+	BIT_UVC_CMD_LOCK_SECRETS = 31,
+	BIT_UVC_CMD_RETR_SECRET = 33,
+	BIT_UVC_CMD_QUERY_KEYS = 34,
+};
+
+enum uv_feat_ind {
+	BIT_UV_FEAT_MISC = 0,
+	BIT_UV_FEAT_AIV = 1,
+	BIT_UV_FEAT_AP = 4,
+	BIT_UV_FEAT_AP_INTR = 5,
 };
 
 struct uv_cb_header {
@@ -40,13 +116,174 @@ struct uv_cb_header {
 	u16 rrc;	/* Return Reason Code */
 } __packed __aligned(8);
 
+/* Query Ultravisor Information */
 struct uv_cb_qui {
+	struct uv_cb_header header;		/* 0x0000 */
+	u64 reserved08;				/* 0x0008 */
+	u64 inst_calls_list[4];			/* 0x0010 */
+	u64 reserved30[2];			/* 0x0030 */
+	u64 uv_base_stor_len;			/* 0x0040 */
+	u64 reserved48;				/* 0x0048 */
+	u64 conf_base_phys_stor_len;		/* 0x0050 */
+	u64 conf_base_virt_stor_len;		/* 0x0058 */
+	u64 conf_virt_var_stor_len;		/* 0x0060 */
+	u64 cpu_stor_len;			/* 0x0068 */
+	u32 reserved70[3];			/* 0x0070 */
+	u32 max_num_sec_conf;			/* 0x007c */
+	u64 max_guest_stor_addr;		/* 0x0080 */
+	u8  reserved88[0x9e - 0x88];		/* 0x0088 */
+	u16 max_guest_cpu_id;			/* 0x009e */
+	u64 uv_feature_indications;		/* 0x00a0 */
+	u64 reserveda8;				/* 0x00a8 */
+	u64 supp_se_hdr_versions;		/* 0x00b0 */
+	u64 supp_se_hdr_pcf;			/* 0x00b8 */
+	u64 reservedc0;				/* 0x00c0 */
+	u64 conf_dump_storage_state_len;	/* 0x00c8 */
+	u64 conf_dump_finalize_len;		/* 0x00d0 */
+	u64 reservedd8;				/* 0x00d8 */
+	u64 supp_att_req_hdr_ver;		/* 0x00e0 */
+	u64 supp_att_pflags;			/* 0x00e8 */
+	u64 reservedf0;				/* 0x00f0 */
+	u64 supp_add_secret_req_ver;		/* 0x00f8 */
+	u64 supp_add_secret_pcf;		/* 0x0100 */
+	u64 supp_secret_types;			/* 0x0108 */
+	u16 max_assoc_secrets;			/* 0x0110 */
+	u16 max_retr_secrets;			/* 0x0112 */
+	u8 reserved114[0x120 - 0x114];		/* 0x0114 */
+} __packed __aligned(8);
+
+struct uv_key_hash {
+	u64 dword[4];
+} __packed __aligned(8);
+
+#define UVC_QUERY_KEYS_IDX_HK		0
+#define UVC_QUERY_KEYS_IDX_BACK_HK	1
+
+/* Query Ultravisor Keys */
+struct uv_cb_query_keys {
+	struct uv_cb_header header;		/* 0x0000 */
+	u64 reserved08[3];			/* 0x0008 */
+	struct uv_key_hash key_hashes[15];	/* 0x0020 */
+} __packed __aligned(8);
+static_assert(sizeof(struct uv_cb_query_keys) == 0x200);
+
+/* Initialize Ultravisor */
+struct uv_cb_init {
+	struct uv_cb_header header;
+	u64 reserved08[2];
+	u64 stor_origin;
+	u64 stor_len;
+	u64 reserved28[4];
+} __packed __aligned(8);
+
+/* Create Guest Configuration */
+struct uv_cb_cgc {
+	struct uv_cb_header header;
+	u64 reserved08[2];
+	u64 guest_handle;
+	u64 conf_base_stor_origin;
+	u64 conf_virt_stor_origin;
+	u8  reserved30[6];
+	union {
+		struct {
+			u16 : 14;
+			u16 ap_instr_intr : 1;
+			u16 ap_allow_instr : 1;
+		};
+		u16 raw;
+	} flags;
+	u64 guest_stor_origin;
+	u64 guest_stor_len;
+	u64 guest_sca;
+	u64 guest_asce;
+	u64 reserved58[5];
+} __packed __aligned(8);
+
+/* Create Secure CPU */
+struct uv_cb_csc {
+	struct uv_cb_header header;
+	u64 reserved08[2];
+	u64 cpu_handle;
+	u64 guest_handle;
+	u64 stor_origin;
+	u8  reserved30[6];
+	u16 num;
+	u64 state_origin;
+	u64 reserved40[4];
+} __packed __aligned(8);
+
+/* Convert to Secure */
+struct uv_cb_cts {
+	struct uv_cb_header header;
+	u64 reserved08[2];
+	u64 guest_handle;
+	u64 gaddr;
+} __packed __aligned(8);
+
+/* Convert from Secure / Pin Page Shared */
+struct uv_cb_cfs {
+	struct uv_cb_header header;
+	u64 reserved08[2];
+	u64 paddr;
+} __packed __aligned(8);
+
+/* Set Secure Config Parameter */
+struct uv_cb_ssc {
+	struct uv_cb_header header;
+	u64 reserved08[2];
+	u64 guest_handle;
+	u64 sec_header_origin;
+	u32 sec_header_len;
+	u32 reserved2c;
+	u64 reserved30[4];
+} __packed __aligned(8);
+
+/* Unpack */
+struct uv_cb_unp {
+	struct uv_cb_header header;
+	u64 reserved08[2];
+	u64 guest_handle;
+	u64 gaddr;
+	u64 tweak[2];
+	u64 reserved38[3];
+} __packed __aligned(8);
+
+#define PV_CPU_STATE_OPR	1
+#define PV_CPU_STATE_STP	2
+#define PV_CPU_STATE_CHKSTP	3
+#define PV_CPU_STATE_OPR_LOAD	5
+
+struct uv_cb_cpu_set_state {
+	struct uv_cb_header header;
+	u64 reserved08[2];
+	u64 cpu_handle;
+	u8  reserved20[7];
+	u8  state;
+	u64 reserved28[5];
+};
+
+/*
+ * A common UV call struct for calls that take no payload
+ * Examples:
+ * Destroy cpu/config
+ * Verify
+ */
+struct uv_cb_nodata {
+	struct uv_cb_header header;
+	u64 reserved08[2];
+	u64 handle;
+	u64 reserved20[4];
+} __packed __aligned(8);
+
+/* Destroy Configuration Fast */
+struct uv_cb_destroy_fast {
 	struct uv_cb_header header;
-	u64 reserved08;
-	u64 inst_calls_list[4];
-	u64 reserved30[15];
+	u64 reserved08[2];
+	u64 handle;
+	u64 reserved20[5];
 } __packed __aligned(8);
 
+/* Set Shared Access */
 struct uv_cb_share {
 	struct uv_cb_header header;
 	u64 reserved08[3];
@@ -54,22 +291,277 @@ struct uv_cb_share {
 	u64 reserved28;
 } __packed __aligned(8);
 
-static inline int uv_call(unsigned long r1, unsigned long r2)
+/* Retrieve Attestation Measurement */
+struct uv_cb_attest {
+	struct uv_cb_header header;	/* 0x0000 */
+	u64 reserved08[2];		/* 0x0008 */
+	u64 arcb_addr;			/* 0x0018 */
+	u64 cont_token;			/* 0x0020 */
+	u8  reserved28[6];		/* 0x0028 */
+	u16 user_data_len;		/* 0x002e */
+	u8  user_data[256];		/* 0x0030 */
+	u32 reserved130[3];		/* 0x0130 */
+	u32 meas_len;			/* 0x013c */
+	u64 meas_addr;			/* 0x0140 */
+	u8  config_uid[16];		/* 0x0148 */
+	u32 reserved158;		/* 0x0158 */
+	u32 add_data_len;		/* 0x015c */
+	u64 add_data_addr;		/* 0x0160 */
+	u64 reserved168[4];		/* 0x0168 */
+} __packed __aligned(8);
+
+struct uv_cb_dump_cpu {
+	struct uv_cb_header header;
+	u64 reserved08[2];
+	u64 cpu_handle;
+	u64 dump_area_origin;
+	u64 reserved28[5];
+} __packed __aligned(8);
+
+struct uv_cb_dump_stor_state {
+	struct uv_cb_header header;
+	u64 reserved08[2];
+	u64 config_handle;
+	u64 dump_area_origin;
+	u64 gaddr;
+	u64 reserved28[4];
+} __packed __aligned(8);
+
+struct uv_cb_dump_complete {
+	struct uv_cb_header header;
+	u64 reserved08[2];
+	u64 config_handle;
+	u64 dump_area_origin;
+	u64 reserved30[5];
+} __packed __aligned(8);
+
+/*
+ * A common UV call struct for pv guests that contains a single address
+ * Examples:
+ * Add Secret
+ */
+struct uv_cb_guest_addr {
+	struct uv_cb_header header;
+	u64 reserved08[3];
+	u64 addr;
+	u64 reserved28[4];
+} __packed __aligned(8);
+
+#define UVC_RC_RETR_SECR_BUF_SMALL	0x0109
+#define UVC_RC_RETR_SECR_STORE_EMPTY	0x010f
+#define UVC_RC_RETR_SECR_INV_IDX	0x0110
+#define UVC_RC_RETR_SECR_INV_SECRET	0x0111
+
+struct uv_cb_retr_secr {
+	struct uv_cb_header header;
+	u64 reserved08[2];
+	u16 secret_idx;
+	u16 reserved1a;
+	u32 buf_size;
+	u64 buf_addr;
+	u64 reserved28[4];
+}  __packed __aligned(8);
+
+struct uv_cb_list_secrets {
+	struct uv_cb_header header;
+	u64 reserved08[2];
+	u8  reserved18[6];
+	u16 start_idx;
+	u64 list_addr;
+	u64 reserved28[4];
+} __packed __aligned(8);
+
+enum uv_secret_types {
+	UV_SECRET_INVAL = 0x0,
+	UV_SECRET_NULL = 0x1,
+	UV_SECRET_ASSOCIATION = 0x2,
+	UV_SECRET_PLAIN = 0x3,
+	UV_SECRET_AES_128 = 0x4,
+	UV_SECRET_AES_192 = 0x5,
+	UV_SECRET_AES_256 = 0x6,
+	UV_SECRET_AES_XTS_128 = 0x7,
+	UV_SECRET_AES_XTS_256 = 0x8,
+	UV_SECRET_HMAC_SHA_256 = 0x9,
+	UV_SECRET_HMAC_SHA_512 = 0xa,
+	/* 0x0b - 0x10 reserved */
+	UV_SECRET_ECDSA_P256 = 0x11,
+	UV_SECRET_ECDSA_P384 = 0x12,
+	UV_SECRET_ECDSA_P521 = 0x13,
+	UV_SECRET_ECDSA_ED25519 = 0x14,
+	UV_SECRET_ECDSA_ED448 = 0x15,
+};
+
+/**
+ * uv_secret_list_item_hdr - UV secret metadata.
+ * @index: Index of the secret in the secret list.
+ * @type: Type of the secret. See `enum uv_secret_types`.
+ * @length: Length of the stored secret.
+ */
+struct uv_secret_list_item_hdr {
+	u16 index;
+	u16 type;
+	u32 length;
+} __packed __aligned(8);
+
+#define UV_SECRET_ID_LEN 32
+/**
+ * uv_secret_list_item - UV secret entry.
+ * @hdr: The metadata of this secret.
+ * @id: The ID of this secret, not the secret itself.
+ */
+struct uv_secret_list_item {
+	struct uv_secret_list_item_hdr hdr;
+	u64 reserverd08;
+	u8 id[UV_SECRET_ID_LEN];
+} __packed __aligned(8);
+
+/**
+ * uv_secret_list - UV secret-metadata list.
+ * @num_secr_stored: Number of secrets stored in this list.
+ * @total_num_secrets: Number of secrets stored in the UV for this guest.
+ * @next_secret_idx: positive number if there are more secrets available or zero.
+ * @secrets: Up to 85 UV-secret metadata entries.
+ */
+struct uv_secret_list {
+	u16 num_secr_stored;
+	u16 total_num_secrets;
+	u16 next_secret_idx;
+	u16 reserved_06;
+	u64 reserved_08;
+	struct uv_secret_list_item secrets[85];
+} __packed __aligned(8);
+static_assert(sizeof(struct uv_secret_list) == PAGE_SIZE);
+
+static inline int __uv_call(unsigned long r1, unsigned long r2)
 {
 	int cc;
 
 	asm volatile(
-		"0:	.insn rrf,0xB9A40000,%[r1],%[r2],0,0\n"
-		"		brc	3,0b\n"
-		"		ipm	%[cc]\n"
-		"		srl	%[cc],28\n"
-		: [cc] "=d" (cc)
+		"	.insn	 rrf,0xb9a40000,%[r1],%[r2],0,0\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc)
 		: [r1] "a" (r1), [r2] "a" (r2)
-		: "memory", "cc");
+		: CC_CLOBBER_LIST("memory"));
+	return CC_TRANSFORM(cc);
+}
+
+static inline int uv_call(unsigned long r1, unsigned long r2)
+{
+	int cc;
+
+	do {
+		cc = __uv_call(r1, r2);
+	} while (cc > 1);
+	return cc;
+}
+
+/* Low level uv_call that avoids stalls for long running busy conditions  */
+static inline int uv_call_sched(unsigned long r1, unsigned long r2)
+{
+	int cc;
+
+	do {
+		cc = __uv_call(r1, r2);
+		cond_resched();
+	} while (cc > 1);
+	return cc;
+}
+
+/*
+ * special variant of uv_call that only transports the cpu or guest
+ * handle and the command, like destroy or verify.
+ */
+static inline int uv_cmd_nodata(u64 handle, u16 cmd, u16 *rc, u16 *rrc)
+{
+	struct uv_cb_nodata uvcb = {
+		.header.cmd = cmd,
+		.header.len = sizeof(uvcb),
+		.handle = handle,
+	};
+	int cc;
+
+	WARN(!handle, "No handle provided to Ultravisor call cmd %x\n", cmd);
+	cc = uv_call_sched(0, (u64)&uvcb);
+	*rc = uvcb.header.rc;
+	*rrc = uvcb.header.rrc;
+	return cc ? -EINVAL : 0;
+}
+
+/**
+ * uv_list_secrets() - Do a List Secrets UVC.
+ *
+ * @buf: Buffer to write list into; size of one page.
+ * @start_idx: The smallest index that should be included in the list.
+ *		For the fist invocation use 0.
+ * @rc: Pointer to store the return code or NULL.
+ * @rrc: Pointer to store the return reason code or NULL.
+ *
+ * This function calls the List Secrets UVC. The result is written into `buf`,
+ * that needs to be at least one page of writable memory.
+ * `buf` consists of:
+ * * %struct uv_secret_list_hdr
+ * * %struct uv_secret_list_item (multiple)
+ *
+ * For `start_idx` use _0_ for the first call. If there are more secrets available
+ * but could not fit into the page then `rc` is `UVC_RC_MORE_DATA`.
+ * In this case use `uv_secret_list_hdr.next_secret_idx` for `start_idx`.
+ *
+ * Context: might sleep.
+ *
+ * Return: The UVC condition code.
+ */
+static inline int uv_list_secrets(struct uv_secret_list *buf, u16 start_idx,
+				  u16 *rc, u16 *rrc)
+{
+	struct uv_cb_list_secrets uvcb = {
+		.header.len = sizeof(uvcb),
+		.header.cmd = UVC_CMD_LIST_SECRETS,
+		.start_idx = start_idx,
+		.list_addr = (u64)buf,
+	};
+	int cc = uv_call_sched(0, (u64)&uvcb);
+
+	if (rc)
+		*rc = uvcb.header.rc;
+	if (rrc)
+		*rrc = uvcb.header.rrc;
+
 	return cc;
 }
 
-#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
+struct uv_info {
+	unsigned long inst_calls_list[4];
+	unsigned long uv_base_stor_len;
+	unsigned long guest_base_stor_len;
+	unsigned long guest_virt_base_stor_len;
+	unsigned long guest_virt_var_stor_len;
+	unsigned long guest_cpu_stor_len;
+	unsigned long max_sec_stor_addr;
+	unsigned int max_num_sec_conf;
+	unsigned short max_guest_cpu_id;
+	unsigned long uv_feature_indications;
+	unsigned long supp_se_hdr_ver;
+	unsigned long supp_se_hdr_pcf;
+	unsigned long conf_dump_storage_state_len;
+	unsigned long conf_dump_finalize_len;
+	unsigned long supp_att_req_hdr_ver;
+	unsigned long supp_att_pflags;
+	unsigned long supp_add_secret_req_ver;
+	unsigned long supp_add_secret_pcf;
+	unsigned long supp_secret_types;
+	unsigned short max_assoc_secrets;
+	unsigned short max_retr_secrets;
+};
+
+extern struct uv_info uv_info;
+
+static inline bool uv_has_feature(u8 feature_bit)
+{
+	if (feature_bit >= sizeof(uv_info.uv_feature_indications) * 8)
+		return false;
+	return test_bit_inv(feature_bit, &uv_info.uv_feature_indications);
+}
+
 extern int prot_virt_guest;
 
 static inline int is_prot_virt_guest(void)
@@ -86,7 +578,7 @@ static inline int share(unsigned long addr, u16 cmd)
 	};
 
 	if (!is_prot_virt_guest())
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 	/*
 	 * Sharing is page wise, if we encounter addresses that are
 	 * not page aligned, we assume something went wrong. If
@@ -97,7 +589,10 @@ static inline int share(unsigned long addr, u16 cmd)
 
 	if (!uv_call(0, (u64)&uvcb))
 		return 0;
-	return -EINVAL;
+	pr_err("%s UVC failed (rc: 0x%x, rrc: 0x%x), possible hypervisor bug.\n",
+	       uvcb.header.cmd == UVC_CMD_SET_SHARED_ACCESS ? "Share" : "Unshare",
+	       uvcb.header.rc, uvcb.header.rrc);
+	panic("System security cannot be guaranteed unless the system panics now.\n");
 }
 
 /*
@@ -121,12 +616,25 @@ static inline int uv_remove_shared(unsigned long addr)
 	return share(addr, UVC_CMD_REMOVE_SHARED_ACCESS);
 }
 
-void uv_query_info(void);
-#else
-#define is_prot_virt_guest() 0
-static inline int uv_set_shared(unsigned long addr) { return 0; }
-static inline int uv_remove_shared(unsigned long addr) { return 0; }
-static inline void uv_query_info(void) {}
-#endif
+int uv_get_secret_metadata(const u8 secret_id[UV_SECRET_ID_LEN],
+			   struct uv_secret_list_item_hdr *secret);
+int uv_retrieve_secret(u16 secret_idx, u8 *buf, size_t buf_size);
+
+extern int prot_virt_host;
+
+static inline int is_prot_virt_host(void)
+{
+	return prot_virt_host;
+}
+
+int uv_pin_shared(unsigned long paddr);
+int uv_destroy_folio(struct folio *folio);
+int uv_destroy_pte(pte_t pte);
+int uv_convert_from_secure_pte(pte_t pte);
+int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb);
+int uv_convert_from_secure(unsigned long paddr);
+int uv_convert_from_secure_folio(struct folio *folio);
+
+void setup_uv(void);
 
 #endif /* _ASM_S390_UV_H */
diff --git a/arch/s390/include/asm/vdso-symbols.h b/arch/s390/include/asm/vdso-symbols.h
new file mode 100644
index 000000000000..0df17574d788
--- /dev/null
+++ b/arch/s390/include/asm/vdso-symbols.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __S390_VDSO_SYMBOLS_H__
+#define __S390_VDSO_SYMBOLS_H__
+
+#include <generated/vdso64-offsets.h>
+#ifdef CONFIG_COMPAT
+#include <generated/vdso32-offsets.h>
+#endif
+
+#define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name))
+#ifdef CONFIG_COMPAT
+#define VDSO32_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso32_offset_##name))
+#else
+#define VDSO32_SYMBOL(tsk, name) (-1UL)
+#endif
+
+#endif /* __S390_VDSO_SYMBOLS_H__ */
diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h
index 3bcfdeb01395..420a073fdde5 100644
--- a/arch/s390/include/asm/vdso.h
+++ b/arch/s390/include/asm/vdso.h
@@ -2,64 +2,16 @@
 #ifndef __S390_VDSO_H__
 #define __S390_VDSO_H__
 
-/* Default link addresses for the vDSOs */
-#define VDSO32_LBASE	0
-#define VDSO64_LBASE	0
-
-#define VDSO_VERSION_STRING	LINUX_2.6.29
+#include <vdso/datapage.h>
 
 #ifndef __ASSEMBLY__
 
-/*
- * Note about the vdso_data and vdso_per_cpu_data structures:
- *
- * NEVER USE THEM IN USERSPACE CODE DIRECTLY. The layout of the
- * structure is supposed to be known only to the function in the vdso
- * itself and may change without notice.
- */
-
-struct vdso_data {
-	__u64 tb_update_count;		/* Timebase atomicity ctr	0x00 */
-	__u64 xtime_tod_stamp;		/* TOD clock for xtime		0x08 */
-	__u64 xtime_clock_sec;		/* Kernel time			0x10 */
-	__u64 xtime_clock_nsec;		/*				0x18 */
-	__u64 xtime_coarse_sec;		/* Coarse kernel time		0x20 */
-	__u64 xtime_coarse_nsec;	/*				0x28 */
-	__u64 wtom_clock_sec;		/* Wall to monotonic clock	0x30 */
-	__u64 wtom_clock_nsec;		/*				0x38 */
-	__u64 wtom_coarse_sec;		/* Coarse wall to monotonic	0x40 */
-	__u64 wtom_coarse_nsec;		/*				0x48 */
-	__u32 tz_minuteswest;		/* Minutes west of Greenwich	0x50 */
-	__u32 tz_dsttime;		/* Type of dst correction	0x54 */
-	__u32 ectg_available;		/* ECTG instruction present	0x58 */
-	__u32 tk_mult;			/* Mult. used for xtime_nsec	0x5c */
-	__u32 tk_shift;			/* Shift used for xtime_nsec	0x60 */
-	__u32 ts_dir;			/* TOD steering direction	0x64 */
-	__u64 ts_end;			/* TOD steering end		0x68 */
-};
+int vdso_getcpu_init(void);
 
-struct vdso_per_cpu_data {
-	__u64 ectg_timer_base;
-	__u64 ectg_user_time;
-	/*
-	 * Note: node_id and cpu_nr must be at adjacent memory locations.
-	 * VDSO userspace must read both values with a single instruction.
-	 */
-	union {
-		__u64 getcpu_val;
-		struct {
-			__u32 node_id;
-			__u32 cpu_nr;
-		};
-	};
-};
+#endif /* __ASSEMBLY__ */
 
-extern struct vdso_data *vdso_data;
-extern struct vdso_data boot_vdso_data;
+#define __VDSO_PAGES	4
 
-void vdso_alloc_boot_cpu(struct lowcore *lowcore);
-int vdso_alloc_per_cpu(struct lowcore *lowcore);
-void vdso_free_per_cpu(struct lowcore *lowcore);
+#define VDSO_VERSION_STRING	LINUX_2.6.29
 
-#endif /* __ASSEMBLY__ */
 #endif /* __S390_VDSO_H__ */
diff --git a/arch/s390/include/asm/vdso/clocksource.h b/arch/s390/include/asm/vdso/clocksource.h
new file mode 100644
index 000000000000..a93eda0ce7bb
--- /dev/null
+++ b/arch/s390/include/asm/vdso/clocksource.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_VDSO_CLOCKSOURCE_H
+#define __ASM_VDSO_CLOCKSOURCE_H
+
+#define VDSO_ARCH_CLOCKMODES	\
+	VDSO_CLOCKMODE_TOD
+
+#endif /* __ASM_VDSO_CLOCKSOURCE_H */
diff --git a/arch/s390/include/asm/vdso/getrandom.h b/arch/s390/include/asm/vdso/getrandom.h
new file mode 100644
index 000000000000..f8713ce39bb2
--- /dev/null
+++ b/arch/s390/include/asm/vdso/getrandom.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_VDSO_GETRANDOM_H
+#define __ASM_VDSO_GETRANDOM_H
+
+#ifndef __ASSEMBLY__
+
+#include <vdso/datapage.h>
+#include <asm/vdso/vsyscall.h>
+#include <asm/syscall.h>
+#include <asm/unistd.h>
+#include <asm/page.h>
+
+/**
+ * getrandom_syscall - Invoke the getrandom() syscall.
+ * @buffer:	Destination buffer to fill with random bytes.
+ * @len:	Size of @buffer in bytes.
+ * @flags:	Zero or more GRND_* flags.
+ * Returns:	The number of random bytes written to @buffer, or a negative value indicating an error.
+ */
+static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsigned int flags)
+{
+	return syscall3(__NR_getrandom, (long)buffer, (long)len, (long)flags);
+}
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/arch/s390/include/asm/vdso/gettimeofday.h b/arch/s390/include/asm/vdso/gettimeofday.h
new file mode 100644
index 000000000000..fb4564308e9d
--- /dev/null
+++ b/arch/s390/include/asm/vdso/gettimeofday.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_VDSO_GETTIMEOFDAY_H
+#define ASM_VDSO_GETTIMEOFDAY_H
+
+#define VDSO_HAS_TIME 1
+
+#define VDSO_HAS_CLOCK_GETRES 1
+
+#define VDSO_DELTA_NOMASK 1
+
+#include <asm/syscall.h>
+#include <asm/timex.h>
+#include <asm/unistd.h>
+#include <linux/compiler.h>
+
+
+static inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_time_data *vd)
+{
+	u64 adj, now;
+
+	now = get_tod_clock();
+	adj = vd->arch_data.tod_steering_end - now;
+	if (unlikely((s64) adj > 0))
+		now += (vd->arch_data.tod_steering_delta < 0) ? (adj >> 15) : -(adj >> 15);
+	return now;
+}
+
+static __always_inline
+long clock_gettime_fallback(clockid_t clkid, struct __kernel_timespec *ts)
+{
+	return syscall2(__NR_clock_gettime, (long)clkid, (long)ts);
+}
+
+static __always_inline
+long gettimeofday_fallback(register struct __kernel_old_timeval *tv,
+			   register struct timezone *tz)
+{
+	return syscall2(__NR_gettimeofday, (long)tv, (long)tz);
+}
+
+static __always_inline
+long clock_getres_fallback(clockid_t clkid, struct __kernel_timespec *ts)
+{
+	return syscall2(__NR_clock_getres, (long)clkid, (long)ts);
+}
+
+#endif
diff --git a/arch/s390/include/asm/vdso/processor.h b/arch/s390/include/asm/vdso/processor.h
new file mode 100644
index 000000000000..cfcc3e117c4c
--- /dev/null
+++ b/arch/s390/include/asm/vdso/processor.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ASM_VDSO_PROCESSOR_H
+#define __ASM_VDSO_PROCESSOR_H
+
+#define cpu_relax() barrier()
+
+#endif /* __ASM_VDSO_PROCESSOR_H */
diff --git a/arch/s390/include/asm/vdso/time_data.h b/arch/s390/include/asm/vdso/time_data.h
new file mode 100644
index 000000000000..8a08752422e6
--- /dev/null
+++ b/arch/s390/include/asm/vdso/time_data.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __S390_ASM_VDSO_TIME_DATA_H
+#define __S390_ASM_VDSO_TIME_DATA_H
+
+#include <linux/types.h>
+
+struct arch_vdso_time_data {
+	__s64 tod_steering_delta;
+	__u64 tod_steering_end;
+};
+
+#endif /* __S390_ASM_VDSO_TIME_DATA_H */
diff --git a/arch/s390/include/asm/vdso/vsyscall.h b/arch/s390/include/asm/vdso/vsyscall.h
new file mode 100644
index 000000000000..d346ebe51301
--- /dev/null
+++ b/arch/s390/include/asm/vdso/vsyscall.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_VDSO_VSYSCALL_H
+#define __ASM_VDSO_VSYSCALL_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/hrtimer.h>
+#include <vdso/datapage.h>
+#include <asm/vdso.h>
+
+/* The asm-generic header needs to be included after the definitions above */
+#include <asm-generic/vdso/vsyscall.h>
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_VSYSCALL_H */
diff --git a/arch/s390/include/asm/vga.h b/arch/s390/include/asm/vga.h
deleted file mode 100644
index 605dc46bac5e..000000000000
--- a/arch/s390/include/asm/vga.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_S390_VGA_H
-#define _ASM_S390_VGA_H
-
-/* Avoid compile errors due to missing asm/vga.h */
-
-#endif /* _ASM_S390_VGA_H */
diff --git a/arch/s390/include/asm/vmalloc.h b/arch/s390/include/asm/vmalloc.h
new file mode 100644
index 000000000000..3ba3a6bdca25
--- /dev/null
+++ b/arch/s390/include/asm/vmalloc.h
@@ -0,0 +1,4 @@
+#ifndef _ASM_S390_VMALLOC_H
+#define _ASM_S390_VMALLOC_H
+
+#endif /* _ASM_S390_VMALLOC_H */
diff --git a/arch/s390/include/asm/vtime.h b/arch/s390/include/asm/vtime.h
index 3622d4ebc73a..9d25fb35a042 100644
--- a/arch/s390/include/asm/vtime.h
+++ b/arch/s390/include/asm/vtime.h
@@ -2,7 +2,22 @@
 #ifndef _S390_VTIME_H
 #define _S390_VTIME_H
 
-#define __ARCH_HAS_VTIME_ACCOUNT
-#define __ARCH_HAS_VTIME_TASK_SWITCH
+static inline void update_timer_sys(void)
+{
+	struct lowcore *lc = get_lowcore();
+
+	lc->system_timer += lc->last_update_timer - lc->exit_timer;
+	lc->user_timer += lc->exit_timer - lc->sys_enter_timer;
+	lc->last_update_timer = lc->sys_enter_timer;
+}
+
+static inline void update_timer_mcck(void)
+{
+	struct lowcore *lc = get_lowcore();
+
+	lc->system_timer += lc->last_update_timer - lc->exit_timer;
+	lc->user_timer += lc->exit_timer - lc->mcck_enter_timer;
+	lc->last_update_timer = lc->mcck_enter_timer;
+}
 
 #endif /* _S390_VTIME_H */
diff --git a/arch/s390/include/asm/vtimer.h b/arch/s390/include/asm/vtimer.h
index 42f707d1c1e8..e601adaa6320 100644
--- a/arch/s390/include/asm/vtimer.h
+++ b/arch/s390/include/asm/vtimer.h
@@ -25,8 +25,6 @@ extern void add_virt_timer_periodic(struct vtimer_list *timer);
 extern int mod_virt_timer(struct vtimer_list *timer, u64 expires);
 extern int mod_virt_timer_periodic(struct vtimer_list *timer, u64 expires);
 extern int del_virt_timer(struct vtimer_list *timer);
-
-extern void init_cpu_vtimer(void);
 extern void vtime_init(void);
 
 #endif /* _ASM_S390_TIMER_H */
diff --git a/arch/s390/include/asm/word-at-a-time.h b/arch/s390/include/asm/word-at-a-time.h
new file mode 100644
index 000000000000..eaa19dee7699
--- /dev/null
+++ b/arch/s390/include/asm/word-at-a-time.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_WORD_AT_A_TIME_H
+#define _ASM_WORD_AT_A_TIME_H
+
+#include <linux/bitops.h>
+#include <linux/wordpart.h>
+#include <asm/asm-extable.h>
+#include <asm/bitsperlong.h>
+
+struct word_at_a_time {
+	const unsigned long bits;
+};
+
+#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x7f) }
+
+static inline unsigned long prep_zero_mask(unsigned long val, unsigned long data, const struct word_at_a_time *c)
+{
+	return data;
+}
+
+static inline unsigned long create_zero_mask(unsigned long data)
+{
+	return __fls(data);
+}
+
+static inline unsigned long find_zero(unsigned long data)
+{
+	return (data ^ (BITS_PER_LONG - 1)) >> 3;
+}
+
+static inline unsigned long has_zero(unsigned long val, unsigned long *data, const struct word_at_a_time *c)
+{
+	unsigned long mask = (val & c->bits) + c->bits;
+
+	*data = ~(mask | val | c->bits);
+	return *data;
+}
+
+static inline unsigned long zero_bytemask(unsigned long data)
+{
+	return ~1UL << data;
+}
+
+/*
+ * Load an unaligned word from kernel space.
+ *
+ * In the (very unlikely) case of the word being a page-crosser
+ * and the next page not being mapped, take the exception and
+ * return zeroes in the non-existing part.
+ */
+static inline unsigned long load_unaligned_zeropad(const void *addr)
+{
+	unsigned long data;
+
+	asm_inline volatile(
+		"0:	lg	%[data],0(%[addr])\n"
+		"1:	nopr	%%r7\n"
+		EX_TABLE_ZEROPAD(0b, 1b, %[data], %[addr])
+		EX_TABLE_ZEROPAD(1b, 1b, %[data], %[addr])
+		: [data] "=d" (data)
+		: [addr] "a" (addr), "m" (*(unsigned long *)addr));
+	return data;
+}
+
+#endif /* _ASM_WORD_AT_A_TIME_H */
diff --git a/arch/s390/include/uapi/asm/cmb.h b/arch/s390/include/uapi/asm/cmb.h
index ecbe94941403..115434ab98fb 100644
--- a/arch/s390/include/uapi/asm/cmb.h
+++ b/arch/s390/include/uapi/asm/cmb.h
@@ -31,7 +31,7 @@
 struct cmbdata {
 	__u64 size;
 	__u64 elapsed_time;
- /* basic and exended format: */
+ /* basic and extended format: */
 	__u64 ssch_rsch_count;
 	__u64 sample_count;
 	__u64 device_connect_time;
diff --git a/arch/s390/include/uapi/asm/dasd.h b/arch/s390/include/uapi/asm/dasd.h
index 9ec86fae9980..7c364b33c84d 100644
--- a/arch/s390/include/uapi/asm/dasd.h
+++ b/arch/s390/include/uapi/asm/dasd.h
@@ -24,7 +24,7 @@
 /*
  * struct dasd_information2_t
  * represents any data about the device, which is visible to userspace.
- *  including foramt and featueres.
+ *  including format and featueres.
  */
 typedef struct dasd_information2_t {
 	unsigned int devno;	    /* S/390 devno */
@@ -78,6 +78,7 @@ typedef struct dasd_information2_t {
  * 0x040: give access to raw eckd data
  * 0x080: enable discard support
  * 0x100: enable autodisable for IFCC errors (default)
+ * 0x200: enable requeue of all requests on autoquiesce
  */
 #define DASD_FEATURE_READONLY	      0x001
 #define DASD_FEATURE_USEDIAG	      0x002
@@ -88,6 +89,7 @@ typedef struct dasd_information2_t {
 #define DASD_FEATURE_USERAW	      0x040
 #define DASD_FEATURE_DISCARD	      0x080
 #define DASD_FEATURE_PATH_AUTODISABLE 0x100
+#define DASD_FEATURE_REQUEUEQUIESCE   0x200
 #define DASD_FEATURE_DEFAULT	      DASD_FEATURE_PATH_AUTODISABLE
 
 #define DASD_PARTN_BITS 2
@@ -183,6 +185,18 @@ typedef struct format_data_t {
 } format_data_t;
 
 /*
+ * struct dasd_copypair_swap_data_t
+ * represents all data necessary to issue a swap of the copy pair relation
+ */
+struct dasd_copypair_swap_data_t {
+	char primary[20]; /* BUSID of primary */
+	char secondary[20]; /* BUSID of secondary */
+
+	/* Reserved for future updates. */
+	__u8 reserved[64];
+};
+
+/*
  * values to be used for format_data_t.intensity
  * 0/8: normal format
  * 1/9: also write record zero
@@ -280,7 +294,7 @@ struct dasd_snid_ioctl_data {
 /********************************************************************************
  * SECTION: Definition of IOCTLs
  *
- * Here ist how the ioctl-nr should be used:
+ * Here is how the ioctl-nr should be used:
  *    0 -   31   DASD driver itself
  *   32 -  239   still open
  *  240 -  255	 reserved for EMC
@@ -326,6 +340,8 @@ struct dasd_snid_ioctl_data {
 #define BIODASDSATTR   _IOW(DASD_IOCTL_LETTER,2,attrib_data_t)
 /* Release Allocated Space */
 #define BIODASDRAS     _IOW(DASD_IOCTL_LETTER, 3, format_data_t)
+/* Swap copy pair relation */
+#define BIODASDCOPYPAIRSWAP _IOW(DASD_IOCTL_LETTER, 4, struct dasd_copypair_swap_data_t)
 
 /* Get Sense Path Group ID (SNID) data */
 #define BIODASDSNID    _IOWR(DASD_IOCTL_LETTER, 1, struct dasd_snid_ioctl_data)
diff --git a/arch/s390/include/uapi/asm/debug.h b/arch/s390/include/uapi/asm/debug.h
deleted file mode 100644
index c7c564d9aea4..000000000000
--- a/arch/s390/include/uapi/asm/debug.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- *   S/390 debug facility
- *
- *    Copyright IBM Corp. 1999, 2000
- */
-
-#ifndef _UAPIDEBUG_H
-#define _UAPIDEBUG_H
-
-#include <linux/fs.h>
-
-/* Note:
- * struct __debug_entry must be defined outside of #ifdef __KERNEL__ 
- * in order to allow a user program to analyze the 'raw'-view.
- */
-
-struct __debug_entry{
-        union {
-                struct {
-                        unsigned long long clock:52;
-                        unsigned long long exception:1;
-                        unsigned long long level:3;
-                        unsigned long long cpuid:8;
-                } fields;
-
-                unsigned long long stck;
-        } id;
-        void* caller;
-} __attribute__((packed));
-
-
-#define __DEBUG_FEATURE_VERSION      2  /* version of debug feature */
-
-#endif /* _UAPIDEBUG_H */
diff --git a/arch/s390/include/uapi/asm/diag.h b/arch/s390/include/uapi/asm/diag.h
new file mode 100644
index 000000000000..b7e6ccb4ff6e
--- /dev/null
+++ b/arch/s390/include/uapi/asm/diag.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Diag ioctls and its associated structures definitions.
+ *
+ * Copyright IBM Corp. 2024
+ */
+
+#ifndef __S390_UAPI_ASM_DIAG_H
+#define __S390_UAPI_ASM_DIAG_H
+
+#include <linux/types.h>
+
+#define DIAG_MAGIC_STR 'D'
+
+struct diag324_pib {
+	__u64 address;
+	__u64 sequence;
+};
+
+struct diag310_memtop {
+	__u64 address;
+	__u64 nesting_lvl;
+};
+
+/* Diag ioctl definitions */
+#define DIAG324_GET_PIBBUF	_IOWR(DIAG_MAGIC_STR, 0x77, struct diag324_pib)
+#define DIAG324_GET_PIBLEN	_IOR(DIAG_MAGIC_STR, 0x78, size_t)
+#define DIAG310_GET_STRIDE	_IOR(DIAG_MAGIC_STR, 0x79, size_t)
+#define DIAG310_GET_MEMTOPLEN	_IOWR(DIAG_MAGIC_STR, 0x7a, size_t)
+#define DIAG310_GET_MEMTOPBUF	_IOWR(DIAG_MAGIC_STR, 0x7b, struct diag310_memtop)
+
+#endif /* __S390_UAPI_ASM_DIAG_H */
diff --git a/arch/s390/include/uapi/asm/fs3270.h b/arch/s390/include/uapi/asm/fs3270.h
new file mode 100644
index 000000000000..c4bc1108af6a
--- /dev/null
+++ b/arch/s390/include/uapi/asm/fs3270.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier:  GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_S390_UAPI_FS3270_H
+#define __ASM_S390_UAPI_FS3270_H
+
+#include <linux/types.h>
+#include <asm/ioctl.h>
+
+/* ioctls for fullscreen 3270 */
+#define TUBICMD		_IO('3', 3)	/* set ccw command for fs reads. */
+#define TUBOCMD		_IO('3', 4)	/* set ccw command for fs writes. */
+#define TUBGETI		_IO('3', 7)	/* get ccw command for fs reads. */
+#define TUBGETO		_IO('3', 8)	/* get ccw command for fs writes. */
+#define TUBGETMOD	_IO('3', 13)	/* get characteristics like model, cols, rows */
+
+/* For TUBGETMOD */
+struct raw3270_iocb {
+	__u16 model;
+	__u16 line_cnt;
+	__u16 col_cnt;
+	__u16 pf_cnt;
+	__u16 re_cnt;
+	__u16 map;
+};
+
+#endif /* __ASM_S390_UAPI_FS3270_H */
diff --git a/arch/s390/include/uapi/asm/hwctrset.h b/arch/s390/include/uapi/asm/hwctrset.h
new file mode 100644
index 000000000000..e56b9dd23a4b
--- /dev/null
+++ b/arch/s390/include/uapi/asm/hwctrset.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright IBM Corp. 2021
+ * Interface implementation for communication with the CPU Measurement
+ * counter facility device driver.
+ *
+ * Author(s): Thomas Richter <tmricht@linux.ibm.com>
+ *
+ * Define for ioctl() commands to communicate with the CPU Measurement
+ * counter facility device driver.
+ */
+
+#ifndef _PERF_CPUM_CF_DIAG_H
+#define _PERF_CPUM_CF_DIAG_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define S390_HWCTR_DEVICE		"hwctr"
+#define S390_HWCTR_START_VERSION	1
+
+struct s390_ctrset_start {		/* Set CPUs to operate on */
+	__u64 version;			/* Version of interface */
+	__u64 data_bytes;		/* # of bytes required */
+	__u64 cpumask_len;		/* Length of CPU mask in bytes */
+	__u64 *cpumask;			/* Pointer to CPU mask */
+	__u64 counter_sets;		/* Bit mask of counter sets to get */
+};
+
+struct s390_ctrset_setdata {		/* Counter set data */
+	__u32 set;			/* Counter set number */
+	__u32 no_cnts;			/* # of counters stored in cv[] */
+	__u64 cv[];			/* Counter values (variable length) */
+};
+
+struct s390_ctrset_cpudata {		/* Counter set data per CPU */
+	__u32 cpu_nr;			/* CPU number */
+	__u32 no_sets;			/* # of counters sets in data[] */
+	struct s390_ctrset_setdata data[];
+};
+
+struct s390_ctrset_read {		/* Structure to get all ctr sets */
+	__u64 no_cpus;			/* Total # of CPUs data taken from */
+	struct s390_ctrset_cpudata data[];
+};
+
+#define S390_HWCTR_MAGIC	'C'	/* Random magic # for ioctls */
+#define	S390_HWCTR_START	_IOWR(S390_HWCTR_MAGIC, 1, struct s390_ctrset_start)
+#define	S390_HWCTR_STOP		_IO(S390_HWCTR_MAGIC, 2)
+#define	S390_HWCTR_READ		_IOWR(S390_HWCTR_MAGIC, 3, struct s390_ctrset_read)
+#endif
diff --git a/arch/s390/include/uapi/asm/ipl.h b/arch/s390/include/uapi/asm/ipl.h
index 451ba7d08905..2cd28af50dd4 100644
--- a/arch/s390/include/uapi/asm/ipl.h
+++ b/arch/s390/include/uapi/asm/ipl.h
@@ -27,6 +27,8 @@ enum ipl_pbt {
 	IPL_PBT_FCP = 0,
 	IPL_PBT_SCP_DATA = 1,
 	IPL_PBT_CCW = 2,
+	IPL_PBT_ECKD = 3,
+	IPL_PBT_NVME = 4,
 };
 
 /* IPL Parameter Block 0 with common fields */
@@ -67,6 +69,30 @@ struct ipl_pb0_fcp {
 #define IPL_PB0_FCP_OPT_IPL	0x10
 #define IPL_PB0_FCP_OPT_DUMP	0x20
 
+/* IPL Parameter Block 0 for NVMe */
+struct ipl_pb0_nvme {
+	__u32 len;
+	__u8  pbt;
+	__u8  reserved1[3];
+	__u8  loadparm[8];
+	__u8  reserved2[304];
+	__u8  opt;
+	__u8  reserved3[3];
+	__u32 fid;
+	__u8 reserved4[12];
+	__u32 nsid;
+	__u8 reserved5[4];
+	__u32 bootprog;
+	__u8 reserved6[12];
+	__u64 br_lba;
+	__u32 scp_data_len;
+	__u8  reserved7[260];
+	__u8  scp_data[];
+} __packed;
+
+#define IPL_PB0_NVME_OPT_IPL	0x10
+#define IPL_PB0_NVME_OPT_DUMP	0x20
+
 /* IPL Parameter Block 0 for CCW */
 struct ipl_pb0_ccw {
 	__u32 len;
@@ -86,6 +112,34 @@ struct ipl_pb0_ccw {
 	__u8  reserved5[8];
 } __packed;
 
+/* IPL Parameter Block 0 for ECKD */
+struct ipl_pb0_eckd {
+	__u32 len;
+	__u8  pbt;
+	__u8  reserved1[3];
+	__u32 reserved2[78];
+	__u8  opt;
+	__u8  reserved4[4];
+	__u8  reserved5:5;
+	__u8  ssid:3;
+	__u16 devno;
+	__u32 reserved6[5];
+	__u32 bootprog;
+	__u8  reserved7[12];
+	struct {
+		__u16 cyl;
+		__u8 head;
+		__u8 record;
+		__u32 reserved;
+	} br_chr __packed;
+	__u32 scp_data_len;
+	__u8  reserved8[260];
+	__u8  scp_data[];
+} __packed;
+
+#define IPL_PB0_ECKD_OPT_IPL	0x10
+#define IPL_PB0_ECKD_OPT_DUMP	0x20
+
 #define IPL_PB0_CCW_VM_FLAG_NSS		0x80
 #define IPL_PB0_CCW_VM_FLAG_VP		0x40
 
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 436ec7636927..60345dd2cba2 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -12,7 +12,320 @@
 #include <linux/types.h>
 
 #define __KVM_S390
-#define __KVM_HAVE_GUEST_DEBUG
+
+struct kvm_s390_skeys {
+	__u64 start_gfn;
+	__u64 count;
+	__u64 skeydata_addr;
+	__u32 flags;
+	__u32 reserved[9];
+};
+
+#define KVM_S390_CMMA_PEEK (1 << 0)
+
+/**
+ * kvm_s390_cmma_log - Used for CMMA migration.
+ *
+ * Used both for input and output.
+ *
+ * @start_gfn: Guest page number to start from.
+ * @count: Size of the result buffer.
+ * @flags: Control operation mode via KVM_S390_CMMA_* flags
+ * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty
+ *             pages are still remaining.
+ * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set
+ *        in the PGSTE.
+ * @values: Pointer to the values buffer.
+ *
+ * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls.
+ */
+struct kvm_s390_cmma_log {
+	__u64 start_gfn;
+	__u32 count;
+	__u32 flags;
+	union {
+		__u64 remaining;
+		__u64 mask;
+	};
+	__u64 values;
+};
+
+#define KVM_S390_RESET_POR       1
+#define KVM_S390_RESET_CLEAR     2
+#define KVM_S390_RESET_SUBSYSTEM 4
+#define KVM_S390_RESET_CPU_INIT  8
+#define KVM_S390_RESET_IPL       16
+
+/* for KVM_S390_MEM_OP */
+struct kvm_s390_mem_op {
+	/* in */
+	__u64 gaddr;		/* the guest address */
+	__u64 flags;		/* flags */
+	__u32 size;		/* amount of bytes */
+	__u32 op;		/* type of operation */
+	__u64 buf;		/* buffer in userspace */
+	union {
+		struct {
+			__u8 ar;	/* the access register number */
+			__u8 key;	/* access key, ignored if flag unset */
+			__u8 pad1[6];	/* ignored */
+			__u64 old_addr;	/* ignored if cmpxchg flag unset */
+		};
+		__u32 sida_offset; /* offset into the sida */
+		__u8 reserved[32]; /* ignored */
+	};
+};
+/* types for kvm_s390_mem_op->op */
+#define KVM_S390_MEMOP_LOGICAL_READ	0
+#define KVM_S390_MEMOP_LOGICAL_WRITE	1
+#define KVM_S390_MEMOP_SIDA_READ	2
+#define KVM_S390_MEMOP_SIDA_WRITE	3
+#define KVM_S390_MEMOP_ABSOLUTE_READ	4
+#define KVM_S390_MEMOP_ABSOLUTE_WRITE	5
+#define KVM_S390_MEMOP_ABSOLUTE_CMPXCHG	6
+
+/* flags for kvm_s390_mem_op->flags */
+#define KVM_S390_MEMOP_F_CHECK_ONLY		(1ULL << 0)
+#define KVM_S390_MEMOP_F_INJECT_EXCEPTION	(1ULL << 1)
+#define KVM_S390_MEMOP_F_SKEY_PROTECTION	(1ULL << 2)
+
+/* flags specifying extension support via KVM_CAP_S390_MEM_OP_EXTENSION */
+#define KVM_S390_MEMOP_EXTENSION_CAP_BASE	(1 << 0)
+#define KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG	(1 << 1)
+
+struct kvm_s390_psw {
+	__u64 mask;
+	__u64 addr;
+};
+
+/* valid values for type in kvm_s390_interrupt */
+#define KVM_S390_SIGP_STOP		0xfffe0000u
+#define KVM_S390_PROGRAM_INT		0xfffe0001u
+#define KVM_S390_SIGP_SET_PREFIX	0xfffe0002u
+#define KVM_S390_RESTART		0xfffe0003u
+#define KVM_S390_INT_PFAULT_INIT	0xfffe0004u
+#define KVM_S390_INT_PFAULT_DONE	0xfffe0005u
+#define KVM_S390_MCHK			0xfffe1000u
+#define KVM_S390_INT_CLOCK_COMP		0xffff1004u
+#define KVM_S390_INT_CPU_TIMER		0xffff1005u
+#define KVM_S390_INT_VIRTIO		0xffff2603u
+#define KVM_S390_INT_SERVICE		0xffff2401u
+#define KVM_S390_INT_EMERGENCY		0xffff1201u
+#define KVM_S390_INT_EXTERNAL_CALL	0xffff1202u
+/* Anything below 0xfffe0000u is taken by INT_IO */
+#define KVM_S390_INT_IO(ai,cssid,ssid,schid)   \
+	(((schid)) |			       \
+	 ((ssid) << 16) |		       \
+	 ((cssid) << 18) |		       \
+	 ((ai) << 26))
+#define KVM_S390_INT_IO_MIN		0x00000000u
+#define KVM_S390_INT_IO_MAX		0xfffdffffu
+#define KVM_S390_INT_IO_AI_MASK		0x04000000u
+
+
+struct kvm_s390_interrupt {
+	__u32 type;
+	__u32 parm;
+	__u64 parm64;
+};
+
+struct kvm_s390_io_info {
+	__u16 subchannel_id;
+	__u16 subchannel_nr;
+	__u32 io_int_parm;
+	__u32 io_int_word;
+};
+
+struct kvm_s390_ext_info {
+	__u32 ext_params;
+	__u32 pad;
+	__u64 ext_params2;
+};
+
+struct kvm_s390_pgm_info {
+	__u64 trans_exc_code;
+	__u64 mon_code;
+	__u64 per_address;
+	__u32 data_exc_code;
+	__u16 code;
+	__u16 mon_class_nr;
+	__u8 per_code;
+	__u8 per_atmid;
+	__u8 exc_access_id;
+	__u8 per_access_id;
+	__u8 op_access_id;
+#define KVM_S390_PGM_FLAGS_ILC_VALID	0x01
+#define KVM_S390_PGM_FLAGS_ILC_0	0x02
+#define KVM_S390_PGM_FLAGS_ILC_1	0x04
+#define KVM_S390_PGM_FLAGS_ILC_MASK	0x06
+#define KVM_S390_PGM_FLAGS_NO_REWIND	0x08
+	__u8 flags;
+	__u8 pad[2];
+};
+
+struct kvm_s390_prefix_info {
+	__u32 address;
+};
+
+struct kvm_s390_extcall_info {
+	__u16 code;
+};
+
+struct kvm_s390_emerg_info {
+	__u16 code;
+};
+
+#define KVM_S390_STOP_FLAG_STORE_STATUS	0x01
+struct kvm_s390_stop_info {
+	__u32 flags;
+};
+
+struct kvm_s390_mchk_info {
+	__u64 cr14;
+	__u64 mcic;
+	__u64 failing_storage_address;
+	__u32 ext_damage_code;
+	__u32 pad;
+	__u8 fixed_logout[16];
+};
+
+struct kvm_s390_irq {
+	__u64 type;
+	union {
+		struct kvm_s390_io_info io;
+		struct kvm_s390_ext_info ext;
+		struct kvm_s390_pgm_info pgm;
+		struct kvm_s390_emerg_info emerg;
+		struct kvm_s390_extcall_info extcall;
+		struct kvm_s390_prefix_info prefix;
+		struct kvm_s390_stop_info stop;
+		struct kvm_s390_mchk_info mchk;
+		char reserved[64];
+	} u;
+};
+
+struct kvm_s390_irq_state {
+	__u64 buf;
+	__u32 flags;        /* will stay unused for compatibility reasons */
+	__u32 len;
+	__u32 reserved[4];  /* will stay unused for compatibility reasons */
+};
+
+struct kvm_s390_ucas_mapping {
+	__u64 user_addr;
+	__u64 vcpu_addr;
+	__u64 length;
+};
+
+struct kvm_s390_pv_sec_parm {
+	__u64 origin;
+	__u64 length;
+};
+
+struct kvm_s390_pv_unp {
+	__u64 addr;
+	__u64 size;
+	__u64 tweak;
+};
+
+enum pv_cmd_dmp_id {
+	KVM_PV_DUMP_INIT,
+	KVM_PV_DUMP_CONFIG_STOR_STATE,
+	KVM_PV_DUMP_COMPLETE,
+	KVM_PV_DUMP_CPU,
+};
+
+struct kvm_s390_pv_dmp {
+	__u64 subcmd;
+	__u64 buff_addr;
+	__u64 buff_len;
+	__u64 gaddr;		/* For dump storage state */
+	__u64 reserved[4];
+};
+
+enum pv_cmd_info_id {
+	KVM_PV_INFO_VM,
+	KVM_PV_INFO_DUMP,
+};
+
+struct kvm_s390_pv_info_dump {
+	__u64 dump_cpu_buffer_len;
+	__u64 dump_config_mem_buffer_per_1m;
+	__u64 dump_config_finalize_len;
+};
+
+struct kvm_s390_pv_info_vm {
+	__u64 inst_calls_list[4];
+	__u64 max_cpus;
+	__u64 max_guests;
+	__u64 max_guest_addr;
+	__u64 feature_indication;
+};
+
+struct kvm_s390_pv_info_header {
+	__u32 id;
+	__u32 len_max;
+	__u32 len_written;
+	__u32 reserved;
+};
+
+struct kvm_s390_pv_info {
+	struct kvm_s390_pv_info_header header;
+	union {
+		struct kvm_s390_pv_info_dump dump;
+		struct kvm_s390_pv_info_vm vm;
+	};
+};
+
+enum pv_cmd_id {
+	KVM_PV_ENABLE,
+	KVM_PV_DISABLE,
+	KVM_PV_SET_SEC_PARMS,
+	KVM_PV_UNPACK,
+	KVM_PV_VERIFY,
+	KVM_PV_PREP_RESET,
+	KVM_PV_UNSHARE_ALL,
+	KVM_PV_INFO,
+	KVM_PV_DUMP,
+	KVM_PV_ASYNC_CLEANUP_PREPARE,
+	KVM_PV_ASYNC_CLEANUP_PERFORM,
+};
+
+struct kvm_pv_cmd {
+	__u32 cmd;	/* Command to be executed */
+	__u16 rc;	/* Ultravisor return code */
+	__u16 rrc;	/* Ultravisor return reason code */
+	__u64 data;	/* Data or address */
+	__u32 flags;    /* flags for future extensions. Must be 0 for now */
+	__u32 reserved[3];
+};
+
+struct kvm_s390_zpci_op {
+	/* in */
+	__u32 fh;               /* target device */
+	__u8  op;               /* operation to perform */
+	__u8  pad[3];
+	union {
+		/* for KVM_S390_ZPCIOP_REG_AEN */
+		struct {
+			__u64 ibv;      /* Guest addr of interrupt bit vector */
+			__u64 sb;       /* Guest addr of summary bit */
+			__u32 flags;
+			__u32 noi;      /* Number of interrupts */
+			__u8 isc;       /* Guest interrupt subclass */
+			__u8 sbo;       /* Offset of guest summary bit vector */
+			__u16 pad;
+		} reg_aen;
+		__u64 reserved[8];
+	} u;
+};
+
+/* types for kvm_s390_zpci_op->op */
+#define KVM_S390_ZPCIOP_REG_AEN                0
+#define KVM_S390_ZPCIOP_DEREG_AEN      1
+
+/* flags for kvm_s390_zpci_op->u.reg_aen.flags */
+#define KVM_S390_ZPCIOP_REGAEN_HOST    (1 << 0)
 
 /* Device control API: s390-specific devices */
 #define KVM_DEV_FLIC_GET_ALL_IRQS	1
@@ -74,6 +387,7 @@ struct kvm_s390_io_adapter_req {
 #define KVM_S390_VM_CRYPTO		2
 #define KVM_S390_VM_CPU_MODEL		3
 #define KVM_S390_VM_MIGRATION		4
+#define KVM_S390_VM_CPU_TOPOLOGY	5
 
 /* kvm attributes for mem_ctrl */
 #define KVM_S390_VM_MEM_ENABLE_CMMA	0
@@ -155,7 +469,24 @@ struct kvm_s390_vm_cpu_subfunc {
 	__u8 kdsa[16];		/* with MSA9 */
 	__u8 sortl[32];		/* with STFLE.150 */
 	__u8 dfltcc[32];	/* with STFLE.151 */
-	__u8 reserved[1728];
+	__u8 pfcr[16];		/* with STFLE.201 */
+	__u8 reserved[1712];
+};
+
+#define KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST	6
+#define KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST	7
+
+#define KVM_S390_VM_CPU_UV_FEAT_NR_BITS	64
+struct kvm_s390_vm_cpu_uv_feat {
+	union {
+		struct {
+			__u64 : 4;
+			__u64 ap : 1;		/* bit 4 */
+			__u64 ap_intr : 1;	/* bit 5 */
+			__u64 : 58;
+		};
+		__u64 feat;
+	};
 };
 
 /* kvm attributes for crypto */
@@ -231,11 +562,13 @@ struct kvm_guest_debug_arch {
 #define KVM_SYNC_GSCB   (1UL << 9)
 #define KVM_SYNC_BPBC   (1UL << 10)
 #define KVM_SYNC_ETOKEN (1UL << 11)
+#define KVM_SYNC_DIAG318 (1UL << 12)
 
 #define KVM_SYNC_S390_VALID_FIELDS \
 	(KVM_SYNC_PREFIX | KVM_SYNC_GPRS | KVM_SYNC_ACRS | KVM_SYNC_CRS | \
 	 KVM_SYNC_ARCH0 | KVM_SYNC_PFAULT | KVM_SYNC_VRS | KVM_SYNC_RICCB | \
-	 KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN)
+	 KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN | \
+	 KVM_SYNC_DIAG318)
 
 /* length and alignment of the sdnx as a power of two */
 #define SDNXC 8
@@ -264,7 +597,8 @@ struct kvm_sync_regs {
 	__u8 reserved2 : 7;
 	__u8 padding1[51];	/* riccb needs to be 64byte aligned */
 	__u8 riccb[64];		/* runtime instrumentation controls block */
-	__u8 padding2[192];	/* sdnx needs to be 256byte aligned */
+	__u64 diag318;		/* diagnose 0x318 info */
+	__u8 padding2[184];	/* sdnx needs to be 256byte aligned */
 	union {
 		__u8 sdnx[SDNXL];  /* state description annex */
 		struct {
diff --git a/arch/s390/include/uapi/asm/pkey.h b/arch/s390/include/uapi/asm/pkey.h
index e22f0720bbb8..ca42e941675d 100644
--- a/arch/s390/include/uapi/asm/pkey.h
+++ b/arch/s390/include/uapi/asm/pkey.h
@@ -2,7 +2,7 @@
 /*
  * Userspace interface to the pkey device driver
  *
- * Copyright IBM Corp. 2017, 2019
+ * Copyright IBM Corp. 2017, 2023
  *
  * Author: Harald Freudenberger <freude@de.ibm.com>
  *
@@ -25,28 +25,45 @@
 #define MAXPROTKEYSIZE	64	/* a protected key blob may be up to 64 bytes */
 #define MAXCLRKEYSIZE	32	   /* a clear key value may be up to 32 bytes */
 #define MAXAESCIPHERKEYSIZE 136  /* our aes cipher keys have always 136 bytes */
+#define MINEP11AESKEYBLOBSIZE 256  /* min EP11 AES key blob size  */
+#define MAXEP11AESKEYBLOBSIZE 336  /* max EP11 AES key blob size */
 
-/* Minimum and maximum size of a key blob */
+/* Minimum size of a key blob */
 #define MINKEYBLOBSIZE	SECKEYBLOBSIZE
-#define MAXKEYBLOBSIZE	MAXAESCIPHERKEYSIZE
 
 /* defines for the type field within the pkey_protkey struct */
-#define PKEY_KEYTYPE_AES_128		      1
-#define PKEY_KEYTYPE_AES_192		      2
-#define PKEY_KEYTYPE_AES_256		      3
+#define PKEY_KEYTYPE_AES_128		1
+#define PKEY_KEYTYPE_AES_192		2
+#define PKEY_KEYTYPE_AES_256		3
+#define PKEY_KEYTYPE_ECC		4
+#define PKEY_KEYTYPE_ECC_P256		5
+#define PKEY_KEYTYPE_ECC_P384		6
+#define PKEY_KEYTYPE_ECC_P521		7
+#define PKEY_KEYTYPE_ECC_ED25519	8
+#define PKEY_KEYTYPE_ECC_ED448		9
+#define PKEY_KEYTYPE_AES_XTS_128	10
+#define PKEY_KEYTYPE_AES_XTS_256	11
+#define PKEY_KEYTYPE_HMAC_512		12
+#define PKEY_KEYTYPE_HMAC_1024		13
 
 /* the newer ioctls use a pkey_key_type enum for type information */
 enum pkey_key_type {
-	PKEY_TYPE_CCA_DATA   = (__u32) 1,
-	PKEY_TYPE_CCA_CIPHER = (__u32) 2,
+	PKEY_TYPE_CCA_DATA   = (__u32)1,
+	PKEY_TYPE_CCA_CIPHER = (__u32)2,
+	PKEY_TYPE_EP11	     = (__u32)3,
+	PKEY_TYPE_CCA_ECC    = (__u32)0x1f,
+	PKEY_TYPE_EP11_AES   = (__u32)6,
+	PKEY_TYPE_EP11_ECC   = (__u32)7,
+	PKEY_TYPE_PROTKEY    = (__u32)8,
+	PKEY_TYPE_UVSECRET   = (__u32)9,
 };
 
 /* the newer ioctls use a pkey_key_size enum for key size information */
 enum pkey_key_size {
-	PKEY_SIZE_AES_128 = (__u32) 128,
-	PKEY_SIZE_AES_192 = (__u32) 192,
-	PKEY_SIZE_AES_256 = (__u32) 256,
-	PKEY_SIZE_UNKNOWN = (__u32) 0xFFFFFFFF,
+	PKEY_SIZE_AES_128 = (__u32)128,
+	PKEY_SIZE_AES_192 = (__u32)192,
+	PKEY_SIZE_AES_256 = (__u32)256,
+	PKEY_SIZE_UNKNOWN = (__u32)0xFFFFFFFF,
 };
 
 /* some of the newer ioctls use these flags */
@@ -87,6 +104,20 @@ struct pkey_clrkey {
 };
 
 /*
+ * EP11 key blobs of type PKEY_TYPE_EP11_AES and PKEY_TYPE_EP11_ECC
+ * are ep11 blobs prepended by this header:
+ */
+struct ep11kblob_header {
+	__u8  type;	/* always 0x00 */
+	__u8  hver;	/* header version,  currently needs to be 0x00 */
+	__u16 len;	/* total length in bytes (including this header) */
+	__u8  version;	/* PKEY_TYPE_EP11_AES or PKEY_TYPE_EP11_ECC */
+	__u8  res0;	/* unused */
+	__u16 bitlen;	/* clear key bit len, 0 for unknown */
+	__u8  res1[8];	/* unused */
+} __packed;
+
+/*
  * Generate CCA AES secure key.
  */
 struct pkey_genseck {
@@ -95,6 +126,7 @@ struct pkey_genseck {
 	__u32 keytype;		    /* in: key type to generate		 */
 	struct pkey_seckey seckey;  /* out: the secure key blob		 */
 };
+
 #define PKEY_GENSECK _IOWR(PKEY_IOCTL_MAGIC, 0x01, struct pkey_genseck)
 
 /*
@@ -107,6 +139,7 @@ struct pkey_clr2seck {
 	struct pkey_clrkey clrkey;  /* in: the clear key value		 */
 	struct pkey_seckey seckey;  /* out: the secure key blob		 */
 };
+
 #define PKEY_CLR2SECK _IOWR(PKEY_IOCTL_MAGIC, 0x02, struct pkey_clr2seck)
 
 /*
@@ -118,6 +151,7 @@ struct pkey_sec2protk {
 	struct pkey_seckey seckey;   /* in: the secure key blob		  */
 	struct pkey_protkey protkey; /* out: the protected key		  */
 };
+
 #define PKEY_SEC2PROTK _IOWR(PKEY_IOCTL_MAGIC, 0x03, struct pkey_sec2protk)
 
 /*
@@ -128,6 +162,7 @@ struct pkey_clr2protk {
 	struct pkey_clrkey clrkey;   /* in: the clear key value		  */
 	struct pkey_protkey protkey; /* out: the protected key		  */
 };
+
 #define PKEY_CLR2PROTK _IOWR(PKEY_IOCTL_MAGIC, 0x04, struct pkey_clr2protk)
 
 /*
@@ -139,6 +174,7 @@ struct pkey_findcard {
 	__u16  cardnr;			       /* out: card number	  */
 	__u16  domain;			       /* out: domain number	  */
 };
+
 #define PKEY_FINDCARD _IOWR(PKEY_IOCTL_MAGIC, 0x05, struct pkey_findcard)
 
 /*
@@ -148,10 +184,11 @@ struct pkey_skey2pkey {
 	struct pkey_seckey seckey;   /* in: the secure key blob		  */
 	struct pkey_protkey protkey; /* out: the protected key		  */
 };
+
 #define PKEY_SKEY2PKEY _IOWR(PKEY_IOCTL_MAGIC, 0x06, struct pkey_skey2pkey)
 
 /*
- * Verify the given CCA AES secure key for being able to be useable with
+ * Verify the given CCA AES secure key for being able to be usable with
  * the pkey module. Check for correct key type and check for having at
  * least one crypto card being able to handle this key (master key
  * or old master key verification pattern matches).
@@ -165,6 +202,7 @@ struct pkey_verifykey {
 	__u16  keysize;			       /* out: key size in bits   */
 	__u32  attributes;		       /* out: attribute bits	  */
 };
+
 #define PKEY_VERIFYKEY _IOWR(PKEY_IOCTL_MAGIC, 0x07, struct pkey_verifykey)
 #define PKEY_VERIFY_ATTR_AES	   0x00000001  /* key is an AES key */
 #define PKEY_VERIFY_ATTR_OLD_MKVP  0x00000100  /* key has old MKVP value */
@@ -196,11 +234,12 @@ struct pkey_kblob2pkey {
 	__u32 keylen;			/* in: the key blob length */
 	struct pkey_protkey protkey;	/* out: the protected key  */
 };
+
 #define PKEY_KBLOB2PROTK _IOWR(PKEY_IOCTL_MAGIC, 0x0A, struct pkey_kblob2pkey)
 
 /*
  * Generate secure key, version 2.
- * Generate either a CCA AES secure key or a CCA AES cipher key.
+ * Generate CCA AES secure key, CCA AES cipher key or EP11 AES secure key.
  * There needs to be a list of apqns given with at least one entry in there.
  * All apqns in the list need to be exact apqns, 0xFFFF as ANY card or domain
  * is not supported. The implementation walks through the list of apqns and
@@ -210,10 +249,13 @@ struct pkey_kblob2pkey {
  * (return -1 with errno ENODEV). You may use the PKEY_APQNS4KT ioctl to
  * generate a list of apqns based on the key type to generate.
  * The keygenflags argument is passed to the low level generation functions
- * individual for the key type and has a key type specific meaning. Currently
- * only CCA AES cipher keys react to this parameter: Use one or more of the
- * PKEY_KEYGEN_* flags to widen the export possibilities. By default a cipher
- * key is only exportable for CPACF (PKEY_KEYGEN_XPRT_CPAC).
+ * individual for the key type and has a key type specific meaning. When
+ * generating CCA cipher keys you can use one or more of the PKEY_KEYGEN_*
+ * flags to widen the export possibilities. By default a cipher key is
+ * only exportable for CPACF (PKEY_KEYGEN_XPRT_CPAC).
+ * The keygenflag argument for generating an EP11 AES key should either be 0
+ * to use the defaults which are XCP_BLOB_ENCRYPT, XCP_BLOB_DECRYPT and
+ * XCP_BLOB_PROTKEY_EXTRACTABLE or a valid combination of XCP_BLOB_* flags.
  */
 struct pkey_genseck2 {
 	struct pkey_apqn __user *apqns; /* in: ptr to list of apqn targets*/
@@ -225,12 +267,13 @@ struct pkey_genseck2 {
 	__u32 keylen;		    /* in: available key blob buffer size */
 				    /* out: actual key blob size	  */
 };
+
 #define PKEY_GENSECK2 _IOWR(PKEY_IOCTL_MAGIC, 0x11, struct pkey_genseck2)
 
 /*
  * Generate secure key from clear key value, version 2.
- * Construct a CCA AES secure key or CCA AES cipher key from a given clear key
- * value.
+ * Construct an CCA AES secure key, CCA AES cipher key or EP11 AES secure
+ * key from a given clear key value.
  * There needs to be a list of apqns given with at least one entry in there.
  * All apqns in the list need to be exact apqns, 0xFFFF as ANY card or domain
  * is not supported. The implementation walks through the list of apqns and
@@ -240,10 +283,13 @@ struct pkey_genseck2 {
  * (return -1 with errno ENODEV). You may use the PKEY_APQNS4KT ioctl to
  * generate a list of apqns based on the key type to generate.
  * The keygenflags argument is passed to the low level generation functions
- * individual for the key type and has a key type specific meaning. Currently
- * only CCA AES cipher keys react to this parameter: Use one or more of the
- * PKEY_KEYGEN_* flags to widen the export possibilities. By default a cipher
- * key is only exportable for CPACF (PKEY_KEYGEN_XPRT_CPAC).
+ * individual for the key type and has a key type specific meaning. When
+ * generating CCA cipher keys you can use one or more of the PKEY_KEYGEN_*
+ * flags to widen the export possibilities. By default a cipher key is
+ * only exportable for CPACF (PKEY_KEYGEN_XPRT_CPAC).
+ * The keygenflag argument for generating an EP11 AES key should either be 0
+ * to use the defaults which are XCP_BLOB_ENCRYPT, XCP_BLOB_DECRYPT and
+ * XCP_BLOB_PROTKEY_EXTRACTABLE or a valid combination of XCP_BLOB_* flags.
  */
 struct pkey_clr2seck2 {
 	struct pkey_apqn __user *apqns; /* in: ptr to list of apqn targets */
@@ -256,6 +302,7 @@ struct pkey_clr2seck2 {
 	__u32 keylen;		    /* in: available key blob buffer size  */
 				    /* out: actual key blob size	   */
 };
+
 #define PKEY_CLR2SECK2 _IOWR(PKEY_IOCTL_MAGIC, 0x12, struct pkey_clr2seck2)
 
 /*
@@ -266,14 +313,19 @@ struct pkey_clr2seck2 {
  * with one apqn able to handle this key.
  * The function also checks for the master key verification patterns
  * of the key matching to the current or alternate mkvp of the apqn.
- * Currently CCA AES secure keys and CCA AES cipher keys are supported.
- * The flags field is updated with some additional info about the apqn mkvp
+ * For CCA AES secure keys and CCA AES cipher keys this means to check
+ * the key's mkvp against the current or old mkvp of the apqns. The flags
+ * field is updated with some additional info about the apqn mkvp
  * match: If the current mkvp matches to the key's mkvp then the
  * PKEY_FLAGS_MATCH_CUR_MKVP bit is set, if the alternate mkvp matches to
  * the key's mkvp the PKEY_FLAGS_MATCH_ALT_MKVP is set. For CCA keys the
  * alternate mkvp is the old master key verification pattern.
  * CCA AES secure keys are also checked to have the CPACF export allowed
  * bit enabled (XPRTCPAC) in the kmf1 field.
+ * EP11 keys are also supported and the wkvp of the key is checked against
+ * the current wkvp of the apqns. There is no alternate for this type of
+ * key and so on a match the flag PKEY_FLAGS_MATCH_CUR_MKVP always is set.
+ * EP11 keys are also checked to have XCP_BLOB_PROTKEY_EXTRACTABLE set.
  * The ioctl returns 0 as long as the given or found apqn matches to
  * matches with the current or alternate mkvp to the key's mkvp. If the given
  * apqn does not match or there is no such apqn found, -1 with errno
@@ -288,10 +340,11 @@ struct pkey_verifykey2 {
 	enum pkey_key_size size;    /* out: the key size		 */
 	__u32 flags;		    /* out: additional key info flags	 */
 };
+
 #define PKEY_VERIFYKEY2 _IOWR(PKEY_IOCTL_MAGIC, 0x17, struct pkey_verifykey2)
 
 /*
- * Transform a key blob (of any type) into a protected key, version 2.
+ * Transform a key blob into a protected key, version 2.
  * There needs to be a list of apqns given with at least one entry in there.
  * All apqns in the list need to be exact apqns, 0xFFFF as ANY card or domain
  * is not supported. The implementation walks through the list of apqns and
@@ -300,6 +353,8 @@ struct pkey_verifykey2 {
  * list is tried until success (return 0) or the end of the list is reached
  * (return -1 with errno ENODEV). You may use the PKEY_APQNS4K ioctl to
  * generate a list of apqns based on the key.
+ * Deriving ECC protected keys from ECC secure keys is not supported with
+ * this ioctl, use PKEY_KBLOB2PROTK3 for this purpose.
  */
 struct pkey_kblob2pkey2 {
 	__u8 __user *key;	     /* in: pointer to key blob		   */
@@ -308,27 +363,32 @@ struct pkey_kblob2pkey2 {
 	__u32 apqn_entries;	     /* in: # of apqn target list entries  */
 	struct pkey_protkey protkey; /* out: the protected key		   */
 };
+
 #define PKEY_KBLOB2PROTK2 _IOWR(PKEY_IOCTL_MAGIC, 0x1A, struct pkey_kblob2pkey2)
 
 /*
  * Build a list of APQNs based on a key blob given.
  * Is able to find out which type of secure key is given (CCA AES secure
- * key or CCA AES cipher key) and tries to find all matching crypto cards
- * based on the MKVP and maybe other criterias (like CCA AES cipher keys
- * need a CEX5C or higher). The list of APQNs is further filtered by the key's
- * mkvp which needs to match to either the current mkvp or the alternate mkvp
- * (which is the old mkvp on CCA adapters) of the apqns. The flags argument may
- * be used to limit the matching apqns. If the PKEY_FLAGS_MATCH_CUR_MKVP is
- * given, only the current mkvp of each apqn is compared. Likewise with the
- * PKEY_FLAGS_MATCH_ALT_MKVP. If both are given, it is assumed to
- * return apqns where either the current or the alternate mkvp
- * matches. At least one of the matching flags needs to be given.
+ * key, CCA AES cipher key, CCA ECC private key, EP11 AES key, EP11 ECC private
+ * key) and tries to find all matching crypto cards based on the MKVP and maybe
+ * other criteria (like CCA AES cipher keys need a CEX5C or higher, EP11 keys
+ * with BLOB_PKEY_EXTRACTABLE need a CEX7 and EP11 api version 4). The list of
+ * APQNs is further filtered by the key's mkvp which needs to match to either
+ * the current mkvp (CCA and EP11) or the alternate mkvp (old mkvp, CCA adapters
+ * only) of the apqns. The flags argument may be used to limit the matching
+ * apqns. If the PKEY_FLAGS_MATCH_CUR_MKVP is given, only the current mkvp of
+ * each apqn is compared. Likewise with the PKEY_FLAGS_MATCH_ALT_MKVP. If both
+ * are given, it is assumed to return apqns where either the current or the
+ * alternate mkvp matches. At least one of the matching flags needs to be given.
+ * The flags argument for EP11 keys has no further action and is currently
+ * ignored (but needs to be given as PKEY_FLAGS_MATCH_CUR_MKVP) as there is only
+ * the wkvp from the key to match against the apqn's wkvp.
  * The list of matching apqns is stored into the space given by the apqns
  * argument and the number of stored entries goes into apqn_entries. If the list
  * is empty (apqn_entries is 0) the apqn_entries field is updated to the number
  * of apqn targets found and the ioctl returns with 0. If apqn_entries is > 0
  * but the number of apqn targets does not fit into the list, the apqn_targets
- * field is updatedd with the number of reqired entries but there are no apqn
+ * field is updated with the number of required entries but there are no apqn
  * values stored in the list and the ioctl returns with ENOSPC. If no matching
  * APQN is found, the ioctl returns with 0 but the apqn_entries value is 0.
  */
@@ -340,6 +400,7 @@ struct pkey_apqns4key {
 	__u32 apqn_entries;	   /* in: max # of apqn entries in the list   */
 				   /* out: # apqns stored into the list	      */
 };
+
 #define PKEY_APQNS4K _IOWR(PKEY_IOCTL_MAGIC, 0x1B, struct pkey_apqns4key)
 
 /*
@@ -348,20 +409,25 @@ struct pkey_apqns4key {
  * restrict the list by given master key verification patterns.
  * For different key types there may be different ways to match the
  * master key verification patterns. For CCA keys (CCA data key and CCA
- * cipher key) the first 8 bytes of cur_mkvp refer to the current mkvp value
- * of the apqn and the first 8 bytes of the alt_mkvp refer to the old mkvp.
- * The flags argument controls if the apqns current and/or alternate mkvp
+ * cipher key) the first 8 bytes of cur_mkvp refer to the current AES mkvp value
+ * of the apqn and the first 8 bytes of the alt_mkvp refer to the old AES mkvp.
+ * For CCA ECC keys it is similar but the match is against the APKA current/old
+ * mkvp. The flags argument controls if the apqns current and/or alternate mkvp
  * should match. If the PKEY_FLAGS_MATCH_CUR_MKVP is given, only the current
  * mkvp of each apqn is compared. Likewise with the PKEY_FLAGS_MATCH_ALT_MKVP.
  * If both are given, it is assumed to return apqns where either the
  * current or the alternate mkvp matches. If no match flag is given
  * (flags is 0) the mkvp values are ignored for the match process.
+ * For EP11 keys there is only the current wkvp. So if the apqns should also
+ * match to a given wkvp, then the PKEY_FLAGS_MATCH_CUR_MKVP flag should be
+ * set. The wkvp value is 32 bytes but only the leftmost 16 bytes are compared
+ * against the leftmost 16 byte of the wkvp of the apqn.
  * The list of matching apqns is stored into the space given by the apqns
  * argument and the number of stored entries goes into apqn_entries. If the list
  * is empty (apqn_entries is 0) the apqn_entries field is updated to the number
  * of apqn targets found and the ioctl returns with 0. If apqn_entries is > 0
  * but the number of apqn targets does not fit into the list, the apqn_targets
- * field is updatedd with the number of reqired entries but there are no apqn
+ * field is updated with the number of required entries but there are no apqn
  * values stored in the list and the ioctl returns with ENOSPC. If no matching
  * APQN is found, the ioctl returns with 0 but the apqn_entries value is 0.
  */
@@ -374,6 +440,34 @@ struct pkey_apqns4keytype {
 	__u32 apqn_entries;	   /* in: max # of apqn entries in the list   */
 				   /* out: # apqns stored into the list	      */
 };
+
 #define PKEY_APQNS4KT _IOWR(PKEY_IOCTL_MAGIC, 0x1C, struct pkey_apqns4keytype)
 
+/*
+ * Transform a key blob into a protected key, version 3.
+ * The difference to version 2 of this ioctl is that the protected key
+ * buffer is now explicitly and not within a struct pkey_protkey any more.
+ * So this ioctl is also able to handle EP11 and CCA ECC secure keys and
+ * provide ECC protected keys.
+ * There needs to be a list of apqns given with at least one entry in there.
+ * All apqns in the list need to be exact apqns, 0xFFFF as ANY card or domain
+ * is not supported. The implementation walks through the list of apqns and
+ * tries to send the request to each apqn without any further checking (like
+ * card type or online state). If the apqn fails, simple the next one in the
+ * list is tried until success (return 0) or the end of the list is reached
+ * (return -1 with errno ENODEV). You may use the PKEY_APQNS4K ioctl to
+ * generate a list of apqns based on the key.
+ */
+struct pkey_kblob2pkey3 {
+	__u8 __user *key;	     /* in: pointer to key blob		   */
+	__u32 keylen;		     /* in: key blob size		   */
+	struct pkey_apqn __user *apqns; /* in: ptr to list of apqn targets */
+	__u32 apqn_entries;	     /* in: # of apqn target list entries  */
+	__u32 pkeytype;		/* out: prot key type (enum pkey_key_type) */
+	__u32 pkeylen;	 /* in/out: size of pkey buffer/actual len of pkey */
+	__u8 __user *pkey;		 /* in: pkey blob buffer space ptr */
+};
+
+#define PKEY_KBLOB2PROTK3 _IOWR(PKEY_IOCTL_MAGIC, 0x1D, struct pkey_kblob2pkey3)
+
 #endif /* _UAPI_PKEY_H */
diff --git a/arch/s390/include/uapi/asm/ptrace.h b/arch/s390/include/uapi/asm/ptrace.h
index 543dd70e12c8..bb0826024bb9 100644
--- a/arch/s390/include/uapi/asm/ptrace.h
+++ b/arch/s390/include/uapi/asm/ptrace.h
@@ -8,6 +8,8 @@
 #ifndef _UAPI_S390_PTRACE_H
 #define _UAPI_S390_PTRACE_H
 
+#include <linux/const.h>
+
 /*
  * Offsets in the user_regs_struct. They are used for the ptrace
  * system call and in entry.S
@@ -166,6 +168,64 @@
 
 #endif /* __s390x__ */
 
+#ifndef __s390x__
+
+#define PSW_MASK_PER		_AC(0x40000000, UL)
+#define PSW_MASK_DAT		_AC(0x04000000, UL)
+#define PSW_MASK_IO		_AC(0x02000000, UL)
+#define PSW_MASK_EXT		_AC(0x01000000, UL)
+#define PSW_MASK_KEY		_AC(0x00F00000, UL)
+#define PSW_MASK_BASE		_AC(0x00080000, UL)	/* always one */
+#define PSW_MASK_MCHECK		_AC(0x00040000, UL)
+#define PSW_MASK_WAIT		_AC(0x00020000, UL)
+#define PSW_MASK_PSTATE		_AC(0x00010000, UL)
+#define PSW_MASK_ASC		_AC(0x0000C000, UL)
+#define PSW_MASK_CC		_AC(0x00003000, UL)
+#define PSW_MASK_PM		_AC(0x00000F00, UL)
+#define PSW_MASK_RI		_AC(0x00000000, UL)
+#define PSW_MASK_EA		_AC(0x00000000, UL)
+#define PSW_MASK_BA		_AC(0x00000000, UL)
+
+#define PSW_MASK_USER		_AC(0x0000FF00, UL)
+
+#define PSW_ADDR_AMODE		_AC(0x80000000, UL)
+#define PSW_ADDR_INSN		_AC(0x7FFFFFFF, UL)
+
+#define PSW_ASC_PRIMARY		_AC(0x00000000, UL)
+#define PSW_ASC_ACCREG		_AC(0x00004000, UL)
+#define PSW_ASC_SECONDARY	_AC(0x00008000, UL)
+#define PSW_ASC_HOME		_AC(0x0000C000, UL)
+
+#else /* __s390x__ */
+
+#define PSW_MASK_PER		_AC(0x4000000000000000, UL)
+#define PSW_MASK_DAT		_AC(0x0400000000000000, UL)
+#define PSW_MASK_IO		_AC(0x0200000000000000, UL)
+#define PSW_MASK_EXT		_AC(0x0100000000000000, UL)
+#define PSW_MASK_BASE		_AC(0x0000000000000000, UL)
+#define PSW_MASK_KEY		_AC(0x00F0000000000000, UL)
+#define PSW_MASK_MCHECK		_AC(0x0004000000000000, UL)
+#define PSW_MASK_WAIT		_AC(0x0002000000000000, UL)
+#define PSW_MASK_PSTATE		_AC(0x0001000000000000, UL)
+#define PSW_MASK_ASC		_AC(0x0000C00000000000, UL)
+#define PSW_MASK_CC		_AC(0x0000300000000000, UL)
+#define PSW_MASK_PM		_AC(0x00000F0000000000, UL)
+#define PSW_MASK_RI		_AC(0x0000008000000000, UL)
+#define PSW_MASK_EA		_AC(0x0000000100000000, UL)
+#define PSW_MASK_BA		_AC(0x0000000080000000, UL)
+
+#define PSW_MASK_USER		_AC(0x0000FF0180000000, UL)
+
+#define PSW_ADDR_AMODE		_AC(0x0000000000000000, UL)
+#define PSW_ADDR_INSN		_AC(0xFFFFFFFFFFFFFFFF, UL)
+
+#define PSW_ASC_PRIMARY		_AC(0x0000000000000000, UL)
+#define PSW_ASC_ACCREG		_AC(0x0000400000000000, UL)
+#define PSW_ASC_SECONDARY	_AC(0x0000800000000000, UL)
+#define PSW_ASC_HOME		_AC(0x0000C00000000000, UL)
+
+#endif /* __s390x__ */
+
 #define NUM_GPRS	16
 #define NUM_FPRS	16
 #define NUM_CRS		16
@@ -179,8 +239,9 @@
 #define ACR_SIZE	4
 
 
-#define PTRACE_OLDSETOPTIONS	     21
-
+#define PTRACE_OLDSETOPTIONS		21
+#define PTRACE_SYSEMU			31
+#define PTRACE_SYSEMU_SINGLESTEP	32
 #ifndef __ASSEMBLY__
 #include <linux/stddef.h>
 #include <linux/types.h>
@@ -213,69 +274,6 @@ typedef struct {
 	unsigned long addr;
 } __attribute__ ((aligned(8))) psw_t;
 
-#ifndef __s390x__
-
-#define PSW_MASK_PER		0x40000000UL
-#define PSW_MASK_DAT		0x04000000UL
-#define PSW_MASK_IO		0x02000000UL
-#define PSW_MASK_EXT		0x01000000UL
-#define PSW_MASK_KEY		0x00F00000UL
-#define PSW_MASK_BASE		0x00080000UL	/* always one */
-#define PSW_MASK_MCHECK		0x00040000UL
-#define PSW_MASK_WAIT		0x00020000UL
-#define PSW_MASK_PSTATE		0x00010000UL
-#define PSW_MASK_ASC		0x0000C000UL
-#define PSW_MASK_CC		0x00003000UL
-#define PSW_MASK_PM		0x00000F00UL
-#define PSW_MASK_RI		0x00000000UL
-#define PSW_MASK_EA		0x00000000UL
-#define PSW_MASK_BA		0x00000000UL
-
-#define PSW_MASK_USER		0x0000FF00UL
-
-#define PSW_ADDR_AMODE		0x80000000UL
-#define PSW_ADDR_INSN		0x7FFFFFFFUL
-
-#define PSW_DEFAULT_KEY		(((unsigned long) PAGE_DEFAULT_ACC) << 20)
-
-#define PSW_ASC_PRIMARY		0x00000000UL
-#define PSW_ASC_ACCREG		0x00004000UL
-#define PSW_ASC_SECONDARY	0x00008000UL
-#define PSW_ASC_HOME		0x0000C000UL
-
-#else /* __s390x__ */
-
-#define PSW_MASK_PER		0x4000000000000000UL
-#define PSW_MASK_DAT		0x0400000000000000UL
-#define PSW_MASK_IO		0x0200000000000000UL
-#define PSW_MASK_EXT		0x0100000000000000UL
-#define PSW_MASK_BASE		0x0000000000000000UL
-#define PSW_MASK_KEY		0x00F0000000000000UL
-#define PSW_MASK_MCHECK		0x0004000000000000UL
-#define PSW_MASK_WAIT		0x0002000000000000UL
-#define PSW_MASK_PSTATE		0x0001000000000000UL
-#define PSW_MASK_ASC		0x0000C00000000000UL
-#define PSW_MASK_CC		0x0000300000000000UL
-#define PSW_MASK_PM		0x00000F0000000000UL
-#define PSW_MASK_RI		0x0000008000000000UL
-#define PSW_MASK_EA		0x0000000100000000UL
-#define PSW_MASK_BA		0x0000000080000000UL
-
-#define PSW_MASK_USER		0x0000FF0180000000UL
-
-#define PSW_ADDR_AMODE		0x0000000000000000UL
-#define PSW_ADDR_INSN		0xFFFFFFFFFFFFFFFFUL
-
-#define PSW_DEFAULT_KEY		(((unsigned long) PAGE_DEFAULT_ACC) << 52)
-
-#define PSW_ASC_PRIMARY		0x0000000000000000UL
-#define PSW_ASC_ACCREG		0x0000400000000000UL
-#define PSW_ASC_SECONDARY	0x0000800000000000UL
-#define PSW_ASC_HOME		0x0000C00000000000UL
-
-#endif /* __s390x__ */
-
-
 /*
  * The s390_regs structure is used to define the elf_gregset_t.
  */
diff --git a/arch/s390/include/uapi/asm/raw3270.h b/arch/s390/include/uapi/asm/raw3270.h
new file mode 100644
index 000000000000..6676f102bd50
--- /dev/null
+++ b/arch/s390/include/uapi/asm/raw3270.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_S390_UAPI_RAW3270_H
+#define __ASM_S390_UAPI_RAW3270_H
+
+/* Local Channel Commands */
+#define TC_WRITE	0x01		/* Write */
+#define TC_RDBUF	0x02		/* Read Buffer */
+#define TC_EWRITE	0x05		/* Erase write */
+#define TC_READMOD	0x06		/* Read modified */
+#define TC_EWRITEA	0x0d		/* Erase write alternate */
+#define TC_WRITESF	0x11		/* Write structured field */
+
+/* Buffer Control Orders */
+#define TO_GE		0x08		/* Graphics Escape */
+#define TO_SF		0x1d		/* Start field */
+#define TO_SBA		0x11		/* Set buffer address */
+#define TO_IC		0x13		/* Insert cursor */
+#define TO_PT		0x05		/* Program tab */
+#define TO_RA		0x3c		/* Repeat to address */
+#define TO_SFE		0x29		/* Start field extended */
+#define TO_EUA		0x12		/* Erase unprotected to address */
+#define TO_MF		0x2c		/* Modify field */
+#define TO_SA		0x28		/* Set attribute */
+
+/* Field Attribute Bytes */
+#define TF_INPUT	0x40		/* Visible input */
+#define TF_INPUTN	0x4c		/* Invisible input */
+#define TF_INMDT	0xc1		/* Visible, Set-MDT */
+#define TF_LOG		0x60
+
+/* Character Attribute Bytes */
+#define TAT_RESET	0x00
+#define TAT_FIELD	0xc0
+#define TAT_EXTHI	0x41
+#define TAT_FGCOLOR	0x42
+#define TAT_CHARS	0x43
+#define TAT_BGCOLOR	0x45
+#define TAT_TRANS	0x46
+
+/* Extended-Highlighting Bytes */
+#define TAX_RESET	0x00
+#define TAX_BLINK	0xf1
+#define TAX_REVER	0xf2
+#define TAX_UNDER	0xf4
+
+/* Reset value */
+#define TAR_RESET	0x00
+
+/* Color values */
+#define TAC_RESET	0x00
+#define TAC_BLUE	0xf1
+#define TAC_RED		0xf2
+#define TAC_PINK	0xf3
+#define TAC_GREEN	0xf4
+#define TAC_TURQ	0xf5
+#define TAC_YELLOW	0xf6
+#define TAC_WHITE	0xf7
+#define TAC_DEFAULT	0x00
+
+/* Write Control Characters */
+#define TW_NONE		0x40		/* No particular action */
+#define TW_KR		0xc2		/* Keyboard restore */
+#define TW_PLUSALARM	0x04		/* Add this bit for alarm */
+
+#define RAW3270_FIRSTMINOR	1	/* First minor number */
+#define RAW3270_MAXDEVS		255	/* Max number of 3270 devices */
+
+#define AID_CLEAR		0x6d
+#define AID_ENTER		0x7d
+#define AID_PF3			0xf3
+#define AID_PF7			0xf7
+#define AID_PF8			0xf8
+#define AID_READ_PARTITION	0x88
+
+#endif /* __ASM_S390_UAPI_RAW3270_H */
diff --git a/arch/s390/include/uapi/asm/schid.h b/arch/s390/include/uapi/asm/schid.h
index 58fca6f48410..a3e1cf168553 100644
--- a/arch/s390/include/uapi/asm/schid.h
+++ b/arch/s390/include/uapi/asm/schid.h
@@ -4,6 +4,8 @@
 
 #include <linux/types.h>
 
+#ifndef __ASSEMBLY__
+
 struct subchannel_id {
 	__u32 cssid : 8;
 	__u32 : 4;
@@ -13,5 +15,6 @@ struct subchannel_id {
 	__u32 sch_no : 16;
 } __attribute__ ((packed, aligned(4)));
 
+#endif /* __ASSEMBLY__ */
 
 #endif /* _UAPIASM_SCHID_H */
diff --git a/arch/s390/include/uapi/asm/setup.h b/arch/s390/include/uapi/asm/setup.h
index 1f8803a31079..598d769e76df 100644
--- a/arch/s390/include/uapi/asm/setup.h
+++ b/arch/s390/include/uapi/asm/setup.h
@@ -1,14 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- *  S390 version
- *    Copyright IBM Corp. 1999, 2010
- */
-
-#ifndef _UAPI_ASM_S390_SETUP_H
-#define _UAPI_ASM_S390_SETUP_H
-
-#define COMMAND_LINE_SIZE	4096
-
-#define ARCH_COMMAND_LINE_SIZE	896
-
-#endif /* _UAPI_ASM_S390_SETUP_H */
diff --git a/arch/s390/include/uapi/asm/sie.h b/arch/s390/include/uapi/asm/sie.h
index 6ca1e68d7103..ede318653c87 100644
--- a/arch/s390/include/uapi/asm/sie.h
+++ b/arch/s390/include/uapi/asm/sie.h
@@ -29,7 +29,7 @@
 	{ 0x13, "SIGP conditional emergency signal" },		\
 	{ 0x15, "SIGP sense running" },				\
 	{ 0x16, "SIGP set multithreading"},			\
-	{ 0x17, "SIGP store additional status ait address"}
+	{ 0x17, "SIGP store additional status at address"}
 
 #define icpt_prog_codes						\
 	{ 0x0001, "Prog Operation" },				\
diff --git a/arch/s390/include/uapi/asm/signal.h b/arch/s390/include/uapi/asm/signal.h
index 9a14a611ed82..e74d6ba1bd3b 100644
--- a/arch/s390/include/uapi/asm/signal.h
+++ b/arch/s390/include/uapi/asm/signal.h
@@ -65,30 +65,6 @@ typedef unsigned long sigset_t;
 #define SIGRTMIN        32
 #define SIGRTMAX        _NSIG
 
-/*
- * SA_FLAGS values:
- *
- * SA_ONSTACK indicates that a registered stack_t will be used.
- * SA_RESTART flag to get restarting signals (which were the default long ago)
- * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop.
- * SA_RESETHAND clears the handler when the signal is delivered.
- * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies.
- * SA_NODEFER prevents the current signal from being masked in the handler.
- *
- * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single
- * Unix names RESETHAND and NODEFER respectively.
- */
-#define SA_NOCLDSTOP    0x00000001
-#define SA_NOCLDWAIT    0x00000002
-#define SA_SIGINFO      0x00000004
-#define SA_ONSTACK      0x08000000
-#define SA_RESTART      0x10000000
-#define SA_NODEFER      0x40000000
-#define SA_RESETHAND    0x80000000
-
-#define SA_NOMASK       SA_NODEFER
-#define SA_ONESHOT      SA_RESETHAND
-
 #define SA_RESTORER     0x04000000
 
 #define MINSIGSTKSZ     2048
@@ -132,7 +108,7 @@ struct sigaction {
 typedef struct sigaltstack {
         void __user *ss_sp;
         int ss_flags;
-        size_t ss_size;
+	__kernel_size_t ss_size;
 } stack_t;
 
 
diff --git a/arch/s390/include/uapi/asm/statfs.h b/arch/s390/include/uapi/asm/statfs.h
index 72604f7792c3..f85b50723dd3 100644
--- a/arch/s390/include/uapi/asm/statfs.h
+++ b/arch/s390/include/uapi/asm/statfs.h
@@ -30,7 +30,7 @@ struct statfs {
 	unsigned int	f_namelen;
 	unsigned int	f_frsize;
 	unsigned int	f_flags;
-	unsigned int	f_spare[4];
+	unsigned int	f_spare[5];
 };
 
 struct statfs64 {
@@ -45,7 +45,7 @@ struct statfs64 {
 	unsigned int	f_namelen;
 	unsigned int	f_frsize;
 	unsigned int	f_flags;
-	unsigned int	f_spare[4];
+	unsigned int	f_spare[5];
 };
 
 #endif
diff --git a/arch/s390/include/uapi/asm/termios.h b/arch/s390/include/uapi/asm/termios.h
deleted file mode 100644
index 54223169c806..000000000000
--- a/arch/s390/include/uapi/asm/termios.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- *  S390 version
- *
- *  Derived from "include/asm-i386/termios.h"
- */
-
-#ifndef _UAPI_S390_TERMIOS_H
-#define _UAPI_S390_TERMIOS_H
-
-#include <asm/termbits.h>
-#include <asm/ioctls.h>
-
-struct winsize {
-	unsigned short ws_row;
-	unsigned short ws_col;
-	unsigned short ws_xpixel;
-	unsigned short ws_ypixel;
-};
-
-#define NCC 8
-struct termio {
-	unsigned short c_iflag;		/* input mode flags */
-	unsigned short c_oflag;		/* output mode flags */
-	unsigned short c_cflag;		/* control mode flags */
-	unsigned short c_lflag;		/* local mode flags */
-	unsigned char c_line;		/* line discipline */
-	unsigned char c_cc[NCC];	/* control characters */
-};
-
-/* modem lines */
-#define TIOCM_LE	0x001
-#define TIOCM_DTR	0x002
-#define TIOCM_RTS	0x004
-#define TIOCM_ST	0x008
-#define TIOCM_SR	0x010
-#define TIOCM_CTS	0x020
-#define TIOCM_CAR	0x040
-#define TIOCM_RNG	0x080
-#define TIOCM_DSR	0x100
-#define TIOCM_CD	TIOCM_CAR
-#define TIOCM_RI	TIOCM_RNG
-#define TIOCM_OUT1	0x2000
-#define TIOCM_OUT2	0x4000
-#define TIOCM_LOOP	0x8000
-
-/* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */
-
-
-#endif /* _UAPI_S390_TERMIOS_H */
diff --git a/arch/s390/include/uapi/asm/types.h b/arch/s390/include/uapi/asm/types.h
index da034c606314..84457dbb26b4 100644
--- a/arch/s390/include/uapi/asm/types.h
+++ b/arch/s390/include/uapi/asm/types.h
@@ -12,15 +12,18 @@
 
 #ifndef __ASSEMBLY__
 
-/* A address type so that arithmetic can be done on it & it can be upgraded to
-   64 bit when necessary 
-*/
-typedef unsigned long addr_t; 
+typedef unsigned long addr_t;
 typedef __signed__ long saddr_t;
 
 typedef struct {
-	__u32 u[4];
-} __vector128;
+	union {
+		struct {
+			__u64 high;
+			__u64 low;
+		};
+		__u32 u[4];
+	};
+} __attribute__((packed, aligned(4))) __vector128;
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/s390/include/uapi/asm/uvdevice.h b/arch/s390/include/uapi/asm/uvdevice.h
new file mode 100644
index 000000000000..4947f26ad9fb
--- /dev/null
+++ b/arch/s390/include/uapi/asm/uvdevice.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *  Copyright IBM Corp. 2022, 2024
+ *  Author(s): Steffen Eiden <seiden@linux.ibm.com>
+ */
+#ifndef __S390_ASM_UVDEVICE_H
+#define __S390_ASM_UVDEVICE_H
+
+#include <linux/types.h>
+
+struct uvio_ioctl_cb {
+	__u32 flags;
+	__u16 uv_rc;			/* UV header rc value */
+	__u16 uv_rrc;			/* UV header rrc value */
+	__u64 argument_addr;		/* Userspace address of uvio argument */
+	__u32 argument_len;
+	__u8  reserved14[0x40 - 0x14];	/* must be zero */
+};
+
+#define UVIO_ATT_USER_DATA_LEN		0x100
+#define UVIO_ATT_UID_LEN		0x10
+struct uvio_attest {
+	__u64 arcb_addr;				/* 0x0000 */
+	__u64 meas_addr;				/* 0x0008 */
+	__u64 add_data_addr;				/* 0x0010 */
+	__u8  user_data[UVIO_ATT_USER_DATA_LEN];	/* 0x0018 */
+	__u8  config_uid[UVIO_ATT_UID_LEN];		/* 0x0118 */
+	__u32 arcb_len;					/* 0x0128 */
+	__u32 meas_len;					/* 0x012c */
+	__u32 add_data_len;				/* 0x0130 */
+	__u16 user_data_len;				/* 0x0134 */
+	__u16 reserved136;				/* 0x0136 */
+};
+
+/**
+ * uvio_uvdev_info - Information of supported functions
+ * @supp_uvio_cmds - supported IOCTLs by this device
+ * @supp_uv_cmds - supported UVCs corresponding to the IOCTL
+ *
+ * UVIO request to get information about supported request types by this
+ * uvdevice and the Ultravisor.  Everything is output. Bits are in LSB0
+ * ordering.  If the bit is set in both, @supp_uvio_cmds and @supp_uv_cmds, the
+ * uvdevice and the Ultravisor support that call.
+ *
+ * Note that bit 0 (UVIO_IOCTL_UVDEV_INFO_NR) is always zero for `supp_uv_cmds`
+ * as there is no corresponding UV-call.
+ */
+struct uvio_uvdev_info {
+	/*
+	 * If bit `n` is set, this device supports the IOCTL with nr `n`.
+	 */
+	__u64 supp_uvio_cmds;
+	/*
+	 * If bit `n` is set, the Ultravisor(UV) supports the UV-call
+	 * corresponding to the IOCTL with nr `n` in the calling context (host
+	 * or guest).  The value is only valid if the corresponding bit in
+	 * @supp_uvio_cmds is set as well.
+	 */
+	__u64 supp_uv_cmds;
+};
+
+/*
+ * The following max values define an upper length for the IOCTL in/out buffers.
+ * However, they do not represent the maximum the Ultravisor allows which is
+ * often way smaller. By allowing larger buffer sizes we hopefully do not need
+ * to update the code with every machine update. It is therefore possible for
+ * userspace to request more memory than actually used by kernel/UV.
+ */
+#define UVIO_ATT_ARCB_MAX_LEN		0x100000
+#define UVIO_ATT_MEASUREMENT_MAX_LEN	0x8000
+#define UVIO_ATT_ADDITIONAL_MAX_LEN	0x8000
+#define UVIO_ADD_SECRET_MAX_LEN		0x100000
+#define UVIO_LIST_SECRETS_LEN		0x1000
+#define UVIO_RETR_SECRET_MAX_LEN	0x2000
+
+#define UVIO_DEVICE_NAME "uv"
+#define UVIO_TYPE_UVC 'u'
+
+enum UVIO_IOCTL_NR {
+	UVIO_IOCTL_UVDEV_INFO_NR = 0x00,
+	UVIO_IOCTL_ATT_NR,
+	UVIO_IOCTL_ADD_SECRET_NR,
+	UVIO_IOCTL_LIST_SECRETS_NR,
+	UVIO_IOCTL_LOCK_SECRETS_NR,
+	UVIO_IOCTL_RETR_SECRET_NR,
+	/* must be the last entry */
+	UVIO_IOCTL_NUM_IOCTLS
+};
+
+#define UVIO_IOCTL(nr)			_IOWR(UVIO_TYPE_UVC, nr, struct uvio_ioctl_cb)
+#define UVIO_IOCTL_UVDEV_INFO		UVIO_IOCTL(UVIO_IOCTL_UVDEV_INFO_NR)
+#define UVIO_IOCTL_ATT			UVIO_IOCTL(UVIO_IOCTL_ATT_NR)
+#define UVIO_IOCTL_ADD_SECRET		UVIO_IOCTL(UVIO_IOCTL_ADD_SECRET_NR)
+#define UVIO_IOCTL_LIST_SECRETS		UVIO_IOCTL(UVIO_IOCTL_LIST_SECRETS_NR)
+#define UVIO_IOCTL_LOCK_SECRETS		UVIO_IOCTL(UVIO_IOCTL_LOCK_SECRETS_NR)
+#define UVIO_IOCTL_RETR_SECRET		UVIO_IOCTL(UVIO_IOCTL_RETR_SECRET_NR)
+
+#define UVIO_SUPP_CALL(nr)		(1ULL << (nr))
+#define UVIO_SUPP_UDEV_INFO		UVIO_SUPP_CALL(UVIO_IOCTL_UDEV_INFO_NR)
+#define UVIO_SUPP_ATT			UVIO_SUPP_CALL(UVIO_IOCTL_ATT_NR)
+#define UVIO_SUPP_ADD_SECRET		UVIO_SUPP_CALL(UVIO_IOCTL_ADD_SECRET_NR)
+#define UVIO_SUPP_LIST_SECRETS		UVIO_SUPP_CALL(UVIO_IOCTL_LIST_SECRETS_NR)
+#define UVIO_SUPP_LOCK_SECRETS		UVIO_SUPP_CALL(UVIO_IOCTL_LOCK_SECRETS_NR)
+#define UVIO_SUPP_RETR_SECRET		UVIO_SUPP_CALL(UVIO_IOCTL_RETR_SECRET_NR)
+
+#endif /* __S390_ASM_UVDEVICE_H */
diff --git a/arch/s390/include/uapi/asm/zcrypt.h b/arch/s390/include/uapi/asm/zcrypt.h
index f9e5e1f0821d..f4785abe1b9f 100644
--- a/arch/s390/include/uapi/asm/zcrypt.h
+++ b/arch/s390/include/uapi/asm/zcrypt.h
@@ -4,7 +4,7 @@
  *
  *  zcrypt 2.2.1 (user-visible header)
  *
- *  Copyright IBM Corp. 2001, 2019
+ *  Copyright IBM Corp. 2001, 2022
  *  Author(s): Robert Burroughs
  *	       Eric Rossman (edrossma@us.ibm.com)
  *
@@ -36,12 +36,12 @@
  * - length(n_modulus) = inputdatalength
  */
 struct ica_rsa_modexpo {
-	char __user  *inputdata;
-	unsigned int  inputdatalength;
-	char __user  *outputdata;
-	unsigned int  outputdatalength;
-	char __user  *b_key;
-	char __user  *n_modulus;
+	__u8 __user  *inputdata;
+	__u32	      inputdatalength;
+	__u8 __user  *outputdata;
+	__u32	      outputdatalength;
+	__u8 __user  *b_key;
+	__u8 __user  *n_modulus;
 };
 
 /**
@@ -59,15 +59,15 @@ struct ica_rsa_modexpo {
  * - length(u_mult_inv) = inputdatalength/2 + 8
  */
 struct ica_rsa_modexpo_crt {
-	char __user  *inputdata;
-	unsigned int  inputdatalength;
-	char __user  *outputdata;
-	unsigned int  outputdatalength;
-	char __user  *bp_key;
-	char __user  *bq_key;
-	char __user  *np_prime;
-	char __user  *nq_prime;
-	char __user  *u_mult_inv;
+	__u8 __user  *inputdata;
+	__u32	      inputdatalength;
+	__u8 __user  *outputdata;
+	__u32	      outputdatalength;
+	__u8 __user  *bp_key;
+	__u8 __user  *bq_key;
+	__u8 __user  *np_prime;
+	__u8 __user  *nq_prime;
+	__u8 __user  *u_mult_inv;
 };
 
 /**
@@ -83,67 +83,66 @@ struct ica_rsa_modexpo_crt {
  *	    key block
  */
 struct CPRBX {
-	unsigned short	cprb_len;	/* CPRB length	      220	 */
-	unsigned char	cprb_ver_id;	/* CPRB version id.   0x02	 */
-	unsigned char	pad_000[3];	/* Alignment pad bytes		 */
-	unsigned char	func_id[2];	/* function id	      0x5432	 */
-	unsigned char	cprb_flags[4];	/* Flags			 */
-	unsigned int	req_parml;	/* request parameter buffer len	 */
-	unsigned int	req_datal;	/* request data buffer		 */
-	unsigned int	rpl_msgbl;	/* reply  message block length	 */
-	unsigned int	rpld_parml;	/* replied parameter block len	 */
-	unsigned int	rpl_datal;	/* reply data block len		 */
-	unsigned int	rpld_datal;	/* replied data block len	 */
-	unsigned int	req_extbl;	/* request extension block len	 */
-	unsigned char	pad_001[4];	/* reserved			 */
-	unsigned int	rpld_extbl;	/* replied extension block len	 */
-	unsigned char	padx000[16 - sizeof(char *)];
-	unsigned char  *req_parmb;	/* request parm block 'address'	 */
-	unsigned char	padx001[16 - sizeof(char *)];
-	unsigned char  *req_datab;	/* request data block 'address'	 */
-	unsigned char	padx002[16 - sizeof(char *)];
-	unsigned char  *rpl_parmb;	/* reply parm block 'address'	 */
-	unsigned char	padx003[16 - sizeof(char *)];
-	unsigned char  *rpl_datab;	/* reply data block 'address'	 */
-	unsigned char	padx004[16 - sizeof(char *)];
-	unsigned char  *req_extb;	/* request extension block 'addr'*/
-	unsigned char	padx005[16 - sizeof(char *)];
-	unsigned char  *rpl_extb;	/* reply extension block 'address'*/
-	unsigned short	ccp_rtcode;	/* server return code		 */
-	unsigned short	ccp_rscode;	/* server reason code		 */
-	unsigned int	mac_data_len;	/* Mac Data Length		 */
-	unsigned char	logon_id[8];	/* Logon Identifier		 */
-	unsigned char	mac_value[8];	/* Mac Value			 */
-	unsigned char	mac_content_flgs;/* Mac content flag byte	 */
-	unsigned char	pad_002;	/* Alignment			 */
-	unsigned short	domain;		/* Domain			 */
-	unsigned char	usage_domain[4];/* Usage domain			 */
-	unsigned char	cntrl_domain[4];/* Control domain		 */
-	unsigned char	S390enf_mask[4];/* S/390 enforcement mask	 */
-	unsigned char	pad_004[36];	/* reserved			 */
+	__u16	     cprb_len;		/* CPRB length	      220	 */
+	__u8	     cprb_ver_id;	/* CPRB version id.   0x02	 */
+	__u8	     ctfm;		/* Command Type Filtering Mask	 */
+	__u8	     pad_000[2];	/* Alignment pad bytes		 */
+	__u8	     func_id[2];	/* function id	      0x5432	 */
+	__u8	     cprb_flags[4];	/* Flags			 */
+	__u32	     req_parml;		/* request parameter buffer len	 */
+	__u32	     req_datal;		/* request data buffer		 */
+	__u32	     rpl_msgbl;		/* reply  message block length	 */
+	__u32	     rpld_parml;	/* replied parameter block len	 */
+	__u32	     rpl_datal;		/* reply data block len		 */
+	__u32	     rpld_datal;	/* replied data block len	 */
+	__u32	     req_extbl;		/* request extension block len	 */
+	__u8	     _pad_001[4];	/* reserved			 */
+	__u32	     rpld_extbl;	/* replied extension block len	 */
+	__u8	     _pad_002[16 - sizeof(__u8 *)];
+	__u8 __user *req_parmb;		/* request parm block 'address'	 */
+	__u8	     _pad_003[16 - sizeof(__u8 *)];
+	__u8 __user *req_datab;		/* request data block 'address'	 */
+	__u8	     _pad_004[16 - sizeof(__u8 *)];
+	__u8 __user *rpl_parmb;		/* reply parm block 'address'	 */
+	__u8	     _pad_005[16 - sizeof(__u8 *)];
+	__u8 __user *rpl_datab;		/* reply data block 'address'	 */
+	__u8	     _pad_006[16 - sizeof(__u8 *)];
+	__u8 __user *req_extb;		/* request extension block 'addr'*/
+	__u8	     _pad_007[16 - sizeof(__u8 *)];
+	__u8 __user *rpl_extb;		/* reply extension block 'address'*/
+	__u16	     ccp_rtcode;	/* server return code		 */
+	__u16	     ccp_rscode;	/* server reason code		 */
+	__u32	     mac_data_len;	/* Mac Data Length		 */
+	__u8	     logon_id[8];	/* Logon Identifier		 */
+	__u8	     mac_value[8];	/* Mac Value			 */
+	__u8	     mac_content_flgs;	/* Mac content flag byte	 */
+	__u8	     _pad_008;		/* Alignment			 */
+	__u16	     domain;		/* Domain			 */
+	__u8	     _pad_009[12];	/* reserved, checked for zeros	 */
+	__u8	     _pad_010[36];	/* reserved			 */
 } __attribute__((packed));
 
 /**
  * xcRB
  */
 struct ica_xcRB {
-	unsigned short	agent_ID;
-	unsigned int	user_defined;
-	unsigned short	request_ID;
-	unsigned int	request_control_blk_length;
-	unsigned char	padding1[16 - sizeof(char *)];
-	char __user    *request_control_blk_addr;
-	unsigned int	request_data_length;
-	char		padding2[16 - sizeof(char *)];
-	char __user    *request_data_address;
-	unsigned int	reply_control_blk_length;
-	char		padding3[16 - sizeof(char *)];
-	char __user    *reply_control_blk_addr;
-	unsigned int	reply_data_length;
-	char		padding4[16 - sizeof(char *)];
-	char __user    *reply_data_addr;
-	unsigned short	priority_window;
-	unsigned int	status;
+	__u16	      agent_ID;
+	__u32	      user_defined;
+	__u16	      request_ID;
+	__u32	      request_control_blk_length;
+	__u8	      _padding1[16 - sizeof(__u8 *)];
+	__u8 __user  *request_control_blk_addr;
+	__u32	      request_data_length;
+	__u8	      _padding2[16 - sizeof(__u8 *)];
+	__u8 __user  *request_data_address;
+	__u32	      reply_control_blk_length;
+	__u8	      _padding3[16 - sizeof(__u8 *)];
+	__u8 __user  *reply_control_blk_addr;
+	__u32	      reply_data_length;
+	__u8	      __padding4[16 - sizeof(__u8 *)];
+	__u8 __user  *reply_data_addr;
+	__u16	      priority_window;
+	__u32	      status;
 } __attribute__((packed));
 
 /**
@@ -161,17 +160,17 @@ struct ica_xcRB {
  * @payload_len:	Payload length
  */
 struct ep11_cprb {
-	__u16		cprb_len;
-	unsigned char	cprb_ver_id;
-	unsigned char	pad_000[2];
-	unsigned char	flags;
-	unsigned char	func_id[2];
-	__u32		source_id;
-	__u32		target_id;
-	__u32		ret_code;
-	__u32		reserved1;
-	__u32		reserved2;
-	__u32		payload_len;
+	__u16	cprb_len;
+	__u8	cprb_ver_id;
+	__u8	pad_000[2];
+	__u8	flags;
+	__u8	func_id[2];
+	__u32	source_id;
+	__u32	target_id;
+	__u32	ret_code;
+	__u32	reserved1;
+	__u32	reserved2;
+	__u32	payload_len;
 } __attribute__((packed));
 
 /**
@@ -197,13 +196,13 @@ struct ep11_target_dev {
  */
 struct ep11_urb {
 	__u16		targets_num;
-	__u64		targets;
+	__u8 __user    *targets;
 	__u64		weight;
 	__u64		req_no;
 	__u64		req_len;
-	__u64		req;
+	__u8 __user    *req;
 	__u64		resp_len;
-	__u64		resp;
+	__u8 __user    *resp;
 } __attribute__((packed));
 
 /**
@@ -237,7 +236,9 @@ struct zcrypt_device_matrix_ext {
 	struct zcrypt_device_status_ext device[MAX_ZDEV_ENTRIES_EXT];
 };
 
-#define AUTOSELECT 0xFFFFFFFF
+#define AUTOSELECT  0xFFFFFFFF
+#define AUTOSEL_AP  ((__u16)0xFFFF)
+#define AUTOSEL_DOM ((__u16)0xFFFF)
 
 #define ZCRYPT_IOCTL_MAGIC 'z'
 
@@ -286,7 +287,7 @@ struct zcrypt_device_matrix_ext {
  *	 0x08: CEX3A
  *	 0x0a: CEX4
  *	 0x0b: CEX5
- *	 0x0c: CEX6 and CEX7
+ *	 0x0c: CEX6, CEX7 or CEX8
  *	 0x0d: device is disabled
  *
  *   ZCRYPT_QDEPTH_MASK
@@ -303,12 +304,12 @@ struct zcrypt_device_matrix_ext {
 /**
  * Supported ioctl calls
  */
-#define ICARSAMODEXPO	_IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x05, 0)
-#define ICARSACRT	_IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x06, 0)
-#define ZSECSENDCPRB	_IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x81, 0)
-#define ZSENDEP11CPRB	_IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x04, 0)
+#define ICARSAMODEXPO  _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x05, 0)
+#define ICARSACRT      _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x06, 0)
+#define ZSECSENDCPRB   _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x81, 0)
+#define ZSENDEP11CPRB  _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x04, 0)
 
-#define ZCRYPT_DEVICE_STATUS _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x5f, 0)
+#define ZCRYPT_DEVICE_STATUS _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x5f, 0)
 #define ZCRYPT_STATUS_MASK   _IOR(ZCRYPT_IOCTL_MAGIC, 0x58, char[MAX_ZDEV_CARDIDS_EXT])
 #define ZCRYPT_QDEPTH_MASK   _IOR(ZCRYPT_IOCTL_MAGIC, 0x59, char[MAX_ZDEV_CARDIDS_EXT])
 #define ZCRYPT_PERDEV_REQCNT _IOR(ZCRYPT_IOCTL_MAGIC, 0x5a, int[MAX_ZDEV_CARDIDS_EXT])
@@ -350,7 +351,7 @@ struct zcrypt_device_matrix {
 };
 
 /* Deprecated: use ZCRYPT_DEVICE_STATUS */
-#define ZDEVICESTATUS _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x4f, 0)
+#define ZDEVICESTATUS _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x4f, 0)
 /* Deprecated: use ZCRYPT_STATUS_MASK */
 #define Z90STAT_STATUS_MASK _IOR(ZCRYPT_IOCTL_MAGIC, 0x48, char[64])
 /* Deprecated: use ZCRYPT_QDEPTH_MASK */
diff --git a/arch/s390/kernel/.gitignore b/arch/s390/kernel/.gitignore
index c5f676c3c224..bbb90f92d051 100644
--- a/arch/s390/kernel/.gitignore
+++ b/arch/s390/kernel/.gitignore
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
 vmlinux.lds
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 2b1203cf7be6..db5f3a3faefb 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -10,6 +10,9 @@ CFLAGS_REMOVE_ftrace.o		= $(CC_FLAGS_FTRACE)
 
 # Do not trace early setup code
 CFLAGS_REMOVE_early.o		= $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_rethook.o		= $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_stacktrace.o	= $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_unwind_bc.o	= $(CC_FLAGS_FTRACE)
 
 endif
 
@@ -33,51 +36,53 @@ CFLAGS_stacktrace.o	+= -fno-optimize-sibling-calls
 CFLAGS_dumpstack.o	+= -fno-optimize-sibling-calls
 CFLAGS_unwind_bc.o	+= -fno-optimize-sibling-calls
 
-#
-# Pass UTS_MACHINE for user_regset definition
-#
-CFLAGS_ptrace.o		+= -DUTS_MACHINE='"$(UTS_MACHINE)"'
-
-obj-y	:= traps.o time.o process.o base.o early.o setup.o idle.o vtime.o
-obj-y	+= processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
-obj-y	+= debug.o irq.o ipl.o dis.o diag.o vdso.o
-obj-y	+= sysinfo.o lgr.o os_info.o machine_kexec.o pgm_check.o
+obj-y	:= head64.o traps.o time.o process.o early.o setup.o idle.o vtime.o
+obj-y	+= processor.o syscall.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
+obj-y	+= debug.o irq.o ipl.o dis.o vdso.o cpufeature.o
+obj-y	+= sysinfo.o lgr.o os_info.o ctlreg.o
 obj-y	+= runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o
-obj-y	+= entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o
+obj-y	+= entry.o reipl.o kdebugfs.o alternative.o
 obj-y	+= nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o
-obj-y	+= smp.o
+obj-y	+= smp.o text_amode31.o stacktrace.o abs_lowcore.o facility.o uv.o wti.o
+obj-y	+= diag/
 
-extra-y				+= head64.o vmlinux.lds
+extra-y				+= vmlinux.lds
 
 obj-$(CONFIG_SYSFS)		+= nospec-sysfs.o
 CFLAGS_REMOVE_nospec-branch.o	+= $(CC_FLAGS_EXPOLINE)
 
+obj-$(CONFIG_SYSFS)		+= cpacf.o
 obj-$(CONFIG_MODULES)		+= module.o
-obj-$(CONFIG_SCHED_TOPOLOGY)	+= topology.o
-obj-$(CONFIG_HIBERNATION)	+= suspend.o swsusp.o
+obj-$(CONFIG_SCHED_TOPOLOGY)	+= topology.o hiperdispatch.o
+obj-$(CONFIG_NUMA)		+= numa.o
 obj-$(CONFIG_AUDIT)		+= audit.o
 compat-obj-$(CONFIG_AUDIT)	+= compat_audit.o
 obj-$(CONFIG_COMPAT)		+= compat_linux.o compat_signal.o
 obj-$(CONFIG_COMPAT)		+= $(compat-obj-y)
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
-obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
-obj-$(CONFIG_FUNCTION_TRACER)	+= mcount.o ftrace.o
+obj-$(CONFIG_KPROBES)		+= mcount.o
+obj-$(CONFIG_RETHOOK)		+= rethook.o
+obj-$(CONFIG_FUNCTION_TRACER)	+= ftrace.o
+obj-$(CONFIG_FUNCTION_TRACER)	+= mcount.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
+obj-$(CONFIG_KEXEC_CORE)	+= machine_kexec.o relocate_kernel.o
+obj-$(CONFIG_VMCORE_INFO)	+= vmcore_info.o
 obj-$(CONFIG_UPROBES)		+= uprobes.o
 obj-$(CONFIG_JUMP_LABEL)	+= jump_label.o
 
 obj-$(CONFIG_KEXEC_FILE)	+= machine_kexec_file.o kexec_image.o
 obj-$(CONFIG_KEXEC_FILE)	+= kexec_elf.o
+obj-$(CONFIG_CERT_STORE)	+= cert_store.o
+obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT)	+= ima_arch.o
 
-obj-$(CONFIG_IMA)		+= ima_arch.o
-
-obj-$(CONFIG_PERF_EVENTS)	+= perf_event.o perf_cpum_cf_common.o
+obj-$(CONFIG_PERF_EVENTS)	+= perf_event.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_cpum_cf.o perf_cpum_sf.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_cpum_cf_events.o perf_regs.o
-obj-$(CONFIG_PERF_EVENTS)	+= perf_cpum_cf_diag.o
+obj-$(CONFIG_PERF_EVENTS)	+= perf_pai_crypto.o perf_pai_ext.o
 
 obj-$(CONFIG_TRACEPOINTS)	+= trace.o
 
 # vdso
 obj-y				+= vdso64/
+obj-$(CONFIG_COMPAT)		+= vdso32/
diff --git a/arch/s390/kernel/abs_lowcore.c b/arch/s390/kernel/abs_lowcore.c
new file mode 100644
index 000000000000..6252b7d115dd
--- /dev/null
+++ b/arch/s390/kernel/abs_lowcore.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/pgtable.h>
+#include <asm/abs_lowcore.h>
+#include <asm/sections.h>
+
+unsigned long __bootdata_preserved(__abs_lowcore);
+
+int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc)
+{
+	unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore));
+	unsigned long phys = __pa(lc);
+	int rc, i;
+
+	for (i = 0; i < LC_PAGES; i++) {
+		rc = __vmem_map_4k_page(addr, phys, PAGE_KERNEL, alloc);
+		if (rc) {
+			/*
+			 * Do not unmap allocated page tables in case the
+			 * allocation was not requested. In such a case the
+			 * request is expected coming from an atomic context,
+			 * while the unmap attempt might sleep.
+			 */
+			if (alloc) {
+				for (--i; i >= 0; i--) {
+					addr -= PAGE_SIZE;
+					vmem_unmap_4k_page(addr);
+				}
+			}
+			return rc;
+		}
+		addr += PAGE_SIZE;
+		phys += PAGE_SIZE;
+	}
+	return 0;
+}
+
+void abs_lowcore_unmap(int cpu)
+{
+	unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore));
+	int i;
+
+	for (i = 0; i < LC_PAGES; i++) {
+		vmem_unmap_4k_page(addr);
+		addr += PAGE_SIZE;
+	}
+}
diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c
index 8e1f2aee85ef..90c0e6408992 100644
--- a/arch/s390/kernel/alternative.c
+++ b/arch/s390/kernel/alternative.c
@@ -1,113 +1,90 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/module.h>
-#include <asm/alternative.h>
-#include <asm/facility.h>
-#include <asm/nospec-branch.h>
-
-#define MAX_PATCH_LEN (255 - 1)
 
-static int __initdata_or_module alt_instr_disabled;
+#ifndef pr_fmt
+#define pr_fmt(fmt)	"alt: " fmt
+#endif
 
-static int __init disable_alternative_instructions(char *str)
-{
-	alt_instr_disabled = 1;
-	return 0;
-}
+#include <linux/uaccess.h>
+#include <linux/printk.h>
+#include <asm/nospec-branch.h>
+#include <asm/abs_lowcore.h>
+#include <asm/alternative.h>
+#include <asm/facility.h>
+#include <asm/sections.h>
+#include <asm/machine.h>
 
-early_param("noaltinstr", disable_alternative_instructions);
+#ifndef a_debug
+#define a_debug		pr_debug
+#endif
 
-struct brcl_insn {
-	u16 opc;
-	s32 disp;
-} __packed;
+#ifndef __kernel_va
+#define __kernel_va(x)	(void *)(x)
+#endif
 
-static u16 __initdata_or_module nop16 = 0x0700;
-static u32 __initdata_or_module nop32 = 0x47000000;
-static struct brcl_insn __initdata_or_module nop48 = {
-	0xc004, 0
-};
+unsigned long __bootdata_preserved(machine_features[1]);
 
-static const void *nops[] __initdata_or_module = {
-	&nop16,
-	&nop32,
-	&nop48
+struct alt_debug {
+	unsigned long facilities[MAX_FACILITY_BIT / BITS_PER_LONG];
+	unsigned long mfeatures[MAX_MFEATURE_BIT / BITS_PER_LONG];
+	int spec;
 };
 
-static void __init_or_module add_jump_padding(void *insns, unsigned int len)
-{
-	struct brcl_insn brcl = {
-		0xc0f4,
-		len / 2
-	};
-
-	memcpy(insns, &brcl, sizeof(brcl));
-	insns += sizeof(brcl);
-	len -= sizeof(brcl);
-
-	while (len > 0) {
-		memcpy(insns, &nop16, 2);
-		insns += 2;
-		len -= 2;
-	}
-}
+static struct alt_debug __bootdata_preserved(alt_debug);
 
-static void __init_or_module add_padding(void *insns, unsigned int len)
+static void alternative_dump(u8 *old, u8 *new, unsigned int len, unsigned int type, unsigned int data)
 {
-	if (len > 6)
-		add_jump_padding(insns, len);
-	else if (len >= 2)
-		memcpy(insns, nops[len / 2 - 1], len);
+	char oinsn[33], ninsn[33];
+	unsigned long kptr;
+	unsigned int pos;
+
+	for (pos = 0; pos < len && 2 * pos < sizeof(oinsn) - 3; pos++)
+		hex_byte_pack(&oinsn[2 * pos], old[pos]);
+	oinsn[2 * pos] = 0;
+	for (pos = 0; pos < len && 2 * pos < sizeof(ninsn) - 3; pos++)
+		hex_byte_pack(&ninsn[2 * pos], new[pos]);
+	ninsn[2 * pos] = 0;
+	kptr = (unsigned long)__kernel_va(old);
+	a_debug("[%d/%3d] %016lx: %s -> %s\n", type, data, kptr, oinsn, ninsn);
 }
 
-static void __init_or_module __apply_alternatives(struct alt_instr *start,
-						  struct alt_instr *end)
+void __apply_alternatives(struct alt_instr *start, struct alt_instr *end, unsigned int ctx)
 {
+	struct alt_debug *d;
 	struct alt_instr *a;
-	u8 *instr, *replacement;
-	u8 insnbuf[MAX_PATCH_LEN];
+	bool debug, replace;
+	u8 *old, *new;
 
 	/*
 	 * The scan order should be from start to end. A later scanned
 	 * alternative code can overwrite previously scanned alternative code.
 	 */
+	d = &alt_debug;
 	for (a = start; a < end; a++) {
-		int insnbuf_sz = 0;
-
-		instr = (u8 *)&a->instr_offset + a->instr_offset;
-		replacement = (u8 *)&a->repl_offset + a->repl_offset;
-
-		if (!__test_facility(a->facility,
-				     S390_lowcore.alt_stfle_fac_list))
-			continue;
-
-		if (unlikely(a->instrlen % 2 || a->replacementlen % 2)) {
-			WARN_ONCE(1, "cpu alternatives instructions length is "
-				     "odd, skipping patching\n");
+		if (!(a->ctx & ctx))
 			continue;
+		switch (a->type) {
+		case ALT_TYPE_FACILITY:
+			replace = test_facility(a->data);
+			debug = __test_facility(a->data, d->facilities);
+			break;
+		case ALT_TYPE_FEATURE:
+			replace = test_machine_feature(a->data);
+			debug = __test_machine_feature(a->data, d->mfeatures);
+			break;
+		case ALT_TYPE_SPEC:
+			replace = nobp_enabled();
+			debug = d->spec;
+			break;
+		default:
+			replace = false;
+			debug = false;
 		}
-
-		memcpy(insnbuf, replacement, a->replacementlen);
-		insnbuf_sz = a->replacementlen;
-
-		if (a->instrlen > a->replacementlen) {
-			add_padding(insnbuf + a->replacementlen,
-				    a->instrlen - a->replacementlen);
-			insnbuf_sz += a->instrlen - a->replacementlen;
-		}
-
-		s390_kernel_write(instr, insnbuf, insnbuf_sz);
+		if (!replace)
+			continue;
+		old = (u8 *)&a->instr_offset + a->instr_offset;
+		new = (u8 *)&a->repl_offset + a->repl_offset;
+		if (debug)
+			alternative_dump(old, new, a->instrlen, a->type, a->data);
+		s390_kernel_write(old, new, a->instrlen);
 	}
 }
-
-void __init_or_module apply_alternatives(struct alt_instr *start,
-					 struct alt_instr *end)
-{
-	if (!alt_instr_disabled)
-		__apply_alternatives(start, end);
-}
-
-extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-void __init apply_alternative_instructions(void)
-{
-	apply_alternatives(__alt_instructions, __alt_instructions_end);
-}
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index ce33406cfe83..841e05f7fa7e 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -5,18 +5,14 @@
  * and format the required data.
  */
 
-#define ASM_OFFSETS_C
-
 #include <linux/kbuild.h>
-#include <linux/kvm_host.h>
 #include <linux/sched.h>
 #include <linux/purgatory.h>
-#include <asm/idle.h>
-#include <asm/vdso.h>
-#include <asm/pgtable.h>
-#include <asm/gmap.h>
-#include <asm/nmi.h>
+#include <linux/pgtable.h>
+#include <linux/ftrace_regs.h>
+#include <asm/kvm_host_types.h>
 #include <asm/stacktrace.h>
+#include <asm/ptrace.h>
 
 int main(void)
 {
@@ -27,82 +23,61 @@ int main(void)
 	BLANK();
 	/* thread struct offsets */
 	OFFSET(__THREAD_ksp, thread_struct, ksp);
-	OFFSET(__THREAD_sysc_table,  thread_struct, sys_call_table);
-	OFFSET(__THREAD_last_break, thread_struct, last_break);
-	OFFSET(__THREAD_FPU_fpc, thread_struct, fpu.fpc);
-	OFFSET(__THREAD_FPU_regs, thread_struct, fpu.regs);
-	OFFSET(__THREAD_per_cause, thread_struct, per_event.cause);
-	OFFSET(__THREAD_per_address, thread_struct, per_event.address);
-	OFFSET(__THREAD_per_paid, thread_struct, per_event.paid);
-	OFFSET(__THREAD_trap_tdb, thread_struct, trap_tdb);
 	BLANK();
 	/* thread info offsets */
 	OFFSET(__TI_flags, task_struct, thread_info.flags);
+	OFFSET(__TI_sie, task_struct, thread_info.sie);
 	BLANK();
 	/* pt_regs offsets */
-	OFFSET(__PT_ARGS, pt_regs, args);
 	OFFSET(__PT_PSW, pt_regs, psw);
 	OFFSET(__PT_GPRS, pt_regs, gprs);
+	OFFSET(__PT_R0, pt_regs, gprs[0]);
+	OFFSET(__PT_R1, pt_regs, gprs[1]);
+	OFFSET(__PT_R2, pt_regs, gprs[2]);
+	OFFSET(__PT_R3, pt_regs, gprs[3]);
+	OFFSET(__PT_R4, pt_regs, gprs[4]);
+	OFFSET(__PT_R5, pt_regs, gprs[5]);
+	OFFSET(__PT_R6, pt_regs, gprs[6]);
+	OFFSET(__PT_R7, pt_regs, gprs[7]);
+	OFFSET(__PT_R8, pt_regs, gprs[8]);
+	OFFSET(__PT_R9, pt_regs, gprs[9]);
+	OFFSET(__PT_R10, pt_regs, gprs[10]);
+	OFFSET(__PT_R11, pt_regs, gprs[11]);
+	OFFSET(__PT_R12, pt_regs, gprs[12]);
+	OFFSET(__PT_R13, pt_regs, gprs[13]);
+	OFFSET(__PT_R14, pt_regs, gprs[14]);
+	OFFSET(__PT_R15, pt_regs, gprs[15]);
 	OFFSET(__PT_ORIG_GPR2, pt_regs, orig_gpr2);
 	OFFSET(__PT_INT_CODE, pt_regs, int_code);
-	OFFSET(__PT_INT_PARM, pt_regs, int_parm);
-	OFFSET(__PT_INT_PARM_LONG, pt_regs, int_parm_long);
 	OFFSET(__PT_FLAGS, pt_regs, flags);
+	OFFSET(__PT_CR1, pt_regs, cr1);
+	OFFSET(__PT_LAST_BREAK, pt_regs, last_break);
 	DEFINE(__PT_SIZE, sizeof(struct pt_regs));
 	BLANK();
 	/* stack_frame offsets */
 	OFFSET(__SF_BACKCHAIN, stack_frame, back_chain);
 	OFFSET(__SF_GPRS, stack_frame, gprs);
-	OFFSET(__SF_EMPTY, stack_frame, empty1);
-	OFFSET(__SF_SIE_CONTROL, stack_frame, empty1[0]);
-	OFFSET(__SF_SIE_SAVEAREA, stack_frame, empty1[1]);
-	OFFSET(__SF_SIE_REASON, stack_frame, empty1[2]);
-	OFFSET(__SF_SIE_FLAGS, stack_frame, empty1[3]);
-	BLANK();
-	/* timeval/timezone offsets for use by vdso */
-	OFFSET(__VDSO_UPD_COUNT, vdso_data, tb_update_count);
-	OFFSET(__VDSO_XTIME_STAMP, vdso_data, xtime_tod_stamp);
-	OFFSET(__VDSO_XTIME_SEC, vdso_data, xtime_clock_sec);
-	OFFSET(__VDSO_XTIME_NSEC, vdso_data, xtime_clock_nsec);
-	OFFSET(__VDSO_XTIME_CRS_SEC, vdso_data, xtime_coarse_sec);
-	OFFSET(__VDSO_XTIME_CRS_NSEC, vdso_data, xtime_coarse_nsec);
-	OFFSET(__VDSO_WTOM_SEC, vdso_data, wtom_clock_sec);
-	OFFSET(__VDSO_WTOM_NSEC, vdso_data, wtom_clock_nsec);
-	OFFSET(__VDSO_WTOM_CRS_SEC, vdso_data, wtom_coarse_sec);
-	OFFSET(__VDSO_WTOM_CRS_NSEC, vdso_data, wtom_coarse_nsec);
-	OFFSET(__VDSO_TIMEZONE, vdso_data, tz_minuteswest);
-	OFFSET(__VDSO_ECTG_OK, vdso_data, ectg_available);
-	OFFSET(__VDSO_TK_MULT, vdso_data, tk_mult);
-	OFFSET(__VDSO_TK_SHIFT, vdso_data, tk_shift);
-	OFFSET(__VDSO_TS_DIR, vdso_data, ts_dir);
-	OFFSET(__VDSO_TS_END, vdso_data, ts_end);
-	OFFSET(__VDSO_ECTG_BASE, vdso_per_cpu_data, ectg_timer_base);
-	OFFSET(__VDSO_ECTG_USER, vdso_per_cpu_data, ectg_user_time);
-	OFFSET(__VDSO_GETCPU_VAL, vdso_per_cpu_data, getcpu_val);
-	BLANK();
-	/* constants used by the vdso */
-	DEFINE(__CLOCK_REALTIME, CLOCK_REALTIME);
-	DEFINE(__CLOCK_MONOTONIC, CLOCK_MONOTONIC);
-	DEFINE(__CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE);
-	DEFINE(__CLOCK_MONOTONIC_COARSE, CLOCK_MONOTONIC_COARSE);
-	DEFINE(__CLOCK_THREAD_CPUTIME_ID, CLOCK_THREAD_CPUTIME_ID);
-	DEFINE(__CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC);
-	DEFINE(__CLOCK_COARSE_RES, LOW_RES_NSEC);
+	OFFSET(__SF_EMPTY, stack_frame, empty[0]);
+	OFFSET(__SF_SIE_CONTROL, stack_frame, sie_control_block);
+	OFFSET(__SF_SIE_SAVEAREA, stack_frame, sie_savearea);
+	OFFSET(__SF_SIE_REASON, stack_frame, sie_reason);
+	OFFSET(__SF_SIE_FLAGS, stack_frame, sie_flags);
+	OFFSET(__SF_SIE_CONTROL_PHYS, stack_frame, sie_control_block_phys);
+	OFFSET(__SF_SIE_GUEST_ASCE, stack_frame, sie_guest_asce);
+	DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame));
 	BLANK();
-	/* idle data offsets */
-	OFFSET(__CLOCK_IDLE_ENTER, s390_idle_data, clock_idle_enter);
-	OFFSET(__CLOCK_IDLE_EXIT, s390_idle_data, clock_idle_exit);
-	OFFSET(__TIMER_IDLE_ENTER, s390_idle_data, timer_idle_enter);
-	OFFSET(__TIMER_IDLE_EXIT, s390_idle_data, timer_idle_exit);
+	OFFSET(__SFUSER_BACKCHAIN, stack_frame_user, back_chain);
+	DEFINE(STACK_FRAME_USER_OVERHEAD, sizeof(struct stack_frame_user));
+	OFFSET(__SFVDSO_RETURN_ADDRESS, stack_frame_vdso_wrapper, return_address);
+	DEFINE(STACK_FRAME_VDSO_OVERHEAD, sizeof(struct stack_frame_vdso_wrapper));
 	BLANK();
 	/* hardware defined lowcore locations 0x000 - 0x1ff */
 	OFFSET(__LC_EXT_PARAMS, lowcore, ext_params);
 	OFFSET(__LC_EXT_CPU_ADDR, lowcore, ext_cpu_addr);
 	OFFSET(__LC_EXT_INT_CODE, lowcore, ext_int_code);
-	OFFSET(__LC_SVC_ILC, lowcore, svc_ilc);
-	OFFSET(__LC_SVC_INT_CODE, lowcore, svc_code);
 	OFFSET(__LC_PGM_ILC, lowcore, pgm_ilc);
-	OFFSET(__LC_PGM_INT_CODE, lowcore, pgm_code);
+	OFFSET(__LC_PGM_CODE, lowcore, pgm_code);
+	OFFSET(__LC_PGM_INT_CODE, lowcore, pgm_int_code);
 	OFFSET(__LC_DATA_EXC_CODE, lowcore, data_exc_code);
 	OFFSET(__LC_MON_CLASS_NR, lowcore, mon_class_num);
 	OFFSET(__LC_PER_CODE, lowcore, per_code);
@@ -118,12 +93,12 @@ int main(void)
 	OFFSET(__LC_SUBCHANNEL_NR, lowcore, subchannel_nr);
 	OFFSET(__LC_IO_INT_PARM, lowcore, io_int_parm);
 	OFFSET(__LC_IO_INT_WORD, lowcore, io_int_word);
-	OFFSET(__LC_STFL_FAC_LIST, lowcore, stfl_fac_list);
-	OFFSET(__LC_STFLE_FAC_LIST, lowcore, stfle_fac_list);
 	OFFSET(__LC_MCCK_CODE, lowcore, mcck_interruption_code);
 	OFFSET(__LC_EXT_DAMAGE_CODE, lowcore, external_damage_code);
 	OFFSET(__LC_MCCK_FAIL_STOR_ADDR, lowcore, failing_storage_address);
-	OFFSET(__LC_LAST_BREAK, lowcore, breaking_event_addr);
+	OFFSET(__LC_PGM_LAST_BREAK, lowcore, pgm_last_break);
+	OFFSET(__LC_RETURN_LPSWE, lowcore, return_lpswe);
+	OFFSET(__LC_RETURN_MCCK_LPSWE, lowcore, return_mcck_lpswe);
 	OFFSET(__LC_RST_OLD_PSW, lowcore, restart_old_psw);
 	OFFSET(__LC_EXT_OLD_PSW, lowcore, external_old_psw);
 	OFFSET(__LC_SVC_OLD_PSW, lowcore, svc_old_psw);
@@ -137,45 +112,36 @@ int main(void)
 	OFFSET(__LC_MCK_NEW_PSW, lowcore, mcck_new_psw);
 	OFFSET(__LC_IO_NEW_PSW, lowcore, io_new_psw);
 	/* software defined lowcore locations 0x200 - 0xdff*/
-	OFFSET(__LC_SAVE_AREA_SYNC, lowcore, save_area_sync);
-	OFFSET(__LC_SAVE_AREA_ASYNC, lowcore, save_area_async);
+	OFFSET(__LC_SAVE_AREA, lowcore, save_area);
 	OFFSET(__LC_SAVE_AREA_RESTART, lowcore, save_area_restart);
-	OFFSET(__LC_CPU_FLAGS, lowcore, cpu_flags);
+	OFFSET(__LC_PCPU, lowcore, pcpu);
 	OFFSET(__LC_RETURN_PSW, lowcore, return_psw);
 	OFFSET(__LC_RETURN_MCCK_PSW, lowcore, return_mcck_psw);
-	OFFSET(__LC_SYNC_ENTER_TIMER, lowcore, sync_enter_timer);
-	OFFSET(__LC_ASYNC_ENTER_TIMER, lowcore, async_enter_timer);
+	OFFSET(__LC_SYS_ENTER_TIMER, lowcore, sys_enter_timer);
 	OFFSET(__LC_MCCK_ENTER_TIMER, lowcore, mcck_enter_timer);
 	OFFSET(__LC_EXIT_TIMER, lowcore, exit_timer);
-	OFFSET(__LC_USER_TIMER, lowcore, user_timer);
-	OFFSET(__LC_SYSTEM_TIMER, lowcore, system_timer);
-	OFFSET(__LC_STEAL_TIMER, lowcore, steal_timer);
 	OFFSET(__LC_LAST_UPDATE_TIMER, lowcore, last_update_timer);
 	OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock);
 	OFFSET(__LC_INT_CLOCK, lowcore, int_clock);
-	OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock);
-	OFFSET(__LC_CLOCK_COMPARATOR, lowcore, clock_comparator);
-	OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock);
 	OFFSET(__LC_CURRENT, lowcore, current_task);
 	OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack);
 	OFFSET(__LC_ASYNC_STACK, lowcore, async_stack);
 	OFFSET(__LC_NODAT_STACK, lowcore, nodat_stack);
 	OFFSET(__LC_RESTART_STACK, lowcore, restart_stack);
+	OFFSET(__LC_MCCK_STACK, lowcore, mcck_stack);
 	OFFSET(__LC_RESTART_FN, lowcore, restart_fn);
 	OFFSET(__LC_RESTART_DATA, lowcore, restart_data);
 	OFFSET(__LC_RESTART_SOURCE, lowcore, restart_source);
+	OFFSET(__LC_RESTART_FLAGS, lowcore, restart_flags);
+	OFFSET(__LC_KERNEL_ASCE, lowcore, kernel_asce);
 	OFFSET(__LC_USER_ASCE, lowcore, user_asce);
-	OFFSET(__LC_VDSO_ASCE, lowcore, vdso_asce);
 	OFFSET(__LC_LPP, lowcore, lpp);
 	OFFSET(__LC_CURRENT_PID, lowcore, current_pid);
-	OFFSET(__LC_PERCPU_OFFSET, lowcore, percpu_offset);
-	OFFSET(__LC_VDSO_PER_CPU, lowcore, vdso_per_cpu_data);
-	OFFSET(__LC_MACHINE_FLAGS, lowcore, machine_flags);
-	OFFSET(__LC_PREEMPT_COUNT, lowcore, preempt_count);
-	OFFSET(__LC_GMAP, lowcore, gmap);
-	OFFSET(__LC_BR_R1, lowcore, br_r1_trampoline);
+	OFFSET(__LC_LAST_BREAK, lowcore, last_break);
 	/* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */
 	OFFSET(__LC_DUMP_REIPL, lowcore, ipib);
+	OFFSET(__LC_VMCORE_INFO, lowcore, vmcore_info);
+	OFFSET(__LC_OS_INFO, lowcore, os_info);
 	/* hardware defined lowcore locations 0x1000 - 0x18ff */
 	OFFSET(__LC_MCESAD, lowcore, mcesad);
 	OFFSET(__LC_EXT_PARAMS2, lowcore, ext_params2);
@@ -187,20 +153,31 @@ int main(void)
 	OFFSET(__LC_TOD_PROGREG_SAVE_AREA, lowcore, tod_progreg_save_area);
 	OFFSET(__LC_CPU_TIMER_SAVE_AREA, lowcore, cpu_timer_save_area);
 	OFFSET(__LC_CLOCK_COMP_SAVE_AREA, lowcore, clock_comp_save_area);
+	OFFSET(__LC_LAST_BREAK_SAVE_AREA, lowcore, last_break_save_area);
 	OFFSET(__LC_AREGS_SAVE_AREA, lowcore, access_regs_save_area);
 	OFFSET(__LC_CREGS_SAVE_AREA, lowcore, cregs_save_area);
 	OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb);
 	BLANK();
-	/* extended machine check save area */
-	OFFSET(__MCESA_GS_SAVE_AREA, mcesa, guarded_storage_save_area);
-	BLANK();
 	/* gmap/sie offsets */
-	OFFSET(__GMAP_ASCE, gmap, asce);
 	OFFSET(__SIE_PROG0C, kvm_s390_sie_block, prog0c);
 	OFFSET(__SIE_PROG20, kvm_s390_sie_block, prog20);
 	/* kexec_sha_region */
 	OFFSET(__KEXEC_SHA_REGION_START, kexec_sha_region, start);
 	OFFSET(__KEXEC_SHA_REGION_LEN, kexec_sha_region, len);
 	DEFINE(__KEXEC_SHA_REGION_SIZE, sizeof(struct kexec_sha_region));
+	/* sizeof kernel parameter area */
+	DEFINE(__PARMAREA_SIZE, sizeof(struct parmarea));
+	/* kernel parameter area offsets */
+	DEFINE(IPL_DEVICE, PARMAREA + offsetof(struct parmarea, ipl_device));
+	DEFINE(INITRD_START, PARMAREA + offsetof(struct parmarea, initrd_start));
+	DEFINE(INITRD_SIZE, PARMAREA + offsetof(struct parmarea, initrd_size));
+	DEFINE(OLDMEM_BASE, PARMAREA + offsetof(struct parmarea, oldmem_base));
+	DEFINE(OLDMEM_SIZE, PARMAREA + offsetof(struct parmarea, oldmem_size));
+	DEFINE(COMMAND_LINE, PARMAREA + offsetof(struct parmarea, command_line));
+	DEFINE(MAX_COMMAND_LINE_SIZE, PARMAREA + offsetof(struct parmarea, max_command_line_size));
+	OFFSET(__FTRACE_REGS_PT_REGS, __arch_ftrace_regs, regs);
+	DEFINE(__FTRACE_REGS_SIZE, sizeof(struct __arch_ftrace_regs));
+
+	OFFSET(__PCPU_FLAGS, pcpu, flags);
 	return 0;
 }
diff --git a/arch/s390/kernel/audit.c b/arch/s390/kernel/audit.c
index d395c6c9944c..02051a596b87 100644
--- a/arch/s390/kernel/audit.c
+++ b/arch/s390/kernel/audit.c
@@ -47,15 +47,17 @@ int audit_classify_syscall(int abi, unsigned syscall)
 #endif
 	switch(syscall) {
 	case __NR_open:
-		return 2;
+		return AUDITSC_OPEN;
 	case __NR_openat:
-		return 3;
+		return AUDITSC_OPENAT;
 	case __NR_socketcall:
-		return 4;
+		return AUDITSC_SOCKETCALL;
 	case __NR_execve:
-		return 5;
+		return AUDITSC_EXECVE;
+	case __NR_openat2:
+		return AUDITSC_OPENAT2;
 	default:
-		return 0;
+		return AUDITSC_NATIVE;
 	}
 }
 
diff --git a/arch/s390/kernel/base.S b/arch/s390/kernel/base.S
deleted file mode 100644
index b79e0fd571f8..000000000000
--- a/arch/s390/kernel/base.S
+++ /dev/null
@@ -1,63 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *  arch/s390/kernel/base.S
- *
- *    Copyright IBM Corp. 2006, 2007
- *    Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
- *		 Michael Holzheu <holzheu@de.ibm.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/asm-offsets.h>
-#include <asm/nospec-insn.h>
-#include <asm/ptrace.h>
-#include <asm/sigp.h>
-
-	GEN_BR_THUNK %r9
-	GEN_BR_THUNK %r14
-
-ENTRY(s390_base_ext_handler)
-	stmg	%r0,%r15,__LC_SAVE_AREA_ASYNC
-	basr	%r13,0
-0:	aghi	%r15,-STACK_FRAME_OVERHEAD
-	larl	%r1,s390_base_ext_handler_fn
-	lg	%r9,0(%r1)
-	ltgr	%r9,%r9
-	jz	1f
-	BASR_EX	%r14,%r9
-1:	lmg	%r0,%r15,__LC_SAVE_AREA_ASYNC
-	ni	__LC_EXT_OLD_PSW+1,0xfd	# clear wait state bit
-	lpswe	__LC_EXT_OLD_PSW
-ENDPROC(s390_base_ext_handler)
-
-	.section .bss
-	.align 8
-	.globl s390_base_ext_handler_fn
-s390_base_ext_handler_fn:
-	.quad	0
-	.previous
-
-ENTRY(s390_base_pgm_handler)
-	stmg	%r0,%r15,__LC_SAVE_AREA_SYNC
-	basr	%r13,0
-0:	aghi	%r15,-STACK_FRAME_OVERHEAD
-	larl	%r1,s390_base_pgm_handler_fn
-	lg	%r9,0(%r1)
-	ltgr	%r9,%r9
-	jz	1f
-	BASR_EX	%r14,%r9
-	lmg	%r0,%r15,__LC_SAVE_AREA_SYNC
-	lpswe	__LC_PGM_OLD_PSW
-1:	lpswe	disabled_wait_psw-0b(%r13)
-ENDPROC(s390_base_pgm_handler)
-
-	.align	8
-disabled_wait_psw:
-	.quad	0x0002000180000000,0x0000000000000000 + s390_base_pgm_handler
-
-	.section .bss
-	.align 8
-	.globl s390_base_pgm_handler_fn
-s390_base_pgm_handler_fn:
-	.quad	0
-	.previous
diff --git a/arch/s390/kernel/cache.c b/arch/s390/kernel/cache.c
index d66825e53fce..4f2669030220 100644
--- a/arch/s390/kernel/cache.c
+++ b/arch/s390/kernel/cache.c
@@ -3,7 +3,6 @@
  * Extract CPU cache information and expose them via sysfs.
  *
  *    Copyright IBM Corp. 2012
- *    Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
  */
 
 #include <linux/seq_file.h>
@@ -47,7 +46,7 @@ struct cache_info {
 #define CACHE_MAX_LEVEL 8
 union cache_topology {
 	struct cache_info ci[CACHE_MAX_LEVEL];
-	unsigned long long raw;
+	unsigned long raw;
 };
 
 static const char * const cache_type_string[] = {
@@ -71,8 +70,6 @@ void show_cacheinfo(struct seq_file *m)
 	struct cacheinfo *cache;
 	int idx;
 
-	if (!test_facility(34))
-		return;
 	this_cpu_ci = get_cpu_cacheinfo(cpumask_any(cpu_online_mask));
 	for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) {
 		cache = this_cpu_ci->info_list + idx;
@@ -132,8 +129,6 @@ int init_cache_level(unsigned int cpu)
 	union cache_topology ct;
 	enum cache_type ctype;
 
-	if (!test_facility(34))
-		return -EOPNOTSUPP;
 	if (!this_cpu_ci)
 		return -EINVAL;
 	ct.raw = ecag(EXTRACT_TOPOLOGY, 0, 0);
@@ -157,8 +152,6 @@ int populate_cache_leaves(unsigned int cpu)
 	union cache_topology ct;
 	enum cache_type ctype;
 
-	if (!test_facility(34))
-		return -EOPNOTSUPP;
 	ct.raw = ecag(EXTRACT_TOPOLOGY, 0, 0);
 	for (idx = 0, level = 0; level < this_cpu_ci->num_levels &&
 	     idx < this_cpu_ci->num_leaves; idx++, level++) {
@@ -173,5 +166,6 @@ int populate_cache_leaves(unsigned int cpu)
 			ci_leaf_init(this_leaf++, pvt, ctype, level, cpu);
 		}
 	}
+	this_cpu_ci->cpu_map_populated = true;
 	return 0;
 }
diff --git a/arch/s390/kernel/cert_store.c b/arch/s390/kernel/cert_store.c
new file mode 100644
index 000000000000..03f3a1e52430
--- /dev/null
+++ b/arch/s390/kernel/cert_store.c
@@ -0,0 +1,813 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DIAG 0x320 support and certificate store handling
+ *
+ * Copyright IBM Corp. 2023
+ * Author(s):	Anastasia Eskova <anastasia.eskova@ibm.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/key-type.h>
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <linux/kobject.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/vmalloc.h>
+#include <crypto/sha2.h>
+#include <keys/user-type.h>
+#include <asm/debug.h>
+#include <asm/diag.h>
+#include <asm/ebcdic.h>
+#include <asm/sclp.h>
+
+#define DIAG_MAX_RETRIES		10
+
+#define VCE_FLAGS_VALID_MASK		0x80
+
+#define ISM_LEN_DWORDS			4
+#define VCSSB_LEN_BYTES			128
+#define VCSSB_LEN_NO_CERTS		4
+#define VCB_LEN_NO_CERTS		64
+#define VC_NAME_LEN_BYTES		64
+
+#define CERT_STORE_KEY_TYPE_NAME	"cert_store_key"
+#define CERT_STORE_KEYRING_NAME		"cert_store"
+
+static debug_info_t *cert_store_dbf;
+static debug_info_t *cert_store_hexdump;
+
+#define pr_dbf_msg(fmt, ...) \
+	debug_sprintf_event(cert_store_dbf, 3, fmt "\n", ## __VA_ARGS__)
+
+enum diag320_subcode {
+	DIAG320_SUBCODES	= 0,
+	DIAG320_STORAGE		= 1,
+	DIAG320_CERT_BLOCK	= 2,
+};
+
+enum diag320_rc {
+	DIAG320_RC_OK		= 0x0001,
+	DIAG320_RC_CS_NOMATCH	= 0x0306,
+};
+
+/* Verification Certificates Store Support Block (VCSSB). */
+struct vcssb {
+	u32 vcssb_length;
+	u8  pad_0x04[3];
+	u8  version;
+	u8  pad_0x08[8];
+	u32 cs_token;
+	u8  pad_0x14[12];
+	u16 total_vc_index_count;
+	u16 max_vc_index_count;
+	u8  pad_0x24[28];
+	u32 max_vce_length;
+	u32 max_vcxe_length;
+	u8  pad_0x48[8];
+	u32 max_single_vcb_length;
+	u32 total_vcb_length;
+	u32 max_single_vcxb_length;
+	u32 total_vcxb_length;
+	u8  pad_0x60[32];
+} __packed __aligned(8);
+
+/* Verification Certificate Entry (VCE) Header. */
+struct vce_header {
+	u32 vce_length;
+	u8  flags;
+	u8  key_type;
+	u16 vc_index;
+	u8  vc_name[VC_NAME_LEN_BYTES]; /* EBCDIC */
+	u8  vc_format;
+	u8  pad_0x49;
+	u16 key_id_length;
+	u8  pad_0x4c;
+	u8  vc_hash_type;
+	u16 vc_hash_length;
+	u8  pad_0x50[4];
+	u32 vc_length;
+	u8  pad_0x58[8];
+	u16 vc_hash_offset;
+	u16 vc_offset;
+	u8  pad_0x64[28];
+} __packed __aligned(4);
+
+/* Verification Certificate Block (VCB) Header. */
+struct vcb_header {
+	u32 vcb_input_length;
+	u8  pad_0x04[4];
+	u16 first_vc_index;
+	u16 last_vc_index;
+	u32 pad_0x0c;
+	u32 cs_token;
+	u8  pad_0x14[12];
+	u32 vcb_output_length;
+	u8  pad_0x24[3];
+	u8  version;
+	u16 stored_vc_count;
+	u16 remaining_vc_count;
+	u8  pad_0x2c[20];
+} __packed __aligned(4);
+
+/* Verification Certificate Block (VCB). */
+struct vcb {
+	struct vcb_header vcb_hdr;
+	u8 vcb_buf[];
+} __packed __aligned(4);
+
+/* Verification Certificate Entry (VCE). */
+struct vce {
+	struct vce_header vce_hdr;
+	u8 cert_data_buf[];
+} __packed __aligned(4);
+
+static void cert_store_key_describe(const struct key *key, struct seq_file *m)
+{
+	char ascii[VC_NAME_LEN_BYTES + 1];
+
+	/*
+	 * First 64 bytes of the key description is key name in EBCDIC CP 500.
+	 * Convert it to ASCII for displaying in /proc/keys.
+	 */
+	strscpy(ascii, key->description, sizeof(ascii));
+	EBCASC_500(ascii, VC_NAME_LEN_BYTES);
+	seq_puts(m, ascii);
+
+	seq_puts(m, &key->description[VC_NAME_LEN_BYTES]);
+	if (key_is_positive(key))
+		seq_printf(m, ": %u", key->datalen);
+}
+
+/*
+ * Certificate store key type takes over properties of
+ * user key but cannot be updated.
+ */
+static struct key_type key_type_cert_store_key = {
+	.name		= CERT_STORE_KEY_TYPE_NAME,
+	.preparse	= user_preparse,
+	.free_preparse	= user_free_preparse,
+	.instantiate	= generic_key_instantiate,
+	.revoke		= user_revoke,
+	.destroy	= user_destroy,
+	.describe	= cert_store_key_describe,
+	.read		= user_read,
+};
+
+/* Logging functions. */
+static void pr_dbf_vcb(const struct vcb *b)
+{
+	pr_dbf_msg("VCB Header:");
+	pr_dbf_msg("vcb_input_length: %d", b->vcb_hdr.vcb_input_length);
+	pr_dbf_msg("first_vc_index: %d", b->vcb_hdr.first_vc_index);
+	pr_dbf_msg("last_vc_index: %d", b->vcb_hdr.last_vc_index);
+	pr_dbf_msg("cs_token: %d", b->vcb_hdr.cs_token);
+	pr_dbf_msg("vcb_output_length: %d", b->vcb_hdr.vcb_output_length);
+	pr_dbf_msg("version: %d", b->vcb_hdr.version);
+	pr_dbf_msg("stored_vc_count: %d", b->vcb_hdr.stored_vc_count);
+	pr_dbf_msg("remaining_vc_count: %d", b->vcb_hdr.remaining_vc_count);
+}
+
+static void pr_dbf_vce(const struct vce *e)
+{
+	unsigned char vc_name[VC_NAME_LEN_BYTES + 1];
+	char log_string[VC_NAME_LEN_BYTES + 40];
+
+	pr_dbf_msg("VCE Header:");
+	pr_dbf_msg("vce_hdr.vce_length: %d", e->vce_hdr.vce_length);
+	pr_dbf_msg("vce_hdr.flags: %d", e->vce_hdr.flags);
+	pr_dbf_msg("vce_hdr.key_type: %d", e->vce_hdr.key_type);
+	pr_dbf_msg("vce_hdr.vc_index: %d", e->vce_hdr.vc_index);
+	pr_dbf_msg("vce_hdr.vc_format: %d", e->vce_hdr.vc_format);
+	pr_dbf_msg("vce_hdr.key_id_length: %d", e->vce_hdr.key_id_length);
+	pr_dbf_msg("vce_hdr.vc_hash_type: %d", e->vce_hdr.vc_hash_type);
+	pr_dbf_msg("vce_hdr.vc_hash_length: %d", e->vce_hdr.vc_hash_length);
+	pr_dbf_msg("vce_hdr.vc_hash_offset: %d", e->vce_hdr.vc_hash_offset);
+	pr_dbf_msg("vce_hdr.vc_length: %d", e->vce_hdr.vc_length);
+	pr_dbf_msg("vce_hdr.vc_offset: %d", e->vce_hdr.vc_offset);
+
+	/* Certificate name in ASCII. */
+	memcpy(vc_name, e->vce_hdr.vc_name, VC_NAME_LEN_BYTES);
+	EBCASC_500(vc_name, VC_NAME_LEN_BYTES);
+	vc_name[VC_NAME_LEN_BYTES] = '\0';
+
+	snprintf(log_string, sizeof(log_string),
+		 "index: %d vce_hdr.vc_name (ASCII): %s",
+		 e->vce_hdr.vc_index, vc_name);
+	debug_text_event(cert_store_hexdump, 3, log_string);
+
+	/* Certificate data. */
+	debug_text_event(cert_store_hexdump, 3, "VCE: Certificate data start");
+	debug_event(cert_store_hexdump, 3, (u8 *)e->cert_data_buf, 128);
+	debug_text_event(cert_store_hexdump, 3, "VCE: Certificate data end");
+	debug_event(cert_store_hexdump, 3,
+		    (u8 *)e->cert_data_buf + e->vce_hdr.vce_length - 128, 128);
+}
+
+static void pr_dbf_vcssb(const struct vcssb *s)
+{
+	debug_text_event(cert_store_hexdump, 3, "DIAG320 Subcode1");
+	debug_event(cert_store_hexdump, 3, (u8 *)s, VCSSB_LEN_BYTES);
+
+	pr_dbf_msg("VCSSB:");
+	pr_dbf_msg("vcssb_length: %u", s->vcssb_length);
+	pr_dbf_msg("version: %u", s->version);
+	pr_dbf_msg("cs_token: %u", s->cs_token);
+	pr_dbf_msg("total_vc_index_count: %u", s->total_vc_index_count);
+	pr_dbf_msg("max_vc_index_count: %u", s->max_vc_index_count);
+	pr_dbf_msg("max_vce_length: %u", s->max_vce_length);
+	pr_dbf_msg("max_vcxe_length: %u", s->max_vce_length);
+	pr_dbf_msg("max_single_vcb_length: %u", s->max_single_vcb_length);
+	pr_dbf_msg("total_vcb_length: %u", s->total_vcb_length);
+	pr_dbf_msg("max_single_vcxb_length: %u", s->max_single_vcxb_length);
+	pr_dbf_msg("total_vcxb_length: %u", s->total_vcxb_length);
+}
+
+static int __diag320(unsigned long subcode, void *addr)
+{
+	union register_pair rp = { .even = (unsigned long)addr, };
+
+	asm_inline volatile(
+		"	diag	%[rp],%[subcode],0x320\n"
+		"0:	nopr	%%r7\n"
+		EX_TABLE(0b, 0b)
+		: [rp] "+d" (rp.pair)
+		: [subcode] "d" (subcode)
+		: "cc", "memory");
+
+	return rp.odd;
+}
+
+static int diag320(unsigned long subcode, void *addr)
+{
+	diag_stat_inc(DIAG_STAT_X320);
+
+	return __diag320(subcode, addr);
+}
+
+/*
+ * Calculate SHA256 hash of the VCE certificate and compare it to hash stored in
+ * VCE. Return -EINVAL if hashes don't match.
+ */
+static int check_certificate_hash(const struct vce *vce)
+{
+	u8 hash[SHA256_DIGEST_SIZE];
+	u16 vc_hash_length;
+	u8 *vce_hash;
+
+	vce_hash = (u8 *)vce + vce->vce_hdr.vc_hash_offset;
+	vc_hash_length = vce->vce_hdr.vc_hash_length;
+	sha256((u8 *)vce + vce->vce_hdr.vc_offset, vce->vce_hdr.vc_length, hash);
+	if (memcmp(vce_hash, hash, vc_hash_length) == 0)
+		return 0;
+
+	pr_dbf_msg("SHA256 hash of received certificate does not match");
+	debug_text_event(cert_store_hexdump, 3, "VCE hash:");
+	debug_event(cert_store_hexdump, 3, vce_hash, SHA256_DIGEST_SIZE);
+	debug_text_event(cert_store_hexdump, 3, "Calculated hash:");
+	debug_event(cert_store_hexdump, 3, hash, SHA256_DIGEST_SIZE);
+
+	return -EINVAL;
+}
+
+static int check_certificate_valid(const struct vce *vce)
+{
+	if (!(vce->vce_hdr.flags & VCE_FLAGS_VALID_MASK)) {
+		pr_dbf_msg("Certificate entry is invalid");
+		return -EINVAL;
+	}
+	if (vce->vce_hdr.vc_format != 1) {
+		pr_dbf_msg("Certificate format is not supported");
+		return -EINVAL;
+	}
+	if (vce->vce_hdr.vc_hash_type != 1) {
+		pr_dbf_msg("Hash type is not supported");
+		return -EINVAL;
+	}
+
+	return check_certificate_hash(vce);
+}
+
+static struct key *get_user_session_keyring(void)
+{
+	key_ref_t us_keyring_ref;
+
+	us_keyring_ref = lookup_user_key(KEY_SPEC_USER_SESSION_KEYRING,
+					 KEY_LOOKUP_CREATE, KEY_NEED_LINK);
+	if (IS_ERR(us_keyring_ref)) {
+		pr_dbf_msg("Couldn't get user session keyring: %ld",
+			   PTR_ERR(us_keyring_ref));
+		return ERR_PTR(-ENOKEY);
+	}
+	key_ref_put(us_keyring_ref);
+	return key_ref_to_ptr(us_keyring_ref);
+}
+
+/* Invalidate all keys from cert_store keyring. */
+static int invalidate_keyring_keys(struct key *keyring)
+{
+	unsigned long num_keys, key_index;
+	size_t keyring_payload_len;
+	key_serial_t *key_array;
+	struct key *current_key;
+	int rc;
+
+	keyring_payload_len = key_type_keyring.read(keyring, NULL, 0);
+	num_keys = keyring_payload_len / sizeof(key_serial_t);
+	key_array = kcalloc(num_keys, sizeof(key_serial_t), GFP_KERNEL);
+	if (!key_array)
+		return -ENOMEM;
+
+	rc = key_type_keyring.read(keyring, (char *)key_array, keyring_payload_len);
+	if (rc != keyring_payload_len) {
+		pr_dbf_msg("Couldn't read keyring payload");
+		goto out;
+	}
+
+	for (key_index = 0; key_index < num_keys; key_index++) {
+		current_key = key_lookup(key_array[key_index]);
+		pr_dbf_msg("Invalidating key %08x", current_key->serial);
+
+		key_invalidate(current_key);
+		key_put(current_key);
+		rc = key_unlink(keyring, current_key);
+		if (rc) {
+			pr_dbf_msg("Couldn't unlink key %08x: %d", current_key->serial, rc);
+			break;
+		}
+	}
+out:
+	kfree(key_array);
+	return rc;
+}
+
+static struct key *find_cs_keyring(void)
+{
+	key_ref_t cs_keyring_ref;
+	struct key *cs_keyring;
+
+	cs_keyring_ref = keyring_search(make_key_ref(get_user_session_keyring(), true),
+					&key_type_keyring, CERT_STORE_KEYRING_NAME,
+					false);
+	if (!IS_ERR(cs_keyring_ref)) {
+		cs_keyring = key_ref_to_ptr(cs_keyring_ref);
+		key_ref_put(cs_keyring_ref);
+		goto found;
+	}
+	/* Search default locations: thread, process, session keyrings */
+	cs_keyring = request_key(&key_type_keyring, CERT_STORE_KEYRING_NAME, NULL);
+	if (IS_ERR(cs_keyring))
+		return NULL;
+	key_put(cs_keyring);
+found:
+	return cs_keyring;
+}
+
+static void cleanup_cs_keys(void)
+{
+	struct key *cs_keyring;
+
+	cs_keyring = find_cs_keyring();
+	if (!cs_keyring)
+		return;
+
+	pr_dbf_msg("Found cert_store keyring. Purging...");
+	/*
+	 * Remove cert_store_key_type in case invalidation
+	 * of old cert_store keys failed (= severe error).
+	 */
+	if (invalidate_keyring_keys(cs_keyring))
+		unregister_key_type(&key_type_cert_store_key);
+
+	keyring_clear(cs_keyring);
+	key_invalidate(cs_keyring);
+	key_put(cs_keyring);
+	key_unlink(get_user_session_keyring(), cs_keyring);
+}
+
+static struct key *create_cs_keyring(void)
+{
+	static struct key *cs_keyring;
+
+	/* Cleanup previous cs_keyring and all associated keys if any. */
+	cleanup_cs_keys();
+	cs_keyring = keyring_alloc(CERT_STORE_KEYRING_NAME, GLOBAL_ROOT_UID,
+				   GLOBAL_ROOT_GID, current_cred(),
+				   (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ,
+				   KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_SET_KEEP,
+				   NULL, get_user_session_keyring());
+	if (IS_ERR(cs_keyring)) {
+		pr_dbf_msg("Can't allocate cert_store keyring");
+		return NULL;
+	}
+
+	pr_dbf_msg("Successfully allocated cert_store keyring: %08x", cs_keyring->serial);
+
+	/*
+	 * In case a previous clean-up ran into an
+	 * error and unregistered key type.
+	 */
+	register_key_type(&key_type_cert_store_key);
+
+	return cs_keyring;
+}
+
+/*
+ * Allocate memory and create key description in format
+ * [key name in EBCDIC]:[VCE index]:[CS token].
+ * Return a pointer to key description or NULL if memory
+ * allocation failed. Memory should be freed by caller.
+ */
+static char *get_key_description(struct vcssb *vcssb, const struct vce *vce)
+{
+	size_t len, name_len;
+	u32 cs_token;
+	char *desc;
+
+	cs_token = vcssb->cs_token;
+	/* Description string contains "%64s:%05u:%010u\0". */
+	name_len = sizeof(vce->vce_hdr.vc_name);
+	len = name_len + 1 + 5 + 1 + 10 + 1;
+	desc = kmalloc(len, GFP_KERNEL);
+	if (!desc)
+		return NULL;
+
+	memcpy(desc, vce->vce_hdr.vc_name, name_len);
+	snprintf(desc + name_len, len - name_len, ":%05u:%010u",
+		 vce->vce_hdr.vc_index, cs_token);
+
+	return desc;
+}
+
+/*
+ * Create a key of type "cert_store_key" using the data from VCE for key
+ * payload and key description. Link the key to "cert_store" keyring.
+ */
+static int create_key_from_vce(struct vcssb *vcssb, struct vce *vce,
+			       struct key *keyring)
+{
+	key_ref_t newkey;
+	char *desc;
+	int rc;
+
+	desc = get_key_description(vcssb, vce);
+	if (!desc)
+		return -ENOMEM;
+
+	newkey = key_create_or_update(
+		make_key_ref(keyring, true), CERT_STORE_KEY_TYPE_NAME,
+		desc, (u8 *)vce + vce->vce_hdr.vc_offset,
+		vce->vce_hdr.vc_length,
+		(KEY_POS_ALL & ~KEY_POS_SETATTR)  | KEY_USR_VIEW | KEY_USR_READ,
+		KEY_ALLOC_NOT_IN_QUOTA);
+
+	rc = PTR_ERR_OR_ZERO(newkey);
+	if (rc) {
+		pr_dbf_msg("Couldn't create a key from Certificate Entry (%d)", rc);
+		rc = -ENOKEY;
+		goto out;
+	}
+
+	key_ref_put(newkey);
+out:
+	kfree(desc);
+	return rc;
+}
+
+/* Get Verification Certificate Storage Size block with DIAG320 subcode2. */
+static int get_vcssb(struct vcssb *vcssb)
+{
+	int diag320_rc;
+
+	memset(vcssb, 0, sizeof(*vcssb));
+	vcssb->vcssb_length = VCSSB_LEN_BYTES;
+	diag320_rc = diag320(DIAG320_STORAGE, vcssb);
+	pr_dbf_vcssb(vcssb);
+
+	if (diag320_rc != DIAG320_RC_OK) {
+		pr_dbf_msg("Diag 320 Subcode 1 returned bad RC: %04x", diag320_rc);
+		return -EIO;
+	}
+	if (vcssb->vcssb_length == VCSSB_LEN_NO_CERTS) {
+		pr_dbf_msg("No certificates available for current configuration");
+		return -ENOKEY;
+	}
+
+	return 0;
+}
+
+static u32 get_4k_mult_vcb_size(struct vcssb *vcssb)
+{
+	return round_up(vcssb->max_single_vcb_length, PAGE_SIZE);
+}
+
+/* Fill input fields of single-entry VCB that will be read by LPAR. */
+static void fill_vcb_input(struct vcssb *vcssb, struct vcb *vcb, u16 index)
+{
+	memset(vcb, 0, sizeof(*vcb));
+	vcb->vcb_hdr.vcb_input_length = get_4k_mult_vcb_size(vcssb);
+	vcb->vcb_hdr.cs_token = vcssb->cs_token;
+
+	/* Request single entry. */
+	vcb->vcb_hdr.first_vc_index = index;
+	vcb->vcb_hdr.last_vc_index = index;
+}
+
+static void extract_vce_from_sevcb(struct vcb *vcb, struct vce *vce)
+{
+	struct vce *extracted_vce;
+
+	extracted_vce = (struct vce *)vcb->vcb_buf;
+	memcpy(vce, vcb->vcb_buf, extracted_vce->vce_hdr.vce_length);
+	pr_dbf_vce(vce);
+}
+
+static int get_sevcb(struct vcssb *vcssb, u16 index, struct vcb *vcb)
+{
+	int rc, diag320_rc;
+
+	fill_vcb_input(vcssb, vcb, index);
+
+	diag320_rc = diag320(DIAG320_CERT_BLOCK, vcb);
+	pr_dbf_msg("Diag 320 Subcode2 RC %2x", diag320_rc);
+	pr_dbf_vcb(vcb);
+
+	switch (diag320_rc) {
+	case DIAG320_RC_OK:
+		rc = 0;
+		if (vcb->vcb_hdr.vcb_output_length == VCB_LEN_NO_CERTS) {
+			pr_dbf_msg("No certificate entry for index %u", index);
+			rc = -ENOKEY;
+		} else if (vcb->vcb_hdr.remaining_vc_count != 0) {
+			/* Retry on insufficient space. */
+			pr_dbf_msg("Couldn't get all requested certificates");
+			rc = -EAGAIN;
+		}
+		break;
+	case DIAG320_RC_CS_NOMATCH:
+		pr_dbf_msg("Certificate Store token mismatch");
+		rc = -EAGAIN;
+		break;
+	default:
+		pr_dbf_msg("Diag 320 Subcode2 returned bad rc (0x%4x)", diag320_rc);
+		rc = -EINVAL;
+		break;
+	}
+
+	return rc;
+}
+
+/*
+ * Allocate memory for single-entry VCB, get VCB via DIAG320 subcode 2 call,
+ * extract VCE and create a key from its' certificate.
+ */
+static int create_key_from_sevcb(struct vcssb *vcssb, u16 index,
+				 struct key *keyring)
+{
+	struct vcb *vcb;
+	struct vce *vce;
+	int rc;
+
+	rc = -ENOMEM;
+	vcb = vmalloc(get_4k_mult_vcb_size(vcssb));
+	vce = vmalloc(vcssb->max_single_vcb_length - sizeof(vcb->vcb_hdr));
+	if (!vcb || !vce)
+		goto out;
+
+	rc = get_sevcb(vcssb, index, vcb);
+	if (rc)
+		goto out;
+
+	extract_vce_from_sevcb(vcb, vce);
+	rc = check_certificate_valid(vce);
+	if (rc)
+		goto out;
+
+	rc = create_key_from_vce(vcssb, vce, keyring);
+	if (rc)
+		goto out;
+
+	pr_dbf_msg("Successfully created key from Certificate Entry %d", index);
+out:
+	vfree(vce);
+	vfree(vcb);
+	return rc;
+}
+
+/*
+ * Request a single-entry VCB for each VCE available for the partition.
+ * Create a key from it and link it to cert_store keyring. If no keys
+ * could be created (i.e. VCEs were invalid) return -ENOKEY.
+ */
+static int add_certificates_to_keyring(struct vcssb *vcssb, struct key *keyring)
+{
+	int rc, index, count, added;
+
+	count = 0;
+	added = 0;
+	/* Certificate Store entries indices start with 1 and have no gaps. */
+	for (index = 1; index < vcssb->total_vc_index_count + 1; index++) {
+		pr_dbf_msg("Creating key from VCE %u", index);
+		rc = create_key_from_sevcb(vcssb, index, keyring);
+		count++;
+
+		if (rc == -EAGAIN)
+			return rc;
+
+		if (rc)
+			pr_dbf_msg("Creating key from VCE %u failed (%d)", index, rc);
+		else
+			added++;
+	}
+
+	if (added == 0) {
+		pr_dbf_msg("Processed %d entries. No keys created", count);
+		return -ENOKEY;
+	}
+
+	pr_info("Added %d of %d keys to cert_store keyring", added, count);
+
+	/*
+	 * Do not allow to link more keys to certificate store keyring after all
+	 * the VCEs were processed.
+	 */
+	rc = keyring_restrict(make_key_ref(keyring, true), NULL, NULL);
+	if (rc)
+		pr_dbf_msg("Failed to set restriction to cert_store keyring (%d)", rc);
+
+	return 0;
+}
+
+/*
+ * Check which DIAG320 subcodes are installed.
+ * Return -ENOENT if subcodes 1 or 2 are not available.
+ */
+static int query_diag320_subcodes(void)
+{
+	unsigned long ism[ISM_LEN_DWORDS];
+	int rc;
+
+	rc = diag320(0, ism);
+	if (rc != DIAG320_RC_OK) {
+		pr_dbf_msg("DIAG320 subcode query returned %04x", rc);
+		return -ENOENT;
+	}
+
+	debug_text_event(cert_store_hexdump, 3, "DIAG320 Subcode 0");
+	debug_event(cert_store_hexdump, 3, ism, sizeof(ism));
+
+	if (!test_bit_inv(1, ism) || !test_bit_inv(2, ism)) {
+		pr_dbf_msg("Not all required DIAG320 subcodes are installed");
+		return -ENOENT;
+	}
+
+	return 0;
+}
+
+/*
+ * Check if Certificate Store is supported by the firmware and DIAG320 subcodes
+ * 1 and 2 are installed. Create cert_store keyring and link all certificates
+ * available for the current partition to it as "cert_store_key" type
+ * keys. On refresh or error invalidate cert_store keyring and destroy
+ * all keys of "cert_store_key" type.
+ */
+static int fill_cs_keyring(void)
+{
+	struct key *cs_keyring;
+	struct vcssb *vcssb;
+	int rc;
+
+	rc = -ENOMEM;
+	vcssb = kmalloc(VCSSB_LEN_BYTES, GFP_KERNEL);
+	if (!vcssb)
+		goto cleanup_keys;
+
+	rc = -ENOENT;
+	if (!sclp.has_diag320) {
+		pr_dbf_msg("Certificate Store is not supported");
+		goto cleanup_keys;
+	}
+
+	rc = query_diag320_subcodes();
+	if (rc)
+		goto cleanup_keys;
+
+	rc = get_vcssb(vcssb);
+	if (rc)
+		goto cleanup_keys;
+
+	rc = -ENOMEM;
+	cs_keyring = create_cs_keyring();
+	if (!cs_keyring)
+		goto cleanup_keys;
+
+	rc = add_certificates_to_keyring(vcssb, cs_keyring);
+	if (rc)
+		goto cleanup_cs_keyring;
+
+	goto out;
+
+cleanup_cs_keyring:
+	key_put(cs_keyring);
+cleanup_keys:
+	cleanup_cs_keys();
+out:
+	kfree(vcssb);
+	return rc;
+}
+
+static DEFINE_MUTEX(cs_refresh_lock);
+static int cs_status_val = -1;
+
+static ssize_t cs_status_show(struct kobject *kobj,
+			      struct kobj_attribute *attr, char *buf)
+{
+	if (cs_status_val == -1)
+		return sysfs_emit(buf, "uninitialized\n");
+	else if (cs_status_val == 0)
+		return sysfs_emit(buf, "ok\n");
+
+	return sysfs_emit(buf, "failed (%d)\n", cs_status_val);
+}
+
+static struct kobj_attribute cs_status_attr = __ATTR_RO(cs_status);
+
+static ssize_t refresh_store(struct kobject *kobj, struct kobj_attribute *attr,
+			     const char *buf, size_t count)
+{
+	int rc, retries;
+
+	pr_dbf_msg("Refresh certificate store information requested");
+	rc = mutex_lock_interruptible(&cs_refresh_lock);
+	if (rc)
+		return rc;
+
+	for (retries = 0; retries < DIAG_MAX_RETRIES; retries++) {
+		/* Request certificates from certificate store. */
+		rc = fill_cs_keyring();
+		if (rc)
+			pr_dbf_msg("Failed to refresh certificate store information (%d)", rc);
+		if (rc != -EAGAIN)
+			break;
+	}
+	cs_status_val = rc;
+	mutex_unlock(&cs_refresh_lock);
+
+	return rc ?: count;
+}
+
+static struct kobj_attribute refresh_attr = __ATTR_WO(refresh);
+
+static const struct attribute *cert_store_attrs[] __initconst = {
+	&cs_status_attr.attr,
+	&refresh_attr.attr,
+	NULL,
+};
+
+static struct kobject *cert_store_kobj;
+
+static int __init cert_store_init(void)
+{
+	int rc = -ENOMEM;
+
+	cert_store_dbf = debug_register("cert_store_msg", 10, 1, 64);
+	if (!cert_store_dbf)
+		goto cleanup_dbf;
+
+	cert_store_hexdump = debug_register("cert_store_hexdump", 3, 1, 128);
+	if (!cert_store_hexdump)
+		goto cleanup_dbf;
+
+	debug_register_view(cert_store_hexdump, &debug_hex_ascii_view);
+	debug_register_view(cert_store_dbf, &debug_sprintf_view);
+
+	/* Create directory /sys/firmware/cert_store. */
+	cert_store_kobj = kobject_create_and_add("cert_store", firmware_kobj);
+	if (!cert_store_kobj)
+		goto cleanup_dbf;
+
+	rc = sysfs_create_files(cert_store_kobj, cert_store_attrs);
+	if (rc)
+		goto cleanup_kobj;
+
+	register_key_type(&key_type_cert_store_key);
+
+	return rc;
+
+cleanup_kobj:
+	kobject_put(cert_store_kobj);
+cleanup_dbf:
+	debug_unregister(cert_store_dbf);
+	debug_unregister(cert_store_hexdump);
+
+	return rc;
+}
+device_initcall(cert_store_init);
diff --git a/arch/s390/kernel/compat_audit.c b/arch/s390/kernel/compat_audit.c
index 444fb1f66944..a7c46e8310f0 100644
--- a/arch/s390/kernel/compat_audit.c
+++ b/arch/s390/kernel/compat_audit.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #undef __s390x__
+#include <linux/audit_arch.h>
 #include <asm/unistd.h>
 #include "audit.h"
 
@@ -32,14 +33,16 @@ int s390_classify_syscall(unsigned syscall)
 {
 	switch(syscall) {
 	case __NR_open:
-		return 2;
+		return AUDITSC_OPEN;
 	case __NR_openat:
-		return 3;
+		return AUDITSC_OPENAT;
 	case __NR_socketcall:
-		return 4;
+		return AUDITSC_SOCKETCALL;
 	case __NR_execve:
-		return 5;
+		return AUDITSC_EXECVE;
+	case __NR_openat2:
+		return AUDITSC_OPENAT2;
 	default:
-		return 1;
+		return AUDITSC_COMPAT;
 	}
 }
diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h
index 64509e7dbd3b..ef23739b277c 100644
--- a/arch/s390/kernel/compat_linux.h
+++ b/arch/s390/kernel/compat_linux.h
@@ -5,69 +5,59 @@
 #include <linux/compat.h>
 #include <linux/socket.h>
 #include <linux/syscalls.h>
+#include <asm/ptrace.h>
 
-/* Macro that masks the high order bit of an 32 bit pointer and converts it*/
-/*       to a 64 bit pointer */
-#define A(__x) ((unsigned long)((__x) & 0x7FFFFFFFUL))
-#define AA(__x)				\
-	((unsigned long)(__x))
+/*
+ * Macro that masks the high order bit of a 32 bit pointer and
+ * converts it to a 64 bit pointer.
+ */
+#define A(__x)	((unsigned long)((__x) & 0x7FFFFFFFUL))
+#define AA(__x)	((unsigned long)(__x))
 
 /* Now 32bit compatibility types */
 struct ipc_kludge_32 {
-        __u32   msgp;                           /* pointer              */
-        __s32   msgtyp;
+	__u32	msgp;	/* pointer */
+	__s32	msgtyp;
 };
 
 /* asm/sigcontext.h */
-typedef union
-{
-	__u64   d;
-	__u32   f; 
+typedef union {
+	__u64	d;
+	__u32	f;
 } freg_t32;
 
-typedef struct
-{
+typedef struct {
 	unsigned int	fpc;
 	unsigned int	pad;
-	freg_t32	fprs[__NUM_FPRS];              
+	freg_t32	fprs[__NUM_FPRS];
 } _s390_fp_regs32;
 
-typedef struct 
-{
-        __u32   mask;
-        __u32	addr;
-} _psw_t32 __attribute__ ((aligned(8)));
-
-typedef struct
-{
-	_psw_t32	psw;
+typedef struct {
+	psw_t32		psw;
 	__u32		gprs[__NUM_GPRS];
 	__u32		acrs[__NUM_ACRS];
 } _s390_regs_common32;
 
-typedef struct
-{
+typedef struct {
 	_s390_regs_common32 regs;
-	_s390_fp_regs32     fpregs;
+	_s390_fp_regs32	    fpregs;
 } _sigregs32;
 
-typedef struct
-{
-	__u32 gprs_high[__NUM_GPRS];
-	__u64 vxrs_low[__NUM_VXRS_LOW];
-	__vector128 vxrs_high[__NUM_VXRS_HIGH];
-	__u8 __reserved[128];
+typedef struct {
+	__u32		gprs_high[__NUM_GPRS];
+	__u64		vxrs_low[__NUM_VXRS_LOW];
+	__vector128	vxrs_high[__NUM_VXRS_HIGH];
+	__u8		__reserved[128];
 } _sigregs_ext32;
 
 #define _SIGCONTEXT_NSIG32	64
 #define _SIGCONTEXT_NSIG_BPW32	32
 #define __SIGNAL_FRAMESIZE32	96
-#define _SIGMASK_COPY_SIZE32	(sizeof(u32)*2)
+#define _SIGMASK_COPY_SIZE32	(sizeof(u32) * 2)
 
-struct sigcontext32
-{
+struct sigcontext32 {
 	__u32	oldmask[_COMPAT_NSIG_WORDS];
-	__u32	sregs;				/* pointer */
+	__u32	sregs;	/* pointer */
 };
 
 /* asm/signal.h */
@@ -75,11 +65,11 @@ struct sigcontext32
 /* asm/ucontext.h */
 struct ucontext32 {
 	__u32			uc_flags;
-	__u32			uc_link;	/* pointer */	
+	__u32			uc_link;	/* pointer */
 	compat_stack_t		uc_stack;
 	_sigregs32		uc_mcontext;
 	compat_sigset_t		uc_sigmask;
-	/* Allow for uc_sigmask growth.  Glibc uses a 1024-bit sigset_t.  */
+	/* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */
 	unsigned char		__unused[128 - sizeof(compat_sigset_t)];
 	_sigregs_ext32		uc_mcontext_ext;
 };
@@ -88,25 +78,6 @@ struct stat64_emu31;
 struct mmap_arg_struct_emu31;
 struct fadvise64_64_args;
 
-long compat_sys_s390_chown16(const char __user *filename, u16 user, u16 group);
-long compat_sys_s390_lchown16(const char __user *filename, u16 user, u16 group);
-long compat_sys_s390_fchown16(unsigned int fd, u16 user, u16 group);
-long compat_sys_s390_setregid16(u16 rgid, u16 egid);
-long compat_sys_s390_setgid16(u16 gid);
-long compat_sys_s390_setreuid16(u16 ruid, u16 euid);
-long compat_sys_s390_setuid16(u16 uid);
-long compat_sys_s390_setresuid16(u16 ruid, u16 euid, u16 suid);
-long compat_sys_s390_getresuid16(u16 __user *ruid, u16 __user *euid, u16 __user *suid);
-long compat_sys_s390_setresgid16(u16 rgid, u16 egid, u16 sgid);
-long compat_sys_s390_getresgid16(u16 __user *rgid, u16 __user *egid, u16 __user *sgid);
-long compat_sys_s390_setfsuid16(u16 uid);
-long compat_sys_s390_setfsgid16(u16 gid);
-long compat_sys_s390_getgroups16(int gidsetsize, u16 __user *grouplist);
-long compat_sys_s390_setgroups16(int gidsetsize, u16 __user *grouplist);
-long compat_sys_s390_getuid16(void);
-long compat_sys_s390_geteuid16(void);
-long compat_sys_s390_getgid16(void);
-long compat_sys_s390_getegid16(void);
 long compat_sys_s390_truncate64(const char __user *path, u32 high, u32 low);
 long compat_sys_s390_ftruncate64(unsigned int fd, u32 high, u32 low);
 long compat_sys_s390_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, u32 high, u32 low);
@@ -118,8 +89,8 @@ long compat_sys_s390_fstat64(unsigned int fd, struct stat64_emu31 __user *statbu
 long compat_sys_s390_fstatat64(unsigned int dfd, const char __user *filename, struct stat64_emu31 __user *statbuf, int flag);
 long compat_sys_s390_old_mmap(struct mmap_arg_struct_emu31 __user *arg);
 long compat_sys_s390_mmap2(struct mmap_arg_struct_emu31 __user *arg);
-long compat_sys_s390_read(unsigned int fd, char __user * buf, compat_size_t count);
-long compat_sys_s390_write(unsigned int fd, const char __user * buf, compat_size_t count);
+long compat_sys_s390_read(unsigned int fd, char __user *buf, compat_size_t count);
+long compat_sys_s390_write(unsigned int fd, const char __user *buf, compat_size_t count);
 long compat_sys_s390_fadvise64(int fd, u32 high, u32 low, compat_size_t len, int advise);
 long compat_sys_s390_fadvise64_64(struct fadvise64_64_args __user *args);
 long compat_sys_s390_sync_file_range(int fd, u32 offhigh, u32 offlow, u32 nhigh, u32 nlow, unsigned int flags);
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 38d4bdbc34b9..5a86b9d1da71 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -24,10 +24,12 @@
 #include <linux/tty.h>
 #include <linux/personality.h>
 #include <linux/binfmts.h>
+#include <asm/vdso-symbols.h>
+#include <asm/access-regs.h>
 #include <asm/ucontext.h>
 #include <linux/uaccess.h>
 #include <asm/lowcore.h>
-#include <asm/switch_to.h>
+#include <asm/fpu.h>
 #include "compat_linux.h"
 #include "compat_ptrace.h"
 #include "entry.h"
@@ -54,7 +56,7 @@ typedef struct
 static void store_sigregs(void)
 {
 	save_access_regs(current->thread.acrs);
-	save_fpu_regs();
+	save_user_fpu_regs();
 }
 
 /* Load registers after signal return */
@@ -77,7 +79,7 @@ static int save_sigregs32(struct pt_regs *regs, _sigregs32 __user *sregs)
 		user_sregs.regs.gprs[i] = (__u32) regs->gprs[i];
 	memcpy(&user_sregs.regs.acrs, current->thread.acrs,
 	       sizeof(user_sregs.regs.acrs));
-	fpregs_store((_s390_fp_regs *) &user_sregs.fpregs, &current->thread.fpu);
+	fpregs_store((_s390_fp_regs *) &user_sregs.fpregs, &current->thread.ufpu);
 	if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs32)))
 		return -EFAULT;
 	return 0;
@@ -88,7 +90,7 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs)
 	_sigregs32 user_sregs;
 	int i;
 
-	/* Alwys make any pending restarted system call return -EINTR */
+	/* Always make any pending restarted system call return -EINTR */
 	current->restart_block.fn = do_no_restart_syscall;
 
 	if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs)))
@@ -97,10 +99,6 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs)
 	if (!is_ri_task(current) && (user_sregs.regs.psw.mask & PSW32_MASK_RI))
 		return -EINVAL;
 
-	/* Test the floating-point-control word. */
-	if (test_fp_ctl(user_sregs.fpregs.fpc))
-		return -EINVAL;
-
 	/* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */
 	regs->psw.mask = (regs->psw.mask & ~(PSW_MASK_USER | PSW_MASK_RI)) |
 		(__u64)(user_sregs.regs.psw.mask & PSW32_MASK_USER) << 32 |
@@ -115,7 +113,7 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs)
 		regs->gprs[i] = (__u64) user_sregs.regs.gprs[i];
 	memcpy(&current->thread.acrs, &user_sregs.regs.acrs,
 	       sizeof(current->thread.acrs));
-	fpregs_load((_s390_fp_regs *) &user_sregs.fpregs, &current->thread.fpu);
+	fpregs_load((_s390_fp_regs *)&user_sregs.fpregs, &current->thread.ufpu);
 
 	clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */
 	return 0;
@@ -136,13 +134,13 @@ static int save_sigregs_ext32(struct pt_regs *regs,
 		return -EFAULT;
 
 	/* Save vector registers to signal stack */
-	if (MACHINE_HAS_VX) {
+	if (cpu_has_vx()) {
 		for (i = 0; i < __NUM_VXRS_LOW; i++)
-			vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1);
+			vxrs[i] = current->thread.ufpu.vxrs[i].low;
 		if (__copy_to_user(&sregs_ext->vxrs_low, vxrs,
 				   sizeof(sregs_ext->vxrs_low)) ||
 		    __copy_to_user(&sregs_ext->vxrs_high,
-				   current->thread.fpu.vxrs + __NUM_VXRS_LOW,
+				   current->thread.ufpu.vxrs + __NUM_VXRS_LOW,
 				   sizeof(sregs_ext->vxrs_high)))
 			return -EFAULT;
 	}
@@ -164,15 +162,15 @@ static int restore_sigregs_ext32(struct pt_regs *regs,
 		*(__u32 *)&regs->gprs[i] = gprs_high[i];
 
 	/* Restore vector registers from signal stack */
-	if (MACHINE_HAS_VX) {
+	if (cpu_has_vx()) {
 		if (__copy_from_user(vxrs, &sregs_ext->vxrs_low,
 				     sizeof(sregs_ext->vxrs_low)) ||
-		    __copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW,
+		    __copy_from_user(current->thread.ufpu.vxrs + __NUM_VXRS_LOW,
 				     &sregs_ext->vxrs_high,
 				     sizeof(sregs_ext->vxrs_high)))
 			return -EFAULT;
 		for (i = 0; i < __NUM_VXRS_LOW; i++)
-			*((__u64 *)(current->thread.fpu.vxrs + i) + 1) = vxrs[i];
+			current->thread.ufpu.vxrs[i].low = vxrs[i];
 	}
 	return 0;
 }
@@ -186,7 +184,7 @@ COMPAT_SYSCALL_DEFINE0(sigreturn)
 	if (get_compat_sigset(&set, (compat_sigset_t __user *)frame->sc.oldmask))
 		goto badframe;
 	set_current_blocked(&set);
-	save_fpu_regs();
+	save_user_fpu_regs();
 	if (restore_sigregs32(regs, &frame->sregs))
 		goto badframe;
 	if (restore_sigregs_ext32(regs, &frame->sregs_ext))
@@ -209,7 +207,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
 	set_current_blocked(&set);
 	if (compat_restore_altstack(&frame->uc.uc_stack))
 		goto badframe;
-	save_fpu_regs();
+	save_user_fpu_regs();
 	if (restore_sigregs32(regs, &frame->uc.uc_mcontext))
 		goto badframe;
 	if (restore_sigregs_ext32(regs, &frame->uc.uc_mcontext_ext))
@@ -264,7 +262,7 @@ static int setup_frame32(struct ksignal *ksig, sigset_t *set,
 	 * the machine supports it
 	 */
 	frame_size = sizeof(*frame) - sizeof(frame->sregs_ext.__reserved);
-	if (!MACHINE_HAS_VX)
+	if (!cpu_has_vx())
 		frame_size -= sizeof(frame->sregs_ext.vxrs_low) +
 			      sizeof(frame->sregs_ext.vxrs_high);
 	frame = get_sigframe(&ksig->ka, regs, frame_size);
@@ -303,11 +301,7 @@ static int setup_frame32(struct ksignal *ksig, sigset_t *set,
 		restorer = (unsigned long __force)
 			ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE;
 	} else {
-		/* Signal frames without vectors registers are short ! */
-		__u16 __user *svc = (void __user *) frame + frame_size - 2;
-		if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc))
-			return -EFAULT;
-		restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE;
+		restorer = VDSO32_SYMBOL(current, sigreturn);
         }
 
 	/* Set up registers for signal handler */
@@ -351,11 +345,12 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set,
 	 * the machine supports it
 	 */
 	uc_flags = UC_GPRS_HIGH;
-	if (MACHINE_HAS_VX) {
+	if (cpu_has_vx()) {
 		uc_flags |= UC_VXRS;
-	} else
+	} else {
 		frame_size -= sizeof(frame->uc.uc_mcontext_ext.vxrs_low) +
 			      sizeof(frame->uc.uc_mcontext_ext.vxrs_high);
+	}
 	frame = get_sigframe(&ksig->ka, regs, frame_size);
 	if (frame == (void __user *) -1UL)
 		return -EFAULT;
@@ -370,10 +365,7 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set,
 		restorer = (unsigned long __force)
 			ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE;
 	} else {
-		__u16 __user *svc = &frame->svc_insn;
-		if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc))
-			return -EFAULT;
-		restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE;
+		restorer = VDSO32_SYMBOL(current, rt_sigreturn);
 	}
 
 	/* Create siginfo on the signal stack */
diff --git a/arch/s390/kernel/cpacf.c b/arch/s390/kernel/cpacf.c
new file mode 100644
index 000000000000..4b9b34f95d72
--- /dev/null
+++ b/arch/s390/kernel/cpacf.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2024
+ */
+
+#define KMSG_COMPONENT "cpacf"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/device.h>
+#include <linux/sysfs.h>
+#include <asm/cpacf.h>
+
+#define CPACF_QUERY(name, instruction)						\
+static ssize_t name##_query_raw_read(struct file *fp,				\
+				     struct kobject *kobj,			\
+				     const struct bin_attribute *attr,		\
+				     char *buf, loff_t offs,			\
+				     size_t count)				\
+{										\
+	cpacf_mask_t mask;							\
+										\
+	if (!cpacf_query(CPACF_##instruction, &mask))				\
+		return -EOPNOTSUPP;						\
+	return memory_read_from_buffer(buf, count, &offs, &mask, sizeof(mask));	\
+}										\
+static const BIN_ATTR_RO(name##_query_raw, sizeof(cpacf_mask_t))
+
+CPACF_QUERY(km, KM);
+CPACF_QUERY(kmc, KMC);
+CPACF_QUERY(kimd, KIMD);
+CPACF_QUERY(klmd, KLMD);
+CPACF_QUERY(kmac, KMAC);
+CPACF_QUERY(pckmo, PCKMO);
+CPACF_QUERY(kmf, KMF);
+CPACF_QUERY(kmctr, KMCTR);
+CPACF_QUERY(kmo, KMO);
+CPACF_QUERY(pcc, PCC);
+CPACF_QUERY(prno, PRNO);
+CPACF_QUERY(kma, KMA);
+CPACF_QUERY(kdsa, KDSA);
+
+#define CPACF_QAI(name, instruction)					\
+static ssize_t name##_query_auth_info_raw_read(				\
+	struct file *fp, struct kobject *kobj,				\
+	const struct bin_attribute *attr, char *buf, loff_t offs,	\
+	size_t count)							\
+{									\
+	cpacf_qai_t qai;						\
+									\
+	if (!cpacf_qai(CPACF_##instruction, &qai))			\
+		return -EOPNOTSUPP;					\
+	return memory_read_from_buffer(buf, count, &offs, &qai,		\
+					sizeof(qai));			\
+}									\
+static const BIN_ATTR_RO(name##_query_auth_info_raw, sizeof(cpacf_qai_t))
+
+CPACF_QAI(km, KM);
+CPACF_QAI(kmc, KMC);
+CPACF_QAI(kimd, KIMD);
+CPACF_QAI(klmd, KLMD);
+CPACF_QAI(kmac, KMAC);
+CPACF_QAI(pckmo, PCKMO);
+CPACF_QAI(kmf, KMF);
+CPACF_QAI(kmctr, KMCTR);
+CPACF_QAI(kmo, KMO);
+CPACF_QAI(pcc, PCC);
+CPACF_QAI(prno, PRNO);
+CPACF_QAI(kma, KMA);
+CPACF_QAI(kdsa, KDSA);
+
+static const struct bin_attribute *const cpacf_attrs[] = {
+	&bin_attr_km_query_raw,
+	&bin_attr_kmc_query_raw,
+	&bin_attr_kimd_query_raw,
+	&bin_attr_klmd_query_raw,
+	&bin_attr_kmac_query_raw,
+	&bin_attr_pckmo_query_raw,
+	&bin_attr_kmf_query_raw,
+	&bin_attr_kmctr_query_raw,
+	&bin_attr_kmo_query_raw,
+	&bin_attr_pcc_query_raw,
+	&bin_attr_prno_query_raw,
+	&bin_attr_kma_query_raw,
+	&bin_attr_kdsa_query_raw,
+	&bin_attr_km_query_auth_info_raw,
+	&bin_attr_kmc_query_auth_info_raw,
+	&bin_attr_kimd_query_auth_info_raw,
+	&bin_attr_klmd_query_auth_info_raw,
+	&bin_attr_kmac_query_auth_info_raw,
+	&bin_attr_pckmo_query_auth_info_raw,
+	&bin_attr_kmf_query_auth_info_raw,
+	&bin_attr_kmctr_query_auth_info_raw,
+	&bin_attr_kmo_query_auth_info_raw,
+	&bin_attr_pcc_query_auth_info_raw,
+	&bin_attr_prno_query_auth_info_raw,
+	&bin_attr_kma_query_auth_info_raw,
+	&bin_attr_kdsa_query_auth_info_raw,
+	NULL,
+};
+
+static const struct attribute_group cpacf_attr_grp = {
+	.name = "cpacf",
+	.bin_attrs_new = cpacf_attrs,
+};
+
+static int __init cpacf_init(void)
+{
+	struct device *cpu_root;
+	int rc = 0;
+
+	cpu_root = bus_get_dev_root(&cpu_subsys);
+	if (cpu_root) {
+		rc = sysfs_create_group(&cpu_root->kobj, &cpacf_attr_grp);
+		put_device(cpu_root);
+	}
+	return rc;
+}
+device_initcall(cpacf_init);
diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c
index af013b4244d3..2f4174b961de 100644
--- a/arch/s390/kernel/cpcmd.c
+++ b/arch/s390/kernel/cpcmd.c
@@ -16,41 +16,45 @@
 #include <linux/stddef.h>
 #include <linux/string.h>
 #include <linux/mm.h>
+#include <linux/io.h>
 #include <asm/diag.h>
 #include <asm/ebcdic.h>
 #include <asm/cpcmd.h>
-#include <asm/io.h>
+#include <asm/asm.h>
 
 static DEFINE_SPINLOCK(cpcmd_lock);
 static char cpcmd_buf[241];
 
 static int diag8_noresponse(int cmdlen)
 {
-	register unsigned long reg2 asm ("2") = (addr_t) cpcmd_buf;
-	register unsigned long reg3 asm ("3") = cmdlen;
-
 	asm volatile(
-		"	diag	%1,%0,0x8\n"
-		: "+d" (reg3) : "d" (reg2) : "cc");
-	return reg3;
+		"	diag	%[rx],%[ry],0x8\n"
+		: [ry] "+&d" (cmdlen)
+		: [rx] "d" (__pa(cpcmd_buf))
+		: "cc");
+	return cmdlen;
 }
 
 static int diag8_response(int cmdlen, char *response, int *rlen)
 {
-	register unsigned long reg2 asm ("2") = (addr_t) cpcmd_buf;
-	register unsigned long reg3 asm ("3") = (addr_t) response;
-	register unsigned long reg4 asm ("4") = cmdlen | 0x40000000L;
-	register unsigned long reg5 asm ("5") = *rlen;
+	union register_pair rx, ry;
+	int cc;
 
+	rx.even = __pa(cpcmd_buf);
+	rx.odd	= __pa(response);
+	ry.even = cmdlen | 0x40000000L;
+	ry.odd	= *rlen;
 	asm volatile(
-		"	diag	%2,%0,0x8\n"
-		"	brc	8,1f\n"
-		"	agr	%1,%4\n"
-		"1:\n"
-		: "+d" (reg4), "+d" (reg5)
-		: "d" (reg2), "d" (reg3), "d" (*rlen) : "cc");
-	*rlen = reg5;
-	return reg4;
+		"	diag	%[rx],%[ry],0x8\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc), [ry] "+d" (ry.pair)
+		: [rx] "d" (rx.pair)
+		: CC_CLOBBER);
+	if (CC_TRANSFORM(cc))
+		*rlen += ry.odd;
+	else
+		*rlen = ry.odd;
+	return ry.even;
 }
 
 /*
diff --git a/arch/s390/kernel/cpufeature.c b/arch/s390/kernel/cpufeature.c
new file mode 100644
index 000000000000..1b2ae42a0c15
--- /dev/null
+++ b/arch/s390/kernel/cpufeature.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2022
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/bug.h>
+#include <asm/elf.h>
+
+enum {
+	TYPE_HWCAP,
+	TYPE_FACILITY,
+};
+
+struct s390_cpu_feature {
+	unsigned int type	: 4;
+	unsigned int num	: 28;
+};
+
+static struct s390_cpu_feature s390_cpu_features[MAX_CPU_FEATURES] = {
+	[S390_CPU_FEATURE_MSA]	= {.type = TYPE_HWCAP, .num = HWCAP_NR_MSA},
+	[S390_CPU_FEATURE_VXRS]	= {.type = TYPE_HWCAP, .num = HWCAP_NR_VXRS},
+	[S390_CPU_FEATURE_UV]	= {.type = TYPE_FACILITY, .num = 158},
+};
+
+/*
+ * cpu_have_feature - Test CPU features on module initialization
+ */
+int cpu_have_feature(unsigned int num)
+{
+	struct s390_cpu_feature *feature;
+
+	if (WARN_ON_ONCE(num >= MAX_CPU_FEATURES))
+		return 0;
+	feature = &s390_cpu_features[num];
+	switch (feature->type) {
+	case TYPE_HWCAP:
+		return !!(elf_hwcap & BIT(feature->num));
+	case TYPE_FACILITY:
+		return test_facility(feature->num);
+	default:
+		WARN_ON_ONCE(1);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(cpu_have_feature);
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index f96a5857bbfd..4a981266b483 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -15,11 +15,14 @@
 #include <linux/slab.h>
 #include <linux/memblock.h>
 #include <linux/elf.h>
+#include <linux/uio.h>
 #include <asm/asm-offsets.h>
 #include <asm/os_info.h>
 #include <asm/elf.h>
 #include <asm/ipl.h>
 #include <asm/sclp.h>
+#include <asm/maccess.h>
+#include <asm/fpu.h>
 
 #define PTR_ADD(x, y) (((char *) (x)) + ((unsigned long) (y)))
 #define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y)))
@@ -44,7 +47,7 @@ struct save_area {
 	u64 fprs[16];
 	u32 fpc;
 	u32 prefix;
-	u64 todpreg;
+	u32 todpreg;
 	u64 timer;
 	u64 todcmp;
 	u64 vxrs_low[16];
@@ -60,9 +63,7 @@ struct save_area * __init save_area_alloc(bool is_boot_cpu)
 {
 	struct save_area *sa;
 
-	sa = (void *) memblock_phys_alloc(sizeof(*sa), 8);
-	if (!sa)
-		panic("Failed to allocate save area\n");
+	sa = memblock_alloc_or_panic(sizeof(*sa), 8);
 
 	if (is_boot_cpu)
 		list_add(&sa->list, &dump_save_areas);
@@ -108,126 +109,65 @@ void __init save_area_add_vxrs(struct save_area *sa, __vector128 *vxrs)
 
 	/* Copy lower halves of vector registers 0-15 */
 	for (i = 0; i < 16; i++)
-		memcpy(&sa->vxrs_low[i], &vxrs[i].u[2], 8);
+		sa->vxrs_low[i] = vxrs[i].low;
 	/* Copy vector registers 16-31 */
 	memcpy(sa->vxrs_high, vxrs + 16, 16 * sizeof(__vector128));
 }
 
-/*
- * Return physical address for virtual address
- */
-static inline void *load_real_addr(void *addr)
+static size_t copy_oldmem_iter(struct iov_iter *iter, unsigned long src, size_t count)
 {
-	unsigned long real_addr;
-
-	asm volatile(
-		   "	lra     %0,0(%1)\n"
-		   "	jz	0f\n"
-		   "	la	%0,0\n"
-		   "0:"
-		   : "=a" (real_addr) : "a" (addr) : "cc");
-	return (void *)real_addr;
-}
-
-/*
- * Copy memory of the old, dumped system to a kernel space virtual address
- */
-int copy_oldmem_kernel(void *dst, void *src, size_t count)
-{
-	unsigned long from, len;
-	void *ra;
-	int rc;
+	size_t len, copied, res = 0;
 
 	while (count) {
-		from = __pa(src);
-		if (!OLDMEM_BASE && from < sclp.hsa_size) {
-			/* Copy from zfcpdump HSA area */
-			len = min(count, sclp.hsa_size - from);
-			rc = memcpy_hsa_kernel(dst, from, len);
-			if (rc)
-				return rc;
+		if (!oldmem_data.start && src < sclp.hsa_size) {
+			/* Copy from zfcp/nvme dump HSA area */
+			len = min(count, sclp.hsa_size - src);
+			copied = memcpy_hsa_iter(iter, src, len);
 		} else {
 			/* Check for swapped kdump oldmem areas */
-			if (OLDMEM_BASE && from - OLDMEM_BASE < OLDMEM_SIZE) {
-				from -= OLDMEM_BASE;
-				len = min(count, OLDMEM_SIZE - from);
-			} else if (OLDMEM_BASE && from < OLDMEM_SIZE) {
-				len = min(count, OLDMEM_SIZE - from);
-				from += OLDMEM_BASE;
+			if (oldmem_data.start && src - oldmem_data.start < oldmem_data.size) {
+				src -= oldmem_data.start;
+				len = min(count, oldmem_data.size - src);
+			} else if (oldmem_data.start && src < oldmem_data.size) {
+				len = min(count, oldmem_data.size - src);
+				src += oldmem_data.start;
 			} else {
 				len = count;
 			}
-			if (is_vmalloc_or_module_addr(dst)) {
-				ra = load_real_addr(dst);
-				len = min(PAGE_SIZE - offset_in_page(ra), len);
-			} else {
-				ra = dst;
-			}
-			if (memcpy_real(ra, (void *) from, len))
-				return -EFAULT;
+			copied = memcpy_real_iter(iter, src, len);
 		}
-		dst += len;
-		src += len;
-		count -= len;
+		count -= copied;
+		src += copied;
+		res += copied;
+		if (copied < len)
+			break;
 	}
-	return 0;
+	return res;
 }
 
-/*
- * Copy memory of the old, dumped system to a user space virtual address
- */
-static int copy_oldmem_user(void __user *dst, void *src, size_t count)
+int copy_oldmem_kernel(void *dst, unsigned long src, size_t count)
 {
-	unsigned long from, len;
-	int rc;
+	struct iov_iter iter;
+	struct kvec kvec;
 
-	while (count) {
-		from = __pa(src);
-		if (!OLDMEM_BASE && from < sclp.hsa_size) {
-			/* Copy from zfcpdump HSA area */
-			len = min(count, sclp.hsa_size - from);
-			rc = memcpy_hsa_user(dst, from, len);
-			if (rc)
-				return rc;
-		} else {
-			/* Check for swapped kdump oldmem areas */
-			if (OLDMEM_BASE && from - OLDMEM_BASE < OLDMEM_SIZE) {
-				from -= OLDMEM_BASE;
-				len = min(count, OLDMEM_SIZE - from);
-			} else if (OLDMEM_BASE && from < OLDMEM_SIZE) {
-				len = min(count, OLDMEM_SIZE - from);
-				from += OLDMEM_BASE;
-			} else {
-				len = count;
-			}
-			rc = copy_to_user_real(dst, (void *) from, count);
-			if (rc)
-				return rc;
-		}
-		dst += len;
-		src += len;
-		count -= len;
-	}
+	kvec.iov_base = dst;
+	kvec.iov_len = count;
+	iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
+	if (copy_oldmem_iter(&iter, src, count) < count)
+		return -EFAULT;
 	return 0;
 }
 
 /*
  * Copy one page from "oldmem"
  */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
-			 unsigned long offset, int userbuf)
+ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize,
+			 unsigned long offset)
 {
-	void *src;
-	int rc;
+	unsigned long src;
 
-	if (!csize)
-		return 0;
-	src = (void *) (pfn << PAGE_SHIFT) + offset;
-	if (userbuf)
-		rc = copy_oldmem_user((void __force __user *) buf, src, csize);
-	else
-		rc = copy_oldmem_kernel((void *) buf, src, csize);
-	return rc;
+	src = pfn_to_phys(pfn) + offset;
+	return copy_oldmem_iter(iter, src, csize);
 }
 
 /*
@@ -243,10 +183,10 @@ static int remap_oldmem_pfn_range_kdump(struct vm_area_struct *vma,
 	unsigned long size_old;
 	int rc;
 
-	if (pfn < OLDMEM_SIZE >> PAGE_SHIFT) {
-		size_old = min(size, OLDMEM_SIZE - (pfn << PAGE_SHIFT));
+	if (pfn < oldmem_data.size >> PAGE_SHIFT) {
+		size_old = min(size, oldmem_data.size - (pfn << PAGE_SHIFT));
 		rc = remap_pfn_range(vma, from,
-				     pfn + (OLDMEM_BASE >> PAGE_SHIFT),
+				     pfn + (oldmem_data.start >> PAGE_SHIFT),
 				     size_old, prot);
 		if (rc || size == size_old)
 			return rc;
@@ -258,7 +198,7 @@ static int remap_oldmem_pfn_range_kdump(struct vm_area_struct *vma,
 }
 
 /*
- * Remap "oldmem" for zfcpdump
+ * Remap "oldmem" for zfcp/nvme dump
  *
  * We only map available memory above HSA size. Memory below HSA size
  * is read on demand using the copy_oldmem_page() function.
@@ -283,26 +223,28 @@ static int remap_oldmem_pfn_range_zfcpdump(struct vm_area_struct *vma,
 }
 
 /*
- * Remap "oldmem" for kdump or zfcpdump
+ * Remap "oldmem" for kdump or zfcp/nvme dump
  */
 int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from,
 			   unsigned long pfn, unsigned long size, pgprot_t prot)
 {
-	if (OLDMEM_BASE)
+	if (oldmem_data.start)
 		return remap_oldmem_pfn_range_kdump(vma, from, pfn, size, prot);
 	else
 		return remap_oldmem_pfn_range_zfcpdump(vma, from, pfn, size,
 						       prot);
 }
 
-static const char *nt_name(Elf64_Word type)
+/*
+ * Return true only when in a kdump or stand-alone kdump environment.
+ * Note that /proc/vmcore might also be available in "standard zfcp/nvme dump"
+ * environments, where this function returns false; see dump_available().
+ */
+bool is_kdump_kernel(void)
 {
-	const char *name = "LINUX";
-
-	if (type == NT_PRPSINFO || type == NT_PRSTATUS || type == NT_PRFPREG)
-		name = KEXEC_CORE_NOTE_NAME;
-	return name;
+	return oldmem_data.start;
 }
+EXPORT_SYMBOL_GPL(is_kdump_kernel);
 
 /*
  * Initialize ELF note
@@ -328,10 +270,8 @@ static void *nt_init_name(void *buf, Elf64_Word type, void *desc, int d_len,
 	return PTR_ADD(buf, len);
 }
 
-static inline void *nt_init(void *buf, Elf64_Word type, void *desc, int d_len)
-{
-	return nt_init_name(buf, type, desc, d_len, nt_name(type));
-}
+#define nt_init(buf, type, desc) \
+	nt_init_name(buf, NT_ ## type, &(desc), sizeof(desc), NN_ ## type)
 
 /*
  * Calculate the size of ELF note
@@ -347,10 +287,7 @@ static size_t nt_size_name(int d_len, const char *name)
 	return size;
 }
 
-static inline size_t nt_size(Elf64_Word type, int d_len)
-{
-	return nt_size_name(d_len, nt_name(type));
-}
+#define nt_size(type, desc) nt_size_name(sizeof(desc), NN_ ## type)
 
 /*
  * Fill ELF notes for one CPU with save area registers
@@ -365,24 +302,22 @@ static void *fill_cpu_elf_notes(void *ptr, int cpu, struct save_area *sa)
 	memcpy(&nt_prstatus.pr_reg.gprs, sa->gprs, sizeof(sa->gprs));
 	memcpy(&nt_prstatus.pr_reg.psw, sa->psw, sizeof(sa->psw));
 	memcpy(&nt_prstatus.pr_reg.acrs, sa->acrs, sizeof(sa->acrs));
-	nt_prstatus.pr_pid = cpu;
+	nt_prstatus.common.pr_pid = cpu;
 	/* Prepare fpregset (floating point) note */
 	memset(&nt_fpregset, 0, sizeof(nt_fpregset));
 	memcpy(&nt_fpregset.fpc, &sa->fpc, sizeof(sa->fpc));
 	memcpy(&nt_fpregset.fprs, &sa->fprs, sizeof(sa->fprs));
 	/* Create ELF notes for the CPU */
-	ptr = nt_init(ptr, NT_PRSTATUS, &nt_prstatus, sizeof(nt_prstatus));
-	ptr = nt_init(ptr, NT_PRFPREG, &nt_fpregset, sizeof(nt_fpregset));
-	ptr = nt_init(ptr, NT_S390_TIMER, &sa->timer, sizeof(sa->timer));
-	ptr = nt_init(ptr, NT_S390_TODCMP, &sa->todcmp, sizeof(sa->todcmp));
-	ptr = nt_init(ptr, NT_S390_TODPREG, &sa->todpreg, sizeof(sa->todpreg));
-	ptr = nt_init(ptr, NT_S390_CTRS, &sa->ctrs, sizeof(sa->ctrs));
-	ptr = nt_init(ptr, NT_S390_PREFIX, &sa->prefix, sizeof(sa->prefix));
-	if (MACHINE_HAS_VX) {
-		ptr = nt_init(ptr, NT_S390_VXRS_HIGH,
-			      &sa->vxrs_high, sizeof(sa->vxrs_high));
-		ptr = nt_init(ptr, NT_S390_VXRS_LOW,
-			      &sa->vxrs_low, sizeof(sa->vxrs_low));
+	ptr = nt_init(ptr, PRSTATUS, nt_prstatus);
+	ptr = nt_init(ptr, PRFPREG, nt_fpregset);
+	ptr = nt_init(ptr, S390_TIMER, sa->timer);
+	ptr = nt_init(ptr, S390_TODCMP, sa->todcmp);
+	ptr = nt_init(ptr, S390_TODPREG, sa->todpreg);
+	ptr = nt_init(ptr, S390_CTRS, sa->ctrs);
+	ptr = nt_init(ptr, S390_PREFIX, sa->prefix);
+	if (cpu_has_vx()) {
+		ptr = nt_init(ptr, S390_VXRS_HIGH, sa->vxrs_high);
+		ptr = nt_init(ptr, S390_VXRS_LOW, sa->vxrs_low);
 	}
 	return ptr;
 }
@@ -395,16 +330,16 @@ static size_t get_cpu_elf_notes_size(void)
 	struct save_area *sa = NULL;
 	size_t size;
 
-	size =	nt_size(NT_PRSTATUS, sizeof(struct elf_prstatus));
-	size +=  nt_size(NT_PRFPREG, sizeof(elf_fpregset_t));
-	size +=  nt_size(NT_S390_TIMER, sizeof(sa->timer));
-	size +=  nt_size(NT_S390_TODCMP, sizeof(sa->todcmp));
-	size +=  nt_size(NT_S390_TODPREG, sizeof(sa->todpreg));
-	size +=  nt_size(NT_S390_CTRS, sizeof(sa->ctrs));
-	size +=  nt_size(NT_S390_PREFIX, sizeof(sa->prefix));
-	if (MACHINE_HAS_VX) {
-		size += nt_size(NT_S390_VXRS_HIGH, sizeof(sa->vxrs_high));
-		size += nt_size(NT_S390_VXRS_LOW, sizeof(sa->vxrs_low));
+	size =	nt_size(PRSTATUS, struct elf_prstatus);
+	size += nt_size(PRFPREG, elf_fpregset_t);
+	size += nt_size(S390_TIMER, sa->timer);
+	size += nt_size(S390_TODCMP, sa->todcmp);
+	size += nt_size(S390_TODPREG, sa->todpreg);
+	size += nt_size(S390_CTRS, sa->ctrs);
+	size += nt_size(S390_PREFIX, sa->prefix);
+	if (cpu_has_vx()) {
+		size += nt_size(S390_VXRS_HIGH, sa->vxrs_high);
+		size += nt_size(S390_VXRS_LOW, sa->vxrs_low);
 	}
 
 	return size;
@@ -420,7 +355,7 @@ static void *nt_prpsinfo(void *ptr)
 	memset(&prpsinfo, 0, sizeof(prpsinfo));
 	prpsinfo.pr_sname = 'R';
 	strcpy(prpsinfo.pr_fname, "vmlinux");
-	return nt_init(ptr, NT_PRPSINFO, &prpsinfo, sizeof(prpsinfo));
+	return nt_init(ptr, PRPSINFO, prpsinfo);
 }
 
 /*
@@ -429,10 +364,10 @@ static void *nt_prpsinfo(void *ptr)
 static void *get_vmcoreinfo_old(unsigned long *size)
 {
 	char nt_name[11], *vmcoreinfo;
+	unsigned long addr;
 	Elf64_Nhdr note;
-	void *addr;
 
-	if (copy_oldmem_kernel(&addr, &S390_lowcore.vmcore_info, sizeof(addr)))
+	if (copy_oldmem_kernel(&addr, __LC_VMCORE_INFO, sizeof(addr)))
 		return NULL;
 	memset(nt_name, 0, sizeof(nt_name));
 	if (copy_oldmem_kernel(&note, addr, sizeof(note)))
@@ -509,7 +444,7 @@ static void *nt_final(void *ptr)
 /*
  * Initialize ELF header (new kernel)
  */
-static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt)
+static void *ehdr_init(Elf64_Ehdr *ehdr, int phdr_count)
 {
 	memset(ehdr, 0, sizeof(*ehdr));
 	memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
@@ -523,7 +458,8 @@ static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt)
 	ehdr->e_phoff = sizeof(Elf64_Ehdr);
 	ehdr->e_ehsize = sizeof(Elf64_Ehdr);
 	ehdr->e_phentsize = sizeof(Elf64_Phdr);
-	ehdr->e_phnum = mem_chunk_cnt + 1;
+	/* Number of PT_LOAD program headers plus PT_NOTE program header */
+	ehdr->e_phnum = phdr_count + 1;
 	return ehdr + 1;
 }
 
@@ -549,34 +485,82 @@ static int get_mem_chunk_cnt(void)
 	int cnt = 0;
 	u64 idx;
 
-	for_each_mem_range(idx, &memblock.physmem, &oldmem_type, NUMA_NO_NODE,
-			   MEMBLOCK_NONE, NULL, NULL, NULL)
+	for_each_physmem_range(idx, &oldmem_type, NULL, NULL)
 		cnt++;
 	return cnt;
 }
 
+static void fill_ptload(Elf64_Phdr *phdr, unsigned long paddr,
+		unsigned long vaddr, unsigned long size)
+{
+	phdr->p_type = PT_LOAD;
+	phdr->p_vaddr = vaddr;
+	phdr->p_offset = paddr;
+	phdr->p_paddr = paddr;
+	phdr->p_filesz = size;
+	phdr->p_memsz = size;
+	phdr->p_flags = PF_R | PF_W | PF_X;
+	phdr->p_align = PAGE_SIZE;
+}
+
 /*
  * Initialize ELF loads (new kernel)
  */
-static void loads_init(Elf64_Phdr *phdr, u64 loads_offset)
+static void loads_init(Elf64_Phdr *phdr, bool os_info_has_vm)
 {
+	unsigned long old_identity_base = 0;
 	phys_addr_t start, end;
 	u64 idx;
 
-	for_each_mem_range(idx, &memblock.physmem, &oldmem_type, NUMA_NO_NODE,
-			   MEMBLOCK_NONE, &start, &end, NULL) {
-		phdr->p_filesz = end - start;
-		phdr->p_type = PT_LOAD;
-		phdr->p_offset = start;
-		phdr->p_vaddr = start;
-		phdr->p_paddr = start;
-		phdr->p_memsz = end - start;
-		phdr->p_flags = PF_R | PF_W | PF_X;
-		phdr->p_align = PAGE_SIZE;
+	if (os_info_has_vm)
+		old_identity_base = os_info_old_value(OS_INFO_IDENTITY_BASE);
+	for_each_physmem_range(idx, &oldmem_type, &start, &end) {
+		fill_ptload(phdr, start, old_identity_base + start,
+			    end - start);
 		phdr++;
 	}
 }
 
+static bool os_info_has_vm(void)
+{
+	return os_info_old_value(OS_INFO_KASLR_OFFSET);
+}
+
+#ifdef CONFIG_PROC_VMCORE_DEVICE_RAM
+/*
+ * Fill PT_LOAD for a physical memory range owned by a device and detected by
+ * its device driver.
+ */
+void elfcorehdr_fill_device_ram_ptload_elf64(Elf64_Phdr *phdr,
+		unsigned long long paddr, unsigned long long size)
+{
+	unsigned long old_identity_base = 0;
+
+	if (os_info_has_vm())
+		old_identity_base = os_info_old_value(OS_INFO_IDENTITY_BASE);
+	fill_ptload(phdr, paddr, old_identity_base + paddr, size);
+}
+#endif
+
+/*
+ * Prepare PT_LOAD type program header for kernel image region
+ */
+static void text_init(Elf64_Phdr *phdr)
+{
+	unsigned long start_phys = os_info_old_value(OS_INFO_IMAGE_PHYS);
+	unsigned long start = os_info_old_value(OS_INFO_IMAGE_START);
+	unsigned long end = os_info_old_value(OS_INFO_IMAGE_END);
+
+	phdr->p_type = PT_LOAD;
+	phdr->p_vaddr = start;
+	phdr->p_filesz = end - start;
+	phdr->p_memsz = end - start;
+	phdr->p_offset = start_phys;
+	phdr->p_paddr = start_phys;
+	phdr->p_flags = PF_R | PF_W | PF_X;
+	phdr->p_align = PAGE_SIZE;
+}
+
 /*
  * Initialize notes (new kernel)
  */
@@ -602,7 +586,7 @@ static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset)
 	return ptr;
 }
 
-static size_t get_elfcorehdr_size(int mem_chunk_cnt)
+static size_t get_elfcorehdr_size(int phdr_count)
 {
 	size_t size;
 
@@ -610,7 +594,7 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt)
 	/* PT_NOTES */
 	size += sizeof(Elf64_Phdr);
 	/* nt_prpsinfo */
-	size += nt_size(NT_PRPSINFO, sizeof(struct elf_prpsinfo));
+	size += nt_size(PRPSINFO, struct elf_prpsinfo);
 	/* regsets */
 	size += get_cpu_cnt() * get_cpu_elf_notes_size();
 	/* nt_vmcoreinfo */
@@ -618,7 +602,7 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt)
 	/* nt_final */
 	size += sizeof(Elf64_Nhdr);
 	/* PT_LOADS */
-	size += mem_chunk_cnt * sizeof(Elf64_Phdr);
+	size += phdr_count * sizeof(Elf64_Phdr);
 
 	return size;
 }
@@ -628,33 +612,35 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt)
  */
 int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
 {
-	Elf64_Phdr *phdr_notes, *phdr_loads;
-	int mem_chunk_cnt;
+	Elf64_Phdr *phdr_notes, *phdr_loads, *phdr_text;
+	int mem_chunk_cnt, phdr_text_cnt;
+	size_t alloc_size;
 	void *ptr, *hdr;
-	u32 alloc_size;
 	u64 hdr_off;
 
-	/* If we are not in kdump or zfcpdump mode return */
-	if (!OLDMEM_BASE && ipl_info.type != IPL_TYPE_FCP_DUMP)
+	/* If we are not in kdump or zfcp/nvme dump mode return */
+	if (!oldmem_data.start && !is_ipl_type_dump())
 		return 0;
-	/* If we cannot get HSA size for zfcpdump return error */
-	if (ipl_info.type == IPL_TYPE_FCP_DUMP && !sclp.hsa_size)
+	/* If we cannot get HSA size for zfcp/nvme dump return error */
+	if (is_ipl_type_dump() && !sclp.hsa_size)
 		return -ENODEV;
 
 	/* For kdump, exclude previous crashkernel memory */
-	if (OLDMEM_BASE) {
-		oldmem_region.base = OLDMEM_BASE;
-		oldmem_region.size = OLDMEM_SIZE;
-		oldmem_type.total_size = OLDMEM_SIZE;
+	if (oldmem_data.start) {
+		oldmem_region.base = oldmem_data.start;
+		oldmem_region.size = oldmem_data.size;
+		oldmem_type.total_size = oldmem_data.size;
 	}
 
 	mem_chunk_cnt = get_mem_chunk_cnt();
+	phdr_text_cnt = os_info_has_vm() ? 1 : 0;
 
-	alloc_size = get_elfcorehdr_size(mem_chunk_cnt);
+	alloc_size = get_elfcorehdr_size(mem_chunk_cnt + phdr_text_cnt);
 
 	hdr = kzalloc(alloc_size, GFP_KERNEL);
 
-	/* Without elfcorehdr /proc/vmcore cannot be created. Thus creating
+	/*
+	 * Without elfcorehdr /proc/vmcore cannot be created. Thus creating
 	 * a dump with this crash kernel will fail. Panic now to allow other
 	 * dump mechanisms to take over.
 	 */
@@ -662,18 +648,25 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
 		panic("s390 kdump allocating elfcorehdr failed");
 
 	/* Init elf header */
-	ptr = ehdr_init(hdr, mem_chunk_cnt);
+	phdr_notes = ehdr_init(hdr, mem_chunk_cnt + phdr_text_cnt);
 	/* Init program headers */
-	phdr_notes = ptr;
-	ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr));
-	phdr_loads = ptr;
-	ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr) * mem_chunk_cnt);
+	if (phdr_text_cnt) {
+		phdr_text = phdr_notes + 1;
+		phdr_loads = phdr_text + 1;
+	} else {
+		phdr_loads = phdr_notes + 1;
+	}
+	ptr = PTR_ADD(phdr_loads, sizeof(Elf64_Phdr) * mem_chunk_cnt);
 	/* Init notes */
 	hdr_off = PTR_DIFF(ptr, hdr);
 	ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off);
+	/* Init kernel text program header */
+	if (phdr_text_cnt)
+		text_init(phdr_text);
 	/* Init loads */
+	loads_init(phdr_loads, phdr_text_cnt);
+	/* Finalize program headers */
 	hdr_off = PTR_DIFF(ptr, hdr);
-	loads_init(phdr_loads, hdr_off);
 	*addr = (unsigned long long) hdr;
 	*size = (unsigned long long) hdr_off;
 	BUG_ON(elfcorehdr_size > alloc_size);
diff --git a/arch/s390/kernel/ctlreg.c b/arch/s390/kernel/ctlreg.c
new file mode 100644
index 000000000000..8cc26cf2c64a
--- /dev/null
+++ b/arch/s390/kernel/ctlreg.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *	Copyright IBM Corp. 1999, 2023
+ */
+
+#include <linux/irqflags.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/cache.h>
+#include <asm/abs_lowcore.h>
+#include <asm/ctlreg.h>
+
+/*
+ * ctl_lock guards access to global control register contents which
+ * are kept in the control register save area within absolute lowcore
+ * at physical address zero.
+ */
+static DEFINE_SPINLOCK(system_ctl_lock);
+
+void system_ctlreg_lock(void)
+	__acquires(&system_ctl_lock)
+{
+	spin_lock(&system_ctl_lock);
+}
+
+void system_ctlreg_unlock(void)
+	__releases(&system_ctl_lock)
+{
+	spin_unlock(&system_ctl_lock);
+}
+
+static bool system_ctlreg_area_init __ro_after_init;
+
+void __init system_ctlreg_init_save_area(struct lowcore *lc)
+{
+	struct lowcore *abs_lc;
+
+	abs_lc = get_abs_lowcore();
+	__local_ctl_store(0, 15, lc->cregs_save_area);
+	__local_ctl_store(0, 15, abs_lc->cregs_save_area);
+	put_abs_lowcore(abs_lc);
+	system_ctlreg_area_init = true;
+}
+
+struct ctlreg_parms {
+	unsigned long andval;
+	unsigned long orval;
+	unsigned long val;
+	int request;
+	int cr;
+};
+
+static void ctlreg_callback(void *info)
+{
+	struct ctlreg_parms *pp = info;
+	struct ctlreg regs[16];
+
+	__local_ctl_store(0, 15, regs);
+	if (pp->request == CTLREG_LOAD) {
+		regs[pp->cr].val = pp->val;
+	} else {
+		regs[pp->cr].val &= pp->andval;
+		regs[pp->cr].val |= pp->orval;
+	}
+	__local_ctl_load(0, 15, regs);
+}
+
+static void system_ctlreg_update(void *info)
+{
+	unsigned long flags;
+
+	if (system_state == SYSTEM_BOOTING) {
+		/*
+		 * For very early calls do not call on_each_cpu()
+		 * since not everything might be setup.
+		 */
+		local_irq_save(flags);
+		ctlreg_callback(info);
+		local_irq_restore(flags);
+	} else {
+		on_each_cpu(ctlreg_callback, info, 1);
+	}
+}
+
+void system_ctlreg_modify(unsigned int cr, unsigned long data, int request)
+{
+	struct ctlreg_parms pp = { .cr = cr, .request = request, };
+	struct lowcore *abs_lc;
+
+	switch (request) {
+	case CTLREG_SET_BIT:
+		pp.orval  = 1UL << data;
+		pp.andval = -1UL;
+		break;
+	case CTLREG_CLEAR_BIT:
+		pp.orval  = 0;
+		pp.andval = ~(1UL << data);
+		break;
+	case CTLREG_LOAD:
+		pp.val = data;
+		break;
+	}
+	if (system_ctlreg_area_init) {
+		system_ctlreg_lock();
+		abs_lc = get_abs_lowcore();
+		if (request == CTLREG_LOAD) {
+			abs_lc->cregs_save_area[cr].val = pp.val;
+		} else {
+			abs_lc->cregs_save_area[cr].val &= pp.andval;
+			abs_lc->cregs_save_area[cr].val |= pp.orval;
+		}
+		put_abs_lowcore(abs_lc);
+		system_ctlreg_update(&pp);
+		system_ctlreg_unlock();
+	} else {
+		system_ctlreg_update(&pp);
+	}
+}
+EXPORT_SYMBOL(system_ctlreg_modify);
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index 6d321f5f101d..ce038e9205f7 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -2,7 +2,7 @@
 /*
  *   S/390 debug facility
  *
- *    Copyright IBM Corp. 1999, 2012
+ *    Copyright IBM Corp. 1999, 2020
  *
  *    Author(s): Michael Holzheu (holzheu@de.ibm.com),
  *		 Holger Smolinski (Holger.Smolinski@de.ibm.com)
@@ -24,6 +24,8 @@
 #include <linux/export.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/math.h>
+#include <linux/minmax.h>
 #include <linux/debugfs.h>
 
 #include <asm/debug.h>
@@ -37,13 +39,13 @@
 
 typedef struct file_private_info {
 	loff_t offset;			/* offset of last read in file */
-	int    act_area;		/* number of last formated area */
+	int    act_area;		/* number of last formatted area */
 	int    act_page;		/* act page in given area */
-	int    act_entry;		/* last formated entry (offset */
+	int    act_entry;		/* last formatted entry (offset */
 					/* relative to beginning of last */
-					/* formated page) */
+					/* formatted page) */
 	size_t act_entry_offset;	/* up to this offset we copied */
-					/* in last read the last formated */
+					/* in last read the last formatted */
 					/* entry to userland */
 	char   temp_buf[2048];		/* buffer for output */
 	debug_info_t *debug_info_org;	/* original debug information */
@@ -59,10 +61,10 @@ typedef struct {
 	 * except of floats, and long long (32 bit)
 	 *
 	 */
-	long args[0];
+	long args[];
 } debug_sprintf_entry_t;
 
-/* internal function prototyes */
+/* internal function prototypes */
 
 static int debug_init(void);
 static ssize_t debug_output(struct file *file, char __user *user_buf,
@@ -76,12 +78,14 @@ static debug_info_t *debug_info_create(const char *name, int pages_per_area,
 static void debug_info_get(debug_info_t *);
 static void debug_info_put(debug_info_t *);
 static int debug_prolog_level_fn(debug_info_t *id,
-				 struct debug_view *view, char *out_buf);
+				 struct debug_view *view, char *out_buf,
+				 size_t out_buf_size);
 static int debug_input_level_fn(debug_info_t *id, struct debug_view *view,
 				struct file *file, const char __user *user_buf,
 				size_t user_buf_size, loff_t *offset);
 static int debug_prolog_pages_fn(debug_info_t *id,
-				 struct debug_view *view, char *out_buf);
+				 struct debug_view *view, char *out_buf,
+				 size_t out_buf_size);
 static int debug_input_pages_fn(debug_info_t *id, struct debug_view *view,
 				struct file *file, const char __user *user_buf,
 				size_t user_buf_size, loff_t *offset);
@@ -89,28 +93,13 @@ static int debug_input_flush_fn(debug_info_t *id, struct debug_view *view,
 				struct file *file, const char __user *user_buf,
 				size_t user_buf_size, loff_t *offset);
 static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view,
-				     char *out_buf, const char *in_buf);
-static int debug_raw_format_fn(debug_info_t *id,
-			       struct debug_view *view, char *out_buf,
-			       const char *in_buf);
-static int debug_raw_header_fn(debug_info_t *id, struct debug_view *view,
-			       int area, debug_entry_t *entry, char *out_buf);
-
-static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view,
-				   char *out_buf, debug_sprintf_entry_t *curr_event);
+				     char *out_buf, size_t out_buf_size,
+				     const char *in_buf);
+static void debug_areas_swap(debug_info_t *a, debug_info_t *b);
+static void debug_events_append(debug_info_t *dest, debug_info_t *src);
 
 /* globals */
 
-struct debug_view debug_raw_view = {
-	"raw",
-	NULL,
-	&debug_raw_header_fn,
-	&debug_raw_format_fn,
-	NULL,
-	NULL
-};
-EXPORT_SYMBOL(debug_raw_view);
-
 struct debug_view debug_hex_ascii_view = {
 	"hex_ascii",
 	NULL,
@@ -152,7 +141,7 @@ struct debug_view debug_sprintf_view = {
 	"sprintf",
 	NULL,
 	&debug_dflt_header_fn,
-	(debug_format_proc_t *)&debug_sprintf_format_fn,
+	&debug_sprintf_format_fn,
 	NULL,
 	NULL
 };
@@ -176,7 +165,6 @@ static const struct file_operations debug_file_ops = {
 	.write	 = debug_input,
 	.open	 = debug_open,
 	.release = debug_close,
-	.llseek  = no_llseek,
 };
 
 static struct dentry *debug_debugfs_root_entry;
@@ -198,9 +186,10 @@ static debug_entry_t ***debug_areas_alloc(int pages_per_area, int nr_areas)
 	if (!areas)
 		goto fail_malloc_areas;
 	for (i = 0; i < nr_areas; i++) {
+		/* GFP_NOWARN to avoid user triggerable WARN, we handle fails */
 		areas[i] = kmalloc_array(pages_per_area,
 					 sizeof(debug_entry_t *),
-					 GFP_KERNEL);
+					 GFP_KERNEL | __GFP_NOWARN);
 		if (!areas[i])
 			goto fail_malloc_areas2;
 		for (j = 0; j < pages_per_area; j++) {
@@ -262,7 +251,7 @@ static debug_info_t *debug_info_alloc(const char *name, int pages_per_area,
 	rc->level	   = level;
 	rc->buf_size	   = buf_size;
 	rc->entry_size	   = sizeof(debug_entry_t) + buf_size;
-	strlcpy(rc->name, name, sizeof(rc->name));
+	strscpy(rc->name, name, sizeof(rc->name));
 	memset(rc->views, 0, DEBUG_MAX_VIEWS * sizeof(struct debug_view *));
 	memset(rc->debugfs_entries, 0, DEBUG_MAX_VIEWS * sizeof(struct dentry *));
 	refcount_set(&(rc->ref_count), 0);
@@ -326,24 +315,6 @@ static debug_info_t *debug_info_create(const char *name, int pages_per_area,
 		goto out;
 
 	rc->mode = mode & ~S_IFMT;
-
-	/* create root directory */
-	rc->debugfs_root_entry = debugfs_create_dir(rc->name,
-						    debug_debugfs_root_entry);
-
-	/* append new element to linked list */
-	if (!debug_area_first) {
-		/* first element in list */
-		debug_area_first = rc;
-		rc->prev = NULL;
-	} else {
-		/* append element to end of list */
-		debug_area_last->next = rc;
-		rc->prev = debug_area_last;
-	}
-	debug_area_last = rc;
-	rc->next = NULL;
-
 	refcount_set(&rc->ref_count, 1);
 out:
 	return rc;
@@ -381,7 +352,10 @@ static debug_info_t *debug_info_copy(debug_info_t *in, int mode)
 	for (i = 0; i < in->nr_areas; i++) {
 		for (j = 0; j < in->pages_per_area; j++)
 			memcpy(rc->areas[i][j], in->areas[i][j], PAGE_SIZE);
+		rc->active_pages[i] = in->active_pages[i];
+		rc->active_entries[i] = in->active_entries[i];
 	}
+	rc->active_area = in->active_area;
 out:
 	spin_unlock_irqrestore(&in->lock, flags);
 	return rc;
@@ -403,32 +377,15 @@ static void debug_info_get(debug_info_t *db_info)
  */
 static void debug_info_put(debug_info_t *db_info)
 {
-	int i;
-
 	if (!db_info)
 		return;
-	if (refcount_dec_and_test(&db_info->ref_count)) {
-		for (i = 0; i < DEBUG_MAX_VIEWS; i++) {
-			if (!db_info->views[i])
-				continue;
-			debugfs_remove(db_info->debugfs_entries[i]);
-		}
-		debugfs_remove(db_info->debugfs_root_entry);
-		if (db_info == debug_area_first)
-			debug_area_first = db_info->next;
-		if (db_info == debug_area_last)
-			debug_area_last = db_info->prev;
-		if (db_info->prev)
-			db_info->prev->next = db_info->next;
-		if (db_info->next)
-			db_info->next->prev = db_info->prev;
+	if (refcount_dec_and_test(&db_info->ref_count))
 		debug_info_free(db_info);
-	}
 }
 
 /*
  * debug_format_entry:
- * - format one debug entry and return size of formated data
+ * - format one debug entry and return size of formatted data
  */
 static int debug_format_entry(file_private_info_t *p_info)
 {
@@ -439,8 +396,10 @@ static int debug_format_entry(file_private_info_t *p_info)
 
 	if (p_info->act_entry == DEBUG_PROLOG_ENTRY) {
 		/* print prolog */
-		if (view->prolog_proc)
-			len += view->prolog_proc(id_snap, view, p_info->temp_buf);
+		if (view->prolog_proc) {
+			len += view->prolog_proc(id_snap, view, p_info->temp_buf,
+						 sizeof(p_info->temp_buf));
+		}
 		goto out;
 	}
 	if (!id_snap->areas) /* this is true, if we have a prolog only view */
@@ -448,23 +407,33 @@ static int debug_format_entry(file_private_info_t *p_info)
 	act_entry = (debug_entry_t *) ((char *)id_snap->areas[p_info->act_area]
 				       [p_info->act_page] + p_info->act_entry);
 
-	if (act_entry->id.stck == 0LL)
+	if (act_entry->clock == 0LL)
 		goto out; /* empty entry */
-	if (view->header_proc)
+	if (view->header_proc) {
 		len += view->header_proc(id_snap, view, p_info->act_area,
-					 act_entry, p_info->temp_buf + len);
-	if (view->format_proc)
+					 act_entry, p_info->temp_buf + len,
+					 sizeof(p_info->temp_buf) - len);
+	}
+	if (view->format_proc) {
 		len += view->format_proc(id_snap, view, p_info->temp_buf + len,
+					 sizeof(p_info->temp_buf) - len,
 					 DEBUG_DATA(act_entry));
+	}
 out:
 	return len;
 }
 
-/*
- * debug_next_entry:
- * - goto next entry in p_info
+/**
+ * debug_next_entry - Go to the next entry
+ * @p_info:	Private info that is manipulated
+ *
+ * Sets the current position in @p_info to the next entry. If no further entry
+ * exists the current position is set to one after the end the return value
+ * indicates that no further entries exist.
+ *
+ * Return: True if there are more following entries, false otherwise
  */
-static inline int debug_next_entry(file_private_info_t *p_info)
+static inline bool debug_next_entry(file_private_info_t *p_info)
 {
 	debug_info_t *id;
 
@@ -472,10 +441,10 @@ static inline int debug_next_entry(file_private_info_t *p_info)
 	if (p_info->act_entry == DEBUG_PROLOG_ENTRY) {
 		p_info->act_entry = 0;
 		p_info->act_page  = 0;
-		goto out;
+		return true;
 	}
 	if (!id->areas)
-		return 1;
+		return false;
 	p_info->act_entry += id->entry_size;
 	/* switch to next page, if we reached the end of the page  */
 	if (p_info->act_entry > (PAGE_SIZE - id->entry_size)) {
@@ -488,16 +457,93 @@ static inline int debug_next_entry(file_private_info_t *p_info)
 			p_info->act_page = 0;
 		}
 		if (p_info->act_area >= id->nr_areas)
-			return 1;
+			return false;
 	}
-out:
-	return 0;
+	return true;
+}
+
+/**
+ * debug_to_act_entry - Go to the currently active entry
+ * @p_info:	Private info that is manipulated
+ *
+ * Sets the current position in @p_info to the currently active
+ * entry of @p_info->debug_info_snap
+ */
+static void debug_to_act_entry(file_private_info_t *p_info)
+{
+	debug_info_t *snap_id;
+
+	snap_id = p_info->debug_info_snap;
+	p_info->act_area = snap_id->active_area;
+	p_info->act_page = snap_id->active_pages[snap_id->active_area];
+	p_info->act_entry = snap_id->active_entries[snap_id->active_area];
+}
+
+/**
+ * debug_prev_entry - Go to the previous entry
+ * @p_info:	Private info that is manipulated
+ *
+ * Sets the current position in @p_info to the previous entry. If no previous entry
+ * exists the current position is set left as DEBUG_PROLOG_ENTRY and the return value
+ * indicates that no previous entries exist.
+ *
+ * Return: True if there are more previous entries, false otherwise
+ */
+
+static inline bool debug_prev_entry(file_private_info_t *p_info)
+{
+	debug_info_t *id;
+
+	id = p_info->debug_info_snap;
+	if (p_info->act_entry == DEBUG_PROLOG_ENTRY)
+		debug_to_act_entry(p_info);
+	if (!id->areas)
+		return false;
+	p_info->act_entry -= id->entry_size;
+	/* switch to prev page, if we reached the beginning of the page  */
+	if (p_info->act_entry < 0) {
+		/* end of previous page */
+		p_info->act_entry = rounddown(PAGE_SIZE, id->entry_size) - id->entry_size;
+		p_info->act_page--;
+		if (p_info->act_page < 0) {
+			/* previous area */
+			p_info->act_area--;
+			p_info->act_page = id->pages_per_area - 1;
+		}
+		if (p_info->act_area < 0)
+			p_info->act_area = (id->nr_areas - 1) % id->nr_areas;
+	}
+	/* check full circle */
+	if (id->active_area == p_info->act_area &&
+	    id->active_pages[id->active_area] == p_info->act_page &&
+	    id->active_entries[id->active_area] == p_info->act_entry)
+		return false;
+	return true;
+}
+
+/**
+ * debug_move_entry - Go to next entry in either the forward or backward direction
+ * @p_info:	Private info that is manipulated
+ * @reverse:	If true go to the next entry in reverse i.e. previous
+ *
+ * Sets the current position in @p_info to the next (@reverse == false) or
+ * previous (@reverse == true) entry.
+ *
+ * Return: True if there are further entries in that direction,
+ * false otherwise.
+ */
+static bool debug_move_entry(file_private_info_t *p_info, bool reverse)
+{
+	if (reverse)
+		return debug_prev_entry(p_info);
+	else
+		return debug_next_entry(p_info);
 }
 
 /*
  * debug_output:
  * - called for user read()
- * - copies formated debug entries to the user buffer
+ * - copies formatted debug entries to the user buffer
  */
 static ssize_t debug_output(struct file *file,		/* file descriptor */
 			    char __user *user_buf,	/* user buffer */
@@ -533,7 +579,7 @@ static ssize_t debug_output(struct file *file,		/* file descriptor */
 		}
 		if (copy_size == formatted_line_residue) {
 			entry_offset = 0;
-			if (debug_next_entry(p_info))
+			if (!debug_next_entry(p_info))
 				goto out;
 		}
 	}
@@ -568,15 +614,51 @@ static ssize_t debug_input(struct file *file, const char __user *user_buf,
 	return rc; /* number of input characters */
 }
 
+static file_private_info_t *debug_file_private_alloc(debug_info_t *debug_info,
+						     struct debug_view *view)
+{
+	debug_info_t *debug_info_snapshot;
+	file_private_info_t *p_info;
+
+	/*
+	 * Make snapshot of current debug areas to get it consistent.
+	 * To copy all the areas is only needed, if we have a view which
+	 * formats the debug areas.
+	 */
+	if (!view->format_proc && !view->header_proc)
+		debug_info_snapshot = debug_info_copy(debug_info, NO_AREAS);
+	else
+		debug_info_snapshot = debug_info_copy(debug_info, ALL_AREAS);
+
+	if (!debug_info_snapshot)
+		return NULL;
+	p_info = kmalloc(sizeof(file_private_info_t), GFP_KERNEL);
+	if (!p_info) {
+		debug_info_free(debug_info_snapshot);
+		return NULL;
+	}
+	p_info->offset = 0;
+	p_info->debug_info_snap = debug_info_snapshot;
+	p_info->debug_info_org	= debug_info;
+	p_info->view = view;
+	p_info->act_area = 0;
+	p_info->act_page = 0;
+	p_info->act_entry = DEBUG_PROLOG_ENTRY;
+	p_info->act_entry_offset = 0;
+	debug_info_get(debug_info);
+
+	return p_info;
+}
+
 /*
  * debug_open:
  * - called for user open()
- * - copies formated output to private_data area of the file
+ * - copies formatted output to private_data area of the file
  *   handle
  */
 static int debug_open(struct inode *inode, struct file *file)
 {
-	debug_info_t *debug_info, *debug_info_snapshot;
+	debug_info_t *debug_info;
 	file_private_info_t *p_info;
 	int i, rc = 0;
 
@@ -594,42 +676,26 @@ static int debug_open(struct inode *inode, struct file *file)
 	goto out;
 
 found:
-
-	/* Make snapshot of current debug areas to get it consistent.	  */
-	/* To copy all the areas is only needed, if we have a view which  */
-	/* formats the debug areas. */
-
-	if (!debug_info->views[i]->format_proc && !debug_info->views[i]->header_proc)
-		debug_info_snapshot = debug_info_copy(debug_info, NO_AREAS);
-	else
-		debug_info_snapshot = debug_info_copy(debug_info, ALL_AREAS);
-
-	if (!debug_info_snapshot) {
-		rc = -ENOMEM;
-		goto out;
-	}
-	p_info = kmalloc(sizeof(file_private_info_t), GFP_KERNEL);
+	p_info = debug_file_private_alloc(debug_info, debug_info->views[i]);
 	if (!p_info) {
-		debug_info_free(debug_info_snapshot);
 		rc = -ENOMEM;
 		goto out;
 	}
-	p_info->offset = 0;
-	p_info->debug_info_snap = debug_info_snapshot;
-	p_info->debug_info_org	= debug_info;
-	p_info->view = debug_info->views[i];
-	p_info->act_area = 0;
-	p_info->act_page = 0;
-	p_info->act_entry = DEBUG_PROLOG_ENTRY;
-	p_info->act_entry_offset = 0;
 	file->private_data = p_info;
-	debug_info_get(debug_info);
 	nonseekable_open(inode, file);
 out:
 	mutex_unlock(&debug_mutex);
 	return rc;
 }
 
+static void debug_file_private_free(file_private_info_t *p_info)
+{
+	if (p_info->debug_info_snap)
+		debug_info_free(p_info->debug_info_snap);
+	debug_info_put(p_info->debug_info_org);
+	kfree(p_info);
+}
+
 /*
  * debug_close:
  * - called for user close()
@@ -640,14 +706,85 @@ static int debug_close(struct inode *inode, struct file *file)
 	file_private_info_t *p_info;
 
 	p_info = (file_private_info_t *) file->private_data;
-	if (p_info->debug_info_snap)
-		debug_info_free(p_info->debug_info_snap);
-	debug_info_put(p_info->debug_info_org);
-	kfree(file->private_data);
+	debug_file_private_free(p_info);
+	file->private_data = NULL;
 	return 0; /* success */
 }
 
 /**
+ * debug_dump - Get a textual representation of debug info, or as much as fits
+ * @id:		Debug information to use
+ * @view:	View with which to dump the debug information
+ * @buf:	Buffer the textual debug data representation is written to
+ * @buf_size:	Size of the buffer, including the trailing '\0' byte
+ * @reverse:	Go backwards from the last written entry
+ *
+ * This function may be used whenever a textual representation of the debug
+ * information is required without using an s390dbf file.
+ *
+ * Note: It is the callers responsibility to supply a view that is compatible
+ * with the debug information data.
+ *
+ * Return: On success returns the number of bytes written to the buffer not
+ * including the trailing '\0' byte. If bug_size == 0 the function returns 0.
+ * On failure an error code less than 0 is returned.
+ */
+ssize_t debug_dump(debug_info_t *id, struct debug_view *view,
+		   char *buf, size_t buf_size, bool reverse)
+{
+	file_private_info_t *p_info;
+	size_t size, offset = 0;
+
+	/* Need space for '\0' byte */
+	if (buf_size < 1)
+		return 0;
+	buf_size--;
+
+	p_info = debug_file_private_alloc(id, view);
+	if (!p_info)
+		return -ENOMEM;
+
+	/* There is always at least the DEBUG_PROLOG_ENTRY */
+	do {
+		size = debug_format_entry(p_info);
+		size = min(size, buf_size - offset);
+		memcpy(buf + offset, p_info->temp_buf, size);
+		offset += size;
+		if (offset >= buf_size)
+			break;
+	} while (debug_move_entry(p_info, reverse));
+	debug_file_private_free(p_info);
+	buf[offset] = '\0';
+
+	return offset;
+}
+
+/* Create debugfs entries and add to internal list. */
+static void _debug_register(debug_info_t *id)
+{
+	/* create root directory */
+	id->debugfs_root_entry = debugfs_create_dir(id->name,
+						    debug_debugfs_root_entry);
+
+	/* append new element to linked list */
+	if (!debug_area_first) {
+		/* first element in list */
+		debug_area_first = id;
+		id->prev = NULL;
+	} else {
+		/* append element to end of list */
+		debug_area_last->next = id;
+		id->prev = debug_area_last;
+	}
+	debug_area_last = id;
+	id->next = NULL;
+
+	debug_register_view(id, &debug_level_view);
+	debug_register_view(id, &debug_flush_view);
+	debug_register_view(id, &debug_pages_view);
+}
+
+/**
  * debug_register_mode() - creates and initializes debug area.
  *
  * @name:	Name of debug log (e.g. used for debugfs entry)
@@ -676,19 +813,16 @@ debug_info_t *debug_register_mode(const char *name, int pages_per_area,
 	if ((uid != 0) || (gid != 0))
 		pr_warn("Root becomes the owner of all s390dbf files in sysfs\n");
 	BUG_ON(!initialized);
-	mutex_lock(&debug_mutex);
 
 	/* create new debug_info */
 	rc = debug_info_create(name, pages_per_area, nr_areas, buf_size, mode);
-	if (!rc)
-		goto out;
-	debug_register_view(rc, &debug_level_view);
-	debug_register_view(rc, &debug_flush_view);
-	debug_register_view(rc, &debug_pages_view);
-out:
-	if (!rc)
+	if (rc) {
+		mutex_lock(&debug_mutex);
+		_debug_register(rc);
+		mutex_unlock(&debug_mutex);
+	} else {
 		pr_err("Registering debug feature %s failed\n", name);
-	mutex_unlock(&debug_mutex);
+	}
 	return rc;
 }
 EXPORT_SYMBOL(debug_register_mode);
@@ -718,6 +852,82 @@ debug_info_t *debug_register(const char *name, int pages_per_area,
 EXPORT_SYMBOL(debug_register);
 
 /**
+ * debug_register_static() - registers a static debug area
+ *
+ * @id: Handle for static debug area
+ * @pages_per_area: Number of pages per area
+ * @nr_areas: Number of debug areas
+ *
+ * Register debug_info_t defined using DEFINE_STATIC_DEBUG_INFO.
+ *
+ * Note: This function is called automatically via an initcall generated by
+ *	 DEFINE_STATIC_DEBUG_INFO.
+ */
+void debug_register_static(debug_info_t *id, int pages_per_area, int nr_areas)
+{
+	unsigned long flags;
+	debug_info_t *copy;
+
+	if (!initialized) {
+		pr_err("Tried to register debug feature %s too early\n",
+		       id->name);
+		return;
+	}
+
+	copy = debug_info_alloc("", pages_per_area, nr_areas, id->buf_size,
+				id->level, ALL_AREAS);
+	if (!copy) {
+		pr_err("Registering debug feature %s failed\n", id->name);
+
+		/* Clear pointers to prevent tracing into released initdata. */
+		spin_lock_irqsave(&id->lock, flags);
+		id->areas = NULL;
+		id->active_pages = NULL;
+		id->active_entries = NULL;
+		spin_unlock_irqrestore(&id->lock, flags);
+
+		return;
+	}
+
+	/* Replace static trace area with dynamic copy. */
+	spin_lock_irqsave(&id->lock, flags);
+	debug_events_append(copy, id);
+	debug_areas_swap(id, copy);
+	spin_unlock_irqrestore(&id->lock, flags);
+
+	/* Clear pointers to initdata and discard copy. */
+	copy->areas = NULL;
+	copy->active_pages = NULL;
+	copy->active_entries = NULL;
+	debug_info_free(copy);
+
+	mutex_lock(&debug_mutex);
+	_debug_register(id);
+	mutex_unlock(&debug_mutex);
+}
+
+/* Remove debugfs entries and remove from internal list. */
+static void _debug_unregister(debug_info_t *id)
+{
+	int i;
+
+	for (i = 0; i < DEBUG_MAX_VIEWS; i++) {
+		if (!id->views[i])
+			continue;
+		debugfs_remove(id->debugfs_entries[i]);
+	}
+	debugfs_remove(id->debugfs_root_entry);
+	if (id == debug_area_first)
+		debug_area_first = id->next;
+	if (id == debug_area_last)
+		debug_area_last = id->prev;
+	if (id->prev)
+		id->prev->next = id->next;
+	if (id->next)
+		id->next->prev = id->prev;
+}
+
+/**
  * debug_unregister() - give back debug area.
  *
  * @id:		handle for debug log
@@ -730,8 +940,10 @@ void debug_unregister(debug_info_t *id)
 	if (!id)
 		return;
 	mutex_lock(&debug_mutex);
-	debug_info_put(id);
+	_debug_unregister(id);
 	mutex_unlock(&debug_mutex);
+
+	debug_info_put(id);
 }
 EXPORT_SYMBOL(debug_unregister);
 
@@ -741,35 +953,28 @@ EXPORT_SYMBOL(debug_unregister);
  */
 static int debug_set_size(debug_info_t *id, int nr_areas, int pages_per_area)
 {
-	debug_entry_t ***new_areas;
+	debug_info_t *new_id;
 	unsigned long flags;
-	int rc = 0;
 
 	if (!id || (nr_areas <= 0) || (pages_per_area < 0))
 		return -EINVAL;
-	if (pages_per_area > 0) {
-		new_areas = debug_areas_alloc(pages_per_area, nr_areas);
-		if (!new_areas) {
-			pr_info("Allocating memory for %i pages failed\n",
-				pages_per_area);
-			rc = -ENOMEM;
-			goto out;
-		}
-	} else {
-		new_areas = NULL;
+
+	new_id = debug_info_alloc("", pages_per_area, nr_areas, id->buf_size,
+				  id->level, ALL_AREAS);
+	if (!new_id) {
+		pr_info("Allocating memory for %i pages failed\n",
+			pages_per_area);
+		return -ENOMEM;
 	}
+
 	spin_lock_irqsave(&id->lock, flags);
-	debug_areas_free(id);
-	id->areas = new_areas;
-	id->nr_areas = nr_areas;
-	id->pages_per_area = pages_per_area;
-	id->active_area = 0;
-	memset(id->active_entries, 0, sizeof(int)*id->nr_areas);
-	memset(id->active_pages, 0, sizeof(int)*id->nr_areas);
+	debug_events_append(new_id, id);
+	debug_areas_swap(new_id, id);
+	debug_info_free(new_id);
 	spin_unlock_irqrestore(&id->lock, flags);
 	pr_info("%s: set new size (%i pages)\n", id->name, pages_per_area);
-out:
-	return rc;
+
+	return 0;
 }
 
 /**
@@ -787,16 +992,17 @@ void debug_set_level(debug_info_t *id, int new_level)
 
 	if (!id)
 		return;
-	spin_lock_irqsave(&id->lock, flags);
+
 	if (new_level == DEBUG_OFF_LEVEL) {
-		id->level = DEBUG_OFF_LEVEL;
 		pr_info("%s: switched off\n", id->name);
 	} else if ((new_level > DEBUG_MAX_LEVEL) || (new_level < 0)) {
 		pr_info("%s: level %i is out of range (%i - %i)\n",
 			id->name, new_level, 0, DEBUG_MAX_LEVEL);
-	} else {
-		id->level = new_level;
+		return;
 	}
+
+	spin_lock_irqsave(&id->lock, flags);
+	id->level = new_level;
 	spin_unlock_irqrestore(&id->lock, flags);
 }
 EXPORT_SYMBOL(debug_set_level);
@@ -836,6 +1042,42 @@ static inline debug_entry_t *get_active_entry(debug_info_t *id)
 				  id->active_entries[id->active_area]);
 }
 
+/* Swap debug areas of a and b. */
+static void debug_areas_swap(debug_info_t *a, debug_info_t *b)
+{
+	swap(a->nr_areas, b->nr_areas);
+	swap(a->pages_per_area, b->pages_per_area);
+	swap(a->areas, b->areas);
+	swap(a->active_area, b->active_area);
+	swap(a->active_pages, b->active_pages);
+	swap(a->active_entries, b->active_entries);
+}
+
+/* Append all debug events in active area from source to destination log. */
+static void debug_events_append(debug_info_t *dest, debug_info_t *src)
+{
+	debug_entry_t *from, *to, *last;
+
+	if (!src->areas || !dest->areas)
+		return;
+
+	/* Loop over all entries in src, starting with oldest. */
+	from = get_active_entry(src);
+	last = from;
+	do {
+		if (from->clock != 0LL) {
+			to = get_active_entry(dest);
+			memset(to, 0, dest->entry_size);
+			memcpy(to, from, min(src->entry_size,
+					     dest->entry_size));
+			proceed_active_entry(dest);
+		}
+
+		proceed_active_entry(src);
+		from = get_active_entry(src);
+	} while (from != last);
+}
+
 /*
  * debug_finish_entry:
  * - set timestamp, caller address, cpu number etc.
@@ -844,12 +1086,17 @@ static inline debug_entry_t *get_active_entry(debug_info_t *id)
 static inline void debug_finish_entry(debug_info_t *id, debug_entry_t *active,
 				      int level, int exception)
 {
-	active->id.stck = get_tod_clock_fast() -
-		*(unsigned long long *) &tod_clock_base[1];
-	active->id.fields.cpuid = smp_processor_id();
+	unsigned long timestamp;
+	union tod_clock clk;
+
+	store_tod_clock_ext(&clk);
+	timestamp = clk.us;
+	timestamp -= TOD_UNIX_EPOCH >> 12;
+	active->clock = timestamp;
+	active->cpu = smp_processor_id();
 	active->caller = __builtin_return_address(0);
-	active->id.fields.exception = exception;
-	active->id.fields.level = level;
+	active->exception = exception;
+	active->level = level;
 	proceed_active_entry(id);
 	if (exception)
 		proceed_active_area(id);
@@ -866,8 +1113,8 @@ static int debug_active = 1;
  * always allow read, allow write only if debug_stoppable is set or
  * if debug_active is already off
  */
-static int s390dbf_procactive(struct ctl_table *table, int write,
-			      void __user *buffer, size_t *lenp, loff_t *ppos)
+static int s390dbf_procactive(const struct ctl_table *table, int write,
+			      void *buffer, size_t *lenp, loff_t *ppos)
 {
 	if (!write || debug_stoppable || !debug_active)
 		return proc_dointvec(table, write, buffer, lenp, ppos);
@@ -875,7 +1122,7 @@ static int s390dbf_procactive(struct ctl_table *table, int write,
 		return 0;
 }
 
-static struct ctl_table s390dbf_table[] = {
+static const struct ctl_table s390dbf_table[] = {
 	{
 		.procname	= "debug_stoppable",
 		.data		= &debug_stoppable,
@@ -890,17 +1137,6 @@ static struct ctl_table s390dbf_table[] = {
 		.mode		= S_IRUGO | S_IWUSR,
 		.proc_handler	= s390dbf_procactive,
 	},
-	{ }
-};
-
-static struct ctl_table s390dbf_dir_table[] = {
-	{
-		.procname	= "s390dbf",
-		.maxlen		= 0,
-		.mode		= S_IRUGO | S_IXUGO,
-		.child		= s390dbf_table,
-	},
-	{ }
 };
 
 static struct ctl_table_header *s390dbf_sysctl_header;
@@ -1121,16 +1357,17 @@ int debug_register_view(debug_info_t *id, struct debug_view *view)
 			break;
 	}
 	if (i == DEBUG_MAX_VIEWS) {
-		pr_err("Registering view %s/%s would exceed the maximum "
-		       "number of views %i\n", id->name, view->name, i);
 		rc = -1;
 	} else {
 		id->views[i] = view;
 		id->debugfs_entries[i] = pde;
 	}
 	spin_unlock_irqrestore(&id->lock, flags);
-	if (rc)
+	if (rc) {
+		pr_err("Registering view %s/%s would exceed the maximum "
+		       "number of views %i\n", id->name, view->name, i);
 		debugfs_remove(pde);
+	}
 out:
 	return rc;
 }
@@ -1215,9 +1452,9 @@ static inline int debug_get_uint(char *buf)
  */
 
 static int debug_prolog_pages_fn(debug_info_t *id, struct debug_view *view,
-				 char *out_buf)
+				 char *out_buf, size_t out_buf_size)
 {
-	return sprintf(out_buf, "%i\n", id->pages_per_area);
+	return scnprintf(out_buf, out_buf_size, "%i\n", id->pages_per_area);
 }
 
 /*
@@ -1264,14 +1501,14 @@ out:
  * prints out actual debug level
  */
 static int debug_prolog_level_fn(debug_info_t *id, struct debug_view *view,
-				 char *out_buf)
+				 char *out_buf, size_t out_buf_size)
 {
 	int rc = 0;
 
 	if (id->level == DEBUG_OFF_LEVEL)
-		rc = sprintf(out_buf, "-\n");
+		rc = scnprintf(out_buf, out_buf_size, "-\n");
 	else
-		rc = sprintf(out_buf, "%i\n", id->level);
+		rc = scnprintf(out_buf, out_buf_size, "%i\n", id->level);
 	return rc;
 }
 
@@ -1385,51 +1622,27 @@ out:
 }
 
 /*
- * prints debug header in raw format
- */
-static int debug_raw_header_fn(debug_info_t *id, struct debug_view *view,
-			       int area, debug_entry_t *entry, char *out_buf)
-{
-	int rc;
-
-	rc = sizeof(debug_entry_t);
-	memcpy(out_buf, entry, sizeof(debug_entry_t));
-	return rc;
-}
-
-/*
- * prints debug data in raw format
- */
-static int debug_raw_format_fn(debug_info_t *id, struct debug_view *view,
-			       char *out_buf, const char *in_buf)
-{
-	int rc;
-
-	rc = id->buf_size;
-	memcpy(out_buf, in_buf, id->buf_size);
-	return rc;
-}
-
-/*
  * prints debug data in hex/ascii format
  */
 static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view,
-				     char *out_buf, const char *in_buf)
+				     char *out_buf, size_t out_buf_size, const char *in_buf)
 {
 	int i, rc = 0;
 
-	for (i = 0; i < id->buf_size; i++)
-		rc += sprintf(out_buf + rc, "%02x ", ((unsigned char *) in_buf)[i]);
-	rc += sprintf(out_buf + rc, "| ");
+	for (i = 0; i < id->buf_size; i++) {
+		rc += scnprintf(out_buf + rc, out_buf_size - rc,
+				"%02x ", ((unsigned char *)in_buf)[i]);
+	}
+	rc += scnprintf(out_buf + rc, out_buf_size - rc, "| ");
 	for (i = 0; i < id->buf_size; i++) {
 		unsigned char c = in_buf[i];
 
 		if (isascii(c) && isprint(c))
-			rc += sprintf(out_buf + rc, "%c", c);
+			rc += scnprintf(out_buf + rc, out_buf_size - rc, "%c", c);
 		else
-			rc += sprintf(out_buf + rc, ".");
+			rc += scnprintf(out_buf + rc, out_buf_size - rc, ".");
 	}
-	rc += sprintf(out_buf + rc, "\n");
+	rc += scnprintf(out_buf + rc, out_buf_size - rc, "\n");
 	return rc;
 }
 
@@ -1437,41 +1650,42 @@ static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view,
  * prints header for debug entry
  */
 int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view,
-			 int area, debug_entry_t *entry, char *out_buf)
+			 int area, debug_entry_t *entry, char *out_buf,
+			 size_t out_buf_size)
 {
-	unsigned long base, sec, usec;
+	unsigned long sec, usec;
 	unsigned long caller;
 	unsigned int level;
 	char *except_str;
 	int rc = 0;
 
-	level = entry->id.fields.level;
-	base = (*(unsigned long *) &tod_clock_base[0]) >> 4;
-	sec = (entry->id.stck >> 12) + base - (TOD_UNIX_EPOCH >> 12);
+	level = entry->level;
+	sec = entry->clock;
 	usec = do_div(sec, USEC_PER_SEC);
 
-	if (entry->id.fields.exception)
+	if (entry->exception)
 		except_str = "*";
 	else
 		except_str = "-";
 	caller = (unsigned long) entry->caller;
-	rc += sprintf(out_buf, "%02i %011ld:%06lu %1u %1s %02i %pK  ",
-		      area, sec, usec, level, except_str,
-		      entry->id.fields.cpuid, (void *)caller);
+	rc += scnprintf(out_buf, out_buf_size, "%02i %011ld:%06lu %1u %1s %04u %px  ",
+			area, sec, usec, level, except_str,
+			entry->cpu, (void *)caller);
 	return rc;
 }
 EXPORT_SYMBOL(debug_dflt_header_fn);
 
 /*
- * prints debug data sprintf-formated:
+ * prints debug data sprintf-formatted:
  * debug_sprinf_event/exception calls must be used together with this view
  */
 
 #define DEBUG_SPRINTF_MAX_ARGS 10
 
-static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view,
-				   char *out_buf, debug_sprintf_entry_t *curr_event)
+int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view,
+			    char *out_buf, size_t out_buf_size, const char *inbuf)
 {
+	debug_sprintf_entry_t *curr_event = (debug_sprintf_entry_t *)inbuf;
 	int num_longs, num_used_args = 0, i, rc = 0;
 	int index[DEBUG_SPRINTF_MAX_ARGS];
 
@@ -1482,8 +1696,9 @@ static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view,
 		goto out; /* bufsize of entry too small */
 	if (num_longs == 1) {
 		/* no args, we use only the string */
-		strcpy(out_buf, curr_event->string);
-		rc = strlen(curr_event->string);
+		rc = strscpy(out_buf, curr_event->string, out_buf_size);
+		if (rc == -E2BIG)
+			rc = out_buf_size;
 		goto out;
 	}
 
@@ -1495,15 +1710,17 @@ static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view,
 	for (i = 0; i < num_used_args; i++)
 		index[i] = i;
 
-	rc = sprintf(out_buf, curr_event->string, curr_event->args[index[0]],
-		     curr_event->args[index[1]], curr_event->args[index[2]],
-		     curr_event->args[index[3]], curr_event->args[index[4]],
-		     curr_event->args[index[5]], curr_event->args[index[6]],
-		     curr_event->args[index[7]], curr_event->args[index[8]],
-		     curr_event->args[index[9]]);
+	rc = scnprintf(out_buf, out_buf_size,
+		       curr_event->string, curr_event->args[index[0]],
+		       curr_event->args[index[1]], curr_event->args[index[2]],
+		       curr_event->args[index[3]], curr_event->args[index[4]],
+		       curr_event->args[index[5]], curr_event->args[index[6]],
+		       curr_event->args[index[7]], curr_event->args[index[8]],
+		       curr_event->args[index[9]]);
 out:
 	return rc;
 }
+EXPORT_SYMBOL(debug_sprintf_format_fn);
 
 /*
  * debug_init:
@@ -1511,7 +1728,7 @@ out:
  */
 static int __init debug_init(void)
 {
-	s390dbf_sysctl_header = register_sysctl_table(s390dbf_dir_table);
+	s390dbf_sysctl_header = register_sysctl("s390dbf", s390dbf_table);
 	mutex_lock(&debug_mutex);
 	debug_debugfs_root_entry = debugfs_create_dir(DEBUG_DIR_ROOT, NULL);
 	initialized = 1;
diff --git a/arch/s390/kernel/diag/Makefile b/arch/s390/kernel/diag/Makefile
new file mode 100644
index 000000000000..956aee6c4090
--- /dev/null
+++ b/arch/s390/kernel/diag/Makefile
@@ -0,0 +1 @@
+obj-y	:= diag_misc.o diag324.o diag.o diag310.o
diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag/diag.c
index e9dac9a24d3f..56b862ba9be8 100644
--- a/arch/s390/kernel/diag.c
+++ b/arch/s390/kernel/diag/diag.c
@@ -11,9 +11,13 @@
 #include <linux/cpu.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
+#include <linux/vmalloc.h>
+#include <asm/asm-extable.h>
 #include <asm/diag.h>
 #include <asm/trace/diag.h>
 #include <asm/sections.h>
+#include <asm/asm.h>
+#include "../entry.h"
 
 struct diag_stat {
 	unsigned int counter[NR_DIAG_STAT];
@@ -33,6 +37,7 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = {
 	[DIAG_STAT_X014] = { .code = 0x014, .name = "Spool File Services" },
 	[DIAG_STAT_X044] = { .code = 0x044, .name = "Voluntary Timeslice End" },
 	[DIAG_STAT_X064] = { .code = 0x064, .name = "NSS Manipulation" },
+	[DIAG_STAT_X08C] = { .code = 0x08c, .name = "Access 3270 Display Device Information" },
 	[DIAG_STAT_X09C] = { .code = 0x09c, .name = "Relinquish Timeslice" },
 	[DIAG_STAT_X0DC] = { .code = 0x0dc, .name = "Appldata Control" },
 	[DIAG_STAT_X204] = { .code = 0x204, .name = "Logical-CPU Utilization" },
@@ -46,12 +51,28 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = {
 	[DIAG_STAT_X2FC] = { .code = 0x2fc, .name = "Guest Performance Data" },
 	[DIAG_STAT_X304] = { .code = 0x304, .name = "Partition-Resource Service" },
 	[DIAG_STAT_X308] = { .code = 0x308, .name = "List-Directed IPL" },
+	[DIAG_STAT_X310] = { .code = 0x310, .name = "Memory Topology Information" },
 	[DIAG_STAT_X318] = { .code = 0x318, .name = "CP Name and Version Codes" },
+	[DIAG_STAT_X320] = { .code = 0x320, .name = "Certificate Store" },
+	[DIAG_STAT_X324] = { .code = 0x324, .name = "Power Information Block" },
+	[DIAG_STAT_X49C] = { .code = 0x49c, .name = "Warning-Track Interruption" },
 	[DIAG_STAT_X500] = { .code = 0x500, .name = "Virtio Service" },
 };
 
-struct diag_ops __bootdata_preserved(diag_dma_ops);
-struct diag210 *__bootdata_preserved(__diag210_tmp_dma);
+struct diag_ops __amode31_ref diag_amode31_ops = {
+	.diag210 = _diag210_amode31,
+	.diag26c = _diag26c_amode31,
+	.diag14 = _diag14_amode31,
+	.diag0c = _diag0c_amode31,
+	.diag8c = _diag8c_amode31,
+	.diag308_reset = _diag308_reset_amode31
+};
+
+static struct diag210 _diag210_tmp_amode31 __section(".amode31.data");
+struct diag210 __amode31_ref *__diag210_tmp_amode31 = &_diag210_tmp_amode31;
+
+static struct diag8c _diag8c_tmp_amode31 __section(".amode31.data");
+static struct diag8c __amode31_ref *__diag8c_tmp_amode31 = &_diag8c_tmp_amode31;
 
 static int show_diag_stat(struct seq_file *m, void *v)
 {
@@ -59,7 +80,7 @@ static int show_diag_stat(struct seq_file *m, void *v)
 	unsigned long n = (unsigned long) v - 1;
 	int cpu, prec, tmp;
 
-	get_online_cpus();
+	cpus_read_lock();
 	if (n == 0) {
 		seq_puts(m, "         ");
 
@@ -78,13 +99,13 @@ static int show_diag_stat(struct seq_file *m, void *v)
 		}
 		seq_printf(m, "    %s\n", diag_map[n-1].name);
 	}
-	put_online_cpus();
+	cpus_read_unlock();
 	return 0;
 }
 
 static void *show_diag_stat_start(struct seq_file *m, loff_t *pos)
 {
-	return *pos <= nr_cpu_ids ? (void *)((unsigned long) *pos + 1) : NULL;
+	return *pos <= NR_DIAG_STAT ? (void *)((unsigned long) *pos + 1) : NULL;
 }
 
 static void *show_diag_stat_next(struct seq_file *m, void *v, loff_t *pos)
@@ -104,18 +125,7 @@ static const struct seq_operations show_diag_stat_sops = {
 	.show	= show_diag_stat,
 };
 
-static int show_diag_stat_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &show_diag_stat_sops);
-}
-
-static const struct file_operations show_diag_stat_fops = {
-	.open		= show_diag_stat_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
+DEFINE_SEQ_ATTRIBUTE(show_diag_stat);
 
 static int __init show_diag_stat_init(void)
 {
@@ -133,7 +143,7 @@ void diag_stat_inc(enum diag_stat_enum nr)
 }
 EXPORT_SYMBOL(diag_stat_inc);
 
-void diag_stat_inc_norecursion(enum diag_stat_enum nr)
+void notrace diag_stat_inc_norecursion(enum diag_stat_enum nr)
 {
 	this_cpu_inc(diag_stat.counter[nr]);
 	trace_s390_diagnose_norecursion(diag_map[nr].code);
@@ -141,35 +151,88 @@ void diag_stat_inc_norecursion(enum diag_stat_enum nr)
 EXPORT_SYMBOL(diag_stat_inc_norecursion);
 
 /*
+ * Diagnose 0c: Pseudo Timer
+ */
+void diag0c(struct hypfs_diag0c_entry *data)
+{
+	diag_stat_inc(DIAG_STAT_X00C);
+	diag_amode31_ops.diag0c(virt_to_phys(data));
+}
+
+/*
  * Diagnose 14: Input spool file manipulation
+ *
+ * The subcode parameter determines the type of the first parameter rx.
+ * Currently used are the following 3 subcommands:
+ * 0x0:   Read the Next Spool File Buffer (Data Record)
+ * 0x28:  Position a Spool File to the Designated Record
+ * 0xfff: Retrieve Next File Descriptor
+ *
+ * For subcommands 0x0 and 0xfff, the value of the first parameter is
+ * a virtual address of a memory buffer and needs virtual to physical
+ * address translation. For other subcommands the rx parameter is not
+ * a virtual address.
  */
 int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode)
 {
 	diag_stat_inc(DIAG_STAT_X014);
-	return diag_dma_ops.diag14(rx, ry1, subcode);
+	switch (subcode) {
+	case 0x0:
+	case 0xfff:
+		rx = virt_to_phys((void *)rx);
+		break;
+	default:
+		/* Do nothing */
+		break;
+	}
+	return diag_amode31_ops.diag14(rx, ry1, subcode);
 }
 EXPORT_SYMBOL(diag14);
 
+#define DIAG204_BUSY_RC 8
+
 static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr)
 {
-	register unsigned long _subcode asm("0") = *subcode;
-	register unsigned long _size asm("1") = size;
+	union register_pair rp = { .even = *subcode, .odd = size };
 
-	asm volatile(
-		"	diag	%2,%0,0x204\n"
+	asm_inline volatile(
+		"	diag	%[addr],%[rp],0x204\n"
 		"0:	nopr	%%r7\n"
 		EX_TABLE(0b,0b)
-		: "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory");
-	*subcode = _subcode;
-	return _size;
+		: [rp] "+&d" (rp.pair) : [addr] "d" (addr) : "memory");
+	*subcode = rp.even;
+	return rp.odd;
 }
 
+/**
+ * diag204() - Issue diagnose 204 call.
+ * @subcode: Subcode of diagnose 204 to be executed.
+ * @size: Size of area in pages which @area points to, if given.
+ * @addr: Vmalloc'ed memory area where the result is written to.
+ *
+ * Execute diagnose 204 with the given subcode and write the result to the
+ * memory area specified with @addr. For subcodes which do not write a
+ * result to memory both @size and @addr must be zero. If @addr is
+ * specified it must be page aligned and must have been allocated with
+ * vmalloc(). Conversion to real / physical addresses will be handled by
+ * this function if required.
+ */
 int diag204(unsigned long subcode, unsigned long size, void *addr)
 {
+	if (addr) {
+		if (WARN_ON_ONCE(!is_vmalloc_addr(addr)))
+			return -EINVAL;
+		if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)addr, PAGE_SIZE)))
+			return -EINVAL;
+	}
+	if ((subcode & DIAG204_SUBCODE_MASK) == DIAG204_SUBC_STIB4)
+		addr = (void *)pfn_to_phys(vmalloc_to_pfn(addr));
 	diag_stat_inc(DIAG_STAT_X204);
 	size = __diag204(&subcode, size, addr);
-	if (subcode)
-		return -1;
+	if (subcode == DIAG204_BUSY_RC)
+		return -EBUSY;
+	else if (subcode)
+		return -EOPNOTSUPP;
 	return size;
 }
 EXPORT_SYMBOL(diag204);
@@ -184,29 +247,53 @@ int diag210(struct diag210 *addr)
 	int ccode;
 
 	spin_lock_irqsave(&diag210_lock, flags);
-	*__diag210_tmp_dma = *addr;
+	*__diag210_tmp_amode31 = *addr;
 
 	diag_stat_inc(DIAG_STAT_X210);
-	ccode = diag_dma_ops.diag210(__diag210_tmp_dma);
+	ccode = diag_amode31_ops.diag210(__diag210_tmp_amode31);
 
-	*addr = *__diag210_tmp_dma;
+	*addr = *__diag210_tmp_amode31;
 	spin_unlock_irqrestore(&diag210_lock, flags);
 
 	return ccode;
 }
 EXPORT_SYMBOL(diag210);
 
+/*
+ * Diagnose 8C: Access 3270 Display Device Information
+ */
+int diag8c(struct diag8c *addr, struct ccw_dev_id *devno)
+{
+	static DEFINE_SPINLOCK(diag8c_lock);
+	unsigned long flags;
+	int ccode;
+
+	spin_lock_irqsave(&diag8c_lock, flags);
+
+	diag_stat_inc(DIAG_STAT_X08C);
+	ccode = diag_amode31_ops.diag8c(__diag8c_tmp_amode31, devno, sizeof(*addr));
+
+	*addr = *__diag8c_tmp_amode31;
+	spin_unlock_irqrestore(&diag8c_lock, flags);
+
+	return ccode;
+}
+EXPORT_SYMBOL(diag8c);
+
 int diag224(void *ptr)
 {
+	unsigned long addr = __pa(ptr);
 	int rc = -EOPNOTSUPP;
 
 	diag_stat_inc(DIAG_STAT_X224);
-	asm volatile(
-		"	diag	%1,%2,0x224\n"
-		"0:	lhi	%0,0x0\n"
+	asm_inline volatile("\n"
+		"	diag	%[type],%[addr],0x224\n"
+		"0:	lhi	%[rc],0\n"
 		"1:\n"
 		EX_TABLE(0b,1b)
-		: "+d" (rc) :"d" (0), "d" (ptr) : "memory");
+		: [rc] "+d" (rc)
+		, "=m" (*(struct { char buf[PAGE_SIZE]; } *)ptr)
+		: [type] "d" (0), [addr] "d" (addr));
 	return rc;
 }
 EXPORT_SYMBOL(diag224);
@@ -217,6 +304,21 @@ EXPORT_SYMBOL(diag224);
 int diag26c(void *req, void *resp, enum diag26c_sc subcode)
 {
 	diag_stat_inc(DIAG_STAT_X26C);
-	return diag_dma_ops.diag26c(req, resp, subcode);
+	return diag_amode31_ops.diag26c(virt_to_phys(req), virt_to_phys(resp), subcode);
 }
 EXPORT_SYMBOL(diag26c);
+
+int diag49c(unsigned long subcode)
+{
+	int cc;
+
+	diag_stat_inc(DIAG_STAT_X49C);
+	asm volatile(
+		"	diag	%[subcode],0,0x49c\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc)
+		: [subcode] "d" (subcode)
+		: CC_CLOBBER);
+	return CC_TRANSFORM(cc);
+}
+EXPORT_SYMBOL(diag49c);
diff --git a/arch/s390/kernel/diag/diag310.c b/arch/s390/kernel/diag/diag310.c
new file mode 100644
index 000000000000..d6a34454aa5a
--- /dev/null
+++ b/arch/s390/kernel/diag/diag310.c
@@ -0,0 +1,276 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Request memory topology information via diag0x310.
+ *
+ * Copyright IBM Corp. 2025
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <asm/diag.h>
+#include <asm/sclp.h>
+#include <uapi/asm/diag.h>
+#include "diag_ioctl.h"
+
+#define DIAG310_LEVELMIN 1
+#define DIAG310_LEVELMAX 6
+
+enum diag310_sc {
+	DIAG310_SUBC_0 = 0,
+	DIAG310_SUBC_1 = 1,
+	DIAG310_SUBC_4 = 4,
+	DIAG310_SUBC_5 = 5
+};
+
+enum diag310_retcode {
+	DIAG310_RET_SUCCESS	= 0x0001,
+	DIAG310_RET_BUSY	= 0x0101,
+	DIAG310_RET_OPNOTSUPP	= 0x0102,
+	DIAG310_RET_SC4_INVAL	= 0x0401,
+	DIAG310_RET_SC4_NODATA	= 0x0402,
+	DIAG310_RET_SC5_INVAL	= 0x0501,
+	DIAG310_RET_SC5_NODATA	= 0x0502,
+	DIAG310_RET_SC5_ESIZE	= 0x0503
+};
+
+union diag310_response {
+	u64 response;
+	struct {
+		u64 result	: 32;
+		u64		: 16;
+		u64 rc		: 16;
+	};
+};
+
+union diag310_req_subcode {
+	u64 subcode;
+	struct {
+		u64		: 48;
+		u64 st		: 8;
+		u64 sc		: 8;
+	};
+};
+
+union diag310_req_size {
+	u64 size;
+	struct {
+		u64 page_count	: 32;
+		u64		: 32;
+	};
+};
+
+static inline unsigned long diag310(unsigned long subcode, unsigned long size, void *addr)
+{
+	union register_pair rp = { .even = (unsigned long)addr, .odd = size };
+
+	diag_stat_inc(DIAG_STAT_X310);
+	asm volatile("diag	%[rp],%[subcode],0x310\n"
+		     : [rp] "+d" (rp.pair)
+		     : [subcode] "d" (subcode)
+		     : "memory");
+	return rp.odd;
+}
+
+static int diag310_result_to_errno(unsigned int result)
+{
+	switch (result) {
+	case DIAG310_RET_BUSY:
+		return -EBUSY;
+	case DIAG310_RET_OPNOTSUPP:
+		return -EOPNOTSUPP;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int diag310_get_subcode_mask(unsigned long *mask)
+{
+	union diag310_response res;
+
+	res.response = diag310(DIAG310_SUBC_0, 0, NULL);
+	if (res.rc != DIAG310_RET_SUCCESS)
+		return diag310_result_to_errno(res.rc);
+	*mask = res.response;
+	return 0;
+}
+
+static int diag310_get_memtop_stride(unsigned long *stride)
+{
+	union diag310_response res;
+
+	res.response = diag310(DIAG310_SUBC_1, 0, NULL);
+	if (res.rc != DIAG310_RET_SUCCESS)
+		return diag310_result_to_errno(res.rc);
+	*stride = res.result;
+	return 0;
+}
+
+static int diag310_get_memtop_size(unsigned long *pages, unsigned long level)
+{
+	union diag310_req_subcode req = { .sc = DIAG310_SUBC_4, .st = level };
+	union diag310_response res;
+
+	res.response = diag310(req.subcode, 0, NULL);
+	switch (res.rc) {
+	case DIAG310_RET_SUCCESS:
+		*pages = res.result;
+		return 0;
+	case DIAG310_RET_SC4_NODATA:
+		return -ENODATA;
+	case DIAG310_RET_SC4_INVAL:
+		return -EINVAL;
+	default:
+		return diag310_result_to_errno(res.rc);
+	}
+}
+
+static int diag310_store_topology_map(void *buf, unsigned long pages, unsigned long level)
+{
+	union diag310_req_subcode req_sc = { .sc = DIAG310_SUBC_5, .st = level };
+	union diag310_req_size req_size = { .page_count = pages };
+	union diag310_response res;
+
+	res.response = diag310(req_sc.subcode, req_size.size, buf);
+	switch (res.rc) {
+	case DIAG310_RET_SUCCESS:
+		return 0;
+	case DIAG310_RET_SC5_NODATA:
+		return -ENODATA;
+	case DIAG310_RET_SC5_ESIZE:
+		return -EOVERFLOW;
+	case DIAG310_RET_SC5_INVAL:
+		return -EINVAL;
+	default:
+		return diag310_result_to_errno(res.rc);
+	}
+}
+
+static int diag310_check_features(void)
+{
+	static int features_available;
+	unsigned long mask;
+	int rc;
+
+	if (READ_ONCE(features_available))
+		return 0;
+	if (!sclp.has_diag310)
+		return -EOPNOTSUPP;
+	rc = diag310_get_subcode_mask(&mask);
+	if (rc)
+		return rc;
+	if (!test_bit_inv(DIAG310_SUBC_1, &mask))
+		return -EOPNOTSUPP;
+	if (!test_bit_inv(DIAG310_SUBC_4, &mask))
+		return -EOPNOTSUPP;
+	if (!test_bit_inv(DIAG310_SUBC_5, &mask))
+		return -EOPNOTSUPP;
+	WRITE_ONCE(features_available, 1);
+	return 0;
+}
+
+static int memtop_get_stride_len(unsigned long *res)
+{
+	static unsigned long memtop_stride;
+	unsigned long stride;
+	int rc;
+
+	stride = READ_ONCE(memtop_stride);
+	if (!stride) {
+		rc = diag310_get_memtop_stride(&stride);
+		if (rc)
+			return rc;
+		WRITE_ONCE(memtop_stride, stride);
+	}
+	*res = stride;
+	return 0;
+}
+
+static int memtop_get_page_count(unsigned long *res, unsigned long level)
+{
+	static unsigned long memtop_pages[DIAG310_LEVELMAX];
+	unsigned long pages;
+	int rc;
+
+	if (level > DIAG310_LEVELMAX || level < DIAG310_LEVELMIN)
+		return -EINVAL;
+	pages = READ_ONCE(memtop_pages[level - 1]);
+	if (!pages) {
+		rc = diag310_get_memtop_size(&pages, level);
+		if (rc)
+			return rc;
+		WRITE_ONCE(memtop_pages[level - 1], pages);
+	}
+	*res = pages;
+	return 0;
+}
+
+long diag310_memtop_stride(unsigned long arg)
+{
+	size_t __user *argp = (void __user *)arg;
+	unsigned long stride;
+	int rc;
+
+	rc = diag310_check_features();
+	if (rc)
+		return rc;
+	rc = memtop_get_stride_len(&stride);
+	if (rc)
+		return rc;
+	if (put_user(stride, argp))
+		return -EFAULT;
+	return 0;
+}
+
+long diag310_memtop_len(unsigned long arg)
+{
+	size_t __user *argp = (void __user *)arg;
+	unsigned long pages, level;
+	int rc;
+
+	rc = diag310_check_features();
+	if (rc)
+		return rc;
+	if (get_user(level, argp))
+		return -EFAULT;
+	rc = memtop_get_page_count(&pages, level);
+	if (rc)
+		return rc;
+	if (put_user(pages * PAGE_SIZE, argp))
+		return -EFAULT;
+	return 0;
+}
+
+long diag310_memtop_buf(unsigned long arg)
+{
+	struct diag310_memtop __user *udata = (struct diag310_memtop __user *)arg;
+	unsigned long level, pages, data_size;
+	u64 address;
+	void *buf;
+	int rc;
+
+	rc = diag310_check_features();
+	if (rc)
+		return rc;
+	if (get_user(level, &udata->nesting_lvl))
+		return -EFAULT;
+	if (get_user(address, &udata->address))
+		return -EFAULT;
+	rc = memtop_get_page_count(&pages, level);
+	if (rc)
+		return rc;
+	data_size = pages * PAGE_SIZE;
+	buf = __vmalloc_node(data_size, PAGE_SIZE, GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT,
+			     NUMA_NO_NODE, __builtin_return_address(0));
+	if (!buf)
+		return -ENOMEM;
+	rc = diag310_store_topology_map(buf, pages, level);
+	if (rc)
+		goto out;
+	if (copy_to_user((void __user *)address, buf, data_size))
+		rc = -EFAULT;
+out:
+	vfree(buf);
+	return rc;
+}
diff --git a/arch/s390/kernel/diag/diag324.c b/arch/s390/kernel/diag/diag324.c
new file mode 100644
index 000000000000..7fa4c0b7eb6c
--- /dev/null
+++ b/arch/s390/kernel/diag/diag324.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Request power readings for resources in a computing environment via
+ * diag 0x324. diag 0x324 stores the power readings in the power information
+ * block (pib).
+ *
+ * Copyright IBM Corp. 2024
+ */
+
+#define pr_fmt(fmt)	"diag324: " fmt
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/ioctl.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/ktime.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+
+#include <asm/diag.h>
+#include <asm/sclp.h>
+#include <asm/timex.h>
+#include <uapi/asm/diag.h>
+#include "diag_ioctl.h"
+
+enum subcode {
+	DIAG324_SUBC_0 = 0,
+	DIAG324_SUBC_1 = 1,
+	DIAG324_SUBC_2 = 2,
+};
+
+enum retcode {
+	DIAG324_RET_SUCCESS		= 0x0001,
+	DIAG324_RET_SUBC_NOTAVAIL	= 0x0103,
+	DIAG324_RET_INSUFFICIENT_SIZE	= 0x0104,
+	DIAG324_RET_READING_UNAVAILABLE	= 0x0105,
+};
+
+union diag324_response {
+	u64 response;
+	struct {
+		u64 installed	: 32;
+		u64		: 16;
+		u64 rc		: 16;
+	} sc0;
+	struct {
+		u64 format	: 16;
+		u64		: 16;
+		u64 pib_len	: 16;
+		u64 rc		: 16;
+	} sc1;
+	struct {
+		u64		: 48;
+		u64 rc		: 16;
+	} sc2;
+};
+
+union diag324_request {
+	u64 request;
+	struct {
+		u64		: 32;
+		u64 allocated	: 16;
+		u64		: 12;
+		u64 sc		: 4;
+	} sc2;
+};
+
+struct pib {
+	u32		: 8;
+	u32 num		: 8;
+	u32 len		: 16;
+	u32		: 24;
+	u32 hlen	: 8;
+	u64		: 64;
+	u64 intv;
+	u8  r[];
+} __packed;
+
+struct pibdata {
+	struct pib *pib;
+	ktime_t expire;
+	u64 sequence;
+	size_t len;
+	int rc;
+};
+
+static DEFINE_MUTEX(pibmutex);
+static struct pibdata pibdata;
+
+#define PIBWORK_DELAY (5 * NSEC_PER_SEC)
+
+static void pibwork_handler(struct work_struct *work);
+static DECLARE_DELAYED_WORK(pibwork, pibwork_handler);
+
+static unsigned long diag324(unsigned long subcode, void *addr)
+{
+	union register_pair rp = { .even = (unsigned long)addr };
+
+	diag_stat_inc(DIAG_STAT_X324);
+	asm volatile("diag	%[rp],%[subcode],0x324\n"
+		     : [rp] "+d" (rp.pair)
+		     : [subcode] "d" (subcode)
+		     : "memory");
+	return rp.odd;
+}
+
+static void pibwork_handler(struct work_struct *work)
+{
+	struct pibdata *data = &pibdata;
+	ktime_t timedout;
+
+	mutex_lock(&pibmutex);
+	timedout = ktime_add_ns(data->expire, PIBWORK_DELAY);
+	if (ktime_before(ktime_get(), timedout)) {
+		mod_delayed_work(system_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY));
+		goto out;
+	}
+	vfree(data->pib);
+	data->pib = NULL;
+out:
+	mutex_unlock(&pibmutex);
+}
+
+static void pib_update(struct pibdata *data)
+{
+	union diag324_request req = { .sc2.sc = DIAG324_SUBC_2, .sc2.allocated = data->len };
+	union diag324_response res;
+	int rc;
+
+	memset(data->pib, 0, data->len);
+	res.response = diag324(req.request, data->pib);
+	switch (res.sc2.rc) {
+	case DIAG324_RET_SUCCESS:
+		rc = 0;
+		break;
+	case DIAG324_RET_SUBC_NOTAVAIL:
+		rc = -ENOENT;
+		break;
+	case DIAG324_RET_INSUFFICIENT_SIZE:
+		rc = -EMSGSIZE;
+		break;
+	case DIAG324_RET_READING_UNAVAILABLE:
+		rc = -EBUSY;
+		break;
+	default:
+		rc = -EINVAL;
+	}
+	data->rc = rc;
+}
+
+long diag324_pibbuf(unsigned long arg)
+{
+	struct diag324_pib __user *udata = (struct diag324_pib __user *)arg;
+	struct pibdata *data = &pibdata;
+	static bool first = true;
+	u64 address;
+	int rc;
+
+	if (!data->len)
+		return -EOPNOTSUPP;
+	if (get_user(address, &udata->address))
+		return -EFAULT;
+	mutex_lock(&pibmutex);
+	rc = -ENOMEM;
+	if (!data->pib)
+		data->pib = vmalloc(data->len);
+	if (!data->pib)
+		goto out;
+	if (first || ktime_after(ktime_get(), data->expire)) {
+		pib_update(data);
+		data->sequence++;
+		data->expire = ktime_add_ns(ktime_get(), tod_to_ns(data->pib->intv));
+		mod_delayed_work(system_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY));
+		first = false;
+	}
+	rc = data->rc;
+	if (rc != 0 && rc != -EBUSY)
+		goto out;
+	rc = copy_to_user((void __user *)address, data->pib, data->pib->len);
+	rc |= put_user(data->sequence, &udata->sequence);
+	if (rc)
+		rc = -EFAULT;
+out:
+	mutex_unlock(&pibmutex);
+	return rc;
+}
+
+long diag324_piblen(unsigned long arg)
+{
+	struct pibdata *data = &pibdata;
+
+	if (!data->len)
+		return -EOPNOTSUPP;
+	if (put_user(data->len, (size_t __user *)arg))
+		return -EFAULT;
+	return 0;
+}
+
+static int __init diag324_init(void)
+{
+	union diag324_response res;
+	unsigned long installed;
+
+	if (!sclp.has_diag324)
+		return -EOPNOTSUPP;
+	res.response = diag324(DIAG324_SUBC_0, NULL);
+	if (res.sc0.rc != DIAG324_RET_SUCCESS)
+		return -EOPNOTSUPP;
+	installed = res.response;
+	if (!test_bit_inv(DIAG324_SUBC_1, &installed))
+		return -EOPNOTSUPP;
+	if (!test_bit_inv(DIAG324_SUBC_2, &installed))
+		return -EOPNOTSUPP;
+	res.response = diag324(DIAG324_SUBC_1, NULL);
+	if (res.sc1.rc != DIAG324_RET_SUCCESS)
+		return -EOPNOTSUPP;
+	pibdata.len = res.sc1.pib_len;
+	return 0;
+}
+device_initcall(diag324_init);
diff --git a/arch/s390/kernel/diag/diag_ioctl.h b/arch/s390/kernel/diag/diag_ioctl.h
new file mode 100644
index 000000000000..7080be946785
--- /dev/null
+++ b/arch/s390/kernel/diag/diag_ioctl.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _DIAG_IOCTL_H
+#define _DIAG_IOCTL_H
+
+#include <linux/types.h>
+
+long diag324_pibbuf(unsigned long arg);
+long diag324_piblen(unsigned long arg);
+
+long diag310_memtop_stride(unsigned long arg);
+long diag310_memtop_len(unsigned long arg);
+long diag310_memtop_buf(unsigned long arg);
+
+#endif /* _DIAG_IOCTL_H */
diff --git a/arch/s390/kernel/diag/diag_misc.c b/arch/s390/kernel/diag/diag_misc.c
new file mode 100644
index 000000000000..efffe02ea02e
--- /dev/null
+++ b/arch/s390/kernel/diag/diag_misc.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Provide diagnose information via misc device /dev/diag.
+ *
+ * Copyright IBM Corp. 2024
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/ioctl.h>
+#include <linux/kernel.h>
+#include <linux/miscdevice.h>
+#include <linux/types.h>
+
+#include <uapi/asm/diag.h>
+#include "diag_ioctl.h"
+
+static long diag_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	long rc;
+
+	switch (cmd) {
+	case DIAG324_GET_PIBLEN:
+		rc = diag324_piblen(arg);
+		break;
+	case DIAG324_GET_PIBBUF:
+		rc = diag324_pibbuf(arg);
+		break;
+	case DIAG310_GET_STRIDE:
+		rc = diag310_memtop_stride(arg);
+		break;
+	case DIAG310_GET_MEMTOPLEN:
+		rc = diag310_memtop_len(arg);
+		break;
+	case DIAG310_GET_MEMTOPBUF:
+		rc = diag310_memtop_buf(arg);
+		break;
+	default:
+		rc = -ENOIOCTLCMD;
+		break;
+	}
+	return rc;
+}
+
+static const struct file_operations fops = {
+	.owner		= THIS_MODULE,
+	.open		= nonseekable_open,
+	.unlocked_ioctl	= diag_ioctl,
+};
+
+static struct miscdevice diagdev = {
+	.name	= "diag",
+	.minor	= MISC_DYNAMIC_MINOR,
+	.fops	= &fops,
+	.mode	= 0444,
+};
+
+static int diag_init(void)
+{
+	return misc_register(&diagdev);
+}
+
+device_initcall(diag_init);
diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c
index f304802ecf7b..94eb8168ea44 100644
--- a/arch/s390/kernel/dis.c
+++ b/arch/s390/kernel/dis.c
@@ -24,8 +24,8 @@
 #include <linux/kdebug.h>
 #include <linux/uaccess.h>
 #include <linux/atomic.h>
+#include <linux/io.h>
 #include <asm/dis.h>
-#include <asm/io.h>
 #include <asm/cpcmd.h>
 #include <asm/lowcore.h>
 #include <asm/debug.h>
@@ -122,6 +122,7 @@ enum {
 	U8_32,	/* 8 bit unsigned value starting at 32 */
 	U12_16, /* 12 bit unsigned value starting at 16 */
 	U16_16, /* 16 bit unsigned value starting at 16 */
+	U16_20, /* 16 bit unsigned value starting at 20 */
 	U16_32, /* 16 bit unsigned value starting at 32 */
 	U32_16, /* 32 bit unsigned value starting at 16 */
 	VX_12,	/* Vector index register starting at position 12 */
@@ -184,6 +185,7 @@ static const struct s390_operand operands[] = {
 	[U8_32]	 = {  8, 32, 0 },
 	[U12_16] = { 12, 16, 0 },
 	[U16_16] = { 16, 16, 0 },
+	[U16_20] = { 16, 20, 0 },
 	[U16_32] = { 16, 32, 0 },
 	[U32_16] = { 32, 16, 0 },
 	[VX_12]	 = {  4, 12, OPERAND_INDEX | OPERAND_VR },
@@ -257,7 +259,6 @@ static const unsigned char formats[][6] = {
 	[INSTR_RSL_R0RD]     = { D_20, L4_8, B_16, 0, 0, 0 },
 	[INSTR_RSY_AARD]     = { A_8, A_12, D20_20, B_16, 0, 0 },
 	[INSTR_RSY_CCRD]     = { C_8, C_12, D20_20, B_16, 0, 0 },
-	[INSTR_RSY_RDRU]     = { R_8, D20_20, B_16, U4_12, 0, 0 },
 	[INSTR_RSY_RRRD]     = { R_8, R_12, D20_20, B_16, 0, 0 },
 	[INSTR_RSY_RURD]     = { R_8, U4_12, D20_20, B_16, 0, 0 },
 	[INSTR_RSY_RURD2]    = { R_8, D20_20, B_16, U4_12, 0, 0 },
@@ -278,6 +279,7 @@ static const unsigned char formats[][6] = {
 	[INSTR_SIL_RDI]	     = { D_20, B_16, I16_32, 0, 0, 0 },
 	[INSTR_SIL_RDU]	     = { D_20, B_16, U16_32, 0, 0, 0 },
 	[INSTR_SIY_IRD]	     = { D20_20, B_16, I8_8, 0, 0, 0 },
+	[INSTR_SIY_RD]	     = { D20_20, B_16, 0, 0, 0, 0 },
 	[INSTR_SIY_URD]	     = { D20_20, B_16, U8_8, 0, 0, 0 },
 	[INSTR_SI_RD]	     = { D_20, B_16, 0, 0, 0, 0 },
 	[INSTR_SI_URD]	     = { D_20, B_16, U8_8, 0, 0, 0 },
@@ -299,23 +301,28 @@ static const unsigned char formats[][6] = {
 	[INSTR_VRI_V0UU2]    = { V_8, U16_16, U4_32, 0, 0, 0 },
 	[INSTR_VRI_V0UUU]    = { V_8, U8_16, U8_24, U4_32, 0, 0 },
 	[INSTR_VRI_VR0UU]    = { V_8, R_12, U8_28, U4_24, 0, 0 },
+	[INSTR_VRI_VV0UU]    = { V_8, V_12, U8_28, U4_24, 0, 0 },
 	[INSTR_VRI_VVUU]     = { V_8, V_12, U16_16, U4_32, 0, 0 },
 	[INSTR_VRI_VVUUU]    = { V_8, V_12, U12_16, U4_32, U4_28, 0 },
 	[INSTR_VRI_VVUUU2]   = { V_8, V_12, U8_28, U8_16, U4_24, 0 },
 	[INSTR_VRI_VVV0U]    = { V_8, V_12, V_16, U8_24, 0, 0 },
 	[INSTR_VRI_VVV0UU]   = { V_8, V_12, V_16, U8_24, U4_32, 0 },
 	[INSTR_VRI_VVV0UU2]  = { V_8, V_12, V_16, U8_28, U4_24, 0 },
-	[INSTR_VRR_0V]	     = { V_12, 0, 0, 0, 0, 0 },
+	[INSTR_VRI_VVV0UV]   = { V_8, V_12, V_16, V_32, U8_24, 0 },
+	[INSTR_VRR_0V0U]     = { V_12, U16_20, 0, 0, 0, 0 },
 	[INSTR_VRR_0VV0U]    = { V_12, V_16, U4_24, 0, 0, 0 },
+	[INSTR_VRR_0VVU]     = { V_12, V_16, U16_20, 0, 0, 0 },
 	[INSTR_VRR_RV0UU]    = { R_8, V_12, U4_24, U4_28, 0, 0 },
 	[INSTR_VRR_VRR]	     = { V_8, R_12, R_16, 0, 0, 0 },
 	[INSTR_VRR_VV]	     = { V_8, V_12, 0, 0, 0, 0 },
 	[INSTR_VRR_VV0U]     = { V_8, V_12, U4_32, 0, 0, 0 },
 	[INSTR_VRR_VV0U0U]   = { V_8, V_12, U4_32, U4_24, 0, 0 },
+	[INSTR_VRR_VV0U2]    = { V_8, V_12, U4_24, 0, 0, 0 },
 	[INSTR_VRR_VV0UU2]   = { V_8, V_12, U4_32, U4_28, 0, 0 },
 	[INSTR_VRR_VV0UUU]   = { V_8, V_12, U4_32, U4_28, U4_24, 0 },
 	[INSTR_VRR_VVV]	     = { V_8, V_12, V_16, 0, 0, 0 },
 	[INSTR_VRR_VVV0U]    = { V_8, V_12, V_16, U4_32, 0, 0 },
+	[INSTR_VRR_VVV0U0]   = { V_8, V_12, V_16, U4_24, 0, 0 },
 	[INSTR_VRR_VVV0U0U]  = { V_8, V_12, V_16, U4_32, U4_24, 0 },
 	[INSTR_VRR_VVV0UU]   = { V_8, V_12, V_16, U4_32, U4_28, 0 },
 	[INSTR_VRR_VVV0UUU]  = { V_8, V_12, V_16, U4_32, U4_28, U4_24 },
@@ -452,21 +459,21 @@ static int print_insn(char *buffer, unsigned char *code, unsigned long addr)
 			if (separator)
 				ptr += sprintf(ptr, "%c", separator);
 			if (operand->flags & OPERAND_GPR)
-				ptr += sprintf(ptr, "%%r%i", value);
+				ptr += sprintf(ptr, "%%r%u", value);
 			else if (operand->flags & OPERAND_FPR)
-				ptr += sprintf(ptr, "%%f%i", value);
+				ptr += sprintf(ptr, "%%f%u", value);
 			else if (operand->flags & OPERAND_AR)
-				ptr += sprintf(ptr, "%%a%i", value);
+				ptr += sprintf(ptr, "%%a%u", value);
 			else if (operand->flags & OPERAND_CR)
-				ptr += sprintf(ptr, "%%c%i", value);
+				ptr += sprintf(ptr, "%%c%u", value);
 			else if (operand->flags & OPERAND_VR)
-				ptr += sprintf(ptr, "%%v%i", value);
+				ptr += sprintf(ptr, "%%v%u", value);
 			else if (operand->flags & OPERAND_PCREL) {
 				void *pcrel = (void *)((int)value + addr);
 
 				ptr += sprintf(ptr, "%px", pcrel);
 			} else if (operand->flags & OPERAND_SIGNED)
-				ptr += sprintf(ptr, "%i", value);
+				ptr += sprintf(ptr, "%i", (int)value);
 			else
 				ptr += sprintf(ptr, "%u", value);
 			if (operand->flags & OPERAND_DISP)
@@ -482,32 +489,38 @@ static int print_insn(char *buffer, unsigned char *code, unsigned long addr)
 	return (int) (ptr - buffer);
 }
 
+static int copy_from_regs(struct pt_regs *regs, void *dst, void *src, int len)
+{
+	if (user_mode(regs)) {
+		if (copy_from_user(dst, (char __user *)src, len))
+			return -EFAULT;
+	} else {
+		if (copy_from_kernel_nofault(dst, src, len))
+			return -EFAULT;
+	}
+	return 0;
+}
+
 void show_code(struct pt_regs *regs)
 {
 	char *mode = user_mode(regs) ? "User" : "Krnl";
 	unsigned char code[64];
 	char buffer[128], *ptr;
-	mm_segment_t old_fs;
 	unsigned long addr;
 	int start, end, opsize, hops, i;
 
 	/* Get a snapshot of the 64 bytes surrounding the fault address. */
-	old_fs = get_fs();
-	set_fs(user_mode(regs) ? USER_DS : KERNEL_DS);
 	for (start = 32; start && regs->psw.addr >= 34 - start; start -= 2) {
 		addr = regs->psw.addr - 34 + start;
-		if (__copy_from_user(code + start - 2,
-				     (char __user *) addr, 2))
+		if (copy_from_regs(regs, code + start - 2, (void *)addr, 2))
 			break;
 	}
 	for (end = 32; end < 64; end += 2) {
 		addr = regs->psw.addr + end - 32;
-		if (__copy_from_user(code + end,
-				     (char __user *) addr, 2))
+		if (copy_from_regs(regs, code + end, (void *)addr, 2))
 			break;
 	}
-	set_fs(old_fs);
-	/* Code snapshot useable ? */
+	/* Code snapshot usable ? */
 	if ((regs->psw.addr & 1) || start >= end) {
 		printk("%s Code: Bad PSW.\n", mode);
 		return;
@@ -557,7 +570,7 @@ void show_code(struct pt_regs *regs)
 
 void print_fn_code(unsigned char *code, unsigned long len)
 {
-	char buffer[64], *ptr;
+	char buffer[128], *ptr;
 	int opsize, i;
 
 	while (len) {
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index d306fe04489a..dd410962ecbe 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -17,6 +17,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task_stack.h>
+#include <asm/asm-offsets.h>
 #include <asm/processor.h>
 #include <asm/debug.h>
 #include <asm/dis.h>
@@ -41,51 +42,50 @@ const char *stack_type_name(enum stack_type type)
 EXPORT_SYMBOL_GPL(stack_type_name);
 
 static inline bool in_stack(unsigned long sp, struct stack_info *info,
-			    enum stack_type type, unsigned long low,
-			    unsigned long high)
+			    enum stack_type type, unsigned long stack)
 {
-	if (sp < low || sp >= high)
+	if (sp < stack || sp >= stack + THREAD_SIZE)
 		return false;
 	info->type = type;
-	info->begin = low;
-	info->end = high;
+	info->begin = stack;
+	info->end = stack + THREAD_SIZE;
 	return true;
 }
 
 static bool in_task_stack(unsigned long sp, struct task_struct *task,
 			  struct stack_info *info)
 {
-	unsigned long stack;
+	unsigned long stack = (unsigned long)task_stack_page(task);
 
-	stack = (unsigned long) task_stack_page(task);
-	return in_stack(sp, info, STACK_TYPE_TASK, stack, stack + THREAD_SIZE);
+	return in_stack(sp, info, STACK_TYPE_TASK, stack);
 }
 
 static bool in_irq_stack(unsigned long sp, struct stack_info *info)
 {
-	unsigned long frame_size, top;
+	unsigned long stack = get_lowcore()->async_stack - STACK_INIT_OFFSET;
 
-	frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
-	top = S390_lowcore.async_stack + frame_size;
-	return in_stack(sp, info, STACK_TYPE_IRQ, top - THREAD_SIZE, top);
+	return in_stack(sp, info, STACK_TYPE_IRQ, stack);
 }
 
 static bool in_nodat_stack(unsigned long sp, struct stack_info *info)
 {
-	unsigned long frame_size, top;
+	unsigned long stack = get_lowcore()->nodat_stack - STACK_INIT_OFFSET;
 
-	frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
-	top = S390_lowcore.nodat_stack + frame_size;
-	return in_stack(sp, info, STACK_TYPE_NODAT, top - THREAD_SIZE, top);
+	return in_stack(sp, info, STACK_TYPE_NODAT, stack);
+}
+
+static bool in_mcck_stack(unsigned long sp, struct stack_info *info)
+{
+	unsigned long stack = get_lowcore()->mcck_stack - STACK_INIT_OFFSET;
+
+	return in_stack(sp, info, STACK_TYPE_MCCK, stack);
 }
 
 static bool in_restart_stack(unsigned long sp, struct stack_info *info)
 {
-	unsigned long frame_size, top;
+	unsigned long stack = get_lowcore()->restart_stack - STACK_INIT_OFFSET;
 
-	frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
-	top = S390_lowcore.restart_stack + frame_size;
-	return in_stack(sp, info, STACK_TYPE_RESTART, top - THREAD_SIZE, top);
+	return in_stack(sp, info, STACK_TYPE_RESTART, stack);
 }
 
 int get_stack_info(unsigned long sp, struct task_struct *task,
@@ -108,7 +108,8 @@ int get_stack_info(unsigned long sp, struct task_struct *task,
 	/* Check per-cpu stacks */
 	if (!in_irq_stack(sp, info) &&
 	    !in_nodat_stack(sp, info) &&
-	    !in_restart_stack(sp, info))
+	    !in_restart_stack(sp, info) &&
+	    !in_mcck_stack(sp, info))
 		goto unknown;
 
 recursion_check:
@@ -126,22 +127,29 @@ unknown:
 	return -EINVAL;
 }
 
-void show_stack(struct task_struct *task, unsigned long *stack)
+void show_stack(struct task_struct *task, unsigned long *stack,
+		       const char *loglvl)
 {
 	struct unwind_state state;
 
-	printk("Call Trace:\n");
+	printk("%sCall Trace:\n", loglvl);
 	unwind_for_each_frame(&state, task, NULL, (unsigned long) stack)
-		printk(state.reliable ? " [<%016lx>] %pSR \n" :
-					"([<%016lx>] %pSR)\n",
-		       state.ip, (void *) state.ip);
+		printk(state.reliable ? "%s [<%016lx>] %pSR \n" :
+					"%s([<%016lx>] %pSR)\n",
+		       loglvl, state.ip, (void *) state.ip);
 	debug_show_held_locks(task ? : current);
 }
 
 static void show_last_breaking_event(struct pt_regs *regs)
 {
 	printk("Last Breaking-Event-Address:\n");
-	printk(" [<%016lx>] %pSR\n", regs->args[0], (void *)regs->args[0]);
+	printk(" [<%016lx>] ", regs->last_break);
+	if (user_mode(regs)) {
+		print_vma_addr(KERN_CONT, regs->last_break);
+		pr_cont("\n");
+	} else {
+		pr_cont("%pSR\n", (void *)regs->last_break);
+	}
 }
 
 void show_registers(struct pt_regs *regs)
@@ -175,13 +183,13 @@ void show_regs(struct pt_regs *regs)
 	show_registers(regs);
 	/* Show stack backtrace if pt_regs is from kernel mode */
 	if (!user_mode(regs))
-		show_stack(NULL, (unsigned long *) regs->gprs[15]);
+		show_stack(NULL, (unsigned long *) regs->gprs[15], KERN_DEFAULT);
 	show_last_breaking_event(regs);
 }
 
 static DEFINE_SPINLOCK(die_lock);
 
-void die(struct pt_regs *regs, const char *str)
+void __noreturn die(struct pt_regs *regs, const char *str)
 {
 	static int die_counter;
 
@@ -191,11 +199,8 @@ void die(struct pt_regs *regs, const char *str)
 	console_verbose();
 	spin_lock_irq(&die_lock);
 	bust_spinlocks(1);
-	printk("%s: %04x ilc:%d [#%d] ", str, regs->int_code & 0xffff,
+	printk("%s: %04x ilc:%d [#%d]", str, regs->int_code & 0xffff,
 	       regs->int_code >> 17, ++die_counter);
-#ifdef CONFIG_PREEMPT
-	pr_cont("PREEMPT ");
-#endif
 	pr_cont("SMP ");
 	if (debug_pagealloc_enabled())
 		pr_cont("DEBUG_PAGEALLOC");
@@ -211,5 +216,5 @@ void die(struct pt_regs *regs, const char *str)
 	if (panic_on_oops)
 		panic("Fatal exception: panic_on_oops");
 	oops_exit();
-	do_exit(SIGSEGV);
+	make_task_dead(SIGSEGV);
 }
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index db32a55daaec..54cf0923050f 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -2,12 +2,13 @@
 /*
  *    Copyright IBM Corp. 2007, 2009
  *    Author(s): Hongjie Yang <hongjie@us.ibm.com>,
- *		 Heiko Carstens <heiko.carstens@de.ibm.com>
  */
 
 #define KMSG_COMPONENT "setup"
 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 
+#include <linux/sched/debug.h>
+#include <linux/cpufeature.h>
 #include <linux/compiler.h>
 #include <linux/init.h>
 #include <linux/errno.h>
@@ -18,8 +19,14 @@
 #include <linux/pfn.h>
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
+#include <asm/asm-extable.h>
+#include <linux/memblock.h>
+#include <asm/access-regs.h>
+#include <asm/asm-offsets.h>
+#include <asm/machine.h>
 #include <asm/diag.h>
 #include <asm/ebcdic.h>
+#include <asm/fpu.h>
 #include <asm/ipl.h>
 #include <asm/lowcore.h>
 #include <asm/processor.h>
@@ -30,22 +37,36 @@
 #include <asm/sclp.h>
 #include <asm/facility.h>
 #include <asm/boot_data.h>
-#include <asm/switch_to.h>
 #include "entry.h"
 
-static void __init reset_tod_clock(void)
-{
-	u64 time;
-
-	if (store_tod_clock(&time) == 0)
-		return;
-	/* TOD clock not running. Set the clock to Unix Epoch. */
-	if (set_tod_clock(TOD_UNIX_EPOCH) != 0 || store_tod_clock(&time) != 0)
-		disabled_wait();
+#define __decompressor_handled_param(func, param)		\
+static int __init ignore_decompressor_param_##func(char *s)	\
+{								\
+	return 0;						\
+}								\
+early_param(#param, ignore_decompressor_param_##func)
+
+#define decompressor_handled_param(param) __decompressor_handled_param(param, param)
+
+decompressor_handled_param(mem);
+decompressor_handled_param(vmalloc);
+decompressor_handled_param(dfltcc);
+decompressor_handled_param(facilities);
+decompressor_handled_param(nokaslr);
+decompressor_handled_param(cmma);
+decompressor_handled_param(relocate_lowcore);
+decompressor_handled_param(bootdebug);
+__decompressor_handled_param(debug_alternative, debug-alternative);
+#if IS_ENABLED(CONFIG_KVM)
+decompressor_handled_param(prot_virt);
+#endif
 
-	memset(tod_clock_base, 0, 16);
-	*(__u64 *) &tod_clock_base[1] = TOD_UNIX_EPOCH;
-	S390_lowcore.last_update_clock = TOD_UNIX_EPOCH;
+static void __init kasan_early_init(void)
+{
+#ifdef CONFIG_KASAN
+	init_task.kasan_depth = 0;
+	pr_info("KernelAddressSanitizer initialized\n");
+#endif
 }
 
 /*
@@ -66,26 +87,6 @@ static noinline __init void init_kernel_storage_key(void)
 
 static __initdata char sysinfo_page[PAGE_SIZE] __aligned(PAGE_SIZE);
 
-static noinline __init void detect_machine_type(void)
-{
-	struct sysinfo_3_2_2 *vmms = (struct sysinfo_3_2_2 *)&sysinfo_page;
-
-	/* Check current-configuration-level */
-	if (stsi(NULL, 0, 0, 0) <= 2) {
-		S390_lowcore.machine_flags |= MACHINE_FLAG_LPAR;
-		return;
-	}
-	/* Get virtual-machine cpu information. */
-	if (stsi(vmms, 3, 2, 2) || !vmms->count)
-		return;
-
-	/* Detect known hypervisors */
-	if (!memcmp(vmms->vm[0].cpi, "\xd2\xe5\xd4", 3))
-		S390_lowcore.machine_flags |= MACHINE_FLAG_KVM;
-	else if (!memcmp(vmms->vm[0].cpi, "\xa9\x61\xe5\xd4", 4))
-		S390_lowcore.machine_flags |= MACHINE_FLAG_VM;
-}
-
 /* Remove leading, trailing and double whitespace. */
 static inline void strim_all(char *str)
 {
@@ -126,9 +127,9 @@ static noinline __init void setup_arch_string(void)
 		strim_all(hvstr);
 	} else {
 		sprintf(hvstr, "%s",
-			MACHINE_IS_LPAR ? "LPAR" :
-			MACHINE_IS_VM ? "z/VM" :
-			MACHINE_IS_KVM ? "KVM" : "unknown");
+			machine_is_lpar() ? "LPAR" :
+			machine_is_vm() ? "z/VM" :
+			machine_is_kvm() ? "KVM" : "unknown");
 	}
 	dump_stack_set_arch_desc("%s (%s)", mstr, hvstr);
 }
@@ -137,9 +138,8 @@ static __init void setup_topology(void)
 {
 	int max_mnest;
 
-	if (!test_facility(11))
+	if (!cpu_has_topology())
 		return;
-	S390_lowcore.machine_flags |= MACHINE_FLAG_TOPOLOGY;
 	for (max_mnest = 6; max_mnest > 1; max_mnest--) {
 		if (stsi(&sysinfo_page, 15, 1, max_mnest) == 0)
 			break;
@@ -147,129 +147,58 @@ static __init void setup_topology(void)
 	topology_max_mnest = max_mnest;
 }
 
-static void early_pgm_check_handler(void)
+void __init __do_early_pgm_check(struct pt_regs *regs)
 {
-	const struct exception_table_entry *fixup;
-	unsigned long cr0, cr0_new;
-	unsigned long addr;
+	struct lowcore *lc = get_lowcore();
+	unsigned long ip;
+
+	regs->int_code = lc->pgm_int_code;
+	regs->int_parm_long = lc->trans_exc_code;
+	ip = __rewind_psw(regs->psw, regs->int_code >> 16);
 
-	addr = S390_lowcore.program_old_psw.addr;
-	fixup = s390_search_extables(addr);
-	if (!fixup)
-		disabled_wait();
-	/* Disable low address protection before storing into lowcore. */
-	__ctl_store(cr0, 0, 0);
-	cr0_new = cr0 & ~(1UL << 28);
-	__ctl_load(cr0_new, 0, 0);
-	S390_lowcore.program_old_psw.addr = extable_fixup(fixup);
-	__ctl_load(cr0, 0, 0);
+	/* Monitor Event? Might be a warning */
+	if ((regs->int_code & PGM_INT_CODE_MASK) == 0x40) {
+		if (report_bug(ip, regs) == BUG_TRAP_TYPE_WARN)
+			return;
+	}
+	if (fixup_exception(regs))
+		return;
+	/*
+	 * Unhandled exception - system cannot continue but try to get some
+	 * helpful messages to the console. Use early_printk() to print
+	 * some basic information in case it is too early for printk().
+	 */
+	register_early_console();
+	early_printk("PANIC: early exception %04x PSW: %016lx %016lx\n",
+		     regs->int_code & 0xffff, regs->psw.mask, regs->psw.addr);
+	show_regs(regs);
+	disabled_wait();
 }
 
 static noinline __init void setup_lowcore_early(void)
 {
+	struct lowcore *lc = get_lowcore();
 	psw_t psw;
 
-	psw.mask = PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA;
-	psw.addr = (unsigned long) s390_base_ext_handler;
-	S390_lowcore.external_new_psw = psw;
-	psw.addr = (unsigned long) s390_base_pgm_handler;
-	S390_lowcore.program_new_psw = psw;
-	s390_base_pgm_handler_fn = early_pgm_check_handler;
-	S390_lowcore.preempt_count = INIT_PREEMPT_COUNT;
-}
-
-static noinline __init void setup_facility_list(void)
-{
-	memcpy(S390_lowcore.alt_stfle_fac_list,
-	       S390_lowcore.stfle_fac_list,
-	       sizeof(S390_lowcore.alt_stfle_fac_list));
-	if (!IS_ENABLED(CONFIG_KERNEL_NOBP))
-		__clear_facility(82, S390_lowcore.alt_stfle_fac_list);
-}
-
-static __init void detect_diag9c(void)
-{
-	unsigned int cpu_address;
-	int rc;
-
-	cpu_address = stap();
-	diag_stat_inc(DIAG_STAT_X09C);
-	asm volatile(
-		"	diag	%2,0,0x9c\n"
-		"0:	la	%0,0\n"
-		"1:\n"
-		EX_TABLE(0b,1b)
-		: "=d" (rc) : "0" (-EOPNOTSUPP), "d" (cpu_address) : "cc");
-	if (!rc)
-		S390_lowcore.machine_flags |= MACHINE_FLAG_DIAG9C;
-}
-
-static __init void detect_diag44(void)
-{
-	int rc;
-
-	diag_stat_inc(DIAG_STAT_X044);
-	asm volatile(
-		"	diag	0,0,0x44\n"
-		"0:	la	%0,0\n"
-		"1:\n"
-		EX_TABLE(0b,1b)
-		: "=d" (rc) : "0" (-EOPNOTSUPP) : "cc");
-	if (!rc)
-		S390_lowcore.machine_flags |= MACHINE_FLAG_DIAG44;
-}
-
-static __init void detect_machine_facilities(void)
-{
-	if (test_facility(8)) {
-		S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT1;
-		__ctl_set_bit(0, 23);
-	}
-	if (test_facility(78))
-		S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT2;
-	if (test_facility(3))
-		S390_lowcore.machine_flags |= MACHINE_FLAG_IDTE;
-	if (test_facility(50) && test_facility(73)) {
-		S390_lowcore.machine_flags |= MACHINE_FLAG_TE;
-		__ctl_set_bit(0, 55);
-	}
-	if (test_facility(51))
-		S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_LC;
-	if (test_facility(129)) {
-		S390_lowcore.machine_flags |= MACHINE_FLAG_VX;
-		__ctl_set_bit(0, 17);
-	}
-	if (test_facility(130) && !noexec_disabled) {
-		S390_lowcore.machine_flags |= MACHINE_FLAG_NX;
-		__ctl_set_bit(0, 20);
-	}
-	if (test_facility(133))
-		S390_lowcore.machine_flags |= MACHINE_FLAG_GS;
-	if (test_facility(139) && (tod_clock_base[1] & 0x80)) {
-		/* Enabled signed clock comparator comparisons */
-		S390_lowcore.machine_flags |= MACHINE_FLAG_SCC;
-		clock_comparator_max = -1ULL >> 1;
-		__ctl_set_bit(0, 53);
-	}
+	psw.addr = (unsigned long)early_pgm_check_handler;
+	psw.mask = PSW_KERNEL_BITS;
+	lc->program_new_psw = psw;
+	lc->preempt_count = INIT_PREEMPT_COUNT;
+	lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW);
+	lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW);
 }
 
 static inline void save_vector_registers(void)
 {
 #ifdef CONFIG_CRASH_DUMP
-	if (test_facility(129))
+	if (cpu_has_vx())
 		save_vx_regs(boot_cpu_vector_save_area);
 #endif
 }
 
-static inline void setup_control_registers(void)
+static inline void setup_low_address_protection(void)
 {
-	unsigned long reg;
-
-	__ctl_store(reg, 0, 0);
-	reg |= CR0_LOW_ADDRESS_PROTECTION;
-	reg |= CR0_EMERGENCY_SIGNAL_SUBMASK;
-	reg |= CR0_EXTERNAL_CALL_SUBMASK;
-	__ctl_load(reg, 0, 0);
+	system_ctl_set_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT);
 }
 
 static inline void setup_access_registers(void)
@@ -279,64 +208,32 @@ static inline void setup_access_registers(void)
 	restore_access_regs(acrs);
 }
 
-static int __init disable_vector_extension(char *str)
-{
-	S390_lowcore.machine_flags &= ~MACHINE_FLAG_VX;
-	__ctl_clear_bit(0, 17);
-	return 0;
-}
-early_param("novx", disable_vector_extension);
-
-static int __init cad_setup(char *str)
-{
-	bool enabled;
-	int rc;
-
-	rc = kstrtobool(str, &enabled);
-	if (!rc && enabled && test_facility(128))
-		/* Enable problem state CAD. */
-		__ctl_set_bit(2, 3);
-	return rc;
-}
-early_param("cad", cad_setup);
-
 char __bootdata(early_command_line)[COMMAND_LINE_SIZE];
 static void __init setup_boot_command_line(void)
 {
 	/* copy arch command line */
-	strlcpy(boot_command_line, early_command_line, ARCH_COMMAND_LINE_SIZE);
+	strscpy(boot_command_line, early_command_line, COMMAND_LINE_SIZE);
 }
 
-static void __init check_image_bootable(void)
+static void __init sort_amode31_extable(void)
 {
-	if (!memcmp(EP_STRING, (void *)EP_OFFSET, strlen(EP_STRING)))
-		return;
-
-	sclp_early_printk("Linux kernel boot failure: An attempt to boot a vmlinux ELF image failed.\n");
-	sclp_early_printk("This image does not contain all parts necessary for starting up. Use\n");
-	sclp_early_printk("bzImage or arch/s390/boot/compressed/vmlinux instead.\n");
-	disabled_wait();
+	sort_extable(__start_amode31_ex_table, __stop_amode31_ex_table);
 }
 
 void __init startup_init(void)
 {
-	reset_tod_clock();
-	check_image_bootable();
+	kasan_early_init();
 	time_early_init();
 	init_kernel_storage_key();
 	lockdep_off();
+	sort_amode31_extable();
 	setup_lowcore_early();
-	setup_facility_list();
-	detect_machine_type();
 	setup_arch_string();
 	setup_boot_command_line();
-	detect_diag9c();
-	detect_diag44();
-	detect_machine_facilities();
 	save_vector_registers();
 	setup_topology();
 	sclp_early_detect();
-	setup_control_registers();
+	setup_low_address_protection();
 	setup_access_registers();
 	lockdep_on();
 }
diff --git a/arch/s390/kernel/early_printk.c b/arch/s390/kernel/early_printk.c
index 6f24d83bc5dc..cefe020a3be3 100644
--- a/arch/s390/kernel/early_printk.c
+++ b/arch/s390/kernel/early_printk.c
@@ -6,11 +6,12 @@
 #include <linux/console.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <asm/setup.h>
 #include <asm/sclp.h>
 
 static void sclp_early_write(struct console *con, const char *s, unsigned int len)
 {
-	__sclp_early_printk(s, len, 0);
+	__sclp_early_printk(s, len);
 }
 
 static struct console sclp_early_console = {
@@ -20,6 +21,16 @@ static struct console sclp_early_console = {
 	.index = -1,
 };
 
+void __init register_early_console(void)
+{
+	if (early_console)
+		return;
+	if (!sclp.has_linemode && !sclp.has_vt220)
+		return;
+	early_console = &sclp_early_console;
+	register_console(early_console);
+}
+
 static int __init setup_early_printk(char *buf)
 {
 	if (early_console)
@@ -27,10 +38,7 @@ static int __init setup_early_printk(char *buf)
 	/* Accept only "earlyprintk" and "earlyprintk=sclp" */
 	if (buf && !str_has_prefix(buf, "sclp"))
 		return 0;
-	if (!sclp.has_linemode && !sclp.has_vt220)
-		return 0;
-	early_console = &sclp_early_console;
-	register_console(early_console);
+	register_early_console();
 	return 0;
 }
 early_param("earlyprintk", setup_early_printk);
diff --git a/arch/s390/kernel/ebcdic.c b/arch/s390/kernel/ebcdic.c
index 7f8246c9be08..0e51fa537262 100644
--- a/arch/s390/kernel/ebcdic.c
+++ b/arch/s390/kernel/ebcdic.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- *    ECBDIC -> ASCII, ASCII -> ECBDIC,
+ *    EBCDIC -> ASCII, ASCII -> EBCDIC,
  *    upper to lower case (EBCDIC) conversion tables.
  *
  *  S390 version
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 270d1d145761..dd291c9ad6a6 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -6,15 +6,15 @@
  *    Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com),
  *		 Hartmut Penner (hp@de.ibm.com),
  *		 Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com),
- *		 Heiko Carstens <heiko.carstens@de.ibm.com>
  */
 
+#include <linux/export.h>
 #include <linux/init.h>
 #include <linux/linkage.h>
-#include <asm/alternative-asm.h>
+#include <asm/asm-extable.h>
+#include <asm/alternative.h>
 #include <asm/processor.h>
 #include <asm/cache.h>
-#include <asm/ctl_reg.h>
 #include <asm/dwarf.h>
 #include <asm/errno.h>
 #include <asm/ptrace.h>
@@ -24,143 +24,51 @@
 #include <asm/page.h>
 #include <asm/sigp.h>
 #include <asm/irq.h>
-#include <asm/vx-insn.h>
+#include <asm/fpu-insn.h>
 #include <asm/setup.h>
 #include <asm/nmi.h>
-#include <asm/export.h>
 #include <asm/nospec-insn.h>
-
-__PT_R0      =	__PT_GPRS
-__PT_R1      =	__PT_GPRS + 8
-__PT_R2      =	__PT_GPRS + 16
-__PT_R3      =	__PT_GPRS + 24
-__PT_R4      =	__PT_GPRS + 32
-__PT_R5      =	__PT_GPRS + 40
-__PT_R6      =	__PT_GPRS + 48
-__PT_R7      =	__PT_GPRS + 56
-__PT_R8      =	__PT_GPRS + 64
-__PT_R9      =	__PT_GPRS + 72
-__PT_R10     =	__PT_GPRS + 80
-__PT_R11     =	__PT_GPRS + 88
-__PT_R12     =	__PT_GPRS + 96
-__PT_R13     =	__PT_GPRS + 104
-__PT_R14     =	__PT_GPRS + 112
-__PT_R15     =	__PT_GPRS + 120
-
-STACK_SHIFT = PAGE_SHIFT + THREAD_SIZE_ORDER
-STACK_SIZE  = 1 << STACK_SHIFT
-STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE
-
-_TIF_WORK	= (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
-		   _TIF_UPROBE | _TIF_GUARDED_STORAGE | _TIF_PATCH_PENDING)
-_TIF_TRACE	= (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \
-		   _TIF_SYSCALL_TRACEPOINT)
-_CIF_WORK	= (_CIF_MCCK_PENDING | _CIF_ASCE_PRIMARY | \
-		   _CIF_ASCE_SECONDARY | _CIF_FPU)
-_PIF_WORK	= (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART)
+#include <asm/lowcore.h>
+#include <asm/machine.h>
 
 _LPP_OFFSET	= __LC_LPP
 
-#define BASED(name) name-cleanup_critical(%r13)
-
-	.macro	TRACE_IRQS_ON
-#ifdef CONFIG_TRACE_IRQFLAGS
-	basr	%r2,%r0
-	brasl	%r14,trace_hardirqs_on_caller
-#endif
+	.macro STBEAR address
+	ALTERNATIVE "nop", ".insn s,0xb2010000,\address", ALT_FACILITY(193)
 	.endm
 
-	.macro	TRACE_IRQS_OFF
-#ifdef CONFIG_TRACE_IRQFLAGS
-	basr	%r2,%r0
-	brasl	%r14,trace_hardirqs_off_caller
-#endif
+	.macro LBEAR address
+	ALTERNATIVE "nop", ".insn s,0xb2000000,\address", ALT_FACILITY(193)
 	.endm
 
-	.macro	LOCKDEP_SYS_EXIT
-#ifdef CONFIG_LOCKDEP
-	tm	__PT_PSW+1(%r11),0x01	# returning to user ?
-	jz	.+10
-	brasl	%r14,lockdep_sys_exit
-#endif
+	.macro LPSWEY address, lpswe
+	ALTERNATIVE_2 "b \lpswe;nopr", \
+		".insn siy,0xeb0000000071,\address,0", ALT_FACILITY(193),		\
+		__stringify(.insn siy,0xeb0000000071,LOWCORE_ALT_ADDRESS+\address,0),	\
+		ALT_FEATURE(MFEATURE_LOWCORE)
 	.endm
 
-	.macro	CHECK_STACK savearea
-#ifdef CONFIG_CHECK_STACK
-	tml	%r15,STACK_SIZE - CONFIG_STACK_GUARD
-	lghi	%r14,\savearea
-	jz	stack_overflow
-#endif
+	.macro MBEAR reg, lowcore
+	ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK(\lowcore)),\
+		ALT_FACILITY(193)
 	.endm
 
-	.macro	CHECK_VMAP_STACK savearea,oklabel
-#ifdef CONFIG_VMAP_STACK
+	.macro	CHECK_VMAP_STACK savearea, lowcore, oklabel
 	lgr	%r14,%r15
-	nill	%r14,0x10000 - STACK_SIZE
-	oill	%r14,STACK_INIT
-	clg	%r14,__LC_KERNEL_STACK
+	nill	%r14,0x10000 - THREAD_SIZE
+	oill	%r14,STACK_INIT_OFFSET
+	clg	%r14,__LC_KERNEL_STACK(\lowcore)
 	je	\oklabel
-	clg	%r14,__LC_ASYNC_STACK
+	clg	%r14,__LC_ASYNC_STACK(\lowcore)
 	je	\oklabel
-	clg	%r14,__LC_NODAT_STACK
+	clg	%r14,__LC_MCCK_STACK(\lowcore)
 	je	\oklabel
-	clg	%r14,__LC_RESTART_STACK
+	clg	%r14,__LC_NODAT_STACK(\lowcore)
 	je	\oklabel
-	lghi	%r14,\savearea
-	j	stack_overflow
-#else
-	j	\oklabel
-#endif
-	.endm
-
-	.macro	SWITCH_ASYNC savearea,timer
-	tmhh	%r8,0x0001		# interrupting from user ?
-	jnz	1f
-	lgr	%r14,%r9
-	slg	%r14,BASED(.Lcritical_start)
-	clg	%r14,BASED(.Lcritical_length)
-	jhe	0f
-	lghi	%r11,\savearea		# inside critical section, do cleanup
-	brasl	%r14,cleanup_critical
-	tmhh	%r8,0x0001		# retest problem state after cleanup
-	jnz	1f
-0:	lg	%r14,__LC_ASYNC_STACK	# are we already on the target stack?
-	slgr	%r14,%r15
-	srag	%r14,%r14,STACK_SHIFT
-	jnz	2f
-	CHECK_STACK \savearea
-	aghi	%r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
-	j	3f
-1:	UPDATE_VTIME %r14,%r15,\timer
-	BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
-2:	lg	%r15,__LC_ASYNC_STACK	# load async stack
-3:	la	%r11,STACK_FRAME_OVERHEAD(%r15)
-	.endm
-
-	.macro UPDATE_VTIME w1,w2,enter_timer
-	lg	\w1,__LC_EXIT_TIMER
-	lg	\w2,__LC_LAST_UPDATE_TIMER
-	slg	\w1,\enter_timer
-	slg	\w2,__LC_EXIT_TIMER
-	alg	\w1,__LC_USER_TIMER
-	alg	\w2,__LC_SYSTEM_TIMER
-	stg	\w1,__LC_USER_TIMER
-	stg	\w2,__LC_SYSTEM_TIMER
-	mvc	__LC_LAST_UPDATE_TIMER(8),\enter_timer
-	.endm
-
-	.macro REENABLE_IRQS
-	stg	%r8,__LC_RETURN_PSW
-	ni	__LC_RETURN_PSW,0xbf
-	ssm	__LC_RETURN_PSW
-	.endm
-
-	.macro STCK savearea
-#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES
-	.insn	s,0xb27c0000,\savearea		# store clock fast
-#else
-	.insn	s,0xb2050000,\savearea		# store clock
-#endif
+	clg	%r14,__LC_RESTART_STACK(\lowcore)
+	je	\oklabel
+	la	%r14,\savearea(\lowcore)
+	j	stack_invalid
 	.endm
 
 	/*
@@ -186,1057 +94,470 @@ _LPP_OFFSET	= __LC_LPP
 	.endm
 
 	.macro BPOFF
-	ALTERNATIVE "", ".long 0xb2e8c000", 82
+	ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", ALT_SPEC(82)
 	.endm
 
 	.macro BPON
-	ALTERNATIVE "", ".long 0xb2e8d000", 82
+	ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", ALT_SPEC(82)
 	.endm
 
 	.macro BPENTER tif_ptr,tif_mask
-	ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .long 0xb2e8d000", \
-		    "", 82
+	ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .insn rrf,0xb2e80000,0,0,13,0", \
+		    "j .+12; nop; nop", ALT_SPEC(82)
 	.endm
 
 	.macro BPEXIT tif_ptr,tif_mask
 	TSTMSK	\tif_ptr,\tif_mask
-	ALTERNATIVE "jz .+8;  .long 0xb2e8c000", \
-		    "jnz .+8; .long 0xb2e8d000", 82
+	ALTERNATIVE "jz .+8;  .insn rrf,0xb2e80000,0,0,12,0", \
+		    "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", ALT_SPEC(82)
+	.endm
+
+#if IS_ENABLED(CONFIG_KVM)
+	.macro SIEEXIT sie_control,lowcore
+	lg	%r9,\sie_control			# get control block pointer
+	ni	__SIE_PROG0C+3(%r9),0xfe		# no longer in SIE
+	lctlg	%c1,%c1,__LC_KERNEL_ASCE(\lowcore)	# load primary asce
+	lg	%r9,__LC_CURRENT(\lowcore)
+	mvi	__TI_sie(%r9),0
+	larl	%r9,sie_exit			# skip forward to sie_exit
+	.endm
+#endif
+
+	.macro STACKLEAK_ERASE
+#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+	brasl	%r14,stackleak_erase_on_task_stack
+#endif
 	.endm
 
-	GEN_BR_THUNK %r9
 	GEN_BR_THUNK %r14
-	GEN_BR_THUNK %r14,%r11
 
 	.section .kprobes.text, "ax"
 .Ldummy:
 	/*
-	 * This nop exists only in order to avoid that __switch_to starts at
-	 * the beginning of the kprobes text section. In that case we would
-	 * have several symbols at the same address. E.g. objdump would take
-	 * an arbitrary symbol name when disassembling this code.
-	 * With the added nop in between the __switch_to symbol is unique
-	 * again.
+	 * The following nop exists only in order to avoid that the next
+	 * symbol starts at the beginning of the kprobes text section.
+	 * In that case there would be several symbols at the same address.
+	 * E.g. objdump would take an arbitrary symbol when disassembling
+	 * the code.
+	 * With the added nop in between this cannot happen.
 	 */
 	nop	0
 
-ENTRY(__bpon)
-	.globl __bpon
-	BPON
-	BR_EX	%r14
-ENDPROC(__bpon)
-
 /*
- * Scheduler resume function, called by switch_to
- *  gpr2 = (task_struct *) prev
- *  gpr3 = (task_struct *) next
+ * Scheduler resume function, called by __switch_to
+ *  gpr2 = (task_struct *)prev
+ *  gpr3 = (task_struct *)next
  * Returns:
  *  gpr2 = prev
  */
-ENTRY(__switch_to)
+SYM_FUNC_START(__switch_to_asm)
 	stmg	%r6,%r15,__SF_GPRS(%r15)	# store gprs of prev task
 	lghi	%r4,__TASK_stack
 	lghi	%r1,__TASK_thread
-	llill	%r5,STACK_INIT
+	llill	%r5,STACK_INIT_OFFSET
 	stg	%r15,__THREAD_ksp(%r1,%r2)	# store kernel stack of prev
 	lg	%r15,0(%r4,%r3)			# start of kernel stack of next
 	agr	%r15,%r5			# end of kernel stack of next
-	stg	%r3,__LC_CURRENT		# store task struct of next
-	stg	%r15,__LC_KERNEL_STACK		# store end of kernel stack
+	GET_LC	%r13
+	stg	%r3,__LC_CURRENT(%r13)		# store task struct of next
+	stg	%r15,__LC_KERNEL_STACK(%r13)	# store end of kernel stack
 	lg	%r15,__THREAD_ksp(%r1,%r3)	# load kernel stack of next
 	aghi	%r3,__TASK_pid
-	mvc	__LC_CURRENT_PID(4,%r0),0(%r3)	# store pid of next
+	mvc	__LC_CURRENT_PID(4,%r13),0(%r3)	# store pid of next
+	ALTERNATIVE "nop", "lpp _LPP_OFFSET(%r13)", ALT_FACILITY(40)
 	lmg	%r6,%r15,__SF_GPRS(%r15)	# load gprs of next task
-	ALTERNATIVE "", ".insn s,0xb2800000,_LPP_OFFSET", 40
 	BR_EX	%r14
-ENDPROC(__switch_to)
-
-.L__critical_start:
+SYM_FUNC_END(__switch_to_asm)
 
 #if IS_ENABLED(CONFIG_KVM)
 /*
- * sie64a calling convention:
- * %r2 pointer to sie control block
- * %r3 guest register save area
+ * __sie64a calling convention:
+ * %r2 pointer to sie control block phys
+ * %r3 pointer to sie control block virt
+ * %r4 guest register save area
+ * %r5 guest asce
  */
-ENTRY(sie64a)
+SYM_FUNC_START(__sie64a)
 	stmg	%r6,%r14,__SF_GPRS(%r15)	# save kernel registers
-	lg	%r12,__LC_CURRENT
-	stg	%r2,__SF_SIE_CONTROL(%r15)	# save control block pointer
-	stg	%r3,__SF_SIE_SAVEAREA(%r15)	# save guest register save area
+	GET_LC	%r13
+	lg	%r14,__LC_CURRENT(%r13)
+	stg	%r2,__SF_SIE_CONTROL_PHYS(%r15)	# save sie block physical..
+	stg	%r3,__SF_SIE_CONTROL(%r15)	# ...and virtual addresses
+	stg	%r4,__SF_SIE_SAVEAREA(%r15)	# save guest register save area
+	stg	%r5,__SF_SIE_GUEST_ASCE(%r15)	# save guest asce
 	xc	__SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0
-	mvc	__SF_SIE_FLAGS(8,%r15),__TI_flags(%r12) # copy thread flags
-	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU		# load guest fp/vx registers ?
-	jno	.Lsie_load_guest_gprs
-	brasl	%r14,load_fpu_regs		# load guest fp/vx regs
-.Lsie_load_guest_gprs:
-	lmg	%r0,%r13,0(%r3)			# load guest gprs 0-13
-	lg	%r14,__LC_GMAP			# get gmap pointer
-	ltgr	%r14,%r14
-	jz	.Lsie_gmap
-	lctlg	%c1,%c1,__GMAP_ASCE(%r14)	# load primary asce
-.Lsie_gmap:
+	mvc	__SF_SIE_FLAGS(8,%r15),__TI_flags(%r14) # copy thread flags
+	lmg	%r0,%r13,0(%r4)			# load guest gprs 0-13
+	mvi	__TI_sie(%r14),1
+	lctlg	%c1,%c1,__SF_SIE_GUEST_ASCE(%r15) # load primary asce
 	lg	%r14,__SF_SIE_CONTROL(%r15)	# get control block pointer
 	oi	__SIE_PROG0C+3(%r14),1		# we are going into SIE now
 	tm	__SIE_PROG20+3(%r14),3		# last exit...
 	jnz	.Lsie_skip
-	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
-	jo	.Lsie_skip			# exit if fp/vx regs changed
-	BPEXIT	__SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
+	lg	%r14,__SF_SIE_CONTROL_PHYS(%r15)	# get sie block phys addr
+	BPEXIT	__SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
 .Lsie_entry:
 	sie	0(%r14)
-.Lsie_exit:
+# Let the next instruction be NOP to avoid triggering a machine check
+# and handling it in a guest as result of the instruction execution.
+	nopr	7
+.Lsie_leave:
 	BPOFF
-	BPENTER	__SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
+	BPENTER	__SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
 .Lsie_skip:
+	lg	%r14,__SF_SIE_CONTROL(%r15)	# get control block pointer
 	ni	__SIE_PROG0C+3(%r14),0xfe	# no longer in SIE
-	lctlg	%c1,%c1,__LC_USER_ASCE		# load primary asce
-.Lsie_done:
-# some program checks are suppressing. C code (e.g. do_protection_exception)
-# will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There
-# are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable.
-# Other instructions between sie64a and .Lsie_done should not cause program
-# interrupts. So lets use 3 nops as a landing pad for all possible rewinds.
-# See also .Lcleanup_sie
-.Lrewind_pad6:
-	nopr	7
-.Lrewind_pad4:
-	nopr	7
-.Lrewind_pad2:
-	nopr	7
-	.globl sie_exit
-sie_exit:
+	GET_LC	%r14
+	lctlg	%c1,%c1,__LC_KERNEL_ASCE(%r14)	# load primary asce
+	lg	%r14,__LC_CURRENT(%r14)
+	mvi	__TI_sie(%r14),0
+SYM_INNER_LABEL(sie_exit, SYM_L_GLOBAL)
 	lg	%r14,__SF_SIE_SAVEAREA(%r15)	# load guest register save area
 	stmg	%r0,%r13,0(%r14)		# save guest gprs 0-13
 	xgr	%r0,%r0				# clear guest registers to
 	xgr	%r1,%r1				# prevent speculative use
-	xgr	%r2,%r2
 	xgr	%r3,%r3
 	xgr	%r4,%r4
 	xgr	%r5,%r5
 	lmg	%r6,%r14,__SF_GPRS(%r15)	# restore kernel registers
 	lg	%r2,__SF_SIE_REASON(%r15)	# return exit reason code
 	BR_EX	%r14
-.Lsie_fault:
-	lghi	%r14,-EFAULT
-	stg	%r14,__SF_SIE_REASON(%r15)	# set exit reason code
-	j	sie_exit
-
-	EX_TABLE(.Lrewind_pad6,.Lsie_fault)
-	EX_TABLE(.Lrewind_pad4,.Lsie_fault)
-	EX_TABLE(.Lrewind_pad2,.Lsie_fault)
-	EX_TABLE(sie_exit,.Lsie_fault)
-ENDPROC(sie64a)
-EXPORT_SYMBOL(sie64a)
+SYM_FUNC_END(__sie64a)
+EXPORT_SYMBOL(__sie64a)
 EXPORT_SYMBOL(sie_exit)
 #endif
 
 /*
  * SVC interrupt handler routine. System calls are synchronous events and
- * are executed with interrupts enabled.
+ * are entered with interrupts disabled.
  */
 
-ENTRY(system_call)
-	stpt	__LC_SYNC_ENTER_TIMER
-.Lsysc_stmg:
-	stmg	%r8,%r15,__LC_SAVE_AREA_SYNC
+SYM_CODE_START(system_call)
+	STMG_LC	%r8,%r15,__LC_SAVE_AREA
+	GET_LC	%r13
+	stpt	__LC_SYS_ENTER_TIMER(%r13)
 	BPOFF
-	lg	%r12,__LC_CURRENT
-	lghi	%r13,__TASK_thread
-	lghi	%r14,_PIF_SYSCALL
+	lghi	%r14,0
 .Lsysc_per:
-	lg	%r15,__LC_KERNEL_STACK
-	la	%r11,STACK_FRAME_OVERHEAD(%r15)	# pointer to pt_regs
-.Lsysc_vtime:
-	UPDATE_VTIME %r8,%r9,__LC_SYNC_ENTER_TIMER
-	BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
-	stmg	%r0,%r7,__PT_R0(%r11)
-	mvc	__PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
-	mvc	__PT_PSW(16,%r11),__LC_SVC_OLD_PSW
-	mvc	__PT_INT_CODE(4,%r11),__LC_SVC_ILC
-	stg	%r14,__PT_FLAGS(%r11)
-.Lsysc_do_svc:
+	STBEAR	__LC_LAST_BREAK(%r13)
+	lctlg	%c1,%c1,__LC_KERNEL_ASCE(%r13)
+	lg	%r15,__LC_KERNEL_STACK(%r13)
+	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
+	stmg	%r0,%r7,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
 	# clear user controlled register to prevent speculative use
 	xgr	%r0,%r0
-	# load address of system call table
-	lg	%r10,__THREAD_sysc_table(%r13,%r12)
-	llgh	%r8,__PT_INT_CODE+2(%r11)
-	slag	%r8,%r8,3			# shift and test for svc 0
-	jnz	.Lsysc_nr_ok
-	# svc 0: system call number in %r1
-	llgfr	%r1,%r1				# clear high word in r1
-	cghi	%r1,NR_syscalls
-	jnl	.Lsysc_nr_ok
-	sth	%r1,__PT_INT_CODE+2(%r11)
-	slag	%r8,%r1,3
-.Lsysc_nr_ok:
-	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
-	stg	%r2,__PT_ORIG_GPR2(%r11)
-	stg	%r7,STACK_FRAME_OVERHEAD(%r15)
-	lg	%r9,0(%r8,%r10)			# get system call add.
-	TSTMSK	__TI_flags(%r12),_TIF_TRACE
-	jnz	.Lsysc_tracesys
-	BASR_EX	%r14,%r9			# call sys_xxxx
-	stg	%r2,__PT_R2(%r11)		# store return value
-
-.Lsysc_return:
-#ifdef CONFIG_DEBUG_RSEQ
-	lgr	%r2,%r11
-	brasl	%r14,rseq_syscall
-#endif
-	LOCKDEP_SYS_EXIT
-.Lsysc_tif:
-	TSTMSK	__PT_FLAGS(%r11),_PIF_WORK
-	jnz	.Lsysc_work
-	TSTMSK	__TI_flags(%r12),_TIF_WORK
-	jnz	.Lsysc_work			# check for work
-	TSTMSK	__LC_CPU_FLAGS,_CIF_WORK
-	jnz	.Lsysc_work
-	BPEXIT	__TI_flags(%r12),_TIF_ISOLATE_BP
-.Lsysc_restore:
-	lg	%r14,__LC_VDSO_PER_CPU
-	lmg	%r0,%r10,__PT_R0(%r11)
-	mvc	__LC_RETURN_PSW(16),__PT_PSW(%r11)
-.Lsysc_exit_timer:
-	stpt	__LC_EXIT_TIMER
-	mvc	__VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
-	lmg	%r11,%r15,__PT_R11(%r11)
-	lpswe	__LC_RETURN_PSW
-.Lsysc_done:
-
-#
-# One of the work bits is on. Find out which one.
-#
-.Lsysc_work:
-	TSTMSK	__LC_CPU_FLAGS,_CIF_MCCK_PENDING
-	jo	.Lsysc_mcck_pending
-	TSTMSK	__TI_flags(%r12),_TIF_NEED_RESCHED
-	jo	.Lsysc_reschedule
-	TSTMSK	__PT_FLAGS(%r11),_PIF_SYSCALL_RESTART
-	jo	.Lsysc_syscall_restart
-#ifdef CONFIG_UPROBES
-	TSTMSK	__TI_flags(%r12),_TIF_UPROBE
-	jo	.Lsysc_uprobe_notify
-#endif
-	TSTMSK	__TI_flags(%r12),_TIF_GUARDED_STORAGE
-	jo	.Lsysc_guarded_storage
-	TSTMSK	__PT_FLAGS(%r11),_PIF_PER_TRAP
-	jo	.Lsysc_singlestep
-#ifdef CONFIG_LIVEPATCH
-	TSTMSK	__TI_flags(%r12),_TIF_PATCH_PENDING
-	jo	.Lsysc_patch_pending	# handle live patching just before
-					# signals and possible syscall restart
-#endif
-	TSTMSK	__PT_FLAGS(%r11),_PIF_SYSCALL_RESTART
-	jo	.Lsysc_syscall_restart
-	TSTMSK	__TI_flags(%r12),_TIF_SIGPENDING
-	jo	.Lsysc_sigpending
-	TSTMSK	__TI_flags(%r12),_TIF_NOTIFY_RESUME
-	jo	.Lsysc_notify_resume
-	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
-	jo	.Lsysc_vxrs
-	TSTMSK	__LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY)
-	jnz	.Lsysc_asce
-	j	.Lsysc_return		# beware of critical section cleanup
-
-#
-# _TIF_NEED_RESCHED is set, call schedule
-#
-.Lsysc_reschedule:
-	larl	%r14,.Lsysc_return
-	jg	schedule
-
-#
-# _CIF_MCCK_PENDING is set, call handler
-#
-.Lsysc_mcck_pending:
-	larl	%r14,.Lsysc_return
-	jg	s390_handle_mcck	# TIF bit will be cleared by handler
-
-#
-# _CIF_ASCE_PRIMARY and/or _CIF_ASCE_SECONDARY set, load user space asce
-#
-.Lsysc_asce:
-	ni	__LC_CPU_FLAGS+7,255-_CIF_ASCE_SECONDARY
-	lctlg	%c7,%c7,__LC_VDSO_ASCE		# load secondary asce
-	TSTMSK	__LC_CPU_FLAGS,_CIF_ASCE_PRIMARY
-	jz	.Lsysc_return
-#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES
-	tm	__LC_STFLE_FAC_LIST+3,0x10	# has MVCOS ?
-	jnz	.Lsysc_set_fs_fixup
-	ni	__LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY
-	lctlg	%c1,%c1,__LC_USER_ASCE		# load primary asce
-	j	.Lsysc_return
-.Lsysc_set_fs_fixup:
-#endif
-	larl	%r14,.Lsysc_return
-	jg	set_fs_fixup
-
-#
-# CIF_FPU is set, restore floating-point controls and floating-point registers.
-#
-.Lsysc_vxrs:
-	larl	%r14,.Lsysc_return
-	jg	load_fpu_regs
-
-#
-# _TIF_SIGPENDING is set, call do_signal
-#
-.Lsysc_sigpending:
-	lgr	%r2,%r11		# pass pointer to pt_regs
-	brasl	%r14,do_signal
-	TSTMSK	__PT_FLAGS(%r11),_PIF_SYSCALL
-	jno	.Lsysc_return
-.Lsysc_do_syscall:
-	lghi	%r13,__TASK_thread
-	lmg	%r2,%r7,__PT_R2(%r11)	# load svc arguments
-	lghi	%r1,0			# svc 0 returns -ENOSYS
-	j	.Lsysc_do_svc
-
-#
-# _TIF_NOTIFY_RESUME is set, call do_notify_resume
-#
-.Lsysc_notify_resume:
-	lgr	%r2,%r11		# pass pointer to pt_regs
-	larl	%r14,.Lsysc_return
-	jg	do_notify_resume
-
-#
-# _TIF_UPROBE is set, call uprobe_notify_resume
-#
-#ifdef CONFIG_UPROBES
-.Lsysc_uprobe_notify:
-	lgr	%r2,%r11		# pass pointer to pt_regs
-	larl	%r14,.Lsysc_return
-	jg	uprobe_notify_resume
-#endif
-
-#
-# _TIF_GUARDED_STORAGE is set, call guarded_storage_load
-#
-.Lsysc_guarded_storage:
-	lgr	%r2,%r11		# pass pointer to pt_regs
-	larl	%r14,.Lsysc_return
-	jg	gs_load_bc_cb
-#
-# _TIF_PATCH_PENDING is set, call klp_update_patch_state
-#
-#ifdef CONFIG_LIVEPATCH
-.Lsysc_patch_pending:
-	lg	%r2,__LC_CURRENT	# pass pointer to task struct
-	larl	%r14,.Lsysc_return
-	jg	klp_update_patch_state
-#endif
-
-#
-# _PIF_PER_TRAP is set, call do_per_trap
-#
-.Lsysc_singlestep:
-	ni	__PT_FLAGS+7(%r11),255-_PIF_PER_TRAP
-	lgr	%r2,%r11		# pass pointer to pt_regs
-	larl	%r14,.Lsysc_return
-	jg	do_per_trap
-
-#
-# _PIF_SYSCALL_RESTART is set, repeat the current system call
-#
-.Lsysc_syscall_restart:
-	ni	__PT_FLAGS+7(%r11),255-_PIF_SYSCALL_RESTART
-	lmg	%r1,%r7,__PT_R1(%r11)	# load svc arguments
-	lg	%r2,__PT_ORIG_GPR2(%r11)
-	j	.Lsysc_do_svc
-
-#
-# call tracehook_report_syscall_entry/tracehook_report_syscall_exit before
-# and after the system call
-#
-.Lsysc_tracesys:
-	lgr	%r2,%r11		# pass pointer to pt_regs
-	la	%r3,0
-	llgh	%r0,__PT_INT_CODE+2(%r11)
-	stg	%r0,__PT_R2(%r11)
-	brasl	%r14,do_syscall_trace_enter
-	lghi	%r0,NR_syscalls
-	clgr	%r0,%r2
-	jnh	.Lsysc_tracenogo
-	sllg	%r8,%r2,3
-	lg	%r9,0(%r8,%r10)
-.Lsysc_tracego:
-	lmg	%r3,%r7,__PT_R3(%r11)
-	stg	%r7,STACK_FRAME_OVERHEAD(%r15)
-	lg	%r2,__PT_ORIG_GPR2(%r11)
-	BASR_EX	%r14,%r9		# call sys_xxx
-	stg	%r2,__PT_R2(%r11)	# store return value
-.Lsysc_tracenogo:
-	TSTMSK	__TI_flags(%r12),_TIF_TRACE
-	jz	.Lsysc_return
-	lgr	%r2,%r11		# pass pointer to pt_regs
-	larl	%r14,.Lsysc_return
-	jg	do_syscall_trace_exit
-ENDPROC(system_call)
+	xgr	%r1,%r1
+	xgr	%r4,%r4
+	xgr	%r5,%r5
+	xgr	%r6,%r6
+	xgr	%r7,%r7
+	xgr	%r8,%r8
+	xgr	%r9,%r9
+	xgr	%r10,%r10
+	xgr	%r11,%r11
+	la	%r2,STACK_FRAME_OVERHEAD(%r15)	# pointer to pt_regs
+	mvc	__PT_R8(64,%r2),__LC_SAVE_AREA(%r13)
+	MBEAR	%r2,%r13
+	lgr	%r3,%r14
+	brasl	%r14,__do_syscall
+	STACKLEAK_ERASE
+	lctlg	%c1,%c1,__LC_USER_ASCE(%r13)
+	mvc	__LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+	BPON
+	LBEAR	STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15)
+	stpt	__LC_EXIT_TIMER(%r13)
+	lmg	%r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
+	LPSWEY	__LC_RETURN_PSW,__LC_RETURN_LPSWE
+SYM_CODE_END(system_call)
 
 #
 # a new process exits the kernel with ret_from_fork
 #
-ENTRY(ret_from_fork)
-	la	%r11,STACK_FRAME_OVERHEAD(%r15)
-	lg	%r12,__LC_CURRENT
-	brasl	%r14,schedule_tail
-	TRACE_IRQS_ON
-	ssm	__LC_SVC_NEW_PSW	# reenable interrupts
-	tm	__PT_PSW+1(%r11),0x01	# forking a kernel thread ?
-	jne	.Lsysc_tracenogo
-	# it's a kernel thread
-	lmg	%r9,%r10,__PT_R9(%r11)	# load gprs
-	la	%r2,0(%r10)
-	BASR_EX	%r14,%r9
-	j	.Lsysc_tracenogo
-ENDPROC(ret_from_fork)
-
-ENTRY(kernel_thread_starter)
-	la	%r2,0(%r10)
-	BASR_EX	%r14,%r9
-	j	.Lsysc_tracenogo
-ENDPROC(kernel_thread_starter)
+SYM_CODE_START(ret_from_fork)
+	lgr	%r3,%r11
+	brasl	%r14,__ret_from_fork
+	STACKLEAK_ERASE
+	GET_LC	%r13
+	lctlg	%c1,%c1,__LC_USER_ASCE(%r13)
+	mvc	__LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+	BPON
+	LBEAR	STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15)
+	stpt	__LC_EXIT_TIMER(%r13)
+	lmg	%r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
+	LPSWEY	__LC_RETURN_PSW,__LC_RETURN_LPSWE
+SYM_CODE_END(ret_from_fork)
 
 /*
  * Program check handler routine
  */
 
-ENTRY(pgm_check_handler)
-	stpt	__LC_SYNC_ENTER_TIMER
+SYM_CODE_START(pgm_check_handler)
+	STMG_LC	%r8,%r15,__LC_SAVE_AREA
+	GET_LC	%r13
+	stpt	__LC_SYS_ENTER_TIMER(%r13)
 	BPOFF
-	stmg	%r8,%r15,__LC_SAVE_AREA_SYNC
-	lg	%r10,__LC_LAST_BREAK
-	lg	%r12,__LC_CURRENT
-	lghi	%r11,0
-	larl	%r13,cleanup_critical
-	lmg	%r8,%r9,__LC_PGM_OLD_PSW
-	tmhh	%r8,0x0001		# test problem state bit
-	jnz	2f			# -> fault in user space
+	lmg	%r8,%r9,__LC_PGM_OLD_PSW(%r13)
+	xgr	%r10,%r10
+	tmhh	%r8,0x0001		# coming from user space?
+	jno	.Lpgm_skip_asce
+	lctlg	%c1,%c1,__LC_KERNEL_ASCE(%r13)
+	j	3f			# -> fault in user space
+.Lpgm_skip_asce:
 #if IS_ENABLED(CONFIG_KVM)
-	# cleanup critical section for program checks in sie64a
-	lgr	%r14,%r9
-	slg	%r14,BASED(.Lsie_critical_start)
-	clg	%r14,BASED(.Lsie_critical_length)
-	jhe	0f
-	lg	%r14,__SF_SIE_CONTROL(%r15)	# get control block pointer
-	ni	__SIE_PROG0C+3(%r14),0xfe	# no longer in SIE
-	lctlg	%c1,%c1,__LC_USER_ASCE		# load primary asce
-	larl	%r9,sie_exit			# skip forward to sie_exit
-	lghi	%r11,_PIF_GUEST_FAULT
+	lg	%r11,__LC_CURRENT(%r13)
+	tm	__TI_sie(%r11),0xff
+	jz	1f
+	BPENTER	__SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
+	SIEEXIT __SF_SIE_CONTROL(%r15),%r13
+	lghi	%r10,_PIF_GUEST_FAULT
 #endif
-0:	tmhh	%r8,0x4000		# PER bit set in old PSW ?
-	jnz	1f			# -> enabled, can't be a double fault
-	tm	__LC_PGM_ILC+3,0x80	# check for per exception
+1:	tmhh	%r8,0x4000		# PER bit set in old PSW ?
+	jnz	2f			# -> enabled, can't be a double fault
+	tm	__LC_PGM_ILC+3(%r13),0x80	# check for per exception
 	jnz	.Lpgm_svcper		# -> single stepped svc
-1:	CHECK_STACK __LC_SAVE_AREA_SYNC
-	aghi	%r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
-	# CHECK_VMAP_STACK branches to stack_overflow or 4f
-	CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,4f
-2:	UPDATE_VTIME %r14,%r15,__LC_SYNC_ENTER_TIMER
-	BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
-	lg	%r15,__LC_KERNEL_STACK
-	lgr	%r14,%r12
-	aghi	%r14,__TASK_thread	# pointer to thread_struct
-	lghi	%r13,__LC_PGM_TDB
-	tm	__LC_PGM_ILC+2,0x02	# check for transaction abort
-	jz	3f
-	mvc	__THREAD_trap_tdb(256,%r14),0(%r13)
-3:	stg	%r10,__THREAD_last_break(%r14)
-4:	lgr	%r13,%r11
-	la	%r11,STACK_FRAME_OVERHEAD(%r15)
+2:	aghi	%r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
+	# CHECK_VMAP_STACK branches to stack_invalid or 4f
+	CHECK_VMAP_STACK __LC_SAVE_AREA,%r13,4f
+3:	lg	%r15,__LC_KERNEL_STACK(%r13)
+4:	la	%r11,STACK_FRAME_OVERHEAD(%r15)
+	stg	%r10,__PT_FLAGS(%r11)
+	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
 	stmg	%r0,%r7,__PT_R0(%r11)
+	mvc	__PT_R8(64,%r11),__LC_SAVE_AREA(%r13)
+	mvc	__PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK(%r13)
+	stmg	%r8,%r9,__PT_PSW(%r11)
 	# clear user controlled registers to prevent speculative use
 	xgr	%r0,%r0
 	xgr	%r1,%r1
-	xgr	%r2,%r2
 	xgr	%r3,%r3
 	xgr	%r4,%r4
 	xgr	%r5,%r5
 	xgr	%r6,%r6
 	xgr	%r7,%r7
-	mvc	__PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
-	stmg	%r8,%r9,__PT_PSW(%r11)
-	mvc	__PT_INT_CODE(4,%r11),__LC_PGM_ILC
-	mvc	__PT_INT_PARM_LONG(8,%r11),__LC_TRANS_EXC_CODE
-	stg	%r13,__PT_FLAGS(%r11)
-	stg	%r10,__PT_ARGS(%r11)
-	tm	__LC_PGM_ILC+3,0x80	# check for per exception
-	jz	5f
-	tmhh	%r8,0x0001		# kernel per event ?
-	jz	.Lpgm_kprobe
-	oi	__PT_FLAGS+7(%r11),_PIF_PER_TRAP
-	mvc	__THREAD_per_address(8,%r14),__LC_PER_ADDRESS
-	mvc	__THREAD_per_cause(2,%r14),__LC_PER_CODE
-	mvc	__THREAD_per_paid(1,%r14),__LC_PER_ACCESS_ID
-5:	REENABLE_IRQS
-	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
-	larl	%r1,pgm_check_table
-	llgh	%r10,__PT_INT_CODE+2(%r11)
-	nill	%r10,0x007f
-	sll	%r10,3
-	je	.Lpgm_return
-	lg	%r9,0(%r10,%r1)		# load address of handler routine
-	lgr	%r2,%r11		# pass pointer to pt_regs
-	BASR_EX	%r14,%r9		# branch to interrupt-handler
-.Lpgm_return:
-	LOCKDEP_SYS_EXIT
-	tm	__PT_PSW+1(%r11),0x01	# returning to user ?
-	jno	.Lsysc_restore
-	TSTMSK	__PT_FLAGS(%r11),_PIF_SYSCALL
-	jo	.Lsysc_do_syscall
-	j	.Lsysc_tif
-
-#
-# PER event in supervisor state, must be kprobes
-#
-.Lpgm_kprobe:
-	REENABLE_IRQS
-	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
-	lgr	%r2,%r11		# pass pointer to pt_regs
-	brasl	%r14,do_per_trap
-	j	.Lpgm_return
+	xgr	%r12,%r12
+	lgr	%r2,%r11
+	brasl	%r14,__do_pgm_check
+	tmhh	%r8,0x0001		# returning to user space?
+	jno	.Lpgm_exit_kernel
+	STACKLEAK_ERASE
+	lctlg	%c1,%c1,__LC_USER_ASCE(%r13)
+	BPON
+	stpt	__LC_EXIT_TIMER(%r13)
+.Lpgm_exit_kernel:
+	mvc	__LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+	LBEAR	STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15)
+	lmg	%r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
+	LPSWEY	__LC_RETURN_PSW,__LC_RETURN_LPSWE
 
 #
 # single stepped system call
 #
 .Lpgm_svcper:
-	mvc	__LC_RETURN_PSW(8),__LC_SVC_NEW_PSW
-	lghi	%r13,__TASK_thread
+	mvc	__LC_RETURN_PSW(8,%r13),__LC_SVC_NEW_PSW(%r13)
 	larl	%r14,.Lsysc_per
-	stg	%r14,__LC_RETURN_PSW+8
-	lghi	%r14,_PIF_SYSCALL | _PIF_PER_TRAP
-	lpswe	__LC_RETURN_PSW		# branch to .Lsysc_per and enable irqs
-ENDPROC(pgm_check_handler)
+	stg	%r14,__LC_RETURN_PSW+8(%r13)
+	lghi	%r14,1
+	LBEAR	__LC_PGM_LAST_BREAK(%r13)
+	LPSWEY	__LC_RETURN_PSW,__LC_RETURN_LPSWE # branch to .Lsysc_per
+SYM_CODE_END(pgm_check_handler)
 
 /*
- * IO interrupt handler routine
+ * Interrupt handler macro used for external and IO interrupts.
  */
-ENTRY(io_int_handler)
-	STCK	__LC_INT_CLOCK
-	stpt	__LC_ASYNC_ENTER_TIMER
+.macro INT_HANDLER name,lc_old_psw,handler
+SYM_CODE_START(\name)
+	STMG_LC	%r8,%r15,__LC_SAVE_AREA
+	GET_LC	%r13
+	stckf	__LC_INT_CLOCK(%r13)
+	stpt	__LC_SYS_ENTER_TIMER(%r13)
+	STBEAR	__LC_LAST_BREAK(%r13)
 	BPOFF
-	stmg	%r8,%r15,__LC_SAVE_AREA_ASYNC
-	lg	%r12,__LC_CURRENT
-	larl	%r13,cleanup_critical
-	lmg	%r8,%r9,__LC_IO_OLD_PSW
-	SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER
-	stmg	%r0,%r7,__PT_R0(%r11)
-	# clear user controlled registers to prevent speculative use
-	xgr	%r0,%r0
-	xgr	%r1,%r1
-	xgr	%r2,%r2
-	xgr	%r3,%r3
-	xgr	%r4,%r4
-	xgr	%r5,%r5
-	xgr	%r6,%r6
-	xgr	%r7,%r7
-	xgr	%r10,%r10
-	mvc	__PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC
-	stmg	%r8,%r9,__PT_PSW(%r11)
-	mvc	__PT_INT_CODE(12,%r11),__LC_SUBCHANNEL_ID
-	xc	__PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
-	TSTMSK	__LC_CPU_FLAGS,_CIF_IGNORE_IRQ
-	jo	.Lio_restore
-	TRACE_IRQS_OFF
-	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
-.Lio_loop:
-	lgr	%r2,%r11		# pass pointer to pt_regs
-	lghi	%r3,IO_INTERRUPT
-	tm	__PT_INT_CODE+8(%r11),0x80	# adapter interrupt ?
-	jz	.Lio_call
-	lghi	%r3,THIN_INTERRUPT
-.Lio_call:
-	brasl	%r14,do_IRQ
-	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_LPAR
-	jz	.Lio_return
-	tpi	0
-	jz	.Lio_return
-	mvc	__PT_INT_CODE(12,%r11),__LC_SUBCHANNEL_ID
-	j	.Lio_loop
-.Lio_return:
-	LOCKDEP_SYS_EXIT
-	TRACE_IRQS_ON
-.Lio_tif:
-	TSTMSK	__TI_flags(%r12),_TIF_WORK
-	jnz	.Lio_work		# there is work to do (signals etc.)
-	TSTMSK	__LC_CPU_FLAGS,_CIF_WORK
-	jnz	.Lio_work
-.Lio_restore:
-	lg	%r14,__LC_VDSO_PER_CPU
-	lmg	%r0,%r10,__PT_R0(%r11)
-	mvc	__LC_RETURN_PSW(16),__PT_PSW(%r11)
-	tm	__PT_PSW+1(%r11),0x01	# returning to user ?
-	jno	.Lio_exit_kernel
-	BPEXIT	__TI_flags(%r12),_TIF_ISOLATE_BP
-.Lio_exit_timer:
-	stpt	__LC_EXIT_TIMER
-	mvc	__VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
-.Lio_exit_kernel:
-	lmg	%r11,%r15,__PT_R11(%r11)
-	lpswe	__LC_RETURN_PSW
-.Lio_done:
-
-#
-# There is work todo, find out in which context we have been interrupted:
-# 1) if we return to user space we can do all _TIF_WORK work
-# 2) if we return to kernel code and kvm is enabled check if we need to
-#    modify the psw to leave SIE
-# 3) if we return to kernel code and preemptive scheduling is enabled check
-#    the preemption counter and if it is zero call preempt_schedule_irq
-# Before any work can be done, a switch to the kernel stack is required.
-#
-.Lio_work:
-	tm	__PT_PSW+1(%r11),0x01	# returning to user ?
-	jo	.Lio_work_user		# yes -> do resched & signal
-#ifdef CONFIG_PREEMPT
-	# check for preemptive scheduling
-	icm	%r0,15,__LC_PREEMPT_COUNT
-	jnz	.Lio_restore		# preemption is disabled
-	TSTMSK	__TI_flags(%r12),_TIF_NEED_RESCHED
-	jno	.Lio_restore
-	# switch to kernel stack
-	lg	%r1,__PT_R15(%r11)
-	aghi	%r1,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
-	mvc	STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11)
-	xc	__SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1)
-	la	%r11,STACK_FRAME_OVERHEAD(%r1)
-	lgr	%r15,%r1
-	# TRACE_IRQS_ON already done at .Lio_return, call
-	# TRACE_IRQS_OFF to keep things symmetrical
-	TRACE_IRQS_OFF
-	brasl	%r14,preempt_schedule_irq
-	j	.Lio_return
-#else
-	j	.Lio_restore
-#endif
-
-#
-# Need to do work before returning to userspace, switch to kernel stack
-#
-.Lio_work_user:
-	lg	%r1,__LC_KERNEL_STACK
-	mvc	STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11)
-	xc	__SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1)
-	la	%r11,STACK_FRAME_OVERHEAD(%r1)
-	lgr	%r15,%r1
-
-#
-# One of the work bits is on. Find out which one.
-#
-.Lio_work_tif:
-	TSTMSK	__LC_CPU_FLAGS,_CIF_MCCK_PENDING
-	jo	.Lio_mcck_pending
-	TSTMSK	__TI_flags(%r12),_TIF_NEED_RESCHED
-	jo	.Lio_reschedule
-#ifdef CONFIG_LIVEPATCH
-	TSTMSK	__TI_flags(%r12),_TIF_PATCH_PENDING
-	jo	.Lio_patch_pending
-#endif
-	TSTMSK	__TI_flags(%r12),_TIF_SIGPENDING
-	jo	.Lio_sigpending
-	TSTMSK	__TI_flags(%r12),_TIF_NOTIFY_RESUME
-	jo	.Lio_notify_resume
-	TSTMSK	__TI_flags(%r12),_TIF_GUARDED_STORAGE
-	jo	.Lio_guarded_storage
-	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
-	jo	.Lio_vxrs
-	TSTMSK	__LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY)
-	jnz	.Lio_asce
-	j	.Lio_return		# beware of critical section cleanup
-
-#
-# _CIF_MCCK_PENDING is set, call handler
-#
-.Lio_mcck_pending:
-	# TRACE_IRQS_ON already done at .Lio_return
-	brasl	%r14,s390_handle_mcck	# TIF bit will be cleared by handler
-	TRACE_IRQS_OFF
-	j	.Lio_return
-
-#
-# _CIF_ASCE_PRIMARY and/or CIF_ASCE_SECONDARY set, load user space asce
-#
-.Lio_asce:
-	ni	__LC_CPU_FLAGS+7,255-_CIF_ASCE_SECONDARY
-	lctlg	%c7,%c7,__LC_VDSO_ASCE		# load secondary asce
-	TSTMSK	__LC_CPU_FLAGS,_CIF_ASCE_PRIMARY
-	jz	.Lio_return
-#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES
-	tm	__LC_STFLE_FAC_LIST+3,0x10	# has MVCOS ?
-	jnz	.Lio_set_fs_fixup
-	ni	__LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY
-	lctlg	%c1,%c1,__LC_USER_ASCE		# load primary asce
-	j	.Lio_return
-.Lio_set_fs_fixup:
-#endif
-	larl	%r14,.Lio_return
-	jg	set_fs_fixup
-
-#
-# CIF_FPU is set, restore floating-point controls and floating-point registers.
-#
-.Lio_vxrs:
-	larl	%r14,.Lio_return
-	jg	load_fpu_regs
-
-#
-# _TIF_GUARDED_STORAGE is set, call guarded_storage_load
-#
-.Lio_guarded_storage:
-	# TRACE_IRQS_ON already done at .Lio_return
-	ssm	__LC_SVC_NEW_PSW	# reenable interrupts
-	lgr	%r2,%r11		# pass pointer to pt_regs
-	brasl	%r14,gs_load_bc_cb
-	ssm	__LC_PGM_NEW_PSW	# disable I/O and ext. interrupts
-	TRACE_IRQS_OFF
-	j	.Lio_return
-
-#
-# _TIF_NEED_RESCHED is set, call schedule
-#
-.Lio_reschedule:
-	# TRACE_IRQS_ON already done at .Lio_return
-	ssm	__LC_SVC_NEW_PSW	# reenable interrupts
-	brasl	%r14,schedule		# call scheduler
-	ssm	__LC_PGM_NEW_PSW	# disable I/O and ext. interrupts
-	TRACE_IRQS_OFF
-	j	.Lio_return
-
-#
-# _TIF_PATCH_PENDING is set, call klp_update_patch_state
-#
-#ifdef CONFIG_LIVEPATCH
-.Lio_patch_pending:
-	lg	%r2,__LC_CURRENT	# pass pointer to task struct
-	larl	%r14,.Lio_return
-	jg	klp_update_patch_state
+	lmg	%r8,%r9,\lc_old_psw(%r13)
+	tmhh	%r8,0x0001			# interrupting from user ?
+	jnz	1f
+#if IS_ENABLED(CONFIG_KVM)
+	lg	%r10,__LC_CURRENT(%r13)
+	tm	__TI_sie(%r10),0xff
+	jz	0f
+	BPENTER	__SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
+	SIEEXIT __SF_SIE_CONTROL(%r15),%r13
 #endif
-
-#
-# _TIF_SIGPENDING or is set, call do_signal
-#
-.Lio_sigpending:
-	# TRACE_IRQS_ON already done at .Lio_return
-	ssm	__LC_SVC_NEW_PSW	# reenable interrupts
-	lgr	%r2,%r11		# pass pointer to pt_regs
-	brasl	%r14,do_signal
-	ssm	__LC_PGM_NEW_PSW	# disable I/O and ext. interrupts
-	TRACE_IRQS_OFF
-	j	.Lio_return
-
-#
-# _TIF_NOTIFY_RESUME or is set, call do_notify_resume
-#
-.Lio_notify_resume:
-	# TRACE_IRQS_ON already done at .Lio_return
-	ssm	__LC_SVC_NEW_PSW	# reenable interrupts
-	lgr	%r2,%r11		# pass pointer to pt_regs
-	brasl	%r14,do_notify_resume
-	ssm	__LC_PGM_NEW_PSW	# disable I/O and ext. interrupts
-	TRACE_IRQS_OFF
-	j	.Lio_return
-ENDPROC(io_int_handler)
-
-/*
- * External interrupt handler routine
- */
-ENTRY(ext_int_handler)
-	STCK	__LC_INT_CLOCK
-	stpt	__LC_ASYNC_ENTER_TIMER
-	BPOFF
-	stmg	%r8,%r15,__LC_SAVE_AREA_ASYNC
-	lg	%r12,__LC_CURRENT
-	larl	%r13,cleanup_critical
-	lmg	%r8,%r9,__LC_EXT_OLD_PSW
-	SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER
+0:	aghi	%r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
+	j	2f
+1:	lctlg	%c1,%c1,__LC_KERNEL_ASCE(%r13)
+	lg	%r15,__LC_KERNEL_STACK(%r13)
+2:	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
+	la	%r11,STACK_FRAME_OVERHEAD(%r15)
 	stmg	%r0,%r7,__PT_R0(%r11)
 	# clear user controlled registers to prevent speculative use
 	xgr	%r0,%r0
 	xgr	%r1,%r1
-	xgr	%r2,%r2
 	xgr	%r3,%r3
 	xgr	%r4,%r4
 	xgr	%r5,%r5
 	xgr	%r6,%r6
 	xgr	%r7,%r7
 	xgr	%r10,%r10
-	mvc	__PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC
-	stmg	%r8,%r9,__PT_PSW(%r11)
-	lghi	%r1,__LC_EXT_PARAMS2
-	mvc	__PT_INT_CODE(4,%r11),__LC_EXT_CPU_ADDR
-	mvc	__PT_INT_PARM(4,%r11),__LC_EXT_PARAMS
-	mvc	__PT_INT_PARM_LONG(8,%r11),0(%r1)
 	xc	__PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
-	TSTMSK	__LC_CPU_FLAGS,_CIF_IGNORE_IRQ
-	jo	.Lio_restore
-	TRACE_IRQS_OFF
-	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
+	mvc	__PT_R8(64,%r11),__LC_SAVE_AREA(%r13)
+	MBEAR	%r11,%r13
+	stmg	%r8,%r9,__PT_PSW(%r11)
 	lgr	%r2,%r11		# pass pointer to pt_regs
-	lghi	%r3,EXT_INTERRUPT
-	brasl	%r14,do_IRQ
-	j	.Lio_return
-ENDPROC(ext_int_handler)
-
-/*
- * Load idle PSW. The second "half" of this function is in .Lcleanup_idle.
- */
-ENTRY(psw_idle)
-	stg	%r3,__SF_EMPTY(%r15)
-	larl	%r1,.Lpsw_idle_lpsw+4
-	stg	%r1,__SF_EMPTY+8(%r15)
-	larl	%r1,smp_cpu_mtid
-	llgf	%r1,0(%r1)
-	ltgr	%r1,%r1
-	jz	.Lpsw_idle_stcctm
-	.insn	rsy,0xeb0000000017,%r1,5,__SF_EMPTY+16(%r15)
-.Lpsw_idle_stcctm:
-	oi	__LC_CPU_FLAGS+7,_CIF_ENABLED_WAIT
+	brasl	%r14,\handler
+	mvc	__LC_RETURN_PSW(16,%r13),__PT_PSW(%r11)
+	tmhh	%r8,0x0001		# returning to user ?
+	jno	2f
+	STACKLEAK_ERASE
+	lctlg	%c1,%c1,__LC_USER_ASCE(%r13)
 	BPON
-	STCK	__CLOCK_IDLE_ENTER(%r2)
-	stpt	__TIMER_IDLE_ENTER(%r2)
-.Lpsw_idle_lpsw:
-	lpswe	__SF_EMPTY(%r15)
-	BR_EX	%r14
-.Lpsw_idle_end:
-ENDPROC(psw_idle)
+	stpt	__LC_EXIT_TIMER(%r13)
+2:	LBEAR	__PT_LAST_BREAK(%r11)
+	lmg	%r0,%r15,__PT_R0(%r11)
+	LPSWEY	__LC_RETURN_PSW,__LC_RETURN_LPSWE
+SYM_CODE_END(\name)
+.endm
 
-/*
- * Store floating-point controls and floating-point or vector register
- * depending whether the vector facility is available.	A critical section
- * cleanup assures that the registers are stored even if interrupted for
- * some other work.  The CIF_FPU flag is set to trigger a lazy restore
- * of the register contents at return from io or a system call.
- */
-ENTRY(save_fpu_regs)
-	lg	%r2,__LC_CURRENT
-	aghi	%r2,__TASK_thread
-	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
-	jo	.Lsave_fpu_regs_exit
-	stfpc	__THREAD_FPU_fpc(%r2)
-	lg	%r3,__THREAD_FPU_regs(%r2)
-	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_VX
-	jz	.Lsave_fpu_regs_fp	  # no -> store FP regs
-	VSTM	%v0,%v15,0,%r3		  # vstm 0,15,0(3)
-	VSTM	%v16,%v31,256,%r3	  # vstm 16,31,256(3)
-	j	.Lsave_fpu_regs_done	  # -> set CIF_FPU flag
-.Lsave_fpu_regs_fp:
-	std	0,0(%r3)
-	std	1,8(%r3)
-	std	2,16(%r3)
-	std	3,24(%r3)
-	std	4,32(%r3)
-	std	5,40(%r3)
-	std	6,48(%r3)
-	std	7,56(%r3)
-	std	8,64(%r3)
-	std	9,72(%r3)
-	std	10,80(%r3)
-	std	11,88(%r3)
-	std	12,96(%r3)
-	std	13,104(%r3)
-	std	14,112(%r3)
-	std	15,120(%r3)
-.Lsave_fpu_regs_done:
-	oi	__LC_CPU_FLAGS+7,_CIF_FPU
-.Lsave_fpu_regs_exit:
-	BR_EX	%r14
-.Lsave_fpu_regs_end:
-ENDPROC(save_fpu_regs)
-EXPORT_SYMBOL(save_fpu_regs)
+	.section .irqentry.text, "ax"
 
-/*
- * Load floating-point controls and floating-point or vector registers.
- * A critical section cleanup assures that the register contents are
- * loaded even if interrupted for some other work.
- *
- * There are special calling conventions to fit into sysc and io return work:
- *	%r15:	<kernel stack>
- * The function requires:
- *	%r4
- */
-load_fpu_regs:
-	lg	%r4,__LC_CURRENT
-	aghi	%r4,__TASK_thread
-	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
-	jno	.Lload_fpu_regs_exit
-	lfpc	__THREAD_FPU_fpc(%r4)
-	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_VX
-	lg	%r4,__THREAD_FPU_regs(%r4)	# %r4 <- reg save area
-	jz	.Lload_fpu_regs_fp		# -> no VX, load FP regs
-	VLM	%v0,%v15,0,%r4
-	VLM	%v16,%v31,256,%r4
-	j	.Lload_fpu_regs_done
-.Lload_fpu_regs_fp:
-	ld	0,0(%r4)
-	ld	1,8(%r4)
-	ld	2,16(%r4)
-	ld	3,24(%r4)
-	ld	4,32(%r4)
-	ld	5,40(%r4)
-	ld	6,48(%r4)
-	ld	7,56(%r4)
-	ld	8,64(%r4)
-	ld	9,72(%r4)
-	ld	10,80(%r4)
-	ld	11,88(%r4)
-	ld	12,96(%r4)
-	ld	13,104(%r4)
-	ld	14,112(%r4)
-	ld	15,120(%r4)
-.Lload_fpu_regs_done:
-	ni	__LC_CPU_FLAGS+7,255-_CIF_FPU
-.Lload_fpu_regs_exit:
-	BR_EX	%r14
-.Lload_fpu_regs_end:
-ENDPROC(load_fpu_regs)
+INT_HANDLER ext_int_handler,__LC_EXT_OLD_PSW,do_ext_irq
+INT_HANDLER io_int_handler,__LC_IO_OLD_PSW,do_io_irq
 
-.L__critical_end:
+	.section .kprobes.text, "ax"
 
 /*
  * Machine check handler routines
  */
-ENTRY(mcck_int_handler)
-	STCK	__LC_MCCK_CLOCK
+SYM_CODE_START(mcck_int_handler)
 	BPOFF
-	la	%r1,4095		# validate r1
-	spt	__LC_CPU_TIMER_SAVE_AREA-4095(%r1)	# validate cpu timer
-	sckc	__LC_CLOCK_COMPARATOR			# validate comparator
-	lam	%a0,%a15,__LC_AREGS_SAVE_AREA-4095(%r1) # validate acrs
-	lmg	%r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# validate gprs
-	lg	%r12,__LC_CURRENT
-	larl	%r13,cleanup_critical
-	lmg	%r8,%r9,__LC_MCK_OLD_PSW
-	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_SYSTEM_DAMAGE
+	GET_LC	%r13
+	lmg	%r8,%r9,__LC_MCK_OLD_PSW(%r13)
+	TSTMSK	__LC_MCCK_CODE(%r13),MCCK_CODE_SYSTEM_DAMAGE
 	jo	.Lmcck_panic		# yes -> rest of mcck code invalid
-	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_CR_VALID
+	TSTMSK	__LC_MCCK_CODE(%r13),MCCK_CODE_CR_VALID
 	jno	.Lmcck_panic		# control registers invalid -> panic
-	la	%r14,4095
-	lctlg	%c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r14) # validate ctl regs
 	ptlb
-	lg	%r11,__LC_MCESAD-4095(%r14) # extended machine check save area
-	nill	%r11,0xfc00		# MCESA_ORIGIN_MASK
-	TSTMSK	__LC_CREGS_SAVE_AREA+16-4095(%r14),CR2_GUARDED_STORAGE
-	jno	0f
-	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_GS_VALID
-	jno	0f
-	.insn	 rxy,0xe3000000004d,0,__MCESA_GS_SAVE_AREA(%r11) # LGSC
-0:	l	%r14,__LC_FP_CREG_SAVE_AREA-4095(%r14)
-	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_FC_VALID
-	jo	0f
-	sr	%r14,%r14
-0:	sfpc	%r14
-	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_VX
-	jo	0f
-	lghi	%r14,__LC_FPREGS_SAVE_AREA
-	ld	%f0,0(%r14)
-	ld	%f1,8(%r14)
-	ld	%f2,16(%r14)
-	ld	%f3,24(%r14)
-	ld	%f4,32(%r14)
-	ld	%f5,40(%r14)
-	ld	%f6,48(%r14)
-	ld	%f7,56(%r14)
-	ld	%f8,64(%r14)
-	ld	%f9,72(%r14)
-	ld	%f10,80(%r14)
-	ld	%f11,88(%r14)
-	ld	%f12,96(%r14)
-	ld	%f13,104(%r14)
-	ld	%f14,112(%r14)
-	ld	%f15,120(%r14)
-	j	1f
-0:	VLM	%v0,%v15,0,%r11
-	VLM	%v16,%v31,256,%r11
-1:	lghi	%r14,__LC_CPU_TIMER_SAVE_AREA
-	mvc	__LC_MCCK_ENTER_TIMER(8),0(%r14)
-	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID
+	lay	%r14,__LC_CPU_TIMER_SAVE_AREA(%r13)
+	mvc	__LC_MCCK_ENTER_TIMER(8,%r13),0(%r14)
+	TSTMSK	__LC_MCCK_CODE(%r13),MCCK_CODE_CPU_TIMER_VALID
 	jo	3f
-	la	%r14,__LC_SYNC_ENTER_TIMER
-	clc	0(8,%r14),__LC_ASYNC_ENTER_TIMER
-	jl	0f
-	la	%r14,__LC_ASYNC_ENTER_TIMER
-0:	clc	0(8,%r14),__LC_EXIT_TIMER
+	la	%r14,__LC_SYS_ENTER_TIMER(%r13)
+	clc	0(8,%r14),__LC_EXIT_TIMER(%r13)
 	jl	1f
-	la	%r14,__LC_EXIT_TIMER
-1:	clc	0(8,%r14),__LC_LAST_UPDATE_TIMER
+	la	%r14,__LC_EXIT_TIMER(%r13)
+1:	clc	0(8,%r14),__LC_LAST_UPDATE_TIMER(%r13)
 	jl	2f
-	la	%r14,__LC_LAST_UPDATE_TIMER
+	la	%r14,__LC_LAST_UPDATE_TIMER(%r13)
 2:	spt	0(%r14)
-	mvc	__LC_MCCK_ENTER_TIMER(8),0(%r14)
-3:	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_PSW_MWP_VALID
+	mvc	__LC_MCCK_ENTER_TIMER(8,%r13),0(%r14)
+3:	TSTMSK	__LC_MCCK_CODE(%r13),MCCK_CODE_PSW_MWP_VALID
 	jno	.Lmcck_panic
 	tmhh	%r8,0x0001		# interrupting from user ?
-	jnz	4f
-	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID
+	jnz	.Lmcck_user
+	TSTMSK	__LC_MCCK_CODE(%r13),MCCK_CODE_PSW_IA_VALID
 	jno	.Lmcck_panic
-4:	ssm	__LC_PGM_NEW_PSW	# turn dat on, keep irqs off
-	SWITCH_ASYNC __LC_GPREGS_SAVE_AREA+64,__LC_MCCK_ENTER_TIMER
-.Lmcck_skip:
-	lghi	%r14,__LC_GPREGS_SAVE_AREA+64
-	stmg	%r0,%r7,__PT_R0(%r11)
+#if IS_ENABLED(CONFIG_KVM)
+	lg	%r10,__LC_CURRENT(%r13)
+	tm	__TI_sie(%r10),0xff
+	jz	.Lmcck_user
+	# Need to compare the address instead of __TI_SIE flag.
+	# Otherwise there would be a race between setting the flag
+	# and entering SIE (or leaving and clearing the flag). This
+	# would cause machine checks targeted at the guest to be
+	# handled by the host.
+	larl	%r14,.Lsie_entry
+	clgrjl	%r9,%r14, 4f
+	larl	%r14,.Lsie_leave
+	clgrjhe	%r9,%r14, 4f
+	lg	%r10,__LC_PCPU(%r13)
+	oi	__PCPU_FLAGS+7(%r10), _CIF_MCCK_GUEST
+4:	BPENTER	__SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
+	SIEEXIT __SF_SIE_CONTROL(%r15),%r13
+#endif
+.Lmcck_user:
+	lg	%r15,__LC_MCCK_STACK(%r13)
+	la	%r11,STACK_FRAME_OVERHEAD(%r15)
+	stctg	%c1,%c1,__PT_CR1(%r11)
+	lctlg	%c1,%c1,__LC_KERNEL_ASCE(%r13)
+	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
+	lay	%r14,__LC_GPREGS_SAVE_AREA(%r13)
+	mvc	__PT_R0(128,%r11),0(%r14)
 	# clear user controlled registers to prevent speculative use
 	xgr	%r0,%r0
 	xgr	%r1,%r1
-	xgr	%r2,%r2
 	xgr	%r3,%r3
 	xgr	%r4,%r4
 	xgr	%r5,%r5
 	xgr	%r6,%r6
 	xgr	%r7,%r7
 	xgr	%r10,%r10
-	mvc	__PT_R8(64,%r11),0(%r14)
 	stmg	%r8,%r9,__PT_PSW(%r11)
 	xc	__PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
 	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
 	lgr	%r2,%r11		# pass pointer to pt_regs
 	brasl	%r14,s390_do_machine_check
-	tm	__PT_PSW+1(%r11),0x01	# returning to user ?
-	jno	.Lmcck_return
-	lg	%r1,__LC_KERNEL_STACK	# switch to kernel stack
-	mvc	STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11)
-	xc	__SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1)
-	la	%r11,STACK_FRAME_OVERHEAD(%r1)
-	lgr	%r15,%r1
-	TSTMSK	__LC_CPU_FLAGS,_CIF_MCCK_PENDING
-	jno	.Lmcck_return
-	TRACE_IRQS_OFF
-	brasl	%r14,s390_handle_mcck
-	TRACE_IRQS_ON
-.Lmcck_return:
-	lg	%r14,__LC_VDSO_PER_CPU
+	lctlg	%c1,%c1,__PT_CR1(%r11)
 	lmg	%r0,%r10,__PT_R0(%r11)
-	mvc	__LC_RETURN_MCCK_PSW(16),__PT_PSW(%r11) # move return PSW
-	tm	__LC_RETURN_MCCK_PSW+1,0x01 # returning to user ?
+	mvc	__LC_RETURN_MCCK_PSW(16,%r13),__PT_PSW(%r11) # move return PSW
+	tm	__LC_RETURN_MCCK_PSW+1(%r13),0x01 # returning to user ?
 	jno	0f
-	BPEXIT	__TI_flags(%r12),_TIF_ISOLATE_BP
-	stpt	__LC_EXIT_TIMER
-	mvc	__VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
-0:	lmg	%r11,%r15,__PT_R11(%r11)
-	lpswe	__LC_RETURN_MCCK_PSW
+	BPON
+	stpt	__LC_EXIT_TIMER(%r13)
+0:	ALTERNATIVE "brcl 0,0", __stringify(lay %r12,__LC_LAST_BREAK_SAVE_AREA(%r13)),\
+		ALT_FACILITY(193)
+	LBEAR	0(%r12)
+	lmg	%r11,%r15,__PT_R11(%r11)
+	LPSWEY	__LC_RETURN_MCCK_PSW,__LC_RETURN_MCCK_LPSWE
 
 .Lmcck_panic:
-	lg	%r15,__LC_NODAT_STACK
-	la	%r11,STACK_FRAME_OVERHEAD(%r15)
-	j	.Lmcck_skip
-ENDPROC(mcck_int_handler)
-
-#
-# PSW restart interrupt handler
-#
-ENTRY(restart_int_handler)
-	ALTERNATIVE "", ".insn s,0xb2800000,_LPP_OFFSET", 40
+	/*
+	 * Iterate over all possible CPU addresses in the range 0..0xffff
+	 * and stop each CPU using signal processor. Use compare and swap
+	 * to allow just one CPU-stopper and prevent concurrent CPUs from
+	 * stopping each other while leaving the others running.
+	 */
+	lhi	%r5,0
+	lhi	%r6,1
+	larl	%r7,stop_lock
+	cs	%r5,%r6,0(%r7)		# single CPU-stopper only
+	jnz	4f
+	larl	%r7,this_cpu
+	stap	0(%r7)			# this CPU address
+	lh	%r4,0(%r7)
+	nilh	%r4,0
+	lhi	%r0,1
+	sll	%r0,16			# CPU counter
+	lhi	%r3,0			# next CPU address
+0:	cr	%r3,%r4
+	je	2f
+1:	sigp	%r1,%r3,SIGP_STOP	# stop next CPU
+	brc	SIGP_CC_BUSY,1b
+2:	ahi	%r3,1
+	brct	%r0,0b
+3:	sigp	%r1,%r4,SIGP_STOP	# stop this CPU
+	brc	SIGP_CC_BUSY,3b
+4:	j	4b
+SYM_CODE_END(mcck_int_handler)
+
+SYM_CODE_START(restart_int_handler)
+	ALTERNATIVE "nop", "lpp _LPP_OFFSET", ALT_FACILITY(40)
 	stg	%r15,__LC_SAVE_AREA_RESTART
-	lg	%r15,__LC_RESTART_STACK
+	TSTMSK	__LC_RESTART_FLAGS,RESTART_FLAG_CTLREGS,4
+	jz	0f
+	lctlg	%c0,%c15,__LC_CREGS_SAVE_AREA
+0:	larl	%r15,daton_psw
+	lpswe	0(%r15)				# turn dat on, keep irqs off
+.Ldaton:
+	GET_LC	%r15
+	lg	%r15,__LC_RESTART_STACK(%r15)
 	xc	STACK_FRAME_OVERHEAD(__PT_SIZE,%r15),STACK_FRAME_OVERHEAD(%r15)
 	stmg	%r0,%r14,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
-	mvc	STACK_FRAME_OVERHEAD+__PT_R15(8,%r15),__LC_SAVE_AREA_RESTART
-	mvc	STACK_FRAME_OVERHEAD+__PT_PSW(16,%r15),__LC_RST_OLD_PSW
+	GET_LC	%r13
+	mvc	STACK_FRAME_OVERHEAD+__PT_R15(8,%r15),__LC_SAVE_AREA_RESTART(%r13)
+	mvc	STACK_FRAME_OVERHEAD+__PT_PSW(16,%r15),__LC_RST_OLD_PSW(%r13)
 	xc	0(STACK_FRAME_OVERHEAD,%r15),0(%r15)
-	lg	%r1,__LC_RESTART_FN		# load fn, parm & source cpu
-	lg	%r2,__LC_RESTART_DATA
-	lg	%r3,__LC_RESTART_SOURCE
+	lg	%r1,__LC_RESTART_FN(%r13)	# load fn, parm & source cpu
+	lg	%r2,__LC_RESTART_DATA(%r13)
+	lgf	%r3,__LC_RESTART_SOURCE(%r13)
 	ltgr	%r3,%r3				# test source cpu address
 	jm	1f				# negative -> skip source stop
 0:	sigp	%r4,%r3,SIGP_SENSE		# sigp sense to source cpu
@@ -1247,18 +568,36 @@ ENTRY(restart_int_handler)
 2:	sigp	%r4,%r3,SIGP_STOP		# sigp stop to current cpu
 	brc	2,2b
 3:	j	3b
-ENDPROC(restart_int_handler)
+SYM_CODE_END(restart_int_handler)
+
+	__INIT
+SYM_CODE_START(early_pgm_check_handler)
+	STMG_LC %r8,%r15,__LC_SAVE_AREA
+	GET_LC	%r13
+	aghi	%r15,-(STACK_FRAME_OVERHEAD+__PT_SIZE)
+	la	%r11,STACK_FRAME_OVERHEAD(%r15)
+	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
+	stmg	%r0,%r7,__PT_R0(%r11)
+	mvc	__PT_PSW(16,%r11),__LC_PGM_OLD_PSW(%r13)
+	mvc	__PT_R8(64,%r11),__LC_SAVE_AREA(%r13)
+	lgr	%r2,%r11
+	brasl	%r14,__do_early_pgm_check
+	mvc	__LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+	lmg	%r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
+	LPSWEY	__LC_RETURN_PSW,__LC_RETURN_LPSWE
+SYM_CODE_END(early_pgm_check_handler)
+	__FINIT
 
 	.section .kprobes.text, "ax"
 
-#if defined(CONFIG_CHECK_STACK) || defined(CONFIG_VMAP_STACK)
 /*
- * The synchronous or the asynchronous stack overflowed. We are dead.
+ * The synchronous or the asynchronous stack pointer is invalid. We are dead.
  * No need to properly save the registers, we are going to panic anyway.
  * Setup a pt_regs so that show_trace can provide a good call trace.
  */
-ENTRY(stack_overflow)
-	lg	%r15,__LC_NODAT_STACK	# change to panic stack
+SYM_CODE_START(stack_invalid)
+	GET_LC	%r15
+	lg	%r15,__LC_NODAT_STACK(%r15) # change to panic stack
 	la	%r11,STACK_FRAME_OVERHEAD(%r15)
 	stmg	%r0,%r7,__PT_R0(%r11)
 	stmg	%r8,%r9,__PT_PSW(%r11)
@@ -1266,279 +605,32 @@ ENTRY(stack_overflow)
 	stg	%r10,__PT_ORIG_GPR2(%r11) # store last break to orig_gpr2
 	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
 	lgr	%r2,%r11		# pass pointer to pt_regs
-	jg	kernel_stack_overflow
-ENDPROC(stack_overflow)
-#endif
-
-ENTRY(cleanup_critical)
-#if IS_ENABLED(CONFIG_KVM)
-	clg	%r9,BASED(.Lcleanup_table_sie)	# .Lsie_gmap
-	jl	0f
-	clg	%r9,BASED(.Lcleanup_table_sie+8)# .Lsie_done
-	jl	.Lcleanup_sie
-#endif
-	clg	%r9,BASED(.Lcleanup_table)	# system_call
-	jl	0f
-	clg	%r9,BASED(.Lcleanup_table+8)	# .Lsysc_do_svc
-	jl	.Lcleanup_system_call
-	clg	%r9,BASED(.Lcleanup_table+16)	# .Lsysc_tif
-	jl	0f
-	clg	%r9,BASED(.Lcleanup_table+24)	# .Lsysc_restore
-	jl	.Lcleanup_sysc_tif
-	clg	%r9,BASED(.Lcleanup_table+32)	# .Lsysc_done
-	jl	.Lcleanup_sysc_restore
-	clg	%r9,BASED(.Lcleanup_table+40)	# .Lio_tif
-	jl	0f
-	clg	%r9,BASED(.Lcleanup_table+48)	# .Lio_restore
-	jl	.Lcleanup_io_tif
-	clg	%r9,BASED(.Lcleanup_table+56)	# .Lio_done
-	jl	.Lcleanup_io_restore
-	clg	%r9,BASED(.Lcleanup_table+64)	# psw_idle
-	jl	0f
-	clg	%r9,BASED(.Lcleanup_table+72)	# .Lpsw_idle_end
-	jl	.Lcleanup_idle
-	clg	%r9,BASED(.Lcleanup_table+80)	# save_fpu_regs
-	jl	0f
-	clg	%r9,BASED(.Lcleanup_table+88)	# .Lsave_fpu_regs_end
-	jl	.Lcleanup_save_fpu_regs
-	clg	%r9,BASED(.Lcleanup_table+96)	# load_fpu_regs
-	jl	0f
-	clg	%r9,BASED(.Lcleanup_table+104)	# .Lload_fpu_regs_end
-	jl	.Lcleanup_load_fpu_regs
-0:	BR_EX	%r14,%r11
-ENDPROC(cleanup_critical)
-
-	.align	8
-.Lcleanup_table:
-	.quad	system_call
-	.quad	.Lsysc_do_svc
-	.quad	.Lsysc_tif
-	.quad	.Lsysc_restore
-	.quad	.Lsysc_done
-	.quad	.Lio_tif
-	.quad	.Lio_restore
-	.quad	.Lio_done
-	.quad	psw_idle
-	.quad	.Lpsw_idle_end
-	.quad	save_fpu_regs
-	.quad	.Lsave_fpu_regs_end
-	.quad	load_fpu_regs
-	.quad	.Lload_fpu_regs_end
-
-#if IS_ENABLED(CONFIG_KVM)
-.Lcleanup_table_sie:
-	.quad	.Lsie_gmap
-	.quad	.Lsie_done
+	jg	kernel_stack_invalid
+SYM_CODE_END(stack_invalid)
+
+	.section .data, "aw"
+	.balign	4
+SYM_DATA_LOCAL(stop_lock,	.long 0)
+SYM_DATA_LOCAL(this_cpu,	.short 0)
+	.balign	8
+SYM_DATA_START_LOCAL(daton_psw)
+	.quad	PSW_KERNEL_BITS
+	.quad	.Ldaton
+SYM_DATA_END(daton_psw)
 
-.Lcleanup_sie:
-	cghi    %r11,__LC_SAVE_AREA_ASYNC 	#Is this in normal interrupt?
-	je      1f
-	slg     %r9,BASED(.Lsie_crit_mcck_start)
-	clg     %r9,BASED(.Lsie_crit_mcck_length)
-	jh      1f
-	oi      __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST
-1:	BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
-	lg	%r9,__SF_SIE_CONTROL(%r15)	# get control block pointer
-	ni	__SIE_PROG0C+3(%r9),0xfe	# no longer in SIE
-	lctlg	%c1,%c1,__LC_USER_ASCE		# load primary asce
-	larl	%r9,sie_exit			# skip forward to sie_exit
-	BR_EX	%r14,%r11
-#endif
-
-.Lcleanup_system_call:
-	# check if stpt has been executed
-	clg	%r9,BASED(.Lcleanup_system_call_insn)
-	jh	0f
-	mvc	__LC_SYNC_ENTER_TIMER(8),__LC_ASYNC_ENTER_TIMER
-	cghi	%r11,__LC_SAVE_AREA_ASYNC
-	je	0f
-	mvc	__LC_SYNC_ENTER_TIMER(8),__LC_MCCK_ENTER_TIMER
-0:	# check if stmg has been executed
-	clg	%r9,BASED(.Lcleanup_system_call_insn+8)
-	jh	0f
-	mvc	__LC_SAVE_AREA_SYNC(64),0(%r11)
-0:	# check if base register setup + TIF bit load has been done
-	clg	%r9,BASED(.Lcleanup_system_call_insn+16)
-	jhe	0f
-	# set up saved register r12 task struct pointer
-	stg	%r12,32(%r11)
-	# set up saved register r13 __TASK_thread offset
-	mvc	40(8,%r11),BASED(.Lcleanup_system_call_const)
-0:	# check if the user time update has been done
-	clg	%r9,BASED(.Lcleanup_system_call_insn+24)
-	jh	0f
-	lg	%r15,__LC_EXIT_TIMER
-	slg	%r15,__LC_SYNC_ENTER_TIMER
-	alg	%r15,__LC_USER_TIMER
-	stg	%r15,__LC_USER_TIMER
-0:	# check if the system time update has been done
-	clg	%r9,BASED(.Lcleanup_system_call_insn+32)
-	jh	0f
-	lg	%r15,__LC_LAST_UPDATE_TIMER
-	slg	%r15,__LC_EXIT_TIMER
-	alg	%r15,__LC_SYSTEM_TIMER
-	stg	%r15,__LC_SYSTEM_TIMER
-0:	# update accounting time stamp
-	mvc	__LC_LAST_UPDATE_TIMER(8),__LC_SYNC_ENTER_TIMER
-	BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
-	# set up saved register r11
-	lg	%r15,__LC_KERNEL_STACK
-	la	%r9,STACK_FRAME_OVERHEAD(%r15)
-	stg	%r9,24(%r11)		# r11 pt_regs pointer
-	# fill pt_regs
-	mvc	__PT_R8(64,%r9),__LC_SAVE_AREA_SYNC
-	stmg	%r0,%r7,__PT_R0(%r9)
-	mvc	__PT_PSW(16,%r9),__LC_SVC_OLD_PSW
-	mvc	__PT_INT_CODE(4,%r9),__LC_SVC_ILC
-	xc	__PT_FLAGS(8,%r9),__PT_FLAGS(%r9)
-	mvi	__PT_FLAGS+7(%r9),_PIF_SYSCALL
-	# setup saved register r15
-	stg	%r15,56(%r11)		# r15 stack pointer
-	# set new psw address and exit
-	larl	%r9,.Lsysc_do_svc
-	BR_EX	%r14,%r11
-.Lcleanup_system_call_insn:
-	.quad	system_call
-	.quad	.Lsysc_stmg
-	.quad	.Lsysc_per
-	.quad	.Lsysc_vtime+36
-	.quad	.Lsysc_vtime+42
-.Lcleanup_system_call_const:
-	.quad	__TASK_thread
-
-.Lcleanup_sysc_tif:
-	larl	%r9,.Lsysc_tif
-	BR_EX	%r14,%r11
-
-.Lcleanup_sysc_restore:
-	# check if stpt has been executed
-	clg	%r9,BASED(.Lcleanup_sysc_restore_insn)
-	jh	0f
-	mvc	__LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER
-	cghi	%r11,__LC_SAVE_AREA_ASYNC
-	je	0f
-	mvc	__LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER
-0:	clg	%r9,BASED(.Lcleanup_sysc_restore_insn+8)
-	je	1f
-	lg	%r9,24(%r11)		# get saved pointer to pt_regs
-	mvc	__LC_RETURN_PSW(16),__PT_PSW(%r9)
-	mvc	0(64,%r11),__PT_R8(%r9)
-	lmg	%r0,%r7,__PT_R0(%r9)
-1:	lmg	%r8,%r9,__LC_RETURN_PSW
-	BR_EX	%r14,%r11
-.Lcleanup_sysc_restore_insn:
-	.quad	.Lsysc_exit_timer
-	.quad	.Lsysc_done - 4
-
-.Lcleanup_io_tif:
-	larl	%r9,.Lio_tif
-	BR_EX	%r14,%r11
-
-.Lcleanup_io_restore:
-	# check if stpt has been executed
-	clg	%r9,BASED(.Lcleanup_io_restore_insn)
-	jh	0f
-	mvc	__LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER
-0:	clg	%r9,BASED(.Lcleanup_io_restore_insn+8)
-	je	1f
-	lg	%r9,24(%r11)		# get saved r11 pointer to pt_regs
-	mvc	__LC_RETURN_PSW(16),__PT_PSW(%r9)
-	mvc	0(64,%r11),__PT_R8(%r9)
-	lmg	%r0,%r7,__PT_R0(%r9)
-1:	lmg	%r8,%r9,__LC_RETURN_PSW
-	BR_EX	%r14,%r11
-.Lcleanup_io_restore_insn:
-	.quad	.Lio_exit_timer
-	.quad	.Lio_done - 4
-
-.Lcleanup_idle:
-	ni	__LC_CPU_FLAGS+7,255-_CIF_ENABLED_WAIT
-	# copy interrupt clock & cpu timer
-	mvc	__CLOCK_IDLE_EXIT(8,%r2),__LC_INT_CLOCK
-	mvc	__TIMER_IDLE_EXIT(8,%r2),__LC_ASYNC_ENTER_TIMER
-	cghi	%r11,__LC_SAVE_AREA_ASYNC
-	je	0f
-	mvc	__CLOCK_IDLE_EXIT(8,%r2),__LC_MCCK_CLOCK
-	mvc	__TIMER_IDLE_EXIT(8,%r2),__LC_MCCK_ENTER_TIMER
-0:	# check if stck & stpt have been executed
-	clg	%r9,BASED(.Lcleanup_idle_insn)
-	jhe	1f
-	mvc	__CLOCK_IDLE_ENTER(8,%r2),__CLOCK_IDLE_EXIT(%r2)
-	mvc	__TIMER_IDLE_ENTER(8,%r2),__TIMER_IDLE_EXIT(%r2)
-1:	# calculate idle cycles
-	clg	%r9,BASED(.Lcleanup_idle_insn)
-	jl	3f
-	larl	%r1,smp_cpu_mtid
-	llgf	%r1,0(%r1)
-	ltgr	%r1,%r1
-	jz	3f
-	.insn	rsy,0xeb0000000017,%r1,5,__SF_EMPTY+80(%r15)
-	larl	%r3,mt_cycles
-	ag	%r3,__LC_PERCPU_OFFSET
-	la	%r4,__SF_EMPTY+16(%r15)
-2:	lg	%r0,0(%r3)
-	slg	%r0,0(%r4)
-	alg	%r0,64(%r4)
-	stg	%r0,0(%r3)
-	la	%r3,8(%r3)
-	la	%r4,8(%r4)
-	brct	%r1,2b
-3:	# account system time going idle
-	lg	%r9,__LC_STEAL_TIMER
-	alg	%r9,__CLOCK_IDLE_ENTER(%r2)
-	slg	%r9,__LC_LAST_UPDATE_CLOCK
-	stg	%r9,__LC_STEAL_TIMER
-	mvc	__LC_LAST_UPDATE_CLOCK(8),__CLOCK_IDLE_EXIT(%r2)
-	lg	%r9,__LC_SYSTEM_TIMER
-	alg	%r9,__LC_LAST_UPDATE_TIMER
-	slg	%r9,__TIMER_IDLE_ENTER(%r2)
-	stg	%r9,__LC_SYSTEM_TIMER
-	mvc	__LC_LAST_UPDATE_TIMER(8),__TIMER_IDLE_EXIT(%r2)
-	# prepare return psw
-	nihh	%r8,0xfcfd		# clear irq & wait state bits
-	lg	%r9,48(%r11)		# return from psw_idle
-	BR_EX	%r14,%r11
-.Lcleanup_idle_insn:
-	.quad	.Lpsw_idle_lpsw
-
-.Lcleanup_save_fpu_regs:
-	larl	%r9,save_fpu_regs
-	BR_EX	%r14,%r11
-
-.Lcleanup_load_fpu_regs:
-	larl	%r9,load_fpu_regs
-	BR_EX	%r14,%r11
-
-/*
- * Integer constants
- */
-	.align	8
-.Lcritical_start:
-	.quad	.L__critical_start
-.Lcritical_length:
-	.quad	.L__critical_end - .L__critical_start
-#if IS_ENABLED(CONFIG_KVM)
-.Lsie_critical_start:
-	.quad	.Lsie_gmap
-.Lsie_critical_length:
-	.quad	.Lsie_done - .Lsie_gmap
-.Lsie_crit_mcck_start:
-	.quad   .Lsie_entry
-.Lsie_crit_mcck_length:
-	.quad   .Lsie_skip - .Lsie_entry
-#endif
 	.section .rodata, "a"
+	.balign	8
 #define SYSCALL(esame,emu)	.quad __s390x_ ## esame
-	.globl	sys_call_table
-sys_call_table:
-#include "asm/syscall_table.h"
+SYM_DATA_START(sys_call_table)
+#include <asm/syscall_table.h>
+SYM_DATA_END(sys_call_table)
 #undef SYSCALL
 
 #ifdef CONFIG_COMPAT
 
 #define SYSCALL(esame,emu)	.quad __s390_ ## emu
-	.globl	sys_call_table_emu
-sys_call_table_emu:
-#include "asm/syscall_table.h"
+SYM_DATA_START(sys_call_table_emu)
+#include <asm/syscall_table.h>
+SYM_DATA_END(sys_call_table_emu)
 #undef SYSCALL
 #endif
diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h
index b2956d49b6ad..dd55cc6bbc28 100644
--- a/arch/s390/kernel/entry.h
+++ b/arch/s390/kernel/entry.h
@@ -5,11 +5,11 @@
 #include <linux/percpu.h>
 #include <linux/types.h>
 #include <linux/signal.h>
+#include <asm/extable.h>
 #include <asm/ptrace.h>
 #include <asm/idle.h>
 
 extern void *restart_stack;
-extern unsigned long suspend_zero_pages;
 
 void system_call(void);
 void pgm_check_handler(void);
@@ -17,54 +17,30 @@ void ext_int_handler(void);
 void io_int_handler(void);
 void mcck_int_handler(void);
 void restart_int_handler(void);
-void restart_call_handler(void);
+void early_pgm_check_handler(void);
 
-asmlinkage long do_syscall_trace_enter(struct pt_regs *regs);
-asmlinkage void do_syscall_trace_exit(struct pt_regs *regs);
+struct task_struct *__switch_to_asm(struct task_struct *prev, struct task_struct *next);
+void __ret_from_fork(struct task_struct *prev, struct pt_regs *regs);
+void __do_pgm_check(struct pt_regs *regs);
+void __do_syscall(struct pt_regs *regs, int per_trap);
+void __do_early_pgm_check(struct pt_regs *regs);
 
 void do_protection_exception(struct pt_regs *regs);
 void do_dat_exception(struct pt_regs *regs);
-
-void addressing_exception(struct pt_regs *regs);
-void data_exception(struct pt_regs *regs);
-void default_trap_handler(struct pt_regs *regs);
-void divide_exception(struct pt_regs *regs);
-void execute_exception(struct pt_regs *regs);
-void hfp_divide_exception(struct pt_regs *regs);
-void hfp_overflow_exception(struct pt_regs *regs);
-void hfp_significance_exception(struct pt_regs *regs);
-void hfp_sqrt_exception(struct pt_regs *regs);
-void hfp_underflow_exception(struct pt_regs *regs);
-void illegal_op(struct pt_regs *regs);
-void operand_exception(struct pt_regs *regs);
-void overflow_exception(struct pt_regs *regs);
-void privileged_op(struct pt_regs *regs);
-void space_switch_exception(struct pt_regs *regs);
-void special_op_exception(struct pt_regs *regs);
-void specification_exception(struct pt_regs *regs);
-void transaction_exception(struct pt_regs *regs);
-void translation_exception(struct pt_regs *regs);
-void vector_exception(struct pt_regs *regs);
-
-void do_per_trap(struct pt_regs *regs);
+void do_secure_storage_access(struct pt_regs *regs);
+void do_non_secure_storage_access(struct pt_regs *regs);
+void do_secure_storage_violation(struct pt_regs *regs);
 void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str);
-void syscall_trace(struct pt_regs *regs, int entryexit);
-void kernel_stack_overflow(struct pt_regs * regs);
-void do_signal(struct pt_regs *regs);
+void kernel_stack_invalid(struct pt_regs *regs);
 void handle_signal32(struct ksignal *ksig, sigset_t *oldset,
 		     struct pt_regs *regs);
-void do_notify_resume(struct pt_regs *regs);
 
-void __init init_IRQ(void);
-void do_IRQ(struct pt_regs *regs, int irq);
-void do_restart(void);
-void __init startup_init_nobss(void);
+void do_io_irq(struct pt_regs *regs);
+void do_ext_irq(struct pt_regs *regs);
+void do_restart(void *arg);
 void __init startup_init(void);
 void die(struct pt_regs *regs, const char *str);
 int setup_profiling_timer(unsigned int multiplier);
-void __init time_init(void);
-void s390_early_resume(void);
-unsigned long prepare_ftrace_return(unsigned long parent, unsigned long sp, unsigned long ip);
 
 struct s390_mmap_arg_struct;
 struct fadvise64_64_args;
@@ -82,10 +58,18 @@ long sys_s390_sthyi(unsigned long function_code, void __user *buffer, u64 __user
 
 DECLARE_PER_CPU(u64, mt_cycles[8]);
 
-void gs_load_bc_cb(struct pt_regs *regs);
-void set_fs_fixup(void);
-
 unsigned long stack_alloc(void);
 void stack_free(unsigned long stack);
 
+extern char kprobes_insn_page[];
+
+extern char _samode31[], _eamode31[];
+extern char _stext_amode31[], _etext_amode31[];
+extern struct exception_table_entry _start_amode31_ex_table[];
+extern struct exception_table_entry _stop_amode31_ex_table[];
+
+#define __amode31_data __section(".amode31.data")
+#define __amode31_ref __section(".amode31.refs")
+extern long _start_amode31_refs[], _end_amode31_refs[];
+
 #endif /* _ENTRY_H */
diff --git a/arch/s390/kernel/facility.c b/arch/s390/kernel/facility.c
new file mode 100644
index 000000000000..f02127219a27
--- /dev/null
+++ b/arch/s390/kernel/facility.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2023
+ */
+
+#include <asm/facility.h>
+
+unsigned int stfle_size(void)
+{
+	static unsigned int size;
+	unsigned int r;
+	u64 dummy;
+
+	r = READ_ONCE(size);
+	if (!r) {
+		r = __stfle_asm(&dummy, 1) + 1;
+		WRITE_ONCE(size, r);
+	}
+	return r;
+}
+EXPORT_SYMBOL(stfle_size);
diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c
index 0da378e2eb25..6f2e87920288 100644
--- a/arch/s390/kernel/fpu.c
+++ b/arch/s390/kernel/fpu.c
@@ -8,170 +8,186 @@
 #include <linux/kernel.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
-#include <asm/fpu/types.h>
-#include <asm/fpu/api.h>
+#include <asm/fpu.h>
 
-asm(".include \"asm/vx-insn.h\"\n");
-
-void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags)
+void __kernel_fpu_begin(struct kernel_fpu *state, int flags)
 {
+	__vector128 *vxrs = state->vxrs;
+	int mask;
+
 	/*
 	 * Limit the save to the FPU/vector registers already
-	 * in use by the previous context
+	 * in use by the previous context.
 	 */
-	flags &= state->mask;
-
+	flags &= state->hdr.mask;
 	if (flags & KERNEL_FPC)
-		/* Save floating point control */
-		asm volatile("stfpc %0" : "=Q" (state->fpc));
-
-	if (!MACHINE_HAS_VX) {
-		if (flags & KERNEL_VXR_V0V7) {
-			/* Save floating-point registers */
-			asm volatile("std 0,%0" : "=Q" (state->fprs[0]));
-			asm volatile("std 1,%0" : "=Q" (state->fprs[1]));
-			asm volatile("std 2,%0" : "=Q" (state->fprs[2]));
-			asm volatile("std 3,%0" : "=Q" (state->fprs[3]));
-			asm volatile("std 4,%0" : "=Q" (state->fprs[4]));
-			asm volatile("std 5,%0" : "=Q" (state->fprs[5]));
-			asm volatile("std 6,%0" : "=Q" (state->fprs[6]));
-			asm volatile("std 7,%0" : "=Q" (state->fprs[7]));
-			asm volatile("std 8,%0" : "=Q" (state->fprs[8]));
-			asm volatile("std 9,%0" : "=Q" (state->fprs[9]));
-			asm volatile("std 10,%0" : "=Q" (state->fprs[10]));
-			asm volatile("std 11,%0" : "=Q" (state->fprs[11]));
-			asm volatile("std 12,%0" : "=Q" (state->fprs[12]));
-			asm volatile("std 13,%0" : "=Q" (state->fprs[13]));
-			asm volatile("std 14,%0" : "=Q" (state->fprs[14]));
-			asm volatile("std 15,%0" : "=Q" (state->fprs[15]));
-		}
+		fpu_stfpc(&state->hdr.fpc);
+	if (!cpu_has_vx()) {
+		if (flags & KERNEL_VXR_LOW)
+			save_fp_regs_vx(vxrs);
 		return;
 	}
-
-	/* Test and save vector registers */
-	asm volatile (
-		/*
-		 * Test if any vector register must be saved and, if so,
-		 * test if all register can be saved.
-		 */
-		"	la	1,%[vxrs]\n"	/* load save area */
-		"	tmll	%[m],30\n"	/* KERNEL_VXR */
-		"	jz	7f\n"		/* no work -> done */
-		"	jo	5f\n"		/* -> save V0..V31 */
-		/*
-		 * Test for special case KERNEL_FPU_MID only. In this
-		 * case a vstm V8..V23 is the best instruction
-		 */
-		"	chi	%[m],12\n"	/* KERNEL_VXR_MID */
-		"	jne	0f\n"		/* -> save V8..V23 */
-		"	VSTM	8,23,128,1\n"	/* vstm %v8,%v23,128(%r1) */
-		"	j	7f\n"
-		/* Test and save the first half of 16 vector registers */
-		"0:	tmll	%[m],6\n"	/* KERNEL_VXR_LOW */
-		"	jz	3f\n"		/* -> KERNEL_VXR_HIGH */
-		"	jo	2f\n"		/* 11 -> save V0..V15 */
-		"	brc	2,1f\n"		/* 10 -> save V8..V15 */
-		"	VSTM	0,7,0,1\n"	/* vstm %v0,%v7,0(%r1) */
-		"	j	3f\n"
-		"1:	VSTM	8,15,128,1\n"	/* vstm %v8,%v15,128(%r1) */
-		"	j	3f\n"
-		"2:	VSTM	0,15,0,1\n"	/* vstm %v0,%v15,0(%r1) */
-		/* Test and save the second half of 16 vector registers */
-		"3:	tmll	%[m],24\n"	/* KERNEL_VXR_HIGH */
-		"	jz	7f\n"
-		"	jo	6f\n"		/* 11 -> save V16..V31 */
-		"	brc	2,4f\n"		/* 10 -> save V24..V31 */
-		"	VSTM	16,23,256,1\n"	/* vstm %v16,%v23,256(%r1) */
-		"	j	7f\n"
-		"4:	VSTM	24,31,384,1\n"	/* vstm %v24,%v31,384(%r1) */
-		"	j	7f\n"
-		"5:	VSTM	0,15,0,1\n"	/* vstm %v0,%v15,0(%r1) */
-		"6:	VSTM	16,31,256,1\n"	/* vstm %v16,%v31,256(%r1) */
-		"7:"
-		: [vxrs] "=Q" (*(struct vx_array *) &state->vxrs)
-		: [m] "d" (flags)
-		: "1", "cc");
+	mask = flags & KERNEL_VXR;
+	if (mask == KERNEL_VXR) {
+		vxrs += fpu_vstm(0, 15, vxrs);
+		vxrs += fpu_vstm(16, 31, vxrs);
+		return;
+	}
+	if (mask == KERNEL_VXR_MID) {
+		vxrs += fpu_vstm(8, 23, vxrs);
+		return;
+	}
+	mask = flags & KERNEL_VXR_LOW;
+	if (mask) {
+		if (mask == KERNEL_VXR_LOW)
+			vxrs += fpu_vstm(0, 15, vxrs);
+		else if (mask == KERNEL_VXR_V0V7)
+			vxrs += fpu_vstm(0, 7, vxrs);
+		else
+			vxrs += fpu_vstm(8, 15, vxrs);
+	}
+	mask = flags & KERNEL_VXR_HIGH;
+	if (mask) {
+		if (mask == KERNEL_VXR_HIGH)
+			vxrs += fpu_vstm(16, 31, vxrs);
+		else if (mask == KERNEL_VXR_V16V23)
+			vxrs += fpu_vstm(16, 23, vxrs);
+		else
+			vxrs += fpu_vstm(24, 31, vxrs);
+	}
 }
 EXPORT_SYMBOL(__kernel_fpu_begin);
 
-void __kernel_fpu_end(struct kernel_fpu *state, u32 flags)
+void __kernel_fpu_end(struct kernel_fpu *state, int flags)
 {
+	__vector128 *vxrs = state->vxrs;
+	int mask;
+
 	/*
 	 * Limit the restore to the FPU/vector registers of the
-	 * previous context that have been overwritte by the
-	 * current context
+	 * previous context that have been overwritten by the
+	 * current context.
 	 */
-	flags &= state->mask;
-
+	flags &= state->hdr.mask;
 	if (flags & KERNEL_FPC)
-		/* Restore floating-point controls */
-		asm volatile("lfpc %0" : : "Q" (state->fpc));
+		fpu_lfpc(&state->hdr.fpc);
+	if (!cpu_has_vx()) {
+		if (flags & KERNEL_VXR_LOW)
+			load_fp_regs_vx(vxrs);
+		return;
+	}
+	mask = flags & KERNEL_VXR;
+	if (mask == KERNEL_VXR) {
+		vxrs += fpu_vlm(0, 15, vxrs);
+		vxrs += fpu_vlm(16, 31, vxrs);
+		return;
+	}
+	if (mask == KERNEL_VXR_MID) {
+		vxrs += fpu_vlm(8, 23, vxrs);
+		return;
+	}
+	mask = flags & KERNEL_VXR_LOW;
+	if (mask) {
+		if (mask == KERNEL_VXR_LOW)
+			vxrs += fpu_vlm(0, 15, vxrs);
+		else if (mask == KERNEL_VXR_V0V7)
+			vxrs += fpu_vlm(0, 7, vxrs);
+		else
+			vxrs += fpu_vlm(8, 15, vxrs);
+	}
+	mask = flags & KERNEL_VXR_HIGH;
+	if (mask) {
+		if (mask == KERNEL_VXR_HIGH)
+			vxrs += fpu_vlm(16, 31, vxrs);
+		else if (mask == KERNEL_VXR_V16V23)
+			vxrs += fpu_vlm(16, 23, vxrs);
+		else
+			vxrs += fpu_vlm(24, 31, vxrs);
+	}
+}
+EXPORT_SYMBOL(__kernel_fpu_end);
 
-	if (!MACHINE_HAS_VX) {
-		if (flags & KERNEL_VXR_V0V7) {
-			/* Restore floating-point registers */
-			asm volatile("ld 0,%0" : : "Q" (state->fprs[0]));
-			asm volatile("ld 1,%0" : : "Q" (state->fprs[1]));
-			asm volatile("ld 2,%0" : : "Q" (state->fprs[2]));
-			asm volatile("ld 3,%0" : : "Q" (state->fprs[3]));
-			asm volatile("ld 4,%0" : : "Q" (state->fprs[4]));
-			asm volatile("ld 5,%0" : : "Q" (state->fprs[5]));
-			asm volatile("ld 6,%0" : : "Q" (state->fprs[6]));
-			asm volatile("ld 7,%0" : : "Q" (state->fprs[7]));
-			asm volatile("ld 8,%0" : : "Q" (state->fprs[8]));
-			asm volatile("ld 9,%0" : : "Q" (state->fprs[9]));
-			asm volatile("ld 10,%0" : : "Q" (state->fprs[10]));
-			asm volatile("ld 11,%0" : : "Q" (state->fprs[11]));
-			asm volatile("ld 12,%0" : : "Q" (state->fprs[12]));
-			asm volatile("ld 13,%0" : : "Q" (state->fprs[13]));
-			asm volatile("ld 14,%0" : : "Q" (state->fprs[14]));
-			asm volatile("ld 15,%0" : : "Q" (state->fprs[15]));
-		}
+void load_fpu_state(struct fpu *state, int flags)
+{
+	__vector128 *vxrs = &state->vxrs[0];
+	int mask;
+
+	if (flags & KERNEL_FPC)
+		fpu_lfpc_safe(&state->fpc);
+	if (!cpu_has_vx()) {
+		if (flags & KERNEL_VXR_V0V7)
+			load_fp_regs_vx(state->vxrs);
 		return;
 	}
+	mask = flags & KERNEL_VXR;
+	if (mask == KERNEL_VXR) {
+		fpu_vlm(0, 15, &vxrs[0]);
+		fpu_vlm(16, 31, &vxrs[16]);
+		return;
+	}
+	if (mask == KERNEL_VXR_MID) {
+		fpu_vlm(8, 23, &vxrs[8]);
+		return;
+	}
+	mask = flags & KERNEL_VXR_LOW;
+	if (mask) {
+		if (mask == KERNEL_VXR_LOW)
+			fpu_vlm(0, 15, &vxrs[0]);
+		else if (mask == KERNEL_VXR_V0V7)
+			fpu_vlm(0, 7, &vxrs[0]);
+		else
+			fpu_vlm(8, 15, &vxrs[8]);
+	}
+	mask = flags & KERNEL_VXR_HIGH;
+	if (mask) {
+		if (mask == KERNEL_VXR_HIGH)
+			fpu_vlm(16, 31, &vxrs[16]);
+		else if (mask == KERNEL_VXR_V16V23)
+			fpu_vlm(16, 23, &vxrs[16]);
+		else
+			fpu_vlm(24, 31, &vxrs[24]);
+	}
+}
+
+void save_fpu_state(struct fpu *state, int flags)
+{
+	__vector128 *vxrs = &state->vxrs[0];
+	int mask;
 
-	/* Test and restore (load) vector registers */
-	asm volatile (
-		/*
-		 * Test if any vector register must be loaded and, if so,
-		 * test if all registers can be loaded at once.
-		 */
-		"	la	1,%[vxrs]\n"	/* load restore area */
-		"	tmll	%[m],30\n"	/* KERNEL_VXR */
-		"	jz	7f\n"		/* no work -> done */
-		"	jo	5f\n"		/* -> restore V0..V31 */
-		/*
-		 * Test for special case KERNEL_FPU_MID only. In this
-		 * case a vlm V8..V23 is the best instruction
-		 */
-		"	chi	%[m],12\n"	/* KERNEL_VXR_MID */
-		"	jne	0f\n"		/* -> restore V8..V23 */
-		"	VLM	8,23,128,1\n"	/* vlm %v8,%v23,128(%r1) */
-		"	j	7f\n"
-		/* Test and restore the first half of 16 vector registers */
-		"0:	tmll	%[m],6\n"	/* KERNEL_VXR_LOW */
-		"	jz	3f\n"		/* -> KERNEL_VXR_HIGH */
-		"	jo	2f\n"		/* 11 -> restore V0..V15 */
-		"	brc	2,1f\n"		/* 10 -> restore V8..V15 */
-		"	VLM	0,7,0,1\n"	/* vlm %v0,%v7,0(%r1) */
-		"	j	3f\n"
-		"1:	VLM	8,15,128,1\n"	/* vlm %v8,%v15,128(%r1) */
-		"	j	3f\n"
-		"2:	VLM	0,15,0,1\n"	/* vlm %v0,%v15,0(%r1) */
-		/* Test and restore the second half of 16 vector registers */
-		"3:	tmll	%[m],24\n"	/* KERNEL_VXR_HIGH */
-		"	jz	7f\n"
-		"	jo	6f\n"		/* 11 -> restore V16..V31 */
-		"	brc	2,4f\n"		/* 10 -> restore V24..V31 */
-		"	VLM	16,23,256,1\n"	/* vlm %v16,%v23,256(%r1) */
-		"	j	7f\n"
-		"4:	VLM	24,31,384,1\n"	/* vlm %v24,%v31,384(%r1) */
-		"	j	7f\n"
-		"5:	VLM	0,15,0,1\n"	/* vlm %v0,%v15,0(%r1) */
-		"6:	VLM	16,31,256,1\n"	/* vlm %v16,%v31,256(%r1) */
-		"7:"
-		: [vxrs] "=Q" (*(struct vx_array *) &state->vxrs)
-		: [m] "d" (flags)
-		: "1", "cc");
+	if (flags & KERNEL_FPC)
+		fpu_stfpc(&state->fpc);
+	if (!cpu_has_vx()) {
+		if (flags & KERNEL_VXR_LOW)
+			save_fp_regs_vx(state->vxrs);
+		return;
+	}
+	mask = flags & KERNEL_VXR;
+	if (mask == KERNEL_VXR) {
+		fpu_vstm(0, 15, &vxrs[0]);
+		fpu_vstm(16, 31, &vxrs[16]);
+		return;
+	}
+	if (mask == KERNEL_VXR_MID) {
+		fpu_vstm(8, 23, &vxrs[8]);
+		return;
+	}
+	mask = flags & KERNEL_VXR_LOW;
+	if (mask) {
+		if (mask == KERNEL_VXR_LOW)
+			fpu_vstm(0, 15, &vxrs[0]);
+		else if (mask == KERNEL_VXR_V0V7)
+			fpu_vstm(0, 7, &vxrs[0]);
+		else
+			fpu_vstm(8, 15, &vxrs[8]);
+	}
+	mask = flags & KERNEL_VXR_HIGH;
+	if (mask) {
+		if (mask == KERNEL_VXR_HIGH)
+			fpu_vstm(16, 31, &vxrs[16]);
+		else if (mask == KERNEL_VXR_V16V23)
+			fpu_vstm(16, 23, &vxrs[16]);
+		else
+			fpu_vstm(24, 31, &vxrs[24]);
+	}
 }
-EXPORT_SYMBOL(__kernel_fpu_end);
+EXPORT_SYMBOL(save_fpu_state);
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 1bb85f60c0dd..e94bb98f5231 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -4,240 +4,333 @@
  *
  * Copyright IBM Corp. 2009,2014
  *
- *   Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>,
- *		Martin Schwidefsky <schwidefsky@de.ibm.com>
+ *   Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  */
 
-#include <linux/moduleloader.h>
 #include <linux/hardirq.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
+#include <linux/kmsan-checks.h>
+#include <linux/cpufeature.h>
 #include <linux/kprobes.h>
+#include <linux/execmem.h>
 #include <trace/syscall.h>
 #include <asm/asm-offsets.h>
+#include <asm/text-patching.h>
 #include <asm/cacheflush.h>
+#include <asm/ftrace.lds.h>
+#include <asm/nospec-branch.h>
 #include <asm/set_memory.h>
 #include "entry.h"
+#include "ftrace.h"
 
 /*
- * The mcount code looks like this:
- *	stg	%r14,8(%r15)		# offset 0
- *	larl	%r1,<&counter>		# offset 6
- *	brasl	%r14,_mcount		# offset 12
- *	lg	%r14,8(%r15)		# offset 18
- * Total length is 24 bytes. Only the first instruction will be patched
- * by ftrace_make_call / ftrace_make_nop.
- * The enabled ftrace code block looks like this:
+ * To generate function prologue either gcc's hotpatch feature (since gcc 4.8)
+ * or a combination of -pg -mrecord-mcount -mnop-mcount -mfentry flags
+ * (since gcc 9 / clang 10) is used.
+ * In both cases the original and also the disabled function prologue contains
+ * only a single six byte instruction and looks like this:
+ * >	brcl	0,0			# offset 0
+ * To enable ftrace the code gets patched like above and afterwards looks
+ * like this:
  * >	brasl	%r0,ftrace_caller	# offset 0
- *	larl	%r1,<&counter>		# offset 6
- *	brasl	%r14,_mcount		# offset 12
- *	lg	%r14,8(%r15)		# offset 18
+ *
+ * The instruction will be patched by ftrace_make_call / ftrace_make_nop.
  * The ftrace function gets called with a non-standard C function call ABI
  * where r0 contains the return address. It is also expected that the called
  * function only clobbers r0 and r1, but restores r2-r15.
  * For module code we can't directly jump to ftrace caller, but need a
  * trampoline (ftrace_plt), which clobbers also r1.
- * The return point of the ftrace function has offset 24, so execution
- * continues behind the mcount block.
- * The disabled ftrace code block looks like this:
- * >	jg	.+24			# offset 0
- *	larl	%r1,<&counter>		# offset 6
- *	brasl	%r14,_mcount		# offset 12
- *	lg	%r14,8(%r15)		# offset 18
- * The jg instruction branches to offset 24 to skip as many instructions
- * as possible.
- * In case we use gcc's hotpatch feature the original and also the disabled
- * function prologue contains only a single six byte instruction and looks
- * like this:
- * >	brcl	0,0			# offset 0
- * To enable ftrace the code gets patched like above and afterwards looks
- * like this:
- * >	brasl	%r0,ftrace_caller	# offset 0
  */
 
-unsigned long ftrace_plt;
+void *ftrace_func __read_mostly = ftrace_stub;
+struct ftrace_insn {
+	u16 opc;
+	s32 disp;
+} __packed;
 
-static inline void ftrace_generate_orig_insn(struct ftrace_insn *insn)
+static const char *ftrace_shared_hotpatch_trampoline(const char **end)
 {
-#if defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT)
-	/* brcl 0,0 */
-	insn->opc = 0xc004;
-	insn->disp = 0;
-#else
-	/* stg r14,8(r15) */
-	insn->opc = 0xe3e0;
-	insn->disp = 0xf0080024;
-#endif
+	const char *tstart, *tend;
+
+	tstart = ftrace_shared_hotpatch_trampoline_br;
+	tend = ftrace_shared_hotpatch_trampoline_br_end;
+#ifdef CONFIG_EXPOLINE
+	if (!nospec_disable) {
+		tstart = ftrace_shared_hotpatch_trampoline_exrl;
+		tend = ftrace_shared_hotpatch_trampoline_exrl_end;
+	}
+#endif /* CONFIG_EXPOLINE */
+	if (end)
+		*end = tend;
+	return tstart;
 }
 
-static inline int is_kprobe_on_ftrace(struct ftrace_insn *insn)
+bool ftrace_need_init_nop(void)
 {
-#ifdef CONFIG_KPROBES
-	if (insn->opc == BREAKPOINT_INSTRUCTION)
-		return 1;
-#endif
-	return 0;
+	return !cpu_has_seq_insn();
 }
 
-static inline void ftrace_generate_kprobe_nop_insn(struct ftrace_insn *insn)
+int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
 {
-#ifdef CONFIG_KPROBES
-	insn->opc = BREAKPOINT_INSTRUCTION;
-	insn->disp = KPROBE_ON_FTRACE_NOP;
+	static struct ftrace_hotpatch_trampoline *next_vmlinux_trampoline =
+		__ftrace_hotpatch_trampolines_start;
+	static const struct ftrace_insn orig = { .opc = 0xc004, .disp = 0 };
+	static struct ftrace_hotpatch_trampoline *trampoline;
+	struct ftrace_hotpatch_trampoline **next_trampoline;
+	struct ftrace_hotpatch_trampoline *trampolines_end;
+	struct ftrace_hotpatch_trampoline tmp;
+	struct ftrace_insn *insn;
+	struct ftrace_insn old;
+	const char *shared;
+	s32 disp;
+
+	BUILD_BUG_ON(sizeof(struct ftrace_hotpatch_trampoline) !=
+		     SIZEOF_FTRACE_HOTPATCH_TRAMPOLINE);
+
+	next_trampoline = &next_vmlinux_trampoline;
+	trampolines_end = __ftrace_hotpatch_trampolines_end;
+	shared = ftrace_shared_hotpatch_trampoline(NULL);
+#ifdef CONFIG_MODULES
+	if (mod) {
+		next_trampoline = &mod->arch.next_trampoline;
+		trampolines_end = mod->arch.trampolines_end;
+	}
 #endif
+
+	if (WARN_ON_ONCE(*next_trampoline >= trampolines_end))
+		return -ENOMEM;
+	trampoline = (*next_trampoline)++;
+
+	if (copy_from_kernel_nofault(&old, (void *)rec->ip, sizeof(old)))
+		return -EFAULT;
+	/* Check for the compiler-generated fentry nop (brcl 0, .). */
+	if (WARN_ON_ONCE(memcmp(&orig, &old, sizeof(old))))
+		return -EINVAL;
+
+	/* Generate the trampoline. */
+	tmp.brasl_opc = 0xc015; /* brasl %r1, shared */
+	tmp.brasl_disp = (shared - (const char *)&trampoline->brasl_opc) / 2;
+	tmp.interceptor = FTRACE_ADDR;
+	tmp.rest_of_intercepted_function = rec->ip + sizeof(struct ftrace_insn);
+	s390_kernel_write(trampoline, &tmp, sizeof(tmp));
+
+	/* Generate a jump to the trampoline. */
+	disp = ((char *)trampoline - (char *)rec->ip) / 2;
+	insn = (struct ftrace_insn *)rec->ip;
+	s390_kernel_write(&insn->disp, &disp, sizeof(disp));
+
+	return 0;
 }
 
-static inline void ftrace_generate_kprobe_call_insn(struct ftrace_insn *insn)
+static struct ftrace_hotpatch_trampoline *ftrace_get_trampoline(struct dyn_ftrace *rec)
 {
-#ifdef CONFIG_KPROBES
-	insn->opc = BREAKPOINT_INSTRUCTION;
-	insn->disp = KPROBE_ON_FTRACE_CALL;
-#endif
+	struct ftrace_hotpatch_trampoline *trampoline;
+	struct ftrace_insn insn;
+	s64 disp;
+	u16 opc;
+
+	if (copy_from_kernel_nofault(&insn, (void *)rec->ip, sizeof(insn)))
+		return ERR_PTR(-EFAULT);
+	disp = (s64)insn.disp * 2;
+	trampoline = (void *)(rec->ip + disp);
+	if (get_kernel_nofault(opc, &trampoline->brasl_opc))
+		return ERR_PTR(-EFAULT);
+	if (opc != 0xc015)
+		return ERR_PTR(-EINVAL);
+	return trampoline;
 }
 
-int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
-		       unsigned long addr)
+static inline struct ftrace_insn
+ftrace_generate_branch_insn(unsigned long ip, unsigned long target)
 {
-	return 0;
+	/* brasl r0,target or brcl 0,0 */
+	return (struct ftrace_insn){ .opc = target ? 0xc005 : 0xc004,
+				     .disp = target ? (target - ip) / 2 : 0 };
 }
 
-int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
-		    unsigned long addr)
+static int ftrace_patch_branch_insn(unsigned long ip, unsigned long old_target,
+				    unsigned long target)
 {
-	struct ftrace_insn orig, new, old;
+	struct ftrace_insn orig = ftrace_generate_branch_insn(ip, old_target);
+	struct ftrace_insn new = ftrace_generate_branch_insn(ip, target);
+	struct ftrace_insn old;
 
-	if (probe_kernel_read(&old, (void *) rec->ip, sizeof(old)))
+	if (!IS_ALIGNED(ip, 8))
+		return -EINVAL;
+	if (copy_from_kernel_nofault(&old, (void *)ip, sizeof(old)))
 		return -EFAULT;
-	if (addr == MCOUNT_ADDR) {
-		/* Initial code replacement */
-		ftrace_generate_orig_insn(&orig);
-		ftrace_generate_nop_insn(&new);
-	} else if (is_kprobe_on_ftrace(&old)) {
-		/*
-		 * If we find a breakpoint instruction, a kprobe has been
-		 * placed at the beginning of the function. We write the
-		 * constant KPROBE_ON_FTRACE_NOP into the remaining four
-		 * bytes of the original instruction so that the kprobes
-		 * handler can execute a nop, if it reaches this breakpoint.
-		 */
-		ftrace_generate_kprobe_call_insn(&orig);
-		ftrace_generate_kprobe_nop_insn(&new);
-	} else {
-		/* Replace ftrace call with a nop. */
-		ftrace_generate_call_insn(&orig, rec->ip);
-		ftrace_generate_nop_insn(&new);
-	}
 	/* Verify that the to be replaced code matches what we expect. */
 	if (memcmp(&orig, &old, sizeof(old)))
 		return -EINVAL;
-	s390_kernel_write((void *) rec->ip, &new, sizeof(new));
+	s390_kernel_write((void *)ip, &new, sizeof(new));
 	return 0;
 }
 
-int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+static int ftrace_modify_trampoline_call(struct dyn_ftrace *rec,
+					 unsigned long old_addr,
+					 unsigned long addr)
 {
-	struct ftrace_insn orig, new, old;
+	struct ftrace_hotpatch_trampoline *trampoline;
+	u64 old;
 
-	if (probe_kernel_read(&old, (void *) rec->ip, sizeof(old)))
+	trampoline = ftrace_get_trampoline(rec);
+	if (IS_ERR(trampoline))
+		return PTR_ERR(trampoline);
+	if (get_kernel_nofault(old, &trampoline->interceptor))
 		return -EFAULT;
-	if (is_kprobe_on_ftrace(&old)) {
-		/*
-		 * If we find a breakpoint instruction, a kprobe has been
-		 * placed at the beginning of the function. We write the
-		 * constant KPROBE_ON_FTRACE_CALL into the remaining four
-		 * bytes of the original instruction so that the kprobes
-		 * handler can execute a brasl if it reaches this breakpoint.
-		 */
-		ftrace_generate_kprobe_nop_insn(&orig);
-		ftrace_generate_kprobe_call_insn(&new);
-	} else {
-		/* Replace nop with an ftrace call. */
-		ftrace_generate_nop_insn(&orig);
-		ftrace_generate_call_insn(&new, rec->ip);
-	}
-	/* Verify that the to be replaced code matches what we expect. */
-	if (memcmp(&orig, &old, sizeof(old)))
+	if (old != old_addr)
 		return -EINVAL;
-	s390_kernel_write((void *) rec->ip, &new, sizeof(new));
+	s390_kernel_write(&trampoline->interceptor, &addr, sizeof(addr));
 	return 0;
 }
 
-int ftrace_update_ftrace_func(ftrace_func_t func)
+int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
+		       unsigned long addr)
 {
-	return 0;
+	if (cpu_has_seq_insn())
+		return ftrace_patch_branch_insn(rec->ip, old_addr, addr);
+	else
+		return ftrace_modify_trampoline_call(rec, old_addr, addr);
 }
 
-int __init ftrace_dyn_arch_init(void)
+static int ftrace_patch_branch_mask(void *addr, u16 expected, bool enable)
 {
+	u16 old;
+	u8 op;
+
+	if (get_kernel_nofault(old, addr))
+		return -EFAULT;
+	if (old != expected)
+		return -EINVAL;
+	/* set mask field to all ones or zeroes */
+	op = enable ? 0xf4 : 0x04;
+	s390_kernel_write((char *)addr + 1, &op, sizeof(op));
 	return 0;
 }
 
-#ifdef CONFIG_MODULES
+int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
+		    unsigned long addr)
+{
+	/* Expect brcl 0xf,... for the !cpu_has_seq_insn() case */
+	if (cpu_has_seq_insn())
+		return ftrace_patch_branch_insn(rec->ip, addr, 0);
+	else
+		return ftrace_patch_branch_mask((void *)rec->ip, 0xc0f4, false);
+}
+
+static int ftrace_make_trampoline_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	struct ftrace_hotpatch_trampoline *trampoline;
 
-static int __init ftrace_plt_init(void)
+	trampoline = ftrace_get_trampoline(rec);
+	if (IS_ERR(trampoline))
+		return PTR_ERR(trampoline);
+	s390_kernel_write(&trampoline->interceptor, &addr, sizeof(addr));
+	/* Expect brcl 0x0,... */
+	return ftrace_patch_branch_mask((void *)rec->ip, 0xc004, true);
+}
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 {
-	unsigned int *ip;
+	if (cpu_has_seq_insn())
+		return ftrace_patch_branch_insn(rec->ip, 0, addr);
+	else
+		return ftrace_make_trampoline_call(rec, addr);
+}
 
-	ftrace_plt = (unsigned long) module_alloc(PAGE_SIZE);
-	if (!ftrace_plt)
-		panic("cannot allocate ftrace plt\n");
-	ip = (unsigned int *) ftrace_plt;
-	ip[0] = 0x0d10e310; /* basr 1,0; lg 1,10(1); br 1 */
-	ip[1] = 0x100a0004;
-	ip[2] = 0x07f10000;
-	ip[3] = FTRACE_ADDR >> 32;
-	ip[4] = FTRACE_ADDR & 0xffffffff;
-	set_memory_ro(ftrace_plt, 1);
+int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+	ftrace_func = func;
 	return 0;
 }
-device_initcall(ftrace_plt_init);
 
-#endif /* CONFIG_MODULES */
+void arch_ftrace_update_code(int command)
+{
+	ftrace_modify_all_code(command);
+}
+
+void ftrace_arch_code_modify_post_process(void)
+{
+	/*
+	 * Flush any pre-fetched instructions on all
+	 * CPUs to make the new code visible.
+	 */
+	text_poke_sync_lock();
+}
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-/*
- * Hook the return address and push it in the stack of return addresses
- * in current thread info.
- */
-unsigned long prepare_ftrace_return(unsigned long ra, unsigned long sp,
-				    unsigned long ip)
+
+void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
+		       struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
+	unsigned long *parent = &arch_ftrace_regs(fregs)->regs.gprs[14];
+	unsigned long sp = arch_ftrace_regs(fregs)->regs.gprs[15];
+
 	if (unlikely(ftrace_graph_is_dead()))
-		goto out;
+		return;
 	if (unlikely(atomic_read(&current->tracing_graph_pause)))
-		goto out;
-	ip -= MCOUNT_INSN_SIZE;
-	if (!function_graph_enter(ra, ip, 0, (void *) sp))
-		ra = (unsigned long) return_to_handler;
-out:
-	return ra;
+		return;
+	if (!function_graph_enter_regs(*parent, ip, 0, (unsigned long *)sp, fregs))
+		*parent = (unsigned long)&return_to_handler;
 }
-NOKPROBE_SYMBOL(prepare_ftrace_return);
 
-/*
- * Patch the kernel code at ftrace_graph_caller location. The instruction
- * there is branch relative on condition. To enable the ftrace graph code
- * block, we simply patch the mask field of the instruction to zero and
- * turn the instruction into a nop.
- * To disable the ftrace graph code the mask field will be patched to
- * all ones, which turns the instruction into an unconditional branch.
- */
-int ftrace_enable_ftrace_graph_caller(void)
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+#ifdef CONFIG_KPROBES_ON_FTRACE
+void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
+		struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
-	u8 op = 0x04; /* set mask field to zero */
+	struct kprobe_ctlblk *kcb;
+	struct pt_regs *regs;
+	struct kprobe *p;
+	int bit;
 
-	s390_kernel_write(__va(ftrace_graph_caller)+1, &op, sizeof(op));
-	return 0;
+	if (unlikely(kprobe_ftrace_disabled))
+		return;
+
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
+	if (bit < 0)
+		return;
+
+	kmsan_unpoison_memory(fregs, ftrace_regs_size());
+	regs = ftrace_get_regs(fregs);
+	p = get_kprobe((kprobe_opcode_t *)ip);
+	if (!regs || unlikely(!p) || kprobe_disabled(p))
+		goto out;
+
+	if (kprobe_running()) {
+		kprobes_inc_nmissed_count(p);
+		goto out;
+	}
+
+	__this_cpu_write(current_kprobe, p);
+
+	kcb = get_kprobe_ctlblk();
+	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+	instruction_pointer_set(regs, ip);
+
+	if (!p->pre_handler || !p->pre_handler(p, regs)) {
+
+		instruction_pointer_set(regs, ip + MCOUNT_INSN_SIZE);
+
+		if (unlikely(p->post_handler)) {
+			kcb->kprobe_status = KPROBE_HIT_SSDONE;
+			p->post_handler(p, regs, 0);
+		}
+	}
+	__this_cpu_write(current_kprobe, NULL);
+out:
+	ftrace_test_recursion_unlock(bit);
 }
+NOKPROBE_SYMBOL(kprobe_ftrace_handler);
 
-int ftrace_disable_ftrace_graph_caller(void)
+int arch_prepare_kprobe_ftrace(struct kprobe *p)
 {
-	u8 op = 0xf4; /* set mask field to all ones */
-
-	s390_kernel_write(__va(ftrace_graph_caller)+1, &op, sizeof(op));
+	p->ainsn.insn = NULL;
 	return 0;
 }
-
-#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+#endif
diff --git a/arch/s390/kernel/ftrace.h b/arch/s390/kernel/ftrace.h
new file mode 100644
index 000000000000..23337065f402
--- /dev/null
+++ b/arch/s390/kernel/ftrace.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FTRACE_H
+#define _FTRACE_H
+
+#include <asm/types.h>
+
+struct ftrace_hotpatch_trampoline {
+	u16 brasl_opc;
+	s32 brasl_disp;
+	s16: 16;
+	u64 rest_of_intercepted_function;
+	u64 interceptor;
+} __packed;
+
+extern struct ftrace_hotpatch_trampoline __ftrace_hotpatch_trampolines_start[];
+extern struct ftrace_hotpatch_trampoline __ftrace_hotpatch_trampolines_end[];
+extern const char ftrace_shared_hotpatch_trampoline_br[];
+extern const char ftrace_shared_hotpatch_trampoline_br_end[];
+extern const char ftrace_shared_hotpatch_trampoline_exrl[];
+extern const char ftrace_shared_hotpatch_trampoline_exrl_end[];
+
+#endif /* _FTRACE_H */
diff --git a/arch/s390/kernel/guarded_storage.c b/arch/s390/kernel/guarded_storage.c
index d14dd1c2e524..cf26d7a37425 100644
--- a/arch/s390/kernel/guarded_storage.c
+++ b/arch/s390/kernel/guarded_storage.c
@@ -4,6 +4,7 @@
  * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  */
 
+#include <linux/cpufeature.h>
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
@@ -28,7 +29,7 @@ static int gs_enable(void)
 			return -ENOMEM;
 		gs_cb->gsd = 25;
 		preempt_disable();
-		__ctl_set_bit(2, 4);
+		local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT);
 		load_gs_cb(gs_cb);
 		current->thread.gs_cb = gs_cb;
 		preempt_enable();
@@ -42,7 +43,7 @@ static int gs_disable(void)
 		preempt_disable();
 		kfree(current->thread.gs_cb);
 		current->thread.gs_cb = NULL;
-		__ctl_clear_bit(2, 4);
+		local_ctl_clear_bit(2, CR2_GUARDED_STORAGE_BIT);
 		preempt_enable();
 	}
 	return 0;
@@ -84,7 +85,7 @@ void gs_load_bc_cb(struct pt_regs *regs)
 	if (gs_cb) {
 		kfree(current->thread.gs_cb);
 		current->thread.gs_bc_cb = NULL;
-		__ctl_set_bit(2, 4);
+		local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT);
 		load_gs_cb(gs_cb);
 		current->thread.gs_cb = gs_cb;
 	}
@@ -109,7 +110,7 @@ static int gs_broadcast(void)
 SYSCALL_DEFINE2(s390_guarded_storage, int, command,
 		struct gs_cb __user *, gs_cb)
 {
-	if (!MACHINE_HAS_GS)
+	if (!cpu_has_gs())
 		return -EOPNOTSUPP;
 	switch (command) {
 	case GS_ENABLE:
diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S
index 8b88dbbda7df..7edb9ded199c 100644
--- a/arch/s390/kernel/head64.S
+++ b/arch/s390/kernel/head64.S
@@ -5,44 +5,36 @@
  *   Author(s):	Hartmut Penner <hp@de.ibm.com>
  *		Martin Schwidefsky <schwidefsky@de.ibm.com>
  *		Rob van der Heij <rvdhei@iae.nl>
- *		Heiko Carstens <heiko.carstens@de.ibm.com>
  *
  */
 
 #include <linux/init.h>
 #include <linux/linkage.h>
+#include <asm/lowcore.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
 
 __HEAD
-ENTRY(startup_continue)
-	tm	__LC_STFLE_FAC_LIST+5,0x80	# LPP available ?
-	jz	0f
-	xc	__LC_LPP+1(7,0),__LC_LPP+1	# clear lpp and current_pid
-	mvi	__LC_LPP,0x80			#   and set LPP_MAGIC
-	.insn	s,0xb2800000,__LC_LPP		# load program parameter
-0:	larl	%r1,tod_clock_base
-	mvc	0(16,%r1),__LC_BOOT_CLOCK
-	larl	%r13,.LPG1		# get base
+SYM_CODE_START(startup_continue)
 #
 # Setup stack
 #
+	GET_LC	%r2
 	larl	%r14,init_task
-	stg	%r14,__LC_CURRENT
-	larl	%r15,init_thread_union+THREAD_SIZE-STACK_FRAME_OVERHEAD-__PT_SIZE
-#ifdef CONFIG_KASAN
-	brasl	%r14,kasan_early_init
-#endif
+	stg	%r14,__LC_CURRENT(%r2)
+	larl	%r15,init_thread_union+STACK_INIT_OFFSET
+	stg	%r15,__LC_KERNEL_STACK(%r2)
+	brasl	%r14,sclp_early_adjust_va	# allow sclp_early_printk
 	brasl	%r14,startup_init		# s390 specific early init
 	brasl	%r14,start_kernel		# common init code
 #
 # We returned from start_kernel ?!? PANIK
 #
 	basr	%r13,0
-	lpswe	.Ldw-.(%r13)		# load disabled wait psw
+	lpswe	dw_psw-.(%r13)		# load disabled wait psw
+SYM_CODE_END(startup_continue)
 
-	.align	16
-.LPG1:
-.Ldw:	.quad	0x0002000180000000,0x0000000000000000
+	.balign	16
+SYM_DATA_LOCAL(dw_psw,	.quad 0x0002000180000000,0x0000000000000000)
diff --git a/arch/s390/kernel/hiperdispatch.c b/arch/s390/kernel/hiperdispatch.c
new file mode 100644
index 000000000000..e7b66d046e8d
--- /dev/null
+++ b/arch/s390/kernel/hiperdispatch.c
@@ -0,0 +1,431 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2024
+ */
+
+#define KMSG_COMPONENT "hd"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+/*
+ * Hiperdispatch:
+ * Dynamically calculates the optimum number of high capacity COREs
+ * by considering the state the system is in. When hiperdispatch decides
+ * that a capacity update is necessary, it schedules a topology update.
+ * During topology updates the CPU capacities are always re-adjusted.
+ *
+ * There is two places where CPU capacities are being accessed within
+ * hiperdispatch.
+ * -> hiperdispatch's reoccuring work function reads CPU capacities to
+ *    determine high capacity CPU count.
+ * -> during a topology update hiperdispatch's adjustment function
+ *    updates CPU capacities.
+ * These two can run on different CPUs in parallel which can cause
+ * hiperdispatch to make wrong decisions. This can potentially cause
+ * some overhead by leading to extra rebuild_sched_domains() calls
+ * for correction. Access to capacities within hiperdispatch has to be
+ * serialized to prevent the overhead.
+ *
+ * Hiperdispatch decision making revolves around steal time.
+ * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time
+ * crosses the threshold value hiperdispatch falls back to giving high
+ * capacities to entitled CPUs. When steal time drops below the
+ * threshold boundary, hiperdispatch utilizes all CPUs by giving all
+ * of them high capacity.
+ *
+ * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread
+ * performance. Comparing the throughput of;
+ * - single CORE, with N threads, running N tasks
+ * - N separate COREs running N tasks,
+ * using individual COREs for individual tasks yield better
+ * performance. This performance difference is roughly ~30% (can change
+ * between machine generations)
+ *
+ * Hiperdispatch tries to hint scheduler to use individual COREs for
+ * each task, as long as steal time on those COREs are less than 30%,
+ * therefore delaying the throughput loss caused by using SMP threads.
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/cpumask.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/kernel_stat.h>
+#include <linux/kstrtox.h>
+#include <linux/ktime.h>
+#include <linux/sysctl.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <asm/hiperdispatch.h>
+#include <asm/setup.h>
+#include <asm/smp.h>
+#include <asm/topology.h>
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/hiperdispatch.h>
+
+#define HD_DELAY_FACTOR			(4)
+#define HD_DELAY_INTERVAL		(HZ / 4)
+#define HD_STEAL_THRESHOLD		30
+#define HD_STEAL_AVG_WEIGHT		16
+
+static cpumask_t hd_vl_coremask;	/* Mask containing all vertical low COREs */
+static cpumask_t hd_vmvl_cpumask;	/* Mask containing vertical medium and low CPUs */
+static int hd_high_capacity_cores;	/* Current CORE count with high capacity */
+static int hd_entitled_cores;		/* Total vertical high and medium CORE count */
+static int hd_online_cores;		/* Current online CORE count */
+
+static unsigned long hd_previous_steal;	/* Previous iteration's CPU steal timer total */
+static unsigned long hd_high_time;	/* Total time spent while all cpus have high capacity */
+static unsigned long hd_low_time;	/* Total time spent while vl cpus have low capacity */
+static atomic64_t hd_adjustments;	/* Total occurrence count of hiperdispatch adjustments */
+
+static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD;
+static unsigned int hd_delay_factor = HD_DELAY_FACTOR;
+static int hd_enabled;
+
+static void hd_capacity_work_fn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn);
+
+static int hd_set_hiperdispatch_mode(int enable)
+{
+	if (!cpu_has_topology())
+		enable = 0;
+	if (hd_enabled == enable)
+		return 0;
+	hd_enabled = enable;
+	return 1;
+}
+
+void hd_reset_state(void)
+{
+	cpumask_clear(&hd_vl_coremask);
+	cpumask_clear(&hd_vmvl_cpumask);
+	hd_entitled_cores = 0;
+	hd_online_cores = 0;
+}
+
+void hd_add_core(int cpu)
+{
+	const struct cpumask *siblings;
+	int polarization;
+
+	hd_online_cores++;
+	polarization = smp_cpu_get_polarization(cpu);
+	siblings = topology_sibling_cpumask(cpu);
+	switch (polarization) {
+	case POLARIZATION_VH:
+		hd_entitled_cores++;
+		break;
+	case POLARIZATION_VM:
+		hd_entitled_cores++;
+		cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
+		break;
+	case POLARIZATION_VL:
+		cpumask_set_cpu(cpu, &hd_vl_coremask);
+		cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
+		break;
+	}
+}
+
+/* Serialize update and read operations of debug counters. */
+static DEFINE_MUTEX(hd_counter_mutex);
+
+static void hd_update_times(void)
+{
+	static ktime_t prev;
+	ktime_t now;
+
+	/*
+	 * Check if hiperdispatch is active, if not set the prev to 0.
+	 * This way it is possible to differentiate the first update iteration after
+	 * enabling hiperdispatch.
+	 */
+	if (hd_entitled_cores == 0 || hd_enabled == 0) {
+		prev = ktime_set(0, 0);
+		return;
+	}
+	now = ktime_get();
+	if (ktime_after(prev, 0)) {
+		if (hd_high_capacity_cores == hd_online_cores)
+			hd_high_time += ktime_ms_delta(now, prev);
+		else
+			hd_low_time += ktime_ms_delta(now, prev);
+	}
+	prev = now;
+}
+
+static void hd_update_capacities(void)
+{
+	int cpu, upscaling_cores;
+	unsigned long capacity;
+
+	upscaling_cores = hd_high_capacity_cores - hd_entitled_cores;
+	capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW;
+	hd_high_capacity_cores = hd_entitled_cores;
+	for_each_cpu(cpu, &hd_vl_coremask) {
+		smp_set_core_capacity(cpu, capacity);
+		if (capacity != CPU_CAPACITY_HIGH)
+			continue;
+		hd_high_capacity_cores++;
+		upscaling_cores--;
+		if (upscaling_cores == 0)
+			capacity = CPU_CAPACITY_LOW;
+	}
+}
+
+void hd_disable_hiperdispatch(void)
+{
+	cancel_delayed_work_sync(&hd_capacity_work);
+	hd_high_capacity_cores = hd_online_cores;
+	hd_previous_steal = 0;
+}
+
+int hd_enable_hiperdispatch(void)
+{
+	mutex_lock(&hd_counter_mutex);
+	hd_update_times();
+	mutex_unlock(&hd_counter_mutex);
+	if (hd_enabled == 0)
+		return 0;
+	if (hd_entitled_cores == 0)
+		return 0;
+	if (hd_online_cores <= hd_entitled_cores)
+		return 0;
+	mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor);
+	hd_update_capacities();
+	return 1;
+}
+
+static unsigned long hd_steal_avg(unsigned long new)
+{
+	static unsigned long steal;
+
+	steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT;
+	return steal;
+}
+
+static unsigned long hd_calculate_steal_percentage(void)
+{
+	unsigned long time_delta, steal_delta, steal, percentage;
+	static ktime_t prev;
+	int cpus, cpu;
+	ktime_t now;
+
+	cpus = 0;
+	steal = 0;
+	percentage = 0;
+	for_each_cpu(cpu, &hd_vmvl_cpumask) {
+		steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
+		cpus++;
+	}
+	/*
+	 * If there is no vertical medium and low CPUs steal time
+	 * is 0 as vertical high CPUs shouldn't experience steal time.
+	 */
+	if (cpus == 0)
+		return percentage;
+	now = ktime_get();
+	time_delta = ktime_to_ns(ktime_sub(now, prev));
+	if (steal > hd_previous_steal && hd_previous_steal != 0) {
+		steal_delta = (steal - hd_previous_steal) * 100 / time_delta;
+		percentage = steal_delta / cpus;
+	}
+	hd_previous_steal = steal;
+	prev = now;
+	return percentage;
+}
+
+static void hd_capacity_work_fn(struct work_struct *work)
+{
+	unsigned long steal_percentage, new_cores;
+
+	mutex_lock(&smp_cpu_state_mutex);
+	/*
+	 * If online cores are less or equal to entitled cores hiperdispatch
+	 * does not need to make any adjustments, call a topology update to
+	 * disable hiperdispatch.
+	 * Normally this check is handled on topology update, but during cpu
+	 * unhotplug, topology and cpu mask updates are done in reverse
+	 * order, causing hd_enable_hiperdispatch() to get stale data.
+	 */
+	if (hd_online_cores <= hd_entitled_cores) {
+		topology_schedule_update();
+		mutex_unlock(&smp_cpu_state_mutex);
+		return;
+	}
+	steal_percentage = hd_steal_avg(hd_calculate_steal_percentage());
+	if (steal_percentage < hd_steal_threshold)
+		new_cores = hd_online_cores;
+	else
+		new_cores = hd_entitled_cores;
+	if (hd_high_capacity_cores != new_cores) {
+		trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores);
+		hd_high_capacity_cores = new_cores;
+		atomic64_inc(&hd_adjustments);
+		topology_schedule_update();
+	}
+	trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores);
+	mutex_unlock(&smp_cpu_state_mutex);
+	schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL);
+}
+
+static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write,
+				     void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int hiperdispatch;
+	int rc;
+	struct ctl_table ctl_entry = {
+		.procname	= ctl->procname,
+		.data		= &hiperdispatch,
+		.maxlen		= sizeof(int),
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	};
+
+	hiperdispatch = hd_enabled;
+	rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
+	if (rc < 0 || !write)
+		return rc;
+	mutex_lock(&smp_cpu_state_mutex);
+	if (hd_set_hiperdispatch_mode(hiperdispatch))
+		topology_schedule_update();
+	mutex_unlock(&smp_cpu_state_mutex);
+	return 0;
+}
+
+static const struct ctl_table hiperdispatch_ctl_table[] = {
+	{
+		.procname	= "hiperdispatch",
+		.mode		= 0644,
+		.proc_handler	= hiperdispatch_ctl_handler,
+	},
+};
+
+static ssize_t hd_steal_threshold_show(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	return sysfs_emit(buf, "%u\n", hd_steal_threshold);
+}
+
+static ssize_t hd_steal_threshold_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf,
+					size_t count)
+{
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buf, 0, &val);
+	if (rc)
+		return rc;
+	if (val > 100)
+		return -ERANGE;
+	hd_steal_threshold = val;
+	return count;
+}
+
+static DEVICE_ATTR_RW(hd_steal_threshold);
+
+static ssize_t hd_delay_factor_show(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	return sysfs_emit(buf, "%u\n", hd_delay_factor);
+}
+
+static ssize_t hd_delay_factor_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf,
+				     size_t count)
+{
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buf, 0, &val);
+	if (rc)
+		return rc;
+	if (!val)
+		return -ERANGE;
+	hd_delay_factor = val;
+	return count;
+}
+
+static DEVICE_ATTR_RW(hd_delay_factor);
+
+static struct attribute *hd_attrs[] = {
+	&dev_attr_hd_steal_threshold.attr,
+	&dev_attr_hd_delay_factor.attr,
+	NULL,
+};
+
+static const struct attribute_group hd_attr_group = {
+	.name  = "hiperdispatch",
+	.attrs = hd_attrs,
+};
+
+static int hd_greedy_time_get(void *unused, u64 *val)
+{
+	mutex_lock(&hd_counter_mutex);
+	hd_update_times();
+	*val = hd_high_time;
+	mutex_unlock(&hd_counter_mutex);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n");
+
+static int hd_conservative_time_get(void *unused, u64 *val)
+{
+	mutex_lock(&hd_counter_mutex);
+	hd_update_times();
+	*val = hd_low_time;
+	mutex_unlock(&hd_counter_mutex);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n");
+
+static int hd_adjustment_count_get(void *unused, u64 *val)
+{
+	*val = atomic64_read(&hd_adjustments);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n");
+
+static void __init hd_create_debugfs_counters(void)
+{
+	struct dentry *dir;
+
+	dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir);
+	debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops);
+	debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops);
+	debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops);
+}
+
+static void __init hd_create_attributes(void)
+{
+	struct device *dev;
+
+	dev = bus_get_dev_root(&cpu_subsys);
+	if (!dev)
+		return;
+	if (sysfs_create_group(&dev->kobj, &hd_attr_group))
+		pr_warn("Unable to create hiperdispatch attribute group\n");
+	put_device(dev);
+}
+
+static int __init hd_init(void)
+{
+	if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) {
+		hd_set_hiperdispatch_mode(1);
+		topology_schedule_update();
+	}
+	if (!register_sysctl("s390", hiperdispatch_ctl_table))
+		pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n");
+	hd_create_debugfs_counters();
+	hd_create_attributes();
+	return 0;
+}
+late_initcall(hd_init);
diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c
index 8f8456816d83..39cb8d0ae348 100644
--- a/arch/s390/kernel/idle.c
+++ b/arch/s390/kernel/idle.c
@@ -9,134 +9,91 @@
 
 #include <linux/kernel.h>
 #include <linux/kernel_stat.h>
-#include <linux/kprobes.h>
 #include <linux/notifier.h>
 #include <linux/init.h>
 #include <linux/cpu.h>
-#include <linux/sched/cputime.h>
+#include <trace/events/power.h>
+#include <asm/cpu_mf.h>
+#include <asm/cputime.h>
 #include <asm/nmi.h>
 #include <asm/smp.h>
 #include "entry.h"
 
 static DEFINE_PER_CPU(struct s390_idle_data, s390_idle);
 
-void enabled_wait(void)
+void account_idle_time_irq(void)
 {
 	struct s390_idle_data *idle = this_cpu_ptr(&s390_idle);
-	unsigned long long idle_time;
-	unsigned long psw_mask;
-
-	trace_hardirqs_on();
+	struct lowcore *lc = get_lowcore();
+	unsigned long idle_time;
+	u64 cycles_new[8];
+	int i;
+
+	if (smp_cpu_mtid) {
+		stcctm(MT_DIAG, smp_cpu_mtid, cycles_new);
+		for (i = 0; i < smp_cpu_mtid; i++)
+			this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]);
+	}
 
-	/* Wait for external, I/O or machine check interrupt. */
-	psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | PSW_MASK_DAT |
-		PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK;
-	clear_cpu_flag(CIF_NOHZ_DELAY);
+	idle_time = lc->int_clock - idle->clock_idle_enter;
 
-	/* Call the assembler magic in entry.S */
-	psw_idle(idle, psw_mask);
+	lc->steal_timer += idle->clock_idle_enter - lc->last_update_clock;
+	lc->last_update_clock = lc->int_clock;
 
-	trace_hardirqs_off();
+	lc->system_timer += lc->last_update_timer - idle->timer_idle_enter;
+	lc->last_update_timer = lc->sys_enter_timer;
 
 	/* Account time spent with enabled wait psw loaded as idle time. */
-	write_seqcount_begin(&idle->seqcount);
-	idle_time = idle->clock_idle_exit - idle->clock_idle_enter;
-	idle->clock_idle_enter = idle->clock_idle_exit = 0ULL;
-	idle->idle_time += idle_time;
-	idle->idle_count++;
+	WRITE_ONCE(idle->idle_time, READ_ONCE(idle->idle_time) + idle_time);
+	WRITE_ONCE(idle->idle_count, READ_ONCE(idle->idle_count) + 1);
 	account_idle_time(cputime_to_nsecs(idle_time));
-	write_seqcount_end(&idle->seqcount);
 }
-NOKPROBE_SYMBOL(enabled_wait);
+
+void noinstr arch_cpu_idle(void)
+{
+	struct s390_idle_data *idle = this_cpu_ptr(&s390_idle);
+	unsigned long psw_mask;
+
+	/* Wait for external, I/O or machine check interrupt. */
+	psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT |
+		   PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK;
+	clear_cpu_flag(CIF_NOHZ_DELAY);
+	set_cpu_flag(CIF_ENABLED_WAIT);
+	if (smp_cpu_mtid)
+		stcctm(MT_DIAG, smp_cpu_mtid, (u64 *)&idle->mt_cycles_enter);
+	idle->clock_idle_enter = get_tod_clock_fast();
+	idle->timer_idle_enter = get_cpu_timer();
+	bpon();
+	__load_psw_mask(psw_mask);
+}
 
 static ssize_t show_idle_count(struct device *dev,
-				struct device_attribute *attr, char *buf)
+			       struct device_attribute *attr, char *buf)
 {
 	struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id);
-	unsigned long long idle_count;
-	unsigned int seq;
-
-	do {
-		seq = read_seqcount_begin(&idle->seqcount);
-		idle_count = READ_ONCE(idle->idle_count);
-		if (READ_ONCE(idle->clock_idle_enter))
-			idle_count++;
-	} while (read_seqcount_retry(&idle->seqcount, seq));
-	return sprintf(buf, "%llu\n", idle_count);
+
+	return sysfs_emit(buf, "%lu\n", READ_ONCE(idle->idle_count));
 }
 DEVICE_ATTR(idle_count, 0444, show_idle_count, NULL);
 
 static ssize_t show_idle_time(struct device *dev,
-				struct device_attribute *attr, char *buf)
+			      struct device_attribute *attr, char *buf)
 {
-	unsigned long long now, idle_time, idle_enter, idle_exit, in_idle;
 	struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id);
-	unsigned int seq;
-
-	do {
-		seq = read_seqcount_begin(&idle->seqcount);
-		idle_time = READ_ONCE(idle->idle_time);
-		idle_enter = READ_ONCE(idle->clock_idle_enter);
-		idle_exit = READ_ONCE(idle->clock_idle_exit);
-	} while (read_seqcount_retry(&idle->seqcount, seq));
-	in_idle = 0;
-	now = get_tod_clock();
-	if (idle_enter) {
-		if (idle_exit) {
-			in_idle = idle_exit - idle_enter;
-		} else if (now > idle_enter) {
-			in_idle = now - idle_enter;
-		}
-	}
-	idle_time += in_idle;
-	return sprintf(buf, "%llu\n", idle_time >> 12);
-}
-DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL);
 
-u64 arch_cpu_idle_time(int cpu)
-{
-	struct s390_idle_data *idle = &per_cpu(s390_idle, cpu);
-	unsigned long long now, idle_enter, idle_exit, in_idle;
-	unsigned int seq;
-
-	do {
-		seq = read_seqcount_begin(&idle->seqcount);
-		idle_enter = READ_ONCE(idle->clock_idle_enter);
-		idle_exit = READ_ONCE(idle->clock_idle_exit);
-	} while (read_seqcount_retry(&idle->seqcount, seq));
-	in_idle = 0;
-	now = get_tod_clock();
-	if (idle_enter) {
-		if (idle_exit) {
-			in_idle = idle_exit - idle_enter;
-		} else if (now > idle_enter) {
-			in_idle = now - idle_enter;
-		}
-	}
-	return cputime_to_nsecs(in_idle);
+	return sysfs_emit(buf, "%lu\n", READ_ONCE(idle->idle_time) >> 12);
 }
+DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL);
 
 void arch_cpu_idle_enter(void)
 {
-	local_mcck_disable();
-}
-
-void arch_cpu_idle(void)
-{
-	if (!test_cpu_flag(CIF_MCCK_PENDING))
-		/* Halt the cpu and keep track of cpu time accounting. */
-		enabled_wait();
-	local_irq_enable();
 }
 
 void arch_cpu_idle_exit(void)
 {
-	local_mcck_enable();
-	if (test_cpu_flag(CIF_MCCK_PENDING))
-		s390_handle_mcck();
 }
 
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
 {
 	cpu_die();
 }
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 6837affc19e8..3b9d9ccfad63 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -4,7 +4,6 @@
  *
  *    Copyright IBM Corp. 2005, 2012
  *    Author(s): Michael Holzheu <holzheu@de.ibm.com>
- *		 Heiko Carstens <heiko.carstens@de.ibm.com>
  *		 Volker Sameske <sameske@de.ibm.com>
  */
 
@@ -13,12 +12,17 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/delay.h>
+#include <linux/kstrtox.h>
+#include <linux/panic_notifier.h>
 #include <linux/reboot.h>
 #include <linux/ctype.h>
 #include <linux/fs.h>
 #include <linux/gfp.h>
 #include <linux/crash_dump.h>
 #include <linux/debug_locks.h>
+#include <linux/vmalloc.h>
+#include <asm/asm-extable.h>
+#include <asm/machine.h>
 #include <asm/diag.h>
 #include <asm/ipl.h>
 #include <asm/smp.h>
@@ -28,6 +32,7 @@
 #include <asm/sclp.h>
 #include <asm/checksum.h>
 #include <asm/debug.h>
+#include <asm/abs_lowcore.h>
 #include <asm/os_info.h>
 #include <asm/sections.h>
 #include <asm/boot_data.h>
@@ -37,12 +42,18 @@
 
 #define IPL_UNKNOWN_STR		"unknown"
 #define IPL_CCW_STR		"ccw"
+#define IPL_ECKD_STR		"eckd"
+#define IPL_ECKD_DUMP_STR	"eckd_dump"
 #define IPL_FCP_STR		"fcp"
 #define IPL_FCP_DUMP_STR	"fcp_dump"
+#define IPL_NVME_STR		"nvme"
+#define IPL_NVME_DUMP_STR	"nvme_dump"
 #define IPL_NSS_STR		"nss"
 
 #define DUMP_CCW_STR		"ccw"
+#define DUMP_ECKD_STR		"eckd"
 #define DUMP_FCP_STR		"fcp"
+#define DUMP_NVME_STR		"nvme"
 #define DUMP_NONE_STR		"none"
 
 /*
@@ -87,12 +98,20 @@ static char *ipl_type_str(enum ipl_type type)
 	switch (type) {
 	case IPL_TYPE_CCW:
 		return IPL_CCW_STR;
+	case IPL_TYPE_ECKD:
+		return IPL_ECKD_STR;
+	case IPL_TYPE_ECKD_DUMP:
+		return IPL_ECKD_DUMP_STR;
 	case IPL_TYPE_FCP:
 		return IPL_FCP_STR;
 	case IPL_TYPE_FCP_DUMP:
 		return IPL_FCP_DUMP_STR;
 	case IPL_TYPE_NSS:
 		return IPL_NSS_STR;
+	case IPL_TYPE_NVME:
+		return IPL_NVME_STR;
+	case IPL_TYPE_NVME_DUMP:
+		return IPL_NVME_DUMP_STR;
 	case IPL_TYPE_UNKNOWN:
 	default:
 		return IPL_UNKNOWN_STR;
@@ -103,6 +122,8 @@ enum dump_type {
 	DUMP_TYPE_NONE	= 1,
 	DUMP_TYPE_CCW	= 2,
 	DUMP_TYPE_FCP	= 4,
+	DUMP_TYPE_NVME	= 8,
+	DUMP_TYPE_ECKD	= 16,
 };
 
 static char *dump_type_str(enum dump_type type)
@@ -112,8 +133,12 @@ static char *dump_type_str(enum dump_type type)
 		return DUMP_NONE_STR;
 	case DUMP_TYPE_CCW:
 		return DUMP_CCW_STR;
+	case DUMP_TYPE_ECKD:
+		return DUMP_ECKD_STR;
 	case DUMP_TYPE_FCP:
 		return DUMP_FCP_STR;
+	case DUMP_TYPE_NVME:
+		return DUMP_NVME_STR;
 	default:
 		return NULL;
 	}
@@ -133,37 +158,48 @@ static int reipl_capabilities = IPL_TYPE_UNKNOWN;
 
 static enum ipl_type reipl_type = IPL_TYPE_UNKNOWN;
 static struct ipl_parameter_block *reipl_block_fcp;
+static struct ipl_parameter_block *reipl_block_nvme;
 static struct ipl_parameter_block *reipl_block_ccw;
+static struct ipl_parameter_block *reipl_block_eckd;
 static struct ipl_parameter_block *reipl_block_nss;
 static struct ipl_parameter_block *reipl_block_actual;
 
 static int dump_capabilities = DUMP_TYPE_NONE;
 static enum dump_type dump_type = DUMP_TYPE_NONE;
 static struct ipl_parameter_block *dump_block_fcp;
+static struct ipl_parameter_block *dump_block_nvme;
 static struct ipl_parameter_block *dump_block_ccw;
+static struct ipl_parameter_block *dump_block_eckd;
 
 static struct sclp_ipl_info sclp_ipl_info;
 
-static inline int __diag308(unsigned long subcode, void *addr)
+static bool reipl_nvme_clear;
+static bool reipl_fcp_clear;
+static bool reipl_ccw_clear;
+static bool reipl_eckd_clear;
+
+static unsigned long os_info_flags;
+
+static inline int __diag308(unsigned long subcode, unsigned long addr)
 {
-	register unsigned long _addr asm("0") = (unsigned long) addr;
-	register unsigned long _rc asm("1") = 0;
+	union register_pair r1;
 
-	asm volatile(
-		"	diag	%0,%2,0x308\n"
+	r1.even = addr;
+	r1.odd	= 0;
+	asm_inline volatile(
+		"	diag	%[r1],%[subcode],0x308\n"
 		"0:	nopr	%%r7\n"
 		EX_TABLE(0b,0b)
-		: "+d" (_addr), "+d" (_rc)
-		: "d" (subcode) : "cc", "memory");
-	return _rc;
+		: [r1] "+&d" (r1.pair)
+		: [subcode] "d" (subcode)
+		: "cc", "memory");
+	return r1.odd;
 }
 
 int diag308(unsigned long subcode, void *addr)
 {
-	if (IS_ENABLED(CONFIG_KASAN))
-		__arch_local_irq_stosm(0x04); /* enable DAT */
 	diag_stat_inc(DIAG_STAT_X308);
-	return __diag308(subcode, addr);
+	return __diag308(subcode, addr ? virt_to_phys(addr) : 0);
 }
 EXPORT_SYMBOL_GPL(diag308);
 
@@ -174,7 +210,7 @@ static ssize_t sys_##_prefix##_##_name##_show(struct kobject *kobj,	\
 		struct kobj_attribute *attr,				\
 		char *page)						\
 {									\
-	return snprintf(page, PAGE_SIZE, _format, ##args);		\
+	return sysfs_emit(page, _format, ##args);			\
 }
 
 #define IPL_ATTR_CCW_STORE_FN(_prefix, _name, _ipl_blk)			\
@@ -200,14 +236,14 @@ IPL_ATTR_SHOW_FN(_prefix, _name, "0.%x.%04x\n",				\
 		 _ipl_blk.ssid, _ipl_blk.devno);			\
 IPL_ATTR_CCW_STORE_FN(_prefix, _name, _ipl_blk);			\
 static struct kobj_attribute sys_##_prefix##_##_name##_attr =		\
-	__ATTR(_name, (S_IRUGO | S_IWUSR),				\
+	__ATTR(_name, 0644,						\
 	       sys_##_prefix##_##_name##_show,				\
 	       sys_##_prefix##_##_name##_store)				\
 
 #define DEFINE_IPL_ATTR_RO(_prefix, _name, _format, _value)		\
 IPL_ATTR_SHOW_FN(_prefix, _name, _format, _value)			\
 static struct kobj_attribute sys_##_prefix##_##_name##_attr =		\
-	__ATTR(_name, S_IRUGO, sys_##_prefix##_##_name##_show, NULL)
+	__ATTR(_name, 0444, sys_##_prefix##_##_name##_show, NULL)
 
 #define DEFINE_IPL_ATTR_RW(_prefix, _name, _fmt_out, _fmt_in, _value)	\
 IPL_ATTR_SHOW_FN(_prefix, _name, _fmt_out, (unsigned long long) _value)	\
@@ -222,7 +258,7 @@ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj,	\
 	return len;							\
 }									\
 static struct kobj_attribute sys_##_prefix##_##_name##_attr =		\
-	__ATTR(_name,(S_IRUGO | S_IWUSR),				\
+	__ATTR(_name, 0644,						\
 			sys_##_prefix##_##_name##_show,			\
 			sys_##_prefix##_##_name##_store)
 
@@ -232,15 +268,74 @@ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj,	\
 		struct kobj_attribute *attr,				\
 		const char *buf, size_t len)				\
 {									\
-	strncpy(_value, buf, sizeof(_value) - 1);			\
+	if (len >= sizeof(_value))					\
+		return -E2BIG;						\
+	len = strscpy(_value, buf, sizeof(_value));			\
+	if ((ssize_t)len < 0)						\
+		return len;						\
 	strim(_value);							\
 	return len;							\
 }									\
 static struct kobj_attribute sys_##_prefix##_##_name##_attr =		\
-	__ATTR(_name,(S_IRUGO | S_IWUSR),				\
+	__ATTR(_name, 0644,						\
 			sys_##_prefix##_##_name##_show,			\
 			sys_##_prefix##_##_name##_store)
 
+#define IPL_ATTR_SCP_DATA_SHOW_FN(_prefix, _ipl_block)				\
+static ssize_t sys_##_prefix##_scp_data_show(struct file *filp,			\
+					    struct kobject *kobj,		\
+					    const struct bin_attribute *attr,	\
+					    char *buf, loff_t off,		\
+					    size_t count)			\
+{										\
+	size_t size = _ipl_block.scp_data_len;					\
+	void *scp_data = _ipl_block.scp_data;					\
+										\
+	return memory_read_from_buffer(buf, count, &off,			\
+				       scp_data, size);				\
+}
+
+#define IPL_ATTR_SCP_DATA_STORE_FN(_prefix, _ipl_block_hdr, _ipl_block, _ipl_bp_len, _ipl_bp0_len)\
+static ssize_t sys_##_prefix##_scp_data_store(struct file *filp,		\
+					struct kobject *kobj,			\
+					const struct bin_attribute *attr,	\
+					char *buf, loff_t off,			\
+					size_t count)				\
+{										\
+	size_t scpdata_len = count;						\
+	size_t padding;								\
+										\
+	if (off)								\
+		return -EINVAL;							\
+										\
+	memcpy(_ipl_block.scp_data, buf, count);				\
+	if (scpdata_len % 8) {							\
+		padding = 8 - (scpdata_len % 8);				\
+		memset(_ipl_block.scp_data + scpdata_len,			\
+		       0, padding);						\
+		scpdata_len += padding;						\
+	}									\
+										\
+	_ipl_block_hdr.len = _ipl_bp_len + scpdata_len;				\
+	_ipl_block.len = _ipl_bp0_len + scpdata_len;				\
+	_ipl_block.scp_data_len = scpdata_len;					\
+										\
+	return count;								\
+}
+
+#define DEFINE_IPL_ATTR_SCP_DATA_RO(_prefix, _ipl_block, _size)		\
+IPL_ATTR_SCP_DATA_SHOW_FN(_prefix, _ipl_block)				\
+static const struct bin_attribute sys_##_prefix##_scp_data_attr =	\
+	__BIN_ATTR(scp_data, 0444, sys_##_prefix##_scp_data_show,	\
+		   NULL, _size)
+
+#define DEFINE_IPL_ATTR_SCP_DATA_RW(_prefix, _ipl_block_hdr, _ipl_block, _ipl_bp_len, _ipl_bp0_len, _size)\
+IPL_ATTR_SCP_DATA_SHOW_FN(_prefix, _ipl_block)					\
+IPL_ATTR_SCP_DATA_STORE_FN(_prefix, _ipl_block_hdr, _ipl_block, _ipl_bp_len, _ipl_bp0_len)\
+static const struct bin_attribute sys_##_prefix##_scp_data_attr =		\
+	__BIN_ATTR(scp_data, 0644, sys_##_prefix##_scp_data_show,		\
+		   sys_##_prefix##_scp_data_store, _size)
+
 /*
  * ipl section
  */
@@ -258,6 +353,16 @@ static __init enum ipl_type get_ipl_type(void)
 			return IPL_TYPE_FCP_DUMP;
 		else
 			return IPL_TYPE_FCP;
+	case IPL_PBT_NVME:
+		if (ipl_block.nvme.opt == IPL_PB0_NVME_OPT_DUMP)
+			return IPL_TYPE_NVME_DUMP;
+		else
+			return IPL_TYPE_NVME;
+	case IPL_PBT_ECKD:
+		if (ipl_block.eckd.opt == IPL_PB0_ECKD_OPT_DUMP)
+			return IPL_TYPE_ECKD_DUMP;
+		else
+			return IPL_TYPE_ECKD;
 	}
 	return IPL_TYPE_UNKNOWN;
 }
@@ -268,7 +373,7 @@ EXPORT_SYMBOL_GPL(ipl_info);
 static ssize_t ipl_type_show(struct kobject *kobj, struct kobj_attribute *attr,
 			     char *page)
 {
-	return sprintf(page, "%s\n", ipl_type_str(ipl_info.type));
+	return sysfs_emit(page, "%s\n", ipl_type_str(ipl_info.type));
 }
 
 static struct kobj_attribute sys_ipl_type_attr = __ATTR_RO(ipl_type);
@@ -276,7 +381,7 @@ static struct kobj_attribute sys_ipl_type_attr = __ATTR_RO(ipl_type);
 static ssize_t ipl_secure_show(struct kobject *kobj,
 			       struct kobj_attribute *attr, char *page)
 {
-	return sprintf(page, "%i\n", !!ipl_secure_flag);
+	return sysfs_emit(page, "%i\n", !!ipl_secure_flag);
 }
 
 static struct kobj_attribute sys_ipl_secure_attr =
@@ -285,7 +390,7 @@ static struct kobj_attribute sys_ipl_secure_attr =
 static ssize_t ipl_has_secure_show(struct kobject *kobj,
 				   struct kobj_attribute *attr, char *page)
 {
-	return sprintf(page, "%i\n", !!sclp.has_sipl);
+	return sysfs_emit(page, "%i\n", !!sclp.has_sipl);
 }
 
 static struct kobj_attribute sys_ipl_has_secure_attr =
@@ -298,56 +403,69 @@ static ssize_t ipl_vm_parm_show(struct kobject *kobj,
 
 	if (ipl_block_valid && (ipl_block.pb0_hdr.pbt == IPL_PBT_CCW))
 		ipl_block_get_ascii_vmparm(parm, sizeof(parm), &ipl_block);
-	return sprintf(page, "%s\n", parm);
+	return sysfs_emit(page, "%s\n", parm);
 }
 
 static struct kobj_attribute sys_ipl_vm_parm_attr =
-	__ATTR(parm, S_IRUGO, ipl_vm_parm_show, NULL);
+	__ATTR(parm, 0444, ipl_vm_parm_show, NULL);
 
 static ssize_t sys_ipl_device_show(struct kobject *kobj,
 				   struct kobj_attribute *attr, char *page)
 {
 	switch (ipl_info.type) {
 	case IPL_TYPE_CCW:
-		return sprintf(page, "0.%x.%04x\n", ipl_block.ccw.ssid,
-			       ipl_block.ccw.devno);
+		return sysfs_emit(page, "0.%x.%04x\n", ipl_block.ccw.ssid,
+				  ipl_block.ccw.devno);
+	case IPL_TYPE_ECKD:
+	case IPL_TYPE_ECKD_DUMP:
+		return sysfs_emit(page, "0.%x.%04x\n", ipl_block.eckd.ssid,
+				  ipl_block.eckd.devno);
 	case IPL_TYPE_FCP:
 	case IPL_TYPE_FCP_DUMP:
-		return sprintf(page, "0.0.%04x\n", ipl_block.fcp.devno);
+		return sysfs_emit(page, "0.0.%04x\n", ipl_block.fcp.devno);
+	case IPL_TYPE_NVME:
+	case IPL_TYPE_NVME_DUMP:
+		return sysfs_emit(page, "%08ux\n", ipl_block.nvme.fid);
 	default:
 		return 0;
 	}
 }
 
 static struct kobj_attribute sys_ipl_device_attr =
-	__ATTR(device, S_IRUGO, sys_ipl_device_show, NULL);
+	__ATTR(device, 0444, sys_ipl_device_show, NULL);
 
-static ssize_t ipl_parameter_read(struct file *filp, struct kobject *kobj,
-				  struct bin_attribute *attr, char *buf,
-				  loff_t off, size_t count)
+static ssize_t sys_ipl_parameter_read(struct file *filp, struct kobject *kobj,
+				      const struct bin_attribute *attr, char *buf,
+				      loff_t off, size_t count)
 {
 	return memory_read_from_buffer(buf, count, &off, &ipl_block,
 				       ipl_block.hdr.len);
 }
-static struct bin_attribute ipl_parameter_attr =
-	__BIN_ATTR(binary_parameter, S_IRUGO, ipl_parameter_read, NULL,
+static const struct bin_attribute sys_ipl_parameter_attr =
+	__BIN_ATTR(binary_parameter, 0444, sys_ipl_parameter_read, NULL,
 		   PAGE_SIZE);
 
-static ssize_t ipl_scp_data_read(struct file *filp, struct kobject *kobj,
-				 struct bin_attribute *attr, char *buf,
-				 loff_t off, size_t count)
-{
-	unsigned int size = ipl_block.fcp.scp_data_len;
-	void *scp_data = &ipl_block.fcp.scp_data;
+DEFINE_IPL_ATTR_SCP_DATA_RO(ipl_fcp, ipl_block.fcp, PAGE_SIZE);
 
-	return memory_read_from_buffer(buf, count, &off, scp_data, size);
-}
-static struct bin_attribute ipl_scp_data_attr =
-	__BIN_ATTR(scp_data, S_IRUGO, ipl_scp_data_read, NULL, PAGE_SIZE);
+static const struct bin_attribute *const ipl_fcp_bin_attrs[] = {
+	&sys_ipl_parameter_attr,
+	&sys_ipl_fcp_scp_data_attr,
+	NULL,
+};
 
-static struct bin_attribute *ipl_fcp_bin_attrs[] = {
-	&ipl_parameter_attr,
-	&ipl_scp_data_attr,
+DEFINE_IPL_ATTR_SCP_DATA_RO(ipl_nvme, ipl_block.nvme, PAGE_SIZE);
+
+static const struct bin_attribute *const ipl_nvme_bin_attrs[] = {
+	&sys_ipl_parameter_attr,
+	&sys_ipl_nvme_scp_data_attr,
+	NULL,
+};
+
+DEFINE_IPL_ATTR_SCP_DATA_RO(ipl_eckd, ipl_block.eckd, PAGE_SIZE);
+
+static const struct bin_attribute *const ipl_eckd_bin_attrs[] = {
+	&sys_ipl_parameter_attr,
+	&sys_ipl_eckd_scp_data_attr,
 	NULL,
 };
 
@@ -362,85 +480,190 @@ DEFINE_IPL_ATTR_RO(ipl_fcp, bootprog, "%lld\n",
 DEFINE_IPL_ATTR_RO(ipl_fcp, br_lba, "%lld\n",
 		   (unsigned long long)ipl_block.fcp.br_lba);
 
+/* NVMe ipl device attributes */
+DEFINE_IPL_ATTR_RO(ipl_nvme, fid, "0x%08llx\n",
+		   (unsigned long long)ipl_block.nvme.fid);
+DEFINE_IPL_ATTR_RO(ipl_nvme, nsid, "0x%08llx\n",
+		   (unsigned long long)ipl_block.nvme.nsid);
+DEFINE_IPL_ATTR_RO(ipl_nvme, bootprog, "%lld\n",
+		   (unsigned long long)ipl_block.nvme.bootprog);
+DEFINE_IPL_ATTR_RO(ipl_nvme, br_lba, "%lld\n",
+		   (unsigned long long)ipl_block.nvme.br_lba);
+
+/* ECKD ipl device attributes */
+DEFINE_IPL_ATTR_RO(ipl_eckd, bootprog, "%lld\n",
+		   (unsigned long long)ipl_block.eckd.bootprog);
+
+#define IPL_ATTR_BR_CHR_SHOW_FN(_name, _ipb)				\
+static ssize_t eckd_##_name##_br_chr_show(struct kobject *kobj,		\
+					  struct kobj_attribute *attr,	\
+					  char *buf)			\
+{									\
+	struct ipl_pb0_eckd *ipb = &(_ipb);				\
+									\
+	if (!ipb->br_chr.cyl &&						\
+	    !ipb->br_chr.head &&					\
+	    !ipb->br_chr.record)					\
+		return sysfs_emit(buf, "auto\n");			\
+									\
+	return sysfs_emit(buf, "0x%x,0x%x,0x%x\n",			\
+			  ipb->br_chr.cyl,				\
+			  ipb->br_chr.head,				\
+			  ipb->br_chr.record);				\
+}
+
+#define IPL_ATTR_BR_CHR_STORE_FN(_name, _ipb)				\
+static ssize_t eckd_##_name##_br_chr_store(struct kobject *kobj,	\
+					   struct kobj_attribute *attr,	\
+					   const char *buf, size_t len)	\
+{									\
+	struct ipl_pb0_eckd *ipb = &(_ipb);				\
+	unsigned long args[3] = { 0 };					\
+	char *p, *p1, *tmp = NULL;					\
+	int i, rc;							\
+									\
+	if (!strncmp(buf, "auto", 4))					\
+		goto out;						\
+									\
+	tmp = kstrdup(buf, GFP_KERNEL);					\
+	p = tmp;							\
+	for (i = 0; i < 3; i++) {					\
+		p1 = strsep(&p, ", ");					\
+		if (!p1) {						\
+			rc = -EINVAL;					\
+			goto err;					\
+		}							\
+		rc = kstrtoul(p1, 0, args + i);				\
+		if (rc)							\
+			goto err;					\
+	}								\
+									\
+	rc = -EINVAL;							\
+	if (i != 3)							\
+		goto err;						\
+									\
+	if ((args[0] || args[1]) && !args[2])				\
+		goto err;						\
+									\
+	if (args[0] > UINT_MAX || args[1] > 255 || args[2] > 255)	\
+		goto err;						\
+									\
+out:									\
+	ipb->br_chr.cyl = args[0];					\
+	ipb->br_chr.head = args[1];					\
+	ipb->br_chr.record = args[2];					\
+	rc = len;							\
+err:									\
+	kfree(tmp);							\
+	return rc;							\
+}
+
+IPL_ATTR_BR_CHR_SHOW_FN(ipl, ipl_block.eckd);
+static struct kobj_attribute sys_ipl_eckd_br_chr_attr =
+	__ATTR(br_chr, 0644, eckd_ipl_br_chr_show, NULL);
+
+IPL_ATTR_BR_CHR_SHOW_FN(reipl, reipl_block_eckd->eckd);
+IPL_ATTR_BR_CHR_STORE_FN(reipl, reipl_block_eckd->eckd);
+
+static struct kobj_attribute sys_reipl_eckd_br_chr_attr =
+	__ATTR(br_chr, 0644, eckd_reipl_br_chr_show, eckd_reipl_br_chr_store);
+
 static ssize_t ipl_ccw_loadparm_show(struct kobject *kobj,
 				     struct kobj_attribute *attr, char *page)
 {
 	char loadparm[LOADPARM_LEN + 1] = {};
 
 	if (!sclp_ipl_info.is_valid)
-		return sprintf(page, "#unknown#\n");
+		return sysfs_emit(page, "#unknown#\n");
 	memcpy(loadparm, &sclp_ipl_info.loadparm, LOADPARM_LEN);
 	EBCASC(loadparm, LOADPARM_LEN);
 	strim(loadparm);
-	return sprintf(page, "%s\n", loadparm);
+	return sysfs_emit(page, "%s\n", loadparm);
 }
 
 static struct kobj_attribute sys_ipl_ccw_loadparm_attr =
 	__ATTR(loadparm, 0444, ipl_ccw_loadparm_show, NULL);
 
 static struct attribute *ipl_fcp_attrs[] = {
-	&sys_ipl_type_attr.attr,
 	&sys_ipl_device_attr.attr,
 	&sys_ipl_fcp_wwpn_attr.attr,
 	&sys_ipl_fcp_lun_attr.attr,
 	&sys_ipl_fcp_bootprog_attr.attr,
 	&sys_ipl_fcp_br_lba_attr.attr,
 	&sys_ipl_ccw_loadparm_attr.attr,
-	&sys_ipl_secure_attr.attr,
-	&sys_ipl_has_secure_attr.attr,
 	NULL,
 };
 
-static struct attribute_group ipl_fcp_attr_group = {
+static const struct attribute_group ipl_fcp_attr_group = {
 	.attrs = ipl_fcp_attrs,
-	.bin_attrs = ipl_fcp_bin_attrs,
+	.bin_attrs_new = ipl_fcp_bin_attrs,
+};
+
+static struct attribute *ipl_nvme_attrs[] = {
+	&sys_ipl_nvme_fid_attr.attr,
+	&sys_ipl_nvme_nsid_attr.attr,
+	&sys_ipl_nvme_bootprog_attr.attr,
+	&sys_ipl_nvme_br_lba_attr.attr,
+	&sys_ipl_ccw_loadparm_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group ipl_nvme_attr_group = {
+	.attrs = ipl_nvme_attrs,
+	.bin_attrs_new = ipl_nvme_bin_attrs,
+};
+
+static struct attribute *ipl_eckd_attrs[] = {
+	&sys_ipl_eckd_bootprog_attr.attr,
+	&sys_ipl_eckd_br_chr_attr.attr,
+	&sys_ipl_ccw_loadparm_attr.attr,
+	&sys_ipl_device_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group ipl_eckd_attr_group = {
+	.attrs = ipl_eckd_attrs,
+	.bin_attrs_new = ipl_eckd_bin_attrs,
 };
 
 /* CCW ipl device attributes */
 
 static struct attribute *ipl_ccw_attrs_vm[] = {
-	&sys_ipl_type_attr.attr,
 	&sys_ipl_device_attr.attr,
 	&sys_ipl_ccw_loadparm_attr.attr,
 	&sys_ipl_vm_parm_attr.attr,
-	&sys_ipl_secure_attr.attr,
-	&sys_ipl_has_secure_attr.attr,
 	NULL,
 };
 
 static struct attribute *ipl_ccw_attrs_lpar[] = {
-	&sys_ipl_type_attr.attr,
 	&sys_ipl_device_attr.attr,
 	&sys_ipl_ccw_loadparm_attr.attr,
-	&sys_ipl_secure_attr.attr,
-	&sys_ipl_has_secure_attr.attr,
 	NULL,
 };
 
-static struct attribute_group ipl_ccw_attr_group_vm = {
+static const struct attribute_group ipl_ccw_attr_group_vm = {
 	.attrs = ipl_ccw_attrs_vm,
 };
 
-static struct attribute_group ipl_ccw_attr_group_lpar = {
+static const struct attribute_group ipl_ccw_attr_group_lpar = {
 	.attrs = ipl_ccw_attrs_lpar
 };
 
-/* UNKNOWN ipl device attributes */
-
-static struct attribute *ipl_unknown_attrs[] = {
+static struct attribute *ipl_common_attrs[] = {
 	&sys_ipl_type_attr.attr,
+	&sys_ipl_secure_attr.attr,
+	&sys_ipl_has_secure_attr.attr,
 	NULL,
 };
 
-static struct attribute_group ipl_unknown_attr_group = {
-	.attrs = ipl_unknown_attrs,
+static const struct attribute_group ipl_common_attr_group = {
+	.attrs = ipl_common_attrs,
 };
 
 static struct kset *ipl_kset;
 
 static void __ipl_run(void *unused)
 {
-	__bpon();
 	diag308(DIAG308_LOAD_CLEAR, NULL);
 }
 
@@ -458,22 +681,31 @@ static int __init ipl_init(void)
 		rc = -ENOMEM;
 		goto out;
 	}
+	rc = sysfs_create_group(&ipl_kset->kobj, &ipl_common_attr_group);
+	if (rc)
+		goto out;
 	switch (ipl_info.type) {
 	case IPL_TYPE_CCW:
-		if (MACHINE_IS_VM)
+		if (machine_is_vm())
 			rc = sysfs_create_group(&ipl_kset->kobj,
 						&ipl_ccw_attr_group_vm);
 		else
 			rc = sysfs_create_group(&ipl_kset->kobj,
 						&ipl_ccw_attr_group_lpar);
 		break;
+	case IPL_TYPE_ECKD:
+	case IPL_TYPE_ECKD_DUMP:
+		rc = sysfs_create_group(&ipl_kset->kobj, &ipl_eckd_attr_group);
+		break;
 	case IPL_TYPE_FCP:
 	case IPL_TYPE_FCP_DUMP:
 		rc = sysfs_create_group(&ipl_kset->kobj, &ipl_fcp_attr_group);
 		break;
+	case IPL_TYPE_NVME:
+	case IPL_TYPE_NVME_DUMP:
+		rc = sysfs_create_group(&ipl_kset->kobj, &ipl_nvme_attr_group);
+		break;
 	default:
-		rc = sysfs_create_group(&ipl_kset->kobj,
-					&ipl_unknown_attr_group);
 		break;
 	}
 out:
@@ -500,7 +732,7 @@ static ssize_t reipl_generic_vmparm_show(struct ipl_parameter_block *ipb,
 	char vmparm[DIAG308_VMPARM_SIZE + 1] = {};
 
 	ipl_block_get_ascii_vmparm(vmparm, sizeof(vmparm), ipb);
-	return sprintf(page, "%s\n", vmparm);
+	return sysfs_emit(page, "%s\n", vmparm);
 }
 
 static ssize_t reipl_generic_vmparm_store(struct ipl_parameter_block *ipb,
@@ -564,54 +796,20 @@ static ssize_t reipl_ccw_vmparm_store(struct kobject *kobj,
 }
 
 static struct kobj_attribute sys_reipl_nss_vmparm_attr =
-	__ATTR(parm, S_IRUGO | S_IWUSR, reipl_nss_vmparm_show,
-					reipl_nss_vmparm_store);
+	__ATTR(parm, 0644, reipl_nss_vmparm_show,
+	       reipl_nss_vmparm_store);
 static struct kobj_attribute sys_reipl_ccw_vmparm_attr =
-	__ATTR(parm, S_IRUGO | S_IWUSR, reipl_ccw_vmparm_show,
-					reipl_ccw_vmparm_store);
+	__ATTR(parm, 0644, reipl_ccw_vmparm_show,
+	       reipl_ccw_vmparm_store);
 
 /* FCP reipl device attributes */
 
-static ssize_t reipl_fcp_scpdata_read(struct file *filp, struct kobject *kobj,
-				      struct bin_attribute *attr,
-				      char *buf, loff_t off, size_t count)
-{
-	size_t size = reipl_block_fcp->fcp.scp_data_len;
-	void *scp_data = reipl_block_fcp->fcp.scp_data;
-
-	return memory_read_from_buffer(buf, count, &off, scp_data, size);
-}
-
-static ssize_t reipl_fcp_scpdata_write(struct file *filp, struct kobject *kobj,
-				       struct bin_attribute *attr,
-				       char *buf, loff_t off, size_t count)
-{
-	size_t scpdata_len = count;
-	size_t padding;
-
+DEFINE_IPL_ATTR_SCP_DATA_RW(reipl_fcp, reipl_block_fcp->hdr,
+			    reipl_block_fcp->fcp,
+			    IPL_BP_FCP_LEN, IPL_BP0_FCP_LEN,
+			    DIAG308_SCPDATA_SIZE);
 
-	if (off)
-		return -EINVAL;
-
-	memcpy(reipl_block_fcp->fcp.scp_data, buf, count);
-	if (scpdata_len % 8) {
-		padding = 8 - (scpdata_len % 8);
-		memset(reipl_block_fcp->fcp.scp_data + scpdata_len,
-		       0, padding);
-		scpdata_len += padding;
-	}
-
-	reipl_block_fcp->hdr.len = IPL_BP_FCP_LEN + scpdata_len;
-	reipl_block_fcp->fcp.len = IPL_BP0_FCP_LEN + scpdata_len;
-	reipl_block_fcp->fcp.scp_data_len = scpdata_len;
-
-	return count;
-}
-static struct bin_attribute sys_reipl_fcp_scp_data_attr =
-	__BIN_ATTR(scp_data, (S_IRUGO | S_IWUSR), reipl_fcp_scpdata_read,
-		   reipl_fcp_scpdata_write, DIAG308_SCPDATA_SIZE);
-
-static struct bin_attribute *reipl_fcp_bin_attrs[] = {
+static const struct bin_attribute *const reipl_fcp_bin_attrs[] = {
 	&sys_reipl_fcp_scp_data_attr,
 	NULL,
 };
@@ -642,7 +840,7 @@ static ssize_t reipl_generic_loadparm_show(struct ipl_parameter_block *ipb,
 	char buf[LOADPARM_LEN + 1];
 
 	reipl_get_ascii_loadparm(buf, ipb);
-	return sprintf(page, "%s\n", buf);
+	return sysfs_emit(page, "%s\n", buf);
 }
 
 static ssize_t reipl_generic_loadparm_store(struct ipl_parameter_block *ipb,
@@ -673,24 +871,43 @@ static ssize_t reipl_generic_loadparm_store(struct ipl_parameter_block *ipb,
 	return len;
 }
 
-/* FCP wrapper */
-static ssize_t reipl_fcp_loadparm_show(struct kobject *kobj,
-				       struct kobj_attribute *attr, char *page)
+#define DEFINE_GENERIC_LOADPARM(name)							\
+static ssize_t reipl_##name##_loadparm_show(struct kobject *kobj,			\
+					    struct kobj_attribute *attr, char *page)	\
+{											\
+	return reipl_generic_loadparm_show(reipl_block_##name, page);			\
+}											\
+static ssize_t reipl_##name##_loadparm_store(struct kobject *kobj,			\
+					     struct kobj_attribute *attr,		\
+					     const char *buf, size_t len)		\
+{											\
+	return reipl_generic_loadparm_store(reipl_block_##name, buf, len);		\
+}											\
+static struct kobj_attribute sys_reipl_##name##_loadparm_attr =				\
+	__ATTR(loadparm, 0644, reipl_##name##_loadparm_show,				\
+	       reipl_##name##_loadparm_store)
+
+DEFINE_GENERIC_LOADPARM(fcp);
+DEFINE_GENERIC_LOADPARM(nvme);
+DEFINE_GENERIC_LOADPARM(ccw);
+DEFINE_GENERIC_LOADPARM(nss);
+DEFINE_GENERIC_LOADPARM(eckd);
+
+static ssize_t reipl_fcp_clear_show(struct kobject *kobj,
+				    struct kobj_attribute *attr, char *page)
 {
-	return reipl_generic_loadparm_show(reipl_block_fcp, page);
+	return sysfs_emit(page, "%u\n", reipl_fcp_clear);
 }
 
-static ssize_t reipl_fcp_loadparm_store(struct kobject *kobj,
-					struct kobj_attribute *attr,
-					const char *buf, size_t len)
+static ssize_t reipl_fcp_clear_store(struct kobject *kobj,
+				     struct kobj_attribute *attr,
+				     const char *buf, size_t len)
 {
-	return reipl_generic_loadparm_store(reipl_block_fcp, buf, len);
+	if (kstrtobool(buf, &reipl_fcp_clear) < 0)
+		return -EINVAL;
+	return len;
 }
 
-static struct kobj_attribute sys_reipl_fcp_loadparm_attr =
-	__ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_fcp_loadparm_show,
-					    reipl_fcp_loadparm_store);
-
 static struct attribute *reipl_fcp_attrs[] = {
 	&sys_reipl_fcp_device_attr.attr,
 	&sys_reipl_fcp_wwpn_attr.attr,
@@ -701,56 +918,100 @@ static struct attribute *reipl_fcp_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group reipl_fcp_attr_group = {
+static const struct attribute_group reipl_fcp_attr_group = {
 	.attrs = reipl_fcp_attrs,
-	.bin_attrs = reipl_fcp_bin_attrs,
+	.bin_attrs_new = reipl_fcp_bin_attrs,
 };
 
-/* CCW reipl device attributes */
-DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw);
+static struct kobj_attribute sys_reipl_fcp_clear_attr =
+	__ATTR(clear, 0644, reipl_fcp_clear_show, reipl_fcp_clear_store);
 
-/* NSS wrapper */
-static ssize_t reipl_nss_loadparm_show(struct kobject *kobj,
-				       struct kobj_attribute *attr, char *page)
+/* NVME reipl device attributes */
+
+DEFINE_IPL_ATTR_SCP_DATA_RW(reipl_nvme, reipl_block_nvme->hdr,
+			    reipl_block_nvme->nvme,
+			    IPL_BP_NVME_LEN, IPL_BP0_NVME_LEN,
+			    DIAG308_SCPDATA_SIZE);
+
+static const struct bin_attribute *const reipl_nvme_bin_attrs[] = {
+	&sys_reipl_nvme_scp_data_attr,
+	NULL,
+};
+
+DEFINE_IPL_ATTR_RW(reipl_nvme, fid, "0x%08llx\n", "%llx\n",
+		   reipl_block_nvme->nvme.fid);
+DEFINE_IPL_ATTR_RW(reipl_nvme, nsid, "0x%08llx\n", "%llx\n",
+		   reipl_block_nvme->nvme.nsid);
+DEFINE_IPL_ATTR_RW(reipl_nvme, bootprog, "%lld\n", "%lld\n",
+		   reipl_block_nvme->nvme.bootprog);
+DEFINE_IPL_ATTR_RW(reipl_nvme, br_lba, "%lld\n", "%lld\n",
+		   reipl_block_nvme->nvme.br_lba);
+
+static struct attribute *reipl_nvme_attrs[] = {
+	&sys_reipl_nvme_fid_attr.attr,
+	&sys_reipl_nvme_nsid_attr.attr,
+	&sys_reipl_nvme_bootprog_attr.attr,
+	&sys_reipl_nvme_br_lba_attr.attr,
+	&sys_reipl_nvme_loadparm_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group reipl_nvme_attr_group = {
+	.attrs = reipl_nvme_attrs,
+	.bin_attrs_new = reipl_nvme_bin_attrs
+};
+
+static ssize_t reipl_nvme_clear_show(struct kobject *kobj,
+				     struct kobj_attribute *attr, char *page)
 {
-	return reipl_generic_loadparm_show(reipl_block_nss, page);
+	return sysfs_emit(page, "%u\n", reipl_nvme_clear);
 }
 
-static ssize_t reipl_nss_loadparm_store(struct kobject *kobj,
-					struct kobj_attribute *attr,
-					const char *buf, size_t len)
+static ssize_t reipl_nvme_clear_store(struct kobject *kobj,
+				      struct kobj_attribute *attr,
+				      const char *buf, size_t len)
 {
-	return reipl_generic_loadparm_store(reipl_block_nss, buf, len);
+	if (kstrtobool(buf, &reipl_nvme_clear) < 0)
+		return -EINVAL;
+	return len;
 }
 
-/* CCW wrapper */
-static ssize_t reipl_ccw_loadparm_show(struct kobject *kobj,
-				       struct kobj_attribute *attr, char *page)
+static struct kobj_attribute sys_reipl_nvme_clear_attr =
+	__ATTR(clear, 0644, reipl_nvme_clear_show, reipl_nvme_clear_store);
+
+/* CCW reipl device attributes */
+DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw);
+
+static ssize_t reipl_ccw_clear_show(struct kobject *kobj,
+				    struct kobj_attribute *attr, char *page)
 {
-	return reipl_generic_loadparm_show(reipl_block_ccw, page);
+	return sysfs_emit(page, "%u\n", reipl_ccw_clear);
 }
 
-static ssize_t reipl_ccw_loadparm_store(struct kobject *kobj,
-					struct kobj_attribute *attr,
-					const char *buf, size_t len)
+static ssize_t reipl_ccw_clear_store(struct kobject *kobj,
+				     struct kobj_attribute *attr,
+				     const char *buf, size_t len)
 {
-	return reipl_generic_loadparm_store(reipl_block_ccw, buf, len);
+	if (kstrtobool(buf, &reipl_ccw_clear) < 0)
+		return -EINVAL;
+	return len;
 }
 
-static struct kobj_attribute sys_reipl_ccw_loadparm_attr =
-	__ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_ccw_loadparm_show,
-					    reipl_ccw_loadparm_store);
+static struct kobj_attribute sys_reipl_ccw_clear_attr =
+	__ATTR(clear, 0644, reipl_ccw_clear_show, reipl_ccw_clear_store);
 
 static struct attribute *reipl_ccw_attrs_vm[] = {
 	&sys_reipl_ccw_device_attr.attr,
 	&sys_reipl_ccw_loadparm_attr.attr,
 	&sys_reipl_ccw_vmparm_attr.attr,
+	&sys_reipl_ccw_clear_attr.attr,
 	NULL,
 };
 
 static struct attribute *reipl_ccw_attrs_lpar[] = {
 	&sys_reipl_ccw_device_attr.attr,
 	&sys_reipl_ccw_loadparm_attr.attr,
+	&sys_reipl_ccw_clear_attr.attr,
 	NULL,
 };
 
@@ -764,6 +1025,52 @@ static struct attribute_group reipl_ccw_attr_group_lpar = {
 	.attrs = reipl_ccw_attrs_lpar,
 };
 
+/* ECKD reipl device attributes */
+
+DEFINE_IPL_ATTR_SCP_DATA_RW(reipl_eckd, reipl_block_eckd->hdr,
+			    reipl_block_eckd->eckd,
+			    IPL_BP_ECKD_LEN, IPL_BP0_ECKD_LEN,
+			    DIAG308_SCPDATA_SIZE);
+
+static const struct bin_attribute *const reipl_eckd_bin_attrs[] = {
+	&sys_reipl_eckd_scp_data_attr,
+	NULL,
+};
+
+DEFINE_IPL_CCW_ATTR_RW(reipl_eckd, device, reipl_block_eckd->eckd);
+DEFINE_IPL_ATTR_RW(reipl_eckd, bootprog, "%lld\n", "%lld\n",
+		   reipl_block_eckd->eckd.bootprog);
+
+static struct attribute *reipl_eckd_attrs[] = {
+	&sys_reipl_eckd_device_attr.attr,
+	&sys_reipl_eckd_bootprog_attr.attr,
+	&sys_reipl_eckd_br_chr_attr.attr,
+	&sys_reipl_eckd_loadparm_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group reipl_eckd_attr_group = {
+	.attrs = reipl_eckd_attrs,
+	.bin_attrs_new = reipl_eckd_bin_attrs
+};
+
+static ssize_t reipl_eckd_clear_show(struct kobject *kobj,
+				     struct kobj_attribute *attr, char *page)
+{
+	return sysfs_emit(page, "%u\n", reipl_eckd_clear);
+}
+
+static ssize_t reipl_eckd_clear_store(struct kobject *kobj,
+				      struct kobj_attribute *attr,
+				      const char *buf, size_t len)
+{
+	if (kstrtobool(buf, &reipl_eckd_clear) < 0)
+		return -EINVAL;
+	return len;
+}
+
+static struct kobj_attribute sys_reipl_eckd_clear_attr =
+	__ATTR(clear, 0644, reipl_eckd_clear_show, reipl_eckd_clear_store);
 
 /* NSS reipl device attributes */
 static void reipl_get_ascii_nss_name(char *dst,
@@ -780,7 +1087,7 @@ static ssize_t reipl_nss_name_show(struct kobject *kobj,
 	char nss_name[NSS_NAME_SIZE + 1] = {};
 
 	reipl_get_ascii_nss_name(nss_name, reipl_block_nss);
-	return sprintf(page, "%s\n", nss_name);
+	return sysfs_emit(page, "%s\n", nss_name);
 }
 
 static ssize_t reipl_nss_name_store(struct kobject *kobj,
@@ -811,12 +1118,8 @@ static ssize_t reipl_nss_name_store(struct kobject *kobj,
 }
 
 static struct kobj_attribute sys_reipl_nss_name_attr =
-	__ATTR(name, S_IRUGO | S_IWUSR, reipl_nss_name_show,
-					reipl_nss_name_store);
-
-static struct kobj_attribute sys_reipl_nss_loadparm_attr =
-	__ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_nss_loadparm_show,
-					    reipl_nss_loadparm_store);
+	__ATTR(name, 0644, reipl_nss_name_show,
+	       reipl_nss_name_store);
 
 static struct attribute *reipl_nss_attrs[] = {
 	&sys_reipl_nss_name_attr.attr,
@@ -832,8 +1135,8 @@ static struct attribute_group reipl_nss_attr_group = {
 
 void set_os_info_reipl_block(void)
 {
-	os_info_entry_add(OS_INFO_REIPL_BLOCK, reipl_block_actual,
-			  reipl_block_actual->hdr.len);
+	os_info_entry_add_data(OS_INFO_REIPL_BLOCK, reipl_block_actual,
+			       reipl_block_actual->hdr.len);
 }
 
 /* reipl type */
@@ -847,9 +1150,15 @@ static int reipl_set_type(enum ipl_type type)
 	case IPL_TYPE_CCW:
 		reipl_block_actual = reipl_block_ccw;
 		break;
+	case IPL_TYPE_ECKD:
+		reipl_block_actual = reipl_block_eckd;
+		break;
 	case IPL_TYPE_FCP:
 		reipl_block_actual = reipl_block_fcp;
 		break;
+	case IPL_TYPE_NVME:
+		reipl_block_actual = reipl_block_nvme;
+		break;
 	case IPL_TYPE_NSS:
 		reipl_block_actual = reipl_block_nss;
 		break;
@@ -863,7 +1172,7 @@ static int reipl_set_type(enum ipl_type type)
 static ssize_t reipl_type_show(struct kobject *kobj,
 			       struct kobj_attribute *attr, char *page)
 {
-	return sprintf(page, "%s\n", ipl_type_str(reipl_type));
+	return sysfs_emit(page, "%s\n", ipl_type_str(reipl_type));
 }
 
 static ssize_t reipl_type_store(struct kobject *kobj,
@@ -874,8 +1183,12 @@ static ssize_t reipl_type_store(struct kobject *kobj,
 
 	if (strncmp(buf, IPL_CCW_STR, strlen(IPL_CCW_STR)) == 0)
 		rc = reipl_set_type(IPL_TYPE_CCW);
+	else if (strncmp(buf, IPL_ECKD_STR, strlen(IPL_ECKD_STR)) == 0)
+		rc = reipl_set_type(IPL_TYPE_ECKD);
 	else if (strncmp(buf, IPL_FCP_STR, strlen(IPL_FCP_STR)) == 0)
 		rc = reipl_set_type(IPL_TYPE_FCP);
+	else if (strncmp(buf, IPL_NVME_STR, strlen(IPL_NVME_STR)) == 0)
+		rc = reipl_set_type(IPL_TYPE_NVME);
 	else if (strncmp(buf, IPL_NSS_STR, strlen(IPL_NSS_STR)) == 0)
 		rc = reipl_set_type(IPL_TYPE_NSS);
 	return (rc != 0) ? rc : len;
@@ -886,17 +1199,39 @@ static struct kobj_attribute reipl_type_attr =
 
 static struct kset *reipl_kset;
 static struct kset *reipl_fcp_kset;
+static struct kset *reipl_nvme_kset;
+static struct kset *reipl_eckd_kset;
 
 static void __reipl_run(void *unused)
 {
 	switch (reipl_type) {
 	case IPL_TYPE_CCW:
 		diag308(DIAG308_SET, reipl_block_ccw);
-		diag308(DIAG308_LOAD_CLEAR, NULL);
+		if (reipl_ccw_clear)
+			diag308(DIAG308_LOAD_CLEAR, NULL);
+		else
+			diag308(DIAG308_LOAD_NORMAL_DUMP, NULL);
+		break;
+	case IPL_TYPE_ECKD:
+		diag308(DIAG308_SET, reipl_block_eckd);
+		if (reipl_eckd_clear)
+			diag308(DIAG308_LOAD_CLEAR, NULL);
+		else
+			diag308(DIAG308_LOAD_NORMAL, NULL);
 		break;
 	case IPL_TYPE_FCP:
 		diag308(DIAG308_SET, reipl_block_fcp);
-		diag308(DIAG308_LOAD_CLEAR, NULL);
+		if (reipl_fcp_clear)
+			diag308(DIAG308_LOAD_CLEAR, NULL);
+		else
+			diag308(DIAG308_LOAD_NORMAL, NULL);
+		break;
+	case IPL_TYPE_NVME:
+		diag308(DIAG308_SET, reipl_block_nvme);
+		if (reipl_nvme_clear)
+			diag308(DIAG308_LOAD_CLEAR, NULL);
+		else
+			diag308(DIAG308_LOAD_NORMAL, NULL);
 		break;
 	case IPL_TYPE_NSS:
 		diag308(DIAG308_SET, reipl_block_nss);
@@ -906,6 +1241,8 @@ static void __reipl_run(void *unused)
 		diag308(DIAG308_LOAD_CLEAR, NULL);
 		break;
 	case IPL_TYPE_FCP_DUMP:
+	case IPL_TYPE_NVME_DUMP:
+	case IPL_TYPE_ECKD_DUMP:
 		break;
 	}
 	disabled_wait();
@@ -936,7 +1273,7 @@ static void reipl_block_ccw_fill_parms(struct ipl_parameter_block *ipb)
 	ipb->ccw.flags = IPL_PB0_FLAG_LOADPARM;
 
 	/* VM PARM */
-	if (MACHINE_IS_VM && ipl_block_valid &&
+	if (machine_is_vm() && ipl_block_valid &&
 	    (ipl_block.ccw.vm_flags & IPL_PB0_CCW_VM_FLAG_VP)) {
 
 		ipb->ccw.vm_flags |= IPL_PB0_CCW_VM_FLAG_VP;
@@ -950,7 +1287,7 @@ static int __init reipl_nss_init(void)
 {
 	int rc;
 
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return 0;
 
 	reipl_block_nss = (void *) get_zeroed_page(GFP_KERNEL);
@@ -975,8 +1312,8 @@ static int __init reipl_ccw_init(void)
 		return -ENOMEM;
 
 	rc = sysfs_create_group(&reipl_kset->kobj,
-				MACHINE_IS_VM ? &reipl_ccw_attr_group_vm
-					      : &reipl_ccw_attr_group_lpar);
+				machine_is_vm() ? &reipl_ccw_attr_group_vm
+						: &reipl_ccw_attr_group_lpar);
 	if (rc)
 		return rc;
 
@@ -1008,10 +1345,16 @@ static int __init reipl_fcp_init(void)
 	}
 
 	rc = sysfs_create_group(&reipl_fcp_kset->kobj, &reipl_fcp_attr_group);
-	if (rc) {
-		kset_unregister(reipl_fcp_kset);
-		free_page((unsigned long) reipl_block_fcp);
-		return rc;
+	if (rc)
+		goto out1;
+
+	if (test_facility(141)) {
+		rc = sysfs_create_file(&reipl_fcp_kset->kobj,
+				       &sys_reipl_fcp_clear_attr.attr);
+		if (rc)
+			goto out2;
+	} else {
+		reipl_fcp_clear = true;
 	}
 
 	if (ipl_info.type == IPL_TYPE_FCP) {
@@ -1032,6 +1375,121 @@ static int __init reipl_fcp_init(void)
 	}
 	reipl_capabilities |= IPL_TYPE_FCP;
 	return 0;
+
+out2:
+	sysfs_remove_group(&reipl_fcp_kset->kobj, &reipl_fcp_attr_group);
+out1:
+	kset_unregister(reipl_fcp_kset);
+	free_page((unsigned long) reipl_block_fcp);
+	return rc;
+}
+
+static int __init reipl_nvme_init(void)
+{
+	int rc;
+
+	reipl_block_nvme = (void *) get_zeroed_page(GFP_KERNEL);
+	if (!reipl_block_nvme)
+		return -ENOMEM;
+
+	/* sysfs: create kset for mixing attr group and bin attrs */
+	reipl_nvme_kset = kset_create_and_add(IPL_NVME_STR, NULL,
+					     &reipl_kset->kobj);
+	if (!reipl_nvme_kset) {
+		free_page((unsigned long) reipl_block_nvme);
+		return -ENOMEM;
+	}
+
+	rc = sysfs_create_group(&reipl_nvme_kset->kobj, &reipl_nvme_attr_group);
+	if (rc)
+		goto out1;
+
+	if (test_facility(141)) {
+		rc = sysfs_create_file(&reipl_nvme_kset->kobj,
+				       &sys_reipl_nvme_clear_attr.attr);
+		if (rc)
+			goto out2;
+	} else {
+		reipl_nvme_clear = true;
+	}
+
+	if (ipl_info.type == IPL_TYPE_NVME) {
+		memcpy(reipl_block_nvme, &ipl_block, sizeof(ipl_block));
+		/*
+		 * Fix loadparm: There are systems where the (SCSI) LOADPARM
+		 * is invalid in the IPL parameter block, so take it
+		 * always from sclp_ipl_info.
+		 */
+		memcpy(reipl_block_nvme->nvme.loadparm, sclp_ipl_info.loadparm,
+		       LOADPARM_LEN);
+	} else {
+		reipl_block_nvme->hdr.len = IPL_BP_NVME_LEN;
+		reipl_block_nvme->hdr.version = IPL_PARM_BLOCK_VERSION;
+		reipl_block_nvme->nvme.len = IPL_BP0_NVME_LEN;
+		reipl_block_nvme->nvme.pbt = IPL_PBT_NVME;
+		reipl_block_nvme->nvme.opt = IPL_PB0_NVME_OPT_IPL;
+	}
+	reipl_capabilities |= IPL_TYPE_NVME;
+	return 0;
+
+out2:
+	sysfs_remove_group(&reipl_nvme_kset->kobj, &reipl_nvme_attr_group);
+out1:
+	kset_unregister(reipl_nvme_kset);
+	free_page((unsigned long) reipl_block_nvme);
+	return rc;
+}
+
+static int __init reipl_eckd_init(void)
+{
+	int rc;
+
+	if (!sclp.has_sipl_eckd)
+		return 0;
+
+	reipl_block_eckd = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!reipl_block_eckd)
+		return -ENOMEM;
+
+	/* sysfs: create kset for mixing attr group and bin attrs */
+	reipl_eckd_kset = kset_create_and_add(IPL_ECKD_STR, NULL,
+					      &reipl_kset->kobj);
+	if (!reipl_eckd_kset) {
+		free_page((unsigned long)reipl_block_eckd);
+		return -ENOMEM;
+	}
+
+	rc = sysfs_create_group(&reipl_eckd_kset->kobj, &reipl_eckd_attr_group);
+	if (rc)
+		goto out1;
+
+	if (test_facility(141)) {
+		rc = sysfs_create_file(&reipl_eckd_kset->kobj,
+				       &sys_reipl_eckd_clear_attr.attr);
+		if (rc)
+			goto out2;
+	} else {
+		reipl_eckd_clear = true;
+	}
+
+	if (ipl_info.type == IPL_TYPE_ECKD) {
+		memcpy(reipl_block_eckd, &ipl_block, sizeof(ipl_block));
+	} else {
+		reipl_block_eckd->hdr.len = IPL_BP_ECKD_LEN;
+		reipl_block_eckd->hdr.version = IPL_PARM_BLOCK_VERSION;
+		reipl_block_eckd->eckd.len = IPL_BP0_ECKD_LEN;
+		reipl_block_eckd->eckd.pbt = IPL_PBT_ECKD;
+		reipl_block_eckd->eckd.opt = IPL_PB0_ECKD_OPT_IPL;
+	}
+	reipl_capabilities |= IPL_TYPE_ECKD;
+	return 0;
+
+out2:
+	sysfs_remove_group(&reipl_eckd_kset->kobj, &reipl_eckd_attr_group);
+out1:
+	kset_unregister(reipl_eckd_kset);
+	free_page((unsigned long)reipl_block_eckd);
+	return rc;
 }
 
 static int __init reipl_type_init(void)
@@ -1049,9 +1507,15 @@ static int __init reipl_type_init(void)
 	if (reipl_block->pb0_hdr.pbt == IPL_PBT_FCP) {
 		memcpy(reipl_block_fcp, reipl_block, size);
 		reipl_type = IPL_TYPE_FCP;
+	} else if (reipl_block->pb0_hdr.pbt == IPL_PBT_NVME) {
+		memcpy(reipl_block_nvme, reipl_block, size);
+		reipl_type = IPL_TYPE_NVME;
 	} else if (reipl_block->pb0_hdr.pbt == IPL_PBT_CCW) {
 		memcpy(reipl_block_ccw, reipl_block, size);
 		reipl_type = IPL_TYPE_CCW;
+	} else if (reipl_block->pb0_hdr.pbt == IPL_PBT_ECKD) {
+		memcpy(reipl_block_eckd, reipl_block, size);
+		reipl_type = IPL_TYPE_ECKD;
 	}
 out:
 	return reipl_set_type(reipl_type);
@@ -1072,9 +1536,15 @@ static int __init reipl_init(void)
 	rc = reipl_ccw_init();
 	if (rc)
 		return rc;
+	rc = reipl_eckd_init();
+	if (rc)
+		return rc;
 	rc = reipl_fcp_init();
 	if (rc)
 		return rc;
+	rc = reipl_nvme_init();
+	if (rc)
+		return rc;
 	rc = reipl_nss_init();
 	if (rc)
 		return rc;
@@ -1104,6 +1574,11 @@ DEFINE_IPL_ATTR_RW(dump_fcp, br_lba, "%lld\n", "%lld\n",
 DEFINE_IPL_ATTR_RW(dump_fcp, device, "0.0.%04llx\n", "0.0.%llx\n",
 		   dump_block_fcp->fcp.devno);
 
+DEFINE_IPL_ATTR_SCP_DATA_RW(dump_fcp, dump_block_fcp->hdr,
+			    dump_block_fcp->fcp,
+			    IPL_BP_FCP_LEN, IPL_BP0_FCP_LEN,
+			    DIAG308_SCPDATA_SIZE);
+
 static struct attribute *dump_fcp_attrs[] = {
 	&sys_dump_fcp_device_attr.attr,
 	&sys_dump_fcp_wwpn_attr.attr,
@@ -1113,9 +1588,83 @@ static struct attribute *dump_fcp_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group dump_fcp_attr_group = {
+static const struct bin_attribute *const dump_fcp_bin_attrs[] = {
+	&sys_dump_fcp_scp_data_attr,
+	NULL,
+};
+
+static const struct attribute_group dump_fcp_attr_group = {
 	.name  = IPL_FCP_STR,
 	.attrs = dump_fcp_attrs,
+	.bin_attrs_new = dump_fcp_bin_attrs,
+};
+
+/* NVME dump device attributes */
+DEFINE_IPL_ATTR_RW(dump_nvme, fid, "0x%08llx\n", "%llx\n",
+		   dump_block_nvme->nvme.fid);
+DEFINE_IPL_ATTR_RW(dump_nvme, nsid, "0x%08llx\n", "%llx\n",
+		   dump_block_nvme->nvme.nsid);
+DEFINE_IPL_ATTR_RW(dump_nvme, bootprog, "%lld\n", "%llx\n",
+		   dump_block_nvme->nvme.bootprog);
+DEFINE_IPL_ATTR_RW(dump_nvme, br_lba, "%lld\n", "%llx\n",
+		   dump_block_nvme->nvme.br_lba);
+
+DEFINE_IPL_ATTR_SCP_DATA_RW(dump_nvme, dump_block_nvme->hdr,
+			    dump_block_nvme->nvme,
+			    IPL_BP_NVME_LEN, IPL_BP0_NVME_LEN,
+			    DIAG308_SCPDATA_SIZE);
+
+static struct attribute *dump_nvme_attrs[] = {
+	&sys_dump_nvme_fid_attr.attr,
+	&sys_dump_nvme_nsid_attr.attr,
+	&sys_dump_nvme_bootprog_attr.attr,
+	&sys_dump_nvme_br_lba_attr.attr,
+	NULL,
+};
+
+static const struct bin_attribute *const dump_nvme_bin_attrs[] = {
+	&sys_dump_nvme_scp_data_attr,
+	NULL,
+};
+
+static const struct attribute_group dump_nvme_attr_group = {
+	.name  = IPL_NVME_STR,
+	.attrs = dump_nvme_attrs,
+	.bin_attrs_new = dump_nvme_bin_attrs,
+};
+
+/* ECKD dump device attributes */
+DEFINE_IPL_CCW_ATTR_RW(dump_eckd, device, dump_block_eckd->eckd);
+DEFINE_IPL_ATTR_RW(dump_eckd, bootprog, "%lld\n", "%llx\n",
+		   dump_block_eckd->eckd.bootprog);
+
+IPL_ATTR_BR_CHR_SHOW_FN(dump, dump_block_eckd->eckd);
+IPL_ATTR_BR_CHR_STORE_FN(dump, dump_block_eckd->eckd);
+
+static struct kobj_attribute sys_dump_eckd_br_chr_attr =
+	__ATTR(br_chr, 0644, eckd_dump_br_chr_show, eckd_dump_br_chr_store);
+
+DEFINE_IPL_ATTR_SCP_DATA_RW(dump_eckd, dump_block_eckd->hdr,
+			    dump_block_eckd->eckd,
+			    IPL_BP_ECKD_LEN, IPL_BP0_ECKD_LEN,
+			    DIAG308_SCPDATA_SIZE);
+
+static struct attribute *dump_eckd_attrs[] = {
+	&sys_dump_eckd_device_attr.attr,
+	&sys_dump_eckd_bootprog_attr.attr,
+	&sys_dump_eckd_br_chr_attr.attr,
+	NULL,
+};
+
+static const struct bin_attribute *const dump_eckd_bin_attrs[] = {
+	&sys_dump_eckd_scp_data_attr,
+	NULL,
+};
+
+static const struct attribute_group dump_eckd_attr_group = {
+	.name  = IPL_ECKD_STR,
+	.attrs = dump_eckd_attrs,
+	.bin_attrs_new = dump_eckd_bin_attrs,
 };
 
 /* CCW dump device attributes */
@@ -1144,7 +1693,7 @@ static int dump_set_type(enum dump_type type)
 static ssize_t dump_type_show(struct kobject *kobj,
 			      struct kobj_attribute *attr, char *page)
 {
-	return sprintf(page, "%s\n", dump_type_str(dump_type));
+	return sysfs_emit(page, "%s\n", dump_type_str(dump_type));
 }
 
 static ssize_t dump_type_store(struct kobject *kobj,
@@ -1157,14 +1706,36 @@ static ssize_t dump_type_store(struct kobject *kobj,
 		rc = dump_set_type(DUMP_TYPE_NONE);
 	else if (strncmp(buf, DUMP_CCW_STR, strlen(DUMP_CCW_STR)) == 0)
 		rc = dump_set_type(DUMP_TYPE_CCW);
+	else if (strncmp(buf, DUMP_ECKD_STR, strlen(DUMP_ECKD_STR)) == 0)
+		rc = dump_set_type(DUMP_TYPE_ECKD);
 	else if (strncmp(buf, DUMP_FCP_STR, strlen(DUMP_FCP_STR)) == 0)
 		rc = dump_set_type(DUMP_TYPE_FCP);
+	else if (strncmp(buf, DUMP_NVME_STR, strlen(DUMP_NVME_STR)) == 0)
+		rc = dump_set_type(DUMP_TYPE_NVME);
 	return (rc != 0) ? rc : len;
 }
 
 static struct kobj_attribute dump_type_attr =
 	__ATTR(dump_type, 0644, dump_type_show, dump_type_store);
 
+static ssize_t dump_area_size_show(struct kobject *kobj,
+				   struct kobj_attribute *attr, char *page)
+{
+	return sysfs_emit(page, "%lu\n", sclp.hsa_size);
+}
+
+static struct kobj_attribute dump_area_size_attr = __ATTR_RO(dump_area_size);
+
+static struct attribute *dump_attrs[] = {
+	&dump_type_attr.attr,
+	&dump_area_size_attr.attr,
+	NULL,
+};
+
+static struct attribute_group dump_attr_group = {
+	.attrs = dump_attrs,
+};
+
 static struct kset *dump_kset;
 
 static void diag308_dump(void *dump_block)
@@ -1173,7 +1744,7 @@ static void diag308_dump(void *dump_block)
 	while (1) {
 		if (diag308(DIAG308_LOAD_NORMAL_DUMP, NULL) != 0x302)
 			break;
-		udelay_simple(USEC_PER_SEC);
+		udelay(USEC_PER_SEC);
 	}
 }
 
@@ -1183,9 +1754,15 @@ static void __dump_run(void *unused)
 	case DUMP_TYPE_CCW:
 		diag308_dump(dump_block_ccw);
 		break;
+	case DUMP_TYPE_ECKD:
+		diag308_dump(dump_block_eckd);
+		break;
 	case DUMP_TYPE_FCP:
 		diag308_dump(dump_block_fcp);
 		break;
+	case DUMP_TYPE_NVME:
+		diag308_dump(dump_block_nvme);
+		break;
 	default:
 		break;
 	}
@@ -1242,6 +1819,52 @@ static int __init dump_fcp_init(void)
 	return 0;
 }
 
+static int __init dump_nvme_init(void)
+{
+	int rc;
+
+	if (!sclp_ipl_info.has_dump)
+		return 0; /* LDIPL DUMP is not installed */
+	dump_block_nvme = (void *) get_zeroed_page(GFP_KERNEL);
+	if (!dump_block_nvme)
+		return -ENOMEM;
+	rc = sysfs_create_group(&dump_kset->kobj, &dump_nvme_attr_group);
+	if (rc) {
+		free_page((unsigned long)dump_block_nvme);
+		return rc;
+	}
+	dump_block_nvme->hdr.len = IPL_BP_NVME_LEN;
+	dump_block_nvme->hdr.version = IPL_PARM_BLOCK_VERSION;
+	dump_block_nvme->nvme.len = IPL_BP0_NVME_LEN;
+	dump_block_nvme->nvme.pbt = IPL_PBT_NVME;
+	dump_block_nvme->nvme.opt = IPL_PB0_NVME_OPT_DUMP;
+	dump_capabilities |= DUMP_TYPE_NVME;
+	return 0;
+}
+
+static int __init dump_eckd_init(void)
+{
+	int rc;
+
+	if (!sclp_ipl_info.has_dump || !sclp.has_sipl_eckd)
+		return 0; /* LDIPL DUMP is not installed */
+	dump_block_eckd = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!dump_block_eckd)
+		return -ENOMEM;
+	rc = sysfs_create_group(&dump_kset->kobj, &dump_eckd_attr_group);
+	if (rc) {
+		free_page((unsigned long)dump_block_eckd);
+		return rc;
+	}
+	dump_block_eckd->hdr.len = IPL_BP_ECKD_LEN;
+	dump_block_eckd->hdr.version = IPL_PARM_BLOCK_VERSION;
+	dump_block_eckd->eckd.len = IPL_BP0_ECKD_LEN;
+	dump_block_eckd->eckd.pbt = IPL_PBT_ECKD;
+	dump_block_eckd->eckd.opt = IPL_PB0_ECKD_OPT_DUMP;
+	dump_capabilities |= DUMP_TYPE_ECKD;
+	return 0;
+}
+
 static int __init dump_init(void)
 {
 	int rc;
@@ -1249,7 +1872,7 @@ static int __init dump_init(void)
 	dump_kset = kset_create_and_add("dump", NULL, firmware_kobj);
 	if (!dump_kset)
 		return -ENOMEM;
-	rc = sysfs_create_file(&dump_kset->kobj, &dump_type_attr.attr);
+	rc = sysfs_create_group(&dump_kset->kobj, &dump_attr_group);
 	if (rc) {
 		kset_unregister(dump_kset);
 		return rc;
@@ -1257,9 +1880,15 @@ static int __init dump_init(void)
 	rc = dump_ccw_init();
 	if (rc)
 		return rc;
+	rc = dump_eckd_init();
+	if (rc)
+		return rc;
 	rc = dump_fcp_init();
 	if (rc)
 		return rc;
+	rc = dump_nvme_init();
+	if (rc)
+		return rc;
 	dump_set_type(DUMP_TYPE_NONE);
 	return 0;
 }
@@ -1272,13 +1901,28 @@ static struct shutdown_action __refdata dump_action = {
 
 static void dump_reipl_run(struct shutdown_trigger *trigger)
 {
-	unsigned long ipib = (unsigned long) reipl_block_actual;
+	struct lowcore *abs_lc;
 	unsigned int csum;
 
-	csum = (__force unsigned int)
-	       csum_partial(reipl_block_actual, reipl_block_actual->hdr.len, 0);
-	mem_assign_absolute(S390_lowcore.ipib, ipib);
-	mem_assign_absolute(S390_lowcore.ipib_checksum, csum);
+	/*
+	 * Set REIPL_CLEAR flag in os_info flags entry indicating
+	 * 'clear' sysfs attribute has been set on the panicked system
+	 * for specified reipl type.
+	 * Always set for IPL_TYPE_NSS and IPL_TYPE_UNKNOWN.
+	 */
+	if ((reipl_type == IPL_TYPE_CCW && reipl_ccw_clear) ||
+	    (reipl_type == IPL_TYPE_ECKD && reipl_eckd_clear) ||
+	    (reipl_type == IPL_TYPE_FCP && reipl_fcp_clear) ||
+	    (reipl_type == IPL_TYPE_NVME && reipl_nvme_clear) ||
+	    reipl_type == IPL_TYPE_NSS ||
+	    reipl_type == IPL_TYPE_UNKNOWN)
+		os_info_flags |= OS_INFO_FLAG_REIPL_CLEAR;
+	os_info_entry_add_data(OS_INFO_FLAGS_ENTRY, &os_info_flags, sizeof(os_info_flags));
+	csum = (__force unsigned int)cksm(reipl_block_actual, reipl_block_actual->hdr.len, 0);
+	abs_lc = get_abs_lowcore();
+	abs_lc->ipib = __pa(reipl_block_actual);
+	abs_lc->ipib_checksum = csum;
+	put_abs_lowcore(abs_lc);
 	dump_run(trigger);
 }
 
@@ -1291,11 +1935,13 @@ static struct shutdown_action __refdata dump_reipl_action = {
  * vmcmd shutdown action: Trigger vm command on shutdown.
  */
 
-static char vmcmd_on_reboot[128];
-static char vmcmd_on_panic[128];
-static char vmcmd_on_halt[128];
-static char vmcmd_on_poff[128];
-static char vmcmd_on_restart[128];
+#define VMCMD_MAX_SIZE	240
+
+static char vmcmd_on_reboot[VMCMD_MAX_SIZE + 1];
+static char vmcmd_on_panic[VMCMD_MAX_SIZE + 1];
+static char vmcmd_on_halt[VMCMD_MAX_SIZE + 1];
+static char vmcmd_on_poff[VMCMD_MAX_SIZE + 1];
+static char vmcmd_on_restart[VMCMD_MAX_SIZE + 1];
 
 DEFINE_IPL_ATTR_STR_RW(vmcmd, on_reboot, "%s\n", "%s\n", vmcmd_on_reboot);
 DEFINE_IPL_ATTR_STR_RW(vmcmd, on_panic, "%s\n", "%s\n", vmcmd_on_panic);
@@ -1342,7 +1988,7 @@ static void vmcmd_run(struct shutdown_trigger *trigger)
 
 static int vmcmd_init(void)
 {
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return -EOPNOTSUPP;
 	vmcmd_kset = kset_create_and_add("vmcmd", NULL, firmware_kobj);
 	if (!vmcmd_kset)
@@ -1407,7 +2053,7 @@ static struct shutdown_trigger on_reboot_trigger = {ON_REIPL_STR,
 static ssize_t on_reboot_show(struct kobject *kobj,
 			      struct kobj_attribute *attr, char *page)
 {
-	return sprintf(page, "%s\n", on_reboot_trigger.action->name);
+	return sysfs_emit(page, "%s\n", on_reboot_trigger.action->name);
 }
 
 static ssize_t on_reboot_store(struct kobject *kobj,
@@ -1433,7 +2079,7 @@ static struct shutdown_trigger on_panic_trigger = {ON_PANIC_STR, &stop_action};
 static ssize_t on_panic_show(struct kobject *kobj,
 			     struct kobj_attribute *attr, char *page)
 {
-	return sprintf(page, "%s\n", on_panic_trigger.action->name);
+	return sysfs_emit(page, "%s\n", on_panic_trigger.action->name);
 }
 
 static ssize_t on_panic_store(struct kobject *kobj,
@@ -1459,7 +2105,7 @@ static struct shutdown_trigger on_restart_trigger = {ON_RESTART_STR,
 static ssize_t on_restart_show(struct kobject *kobj,
 			       struct kobj_attribute *attr, char *page)
 {
-	return sprintf(page, "%s\n", on_restart_trigger.action->name);
+	return sysfs_emit(page, "%s\n", on_restart_trigger.action->name);
 }
 
 static ssize_t on_restart_store(struct kobject *kobj,
@@ -1472,7 +2118,6 @@ static struct kobj_attribute on_restart_attr = __ATTR_RW(on_restart);
 
 static void __do_restart(void *ignore)
 {
-	__arch_local_irq_stosm(0x04); /* enable DAT */
 	smp_send_stop();
 #ifdef CONFIG_CRASH_DUMP
 	crash_kexec(NULL);
@@ -1481,12 +2126,12 @@ static void __do_restart(void *ignore)
 	stop_run(&on_restart_trigger);
 }
 
-void do_restart(void)
+void do_restart(void *arg)
 {
 	tracing_off();
 	debug_locks_off();
 	lgr_info_log();
-	smp_call_online_cpu(__do_restart, NULL);
+	smp_call_ipl_cpu(__do_restart, arg);
 }
 
 /* on halt */
@@ -1496,7 +2141,7 @@ static struct shutdown_trigger on_halt_trigger = {ON_HALT_STR, &stop_action};
 static ssize_t on_halt_show(struct kobject *kobj,
 			    struct kobj_attribute *attr, char *page)
 {
-	return sprintf(page, "%s\n", on_halt_trigger.action->name);
+	return sysfs_emit(page, "%s\n", on_halt_trigger.action->name);
 }
 
 static ssize_t on_halt_store(struct kobject *kobj,
@@ -1522,7 +2167,7 @@ static struct shutdown_trigger on_poff_trigger = {ON_POFF_STR, &stop_action};
 static ssize_t on_poff_show(struct kobject *kobj,
 			    struct kobj_attribute *attr, char *page)
 {
-	return sprintf(page, "%s\n", on_poff_trigger.action->name);
+	return sysfs_emit(page, "%s\n", on_poff_trigger.action->name);
 }
 
 static ssize_t on_poff_store(struct kobject *kobj,
@@ -1620,10 +2265,10 @@ static void __init strncpy_skip_quote(char *dst, char *src, int n)
 
 static int __init vmcmd_on_reboot_setup(char *str)
 {
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return 1;
-	strncpy_skip_quote(vmcmd_on_reboot, str, 127);
-	vmcmd_on_reboot[127] = 0;
+	strncpy_skip_quote(vmcmd_on_reboot, str, VMCMD_MAX_SIZE);
+	vmcmd_on_reboot[VMCMD_MAX_SIZE] = 0;
 	on_reboot_trigger.action = &vmcmd_action;
 	return 1;
 }
@@ -1631,10 +2276,10 @@ __setup("vmreboot=", vmcmd_on_reboot_setup);
 
 static int __init vmcmd_on_panic_setup(char *str)
 {
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return 1;
-	strncpy_skip_quote(vmcmd_on_panic, str, 127);
-	vmcmd_on_panic[127] = 0;
+	strncpy_skip_quote(vmcmd_on_panic, str, VMCMD_MAX_SIZE);
+	vmcmd_on_panic[VMCMD_MAX_SIZE] = 0;
 	on_panic_trigger.action = &vmcmd_action;
 	return 1;
 }
@@ -1642,10 +2287,10 @@ __setup("vmpanic=", vmcmd_on_panic_setup);
 
 static int __init vmcmd_on_halt_setup(char *str)
 {
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return 1;
-	strncpy_skip_quote(vmcmd_on_halt, str, 127);
-	vmcmd_on_halt[127] = 0;
+	strncpy_skip_quote(vmcmd_on_halt, str, VMCMD_MAX_SIZE);
+	vmcmd_on_halt[VMCMD_MAX_SIZE] = 0;
 	on_halt_trigger.action = &vmcmd_action;
 	return 1;
 }
@@ -1653,10 +2298,10 @@ __setup("vmhalt=", vmcmd_on_halt_setup);
 
 static int __init vmcmd_on_poff_setup(char *str)
 {
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return 1;
-	strncpy_skip_quote(vmcmd_on_poff, str, 127);
-	vmcmd_on_poff[127] = 0;
+	strncpy_skip_quote(vmcmd_on_poff, str, VMCMD_MAX_SIZE);
+	vmcmd_on_poff[VMCMD_MAX_SIZE] = 0;
 	on_poff_trigger.action = &vmcmd_action;
 	return 1;
 }
@@ -1684,6 +2329,11 @@ void __init setup_ipl(void)
 		ipl_info.data.ccw.dev_id.ssid = ipl_block.ccw.ssid;
 		ipl_info.data.ccw.dev_id.devno = ipl_block.ccw.devno;
 		break;
+	case IPL_TYPE_ECKD:
+	case IPL_TYPE_ECKD_DUMP:
+		ipl_info.data.eckd.dev_id.ssid = ipl_block.eckd.ssid;
+		ipl_info.data.eckd.dev_id.devno = ipl_block.eckd.devno;
+		break;
 	case IPL_TYPE_FCP:
 	case IPL_TYPE_FCP_DUMP:
 		ipl_info.data.fcp.dev_id.ssid = 0;
@@ -1691,6 +2341,11 @@ void __init setup_ipl(void)
 		ipl_info.data.fcp.wwpn = ipl_block.fcp.wwpn;
 		ipl_info.data.fcp.lun = ipl_block.fcp.lun;
 		break;
+	case IPL_TYPE_NVME:
+	case IPL_TYPE_NVME_DUMP:
+		ipl_info.data.nvme.fid = ipl_block.nvme.fid;
+		ipl_info.data.nvme.nsid = ipl_block.nvme.nsid;
+		break;
 	case IPL_TYPE_NSS:
 	case IPL_TYPE_UNKNOWN:
 		/* We have no info to copy */
@@ -1705,8 +2360,8 @@ void s390_reset_system(void)
 	set_prefix(0);
 
 	/* Disable lowcore protection */
-	__ctl_clear_bit(0, 28);
-	diag_dma_ops.diag308_reset();
+	local_ctl_clear_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT);
+	diag_amode31_ops.diag308_reset();
 }
 
 #ifdef CONFIG_KEXEC_FILE
@@ -1783,7 +2438,7 @@ void *ipl_report_finish(struct ipl_report *report)
 
 	buf = vzalloc(report->size);
 	if (!buf)
-		return ERR_PTR(-ENOMEM);
+		goto out;
 	ptr = buf;
 
 	memcpy(ptr, report->ipib, report->ipib->hdr.len);
@@ -1822,6 +2477,7 @@ void *ipl_report_finish(struct ipl_report *report)
 	}
 
 	BUG_ON(ptr > buf + report->size);
+out:
 	return buf;
 }
 
diff --git a/arch/s390/kernel/ipl_vmparm.c b/arch/s390/kernel/ipl_vmparm.c
index af43535a976d..b5245fadcfb0 100644
--- a/arch/s390/kernel/ipl_vmparm.c
+++ b/arch/s390/kernel/ipl_vmparm.c
@@ -1,4 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/minmax.h>
+#include <linux/string.h>
 #include <asm/ebcdic.h>
 #include <asm/ipl.h>
 
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index 8371855042dc..bdf9c7cb5685 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/kernel_stat.h>
+#include <linux/cpufeature.h>
 #include <linux/interrupt.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
@@ -21,12 +22,17 @@
 #include <linux/init.h>
 #include <linux/cpu.h>
 #include <linux/irq.h>
+#include <linux/entry-common.h>
 #include <asm/irq_regs.h>
 #include <asm/cputime.h>
 #include <asm/lowcore.h>
+#include <asm/machine.h>
 #include <asm/irq.h>
 #include <asm/hw_irq.h>
 #include <asm/stacktrace.h>
+#include <asm/softirq_stack.h>
+#include <asm/vtime.h>
+#include <asm/asm.h>
 #include "entry.h"
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat);
@@ -73,13 +79,13 @@ static const struct irq_class irqclass_sub_desc[] = {
 	{.irq = IRQEXT_CMS, .name = "CMS", .desc = "[EXT] CPU-Measurement: Sampling"},
 	{.irq = IRQEXT_CMC, .name = "CMC", .desc = "[EXT] CPU-Measurement: Counter"},
 	{.irq = IRQEXT_FTP, .name = "FTP", .desc = "[EXT] HMC FTP Service"},
+	{.irq = IRQEXT_WTI, .name = "WTI", .desc = "[EXT] Warning Track"},
 	{.irq = IRQIO_CIO,  .name = "CIO", .desc = "[I/O] Common I/O Layer Interrupt"},
 	{.irq = IRQIO_DAS,  .name = "DAS", .desc = "[I/O] DASD"},
 	{.irq = IRQIO_C15,  .name = "C15", .desc = "[I/O] 3215"},
 	{.irq = IRQIO_C70,  .name = "C70", .desc = "[I/O] 3270"},
 	{.irq = IRQIO_TAP,  .name = "TAP", .desc = "[I/O] Tape"},
 	{.irq = IRQIO_VMR,  .name = "VMR", .desc = "[I/O] Unit Record Devices"},
-	{.irq = IRQIO_LCS,  .name = "LCS", .desc = "[I/O] LCS"},
 	{.irq = IRQIO_CTC,  .name = "CTC", .desc = "[I/O] CTC"},
 	{.irq = IRQIO_ADM,  .name = "ADM", .desc = "[I/O] EADM Subchannel"},
 	{.irq = IRQIO_CSC,  .name = "CSC", .desc = "[I/O] CHSC Subchannel"},
@@ -95,27 +101,111 @@ static const struct irq_class irqclass_sub_desc[] = {
 	{.irq = CPU_RST,    .name = "RST", .desc = "[CPU] CPU Restart"},
 };
 
-void __init init_IRQ(void)
+static void do_IRQ(struct pt_regs *regs, int irq)
 {
-	BUILD_BUG_ON(ARRAY_SIZE(irqclass_sub_desc) != NR_ARCH_IRQS);
-	init_cio_interrupts();
-	init_airq_interrupts();
-	init_ext_interrupts();
+	if (tod_after_eq(get_lowcore()->int_clock,
+			 get_lowcore()->clock_comparator))
+		/* Serve timer interrupts first. */
+		clock_comparator_work();
+	generic_handle_irq(irq);
 }
 
-void do_IRQ(struct pt_regs *regs, int irq)
+static int on_async_stack(void)
 {
-	struct pt_regs *old_regs;
+	unsigned long frame = current_frame_address();
 
-	old_regs = set_irq_regs(regs);
-	irq_enter();
-	if (tod_after_eq(S390_lowcore.int_clock,
-			 S390_lowcore.clock_comparator))
-		/* Serve timer interrupts first. */
-		clock_comparator_work();
-	generic_handle_irq(irq);
-	irq_exit();
+	return ((get_lowcore()->async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0;
+}
+
+static void do_irq_async(struct pt_regs *regs, int irq)
+{
+	if (on_async_stack()) {
+		do_IRQ(regs, irq);
+	} else {
+		call_on_stack(2, get_lowcore()->async_stack, void, do_IRQ,
+			      struct pt_regs *, regs, int, irq);
+	}
+}
+
+static int irq_pending(struct pt_regs *regs)
+{
+	int cc;
+
+	asm volatile(
+		"	tpi	 0\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc)
+		:
+		: CC_CLOBBER);
+	return CC_TRANSFORM(cc);
+}
+
+void noinstr do_io_irq(struct pt_regs *regs)
+{
+	irqentry_state_t state = irqentry_enter(regs);
+	struct pt_regs *old_regs = set_irq_regs(regs);
+	bool from_idle;
+
+	irq_enter_rcu();
+
+	if (user_mode(regs)) {
+		update_timer_sys();
+		if (cpu_has_bear())
+			current->thread.last_break = regs->last_break;
+	}
+
+	from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT);
+	if (from_idle)
+		account_idle_time_irq();
+
+	set_cpu_flag(CIF_NOHZ_DELAY);
+	do {
+		regs->tpi_info = get_lowcore()->tpi_info;
+		if (get_lowcore()->tpi_info.adapter_IO)
+			do_irq_async(regs, THIN_INTERRUPT);
+		else
+			do_irq_async(regs, IO_INTERRUPT);
+	} while (machine_is_lpar() && irq_pending(regs));
+
+	irq_exit_rcu();
+
+	set_irq_regs(old_regs);
+	irqentry_exit(regs, state);
+
+	if (from_idle)
+		regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT);
+}
+
+void noinstr do_ext_irq(struct pt_regs *regs)
+{
+	irqentry_state_t state = irqentry_enter(regs);
+	struct pt_regs *old_regs = set_irq_regs(regs);
+	bool from_idle;
+
+	irq_enter_rcu();
+
+	if (user_mode(regs)) {
+		update_timer_sys();
+		if (cpu_has_bear())
+			current->thread.last_break = regs->last_break;
+	}
+
+	regs->int_code = get_lowcore()->ext_int_code_addr;
+	regs->int_parm = get_lowcore()->ext_params;
+	regs->int_parm_long = get_lowcore()->ext_params2;
+
+	from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT);
+	if (from_idle)
+		account_idle_time_irq();
+
+	do_irq_async(regs, EXT_INTERRUPT);
+
+	irq_exit_rcu();
 	set_irq_regs(old_regs);
+	irqentry_exit(regs, state);
+
+	if (from_idle)
+		regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT);
 }
 
 static void show_msi_interrupt(struct seq_file *p, int irq)
@@ -124,7 +214,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq)
 	unsigned long flags;
 	int cpu;
 
-	irq_lock_sparse();
+	rcu_read_lock();
 	desc = irq_to_desc(irq);
 	if (!desc)
 		goto out;
@@ -132,7 +222,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq)
 	raw_spin_lock_irqsave(&desc->lock, flags);
 	seq_printf(p, "%3d: ", irq);
 	for_each_online_cpu(cpu)
-		seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu));
+		seq_printf(p, "%10u ", irq_desc_kstat_cpu(desc, cpu));
 
 	if (desc->irq_data.chip)
 		seq_printf(p, " %8s", desc->irq_data.chip->name);
@@ -143,7 +233,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq)
 	seq_putc(p, '\n');
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 out:
-	irq_unlock_sparse();
+	rcu_read_unlock();
 }
 
 /*
@@ -154,7 +244,7 @@ int show_interrupts(struct seq_file *p, void *v)
 	int index = *(loff_t *) v;
 	int cpu, irq;
 
-	get_online_cpus();
+	cpus_read_lock();
 	if (index == 0) {
 		seq_puts(p, "           ");
 		for_each_online_cpu(cpu)
@@ -169,7 +259,7 @@ int show_interrupts(struct seq_file *p, void *v)
 		seq_putc(p, '\n');
 		goto out;
 	}
-	if (index < nr_irqs) {
+	if (index < irq_get_nr_irqs()) {
 		show_msi_interrupt(p, index);
 		goto out;
 	}
@@ -184,7 +274,7 @@ int show_interrupts(struct seq_file *p, void *v)
 		seq_putc(p, '\n');
 	}
 out:
-	put_online_cpus();
+	cpus_read_unlock();
 	return 0;
 }
 
@@ -194,24 +284,6 @@ unsigned int arch_dynirq_lower_bound(unsigned int from)
 }
 
 /*
- * Switch to the asynchronous interrupt stack for softirq execution.
- */
-void do_softirq_own_stack(void)
-{
-	unsigned long old, new;
-
-	old = current_stack_pointer();
-	/* Check against async. stack address range. */
-	new = S390_lowcore.async_stack;
-	if (((new - old) >> (PAGE_SHIFT + THREAD_SIZE_ORDER)) != 0) {
-		CALL_ON_STACK(__do_softirq, new, 0);
-	} else {
-		/* We are already on the async stack. */
-		__do_softirq();
-	}
-}
-
-/*
  * ext_int_hash[index] is the list head for all external interrupts that hash
  * to this index.
  */
@@ -279,7 +351,7 @@ static irqreturn_t do_ext_interrupt(int irq, void *dummy)
 	struct ext_int_info *p;
 	int index;
 
-	ext_code = *(struct ext_code *) &regs->int_code;
+	ext_code.int_code = regs->int_code;
 	if (ext_code.code != EXT_IRQ_CLK_COMP)
 		set_cpu_flag(CIF_NOHZ_DELAY);
 
@@ -294,12 +366,7 @@ static irqreturn_t do_ext_interrupt(int irq, void *dummy)
 	return IRQ_HANDLED;
 }
 
-static struct irqaction external_interrupt = {
-	.name	 = "EXT",
-	.handler = do_ext_interrupt,
-};
-
-void __init init_ext_interrupts(void)
+static void __init init_ext_interrupts(void)
 {
 	int idx;
 
@@ -308,7 +375,16 @@ void __init init_ext_interrupts(void)
 
 	irq_set_chip_and_handler(EXT_INTERRUPT,
 				 &dummy_irq_chip, handle_percpu_irq);
-	setup_irq(EXT_INTERRUPT, &external_interrupt);
+	if (request_irq(EXT_INTERRUPT, do_ext_interrupt, 0, "EXT", NULL))
+		panic("Failed to register EXT interrupt\n");
+}
+
+void __init init_IRQ(void)
+{
+	BUILD_BUG_ON(ARRAY_SIZE(irqclass_sub_desc) != NR_ARCH_IRQS);
+	init_cio_interrupts();
+	init_airq_interrupts();
+	init_ext_interrupts();
 }
 
 static DEFINE_SPINLOCK(irq_subclass_lock);
@@ -318,7 +394,7 @@ void irq_subclass_register(enum irq_subclass subclass)
 {
 	spin_lock(&irq_subclass_lock);
 	if (!irq_subclass_refcount[subclass])
-		ctl_set_bit(0, subclass);
+		system_ctl_set_bit(0, subclass);
 	irq_subclass_refcount[subclass]++;
 	spin_unlock(&irq_subclass_lock);
 }
@@ -329,7 +405,7 @@ void irq_subclass_unregister(enum irq_subclass subclass)
 	spin_lock(&irq_subclass_lock);
 	irq_subclass_refcount[subclass]--;
 	if (!irq_subclass_refcount[subclass])
-		ctl_clear_bit(0, subclass);
+		system_ctl_clear_bit(0, subclass);
 	spin_unlock(&irq_subclass_lock);
 }
 EXPORT_SYMBOL(irq_subclass_unregister);
diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c
index ab584e8e3527..e808bb8bc0da 100644
--- a/arch/s390/kernel/jump_label.c
+++ b/arch/s390/kernel/jump_label.c
@@ -6,8 +6,9 @@
  * Author(s): Jan Glauber <jang@linux.vnet.ibm.com>
  */
 #include <linux/uaccess.h>
-#include <linux/stop_machine.h>
 #include <linux/jump_label.h>
+#include <linux/module.h>
+#include <asm/text-patching.h>
 #include <asm/ipl.h>
 
 struct insn {
@@ -36,21 +37,15 @@ static void jump_label_bug(struct jump_entry *entry, struct insn *expected,
 	unsigned char *ipe = (unsigned char *)expected;
 	unsigned char *ipn = (unsigned char *)new;
 
-	pr_emerg("Jump label code mismatch at %pS [%p]\n", ipc, ipc);
+	pr_emerg("Jump label code mismatch at %pS [%px]\n", ipc, ipc);
 	pr_emerg("Found:    %6ph\n", ipc);
 	pr_emerg("Expected: %6ph\n", ipe);
 	pr_emerg("New:      %6ph\n", ipn);
 	panic("Corrupted kernel text");
 }
 
-static struct insn orignop = {
-	.opcode = 0xc004,
-	.offset = JUMP_LABEL_NOP_OFFSET >> 1,
-};
-
-static void __jump_label_transform(struct jump_entry *entry,
-				   enum jump_label_type type,
-				   int init)
+static void jump_label_transform(struct jump_entry *entry,
+				 enum jump_label_type type)
 {
 	void *code = (void *)jump_entry_code(entry);
 	struct insn old, new;
@@ -62,29 +57,26 @@ static void __jump_label_transform(struct jump_entry *entry,
 		jump_label_make_branch(entry, &old);
 		jump_label_make_nop(entry, &new);
 	}
-	if (init) {
-		if (memcmp(code, &orignop, sizeof(orignop)))
-			jump_label_bug(entry, &orignop, &new);
-	} else {
-		if (memcmp(code, &old, sizeof(old)))
-			jump_label_bug(entry, &old, &new);
-	}
+	if (memcmp(code, &old, sizeof(old)))
+		jump_label_bug(entry, &old, &new);
 	s390_kernel_write(code, &new, sizeof(new));
 }
 
-static void __jump_label_sync(void *dummy)
+void arch_jump_label_transform(struct jump_entry *entry,
+			       enum jump_label_type type)
 {
+	jump_label_transform(entry, type);
+	text_poke_sync();
 }
 
-void arch_jump_label_transform(struct jump_entry *entry,
-			       enum jump_label_type type)
+bool arch_jump_label_transform_queue(struct jump_entry *entry,
+				     enum jump_label_type type)
 {
-	__jump_label_transform(entry, type, 0);
-	smp_call_function(__jump_label_sync, NULL, 1);
+	jump_label_transform(entry, type);
+	return true;
 }
 
-void arch_jump_label_transform_static(struct jump_entry *entry,
-				      enum jump_label_type type)
+void arch_jump_label_transform_apply(void)
 {
-	__jump_label_transform(entry, type, 1);
+	text_poke_sync();
 }
diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c
index 9da6fa30c447..4d364de43799 100644
--- a/arch/s390/kernel/kexec_elf.c
+++ b/arch/s390/kernel/kexec_elf.c
@@ -40,8 +40,10 @@ static int kexec_file_add_kernel_elf(struct kimage *image,
 		buf.bufsz = phdr->p_filesz;
 
 		buf.mem = ALIGN(phdr->p_paddr, phdr->p_align);
+#ifdef CONFIG_CRASH_DUMP
 		if (image->type == KEXEC_TYPE_CRASH)
 			buf.mem += crashk_res.start;
+#endif
 		buf.memsz = phdr->p_memsz;
 		data->memsz = ALIGN(data->memsz, phdr->p_align) + buf.memsz;
 
diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c
index af23eff5774d..a32ce8bea745 100644
--- a/arch/s390/kernel/kexec_image.c
+++ b/arch/s390/kernel/kexec_image.c
@@ -24,8 +24,10 @@ static int kexec_file_add_kernel_image(struct kimage *image,
 	buf.bufsz = image->kernel_buf_len;
 
 	buf.mem = 0;
+#ifdef CONFIG_CRASH_DUMP
 	if (image->type == KEXEC_TYPE_CRASH)
 		buf.mem += crashk_res.start;
+#endif
 	buf.memsz = buf.bufsz;
 
 	data->kernel_buf = image->kernel_buf;
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index 6f1388391620..c450120b4474 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -7,10 +7,13 @@
  * s390 port, used ppc64 as template. Mike Grundy <grundym@us.ibm.com>
  */
 
+#define pr_fmt(fmt) "kprobes: " fmt
+
 #include <linux/kprobes.h>
 #include <linux/ptrace.h>
 #include <linux/preempt.h>
 #include <linux/stop_machine.h>
+#include <linux/cpufeature.h>
 #include <linux/kdebug.h>
 #include <linux/uaccess.h>
 #include <linux/extable.h>
@@ -18,129 +21,118 @@
 #include <linux/slab.h>
 #include <linux/hardirq.h>
 #include <linux/ftrace.h>
+#include <linux/execmem.h>
+#include <asm/text-patching.h>
 #include <asm/set_memory.h>
 #include <asm/sections.h>
 #include <asm/dis.h>
+#include "entry.h"
 
 DEFINE_PER_CPU(struct kprobe *, current_kprobe);
 DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
 struct kretprobe_blackpoint kretprobe_blacklist[] = { };
 
-DEFINE_INSN_CACHE_OPS(s390_insn);
-
-static int insn_page_in_use;
-static char insn_page[PAGE_SIZE] __aligned(PAGE_SIZE);
-
-static void *alloc_s390_insn_page(void)
+void *alloc_insn_page(void)
 {
-	if (xchg(&insn_page_in_use, 1) == 1)
-		return NULL;
-	set_memory_x((unsigned long) &insn_page, 1);
-	return &insn_page;
-}
+	void *page;
 
-static void free_s390_insn_page(void *page)
-{
-	set_memory_nx((unsigned long) page, 1);
-	xchg(&insn_page_in_use, 0);
+	page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
+	if (!page)
+		return NULL;
+	set_memory_rox((unsigned long)page, 1);
+	return page;
 }
 
-struct kprobe_insn_cache kprobe_s390_insn_slots = {
-	.mutex = __MUTEX_INITIALIZER(kprobe_s390_insn_slots.mutex),
-	.alloc = alloc_s390_insn_page,
-	.free = free_s390_insn_page,
-	.pages = LIST_HEAD_INIT(kprobe_s390_insn_slots.pages),
-	.insn_size = MAX_INSN_SIZE,
-};
-
 static void copy_instruction(struct kprobe *p)
 {
-	unsigned long ip = (unsigned long) p->addr;
+	kprobe_opcode_t insn[MAX_INSN_SIZE];
 	s64 disp, new_disp;
 	u64 addr, new_addr;
+	unsigned int len;
 
-	if (ftrace_location(ip) == ip) {
+	len = insn_length(*p->addr >> 8);
+	memcpy(&insn, p->addr, len);
+	p->opcode = insn[0];
+	if (probe_is_insn_relative_long(&insn[0])) {
 		/*
-		 * If kprobes patches the instruction that is morphed by
-		 * ftrace make sure that kprobes always sees the branch
-		 * "jg .+24" that skips the mcount block or the "brcl 0,0"
-		 * in case of hotpatch.
+		 * For pc-relative instructions in RIL-b or RIL-c format patch
+		 * the RI2 displacement field. The insn slot for the to be
+		 * patched instruction is within the same 4GB area like the
+		 * original instruction. Therefore the new displacement will
+		 * always fit.
 		 */
-		ftrace_generate_nop_insn((struct ftrace_insn *)p->ainsn.insn);
-		p->ainsn.is_ftrace_insn = 1;
-	} else
-		memcpy(p->ainsn.insn, p->addr, insn_length(*p->addr >> 8));
-	p->opcode = p->ainsn.insn[0];
-	if (!probe_is_insn_relative_long(p->ainsn.insn))
-		return;
-	/*
-	 * For pc-relative instructions in RIL-b or RIL-c format patch the
-	 * RI2 displacement field. We have already made sure that the insn
-	 * slot for the patched instruction is within the same 2GB area
-	 * as the original instruction (either kernel image or module area).
-	 * Therefore the new displacement will always fit.
-	 */
-	disp = *(s32 *)&p->ainsn.insn[1];
-	addr = (u64)(unsigned long)p->addr;
-	new_addr = (u64)(unsigned long)p->ainsn.insn;
-	new_disp = ((addr + (disp * 2)) - new_addr) / 2;
-	*(s32 *)&p->ainsn.insn[1] = new_disp;
+		disp = *(s32 *)&insn[1];
+		addr = (u64)(unsigned long)p->addr;
+		new_addr = (u64)(unsigned long)p->ainsn.insn;
+		new_disp = ((addr + (disp * 2)) - new_addr) / 2;
+		*(s32 *)&insn[1] = new_disp;
+	}
+	s390_kernel_write(p->ainsn.insn, &insn, len);
 }
 NOKPROBE_SYMBOL(copy_instruction);
 
-static inline int is_kernel_addr(void *addr)
-{
-	return addr < (void *)_end;
-}
-
-static int s390_get_insn_slot(struct kprobe *p)
-{
-	/*
-	 * Get an insn slot that is within the same 2GB area like the original
-	 * instruction. That way instructions with a 32bit signed displacement
-	 * field can be patched and executed within the insn slot.
-	 */
-	p->ainsn.insn = NULL;
-	if (is_kernel_addr(p->addr))
-		p->ainsn.insn = get_s390_insn_slot();
-	else if (is_module_addr(p->addr))
-		p->ainsn.insn = get_insn_slot();
-	return p->ainsn.insn ? 0 : -ENOMEM;
-}
-NOKPROBE_SYMBOL(s390_get_insn_slot);
-
-static void s390_free_insn_slot(struct kprobe *p)
+/* Check if paddr is at an instruction boundary */
+static bool can_probe(unsigned long paddr)
 {
-	if (!p->ainsn.insn)
-		return;
-	if (is_kernel_addr(p->addr))
-		free_s390_insn_slot(p->ainsn.insn, 0);
-	else
-		free_insn_slot(p->ainsn.insn, 0);
-	p->ainsn.insn = NULL;
+	unsigned long addr, offset = 0;
+	kprobe_opcode_t insn;
+	struct kprobe *kp;
+
+	if (paddr & 0x01)
+		return false;
+
+	if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
+		return false;
+
+	/* Decode instructions */
+	addr = paddr - offset;
+	while (addr < paddr) {
+		if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(insn)))
+			return false;
+
+		if (insn >> 8 == 0) {
+			if (insn != BREAKPOINT_INSTRUCTION) {
+				/*
+				 * Note that QEMU inserts opcode 0x0000 to implement
+				 * software breakpoints for guests. Since the size of
+				 * the original instruction is unknown, stop following
+				 * instructions and prevent setting a kprobe.
+				 */
+				return false;
+			}
+			/*
+			 * Check if the instruction has been modified by another
+			 * kprobe, in which case the original instruction is
+			 * decoded.
+			 */
+			kp = get_kprobe((void *)addr);
+			if (!kp) {
+				/* not a kprobe */
+				return false;
+			}
+			insn = kp->opcode;
+		}
+		addr += insn_length(insn >> 8);
+	}
+	return addr == paddr;
 }
-NOKPROBE_SYMBOL(s390_free_insn_slot);
 
 int arch_prepare_kprobe(struct kprobe *p)
 {
-	if ((unsigned long) p->addr & 0x01)
+	if (!can_probe((unsigned long)p->addr))
 		return -EINVAL;
 	/* Make sure the probe isn't going on a difficult instruction */
 	if (probe_is_prohibited_opcode(p->addr))
 		return -EINVAL;
-	if (s390_get_insn_slot(p))
+	p->ainsn.insn = get_insn_slot();
+	if (!p->ainsn.insn)
 		return -ENOMEM;
 	copy_instruction(p);
 	return 0;
 }
 NOKPROBE_SYMBOL(arch_prepare_kprobe);
 
-int arch_check_ftrace_location(struct kprobe *p)
-{
-	return 0;
-}
-
 struct swap_insn_args {
 	struct kprobe *p;
 	unsigned int arm_kprobe : 1;
@@ -149,28 +141,11 @@ struct swap_insn_args {
 static int swap_instruction(void *data)
 {
 	struct swap_insn_args *args = data;
-	struct ftrace_insn new_insn, *insn;
 	struct kprobe *p = args->p;
-	size_t len;
-
-	new_insn.opc = args->arm_kprobe ? BREAKPOINT_INSTRUCTION : p->opcode;
-	len = sizeof(new_insn.opc);
-	if (!p->ainsn.is_ftrace_insn)
-		goto skip_ftrace;
-	len = sizeof(new_insn);
-	insn = (struct ftrace_insn *) p->addr;
-	if (args->arm_kprobe) {
-		if (is_ftrace_nop(insn))
-			new_insn.disp = KPROBE_ON_FTRACE_NOP;
-		else
-			new_insn.disp = KPROBE_ON_FTRACE_CALL;
-	} else {
-		ftrace_generate_call_insn(&new_insn, (unsigned long)p->addr);
-		if (insn->disp == KPROBE_ON_FTRACE_NOP)
-			ftrace_generate_nop_insn(&new_insn);
-	}
-skip_ftrace:
-	s390_kernel_write(p->addr, &new_insn, len);
+	u16 opc;
+
+	opc = args->arm_kprobe ? BREAKPOINT_INSTRUCTION : p->opcode;
+	s390_kernel_write(p->addr, &opc, sizeof(opc));
 	return 0;
 }
 NOKPROBE_SYMBOL(swap_instruction);
@@ -179,7 +154,12 @@ void arch_arm_kprobe(struct kprobe *p)
 {
 	struct swap_insn_args args = {.p = p, .arm_kprobe = 1};
 
-	stop_machine_cpuslocked(swap_instruction, &args, NULL);
+	if (cpu_has_seq_insn()) {
+		swap_instruction(&args);
+		text_poke_sync();
+	} else {
+		stop_machine_cpuslocked(swap_instruction, &args, NULL);
+	}
 }
 NOKPROBE_SYMBOL(arch_arm_kprobe);
 
@@ -187,13 +167,21 @@ void arch_disarm_kprobe(struct kprobe *p)
 {
 	struct swap_insn_args args = {.p = p, .arm_kprobe = 0};
 
-	stop_machine_cpuslocked(swap_instruction, &args, NULL);
+	if (cpu_has_seq_insn()) {
+		swap_instruction(&args);
+		text_poke_sync();
+	} else {
+		stop_machine_cpuslocked(swap_instruction, &args, NULL);
+	}
 }
 NOKPROBE_SYMBOL(arch_disarm_kprobe);
 
 void arch_remove_kprobe(struct kprobe *p)
 {
-	s390_free_insn_slot(p);
+	if (!p->ainsn.insn)
+		return;
+	free_insn_slot(p->ainsn.insn, 0);
+	p->ainsn.insn = NULL;
 }
 NOKPROBE_SYMBOL(arch_remove_kprobe);
 
@@ -201,20 +189,27 @@ static void enable_singlestep(struct kprobe_ctlblk *kcb,
 			      struct pt_regs *regs,
 			      unsigned long ip)
 {
-	struct per_regs per_kprobe;
+	union {
+		struct ctlreg regs[3];
+		struct {
+			struct ctlreg control;
+			struct ctlreg start;
+			struct ctlreg end;
+		};
+	} per_kprobe;
 
 	/* Set up the PER control registers %cr9-%cr11 */
-	per_kprobe.control = PER_EVENT_IFETCH;
-	per_kprobe.start = ip;
-	per_kprobe.end = ip;
+	per_kprobe.control.val = PER_EVENT_IFETCH;
+	per_kprobe.start.val = ip;
+	per_kprobe.end.val = ip;
 
 	/* Save control regs and psw mask */
-	__ctl_store(kcb->kprobe_saved_ctl, 9, 11);
+	__local_ctl_store(9, 11, kcb->kprobe_saved_ctl);
 	kcb->kprobe_saved_imask = regs->psw.mask &
 		(PSW_MASK_PER | PSW_MASK_IO | PSW_MASK_EXT);
 
 	/* Set PER control regs, turns on single step for the given address */
-	__ctl_load(per_kprobe, 9, 11);
+	__local_ctl_load(9, 11, per_kprobe.regs);
 	regs->psw.mask |= PSW_MASK_PER;
 	regs->psw.mask &= ~(PSW_MASK_IO | PSW_MASK_EXT);
 	regs->psw.addr = ip;
@@ -226,7 +221,7 @@ static void disable_singlestep(struct kprobe_ctlblk *kcb,
 			       unsigned long ip)
 {
 	/* Restore control regs and psw mask, set new psw address */
-	__ctl_load(kcb->kprobe_saved_ctl, 9, 11);
+	__local_ctl_load(9, 11, kcb->kprobe_saved_ctl);
 	regs->psw.mask &= ~PSW_MASK_PER;
 	regs->psw.mask |= kcb->kprobe_saved_imask;
 	regs->psw.addr = ip;
@@ -255,18 +250,10 @@ static void pop_kprobe(struct kprobe_ctlblk *kcb)
 {
 	__this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
 	kcb->kprobe_status = kcb->prev_kprobe.status;
+	kcb->prev_kprobe.kp = NULL;
 }
 NOKPROBE_SYMBOL(pop_kprobe);
 
-void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
-{
-	ri->ret_addr = (kprobe_opcode_t *) regs->gprs[14];
-
-	/* Replace the return addr with trampoline addr */
-	regs->gprs[14] = (unsigned long) &kretprobe_trampoline;
-}
-NOKPROBE_SYMBOL(arch_prepare_kretprobe);
-
 static void kprobe_reenter_check(struct kprobe_ctlblk *kcb, struct kprobe *p)
 {
 	switch (kcb->kprobe_status) {
@@ -282,7 +269,7 @@ static void kprobe_reenter_check(struct kprobe_ctlblk *kcb, struct kprobe *p)
 		 * is a BUG. The code path resides in the .kprobes.text
 		 * section and is executed with interrupts disabled.
 		 */
-		pr_err("Invalid kprobe detected.\n");
+		pr_err("Failed to recover from reentered kprobes.\n");
 		dump_kprobe(p);
 		BUG();
 	}
@@ -348,109 +335,6 @@ static int kprobe_handler(struct pt_regs *regs)
 NOKPROBE_SYMBOL(kprobe_handler);
 
 /*
- * Function return probe trampoline:
- *	- init_kprobes() establishes a probepoint here
- *	- When the probed function returns, this probe
- *		causes the handlers to fire
- */
-static void __used kretprobe_trampoline_holder(void)
-{
-	asm volatile(".global kretprobe_trampoline\n"
-		     "kretprobe_trampoline: bcr 0,0\n");
-}
-
-/*
- * Called when the probe at kretprobe trampoline is hit
- */
-static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
-{
-	struct kretprobe_instance *ri;
-	struct hlist_head *head, empty_rp;
-	struct hlist_node *tmp;
-	unsigned long flags, orig_ret_address;
-	unsigned long trampoline_address;
-	kprobe_opcode_t *correct_ret_addr;
-
-	INIT_HLIST_HEAD(&empty_rp);
-	kretprobe_hash_lock(current, &head, &flags);
-
-	/*
-	 * It is possible to have multiple instances associated with a given
-	 * task either because an multiple functions in the call path
-	 * have a return probe installed on them, and/or more than one return
-	 * return probe was registered for a target function.
-	 *
-	 * We can handle this because:
-	 *     - instances are always inserted at the head of the list
-	 *     - when multiple return probes are registered for the same
-	 *	 function, the first instance's ret_addr will point to the
-	 *	 real return address, and all the rest will point to
-	 *	 kretprobe_trampoline
-	 */
-	ri = NULL;
-	orig_ret_address = 0;
-	correct_ret_addr = NULL;
-	trampoline_address = (unsigned long) &kretprobe_trampoline;
-	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
-		if (ri->task != current)
-			/* another task is sharing our hash bucket */
-			continue;
-
-		orig_ret_address = (unsigned long) ri->ret_addr;
-
-		if (orig_ret_address != trampoline_address)
-			/*
-			 * This is the real return address. Any other
-			 * instances associated with this task are for
-			 * other calls deeper on the call stack
-			 */
-			break;
-	}
-
-	kretprobe_assert(ri, orig_ret_address, trampoline_address);
-
-	correct_ret_addr = ri->ret_addr;
-	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
-		if (ri->task != current)
-			/* another task is sharing our hash bucket */
-			continue;
-
-		orig_ret_address = (unsigned long) ri->ret_addr;
-
-		if (ri->rp && ri->rp->handler) {
-			ri->ret_addr = correct_ret_addr;
-			ri->rp->handler(ri, regs);
-		}
-
-		recycle_rp_inst(ri, &empty_rp);
-
-		if (orig_ret_address != trampoline_address)
-			/*
-			 * This is the real return address. Any other
-			 * instances associated with this task are for
-			 * other calls deeper on the call stack
-			 */
-			break;
-	}
-
-	regs->psw.addr = orig_ret_address;
-
-	kretprobe_hash_unlock(current, &flags);
-
-	hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
-		hlist_del(&ri->hlist);
-		kfree(ri);
-	}
-	/*
-	 * By returning a non-zero value, we are telling
-	 * kprobe_handler() that we don't want the post_handler
-	 * to run (and have re-enabled preemption)
-	 */
-	return 1;
-}
-NOKPROBE_SYMBOL(trampoline_probe_handler);
-
-/*
  * Called after single-stepping.  p->addr is the address of the
  * instruction whose first byte has been replaced by the "breakpoint"
  * instruction.  To avoid the SMP problems that can occur when we
@@ -464,24 +348,6 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs)
 	unsigned long ip = regs->psw.addr;
 	int fixup = probe_get_fixup_type(p->ainsn.insn);
 
-	/* Check if the kprobes location is an enabled ftrace caller */
-	if (p->ainsn.is_ftrace_insn) {
-		struct ftrace_insn *insn = (struct ftrace_insn *) p->addr;
-		struct ftrace_insn call_insn;
-
-		ftrace_generate_call_insn(&call_insn, (unsigned long) p->addr);
-		/*
-		 * A kprobe on an enabled ftrace call site actually single
-		 * stepped an unconditional branch (ftrace nop equivalent).
-		 * Now we need to fixup things and pretend that a brasl r0,...
-		 * was executed instead.
-		 */
-		if (insn->disp == KPROBE_ON_FTRACE_CALL) {
-			ip += call_insn.disp * 2 - MCOUNT_INSN_SIZE;
-			regs->gprs[0] = (unsigned long)p->addr + sizeof(*insn);
-		}
-	}
-
 	if (fixup & FIXUP_PSW_NORMAL)
 		ip += (unsigned long) p->addr - (unsigned long) p->ainsn.insn;
 
@@ -509,12 +375,11 @@ static int post_kprobe_handler(struct pt_regs *regs)
 	if (!p)
 		return 0;
 
+	resume_execution(p, regs);
 	if (kcb->kprobe_status != KPROBE_REENTER && p->post_handler) {
 		kcb->kprobe_status = KPROBE_HIT_SSDONE;
 		p->post_handler(p, regs, 0);
 	}
-
-	resume_execution(p, regs);
 	pop_kprobe(kcb);
 	preempt_enable_no_resched();
 
@@ -534,7 +399,6 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr)
 {
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 	struct kprobe *p = kprobe_running();
-	const struct exception_table_entry *entry;
 
 	switch(kcb->kprobe_status) {
 	case KPROBE_HIT_SS:
@@ -553,32 +417,11 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr)
 	case KPROBE_HIT_ACTIVE:
 	case KPROBE_HIT_SSDONE:
 		/*
-		 * We increment the nmissed count for accounting,
-		 * we can also use npre/npostfault count for accounting
-		 * these specific fault cases.
-		 */
-		kprobes_inc_nmissed_count(p);
-
-		/*
-		 * We come here because instructions in the pre/post
-		 * handler caused the page_fault, this could happen
-		 * if handler tries to access user space by
-		 * copy_from_user(), get_user() etc. Let the
-		 * user-specified handler try to fix it first.
-		 */
-		if (p->fault_handler && p->fault_handler(p, regs, trapnr))
-			return 1;
-
-		/*
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
 		 */
-		entry = s390_search_extables(regs->psw.addr);
-		if (entry) {
-			regs->psw.addr = extable_fixup(entry);
+		if (fixup_exception(regs))
 			return 1;
-		}
-
 		/*
 		 * fixup_exception() could not handle it,
 		 * Let do_page_fault() fix it.
@@ -642,18 +485,19 @@ int kprobe_exceptions_notify(struct notifier_block *self,
 }
 NOKPROBE_SYMBOL(kprobe_exceptions_notify);
 
-static struct kprobe trampoline = {
-	.addr = (kprobe_opcode_t *) &kretprobe_trampoline,
-	.pre_handler = trampoline_probe_handler
-};
-
 int __init arch_init_kprobes(void)
 {
-	return register_kprobe(&trampoline);
+	return 0;
+}
+
+int __init arch_populate_kprobe_blacklist(void)
+{
+	return kprobe_add_area_blacklist((unsigned long)__irqentry_text_start,
+					 (unsigned long)__irqentry_text_end);
 }
 
 int arch_trampoline_kprobe(struct kprobe *p)
 {
-	return p->addr == (kprobe_opcode_t *) &kretprobe_trampoline;
+	return 0;
 }
 NOKPROBE_SYMBOL(arch_trampoline_kprobe);
diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c
index 452502f9a0d9..6d1ffca5f798 100644
--- a/arch/s390/kernel/lgr.c
+++ b/arch/s390/kernel/lgr.c
@@ -88,8 +88,7 @@ static void lgr_stsi_2_2_2(struct lgr_info *lgr_info)
 	if (stsi(si, 2, 2, 2))
 		return;
 	cpascii(lgr_info->name, si->name, sizeof(si->name));
-	memcpy(&lgr_info->lpar_number, &si->lpar_number,
-	       sizeof(lgr_info->lpar_number));
+	lgr_info->lpar_number = si->lpar_number;
 }
 
 /*
@@ -167,7 +166,7 @@ static struct timer_list lgr_timer;
  */
 static void lgr_timer_set(void)
 {
-	mod_timer(&lgr_timer, jiffies + LGR_TIMER_INTERVAL_SECS * HZ);
+	mod_timer(&lgr_timer, jiffies + secs_to_jiffies(LGR_TIMER_INTERVAL_SECS));
 }
 
 /*
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index cb8b1cc285c9..baeb3dcfc1c8 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -3,7 +3,6 @@
  * Copyright IBM Corp. 2005, 2011
  *
  * Author(s): Rolf Adelsberger,
- *	      Heiko Carstens <heiko.carstens@de.ibm.com>
  *	      Michael Holzheu <holzheu@linux.vnet.ibm.com>
  */
 
@@ -14,24 +13,28 @@
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
 #include <linux/debug_locks.h>
-#include <linux/suspend.h>
+#include <linux/cpufeature.h>
+#include <asm/guarded_storage.h>
+#include <asm/machine.h>
+#include <asm/pfault.h>
 #include <asm/cio.h>
+#include <asm/fpu.h>
 #include <asm/setup.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
 #include <asm/smp.h>
 #include <asm/ipl.h>
 #include <asm/diag.h>
 #include <asm/elf.h>
 #include <asm/asm-offsets.h>
 #include <asm/cacheflush.h>
+#include <asm/abs_lowcore.h>
 #include <asm/os_info.h>
 #include <asm/set_memory.h>
 #include <asm/stacktrace.h>
-#include <asm/switch_to.h>
 #include <asm/nmi.h>
+#include <asm/sclp.h>
 
-typedef void (*relocate_kernel_t)(kimage_entry_t *, unsigned long);
+typedef void (*relocate_kernel_t)(unsigned long, unsigned long, unsigned long);
+typedef int (*purgatory_t)(int);
 
 extern const unsigned char relocate_kernel[];
 extern const unsigned long long relocate_kernel_len;
@@ -39,46 +42,19 @@ extern const unsigned long long relocate_kernel_len;
 #ifdef CONFIG_CRASH_DUMP
 
 /*
- * PM notifier callback for kdump
- */
-static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action,
-			       void *ptr)
-{
-	switch (action) {
-	case PM_SUSPEND_PREPARE:
-	case PM_HIBERNATION_PREPARE:
-		if (kexec_crash_image)
-			arch_kexec_unprotect_crashkres();
-		break;
-	case PM_POST_SUSPEND:
-	case PM_POST_HIBERNATION:
-		if (kexec_crash_image)
-			arch_kexec_protect_crashkres();
-		break;
-	default:
-		return NOTIFY_DONE;
-	}
-	return NOTIFY_OK;
-}
-
-static int __init machine_kdump_pm_init(void)
-{
-	pm_notifier(machine_kdump_pm_cb, 0);
-	return 0;
-}
-arch_initcall(machine_kdump_pm_init);
-
-/*
  * Reset the system, copy boot CPU registers to absolute zero,
  * and jump to the kdump image
  */
-static void __do_machine_kdump(void *image)
+static void __do_machine_kdump(void *data)
 {
-	int (*start_kdump)(int);
+	struct kimage *image = data;
+	purgatory_t purgatory;
 	unsigned long prefix;
 
+	purgatory = (purgatory_t)image->start;
+
 	/* store_status() saved the prefix register to lowcore */
-	prefix = (unsigned long) S390_lowcore.prefixreg_save_area;
+	prefix = (unsigned long)get_lowcore()->prefixreg_save_area;
 
 	/* Now do the reset  */
 	s390_reset_system();
@@ -88,14 +64,12 @@ static void __do_machine_kdump(void *image)
 	 * This need to be done *after* s390_reset_system set the
 	 * prefix register of this CPU to zero
 	 */
-	memcpy((void *) __LC_FPREGS_SAVE_AREA,
-	       (void *)(prefix + __LC_FPREGS_SAVE_AREA), 512);
+	memcpy(absolute_pointer(get_lowcore()->floating_pt_save_area),
+	       phys_to_virt(prefix + __LC_FPREGS_SAVE_AREA), 512);
 
-	__load_psw_mask(PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA);
-	start_kdump = (void *)((struct kimage *) image)->start;
-	start_kdump(1);
+	call_nodat(1, int, purgatory, int, 1);
 
-	/* Die if start_kdump returns */
+	/* Die if kdump returns */
 	disabled_wait();
 }
 
@@ -119,16 +93,16 @@ static noinline void __machine_kdump(void *image)
 			continue;
 	}
 	/* Store status of the boot CPU */
-	mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
-	if (MACHINE_HAS_VX)
+	mcesa = __va(get_lowcore()->mcesad & MCESA_ORIGIN_MASK);
+	if (cpu_has_vx())
 		save_vx_regs((__vector128 *) mcesa->vector_save_area);
-	if (MACHINE_HAS_GS) {
-		__ctl_store(cr2_old.val, 2, 2);
+	if (cpu_has_gs()) {
+		local_ctl_store(2, &cr2_old.reg);
 		cr2_new = cr2_old;
 		cr2_new.gse = 1;
-		__ctl_load(cr2_new.val, 2, 2);
+		local_ctl_load(2, &cr2_new.reg);
 		save_gs_cb((struct gs_cb *) mcesa->guarded_storage_save_area);
-		__ctl_load(cr2_old.val, 2, 2);
+		local_ctl_load(2, &cr2_old.reg);
 	}
 	/*
 	 * To create a good backchain for this CPU in the dump store_status
@@ -142,18 +116,6 @@ static noinline void __machine_kdump(void *image)
 	store_status(__do_machine_kdump, image);
 }
 
-static unsigned long do_start_kdump(unsigned long addr)
-{
-	struct kimage *image = (struct kimage *) addr;
-	int (*start_kdump)(int) = (void *)image->start;
-	int rc;
-
-	__arch_local_irq_stnsm(0xfb); /* disable DAT */
-	rc = start_kdump(0);
-	__arch_local_irq_stosm(0x04); /* enable DAT */
-	return rc;
-}
-
 #endif /* CONFIG_CRASH_DUMP */
 
 /*
@@ -162,11 +124,10 @@ static unsigned long do_start_kdump(unsigned long addr)
 static bool kdump_csum_valid(struct kimage *image)
 {
 #ifdef CONFIG_CRASH_DUMP
+	purgatory_t purgatory = (purgatory_t)image->start;
 	int rc;
 
-	preempt_disable();
-	rc = CALL_ON_STACK(do_start_kdump, S390_lowcore.nodat_stack, 1, image);
-	preempt_enable();
+	rc = call_nodat(1, int, purgatory, int, 0);
 	return rc == 0;
 #else
 	return false;
@@ -219,7 +180,7 @@ void arch_kexec_unprotect_crashkres(void)
 static int machine_kexec_prepare_kdump(void)
 {
 #ifdef CONFIG_CRASH_DUMP
-	if (MACHINE_IS_VM)
+	if (machine_is_vm())
 		diag10_range(PFN_DOWN(crashk_res.start),
 			     PFN_DOWN(crashk_res.end - crashk_res.start + 1));
 	return 0;
@@ -240,7 +201,7 @@ int machine_kexec_prepare(struct kimage *image)
 		return -EINVAL;
 
 	/* Get the destination where the assembler code should be copied to.*/
-	reboot_code_buffer = (void *) page_to_phys(image->control_code_page);
+	reboot_code_buffer = page_to_virt(image->control_code_page);
 
 	/* Then copy it */
 	memcpy(reboot_code_buffer, relocate_kernel, relocate_kernel_len);
@@ -251,17 +212,6 @@ void machine_kexec_cleanup(struct kimage *image)
 {
 }
 
-void arch_crash_save_vmcoreinfo(void)
-{
-	VMCOREINFO_SYMBOL(lowcore_ptr);
-	VMCOREINFO_SYMBOL(high_memory);
-	VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
-	vmcoreinfo_append_str("SDMA=%lx\n", __sdma);
-	vmcoreinfo_append_str("EDMA=%lx\n", __edma);
-	vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
-	mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
-}
-
 void machine_shutdown(void)
 {
 }
@@ -276,15 +226,20 @@ void machine_crash_shutdown(struct pt_regs *regs)
  */
 static void __do_machine_kexec(void *data)
 {
-	relocate_kernel_t data_mover;
+	unsigned long data_mover, entry, diag308_subcode;
 	struct kimage *image = data;
 
+	data_mover = page_to_phys(image->control_code_page);
+	entry = virt_to_phys(&image->head);
+	diag308_subcode = DIAG308_CLEAR_RESET;
+	if (sclp.has_iplcc)
+		diag308_subcode |= DIAG308_FLAG_EI;
 	s390_reset_system();
-	data_mover = (relocate_kernel_t) page_to_phys(image->control_code_page);
 
-	__arch_local_irq_stnsm(0xfb); /* disable DAT - avoid no-execute */
-	/* Call the moving routine */
-	(*data_mover)(&image->head, image->start);
+	call_nodat(3, void, (relocate_kernel_t)data_mover,
+		   unsigned long, entry,
+		   unsigned long, image->start,
+		   unsigned long, diag308_subcode);
 
 	/* Die if kexec returns */
 	disabled_wait();
@@ -295,7 +250,6 @@ static void __do_machine_kexec(void *data)
  */
 static void __machine_kexec(void *data)
 {
-	__arch_local_irq_stosm(0x04); /* enable DAT */
 	pfault_fini();
 	tracing_off();
 	debug_locks_off();
diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c
index 8415ae7d2a23..c2bac14dd668 100644
--- a/arch/s390/kernel/machine_kexec_file.c
+++ b/arch/s390/kernel/machine_kexec_file.c
@@ -7,11 +7,14 @@
  * Author(s): Philipp Rudo <prudo@linux.vnet.ibm.com>
  */
 
+#define pr_fmt(fmt)	"kexec: " fmt
+
 #include <linux/elf.h>
 #include <linux/errno.h>
 #include <linux/kexec.h>
 #include <linux/module_signature.h>
 #include <linux/verification.h>
+#include <linux/vmalloc.h>
 #include <asm/boot_data.h>
 #include <asm/ipl.h>
 #include <asm/setup.h>
@@ -28,6 +31,7 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len)
 	const unsigned long marker_len = sizeof(MODULE_SIG_STRING) - 1;
 	struct module_signature *ms;
 	unsigned long sig_len;
+	int ret;
 
 	/* Skip signature verification when not secure IPLed. */
 	if (!ipl_secure_flag)
@@ -62,11 +66,18 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len)
 		return -EBADMSG;
 	}
 
-	return verify_pkcs7_signature(kernel, kernel_len,
-				      kernel + kernel_len, sig_len,
-				      VERIFY_USE_PLATFORM_KEYRING,
-				      VERIFYING_MODULE_SIGNATURE,
-				      NULL, NULL);
+	ret = verify_pkcs7_signature(kernel, kernel_len,
+				     kernel + kernel_len, sig_len,
+				     VERIFY_USE_SECONDARY_KEYRING,
+				     VERIFYING_MODULE_SIGNATURE,
+				     NULL, NULL);
+	if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING))
+		ret = verify_pkcs7_signature(kernel, kernel_len,
+					     kernel + kernel_len, sig_len,
+					     VERIFY_USE_PLATFORM_KEYRING,
+					     VERIFYING_MODULE_SIGNATURE,
+					     NULL, NULL);
+	return ret;
 }
 #endif /* CONFIG_KEXEC_SIG */
 
@@ -94,6 +105,7 @@ static int kexec_file_update_purgatory(struct kimage *image,
 	if (ret)
 		return ret;
 
+#ifdef CONFIG_CRASH_DUMP
 	if (image->type == KEXEC_TYPE_CRASH) {
 		u64 crash_size;
 
@@ -110,6 +122,7 @@ static int kexec_file_update_purgatory(struct kimage *image,
 						     sizeof(crash_size),
 						     false);
 	}
+#endif
 	return ret;
 }
 
@@ -123,8 +136,10 @@ static int kexec_file_add_purgatory(struct kimage *image,
 
 	data->memsz = ALIGN(data->memsz, PAGE_SIZE);
 	buf.mem = data->memsz;
+#ifdef CONFIG_CRASH_DUMP
 	if (image->type == KEXEC_TYPE_CRASH)
 		buf.mem += crashk_res.start;
+#endif
 
 	ret = kexec_load_purgatory(image, &buf);
 	if (ret)
@@ -147,11 +162,13 @@ static int kexec_file_add_initrd(struct kimage *image,
 
 	data->memsz = ALIGN(data->memsz, PAGE_SIZE);
 	buf.mem = data->memsz;
+#ifdef CONFIG_CRASH_DUMP
 	if (image->type == KEXEC_TYPE_CRASH)
 		buf.mem += crashk_res.start;
+#endif
 	buf.memsz = buf.bufsz;
 
-	data->parm->initrd_start = buf.mem;
+	data->parm->initrd_start = data->memsz;
 	data->parm->initrd_size = buf.memsz;
 	data->memsz += buf.memsz;
 
@@ -170,15 +187,14 @@ static int kexec_file_add_ipl_report(struct kimage *image,
 	struct kexec_buf buf;
 	unsigned long addr;
 	void *ptr, *end;
+	int ret;
 
 	buf.image = image;
 
 	data->memsz = ALIGN(data->memsz, PAGE_SIZE);
 	buf.mem = data->memsz;
-	if (image->type == KEXEC_TYPE_CRASH)
-		buf.mem += crashk_res.start;
 
-	ptr = (void *)ipl_cert_list_addr;
+	ptr = __va(ipl_cert_list_addr);
 	end = ptr + ipl_cert_list_size;
 	ncerts = 0;
 	while (ptr < end) {
@@ -190,7 +206,7 @@ static int kexec_file_add_ipl_report(struct kimage *image,
 
 	addr = data->memsz + data->report->size;
 	addr += ncerts * sizeof(struct ipl_rb_certificate_entry);
-	ptr = (void *)ipl_cert_list_addr;
+	ptr = __va(ipl_cert_list_addr);
 	while (ptr < end) {
 		len = *(unsigned int *)ptr;
 		ptr += sizeof(len);
@@ -199,9 +215,13 @@ static int kexec_file_add_ipl_report(struct kimage *image,
 		ptr += len;
 	}
 
+	ret = -ENOMEM;
 	buf.buffer = ipl_report_finish(data->report);
+	if (!buf.buffer)
+		goto out;
 	buf.bufsz = data->report->size;
 	buf.memsz = buf.bufsz;
+	image->arch.ipl_buf = buf.buffer;
 
 	data->memsz += buf.memsz;
 
@@ -209,14 +229,23 @@ static int kexec_file_add_ipl_report(struct kimage *image,
 		data->kernel_buf + offsetof(struct lowcore, ipl_parmblock_ptr);
 	*lc_ipl_parmblock_ptr = (__u32)buf.mem;
 
-	return kexec_add_buffer(&buf);
+#ifdef CONFIG_CRASH_DUMP
+	if (image->type == KEXEC_TYPE_CRASH)
+		buf.mem += crashk_res.start;
+#endif
+
+	ret = kexec_add_buffer(&buf);
+out:
+	return ret;
 }
 
 void *kexec_file_add_components(struct kimage *image,
 				int (*add_kernel)(struct kimage *image,
 						  struct s390_load_data *data))
 {
+	unsigned long max_command_line_size = LEGACY_COMMAND_LINE_SIZE;
 	struct s390_load_data data = {0};
+	unsigned long minsize;
 	int ret;
 
 	data.report = ipl_report_init(&ipl_block);
@@ -227,17 +256,32 @@ void *kexec_file_add_components(struct kimage *image,
 	if (ret)
 		goto out;
 
-	if (image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) {
-		ret = -EINVAL;
+	ret = -EINVAL;
+	minsize = PARMAREA + offsetof(struct parmarea, command_line);
+	if (image->kernel_buf_len < minsize)
 		goto out;
-	}
+
+	if (data.parm->max_command_line_size)
+		max_command_line_size = data.parm->max_command_line_size;
+
+	if (minsize + max_command_line_size < minsize)
+		goto out;
+
+	if (image->kernel_buf_len < minsize + max_command_line_size)
+		goto out;
+
+	if (image->cmdline_buf_len >= max_command_line_size)
+		goto out;
+
 	memcpy(data.parm->command_line, image->cmdline_buf,
 	       image->cmdline_buf_len);
 
+#ifdef CONFIG_CRASH_DUMP
 	if (image->type == KEXEC_TYPE_CRASH) {
 		data.parm->oldmem_base = crashk_res.start;
 		data.parm->oldmem_size = crashk_res.end - crashk_res.start + 1;
 	}
+#endif
 
 	if (image->initrd_buf) {
 		ret = kexec_file_add_initrd(image, &data);
@@ -267,8 +311,16 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
 				     const Elf_Shdr *relsec,
 				     const Elf_Shdr *symtab)
 {
+	const char *strtab, *name, *shstrtab;
+	const Elf_Shdr *sechdrs;
 	Elf_Rela *relas;
 	int i, r_type;
+	int ret;
+
+	/* String & section header string table */
+	sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
+	strtab = (char *)pi->ehdr + sechdrs[symtab->sh_link].sh_offset;
+	shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset;
 
 	relas = (void *)pi->ehdr + relsec->sh_offset;
 
@@ -281,15 +333,27 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
 		sym = (void *)pi->ehdr + symtab->sh_offset;
 		sym += ELF64_R_SYM(relas[i].r_info);
 
-		if (sym->st_shndx == SHN_UNDEF)
+		if (sym->st_name)
+			name = strtab + sym->st_name;
+		else
+			name = shstrtab + sechdrs[sym->st_shndx].sh_name;
+
+		if (sym->st_shndx == SHN_UNDEF) {
+			pr_err("Undefined symbol: %s\n", name);
 			return -ENOEXEC;
+		}
 
-		if (sym->st_shndx == SHN_COMMON)
+		if (sym->st_shndx == SHN_COMMON) {
+			pr_err("symbol '%s' in common section\n", name);
 			return -ENOEXEC;
+		}
 
 		if (sym->st_shndx >= pi->ehdr->e_shnum &&
-		    sym->st_shndx != SHN_ABS)
+		    sym->st_shndx != SHN_ABS) {
+			pr_err("Invalid section %d for symbol %s\n",
+			       sym->st_shndx, name);
 			return -ENOEXEC;
+		}
 
 		loc = pi->purgatory_buf;
 		loc += section->sh_offset;
@@ -303,21 +367,23 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
 		addr = section->sh_addr + relas[i].r_offset;
 
 		r_type = ELF64_R_TYPE(relas[i].r_info);
-		arch_kexec_do_relocs(r_type, loc, val, addr);
+
+		if (r_type == R_390_PLT32DBL)
+			r_type = R_390_PC32DBL;
+
+		ret = arch_kexec_do_relocs(r_type, loc, val, addr);
+		if (ret) {
+			pr_err("Unknown rela relocation: %d\n", r_type);
+			return -ENOEXEC;
+		}
 	}
 	return 0;
 }
 
-int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
-				  unsigned long buf_len)
+int arch_kimage_file_post_load_cleanup(struct kimage *image)
 {
-	/* A kernel must be at least large enough to contain head.S. During
-	 * load memory in head.S will be accessed, e.g. to register the next
-	 * command line. If the next kernel were smaller the current kernel
-	 * will panic at load.
-	 */
-	if (buf_len < HEAD_END)
-		return -ENOEXEC;
-
-	return kexec_image_probe_default(image, buf, buf_len);
+	vfree(image->arch.ipl_buf);
+	image->arch.ipl_buf = NULL;
+
+	return kexec_image_post_load_cleanup_default(image);
 }
diff --git a/arch/s390/kernel/machine_kexec_reloc.c b/arch/s390/kernel/machine_kexec_reloc.c
index d5035de9020e..b7182cec48dc 100644
--- a/arch/s390/kernel/machine_kexec_reloc.c
+++ b/arch/s390/kernel/machine_kexec_reloc.c
@@ -28,6 +28,7 @@ int arch_kexec_do_relocs(int r_type, void *loc, unsigned long val,
 		break;
 	case R_390_64:		/* Direct 64 bit.  */
 	case R_390_GLOB_DAT:
+	case R_390_JMP_SLOT:
 		*(u64 *)loc = val;
 		break;
 	case R_390_PC16:	/* PC relative 16 bit.	*/
diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S
index 9e1660a6b9db..1fec370fecf4 100644
--- a/arch/s390/kernel/mcount.S
+++ b/arch/s390/kernel/mcount.S
@@ -2,8 +2,6 @@
 /*
  * Copyright IBM Corp. 2008, 2009
  *
- *   Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>,
- *
  */
 
 #include <linux/linkage.h>
@@ -11,84 +9,177 @@
 #include <asm/ftrace.h>
 #include <asm/nospec-insn.h>
 #include <asm/ptrace.h>
-#include <asm/export.h>
+#include <asm/march.h>
+
+#define STACK_FRAME_SIZE_PTREGS		(STACK_FRAME_OVERHEAD + __PT_SIZE)
+#define STACK_PTREGS			(STACK_FRAME_OVERHEAD)
+#define STACK_PTREGS_GPRS		(STACK_PTREGS + __PT_GPRS)
+#define STACK_PTREGS_PSW		(STACK_PTREGS + __PT_PSW)
+
+#define STACK_FRAME_SIZE_FREGS		(STACK_FRAME_OVERHEAD + __FTRACE_REGS_SIZE)
+#define STACK_FREGS			(STACK_FRAME_OVERHEAD)
+#define STACK_FREGS_PTREGS		(STACK_FRAME_OVERHEAD + __FTRACE_REGS_PT_REGS)
+#define STACK_FREGS_PTREGS_GPRS		(STACK_FREGS_PTREGS + __PT_GPRS)
+#define STACK_FREGS_PTREGS_PSW		(STACK_FREGS_PTREGS + __PT_PSW)
+#define STACK_FREGS_PTREGS_ORIG_GPR2	(STACK_FREGS_PTREGS + __PT_ORIG_GPR2)
+#define STACK_FREGS_PTREGS_FLAGS	(STACK_FREGS_PTREGS + __PT_FLAGS)
+
+/* packed stack: allocate just enough for r14, r15 and backchain */
+#define TRACED_FUNC_FRAME_SIZE	24
+
+#ifdef CONFIG_FUNCTION_TRACER
 
 	GEN_BR_THUNK %r1
 	GEN_BR_THUNK %r14
 
 	.section .kprobes.text, "ax"
 
-ENTRY(ftrace_stub)
+SYM_FUNC_START(ftrace_stub)
 	BR_EX	%r14
-ENDPROC(ftrace_stub)
+SYM_FUNC_END(ftrace_stub)
 
-#define STACK_FRAME_SIZE  (STACK_FRAME_OVERHEAD + __PT_SIZE)
-#define STACK_PTREGS	  (STACK_FRAME_OVERHEAD)
-#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS)
-#define STACK_PTREGS_PSW  (STACK_PTREGS + __PT_PSW)
+SYM_CODE_START(ftrace_stub_direct_tramp)
+	lgr	%r1, %r0
+	BR_EX	%r1
+SYM_CODE_END(ftrace_stub_direct_tramp)
 
-ENTRY(_mcount)
-	BR_EX	%r14
-ENDPROC(_mcount)
-EXPORT_SYMBOL(_mcount)
+	.macro	ftrace_regs_entry, allregs=0
+	stg	%r14,(__SF_GPRS+8*8)(%r15)	# save traced function caller
+
+	.if \allregs == 1
+	# save psw mask
+	# don't put any instructions clobbering CC before this point
+	epsw	%r1,%r14
+	risbg	%r14,%r1,0,31,32
+	.endif
 
-ENTRY(ftrace_caller)
-	.globl	ftrace_regs_caller
-	.set	ftrace_regs_caller,ftrace_caller
 	lgr	%r1,%r15
-#if !(defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT))
-	aghi	%r0,MCOUNT_RETURN_FIXUP
-#endif
-	aghi	%r15,-STACK_FRAME_SIZE
+	# allocate stack frame for ftrace_caller to contain traced function
+	aghi	%r15,-TRACED_FUNC_FRAME_SIZE
+	stg	%r1,__SF_BACKCHAIN(%r15)
+	stg	%r0,(__SF_GPRS+8*8)(%r15)
+	stg	%r15,(__SF_GPRS+9*8)(%r15)
+	# allocate ftrace_regs and stack frame for ftrace_trace_function
+	aghi	%r15,-STACK_FRAME_SIZE_FREGS
+	stg	%r1,(STACK_FREGS_PTREGS_GPRS+15*8)(%r15)
+	xc	STACK_FREGS_PTREGS_ORIG_GPR2(8,%r15),STACK_FREGS_PTREGS_ORIG_GPR2(%r15)
+
+	.if \allregs == 1
+	stg	%r14,(STACK_FREGS_PTREGS_PSW)(%r15)
+	mvghi	STACK_FREGS_PTREGS_FLAGS(%r15),_PIF_FTRACE_FULL_REGS
+	.else
+	xc	STACK_FREGS_PTREGS_FLAGS(8,%r15),STACK_FREGS_PTREGS_FLAGS(%r15)
+	.endif
+
+	lg	%r14,(__SF_GPRS+8*8)(%r1)	# restore original return address
+	aghi	%r1,-TRACED_FUNC_FRAME_SIZE
 	stg	%r1,__SF_BACKCHAIN(%r15)
-	stg	%r1,(STACK_PTREGS_GPRS+15*8)(%r15)
-	stg	%r0,(STACK_PTREGS_PSW+8)(%r15)
-	stmg	%r2,%r14,(STACK_PTREGS_GPRS+2*8)(%r15)
-#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
+	stg	%r0,(STACK_FREGS_PTREGS_PSW+8)(%r15)
+	stmg	%r2,%r14,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15)
+	.endm
+
+SYM_CODE_START(ftrace_regs_caller)
+	ftrace_regs_entry	1
+	j	ftrace_common
+SYM_CODE_END(ftrace_regs_caller)
+
+SYM_CODE_START(ftrace_caller)
+	ftrace_regs_entry	0
+	j	ftrace_common
+SYM_CODE_END(ftrace_caller)
+
+SYM_CODE_START(ftrace_common)
+#ifdef MARCH_HAS_Z196_FEATURES
 	aghik	%r2,%r0,-MCOUNT_INSN_SIZE
 	lgrl	%r4,function_trace_op
-	lgrl	%r1,ftrace_trace_function
+	lgrl	%r1,ftrace_func
 #else
 	lgr	%r2,%r0
 	aghi	%r2,-MCOUNT_INSN_SIZE
 	larl	%r4,function_trace_op
 	lg	%r4,0(%r4)
-	larl	%r1,ftrace_trace_function
+	larl	%r1,ftrace_func
 	lg	%r1,0(%r1)
 #endif
 	lgr	%r3,%r14
-	la	%r5,STACK_PTREGS(%r15)
+	la	%r5,STACK_FREGS(%r15)
 	BASR_EX	%r14,%r1
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-# The j instruction gets runtime patched to a nop instruction.
-# See ftrace_enable_ftrace_graph_caller.
-	.globl ftrace_graph_caller
-ftrace_graph_caller:
-	j	ftrace_graph_caller_end
-	lmg	%r2,%r3,(STACK_PTREGS_GPRS+14*8)(%r15)
-	lg	%r4,(STACK_PTREGS_PSW+8)(%r15)
-	brasl	%r14,prepare_ftrace_return
-	stg	%r2,(STACK_PTREGS_GPRS+14*8)(%r15)
-ftrace_graph_caller_end:
-	.globl	ftrace_graph_caller_end
+	lg	%r0,(STACK_FREGS_PTREGS_PSW+8)(%r15)
+#ifdef MARCH_HAS_Z196_FEATURES
+	ltg	%r1,STACK_FREGS_PTREGS_ORIG_GPR2(%r15)
+	locgrz	%r1,%r0
+#else
+	lg	%r1,STACK_FREGS_PTREGS_ORIG_GPR2(%r15)
+	ltgr	%r1,%r1
+	jnz	0f
+	lgr	%r1,%r0
 #endif
-	lg	%r1,(STACK_PTREGS_PSW+8)(%r15)
-	lmg	%r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15)
+0:	lmg	%r2,%r15,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15)
 	BR_EX	%r1
-ENDPROC(ftrace_caller)
+SYM_CODE_END(ftrace_common)
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
-ENTRY(return_to_handler)
+SYM_FUNC_START(return_to_handler)
 	stmg	%r2,%r5,32(%r15)
 	lgr	%r1,%r15
-	aghi	%r15,-STACK_FRAME_OVERHEAD
+	# allocate ftrace_regs and stack frame for ftrace_return_to_handler
+	aghi	%r15,-STACK_FRAME_SIZE_FREGS
 	stg	%r1,__SF_BACKCHAIN(%r15)
+	stg	%r2,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15)
+	stg	%r1,(STACK_FREGS_PTREGS_GPRS+15*8)(%r15)
+	la	%r2,STACK_FRAME_OVERHEAD(%r15)
 	brasl	%r14,ftrace_return_to_handler
-	aghi	%r15,STACK_FRAME_OVERHEAD
+	aghi	%r15,STACK_FRAME_SIZE_FREGS
 	lgr	%r14,%r2
 	lmg	%r2,%r5,32(%r15)
 	BR_EX	%r14
-ENDPROC(return_to_handler)
+SYM_FUNC_END(return_to_handler)
 
 #endif
+#endif /* CONFIG_FUNCTION_TRACER */
+
+SYM_CODE_START(ftrace_shared_hotpatch_trampoline_br)
+	lmg	%r0,%r1,2(%r1)
+	br	%r1
+SYM_INNER_LABEL(ftrace_shared_hotpatch_trampoline_br_end, SYM_L_GLOBAL)
+SYM_CODE_END(ftrace_shared_hotpatch_trampoline_br)
+
+#ifdef CONFIG_EXPOLINE
+SYM_CODE_START(ftrace_shared_hotpatch_trampoline_exrl)
+	lmg	%r0,%r1,2(%r1)
+	exrl	%r0,0f
+	j	.
+0:	br	%r1
+SYM_INNER_LABEL(ftrace_shared_hotpatch_trampoline_exrl_end, SYM_L_GLOBAL)
+SYM_CODE_END(ftrace_shared_hotpatch_trampoline_exrl)
+#endif /* CONFIG_EXPOLINE */
+
+#ifdef CONFIG_RETHOOK
+
+SYM_CODE_START(arch_rethook_trampoline)
+	stg	%r14,(__SF_GPRS+8*8)(%r15)
+	lay	%r15,-STACK_FRAME_SIZE_PTREGS(%r15)
+	stmg	%r0,%r14,STACK_PTREGS_GPRS(%r15)
+
+	# store original stack pointer in backchain and pt_regs
+	lay	%r7,STACK_FRAME_SIZE_PTREGS(%r15)
+	stg	%r7,__SF_BACKCHAIN(%r15)
+	stg	%r7,STACK_PTREGS_GPRS+(15*8)(%r15)
+
+	# store full psw
+	epsw	%r2,%r3
+	risbg	%r3,%r2,0,31,32
+	stg	%r3,STACK_PTREGS_PSW(%r15)
+	larl	%r1,arch_rethook_trampoline
+	stg	%r1,STACK_PTREGS_PSW+8(%r15)
+
+	lay	%r2,STACK_PTREGS(%r15)
+	brasl	%r14,arch_rethook_trampoline_callback
+
+	mvc	__SF_EMPTY(16,%r7),STACK_PTREGS_PSW(%r15)
+	lmg	%r0,%r15,STACK_PTREGS_GPRS(%r15)
+	lpswe	__SF_EMPTY(%r15)
+SYM_CODE_END(arch_rethook_trampoline)
+
+#endif /* CONFIG_RETHOOK */
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index ba8f19bb438b..91e207b50394 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -14,14 +14,20 @@
 #include <linux/elf.h>
 #include <linux/vmalloc.h>
 #include <linux/fs.h>
+#include <linux/ftrace.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/kasan.h>
 #include <linux/moduleloader.h>
 #include <linux/bug.h>
+#include <linux/memory.h>
+#include <linux/execmem.h>
 #include <asm/alternative.h>
 #include <asm/nospec-branch.h>
 #include <asm/facility.h>
+#include <asm/ftrace.lds.h>
+#include <asm/set_memory.h>
+#include <asm/setup.h>
 
 #if 0
 #define DEBUGP printk
@@ -29,23 +35,14 @@
 #define DEBUGP(fmt , ...)
 #endif
 
-#define PLT_ENTRY_SIZE 20
+#define PLT_ENTRY_SIZE 22
 
-void *module_alloc(unsigned long size)
+#ifdef CONFIG_FUNCTION_TRACER
+void module_arch_cleanup(struct module *mod)
 {
-	void *p;
-
-	if (PAGE_ALIGN(size) > MODULES_LEN)
-		return NULL;
-	p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
-				 GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-				 __builtin_return_address(0));
-	if (p && (kasan_module_alloc(p, size) < 0)) {
-		vfree(p);
-		return NULL;
-	}
-	return p;
+	execmem_free(mod->arch.trampolines_start);
 }
+#endif
 
 void module_arch_freeing_init(struct module *mod)
 {
@@ -114,6 +111,7 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 	Elf_Rela *rela;
 	char *strings;
 	int nrela, i, j;
+	struct module_memory *mod_mem;
 
 	/* Find symbol table and string table. */
 	symtab = NULL;
@@ -161,23 +159,26 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 
 	/* Increase core size by size of got & plt and set start
 	   offsets for got and plt. */
-	me->core_layout.size = ALIGN(me->core_layout.size, 4);
-	me->arch.got_offset = me->core_layout.size;
-	me->core_layout.size += me->arch.got_size;
-	me->arch.plt_offset = me->core_layout.size;
+	mod_mem = &me->mem[MOD_TEXT];
+	mod_mem->size = ALIGN(mod_mem->size, 4);
+	me->arch.got_offset = mod_mem->size;
+	mod_mem->size += me->arch.got_size;
+	me->arch.plt_offset = mod_mem->size;
 	if (me->arch.plt_size) {
 		if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable)
 			me->arch.plt_size += PLT_ENTRY_SIZE;
-		me->core_layout.size += me->arch.plt_size;
+		mod_mem->size += me->arch.plt_size;
 	}
 	return 0;
 }
 
 static int apply_rela_bits(Elf_Addr loc, Elf_Addr val,
-			   int sign, int bits, int shift)
+			   int sign, int bits, int shift,
+			   void *(*write)(void *dest, const void *src, size_t len))
 {
 	unsigned long umax;
 	long min, max;
+	void *dest = (void *)loc;
 
 	if (val & ((1UL << shift) - 1))
 		return -ENOEXEC;
@@ -194,26 +195,33 @@ static int apply_rela_bits(Elf_Addr loc, Elf_Addr val,
 			return -ENOEXEC;
 	}
 
-	if (bits == 8)
-		*(unsigned char *) loc = val;
-	else if (bits == 12)
-		*(unsigned short *) loc = (val & 0xfff) |
+	if (bits == 8) {
+		unsigned char tmp = val;
+		write(dest, &tmp, 1);
+	} else if (bits == 12) {
+		unsigned short tmp = (val & 0xfff) |
 			(*(unsigned short *) loc & 0xf000);
-	else if (bits == 16)
-		*(unsigned short *) loc = val;
-	else if (bits == 20)
-		*(unsigned int *) loc = (val & 0xfff) << 16 |
-			(val & 0xff000) >> 4 |
-			(*(unsigned int *) loc & 0xf00000ff);
-	else if (bits == 32)
-		*(unsigned int *) loc = val;
-	else if (bits == 64)
-		*(unsigned long *) loc = val;
+		write(dest, &tmp, 2);
+	} else if (bits == 16) {
+		unsigned short tmp = val;
+		write(dest, &tmp, 2);
+	} else if (bits == 20) {
+		unsigned int tmp = (val & 0xfff) << 16 |
+			(val & 0xff000) >> 4 | (*(unsigned int *) loc & 0xf00000ff);
+		write(dest, &tmp, 4);
+	} else if (bits == 32) {
+		unsigned int tmp = val;
+		write(dest, &tmp, 4);
+	} else if (bits == 64) {
+		unsigned long tmp = val;
+		write(dest, &tmp, 8);
+	}
 	return 0;
 }
 
 static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
-		      const char *strtab, struct module *me)
+		      const char *strtab, struct module *me,
+		      void *(*write)(void *dest, const void *src, size_t len))
 {
 	struct mod_arch_syminfo *info;
 	Elf_Addr loc, val;
@@ -241,17 +249,17 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
 	case R_390_64:		/* Direct 64 bit.  */
 		val += rela->r_addend;
 		if (r_type == R_390_8)
-			rc = apply_rela_bits(loc, val, 0, 8, 0);
+			rc = apply_rela_bits(loc, val, 0, 8, 0, write);
 		else if (r_type == R_390_12)
-			rc = apply_rela_bits(loc, val, 0, 12, 0);
+			rc = apply_rela_bits(loc, val, 0, 12, 0, write);
 		else if (r_type == R_390_16)
-			rc = apply_rela_bits(loc, val, 0, 16, 0);
+			rc = apply_rela_bits(loc, val, 0, 16, 0, write);
 		else if (r_type == R_390_20)
-			rc = apply_rela_bits(loc, val, 1, 20, 0);
+			rc = apply_rela_bits(loc, val, 1, 20, 0, write);
 		else if (r_type == R_390_32)
-			rc = apply_rela_bits(loc, val, 0, 32, 0);
+			rc = apply_rela_bits(loc, val, 0, 32, 0, write);
 		else if (r_type == R_390_64)
-			rc = apply_rela_bits(loc, val, 0, 64, 0);
+			rc = apply_rela_bits(loc, val, 0, 64, 0, write);
 		break;
 	case R_390_PC16:	/* PC relative 16 bit.  */
 	case R_390_PC16DBL:	/* PC relative 16 bit shifted by 1.  */
@@ -260,15 +268,15 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
 	case R_390_PC64:	/* PC relative 64 bit.	*/
 		val += rela->r_addend - loc;
 		if (r_type == R_390_PC16)
-			rc = apply_rela_bits(loc, val, 1, 16, 0);
+			rc = apply_rela_bits(loc, val, 1, 16, 0, write);
 		else if (r_type == R_390_PC16DBL)
-			rc = apply_rela_bits(loc, val, 1, 16, 1);
+			rc = apply_rela_bits(loc, val, 1, 16, 1, write);
 		else if (r_type == R_390_PC32DBL)
-			rc = apply_rela_bits(loc, val, 1, 32, 1);
+			rc = apply_rela_bits(loc, val, 1, 32, 1, write);
 		else if (r_type == R_390_PC32)
-			rc = apply_rela_bits(loc, val, 1, 32, 0);
+			rc = apply_rela_bits(loc, val, 1, 32, 0, write);
 		else if (r_type == R_390_PC64)
-			rc = apply_rela_bits(loc, val, 1, 64, 0);
+			rc = apply_rela_bits(loc, val, 1, 64, 0, write);
 		break;
 	case R_390_GOT12:	/* 12 bit GOT offset.  */
 	case R_390_GOT16:	/* 16 bit GOT offset.  */
@@ -283,33 +291,34 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
 	case R_390_GOTPLT64:	/* 64 bit offset to jump slot.	*/
 	case R_390_GOTPLTENT:	/* 32 bit rel. offset to jump slot >> 1. */
 		if (info->got_initialized == 0) {
-			Elf_Addr *gotent;
+			Elf_Addr *gotent = me->mem[MOD_TEXT].base +
+					   me->arch.got_offset +
+					   info->got_offset;
 
-			gotent = me->core_layout.base + me->arch.got_offset +
-				info->got_offset;
-			*gotent = val;
+			write(gotent, &val, sizeof(*gotent));
 			info->got_initialized = 1;
 		}
 		val = info->got_offset + rela->r_addend;
 		if (r_type == R_390_GOT12 ||
 		    r_type == R_390_GOTPLT12)
-			rc = apply_rela_bits(loc, val, 0, 12, 0);
+			rc = apply_rela_bits(loc, val, 0, 12, 0, write);
 		else if (r_type == R_390_GOT16 ||
 			 r_type == R_390_GOTPLT16)
-			rc = apply_rela_bits(loc, val, 0, 16, 0);
+			rc = apply_rela_bits(loc, val, 0, 16, 0, write);
 		else if (r_type == R_390_GOT20 ||
 			 r_type == R_390_GOTPLT20)
-			rc = apply_rela_bits(loc, val, 1, 20, 0);
+			rc = apply_rela_bits(loc, val, 1, 20, 0, write);
 		else if (r_type == R_390_GOT32 ||
 			 r_type == R_390_GOTPLT32)
-			rc = apply_rela_bits(loc, val, 0, 32, 0);
+			rc = apply_rela_bits(loc, val, 0, 32, 0, write);
 		else if (r_type == R_390_GOT64 ||
 			 r_type == R_390_GOTPLT64)
-			rc = apply_rela_bits(loc, val, 0, 64, 0);
+			rc = apply_rela_bits(loc, val, 0, 64, 0, write);
 		else if (r_type == R_390_GOTENT ||
 			 r_type == R_390_GOTPLTENT) {
-			val += (Elf_Addr) me->core_layout.base - loc;
-			rc = apply_rela_bits(loc, val, 1, 32, 1);
+			val += (Elf_Addr)me->mem[MOD_TEXT].base +
+				me->arch.got_offset - loc;
+			rc = apply_rela_bits(loc, val, 1, 32, 1, write);
 		}
 		break;
 	case R_390_PLT16DBL:	/* 16 bit PC rel. PLT shifted by 1.  */
@@ -320,25 +329,28 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
 	case R_390_PLTOFF32:	/* 32 bit offset from GOT to PLT. */
 	case R_390_PLTOFF64:	/* 16 bit offset from GOT to PLT. */
 		if (info->plt_initialized == 0) {
-			unsigned int *ip;
-			ip = me->core_layout.base + me->arch.plt_offset +
-				info->plt_offset;
-			ip[0] = 0x0d10e310;	/* basr 1,0  */
-			ip[1] = 0x100a0004;	/* lg	1,10(1) */
+			unsigned char insn[PLT_ENTRY_SIZE];
+			char *plt_base;
+			char *ip;
+
+			plt_base = me->mem[MOD_TEXT].base + me->arch.plt_offset;
+			ip = plt_base + info->plt_offset;
+			*(int *)insn = 0x0d10e310;	/* basr 1,0  */
+			*(int *)&insn[4] = 0x100c0004;	/* lg	1,12(1) */
 			if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable) {
-				unsigned int *ij;
-				ij = me->core_layout.base +
-					me->arch.plt_offset +
-					me->arch.plt_size - PLT_ENTRY_SIZE;
-				ip[2] = 0xa7f40000 +	/* j __jump_r1 */
-					(unsigned int)(u16)
-					(((unsigned long) ij - 8 -
-					  (unsigned long) ip) / 2);
+				char *jump_r1;
+
+				jump_r1 = plt_base + me->arch.plt_size -
+					PLT_ENTRY_SIZE;
+				/* brcl	0xf,__jump_r1 */
+				*(short *)&insn[8] = 0xc0f4;
+				*(int *)&insn[10] = (jump_r1 - (ip + 8)) / 2;
 			} else {
-				ip[2] = 0x07f10000;	/* br %r1 */
+				*(int *)&insn[8] = 0x07f10000;	/* br %r1 */
 			}
-			ip[3] = (unsigned int) (val >> 32);
-			ip[4] = (unsigned int) val;
+			*(long *)&insn[14] = val;
+
+			write(ip, insn, sizeof(insn));
 			info->plt_initialized = 1;
 		}
 		if (r_type == R_390_PLTOFF16 ||
@@ -351,44 +363,44 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
 			       val - loc + 0xffffUL < 0x1ffffeUL) ||
 			      (r_type == R_390_PLT32DBL &&
 			       val - loc + 0xffffffffULL < 0x1fffffffeULL)))
-				val = (Elf_Addr) me->core_layout.base +
+				val = (Elf_Addr) me->mem[MOD_TEXT].base +
 					me->arch.plt_offset +
 					info->plt_offset;
 			val += rela->r_addend - loc;
 		}
 		if (r_type == R_390_PLT16DBL)
-			rc = apply_rela_bits(loc, val, 1, 16, 1);
+			rc = apply_rela_bits(loc, val, 1, 16, 1, write);
 		else if (r_type == R_390_PLTOFF16)
-			rc = apply_rela_bits(loc, val, 0, 16, 0);
+			rc = apply_rela_bits(loc, val, 0, 16, 0, write);
 		else if (r_type == R_390_PLT32DBL)
-			rc = apply_rela_bits(loc, val, 1, 32, 1);
+			rc = apply_rela_bits(loc, val, 1, 32, 1, write);
 		else if (r_type == R_390_PLT32 ||
 			 r_type == R_390_PLTOFF32)
-			rc = apply_rela_bits(loc, val, 0, 32, 0);
+			rc = apply_rela_bits(loc, val, 0, 32, 0, write);
 		else if (r_type == R_390_PLT64 ||
 			 r_type == R_390_PLTOFF64)
-			rc = apply_rela_bits(loc, val, 0, 64, 0);
+			rc = apply_rela_bits(loc, val, 0, 64, 0, write);
 		break;
 	case R_390_GOTOFF16:	/* 16 bit offset to GOT.  */
 	case R_390_GOTOFF32:	/* 32 bit offset to GOT.  */
 	case R_390_GOTOFF64:	/* 64 bit offset to GOT. */
 		val = val + rela->r_addend -
-			((Elf_Addr) me->core_layout.base + me->arch.got_offset);
+			((Elf_Addr) me->mem[MOD_TEXT].base + me->arch.got_offset);
 		if (r_type == R_390_GOTOFF16)
-			rc = apply_rela_bits(loc, val, 0, 16, 0);
+			rc = apply_rela_bits(loc, val, 0, 16, 0, write);
 		else if (r_type == R_390_GOTOFF32)
-			rc = apply_rela_bits(loc, val, 0, 32, 0);
+			rc = apply_rela_bits(loc, val, 0, 32, 0, write);
 		else if (r_type == R_390_GOTOFF64)
-			rc = apply_rela_bits(loc, val, 0, 64, 0);
+			rc = apply_rela_bits(loc, val, 0, 64, 0, write);
 		break;
 	case R_390_GOTPC:	/* 32 bit PC relative offset to GOT. */
 	case R_390_GOTPCDBL:	/* 32 bit PC rel. off. to GOT shifted by 1. */
-		val = (Elf_Addr) me->core_layout.base + me->arch.got_offset +
+		val = (Elf_Addr) me->mem[MOD_TEXT].base + me->arch.got_offset +
 			rela->r_addend - loc;
 		if (r_type == R_390_GOTPC)
-			rc = apply_rela_bits(loc, val, 1, 32, 0);
+			rc = apply_rela_bits(loc, val, 1, 32, 0, write);
 		else if (r_type == R_390_GOTPCDBL)
-			rc = apply_rela_bits(loc, val, 1, 32, 1);
+			rc = apply_rela_bits(loc, val, 1, 32, 1, write);
 		break;
 	case R_390_COPY:
 	case R_390_GLOB_DAT:	/* Create GOT entry.  */
@@ -412,9 +424,10 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
 	return 0;
 }
 
-int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
+static int __apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 		       unsigned int symindex, unsigned int relsec,
-		       struct module *me)
+		       struct module *me,
+		       void *(*write)(void *dest, const void *src, size_t len))
 {
 	Elf_Addr base;
 	Elf_Sym *symtab;
@@ -430,13 +443,51 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 	n = sechdrs[relsec].sh_size / sizeof(Elf_Rela);
 
 	for (i = 0; i < n; i++, rela++) {
-		rc = apply_rela(rela, base, symtab, strtab, me);
+		rc = apply_rela(rela, base, symtab, strtab, me, write);
 		if (rc)
 			return rc;
 	}
 	return 0;
 }
 
+int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
+		       unsigned int symindex, unsigned int relsec,
+		       struct module *me)
+{
+	bool early = me->state == MODULE_STATE_UNFORMED;
+	void *(*write)(void *, const void *, size_t) = memcpy;
+
+	if (!early)
+		write = s390_kernel_write;
+
+	return __apply_relocate_add(sechdrs, strtab, symindex, relsec, me,
+				    write);
+}
+
+#ifdef CONFIG_FUNCTION_TRACER
+static int module_alloc_ftrace_hotpatch_trampolines(struct module *me,
+						    const Elf_Shdr *s)
+{
+	char *start, *end;
+	int numpages;
+	size_t size;
+
+	size = FTRACE_HOTPATCH_TRAMPOLINES_SIZE(s->sh_size);
+	numpages = DIV_ROUND_UP(size, PAGE_SIZE);
+	start = execmem_alloc(EXECMEM_FTRACE, numpages * PAGE_SIZE);
+	if (!start)
+		return -ENOMEM;
+	set_memory_rox((unsigned long)start, numpages);
+	end = start + size;
+
+	me->arch.trampolines_start = (struct ftrace_hotpatch_trampoline *)start;
+	me->arch.trampolines_end = (struct ftrace_hotpatch_trampoline *)end;
+	me->arch.next_trampoline = me->arch.trampolines_start;
+
+	return 0;
+}
+#endif /* CONFIG_FUNCTION_TRACER */
+
 int module_finalize(const Elf_Ehdr *hdr,
 		    const Elf_Shdr *sechdrs,
 		    struct module *me)
@@ -444,22 +495,19 @@ int module_finalize(const Elf_Ehdr *hdr,
 	const Elf_Shdr *s;
 	char *secstrings, *secname;
 	void *aseg;
+#ifdef CONFIG_FUNCTION_TRACER
+	int ret;
+#endif
 
 	if (IS_ENABLED(CONFIG_EXPOLINE) &&
 	    !nospec_disable && me->arch.plt_size) {
 		unsigned int *ij;
 
-		ij = me->core_layout.base + me->arch.plt_offset +
+		ij = me->mem[MOD_TEXT].base + me->arch.plt_offset +
 			me->arch.plt_size - PLT_ENTRY_SIZE;
-		if (test_facility(35)) {
-			ij[0] = 0xc6000000;	/* exrl	%r0,.+10	*/
-			ij[1] = 0x0005a7f4;	/* j	.		*/
-			ij[2] = 0x000007f1;	/* br	%r1		*/
-		} else {
-			ij[0] = 0x44000000 | (unsigned int)
-				offsetof(struct lowcore, br_r1_trampoline);
-			ij[1] = 0xa7f40000;	/* j	.		*/
-		}
+		ij[0] = 0xc6000000;	/* exrl	%r0,.+10	*/
+		ij[1] = 0x0005a7f4;	/* j	.		*/
+		ij[2] = 0x000007f1;	/* br	%r1		*/
 	}
 
 	secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
@@ -478,8 +526,15 @@ int module_finalize(const Elf_Ehdr *hdr,
 		if (IS_ENABLED(CONFIG_EXPOLINE) &&
 		    (str_has_prefix(secname, ".s390_return")))
 			nospec_revert(aseg, aseg + s->sh_size);
+
+#ifdef CONFIG_FUNCTION_TRACER
+		if (!strcmp(FTRACE_CALLSITE_SECTION, secname)) {
+			ret = module_alloc_ftrace_hotpatch_trampolines(me, s);
+			if (ret < 0)
+				return ret;
+		}
+#endif /* CONFIG_FUNCTION_TRACER */
 	}
 
-	jump_label_apply_nops(me);
 	return 0;
 }
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 0a487fae763e..3da371c144eb 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -6,12 +6,13 @@
  *    Author(s): Ingo Adlung <adlung@de.ibm.com>,
  *		 Martin Schwidefsky <schwidefsky@de.ibm.com>,
  *		 Cornelia Huck <cornelia.huck@de.ibm.com>,
- *		 Heiko Carstens <heiko.carstens@de.ibm.com>,
  */
 
 #include <linux/kernel_stat.h>
+#include <linux/cpufeature.h>
 #include <linux/init.h>
 #include <linux/errno.h>
+#include <linux/entry-common.h>
 #include <linux/hardirq.h>
 #include <linux/log2.h>
 #include <linux/kprobes.h>
@@ -19,18 +20,19 @@
 #include <linux/time.h>
 #include <linux/module.h>
 #include <linux/sched/signal.h>
-
+#include <linux/kvm_host.h>
 #include <linux/export.h>
 #include <asm/lowcore.h>
+#include <asm/ctlreg.h>
+#include <asm/fpu.h>
 #include <asm/smp.h>
 #include <asm/stp.h>
 #include <asm/cputime.h>
 #include <asm/nmi.h>
 #include <asm/crw.h>
-#include <asm/switch_to.h>
-#include <asm/ctl_reg.h>
 #include <asm/asm-offsets.h>
-#include <linux/kvm_host.h>
+#include <asm/pai.h>
+#include <asm/vtime.h>
 
 struct mcck_struct {
 	unsigned int kill_task : 1;
@@ -41,116 +43,135 @@ struct mcck_struct {
 };
 
 static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck);
-static struct kmem_cache *mcesa_cache;
-static unsigned long mcesa_origin_lc;
 
 static inline int nmi_needs_mcesa(void)
 {
-	return MACHINE_HAS_VX || MACHINE_HAS_GS;
-}
-
-static inline unsigned long nmi_get_mcesa_size(void)
-{
-	if (MACHINE_HAS_GS)
-		return MCESA_MAX_SIZE;
-	return MCESA_MIN_SIZE;
+	return cpu_has_vx() || cpu_has_gs();
 }
 
 /*
  * The initial machine check extended save area for the boot CPU.
- * It will be replaced by nmi_init() with an allocated structure.
- * The structure is required for machine check happening early in
- * the boot process.
+ * It will be replaced on the boot CPU reinit with an allocated
+ * structure. The structure is required for machine check happening
+ * early in the boot process.
  */
-static struct mcesa boot_mcesa __initdata __aligned(MCESA_MAX_SIZE);
+static struct mcesa boot_mcesa __aligned(MCESA_MAX_SIZE);
 
-void __init nmi_alloc_boot_cpu(struct lowcore *lc)
+void __init nmi_alloc_mcesa_early(u64 *mcesad)
 {
 	if (!nmi_needs_mcesa())
 		return;
-	lc->mcesad = (unsigned long) &boot_mcesa;
-	if (MACHINE_HAS_GS)
-		lc->mcesad |= ilog2(MCESA_MAX_SIZE);
+	*mcesad = __pa(&boot_mcesa);
+	if (cpu_has_gs())
+		*mcesad |= ilog2(MCESA_MAX_SIZE);
 }
 
-static int __init nmi_init(void)
+int nmi_alloc_mcesa(u64 *mcesad)
 {
-	unsigned long origin, cr0, size;
+	unsigned long size;
+	void *origin;
 
+	*mcesad = 0;
 	if (!nmi_needs_mcesa())
 		return 0;
-	size = nmi_get_mcesa_size();
-	if (size > MCESA_MIN_SIZE)
-		mcesa_origin_lc = ilog2(size);
-	/* create slab cache for the machine-check-extended-save-areas */
-	mcesa_cache = kmem_cache_create("nmi_save_areas", size, size, 0, NULL);
-	if (!mcesa_cache)
-		panic("Couldn't create nmi save area cache");
-	origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL);
+	size = cpu_has_gs() ? MCESA_MAX_SIZE : MCESA_MIN_SIZE;
+	origin = kmalloc(size, GFP_KERNEL);
 	if (!origin)
-		panic("Couldn't allocate nmi save area");
+		return -ENOMEM;
 	/* The pointer is stored with mcesa_bits ORed in */
-	kmemleak_not_leak((void *) origin);
-	__ctl_store(cr0, 0, 0);
-	__ctl_clear_bit(0, 28); /* disable lowcore protection */
-	/* Replace boot_mcesa on the boot CPU */
-	S390_lowcore.mcesad = origin | mcesa_origin_lc;
-	__ctl_load(cr0, 0, 0);
+	kmemleak_not_leak(origin);
+	*mcesad = __pa(origin);
+	if (cpu_has_gs())
+		*mcesad |= ilog2(MCESA_MAX_SIZE);
 	return 0;
 }
-early_initcall(nmi_init);
 
-int nmi_alloc_per_cpu(struct lowcore *lc)
+void nmi_free_mcesa(u64 *mcesad)
 {
-	unsigned long origin;
-
 	if (!nmi_needs_mcesa())
-		return 0;
-	origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL);
-	if (!origin)
-		return -ENOMEM;
-	/* The pointer is stored with mcesa_bits ORed in */
-	kmemleak_not_leak((void *) origin);
-	lc->mcesad = origin | mcesa_origin_lc;
-	return 0;
+		return;
+	kfree(__va(*mcesad & MCESA_ORIGIN_MASK));
 }
 
-void nmi_free_per_cpu(struct lowcore *lc)
+static __always_inline char *nmi_puts(char *dest, const char *src)
 {
-	if (!nmi_needs_mcesa())
-		return;
-	kmem_cache_free(mcesa_cache, (void *)(lc->mcesad & MCESA_ORIGIN_MASK));
+	while (*src)
+		*dest++ = *src++;
+	*dest = 0;
+	return dest;
+}
+
+static __always_inline char *u64_to_hex(char *dest, u64 val)
+{
+	int i, num;
+
+	for (i = 1; i <= 16; i++) {
+		num = (val >> (64 - 4 * i)) & 0xf;
+		if (num >= 10)
+			*dest++ = 'A' + num - 10;
+		else
+			*dest++ = '0' + num;
+	}
+	*dest = 0;
+	return dest;
 }
 
 static notrace void s390_handle_damage(void)
 {
+	struct lowcore *lc = get_lowcore();
+	union ctlreg0 cr0, cr0_new;
+	char message[100];
+	psw_t psw_save;
+	char *ptr;
+
 	smp_emergency_stop();
+	diag_amode31_ops.diag308_reset();
+	ptr = nmi_puts(message, "System stopped due to unrecoverable machine check, code: 0x");
+	u64_to_hex(ptr, lc->mcck_interruption_code);
+
+	/*
+	 * Disable low address protection and make machine check new PSW a
+	 * disabled wait PSW. Any additional machine check cannot be handled.
+	 */
+	local_ctl_store(0, &cr0.reg);
+	cr0_new = cr0;
+	cr0_new.lap = 0;
+	local_ctl_load(0, &cr0_new.reg);
+	psw_save = lc->mcck_new_psw;
+	psw_bits(lc->mcck_new_psw).io = 0;
+	psw_bits(lc->mcck_new_psw).ext = 0;
+	psw_bits(lc->mcck_new_psw).wait = 1;
+	sclp_emergency_printk(message);
+
+	/*
+	 * Restore machine check new PSW and control register 0 to original
+	 * values. This makes possible system dump analysis easier.
+	 */
+	lc->mcck_new_psw = psw_save;
+	local_ctl_load(0, &cr0.reg);
 	disabled_wait();
 	while (1);
 }
 NOKPROBE_SYMBOL(s390_handle_damage);
 
 /*
- * Main machine check handler function. Will be called with interrupts enabled
- * or disabled and machine checks enabled or disabled.
+ * Main machine check handler function. Will be called with interrupts disabled
+ * and machine checks enabled.
  */
 void s390_handle_mcck(void)
 {
-	unsigned long flags;
 	struct mcck_struct mcck;
+	unsigned long mflags;
 
 	/*
 	 * Disable machine checks and get the current state of accumulated
 	 * machine checks. Afterwards delete the old state and enable machine
 	 * checks again.
 	 */
-	local_irq_save(flags);
-	local_mcck_disable();
+	local_mcck_save(mflags);
 	mcck = *this_cpu_ptr(&cpu_mcck);
 	memset(this_cpu_ptr(&cpu_mcck), 0, sizeof(mcck));
-	clear_cpu_flag(CIF_MCCK_PENDING);
-	local_mcck_enable();
-	local_irq_restore(flags);
+	local_mcck_restore(mflags);
 
 	if (mcck.channel_report)
 		crw_handle_channel_report();
@@ -167,134 +188,80 @@ void s390_handle_mcck(void)
 		static int mchchk_wng_posted = 0;
 
 		/* Use single cpu clear, as we cannot handle smp here. */
-		__ctl_clear_bit(14, 24);	/* Disable WARNING MCH */
+		local_ctl_clear_bit(14, CR14_WARNING_SUBMASK_BIT);
 		if (xchg(&mchchk_wng_posted, 1) == 0)
 			kill_cad_pid(SIGPWR, 1);
 	}
 	if (mcck.stp_queue)
 		stp_queue_work();
 	if (mcck.kill_task) {
-		local_irq_enable();
 		printk(KERN_EMERG "mcck: Terminating task because of machine "
 		       "malfunction (code 0x%016lx).\n", mcck.mcck_code);
 		printk(KERN_EMERG "mcck: task: %s, pid: %d.\n",
 		       current->comm, current->pid);
-		do_exit(SIGSEGV);
+		if (is_global_init(current))
+			panic("mcck: Attempting to kill init!\n");
+		do_send_sig_info(SIGKILL, SEND_SIG_PRIV, current, PIDTYPE_PID);
 	}
 }
-EXPORT_SYMBOL_GPL(s390_handle_mcck);
 
-/*
- * returns 0 if all required registers are available
- * returns 1 otherwise
+/**
+ * nmi_registers_valid - verify if registers are valid
+ * @mci: machine check interruption code
+ *
+ * Inspect a machine check interruption code and verify if all required
+ * registers are valid. For some registers the corresponding validity bit is
+ * ignored and the registers are set to the expected value.
+ * Returns true if all registers are valid, otherwise false.
  */
-static int notrace s390_check_registers(union mci mci, int umode)
+static bool notrace nmi_registers_valid(union mci mci)
 {
 	union ctlreg2 cr2;
-	int kill_task;
-
-	kill_task = 0;
-
-	if (!mci.gr) {
-		/*
-		 * General purpose registers couldn't be restored and have
-		 * unknown contents. Stop system or terminate process.
-		 */
-		if (!umode)
-			s390_handle_damage();
-		kill_task = 1;
-	}
-	/* Check control registers */
-	if (!mci.cr) {
-		/*
-		 * Control registers have unknown contents.
-		 * Can't recover and therefore stopping machine.
-		 */
-		s390_handle_damage();
-	}
-	if (!mci.fp) {
-		/*
-		 * Floating point registers can't be restored. If the
-		 * kernel currently uses floating point registers the
-		 * system is stopped. If the process has its floating
-		 * pointer registers loaded it is terminated.
-		 */
-		if (S390_lowcore.fpu_flags & KERNEL_VXR_V0V7)
-			s390_handle_damage();
-		if (!test_cpu_flag(CIF_FPU))
-			kill_task = 1;
-	}
-	if (!mci.fc) {
-		/*
-		 * Floating point control register can't be restored.
-		 * If the kernel currently uses the floating pointer
-		 * registers and needs the FPC register the system is
-		 * stopped. If the process has its floating pointer
-		 * registers loaded it is terminated.
-		 */
-		if (S390_lowcore.fpu_flags & KERNEL_FPC)
-			s390_handle_damage();
-		if (!test_cpu_flag(CIF_FPU))
-			kill_task = 1;
-	}
-
-	if (MACHINE_HAS_VX) {
-		if (!mci.vr) {
-			/*
-			 * Vector registers can't be restored. If the kernel
-			 * currently uses vector registers the system is
-			 * stopped. If the process has its vector registers
-			 * loaded it is terminated.
-			 */
-			if (S390_lowcore.fpu_flags & KERNEL_VXR)
-				s390_handle_damage();
-			if (!test_cpu_flag(CIF_FPU))
-				kill_task = 1;
-		}
-	}
-	/* Check if access registers are valid */
-	if (!mci.ar) {
-		/*
-		 * Access registers have unknown contents.
-		 * Terminating task.
-		 */
-		kill_task = 1;
-	}
-	/* Check guarded storage registers */
-	cr2.val = S390_lowcore.cregs_save_area[2];
-	if (cr2.gse) {
-		if (!mci.gs) {
-			/*
-			 * Guarded storage register can't be restored and
-			 * the current processes uses guarded storage.
-			 * It has to be terminated.
-			 */
-			kill_task = 1;
-		}
-	}
-	/* Check if old PSW is valid */
-	if (!mci.wp) {
-		/*
-		 * Can't tell if we come from user or kernel mode
-		 * -> stopping machine.
-		 */
-		s390_handle_damage();
-	}
-	/* Check for invalid kernel instruction address */
-	if (!mci.ia && !umode) {
-		/*
-		 * The instruction address got lost while running
-		 * in the kernel -> stopping machine.
-		 */
-		s390_handle_damage();
-	}
 
+	/*
+	 * The getcpu vdso syscall reads the CPU number from the programmable
+	 * field of the TOD clock. Disregard the TOD programmable register
+	 * validity bit and load the CPU number into the TOD programmable field
+	 * unconditionally.
+	 */
+	set_tod_programmable_field(raw_smp_processor_id());
+	/*
+	 * Set the clock comparator register to the next expected value.
+	 */
+	set_clock_comparator(get_lowcore()->clock_comparator);
+	if (!mci.gr || !mci.fp || !mci.fc)
+		return false;
+	/*
+	 * The vector validity must only be checked if not running a
+	 * KVM guest. For KVM guests the machine check is forwarded by
+	 * KVM and it is the responsibility of the guest to take
+	 * appropriate actions. The host vector or FPU values have been
+	 * saved by KVM and will be restored by KVM.
+	 */
+	if (!mci.vr && !test_cpu_flag(CIF_MCCK_GUEST))
+		return false;
+	if (!mci.ar)
+		return false;
+	/*
+	 * Two cases for guarded storage registers:
+	 * - machine check in kernel or userspace
+	 * - machine check while running SIE (KVM guest)
+	 * For kernel or userspace the userspace values of guarded storage
+	 * control can not be recreated, the process must be terminated.
+	 * For SIE the guest values of guarded storage can not be recreated.
+	 * This is either due to a bug or due to GS being disabled in the
+	 * guest. The guest will be notified by KVM code and the guests machine
+	 * check handling must take care of this. The host values are saved by
+	 * KVM and are not affected.
+	 */
+	cr2.reg = get_lowcore()->cregs_save_area[2];
+	if (cr2.gse && !mci.gs && !test_cpu_flag(CIF_MCCK_GUEST))
+		return false;
 	if (!mci.ms || !mci.pm || !mci.ia)
-		kill_task = 1;
-
-	return kill_task;
+		return false;
+	return true;
 }
-NOKPROBE_SYMBOL(s390_check_registers);
+NOKPROBE_SYMBOL(nmi_registers_valid);
 
 /*
  * Backup the guest's machine check info to its description block
@@ -305,8 +272,7 @@ static void notrace s390_backup_mcck_info(struct pt_regs *regs)
 	struct sie_page *sie_page;
 
 	/* r14 contains the sie block, which was set in sie64a */
-	struct kvm_s390_sie_block *sie_block =
-			(struct kvm_s390_sie_block *) regs->gprs[14];
+	struct kvm_s390_sie_block *sie_block = phys_to_virt(regs->gprs[14]);
 
 	if (sie_block == NULL)
 		/* Something's seriously wrong, stop system. */
@@ -314,11 +280,10 @@ static void notrace s390_backup_mcck_info(struct pt_regs *regs)
 
 	sie_page = container_of(sie_block, struct sie_page, sie_block);
 	mcck_backup = &sie_page->mcck_info;
-	mcck_backup->mcic = S390_lowcore.mcck_interruption_code &
+	mcck_backup->mcic = get_lowcore()->mcck_interruption_code &
 				~(MCCK_CODE_CP | MCCK_CODE_EXT_DAMAGE);
-	mcck_backup->ext_damage_code = S390_lowcore.external_damage_code;
-	mcck_backup->failing_storage_address
-			= S390_lowcore.failing_storage_address;
+	mcck_backup->ext_damage_code = get_lowcore()->external_damage_code;
+	mcck_backup->failing_storage_address = get_lowcore()->failing_storage_address;
 }
 NOKPROBE_SYMBOL(s390_backup_mcck_info);
 
@@ -338,21 +303,22 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
 	static int ipd_count;
 	static DEFINE_SPINLOCK(ipd_lock);
 	static unsigned long long last_ipd;
+	struct lowcore *lc = get_lowcore();
 	struct mcck_struct *mcck;
 	unsigned long long tmp;
+	irqentry_state_t irq_state;
 	union mci mci;
 	unsigned long mcck_dam_code;
+	int mcck_pending = 0;
+
+	irq_state = irqentry_nmi_enter(regs);
 
-	nmi_enter();
+	if (user_mode(regs))
+		update_timer_mcck();
 	inc_irq_stat(NMI_NMI);
-	mci.val = S390_lowcore.mcck_interruption_code;
+	mci.val = lc->mcck_interruption_code;
 	mcck = this_cpu_ptr(&cpu_mcck);
 
-	if (mci.sd) {
-		/* System damage -> stopping machine */
-		s390_handle_damage();
-	}
-
 	/*
 	 * Reinject the instruction processing damages' machine checks
 	 * including Delayed Access Exception into the guest
@@ -393,14 +359,16 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
 			s390_handle_damage();
 		}
 	}
-	if (s390_check_registers(mci, user_mode(regs))) {
+	if (!nmi_registers_valid(mci)) {
+		if (!user_mode(regs))
+			s390_handle_damage();
 		/*
 		 * Couldn't restore all register contents for the
 		 * user space process -> mark task for termination.
 		 */
 		mcck->kill_task = 1;
 		mcck->mcck_code = mci.val;
-		set_cpu_flag(CIF_MCCK_PENDING);
+		mcck_pending = 1;
 	}
 
 	/*
@@ -416,38 +384,36 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
 	}
 	if (mci.ed && mci.ec) {
 		/* External damage */
-		if (S390_lowcore.external_damage_code & (1U << ED_STP_SYNC))
+		if (lc->external_damage_code & (1U << ED_STP_SYNC))
 			mcck->stp_queue |= stp_sync_check();
-		if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND))
+		if (lc->external_damage_code & (1U << ED_STP_ISLAND))
 			mcck->stp_queue |= stp_island_check();
-		if (mcck->stp_queue)
-			set_cpu_flag(CIF_MCCK_PENDING);
+		mcck_pending = 1;
 	}
-
 	/*
 	 * Reinject storage related machine checks into the guest if they
 	 * happen when the guest is running.
 	 */
 	if (!test_cpu_flag(CIF_MCCK_GUEST)) {
+		/* Storage error uncorrected */
 		if (mci.se)
-			/* Storage error uncorrected */
 			s390_handle_damage();
+		/* Storage key-error uncorrected */
 		if (mci.ke)
-			/* Storage key-error uncorrected */
 			s390_handle_damage();
+		/* Storage degradation */
 		if (mci.ds && mci.fa)
-			/* Storage degradation */
 			s390_handle_damage();
 	}
 	if (mci.cp) {
 		/* Channel report word pending */
 		mcck->channel_report = 1;
-		set_cpu_flag(CIF_MCCK_PENDING);
+		mcck_pending = 1;
 	}
 	if (mci.w) {
 		/* Warning pending */
 		mcck->warning = 1;
-		set_cpu_flag(CIF_MCCK_PENDING);
+		mcck_pending = 1;
 	}
 
 	/*
@@ -462,15 +428,19 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
 		*((long *)(regs->gprs[15] + __SF_SIE_REASON)) = -EINTR;
 	}
 	clear_cpu_flag(CIF_MCCK_GUEST);
-	nmi_exit();
+
+	if (mcck_pending)
+		schedule_mcck_handler();
+
+	irqentry_nmi_exit(regs, irq_state);
 }
 NOKPROBE_SYMBOL(s390_do_machine_check);
 
 static int __init machine_check_init(void)
 {
-	ctl_set_bit(14, 25);	/* enable external damage MCH */
-	ctl_set_bit(14, 27);	/* enable system recovery MCH */
-	ctl_set_bit(14, 24);	/* enable warning MCH */
+	system_ctl_set_bit(14, CR14_EXTERNAL_DAMAGE_SUBMASK_BIT);
+	system_ctl_set_bit(14, CR14_RECOVERY_SUBMASK_BIT);
+	system_ctl_set_bit(14, CR14_WARNING_SUBMASK_BIT);
 	return 0;
 }
 early_initcall(machine_check_init);
diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c
index 29e511f5bf06..e11ec15960a1 100644
--- a/arch/s390/kernel/nospec-branch.c
+++ b/arch/s390/kernel/nospec-branch.c
@@ -4,6 +4,8 @@
 #include <linux/cpu.h>
 #include <asm/nospec-branch.h>
 
+int nobp = IS_ENABLED(CONFIG_KERNEL_NOBP);
+
 static int __init nobp_setup_early(char *str)
 {
 	bool enabled;
@@ -14,14 +16,14 @@ static int __init nobp_setup_early(char *str)
 		return rc;
 	if (enabled && test_facility(82)) {
 		/*
-		 * The user explicitely requested nobp=1, enable it and
+		 * The user explicitly requested nobp=1, enable it and
 		 * disable the expoline support.
 		 */
-		__set_facility(82, S390_lowcore.alt_stfle_fac_list);
+		nobp = 1;
 		if (IS_ENABLED(CONFIG_EXPOLINE))
 			nospec_disable = 1;
 	} else {
-		__clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+		nobp = 0;
 	}
 	return 0;
 }
@@ -29,7 +31,7 @@ early_param("nobp", nobp_setup_early);
 
 static int __init nospec_setup_early(char *str)
 {
-	__clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+	nobp = 0;
 	return 0;
 }
 early_param("nospec", nospec_setup_early);
@@ -38,9 +40,9 @@ static int __init nospec_report(void)
 {
 	if (test_facility(156))
 		pr_info("Spectre V2 mitigation: etokens\n");
-	if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable)
+	if (nospec_uses_trampoline())
 		pr_info("Spectre V2 mitigation: execute trampolines\n");
-	if (__test_facility(82, S390_lowcore.alt_stfle_fac_list))
+	if (nobp_enabled())
 		pr_info("Spectre V2 mitigation: limited branch prediction\n");
 	return 0;
 }
@@ -66,14 +68,14 @@ void __init nospec_auto_detect(void)
 		 */
 		if (__is_defined(CC_USING_EXPOLINE))
 			nospec_disable = 1;
-		__clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+		nobp = 0;
 	} else if (__is_defined(CC_USING_EXPOLINE)) {
 		/*
 		 * The kernel has been compiled with expolines.
 		 * Keep expolines enabled and disable nobp.
 		 */
 		nospec_disable = 0;
-		__clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+		nobp = 0;
 	}
 	/*
 	 * If the kernel has not been compiled with expolines the
@@ -86,7 +88,7 @@ static int __init spectre_v2_setup_early(char *str)
 {
 	if (str && !strncmp(str, "on", 2)) {
 		nospec_disable = 0;
-		__clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+		nobp = 0;
 	}
 	if (str && !strncmp(str, "off", 3))
 		nospec_disable = 1;
@@ -99,11 +101,13 @@ early_param("spectre_v2", spectre_v2_setup_early);
 static void __init_or_module __nospec_revert(s32 *start, s32 *end)
 {
 	enum { BRCL_EXPOLINE, BRASL_EXPOLINE } type;
+	static const u8 branch[] = { 0x47, 0x00, 0x07, 0x00 };
 	u8 *instr, *thunk, *br;
 	u8 insnbuf[6];
 	s32 *epo;
 
 	/* Second part of the instruction replace is always a nop */
+	memcpy(insnbuf + 2, branch, sizeof(branch));
 	for (epo = start; epo < end; epo++) {
 		instr = (u8 *) epo + *epo;
 		if (instr[0] == 0xc0 && (instr[1] & 0x0f) == 0x04)
@@ -112,46 +116,24 @@ static void __init_or_module __nospec_revert(s32 *start, s32 *end)
 			type = BRASL_EXPOLINE;	/* brasl instruction */
 		else
 			continue;
-		thunk = instr + (*(int *)(instr + 2)) * 2;
+		thunk = instr + (long)(*(int *)(instr + 2)) * 2;
 		if (thunk[0] == 0xc6 && thunk[1] == 0x00)
 			/* exrl %r0,<target-br> */
-			br = thunk + (*(int *)(thunk + 2)) * 2;
-		else if (thunk[0] == 0xc0 && (thunk[1] & 0x0f) == 0x00 &&
-			 thunk[6] == 0x44 && thunk[7] == 0x00 &&
-			 (thunk[8] & 0x0f) == 0x00 && thunk[9] == 0x00 &&
-			 (thunk[1] & 0xf0) == (thunk[8] & 0xf0))
-			/* larl %rx,<target br> + ex %r0,0(%rx) */
-			br = thunk + (*(int *)(thunk + 2)) * 2;
+			br = thunk + (long)(*(int *)(thunk + 2)) * 2;
 		else
 			continue;
-		/* Check for unconditional branch 0x07f? or 0x47f???? */
-		if ((br[0] & 0xbf) != 0x07 || (br[1] & 0xf0) != 0xf0)
+		if (br[0] != 0x07 || (br[1] & 0xf0) != 0xf0)
 			continue;
-
-		memcpy(insnbuf + 2, (char[]) { 0x47, 0x00, 0x07, 0x00 }, 4);
 		switch (type) {
 		case BRCL_EXPOLINE:
+			/* brcl to thunk, replace with br + nop */
 			insnbuf[0] = br[0];
 			insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f);
-			if (br[0] == 0x47) {
-				/* brcl to b, replace with bc + nopr */
-				insnbuf[2] = br[2];
-				insnbuf[3] = br[3];
-			} else {
-				/* brcl to br, replace with bcr + nop */
-			}
 			break;
 		case BRASL_EXPOLINE:
+			/* brasl to thunk, replace with basr + nop */
+			insnbuf[0] = 0x0d;
 			insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f);
-			if (br[0] == 0x47) {
-				/* brasl to b, replace with bas + nopr */
-				insnbuf[0] = 0x4d;
-				insnbuf[2] = br[2];
-				insnbuf[3] = br[3];
-			} else {
-				/* brasl to br, replace with basr + nop */
-				insnbuf[0] = 0x0d;
-			}
 			break;
 		}
 
diff --git a/arch/s390/kernel/nospec-sysfs.c b/arch/s390/kernel/nospec-sysfs.c
index 48f472bf9290..5970dd3ee7c5 100644
--- a/arch/s390/kernel/nospec-sysfs.c
+++ b/arch/s390/kernel/nospec-sysfs.c
@@ -7,17 +7,17 @@
 ssize_t cpu_show_spectre_v1(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
-	return sprintf(buf, "Mitigation: __user pointer sanitization\n");
+	return sysfs_emit(buf, "Mitigation: __user pointer sanitization\n");
 }
 
 ssize_t cpu_show_spectre_v2(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
 	if (test_facility(156))
-		return sprintf(buf, "Mitigation: etokens\n");
-	if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable)
-		return sprintf(buf, "Mitigation: execute trampolines\n");
-	if (__test_facility(82, S390_lowcore.alt_stfle_fac_list))
-		return sprintf(buf, "Mitigation: limited branch prediction\n");
-	return sprintf(buf, "Vulnerable\n");
+		return sysfs_emit(buf, "Mitigation: etokens\n");
+	if (nospec_uses_trampoline())
+		return sysfs_emit(buf, "Mitigation: execute trampolines\n");
+	if (nobp_enabled())
+		return sysfs_emit(buf, "Mitigation: limited branch prediction\n");
+	return sysfs_emit(buf, "Vulnerable\n");
 }
diff --git a/arch/s390/kernel/numa.c b/arch/s390/kernel/numa.c
new file mode 100644
index 000000000000..2fc40f97c0ad
--- /dev/null
+++ b/arch/s390/kernel/numa.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NUMA support for s390
+ *
+ * Implement NUMA core code.
+ *
+ * Copyright IBM Corp. 2015
+ */
+
+#include <linux/kernel.h>
+#include <linux/mmzone.h>
+#include <linux/cpumask.h>
+#include <linux/memblock.h>
+#include <linux/node.h>
+#include <asm/numa.h>
+
+void __init numa_setup(void)
+{
+	int nid;
+
+	nodes_clear(node_possible_map);
+	node_set(0, node_possible_map);
+	node_set_online(0);
+	for (nid = 0; nid < MAX_NUMNODES; nid++)
+		NODE_DATA(nid) = memblock_alloc_or_panic(sizeof(pg_data_t), 8);
+	NODE_DATA(0)->node_spanned_pages = memblock_end_of_DRAM() >> PAGE_SHIFT;
+	NODE_DATA(0)->node_id = 0;
+}
diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c
index 0a5e4bafb6ad..c2a468986212 100644
--- a/arch/s390/kernel/os_info.c
+++ b/arch/s390/kernel/os_info.c
@@ -13,8 +13,13 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <asm/checksum.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
 #include <asm/os_info.h>
+#include <asm/physmem_info.h>
+#include <asm/maccess.h>
+#include <asm/asm-offsets.h>
+#include <asm/sections.h>
+#include <asm/ipl.h>
 
 /*
  * OS info structure has to be page aligned
@@ -27,7 +32,7 @@ static struct os_info os_info __page_aligned_data;
 u32 os_info_csum(struct os_info *os_info)
 {
 	int size = sizeof(*os_info) - offsetof(struct os_info, version_major);
-	return (__force u32)csum_partial(&os_info->version_major, size, 0);
+	return (__force u32)cksm(&os_info->version_major, size, 0);
 }
 
 /*
@@ -41,28 +46,51 @@ void os_info_crashkernel_add(unsigned long base, unsigned long size)
 }
 
 /*
- * Add OS info entry and update checksum
+ * Add OS info data entry and update checksum
  */
-void os_info_entry_add(int nr, void *ptr, u64 size)
+void os_info_entry_add_data(int nr, void *ptr, u64 size)
 {
-	os_info.entry[nr].addr = (u64)(unsigned long)ptr;
+	os_info.entry[nr].addr = __pa(ptr);
 	os_info.entry[nr].size = size;
-	os_info.entry[nr].csum = (__force u32)csum_partial(ptr, size, 0);
+	os_info.entry[nr].csum = (__force u32)cksm(ptr, size, 0);
 	os_info.csum = os_info_csum(&os_info);
 }
 
 /*
- * Initialize OS info struture and set lowcore pointer
+ * Add OS info value entry and update checksum
+ */
+void os_info_entry_add_val(int nr, u64 value)
+{
+	os_info.entry[nr].val = value;
+	os_info.entry[nr].size = 0;
+	os_info.entry[nr].csum = 0;
+	os_info.csum = os_info_csum(&os_info);
+}
+
+/*
+ * Initialize OS info structure and set lowcore pointer
  */
 void __init os_info_init(void)
 {
-	void *ptr = &os_info;
+	struct lowcore *abs_lc;
 
+	BUILD_BUG_ON(sizeof(struct os_info) != PAGE_SIZE);
 	os_info.version_major = OS_INFO_VERSION_MAJOR;
 	os_info.version_minor = OS_INFO_VERSION_MINOR;
 	os_info.magic = OS_INFO_MAGIC;
+	os_info_entry_add_val(OS_INFO_IDENTITY_BASE, __identity_base);
+	os_info_entry_add_val(OS_INFO_KASLR_OFFSET, kaslr_offset());
+	os_info_entry_add_val(OS_INFO_KASLR_OFF_PHYS, __kaslr_offset_phys);
+	os_info_entry_add_val(OS_INFO_VMEMMAP, (unsigned long)vmemmap);
+	os_info_entry_add_val(OS_INFO_AMODE31_START, AMODE31_START);
+	os_info_entry_add_val(OS_INFO_AMODE31_END, AMODE31_END);
+	os_info_entry_add_val(OS_INFO_IMAGE_START, (unsigned long)_stext);
+	os_info_entry_add_val(OS_INFO_IMAGE_END, (unsigned long)_end);
+	os_info_entry_add_val(OS_INFO_IMAGE_PHYS, __pa_symbol(_stext));
 	os_info.csum = os_info_csum(&os_info);
-	mem_assign_absolute(S390_lowcore.os_info, (unsigned long) ptr);
+	abs_lc = get_abs_lowcore();
+	abs_lc->os_info = __pa(&os_info);
+	put_abs_lowcore(abs_lc);
 }
 
 #ifdef CONFIG_CRASH_DUMP
@@ -90,11 +118,11 @@ static void os_info_old_alloc(int nr, int align)
 		goto fail;
 	}
 	buf_align = PTR_ALIGN(buf, align);
-	if (copy_oldmem_kernel(buf_align, (void *) addr, size)) {
+	if (copy_oldmem_kernel(buf_align, addr, size)) {
 		msg = "copy failed";
 		goto fail_free;
 	}
-	csum = (__force u32)csum_partial(buf_align, size, 0);
+	csum = (__force u32)cksm(buf_align, size, 0);
 	if (csum != os_info_old->entry[nr].csum) {
 		msg = "checksum failed";
 		goto fail_free;
@@ -121,17 +149,16 @@ static void os_info_old_init(void)
 
 	if (os_info_init)
 		return;
-	if (!OLDMEM_BASE)
+	if (!oldmem_data.start && !is_ipl_type_dump())
 		goto fail;
-	if (copy_oldmem_kernel(&addr, &S390_lowcore.os_info, sizeof(addr)))
+	if (copy_oldmem_kernel(&addr, __LC_OS_INFO, sizeof(addr)))
 		goto fail;
 	if (addr == 0 || addr % PAGE_SIZE)
 		goto fail;
 	os_info_old = kzalloc(sizeof(*os_info_old), GFP_KERNEL);
 	if (!os_info_old)
 		goto fail;
-	if (copy_oldmem_kernel(os_info_old, (void *) addr,
-			       sizeof(*os_info_old)))
+	if (copy_oldmem_kernel(os_info_old, addr, sizeof(*os_info_old)))
 		goto fail_free;
 	if (os_info_old->magic != OS_INFO_MAGIC)
 		goto fail_free;
@@ -154,7 +181,7 @@ fail:
 }
 
 /*
- * Return pointer to os infor entry and its size
+ * Return pointer to os info entry and its size
  */
 void *os_info_old_entry(int nr, unsigned long *size)
 {
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 0eb1d1cc53a8..33205dd410e4 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -2,8 +2,9 @@
 /*
  * Performance event support for s390x - CPU-measurement Counter Facility
  *
- *  Copyright IBM Corp. 2012, 2019
+ *  Copyright IBM Corp. 2012, 2023
  *  Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
+ *	       Thomas Richter <tmricht@linux.ibm.com>
  */
 #define KMSG_COMPONENT	"cpum_cf"
 #define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
@@ -14,7 +15,584 @@
 #include <linux/notifier.h>
 #include <linux/init.h>
 #include <linux/export.h>
-#include <asm/cpu_mcf.h>
+#include <linux/miscdevice.h>
+#include <linux/perf_event.h>
+
+#include <asm/cpu_mf.h>
+#include <asm/hwctrset.h>
+#include <asm/debug.h>
+
+/* Perf PMU definitions for the counter facility */
+#define PERF_CPUM_CF_MAX_CTR		0xffffUL  /* Max ctr for ECCTR */
+#define PERF_EVENT_CPUM_CF_DIAG		0xBC000UL /* Event: Counter sets */
+
+enum cpumf_ctr_set {
+	CPUMF_CTR_SET_BASIC   = 0,    /* Basic Counter Set */
+	CPUMF_CTR_SET_USER    = 1,    /* Problem-State Counter Set */
+	CPUMF_CTR_SET_CRYPTO  = 2,    /* Crypto-Activity Counter Set */
+	CPUMF_CTR_SET_EXT     = 3,    /* Extended Counter Set */
+	CPUMF_CTR_SET_MT_DIAG = 4,    /* MT-diagnostic Counter Set */
+
+	/* Maximum number of counter sets */
+	CPUMF_CTR_SET_MAX,
+};
+
+#define CPUMF_LCCTL_ENABLE_SHIFT    16
+#define CPUMF_LCCTL_ACTCTL_SHIFT     0
+
+static inline void ctr_set_enable(u64 *state, u64 ctrsets)
+{
+	*state |= ctrsets << CPUMF_LCCTL_ENABLE_SHIFT;
+}
+
+static inline void ctr_set_disable(u64 *state, u64 ctrsets)
+{
+	*state &= ~(ctrsets << CPUMF_LCCTL_ENABLE_SHIFT);
+}
+
+static inline void ctr_set_start(u64 *state, u64 ctrsets)
+{
+	*state |= ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT;
+}
+
+static inline void ctr_set_stop(u64 *state, u64 ctrsets)
+{
+	*state &= ~(ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT);
+}
+
+static inline int ctr_stcctm(enum cpumf_ctr_set set, u64 range, u64 *dest)
+{
+	switch (set) {
+	case CPUMF_CTR_SET_BASIC:
+		return stcctm(BASIC, range, dest);
+	case CPUMF_CTR_SET_USER:
+		return stcctm(PROBLEM_STATE, range, dest);
+	case CPUMF_CTR_SET_CRYPTO:
+		return stcctm(CRYPTO_ACTIVITY, range, dest);
+	case CPUMF_CTR_SET_EXT:
+		return stcctm(EXTENDED, range, dest);
+	case CPUMF_CTR_SET_MT_DIAG:
+		return stcctm(MT_DIAG_CLEARING, range, dest);
+	case CPUMF_CTR_SET_MAX:
+		return 3;
+	}
+	return 3;
+}
+
+struct cpu_cf_events {
+	refcount_t refcnt;		/* Reference count */
+	atomic_t		ctr_set[CPUMF_CTR_SET_MAX];
+	u64			state;		/* For perf_event_open SVC */
+	u64			dev_state;	/* For /dev/hwctr */
+	unsigned int		flags;
+	size_t used;			/* Bytes used in data */
+	size_t usedss;			/* Bytes used in start/stop */
+	unsigned char start[PAGE_SIZE];	/* Counter set at event add */
+	unsigned char stop[PAGE_SIZE];	/* Counter set at event delete */
+	unsigned char data[PAGE_SIZE];	/* Counter set at /dev/hwctr */
+	unsigned int sets;		/* # Counter set saved in memory */
+};
+
+static unsigned int cfdiag_cpu_speed;	/* CPU speed for CF_DIAG trailer */
+static debug_info_t *cf_dbg;
+
+/*
+ * The CPU Measurement query counter information instruction contains
+ * information which varies per machine generation, but is constant and
+ * does not change when running on a particular machine, such as counter
+ * first and second version number. This is needed to determine the size
+ * of counter sets. Extract this information at device driver initialization.
+ */
+static struct cpumf_ctr_info	cpumf_ctr_info;
+
+struct cpu_cf_ptr {
+	struct cpu_cf_events *cpucf;
+};
+
+static struct cpu_cf_root {		/* Anchor to per CPU data */
+	refcount_t refcnt;		/* Overall active events */
+	struct cpu_cf_ptr __percpu *cfptr;
+} cpu_cf_root;
+
+/*
+ * Serialize event initialization and event removal. Both are called from
+ * user space in task context with perf_event_open() and close()
+ * system calls.
+ *
+ * This mutex serializes functions cpum_cf_alloc_cpu() called at event
+ * initialization via cpumf_pmu_event_init() and function cpum_cf_free_cpu()
+ * called at event removal via call back function hw_perf_event_destroy()
+ * when the event is deleted. They are serialized to enforce correct
+ * bookkeeping of pointer and reference counts anchored by
+ * struct cpu_cf_root and the access to cpu_cf_root::refcnt and the
+ * per CPU pointers stored in cpu_cf_root::cfptr.
+ */
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+/*
+ * Get pointer to per-cpu structure.
+ *
+ * Function get_cpu_cfhw() is called from
+ * - cfset_copy_all(): This function is protected by cpus_read_lock(), so
+ *   CPU hot plug remove can not happen. Event removal requires a close()
+ *   first.
+ *
+ * Function this_cpu_cfhw() is called from perf common code functions:
+ * - pmu_{en|dis}able(), pmu_{add|del}()and pmu_{start|stop}():
+ *   All functions execute with interrupts disabled on that particular CPU.
+ * - cfset_ioctl_{on|off}, cfset_cpu_read(): see comment cfset_copy_all().
+ *
+ * Therefore it is safe to access the CPU specific pointer to the event.
+ */
+static struct cpu_cf_events *get_cpu_cfhw(int cpu)
+{
+	struct cpu_cf_ptr __percpu *p = cpu_cf_root.cfptr;
+
+	if (p) {
+		struct cpu_cf_ptr *q = per_cpu_ptr(p, cpu);
+
+		return q->cpucf;
+	}
+	return NULL;
+}
+
+static struct cpu_cf_events *this_cpu_cfhw(void)
+{
+	return get_cpu_cfhw(smp_processor_id());
+}
+
+/* Disable counter sets on dedicated CPU */
+static void cpum_cf_reset_cpu(void *flags)
+{
+	lcctl(0);
+}
+
+/* Free per CPU data when the last event is removed. */
+static void cpum_cf_free_root(void)
+{
+	if (!refcount_dec_and_test(&cpu_cf_root.refcnt))
+		return;
+	free_percpu(cpu_cf_root.cfptr);
+	cpu_cf_root.cfptr = NULL;
+	irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT);
+	on_each_cpu(cpum_cf_reset_cpu, NULL, 1);
+	debug_sprintf_event(cf_dbg, 4, "%s root.refcnt %u cfptr %d\n",
+			    __func__, refcount_read(&cpu_cf_root.refcnt),
+			    !cpu_cf_root.cfptr);
+}
+
+/*
+ * On initialization of first event also allocate per CPU data dynamically.
+ * Start with an array of pointers, the array size is the maximum number of
+ * CPUs possible, which might be larger than the number of CPUs currently
+ * online.
+ */
+static int cpum_cf_alloc_root(void)
+{
+	int rc = 0;
+
+	if (refcount_inc_not_zero(&cpu_cf_root.refcnt))
+		return rc;
+
+	/* The memory is already zeroed. */
+	cpu_cf_root.cfptr = alloc_percpu(struct cpu_cf_ptr);
+	if (cpu_cf_root.cfptr) {
+		refcount_set(&cpu_cf_root.refcnt, 1);
+		on_each_cpu(cpum_cf_reset_cpu, NULL, 1);
+		irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT);
+	} else {
+		rc = -ENOMEM;
+	}
+
+	return rc;
+}
+
+/* Free CPU counter data structure for a PMU */
+static void cpum_cf_free_cpu(int cpu)
+{
+	struct cpu_cf_events *cpuhw;
+	struct cpu_cf_ptr *p;
+
+	mutex_lock(&pmc_reserve_mutex);
+	/*
+	 * When invoked via CPU hotplug handler, there might be no events
+	 * installed or that particular CPU might not have an
+	 * event installed. This anchor pointer can be NULL!
+	 */
+	if (!cpu_cf_root.cfptr)
+		goto out;
+	p = per_cpu_ptr(cpu_cf_root.cfptr, cpu);
+	cpuhw = p->cpucf;
+	/*
+	 * Might be zero when called from CPU hotplug handler and no event
+	 * installed on that CPU, but on different CPUs.
+	 */
+	if (!cpuhw)
+		goto out;
+
+	if (refcount_dec_and_test(&cpuhw->refcnt)) {
+		kfree(cpuhw);
+		p->cpucf = NULL;
+	}
+	cpum_cf_free_root();
+out:
+	mutex_unlock(&pmc_reserve_mutex);
+}
+
+/* Allocate CPU counter data structure for a PMU. Called under mutex lock. */
+static int cpum_cf_alloc_cpu(int cpu)
+{
+	struct cpu_cf_events *cpuhw;
+	struct cpu_cf_ptr *p;
+	int rc;
+
+	mutex_lock(&pmc_reserve_mutex);
+	rc = cpum_cf_alloc_root();
+	if (rc)
+		goto unlock;
+	p = per_cpu_ptr(cpu_cf_root.cfptr, cpu);
+	cpuhw = p->cpucf;
+
+	if (!cpuhw) {
+		cpuhw = kzalloc(sizeof(*cpuhw), GFP_KERNEL);
+		if (cpuhw) {
+			p->cpucf = cpuhw;
+			refcount_set(&cpuhw->refcnt, 1);
+		} else {
+			rc = -ENOMEM;
+		}
+	} else {
+		refcount_inc(&cpuhw->refcnt);
+	}
+	if (rc) {
+		/*
+		 * Error in allocation of event, decrement anchor. Since
+		 * cpu_cf_event in not created, its destroy() function is not
+		 * invoked. Adjust the reference counter for the anchor.
+		 */
+		cpum_cf_free_root();
+	}
+unlock:
+	mutex_unlock(&pmc_reserve_mutex);
+	return rc;
+}
+
+/*
+ * Create/delete per CPU data structures for /dev/hwctr interface and events
+ * created by perf_event_open().
+ * If cpu is -1, track task on all available CPUs. This requires
+ * allocation of hardware data structures for all CPUs. This setup handles
+ * perf_event_open() with task context and /dev/hwctr interface.
+ * If cpu is non-zero install event on this CPU only. This setup handles
+ * perf_event_open() with CPU context.
+ */
+static int cpum_cf_alloc(int cpu)
+{
+	cpumask_var_t mask;
+	int rc;
+
+	if (cpu == -1) {
+		if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+			return -ENOMEM;
+		for_each_online_cpu(cpu) {
+			rc = cpum_cf_alloc_cpu(cpu);
+			if (rc) {
+				for_each_cpu(cpu, mask)
+					cpum_cf_free_cpu(cpu);
+				break;
+			}
+			cpumask_set_cpu(cpu, mask);
+		}
+		free_cpumask_var(mask);
+	} else {
+		rc = cpum_cf_alloc_cpu(cpu);
+	}
+	return rc;
+}
+
+static void cpum_cf_free(int cpu)
+{
+	if (cpu == -1) {
+		for_each_online_cpu(cpu)
+			cpum_cf_free_cpu(cpu);
+	} else {
+		cpum_cf_free_cpu(cpu);
+	}
+}
+
+#define	CF_DIAG_CTRSET_DEF		0xfeef	/* Counter set header mark */
+						/* interval in seconds */
+
+/* Counter sets are stored as data stream in a page sized memory buffer and
+ * exported to user space via raw data attached to the event sample data.
+ * Each counter set starts with an eight byte header consisting of:
+ * - a two byte eye catcher (0xfeef)
+ * - a one byte counter set number
+ * - a two byte counter set size (indicates the number of counters in this set)
+ * - a three byte reserved value (must be zero) to make the header the same
+ *   size as a counter value.
+ * All counter values are eight byte in size.
+ *
+ * All counter sets are followed by a 64 byte trailer.
+ * The trailer consists of a:
+ * - flag field indicating valid fields when corresponding bit set
+ * - the counter facility first and second version number
+ * - the CPU speed if nonzero
+ * - the time stamp the counter sets have been collected
+ * - the time of day (TOD) base value
+ * - the machine type.
+ *
+ * The counter sets are saved when the process is prepared to be executed on a
+ * CPU and saved again when the process is going to be removed from a CPU.
+ * The difference of both counter sets are calculated and stored in the event
+ * sample data area.
+ */
+struct cf_ctrset_entry {	/* CPU-M CF counter set entry (8 byte) */
+	unsigned int def:16;	/* 0-15  Data Entry Format */
+	unsigned int set:16;	/* 16-31 Counter set identifier */
+	unsigned int ctr:16;	/* 32-47 Number of stored counters */
+	unsigned int res1:16;	/* 48-63 Reserved */
+};
+
+struct cf_trailer_entry {	/* CPU-M CF_DIAG trailer (64 byte) */
+	/* 0 - 7 */
+	union {
+		struct {
+			unsigned int clock_base:1;	/* TOD clock base set */
+			unsigned int speed:1;		/* CPU speed set */
+			/* Measurement alerts */
+			unsigned int mtda:1;	/* Loss of MT ctr. data alert */
+			unsigned int caca:1;	/* Counter auth. change alert */
+			unsigned int lcda:1;	/* Loss of counter data alert */
+		};
+		unsigned long flags;	/* 0-63    All indicators */
+	};
+	/* 8 - 15 */
+	unsigned int cfvn:16;			/* 64-79   Ctr First Version */
+	unsigned int csvn:16;			/* 80-95   Ctr Second Version */
+	unsigned int cpu_speed:32;		/* 96-127  CPU speed */
+	/* 16 - 23 */
+	unsigned long timestamp;		/* 128-191 Timestamp (TOD) */
+	/* 24 - 55 */
+	union {
+		struct {
+			unsigned long progusage1;
+			unsigned long progusage2;
+			unsigned long progusage3;
+			unsigned long tod_base;
+		};
+		unsigned long progusage[4];
+	};
+	/* 56 - 63 */
+	unsigned int mach_type:16;		/* Machine type */
+	unsigned int res1:16;			/* Reserved */
+	unsigned int res2:32;			/* Reserved */
+};
+
+/* Create the trailer data at the end of a page. */
+static void cfdiag_trailer(struct cf_trailer_entry *te)
+{
+	struct cpuid cpuid;
+
+	te->cfvn = cpumf_ctr_info.cfvn;		/* Counter version numbers */
+	te->csvn = cpumf_ctr_info.csvn;
+
+	get_cpu_id(&cpuid);			/* Machine type */
+	te->mach_type = cpuid.machine;
+	te->cpu_speed = cfdiag_cpu_speed;
+	if (te->cpu_speed)
+		te->speed = 1;
+	te->clock_base = 1;			/* Save clock base */
+	te->tod_base = tod_clock_base.tod;
+	te->timestamp = get_tod_clock_fast();
+}
+
+/*
+ * The number of counters per counter set varies between machine generations,
+ * but is constant when running on a particular machine generation.
+ * Determine each counter set size at device driver initialization and
+ * retrieve it later.
+ */
+static size_t cpumf_ctr_setsizes[CPUMF_CTR_SET_MAX];
+static void cpum_cf_make_setsize(enum cpumf_ctr_set ctrset)
+{
+	size_t ctrset_size = 0;
+
+	switch (ctrset) {
+	case CPUMF_CTR_SET_BASIC:
+		if (cpumf_ctr_info.cfvn >= 1)
+			ctrset_size = 6;
+		break;
+	case CPUMF_CTR_SET_USER:
+		if (cpumf_ctr_info.cfvn == 1)
+			ctrset_size = 6;
+		else if (cpumf_ctr_info.cfvn >= 3)
+			ctrset_size = 2;
+		break;
+	case CPUMF_CTR_SET_CRYPTO:
+		if (cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5)
+			ctrset_size = 16;
+		else if (cpumf_ctr_info.csvn >= 6)
+			ctrset_size = 20;
+		break;
+	case CPUMF_CTR_SET_EXT:
+		if (cpumf_ctr_info.csvn == 1)
+			ctrset_size = 32;
+		else if (cpumf_ctr_info.csvn == 2)
+			ctrset_size = 48;
+		else if (cpumf_ctr_info.csvn >= 3 && cpumf_ctr_info.csvn <= 5)
+			ctrset_size = 128;
+		else if (cpumf_ctr_info.csvn == 6 || cpumf_ctr_info.csvn == 7)
+			ctrset_size = 160;
+		break;
+	case CPUMF_CTR_SET_MT_DIAG:
+		if (cpumf_ctr_info.csvn > 3)
+			ctrset_size = 48;
+		break;
+	case CPUMF_CTR_SET_MAX:
+		break;
+	}
+	cpumf_ctr_setsizes[ctrset] = ctrset_size;
+}
+
+/*
+ * Return the maximum possible counter set size (in number of 8 byte counters)
+ * depending on type and model number.
+ */
+static size_t cpum_cf_read_setsize(enum cpumf_ctr_set ctrset)
+{
+	return cpumf_ctr_setsizes[ctrset];
+}
+
+/* Read a counter set. The counter set number determines the counter set and
+ * the CPUM-CF first and second version number determine the number of
+ * available counters in each counter set.
+ * Each counter set starts with header containing the counter set number and
+ * the number of eight byte counters.
+ *
+ * The functions returns the number of bytes occupied by this counter set
+ * including the header.
+ * If there is no counter in the counter set, this counter set is useless and
+ * zero is returned on this case.
+ *
+ * Note that the counter sets may not be enabled or active and the stcctm
+ * instruction might return error 3. Depending on error_ok value this is ok,
+ * for example when called from cpumf_pmu_start() call back function.
+ */
+static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset,
+			       size_t room, bool error_ok)
+{
+	size_t ctrset_size, need = 0;
+	int rc = 3;				/* Assume write failure */
+
+	ctrdata->def = CF_DIAG_CTRSET_DEF;
+	ctrdata->set = ctrset;
+	ctrdata->res1 = 0;
+	ctrset_size = cpum_cf_read_setsize(ctrset);
+
+	if (ctrset_size) {			/* Save data */
+		need = ctrset_size * sizeof(u64) + sizeof(*ctrdata);
+		if (need <= room) {
+			rc = ctr_stcctm(ctrset, ctrset_size,
+					(u64 *)(ctrdata + 1));
+		}
+		if (rc != 3 || error_ok)
+			ctrdata->ctr = ctrset_size;
+		else
+			need = 0;
+	}
+
+	return need;
+}
+
+static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = {
+	[CPUMF_CTR_SET_BASIC]	= 0x02,
+	[CPUMF_CTR_SET_USER]	= 0x04,
+	[CPUMF_CTR_SET_CRYPTO]	= 0x08,
+	[CPUMF_CTR_SET_EXT]	= 0x01,
+	[CPUMF_CTR_SET_MT_DIAG] = 0x20,
+};
+
+/* Read out all counter sets and save them in the provided data buffer.
+ * The last 64 byte host an artificial trailer entry.
+ */
+static size_t cfdiag_getctr(void *data, size_t sz, unsigned long auth,
+			    bool error_ok)
+{
+	struct cf_trailer_entry *trailer;
+	size_t offset = 0, done;
+	int i;
+
+	memset(data, 0, sz);
+	sz -= sizeof(*trailer);		/* Always room for trailer */
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		struct cf_ctrset_entry *ctrdata = data + offset;
+
+		if (!(auth & cpumf_ctr_ctl[i]))
+			continue;	/* Counter set not authorized */
+
+		done = cfdiag_getctrset(ctrdata, i, sz - offset, error_ok);
+		offset += done;
+	}
+	trailer = data + offset;
+	cfdiag_trailer(trailer);
+	return offset + sizeof(*trailer);
+}
+
+/* Calculate the difference for each counter in a counter set. */
+static void cfdiag_diffctrset(u64 *pstart, u64 *pstop, int counters)
+{
+	for (; --counters >= 0; ++pstart, ++pstop)
+		if (*pstop >= *pstart)
+			*pstop -= *pstart;
+		else
+			*pstop = *pstart - *pstop + 1;
+}
+
+/* Scan the counter sets and calculate the difference of each counter
+ * in each set. The result is the increment of each counter during the
+ * period the counter set has been activated.
+ *
+ * Return true on success.
+ */
+static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth)
+{
+	struct cf_trailer_entry *trailer_start, *trailer_stop;
+	struct cf_ctrset_entry *ctrstart, *ctrstop;
+	size_t offset = 0;
+	int i;
+
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset);
+		ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset);
+
+		/* Counter set not authorized */
+		if (!(auth & cpumf_ctr_ctl[i]))
+			continue;
+		/* Counter set size zero was not saved */
+		if (!cpum_cf_read_setsize(i))
+			continue;
+
+		if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) {
+			pr_err_once("cpum_cf_diag counter set compare error "
+				    "in set %i\n", ctrstart->set);
+			return 0;
+		}
+		if (ctrstart->def == CF_DIAG_CTRSET_DEF) {
+			cfdiag_diffctrset((u64 *)(ctrstart + 1),
+					  (u64 *)(ctrstop + 1), ctrstart->ctr);
+			offset += ctrstart->ctr * sizeof(u64) +
+							sizeof(*ctrstart);
+		}
+	}
+
+	/* Save time_stamp from start of event in stop's trailer */
+	trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset);
+	trailer_stop = (struct cf_trailer_entry *)(cpuhw->stop + offset);
+	trailer_stop->progusage[0] = trailer_start->timestamp;
+
+	return 1;
+}
 
 static enum cpumf_ctr_set get_counter_set(u64 event)
 {
@@ -34,39 +612,35 @@ static enum cpumf_ctr_set get_counter_set(u64 event)
 	return set;
 }
 
-static int validate_ctr_version(const struct hw_perf_event *hwc)
+static int validate_ctr_version(const u64 config, enum cpumf_ctr_set set)
 {
-	struct cpu_cf_events *cpuhw;
-	int err = 0;
 	u16 mtdiag_ctl;
-
-	cpuhw = &get_cpu_var(cpu_cf_events);
+	int err = 0;
 
 	/* check required version for counter sets */
-	switch (hwc->config_base) {
+	switch (set) {
 	case CPUMF_CTR_SET_BASIC:
 	case CPUMF_CTR_SET_USER:
-		if (cpuhw->info.cfvn < 1)
+		if (cpumf_ctr_info.cfvn < 1)
 			err = -EOPNOTSUPP;
 		break;
 	case CPUMF_CTR_SET_CRYPTO:
-		if ((cpuhw->info.csvn >= 1 && cpuhw->info.csvn <= 5 &&
-		     hwc->config > 79) ||
-		    (cpuhw->info.csvn >= 6 && hwc->config > 83))
+		if ((cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5 &&
+		     config > 79) || (cpumf_ctr_info.csvn >= 6 && config > 83))
 			err = -EOPNOTSUPP;
 		break;
 	case CPUMF_CTR_SET_EXT:
-		if (cpuhw->info.csvn < 1)
+		if (cpumf_ctr_info.csvn < 1)
 			err = -EOPNOTSUPP;
-		if ((cpuhw->info.csvn == 1 && hwc->config > 159) ||
-		    (cpuhw->info.csvn == 2 && hwc->config > 175) ||
-		    (cpuhw->info.csvn >= 3 && cpuhw->info.csvn <= 5
-		     && hwc->config > 255) ||
-		    (cpuhw->info.csvn >= 6 && hwc->config > 287))
+		if ((cpumf_ctr_info.csvn == 1 && config > 159) ||
+		    (cpumf_ctr_info.csvn == 2 && config > 175) ||
+		    (cpumf_ctr_info.csvn >= 3 && cpumf_ctr_info.csvn <= 5 &&
+		     config > 255) ||
+		    (cpumf_ctr_info.csvn >= 6 && config > 287))
 			err = -EOPNOTSUPP;
 		break;
 	case CPUMF_CTR_SET_MT_DIAG:
-		if (cpuhw->info.csvn <= 3)
+		if (cpumf_ctr_info.csvn <= 3)
 			err = -EOPNOTSUPP;
 		/*
 		 * MT-diagnostic counters are read-only.  The counter set
@@ -81,35 +655,15 @@ static int validate_ctr_version(const struct hw_perf_event *hwc)
 		 * counter set is enabled and active.
 		 */
 		mtdiag_ctl = cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG];
-		if (!((cpuhw->info.auth_ctl & mtdiag_ctl) &&
-		      (cpuhw->info.enable_ctl & mtdiag_ctl) &&
-		      (cpuhw->info.act_ctl & mtdiag_ctl)))
+		if (!((cpumf_ctr_info.auth_ctl & mtdiag_ctl) &&
+		      (cpumf_ctr_info.enable_ctl & mtdiag_ctl) &&
+		      (cpumf_ctr_info.act_ctl & mtdiag_ctl)))
 			err = -EOPNOTSUPP;
 		break;
+	case CPUMF_CTR_SET_MAX:
+		err = -EOPNOTSUPP;
 	}
 
-	put_cpu_var(cpu_cf_events);
-	return err;
-}
-
-static int validate_ctr_auth(const struct hw_perf_event *hwc)
-{
-	struct cpu_cf_events *cpuhw;
-	u64 ctrs_state;
-	int err = 0;
-
-	cpuhw = &get_cpu_var(cpu_cf_events);
-
-	/* Check authorization for cpu counter sets.
-	 * If the particular CPU counter set is not authorized,
-	 * return with -ENOENT in order to fall back to other
-	 * PMUs that might suffice the event request.
-	 */
-	ctrs_state = cpumf_ctr_ctl[hwc->config_base];
-	if (!(ctrs_state & cpuhw->info.auth_ctl))
-		err = -ENOENT;
-
-	put_cpu_var(cpu_cf_events);
 	return err;
 }
 
@@ -120,20 +674,17 @@ static int validate_ctr_auth(const struct hw_perf_event *hwc)
  */
 static void cpumf_pmu_enable(struct pmu *pmu)
 {
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cpu_cf_events *cpuhw = this_cpu_cfhw();
 	int err;
 
-	if (cpuhw->flags & PMU_F_ENABLED)
-		return;
-
-	err = lcctl(cpuhw->state);
-	if (err) {
-		pr_err("Enabling the performance measuring unit "
-		       "failed with rc=%x\n", err);
+	if (!cpuhw || (cpuhw->flags & PMU_F_ENABLED))
 		return;
-	}
 
-	cpuhw->flags |= PMU_F_ENABLED;
+	err = lcctl(cpuhw->state | cpuhw->dev_state);
+	if (err)
+		pr_err("Enabling the performance measuring unit failed with rc=%x\n", err);
+	else
+		cpuhw->flags |= PMU_F_ENABLED;
 }
 
 /*
@@ -143,39 +694,26 @@ static void cpumf_pmu_enable(struct pmu *pmu)
  */
 static void cpumf_pmu_disable(struct pmu *pmu)
 {
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	int err;
+	struct cpu_cf_events *cpuhw = this_cpu_cfhw();
 	u64 inactive;
+	int err;
 
-	if (!(cpuhw->flags & PMU_F_ENABLED))
+	if (!cpuhw || !(cpuhw->flags & PMU_F_ENABLED))
 		return;
 
 	inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
+	inactive |= cpuhw->dev_state;
 	err = lcctl(inactive);
-	if (err) {
-		pr_err("Disabling the performance measuring unit "
-		       "failed with rc=%x\n", err);
-		return;
-	}
-
-	cpuhw->flags &= ~PMU_F_ENABLED;
+	if (err)
+		pr_err("Disabling the performance measuring unit failed with rc=%x\n", err);
+	else
+		cpuhw->flags &= ~PMU_F_ENABLED;
 }
 
-
-/* Number of perf events counting hardware events */
-static atomic_t num_events = ATOMIC_INIT(0);
-/* Used to avoid races in calling reserve/release_cpumf_hardware */
-static DEFINE_MUTEX(pmc_reserve_mutex);
-
 /* Release the PMU if event is the last perf event */
 static void hw_perf_event_destroy(struct perf_event *event)
 {
-	if (!atomic_add_unless(&num_events, -1, 1)) {
-		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_dec_return(&num_events) == 0)
-			__kernel_cpumcf_end();
-		mutex_unlock(&pmc_reserve_mutex);
-	}
+	cpum_cf_free(event->cpu);
 }
 
 /* CPUMF <-> perf event mappings for kernel+userspace (basic set) */
@@ -199,12 +737,17 @@ static const int cpumf_generic_events_user[] = {
 	[PERF_COUNT_HW_BUS_CYCLES]	    = -1,
 };
 
+static int is_userspace_event(u64 ev)
+{
+	return cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev ||
+	       cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev;
+}
+
 static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
 {
 	struct perf_event_attr *attr = &event->attr;
 	struct hw_perf_event *hwc = &event->hw;
 	enum cpumf_ctr_set set;
-	int err = 0;
 	u64 ev;
 
 	switch (type) {
@@ -221,21 +764,26 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
 		if (is_sampling_event(event))	/* No sampling support */
 			return -ENOENT;
 		ev = attr->config;
-		/* Count user space (problem-state) only */
 		if (!attr->exclude_user && attr->exclude_kernel) {
-			if (ev >= ARRAY_SIZE(cpumf_generic_events_user))
-				return -EOPNOTSUPP;
-			ev = cpumf_generic_events_user[ev];
-
-		/* No support for kernel space counters only */
+			/*
+			 * Count user space (problem-state) only
+			 * Handle events 32 and 33 as 0:u and 1:u
+			 */
+			if (!is_userspace_event(ev)) {
+				if (ev >= ARRAY_SIZE(cpumf_generic_events_user))
+					return -EOPNOTSUPP;
+				ev = cpumf_generic_events_user[ev];
+			}
 		} else if (!attr->exclude_kernel && attr->exclude_user) {
+			/* No support for kernel space counters only */
 			return -EOPNOTSUPP;
-
-		/* Count user and kernel space */
 		} else {
-			if (ev >= ARRAY_SIZE(cpumf_generic_events_basic))
-				return -EOPNOTSUPP;
-			ev = cpumf_generic_events_basic[ev];
+			/* Count user and kernel space, incl. events 32 + 33 */
+			if (!is_userspace_event(ev)) {
+				if (ev >= ARRAY_SIZE(cpumf_generic_events_basic))
+					return -EOPNOTSUPP;
+				ev = cpumf_generic_events_basic[ev];
+			}
 		}
 		break;
 
@@ -260,36 +808,51 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
 		/*
 		 * Use the hardware perf event structure to store the
 		 * counter number in the 'config' member and the counter
-		 * set number in the 'config_base'.  The counter set number
-		 * is then later used to enable/disable the counter(s).
+		 * set number in the 'config_base' as bit mask.
+		 * It is later used to enable/disable the counter(s).
 		 */
 		hwc->config = ev;
-		hwc->config_base = set;
+		hwc->config_base = cpumf_ctr_ctl[set];
 		break;
 	case CPUMF_CTR_SET_MAX:
 		/* The counter could not be associated to a counter set */
 		return -EINVAL;
-	};
+	}
 
 	/* Initialize for using the CPU-measurement counter facility */
-	if (!atomic_inc_not_zero(&num_events)) {
-		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_read(&num_events) == 0 && __kernel_cpumcf_begin())
-			err = -EBUSY;
-		else
-			atomic_inc(&num_events);
-		mutex_unlock(&pmc_reserve_mutex);
-	}
-	if (err)
-		return err;
+	if (cpum_cf_alloc(event->cpu))
+		return -ENOMEM;
 	event->destroy = hw_perf_event_destroy;
 
-	/* Finally, validate version and authorization of the counter set */
-	err = validate_ctr_auth(hwc);
-	if (!err)
-		err = validate_ctr_version(hwc);
+	/*
+	 * Finally, validate version and authorization of the counter set.
+	 * If the particular CPU counter set is not authorized,
+	 * return with -ENOENT in order to fall back to other
+	 * PMUs that might suffice the event request.
+	 */
+	if (!(hwc->config_base & cpumf_ctr_info.auth_ctl))
+		return -ENOENT;
+	return validate_ctr_version(hwc->config, set);
+}
 
-	return err;
+/* Events CPU_CYCLES and INSTRUCTIONS can be submitted with two different
+ * attribute::type values:
+ * - PERF_TYPE_HARDWARE:
+ * - pmu->type:
+ * Handle both type of invocations identical. They address the same hardware.
+ * The result is different when event modifiers exclude_kernel and/or
+ * exclude_user are also set.
+ */
+static int cpumf_pmu_event_type(struct perf_event *event)
+{
+	u64 ev = event->attr.config;
+
+	if (cpumf_generic_events_basic[PERF_COUNT_HW_CPU_CYCLES] == ev ||
+	    cpumf_generic_events_basic[PERF_COUNT_HW_INSTRUCTIONS] == ev ||
+	    cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev ||
+	    cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev)
+		return PERF_TYPE_HARDWARE;
+	return PERF_TYPE_RAW;
 }
 
 static int cpumf_pmu_event_init(struct perf_event *event)
@@ -301,7 +864,7 @@ static int cpumf_pmu_event_init(struct perf_event *event)
 		err = __hw_perf_event_init(event, type);
 	else if (event->pmu->type == type)
 		/* Registered as unknown PMU */
-		err = __hw_perf_event_init(event, PERF_TYPE_RAW);
+		err = __hw_perf_event_init(event, cpumf_pmu_event_type(event));
 	else
 		return -ENOENT;
 
@@ -316,8 +879,8 @@ static int hw_perf_event_reset(struct perf_event *event)
 	u64 prev, new;
 	int err;
 
+	prev = local64_read(&event->hw.prev_count);
 	do {
-		prev = local64_read(&event->hw.prev_count);
 		err = ecctr(event->hw.config, &new);
 		if (err) {
 			if (err != 3)
@@ -329,7 +892,7 @@ static int hw_perf_event_reset(struct perf_event *event)
 			 */
 			new = 0;
 		}
-	} while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev);
+	} while (!local64_try_cmpxchg(&event->hw.prev_count, &prev, new));
 
 	return err;
 }
@@ -339,12 +902,12 @@ static void hw_perf_event_update(struct perf_event *event)
 	u64 prev, new, delta;
 	int err;
 
+	prev = local64_read(&event->hw.prev_count);
 	do {
-		prev = local64_read(&event->hw.prev_count);
 		err = ecctr(event->hw.config, &new);
 		if (err)
 			return;
-	} while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev);
+	} while (!local64_try_cmpxchg(&event->hw.prev_count, &prev, new));
 
 	delta = (prev <= new) ? new - prev
 			      : (-1ULL - prev) + new + 1;	 /* overflow */
@@ -361,18 +924,13 @@ static void cpumf_pmu_read(struct perf_event *event)
 
 static void cpumf_pmu_start(struct perf_event *event, int flags)
 {
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cpu_cf_events *cpuhw = this_cpu_cfhw();
 	struct hw_perf_event *hwc = &event->hw;
+	int i;
 
-	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
-		return;
-
-	if (WARN_ON_ONCE(hwc->config == -1))
+	if (!(hwc->state & PERF_HES_STOPPED))
 		return;
 
-	if (flags & PERF_EF_RELOAD)
-		WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
-
 	hwc->state = 0;
 
 	/* (Re-)enable and activate the counter set */
@@ -384,45 +942,95 @@ static void cpumf_pmu_start(struct perf_event *event, int flags)
 	 * needs to be synchronized.  At this point, the counter set can be in
 	 * the inactive or disabled state.
 	 */
-	hw_perf_event_reset(event);
+	if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) {
+		cpuhw->usedss = cfdiag_getctr(cpuhw->start,
+					      sizeof(cpuhw->start),
+					      hwc->config_base, true);
+	} else {
+		hw_perf_event_reset(event);
+	}
+
+	/* Increment refcount for counter sets */
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
+		if ((hwc->config_base & cpumf_ctr_ctl[i]))
+			atomic_inc(&cpuhw->ctr_set[i]);
+}
+
+/* Create perf event sample with the counter sets as raw data.	The sample
+ * is then pushed to the event subsystem and the function checks for
+ * possible event overflows. If an event overflow occurs, the PMU is
+ * stopped.
+ *
+ * Return non-zero if an event overflow occurred.
+ */
+static int cfdiag_push_sample(struct perf_event *event,
+			      struct cpu_cf_events *cpuhw)
+{
+	struct perf_sample_data data;
+	struct perf_raw_record raw;
+	struct pt_regs regs;
+	int overflow;
+
+	/* Setup perf sample */
+	perf_sample_data_init(&data, 0, event->hw.last_period);
+	memset(&regs, 0, sizeof(regs));
+	memset(&raw, 0, sizeof(raw));
+
+	if (event->attr.sample_type & PERF_SAMPLE_CPU)
+		data.cpu_entry.cpu = event->cpu;
+	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+		raw.frag.size = cpuhw->usedss;
+		raw.frag.data = cpuhw->stop;
+		perf_sample_save_raw_data(&data, event, &raw);
+	}
 
-	/* increment refcount for this counter set */
-	atomic_inc(&cpuhw->ctr_set[hwc->config_base]);
+	overflow = perf_event_overflow(event, &data, &regs);
+	if (overflow)
+		event->pmu->stop(event, 0);
+
+	perf_event_update_userpage(event);
+	return overflow;
 }
 
 static void cpumf_pmu_stop(struct perf_event *event, int flags)
 {
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cpu_cf_events *cpuhw = this_cpu_cfhw();
 	struct hw_perf_event *hwc = &event->hw;
+	int i;
 
 	if (!(hwc->state & PERF_HES_STOPPED)) {
 		/* Decrement reference count for this counter set and if this
 		 * is the last used counter in the set, clear activation
 		 * control and set the counter set state to inactive.
 		 */
-		if (!atomic_dec_return(&cpuhw->ctr_set[hwc->config_base]))
-			ctr_set_stop(&cpuhw->state, hwc->config_base);
-		event->hw.state |= PERF_HES_STOPPED;
+		for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+			if (!(hwc->config_base & cpumf_ctr_ctl[i]))
+				continue;
+			if (!atomic_dec_return(&cpuhw->ctr_set[i]))
+				ctr_set_stop(&cpuhw->state, cpumf_ctr_ctl[i]);
+		}
+		hwc->state |= PERF_HES_STOPPED;
 	}
 
 	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-		hw_perf_event_update(event);
-		event->hw.state |= PERF_HES_UPTODATE;
+		if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) {
+			local64_inc(&event->count);
+			cpuhw->usedss = cfdiag_getctr(cpuhw->stop,
+						      sizeof(cpuhw->stop),
+						      event->hw.config_base,
+						      false);
+			if (cfdiag_diffctr(cpuhw, event->hw.config_base))
+				cfdiag_push_sample(event, cpuhw);
+		} else {
+			hw_perf_event_update(event);
+		}
+		hwc->state |= PERF_HES_UPTODATE;
 	}
 }
 
 static int cpumf_pmu_add(struct perf_event *event, int flags)
 {
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-
-	/* Check authorization for the counter set to which this
-	 * counter belongs.
-	 * For group events transaction, the authorization check is
-	 * done in cpumf_pmu_commit_txn().
-	 */
-	if (!(cpuhw->txn_flags & PERF_PMU_TXN_ADD))
-		if (validate_ctr_auth(&event->hw))
-			return -ENOENT;
+	struct cpu_cf_events *cpuhw = this_cpu_cfhw();
 
 	ctr_set_enable(&cpuhw->state, event->hw.config_base);
 	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
@@ -430,14 +1038,13 @@ static int cpumf_pmu_add(struct perf_event *event, int flags)
 	if (flags & PERF_EF_START)
 		cpumf_pmu_start(event, PERF_EF_RELOAD);
 
-	perf_event_update_userpage(event);
-
 	return 0;
 }
 
 static void cpumf_pmu_del(struct perf_event *event, int flags)
 {
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cpu_cf_events *cpuhw = this_cpu_cfhw();
+	int i;
 
 	cpumf_pmu_stop(event, PERF_EF_UPDATE);
 
@@ -447,114 +1054,906 @@ static void cpumf_pmu_del(struct perf_event *event, int flags)
 	 *
 	 * When a new perf event has been added but not yet started, this can
 	 * clear enable control and resets all counters in a set.  Therefore,
-	 * cpumf_pmu_start() always has to reenable a counter set.
+	 * cpumf_pmu_start() always has to re-enable a counter set.
 	 */
-	if (!atomic_read(&cpuhw->ctr_set[event->hw.config_base]))
-		ctr_set_disable(&cpuhw->state, event->hw.config_base);
-
-	perf_event_update_userpage(event);
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
+		if (!atomic_read(&cpuhw->ctr_set[i]))
+			ctr_set_disable(&cpuhw->state, cpumf_ctr_ctl[i]);
 }
 
+/* Performance monitoring unit for s390x */
+static struct pmu cpumf_pmu = {
+	.task_ctx_nr  = perf_sw_context,
+	.capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+	.pmu_enable   = cpumf_pmu_enable,
+	.pmu_disable  = cpumf_pmu_disable,
+	.event_init   = cpumf_pmu_event_init,
+	.add	      = cpumf_pmu_add,
+	.del	      = cpumf_pmu_del,
+	.start	      = cpumf_pmu_start,
+	.stop	      = cpumf_pmu_stop,
+	.read	      = cpumf_pmu_read,
+};
+
+static struct cfset_session {		/* CPUs and counter set bit mask */
+	struct list_head head;		/* Head of list of active processes */
+} cfset_session = {
+	.head = LIST_HEAD_INIT(cfset_session.head)
+};
+
+static refcount_t cfset_opencnt = REFCOUNT_INIT(0);	/* Access count */
 /*
- * Start group events scheduling transaction.
- * Set flags to perform a single test at commit time.
+ * Synchronize access to device /dev/hwc. This mutex protects against
+ * concurrent access to functions cfset_open() and cfset_release().
+ * Same for CPU hotplug add and remove events triggering
+ * cpum_cf_online_cpu() and cpum_cf_offline_cpu().
+ * It also serializes concurrent device ioctl access from multiple
+ * processes accessing /dev/hwc.
  *
- * We only support PERF_PMU_TXN_ADD transactions. Save the
- * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
- * transactions.
+ * The mutex protects concurrent access to the /dev/hwctr session management
+ * struct cfset_session and reference counting variable cfset_opencnt.
  */
-static void cpumf_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
+static DEFINE_MUTEX(cfset_ctrset_mutex);
+
+/*
+ * CPU hotplug handles only /dev/hwctr device.
+ * For perf_event_open() the CPU hotplug handling is done on kernel common
+ * code:
+ * - CPU add: Nothing is done since a file descriptor can not be created
+ *   and returned to the user.
+ * - CPU delete: Handled by common code via pmu_disable(), pmu_stop() and
+ *   pmu_delete(). The event itself is removed when the file descriptor is
+ *   closed.
+ */
+static int cfset_online_cpu(unsigned int cpu);
+
+static int cpum_cf_online_cpu(unsigned int cpu)
 {
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	int rc = 0;
+
+	/*
+	 * Ignore notification for perf_event_open().
+	 * Handle only /dev/hwctr device sessions.
+	 */
+	mutex_lock(&cfset_ctrset_mutex);
+	if (refcount_read(&cfset_opencnt)) {
+		rc = cpum_cf_alloc_cpu(cpu);
+		if (!rc)
+			cfset_online_cpu(cpu);
+	}
+	mutex_unlock(&cfset_ctrset_mutex);
+	return rc;
+}
+
+static int cfset_offline_cpu(unsigned int cpu);
+
+static int cpum_cf_offline_cpu(unsigned int cpu)
+{
+	/*
+	 * During task exit processing of grouped perf events triggered by CPU
+	 * hotplug processing, pmu_disable() is called as part of perf context
+	 * removal process. Therefore do not trigger event removal now for
+	 * perf_event_open() created events. Perf common code triggers event
+	 * destruction when the event file descriptor is closed.
+	 *
+	 * Handle only /dev/hwctr device sessions.
+	 */
+	mutex_lock(&cfset_ctrset_mutex);
+	if (refcount_read(&cfset_opencnt)) {
+		cfset_offline_cpu(cpu);
+		cpum_cf_free_cpu(cpu);
+	}
+	mutex_unlock(&cfset_ctrset_mutex);
+	return 0;
+}
+
+/* Return true if store counter set multiple instruction is available */
+static inline int stccm_avail(void)
+{
+	return test_facility(142);
+}
+
+/* CPU-measurement alerts for the counter facility */
+static void cpumf_measurement_alert(struct ext_code ext_code,
+				    unsigned int alert, unsigned long unused)
+{
+	struct cpu_cf_events *cpuhw;
+
+	if (!(alert & CPU_MF_INT_CF_MASK))
+		return;
 
-	WARN_ON_ONCE(cpuhw->txn_flags);		/* txn already in flight */
+	inc_irq_stat(IRQEXT_CMC);
 
-	cpuhw->txn_flags = txn_flags;
-	if (txn_flags & ~PERF_PMU_TXN_ADD)
+	/*
+	 * Measurement alerts are shared and might happen when the PMU
+	 * is not reserved.  Ignore these alerts in this case.
+	 */
+	cpuhw = this_cpu_cfhw();
+	if (!cpuhw)
 		return;
 
-	perf_pmu_disable(pmu);
-	cpuhw->tx_state = cpuhw->state;
+	/* counter authorization change alert */
+	if (alert & CPU_MF_INT_CF_CACA)
+		qctri(&cpumf_ctr_info);
+
+	/* loss of counter data alert */
+	if (alert & CPU_MF_INT_CF_LCDA)
+		pr_err("CPU[%i] Counter data was lost\n", smp_processor_id());
+
+	/* loss of MT counter data alert */
+	if (alert & CPU_MF_INT_CF_MTDA)
+		pr_warn("CPU[%i] MT counter data was lost\n",
+			smp_processor_id());
 }
 
-/*
- * Stop and cancel a group events scheduling tranctions.
- * Assumes cpumf_pmu_del() is called for each successful added
- * cpumf_pmu_add() during the transaction.
+static int cfset_init(void);
+static int __init cpumf_pmu_init(void)
+{
+	int rc;
+
+	/* Extract counter measurement facility information */
+	if (!cpum_cf_avail() || qctri(&cpumf_ctr_info))
+		return -ENODEV;
+
+	/* Determine and store counter set sizes for later reference */
+	for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
+		cpum_cf_make_setsize(rc);
+
+	/*
+	 * Clear bit 15 of cr0 to unauthorize problem-state to
+	 * extract measurement counters
+	 */
+	system_ctl_clear_bit(0, CR0_CPUMF_EXTRACTION_AUTH_BIT);
+
+	/* register handler for measurement-alert interruptions */
+	rc = register_external_irq(EXT_IRQ_MEASURE_ALERT,
+				   cpumf_measurement_alert);
+	if (rc) {
+		pr_err("Registering for CPU-measurement alerts failed with rc=%i\n", rc);
+		return rc;
+	}
+
+	/* Setup s390dbf facility */
+	cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128);
+	if (!cf_dbg) {
+		pr_err("Registration of s390dbf(cpum_cf) failed\n");
+		rc = -ENOMEM;
+		goto out1;
+	}
+	debug_register_view(cf_dbg, &debug_sprintf_view);
+
+	cpumf_pmu.attr_groups = cpumf_cf_event_group();
+	rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1);
+	if (rc) {
+		pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc);
+		goto out2;
+	} else if (stccm_avail()) {	/* Setup counter set device */
+		cfset_init();
+	}
+
+	rc = cpuhp_setup_state(CPUHP_AP_PERF_S390_CF_ONLINE,
+			       "perf/s390/cf:online",
+			       cpum_cf_online_cpu, cpum_cf_offline_cpu);
+	return rc;
+
+out2:
+	debug_unregister_view(cf_dbg, &debug_sprintf_view);
+	debug_unregister(cf_dbg);
+out1:
+	unregister_external_irq(EXT_IRQ_MEASURE_ALERT, cpumf_measurement_alert);
+	return rc;
+}
+
+/* Support for the CPU Measurement Facility counter set extraction using
+ * device /dev/hwctr. This allows user space programs to extract complete
+ * counter set via normal file operations.
  */
-static void cpumf_pmu_cancel_txn(struct pmu *pmu)
+
+struct cfset_call_on_cpu_parm {		/* Parm struct for smp_call_on_cpu */
+	unsigned int sets;		/* Counter set bit mask */
+	atomic_t cpus_ack;		/* # CPUs successfully executed func */
+};
+
+struct cfset_request {			/* CPUs and counter set bit mask */
+	unsigned long ctrset;		/* Bit mask of counter set to read */
+	cpumask_t mask;			/* CPU mask to read from */
+	struct list_head node;		/* Chain to cfset_session.head */
+};
+
+static void cfset_session_init(void)
 {
-	unsigned int txn_flags;
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	INIT_LIST_HEAD(&cfset_session.head);
+}
 
-	WARN_ON_ONCE(!cpuhw->txn_flags);	/* no txn in flight */
+/* Remove current request from global bookkeeping. Maintain a counter set bit
+ * mask on a per CPU basis.
+ * Done in process context under mutex protection.
+ */
+static void cfset_session_del(struct cfset_request *p)
+{
+	list_del(&p->node);
+}
 
-	txn_flags = cpuhw->txn_flags;
-	cpuhw->txn_flags = 0;
-	if (txn_flags & ~PERF_PMU_TXN_ADD)
-		return;
+/* Add current request to global bookkeeping. Maintain a counter set bit mask
+ * on a per CPU basis.
+ * Done in process context under mutex protection.
+ */
+static void cfset_session_add(struct cfset_request *p)
+{
+	list_add(&p->node, &cfset_session.head);
+}
 
-	WARN_ON(cpuhw->tx_state != cpuhw->state);
+/* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access
+ * path is currently used.
+ * The cpu_cf_events::dev_state is used to denote counter sets in use by this
+ * interface. It is always or'ed in. If this interface is not active, its
+ * value is zero and no additional counter sets will be included.
+ *
+ * The cpu_cf_events::state is used by the perf_event_open SVC and remains
+ * unchanged.
+ *
+ * perf_pmu_enable() and perf_pmu_enable() and its call backs
+ * cpumf_pmu_enable() and  cpumf_pmu_disable() are called by the
+ * performance measurement subsystem to enable per process
+ * CPU Measurement counter facility.
+ * The XXX_enable() and XXX_disable functions are used to turn off
+ * x86 performance monitoring interrupt (PMI) during scheduling.
+ * s390 uses these calls to temporarily stop and resume the active CPU
+ * counters sets during scheduling.
+ *
+ * We do allow concurrent access of perf_event_open() SVC and /dev/hwctr
+ * device access.  The perf_event_open() SVC interface makes a lot of effort
+ * to only run the counters while the calling process is actively scheduled
+ * to run.
+ * When /dev/hwctr interface is also used at the same time, the counter sets
+ * will keep running, even when the process is scheduled off a CPU.
+ * However this is not a problem and does not lead to wrong counter values
+ * for the perf_event_open() SVC. The current counter value will be recorded
+ * during schedule-in. At schedule-out time the current counter value is
+ * extracted again and the delta is calculated and added to the event.
+ */
+/* Stop all counter sets via ioctl interface */
+static void cfset_ioctl_off(void *parm)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_cfhw();
+	struct cfset_call_on_cpu_parm *p = parm;
+	int rc;
 
-	perf_pmu_enable(pmu);
+	/* Check if any counter set used by /dev/hwctr */
+	for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
+		if ((p->sets & cpumf_ctr_ctl[rc])) {
+			if (!atomic_dec_return(&cpuhw->ctr_set[rc])) {
+				ctr_set_disable(&cpuhw->dev_state,
+						cpumf_ctr_ctl[rc]);
+				ctr_set_stop(&cpuhw->dev_state,
+					     cpumf_ctr_ctl[rc]);
+			}
+		}
+	/* Keep perf_event_open counter sets */
+	rc = lcctl(cpuhw->dev_state | cpuhw->state);
+	if (rc)
+		pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n",
+		       cpuhw->state, S390_HWCTR_DEVICE, rc);
+	if (!cpuhw->dev_state)
+		cpuhw->flags &= ~PMU_F_IN_USE;
+}
+
+/* Start counter sets on particular CPU */
+static void cfset_ioctl_on(void *parm)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_cfhw();
+	struct cfset_call_on_cpu_parm *p = parm;
+	int rc;
+
+	cpuhw->flags |= PMU_F_IN_USE;
+	ctr_set_enable(&cpuhw->dev_state, p->sets);
+	ctr_set_start(&cpuhw->dev_state, p->sets);
+	for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
+		if ((p->sets & cpumf_ctr_ctl[rc]))
+			atomic_inc(&cpuhw->ctr_set[rc]);
+	rc = lcctl(cpuhw->dev_state | cpuhw->state);	/* Start counter sets */
+	if (!rc)
+		atomic_inc(&p->cpus_ack);
+	else
+		pr_err("Counter set start %#llx of /dev/%s failed rc=%i\n",
+		       cpuhw->dev_state | cpuhw->state, S390_HWCTR_DEVICE, rc);
+}
+
+static void cfset_release_cpu(void *p)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_cfhw();
+	int rc;
+
+	cpuhw->dev_state = 0;
+	rc = lcctl(cpuhw->state);	/* Keep perf_event_open counter sets */
+	if (rc)
+		pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n",
+		       cpuhw->state, S390_HWCTR_DEVICE, rc);
+}
+
+/* This modifies the process CPU mask to adopt it to the currently online
+ * CPUs. Offline CPUs can not be addresses. This call terminates the access
+ * and is usually followed by close() or a new iotcl(..., START, ...) which
+ * creates a new request structure.
+ */
+static void cfset_all_stop(struct cfset_request *req)
+{
+	struct cfset_call_on_cpu_parm p = {
+		.sets = req->ctrset,
+	};
+
+	cpumask_and(&req->mask, &req->mask, cpu_online_mask);
+	on_each_cpu_mask(&req->mask, cfset_ioctl_off, &p, 1);
+}
+
+/* Release function is also called when application gets terminated without
+ * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command.
+ */
+static int cfset_release(struct inode *inode, struct file *file)
+{
+	mutex_lock(&cfset_ctrset_mutex);
+	/* Open followed by close/exit has no private_data */
+	if (file->private_data) {
+		cfset_all_stop(file->private_data);
+		cfset_session_del(file->private_data);
+		kfree(file->private_data);
+		file->private_data = NULL;
+	}
+	if (refcount_dec_and_test(&cfset_opencnt)) {	/* Last close */
+		on_each_cpu(cfset_release_cpu, NULL, 1);
+		cpum_cf_free(-1);
+	}
+	mutex_unlock(&cfset_ctrset_mutex);
+	return 0;
 }
 
 /*
- * Commit the group events scheduling transaction.  On success, the
- * transaction is closed.   On error, the transaction is kept open
- * until cpumf_pmu_cancel_txn() is called.
+ * Open via /dev/hwctr device. Allocate all per CPU resources on the first
+ * open of the device. The last close releases all per CPU resources.
+ * Parallel perf_event_open system calls also use per CPU resources.
+ * These invocations are handled via reference counting on the per CPU data
+ * structures.
  */
-static int cpumf_pmu_commit_txn(struct pmu *pmu)
+static int cfset_open(struct inode *inode, struct file *file)
 {
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	u64 state;
+	int rc = 0;
+
+	if (!perfmon_capable())
+		return -EPERM;
+	file->private_data = NULL;
+
+	mutex_lock(&cfset_ctrset_mutex);
+	if (!refcount_inc_not_zero(&cfset_opencnt)) {	/* First open */
+		rc = cpum_cf_alloc(-1);
+		if (!rc) {
+			cfset_session_init();
+			refcount_set(&cfset_opencnt, 1);
+		}
+	}
+	mutex_unlock(&cfset_ctrset_mutex);
 
-	WARN_ON_ONCE(!cpuhw->txn_flags);	/* no txn in flight */
+	/* nonseekable_open() never fails */
+	return rc ?: nonseekable_open(inode, file);
+}
 
-	if (cpuhw->txn_flags & ~PERF_PMU_TXN_ADD) {
-		cpuhw->txn_flags = 0;
-		return 0;
+static int cfset_all_start(struct cfset_request *req)
+{
+	struct cfset_call_on_cpu_parm p = {
+		.sets = req->ctrset,
+		.cpus_ack = ATOMIC_INIT(0),
+	};
+	cpumask_var_t mask;
+	int rc = 0;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_and(mask, &req->mask, cpu_online_mask);
+	on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1);
+	if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) {
+		on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1);
+		rc = -EIO;
 	}
+	free_cpumask_var(mask);
+	return rc;
+}
 
-	/* check if the updated state can be scheduled */
-	state = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
-	state >>= CPUMF_LCCTL_ENABLE_SHIFT;
-	if ((state & cpuhw->info.auth_ctl) != state)
-		return -ENOENT;
+/* Return the maximum required space for all possible CPUs in case one
+ * CPU will be onlined during the START, READ, STOP cycles.
+ * To find out the size of the counter sets, any one CPU will do. They
+ * all have the same counter sets.
+ */
+static size_t cfset_needspace(unsigned int sets)
+{
+	size_t bytes = 0;
+	int i;
+
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		if (!(sets & cpumf_ctr_ctl[i]))
+			continue;
+		bytes += cpum_cf_read_setsize(i) * sizeof(u64) +
+			 sizeof(((struct s390_ctrset_setdata *)0)->set) +
+			 sizeof(((struct s390_ctrset_setdata *)0)->no_cnts);
+	}
+	bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids *
+		(bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) +
+		     sizeof(((struct s390_ctrset_cpudata *)0)->no_sets));
+	return bytes;
+}
+
+static int cfset_all_copy(unsigned long arg, cpumask_t *mask)
+{
+	struct s390_ctrset_read __user *ctrset_read;
+	unsigned int cpu, cpus, rc = 0;
+	void __user *uptr;
+
+	ctrset_read = (struct s390_ctrset_read __user *)arg;
+	uptr = ctrset_read->data;
+	for_each_cpu(cpu, mask) {
+		struct cpu_cf_events *cpuhw = get_cpu_cfhw(cpu);
+		struct s390_ctrset_cpudata __user *ctrset_cpudata;
+
+		ctrset_cpudata = uptr;
+		rc  = put_user(cpu, &ctrset_cpudata->cpu_nr);
+		rc |= put_user(cpuhw->sets, &ctrset_cpudata->no_sets);
+		rc |= copy_to_user(ctrset_cpudata->data, cpuhw->data,
+				   cpuhw->used);
+		if (rc) {
+			rc = -EFAULT;
+			goto out;
+		}
+		uptr += sizeof(struct s390_ctrset_cpudata) + cpuhw->used;
+		cond_resched();
+	}
+	cpus = cpumask_weight(mask);
+	if (put_user(cpus, &ctrset_read->no_cpus))
+		rc = -EFAULT;
+out:
+	return rc;
+}
+
+static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset,
+				int ctrset_size, size_t room)
+{
+	size_t need = 0;
+	int rc = -1;
+
+	need = sizeof(*p) + sizeof(u64) * ctrset_size;
+	if (need <= room) {
+		p->set = cpumf_ctr_ctl[ctrset];
+		p->no_cnts = ctrset_size;
+		rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv);
+		if (rc == 3)		/* Nothing stored */
+			need = 0;
+	}
+	return need;
+}
+
+/* Read all counter sets. */
+static void cfset_cpu_read(void *parm)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_cfhw();
+	struct cfset_call_on_cpu_parm *p = parm;
+	int set, set_size;
+	size_t space;
+
+	/* No data saved yet */
+	cpuhw->used = 0;
+	cpuhw->sets = 0;
+	memset(cpuhw->data, 0, sizeof(cpuhw->data));
+
+	/* Scan the counter sets */
+	for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) {
+		struct s390_ctrset_setdata *sp = (void *)cpuhw->data +
+						 cpuhw->used;
+
+		if (!(p->sets & cpumf_ctr_ctl[set]))
+			continue;	/* Counter set not in list */
+		set_size = cpum_cf_read_setsize(set);
+		space = sizeof(cpuhw->data) - cpuhw->used;
+		space = cfset_cpuset_read(sp, set, set_size, space);
+		if (space) {
+			cpuhw->used += space;
+			cpuhw->sets += 1;
+		}
+	}
+}
+
+static int cfset_all_read(unsigned long arg, struct cfset_request *req)
+{
+	struct cfset_call_on_cpu_parm p;
+	cpumask_var_t mask;
+	int rc;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	p.sets = req->ctrset;
+	cpumask_and(mask, &req->mask, cpu_online_mask);
+	on_each_cpu_mask(mask, cfset_cpu_read, &p, 1);
+	rc = cfset_all_copy(arg, mask);
+	free_cpumask_var(mask);
+	return rc;
+}
+
+static long cfset_ioctl_read(unsigned long arg, struct cfset_request *req)
+{
+	int ret = -ENODATA;
+
+	if (req && req->ctrset)
+		ret = cfset_all_read(arg, req);
+	return ret;
+}
+
+static long cfset_ioctl_stop(struct file *file)
+{
+	struct cfset_request *req = file->private_data;
+	int ret = -ENXIO;
+
+	if (req) {
+		cfset_all_stop(req);
+		cfset_session_del(req);
+		kfree(req);
+		file->private_data = NULL;
+		ret = 0;
+	}
+	return ret;
+}
+
+static long cfset_ioctl_start(unsigned long arg, struct file *file)
+{
+	struct s390_ctrset_start __user *ustart;
+	struct s390_ctrset_start start;
+	struct cfset_request *preq;
+	void __user *umask;
+	unsigned int len;
+	int ret = 0;
+	size_t need;
+
+	if (file->private_data)
+		return -EBUSY;
+	ustart = (struct s390_ctrset_start __user *)arg;
+	if (copy_from_user(&start, ustart, sizeof(start)))
+		return -EFAULT;
+	if (start.version != S390_HWCTR_START_VERSION)
+		return -EINVAL;
+	if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_USER] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]))
+		return -EINVAL;		/* Invalid counter set */
+	if (!start.counter_sets)
+		return -EINVAL;		/* No counter set at all? */
+
+	preq = kzalloc(sizeof(*preq), GFP_KERNEL);
+	if (!preq)
+		return -ENOMEM;
+	cpumask_clear(&preq->mask);
+	len = min_t(u64, start.cpumask_len, cpumask_size());
+	umask = (void __user *)start.cpumask;
+	if (copy_from_user(&preq->mask, umask, len)) {
+		kfree(preq);
+		return -EFAULT;
+	}
+	if (cpumask_empty(&preq->mask)) {
+		kfree(preq);
+		return -EINVAL;
+	}
+	need = cfset_needspace(start.counter_sets);
+	if (put_user(need, &ustart->data_bytes)) {
+		kfree(preq);
+		return -EFAULT;
+	}
+	preq->ctrset = start.counter_sets;
+	ret = cfset_all_start(preq);
+	if (!ret) {
+		cfset_session_add(preq);
+		file->private_data = preq;
+	} else {
+		kfree(preq);
+	}
+	return ret;
+}
+
+/* Entry point to the /dev/hwctr device interface.
+ * The ioctl system call supports three subcommands:
+ * S390_HWCTR_START: Start the specified counter sets on a CPU list. The
+ *    counter set keeps running until explicitly stopped. Returns the number
+ *    of bytes needed to store the counter values. If another S390_HWCTR_START
+ *    ioctl subcommand is called without a previous S390_HWCTR_STOP stop
+ *    command on the same file descriptor, -EBUSY is returned.
+ * S390_HWCTR_READ: Read the counter set values from specified CPU list given
+ *    with the S390_HWCTR_START command.
+ * S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the
+ *    previous S390_HWCTR_START subcommand.
+ */
+static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int ret;
+
+	cpus_read_lock();
+	mutex_lock(&cfset_ctrset_mutex);
+	switch (cmd) {
+	case S390_HWCTR_START:
+		ret = cfset_ioctl_start(arg, file);
+		break;
+	case S390_HWCTR_STOP:
+		ret = cfset_ioctl_stop(file);
+		break;
+	case S390_HWCTR_READ:
+		ret = cfset_ioctl_read(arg, file->private_data);
+		break;
+	default:
+		ret = -ENOTTY;
+		break;
+	}
+	mutex_unlock(&cfset_ctrset_mutex);
+	cpus_read_unlock();
+	return ret;
+}
+
+static const struct file_operations cfset_fops = {
+	.owner = THIS_MODULE,
+	.open = cfset_open,
+	.release = cfset_release,
+	.unlocked_ioctl	= cfset_ioctl,
+	.compat_ioctl = cfset_ioctl,
+};
 
-	cpuhw->txn_flags = 0;
-	perf_pmu_enable(pmu);
+static struct miscdevice cfset_dev = {
+	.name	= S390_HWCTR_DEVICE,
+	.minor	= MISC_DYNAMIC_MINOR,
+	.fops	= &cfset_fops,
+	.mode	= 0666,
+};
+
+/* Hotplug add of a CPU. Scan through all active processes and add
+ * that CPU to the list of CPUs supplied with ioctl(..., START, ...).
+ */
+static int cfset_online_cpu(unsigned int cpu)
+{
+	struct cfset_call_on_cpu_parm p;
+	struct cfset_request *rp;
+
+	if (!list_empty(&cfset_session.head)) {
+		list_for_each_entry(rp, &cfset_session.head, node) {
+			p.sets = rp->ctrset;
+			cfset_ioctl_on(&p);
+			cpumask_set_cpu(cpu, &rp->mask);
+		}
+	}
 	return 0;
 }
 
-/* Performance monitoring unit for s390x */
-static struct pmu cpumf_pmu = {
+/* Hotplug remove of a CPU. Scan through all active processes and clear
+ * that CPU from the list of CPUs supplied with ioctl(..., START, ...).
+ * Adjust reference counts.
+ */
+static int cfset_offline_cpu(unsigned int cpu)
+{
+	struct cfset_call_on_cpu_parm p;
+	struct cfset_request *rp;
+
+	if (!list_empty(&cfset_session.head)) {
+		list_for_each_entry(rp, &cfset_session.head, node) {
+			p.sets = rp->ctrset;
+			cfset_ioctl_off(&p);
+			cpumask_clear_cpu(cpu, &rp->mask);
+		}
+	}
+	return 0;
+}
+
+static void cfdiag_read(struct perf_event *event)
+{
+}
+
+static int get_authctrsets(void)
+{
+	unsigned long auth = 0;
+	enum cpumf_ctr_set i;
+
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		if (cpumf_ctr_info.auth_ctl & cpumf_ctr_ctl[i])
+			auth |= cpumf_ctr_ctl[i];
+	}
+	return auth;
+}
+
+/* Setup the event. Test for authorized counter sets and only include counter
+ * sets which are authorized at the time of the setup. Including unauthorized
+ * counter sets result in specification exception (and panic).
+ */
+static int cfdiag_event_init2(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	int err = 0;
+
+	/* Set sample_period to indicate sampling */
+	event->hw.config = attr->config;
+	event->hw.sample_period = attr->sample_period;
+	local64_set(&event->hw.period_left, event->hw.sample_period);
+	local64_set(&event->count, 0);
+	event->hw.last_period = event->hw.sample_period;
+
+	/* Add all authorized counter sets to config_base. The
+	 * the hardware init function is either called per-cpu or just once
+	 * for all CPUS (event->cpu == -1).  This depends on the whether
+	 * counting is started for all CPUs or on a per workload base where
+	 * the perf event moves from one CPU to another CPU.
+	 * Checking the authorization on any CPU is fine as the hardware
+	 * applies the same authorization settings to all CPUs.
+	 */
+	event->hw.config_base = get_authctrsets();
+
+	/* No authorized counter sets, nothing to count/sample */
+	if (!event->hw.config_base)
+		err = -EINVAL;
+
+	return err;
+}
+
+static int cfdiag_event_init(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	int err = -ENOENT;
+
+	if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG ||
+	    event->attr.type != event->pmu->type)
+		goto out;
+
+	/* Raw events are used to access counters directly,
+	 * hence do not permit excludes.
+	 * This event is useless without PERF_SAMPLE_RAW to return counter set
+	 * values as raw data.
+	 */
+	if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv ||
+	    !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	/* Initialize for using the CPU-measurement counter facility */
+	if (cpum_cf_alloc(event->cpu))
+		return -ENOMEM;
+	event->destroy = hw_perf_event_destroy;
+
+	err = cfdiag_event_init2(event);
+	if (unlikely(err))
+		event->destroy(event);
+out:
+	return err;
+}
+
+/* Create cf_diag/events/CF_DIAG event sysfs file. This counter is used
+ * to collect the complete counter sets for a scheduled process. Target
+ * are complete counter sets attached as raw data to the artificial event.
+ * This results in complete counter sets available when a process is
+ * scheduled. Contains the delta of every counter while the process was
+ * running.
+ */
+CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG);
+
+static struct attribute *cfdiag_events_attr[] = {
+	CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG),
+	NULL,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+
+static struct attribute *cfdiag_format_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group cfdiag_events_group = {
+	.name = "events",
+	.attrs = cfdiag_events_attr,
+};
+static struct attribute_group cfdiag_format_group = {
+	.name = "format",
+	.attrs = cfdiag_format_attr,
+};
+static const struct attribute_group *cfdiag_attr_groups[] = {
+	&cfdiag_events_group,
+	&cfdiag_format_group,
+	NULL,
+};
+
+/* Performance monitoring unit for event CF_DIAG. Since this event
+ * is also started and stopped via the perf_event_open() system call, use
+ * the same event enable/disable call back functions. They do not
+ * have a pointer to the perf_event structure as first parameter.
+ *
+ * The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common.
+ * Reuse them and distinguish the event (always first parameter) via
+ * 'config' member.
+ */
+static struct pmu cf_diag = {
 	.task_ctx_nr  = perf_sw_context,
-	.capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+	.event_init   = cfdiag_event_init,
 	.pmu_enable   = cpumf_pmu_enable,
 	.pmu_disable  = cpumf_pmu_disable,
-	.event_init   = cpumf_pmu_event_init,
 	.add	      = cpumf_pmu_add,
 	.del	      = cpumf_pmu_del,
 	.start	      = cpumf_pmu_start,
 	.stop	      = cpumf_pmu_stop,
-	.read	      = cpumf_pmu_read,
-	.start_txn    = cpumf_pmu_start_txn,
-	.commit_txn   = cpumf_pmu_commit_txn,
-	.cancel_txn   = cpumf_pmu_cancel_txn,
+	.read	      = cfdiag_read,
+
+	.attr_groups  = cfdiag_attr_groups
 };
 
-static int __init cpumf_pmu_init(void)
+/* Calculate memory needed to store all counter sets together with header and
+ * trailer data. This is independent of the counter set authorization which
+ * can vary depending on the configuration.
+ */
+static size_t cfdiag_maxsize(struct cpumf_ctr_info *info)
+{
+	size_t max_size = sizeof(struct cf_trailer_entry);
+	enum cpumf_ctr_set i;
+
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		size_t size = cpum_cf_read_setsize(i);
+
+		if (size)
+			max_size += size * sizeof(u64) +
+				    sizeof(struct cf_ctrset_entry);
+	}
+	return max_size;
+}
+
+/* Get the CPU speed, try sampling facility first and CPU attributes second. */
+static void cfdiag_get_cpu_speed(void)
+{
+	unsigned long mhz;
+
+	if (cpum_sf_avail()) {			/* Sampling facility first */
+		struct hws_qsi_info_block si;
+
+		memset(&si, 0, sizeof(si));
+		if (!qsi(&si)) {
+			cfdiag_cpu_speed = si.cpu_speed;
+			return;
+		}
+	}
+
+	/* Fallback: CPU speed extract static part. Used in case
+	 * CPU Measurement Sampling Facility is turned off.
+	 */
+	mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0);
+	if (mhz != -1UL)
+		cfdiag_cpu_speed = mhz & 0xffffffff;
+}
+
+static int cfset_init(void)
 {
+	size_t need;
 	int rc;
 
-	if (!kernel_cpumcf_avail())
-		return -ENODEV;
+	cfdiag_get_cpu_speed();
+	/* Make sure the counter set data fits into predefined buffer. */
+	need = cfdiag_maxsize(&cpumf_ctr_info);
+	if (need > sizeof(((struct cpu_cf_events *)0)->start)) {
+		pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n",
+		       need);
+		return -ENOMEM;
+	}
 
-	cpumf_pmu.attr_groups = cpumf_cf_event_group();
-	rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1);
-	if (rc)
-		pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc);
+	rc = misc_register(&cfset_dev);
+	if (rc) {
+		pr_err("Registration of /dev/%s failed rc=%i\n",
+		       cfset_dev.name, rc);
+		goto out;
+	}
+
+	rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1);
+	if (rc) {
+		misc_deregister(&cfset_dev);
+		pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n",
+		       rc);
+	}
+out:
 	return rc;
 }
-subsys_initcall(cpumf_pmu_init);
+
+device_initcall(cpumf_pmu_init);
diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c
deleted file mode 100644
index 3bced89caffb..000000000000
--- a/arch/s390/kernel/perf_cpum_cf_common.c
+++ /dev/null
@@ -1,201 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * CPU-Measurement Counter Facility Support - Common Layer
- *
- *  Copyright IBM Corp. 2019
- *  Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
- */
-#define KMSG_COMPONENT	"cpum_cf_common"
-#define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kernel_stat.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <asm/ctl_reg.h>
-#include <asm/irq.h>
-#include <asm/cpu_mcf.h>
-
-/* Per-CPU event structure for the counter facility */
-DEFINE_PER_CPU(struct cpu_cf_events, cpu_cf_events) = {
-	.ctr_set = {
-		[CPUMF_CTR_SET_BASIC]	= ATOMIC_INIT(0),
-		[CPUMF_CTR_SET_USER]	= ATOMIC_INIT(0),
-		[CPUMF_CTR_SET_CRYPTO]	= ATOMIC_INIT(0),
-		[CPUMF_CTR_SET_EXT]	= ATOMIC_INIT(0),
-		[CPUMF_CTR_SET_MT_DIAG] = ATOMIC_INIT(0),
-	},
-	.alert = ATOMIC64_INIT(0),
-	.state = 0,
-	.flags = 0,
-	.txn_flags = 0,
-};
-/* Indicator whether the CPU-Measurement Counter Facility Support is ready */
-static bool cpum_cf_initalized;
-
-/* CPU-measurement alerts for the counter facility */
-static void cpumf_measurement_alert(struct ext_code ext_code,
-				    unsigned int alert, unsigned long unused)
-{
-	struct cpu_cf_events *cpuhw;
-
-	if (!(alert & CPU_MF_INT_CF_MASK))
-		return;
-
-	inc_irq_stat(IRQEXT_CMC);
-	cpuhw = this_cpu_ptr(&cpu_cf_events);
-
-	/* Measurement alerts are shared and might happen when the PMU
-	 * is not reserved.  Ignore these alerts in this case. */
-	if (!(cpuhw->flags & PMU_F_RESERVED))
-		return;
-
-	/* counter authorization change alert */
-	if (alert & CPU_MF_INT_CF_CACA)
-		qctri(&cpuhw->info);
-
-	/* loss of counter data alert */
-	if (alert & CPU_MF_INT_CF_LCDA)
-		pr_err("CPU[%i] Counter data was lost\n", smp_processor_id());
-
-	/* loss of MT counter data alert */
-	if (alert & CPU_MF_INT_CF_MTDA)
-		pr_warn("CPU[%i] MT counter data was lost\n",
-			smp_processor_id());
-
-	/* store alert for special handling by in-kernel users */
-	atomic64_or(alert, &cpuhw->alert);
-}
-
-#define PMC_INIT      0
-#define PMC_RELEASE   1
-static void cpum_cf_setup_cpu(void *flags)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-
-	switch (*((int *) flags)) {
-	case PMC_INIT:
-		memset(&cpuhw->info, 0, sizeof(cpuhw->info));
-		qctri(&cpuhw->info);
-		cpuhw->flags |= PMU_F_RESERVED;
-		break;
-
-	case PMC_RELEASE:
-		cpuhw->flags &= ~PMU_F_RESERVED;
-		break;
-	}
-
-	/* Disable CPU counter sets */
-	lcctl(0);
-}
-
-bool kernel_cpumcf_avail(void)
-{
-	return cpum_cf_initalized;
-}
-EXPORT_SYMBOL(kernel_cpumcf_avail);
-
-
-/* Reserve/release functions for sharing perf hardware */
-static DEFINE_SPINLOCK(cpumcf_owner_lock);
-static void *cpumcf_owner;
-
-/* Initialize the CPU-measurement counter facility */
-int __kernel_cpumcf_begin(void)
-{
-	int flags = PMC_INIT;
-	int err = 0;
-
-	spin_lock(&cpumcf_owner_lock);
-	if (cpumcf_owner)
-		err = -EBUSY;
-	else
-		cpumcf_owner = __builtin_return_address(0);
-	spin_unlock(&cpumcf_owner_lock);
-	if (err)
-		return err;
-
-	on_each_cpu(cpum_cf_setup_cpu, &flags, 1);
-	irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT);
-
-	return 0;
-}
-EXPORT_SYMBOL(__kernel_cpumcf_begin);
-
-/* Obtain the CPU-measurement alerts for the counter facility */
-unsigned long kernel_cpumcf_alert(int clear)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	unsigned long alert;
-
-	alert = atomic64_read(&cpuhw->alert);
-	if (clear)
-		atomic64_set(&cpuhw->alert, 0);
-
-	return alert;
-}
-EXPORT_SYMBOL(kernel_cpumcf_alert);
-
-/* Release the CPU-measurement counter facility */
-void __kernel_cpumcf_end(void)
-{
-	int flags = PMC_RELEASE;
-
-	on_each_cpu(cpum_cf_setup_cpu, &flags, 1);
-	irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT);
-
-	spin_lock(&cpumcf_owner_lock);
-	cpumcf_owner = NULL;
-	spin_unlock(&cpumcf_owner_lock);
-}
-EXPORT_SYMBOL(__kernel_cpumcf_end);
-
-static int cpum_cf_setup(unsigned int cpu, int flags)
-{
-	local_irq_disable();
-	cpum_cf_setup_cpu(&flags);
-	local_irq_enable();
-	return 0;
-}
-
-static int cpum_cf_online_cpu(unsigned int cpu)
-{
-	return cpum_cf_setup(cpu, PMC_INIT);
-}
-
-static int cpum_cf_offline_cpu(unsigned int cpu)
-{
-	return cpum_cf_setup(cpu, PMC_RELEASE);
-}
-
-static int __init cpum_cf_init(void)
-{
-	int rc;
-
-	if (!cpum_cf_avail())
-		return -ENODEV;
-
-	/* clear bit 15 of cr0 to unauthorize problem-state to
-	 * extract measurement counters */
-	ctl_clear_bit(0, 48);
-
-	/* register handler for measurement-alert interruptions */
-	rc = register_external_irq(EXT_IRQ_MEASURE_ALERT,
-				   cpumf_measurement_alert);
-	if (rc) {
-		pr_err("Registering for CPU-measurement alerts "
-		       "failed with rc=%i\n", rc);
-		return rc;
-	}
-
-	rc = cpuhp_setup_state(CPUHP_AP_PERF_S390_CF_ONLINE,
-				"perf/s390/cf:online",
-				cpum_cf_online_cpu, cpum_cf_offline_cpu);
-	if (!rc)
-		cpum_cf_initalized = true;
-
-	return rc;
-}
-early_initcall(cpum_cf_init);
diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c
deleted file mode 100644
index e949ab832ed7..000000000000
--- a/arch/s390/kernel/perf_cpum_cf_diag.c
+++ /dev/null
@@ -1,705 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Performance event support for s390x - CPU-measurement Counter Sets
- *
- *  Copyright IBM Corp. 2019
- *  Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
- *	       Thomas Richer <tmricht@linux.ibm.com>
- */
-#define KMSG_COMPONENT	"cpum_cf_diag"
-#define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kernel_stat.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/processor.h>
-
-#include <asm/ctl_reg.h>
-#include <asm/irq.h>
-#include <asm/cpu_mcf.h>
-#include <asm/timex.h>
-#include <asm/debug.h>
-
-#define	CF_DIAG_CTRSET_DEF		0xfeef	/* Counter set header mark */
-
-static unsigned int cf_diag_cpu_speed;
-static debug_info_t *cf_diag_dbg;
-
-struct cf_diag_csd {		/* Counter set data per CPU */
-	size_t used;			/* Bytes used in data/start */
-	unsigned char start[PAGE_SIZE];	/* Counter set at event start */
-	unsigned char data[PAGE_SIZE];	/* Counter set at event delete */
-};
-static DEFINE_PER_CPU(struct cf_diag_csd, cf_diag_csd);
-
-/* Counter sets are stored as data stream in a page sized memory buffer and
- * exported to user space via raw data attached to the event sample data.
- * Each counter set starts with an eight byte header consisting of:
- * - a two byte eye catcher (0xfeef)
- * - a one byte counter set number
- * - a two byte counter set size (indicates the number of counters in this set)
- * - a three byte reserved value (must be zero) to make the header the same
- *   size as a counter value.
- * All counter values are eight byte in size.
- *
- * All counter sets are followed by a 64 byte trailer.
- * The trailer consists of a:
- * - flag field indicating valid fields when corresponding bit set
- * - the counter facility first and second version number
- * - the CPU speed if nonzero
- * - the time stamp the counter sets have been collected
- * - the time of day (TOD) base value
- * - the machine type.
- *
- * The counter sets are saved when the process is prepared to be executed on a
- * CPU and saved again when the process is going to be removed from a CPU.
- * The difference of both counter sets are calculated and stored in the event
- * sample data area.
- */
-
-struct cf_ctrset_entry {	/* CPU-M CF counter set entry (8 byte) */
-	unsigned int def:16;	/* 0-15  Data Entry Format */
-	unsigned int set:16;	/* 16-31 Counter set identifier */
-	unsigned int ctr:16;	/* 32-47 Number of stored counters */
-	unsigned int res1:16;	/* 48-63 Reserved */
-};
-
-struct cf_trailer_entry {	/* CPU-M CF_DIAG trailer (64 byte) */
-	/* 0 - 7 */
-	union {
-		struct {
-			unsigned int clock_base:1;	/* TOD clock base set */
-			unsigned int speed:1;		/* CPU speed set */
-			/* Measurement alerts */
-			unsigned int mtda:1;	/* Loss of MT ctr. data alert */
-			unsigned int caca:1;	/* Counter auth. change alert */
-			unsigned int lcda:1;	/* Loss of counter data alert */
-		};
-		unsigned long flags;	/* 0-63    All indicators */
-	};
-	/* 8 - 15 */
-	unsigned int cfvn:16;			/* 64-79   Ctr First Version */
-	unsigned int csvn:16;			/* 80-95   Ctr Second Version */
-	unsigned int cpu_speed:32;		/* 96-127  CPU speed */
-	/* 16 - 23 */
-	unsigned long timestamp;		/* 128-191 Timestamp (TOD) */
-	/* 24 - 55 */
-	union {
-		struct {
-			unsigned long progusage1;
-			unsigned long progusage2;
-			unsigned long progusage3;
-			unsigned long tod_base;
-		};
-		unsigned long progusage[4];
-	};
-	/* 56 - 63 */
-	unsigned int mach_type:16;		/* Machine type */
-	unsigned int res1:16;			/* Reserved */
-	unsigned int res2:32;			/* Reserved */
-};
-
-/* Create the trailer data at the end of a page. */
-static void cf_diag_trailer(struct cf_trailer_entry *te)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cpuid cpuid;
-
-	te->cfvn = cpuhw->info.cfvn;		/* Counter version numbers */
-	te->csvn = cpuhw->info.csvn;
-
-	get_cpu_id(&cpuid);			/* Machine type */
-	te->mach_type = cpuid.machine;
-	te->cpu_speed = cf_diag_cpu_speed;
-	if (te->cpu_speed)
-		te->speed = 1;
-	te->clock_base = 1;			/* Save clock base */
-	memcpy(&te->tod_base, &tod_clock_base[1], 8);
-	store_tod_clock((__u64 *)&te->timestamp);
-}
-
-/*
- * Change the CPUMF state to active.
- * Enable and activate the CPU-counter sets according
- * to the per-cpu control state.
- */
-static void cf_diag_enable(struct pmu *pmu)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	int err;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s pmu %p cpu %d flags %#x state %#llx\n",
-			    __func__, pmu, smp_processor_id(), cpuhw->flags,
-			    cpuhw->state);
-	if (cpuhw->flags & PMU_F_ENABLED)
-		return;
-
-	err = lcctl(cpuhw->state);
-	if (err) {
-		pr_err("Enabling the performance measuring unit "
-		       "failed with rc=%x\n", err);
-		return;
-	}
-	cpuhw->flags |= PMU_F_ENABLED;
-}
-
-/*
- * Change the CPUMF state to inactive.
- * Disable and enable (inactive) the CPU-counter sets according
- * to the per-cpu control state.
- */
-static void cf_diag_disable(struct pmu *pmu)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	u64 inactive;
-	int err;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s pmu %p cpu %d flags %#x state %#llx\n",
-			    __func__, pmu, smp_processor_id(), cpuhw->flags,
-			    cpuhw->state);
-	if (!(cpuhw->flags & PMU_F_ENABLED))
-		return;
-
-	inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
-	err = lcctl(inactive);
-	if (err) {
-		pr_err("Disabling the performance measuring unit "
-		       "failed with rc=%x\n", err);
-		return;
-	}
-	cpuhw->flags &= ~PMU_F_ENABLED;
-}
-
-/* Number of perf events counting hardware events */
-static atomic_t cf_diag_events = ATOMIC_INIT(0);
-
-/* Release the PMU if event is the last perf event */
-static void cf_diag_perf_event_destroy(struct perf_event *event)
-{
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d cf_diag_events %d\n",
-			    __func__, event, event->cpu,
-			    atomic_read(&cf_diag_events));
-	if (atomic_dec_return(&cf_diag_events) == 0)
-		__kernel_cpumcf_end();
-}
-
-/* Setup the event. Test for authorized counter sets and only include counter
- * sets which are authorized at the time of the setup. Including unauthorized
- * counter sets result in specification exception (and panic).
- */
-static int __hw_perf_event_init(struct perf_event *event)
-{
-	struct perf_event_attr *attr = &event->attr;
-	struct cpu_cf_events *cpuhw;
-	enum cpumf_ctr_set i;
-	int err = 0;
-
-	debug_sprintf_event(cf_diag_dbg, 5, "%s event %p cpu %d\n", __func__,
-			    event, event->cpu);
-
-	event->hw.config = attr->config;
-	event->hw.config_base = 0;
-
-	/* Add all authorized counter sets to config_base. The
-	 * the hardware init function is either called per-cpu or just once
-	 * for all CPUS (event->cpu == -1).  This depends on the whether
-	 * counting is started for all CPUs or on a per workload base where
-	 * the perf event moves from one CPU to another CPU.
-	 * Checking the authorization on any CPU is fine as the hardware
-	 * applies the same authorization settings to all CPUs.
-	 */
-	cpuhw = &get_cpu_var(cpu_cf_events);
-	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
-		if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i])
-			event->hw.config_base |= cpumf_ctr_ctl[i];
-	put_cpu_var(cpu_cf_events);
-
-	/* No authorized counter sets, nothing to count/sample */
-	if (!event->hw.config_base) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	/* Set sample_period to indicate sampling */
-	event->hw.sample_period = attr->sample_period;
-	local64_set(&event->hw.period_left, event->hw.sample_period);
-	event->hw.last_period  = event->hw.sample_period;
-out:
-	debug_sprintf_event(cf_diag_dbg, 5, "%s err %d config_base %#lx\n",
-			    __func__, err, event->hw.config_base);
-	return err;
-}
-
-static int cf_diag_event_init(struct perf_event *event)
-{
-	struct perf_event_attr *attr = &event->attr;
-	int err = -ENOENT;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d config %#llx type:%u "
-			    "sample_type %#llx cf_diag_events %d\n", __func__,
-			    event, event->cpu, attr->config, event->pmu->type,
-			    attr->sample_type, atomic_read(&cf_diag_events));
-
-	if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG ||
-	    event->attr.type != event->pmu->type)
-		goto out;
-
-	/* Raw events are used to access counters directly,
-	 * hence do not permit excludes.
-	 * This event is usesless without PERF_SAMPLE_RAW to return counter set
-	 * values as raw data.
-	 */
-	if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv ||
-	    !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	/* Initialize for using the CPU-measurement counter facility */
-	if (atomic_inc_return(&cf_diag_events) == 1) {
-		if (__kernel_cpumcf_begin()) {
-			atomic_dec(&cf_diag_events);
-			err = -EBUSY;
-			goto out;
-		}
-	}
-	event->destroy = cf_diag_perf_event_destroy;
-
-	err = __hw_perf_event_init(event);
-	if (unlikely(err))
-		event->destroy(event);
-out:
-	debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err);
-	return err;
-}
-
-static void cf_diag_read(struct perf_event *event)
-{
-	debug_sprintf_event(cf_diag_dbg, 5, "%s event %p\n", __func__, event);
-}
-
-/* Return the maximum possible counter set size (in number of 8 byte counters)
- * depending on type and model number.
- */
-static size_t cf_diag_ctrset_size(enum cpumf_ctr_set ctrset,
-				 struct cpumf_ctr_info *info)
-{
-	size_t ctrset_size = 0;
-
-	switch (ctrset) {
-	case CPUMF_CTR_SET_BASIC:
-		if (info->cfvn >= 1)
-			ctrset_size = 6;
-		break;
-	case CPUMF_CTR_SET_USER:
-		if (info->cfvn == 1)
-			ctrset_size = 6;
-		else if (info->cfvn >= 3)
-			ctrset_size = 2;
-		break;
-	case CPUMF_CTR_SET_CRYPTO:
-		if (info->csvn >= 1 && info->csvn <= 5)
-			ctrset_size = 16;
-		else if (info->csvn == 6)
-			ctrset_size = 20;
-		break;
-	case CPUMF_CTR_SET_EXT:
-		if (info->csvn == 1)
-			ctrset_size = 32;
-		else if (info->csvn == 2)
-			ctrset_size = 48;
-		else if (info->csvn >= 3 && info->csvn <= 5)
-			ctrset_size = 128;
-		else if (info->csvn == 6)
-			ctrset_size = 160;
-		break;
-	case CPUMF_CTR_SET_MT_DIAG:
-		if (info->csvn > 3)
-			ctrset_size = 48;
-		break;
-	case CPUMF_CTR_SET_MAX:
-		break;
-	}
-
-	return ctrset_size;
-}
-
-/* Calculate memory needed to store all counter sets together with header and
- * trailer data. This is independend of the counter set authorization which
- * can vary depending on the configuration.
- */
-static size_t cf_diag_ctrset_maxsize(struct cpumf_ctr_info *info)
-{
-	size_t max_size = sizeof(struct cf_trailer_entry);
-	enum cpumf_ctr_set i;
-
-	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
-		size_t size = cf_diag_ctrset_size(i, info);
-
-		if (size)
-			max_size += size * sizeof(u64) +
-				    sizeof(struct cf_ctrset_entry);
-	}
-	debug_sprintf_event(cf_diag_dbg, 5, "%s max_size %zu\n", __func__,
-			    max_size);
-
-	return max_size;
-}
-
-/* Read a counter set. The counter set number determines which counter set and
- * the CPUM-CF first and second version number determine the number of
- * available counters in this counter set.
- * Each counter set starts with header containing the counter set number and
- * the number of 8 byte counters.
- *
- * The functions returns the number of bytes occupied by this counter set
- * including the header.
- * If there is no counter in the counter set, this counter set is useless and
- * zero is returned on this case.
- */
-static size_t cf_diag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset,
-				size_t room)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	size_t ctrset_size, need = 0;
-	int rc = 3;				/* Assume write failure */
-
-	ctrdata->def = CF_DIAG_CTRSET_DEF;
-	ctrdata->set = ctrset;
-	ctrdata->res1 = 0;
-	ctrset_size = cf_diag_ctrset_size(ctrset, &cpuhw->info);
-
-	if (ctrset_size) {			/* Save data */
-		need = ctrset_size * sizeof(u64) + sizeof(*ctrdata);
-		if (need <= room)
-			rc = ctr_stcctm(ctrset, ctrset_size,
-					(u64 *)(ctrdata + 1));
-		if (rc != 3)
-			ctrdata->ctr = ctrset_size;
-		else
-			need = 0;
-	}
-
-	debug_sprintf_event(cf_diag_dbg, 6,
-			    "%s ctrset %d ctrset_size %zu cfvn %d csvn %d"
-			    " need %zd rc %d\n",
-			    __func__, ctrset, ctrset_size, cpuhw->info.cfvn,
-			    cpuhw->info.csvn, need, rc);
-	return need;
-}
-
-/* Read out all counter sets and save them in the provided data buffer.
- * The last 64 byte host an artificial trailer entry.
- */
-static size_t cf_diag_getctr(void *data, size_t sz, unsigned long auth)
-{
-	struct cf_trailer_entry *trailer;
-	size_t offset = 0, done;
-	int i;
-
-	memset(data, 0, sz);
-	sz -= sizeof(*trailer);			/* Always room for trailer */
-	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
-		struct cf_ctrset_entry *ctrdata = data + offset;
-
-		if (!(auth & cpumf_ctr_ctl[i]))
-			continue;	/* Counter set not authorized */
-
-		done = cf_diag_getctrset(ctrdata, i, sz - offset);
-		offset += done;
-		debug_sprintf_event(cf_diag_dbg, 6,
-				    "%s ctrset %d offset %zu done %zu\n",
-				     __func__, i, offset, done);
-	}
-	trailer = data + offset;
-	cf_diag_trailer(trailer);
-	return offset + sizeof(*trailer);
-}
-
-/* Calculate the difference for each counter in a counter set. */
-static void cf_diag_diffctrset(u64 *pstart, u64 *pstop, int counters)
-{
-	for (; --counters >= 0; ++pstart, ++pstop)
-		if (*pstop >= *pstart)
-			*pstop -= *pstart;
-		else
-			*pstop = *pstart - *pstop;
-}
-
-/* Scan the counter sets and calculate the difference of each counter
- * in each set. The result is the increment of each counter during the
- * period the counter set has been activated.
- *
- * Return true on success.
- */
-static int cf_diag_diffctr(struct cf_diag_csd *csd, unsigned long auth)
-{
-	struct cf_trailer_entry *trailer_start, *trailer_stop;
-	struct cf_ctrset_entry *ctrstart, *ctrstop;
-	size_t offset = 0;
-
-	auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1;
-	do {
-		ctrstart = (struct cf_ctrset_entry *)(csd->start + offset);
-		ctrstop = (struct cf_ctrset_entry *)(csd->data + offset);
-
-		if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) {
-			pr_err("cpum_cf_diag counter set compare error "
-				"in set %i\n", ctrstart->set);
-			return 0;
-		}
-		auth &= ~cpumf_ctr_ctl[ctrstart->set];
-		if (ctrstart->def == CF_DIAG_CTRSET_DEF) {
-			cf_diag_diffctrset((u64 *)(ctrstart + 1),
-					  (u64 *)(ctrstop + 1), ctrstart->ctr);
-			offset += ctrstart->ctr * sizeof(u64) +
-				  sizeof(*ctrstart);
-		}
-		debug_sprintf_event(cf_diag_dbg, 6,
-				    "%s set %d ctr %d offset %zu auth %lx\n",
-				    __func__, ctrstart->set, ctrstart->ctr,
-				    offset, auth);
-	} while (ctrstart->def && auth);
-
-	/* Save time_stamp from start of event in stop's trailer */
-	trailer_start = (struct cf_trailer_entry *)(csd->start + offset);
-	trailer_stop = (struct cf_trailer_entry *)(csd->data + offset);
-	trailer_stop->progusage[0] = trailer_start->timestamp;
-
-	return 1;
-}
-
-/* Create perf event sample with the counter sets as raw data.	The sample
- * is then pushed to the event subsystem and the function checks for
- * possible event overflows. If an event overflow occurs, the PMU is
- * stopped.
- *
- * Return non-zero if an event overflow occurred.
- */
-static int cf_diag_push_sample(struct perf_event *event,
-			       struct cf_diag_csd *csd)
-{
-	struct perf_sample_data data;
-	struct perf_raw_record raw;
-	struct pt_regs regs;
-	int overflow;
-
-	/* Setup perf sample */
-	perf_sample_data_init(&data, 0, event->hw.last_period);
-	memset(&regs, 0, sizeof(regs));
-	memset(&raw, 0, sizeof(raw));
-
-	if (event->attr.sample_type & PERF_SAMPLE_CPU)
-		data.cpu_entry.cpu = event->cpu;
-	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-		raw.frag.size = csd->used;
-		raw.frag.data = csd->data;
-		raw.size = csd->used;
-		data.raw = &raw;
-	}
-
-	overflow = perf_event_overflow(event, &data, &regs);
-	debug_sprintf_event(cf_diag_dbg, 6,
-			    "%s event %p cpu %d sample_type %#llx raw %d "
-			    "ov %d\n", __func__, event, event->cpu,
-			    event->attr.sample_type, raw.size, overflow);
-	if (overflow)
-		event->pmu->stop(event, 0);
-
-	perf_event_update_userpage(event);
-	return overflow;
-}
-
-static void cf_diag_start(struct perf_event *event, int flags)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
-	struct hw_perf_event *hwc = &event->hw;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d flags %#x hwc-state %#x\n",
-			    __func__, event, event->cpu, flags, hwc->state);
-	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
-		return;
-
-	/* (Re-)enable and activate all counter sets */
-	lcctl(0);		/* Reset counter sets */
-	hwc->state = 0;
-	ctr_set_multiple_enable(&cpuhw->state, hwc->config_base);
-	lcctl(cpuhw->state);	/* Enable counter sets */
-	csd->used = cf_diag_getctr(csd->start, sizeof(csd->start),
-				   event->hw.config_base);
-	ctr_set_multiple_start(&cpuhw->state, hwc->config_base);
-	/* Function cf_diag_enable() starts the counter sets. */
-}
-
-static void cf_diag_stop(struct perf_event *event, int flags)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
-	struct hw_perf_event *hwc = &event->hw;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d flags %#x hwc-state %#x\n",
-			    __func__, event, event->cpu, flags, hwc->state);
-
-	/* Deactivate all counter sets */
-	ctr_set_multiple_stop(&cpuhw->state, hwc->config_base);
-	local64_inc(&event->count);
-	csd->used = cf_diag_getctr(csd->data, sizeof(csd->data),
-				   event->hw.config_base);
-	if (cf_diag_diffctr(csd, event->hw.config_base))
-		cf_diag_push_sample(event, csd);
-	hwc->state |= PERF_HES_STOPPED;
-}
-
-static int cf_diag_add(struct perf_event *event, int flags)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	int err = 0;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d flags %#x cpuhw %p\n",
-			    __func__, event, event->cpu, flags, cpuhw);
-
-	if (cpuhw->flags & PMU_F_IN_USE) {
-		err = -EAGAIN;
-		goto out;
-	}
-
-	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-	cpuhw->flags |= PMU_F_IN_USE;
-	if (flags & PERF_EF_START)
-		cf_diag_start(event, PERF_EF_RELOAD);
-out:
-	debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err);
-	return err;
-}
-
-static void cf_diag_del(struct perf_event *event, int flags)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d flags %#x\n",
-			   __func__, event, event->cpu, flags);
-
-	cf_diag_stop(event, PERF_EF_UPDATE);
-	ctr_set_multiple_stop(&cpuhw->state, event->hw.config_base);
-	ctr_set_multiple_disable(&cpuhw->state, event->hw.config_base);
-	cpuhw->flags &= ~PMU_F_IN_USE;
-}
-
-CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG);
-
-static struct attribute *cf_diag_events_attr[] = {
-	CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG),
-	NULL,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-63");
-
-static struct attribute *cf_diag_format_attr[] = {
-	&format_attr_event.attr,
-	NULL,
-};
-
-static struct attribute_group cf_diag_events_group = {
-	.name = "events",
-	.attrs = cf_diag_events_attr,
-};
-static struct attribute_group cf_diag_format_group = {
-	.name = "format",
-	.attrs = cf_diag_format_attr,
-};
-static const struct attribute_group *cf_diag_attr_groups[] = {
-	&cf_diag_events_group,
-	&cf_diag_format_group,
-	NULL,
-};
-
-/* Performance monitoring unit for s390x */
-static struct pmu cf_diag = {
-	.task_ctx_nr  = perf_sw_context,
-	.pmu_enable   = cf_diag_enable,
-	.pmu_disable  = cf_diag_disable,
-	.event_init   = cf_diag_event_init,
-	.add	      = cf_diag_add,
-	.del	      = cf_diag_del,
-	.start	      = cf_diag_start,
-	.stop	      = cf_diag_stop,
-	.read	      = cf_diag_read,
-
-	.attr_groups  = cf_diag_attr_groups
-};
-
-/* Get the CPU speed, try sampling facility first and CPU attributes second. */
-static void cf_diag_get_cpu_speed(void)
-{
-	if (cpum_sf_avail()) {			/* Sampling facility first */
-		struct hws_qsi_info_block si;
-
-		memset(&si, 0, sizeof(si));
-		if (!qsi(&si)) {
-			cf_diag_cpu_speed = si.cpu_speed;
-			return;
-		}
-	}
-
-	if (test_facility(34)) {		/* CPU speed extract static part */
-		unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0);
-
-		if (mhz != -1UL)
-			cf_diag_cpu_speed = mhz & 0xffffffff;
-	}
-}
-
-/* Initialize the counter set PMU to generate complete counter set data as
- * event raw data. This relies on the CPU Measurement Counter Facility device
- * already being loaded and initialized.
- */
-static int __init cf_diag_init(void)
-{
-	struct cpumf_ctr_info info;
-	size_t need;
-	int rc;
-
-	if (!kernel_cpumcf_avail() || !stccm_avail() || qctri(&info))
-		return -ENODEV;
-	cf_diag_get_cpu_speed();
-
-	/* Make sure the counter set data fits into predefined buffer. */
-	need = cf_diag_ctrset_maxsize(&info);
-	if (need > sizeof(((struct cf_diag_csd *)0)->start)) {
-		pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n",
-		       need);
-		return -ENOMEM;
-	}
-
-	/* Setup s390dbf facility */
-	cf_diag_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128);
-	if (!cf_diag_dbg) {
-		pr_err("Registration of s390dbf(cpum_cf_diag) failed\n");
-		return -ENOMEM;
-	}
-	debug_register_view(cf_diag_dbg, &debug_sprintf_view);
-
-	rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1);
-	if (rc) {
-		debug_unregister_view(cf_diag_dbg, &debug_sprintf_view);
-		debug_unregister(cf_diag_dbg);
-		pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n",
-		       rc);
-	}
-	return rc;
-}
-arch_initcall(cf_diag_init);
diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c
index 8b33e03e47b8..e4a6bfc91080 100644
--- a/arch/s390/kernel/perf_cpum_cf_events.c
+++ b/arch/s390/kernel/perf_cpum_cf_events.c
@@ -238,6 +238,134 @@ CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_SPECIAL, 0x00f5);
 CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0);
 CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1);
 
+CPUMF_EVENT_ATTR(cf_z15, L1D_RO_EXCL_WRITES, 0x0080);
+CPUMF_EVENT_ATTR(cf_z15, DTLB2_WRITES, 0x0081);
+CPUMF_EVENT_ATTR(cf_z15, DTLB2_MISSES, 0x0082);
+CPUMF_EVENT_ATTR(cf_z15, DTLB2_HPAGE_WRITES, 0x0083);
+CPUMF_EVENT_ATTR(cf_z15, DTLB2_GPAGE_WRITES, 0x0084);
+CPUMF_EVENT_ATTR(cf_z15, L1D_L2D_SOURCED_WRITES, 0x0085);
+CPUMF_EVENT_ATTR(cf_z15, ITLB2_WRITES, 0x0086);
+CPUMF_EVENT_ATTR(cf_z15, ITLB2_MISSES, 0x0087);
+CPUMF_EVENT_ATTR(cf_z15, L1I_L2I_SOURCED_WRITES, 0x0088);
+CPUMF_EVENT_ATTR(cf_z15, TLB2_PTE_WRITES, 0x0089);
+CPUMF_EVENT_ATTR(cf_z15, TLB2_CRSTE_WRITES, 0x008a);
+CPUMF_EVENT_ATTR(cf_z15, TLB2_ENGINES_BUSY, 0x008b);
+CPUMF_EVENT_ATTR(cf_z15, TX_C_TEND, 0x008c);
+CPUMF_EVENT_ATTR(cf_z15, TX_NC_TEND, 0x008d);
+CPUMF_EVENT_ATTR(cf_z15, L1C_TLB2_MISSES, 0x008f);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES, 0x0090);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_MEMORY_SOURCED_WRITES, 0x0091);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_IV, 0x0092);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES, 0x0093);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x0094);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x0095);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES, 0x0096);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x0097);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x0098);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES, 0x0099);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x009a);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x009b);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONDRAWER_L4_SOURCED_WRITES, 0x009c);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L4_SOURCED_WRITES, 0x009d);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_RO, 0x009e);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES, 0x00a2);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_MEMORY_SOURCED_WRITES, 0x00a3);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES_IV, 0x00a4);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES, 0x00a5);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x00a6);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x00a7);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES, 0x00a8);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x00a9);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x00aa);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES, 0x00ab);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x00ac);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x00ad);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONDRAWER_L4_SOURCED_WRITES, 0x00ae);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L4_SOURCED_WRITES, 0x00af);
+CPUMF_EVENT_ATTR(cf_z15, BCD_DFP_EXECUTION_SLOTS, 0x00e0);
+CPUMF_EVENT_ATTR(cf_z15, VX_BCD_EXECUTION_SLOTS, 0x00e1);
+CPUMF_EVENT_ATTR(cf_z15, DECIMAL_INSTRUCTIONS, 0x00e2);
+CPUMF_EVENT_ATTR(cf_z15, LAST_HOST_TRANSLATIONS, 0x00e8);
+CPUMF_EVENT_ATTR(cf_z15, TX_NC_TABORT, 0x00f3);
+CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_NO_SPECIAL, 0x00f4);
+CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_SPECIAL, 0x00f5);
+CPUMF_EVENT_ATTR(cf_z15, DFLT_ACCESS, 0x00f7);
+CPUMF_EVENT_ATTR(cf_z15, DFLT_CYCLES, 0x00fc);
+CPUMF_EVENT_ATTR(cf_z15, DFLT_CC, 0x00108);
+CPUMF_EVENT_ATTR(cf_z15, DFLT_CCFINISH, 0x00109);
+CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0);
+CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1);
+CPUMF_EVENT_ATTR(cf_z16, L1D_RO_EXCL_WRITES, 0x0080);
+CPUMF_EVENT_ATTR(cf_z16, DTLB2_WRITES, 0x0081);
+CPUMF_EVENT_ATTR(cf_z16, DTLB2_MISSES, 0x0082);
+CPUMF_EVENT_ATTR(cf_z16, CRSTE_1MB_WRITES, 0x0083);
+CPUMF_EVENT_ATTR(cf_z16, DTLB2_GPAGE_WRITES, 0x0084);
+CPUMF_EVENT_ATTR(cf_z16, ITLB2_WRITES, 0x0086);
+CPUMF_EVENT_ATTR(cf_z16, ITLB2_MISSES, 0x0087);
+CPUMF_EVENT_ATTR(cf_z16, TLB2_PTE_WRITES, 0x0089);
+CPUMF_EVENT_ATTR(cf_z16, TLB2_CRSTE_WRITES, 0x008a);
+CPUMF_EVENT_ATTR(cf_z16, TLB2_ENGINES_BUSY, 0x008b);
+CPUMF_EVENT_ATTR(cf_z16, TX_C_TEND, 0x008c);
+CPUMF_EVENT_ATTR(cf_z16, TX_NC_TEND, 0x008d);
+CPUMF_EVENT_ATTR(cf_z16, L1C_TLB2_MISSES, 0x008f);
+CPUMF_EVENT_ATTR(cf_z16, DCW_REQ, 0x0091);
+CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_IV, 0x0092);
+CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_CHIP_HIT, 0x0093);
+CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_DRAWER_HIT, 0x0094);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP, 0x0095);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_IV, 0x0096);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_CHIP_HIT, 0x0097);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_DRAWER_HIT, 0x0098);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_MODULE, 0x0099);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_DRAWER, 0x009a);
+CPUMF_EVENT_ATTR(cf_z16, DCW_OFF_DRAWER, 0x009b);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_MEMORY, 0x009c);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_MODULE_MEMORY, 0x009d);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_DRAWER_MEMORY, 0x009e);
+CPUMF_EVENT_ATTR(cf_z16, DCW_OFF_DRAWER_MEMORY, 0x009f);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_IV, 0x00a0);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_CHIP_HIT, 0x00a1);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_DRAWER_HIT, 0x00a2);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_IV, 0x00a3);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_CHIP_HIT, 0x00a4);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_DRAWER_HIT, 0x00a5);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_IV, 0x00a6);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_CHIP_HIT, 0x00a7);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_DRAWER_HIT, 0x00a8);
+CPUMF_EVENT_ATTR(cf_z16, ICW_REQ, 0x00a9);
+CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_IV, 0x00aa);
+CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_CHIP_HIT, 0x00ab);
+CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_DRAWER_HIT, 0x00ac);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP, 0x00ad);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_IV, 0x00ae);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_CHIP_HIT, 0x00af);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_DRAWER_HIT, 0x00b0);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_MODULE, 0x00b1);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_DRAWER, 0x00b2);
+CPUMF_EVENT_ATTR(cf_z16, ICW_OFF_DRAWER, 0x00b3);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_MEMORY, 0x00b4);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_MODULE_MEMORY, 0x00b5);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_DRAWER_MEMORY, 0x00b6);
+CPUMF_EVENT_ATTR(cf_z16, ICW_OFF_DRAWER_MEMORY, 0x00b7);
+CPUMF_EVENT_ATTR(cf_z16, BCD_DFP_EXECUTION_SLOTS, 0x00e0);
+CPUMF_EVENT_ATTR(cf_z16, VX_BCD_EXECUTION_SLOTS, 0x00e1);
+CPUMF_EVENT_ATTR(cf_z16, DECIMAL_INSTRUCTIONS, 0x00e2);
+CPUMF_EVENT_ATTR(cf_z16, LAST_HOST_TRANSLATIONS, 0x00e8);
+CPUMF_EVENT_ATTR(cf_z16, TX_NC_TABORT, 0x00f4);
+CPUMF_EVENT_ATTR(cf_z16, TX_C_TABORT_NO_SPECIAL, 0x00f5);
+CPUMF_EVENT_ATTR(cf_z16, TX_C_TABORT_SPECIAL, 0x00f6);
+CPUMF_EVENT_ATTR(cf_z16, DFLT_ACCESS, 0x00f8);
+CPUMF_EVENT_ATTR(cf_z16, DFLT_CYCLES, 0x00fd);
+CPUMF_EVENT_ATTR(cf_z16, SORTL, 0x0100);
+CPUMF_EVENT_ATTR(cf_z16, DFLT_CC, 0x0109);
+CPUMF_EVENT_ATTR(cf_z16, DFLT_CCFINISH, 0x010a);
+CPUMF_EVENT_ATTR(cf_z16, NNPA_INVOCATIONS, 0x010b);
+CPUMF_EVENT_ATTR(cf_z16, NNPA_COMPLETIONS, 0x010c);
+CPUMF_EVENT_ATTR(cf_z16, NNPA_WAIT_LOCK, 0x010d);
+CPUMF_EVENT_ATTR(cf_z16, NNPA_HOLD_LOCK, 0x010e);
+CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0);
+CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1);
+
 static struct attribute *cpumcf_fvn1_pmu_event_attr[] __initdata = {
 	CPUMF_EVENT_PTR(cf_fvn1, CPU_CYCLES),
 	CPUMF_EVENT_PTR(cf_fvn1, INSTRUCTIONS),
@@ -286,7 +414,7 @@ static struct attribute *cpumcf_svn_12345_pmu_event_attr[] __initdata = {
 	NULL,
 };
 
-static struct attribute *cpumcf_svn_6_pmu_event_attr[] __initdata = {
+static struct attribute *cpumcf_svn_67_pmu_event_attr[] __initdata = {
 	CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS),
 	CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES),
 	CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS),
@@ -516,6 +644,141 @@ static struct attribute *cpumcf_z14_pmu_event_attr[] __initdata = {
 	NULL,
 };
 
+static struct attribute *cpumcf_z15_pmu_event_attr[] __initdata = {
+	CPUMF_EVENT_PTR(cf_z15, L1D_RO_EXCL_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, DTLB2_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, DTLB2_MISSES),
+	CPUMF_EVENT_PTR(cf_z15, DTLB2_HPAGE_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, DTLB2_GPAGE_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, L1D_L2D_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, ITLB2_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, ITLB2_MISSES),
+	CPUMF_EVENT_PTR(cf_z15, L1I_L2I_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, TLB2_PTE_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, TLB2_CRSTE_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, TLB2_ENGINES_BUSY),
+	CPUMF_EVENT_PTR(cf_z15, TX_C_TEND),
+	CPUMF_EVENT_PTR(cf_z15, TX_NC_TEND),
+	CPUMF_EVENT_PTR(cf_z15, L1C_TLB2_MISSES),
+	CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_MEMORY_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_IV),
+	CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV),
+	CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV),
+	CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV),
+	CPUMF_EVENT_PTR(cf_z15, L1D_ONDRAWER_L4_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L4_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_RO),
+	CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_MEMORY_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES_IV),
+	CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV),
+	CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV),
+	CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV),
+	CPUMF_EVENT_PTR(cf_z15, L1I_ONDRAWER_L4_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L4_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z15, BCD_DFP_EXECUTION_SLOTS),
+	CPUMF_EVENT_PTR(cf_z15, VX_BCD_EXECUTION_SLOTS),
+	CPUMF_EVENT_PTR(cf_z15, DECIMAL_INSTRUCTIONS),
+	CPUMF_EVENT_PTR(cf_z15, LAST_HOST_TRANSLATIONS),
+	CPUMF_EVENT_PTR(cf_z15, TX_NC_TABORT),
+	CPUMF_EVENT_PTR(cf_z15, TX_C_TABORT_NO_SPECIAL),
+	CPUMF_EVENT_PTR(cf_z15, TX_C_TABORT_SPECIAL),
+	CPUMF_EVENT_PTR(cf_z15, DFLT_ACCESS),
+	CPUMF_EVENT_PTR(cf_z15, DFLT_CYCLES),
+	CPUMF_EVENT_PTR(cf_z15, DFLT_CC),
+	CPUMF_EVENT_PTR(cf_z15, DFLT_CCFINISH),
+	CPUMF_EVENT_PTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE),
+	CPUMF_EVENT_PTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE),
+	NULL,
+};
+
+static struct attribute *cpumcf_z16_pmu_event_attr[] __initdata = {
+	CPUMF_EVENT_PTR(cf_z16, L1D_RO_EXCL_WRITES),
+	CPUMF_EVENT_PTR(cf_z16, DTLB2_WRITES),
+	CPUMF_EVENT_PTR(cf_z16, DTLB2_MISSES),
+	CPUMF_EVENT_PTR(cf_z16, CRSTE_1MB_WRITES),
+	CPUMF_EVENT_PTR(cf_z16, DTLB2_GPAGE_WRITES),
+	CPUMF_EVENT_PTR(cf_z16, ITLB2_WRITES),
+	CPUMF_EVENT_PTR(cf_z16, ITLB2_MISSES),
+	CPUMF_EVENT_PTR(cf_z16, TLB2_PTE_WRITES),
+	CPUMF_EVENT_PTR(cf_z16, TLB2_CRSTE_WRITES),
+	CPUMF_EVENT_PTR(cf_z16, TLB2_ENGINES_BUSY),
+	CPUMF_EVENT_PTR(cf_z16, TX_C_TEND),
+	CPUMF_EVENT_PTR(cf_z16, TX_NC_TEND),
+	CPUMF_EVENT_PTR(cf_z16, L1C_TLB2_MISSES),
+	CPUMF_EVENT_PTR(cf_z16, DCW_REQ),
+	CPUMF_EVENT_PTR(cf_z16, DCW_REQ_IV),
+	CPUMF_EVENT_PTR(cf_z16, DCW_REQ_CHIP_HIT),
+	CPUMF_EVENT_PTR(cf_z16, DCW_REQ_DRAWER_HIT),
+	CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP),
+	CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_IV),
+	CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_CHIP_HIT),
+	CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_DRAWER_HIT),
+	CPUMF_EVENT_PTR(cf_z16, DCW_ON_MODULE),
+	CPUMF_EVENT_PTR(cf_z16, DCW_ON_DRAWER),
+	CPUMF_EVENT_PTR(cf_z16, DCW_OFF_DRAWER),
+	CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_MEMORY),
+	CPUMF_EVENT_PTR(cf_z16, DCW_ON_MODULE_MEMORY),
+	CPUMF_EVENT_PTR(cf_z16, DCW_ON_DRAWER_MEMORY),
+	CPUMF_EVENT_PTR(cf_z16, DCW_OFF_DRAWER_MEMORY),
+	CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_IV),
+	CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_CHIP_HIT),
+	CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_DRAWER_HIT),
+	CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_IV),
+	CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_CHIP_HIT),
+	CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_DRAWER_HIT),
+	CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_IV),
+	CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_CHIP_HIT),
+	CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_DRAWER_HIT),
+	CPUMF_EVENT_PTR(cf_z16, ICW_REQ),
+	CPUMF_EVENT_PTR(cf_z16, ICW_REQ_IV),
+	CPUMF_EVENT_PTR(cf_z16, ICW_REQ_CHIP_HIT),
+	CPUMF_EVENT_PTR(cf_z16, ICW_REQ_DRAWER_HIT),
+	CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP),
+	CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_IV),
+	CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_CHIP_HIT),
+	CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_DRAWER_HIT),
+	CPUMF_EVENT_PTR(cf_z16, ICW_ON_MODULE),
+	CPUMF_EVENT_PTR(cf_z16, ICW_ON_DRAWER),
+	CPUMF_EVENT_PTR(cf_z16, ICW_OFF_DRAWER),
+	CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_MEMORY),
+	CPUMF_EVENT_PTR(cf_z16, ICW_ON_MODULE_MEMORY),
+	CPUMF_EVENT_PTR(cf_z16, ICW_ON_DRAWER_MEMORY),
+	CPUMF_EVENT_PTR(cf_z16, ICW_OFF_DRAWER_MEMORY),
+	CPUMF_EVENT_PTR(cf_z16, BCD_DFP_EXECUTION_SLOTS),
+	CPUMF_EVENT_PTR(cf_z16, VX_BCD_EXECUTION_SLOTS),
+	CPUMF_EVENT_PTR(cf_z16, DECIMAL_INSTRUCTIONS),
+	CPUMF_EVENT_PTR(cf_z16, LAST_HOST_TRANSLATIONS),
+	CPUMF_EVENT_PTR(cf_z16, TX_NC_TABORT),
+	CPUMF_EVENT_PTR(cf_z16, TX_C_TABORT_NO_SPECIAL),
+	CPUMF_EVENT_PTR(cf_z16, TX_C_TABORT_SPECIAL),
+	CPUMF_EVENT_PTR(cf_z16, DFLT_ACCESS),
+	CPUMF_EVENT_PTR(cf_z16, DFLT_CYCLES),
+	CPUMF_EVENT_PTR(cf_z16, SORTL),
+	CPUMF_EVENT_PTR(cf_z16, DFLT_CC),
+	CPUMF_EVENT_PTR(cf_z16, DFLT_CCFINISH),
+	CPUMF_EVENT_PTR(cf_z16, NNPA_INVOCATIONS),
+	CPUMF_EVENT_PTR(cf_z16, NNPA_COMPLETIONS),
+	CPUMF_EVENT_PTR(cf_z16, NNPA_WAIT_LOCK),
+	CPUMF_EVENT_PTR(cf_z16, NNPA_HOLD_LOCK),
+	CPUMF_EVENT_PTR(cf_z16, MT_DIAG_CYCLES_ONE_THR_ACTIVE),
+	CPUMF_EVENT_PTR(cf_z16, MT_DIAG_CYCLES_TWO_THR_ACTIVE),
+	NULL,
+};
+
 /* END: CPUM_CF COUNTER DEFINITIONS ===================================== */
 
 static struct attribute_group cpumcf_pmu_events_group = {
@@ -592,16 +855,11 @@ __init const struct attribute_group **cpumf_cf_event_group(void)
 	}
 
 	/* Determine version specific crypto set */
-	switch (ci.csvn) {
-	case 1 ... 5:
+	csvn = none;
+	if (ci.csvn >= 1 && ci.csvn <= 5)
 		csvn = cpumcf_svn_12345_pmu_event_attr;
-		break;
-	case 6:
-		csvn = cpumcf_svn_6_pmu_event_attr;
-		break;
-	default:
-		csvn = none;
-	}
+	else if (ci.csvn >= 6)
+		csvn = cpumcf_svn_67_pmu_event_attr;
 
 	/* Determine model-specific counter set(s) */
 	get_cpu_id(&cpu_id);
@@ -624,9 +882,15 @@ __init const struct attribute_group **cpumf_cf_event_group(void)
 		break;
 	case 0x3906:
 	case 0x3907:
+		model = cpumcf_z14_pmu_event_attr;
+		break;
 	case 0x8561:
 	case 0x8562:
-		model = cpumcf_z14_pmu_event_attr;
+		model = cpumcf_z15_pmu_event_attr;
+		break;
+	case 0x3931:
+	case 0x3932:
+		model = cpumcf_z16_pmu_event_attr;
 		break;
 	default:
 		model = none;
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index c07fdcd73726..5f60248cb468 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -22,6 +22,23 @@
 #include <asm/irq.h>
 #include <asm/debug.h>
 #include <asm/timex.h>
+#include <linux/io.h>
+
+/* Perf PMU definitions for the sampling facility */
+#define PERF_CPUM_SF_MAX_CTR		2
+#define PERF_EVENT_CPUM_SF		0xB0000UL /* Event: Basic-sampling */
+#define PERF_EVENT_CPUM_SF_DIAG		0xBD000UL /* Event: Combined-sampling */
+#define PERF_CPUM_SF_BASIC_MODE		0x0001	  /* Basic-sampling flag */
+#define PERF_CPUM_SF_DIAG_MODE		0x0002	  /* Diagnostic-sampling flag */
+#define PERF_CPUM_SF_FREQ_MODE		0x0008	  /* Sampling with frequency */
+
+#define OVERFLOW_REG(hwc)	((hwc)->extra_reg.config)
+#define SFB_ALLOC_REG(hwc)	((hwc)->extra_reg.alloc)
+#define TEAR_REG(hwc)		((hwc)->last_tag)
+#define SAMPL_RATE(hwc)		((hwc)->event_base)
+#define SAMPL_FLAGS(hwc)	((hwc)->config_base)
+#define SAMPL_DIAG_MODE(hwc)	(SAMPL_FLAGS(hwc) & PERF_CPUM_SF_DIAG_MODE)
+#define SAMPL_FREQ_MODE(hwc)	(SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FREQ_MODE)
 
 /* Minimum number of sample-data-block-tables:
  * At least one table is required for the sampling buffer structure.
@@ -42,7 +59,7 @@
 #define CPUM_SF_SDBT_TL_OFFSET	(CPUM_SF_SDB_PER_TABLE * 8)
 static inline int require_table_link(const void *sdbt)
 {
-	return ((unsigned long) sdbt & ~PAGE_MASK) == CPUM_SF_SDBT_TL_OFFSET;
+	return ((unsigned long)sdbt & ~PAGE_MASK) == CPUM_SF_SDBT_TL_OFFSET;
 }
 
 /* Minimum and maximum sampling buffer sizes:
@@ -99,15 +116,55 @@ static DEFINE_PER_CPU(struct cpu_hw_sf, cpu_hw_sf);
 /* Debug feature */
 static debug_info_t *sfdbg;
 
+/* Sampling control helper functions */
+static inline unsigned long freq_to_sample_rate(struct hws_qsi_info_block *qsi,
+						unsigned long freq)
+{
+	return (USEC_PER_SEC / freq) * qsi->cpu_speed;
+}
+
+static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi,
+						unsigned long rate)
+{
+	return USEC_PER_SEC * qsi->cpu_speed / rate;
+}
+
+/* Return pointer to trailer entry of an sample data block */
+static inline struct hws_trailer_entry *trailer_entry_ptr(unsigned long v)
+{
+	void *ret;
+
+	ret = (void *)v;
+	ret += PAGE_SIZE;
+	ret -= sizeof(struct hws_trailer_entry);
+
+	return ret;
+}
+
+/*
+ * Return true if the entry in the sample data block table (sdbt)
+ * is a link to the next sdbt
+ */
+static inline int is_link_entry(unsigned long *s)
+{
+	return *s & 0x1UL ? 1 : 0;
+}
+
+/* Return pointer to the linked sdbt */
+static inline unsigned long *get_next_sdbt(unsigned long *s)
+{
+	return phys_to_virt(*s & ~0x1UL);
+}
+
 /*
  * sf_disable() - Switch off sampling facility
  */
-static int sf_disable(void)
+static void sf_disable(void)
 {
 	struct hws_lsctl_request_block sreq;
 
 	memset(&sreq, 0, sizeof(sreq));
-	return lsctl(&sreq);
+	lsctl(&sreq);
 }
 
 /*
@@ -123,57 +180,44 @@ static int sf_buffer_available(struct cpu_hw_sf *cpuhw)
  */
 static void free_sampling_buffer(struct sf_buffer *sfb)
 {
-	unsigned long *sdbt, *curr;
-
-	if (!sfb->sdbt)
-		return;
+	unsigned long *sdbt, *curr, *head;
 
 	sdbt = sfb->sdbt;
-	curr = sdbt;
-
+	if (!sdbt)
+		return;
+	sfb->sdbt = NULL;
 	/* Free the SDBT after all SDBs are processed... */
-	while (1) {
-		if (!*curr || !sdbt)
-			break;
-
-		/* Process table-link entries */
+	head = sdbt;
+	curr = sdbt;
+	do {
 		if (is_link_entry(curr)) {
+			/* Process table-link entries */
 			curr = get_next_sdbt(curr);
-			if (sdbt)
-				free_page((unsigned long) sdbt);
-
-			/* If the origin is reached, sampling buffer is freed */
-			if (curr == sfb->sdbt)
-				break;
-			else
-				sdbt = curr;
+			free_page((unsigned long)sdbt);
+			sdbt = curr;
 		} else {
 			/* Process SDB pointer */
-			if (*curr) {
-				free_page(*curr);
-				curr++;
-			}
+			free_page((unsigned long)phys_to_virt(*curr));
+			curr++;
 		}
-	}
-
-	debug_sprintf_event(sfdbg, 5, "%s: freed sdbt %#lx\n", __func__,
-			    (unsigned long)sfb->sdbt);
+	} while (curr != head);
 	memset(sfb, 0, sizeof(*sfb));
 }
 
 static int alloc_sample_data_block(unsigned long *sdbt, gfp_t gfp_flags)
 {
-	unsigned long sdb, *trailer;
+	struct hws_trailer_entry *te;
+	unsigned long sdb;
 
 	/* Allocate and initialize sample-data-block */
 	sdb = get_zeroed_page(gfp_flags);
 	if (!sdb)
 		return -ENOMEM;
-	trailer = trailer_entry_ptr(sdb);
-	*trailer = SDB_TE_ALERT_REQ_MASK;
+	te = trailer_entry_ptr(sdb);
+	te->header.a = 1;
 
 	/* Link SDB into the sample-data-block-table */
-	*sdbt = sdb;
+	*sdbt = virt_to_phys((void *)sdb);
 
 	return 0;
 }
@@ -212,10 +256,8 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb,
 	 * the sampling buffer origin.
 	 */
 	if (sfb->sdbt != get_next_sdbt(tail)) {
-		debug_sprintf_event(sfdbg, 3, "%s: "
-				    "sampling buffer is not linked: origin %#lx"
-				    " tail %#lx\n", __func__,
-				    (unsigned long)sfb->sdbt,
+		debug_sprintf_event(sfdbg, 3, "%s buffer not linked origin %#lx tail %#lx\n",
+				    __func__, (unsigned long)sfb->sdbt,
 				    (unsigned long)tail);
 		return -EINVAL;
 	}
@@ -225,14 +267,14 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb,
 	for (i = 0; i < num_sdb; i++) {
 		/* Allocate a new SDB-table if it is full. */
 		if (require_table_link(tail)) {
-			new = (unsigned long *) get_zeroed_page(gfp_flags);
+			new = (unsigned long *)get_zeroed_page(gfp_flags);
 			if (!new) {
 				rc = -ENOMEM;
 				break;
 			}
 			sfb->num_sdbt++;
 			/* Link current page to tail of chain */
-			*tail = (unsigned long)(void *) new + 1;
+			*tail = virt_to_phys((void *)new) + 1;
 			tail_prev = tail;
 			tail = new;
 		}
@@ -251,7 +293,7 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb,
 			 */
 			if (tail_prev) {
 				sfb->num_sdbt--;
-				free_page((unsigned long) new);
+				free_page((unsigned long)new);
 				tail = tail_prev;
 			}
 			break;
@@ -262,12 +304,9 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb,
 	}
 
 	/* Link sampling buffer to its origin */
-	*tail = (unsigned long) sfb->sdbt + 1;
+	*tail = virt_to_phys(sfb->sdbt) + 1;
 	sfb->tail = tail;
 
-	debug_sprintf_event(sfdbg, 4, "%s: new buffer"
-			    " settings: sdbt %lu sdb %lu\n", __func__,
-			    sfb->num_sdbt, sfb->num_sdb);
 	return rc;
 }
 
@@ -290,7 +329,7 @@ static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb)
 		return -EINVAL;
 
 	/* Allocate the sample-data-block-table origin */
-	sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+	sfb->sdbt = (unsigned long *)get_zeroed_page(GFP_KERNEL);
 	if (!sfb->sdbt)
 		return -ENOMEM;
 	sfb->num_sdb = 0;
@@ -300,19 +339,12 @@ static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb)
 	 * realloc_sampling_buffer() invocation.
 	 */
 	sfb->tail = sfb->sdbt;
-	*sfb->tail = (unsigned long)(void *) sfb->sdbt + 1;
+	*sfb->tail = virt_to_phys((void *)sfb->sdbt) + 1;
 
 	/* Allocate requested number of sample-data-blocks */
 	rc = realloc_sampling_buffer(sfb, num_sdb, GFP_KERNEL);
-	if (rc) {
+	if (rc)
 		free_sampling_buffer(sfb);
-		debug_sprintf_event(sfdbg, 4, "%s: "
-			"realloc_sampling_buffer failed with rc %i\n",
-			__func__, rc);
-	} else
-		debug_sprintf_event(sfdbg, 4,
-			"%s: tear %#lx dear %#lx\n", __func__,
-			(unsigned long)sfb->sdbt, (unsigned long)*sfb->sdbt);
 	return rc;
 }
 
@@ -324,8 +356,8 @@ static void sfb_set_limits(unsigned long min, unsigned long max)
 	CPUM_SF_MAX_SDB = max;
 
 	memset(&si, 0, sizeof(si));
-	if (!qsi(&si))
-		CPUM_SF_SDB_DIAG_FACTOR = DIV_ROUND_UP(si.dsdes, si.bsdes);
+	qsi(&si);
+	CPUM_SF_SDB_DIAG_FACTOR = DIV_ROUND_UP(si.dsdes, si.bsdes);
 }
 
 static unsigned long sfb_max_limit(struct hw_perf_event *hwc)
@@ -344,12 +376,6 @@ static unsigned long sfb_pending_allocs(struct sf_buffer *sfb,
 	return 0;
 }
 
-static int sfb_has_pending_allocs(struct sf_buffer *sfb,
-				   struct hw_perf_event *hwc)
-{
-	return sfb_pending_allocs(sfb, hwc) > 0;
-}
-
 static void sfb_account_allocs(unsigned long num, struct hw_perf_event *hwc)
 {
 	/* Limit the number of SDBs to not exceed the maximum */
@@ -366,45 +392,45 @@ static void sfb_init_allocs(unsigned long num, struct hw_perf_event *hwc)
 
 static void deallocate_buffers(struct cpu_hw_sf *cpuhw)
 {
-	if (cpuhw->sfb.sdbt)
+	if (sf_buffer_available(cpuhw))
 		free_sampling_buffer(&cpuhw->sfb);
 }
 
 static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc)
 {
-	unsigned long n_sdb, freq, factor;
-	size_t sample_size;
+	unsigned long n_sdb, freq;
 
 	/* Calculate sampling buffers using 4K pages
 	 *
-	 *    1. Determine the sample data size which depends on the used
-	 *	 sampling functions, for example, basic-sampling or
-	 *	 basic-sampling with diagnostic-sampling.
+	 *    1. The sampling size is 32 bytes for basic sampling. This size
+	 *	 is the same for all machine types. Diagnostic
+	 *	 sampling uses auxlilary data buffer setup which provides the
+	 *	 memory for SDBs using linux common code auxiliary trace
+	 *	 setup.
 	 *
-	 *    2. Use the sampling frequency as input.  The sampling buffer is
-	 *	 designed for almost one second.  This can be adjusted through
-	 *	 the "factor" variable.
-	 *	 In any case, alloc_sampling_buffer() sets the Alert Request
+	 *    2. Function alloc_sampling_buffer() sets the Alert Request
 	 *	 Control indicator to trigger a measurement-alert to harvest
-	 *	 sample-data-blocks (sdb).
+	 *	 sample-data-blocks (SDB). This is done per SDB. This
+	 *	 measurement alert interrupt fires quick enough to handle
+	 *	 one SDB, on very high frequency and work loads there might
+	 *	 be 2 to 3 SBDs available for sample processing.
+	 *	 Currently there is no need for setup alert request on every
+	 *	 n-th page. This is counterproductive as one IRQ triggers
+	 *	 a very high number of samples to be processed at one IRQ.
 	 *
-	 *    3. Compute the number of sample-data-blocks and ensure a minimum
-	 *	 of CPUM_SF_MIN_SDB.  Also ensure the upper limit does not
-	 *	 exceed a "calculated" maximum.  The symbolic maximum is
-	 *	 designed for basic-sampling only and needs to be increased if
-	 *	 diagnostic-sampling is active.
-	 *	 See also the remarks for these symbolic constants.
+	 *    3. Use the sampling frequency as input.
+	 *	 Compute the number of SDBs and ensure a minimum
+	 *	 of CPUM_SF_MIN_SDB.  Depending on frequency add some more
+	 *	 SDBs to handle a higher sampling rate.
+	 *	 Use a minimum of CPUM_SF_MIN_SDB and allow for 100 samples
+	 *	 (one SDB) for every 10000 HZ frequency increment.
 	 *
 	 *    4. Compute the number of sample-data-block-tables (SDBT) and
 	 *	 ensure a minimum of CPUM_SF_MIN_SDBT (one table can manage up
 	 *	 to 511 SDBs).
 	 */
-	sample_size = sizeof(struct hws_basic_entry);
 	freq = sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc));
-	factor = 1;
-	n_sdb = DIV_ROUND_UP(freq, factor * ((PAGE_SIZE-64) / sample_size));
-	if (n_sdb < CPUM_SF_MIN_SDB)
-		n_sdb = CPUM_SF_MIN_SDB;
+	n_sdb = CPUM_SF_MIN_SDB + DIV_ROUND_UP(freq, 10000);
 
 	/* If there is already a sampling buffer allocated, it is very likely
 	 * that the sampling facility is enabled too.  If the event to be
@@ -418,12 +444,6 @@ static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc)
 	if (sf_buffer_available(cpuhw))
 		return 0;
 
-	debug_sprintf_event(sfdbg, 3,
-			    "%s: rate %lu f %lu sdb %lu/%lu"
-			    " sample_size %lu cpuhw %p\n", __func__,
-			    SAMPL_RATE(hwc), freq, n_sdb, sfb_max_limit(hwc),
-			    sample_size, cpuhw);
-
 	return alloc_sampling_buffer(&cpuhw->sfb,
 				     sfb_pending_allocs(&cpuhw->sfb, hwc));
 }
@@ -480,8 +500,6 @@ static void sfb_account_overflows(struct cpu_hw_sf *cpuhw,
 	if (num)
 		sfb_account_allocs(num, hwc);
 
-	debug_sprintf_event(sfdbg, 5, "%s: overflow %llu ratio %lu num %lu\n",
-			    __func__, OVERFLOW_REG(hwc), ratio, num);
 	OVERFLOW_REG(hwc) = 0;
 }
 
@@ -499,13 +517,11 @@ static void sfb_account_overflows(struct cpu_hw_sf *cpuhw,
 static void extend_sampling_buffer(struct sf_buffer *sfb,
 				   struct hw_perf_event *hwc)
 {
-	unsigned long num, num_old;
-	int rc;
+	unsigned long num;
 
 	num = sfb_pending_allocs(sfb, hwc);
 	if (!num)
 		return;
-	num_old = sfb->num_sdb;
 
 	/* Disable the sampling facility to reset any states and also
 	 * clear pending measurement alerts.
@@ -517,62 +533,32 @@ static void extend_sampling_buffer(struct sf_buffer *sfb,
 	 * called by perf.  Because this is a reallocation, it is fine if the
 	 * new SDB-request cannot be satisfied immediately.
 	 */
-	rc = realloc_sampling_buffer(sfb, num, GFP_ATOMIC);
-	if (rc)
-		debug_sprintf_event(sfdbg, 5, "%s: realloc failed with rc %i\n",
-				    __func__, rc);
-
-	if (sfb_has_pending_allocs(sfb, hwc))
-		debug_sprintf_event(sfdbg, 5, "%s: "
-				    "req %lu alloc %lu remaining %lu\n",
-				    __func__, num, sfb->num_sdb - num_old,
-				    sfb_pending_allocs(sfb, hwc));
+	realloc_sampling_buffer(sfb, num, GFP_ATOMIC);
 }
 
 /* Number of perf events counting hardware events */
-static atomic_t num_events;
+static refcount_t num_events;
 /* Used to avoid races in calling reserve/release_cpumf_hardware */
 static DEFINE_MUTEX(pmc_reserve_mutex);
 
 #define PMC_INIT      0
 #define PMC_RELEASE   1
-#define PMC_FAILURE   2
 static void setup_pmc_cpu(void *flags)
 {
-	int err;
-	struct cpu_hw_sf *cpusf = this_cpu_ptr(&cpu_hw_sf);
+	struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf);
 
-	err = 0;
-	switch (*((int *) flags)) {
+	sf_disable();
+	switch (*((int *)flags)) {
 	case PMC_INIT:
-		memset(cpusf, 0, sizeof(*cpusf));
-		err = qsi(&cpusf->qsi);
-		if (err)
-			break;
-		cpusf->flags |= PMU_F_RESERVED;
-		err = sf_disable();
-		if (err)
-			pr_err("Switching off the sampling facility failed "
-			       "with rc %i\n", err);
-		debug_sprintf_event(sfdbg, 5,
-				    "%s: initialized: cpuhw %p\n", __func__,
-				    cpusf);
+		memset(cpuhw, 0, sizeof(*cpuhw));
+		qsi(&cpuhw->qsi);
+		cpuhw->flags |= PMU_F_RESERVED;
 		break;
 	case PMC_RELEASE:
-		cpusf->flags &= ~PMU_F_RESERVED;
-		err = sf_disable();
-		if (err) {
-			pr_err("Switching off the sampling facility failed "
-			       "with rc %i\n", err);
-		} else
-			deallocate_buffers(cpusf);
-		debug_sprintf_event(sfdbg, 5,
-				    "%s: released: cpuhw %p\n", __func__,
-				    cpusf);
+		cpuhw->flags &= ~PMU_F_RESERVED;
+		deallocate_buffers(cpuhw);
 		break;
 	}
-	if (err)
-		*((int *) flags) |= PMC_FAILURE;
 }
 
 static void release_pmc_hardware(void)
@@ -583,27 +569,19 @@ static void release_pmc_hardware(void)
 	on_each_cpu(setup_pmc_cpu, &flags, 1);
 }
 
-static int reserve_pmc_hardware(void)
+static void reserve_pmc_hardware(void)
 {
 	int flags = PMC_INIT;
 
 	on_each_cpu(setup_pmc_cpu, &flags, 1);
-	if (flags & PMC_FAILURE) {
-		release_pmc_hardware();
-		return -ENODEV;
-	}
 	irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT);
-
-	return 0;
 }
 
 static void hw_perf_event_destroy(struct perf_event *event)
 {
 	/* Release PMC if this is the last perf event */
-	if (!atomic_add_unless(&num_events, -1, 1)) {
-		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_dec_return(&num_events) == 0)
-			release_pmc_hardware();
+	if (refcount_dec_and_mutex_lock(&num_events, &pmc_reserve_mutex)) {
+		release_pmc_hardware();
 		mutex_unlock(&pmc_reserve_mutex);
 	}
 }
@@ -669,8 +647,9 @@ static void cpumsf_output_event_pid(struct perf_event *event,
 	/* Protect callchain buffers, tasks */
 	rcu_read_lock();
 
-	perf_prepare_sample(&header, data, event, regs);
-	if (perf_output_begin(&handle, event, header.size))
+	perf_prepare_sample(data, event, regs);
+	perf_prepare_header(&header, data, event, regs);
+	if (perf_output_begin(&handle, data, event, header.size))
 		goto out;
 
 	/* Update the process ID (see also kernel/events/core.c) */
@@ -706,9 +685,6 @@ static unsigned long getrate(bool freq, unsigned long sample,
 		 */
 		if (sample_rate_to_freq(si, rate) >
 		    sysctl_perf_event_sample_rate) {
-			debug_sprintf_event(sfdbg, 1, "%s: "
-					    "Sampling rate exceeds maximum "
-					    "perf sample rate\n", __func__);
 			rate = 0;
 		}
 	}
@@ -753,9 +729,6 @@ static int __hw_perf_event_init_rate(struct perf_event *event,
 	attr->sample_period = rate;
 	SAMPL_RATE(hwc) = rate;
 	hw_init_period(hwc, SAMPL_RATE(hwc));
-	debug_sprintf_event(sfdbg, 4, "%s: cpu %d period %#llx freq %d,%#lx\n",
-			    __func__, event->cpu, event->attr.sample_period,
-			    event->attr.freq, SAMPLE_FREQ_MODE(hwc));
 	return 0;
 }
 
@@ -765,23 +738,16 @@ static int __hw_perf_event_init(struct perf_event *event)
 	struct hws_qsi_info_block si;
 	struct perf_event_attr *attr = &event->attr;
 	struct hw_perf_event *hwc = &event->hw;
-	int cpu, err;
+	int cpu, err = 0;
 
 	/* Reserve CPU-measurement sampling facility */
-	err = 0;
-	if (!atomic_inc_not_zero(&num_events)) {
-		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_read(&num_events) == 0 && reserve_pmc_hardware())
-			err = -EBUSY;
-		else
-			atomic_inc(&num_events);
-		mutex_unlock(&pmc_reserve_mutex);
+	mutex_lock(&pmc_reserve_mutex);
+	if (!refcount_inc_not_zero(&num_events)) {
+		reserve_pmc_hardware();
+		refcount_set(&num_events, 1);
 	}
 	event->destroy = hw_perf_event_destroy;
 
-	if (err)
-		goto out;
-
 	/* Access per-CPU sampling information (query sampling info) */
 	/*
 	 * The event->cpu value can be -1 to count on every CPU, for example,
@@ -793,9 +759,9 @@ static int __hw_perf_event_init(struct perf_event *event)
 	 */
 	memset(&si, 0, sizeof(si));
 	cpuhw = NULL;
-	if (event->cpu == -1)
+	if (event->cpu == -1) {
 		qsi(&si);
-	else {
+	} else {
 		/* Event is pinned to a particular CPU, retrieve the per-CPU
 		 * sampling structure for accessing the CPU-specific QSI.
 		 */
@@ -832,21 +798,13 @@ static int __hw_perf_event_init(struct perf_event *event)
 		SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_DIAG_MODE;
 	}
 
-	/* Check and set other sampling flags */
-	if (attr->config1 & PERF_CPUM_SF_FULL_BLOCKS)
-		SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FULL_BLOCKS;
-
 	err =  __hw_perf_event_init_rate(event, &si);
 	if (err)
 		goto out;
 
-	/* Initialize sample data overflow accounting */
-	hwc->extra_reg.reg = REG_OVERFLOW;
-	OVERFLOW_REG(hwc) = 0;
-
 	/* Use AUX buffer. No need to allocate it by ourself */
 	if (attr->config == PERF_EVENT_CPUM_SF_DIAG)
-		return 0;
+		goto out;
 
 	/* Allocate the per-CPU sampling buffer using the CPU information
 	 * from the event.  If the event is not pinned to a particular
@@ -876,15 +834,25 @@ static int __hw_perf_event_init(struct perf_event *event)
 		if (is_default_overflow_handler(event))
 			event->overflow_handler = cpumsf_output_event_pid;
 out:
+	mutex_unlock(&pmc_reserve_mutex);
 	return err;
 }
 
+static bool is_callchain_event(struct perf_event *event)
+{
+	u64 sample_type = event->attr.sample_type;
+
+	return sample_type & (PERF_SAMPLE_CALLCHAIN | PERF_SAMPLE_REGS_USER |
+			      PERF_SAMPLE_STACK_USER);
+}
+
 static int cpumsf_pmu_event_init(struct perf_event *event)
 {
 	int err;
 
 	/* No support for taken branch sampling */
-	if (has_branch_stack(event))
+	/* No support for callchain, stacks and registers */
+	if (has_branch_stack(event) || is_callchain_event(event))
 		return -EOPNOTSUPP;
 
 	switch (event->attr.type) {
@@ -908,10 +876,6 @@ static int cpumsf_pmu_event_init(struct perf_event *event)
 		return -ENOENT;
 	}
 
-	/* Check online status of the CPU to which the event is pinned */
-	if (event->cpu >= 0 && !cpu_online(event->cpu))
-		return -ENODEV;
-
 	/* Force reset of idle/hv excludes regardless of what the
 	 * user requested.
 	 */
@@ -933,10 +897,14 @@ static void cpumsf_pmu_enable(struct pmu *pmu)
 	struct hw_perf_event *hwc;
 	int err;
 
-	if (cpuhw->flags & PMU_F_ENABLED)
-		return;
-
-	if (cpuhw->flags & PMU_F_ERR_MASK)
+	/*
+	 * Event must be
+	 * - added/started on this CPU (PMU_F_IN_USE set)
+	 * - and CPU must be available (PMU_F_RESERVED set)
+	 * - and not already enabled (PMU_F_ENABLED not set)
+	 * - and not in error condition (PMU_F_ERR_MASK not set)
+	 */
+	if (cpuhw->flags != (PMU_F_IN_USE | PMU_F_RESERVED))
 		return;
 
 	/* Check whether to extent the sampling buffer.
@@ -950,40 +918,27 @@ static void cpumsf_pmu_enable(struct pmu *pmu)
 	 * facility, but it can be fully re-enabled using sampling controls that
 	 * have been saved in cpumsf_pmu_disable().
 	 */
-	if (cpuhw->event) {
-		hwc = &cpuhw->event->hw;
-		if (!(SAMPL_DIAG_MODE(hwc))) {
-			/*
-			 * Account number of overflow-designated
-			 * buffer extents
-			 */
-			sfb_account_overflows(cpuhw, hwc);
-			extend_sampling_buffer(&cpuhw->sfb, hwc);
-		}
-		/* Rate may be adjusted with ioctl() */
-		cpuhw->lsctl.interval = SAMPL_RATE(&cpuhw->event->hw);
+	hwc = &cpuhw->event->hw;
+	if (!(SAMPL_DIAG_MODE(hwc))) {
+		/*
+		 * Account number of overflow-designated buffer extents
+		 */
+		sfb_account_overflows(cpuhw, hwc);
+		extend_sampling_buffer(&cpuhw->sfb, hwc);
 	}
+	/* Rate may be adjusted with ioctl() */
+	cpuhw->lsctl.interval = SAMPL_RATE(hwc);
 
 	/* (Re)enable the PMU and sampling facility */
-	cpuhw->flags |= PMU_F_ENABLED;
-	barrier();
-
 	err = lsctl(&cpuhw->lsctl);
 	if (err) {
-		cpuhw->flags &= ~PMU_F_ENABLED;
-		pr_err("Loading sampling controls failed: op %i err %i\n",
-			1, err);
+		pr_err("Loading sampling controls failed: op 1 err %i\n", err);
 		return;
 	}
 
 	/* Load current program parameter */
-	lpp(&S390_lowcore.lpp);
-
-	debug_sprintf_event(sfdbg, 6, "%s: es %i cs %i ed %i cd %i "
-			    "interval %#lx tear %#lx dear %#lx\n", __func__,
-			    cpuhw->lsctl.es, cpuhw->lsctl.cs, cpuhw->lsctl.ed,
-			    cpuhw->lsctl.cd, cpuhw->lsctl.interval,
-			    cpuhw->lsctl.tear, cpuhw->lsctl.dear);
+	lpp(&get_lowcore()->lpp);
+	cpuhw->flags |= PMU_F_ENABLED;
 }
 
 static void cpumsf_pmu_disable(struct pmu *pmu)
@@ -1006,31 +961,27 @@ static void cpumsf_pmu_disable(struct pmu *pmu)
 
 	err = lsctl(&inactive);
 	if (err) {
-		pr_err("Loading sampling controls failed: op %i err %i\n",
-			2, err);
+		pr_err("Loading sampling controls failed: op 2 err %i\n", err);
 		return;
 	}
 
-	/* Save state of TEAR and DEAR register contents */
-	err = qsi(&si);
-	if (!err) {
-		/* TEAR/DEAR values are valid only if the sampling facility is
-		 * enabled.  Note that cpumsf_pmu_disable() might be called even
-		 * for a disabled sampling facility because cpumsf_pmu_enable()
-		 * controls the enable/disable state.
-		 */
-		if (si.es) {
-			cpuhw->lsctl.tear = si.tear;
-			cpuhw->lsctl.dear = si.dear;
-		}
-	} else
-		debug_sprintf_event(sfdbg, 3, "%s: qsi() failed with err %i\n",
-				    __func__, err);
+	/*
+	 * Save state of TEAR and DEAR register contents.
+	 * TEAR/DEAR values are valid only if the sampling facility is
+	 * enabled.  Note that cpumsf_pmu_disable() might be called even
+	 * for a disabled sampling facility because cpumsf_pmu_enable()
+	 * controls the enable/disable state.
+	 */
+	qsi(&si);
+	if (si.es) {
+		cpuhw->lsctl.tear = si.tear;
+		cpuhw->lsctl.dear = si.dear;
+	}
 
 	cpuhw->flags &= ~PMU_F_ENABLED;
 }
 
-/* perf_exclude_event() - Filter event
+/* perf_event_exclude() - Filter event
  * @event:	The perf event
  * @regs:	pt_regs structure
  * @sde_regs:	Sample-data-entry (sde) regs structure
@@ -1039,7 +990,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu)
  *
  * Return non-zero if the event shall be excluded.
  */
-static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs,
+static int perf_event_exclude(struct perf_event *event, struct pt_regs *regs,
 			      struct perf_sf_sde_regs *sde_regs)
 {
 	if (event->attr.exclude_user && user_mode(regs))
@@ -1122,7 +1073,7 @@ static int perf_push_sample(struct perf_event *event,
 	data.tid_entry.pid = basic->hpp & LPP_PID_MASK;
 
 	overflow = 0;
-	if (perf_exclude_event(event, &regs, sde_regs))
+	if (perf_event_exclude(event, &regs, sde_regs))
 		goto out;
 	if (perf_event_overflow(event, &data, &regs)) {
 		overflow = 1;
@@ -1164,11 +1115,11 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
 	struct hws_trailer_entry *te;
 	struct hws_basic_entry *sample;
 
-	te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt);
-	sample = (struct hws_basic_entry *) *sdbt;
-	while ((unsigned long *) sample < (unsigned long *) te) {
+	te = trailer_entry_ptr((unsigned long)sdbt);
+	sample = (struct hws_basic_entry *)sdbt;
+	while ((unsigned long *)sample < (unsigned long *)te) {
 		/* Check for an empty sample */
-		if (!sample->def)
+		if (!sample->def || sample->LS)
 			break;
 
 		/* Update perf event period */
@@ -1191,11 +1142,6 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
 				/* Count discarded samples */
 				*overflow += 1;
 		} else {
-			debug_sprintf_event(sfdbg, 4,
-					    "%s: Found unknown"
-					    " sampling data entry: te->f %i"
-					    " basic.def %#4x (%p)\n", __func__,
-					    te->f, sample->def, sample);
 			/* Sample slot is not yet written or other record.
 			 *
 			 * This condition can occur if the buffer was reused
@@ -1206,7 +1152,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
 			 * that are not full.  Stop processing if the first
 			 * invalid format was detected.
 			 */
-			if (!te->f)
+			if (!te->header.f)
 				break;
 		}
 
@@ -1224,71 +1170,62 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
  * The sampling buffer position are retrieved and saved in the TEAR_REG
  * register of the specified perf event.
  *
- * Only full sample-data-blocks are processed.	Specify the flash_all flag
- * to also walk through partially filled sample-data-blocks.  It is ignored
- * if PERF_CPUM_SF_FULL_BLOCKS is set.	The PERF_CPUM_SF_FULL_BLOCKS flag
- * enforces the processing of full sample-data-blocks only (trailer entries
- * with the block-full-indicator bit set).
+ * Only full sample-data-blocks are processed.	Specify the flush_all flag
+ * to also walk through partially filled sample-data-blocks.
  */
 static void hw_perf_event_update(struct perf_event *event, int flush_all)
 {
+	unsigned long long event_overflow, sampl_overflow, num_sdb;
 	struct hw_perf_event *hwc = &event->hw;
+	union hws_trailer_header prev, new;
 	struct hws_trailer_entry *te;
-	unsigned long *sdbt;
-	unsigned long long event_overflow, sampl_overflow, num_sdb, te_flags;
+	unsigned long *sdbt, sdb;
 	int done;
 
 	/*
 	 * AUX buffer is used when in diagnostic sampling mode.
 	 * No perf events/samples are created.
 	 */
-	if (SAMPL_DIAG_MODE(&event->hw))
+	if (SAMPL_DIAG_MODE(hwc))
 		return;
 
-	if (flush_all && SDB_FULL_BLOCKS(hwc))
-		flush_all = 0;
-
-	sdbt = (unsigned long *) TEAR_REG(hwc);
+	sdbt = (unsigned long *)TEAR_REG(hwc);
 	done = event_overflow = sampl_overflow = num_sdb = 0;
 	while (!done) {
 		/* Get the trailer entry of the sample-data-block */
-		te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt);
+		sdb = (unsigned long)phys_to_virt(*sdbt);
+		te = trailer_entry_ptr(sdb);
 
 		/* Leave loop if no more work to do (block full indicator) */
-		if (!te->f) {
+		if (!te->header.f) {
 			done = 1;
 			if (!flush_all)
 				break;
 		}
 
 		/* Check the sample overflow count */
-		if (te->overflow)
+		if (te->header.overflow)
 			/* Account sample overflows and, if a particular limit
 			 * is reached, extend the sampling buffer.
 			 * For details, see sfb_account_overflows().
 			 */
-			sampl_overflow += te->overflow;
-
-		/* Timestamps are valid for full sample-data-blocks only */
-		debug_sprintf_event(sfdbg, 6, "%s: sdbt %#lx "
-				    "overflow %llu timestamp %#llx\n",
-				    __func__, (unsigned long)sdbt, te->overflow,
-				    (te->f) ? trailer_timestamp(te) : 0ULL);
+			sampl_overflow += te->header.overflow;
 
 		/* Collect all samples from a single sample-data-block and
 		 * flag if an (perf) event overflow happened.  If so, the PMU
 		 * is stopped and remaining samples will be discarded.
 		 */
-		hw_collect_samples(event, sdbt, &event_overflow);
+		hw_collect_samples(event, (unsigned long *)sdb, &event_overflow);
 		num_sdb++;
 
 		/* Reset trailer (using compare-double-and-swap) */
+		prev.val = READ_ONCE_ALIGNED_128(te->header.val);
 		do {
-			te_flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK;
-			te_flags |= SDB_TE_ALERT_REQ_MASK;
-		} while (!cmpxchg_double(&te->flags, &te->overflow,
-					 te->flags, te->overflow,
-					 te_flags, 0ULL));
+			new.val = prev.val;
+			new.f = 0;
+			new.a = 1;
+			new.overflow = 0;
+		} while (!try_cmpxchg128(&te->header.val, &prev.val, new.val));
 
 		/* Advance to next sample-data-block */
 		sdbt++;
@@ -1296,37 +1233,52 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all)
 			sdbt = get_next_sdbt(sdbt);
 
 		/* Update event hardware registers */
-		TEAR_REG(hwc) = (unsigned long) sdbt;
+		TEAR_REG(hwc) = (unsigned long)sdbt;
 
 		/* Stop processing sample-data if all samples of the current
 		 * sample-data-block were flushed even if it was not full.
 		 */
 		if (flush_all && done)
 			break;
-
-		/* If an event overflow happened, discard samples by
-		 * processing any remaining sample-data-blocks.
-		 */
-		if (event_overflow)
-			flush_all = 1;
 	}
 
 	/* Account sample overflows in the event hardware structure */
 	if (sampl_overflow)
 		OVERFLOW_REG(hwc) = DIV_ROUND_UP(OVERFLOW_REG(hwc) +
 						 sampl_overflow, 1 + num_sdb);
-	if (sampl_overflow || event_overflow)
-		debug_sprintf_event(sfdbg, 4, "%s: "
-				    "overflows: sample %llu event %llu"
-				    " total %llu num_sdb %llu\n",
-				    __func__, sampl_overflow, event_overflow,
-				    OVERFLOW_REG(hwc), num_sdb);
+
+	/* Perf_event_overflow() and perf_event_account_interrupt() limit
+	 * the interrupt rate to an upper limit. Roughly 1000 samples per
+	 * task tick.
+	 * Hitting this limit results in a large number
+	 * of throttled REF_REPORT_THROTTLE entries and the samples
+	 * are dropped.
+	 * Slightly increase the interval to avoid hitting this limit.
+	 */
+	if (event_overflow)
+		SAMPL_RATE(hwc) += DIV_ROUND_UP(SAMPL_RATE(hwc), 10);
 }
 
-#define AUX_SDB_INDEX(aux, i) ((i) % aux->sfb.num_sdb)
-#define AUX_SDB_NUM(aux, start, end) (end >= start ? end - start + 1 : 0)
-#define AUX_SDB_NUM_ALERT(aux) AUX_SDB_NUM(aux, aux->head, aux->alert_mark)
-#define AUX_SDB_NUM_EMPTY(aux) AUX_SDB_NUM(aux, aux->head, aux->empty_mark)
+static inline unsigned long aux_sdb_index(struct aux_buffer *aux,
+					  unsigned long i)
+{
+	return i % aux->sfb.num_sdb;
+}
+
+static inline unsigned long aux_sdb_num(unsigned long start, unsigned long end)
+{
+	return end >= start ? end - start + 1 : 0;
+}
+
+static inline unsigned long aux_sdb_num_alert(struct aux_buffer *aux)
+{
+	return aux_sdb_num(aux->head, aux->alert_mark);
+}
+
+static inline unsigned long aux_sdb_num_empty(struct aux_buffer *aux)
+{
+	return aux_sdb_num(aux->head, aux->empty_mark);
+}
 
 /*
  * Get trailer entry by index of SDB.
@@ -1336,9 +1288,9 @@ static struct hws_trailer_entry *aux_sdb_trailer(struct aux_buffer *aux,
 {
 	unsigned long sdb;
 
-	index = AUX_SDB_INDEX(aux, index);
+	index = aux_sdb_index(aux, index);
 	sdb = aux->sdb_index[index];
-	return (struct hws_trailer_entry *)trailer_entry_ptr(sdb);
+	return trailer_entry_ptr(sdb);
 }
 
 /*
@@ -1360,10 +1312,10 @@ static void aux_output_end(struct perf_output_handle *handle)
 	if (!aux)
 		return;
 
-	range_scan = AUX_SDB_NUM_ALERT(aux);
+	range_scan = aux_sdb_num_alert(aux);
 	for (i = 0, idx = aux->head; i < range_scan; i++, idx++) {
 		te = aux_sdb_trailer(aux, idx);
-		if (!(te->flags & SDB_TE_BUFFER_FULL_MASK))
+		if (!te->header.f)
 			break;
 	}
 	/* i is num of SDBs which are full */
@@ -1371,9 +1323,7 @@ static void aux_output_end(struct perf_output_handle *handle)
 
 	/* Remove alert indicators in the buffer */
 	te = aux_sdb_trailer(aux, aux->alert_mark);
-	te->flags &= ~SDB_TE_ALERT_REQ_MASK;
-
-	debug_sprintf_event(sfdbg, 6, "%s: collect %#lx SDBs\n", __func__, i);
+	te->header.a = 0;
 }
 
 /*
@@ -1389,12 +1339,10 @@ static int aux_output_begin(struct perf_output_handle *handle,
 			    struct aux_buffer *aux,
 			    struct cpu_hw_sf *cpuhw)
 {
-	unsigned long range;
-	unsigned long i, range_scan, idx;
-	unsigned long head, base, offset;
+	unsigned long range, i, range_scan, idx, head, base, offset;
 	struct hws_trailer_entry *te;
 
-	if (WARN_ON_ONCE(handle->head & ~PAGE_MASK))
+	if (handle->head & ~PAGE_MASK)
 		return -EINVAL;
 
 	aux->head = handle->head >> PAGE_SHIFT;
@@ -1406,14 +1354,14 @@ static int aux_output_begin(struct perf_output_handle *handle,
 	 * SDBs between aux->head and aux->empty_mark are already ready
 	 * for new data. range_scan is num of SDBs not within them.
 	 */
-	if (range > AUX_SDB_NUM_EMPTY(aux)) {
-		range_scan = range - AUX_SDB_NUM_EMPTY(aux);
+	if (range > aux_sdb_num_empty(aux)) {
+		range_scan = range - aux_sdb_num_empty(aux);
 		idx = aux->empty_mark + 1;
 		for (i = 0; i < range_scan; i++, idx++) {
 			te = aux_sdb_trailer(aux, idx);
-			te->flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK;
-			te->flags = te->flags & ~SDB_TE_ALERT_REQ_MASK;
-			te->overflow = 0;
+			te->header.f = 0;
+			te->header.a = 0;
+			te->header.overflow = 0;
 		}
 		/* Save the position of empty SDBs */
 		aux->empty_mark = aux->head + range - 1;
@@ -1422,24 +1370,14 @@ static int aux_output_begin(struct perf_output_handle *handle,
 	/* Set alert indicator */
 	aux->alert_mark = aux->head + range/2 - 1;
 	te = aux_sdb_trailer(aux, aux->alert_mark);
-	te->flags = te->flags | SDB_TE_ALERT_REQ_MASK;
+	te->header.a = 1;
 
 	/* Reset hardware buffer head */
-	head = AUX_SDB_INDEX(aux, aux->head);
+	head = aux_sdb_index(aux, aux->head);
 	base = aux->sdbt_index[head / CPUM_SF_SDB_PER_TABLE];
 	offset = head % CPUM_SF_SDB_PER_TABLE;
-	cpuhw->lsctl.tear = base + offset * sizeof(unsigned long);
-	cpuhw->lsctl.dear = aux->sdb_index[head];
-
-	debug_sprintf_event(sfdbg, 6, "%s: "
-			    "head->alert_mark->empty_mark (num_alert, range)"
-			    "[%#lx -> %#lx -> %#lx] (%#lx, %#lx) "
-			    "tear index %#lx, tear %#lx dear %#lx\n", __func__,
-			    aux->head, aux->alert_mark, aux->empty_mark,
-			    AUX_SDB_NUM_ALERT(aux), range,
-			    head / CPUM_SF_SDB_PER_TABLE,
-			    cpuhw->lsctl.tear,
-			    cpuhw->lsctl.dear);
+	cpuhw->lsctl.tear = virt_to_phys((void *)base) + offset * sizeof(unsigned long);
+	cpuhw->lsctl.dear = virt_to_phys((void *)aux->sdb_index[head]);
 
 	return 0;
 }
@@ -1453,15 +1391,15 @@ static int aux_output_begin(struct perf_output_handle *handle,
 static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
 			  unsigned long long *overflow)
 {
-	unsigned long long orig_overflow, orig_flags, new_flags;
+	union hws_trailer_header prev, new;
 	struct hws_trailer_entry *te;
 
 	te = aux_sdb_trailer(aux, alert_index);
+	prev.val = READ_ONCE_ALIGNED_128(te->header.val);
 	do {
-		orig_flags = te->flags;
-		orig_overflow = te->overflow;
-		*overflow = orig_overflow;
-		if (orig_flags & SDB_TE_BUFFER_FULL_MASK) {
+		new.val = prev.val;
+		*overflow = prev.overflow;
+		if (prev.f) {
 			/*
 			 * SDB is already set by hardware.
 			 * Abort and try to set somewhere
@@ -1469,10 +1407,9 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
 			 */
 			return false;
 		}
-		new_flags = orig_flags | SDB_TE_ALERT_REQ_MASK;
-	} while (!cmpxchg_double(&te->flags, &te->overflow,
-				 orig_flags, orig_overflow,
-				 new_flags, 0ULL));
+		new.a = 1;
+		new.overflow = 0;
+	} while (!try_cmpxchg128(&te->header.val, &prev.val, new.val));
 	return true;
 }
 
@@ -1501,11 +1438,12 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
 static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range,
 			     unsigned long long *overflow)
 {
-	unsigned long long orig_overflow, orig_flags, new_flags;
+	union hws_trailer_header prev, new;
 	unsigned long i, range_scan, idx;
+	unsigned long long orig_overflow;
 	struct hws_trailer_entry *te;
 
-	if (range <= AUX_SDB_NUM_EMPTY(aux))
+	if (range <= aux_sdb_num_empty(aux))
 		/*
 		 * No need to scan. All SDBs in range are marked as empty.
 		 * Just set alert indicator. Should check race with hardware
@@ -1526,21 +1464,21 @@ static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range,
 	 * Start scanning from one SDB behind empty_mark. If the new alert
 	 * indicator fall into this range, set it.
 	 */
-	range_scan = range - AUX_SDB_NUM_EMPTY(aux);
+	range_scan = range - aux_sdb_num_empty(aux);
 	idx = aux->empty_mark + 1;
 	for (i = 0; i < range_scan; i++, idx++) {
 		te = aux_sdb_trailer(aux, idx);
+		prev.val = READ_ONCE_ALIGNED_128(te->header.val);
 		do {
-			orig_flags = te->flags;
-			orig_overflow = te->overflow;
-			new_flags = orig_flags & ~SDB_TE_BUFFER_FULL_MASK;
+			new.val = prev.val;
+			orig_overflow = prev.overflow;
+			new.f = 0;
+			new.overflow = 0;
 			if (idx == aux->alert_mark)
-				new_flags |= SDB_TE_ALERT_REQ_MASK;
+				new.a = 1;
 			else
-				new_flags &= ~SDB_TE_ALERT_REQ_MASK;
-		} while (!cmpxchg_double(&te->flags, &te->overflow,
-					 orig_flags, orig_overflow,
-					 new_flags, 0ULL));
+				new.a = 0;
+		} while (!try_cmpxchg128(&te->header.val, &prev.val, new.val));
 		*overflow += orig_overflow;
 	}
 
@@ -1563,14 +1501,16 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw)
 	unsigned long num_sdb;
 
 	aux = perf_get_aux(handle);
-	if (WARN_ON_ONCE(!aux))
+	if (!aux)
 		return;
 
 	/* Inform user space new data arrived */
-	size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT;
+	size = aux_sdb_num_alert(aux) << PAGE_SHIFT;
+	debug_sprintf_event(sfdbg, 6, "%s #alert %ld\n", __func__,
+			    size >> PAGE_SHIFT);
 	perf_aux_output_end(handle, size);
-	num_sdb = aux->sfb.num_sdb;
 
+	num_sdb = aux->sfb.num_sdb;
 	while (!done) {
 		/* Get an output handle */
 		aux = perf_aux_output_begin(handle, cpuhw->event);
@@ -1578,12 +1518,9 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw)
 			pr_err("The AUX buffer with %lu pages for the "
 			       "diagnostic-sampling mode is full\n",
 				num_sdb);
-			debug_sprintf_event(sfdbg, 1,
-					    "%s: AUX buffer used up\n",
-					    __func__);
 			break;
 		}
-		if (WARN_ON_ONCE(!aux))
+		if (!aux)
 			return;
 
 		/* Update head and alert_mark to new position */
@@ -1602,26 +1539,12 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw)
 			size = range << PAGE_SHIFT;
 			perf_aux_output_end(&cpuhw->handle, size);
 			pr_err("Sample data caused the AUX buffer with %lu "
-			       "pages to overflow\n", num_sdb);
-			debug_sprintf_event(sfdbg, 1, "%s: head %#lx range %#lx "
-					    "overflow %#llx\n", __func__,
-					    aux->head, range, overflow);
+			       "pages to overflow\n", aux->sfb.num_sdb);
 		} else {
-			size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT;
+			size = aux_sdb_num_alert(aux) << PAGE_SHIFT;
 			perf_aux_output_end(&cpuhw->handle, size);
-			debug_sprintf_event(sfdbg, 6, "%s: head %#lx alert %#lx "
-					    "already full, try another\n",
-					    __func__,
-					    aux->head, aux->alert_mark);
 		}
 	}
-
-	if (done)
-		debug_sprintf_event(sfdbg, 6, "%s: aux_reset_buffer "
-				    "[%#lx -> %#lx -> %#lx] (%#lx, %#lx)\n",
-				    __func__, aux->head, aux->alert_mark,
-				    aux->empty_mark, AUX_SDB_NUM_ALERT(aux),
-				    range);
 }
 
 /*
@@ -1643,20 +1566,17 @@ static void aux_buffer_free(void *data)
 	kfree(aux->sdbt_index);
 	kfree(aux->sdb_index);
 	kfree(aux);
-
-	debug_sprintf_event(sfdbg, 4, "%s: free "
-			    "%lu SDBTs\n", __func__, num_sdbt);
 }
 
 static void aux_sdb_init(unsigned long sdb)
 {
 	struct hws_trailer_entry *te;
 
-	te = (struct hws_trailer_entry *)trailer_entry_ptr(sdb);
+	te = trailer_entry_ptr(sdb);
 
 	/* Save clock base */
 	te->clock_base = 1;
-	memcpy(&te->progusage2, &tod_clock_base[1], 8);
+	te->progusage2 = tod_clock_base.tod;
 }
 
 /*
@@ -1697,13 +1617,13 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages,
 	}
 
 	/* Allocate aux_buffer struct for the event */
-	aux = kmalloc(sizeof(struct aux_buffer), GFP_KERNEL);
+	aux = kzalloc(sizeof(struct aux_buffer), GFP_KERNEL);
 	if (!aux)
 		goto no_aux;
 	sfb = &aux->sfb;
 
 	/* Allocate sdbt_index for fast reference */
-	n_sdbt = (nr_pages + CPUM_SF_SDB_PER_TABLE - 1) / CPUM_SF_SDB_PER_TABLE;
+	n_sdbt = DIV_ROUND_UP(nr_pages, CPUM_SF_SDB_PER_TABLE);
 	aux->sdbt_index = kmalloc_array(n_sdbt, sizeof(void *), GFP_KERNEL);
 	if (!aux->sdbt_index)
 		goto no_sdbt_index;
@@ -1715,7 +1635,7 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages,
 
 	/* Allocate the first SDBT */
 	sfb->num_sdbt = 0;
-	sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+	sfb->sdbt = (unsigned long *)get_zeroed_page(GFP_KERNEL);
 	if (!sfb->sdbt)
 		goto no_sdbt;
 	aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)sfb->sdbt;
@@ -1727,23 +1647,23 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages,
 	 */
 	for (i = 0; i < nr_pages; i++, tail++) {
 		if (require_table_link(tail)) {
-			new = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+			new = (unsigned long *)get_zeroed_page(GFP_KERNEL);
 			if (!new)
 				goto no_sdbt;
 			aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)new;
 			/* Link current page to tail of chain */
-			*tail = (unsigned long)(void *) new + 1;
+			*tail = virt_to_phys(new) + 1;
 			tail = new;
 		}
 		/* Tail is the entry in a SDBT */
-		*tail = (unsigned long)pages[i];
+		*tail = virt_to_phys(pages[i]);
 		aux->sdb_index[i] = (unsigned long)pages[i];
 		aux_sdb_init((unsigned long)pages[i]);
 	}
 	sfb->num_sdb = nr_pages;
 
 	/* Link the last entry in the SDBT to the first SDBT */
-	*tail = (unsigned long) sfb->sdbt + 1;
+	*tail = virt_to_phys(sfb->sdbt) + 1;
 	sfb->tail = tail;
 
 	/*
@@ -1753,9 +1673,6 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages,
 	 */
 	aux->empty_mark = sfb->num_sdb - 1;
 
-	debug_sprintf_event(sfdbg, 4, "%s: setup %lu SDBTs and %lu SDBs\n",
-			    __func__, sfb->num_sdbt, sfb->num_sdb);
-
 	return aux;
 
 no_sdbt:
@@ -1776,7 +1693,7 @@ static void cpumsf_pmu_read(struct perf_event *event)
 	/* Nothing to do ... updates are interrupt-driven */
 }
 
-/* Check if the new sampling period/freqeuncy is appropriate.
+/* Check if the new sampling period/frequency is appropriate.
  *
  * Return non-zero on error and zero on passed checks.
  */
@@ -1788,8 +1705,7 @@ static int cpumsf_pmu_check_period(struct perf_event *event, u64 value)
 
 	memset(&si, 0, sizeof(si));
 	if (event->cpu == -1) {
-		if (qsi(&si))
-			return -ENODEV;
+		qsi(&si);
 	} else {
 		/* Event is pinned to a particular CPU, retrieve the per-CPU
 		 * sampling structure for accessing the CPU-specific QSI.
@@ -1799,7 +1715,7 @@ static int cpumsf_pmu_check_period(struct perf_event *event, u64 value)
 		si = cpuhw->qsi;
 	}
 
-	do_freq = !!SAMPLE_FREQ_MODE(&event->hw);
+	do_freq = !!SAMPL_FREQ_MODE(&event->hw);
 	rate = getrate(do_freq, value, &si);
 	if (!rate)
 		return -EINVAL;
@@ -1807,10 +1723,6 @@ static int cpumsf_pmu_check_period(struct perf_event *event, u64 value)
 	event->attr.sample_period = rate;
 	SAMPL_RATE(&event->hw) = rate;
 	hw_init_period(&event->hw, SAMPL_RATE(&event->hw));
-	debug_sprintf_event(sfdbg, 4, "%s:"
-			    " cpu %d value %#llx period %#llx freq %d\n",
-			    __func__, event->cpu, value,
-			    event->attr.sample_period, do_freq);
 	return 0;
 }
 
@@ -1821,12 +1733,8 @@ static void cpumsf_pmu_start(struct perf_event *event, int flags)
 {
 	struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf);
 
-	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+	if (!(event->hw.state & PERF_HES_STOPPED))
 		return;
-
-	if (flags & PERF_EF_RELOAD)
-		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
-
 	perf_pmu_disable(event->pmu);
 	event->hw.state = 0;
 	cpuhw->lsctl.cs = 1;
@@ -1851,7 +1759,9 @@ static void cpumsf_pmu_stop(struct perf_event *event, int flags)
 	event->hw.state |= PERF_HES_STOPPED;
 
 	if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) {
-		hw_perf_event_update(event, 1);
+		/* CPU hotplug off removes SDBs. No samples to extract. */
+		if (cpuhw->flags & PMU_F_RESERVED)
+			hw_perf_event_update(event, 1);
 		event->hw.state |= PERF_HES_UPTODATE;
 	}
 	perf_pmu_enable(event->pmu);
@@ -1861,15 +1771,14 @@ static int cpumsf_pmu_add(struct perf_event *event, int flags)
 {
 	struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf);
 	struct aux_buffer *aux;
-	int err;
+	int err = 0;
 
 	if (cpuhw->flags & PMU_F_IN_USE)
 		return -EAGAIN;
 
-	if (!SAMPL_DIAG_MODE(&event->hw) && !cpuhw->sfb.sdbt)
+	if (!SAMPL_DIAG_MODE(&event->hw) && !sf_buffer_available(cpuhw))
 		return -EINVAL;
 
-	err = 0;
 	perf_pmu_disable(event->pmu);
 
 	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
@@ -1883,9 +1792,9 @@ static int cpumsf_pmu_add(struct perf_event *event, int flags)
 	cpuhw->lsctl.h = 1;
 	cpuhw->lsctl.interval = SAMPL_RATE(&event->hw);
 	if (!SAMPL_DIAG_MODE(&event->hw)) {
-		cpuhw->lsctl.tear = (unsigned long) cpuhw->sfb.sdbt;
-		cpuhw->lsctl.dear = *(unsigned long *) cpuhw->sfb.sdbt;
-		TEAR_REG(&event->hw) = (unsigned long) cpuhw->sfb.sdbt;
+		cpuhw->lsctl.tear = virt_to_phys(cpuhw->sfb.sdbt);
+		cpuhw->lsctl.dear = *(unsigned long *)cpuhw->sfb.sdbt;
+		TEAR_REG(&event->hw) = (unsigned long)cpuhw->sfb.sdbt;
 	}
 
 	/* Ensure sampling functions are in the disabled state.  If disabled,
@@ -2029,18 +1938,17 @@ static void cpumf_measurement_alert(struct ext_code ext_code,
 
 	/* Program alert request */
 	if (alert & CPU_MF_INT_SF_PRA) {
-		if (cpuhw->flags & PMU_F_IN_USE)
+		if (cpuhw->flags & PMU_F_IN_USE) {
 			if (SAMPL_DIAG_MODE(&cpuhw->event->hw))
 				hw_collect_aux(cpuhw);
 			else
 				hw_perf_event_update(cpuhw->event, 0);
-		else
-			WARN_ON_ONCE(!(cpuhw->flags & PMU_F_IN_USE));
+		}
 	}
 
 	/* Report measurement alerts only for non-PRA codes */
 	if (alert != CPU_MF_INT_SF_PRA)
-		debug_sprintf_event(sfdbg, 6, "%s: alert %#x\n", __func__,
+		debug_sprintf_event(sfdbg, 6, "%s alert %#x\n", __func__,
 				    alert);
 
 	/* Sampling authorization change request */
@@ -2056,7 +1964,7 @@ static void cpumf_measurement_alert(struct ext_code ext_code,
 
 	/* Invalid sampling buffer entry */
 	if (alert & (CPU_MF_INT_SF_IAE|CPU_MF_INT_SF_ISE)) {
-		pr_err("A sampling buffer entry is incorrect (alert=0x%x)\n",
+		pr_err("A sampling buffer entry is incorrect (alert=%#x)\n",
 		       alert);
 		cpuhw->flags |= PMU_F_ERR_IBE;
 		sf_disable();
@@ -2068,7 +1976,7 @@ static int cpusf_pmu_setup(unsigned int cpu, int flags)
 	/* Ignore the notification if no events are scheduled on the PMU.
 	 * This might be racy...
 	 */
-	if (!atomic_read(&num_events))
+	if (!refcount_read(&num_events))
 		return 0;
 
 	local_irq_disable();
@@ -2130,10 +2038,12 @@ static const struct kernel_param_ops param_ops_sfb_size = {
 	.get = param_get_sfb_size,
 };
 
-#define RS_INIT_FAILURE_QSI	  0x0001
-#define RS_INIT_FAILURE_BSDES	  0x0002
-#define RS_INIT_FAILURE_ALRT	  0x0003
-#define RS_INIT_FAILURE_PERF	  0x0004
+enum {
+	RS_INIT_FAILURE_BSDES	= 2,	/* Bad basic sampling size */
+	RS_INIT_FAILURE_ALRT	= 3,	/* IRQ registration failure */
+	RS_INIT_FAILURE_PERF	= 4	/* PMU registration failure */
+};
+
 static void __init pr_cpumsf_err(unsigned int reason)
 {
 	pr_err("Sampling facility support for perf is not available: "
@@ -2149,11 +2059,7 @@ static int __init init_cpum_sampling_pmu(void)
 		return -ENODEV;
 
 	memset(&si, 0, sizeof(si));
-	if (qsi(&si)) {
-		pr_cpumsf_err(RS_INIT_FAILURE_QSI);
-		return -ENODEV;
-	}
-
+	qsi(&si);
 	if (!si.as && !si.ad)
 		return -ENODEV;
 
@@ -2202,4 +2108,4 @@ out:
 }
 
 arch_initcall(init_cpum_sampling_pmu);
-core_param(cpum_sfb_size, CPUM_SF_MAX_SDB, sfb_size, 0640);
+core_param(cpum_sfb_size, CPUM_SF_MAX_SDB, sfb_size, 0644);
diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c
index 1e75cc983546..2b9611c4718e 100644
--- a/arch/s390/kernel/perf_event.c
+++ b/arch/s390/kernel/perf_event.c
@@ -15,7 +15,10 @@
 #include <linux/export.h>
 #include <linux/seq_file.h>
 #include <linux/spinlock.h>
+#include <linux/uaccess.h>
+#include <linux/compat.h>
 #include <linux/sysfs.h>
+#include <asm/stacktrace.h>
 #include <asm/irq.h>
 #include <asm/cpu_mf.h>
 #include <asm/lowcore.h>
@@ -23,27 +26,6 @@
 #include <asm/sysinfo.h>
 #include <asm/unwind.h>
 
-const char *perf_pmu_name(void)
-{
-	if (cpum_cf_avail() || cpum_sf_avail())
-		return "CPU-Measurement Facilities (CPU-MF)";
-	return "pmu";
-}
-EXPORT_SYMBOL(perf_pmu_name);
-
-int perf_num_counters(void)
-{
-	int num = 0;
-
-	if (cpum_cf_avail())
-		num += PERF_CPUM_CF_MAX_CTR;
-	if (cpum_sf_avail())
-		num += PERF_CPUM_SF_MAX_CTR;
-
-	return num;
-}
-EXPORT_SYMBOL(perf_num_counters);
-
 static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs)
 {
 	struct stack_frame *stack = (struct stack_frame *) regs->gprs[15];
@@ -51,7 +33,7 @@ static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs)
 	if (!stack)
 		return NULL;
 
-	return (struct kvm_s390_sie_block *) stack->empty1[0];
+	return (struct kvm_s390_sie_block *)stack->sie_control_block;
 }
 
 static bool is_in_guest(struct pt_regs *regs)
@@ -75,7 +57,7 @@ static unsigned long instruction_pointer_guest(struct pt_regs *regs)
 	return sie_block(regs)->gpsw.addr;
 }
 
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
+unsigned long perf_arch_instruction_pointer(struct pt_regs *regs)
 {
 	return is_in_guest(regs) ? instruction_pointer_guest(regs)
 				 : instruction_pointer(regs);
@@ -102,7 +84,7 @@ static unsigned long perf_misc_flags_sf(struct pt_regs *regs)
 	return flags;
 }
 
-unsigned long perf_misc_flags(struct pt_regs *regs)
+unsigned long perf_arch_misc_flags(struct pt_regs *regs)
 {
 	/* Check if the cpum_sf PMU has created the pt_regs structure.
 	 * In this case, perf misc flags can be easily extracted.  Otherwise,
@@ -233,6 +215,12 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 	}
 }
 
+void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
+			 struct pt_regs *regs)
+{
+	arch_stack_walk_user_common(NULL, NULL, entry, regs, true);
+}
+
 /* Perf definitions for PMU event attributes in sysfs */
 ssize_t cpumf_events_sysfs_show(struct device *dev,
 				struct device_attribute *attr, char *page)
@@ -240,5 +228,5 @@ ssize_t cpumf_events_sysfs_show(struct device *dev,
 	struct perf_pmu_events_attr *pmu_attr;
 
 	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
-	return sprintf(page, "event=0x%04llx\n", pmu_attr->id);
+	return sysfs_emit(page, "event=0x%04llx\n", pmu_attr->id);
 }
diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c
new file mode 100644
index 000000000000..63875270941b
--- /dev/null
+++ b/arch/s390/kernel/perf_pai_crypto.c
@@ -0,0 +1,862 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Performance event support - Processor Activity Instrumentation Facility
+ *
+ *  Copyright IBM Corp. 2022
+ *  Author(s): Thomas Richter <tmricht@linux.ibm.com>
+ */
+#define KMSG_COMPONENT	"pai_crypto"
+#define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/io.h>
+#include <linux/perf_event.h>
+#include <asm/ctlreg.h>
+#include <asm/pai.h>
+#include <asm/debug.h>
+
+static debug_info_t *cfm_dbg;
+static unsigned int paicrypt_cnt;	/* Size of the mapped counter sets */
+					/* extracted with QPACI instruction */
+
+DEFINE_STATIC_KEY_FALSE(pai_key);
+
+struct pai_userdata {
+	u16 num;
+	u64 value;
+} __packed;
+
+struct paicrypt_map {
+	unsigned long *page;		/* Page for CPU to store counters */
+	struct pai_userdata *save;	/* Page to store no-zero counters */
+	unsigned int active_events;	/* # of PAI crypto users */
+	refcount_t refcnt;		/* Reference count mapped buffers */
+	struct perf_event *event;	/* Perf event for sampling */
+	struct list_head syswide_list;	/* List system-wide sampling events */
+};
+
+struct paicrypt_mapptr {
+	struct paicrypt_map *mapptr;
+};
+
+static struct paicrypt_root {		/* Anchor to per CPU data */
+	refcount_t refcnt;		/* Overall active events */
+	struct paicrypt_mapptr __percpu *mapptr;
+} paicrypt_root;
+
+/* Free per CPU data when the last event is removed. */
+static void paicrypt_root_free(void)
+{
+	if (refcount_dec_and_test(&paicrypt_root.refcnt)) {
+		free_percpu(paicrypt_root.mapptr);
+		paicrypt_root.mapptr = NULL;
+	}
+	debug_sprintf_event(cfm_dbg, 5, "%s root.refcount %d\n", __func__,
+			    refcount_read(&paicrypt_root.refcnt));
+}
+
+/*
+ * On initialization of first event also allocate per CPU data dynamically.
+ * Start with an array of pointers, the array size is the maximum number of
+ * CPUs possible, which might be larger than the number of CPUs currently
+ * online.
+ */
+static int paicrypt_root_alloc(void)
+{
+	if (!refcount_inc_not_zero(&paicrypt_root.refcnt)) {
+		/* The memory is already zeroed. */
+		paicrypt_root.mapptr = alloc_percpu(struct paicrypt_mapptr);
+		if (!paicrypt_root.mapptr)
+			return -ENOMEM;
+		refcount_set(&paicrypt_root.refcnt, 1);
+	}
+	return 0;
+}
+
+/* Release the PMU if event is the last perf event */
+static DEFINE_MUTEX(pai_reserve_mutex);
+
+/* Adjust usage counters and remove allocated memory when all users are
+ * gone.
+ */
+static void paicrypt_event_destroy_cpu(struct perf_event *event, int cpu)
+{
+	struct paicrypt_mapptr *mp = per_cpu_ptr(paicrypt_root.mapptr, cpu);
+	struct paicrypt_map *cpump = mp->mapptr;
+
+	mutex_lock(&pai_reserve_mutex);
+	debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d users %d "
+			    "refcnt %u\n", __func__, event->attr.config,
+			    event->cpu, cpump->active_events,
+			    refcount_read(&cpump->refcnt));
+	if (refcount_dec_and_test(&cpump->refcnt)) {
+		debug_sprintf_event(cfm_dbg, 4, "%s page %#lx save %p\n",
+				    __func__, (unsigned long)cpump->page,
+				    cpump->save);
+		free_page((unsigned long)cpump->page);
+		kvfree(cpump->save);
+		kfree(cpump);
+		mp->mapptr = NULL;
+	}
+	paicrypt_root_free();
+	mutex_unlock(&pai_reserve_mutex);
+}
+
+static void paicrypt_event_destroy(struct perf_event *event)
+{
+	int cpu;
+
+	static_branch_dec(&pai_key);
+	free_page(PAI_SAVE_AREA(event));
+	if (event->cpu == -1) {
+		struct cpumask *mask = PAI_CPU_MASK(event);
+
+		for_each_cpu(cpu, mask)
+			paicrypt_event_destroy_cpu(event, cpu);
+		kfree(mask);
+	} else {
+		paicrypt_event_destroy_cpu(event, event->cpu);
+	}
+}
+
+static u64 paicrypt_getctr(unsigned long *page, int nr, bool kernel)
+{
+	if (kernel)
+		nr += PAI_CRYPTO_MAXCTR;
+	return page[nr];
+}
+
+/* Read the counter values. Return value from location in CMP. For event
+ * CRYPTO_ALL sum up all events.
+ */
+static u64 paicrypt_getdata(struct perf_event *event, bool kernel)
+{
+	struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
+	struct paicrypt_map *cpump = mp->mapptr;
+	u64 sum = 0;
+	int i;
+
+	if (event->attr.config != PAI_CRYPTO_BASE) {
+		return paicrypt_getctr(cpump->page,
+				       event->attr.config - PAI_CRYPTO_BASE,
+				       kernel);
+	}
+
+	for (i = 1; i <= paicrypt_cnt; i++) {
+		u64 val = paicrypt_getctr(cpump->page, i, kernel);
+
+		if (!val)
+			continue;
+		sum += val;
+	}
+	return sum;
+}
+
+static u64 paicrypt_getall(struct perf_event *event)
+{
+	u64 sum = 0;
+
+	if (!event->attr.exclude_kernel)
+		sum += paicrypt_getdata(event, true);
+	if (!event->attr.exclude_user)
+		sum += paicrypt_getdata(event, false);
+
+	return sum;
+}
+
+/* Check concurrent access of counting and sampling for crypto events.
+ * This function is called in process context and it is save to block.
+ * When the event initialization functions fails, no other call back will
+ * be invoked.
+ *
+ * Allocate the memory for the event.
+ */
+static struct paicrypt_map *paicrypt_busy(struct perf_event *event, int cpu)
+{
+	struct paicrypt_map *cpump = NULL;
+	struct paicrypt_mapptr *mp;
+	int rc;
+
+	mutex_lock(&pai_reserve_mutex);
+
+	/* Allocate root node */
+	rc = paicrypt_root_alloc();
+	if (rc)
+		goto unlock;
+
+	/* Allocate node for this event */
+	mp = per_cpu_ptr(paicrypt_root.mapptr, cpu);
+	cpump = mp->mapptr;
+	if (!cpump) {			/* Paicrypt_map allocated? */
+		cpump = kzalloc(sizeof(*cpump), GFP_KERNEL);
+		if (!cpump) {
+			rc = -ENOMEM;
+			goto free_root;
+		}
+		INIT_LIST_HEAD(&cpump->syswide_list);
+	}
+
+	/* Allocate memory for counter page and counter extraction.
+	 * Only the first counting event has to allocate a page.
+	 */
+	if (cpump->page) {
+		refcount_inc(&cpump->refcnt);
+		goto unlock;
+	}
+
+	rc = -ENOMEM;
+	cpump->page = (unsigned long *)get_zeroed_page(GFP_KERNEL);
+	if (!cpump->page)
+		goto free_paicrypt_map;
+	cpump->save = kvmalloc_array(paicrypt_cnt + 1,
+				     sizeof(struct pai_userdata), GFP_KERNEL);
+	if (!cpump->save) {
+		free_page((unsigned long)cpump->page);
+		cpump->page = NULL;
+		goto free_paicrypt_map;
+	}
+
+	/* Set mode and reference count */
+	rc = 0;
+	refcount_set(&cpump->refcnt, 1);
+	mp->mapptr = cpump;
+	debug_sprintf_event(cfm_dbg, 5, "%s users %d refcnt %u page %#lx "
+			    "save %p rc %d\n", __func__, cpump->active_events,
+			    refcount_read(&cpump->refcnt),
+			    (unsigned long)cpump->page, cpump->save, rc);
+	goto unlock;
+
+free_paicrypt_map:
+	/* Undo memory allocation */
+	kfree(cpump);
+	mp->mapptr = NULL;
+free_root:
+	paicrypt_root_free();
+unlock:
+	mutex_unlock(&pai_reserve_mutex);
+	return rc ? ERR_PTR(rc) : cpump;
+}
+
+static int paicrypt_event_init_all(struct perf_event *event)
+{
+	struct paicrypt_map *cpump;
+	struct cpumask *maskptr;
+	int cpu, rc = -ENOMEM;
+
+	maskptr = kzalloc(sizeof(*maskptr), GFP_KERNEL);
+	if (!maskptr)
+		goto out;
+
+	for_each_online_cpu(cpu) {
+		cpump = paicrypt_busy(event, cpu);
+		if (IS_ERR(cpump)) {
+			for_each_cpu(cpu, maskptr)
+				paicrypt_event_destroy_cpu(event, cpu);
+			kfree(maskptr);
+			rc = PTR_ERR(cpump);
+			goto out;
+		}
+		cpumask_set_cpu(cpu, maskptr);
+	}
+
+	/*
+	 * On error all cpumask are freed and all events have been destroyed.
+	 * Save of which CPUs data structures have been allocated for.
+	 * Release them in paicrypt_event_destroy call back function
+	 * for this event.
+	 */
+	PAI_CPU_MASK(event) = maskptr;
+	rc = 0;
+out:
+	return rc;
+}
+
+/* Might be called on different CPU than the one the event is intended for. */
+static int paicrypt_event_init(struct perf_event *event)
+{
+	struct perf_event_attr *a = &event->attr;
+	struct paicrypt_map *cpump;
+	int rc = 0;
+
+	/* PAI crypto PMU registered as PERF_TYPE_RAW, check event type */
+	if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type)
+		return -ENOENT;
+	/* PAI crypto event must be in valid range */
+	if (a->config < PAI_CRYPTO_BASE ||
+	    a->config > PAI_CRYPTO_BASE + paicrypt_cnt)
+		return -EINVAL;
+	/* Allow only CRYPTO_ALL for sampling */
+	if (a->sample_period && a->config != PAI_CRYPTO_BASE)
+		return -EINVAL;
+	/* Get a page to store last counter values for sampling */
+	if (a->sample_period) {
+		PAI_SAVE_AREA(event) = get_zeroed_page(GFP_KERNEL);
+		if (!PAI_SAVE_AREA(event)) {
+			rc = -ENOMEM;
+			goto out;
+		}
+	}
+
+	if (event->cpu >= 0) {
+		cpump = paicrypt_busy(event, event->cpu);
+		if (IS_ERR(cpump))
+			rc = PTR_ERR(cpump);
+	} else {
+		rc = paicrypt_event_init_all(event);
+	}
+	if (rc) {
+		free_page(PAI_SAVE_AREA(event));
+		goto out;
+	}
+	event->destroy = paicrypt_event_destroy;
+
+	if (a->sample_period) {
+		a->sample_period = 1;
+		a->freq = 0;
+		/* Register for paicrypt_sched_task() to be called */
+		event->attach_state |= PERF_ATTACH_SCHED_CB;
+		/* Add raw data which contain the memory mapped counters */
+		a->sample_type |= PERF_SAMPLE_RAW;
+		/* Turn off inheritance */
+		a->inherit = 0;
+	}
+
+	static_branch_inc(&pai_key);
+out:
+	return rc;
+}
+
+static void paicrypt_read(struct perf_event *event)
+{
+	u64 prev, new, delta;
+
+	prev = local64_read(&event->hw.prev_count);
+	new = paicrypt_getall(event);
+	local64_set(&event->hw.prev_count, new);
+	delta = (prev <= new) ? new - prev
+			      : (-1ULL - prev) + new + 1;	 /* overflow */
+	local64_add(delta, &event->count);
+}
+
+static void paicrypt_start(struct perf_event *event, int flags)
+{
+	struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
+	struct paicrypt_map *cpump = mp->mapptr;
+	u64 sum;
+
+	if (!event->attr.sample_period) {	/* Counting */
+		sum = paicrypt_getall(event);	/* Get current value */
+		local64_set(&event->hw.prev_count, sum);
+	} else {				/* Sampling */
+		memcpy((void *)PAI_SAVE_AREA(event), cpump->page, PAGE_SIZE);
+		/* Enable context switch callback for system-wide sampling */
+		if (!(event->attach_state & PERF_ATTACH_TASK)) {
+			list_add_tail(PAI_SWLIST(event), &cpump->syswide_list);
+			perf_sched_cb_inc(event->pmu);
+		} else {
+			cpump->event = event;
+		}
+	}
+}
+
+static int paicrypt_add(struct perf_event *event, int flags)
+{
+	struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
+	struct paicrypt_map *cpump = mp->mapptr;
+	unsigned long ccd;
+
+	if (++cpump->active_events == 1) {
+		ccd = virt_to_phys(cpump->page) | PAI_CRYPTO_KERNEL_OFFSET;
+		WRITE_ONCE(get_lowcore()->ccd, ccd);
+		local_ctl_set_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT);
+	}
+	if (flags & PERF_EF_START)
+		paicrypt_start(event, PERF_EF_RELOAD);
+	event->hw.state = 0;
+	return 0;
+}
+
+static void paicrypt_have_sample(struct perf_event *, struct paicrypt_map *);
+static void paicrypt_stop(struct perf_event *event, int flags)
+{
+	struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
+	struct paicrypt_map *cpump = mp->mapptr;
+
+	if (!event->attr.sample_period) {	/* Counting */
+		paicrypt_read(event);
+	} else {				/* Sampling */
+		if (!(event->attach_state & PERF_ATTACH_TASK)) {
+			perf_sched_cb_dec(event->pmu);
+			list_del(PAI_SWLIST(event));
+		} else {
+			paicrypt_have_sample(event, cpump);
+			cpump->event = NULL;
+		}
+	}
+	event->hw.state = PERF_HES_STOPPED;
+}
+
+static void paicrypt_del(struct perf_event *event, int flags)
+{
+	struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
+	struct paicrypt_map *cpump = mp->mapptr;
+
+	paicrypt_stop(event, PERF_EF_UPDATE);
+	if (--cpump->active_events == 0) {
+		local_ctl_clear_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT);
+		WRITE_ONCE(get_lowcore()->ccd, 0);
+	}
+}
+
+/* Create raw data and save it in buffer. Calculate the delta for each
+ * counter between this invocation and the last invocation.
+ * Returns number of bytes copied.
+ * Saves only entries with positive counter difference of the form
+ * 2 bytes: Number of counter
+ * 8 bytes: Value of counter
+ */
+static size_t paicrypt_copy(struct pai_userdata *userdata, unsigned long *page,
+			    unsigned long *page_old, bool exclude_user,
+			    bool exclude_kernel)
+{
+	int i, outidx = 0;
+
+	for (i = 1; i <= paicrypt_cnt; i++) {
+		u64 val = 0, val_old = 0;
+
+		if (!exclude_kernel) {
+			val += paicrypt_getctr(page, i, true);
+			val_old += paicrypt_getctr(page_old, i, true);
+		}
+		if (!exclude_user) {
+			val += paicrypt_getctr(page, i, false);
+			val_old += paicrypt_getctr(page_old, i, false);
+		}
+		if (val >= val_old)
+			val -= val_old;
+		else
+			val = (~0ULL - val_old) + val + 1;
+		if (val) {
+			userdata[outidx].num = i;
+			userdata[outidx].value = val;
+			outidx++;
+		}
+	}
+	return outidx * sizeof(struct pai_userdata);
+}
+
+static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump,
+				struct perf_event *event)
+{
+	struct perf_sample_data data;
+	struct perf_raw_record raw;
+	struct pt_regs regs;
+	int overflow;
+
+	/* Setup perf sample */
+	memset(&regs, 0, sizeof(regs));
+	memset(&raw, 0, sizeof(raw));
+	memset(&data, 0, sizeof(data));
+	perf_sample_data_init(&data, 0, event->hw.last_period);
+	if (event->attr.sample_type & PERF_SAMPLE_TID) {
+		data.tid_entry.pid = task_tgid_nr(current);
+		data.tid_entry.tid = task_pid_nr(current);
+	}
+	if (event->attr.sample_type & PERF_SAMPLE_TIME)
+		data.time = event->clock();
+	if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
+		data.id = event->id;
+	if (event->attr.sample_type & PERF_SAMPLE_CPU) {
+		data.cpu_entry.cpu = smp_processor_id();
+		data.cpu_entry.reserved = 0;
+	}
+	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+		raw.frag.size = rawsize;
+		raw.frag.data = cpump->save;
+		perf_sample_save_raw_data(&data, event, &raw);
+	}
+
+	overflow = perf_event_overflow(event, &data, &regs);
+	perf_event_update_userpage(event);
+	/* Save crypto counter lowcore page after reading event data. */
+	memcpy((void *)PAI_SAVE_AREA(event), cpump->page, PAGE_SIZE);
+	return overflow;
+}
+
+/* Check if there is data to be saved on schedule out of a task. */
+static void paicrypt_have_sample(struct perf_event *event,
+				 struct paicrypt_map *cpump)
+{
+	size_t rawsize;
+
+	if (!event)		/* No event active */
+		return;
+	rawsize = paicrypt_copy(cpump->save, cpump->page,
+				(unsigned long *)PAI_SAVE_AREA(event),
+				event->attr.exclude_user,
+				event->attr.exclude_kernel);
+	if (rawsize)			/* No incremented counters */
+		paicrypt_push_sample(rawsize, cpump, event);
+}
+
+/* Check if there is data to be saved on schedule out of a task. */
+static void paicrypt_have_samples(void)
+{
+	struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
+	struct paicrypt_map *cpump = mp->mapptr;
+	struct perf_event *event;
+
+	list_for_each_entry(event, &cpump->syswide_list, hw.tp_list)
+		paicrypt_have_sample(event, cpump);
+}
+
+/* Called on schedule-in and schedule-out. No access to event structure,
+ * but for sampling only event CRYPTO_ALL is allowed.
+ */
+static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx,
+				struct task_struct *task, bool sched_in)
+{
+	/* We started with a clean page on event installation. So read out
+	 * results on schedule_out and if page was dirty, save old values.
+	 */
+	if (!sched_in)
+		paicrypt_have_samples();
+}
+
+/* Attribute definitions for paicrypt interface. As with other CPU
+ * Measurement Facilities, there is one attribute per mapped counter.
+ * The number of mapped counters may vary per machine generation. Use
+ * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction
+ * to determine the number of mapped counters. The instructions returns
+ * a positive number, which is the highest number of supported counters.
+ * All counters less than this number are also supported, there are no
+ * holes. A returned number of zero means no support for mapped counters.
+ *
+ * The identification of the counter is a unique number. The chosen range
+ * is 0x1000 + offset in mapped kernel page.
+ * All CPU Measurement Facility counters identifiers must be unique and
+ * the numbers from 0 to 496 are already used for the CPU Measurement
+ * Counter facility. Numbers 0xb0000, 0xbc000 and 0xbd000 are already
+ * used for the CPU Measurement Sampling facility.
+ */
+PMU_FORMAT_ATTR(event, "config:0-63");
+
+static struct attribute *paicrypt_format_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group paicrypt_events_group = {
+	.name = "events",
+	.attrs = NULL			/* Filled in attr_event_init() */
+};
+
+static struct attribute_group paicrypt_format_group = {
+	.name = "format",
+	.attrs = paicrypt_format_attr,
+};
+
+static const struct attribute_group *paicrypt_attr_groups[] = {
+	&paicrypt_events_group,
+	&paicrypt_format_group,
+	NULL,
+};
+
+/* Performance monitoring unit for mapped counters */
+static struct pmu paicrypt = {
+	.task_ctx_nr  = perf_hw_context,
+	.event_init   = paicrypt_event_init,
+	.add	      = paicrypt_add,
+	.del	      = paicrypt_del,
+	.start	      = paicrypt_start,
+	.stop	      = paicrypt_stop,
+	.read	      = paicrypt_read,
+	.sched_task   = paicrypt_sched_task,
+	.attr_groups  = paicrypt_attr_groups
+};
+
+/* List of symbolic PAI counter names. */
+static const char * const paicrypt_ctrnames[] = {
+	[0] = "CRYPTO_ALL",
+	[1] = "KM_DEA",
+	[2] = "KM_TDEA_128",
+	[3] = "KM_TDEA_192",
+	[4] = "KM_ENCRYPTED_DEA",
+	[5] = "KM_ENCRYPTED_TDEA_128",
+	[6] = "KM_ENCRYPTED_TDEA_192",
+	[7] = "KM_AES_128",
+	[8] = "KM_AES_192",
+	[9] = "KM_AES_256",
+	[10] = "KM_ENCRYPTED_AES_128",
+	[11] = "KM_ENCRYPTED_AES_192",
+	[12] = "KM_ENCRYPTED_AES_256",
+	[13] = "KM_XTS_AES_128",
+	[14] = "KM_XTS_AES_256",
+	[15] = "KM_XTS_ENCRYPTED_AES_128",
+	[16] = "KM_XTS_ENCRYPTED_AES_256",
+	[17] = "KMC_DEA",
+	[18] = "KMC_TDEA_128",
+	[19] = "KMC_TDEA_192",
+	[20] = "KMC_ENCRYPTED_DEA",
+	[21] = "KMC_ENCRYPTED_TDEA_128",
+	[22] = "KMC_ENCRYPTED_TDEA_192",
+	[23] = "KMC_AES_128",
+	[24] = "KMC_AES_192",
+	[25] = "KMC_AES_256",
+	[26] = "KMC_ENCRYPTED_AES_128",
+	[27] = "KMC_ENCRYPTED_AES_192",
+	[28] = "KMC_ENCRYPTED_AES_256",
+	[29] = "KMC_PRNG",
+	[30] = "KMA_GCM_AES_128",
+	[31] = "KMA_GCM_AES_192",
+	[32] = "KMA_GCM_AES_256",
+	[33] = "KMA_GCM_ENCRYPTED_AES_128",
+	[34] = "KMA_GCM_ENCRYPTED_AES_192",
+	[35] = "KMA_GCM_ENCRYPTED_AES_256",
+	[36] = "KMF_DEA",
+	[37] = "KMF_TDEA_128",
+	[38] = "KMF_TDEA_192",
+	[39] = "KMF_ENCRYPTED_DEA",
+	[40] = "KMF_ENCRYPTED_TDEA_128",
+	[41] = "KMF_ENCRYPTED_TDEA_192",
+	[42] = "KMF_AES_128",
+	[43] = "KMF_AES_192",
+	[44] = "KMF_AES_256",
+	[45] = "KMF_ENCRYPTED_AES_128",
+	[46] = "KMF_ENCRYPTED_AES_192",
+	[47] = "KMF_ENCRYPTED_AES_256",
+	[48] = "KMCTR_DEA",
+	[49] = "KMCTR_TDEA_128",
+	[50] = "KMCTR_TDEA_192",
+	[51] = "KMCTR_ENCRYPTED_DEA",
+	[52] = "KMCTR_ENCRYPTED_TDEA_128",
+	[53] = "KMCTR_ENCRYPTED_TDEA_192",
+	[54] = "KMCTR_AES_128",
+	[55] = "KMCTR_AES_192",
+	[56] = "KMCTR_AES_256",
+	[57] = "KMCTR_ENCRYPTED_AES_128",
+	[58] = "KMCTR_ENCRYPTED_AES_192",
+	[59] = "KMCTR_ENCRYPTED_AES_256",
+	[60] = "KMO_DEA",
+	[61] = "KMO_TDEA_128",
+	[62] = "KMO_TDEA_192",
+	[63] = "KMO_ENCRYPTED_DEA",
+	[64] = "KMO_ENCRYPTED_TDEA_128",
+	[65] = "KMO_ENCRYPTED_TDEA_192",
+	[66] = "KMO_AES_128",
+	[67] = "KMO_AES_192",
+	[68] = "KMO_AES_256",
+	[69] = "KMO_ENCRYPTED_AES_128",
+	[70] = "KMO_ENCRYPTED_AES_192",
+	[71] = "KMO_ENCRYPTED_AES_256",
+	[72] = "KIMD_SHA_1",
+	[73] = "KIMD_SHA_256",
+	[74] = "KIMD_SHA_512",
+	[75] = "KIMD_SHA3_224",
+	[76] = "KIMD_SHA3_256",
+	[77] = "KIMD_SHA3_384",
+	[78] = "KIMD_SHA3_512",
+	[79] = "KIMD_SHAKE_128",
+	[80] = "KIMD_SHAKE_256",
+	[81] = "KIMD_GHASH",
+	[82] = "KLMD_SHA_1",
+	[83] = "KLMD_SHA_256",
+	[84] = "KLMD_SHA_512",
+	[85] = "KLMD_SHA3_224",
+	[86] = "KLMD_SHA3_256",
+	[87] = "KLMD_SHA3_384",
+	[88] = "KLMD_SHA3_512",
+	[89] = "KLMD_SHAKE_128",
+	[90] = "KLMD_SHAKE_256",
+	[91] = "KMAC_DEA",
+	[92] = "KMAC_TDEA_128",
+	[93] = "KMAC_TDEA_192",
+	[94] = "KMAC_ENCRYPTED_DEA",
+	[95] = "KMAC_ENCRYPTED_TDEA_128",
+	[96] = "KMAC_ENCRYPTED_TDEA_192",
+	[97] = "KMAC_AES_128",
+	[98] = "KMAC_AES_192",
+	[99] = "KMAC_AES_256",
+	[100] = "KMAC_ENCRYPTED_AES_128",
+	[101] = "KMAC_ENCRYPTED_AES_192",
+	[102] = "KMAC_ENCRYPTED_AES_256",
+	[103] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_DEA",
+	[104] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_128",
+	[105] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_192",
+	[106] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_DEA",
+	[107] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_128",
+	[108] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_192",
+	[109] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_128",
+	[110] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_192",
+	[111] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_256",
+	[112] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_128",
+	[113] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_192",
+	[114] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_256A",
+	[115] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_128",
+	[116] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_256",
+	[117] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_128",
+	[118] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_256",
+	[119] = "PCC_SCALAR_MULTIPLY_P256",
+	[120] = "PCC_SCALAR_MULTIPLY_P384",
+	[121] = "PCC_SCALAR_MULTIPLY_P521",
+	[122] = "PCC_SCALAR_MULTIPLY_ED25519",
+	[123] = "PCC_SCALAR_MULTIPLY_ED448",
+	[124] = "PCC_SCALAR_MULTIPLY_X25519",
+	[125] = "PCC_SCALAR_MULTIPLY_X448",
+	[126] = "PRNO_SHA_512_DRNG",
+	[127] = "PRNO_TRNG_QUERY_RAW_TO_CONDITIONED_RATIO",
+	[128] = "PRNO_TRNG",
+	[129] = "KDSA_ECDSA_VERIFY_P256",
+	[130] = "KDSA_ECDSA_VERIFY_P384",
+	[131] = "KDSA_ECDSA_VERIFY_P521",
+	[132] = "KDSA_ECDSA_SIGN_P256",
+	[133] = "KDSA_ECDSA_SIGN_P384",
+	[134] = "KDSA_ECDSA_SIGN_P521",
+	[135] = "KDSA_ENCRYPTED_ECDSA_SIGN_P256",
+	[136] = "KDSA_ENCRYPTED_ECDSA_SIGN_P384",
+	[137] = "KDSA_ENCRYPTED_ECDSA_SIGN_P521",
+	[138] = "KDSA_EDDSA_VERIFY_ED25519",
+	[139] = "KDSA_EDDSA_VERIFY_ED448",
+	[140] = "KDSA_EDDSA_SIGN_ED25519",
+	[141] = "KDSA_EDDSA_SIGN_ED448",
+	[142] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED25519",
+	[143] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED448",
+	[144] = "PCKMO_ENCRYPT_DEA_KEY",
+	[145] = "PCKMO_ENCRYPT_TDEA_128_KEY",
+	[146] = "PCKMO_ENCRYPT_TDEA_192_KEY",
+	[147] = "PCKMO_ENCRYPT_AES_128_KEY",
+	[148] = "PCKMO_ENCRYPT_AES_192_KEY",
+	[149] = "PCKMO_ENCRYPT_AES_256_KEY",
+	[150] = "PCKMO_ENCRYPT_ECC_P256_KEY",
+	[151] = "PCKMO_ENCRYPT_ECC_P384_KEY",
+	[152] = "PCKMO_ENCRYPT_ECC_P521_KEY",
+	[153] = "PCKMO_ENCRYPT_ECC_ED25519_KEY",
+	[154] = "PCKMO_ENCRYPT_ECC_ED448_KEY",
+	[155] = "IBM_RESERVED_155",
+	[156] = "IBM_RESERVED_156",
+	[157] = "KM_FULL_XTS_AES_128",
+	[158] = "KM_FULL_XTS_AES_256",
+	[159] = "KM_FULL_XTS_ENCRYPTED_AES_128",
+	[160] = "KM_FULL_XTS_ENCRYPTED_AES_256",
+	[161] = "KMAC_HMAC_SHA_224",
+	[162] = "KMAC_HMAC_SHA_256",
+	[163] = "KMAC_HMAC_SHA_384",
+	[164] = "KMAC_HMAC_SHA_512",
+	[165] = "KMAC_HMAC_ENCRYPTED_SHA_224",
+	[166] = "KMAC_HMAC_ENCRYPTED_SHA_256",
+	[167] = "KMAC_HMAC_ENCRYPTED_SHA_384",
+	[168] = "KMAC_HMAC_ENCRYPTED_SHA_512",
+	[169] = "PCKMO_ENCRYPT_HMAC_512_KEY",
+	[170] = "PCKMO_ENCRYPT_HMAC_1024_KEY",
+	[171] = "PCKMO_ENCRYPT_AES_XTS_128",
+	[172] = "PCKMO_ENCRYPT_AES_XTS_256",
+};
+
+static void __init attr_event_free(struct attribute **attrs, int num)
+{
+	struct perf_pmu_events_attr *pa;
+	int i;
+
+	for (i = 0; i < num; i++) {
+		struct device_attribute *dap;
+
+		dap = container_of(attrs[i], struct device_attribute, attr);
+		pa = container_of(dap, struct perf_pmu_events_attr, attr);
+		kfree(pa);
+	}
+	kfree(attrs);
+}
+
+static int __init attr_event_init_one(struct attribute **attrs, int num)
+{
+	struct perf_pmu_events_attr *pa;
+
+	/* Index larger than array_size, no counter name available */
+	if (num >= ARRAY_SIZE(paicrypt_ctrnames)) {
+		attrs[num] = NULL;
+		return 0;
+	}
+
+	pa = kzalloc(sizeof(*pa), GFP_KERNEL);
+	if (!pa)
+		return -ENOMEM;
+
+	sysfs_attr_init(&pa->attr.attr);
+	pa->id = PAI_CRYPTO_BASE + num;
+	pa->attr.attr.name = paicrypt_ctrnames[num];
+	pa->attr.attr.mode = 0444;
+	pa->attr.show = cpumf_events_sysfs_show;
+	pa->attr.store = NULL;
+	attrs[num] = &pa->attr.attr;
+	return 0;
+}
+
+/* Create PMU sysfs event attributes on the fly. */
+static int __init attr_event_init(void)
+{
+	struct attribute **attrs;
+	int ret, i;
+
+	attrs = kmalloc_array(paicrypt_cnt + 2, sizeof(*attrs), GFP_KERNEL);
+	if (!attrs)
+		return -ENOMEM;
+	for (i = 0; i <= paicrypt_cnt; i++) {
+		ret = attr_event_init_one(attrs, i);
+		if (ret) {
+			attr_event_free(attrs, i);
+			return ret;
+		}
+	}
+	attrs[i] = NULL;
+	paicrypt_events_group.attrs = attrs;
+	return 0;
+}
+
+static int __init paicrypt_init(void)
+{
+	struct qpaci_info_block ib;
+	int rc;
+
+	if (!test_facility(196))
+		return 0;
+
+	qpaci(&ib);
+	paicrypt_cnt = ib.num_cc;
+	if (paicrypt_cnt == 0)
+		return 0;
+	if (paicrypt_cnt >= PAI_CRYPTO_MAXCTR) {
+		pr_err("Too many PMU pai_crypto counters %d\n", paicrypt_cnt);
+		return -E2BIG;
+	}
+
+	rc = attr_event_init();		/* Export known PAI crypto events */
+	if (rc) {
+		pr_err("Creation of PMU pai_crypto /sysfs failed\n");
+		return rc;
+	}
+
+	/* Setup s390dbf facility */
+	cfm_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128);
+	if (!cfm_dbg) {
+		pr_err("Registration of s390dbf pai_crypto failed\n");
+		return -ENOMEM;
+	}
+	debug_register_view(cfm_dbg, &debug_sprintf_view);
+
+	rc = perf_pmu_register(&paicrypt, "pai_crypto", -1);
+	if (rc) {
+		pr_err("Registering the pai_crypto PMU failed with rc=%i\n",
+		       rc);
+		debug_unregister_view(cfm_dbg, &debug_sprintf_view);
+		debug_unregister(cfm_dbg);
+		return rc;
+	}
+	return 0;
+}
+
+device_initcall(paicrypt_init);
diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c
new file mode 100644
index 000000000000..fd14d5ebccbc
--- /dev/null
+++ b/arch/s390/kernel/perf_pai_ext.c
@@ -0,0 +1,757 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Performance event support - Processor Activity Instrumentation Extension
+ * Facility
+ *
+ *  Copyright IBM Corp. 2022
+ *  Author(s): Thomas Richter <tmricht@linux.ibm.com>
+ */
+#define KMSG_COMPONENT	"pai_ext"
+#define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/io.h>
+#include <linux/perf_event.h>
+#include <asm/ctlreg.h>
+#include <asm/pai.h>
+#include <asm/debug.h>
+
+#define	PAIE1_CB_SZ		0x200	/* Size of PAIE1 control block */
+#define	PAIE1_CTRBLOCK_SZ	0x400	/* Size of PAIE1 counter blocks */
+
+static debug_info_t *paiext_dbg;
+static unsigned int paiext_cnt;	/* Extracted with QPACI instruction */
+
+struct pai_userdata {
+	u16 num;
+	u64 value;
+} __packed;
+
+/* Create the PAI extension 1 control block area.
+ * The PAI extension control block 1 is pointed to by lowcore
+ * address 0x1508 for each CPU. This control block is 512 bytes in size
+ * and requires a 512 byte boundary alignment.
+ */
+struct paiext_cb {		/* PAI extension 1 control block */
+	u64 header;		/* Not used */
+	u64 reserved1;
+	u64 acc;		/* Addr to analytics counter control block */
+	u8 reserved2[488];
+} __packed;
+
+struct paiext_map {
+	unsigned long *area;		/* Area for CPU to store counters */
+	struct pai_userdata *save;	/* Area to store non-zero counters */
+	unsigned int active_events;	/* # of PAI Extension users */
+	refcount_t refcnt;
+	struct perf_event *event;	/* Perf event for sampling */
+	struct paiext_cb *paiext_cb;	/* PAI extension control block area */
+	struct list_head syswide_list;	/* List system-wide sampling events */
+};
+
+struct paiext_mapptr {
+	struct paiext_map *mapptr;
+};
+
+static struct paiext_root {		/* Anchor to per CPU data */
+	refcount_t refcnt;		/* Overall active events */
+	struct paiext_mapptr __percpu *mapptr;
+} paiext_root;
+
+/* Free per CPU data when the last event is removed. */
+static void paiext_root_free(void)
+{
+	if (refcount_dec_and_test(&paiext_root.refcnt)) {
+		free_percpu(paiext_root.mapptr);
+		paiext_root.mapptr = NULL;
+	}
+	debug_sprintf_event(paiext_dbg, 5, "%s root.refcount %d\n", __func__,
+			    refcount_read(&paiext_root.refcnt));
+}
+
+/* On initialization of first event also allocate per CPU data dynamically.
+ * Start with an array of pointers, the array size is the maximum number of
+ * CPUs possible, which might be larger than the number of CPUs currently
+ * online.
+ */
+static int paiext_root_alloc(void)
+{
+	if (!refcount_inc_not_zero(&paiext_root.refcnt)) {
+		/* The memory is already zeroed. */
+		paiext_root.mapptr = alloc_percpu(struct paiext_mapptr);
+		if (!paiext_root.mapptr) {
+			/* Returning without refcnt adjustment is ok. The
+			 * error code is handled by paiext_alloc() which
+			 * decrements refcnt when an event can not be
+			 * created.
+			 */
+			return -ENOMEM;
+		}
+		refcount_set(&paiext_root.refcnt, 1);
+	}
+	return 0;
+}
+
+/* Protects against concurrent increment of sampler and counter member
+ * increments at the same time and prohibits concurrent execution of
+ * counting and sampling events.
+ * Ensures that analytics counter block is deallocated only when the
+ * sampling and counting on that cpu is zero.
+ * For details see paiext_alloc().
+ */
+static DEFINE_MUTEX(paiext_reserve_mutex);
+
+/* Free all memory allocated for event counting/sampling setup */
+static void paiext_free(struct paiext_mapptr *mp)
+{
+	kfree(mp->mapptr->area);
+	kfree(mp->mapptr->paiext_cb);
+	kvfree(mp->mapptr->save);
+	kfree(mp->mapptr);
+	mp->mapptr = NULL;
+}
+
+/* Release the PMU if event is the last perf event */
+static void paiext_event_destroy_cpu(struct perf_event *event, int cpu)
+{
+	struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, cpu);
+	struct paiext_map *cpump = mp->mapptr;
+
+	mutex_lock(&paiext_reserve_mutex);
+	if (refcount_dec_and_test(&cpump->refcnt))	/* Last reference gone */
+		paiext_free(mp);
+	paiext_root_free();
+	mutex_unlock(&paiext_reserve_mutex);
+}
+
+static void paiext_event_destroy(struct perf_event *event)
+{
+	int cpu;
+
+	free_page(PAI_SAVE_AREA(event));
+	if (event->cpu == -1) {
+		struct cpumask *mask = PAI_CPU_MASK(event);
+
+		for_each_cpu(cpu, mask)
+			paiext_event_destroy_cpu(event, cpu);
+		kfree(mask);
+	} else {
+		paiext_event_destroy_cpu(event, event->cpu);
+	}
+	debug_sprintf_event(paiext_dbg, 4, "%s cpu %d\n", __func__,
+			    event->cpu);
+}
+
+/* Used to avoid races in checking concurrent access of counting and
+ * sampling for pai_extension events.
+ *
+ * Only one instance of event pai_ext/NNPA_ALL/ for sampling is
+ * allowed and when this event is running, no counting event is allowed.
+ * Several counting events are allowed in parallel, but no sampling event
+ * is allowed while one (or more) counting events are running.
+ *
+ * This function is called in process context and it is safe to block.
+ * When the event initialization functions fails, no other call back will
+ * be invoked.
+ *
+ * Allocate the memory for the event.
+ */
+static int paiext_alloc_cpu(struct perf_event *event, int cpu)
+{
+	struct paiext_mapptr *mp;
+	struct paiext_map *cpump;
+	int rc;
+
+	mutex_lock(&paiext_reserve_mutex);
+	rc = paiext_root_alloc();
+	if (rc)
+		goto unlock;
+
+	mp = per_cpu_ptr(paiext_root.mapptr, cpu);
+	cpump = mp->mapptr;
+	if (!cpump) {			/* Paiext_map allocated? */
+		rc = -ENOMEM;
+		cpump = kzalloc(sizeof(*cpump), GFP_KERNEL);
+		if (!cpump)
+			goto undo;
+
+		/* Allocate memory for counter area and counter extraction.
+		 * These are
+		 * - a 512 byte block and requires 512 byte boundary alignment.
+		 * - a 1KB byte block and requires 1KB boundary alignment.
+		 * Only the first counting event has to allocate the area.
+		 *
+		 * Note: This works with commit 59bb47985c1d by default.
+		 * Backporting this to kernels without this commit might
+		 * need adjustment.
+		 */
+		mp->mapptr = cpump;
+		cpump->area = kzalloc(PAIE1_CTRBLOCK_SZ, GFP_KERNEL);
+		cpump->paiext_cb = kzalloc(PAIE1_CB_SZ, GFP_KERNEL);
+		cpump->save = kvmalloc_array(paiext_cnt + 1,
+					     sizeof(struct pai_userdata),
+					     GFP_KERNEL);
+		if (!cpump->save || !cpump->area || !cpump->paiext_cb) {
+			paiext_free(mp);
+			goto undo;
+		}
+		INIT_LIST_HEAD(&cpump->syswide_list);
+		refcount_set(&cpump->refcnt, 1);
+		rc = 0;
+	} else {
+		refcount_inc(&cpump->refcnt);
+	}
+
+undo:
+	if (rc) {
+		/* Error in allocation of event, decrement anchor. Since
+		 * the event in not created, its destroy() function is never
+		 * invoked. Adjust the reference counter for the anchor.
+		 */
+		paiext_root_free();
+	}
+unlock:
+	mutex_unlock(&paiext_reserve_mutex);
+	/* If rc is non-zero, no increment of counter/sampler was done. */
+	return rc;
+}
+
+static int paiext_alloc(struct perf_event *event)
+{
+	struct cpumask *maskptr;
+	int cpu, rc = -ENOMEM;
+
+	maskptr = kzalloc(sizeof(*maskptr), GFP_KERNEL);
+	if (!maskptr)
+		goto out;
+
+	for_each_online_cpu(cpu) {
+		rc = paiext_alloc_cpu(event, cpu);
+		if (rc) {
+			for_each_cpu(cpu, maskptr)
+				paiext_event_destroy_cpu(event, cpu);
+			kfree(maskptr);
+			goto out;
+		}
+		cpumask_set_cpu(cpu, maskptr);
+	}
+
+	/*
+	 * On error all cpumask are freed and all events have been destroyed.
+	 * Save of which CPUs data structures have been allocated for.
+	 * Release them in paicrypt_event_destroy call back function
+	 * for this event.
+	 */
+	PAI_CPU_MASK(event) = maskptr;
+	rc = 0;
+out:
+	return rc;
+}
+
+/* The PAI extension 1 control block supports up to 128 entries. Return
+ * the index within PAIE1_CB given the event number. Also validate event
+ * number.
+ */
+static int paiext_event_valid(struct perf_event *event)
+{
+	u64 cfg = event->attr.config;
+
+	if (cfg >= PAI_NNPA_BASE && cfg <= PAI_NNPA_BASE + paiext_cnt) {
+		/* Offset NNPA in paiext_cb */
+		event->hw.config_base = offsetof(struct paiext_cb, acc);
+		return 0;
+	}
+	return -EINVAL;
+}
+
+/* Might be called on different CPU than the one the event is intended for. */
+static int paiext_event_init(struct perf_event *event)
+{
+	struct perf_event_attr *a = &event->attr;
+	int rc;
+
+	/* PMU pai_ext registered as PERF_TYPE_RAW, check event type */
+	if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type)
+		return -ENOENT;
+	/* PAI extension event must be valid and in supported range */
+	rc = paiext_event_valid(event);
+	if (rc)
+		return rc;
+	/* Allow only event NNPA_ALL for sampling. */
+	if (a->sample_period && a->config != PAI_NNPA_BASE)
+		return -EINVAL;
+	/* Prohibit exclude_user event selection */
+	if (a->exclude_user)
+		return -EINVAL;
+	/* Get a page to store last counter values for sampling */
+	if (a->sample_period) {
+		PAI_SAVE_AREA(event) = get_zeroed_page(GFP_KERNEL);
+		if (!PAI_SAVE_AREA(event))
+			return -ENOMEM;
+	}
+
+	if (event->cpu >= 0)
+		rc = paiext_alloc_cpu(event, event->cpu);
+	else
+		rc = paiext_alloc(event);
+	if (rc) {
+		free_page(PAI_SAVE_AREA(event));
+		return rc;
+	}
+	event->destroy = paiext_event_destroy;
+
+	if (a->sample_period) {
+		a->sample_period = 1;
+		a->freq = 0;
+		/* Register for paicrypt_sched_task() to be called */
+		event->attach_state |= PERF_ATTACH_SCHED_CB;
+		/* Add raw data which are the memory mapped counters */
+		a->sample_type |= PERF_SAMPLE_RAW;
+		/* Turn off inheritance */
+		a->inherit = 0;
+	}
+
+	return 0;
+}
+
+static u64 paiext_getctr(unsigned long *area, int nr)
+{
+	return area[nr];
+}
+
+/* Read the counter values. Return value from location in buffer. For event
+ * NNPA_ALL sum up all events.
+ */
+static u64 paiext_getdata(struct perf_event *event)
+{
+	struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
+	struct paiext_map *cpump = mp->mapptr;
+	u64 sum = 0;
+	int i;
+
+	if (event->attr.config != PAI_NNPA_BASE)
+		return paiext_getctr(cpump->area,
+				     event->attr.config - PAI_NNPA_BASE);
+
+	for (i = 1; i <= paiext_cnt; i++)
+		sum += paiext_getctr(cpump->area, i);
+
+	return sum;
+}
+
+static u64 paiext_getall(struct perf_event *event)
+{
+	return paiext_getdata(event);
+}
+
+static void paiext_read(struct perf_event *event)
+{
+	u64 prev, new, delta;
+
+	prev = local64_read(&event->hw.prev_count);
+	new = paiext_getall(event);
+	local64_set(&event->hw.prev_count, new);
+	delta = new - prev;
+	local64_add(delta, &event->count);
+}
+
+static void paiext_start(struct perf_event *event, int flags)
+{
+	struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
+	struct paiext_map *cpump = mp->mapptr;
+	u64 sum;
+
+	if (!event->attr.sample_period) {	/* Counting */
+		sum = paiext_getall(event);	/* Get current value */
+		local64_set(&event->hw.prev_count, sum);
+	} else {				/* Sampling */
+		memcpy((void *)PAI_SAVE_AREA(event), cpump->area,
+		       PAIE1_CTRBLOCK_SZ);
+		/* Enable context switch callback for system-wide sampling */
+		if (!(event->attach_state & PERF_ATTACH_TASK)) {
+			list_add_tail(PAI_SWLIST(event), &cpump->syswide_list);
+			perf_sched_cb_inc(event->pmu);
+		} else {
+			cpump->event = event;
+		}
+	}
+}
+
+static int paiext_add(struct perf_event *event, int flags)
+{
+	struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
+	struct paiext_map *cpump = mp->mapptr;
+	struct paiext_cb *pcb = cpump->paiext_cb;
+
+	if (++cpump->active_events == 1) {
+		get_lowcore()->aicd = virt_to_phys(cpump->paiext_cb);
+		pcb->acc = virt_to_phys(cpump->area) | 0x1;
+		/* Enable CPU instruction lookup for PAIE1 control block */
+		local_ctl_set_bit(0, CR0_PAI_EXTENSION_BIT);
+	}
+	if (flags & PERF_EF_START)
+		paiext_start(event, PERF_EF_RELOAD);
+	event->hw.state = 0;
+	return 0;
+}
+
+static void paiext_have_sample(struct perf_event *, struct paiext_map *);
+static void paiext_stop(struct perf_event *event, int flags)
+{
+	struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
+	struct paiext_map *cpump = mp->mapptr;
+
+	if (!event->attr.sample_period) {	/* Counting */
+		paiext_read(event);
+	} else {				/* Sampling */
+		if (!(event->attach_state & PERF_ATTACH_TASK)) {
+			list_del(PAI_SWLIST(event));
+			perf_sched_cb_dec(event->pmu);
+		} else {
+			paiext_have_sample(event, cpump);
+			cpump->event = NULL;
+		}
+	}
+	event->hw.state = PERF_HES_STOPPED;
+}
+
+static void paiext_del(struct perf_event *event, int flags)
+{
+	struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
+	struct paiext_map *cpump = mp->mapptr;
+	struct paiext_cb *pcb = cpump->paiext_cb;
+
+	paiext_stop(event, PERF_EF_UPDATE);
+	if (--cpump->active_events == 0) {
+		/* Disable CPU instruction lookup for PAIE1 control block */
+		local_ctl_clear_bit(0, CR0_PAI_EXTENSION_BIT);
+		pcb->acc = 0;
+		get_lowcore()->aicd = 0;
+	}
+}
+
+/* Create raw data and save it in buffer. Returns number of bytes copied.
+ * Saves only positive counter entries of the form
+ * 2 bytes: Number of counter
+ * 8 bytes: Value of counter
+ */
+static size_t paiext_copy(struct pai_userdata *userdata, unsigned long *area,
+			  unsigned long *area_old)
+{
+	int i, outidx = 0;
+
+	for (i = 1; i <= paiext_cnt; i++) {
+		u64 val = paiext_getctr(area, i);
+		u64 val_old = paiext_getctr(area_old, i);
+
+		if (val >= val_old)
+			val -= val_old;
+		else
+			val = (~0ULL - val_old) + val + 1;
+		if (val) {
+			userdata[outidx].num = i;
+			userdata[outidx].value = val;
+			outidx++;
+		}
+	}
+	return outidx * sizeof(*userdata);
+}
+
+/* Write sample when one or more counters values are nonzero.
+ *
+ * Note: The function paiext_sched_task() and paiext_push_sample() are not
+ * invoked after function paiext_del() has been called because of function
+ * perf_sched_cb_dec().
+ * The function paiext_sched_task() and paiext_push_sample() are only
+ * called when sampling is active. Function perf_sched_cb_inc()
+ * has been invoked to install function paiext_sched_task() as call back
+ * to run at context switch time (see paiext_add()).
+ *
+ * This causes function perf_event_context_sched_out() and
+ * perf_event_context_sched_in() to check whether the PMU has installed an
+ * sched_task() callback. That callback is not active after paiext_del()
+ * returns and has deleted the event on that CPU.
+ */
+static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump,
+			      struct perf_event *event)
+{
+	struct perf_sample_data data;
+	struct perf_raw_record raw;
+	struct pt_regs regs;
+	int overflow;
+
+	/* Setup perf sample */
+	memset(&regs, 0, sizeof(regs));
+	memset(&raw, 0, sizeof(raw));
+	memset(&data, 0, sizeof(data));
+	perf_sample_data_init(&data, 0, event->hw.last_period);
+	if (event->attr.sample_type & PERF_SAMPLE_TID) {
+		data.tid_entry.pid = task_tgid_nr(current);
+		data.tid_entry.tid = task_pid_nr(current);
+	}
+	if (event->attr.sample_type & PERF_SAMPLE_TIME)
+		data.time = event->clock();
+	if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
+		data.id = event->id;
+	if (event->attr.sample_type & PERF_SAMPLE_CPU)
+		data.cpu_entry.cpu = smp_processor_id();
+	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+		raw.frag.size = rawsize;
+		raw.frag.data = cpump->save;
+		perf_sample_save_raw_data(&data, event, &raw);
+	}
+
+	overflow = perf_event_overflow(event, &data, &regs);
+	perf_event_update_userpage(event);
+	/* Save NNPA lowcore area after read in event */
+	memcpy((void *)PAI_SAVE_AREA(event), cpump->area,
+	       PAIE1_CTRBLOCK_SZ);
+	return overflow;
+}
+
+/* Check if there is data to be saved on schedule out of a task. */
+static void paiext_have_sample(struct perf_event *event,
+			       struct paiext_map *cpump)
+{
+	size_t rawsize;
+
+	if (!event)
+		return;
+	rawsize = paiext_copy(cpump->save, cpump->area,
+			      (unsigned long *)PAI_SAVE_AREA(event));
+	if (rawsize)			/* Incremented counters */
+		paiext_push_sample(rawsize, cpump, event);
+}
+
+/* Check if there is data to be saved on schedule out of a task. */
+static void paiext_have_samples(void)
+{
+	struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
+	struct paiext_map *cpump = mp->mapptr;
+	struct perf_event *event;
+
+	list_for_each_entry(event, &cpump->syswide_list, hw.tp_list)
+		paiext_have_sample(event, cpump);
+}
+
+/* Called on schedule-in and schedule-out. No access to event structure,
+ * but for sampling only event NNPA_ALL is allowed.
+ */
+static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx,
+			      struct task_struct *task, bool sched_in)
+{
+	/* We started with a clean page on event installation. So read out
+	 * results on schedule_out and if page was dirty, save old values.
+	 */
+	if (!sched_in)
+		paiext_have_samples();
+}
+
+/* Attribute definitions for pai extension1 interface. As with other CPU
+ * Measurement Facilities, there is one attribute per mapped counter.
+ * The number of mapped counters may vary per machine generation. Use
+ * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction
+ * to determine the number of mapped counters. The instructions returns
+ * a positive number, which is the highest number of supported counters.
+ * All counters less than this number are also supported, there are no
+ * holes. A returned number of zero means no support for mapped counters.
+ *
+ * The identification of the counter is a unique number. The chosen range
+ * is 0x1800 + offset in mapped kernel page.
+ * All CPU Measurement Facility counters identifiers must be unique and
+ * the numbers from 0 to 496 are already used for the CPU Measurement
+ * Counter facility. Number 0x1000 to 0x103e are used for PAI cryptography
+ * counters.
+ * Numbers 0xb0000, 0xbc000 and 0xbd000 are already
+ * used for the CPU Measurement Sampling facility.
+ */
+PMU_FORMAT_ATTR(event, "config:0-63");
+
+static struct attribute *paiext_format_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group paiext_events_group = {
+	.name = "events",
+	.attrs = NULL,			/* Filled in attr_event_init() */
+};
+
+static struct attribute_group paiext_format_group = {
+	.name = "format",
+	.attrs = paiext_format_attr,
+};
+
+static const struct attribute_group *paiext_attr_groups[] = {
+	&paiext_events_group,
+	&paiext_format_group,
+	NULL,
+};
+
+/* Performance monitoring unit for mapped counters */
+static struct pmu paiext = {
+	.task_ctx_nr  = perf_hw_context,
+	.event_init   = paiext_event_init,
+	.add	      = paiext_add,
+	.del	      = paiext_del,
+	.start	      = paiext_start,
+	.stop	      = paiext_stop,
+	.read	      = paiext_read,
+	.sched_task   = paiext_sched_task,
+	.attr_groups  = paiext_attr_groups,
+};
+
+/* List of symbolic PAI extension 1 NNPA counter names. */
+static const char * const paiext_ctrnames[] = {
+	[0] = "NNPA_ALL",
+	[1] = "NNPA_ADD",
+	[2] = "NNPA_SUB",
+	[3] = "NNPA_MUL",
+	[4] = "NNPA_DIV",
+	[5] = "NNPA_MIN",
+	[6] = "NNPA_MAX",
+	[7] = "NNPA_LOG",
+	[8] = "NNPA_EXP",
+	[9] = "NNPA_IBM_RESERVED_9",
+	[10] = "NNPA_RELU",
+	[11] = "NNPA_TANH",
+	[12] = "NNPA_SIGMOID",
+	[13] = "NNPA_SOFTMAX",
+	[14] = "NNPA_BATCHNORM",
+	[15] = "NNPA_MAXPOOL2D",
+	[16] = "NNPA_AVGPOOL2D",
+	[17] = "NNPA_LSTMACT",
+	[18] = "NNPA_GRUACT",
+	[19] = "NNPA_CONVOLUTION",
+	[20] = "NNPA_MATMUL_OP",
+	[21] = "NNPA_MATMUL_OP_BCAST23",
+	[22] = "NNPA_SMALLBATCH",
+	[23] = "NNPA_LARGEDIM",
+	[24] = "NNPA_SMALLTENSOR",
+	[25] = "NNPA_1MFRAME",
+	[26] = "NNPA_2GFRAME",
+	[27] = "NNPA_ACCESSEXCEPT",
+	[28] = "NNPA_TRANSFORM",
+	[29] = "NNPA_GELU",
+	[30] = "NNPA_MOMENTS",
+	[31] = "NNPA_LAYERNORM",
+	[32] = "NNPA_MATMUL_OP_BCAST1",
+	[33] = "NNPA_SQRT",
+	[34] = "NNPA_INVSQRT",
+	[35] = "NNPA_NORM",
+	[36] = "NNPA_REDUCE",
+};
+
+static void __init attr_event_free(struct attribute **attrs, int num)
+{
+	struct perf_pmu_events_attr *pa;
+	struct device_attribute *dap;
+	int i;
+
+	for (i = 0; i < num; i++) {
+		dap = container_of(attrs[i], struct device_attribute, attr);
+		pa = container_of(dap, struct perf_pmu_events_attr, attr);
+		kfree(pa);
+	}
+	kfree(attrs);
+}
+
+static int __init attr_event_init_one(struct attribute **attrs, int num)
+{
+	struct perf_pmu_events_attr *pa;
+
+	/* Index larger than array_size, no counter name available */
+	if (num >= ARRAY_SIZE(paiext_ctrnames)) {
+		attrs[num] = NULL;
+		return 0;
+	}
+
+	pa = kzalloc(sizeof(*pa), GFP_KERNEL);
+	if (!pa)
+		return -ENOMEM;
+
+	sysfs_attr_init(&pa->attr.attr);
+	pa->id = PAI_NNPA_BASE + num;
+	pa->attr.attr.name = paiext_ctrnames[num];
+	pa->attr.attr.mode = 0444;
+	pa->attr.show = cpumf_events_sysfs_show;
+	pa->attr.store = NULL;
+	attrs[num] = &pa->attr.attr;
+	return 0;
+}
+
+/* Create PMU sysfs event attributes on the fly. */
+static int __init attr_event_init(void)
+{
+	struct attribute **attrs;
+	int ret, i;
+
+	attrs = kmalloc_array(paiext_cnt + 2, sizeof(*attrs), GFP_KERNEL);
+	if (!attrs)
+		return -ENOMEM;
+	for (i = 0; i <= paiext_cnt; i++) {
+		ret = attr_event_init_one(attrs, i);
+		if (ret) {
+			attr_event_free(attrs, i);
+			return ret;
+		}
+	}
+	attrs[i] = NULL;
+	paiext_events_group.attrs = attrs;
+	return 0;
+}
+
+static int __init paiext_init(void)
+{
+	struct qpaci_info_block ib;
+	int rc = -ENOMEM;
+
+	if (!test_facility(197))
+		return 0;
+
+	qpaci(&ib);
+	paiext_cnt = ib.num_nnpa;
+	if (paiext_cnt >= PAI_NNPA_MAXCTR)
+		paiext_cnt = PAI_NNPA_MAXCTR;
+	if (!paiext_cnt)
+		return 0;
+
+	rc = attr_event_init();
+	if (rc) {
+		pr_err("Creation of PMU " KMSG_COMPONENT " /sysfs failed\n");
+		return rc;
+	}
+
+	/* Setup s390dbf facility */
+	paiext_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128);
+	if (!paiext_dbg) {
+		pr_err("Registration of s390dbf " KMSG_COMPONENT " failed\n");
+		rc = -ENOMEM;
+		goto out_init;
+	}
+	debug_register_view(paiext_dbg, &debug_sprintf_view);
+
+	rc = perf_pmu_register(&paiext, KMSG_COMPONENT, -1);
+	if (rc) {
+		pr_err("Registration of " KMSG_COMPONENT " PMU failed with "
+		       "rc=%i\n", rc);
+		goto out_pmu;
+	}
+
+	return 0;
+
+out_pmu:
+	debug_unregister_view(paiext_dbg, &debug_sprintf_view);
+	debug_unregister(paiext_dbg);
+out_init:
+	attr_event_free(paiext_events_group.attrs,
+			ARRAY_SIZE(paiext_ctrnames) + 1);
+	return rc;
+}
+
+device_initcall(paiext_init);
diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c
index 4352a504f235..a6b058ee4a36 100644
--- a/arch/s390/kernel/perf_regs.c
+++ b/arch/s390/kernel/perf_regs.c
@@ -5,8 +5,7 @@
 #include <linux/errno.h>
 #include <linux/bug.h>
 #include <asm/ptrace.h>
-#include <asm/fpu/api.h>
-#include <asm/fpu/types.h>
+#include <asm/fpu.h>
 
 u64 perf_reg_value(struct pt_regs *regs, int idx)
 {
@@ -20,8 +19,7 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
 			return 0;
 
 		idx -= PERF_REG_S390_FP0;
-		fp = MACHINE_HAS_VX ? *(freg_t *)(current->thread.fpu.vxrs + idx)
-				    : current->thread.fpu.fprs[idx];
+		fp = *(freg_t *)(current->thread.ufpu.vxrs + idx);
 		return fp.ui;
 	}
 
@@ -53,8 +51,7 @@ u64 perf_reg_abi(struct task_struct *task)
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	/*
 	 * Use the regs from the first interruption and let
@@ -64,6 +61,6 @@ void perf_get_regs_user(struct perf_regs *regs_user,
 	 */
 	regs_user->regs = task_pt_regs(current);
 	if (user_mode(regs_user->regs))
-		save_fpu_regs();
+		save_user_fpu_regs();
 	regs_user->abi = perf_reg_abi(current);
 }
diff --git a/arch/s390/kernel/pgm_check.S b/arch/s390/kernel/pgm_check.S
deleted file mode 100644
index 59dee9d3bebf..000000000000
--- a/arch/s390/kernel/pgm_check.S
+++ /dev/null
@@ -1,147 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *    Program check table.
- *
- *    Copyright IBM Corp. 2012
- */
-
-#include <linux/linkage.h>
-
-#define PGM_CHECK(handler)	.quad handler
-#define PGM_CHECK_DEFAULT	PGM_CHECK(default_trap_handler)
-
-/*
- * The program check table contains exactly 128 (0x00-0x7f) entries. Each
- * line defines the function to be called corresponding to the program check
- * interruption code.
- */
-.section .rodata, "a"
-ENTRY(pgm_check_table)
-PGM_CHECK_DEFAULT			/* 00 */
-PGM_CHECK(illegal_op)			/* 01 */
-PGM_CHECK(privileged_op)		/* 02 */
-PGM_CHECK(execute_exception)		/* 03 */
-PGM_CHECK(do_protection_exception)	/* 04 */
-PGM_CHECK(addressing_exception)		/* 05 */
-PGM_CHECK(specification_exception)	/* 06 */
-PGM_CHECK(data_exception)		/* 07 */
-PGM_CHECK(overflow_exception)		/* 08 */
-PGM_CHECK(divide_exception)		/* 09 */
-PGM_CHECK(overflow_exception)		/* 0a */
-PGM_CHECK(divide_exception)		/* 0b */
-PGM_CHECK(hfp_overflow_exception)	/* 0c */
-PGM_CHECK(hfp_underflow_exception)	/* 0d */
-PGM_CHECK(hfp_significance_exception)	/* 0e */
-PGM_CHECK(hfp_divide_exception)		/* 0f */
-PGM_CHECK(do_dat_exception)		/* 10 */
-PGM_CHECK(do_dat_exception)		/* 11 */
-PGM_CHECK(translation_exception)	/* 12 */
-PGM_CHECK(special_op_exception)		/* 13 */
-PGM_CHECK_DEFAULT			/* 14 */
-PGM_CHECK(operand_exception)		/* 15 */
-PGM_CHECK_DEFAULT			/* 16 */
-PGM_CHECK_DEFAULT			/* 17 */
-PGM_CHECK(transaction_exception)	/* 18 */
-PGM_CHECK_DEFAULT			/* 19 */
-PGM_CHECK_DEFAULT			/* 1a */
-PGM_CHECK(vector_exception)		/* 1b */
-PGM_CHECK(space_switch_exception)	/* 1c */
-PGM_CHECK(hfp_sqrt_exception)		/* 1d */
-PGM_CHECK_DEFAULT			/* 1e */
-PGM_CHECK_DEFAULT			/* 1f */
-PGM_CHECK_DEFAULT			/* 20 */
-PGM_CHECK_DEFAULT			/* 21 */
-PGM_CHECK_DEFAULT			/* 22 */
-PGM_CHECK_DEFAULT			/* 23 */
-PGM_CHECK_DEFAULT			/* 24 */
-PGM_CHECK_DEFAULT			/* 25 */
-PGM_CHECK_DEFAULT			/* 26 */
-PGM_CHECK_DEFAULT			/* 27 */
-PGM_CHECK_DEFAULT			/* 28 */
-PGM_CHECK_DEFAULT			/* 29 */
-PGM_CHECK_DEFAULT			/* 2a */
-PGM_CHECK_DEFAULT			/* 2b */
-PGM_CHECK_DEFAULT			/* 2c */
-PGM_CHECK_DEFAULT			/* 2d */
-PGM_CHECK_DEFAULT			/* 2e */
-PGM_CHECK_DEFAULT			/* 2f */
-PGM_CHECK_DEFAULT			/* 30 */
-PGM_CHECK_DEFAULT			/* 31 */
-PGM_CHECK_DEFAULT			/* 32 */
-PGM_CHECK_DEFAULT			/* 33 */
-PGM_CHECK_DEFAULT			/* 34 */
-PGM_CHECK_DEFAULT			/* 35 */
-PGM_CHECK_DEFAULT			/* 36 */
-PGM_CHECK_DEFAULT			/* 37 */
-PGM_CHECK(do_dat_exception)		/* 38 */
-PGM_CHECK(do_dat_exception)		/* 39 */
-PGM_CHECK(do_dat_exception)		/* 3a */
-PGM_CHECK(do_dat_exception)		/* 3b */
-PGM_CHECK_DEFAULT			/* 3c */
-PGM_CHECK_DEFAULT			/* 3d */
-PGM_CHECK_DEFAULT			/* 3e */
-PGM_CHECK_DEFAULT			/* 3f */
-PGM_CHECK_DEFAULT			/* 40 */
-PGM_CHECK_DEFAULT			/* 41 */
-PGM_CHECK_DEFAULT			/* 42 */
-PGM_CHECK_DEFAULT			/* 43 */
-PGM_CHECK_DEFAULT			/* 44 */
-PGM_CHECK_DEFAULT			/* 45 */
-PGM_CHECK_DEFAULT			/* 46 */
-PGM_CHECK_DEFAULT			/* 47 */
-PGM_CHECK_DEFAULT			/* 48 */
-PGM_CHECK_DEFAULT			/* 49 */
-PGM_CHECK_DEFAULT			/* 4a */
-PGM_CHECK_DEFAULT			/* 4b */
-PGM_CHECK_DEFAULT			/* 4c */
-PGM_CHECK_DEFAULT			/* 4d */
-PGM_CHECK_DEFAULT			/* 4e */
-PGM_CHECK_DEFAULT			/* 4f */
-PGM_CHECK_DEFAULT			/* 50 */
-PGM_CHECK_DEFAULT			/* 51 */
-PGM_CHECK_DEFAULT			/* 52 */
-PGM_CHECK_DEFAULT			/* 53 */
-PGM_CHECK_DEFAULT			/* 54 */
-PGM_CHECK_DEFAULT			/* 55 */
-PGM_CHECK_DEFAULT			/* 56 */
-PGM_CHECK_DEFAULT			/* 57 */
-PGM_CHECK_DEFAULT			/* 58 */
-PGM_CHECK_DEFAULT			/* 59 */
-PGM_CHECK_DEFAULT			/* 5a */
-PGM_CHECK_DEFAULT			/* 5b */
-PGM_CHECK_DEFAULT			/* 5c */
-PGM_CHECK_DEFAULT			/* 5d */
-PGM_CHECK_DEFAULT			/* 5e */
-PGM_CHECK_DEFAULT			/* 5f */
-PGM_CHECK_DEFAULT			/* 60 */
-PGM_CHECK_DEFAULT			/* 61 */
-PGM_CHECK_DEFAULT			/* 62 */
-PGM_CHECK_DEFAULT			/* 63 */
-PGM_CHECK_DEFAULT			/* 64 */
-PGM_CHECK_DEFAULT			/* 65 */
-PGM_CHECK_DEFAULT			/* 66 */
-PGM_CHECK_DEFAULT			/* 67 */
-PGM_CHECK_DEFAULT			/* 68 */
-PGM_CHECK_DEFAULT			/* 69 */
-PGM_CHECK_DEFAULT			/* 6a */
-PGM_CHECK_DEFAULT			/* 6b */
-PGM_CHECK_DEFAULT			/* 6c */
-PGM_CHECK_DEFAULT			/* 6d */
-PGM_CHECK_DEFAULT			/* 6e */
-PGM_CHECK_DEFAULT			/* 6f */
-PGM_CHECK_DEFAULT			/* 70 */
-PGM_CHECK_DEFAULT			/* 71 */
-PGM_CHECK_DEFAULT			/* 72 */
-PGM_CHECK_DEFAULT			/* 73 */
-PGM_CHECK_DEFAULT			/* 74 */
-PGM_CHECK_DEFAULT			/* 75 */
-PGM_CHECK_DEFAULT			/* 76 */
-PGM_CHECK_DEFAULT			/* 77 */
-PGM_CHECK_DEFAULT			/* 78 */
-PGM_CHECK_DEFAULT			/* 79 */
-PGM_CHECK_DEFAULT			/* 7a */
-PGM_CHECK_DEFAULT			/* 7b */
-PGM_CHECK_DEFAULT			/* 7c */
-PGM_CHECK_DEFAULT			/* 7d */
-PGM_CHECK_DEFAULT			/* 7e */
-PGM_CHECK_DEFAULT			/* 7f */
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 6ccef5f29761..9637aee43c40 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -29,23 +29,41 @@
 #include <linux/random.h>
 #include <linux/export.h>
 #include <linux/init_task.h>
+#include <linux/entry-common.h>
+#include <linux/io.h>
+#include <asm/guarded_storage.h>
+#include <asm/access-regs.h>
+#include <asm/switch_to.h>
 #include <asm/cpu_mf.h>
-#include <asm/io.h>
 #include <asm/processor.h>
+#include <asm/ptrace.h>
 #include <asm/vtimer.h>
 #include <asm/exec.h>
+#include <asm/fpu.h>
 #include <asm/irq.h>
 #include <asm/nmi.h>
 #include <asm/smp.h>
 #include <asm/stacktrace.h>
-#include <asm/switch_to.h>
 #include <asm/runtime_instr.h>
 #include <asm/unwind.h>
 #include "entry.h"
 
-asmlinkage void ret_from_fork(void) asm ("ret_from_fork");
+void ret_from_fork(void) asm("ret_from_fork");
+
+void __ret_from_fork(struct task_struct *prev, struct pt_regs *regs)
+{
+	void (*func)(void *arg);
+
+	schedule_tail(prev);
 
-extern void kernel_thread_starter(void);
+	if (!user_mode(regs)) {
+		/* Kernel thread */
+		func = (void *)regs->gprs[9];
+		func((void *)regs->gprs[10]);
+	}
+	clear_pt_regs_flag(regs, PIF_SYSCALL);
+	syscall_exit_to_user_mode(regs);
+}
 
 void flush_thread(void)
 {
@@ -53,10 +71,10 @@ void flush_thread(void)
 
 void arch_setup_new_exec(void)
 {
-	if (S390_lowcore.current_pid != current->pid) {
-		S390_lowcore.current_pid = current->pid;
+	if (get_lowcore()->current_pid != current->pid) {
+		get_lowcore()->current_pid = current->pid;
 		if (test_facility(40))
-			lpp(&S390_lowcore.lpp);
+			lpp(&get_lowcore()->lpp);
 	}
 }
 
@@ -68,21 +86,30 @@ void arch_release_task_struct(struct task_struct *tsk)
 
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
+	save_user_fpu_regs();
+
+	*dst = *src;
+	dst->thread.kfpu_flags = 0;
+
 	/*
-	 * Save the floating-point or vector register state of the current
-	 * task and set the CIF_FPU flag to lazy restore the FPU register
-	 * state when returning to user space.
+	 * Don't transfer over the runtime instrumentation or the guarded
+	 * storage control block pointers. These fields are cleared here instead
+	 * of in copy_thread() to avoid premature freeing of associated memory
+	 * on fork() failure. Wait to clear the RI flag because ->stack still
+	 * refers to the source thread.
 	 */
-	save_fpu_regs();
+	dst->thread.ri_cb = NULL;
+	dst->thread.gs_cb = NULL;
+	dst->thread.gs_bc_cb = NULL;
 
-	memcpy(dst, src, arch_task_struct_size);
-	dst->thread.fpu.regs = dst->thread.fpu.fprs;
 	return 0;
 }
 
-int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
-		    unsigned long arg, struct task_struct *p, unsigned long tls)
+int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
+	unsigned long clone_flags = args->flags;
+	unsigned long new_stackp = args->stack;
+	unsigned long tls = args->tls;
 	struct fake_frame
 	{
 		struct stack_frame sf;
@@ -94,7 +121,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
 	/* Save access registers to new thread structure. */
 	save_access_regs(&p->thread.acrs[0]);
 	/* start new process with ar4 pointing to the correct address space */
-	p->thread.mm_segment = get_fs();
 	/* Don't copy debug registers */
 	memset(&p->thread.per_user, 0, sizeof(p->thread.per_user));
 	memset(&p->thread.per_event, 0, sizeof(p->thread.per_event));
@@ -106,26 +132,26 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
 	p->thread.system_timer = 0;
 	p->thread.hardirq_timer = 0;
 	p->thread.softirq_timer = 0;
+	p->thread.last_break = 1;
 
 	frame->sf.back_chain = 0;
+	frame->sf.gprs[11 - 6] = (unsigned long)&frame->childregs;
+	frame->sf.gprs[12 - 6] = (unsigned long)p;
 	/* new return point is ret_from_fork */
-	frame->sf.gprs[8] = (unsigned long) ret_from_fork;
+	frame->sf.gprs[14 - 6] = (unsigned long)ret_from_fork;
 	/* fake return stack for resume(), don't go back to schedule */
-	frame->sf.gprs[9] = (unsigned long) frame;
+	frame->sf.gprs[15 - 6] = (unsigned long)frame;
 
 	/* Store access registers to kernel stack of new process. */
-	if (unlikely(p->flags & PF_KTHREAD)) {
+	if (unlikely(args->fn)) {
 		/* kernel thread */
 		memset(&frame->childregs, 0, sizeof(struct pt_regs));
-		frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT |
-				PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK;
-		frame->childregs.psw.addr =
-				(unsigned long) kernel_thread_starter;
-		frame->childregs.gprs[9] = new_stackp; /* function */
-		frame->childregs.gprs[10] = arg;
-		frame->childregs.gprs[11] = (unsigned long) do_exit;
+		frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_IO |
+					    PSW_MASK_EXT | PSW_MASK_MCHECK;
+		frame->childregs.gprs[9] = (unsigned long)args->fn;
+		frame->childregs.gprs[10] = (unsigned long)args->fn_arg;
 		frame->childregs.orig_gpr2 = -1;
-
+		frame->childregs.last_break = 1;
 		return 0;
 	}
 	frame->childregs = *current_pt_regs();
@@ -133,13 +159,11 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
 	frame->childregs.flags = 0;
 	if (new_stackp)
 		frame->childregs.gprs[15] = new_stackp;
-
-	/* Don't copy runtime instrumentation info */
-	p->thread.ri_cb = NULL;
+	/*
+	 * Clear the runtime instrumentation flag after the above childregs
+	 * copy. The CB pointer was already cleared in arch_dup_task_struct().
+	 */
 	frame->childregs.psw.mask &= ~PSW_MASK_RI;
-	/* Don't copy guarded storage control block */
-	p->thread.gs_cb = NULL;
-	p->thread.gs_bc_cb = NULL;
 
 	/* Set a new TLS ?  */
 	if (clone_flags & CLONE_SETTLS) {
@@ -150,39 +174,42 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
 			p->thread.acrs[1] = (unsigned int)tls;
 		}
 	}
+	/*
+	 * s390 stores the svc return address in arch_data when calling
+	 * sigreturn()/restart_syscall() via vdso. 1 means no valid address
+	 * stored.
+	 */
+	p->restart_block.arch_data = 1;
 	return 0;
 }
 
-asmlinkage void execve_tail(void)
+void execve_tail(void)
 {
-	current->thread.fpu.fpc = 0;
-	asm volatile("sfpc %0" : : "d" (0));
+	current->thread.ufpu.fpc = 0;
+	fpu_sfpc(0);
 }
 
-/*
- * fill in the FPU structure for a core dump.
- */
-int dump_fpu (struct pt_regs * regs, s390_fp_regs *fpregs)
+struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next)
 {
-	save_fpu_regs();
-	fpregs->fpc = current->thread.fpu.fpc;
-	fpregs->pad = 0;
-	if (MACHINE_HAS_VX)
-		convert_vx_to_fp((freg_t *)&fpregs->fprs,
-				 current->thread.fpu.vxrs);
-	else
-		memcpy(&fpregs->fprs, current->thread.fpu.fprs,
-		       sizeof(fpregs->fprs));
-	return 1;
+	save_user_fpu_regs();
+	save_kernel_fpu_regs(&prev->thread);
+	save_access_regs(&prev->thread.acrs[0]);
+	save_ri_cb(prev->thread.ri_cb);
+	save_gs_cb(prev->thread.gs_cb);
+	update_cr_regs(next);
+	restore_kernel_fpu_regs(&next->thread);
+	restore_access_regs(&next->thread.acrs[0]);
+	restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb);
+	restore_gs_cb(next->thread.gs_cb);
+	return __switch_to_asm(prev, next);
 }
-EXPORT_SYMBOL(dump_fpu);
 
-unsigned long get_wchan(struct task_struct *p)
+unsigned long __get_wchan(struct task_struct *p)
 {
 	struct unwind_state state;
 	unsigned long ip = 0;
 
-	if (!p || p == current || p->state == TASK_RUNNING || !task_stack_page(p))
+	if (!task_stack_page(p))
 		return 0;
 
 	if (!try_get_task_stack(p))
@@ -209,13 +236,13 @@ unsigned long get_wchan(struct task_struct *p)
 unsigned long arch_align_stack(unsigned long sp)
 {
 	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
-		sp -= get_random_int() & ~PAGE_MASK;
+		sp -= get_random_u32_below(PAGE_SIZE);
 	return sp & ~0xf;
 }
 
 static inline unsigned long brk_rnd(void)
 {
-	return (get_random_int() & BRK_RND_MASK) << PAGE_SHIFT;
+	return (get_random_u16() & BRK_RND_MASK) << PAGE_SHIFT;
 }
 
 unsigned long arch_randomize_brk(struct mm_struct *mm)
@@ -225,16 +252,3 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
 	ret = PAGE_ALIGN(mm->brk + brk_rnd());
 	return (ret > mm->brk) ? ret : mm->brk;
 }
-
-void set_fs_fixup(void)
-{
-	struct pt_regs *regs = current_pt_regs();
-	static bool warned;
-
-	set_fs(USER_DS);
-	if (warned)
-		return;
-	WARN(1, "Unbalanced set_fs - int code: 0x%x\n", regs->int_code);
-	show_registers(regs);
-	warned = true;
-}
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index 6ebc2117c66c..54e281436a28 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -11,20 +11,27 @@
 #include <linux/cpufeature.h>
 #include <linux/bitops.h>
 #include <linux/kernel.h>
+#include <linux/random.h>
 #include <linux/sched/mm.h>
 #include <linux/init.h>
 #include <linux/seq_file.h>
 #include <linux/mm_types.h>
 #include <linux/delay.h>
 #include <linux/cpu.h>
-
+#include <linux/smp.h>
+#include <asm/text-patching.h>
+#include <asm/machine.h>
 #include <asm/diag.h>
 #include <asm/facility.h>
 #include <asm/elf.h>
 #include <asm/lowcore.h>
 #include <asm/param.h>
+#include <asm/sclp.h>
 #include <asm/smp.h>
 
+unsigned long __read_mostly elf_hwcap;
+char elf_platform[ELF_PLATFORM_SIZE];
+
 struct cpu_info {
 	unsigned int cpu_mhz_dynamic;
 	unsigned int cpu_mhz_static;
@@ -67,7 +74,7 @@ void notrace stop_machine_yield(const struct cpumask *cpumask)
 	this_cpu = smp_processor_id();
 	if (__this_cpu_inc_return(cpu_relax_retry) >= spin_retry) {
 		__this_cpu_write(cpu_relax_retry, 0);
-		cpu = cpumask_next_wrap(this_cpu, cpumask, this_cpu, false);
+		cpu = cpumask_next_wrap(this_cpu, cpumask);
 		if (cpu >= nr_cpu_ids)
 			return;
 		if (arch_vcpu_is_preempted(cpu))
@@ -75,6 +82,23 @@ void notrace stop_machine_yield(const struct cpumask *cpumask)
 	}
 }
 
+static void do_sync_core(void *info)
+{
+	sync_core();
+}
+
+void text_poke_sync(void)
+{
+	on_each_cpu(do_sync_core, NULL, 1);
+}
+
+void text_poke_sync_lock(void)
+{
+	cpus_read_lock();
+	text_poke_sync();
+	cpus_read_unlock();
+}
+
 /*
  * cpu_init - initializes state that is per-CPU.
  */
@@ -91,23 +115,12 @@ void cpu_init(void)
 	enter_lazy_tlb(&init_mm, current);
 }
 
-/*
- * cpu_have_feature - Test CPU features on module initialization
- */
-int cpu_have_feature(unsigned int num)
-{
-	return elf_hwcap & (1UL << num);
-}
-EXPORT_SYMBOL(cpu_have_feature);
-
 static void show_facilities(struct seq_file *m)
 {
 	unsigned int bit;
-	long *facilities;
 
-	facilities = (long *)&S390_lowcore.stfle_fac_list;
 	seq_puts(m, "facilities      :");
-	for_each_set_bit_inv(bit, facilities, MAX_FACILITY_BIT)
+	for_each_set_bit_inv(bit, (long *)&stfle_fac_list, MAX_FACILITY_BIT)
 		seq_printf(m, " %d", bit);
 	seq_putc(m, '\n');
 }
@@ -115,15 +128,33 @@ static void show_facilities(struct seq_file *m)
 static void show_cpu_summary(struct seq_file *m, void *v)
 {
 	static const char *hwcap_str[] = {
-		"esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp",
-		"edat", "etf3eh", "highgprs", "te", "vx", "vxd", "vxe", "gs",
-		"vxe2", "vxp", "sort", "dflt"
-	};
-	static const char * const int_hwcap_str[] = {
-		"sie"
+		[HWCAP_NR_ESAN3]	= "esan3",
+		[HWCAP_NR_ZARCH]	= "zarch",
+		[HWCAP_NR_STFLE]	= "stfle",
+		[HWCAP_NR_MSA]		= "msa",
+		[HWCAP_NR_LDISP]	= "ldisp",
+		[HWCAP_NR_EIMM]		= "eimm",
+		[HWCAP_NR_DFP]		= "dfp",
+		[HWCAP_NR_HPAGE]	= "edat",
+		[HWCAP_NR_ETF3EH]	= "etf3eh",
+		[HWCAP_NR_HIGH_GPRS]	= "highgprs",
+		[HWCAP_NR_TE]		= "te",
+		[HWCAP_NR_VXRS]		= "vx",
+		[HWCAP_NR_VXRS_BCD]	= "vxd",
+		[HWCAP_NR_VXRS_EXT]	= "vxe",
+		[HWCAP_NR_GS]		= "gs",
+		[HWCAP_NR_VXRS_EXT2]	= "vxe2",
+		[HWCAP_NR_VXRS_PDE]	= "vxp",
+		[HWCAP_NR_SORT]		= "sort",
+		[HWCAP_NR_DFLT]		= "dflt",
+		[HWCAP_NR_VXRS_PDE2]	= "vxp2",
+		[HWCAP_NR_NNPA]		= "nnpa",
+		[HWCAP_NR_PCI_MIO]	= "pcimio",
+		[HWCAP_NR_SIE]		= "sie",
 	};
 	int i, cpu;
 
+	BUILD_BUG_ON(ARRAY_SIZE(hwcap_str) != HWCAP_NR_MAX);
 	seq_printf(m, "vendor_id       : IBM/S390\n"
 		   "# processors    : %i\n"
 		   "bogomips per cpu: %lu.%02lu\n",
@@ -134,9 +165,6 @@ static void show_cpu_summary(struct seq_file *m, void *v)
 	for (i = 0; i < ARRAY_SIZE(hwcap_str); i++)
 		if (hwcap_str[i] && (elf_hwcap & (1UL << i)))
 			seq_printf(m, "%s ", hwcap_str[i]);
-	for (i = 0; i < ARRAY_SIZE(int_hwcap_str); i++)
-		if (int_hwcap_str[i] && (int_hwcap & (1UL << i)))
-			seq_printf(m, "%s ", int_hwcap_str[i]);
 	seq_puts(m, "\n");
 	show_facilities(m);
 	show_cacheinfo(m);
@@ -151,10 +179,155 @@ static void show_cpu_summary(struct seq_file *m, void *v)
 	}
 }
 
+static int __init setup_hwcaps(void)
+{
+	/* instructions named N3, "backported" to esa-mode */
+	elf_hwcap |= HWCAP_ESAN3;
+
+	/* z/Architecture mode active */
+	elf_hwcap |= HWCAP_ZARCH;
+
+	/* store-facility-list-extended */
+	if (test_facility(7))
+		elf_hwcap |= HWCAP_STFLE;
+
+	/* message-security assist */
+	if (test_facility(17))
+		elf_hwcap |= HWCAP_MSA;
+
+	/* long-displacement */
+	if (test_facility(19))
+		elf_hwcap |= HWCAP_LDISP;
+
+	/* extended-immediate */
+	elf_hwcap |= HWCAP_EIMM;
+
+	/* extended-translation facility 3 enhancement */
+	if (test_facility(22) && test_facility(30))
+		elf_hwcap |= HWCAP_ETF3EH;
+
+	/* decimal floating point & perform floating point operation */
+	if (test_facility(42) && test_facility(44))
+		elf_hwcap |= HWCAP_DFP;
+
+	/* huge page support */
+	if (cpu_has_edat1())
+		elf_hwcap |= HWCAP_HPAGE;
+
+	/* 64-bit register support for 31-bit processes */
+	elf_hwcap |= HWCAP_HIGH_GPRS;
+
+	/* transactional execution */
+	if (machine_has_tx())
+		elf_hwcap |= HWCAP_TE;
+
+	/* vector */
+	if (test_facility(129)) {
+		elf_hwcap |= HWCAP_VXRS;
+		if (test_facility(134))
+			elf_hwcap |= HWCAP_VXRS_BCD;
+		if (test_facility(135))
+			elf_hwcap |= HWCAP_VXRS_EXT;
+		if (test_facility(148))
+			elf_hwcap |= HWCAP_VXRS_EXT2;
+		if (test_facility(152))
+			elf_hwcap |= HWCAP_VXRS_PDE;
+		if (test_facility(192))
+			elf_hwcap |= HWCAP_VXRS_PDE2;
+	}
+
+	if (test_facility(150))
+		elf_hwcap |= HWCAP_SORT;
+
+	if (test_facility(151))
+		elf_hwcap |= HWCAP_DFLT;
+
+	if (test_facility(165))
+		elf_hwcap |= HWCAP_NNPA;
+
+	/* guarded storage */
+	if (cpu_has_gs())
+		elf_hwcap |= HWCAP_GS;
+
+	if (test_machine_feature(MFEATURE_PCI_MIO))
+		elf_hwcap |= HWCAP_PCI_MIO;
+
+	/* virtualization support */
+	if (sclp.has_sief2)
+		elf_hwcap |= HWCAP_SIE;
+
+	return 0;
+}
+arch_initcall(setup_hwcaps);
+
+static int __init setup_elf_platform(void)
+{
+	struct cpuid cpu_id;
+
+	get_cpu_id(&cpu_id);
+	add_device_randomness(&cpu_id, sizeof(cpu_id));
+	switch (cpu_id.machine) {
+	default:	/* Use "z10" as default. */
+		strcpy(elf_platform, "z10");
+		break;
+	case 0x2817:
+	case 0x2818:
+		strcpy(elf_platform, "z196");
+		break;
+	case 0x2827:
+	case 0x2828:
+		strcpy(elf_platform, "zEC12");
+		break;
+	case 0x2964:
+	case 0x2965:
+		strcpy(elf_platform, "z13");
+		break;
+	case 0x3906:
+	case 0x3907:
+		strcpy(elf_platform, "z14");
+		break;
+	case 0x8561:
+	case 0x8562:
+		strcpy(elf_platform, "z15");
+		break;
+	case 0x3931:
+	case 0x3932:
+		strcpy(elf_platform, "z16");
+		break;
+	}
+	return 0;
+}
+arch_initcall(setup_elf_platform);
+
+static void show_cpu_topology(struct seq_file *m, unsigned long n)
+{
+#ifdef CONFIG_SCHED_TOPOLOGY
+	seq_printf(m, "physical id     : %d\n", topology_physical_package_id(n));
+	seq_printf(m, "core id         : %d\n", topology_core_id(n));
+	seq_printf(m, "book id         : %d\n", topology_book_id(n));
+	seq_printf(m, "drawer id       : %d\n", topology_drawer_id(n));
+	seq_printf(m, "dedicated       : %d\n", topology_cpu_dedicated(n));
+	seq_printf(m, "address         : %d\n", smp_cpu_get_cpu_address(n));
+	seq_printf(m, "siblings        : %d\n", cpumask_weight(topology_core_cpumask(n)));
+	seq_printf(m, "cpu cores       : %d\n", topology_booted_cores(n));
+#endif /* CONFIG_SCHED_TOPOLOGY */
+}
+
+static void show_cpu_ids(struct seq_file *m, unsigned long n)
+{
+	struct cpuid *id = &per_cpu(cpu_info.cpu_id, n);
+
+	seq_printf(m, "version         : %02X\n", id->version);
+	seq_printf(m, "identification  : %06X\n", id->ident);
+	seq_printf(m, "machine         : %04X\n", id->machine);
+}
+
 static void show_cpu_mhz(struct seq_file *m, unsigned long n)
 {
 	struct cpu_info *c = per_cpu_ptr(&cpu_info, n);
 
+	if (!machine_has_cpu_mhz)
+		return;
 	seq_printf(m, "cpu MHz dynamic : %d\n", c->cpu_mhz_dynamic);
 	seq_printf(m, "cpu MHz static  : %d\n", c->cpu_mhz_static);
 }
@@ -165,12 +338,13 @@ static void show_cpu_mhz(struct seq_file *m, unsigned long n)
 static int show_cpuinfo(struct seq_file *m, void *v)
 {
 	unsigned long n = (unsigned long) v - 1;
+	unsigned long first = cpumask_first(cpu_online_mask);
 
-	if (!n)
+	if (n == first)
 		show_cpu_summary(m, v);
-	if (!machine_has_cpu_mhz)
-		return 0;
 	seq_printf(m, "\ncpu number      : %ld\n", n);
+	show_cpu_topology(m, n);
+	show_cpu_ids(m, n);
 	show_cpu_mhz(m, n);
 	return 0;
 }
@@ -179,12 +353,14 @@ static inline void *c_update(loff_t *pos)
 {
 	if (*pos)
 		*pos = cpumask_next(*pos - 1, cpu_online_mask);
+	else
+		*pos = cpumask_first(cpu_online_mask);
 	return *pos < nr_cpu_ids ? (void *)*pos + 1 : NULL;
 }
 
 static void *c_start(struct seq_file *m, loff_t *pos)
 {
-	get_online_cpus();
+	cpus_read_lock();
 	return c_update(pos);
 }
 
@@ -196,7 +372,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void c_stop(struct seq_file *m, void *v)
 {
-	put_online_cpus();
+	cpus_read_unlock();
 }
 
 const struct seq_operations cpuinfo_op = {
@@ -205,21 +381,3 @@ const struct seq_operations cpuinfo_op = {
 	.stop	= c_stop,
 	.show	= show_cpuinfo,
 };
-
-int s390_isolate_bp(void)
-{
-	if (!test_facility(82))
-		return -EOPNOTSUPP;
-	set_thread_flag(TIF_ISOLATE_BP);
-	return 0;
-}
-EXPORT_SYMBOL(s390_isolate_bp);
-
-int s390_isolate_bp_guest(void)
-{
-	if (!test_facility(82))
-		return -EOPNOTSUPP;
-	set_thread_flag(TIF_ISOLATE_BP_GUEST);
-	return 0;
-}
-EXPORT_SYMBOL(s390_isolate_bp_guest);
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 58faa12542a1..34b8d9e745df 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/sched/task_stack.h>
+#include <linux/cpufeature.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
@@ -20,18 +21,20 @@
 #include <linux/signal.h>
 #include <linux/elf.h>
 #include <linux/regset.h>
-#include <linux/tracehook.h>
 #include <linux/seccomp.h>
 #include <linux/compat.h>
 #include <trace/syscall.h>
+#include <asm/guarded_storage.h>
+#include <asm/access-regs.h>
 #include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
-#include <asm/switch_to.h>
 #include <asm/runtime_instr.h>
 #include <asm/facility.h>
+#include <asm/machine.h>
+#include <asm/ptrace.h>
+#include <asm/rwonce.h>
+#include <asm/fpu.h>
 
 #include "entry.h"
 
@@ -39,24 +42,28 @@
 #include "compat_ptrace.h"
 #endif
 
-#define CREATE_TRACE_POINTS
-#include <trace/events/syscalls.h>
-
 void update_cr_regs(struct task_struct *task)
 {
 	struct pt_regs *regs = task_pt_regs(task);
 	struct thread_struct *thread = &task->thread;
-	struct per_regs old, new;
 	union ctlreg0 cr0_old, cr0_new;
 	union ctlreg2 cr2_old, cr2_new;
 	int cr0_changed, cr2_changed;
-
-	__ctl_store(cr0_old.val, 0, 0);
-	__ctl_store(cr2_old.val, 2, 2);
+	union {
+		struct ctlreg regs[3];
+		struct {
+			struct ctlreg control;
+			struct ctlreg start;
+			struct ctlreg end;
+		};
+	} old, new;
+
+	local_ctl_store(0, &cr0_old.reg);
+	local_ctl_store(2, &cr2_old.reg);
 	cr0_new = cr0_old;
 	cr2_new = cr2_old;
 	/* Take care of the enable/disable of transactional execution. */
-	if (MACHINE_HAS_TE) {
+	if (machine_has_tx()) {
 		/* Set or clear transaction execution TXC bit 8. */
 		cr0_new.tcx = 1;
 		if (task->thread.per_flags & PER_FLAG_NO_TE)
@@ -71,7 +78,7 @@ void update_cr_regs(struct task_struct *task)
 		}
 	}
 	/* Take care of enable/disable of guarded storage. */
-	if (MACHINE_HAS_GS) {
+	if (cpu_has_gs()) {
 		cr2_new.gse = 0;
 		if (task->thread.gs_cb)
 			cr2_new.gse = 1;
@@ -80,38 +87,38 @@ void update_cr_regs(struct task_struct *task)
 	cr0_changed = cr0_new.val != cr0_old.val;
 	cr2_changed = cr2_new.val != cr2_old.val;
 	if (cr0_changed)
-		__ctl_load(cr0_new.val, 0, 0);
+		local_ctl_load(0, &cr0_new.reg);
 	if (cr2_changed)
-		__ctl_load(cr2_new.val, 2, 2);
+		local_ctl_load(2, &cr2_new.reg);
 	/* Copy user specified PER registers */
-	new.control = thread->per_user.control;
-	new.start = thread->per_user.start;
-	new.end = thread->per_user.end;
+	new.control.val = thread->per_user.control;
+	new.start.val = thread->per_user.start;
+	new.end.val = thread->per_user.end;
 
 	/* merge TIF_SINGLE_STEP into user specified PER registers. */
 	if (test_tsk_thread_flag(task, TIF_SINGLE_STEP) ||
 	    test_tsk_thread_flag(task, TIF_UPROBE_SINGLESTEP)) {
 		if (test_tsk_thread_flag(task, TIF_BLOCK_STEP))
-			new.control |= PER_EVENT_BRANCH;
+			new.control.val |= PER_EVENT_BRANCH;
 		else
-			new.control |= PER_EVENT_IFETCH;
-		new.control |= PER_CONTROL_SUSPENSION;
-		new.control |= PER_EVENT_TRANSACTION_END;
+			new.control.val |= PER_EVENT_IFETCH;
+		new.control.val |= PER_CONTROL_SUSPENSION;
+		new.control.val |= PER_EVENT_TRANSACTION_END;
 		if (test_tsk_thread_flag(task, TIF_UPROBE_SINGLESTEP))
-			new.control |= PER_EVENT_IFETCH;
-		new.start = 0;
-		new.end = -1UL;
+			new.control.val |= PER_EVENT_IFETCH;
+		new.start.val = 0;
+		new.end.val = -1UL;
 	}
 
 	/* Take care of the PER enablement bit in the PSW. */
-	if (!(new.control & PER_EVENT_MASK)) {
+	if (!(new.control.val & PER_EVENT_MASK)) {
 		regs->psw.mask &= ~PSW_MASK_PER;
 		return;
 	}
 	regs->psw.mask |= PSW_MASK_PER;
-	__ctl_store(old, 9, 11);
+	__local_ctl_store(9, 11, old.regs);
 	if (memcmp(&new, &old, sizeof(struct per_regs)) != 0)
-		__ctl_load(new, 9, 11);
+		__local_ctl_load(9, 11, new.regs);
 }
 
 void user_enable_single_step(struct task_struct *task)
@@ -142,7 +149,7 @@ void ptrace_disable(struct task_struct *task)
 	memset(&task->thread.per_user, 0, sizeof(task->thread.per_user));
 	memset(&task->thread.per_event, 0, sizeof(task->thread.per_event));
 	clear_tsk_thread_flag(task, TIF_SINGLE_STEP);
-	clear_pt_regs_flag(task_pt_regs(task), PIF_PER_TRAP);
+	clear_tsk_thread_flag(task, TIF_PER_TRAP);
 	task->thread.per_flags = 0;
 }
 
@@ -151,38 +158,36 @@ void ptrace_disable(struct task_struct *task)
 static inline unsigned long __peek_user_per(struct task_struct *child,
 					    addr_t addr)
 {
-	struct per_struct_kernel *dummy = NULL;
-
-	if (addr == (addr_t) &dummy->cr9)
+	if (addr == offsetof(struct per_struct_kernel, cr9))
 		/* Control bits of the active per set. */
 		return test_thread_flag(TIF_SINGLE_STEP) ?
 			PER_EVENT_IFETCH : child->thread.per_user.control;
-	else if (addr == (addr_t) &dummy->cr10)
+	else if (addr == offsetof(struct per_struct_kernel, cr10))
 		/* Start address of the active per set. */
 		return test_thread_flag(TIF_SINGLE_STEP) ?
 			0 : child->thread.per_user.start;
-	else if (addr == (addr_t) &dummy->cr11)
+	else if (addr == offsetof(struct per_struct_kernel, cr11))
 		/* End address of the active per set. */
 		return test_thread_flag(TIF_SINGLE_STEP) ?
 			-1UL : child->thread.per_user.end;
-	else if (addr == (addr_t) &dummy->bits)
+	else if (addr == offsetof(struct per_struct_kernel, bits))
 		/* Single-step bit. */
 		return test_thread_flag(TIF_SINGLE_STEP) ?
 			(1UL << (BITS_PER_LONG - 1)) : 0;
-	else if (addr == (addr_t) &dummy->starting_addr)
+	else if (addr == offsetof(struct per_struct_kernel, starting_addr))
 		/* Start address of the user specified per set. */
 		return child->thread.per_user.start;
-	else if (addr == (addr_t) &dummy->ending_addr)
+	else if (addr == offsetof(struct per_struct_kernel, ending_addr))
 		/* End address of the user specified per set. */
 		return child->thread.per_user.end;
-	else if (addr == (addr_t) &dummy->perc_atmid)
+	else if (addr == offsetof(struct per_struct_kernel, perc_atmid))
 		/* PER code, ATMID and AI of the last PER trap */
 		return (unsigned long)
 			child->thread.per_event.cause << (BITS_PER_LONG - 16);
-	else if (addr == (addr_t) &dummy->address)
+	else if (addr == offsetof(struct per_struct_kernel, address))
 		/* Address of the last PER trap */
 		return child->thread.per_event.address;
-	else if (addr == (addr_t) &dummy->access_id)
+	else if (addr == offsetof(struct per_struct_kernel, access_id))
 		/* Access id of the last PER trap */
 		return (unsigned long)
 			child->thread.per_event.paid << (BITS_PER_LONG - 8);
@@ -200,73 +205,65 @@ static inline unsigned long __peek_user_per(struct task_struct *child,
  */
 static unsigned long __peek_user(struct task_struct *child, addr_t addr)
 {
-	struct user *dummy = NULL;
 	addr_t offset, tmp;
 
-	if (addr < (addr_t) &dummy->regs.acrs) {
+	if (addr < offsetof(struct user, regs.acrs)) {
 		/*
 		 * psw and gprs are stored on the stack
 		 */
 		tmp = *(addr_t *)((addr_t) &task_pt_regs(child)->psw + addr);
-		if (addr == (addr_t) &dummy->regs.psw.mask) {
+		if (addr == offsetof(struct user, regs.psw.mask)) {
 			/* Return a clean psw mask. */
 			tmp &= PSW_MASK_USER | PSW_MASK_RI;
 			tmp |= PSW_USER_BITS;
 		}
 
-	} else if (addr < (addr_t) &dummy->regs.orig_gpr2) {
+	} else if (addr < offsetof(struct user, regs.orig_gpr2)) {
 		/*
 		 * access registers are stored in the thread structure
 		 */
-		offset = addr - (addr_t) &dummy->regs.acrs;
+		offset = addr - offsetof(struct user, regs.acrs);
 		/*
 		 * Very special case: old & broken 64 bit gdb reading
 		 * from acrs[15]. Result is a 64 bit value. Read the
 		 * 32 bit acrs[15] value and shift it by 32. Sick...
 		 */
-		if (addr == (addr_t) &dummy->regs.acrs[15])
+		if (addr == offsetof(struct user, regs.acrs[15]))
 			tmp = ((unsigned long) child->thread.acrs[15]) << 32;
 		else
 			tmp = *(addr_t *)((addr_t) &child->thread.acrs + offset);
 
-	} else if (addr == (addr_t) &dummy->regs.orig_gpr2) {
+	} else if (addr == offsetof(struct user, regs.orig_gpr2)) {
 		/*
 		 * orig_gpr2 is stored on the kernel stack
 		 */
 		tmp = (addr_t) task_pt_regs(child)->orig_gpr2;
 
-	} else if (addr < (addr_t) &dummy->regs.fp_regs) {
+	} else if (addr < offsetof(struct user, regs.fp_regs)) {
 		/*
 		 * prevent reads of padding hole between
 		 * orig_gpr2 and fp_regs on s390.
 		 */
 		tmp = 0;
 
-	} else if (addr == (addr_t) &dummy->regs.fp_regs.fpc) {
+	} else if (addr == offsetof(struct user, regs.fp_regs.fpc)) {
 		/*
 		 * floating point control reg. is in the thread structure
 		 */
-		tmp = child->thread.fpu.fpc;
+		tmp = child->thread.ufpu.fpc;
 		tmp <<= BITS_PER_LONG - 32;
 
-	} else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) {
+	} else if (addr < offsetof(struct user, regs.fp_regs) + sizeof(s390_fp_regs)) {
 		/*
-		 * floating point regs. are either in child->thread.fpu
-		 * or the child->thread.fpu.vxrs array
+		 * floating point regs. are in the child->thread.ufpu.vxrs array
 		 */
-		offset = addr - (addr_t) &dummy->regs.fp_regs.fprs;
-		if (MACHINE_HAS_VX)
-			tmp = *(addr_t *)
-			       ((addr_t) child->thread.fpu.vxrs + 2*offset);
-		else
-			tmp = *(addr_t *)
-			       ((addr_t) child->thread.fpu.fprs + offset);
-
-	} else if (addr < (addr_t) (&dummy->regs.per_info + 1)) {
+		offset = addr - offsetof(struct user, regs.fp_regs.fprs);
+		tmp = *(addr_t *)((addr_t)child->thread.ufpu.vxrs + 2 * offset);
+	} else if (addr < offsetof(struct user, regs.per_info) + sizeof(per_struct)) {
 		/*
 		 * Handle access to the per_info structure.
 		 */
-		addr -= (addr_t) &dummy->regs.per_info;
+		addr -= offsetof(struct user, regs.per_info);
 		tmp = __peek_user_per(child, addr);
 
 	} else
@@ -285,8 +282,8 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data)
 	 * an alignment of 4. Programmers from hell...
 	 */
 	mask = __ADDR_MASK;
-	if (addr >= (addr_t) &((struct user *) NULL)->regs.acrs &&
-	    addr < (addr_t) &((struct user *) NULL)->regs.orig_gpr2)
+	if (addr >= offsetof(struct user, regs.acrs) &&
+	    addr < offsetof(struct user, regs.orig_gpr2))
 		mask = 3;
 	if ((addr & mask) || addr > sizeof(struct user) - __ADDR_MASK)
 		return -EIO;
@@ -298,8 +295,6 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data)
 static inline void __poke_user_per(struct task_struct *child,
 				   addr_t addr, addr_t data)
 {
-	struct per_struct_kernel *dummy = NULL;
-
 	/*
 	 * There are only three fields in the per_info struct that the
 	 * debugger user can write to.
@@ -312,14 +307,14 @@ static inline void __poke_user_per(struct task_struct *child,
 	 * addresses are used only if single stepping is not in effect.
 	 * Writes to any other field in per_info are ignored.
 	 */
-	if (addr == (addr_t) &dummy->cr9)
+	if (addr == offsetof(struct per_struct_kernel, cr9))
 		/* PER event mask of the user specified per set. */
 		child->thread.per_user.control =
 			data & (PER_EVENT_MASK | PER_CONTROL_MASK);
-	else if (addr == (addr_t) &dummy->starting_addr)
+	else if (addr == offsetof(struct per_struct_kernel, starting_addr))
 		/* Starting address of the user specified per set. */
 		child->thread.per_user.start = data;
-	else if (addr == (addr_t) &dummy->ending_addr)
+	else if (addr == offsetof(struct per_struct_kernel, ending_addr))
 		/* Ending address of the user specified per set. */
 		child->thread.per_user.end = data;
 }
@@ -332,14 +327,15 @@ static inline void __poke_user_per(struct task_struct *child,
  */
 static int __poke_user(struct task_struct *child, addr_t addr, addr_t data)
 {
-	struct user *dummy = NULL;
 	addr_t offset;
 
-	if (addr < (addr_t) &dummy->regs.acrs) {
+
+	if (addr < offsetof(struct user, regs.acrs)) {
+		struct pt_regs *regs = task_pt_regs(child);
 		/*
 		 * psw and gprs are stored on the stack
 		 */
-		if (addr == (addr_t) &dummy->regs.psw.mask) {
+		if (addr == offsetof(struct user, regs.psw.mask)) {
 			unsigned long mask = PSW_MASK_USER;
 
 			mask |= is_ri_task(child) ? PSW_MASK_RI : 0;
@@ -353,64 +349,62 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data)
 				/* Invalid addressing mode bits */
 				return -EINVAL;
 		}
-		*(addr_t *)((addr_t) &task_pt_regs(child)->psw + addr) = data;
 
-	} else if (addr < (addr_t) (&dummy->regs.orig_gpr2)) {
+		if (test_pt_regs_flag(regs, PIF_SYSCALL) &&
+			addr == offsetof(struct user, regs.gprs[2])) {
+			struct pt_regs *regs = task_pt_regs(child);
+
+			regs->int_code = 0x20000 | (data & 0xffff);
+		}
+		*(addr_t *)((addr_t) &regs->psw + addr) = data;
+	} else if (addr < offsetof(struct user, regs.orig_gpr2)) {
 		/*
 		 * access registers are stored in the thread structure
 		 */
-		offset = addr - (addr_t) &dummy->regs.acrs;
+		offset = addr - offsetof(struct user, regs.acrs);
 		/*
 		 * Very special case: old & broken 64 bit gdb writing
 		 * to acrs[15] with a 64 bit value. Ignore the lower
 		 * half of the value and write the upper 32 bit to
 		 * acrs[15]. Sick...
 		 */
-		if (addr == (addr_t) &dummy->regs.acrs[15])
+		if (addr == offsetof(struct user, regs.acrs[15]))
 			child->thread.acrs[15] = (unsigned int) (data >> 32);
 		else
 			*(addr_t *)((addr_t) &child->thread.acrs + offset) = data;
 
-	} else if (addr == (addr_t) &dummy->regs.orig_gpr2) {
+	} else if (addr == offsetof(struct user, regs.orig_gpr2)) {
 		/*
 		 * orig_gpr2 is stored on the kernel stack
 		 */
 		task_pt_regs(child)->orig_gpr2 = data;
 
-	} else if (addr < (addr_t) &dummy->regs.fp_regs) {
+	} else if (addr < offsetof(struct user, regs.fp_regs)) {
 		/*
 		 * prevent writes of padding hole between
 		 * orig_gpr2 and fp_regs on s390.
 		 */
 		return 0;
 
-	} else if (addr == (addr_t) &dummy->regs.fp_regs.fpc) {
+	} else if (addr == offsetof(struct user, regs.fp_regs.fpc)) {
 		/*
 		 * floating point control reg. is in the thread structure
 		 */
-		if ((unsigned int) data != 0 ||
-		    test_fp_ctl(data >> (BITS_PER_LONG - 32)))
+		if ((unsigned int)data != 0)
 			return -EINVAL;
-		child->thread.fpu.fpc = data >> (BITS_PER_LONG - 32);
+		child->thread.ufpu.fpc = data >> (BITS_PER_LONG - 32);
 
-	} else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) {
+	} else if (addr < offsetof(struct user, regs.fp_regs) + sizeof(s390_fp_regs)) {
 		/*
-		 * floating point regs. are either in child->thread.fpu
-		 * or the child->thread.fpu.vxrs array
+		 * floating point regs. are in the child->thread.ufpu.vxrs array
 		 */
-		offset = addr - (addr_t) &dummy->regs.fp_regs.fprs;
-		if (MACHINE_HAS_VX)
-			*(addr_t *)((addr_t)
-				child->thread.fpu.vxrs + 2*offset) = data;
-		else
-			*(addr_t *)((addr_t)
-				child->thread.fpu.fprs + offset) = data;
-
-	} else if (addr < (addr_t) (&dummy->regs.per_info + 1)) {
+		offset = addr - offsetof(struct user, regs.fp_regs.fprs);
+		*(addr_t *)((addr_t)child->thread.ufpu.vxrs + 2 * offset) = data;
+	} else if (addr < offsetof(struct user, regs.per_info) + sizeof(per_struct)) {
 		/*
 		 * Handle access to the per_info structure.
 		 */
-		addr -= (addr_t) &dummy->regs.per_info;
+		addr -= offsetof(struct user, regs.per_info);
 		__poke_user_per(child, addr, data);
 
 	}
@@ -427,8 +421,8 @@ static int poke_user(struct task_struct *child, addr_t addr, addr_t data)
 	 * an alignment of 4. Programmers from hell indeed...
 	 */
 	mask = __ADDR_MASK;
-	if (addr >= (addr_t) &((struct user *) NULL)->regs.acrs &&
-	    addr < (addr_t) &((struct user *) NULL)->regs.orig_gpr2)
+	if (addr >= offsetof(struct user, regs.acrs) &&
+	    addr < offsetof(struct user, regs.orig_gpr2))
 		mask = 3;
 	if ((addr & mask) || addr > sizeof(struct user) - __ADDR_MASK)
 		return -EIO;
@@ -477,22 +471,20 @@ long arch_ptrace(struct task_struct *child, long request,
 		}
 		return 0;
 	case PTRACE_GET_LAST_BREAK:
-		put_user(child->thread.last_break,
-			 (unsigned long __user *) data);
-		return 0;
+		return put_user(child->thread.last_break, (unsigned long __user *)data);
 	case PTRACE_ENABLE_TE:
-		if (!MACHINE_HAS_TE)
+		if (!machine_has_tx())
 			return -EIO;
 		child->thread.per_flags &= ~PER_FLAG_NO_TE;
 		return 0;
 	case PTRACE_DISABLE_TE:
-		if (!MACHINE_HAS_TE)
+		if (!machine_has_tx())
 			return -EIO;
 		child->thread.per_flags |= PER_FLAG_NO_TE;
 		child->thread.per_flags &= ~PER_FLAG_TE_ABORT_RAND;
 		return 0;
 	case PTRACE_TE_ABORT_RAND:
-		if (!MACHINE_HAS_TE || (child->thread.per_flags & PER_FLAG_NO_TE))
+		if (!machine_has_tx() || (child->thread.per_flags & PER_FLAG_NO_TE))
 			return -EIO;
 		switch (data) {
 		case 0UL:
@@ -536,37 +528,35 @@ long arch_ptrace(struct task_struct *child, long request,
 static inline __u32 __peek_user_per_compat(struct task_struct *child,
 					   addr_t addr)
 {
-	struct compat_per_struct_kernel *dummy32 = NULL;
-
-	if (addr == (addr_t) &dummy32->cr9)
+	if (addr == offsetof(struct compat_per_struct_kernel, cr9))
 		/* Control bits of the active per set. */
 		return (__u32) test_thread_flag(TIF_SINGLE_STEP) ?
 			PER_EVENT_IFETCH : child->thread.per_user.control;
-	else if (addr == (addr_t) &dummy32->cr10)
+	else if (addr == offsetof(struct compat_per_struct_kernel, cr10))
 		/* Start address of the active per set. */
 		return (__u32) test_thread_flag(TIF_SINGLE_STEP) ?
 			0 : child->thread.per_user.start;
-	else if (addr == (addr_t) &dummy32->cr11)
+	else if (addr == offsetof(struct compat_per_struct_kernel, cr11))
 		/* End address of the active per set. */
 		return test_thread_flag(TIF_SINGLE_STEP) ?
 			PSW32_ADDR_INSN : child->thread.per_user.end;
-	else if (addr == (addr_t) &dummy32->bits)
+	else if (addr == offsetof(struct compat_per_struct_kernel, bits))
 		/* Single-step bit. */
 		return (__u32) test_thread_flag(TIF_SINGLE_STEP) ?
 			0x80000000 : 0;
-	else if (addr == (addr_t) &dummy32->starting_addr)
+	else if (addr == offsetof(struct compat_per_struct_kernel, starting_addr))
 		/* Start address of the user specified per set. */
 		return (__u32) child->thread.per_user.start;
-	else if (addr == (addr_t) &dummy32->ending_addr)
+	else if (addr == offsetof(struct compat_per_struct_kernel, ending_addr))
 		/* End address of the user specified per set. */
 		return (__u32) child->thread.per_user.end;
-	else if (addr == (addr_t) &dummy32->perc_atmid)
+	else if (addr == offsetof(struct compat_per_struct_kernel, perc_atmid))
 		/* PER code, ATMID and AI of the last PER trap */
 		return (__u32) child->thread.per_event.cause << 16;
-	else if (addr == (addr_t) &dummy32->address)
+	else if (addr == offsetof(struct compat_per_struct_kernel, address))
 		/* Address of the last PER trap */
 		return (__u32) child->thread.per_event.address;
-	else if (addr == (addr_t) &dummy32->access_id)
+	else if (addr == offsetof(struct compat_per_struct_kernel, access_id))
 		/* Access id of the last PER trap */
 		return (__u32) child->thread.per_event.paid << 24;
 	return 0;
@@ -577,21 +567,20 @@ static inline __u32 __peek_user_per_compat(struct task_struct *child,
  */
 static u32 __peek_user_compat(struct task_struct *child, addr_t addr)
 {
-	struct compat_user *dummy32 = NULL;
 	addr_t offset;
 	__u32 tmp;
 
-	if (addr < (addr_t) &dummy32->regs.acrs) {
+	if (addr < offsetof(struct compat_user, regs.acrs)) {
 		struct pt_regs *regs = task_pt_regs(child);
 		/*
 		 * psw and gprs are stored on the stack
 		 */
-		if (addr == (addr_t) &dummy32->regs.psw.mask) {
+		if (addr == offsetof(struct compat_user, regs.psw.mask)) {
 			/* Fake a 31 bit psw mask. */
 			tmp = (__u32)(regs->psw.mask >> 32);
 			tmp &= PSW32_MASK_USER | PSW32_MASK_RI;
 			tmp |= PSW32_USER_BITS;
-		} else if (addr == (addr_t) &dummy32->regs.psw.addr) {
+		} else if (addr == offsetof(struct compat_user, regs.psw.addr)) {
 			/* Fake a 31 bit psw address. */
 			tmp = (__u32) regs->psw.addr |
 				(__u32)(regs->psw.mask & PSW_MASK_BA);
@@ -599,50 +588,43 @@ static u32 __peek_user_compat(struct task_struct *child, addr_t addr)
 			/* gpr 0-15 */
 			tmp = *(__u32 *)((addr_t) &regs->psw + addr*2 + 4);
 		}
-	} else if (addr < (addr_t) (&dummy32->regs.orig_gpr2)) {
+	} else if (addr < offsetof(struct compat_user, regs.orig_gpr2)) {
 		/*
 		 * access registers are stored in the thread structure
 		 */
-		offset = addr - (addr_t) &dummy32->regs.acrs;
+		offset = addr - offsetof(struct compat_user, regs.acrs);
 		tmp = *(__u32*)((addr_t) &child->thread.acrs + offset);
 
-	} else if (addr == (addr_t) (&dummy32->regs.orig_gpr2)) {
+	} else if (addr == offsetof(struct compat_user, regs.orig_gpr2)) {
 		/*
 		 * orig_gpr2 is stored on the kernel stack
 		 */
 		tmp = *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4);
 
-	} else if (addr < (addr_t) &dummy32->regs.fp_regs) {
+	} else if (addr < offsetof(struct compat_user, regs.fp_regs)) {
 		/*
 		 * prevent reads of padding hole between
 		 * orig_gpr2 and fp_regs on s390.
 		 */
 		tmp = 0;
 
-	} else if (addr == (addr_t) &dummy32->regs.fp_regs.fpc) {
+	} else if (addr == offsetof(struct compat_user, regs.fp_regs.fpc)) {
 		/*
 		 * floating point control reg. is in the thread structure
 		 */
-		tmp = child->thread.fpu.fpc;
+		tmp = child->thread.ufpu.fpc;
 
-	} else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) {
+	} else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) {
 		/*
-		 * floating point regs. are either in child->thread.fpu
-		 * or the child->thread.fpu.vxrs array
+		 * floating point regs. are in the child->thread.ufpu.vxrs array
 		 */
-		offset = addr - (addr_t) &dummy32->regs.fp_regs.fprs;
-		if (MACHINE_HAS_VX)
-			tmp = *(__u32 *)
-			       ((addr_t) child->thread.fpu.vxrs + 2*offset);
-		else
-			tmp = *(__u32 *)
-			       ((addr_t) child->thread.fpu.fprs + offset);
-
-	} else if (addr < (addr_t) (&dummy32->regs.per_info + 1)) {
+		offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs);
+		tmp = *(__u32 *)((addr_t)child->thread.ufpu.vxrs + 2 * offset);
+	} else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) {
 		/*
 		 * Handle access to the per_info structure.
 		 */
-		addr -= (addr_t) &dummy32->regs.per_info;
+		addr -= offsetof(struct compat_user, regs.per_info);
 		tmp = __peek_user_per_compat(child, addr);
 
 	} else
@@ -669,16 +651,14 @@ static int peek_user_compat(struct task_struct *child,
 static inline void __poke_user_per_compat(struct task_struct *child,
 					  addr_t addr, __u32 data)
 {
-	struct compat_per_struct_kernel *dummy32 = NULL;
-
-	if (addr == (addr_t) &dummy32->cr9)
+	if (addr == offsetof(struct compat_per_struct_kernel, cr9))
 		/* PER event mask of the user specified per set. */
 		child->thread.per_user.control =
 			data & (PER_EVENT_MASK | PER_CONTROL_MASK);
-	else if (addr == (addr_t) &dummy32->starting_addr)
+	else if (addr == offsetof(struct compat_per_struct_kernel, starting_addr))
 		/* Starting address of the user specified per set. */
 		child->thread.per_user.start = data;
-	else if (addr == (addr_t) &dummy32->ending_addr)
+	else if (addr == offsetof(struct compat_per_struct_kernel, ending_addr))
 		/* Ending address of the user specified per set. */
 		child->thread.per_user.end = data;
 }
@@ -689,16 +669,15 @@ static inline void __poke_user_per_compat(struct task_struct *child,
 static int __poke_user_compat(struct task_struct *child,
 			      addr_t addr, addr_t data)
 {
-	struct compat_user *dummy32 = NULL;
 	__u32 tmp = (__u32) data;
 	addr_t offset;
 
-	if (addr < (addr_t) &dummy32->regs.acrs) {
+	if (addr < offsetof(struct compat_user, regs.acrs)) {
 		struct pt_regs *regs = task_pt_regs(child);
 		/*
 		 * psw, gprs, acrs and orig_gpr2 are stored on the stack
 		 */
-		if (addr == (addr_t) &dummy32->regs.psw.mask) {
+		if (addr == offsetof(struct compat_user, regs.psw.mask)) {
 			__u32 mask = PSW32_MASK_USER;
 
 			mask |= is_ri_task(child) ? PSW32_MASK_RI : 0;
@@ -712,62 +691,59 @@ static int __poke_user_compat(struct task_struct *child,
 			regs->psw.mask = (regs->psw.mask & ~PSW_MASK_USER) |
 				(regs->psw.mask & PSW_MASK_BA) |
 				(__u64)(tmp & mask) << 32;
-		} else if (addr == (addr_t) &dummy32->regs.psw.addr) {
+		} else if (addr == offsetof(struct compat_user, regs.psw.addr)) {
 			/* Build a 64 bit psw address from 31 bit address. */
 			regs->psw.addr = (__u64) tmp & PSW32_ADDR_INSN;
 			/* Transfer 31 bit amode bit to psw mask. */
 			regs->psw.mask = (regs->psw.mask & ~PSW_MASK_BA) |
 				(__u64)(tmp & PSW32_ADDR_AMODE);
 		} else {
+			if (test_pt_regs_flag(regs, PIF_SYSCALL) &&
+				addr == offsetof(struct compat_user, regs.gprs[2])) {
+				struct pt_regs *regs = task_pt_regs(child);
+
+				regs->int_code = 0x20000 | (data & 0xffff);
+			}
 			/* gpr 0-15 */
 			*(__u32*)((addr_t) &regs->psw + addr*2 + 4) = tmp;
 		}
-	} else if (addr < (addr_t) (&dummy32->regs.orig_gpr2)) {
+	} else if (addr < offsetof(struct compat_user, regs.orig_gpr2)) {
 		/*
 		 * access registers are stored in the thread structure
 		 */
-		offset = addr - (addr_t) &dummy32->regs.acrs;
+		offset = addr - offsetof(struct compat_user, regs.acrs);
 		*(__u32*)((addr_t) &child->thread.acrs + offset) = tmp;
 
-	} else if (addr == (addr_t) (&dummy32->regs.orig_gpr2)) {
+	} else if (addr == offsetof(struct compat_user, regs.orig_gpr2)) {
 		/*
 		 * orig_gpr2 is stored on the kernel stack
 		 */
 		*(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4) = tmp;
 
-	} else if (addr < (addr_t) &dummy32->regs.fp_regs) {
+	} else if (addr < offsetof(struct compat_user, regs.fp_regs)) {
 		/*
 		 * prevent writess of padding hole between
 		 * orig_gpr2 and fp_regs on s390.
 		 */
 		return 0;
 
-	} else if (addr == (addr_t) &dummy32->regs.fp_regs.fpc) {
+	} else if (addr == offsetof(struct compat_user, regs.fp_regs.fpc)) {
 		/*
 		 * floating point control reg. is in the thread structure
 		 */
-		if (test_fp_ctl(tmp))
-			return -EINVAL;
-		child->thread.fpu.fpc = data;
+		child->thread.ufpu.fpc = data;
 
-	} else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) {
+	} else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) {
 		/*
-		 * floating point regs. are either in child->thread.fpu
-		 * or the child->thread.fpu.vxrs array
+		 * floating point regs. are in the child->thread.ufpu.vxrs array
 		 */
-		offset = addr - (addr_t) &dummy32->regs.fp_regs.fprs;
-		if (MACHINE_HAS_VX)
-			*(__u32 *)((addr_t)
-				child->thread.fpu.vxrs + 2*offset) = tmp;
-		else
-			*(__u32 *)((addr_t)
-				child->thread.fpu.fprs + offset) = tmp;
-
-	} else if (addr < (addr_t) (&dummy32->regs.per_info + 1)) {
+		offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs);
+		*(__u32 *)((addr_t)child->thread.ufpu.vxrs + 2 * offset) = tmp;
+	} else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) {
 		/*
 		 * Handle access to the per_info structure.
 		 */
-		addr -= (addr_t) &dummy32->regs.per_info;
+		addr -= offsetof(struct compat_user, regs.per_info);
 		__poke_user_per_compat(child, addr, data);
 	}
 
@@ -827,92 +803,26 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 		}
 		return 0;
 	case PTRACE_GET_LAST_BREAK:
-		put_user(child->thread.last_break,
-			 (unsigned int __user *) data);
-		return 0;
+		return put_user(child->thread.last_break, (unsigned int __user *)data);
 	}
 	return compat_ptrace_request(child, request, addr, data);
 }
 #endif
 
-asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
-{
-	unsigned long mask = -1UL;
-
-	/*
-	 * The sysc_tracesys code in entry.S stored the system
-	 * call number to gprs[2].
-	 */
-	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
-	    (tracehook_report_syscall_entry(regs) ||
-	     regs->gprs[2] >= NR_syscalls)) {
-		/*
-		 * Tracing decided this syscall should not happen or the
-		 * debugger stored an invalid system call number. Skip
-		 * the system call and the system call restart handling.
-		 */
-		clear_pt_regs_flag(regs, PIF_SYSCALL);
-		return -1;
-	}
-
-	/* Do the secure computing check after ptrace. */
-	if (secure_computing()) {
-		/* seccomp failures shouldn't expose any additional code. */
-		return -1;
-	}
-
-	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
-		trace_sys_enter(regs, regs->gprs[2]);
-
-	if (is_compat_task())
-		mask = 0xffffffff;
-
-	audit_syscall_entry(regs->gprs[2], regs->orig_gpr2 & mask,
-			    regs->gprs[3] &mask, regs->gprs[4] &mask,
-			    regs->gprs[5] &mask);
-
-	return regs->gprs[2];
-}
-
-asmlinkage void do_syscall_trace_exit(struct pt_regs *regs)
-{
-	audit_syscall_exit(regs);
-
-	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
-		trace_sys_exit(regs, regs->gprs[2]);
-
-	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, 0);
-}
-
 /*
  * user_regset definitions.
  */
 
 static int s390_regs_get(struct task_struct *target,
 			 const struct user_regset *regset,
-			 unsigned int pos, unsigned int count,
-			 void *kbuf, void __user *ubuf)
+			 struct membuf to)
 {
+	unsigned pos;
 	if (target == current)
 		save_access_regs(target->thread.acrs);
 
-	if (kbuf) {
-		unsigned long *k = kbuf;
-		while (count > 0) {
-			*k++ = __peek_user(target, pos);
-			count -= sizeof(*k);
-			pos += sizeof(*k);
-		}
-	} else {
-		unsigned long __user *u = ubuf;
-		while (count > 0) {
-			if (__put_user(__peek_user(target, pos), u++))
-				return -EFAULT;
-			count -= sizeof(*u);
-			pos += sizeof(*u);
-		}
-	}
+	for (pos = 0; pos < sizeof(s390_regs); pos += sizeof(long))
+		membuf_store(&to, __peek_user(target, pos));
 	return 0;
 }
 
@@ -953,19 +863,18 @@ static int s390_regs_set(struct task_struct *target,
 }
 
 static int s390_fpregs_get(struct task_struct *target,
-			   const struct user_regset *regset, unsigned int pos,
-			   unsigned int count, void *kbuf, void __user *ubuf)
+			   const struct user_regset *regset,
+			   struct membuf to)
 {
 	_s390_fp_regs fp_regs;
 
 	if (target == current)
-		save_fpu_regs();
+		save_user_fpu_regs();
 
-	fp_regs.fpc = target->thread.fpu.fpc;
-	fpregs_store(&fp_regs, &target->thread.fpu);
+	fp_regs.fpc = target->thread.ufpu.fpc;
+	fpregs_store(&fp_regs, &target->thread.ufpu);
 
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				   &fp_regs, 0, -1);
+	return membuf_write(&to, &fp_regs, sizeof(fp_regs));
 }
 
 static int s390_fpregs_set(struct task_struct *target,
@@ -977,23 +886,17 @@ static int s390_fpregs_set(struct task_struct *target,
 	freg_t fprs[__NUM_FPRS];
 
 	if (target == current)
-		save_fpu_regs();
-
-	if (MACHINE_HAS_VX)
-		convert_vx_to_fp(fprs, target->thread.fpu.vxrs);
-	else
-		memcpy(&fprs, target->thread.fpu.fprs, sizeof(fprs));
-
-	/* If setting FPC, must validate it first. */
+		save_user_fpu_regs();
+	convert_vx_to_fp(fprs, target->thread.ufpu.vxrs);
 	if (count > 0 && pos < offsetof(s390_fp_regs, fprs)) {
-		u32 ufpc[2] = { target->thread.fpu.fpc, 0 };
+		u32 ufpc[2] = { target->thread.ufpu.fpc, 0 };
 		rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ufpc,
 					0, offsetof(s390_fp_regs, fprs));
 		if (rc)
 			return rc;
-		if (ufpc[1] != 0 || test_fp_ctl(ufpc[0]))
+		if (ufpc[1] != 0)
 			return -EINVAL;
-		target->thread.fpu.fpc = ufpc[0];
+		target->thread.ufpu.fpc = ufpc[0];
 	}
 
 	if (rc == 0 && count > 0)
@@ -1001,31 +904,15 @@ static int s390_fpregs_set(struct task_struct *target,
 					fprs, offsetof(s390_fp_regs, fprs), -1);
 	if (rc)
 		return rc;
-
-	if (MACHINE_HAS_VX)
-		convert_fp_to_vx(target->thread.fpu.vxrs, fprs);
-	else
-		memcpy(target->thread.fpu.fprs, &fprs, sizeof(fprs));
-
+	convert_fp_to_vx(target->thread.ufpu.vxrs, fprs);
 	return rc;
 }
 
 static int s390_last_break_get(struct task_struct *target,
 			       const struct user_regset *regset,
-			       unsigned int pos, unsigned int count,
-			       void *kbuf, void __user *ubuf)
+			       struct membuf to)
 {
-	if (count > 0) {
-		if (kbuf) {
-			unsigned long *k = kbuf;
-			*k = target->thread.last_break;
-		} else {
-			unsigned long  __user *u = ubuf;
-			if (__put_user(target->thread.last_break, u))
-				return -EFAULT;
-		}
-	}
-	return 0;
+	return membuf_store(&to, target->thread.last_break);
 }
 
 static int s390_last_break_set(struct task_struct *target,
@@ -1038,16 +925,15 @@ static int s390_last_break_set(struct task_struct *target,
 
 static int s390_tdb_get(struct task_struct *target,
 			const struct user_regset *regset,
-			unsigned int pos, unsigned int count,
-			void *kbuf, void __user *ubuf)
+			struct membuf to)
 {
 	struct pt_regs *regs = task_pt_regs(target);
-	unsigned char *data;
+	size_t size;
 
 	if (!(regs->int_code & 0x200))
 		return -ENODATA;
-	data = target->thread.trap_tdb;
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, data, 0, 256);
+	size = sizeof(target->thread.trap_tdb.data);
+	return membuf_write(&to, target->thread.trap_tdb.data, size);
 }
 
 static int s390_tdb_set(struct task_struct *target,
@@ -1060,19 +946,18 @@ static int s390_tdb_set(struct task_struct *target,
 
 static int s390_vxrs_low_get(struct task_struct *target,
 			     const struct user_regset *regset,
-			     unsigned int pos, unsigned int count,
-			     void *kbuf, void __user *ubuf)
+			     struct membuf to)
 {
 	__u64 vxrs[__NUM_VXRS_LOW];
 	int i;
 
-	if (!MACHINE_HAS_VX)
+	if (!cpu_has_vx())
 		return -ENODEV;
 	if (target == current)
-		save_fpu_regs();
+		save_user_fpu_regs();
 	for (i = 0; i < __NUM_VXRS_LOW; i++)
-		vxrs[i] = *((__u64 *)(target->thread.fpu.vxrs + i) + 1);
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1);
+		vxrs[i] = target->thread.ufpu.vxrs[i].low;
+	return membuf_write(&to, vxrs, sizeof(vxrs));
 }
 
 static int s390_vxrs_low_set(struct task_struct *target,
@@ -1083,36 +968,32 @@ static int s390_vxrs_low_set(struct task_struct *target,
 	__u64 vxrs[__NUM_VXRS_LOW];
 	int i, rc;
 
-	if (!MACHINE_HAS_VX)
+	if (!cpu_has_vx())
 		return -ENODEV;
 	if (target == current)
-		save_fpu_regs();
+		save_user_fpu_regs();
 
 	for (i = 0; i < __NUM_VXRS_LOW; i++)
-		vxrs[i] = *((__u64 *)(target->thread.fpu.vxrs + i) + 1);
+		vxrs[i] = target->thread.ufpu.vxrs[i].low;
 
 	rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1);
 	if (rc == 0)
 		for (i = 0; i < __NUM_VXRS_LOW; i++)
-			*((__u64 *)(target->thread.fpu.vxrs + i) + 1) = vxrs[i];
+			target->thread.ufpu.vxrs[i].low = vxrs[i];
 
 	return rc;
 }
 
 static int s390_vxrs_high_get(struct task_struct *target,
 			      const struct user_regset *regset,
-			      unsigned int pos, unsigned int count,
-			      void *kbuf, void __user *ubuf)
+			      struct membuf to)
 {
-	__vector128 vxrs[__NUM_VXRS_HIGH];
-
-	if (!MACHINE_HAS_VX)
+	if (!cpu_has_vx())
 		return -ENODEV;
 	if (target == current)
-		save_fpu_regs();
-	memcpy(vxrs, target->thread.fpu.vxrs + __NUM_VXRS_LOW, sizeof(vxrs));
-
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1);
+		save_user_fpu_regs();
+	return membuf_write(&to, target->thread.ufpu.vxrs + __NUM_VXRS_LOW,
+			    __NUM_VXRS_HIGH * sizeof(__vector128));
 }
 
 static int s390_vxrs_high_set(struct task_struct *target,
@@ -1122,24 +1003,21 @@ static int s390_vxrs_high_set(struct task_struct *target,
 {
 	int rc;
 
-	if (!MACHINE_HAS_VX)
+	if (!cpu_has_vx())
 		return -ENODEV;
 	if (target == current)
-		save_fpu_regs();
+		save_user_fpu_regs();
 
 	rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				target->thread.fpu.vxrs + __NUM_VXRS_LOW, 0, -1);
+				target->thread.ufpu.vxrs + __NUM_VXRS_LOW, 0, -1);
 	return rc;
 }
 
 static int s390_system_call_get(struct task_struct *target,
 				const struct user_regset *regset,
-				unsigned int pos, unsigned int count,
-				void *kbuf, void __user *ubuf)
+				struct membuf to)
 {
-	unsigned int *data = &target->thread.system_call;
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				   data, 0, sizeof(unsigned int));
+	return membuf_store(&to, target->thread.system_call);
 }
 
 static int s390_system_call_set(struct task_struct *target,
@@ -1154,19 +1032,17 @@ static int s390_system_call_set(struct task_struct *target,
 
 static int s390_gs_cb_get(struct task_struct *target,
 			  const struct user_regset *regset,
-			  unsigned int pos, unsigned int count,
-			  void *kbuf, void __user *ubuf)
+			  struct membuf to)
 {
 	struct gs_cb *data = target->thread.gs_cb;
 
-	if (!MACHINE_HAS_GS)
+	if (!cpu_has_gs())
 		return -ENODEV;
 	if (!data)
 		return -ENODATA;
 	if (target == current)
 		save_gs_cb(data);
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				   data, 0, sizeof(struct gs_cb));
+	return membuf_write(&to, data, sizeof(struct gs_cb));
 }
 
 static int s390_gs_cb_set(struct task_struct *target,
@@ -1177,7 +1053,7 @@ static int s390_gs_cb_set(struct task_struct *target,
 	struct gs_cb gs_cb = { }, *data = NULL;
 	int rc;
 
-	if (!MACHINE_HAS_GS)
+	if (!cpu_has_gs())
 		return -ENODEV;
 	if (!target->thread.gs_cb) {
 		data = kzalloc(sizeof(*data), GFP_KERNEL);
@@ -1201,7 +1077,7 @@ static int s390_gs_cb_set(struct task_struct *target,
 		target->thread.gs_cb = data;
 	*target->thread.gs_cb = gs_cb;
 	if (target == current) {
-		__ctl_set_bit(2, 4);
+		local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT);
 		restore_gs_cb(target->thread.gs_cb);
 	}
 	preempt_enable();
@@ -1210,17 +1086,15 @@ static int s390_gs_cb_set(struct task_struct *target,
 
 static int s390_gs_bc_get(struct task_struct *target,
 			  const struct user_regset *regset,
-			  unsigned int pos, unsigned int count,
-			  void *kbuf, void __user *ubuf)
+			  struct membuf to)
 {
 	struct gs_cb *data = target->thread.gs_bc_cb;
 
-	if (!MACHINE_HAS_GS)
+	if (!cpu_has_gs())
 		return -ENODEV;
 	if (!data)
 		return -ENODATA;
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				   data, 0, sizeof(struct gs_cb));
+	return membuf_write(&to, data, sizeof(struct gs_cb));
 }
 
 static int s390_gs_bc_set(struct task_struct *target,
@@ -1230,7 +1104,7 @@ static int s390_gs_bc_set(struct task_struct *target,
 {
 	struct gs_cb *data = target->thread.gs_bc_cb;
 
-	if (!MACHINE_HAS_GS)
+	if (!cpu_has_gs())
 		return -ENODEV;
 	if (!data) {
 		data = kzalloc(sizeof(*data), GFP_KERNEL);
@@ -1256,7 +1130,6 @@ static bool is_ri_cb_valid(struct runtime_instr_cb *cb)
 		cb->pc == 1 &&
 		cb->qc == 0 &&
 		cb->reserved2 == 0 &&
-		cb->key == PAGE_DEFAULT_KEY &&
 		cb->reserved3 == 0 &&
 		cb->reserved4 == 0 &&
 		cb->reserved5 == 0 &&
@@ -1271,8 +1144,7 @@ static bool is_ri_cb_valid(struct runtime_instr_cb *cb)
 
 static int s390_runtime_instr_get(struct task_struct *target,
 				const struct user_regset *regset,
-				unsigned int pos, unsigned int count,
-				void *kbuf, void __user *ubuf)
+				struct membuf to)
 {
 	struct runtime_instr_cb *data = target->thread.ri_cb;
 
@@ -1281,8 +1153,7 @@ static int s390_runtime_instr_get(struct task_struct *target,
 	if (!data)
 		return -ENODATA;
 
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				   data, 0, sizeof(struct runtime_instr_cb));
+	return membuf_write(&to, data, sizeof(struct runtime_instr_cb));
 }
 
 static int s390_runtime_instr_set(struct task_struct *target,
@@ -1320,7 +1191,11 @@ static int s390_runtime_instr_set(struct task_struct *target,
 		kfree(data);
 		return -EINVAL;
 	}
-
+	/*
+	 * Override access key in any case, since user space should
+	 * not be able to set it, nor should it care about it.
+	 */
+	ri_cb.key = PAGE_DEFAULT_KEY >> 4;
 	preempt_disable();
 	if (!target->thread.ri_cb)
 		target->thread.ri_cb = data;
@@ -1338,7 +1213,7 @@ static const struct user_regset s390_regsets[] = {
 		.n = sizeof(s390_regs) / sizeof(long),
 		.size = sizeof(long),
 		.align = sizeof(long),
-		.get = s390_regs_get,
+		.regset_get = s390_regs_get,
 		.set = s390_regs_set,
 	},
 	{
@@ -1346,7 +1221,7 @@ static const struct user_regset s390_regsets[] = {
 		.n = sizeof(s390_fp_regs) / sizeof(long),
 		.size = sizeof(long),
 		.align = sizeof(long),
-		.get = s390_fpregs_get,
+		.regset_get = s390_fpregs_get,
 		.set = s390_fpregs_set,
 	},
 	{
@@ -1354,7 +1229,7 @@ static const struct user_regset s390_regsets[] = {
 		.n = 1,
 		.size = sizeof(unsigned int),
 		.align = sizeof(unsigned int),
-		.get = s390_system_call_get,
+		.regset_get = s390_system_call_get,
 		.set = s390_system_call_set,
 	},
 	{
@@ -1362,7 +1237,7 @@ static const struct user_regset s390_regsets[] = {
 		.n = 1,
 		.size = sizeof(long),
 		.align = sizeof(long),
-		.get = s390_last_break_get,
+		.regset_get = s390_last_break_get,
 		.set = s390_last_break_set,
 	},
 	{
@@ -1370,7 +1245,7 @@ static const struct user_regset s390_regsets[] = {
 		.n = 1,
 		.size = 256,
 		.align = 1,
-		.get = s390_tdb_get,
+		.regset_get = s390_tdb_get,
 		.set = s390_tdb_set,
 	},
 	{
@@ -1378,7 +1253,7 @@ static const struct user_regset s390_regsets[] = {
 		.n = __NUM_VXRS_LOW,
 		.size = sizeof(__u64),
 		.align = sizeof(__u64),
-		.get = s390_vxrs_low_get,
+		.regset_get = s390_vxrs_low_get,
 		.set = s390_vxrs_low_set,
 	},
 	{
@@ -1386,7 +1261,7 @@ static const struct user_regset s390_regsets[] = {
 		.n = __NUM_VXRS_HIGH,
 		.size = sizeof(__vector128),
 		.align = sizeof(__vector128),
-		.get = s390_vxrs_high_get,
+		.regset_get = s390_vxrs_high_get,
 		.set = s390_vxrs_high_set,
 	},
 	{
@@ -1394,7 +1269,7 @@ static const struct user_regset s390_regsets[] = {
 		.n = sizeof(struct gs_cb) / sizeof(__u64),
 		.size = sizeof(__u64),
 		.align = sizeof(__u64),
-		.get = s390_gs_cb_get,
+		.regset_get = s390_gs_cb_get,
 		.set = s390_gs_cb_set,
 	},
 	{
@@ -1402,7 +1277,7 @@ static const struct user_regset s390_regsets[] = {
 		.n = sizeof(struct gs_cb) / sizeof(__u64),
 		.size = sizeof(__u64),
 		.align = sizeof(__u64),
-		.get = s390_gs_bc_get,
+		.regset_get = s390_gs_bc_get,
 		.set = s390_gs_bc_set,
 	},
 	{
@@ -1410,13 +1285,13 @@ static const struct user_regset s390_regsets[] = {
 		.n = sizeof(struct runtime_instr_cb) / sizeof(__u64),
 		.size = sizeof(__u64),
 		.align = sizeof(__u64),
-		.get = s390_runtime_instr_get,
+		.regset_get = s390_runtime_instr_get,
 		.set = s390_runtime_instr_set,
 	},
 };
 
 static const struct user_regset_view user_s390_view = {
-	.name = UTS_MACHINE,
+	.name = "s390x",
 	.e_machine = EM_S390,
 	.regsets = s390_regsets,
 	.n = ARRAY_SIZE(s390_regsets)
@@ -1425,28 +1300,15 @@ static const struct user_regset_view user_s390_view = {
 #ifdef CONFIG_COMPAT
 static int s390_compat_regs_get(struct task_struct *target,
 				const struct user_regset *regset,
-				unsigned int pos, unsigned int count,
-				void *kbuf, void __user *ubuf)
+				struct membuf to)
 {
+	unsigned n;
+
 	if (target == current)
 		save_access_regs(target->thread.acrs);
 
-	if (kbuf) {
-		compat_ulong_t *k = kbuf;
-		while (count > 0) {
-			*k++ = __peek_user_compat(target, pos);
-			count -= sizeof(*k);
-			pos += sizeof(*k);
-		}
-	} else {
-		compat_ulong_t __user *u = ubuf;
-		while (count > 0) {
-			if (__put_user(__peek_user_compat(target, pos), u++))
-				return -EFAULT;
-			count -= sizeof(*u);
-			pos += sizeof(*u);
-		}
-	}
+	for (n = 0; n < sizeof(s390_compat_regs); n += sizeof(compat_ulong_t))
+		membuf_store(&to, __peek_user_compat(target, n));
 	return 0;
 }
 
@@ -1488,29 +1350,14 @@ static int s390_compat_regs_set(struct task_struct *target,
 
 static int s390_compat_regs_high_get(struct task_struct *target,
 				     const struct user_regset *regset,
-				     unsigned int pos, unsigned int count,
-				     void *kbuf, void __user *ubuf)
+				     struct membuf to)
 {
 	compat_ulong_t *gprs_high;
+	int i;
 
-	gprs_high = (compat_ulong_t *)
-		&task_pt_regs(target)->gprs[pos / sizeof(compat_ulong_t)];
-	if (kbuf) {
-		compat_ulong_t *k = kbuf;
-		while (count > 0) {
-			*k++ = *gprs_high;
-			gprs_high += 2;
-			count -= sizeof(*k);
-		}
-	} else {
-		compat_ulong_t __user *u = ubuf;
-		while (count > 0) {
-			if (__put_user(*gprs_high, u++))
-				return -EFAULT;
-			gprs_high += 2;
-			count -= sizeof(*u);
-		}
-	}
+	gprs_high = (compat_ulong_t *)task_pt_regs(target)->gprs;
+	for (i = 0; i < NUM_GPRS; i++, gprs_high += 2)
+		membuf_store(&to, *gprs_high);
 	return 0;
 }
 
@@ -1549,23 +1396,11 @@ static int s390_compat_regs_high_set(struct task_struct *target,
 
 static int s390_compat_last_break_get(struct task_struct *target,
 				      const struct user_regset *regset,
-				      unsigned int pos, unsigned int count,
-				      void *kbuf, void __user *ubuf)
+				      struct membuf to)
 {
-	compat_ulong_t last_break;
+	compat_ulong_t last_break = target->thread.last_break;
 
-	if (count > 0) {
-		last_break = target->thread.last_break;
-		if (kbuf) {
-			unsigned long *k = kbuf;
-			*k = last_break;
-		} else {
-			unsigned long  __user *u = ubuf;
-			if (__put_user(last_break, u))
-				return -EFAULT;
-		}
-	}
-	return 0;
+	return membuf_store(&to, (unsigned long)last_break);
 }
 
 static int s390_compat_last_break_set(struct task_struct *target,
@@ -1582,7 +1417,7 @@ static const struct user_regset s390_compat_regsets[] = {
 		.n = sizeof(s390_compat_regs) / sizeof(compat_long_t),
 		.size = sizeof(compat_long_t),
 		.align = sizeof(compat_long_t),
-		.get = s390_compat_regs_get,
+		.regset_get = s390_compat_regs_get,
 		.set = s390_compat_regs_set,
 	},
 	{
@@ -1590,7 +1425,7 @@ static const struct user_regset s390_compat_regsets[] = {
 		.n = sizeof(s390_fp_regs) / sizeof(compat_long_t),
 		.size = sizeof(compat_long_t),
 		.align = sizeof(compat_long_t),
-		.get = s390_fpregs_get,
+		.regset_get = s390_fpregs_get,
 		.set = s390_fpregs_set,
 	},
 	{
@@ -1598,7 +1433,7 @@ static const struct user_regset s390_compat_regsets[] = {
 		.n = 1,
 		.size = sizeof(compat_uint_t),
 		.align = sizeof(compat_uint_t),
-		.get = s390_system_call_get,
+		.regset_get = s390_system_call_get,
 		.set = s390_system_call_set,
 	},
 	{
@@ -1606,7 +1441,7 @@ static const struct user_regset s390_compat_regsets[] = {
 		.n = 1,
 		.size = sizeof(long),
 		.align = sizeof(long),
-		.get = s390_compat_last_break_get,
+		.regset_get = s390_compat_last_break_get,
 		.set = s390_compat_last_break_set,
 	},
 	{
@@ -1614,7 +1449,7 @@ static const struct user_regset s390_compat_regsets[] = {
 		.n = 1,
 		.size = 256,
 		.align = 1,
-		.get = s390_tdb_get,
+		.regset_get = s390_tdb_get,
 		.set = s390_tdb_set,
 	},
 	{
@@ -1622,7 +1457,7 @@ static const struct user_regset s390_compat_regsets[] = {
 		.n = __NUM_VXRS_LOW,
 		.size = sizeof(__u64),
 		.align = sizeof(__u64),
-		.get = s390_vxrs_low_get,
+		.regset_get = s390_vxrs_low_get,
 		.set = s390_vxrs_low_set,
 	},
 	{
@@ -1630,7 +1465,7 @@ static const struct user_regset s390_compat_regsets[] = {
 		.n = __NUM_VXRS_HIGH,
 		.size = sizeof(__vector128),
 		.align = sizeof(__vector128),
-		.get = s390_vxrs_high_get,
+		.regset_get = s390_vxrs_high_get,
 		.set = s390_vxrs_high_set,
 	},
 	{
@@ -1638,7 +1473,7 @@ static const struct user_regset s390_compat_regsets[] = {
 		.n = sizeof(s390_compat_regs_high) / sizeof(compat_long_t),
 		.size = sizeof(compat_long_t),
 		.align = sizeof(compat_long_t),
-		.get = s390_compat_regs_high_get,
+		.regset_get = s390_compat_regs_high_get,
 		.set = s390_compat_regs_high_set,
 	},
 	{
@@ -1646,7 +1481,7 @@ static const struct user_regset s390_compat_regsets[] = {
 		.n = sizeof(struct gs_cb) / sizeof(__u64),
 		.size = sizeof(__u64),
 		.align = sizeof(__u64),
-		.get = s390_gs_cb_get,
+		.regset_get = s390_gs_cb_get,
 		.set = s390_gs_cb_set,
 	},
 	{
@@ -1654,7 +1489,7 @@ static const struct user_regset s390_compat_regsets[] = {
 		.n = sizeof(struct gs_cb) / sizeof(__u64),
 		.size = sizeof(__u64),
 		.align = sizeof(__u64),
-		.get = s390_gs_bc_get,
+		.regset_get = s390_gs_bc_get,
 		.set = s390_gs_bc_set,
 	},
 	{
@@ -1662,7 +1497,7 @@ static const struct user_regset s390_compat_regsets[] = {
 		.n = sizeof(struct runtime_instr_cb) / sizeof(__u64),
 		.size = sizeof(__u64),
 		.align = sizeof(__u64),
-		.get = s390_runtime_instr_get,
+		.regset_get = s390_runtime_instr_get,
 		.set = s390_runtime_instr_set,
 	},
 };
@@ -1739,5 +1574,5 @@ unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n)
 	addr = kernel_stack_pointer(regs) + n * sizeof(long);
 	if (!regs_within_kernel_stack(regs, addr))
 		return 0;
-	return *(unsigned long *)addr;
+	return READ_ONCE_NOCHECK(addr);
 }
diff --git a/arch/s390/kernel/reipl.S b/arch/s390/kernel/reipl.S
index 4a22163962eb..69fcaf54d5ca 100644
--- a/arch/s390/kernel/reipl.S
+++ b/arch/s390/kernel/reipl.S
@@ -9,6 +9,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/nospec-insn.h>
 #include <asm/sigp.h>
+#include <asm/lowcore.h>
 
 	GEN_BR_THUNK %r9
 
@@ -19,21 +20,16 @@
 # r2 = Function to be called after store status
 # r3 = Parameter for function
 #
-ENTRY(store_status)
-	/* Save register one and load save area base */
-	stg	%r1,__LC_SAVE_AREA_RESTART
+SYM_CODE_START(store_status)
+	STMG_LC	%r0,%r15,__LC_GPREGS_SAVE_AREA
 	/* General purpose registers */
-	lghi	%r1,__LC_GPREGS_SAVE_AREA
-	stmg	%r0,%r15,0(%r1)
-	mvc	8(8,%r1),__LC_SAVE_AREA_RESTART
+	GET_LC	%r13
 	/* Control registers */
-	lghi	%r1,__LC_CREGS_SAVE_AREA
-	stctg	%c0,%c15,0(%r1)
+	stctg	%c0,%c15,__LC_CREGS_SAVE_AREA(%r13)
 	/* Access registers */
-	lghi	%r1,__LC_AREGS_SAVE_AREA
-	stam	%a0,%a15,0(%r1)
+	stamy	%a0,%a15,__LC_AREGS_SAVE_AREA(%r13)
 	/* Floating point registers */
-	lghi	%r1,__LC_FPREGS_SAVE_AREA
+	lay	%r1,__LC_FPREGS_SAVE_AREA(%r13)
 	std	%f0, 0x00(%r1)
 	std	%f1, 0x08(%r1)
 	std	%f2, 0x10(%r1)
@@ -51,21 +47,21 @@ ENTRY(store_status)
 	std	%f14,0x70(%r1)
 	std	%f15,0x78(%r1)
 	/* Floating point control register */
-	lghi	%r1,__LC_FP_CREG_SAVE_AREA
+	lay	%r1,__LC_FP_CREG_SAVE_AREA(%r13)
 	stfpc	0(%r1)
 	/* CPU timer */
-	lghi	%r1,__LC_CPU_TIMER_SAVE_AREA
+	lay	%r1,__LC_CPU_TIMER_SAVE_AREA(%r13)
 	stpt	0(%r1)
 	/* Store prefix register */
-	lghi	%r1,__LC_PREFIX_SAVE_AREA
+	lay	%r1,__LC_PREFIX_SAVE_AREA(%r13)
 	stpx	0(%r1)
 	/* Clock comparator - seven bytes */
-	lghi	%r1,__LC_CLOCK_COMP_SAVE_AREA
-	larl	%r4,.Lclkcmp
+	larl	%r4,clkcmp
 	stckc	0(%r4)
+	lay	%r1,__LC_CLOCK_COMP_SAVE_AREA(%r13)
 	mvc	1(7,%r1),1(%r4)
 	/* Program status word */
-	lghi	%r1,__LC_PSW_SAVE_AREA
+	lay	%r1,__LC_PSW_SAVE_AREA(%r13)
 	epsw	%r4,%r5
 	st	%r4,0(%r1)
 	st	%r5,4(%r1)
@@ -73,9 +69,9 @@ ENTRY(store_status)
 	lgr	%r9,%r2
 	lgr	%r2,%r3
 	BR_EX	%r9
-ENDPROC(store_status)
+SYM_CODE_END(store_status)
 
 	.section .bss
-	.align	8
-.Lclkcmp:	.quad	0x0000000000000000
+	.balign	8
+SYM_DATA_LOCAL(clkcmp,	.quad 0x0000000000000000)
 	.previous
diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S
index fe396673e8a6..0ae297c82afd 100644
--- a/arch/s390/kernel/relocate_kernel.S
+++ b/arch/s390/kernel/relocate_kernel.S
@@ -2,8 +2,7 @@
 /*
  * Copyright IBM Corp. 2005
  *
- * Author(s): Rolf Adelsberger,
- *	      Heiko Carstens <heiko.carstens@de.ibm.com>
+ * Author(s): Rolf Adelsberger
  *
  */
 
@@ -15,6 +14,7 @@
  * moves the new kernel to its destination...
  * %r2 = pointer to first kimage_entry_t
  * %r3 = start address - where to jump to after the job is done...
+ * %r4 = subcode
  *
  * %r5 will be used as temp. storage
  * %r6 holds the destination address
@@ -26,53 +26,51 @@
  */
 
 	.text
-ENTRY(relocate_kernel)
-		basr	%r13,0		# base address
-	.base:
-		lghi	%r7,PAGE_SIZE	# load PAGE_SIZE in r7
-		lghi	%r9,PAGE_SIZE	# load PAGE_SIZE in r9
-		lg	%r5,0(%r2)	# read another word for indirection page
-		aghi	%r2,8		# increment pointer
-		tml	%r5,0x1		# is it a destination page?
-		je	.indir_check	# NO, goto "indir_check"
-		lgr	%r6,%r5		# r6 = r5
-		nill	%r6,0xf000	# mask it out and...
-		j	.base		# ...next iteration
-	.indir_check:
-		tml	%r5,0x2		# is it a indirection page?
-		je	.done_test	# NO, goto "done_test"
-		nill	%r5,0xf000	# YES, mask out,
-		lgr	%r2,%r5		# move it into the right register,
-		j	.base		# and read next...
-	.done_test:
-		tml	%r5,0x4		# is it the done indicator?
-		je	.source_test	# NO! Well, then it should be the source indicator...
-		j	.done		# ok, lets finish it here...
-	.source_test:
-		tml	%r5,0x8		# it should be a source indicator...
-		je	.base		# NO, ignore it...
-		lgr	%r8,%r5		# r8 = r5
-		nill	%r8,0xf000	# masking
-	0:	mvcle	%r6,%r8,0x0	# copy PAGE_SIZE bytes from r8 to r6 - pad with 0
-		jo	0b
-		j	.base
-	.done:
-		sgr	%r0,%r0		# clear register r0
-		cghi	%r3,0
-		je	.diag
-		la	%r4,load_psw-.base(%r13)	# load psw-address into the register
-		o	%r3,4(%r4)	# or load address into psw
-		st	%r3,4(%r4)
-		mvc	0(8,%r0),0(%r4)	# copy psw to absolute address 0
-	.diag:
-		diag	%r0,%r0,0x308
-ENDPROC(relocate_kernel)
+SYM_CODE_START(relocate_kernel)
+	basr	%r13,0		# base address
+.base:
+	lghi	%r7,PAGE_SIZE	# load PAGE_SIZE in r7
+	lghi	%r9,PAGE_SIZE	# load PAGE_SIZE in r9
+	lg	%r5,0(%r2)	# read another word for indirection page
+	aghi	%r2,8		# increment pointer
+	tml	%r5,0x1		# is it a destination page?
+	je	.indir_check	# NO, goto "indir_check"
+	lgr	%r6,%r5		# r6 = r5
+	nill	%r6,0xf000	# mask it out and...
+	j	.base		# ...next iteration
+.indir_check:
+	tml	%r5,0x2		# is it a indirection page?
+	je	.done_test	# NO, goto "done_test"
+	nill	%r5,0xf000	# YES, mask out,
+	lgr	%r2,%r5		# move it into the right register,
+	j	.base		# and read next...
+.done_test:
+	tml	%r5,0x4		# is it the done indicator?
+	je	.source_test	# NO! Well, then it should be the source indicator...
+	j	.done		# ok, lets finish it here...
+.source_test:
+	tml	%r5,0x8		# it should be a source indicator...
+	je	.base		# NO, ignore it...
+	lgr	%r8,%r5		# r8 = r5
+	nill	%r8,0xf000	# masking
+0:	mvcle	%r6,%r8,0x0	# copy PAGE_SIZE bytes from r8 to r6 - pad with 0
+	jo	0b
+	j	.base
+.done:
+	lgr	%r0,%r4		# subcode
+	cghi	%r3,0
+	je	.diag
+	la	%r4,load_psw-.base(%r13)	# load psw-address into the register
+	o	%r3,4(%r4)	# or load address into psw
+	st	%r3,4(%r4)
+	mvc	0(8,%r0),0(%r4)	# copy psw to absolute address 0
+.diag:
+	diag	%r0,%r0,0x308
+SYM_CODE_END(relocate_kernel)
 
-		.align	8
-	load_psw:
-		.long	0x00080000,0x80000000
-	relocate_kernel_end:
-	.align 8
-	.globl	relocate_kernel_len
-	relocate_kernel_len:
-		.quad	relocate_kernel_end - relocate_kernel
+	.balign	8
+SYM_DATA_START_LOCAL(load_psw)
+	.long	0x00080000,0x80000000
+SYM_DATA_END_LABEL(load_psw, SYM_L_LOCAL, relocate_kernel_end)
+	.balign 8
+SYM_DATA(relocate_kernel_len, .quad relocate_kernel_end - relocate_kernel)
diff --git a/arch/s390/kernel/rethook.c b/arch/s390/kernel/rethook.c
new file mode 100644
index 000000000000..af10e6bdd34e
--- /dev/null
+++ b/arch/s390/kernel/rethook.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/rethook.h>
+#include <linux/kprobes.h>
+#include "rethook.h"
+
+void arch_rethook_prepare(struct rethook_node *rh, struct pt_regs *regs, bool mcount)
+{
+	rh->ret_addr = regs->gprs[14];
+	rh->frame = regs->gprs[15];
+
+	/* Replace the return addr with trampoline addr */
+	regs->gprs[14] = (unsigned long)&arch_rethook_trampoline;
+}
+NOKPROBE_SYMBOL(arch_rethook_prepare);
+
+void arch_rethook_fixup_return(struct pt_regs *regs,
+			       unsigned long correct_ret_addr)
+{
+	/* Replace fake return address with real one. */
+	regs->gprs[14] = correct_ret_addr;
+}
+NOKPROBE_SYMBOL(arch_rethook_fixup_return);
+
+/*
+ * Called from arch_rethook_trampoline
+ */
+unsigned long arch_rethook_trampoline_callback(struct pt_regs *regs)
+{
+	return rethook_trampoline_handler(regs, regs->gprs[15]);
+}
+NOKPROBE_SYMBOL(arch_rethook_trampoline_callback);
+
+/* assembler function that handles the rethook must not be probed itself */
+NOKPROBE_SYMBOL(arch_rethook_trampoline);
diff --git a/arch/s390/kernel/rethook.h b/arch/s390/kernel/rethook.h
new file mode 100644
index 000000000000..32f069eed3f3
--- /dev/null
+++ b/arch/s390/kernel/rethook.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __S390_RETHOOK_H
+#define __S390_RETHOOK_H
+
+unsigned long arch_rethook_trampoline_callback(struct pt_regs *regs);
+
+#endif
diff --git a/arch/s390/kernel/runtime_instr.c b/arch/s390/kernel/runtime_instr.c
index 125c7f6e8715..1788a5454b6f 100644
--- a/arch/s390/kernel/runtime_instr.c
+++ b/arch/s390/kernel/runtime_instr.c
@@ -57,7 +57,7 @@ static void init_runtime_instr_cb(struct runtime_instr_cb *cb)
 	cb->k = 1;
 	cb->ps = 1;
 	cb->pc = 1;
-	cb->key = PAGE_DEFAULT_KEY;
+	cb->key = PAGE_DEFAULT_KEY >> 4;
 	cb->v = 1;
 }
 
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 9cbf490fd162..f244c5560e7f 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -37,7 +37,7 @@
 #include <linux/root_dev.h>
 #include <linux/console.h>
 #include <linux/kernel_stat.h>
-#include <linux/dma-contiguous.h>
+#include <linux/dma-map-ops.h>
 #include <linux/device.h>
 #include <linux/notifier.h>
 #include <linux/pfn.h>
@@ -49,14 +49,18 @@
 #include <linux/memory.h>
 #include <linux/compat.h>
 #include <linux/start_kernel.h>
+#include <linux/hugetlb.h>
+#include <linux/kmemleak.h>
 
+#include <asm/archrandom.h>
 #include <asm/boot_data.h>
+#include <asm/machine.h>
 #include <asm/ipl.h>
 #include <asm/facility.h>
 #include <asm/smp.h>
 #include <asm/mmu_context.h>
 #include <asm/cpcmd.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
 #include <asm/nmi.h>
 #include <asm/irq.h>
 #include <asm/page.h>
@@ -71,8 +75,10 @@
 #include <asm/numa.h>
 #include <asm/alternative.h>
 #include <asm/nospec-branch.h>
-#include <asm/mem_detect.h>
+#include <asm/physmem_info.h>
+#include <asm/maccess.h>
 #include <asm/uv.h>
+#include <asm/asm-offsets.h>
 #include "entry.h"
 
 /*
@@ -87,48 +93,101 @@ EXPORT_SYMBOL(console_devno);
 unsigned int console_irq = -1;
 EXPORT_SYMBOL(console_irq);
 
-unsigned long elf_hwcap __read_mostly = 0;
-char elf_platform[ELF_PLATFORM_SIZE];
+/*
+ * Some code and data needs to stay below 2 GB, even when the kernel would be
+ * relocated above 2 GB, because it has to use 31 bit addresses.
+ * Such code and data is part of the .amode31 section.
+ */
+char __amode31_ref *__samode31 = _samode31;
+char __amode31_ref *__eamode31 = _eamode31;
+char __amode31_ref *__stext_amode31 = _stext_amode31;
+char __amode31_ref *__etext_amode31 = _etext_amode31;
+struct exception_table_entry __amode31_ref *__start_amode31_ex_table = _start_amode31_ex_table;
+struct exception_table_entry __amode31_ref *__stop_amode31_ex_table = _stop_amode31_ex_table;
 
-unsigned long int_hwcap = 0;
+/*
+ * Control registers CR2, CR5 and CR15 are initialized with addresses
+ * of tables that must be placed below 2G which is handled by the AMODE31
+ * sections.
+ * Because the AMODE31 sections are relocated below 2G at startup,
+ * the content of control registers CR2, CR5 and CR15 must be updated
+ * with new addresses after the relocation. The initial initialization of
+ * control registers occurs in head64.S and then gets updated again after AMODE31
+ * relocation. We must access the relevant AMODE31 tables indirectly via
+ * pointers placed in the .amode31.refs linker section. Those pointers get
+ * updated automatically during AMODE31 relocation and always contain a valid
+ * address within AMODE31 sections.
+ */
 
-#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
-int __bootdata_preserved(prot_virt_guest);
-#endif
+static __amode31_data u32 __ctl_duct_amode31[16] __aligned(64);
+
+static __amode31_data u64 __ctl_aste_amode31[8] __aligned(64) = {
+	[1] = 0xffffffffffffffff
+};
+
+static __amode31_data u32 __ctl_duald_amode31[32] __aligned(128) = {
+	0x80000000, 0, 0, 0,
+	0x80000000, 0, 0, 0,
+	0x80000000, 0, 0, 0,
+	0x80000000, 0, 0, 0,
+	0x80000000, 0, 0, 0,
+	0x80000000, 0, 0, 0,
+	0x80000000, 0, 0, 0,
+	0x80000000, 0, 0, 0
+};
 
-int __bootdata(noexec_disabled);
-int __bootdata(memory_end_set);
-unsigned long __bootdata(memory_end);
-unsigned long __bootdata(vmalloc_size);
-unsigned long __bootdata(max_physmem_end);
-struct mem_detect_info __bootdata(mem_detect);
-
-struct exception_table_entry *__bootdata_preserved(__start_dma_ex_table);
-struct exception_table_entry *__bootdata_preserved(__stop_dma_ex_table);
-unsigned long __bootdata_preserved(__swsusp_reset_dma);
-unsigned long __bootdata_preserved(__stext_dma);
-unsigned long __bootdata_preserved(__etext_dma);
-unsigned long __bootdata_preserved(__sdma);
-unsigned long __bootdata_preserved(__edma);
-unsigned long __bootdata_preserved(__kaslr_offset);
-
-unsigned long VMALLOC_START;
+static __amode31_data u32 __ctl_linkage_stack_amode31[8] __aligned(64) = {
+	0, 0, 0x89000000, 0,
+	0, 0, 0x8a000000, 0
+};
+
+static u64 __amode31_ref *__ctl_aste = __ctl_aste_amode31;
+static u32 __amode31_ref *__ctl_duald = __ctl_duald_amode31;
+static u32 __amode31_ref *__ctl_linkage_stack = __ctl_linkage_stack_amode31;
+static u32 __amode31_ref *__ctl_duct = __ctl_duct_amode31;
+
+unsigned long __bootdata_preserved(max_mappable);
+struct physmem_info __bootdata(physmem_info);
+
+struct vm_layout __bootdata_preserved(vm_layout);
+EXPORT_SYMBOL(vm_layout);
+int __bootdata_preserved(__kaslr_enabled);
+unsigned int __bootdata_preserved(zlib_dfltcc_support);
+EXPORT_SYMBOL(zlib_dfltcc_support);
+u64 __bootdata_preserved(stfle_fac_list[16]);
+EXPORT_SYMBOL(stfle_fac_list);
+struct oldmem_data __bootdata_preserved(oldmem_data);
+
+char __bootdata(boot_rb)[PAGE_SIZE * 2];
+bool __bootdata(boot_earlyprintk);
+size_t __bootdata(boot_rb_off);
+char __bootdata(bootdebug_filter)[128];
+bool __bootdata(bootdebug);
+
+unsigned long __bootdata_preserved(VMALLOC_START);
 EXPORT_SYMBOL(VMALLOC_START);
 
-unsigned long VMALLOC_END;
+unsigned long __bootdata_preserved(VMALLOC_END);
 EXPORT_SYMBOL(VMALLOC_END);
 
-struct page *vmemmap;
+struct page *__bootdata_preserved(vmemmap);
 EXPORT_SYMBOL(vmemmap);
+unsigned long __bootdata_preserved(vmemmap_size);
 
-unsigned long MODULES_VADDR;
-unsigned long MODULES_END;
+unsigned long __bootdata_preserved(MODULES_VADDR);
+unsigned long __bootdata_preserved(MODULES_END);
 
 /* An array with a pointer to the lowcore of every CPU. */
 struct lowcore *lowcore_ptr[NR_CPUS];
 EXPORT_SYMBOL(lowcore_ptr);
 
 /*
+ * The Write Back bit position in the physaddr is given by the SLPC PCI.
+ * Leaving the mask zero always uses write through which is safe
+ */
+unsigned long mio_wb_bit_mask __ro_after_init;
+
+/*
  * This is set up by the setup-routine at boot-time
  * for S390 need to find out, what we have to setup
  * using address 0x10400 ...
@@ -161,7 +220,7 @@ static void __init set_preferred_console(void)
 	else if (CONSOLE_IS_3270)
 		add_preferred_console("tty3270", 0, NULL);
 	else if (CONSOLE_IS_VT220)
-		add_preferred_console("ttyS", 1, NULL);
+		add_preferred_console("ttysclp", 0, NULL);
 	else if (CONSOLE_IS_HVC)
 		add_preferred_console("hvc", 0, NULL);
 }
@@ -191,7 +250,7 @@ static void __init conmode_default(void)
 	char query_buffer[1024];
 	char *ptr;
 
-        if (MACHINE_IS_VM) {
+	if (machine_is_vm()) {
 		cpcmd("QUERY CONSOLE", query_buffer, 1024, NULL);
 		console_devno = simple_strtoul(query_buffer + 5, NULL, 16);
 		ptr = strstr(query_buffer, "SUBCHANNEL =");
@@ -229,7 +288,7 @@ static void __init conmode_default(void)
 			SET_CONSOLE_SCLP;
 #endif
 		}
-	} else if (MACHINE_IS_KVM) {
+	} else if (machine_is_kvm()) {
 		if (sclp.has_vt220 && IS_ENABLED(CONFIG_SCLP_VT220_CONSOLE))
 			SET_CONSOLE_VT220;
 		else if (sclp.has_linemode && IS_ENABLED(CONFIG_SCLP_CONSOLE))
@@ -241,18 +300,16 @@ static void __init conmode_default(void)
 		SET_CONSOLE_SCLP;
 #endif
 	}
-	if (IS_ENABLED(CONFIG_VT) && IS_ENABLED(CONFIG_DUMMY_CONSOLE))
-		conswitchp = &dummy_con;
 }
 
 #ifdef CONFIG_CRASH_DUMP
 static void __init setup_zfcpdump(void)
 {
-	if (ipl_info.type != IPL_TYPE_FCP_DUMP)
+	if (!is_ipl_type_dump())
 		return;
-	if (OLDMEM_BASE)
+	if (oldmem_data.start)
 		return;
-	strcat(boot_command_line, " cio_ignore=all,!ipldev,!condev");
+	strlcat(boot_command_line, " cio_ignore=all,!ipldev,!condev", COMMAND_LINE_SIZE);
 	console_loglevel = 2;
 }
 #else
@@ -303,76 +360,34 @@ void machine_power_off(void)
 void (*pm_power_off)(void) = machine_power_off;
 EXPORT_SYMBOL_GPL(pm_power_off);
 
-void *restart_stack __section(.data);
+void *restart_stack;
 
 unsigned long stack_alloc(void)
 {
-#ifdef CONFIG_VMAP_STACK
-	return (unsigned long)
-		__vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
-				     VMALLOC_START, VMALLOC_END,
-				     THREADINFO_GFP,
-				     PAGE_KERNEL, 0, NUMA_NO_NODE,
-				     __builtin_return_address(0));
-#else
-	return __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
-#endif
-}
-
-void stack_free(unsigned long stack)
-{
-#ifdef CONFIG_VMAP_STACK
-	vfree((void *) stack);
-#else
-	free_pages(stack, THREAD_SIZE_ORDER);
-#endif
-}
-
-int __init arch_early_irq_init(void)
-{
-	unsigned long stack;
+	void *stack;
 
-	stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
-	if (!stack)
-		panic("Couldn't allocate async stack");
-	S390_lowcore.async_stack = stack + STACK_INIT_OFFSET;
-	return 0;
+	stack = __vmalloc_node(THREAD_SIZE, THREAD_SIZE, THREADINFO_GFP,
+			       NUMA_NO_NODE, __builtin_return_address(0));
+	kmemleak_not_leak(stack);
+	return (unsigned long)stack;
 }
 
-static int __init async_stack_realloc(void)
+void stack_free(unsigned long stack)
 {
-	unsigned long old, new;
-
-	old = S390_lowcore.async_stack - STACK_INIT_OFFSET;
-	new = stack_alloc();
-	if (!new)
-		panic("Couldn't allocate async stack");
-	S390_lowcore.async_stack = new + STACK_INIT_OFFSET;
-	free_pages(old, THREAD_SIZE_ORDER);
-	return 0;
+	vfree((void *)stack);
 }
-early_initcall(async_stack_realloc);
 
-void __init arch_call_rest_init(void)
+static unsigned long __init stack_alloc_early(void)
 {
 	unsigned long stack;
 
-	stack = stack_alloc();
-	if (!stack)
-		panic("Couldn't allocate kernel stack");
-	current->stack = (void *) stack;
-#ifdef CONFIG_VMAP_STACK
-	current->stack_vm_area = (void *) stack;
-#endif
-	set_task_stack_end_magic(current);
-	stack += STACK_INIT_OFFSET;
-	S390_lowcore.kernel_stack = stack;
-	CALL_ON_STACK_NORETURN(rest_init, stack);
+	stack = (unsigned long)memblock_alloc_or_panic(THREAD_SIZE, THREAD_SIZE);
+	return stack;
 }
 
-static void __init setup_lowcore_dat_off(void)
+static void __init setup_lowcore(void)
 {
-	struct lowcore *lc;
+	struct lowcore *lc, *abs_lc;
 
 	/*
 	 * Setup lowcore for boot cpu
@@ -383,52 +398,40 @@ static void __init setup_lowcore_dat_off(void)
 		panic("%s: Failed to allocate %zu bytes align=%zx\n",
 		      __func__, sizeof(*lc), sizeof(*lc));
 
-	lc->restart_psw.mask = PSW_KERNEL_BITS;
-	lc->restart_psw.addr = (unsigned long) restart_int_handler;
-	lc->external_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK;
+	lc->pcpu = (unsigned long)per_cpu_ptr(&pcpu_devices, 0);
+	lc->restart_psw.mask = PSW_KERNEL_BITS & ~PSW_MASK_DAT;
+	lc->restart_psw.addr = __pa(restart_int_handler);
+	lc->external_new_psw.mask = PSW_KERNEL_BITS;
 	lc->external_new_psw.addr = (unsigned long) ext_int_handler;
-	lc->svc_new_psw.mask = PSW_KERNEL_BITS |
-		PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK;
+	lc->svc_new_psw.mask = PSW_KERNEL_BITS;
 	lc->svc_new_psw.addr = (unsigned long) system_call;
-	lc->program_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK;
+	lc->program_new_psw.mask = PSW_KERNEL_BITS;
 	lc->program_new_psw.addr = (unsigned long) pgm_check_handler;
 	lc->mcck_new_psw.mask = PSW_KERNEL_BITS;
 	lc->mcck_new_psw.addr = (unsigned long) mcck_int_handler;
-	lc->io_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK;
+	lc->io_new_psw.mask = PSW_KERNEL_BITS;
 	lc->io_new_psw.addr = (unsigned long) io_int_handler;
 	lc->clock_comparator = clock_comparator_max;
-	lc->nodat_stack = ((unsigned long) &init_thread_union)
-		+ THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs);
 	lc->current_task = (unsigned long)&init_task;
 	lc->lpp = LPP_MAGIC;
-	lc->machine_flags = S390_lowcore.machine_flags;
-	lc->preempt_count = S390_lowcore.preempt_count;
-	lc->stfl_fac_list = S390_lowcore.stfl_fac_list;
-	memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list,
-	       sizeof(lc->stfle_fac_list));
-	memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list,
-	       sizeof(lc->alt_stfle_fac_list));
-	nmi_alloc_boot_cpu(lc);
-	vdso_alloc_boot_cpu(lc);
-	lc->sync_enter_timer = S390_lowcore.sync_enter_timer;
-	lc->async_enter_timer = S390_lowcore.async_enter_timer;
-	lc->exit_timer = S390_lowcore.exit_timer;
-	lc->user_timer = S390_lowcore.user_timer;
-	lc->system_timer = S390_lowcore.system_timer;
-	lc->steal_timer = S390_lowcore.steal_timer;
-	lc->last_update_timer = S390_lowcore.last_update_timer;
-	lc->last_update_clock = S390_lowcore.last_update_clock;
-
+	lc->preempt_count = get_lowcore()->preempt_count;
+	nmi_alloc_mcesa_early(&lc->mcesad);
+	lc->sys_enter_timer = get_lowcore()->sys_enter_timer;
+	lc->exit_timer = get_lowcore()->exit_timer;
+	lc->user_timer = get_lowcore()->user_timer;
+	lc->system_timer = get_lowcore()->system_timer;
+	lc->steal_timer = get_lowcore()->steal_timer;
+	lc->last_update_timer = get_lowcore()->last_update_timer;
+	lc->last_update_clock = get_lowcore()->last_update_clock;
 	/*
 	 * Allocate the global restart stack which is the same for
-	 * all CPUs in cast *one* of them does a PSW restart.
+	 * all CPUs in case *one* of them does a PSW restart.
 	 */
-	restart_stack = memblock_alloc(THREAD_SIZE, THREAD_SIZE);
-	if (!restart_stack)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, THREAD_SIZE, THREAD_SIZE);
-	restart_stack += STACK_INIT_OFFSET;
-
+	restart_stack = (void *)(stack_alloc_early() + STACK_INIT_OFFSET);
+	lc->mcck_stack = stack_alloc_early() + STACK_INIT_OFFSET;
+	lc->async_stack = stack_alloc_early() + STACK_INIT_OFFSET;
+	lc->nodat_stack = stack_alloc_early() + STACK_INIT_OFFSET;
+	lc->kernel_stack = get_lowcore()->kernel_stack;
 	/*
 	 * Set up PSW restart to call ipl.c:do_restart(). Copy the relevant
 	 * restart data to the absolute zero lowcore. This is necessary if
@@ -437,32 +440,32 @@ static void __init setup_lowcore_dat_off(void)
 	lc->restart_stack = (unsigned long) restart_stack;
 	lc->restart_fn = (unsigned long) do_restart;
 	lc->restart_data = 0;
-	lc->restart_source = -1UL;
-
-	/* Setup absolute zero lowcore */
-	mem_assign_absolute(S390_lowcore.restart_stack, lc->restart_stack);
-	mem_assign_absolute(S390_lowcore.restart_fn, lc->restart_fn);
-	mem_assign_absolute(S390_lowcore.restart_data, lc->restart_data);
-	mem_assign_absolute(S390_lowcore.restart_source, lc->restart_source);
-	mem_assign_absolute(S390_lowcore.restart_psw, lc->restart_psw);
-
+	lc->restart_source = -1U;
 	lc->spinlock_lockval = arch_spin_lockval(0);
 	lc->spinlock_index = 0;
 	arch_spin_lock_setup(0);
-	lc->br_r1_trampoline = 0x07f1;	/* br %r1 */
-
-	set_prefix((u32)(unsigned long) lc);
+	lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW);
+	lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW);
+	lc->preempt_count = PREEMPT_DISABLED;
+	lc->kernel_asce = get_lowcore()->kernel_asce;
+	lc->user_asce = get_lowcore()->user_asce;
+
+	system_ctlreg_init_save_area(lc);
+	abs_lc = get_abs_lowcore();
+	abs_lc->restart_stack = lc->restart_stack;
+	abs_lc->restart_fn = lc->restart_fn;
+	abs_lc->restart_data = lc->restart_data;
+	abs_lc->restart_source = lc->restart_source;
+	abs_lc->restart_psw = lc->restart_psw;
+	abs_lc->restart_flags = RESTART_FLAG_CTLREGS;
+	abs_lc->program_new_psw = lc->program_new_psw;
+	abs_lc->mcesad = lc->mcesad;
+	put_abs_lowcore(abs_lc);
+
+	set_prefix(__pa(lc));
 	lowcore_ptr[0] = lc;
-}
-
-static void __init setup_lowcore_dat_on(void)
-{
-	__ctl_clear_bit(0, 28);
-	S390_lowcore.external_new_psw.mask |= PSW_MASK_DAT;
-	S390_lowcore.svc_new_psw.mask |= PSW_MASK_DAT;
-	S390_lowcore.program_new_psw.mask |= PSW_MASK_DAT;
-	S390_lowcore.io_new_psw.mask |= PSW_MASK_DAT;
-	__ctl_set_bit(0, 28);
+	if (abs_lowcore_map(0, lowcore_ptr[0], false))
+		panic("Couldn't setup absolute lowcore");
 }
 
 static struct resource code_resource = {
@@ -489,26 +492,29 @@ static struct resource __initdata *standard_resources[] = {
 static void __init setup_resources(void)
 {
 	struct resource *res, *std_res, *sub_res;
-	struct memblock_region *reg;
+	phys_addr_t start, end;
 	int j;
+	u64 i;
+
+	code_resource.start = __pa_symbol(_text);
+	code_resource.end = __pa_symbol(_etext) - 1;
+	data_resource.start = __pa_symbol(_etext);
+	data_resource.end = __pa_symbol(_edata) - 1;
+	bss_resource.start = __pa_symbol(__bss_start);
+	bss_resource.end = __pa_symbol(__bss_stop) - 1;
 
-	code_resource.start = (unsigned long) _text;
-	code_resource.end = (unsigned long) _etext - 1;
-	data_resource.start = (unsigned long) _etext;
-	data_resource.end = (unsigned long) _edata - 1;
-	bss_resource.start = (unsigned long) __bss_start;
-	bss_resource.end = (unsigned long) __bss_stop - 1;
-
-	for_each_memblock(memory, reg) {
-		res = memblock_alloc(sizeof(*res), 8);
-		if (!res)
-			panic("%s: Failed to allocate %zu bytes align=0x%x\n",
-			      __func__, sizeof(*res), 8);
+	for_each_mem_range(i, &start, &end) {
+		res = memblock_alloc_or_panic(sizeof(*res), 8);
 		res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
 
 		res->name = "System RAM";
-		res->start = reg->base;
-		res->end = reg->base + reg->size - 1;
+		res->start = start;
+		/*
+		 * In memblock, end points to the first byte after the
+		 * range while in resources, end points to the last byte in
+		 * the range.
+		 */
+		res->end = end - 1;
 		request_resource(&iomem_resource, res);
 
 		for (j = 0; j < ARRAY_SIZE(standard_resources); j++) {
@@ -517,10 +523,7 @@ static void __init setup_resources(void)
 			    std_res->start > res->end)
 				continue;
 			if (std_res->end > res->end) {
-				sub_res = memblock_alloc(sizeof(*sub_res), 8);
-				if (!sub_res)
-					panic("%s: Failed to allocate %zu bytes align=0x%x\n",
-					      __func__, sizeof(*sub_res), 8);
+				sub_res = memblock_alloc_or_panic(sizeof(*sub_res), 8);
 				*sub_res = *std_res;
 				sub_res->end = res->end;
 				std_res->start = res->end + 1;
@@ -539,7 +542,8 @@ static void __init setup_resources(void)
 	 * part of the System RAM resource.
 	 */
 	if (crashk_res.end) {
-		memblock_add_node(crashk_res.start, resource_size(&crashk_res), 0);
+		memblock_add_node(crashk_res.start, resource_size(&crashk_res),
+				  0, MEMBLOCK_NONE);
 		memblock_reserve(crashk_res.start, resource_size(&crashk_res));
 		insert_resource(&iomem_resource, &crashk_res);
 	}
@@ -548,56 +552,17 @@ static void __init setup_resources(void)
 
 static void __init setup_memory_end(void)
 {
-	unsigned long vmax, tmp;
-
-	/* Choose kernel address space layout: 3 or 4 levels. */
-	if (IS_ENABLED(CONFIG_KASAN)) {
-		vmax = IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING)
-			   ? _REGION1_SIZE
-			   : _REGION2_SIZE;
-	} else {
-		tmp = (memory_end ?: max_physmem_end) / PAGE_SIZE;
-		tmp = tmp * (sizeof(struct page) + PAGE_SIZE);
-		if (tmp + vmalloc_size + MODULES_LEN <= _REGION2_SIZE)
-			vmax = _REGION2_SIZE; /* 3-level kernel page table */
-		else
-			vmax = _REGION1_SIZE; /* 4-level kernel page table */
-	}
-
-	/* module area is at the end of the kernel address space. */
-	MODULES_END = vmax;
-	MODULES_VADDR = MODULES_END - MODULES_LEN;
-	VMALLOC_END = MODULES_VADDR;
-	VMALLOC_START = VMALLOC_END - vmalloc_size;
-
-	/* Split remaining virtual space between 1:1 mapping & vmemmap array */
-	tmp = VMALLOC_START / (PAGE_SIZE + sizeof(struct page));
-	/* vmemmap contains a multiple of PAGES_PER_SECTION struct pages */
-	tmp = SECTION_ALIGN_UP(tmp);
-	tmp = VMALLOC_START - tmp * sizeof(struct page);
-	tmp &= ~((vmax >> 11) - 1);	/* align to page table level */
-	tmp = min(tmp, 1UL << MAX_PHYSMEM_BITS);
-	vmemmap = (struct page *) tmp;
-
-	/* Take care that memory_end is set and <= vmemmap */
-	memory_end = min(memory_end ?: max_physmem_end, (unsigned long)vmemmap);
-#ifdef CONFIG_KASAN
-	/* fit in kasan shadow memory region between 1:1 and vmemmap */
-	memory_end = min(memory_end, KASAN_SHADOW_START);
-	vmemmap = max(vmemmap, (struct page *)KASAN_SHADOW_END);
-#endif
-	max_pfn = max_low_pfn = PFN_DOWN(memory_end);
-	memblock_remove(memory_end, ULONG_MAX);
-
-	pr_notice("The maximum memory size is %luMB\n", memory_end >> 20);
+	max_pfn = max_low_pfn = PFN_DOWN(ident_map_size);
+	pr_notice("The maximum memory size is %luMB\n", ident_map_size >> 20);
 }
 
 #ifdef CONFIG_CRASH_DUMP
 
 /*
- * When kdump is enabled, we have to ensure that no memory from
- * the area [0 - crashkernel memory size] and
- * [crashk_res.start - crashk_res.end] is set offline.
+ * When kdump is enabled, we have to ensure that no memory from the area
+ * [0 - crashkernel memory size] is set offline - it will be exchanged with
+ * the crashkernel memory region when kdump is triggered. The crashkernel
+ * memory region can never get offlined (pages are unmovable).
  */
 static int kdump_mem_notifier(struct notifier_block *nb,
 			      unsigned long action, void *data)
@@ -608,11 +573,7 @@ static int kdump_mem_notifier(struct notifier_block *nb,
 		return NOTIFY_OK;
 	if (arg->start_pfn < PFN_DOWN(resource_size(&crashk_res)))
 		return NOTIFY_BAD;
-	if (arg->start_pfn > PFN_DOWN(crashk_res.end))
-		return NOTIFY_OK;
-	if (arg->start_pfn + arg->nr_pages - 1 < PFN_DOWN(crashk_res.start))
-		return NOTIFY_OK;
-	return NOTIFY_BAD;
+	return NOTIFY_OK;
 }
 
 static struct notifier_block kdump_mem_nb = {
@@ -622,36 +583,15 @@ static struct notifier_block kdump_mem_nb = {
 #endif
 
 /*
- * Make sure that the area behind memory_end is protected
+ * Reserve page tables created by decompressor
  */
-static void reserve_memory_end(void)
+static void __init reserve_pgtables(void)
 {
-	if (memory_end_set)
-		memblock_reserve(memory_end, ULONG_MAX);
-}
-
-/*
- * Make sure that oldmem, where the dump is stored, is protected
- */
-static void reserve_oldmem(void)
-{
-#ifdef CONFIG_CRASH_DUMP
-	if (OLDMEM_BASE)
-		/* Forget all memory above the running kdump system */
-		memblock_reserve(OLDMEM_SIZE, (phys_addr_t)ULONG_MAX);
-#endif
-}
+	unsigned long start, end;
+	struct reserved_range *range;
 
-/*
- * Make sure that oldmem, where the dump is stored, is protected
- */
-static void remove_oldmem(void)
-{
-#ifdef CONFIG_CRASH_DUMP
-	if (OLDMEM_BASE)
-		/* Forget all memory above the running kdump system */
-		memblock_remove(OLDMEM_SIZE, (phys_addr_t)ULONG_MAX);
-#endif
+	for_each_physmem_reserved_type_range(RR_VMEM, range, &start, &end)
+		memblock_reserve(start, end - start);
 }
 
 /*
@@ -664,8 +604,8 @@ static void __init reserve_crashkernel(void)
 	phys_addr_t low, high;
 	int rc;
 
-	rc = parse_crashkernel(boot_command_line, memory_end, &crash_size,
-			       &crash_base);
+	rc = parse_crashkernel(boot_command_line, ident_map_size,
+			       &crash_size, &crash_base, NULL, NULL);
 
 	crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN);
 	crash_size = ALIGN(crash_size, KEXEC_CRASH_MEM_ALIGN);
@@ -678,9 +618,9 @@ static void __init reserve_crashkernel(void)
 		return;
 	}
 
-	low = crash_base ?: OLDMEM_BASE;
+	low = crash_base ?: oldmem_data.start;
 	high = low + crash_size;
-	if (low >= OLDMEM_BASE && high <= OLDMEM_BASE + OLDMEM_SIZE) {
+	if (low >= oldmem_data.start && high <= oldmem_data.start + oldmem_data.size) {
 		/* The crashkernel fits into OLDMEM, reuse OLDMEM */
 		crash_base = low;
 	} else {
@@ -694,8 +634,9 @@ static void __init reserve_crashkernel(void)
 			return;
 		}
 		low = crash_base ?: low;
-		crash_base = memblock_find_in_range(low, high, crash_size,
-						    KEXEC_CRASH_MEM_ALIGN);
+		crash_base = memblock_phys_alloc_range(crash_size,
+						       KEXEC_CRASH_MEM_ALIGN,
+						       low, high);
 	}
 
 	if (!crash_base) {
@@ -704,10 +645,12 @@ static void __init reserve_crashkernel(void)
 		return;
 	}
 
-	if (register_memory_notifier(&kdump_mem_nb))
+	if (register_memory_notifier(&kdump_mem_nb)) {
+		memblock_phys_free(crash_base, crash_size);
 		return;
+	}
 
-	if (!OLDMEM_BASE && MACHINE_IS_VM)
+	if (!oldmem_data.start && machine_is_vm())
 		diag10_range(PFN_DOWN(crash_base), PFN_DOWN(crash_size));
 	crashk_res.start = crash_base;
 	crashk_res.end = crash_base + crash_size - 1;
@@ -725,13 +668,13 @@ static void __init reserve_crashkernel(void)
  */
 static void __init reserve_initrd(void)
 {
-#ifdef CONFIG_BLK_DEV_INITRD
-	if (!INITRD_START || !INITRD_SIZE)
+	unsigned long addr, size;
+
+	if (!IS_ENABLED(CONFIG_BLK_DEV_INITRD) || !get_physmem_reserved(RR_INITRD, &addr, &size))
 		return;
-	initrd_start = INITRD_START;
-	initrd_end = initrd_start + INITRD_SIZE;
-	memblock_reserve(INITRD_START, INITRD_SIZE);
-#endif
+	initrd_start = (unsigned long)__va(addr);
+	initrd_end = initrd_start + size;
+	memblock_reserve(addr, size);
 }
 
 /*
@@ -743,252 +686,122 @@ static void __init reserve_certificate_list(void)
 		memblock_reserve(ipl_cert_list_addr, ipl_cert_list_size);
 }
 
-static void __init reserve_mem_detect_info(void)
-{
-	unsigned long start, size;
-
-	get_mem_detect_reserved(&start, &size);
-	if (size)
-		memblock_reserve(start, size);
-}
-
-static void __init free_mem_detect_info(void)
+static void __init reserve_physmem_info(void)
 {
-	unsigned long start, size;
+	unsigned long addr, size;
 
-	get_mem_detect_reserved(&start, &size);
-	if (size)
-		memblock_free(start, size);
+	if (get_physmem_reserved(RR_MEM_DETECT_EXT, &addr, &size))
+		memblock_reserve(addr, size);
 }
 
-static void __init memblock_physmem_add(phys_addr_t start, phys_addr_t size)
+static void __init free_physmem_info(void)
 {
-	memblock_dbg("memblock_physmem_add: [%#016llx-%#016llx]\n",
-		     start, start + size - 1);
-	memblock_add_range(&memblock.memory, start, size, 0, 0);
-	memblock_add_range(&memblock.physmem, start, size, 0, 0);
-}
+	unsigned long addr, size;
 
-static const char * __init get_mem_info_source(void)
-{
-	switch (mem_detect.info_source) {
-	case MEM_DETECT_SCLP_STOR_INFO:
-		return "sclp storage info";
-	case MEM_DETECT_DIAG260:
-		return "diag260";
-	case MEM_DETECT_SCLP_READ_INFO:
-		return "sclp read info";
-	case MEM_DETECT_BIN_SEARCH:
-		return "binary search";
-	}
-	return "none";
+	if (get_physmem_reserved(RR_MEM_DETECT_EXT, &addr, &size))
+		memblock_phys_free(addr, size);
 }
 
-static void __init memblock_add_mem_detect_info(void)
+static void __init memblock_add_physmem_info(void)
 {
 	unsigned long start, end;
 	int i;
 
-	memblock_dbg("physmem info source: %s (%hhd)\n",
-		     get_mem_info_source(), mem_detect.info_source);
+	pr_debug("physmem info source: %s (%hhd)\n",
+		 get_physmem_info_source(), physmem_info.info_source);
 	/* keep memblock lists close to the kernel */
 	memblock_set_bottom_up(true);
-	for_each_mem_detect_block(i, &start, &end)
+	for_each_physmem_usable_range(i, &start, &end)
+		memblock_add(start, end - start);
+	for_each_physmem_online_range(i, &start, &end)
 		memblock_physmem_add(start, end - start);
 	memblock_set_bottom_up(false);
-	memblock_dump_all();
+	memblock_set_node(0, ULONG_MAX, &memblock.memory, 0);
 }
 
 /*
- * Check for initrd being in usable memory
+ * Reserve memory used for lowcore.
  */
-static void __init check_initrd(void)
+static void __init reserve_lowcore(void)
 {
-#ifdef CONFIG_BLK_DEV_INITRD
-	if (INITRD_START && INITRD_SIZE &&
-	    !memblock_is_region_memory(INITRD_START, INITRD_SIZE)) {
-		pr_err("The initial RAM disk does not fit into the memory\n");
-		memblock_free(INITRD_START, INITRD_SIZE);
-		initrd_start = initrd_end = 0;
+	void *lowcore_start = get_lowcore();
+	void *lowcore_end = lowcore_start + sizeof(struct lowcore);
+	void *start, *end;
+
+	if (absolute_pointer(__identity_base) < lowcore_end) {
+		start = max(lowcore_start, (void *)__identity_base);
+		end = min(lowcore_end, (void *)(__identity_base + ident_map_size));
+		memblock_reserve(__pa(start), __pa(end));
 	}
-#endif
 }
 
 /*
- * Reserve memory used for lowcore/command line/kernel image.
+ * Reserve memory used for absolute lowcore/command line/kernel image.
  */
 static void __init reserve_kernel(void)
 {
-	unsigned long start_pfn = PFN_UP(__pa(_end));
-
-	memblock_reserve(0, HEAD_END);
-	memblock_reserve((unsigned long)_stext, PFN_PHYS(start_pfn)
-			 - (unsigned long)_stext);
-	memblock_reserve(__sdma, __edma - __sdma);
+	memblock_reserve(0, STARTUP_NORMAL_OFFSET);
+	memblock_reserve(OLDMEM_BASE, sizeof(unsigned long));
+	memblock_reserve(OLDMEM_SIZE, sizeof(unsigned long));
+	memblock_reserve(physmem_info.reserved[RR_AMODE31].start, __eamode31 - __samode31);
+	memblock_reserve(__pa(sclp_early_sccb), EXT_SCCB_READ_SCP);
+	memblock_reserve(__pa(_stext), _end - _stext);
 }
 
 static void __init setup_memory(void)
 {
-	struct memblock_region *reg;
+	phys_addr_t start, end;
+	u64 i;
 
 	/*
 	 * Init storage key for present memory
 	 */
-	for_each_memblock(memory, reg) {
-		storage_key_init_range(reg->base, reg->base + reg->size);
-	}
-	psw_set_key(PAGE_DEFAULT_KEY);
+	for_each_mem_range(i, &start, &end)
+		storage_key_init_range(start, end);
 
-	/* Only cosmetics */
-	memblock_enforce_memory_limit(memblock_end_of_DRAM());
+	psw_set_key(PAGE_DEFAULT_KEY);
 }
 
-/*
- * Setup hardware capabilities.
- */
-static int __init setup_hwcaps(void)
+static void __init relocate_amode31_section(void)
 {
-	static const int stfl_bits[6] = { 0, 2, 7, 17, 19, 21 };
-	struct cpuid cpu_id;
-	int i;
-
-	/*
-	 * The store facility list bits numbers as found in the principles
-	 * of operation are numbered with bit 1UL<<31 as number 0 to
-	 * bit 1UL<<0 as number 31.
-	 *   Bit 0: instructions named N3, "backported" to esa-mode
-	 *   Bit 2: z/Architecture mode is active
-	 *   Bit 7: the store-facility-list-extended facility is installed
-	 *   Bit 17: the message-security assist is installed
-	 *   Bit 19: the long-displacement facility is installed
-	 *   Bit 21: the extended-immediate facility is installed
-	 *   Bit 22: extended-translation facility 3 is installed
-	 *   Bit 30: extended-translation facility 3 enhancement facility
-	 * These get translated to:
-	 *   HWCAP_S390_ESAN3 bit 0, HWCAP_S390_ZARCH bit 1,
-	 *   HWCAP_S390_STFLE bit 2, HWCAP_S390_MSA bit 3,
-	 *   HWCAP_S390_LDISP bit 4, HWCAP_S390_EIMM bit 5 and
-	 *   HWCAP_S390_ETF3EH bit 8 (22 && 30).
-	 */
-	for (i = 0; i < 6; i++)
-		if (test_facility(stfl_bits[i]))
-			elf_hwcap |= 1UL << i;
+	unsigned long amode31_size = __eamode31 - __samode31;
+	long amode31_offset, *ptr;
 
-	if (test_facility(22) && test_facility(30))
-		elf_hwcap |= HWCAP_S390_ETF3EH;
+	amode31_offset = AMODE31_START - (unsigned long)__samode31;
+	pr_info("Relocating AMODE31 section of size 0x%08lx\n", amode31_size);
 
-	/*
-	 * Check for additional facilities with store-facility-list-extended.
-	 * stfle stores doublewords (8 byte) with bit 1ULL<<63 as bit 0
-	 * and 1ULL<<0 as bit 63. Bits 0-31 contain the same information
-	 * as stored by stfl, bits 32-xxx contain additional facilities.
-	 * How many facility words are stored depends on the number of
-	 * doublewords passed to the instruction. The additional facilities
-	 * are:
-	 *   Bit 42: decimal floating point facility is installed
-	 *   Bit 44: perform floating point operation facility is installed
-	 * translated to:
-	 *   HWCAP_S390_DFP bit 6 (42 && 44).
-	 */
-	if ((elf_hwcap & (1UL << 2)) && test_facility(42) && test_facility(44))
-		elf_hwcap |= HWCAP_S390_DFP;
-
-	/*
-	 * Huge page support HWCAP_S390_HPAGE is bit 7.
-	 */
-	if (MACHINE_HAS_EDAT1)
-		elf_hwcap |= HWCAP_S390_HPAGE;
-
-	/*
-	 * 64-bit register support for 31-bit processes
-	 * HWCAP_S390_HIGH_GPRS is bit 9.
-	 */
-	elf_hwcap |= HWCAP_S390_HIGH_GPRS;
+	/* Move original AMODE31 section to the new one */
+	memmove((void *)physmem_info.reserved[RR_AMODE31].start, __samode31, amode31_size);
+	/* Zero out the old AMODE31 section to catch invalid accesses within it */
+	memset(__samode31, 0, amode31_size);
 
-	/*
-	 * Transactional execution support HWCAP_S390_TE is bit 10.
-	 */
-	if (MACHINE_HAS_TE)
-		elf_hwcap |= HWCAP_S390_TE;
-
-	/*
-	 * Vector extension HWCAP_S390_VXRS is bit 11. The Vector extension
-	 * can be disabled with the "novx" parameter. Use MACHINE_HAS_VX
-	 * instead of facility bit 129.
-	 */
-	if (MACHINE_HAS_VX) {
-		elf_hwcap |= HWCAP_S390_VXRS;
-		if (test_facility(134))
-			elf_hwcap |= HWCAP_S390_VXRS_EXT;
-		if (test_facility(135))
-			elf_hwcap |= HWCAP_S390_VXRS_BCD;
-		if (test_facility(148))
-			elf_hwcap |= HWCAP_S390_VXRS_EXT2;
-		if (test_facility(152))
-			elf_hwcap |= HWCAP_S390_VXRS_PDE;
-	}
-	if (test_facility(150))
-		elf_hwcap |= HWCAP_S390_SORT;
-	if (test_facility(151))
-		elf_hwcap |= HWCAP_S390_DFLT;
+	/* Update all AMODE31 region references */
+	for (ptr = _start_amode31_refs; ptr != _end_amode31_refs; ptr++)
+		*ptr += amode31_offset;
+}
 
-	/*
-	 * Guarded storage support HWCAP_S390_GS is bit 12.
-	 */
-	if (MACHINE_HAS_GS)
-		elf_hwcap |= HWCAP_S390_GS;
-
-	get_cpu_id(&cpu_id);
-	add_device_randomness(&cpu_id, sizeof(cpu_id));
-	switch (cpu_id.machine) {
-	case 0x2064:
-	case 0x2066:
-	default:	/* Use "z900" as default for 64 bit kernels. */
-		strcpy(elf_platform, "z900");
-		break;
-	case 0x2084:
-	case 0x2086:
-		strcpy(elf_platform, "z990");
-		break;
-	case 0x2094:
-	case 0x2096:
-		strcpy(elf_platform, "z9-109");
-		break;
-	case 0x2097:
-	case 0x2098:
-		strcpy(elf_platform, "z10");
-		break;
-	case 0x2817:
-	case 0x2818:
-		strcpy(elf_platform, "z196");
-		break;
-	case 0x2827:
-	case 0x2828:
-		strcpy(elf_platform, "zEC12");
-		break;
-	case 0x2964:
-	case 0x2965:
-		strcpy(elf_platform, "z13");
-		break;
-	case 0x3906:
-	case 0x3907:
-		strcpy(elf_platform, "z14");
-		break;
-	case 0x8561:
-	case 0x8562:
-		strcpy(elf_platform, "z15");
-		break;
-	}
+/* This must be called after AMODE31 relocation */
+static void __init setup_cr(void)
+{
+	union ctlreg2 cr2;
+	union ctlreg5 cr5;
+	union ctlreg15 cr15;
 
-	/*
-	 * Virtualization support HWCAP_INT_SIE is bit 0.
-	 */
-	if (sclp.has_sief2)
-		int_hwcap |= HWCAP_INT_SIE;
+	__ctl_duct[1] = (unsigned long)__ctl_aste;
+	__ctl_duct[2] = (unsigned long)__ctl_aste;
+	__ctl_duct[4] = (unsigned long)__ctl_duald;
 
-	return 0;
+	/* Update control registers CR2, CR5 and CR15 */
+	local_ctl_store(2, &cr2.reg);
+	local_ctl_store(5, &cr5.reg);
+	local_ctl_store(15, &cr15.reg);
+	cr2.ducto = (unsigned long)__ctl_duct >> 6;
+	cr5.pasteo = (unsigned long)__ctl_duct >> 6;
+	cr15.lsea = (unsigned long)__ctl_linkage_stack >> 3;
+	system_ctl_load(2, &cr2.reg);
+	system_ctl_load(5, &cr5.reg);
+	system_ctl_load(15, &cr15.reg);
 }
-arch_initcall(setup_hwcaps);
 
 /*
  * Add system information as device randomness
@@ -997,30 +810,13 @@ static void __init setup_randomness(void)
 {
 	struct sysinfo_3_2_2 *vmms;
 
-	vmms = (struct sysinfo_3_2_2 *) memblock_phys_alloc(PAGE_SIZE,
-							    PAGE_SIZE);
-	if (!vmms)
-		panic("Failed to allocate memory for sysinfo structure\n");
-
+	vmms = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
 	if (stsi(vmms, 3, 2, 2) == 0 && vmms->count)
 		add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count);
-	memblock_free((unsigned long) vmms, PAGE_SIZE);
-}
+	memblock_free(vmms, PAGE_SIZE);
 
-/*
- * Find the correct size for the task_struct. This depends on
- * the size of the struct fpu at the end of the thread_struct
- * which is embedded in the task_struct.
- */
-static void __init setup_task_size(void)
-{
-	int task_size = sizeof(struct task_struct);
-
-	if (!MACHINE_HAS_VX) {
-		task_size -= sizeof(__vector128) * __NUM_VXRS;
-		task_size += sizeof(freg_t) * __NUM_FPRS;
-	}
-	arch_task_struct_size = task_size;
+	if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG))
+		static_branch_enable(&s390_arch_random_available);
 }
 
 /*
@@ -1031,8 +827,7 @@ static void __init setup_control_program_code(void)
 {
 	union diag318_info diag318_info = {
 		.cpnc = CPNC_LINUX,
-		.cpvc_linux = 0,
-		.cpvc_distro = {0},
+		.cpvc = 0,
 	};
 
 	if (!sclp.has_diag318)
@@ -1052,11 +847,11 @@ static void __init log_component_list(void)
 
 	if (!early_ipl_comp_list_addr)
 		return;
-	if (ipl_block.hdr.flags & IPL_PL_FLAG_IPLSR)
+	if (ipl_block.hdr.flags & IPL_PL_FLAG_SIPL)
 		pr_info("Linux is running with Secure-IPL enabled\n");
 	else
 		pr_info("Linux is running with Secure-IPL disabled\n");
-	ptr = (void *) early_ipl_comp_list_addr;
+	ptr = __va(early_ipl_comp_list_addr);
 	end = (void *) ptr + early_ipl_comp_list_size;
 	pr_info("The IPL report contains the following components:\n");
 	while (ptr < end) {
@@ -1075,6 +870,23 @@ static void __init log_component_list(void)
 }
 
 /*
+ * Print avoiding interpretation of % in buf and taking bootdebug option
+ * into consideration.
+ */
+static void __init print_rb_entry(const char *buf)
+{
+	char fmt[] = KERN_SOH "0boot: %s";
+	int level = printk_get_level(buf);
+
+	buf = skip_timestamp(printk_skip_level(buf));
+	if (level == KERN_DEBUG[1] && (!bootdebug || !bootdebug_filter_match(buf)))
+		return;
+
+	fmt[1] = level;
+	printk(fmt, buf);
+}
+
+/*
  * Setup function called from init/main.c just after the banner
  * was printed.
  */
@@ -1084,15 +896,21 @@ void __init setup_arch(char **cmdline_p)
         /*
          * print what head.S has found out about the machine
          */
-	if (MACHINE_IS_VM)
+	if (machine_is_vm())
 		pr_info("Linux is running as a z/VM "
 			"guest operating system in 64-bit mode\n");
-	else if (MACHINE_IS_KVM)
+	else if (machine_is_kvm())
 		pr_info("Linux is running under KVM in 64-bit mode\n");
-	else if (MACHINE_IS_LPAR)
+	else if (machine_is_lpar())
 		pr_info("Linux is running natively in 64-bit mode\n");
 	else
 		pr_info("Linux is running as a guest in 64-bit mode\n");
+	/* Print decompressor messages if not already printed */
+	if (!boot_earlyprintk)
+		boot_rb_foreach(print_rb_entry);
+
+	if (machine_has_relocated_lowcore())
+		pr_info("Lowcore relocated to 0x%px\n", get_lowcore());
 
 	log_component_list();
 
@@ -1102,14 +920,12 @@ void __init setup_arch(char **cmdline_p)
 
         ROOT_DEV = Root_RAM0;
 
-	init_mm.start_code = (unsigned long) _text;
-	init_mm.end_code = (unsigned long) _etext;
-	init_mm.end_data = (unsigned long) _edata;
-	init_mm.brk = (unsigned long) _end;
+	setup_initial_init_mm(_text, _etext, _edata, _end);
 
 	if (IS_ENABLED(CONFIG_EXPOLINE_AUTO))
 		nospec_auto_detect();
 
+	jump_label_init();
 	parse_early_param();
 #ifdef CONFIG_CRASH_DUMP
 	/* Deactivate elfcorehdr= kernel parameter */
@@ -1118,58 +934,54 @@ void __init setup_arch(char **cmdline_p)
 
 	os_info_init();
 	setup_ipl();
-	setup_task_size();
 	setup_control_program_code();
 
 	/* Do some memory reservations *before* memory is added to memblock */
-	reserve_memory_end();
-	reserve_oldmem();
+	reserve_pgtables();
+	reserve_lowcore();
 	reserve_kernel();
 	reserve_initrd();
 	reserve_certificate_list();
-	reserve_mem_detect_info();
+	reserve_physmem_info();
+	memblock_set_current_limit(ident_map_size);
 	memblock_allow_resize();
 
 	/* Get information about *all* installed memory */
-	memblock_add_mem_detect_info();
-
-	free_mem_detect_info();
-	remove_oldmem();
-
-	/*
-	 * Make sure all chunks are MAX_ORDER aligned so we don't need the
-	 * extra checks that HOLES_IN_ZONE would require.
-	 *
-	 * Is this still required?
-	 */
-	memblock_trim_memory(1UL << (MAX_ORDER - 1 + PAGE_SHIFT));
+	memblock_add_physmem_info();
 
+	free_physmem_info();
 	setup_memory_end();
+	memblock_dump_all();
 	setup_memory();
-	dma_contiguous_reserve(memory_end);
+
+	relocate_amode31_section();
+	setup_cr();
+	setup_uv();
+	dma_contiguous_reserve(ident_map_size);
 	vmcp_cma_reserve();
+	if (cpu_has_edat2())
+		hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
 
-	check_initrd();
 	reserve_crashkernel();
 #ifdef CONFIG_CRASH_DUMP
 	/*
-	 * Be aware that smp_save_dump_cpus() triggers a system reset.
+	 * Be aware that smp_save_dump_secondary_cpus() triggers a system reset.
 	 * Therefore CPU and device initialization should be done afterwards.
 	 */
-	smp_save_dump_cpus();
+	smp_save_dump_secondary_cpus();
 #endif
 
 	setup_resources();
-	setup_lowcore_dat_off();
+	setup_lowcore();
 	smp_fill_possible_mask();
 	cpu_detect_mhz_feature();
         cpu_init();
 	numa_setup();
 	smp_detect_cpus();
 	topology_init_early();
-
+	setup_protection_map();
 	/*
-	 * Create kernel page tables and switch to virtual addressing.
+	 * Create kernel page tables.
 	 */
         paging_init();
 
@@ -1177,7 +989,9 @@ void __init setup_arch(char **cmdline_p)
 	 * After paging_init created the kernel page table, the new PSWs
 	 * in lowcore can now run with DAT enabled.
 	 */
-	setup_lowcore_dat_on();
+#ifdef CONFIG_CRASH_DUMP
+	smp_save_dump_ipl_cpu();
+#endif
 
         /* Setup default console */
 	conmode_default();
@@ -1187,9 +1001,14 @@ void __init setup_arch(char **cmdline_p)
 	if (IS_ENABLED(CONFIG_EXPOLINE))
 		nospec_init_branches();
 
-	/* Setup zfcpdump support */
+	/* Setup zfcp/nvme dump support */
 	setup_zfcpdump();
 
 	/* Add system specific data to the random pool */
 	setup_randomness();
 }
+
+void __init arch_cpu_finalize_init(void)
+{
+	sclp_init();
+}
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index e6fca5498e1f..e48013cd832c 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -12,10 +12,12 @@
 
 #include <linux/sched.h>
 #include <linux/sched/task_stack.h>
+#include <linux/rseq.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
+#include <linux/entry-common.h>
 #include <linux/errno.h>
 #include <linux/wait.h>
 #include <linux/ptrace.h>
@@ -24,13 +26,13 @@
 #include <linux/tty.h>
 #include <linux/personality.h>
 #include <linux/binfmts.h>
-#include <linux/tracehook.h>
 #include <linux/syscalls.h>
 #include <linux/compat.h>
 #include <asm/ucontext.h>
 #include <linux/uaccess.h>
+#include <asm/vdso-symbols.h>
+#include <asm/access-regs.h>
 #include <asm/lowcore.h>
-#include <asm/switch_to.h>
 #include "entry.h"
 
 /*
@@ -107,7 +109,7 @@ struct rt_sigframe
 static void store_sigregs(void)
 {
 	save_access_regs(current->thread.acrs);
-	save_fpu_regs();
+	save_user_fpu_regs();
 }
 
 /* Load registers after signal return */
@@ -129,7 +131,7 @@ static int save_sigregs(struct pt_regs *regs, _sigregs __user *sregs)
 	memcpy(&user_sregs.regs.gprs, &regs->gprs, sizeof(sregs->regs.gprs));
 	memcpy(&user_sregs.regs.acrs, current->thread.acrs,
 	       sizeof(user_sregs.regs.acrs));
-	fpregs_store(&user_sregs.fpregs, &current->thread.fpu);
+	fpregs_store(&user_sregs.fpregs, &current->thread.ufpu);
 	if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs)))
 		return -EFAULT;
 	return 0;
@@ -139,7 +141,7 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs)
 {
 	_sigregs user_sregs;
 
-	/* Alwys make any pending restarted system call return -EINTR */
+	/* Always make any pending restarted system call return -EINTR */
 	current->restart_block.fn = do_no_restart_syscall;
 
 	if (__copy_from_user(&user_sregs, sregs, sizeof(user_sregs)))
@@ -148,10 +150,6 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs)
 	if (!is_ri_task(current) && (user_sregs.regs.psw.mask & PSW_MASK_RI))
 		return -EINVAL;
 
-	/* Test the floating-point-control word. */
-	if (test_fp_ctl(user_sregs.fpregs.fpc))
-		return -EINVAL;
-
 	/* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */
 	regs->psw.mask = (regs->psw.mask & ~(PSW_MASK_USER | PSW_MASK_RI)) |
 		(user_sregs.regs.psw.mask & (PSW_MASK_USER | PSW_MASK_RI));
@@ -167,7 +165,7 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs)
 	memcpy(&current->thread.acrs, &user_sregs.regs.acrs,
 	       sizeof(current->thread.acrs));
 
-	fpregs_load(&user_sregs.fpregs, &current->thread.fpu);
+	fpregs_load(&user_sregs.fpregs, &current->thread.ufpu);
 
 	clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */
 	return 0;
@@ -181,13 +179,13 @@ static int save_sigregs_ext(struct pt_regs *regs,
 	int i;
 
 	/* Save vector registers to signal stack */
-	if (MACHINE_HAS_VX) {
+	if (cpu_has_vx()) {
 		for (i = 0; i < __NUM_VXRS_LOW; i++)
-			vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1);
+			vxrs[i] = current->thread.ufpu.vxrs[i].low;
 		if (__copy_to_user(&sregs_ext->vxrs_low, vxrs,
 				   sizeof(sregs_ext->vxrs_low)) ||
 		    __copy_to_user(&sregs_ext->vxrs_high,
-				   current->thread.fpu.vxrs + __NUM_VXRS_LOW,
+				   current->thread.ufpu.vxrs + __NUM_VXRS_LOW,
 				   sizeof(sregs_ext->vxrs_high)))
 			return -EFAULT;
 	}
@@ -201,15 +199,15 @@ static int restore_sigregs_ext(struct pt_regs *regs,
 	int i;
 
 	/* Restore vector registers from signal stack */
-	if (MACHINE_HAS_VX) {
+	if (cpu_has_vx()) {
 		if (__copy_from_user(vxrs, &sregs_ext->vxrs_low,
 				     sizeof(sregs_ext->vxrs_low)) ||
-		    __copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW,
+		    __copy_from_user(current->thread.ufpu.vxrs + __NUM_VXRS_LOW,
 				     &sregs_ext->vxrs_high,
 				     sizeof(sregs_ext->vxrs_high)))
 			return -EFAULT;
 		for (i = 0; i < __NUM_VXRS_LOW; i++)
-			*((__u64 *)(current->thread.fpu.vxrs + i) + 1) = vxrs[i];
+			current->thread.ufpu.vxrs[i].low = vxrs[i];
 	}
 	return 0;
 }
@@ -224,7 +222,7 @@ SYSCALL_DEFINE0(sigreturn)
 	if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE))
 		goto badframe;
 	set_current_blocked(&set);
-	save_fpu_regs();
+	save_user_fpu_regs();
 	if (restore_sigregs(regs, &frame->sregs))
 		goto badframe;
 	if (restore_sigregs_ext(regs, &frame->sregs_ext))
@@ -248,7 +246,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
 	set_current_blocked(&set);
 	if (restore_altstack(&frame->uc.uc_stack))
 		goto badframe;
-	save_fpu_regs();
+	save_user_fpu_regs();
 	if (restore_sigregs(regs, &frame->uc.uc_mcontext))
 		goto badframe;
 	if (restore_sigregs_ext(regs, &frame->uc.uc_mcontext_ext))
@@ -299,7 +297,7 @@ static int setup_frame(int sig, struct k_sigaction *ka,
 	 * included in the signal frame on a 31-bit system.
 	 */
 	frame_size = sizeof(*frame) - sizeof(frame->sregs_ext);
-	if (MACHINE_HAS_VX)
+	if (cpu_has_vx())
 		frame_size += sizeof(frame->sregs_ext);
 	frame = get_sigframe(ka, regs, frame_size);
 	if (frame == (void __user *) -1UL)
@@ -332,15 +330,10 @@ static int setup_frame(int sig, struct k_sigaction *ka,
 
 	/* Set up to return from userspace.  If provided, use a stub
 	   already in userspace.  */
-	if (ka->sa.sa_flags & SA_RESTORER) {
+	if (ka->sa.sa_flags & SA_RESTORER)
 		restorer = (unsigned long) ka->sa.sa_restorer;
-	} else {
-		/* Signal frame without vector registers are short ! */
-		__u16 __user *svc = (void __user *) frame + frame_size - 2;
-		if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc))
-			return -EFAULT;
-		restorer = (unsigned long) svc;
-	}
+	else
+		restorer = VDSO64_SYMBOL(current, sigreturn);
 
 	/* Set up registers for signal handler */
 	regs->gprs[14] = restorer;
@@ -381,7 +374,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
 	 * included in the signal frame on a 31-bit system.
 	 */
 	uc_flags = 0;
-	if (MACHINE_HAS_VX) {
+	if (cpu_has_vx()) {
 		frame_size += sizeof(_sigregs_ext);
 		uc_flags |= UC_VXRS;
 	}
@@ -395,14 +388,10 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
 
 	/* Set up to return from userspace.  If provided, use a stub
 	   already in userspace.  */
-	if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+	if (ksig->ka.sa.sa_flags & SA_RESTORER)
 		restorer = (unsigned long) ksig->ka.sa.sa_restorer;
-	} else {
-		__u16 __user *svc = &frame->svc_insn;
-		if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc))
-			return -EFAULT;
-		restorer = (unsigned long) svc;
-	}
+	else
+		restorer = VDSO64_SYMBOL(current, rt_sigreturn);
 
 	/* Create siginfo on the signal stack */
 	if (copy_siginfo_to_user(&frame->info, &ksig->info))
@@ -459,7 +448,8 @@ static void handle_signal(struct ksignal *ksig, sigset_t *oldset,
  * the kernel can handle, and then we build all the user-level signal handling
  * stack-frames in one go after that.
  */
-void do_signal(struct pt_regs *regs)
+
+void arch_do_signal_or_restart(struct pt_regs *regs)
 {
 	struct ksignal ksig;
 	sigset_t *oldset = sigmask_to_save();
@@ -487,7 +477,7 @@ void do_signal(struct pt_regs *regs)
 					regs->gprs[2] = -EINTR;
 					break;
 				}
-			/* fallthrough */
+				fallthrough;
 			case -ERESTARTNOINTR:
 				regs->gprs[2] = regs->orig_gpr2;
 				regs->psw.addr =
@@ -498,6 +488,7 @@ void do_signal(struct pt_regs *regs)
 		}
 		/* No longer in a system call */
 		clear_pt_regs_flag(regs, PIF_SYSCALL);
+
 		rseq_signal_deliver(&ksig, regs);
 		if (is_compat_task())
 			handle_signal32(&ksig, oldset, regs);
@@ -513,16 +504,22 @@ void do_signal(struct pt_regs *regs)
 		switch (regs->gprs[2]) {
 		case -ERESTART_RESTARTBLOCK:
 			/* Restart with sys_restart_syscall */
-			regs->int_code = __NR_restart_syscall;
-		/* fallthrough */
+			regs->gprs[2] = regs->orig_gpr2;
+			current->restart_block.arch_data = regs->psw.addr;
+			if (is_compat_task())
+				regs->psw.addr = VDSO32_SYMBOL(current, restart_syscall);
+			else
+				regs->psw.addr = VDSO64_SYMBOL(current, restart_syscall);
+			if (test_thread_flag(TIF_SINGLE_STEP))
+				clear_thread_flag(TIF_PER_TRAP);
+			break;
 		case -ERESTARTNOHAND:
 		case -ERESTARTSYS:
 		case -ERESTARTNOINTR:
-			/* Restart system call with magic TIF bit. */
 			regs->gprs[2] = regs->orig_gpr2;
-			set_pt_regs_flag(regs, PIF_SYSCALL);
+			regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
 			if (test_thread_flag(TIF_SINGLE_STEP))
-				clear_pt_regs_flag(regs, PIF_PER_TRAP);
+				clear_thread_flag(TIF_PER_TRAP);
 			break;
 		}
 	}
@@ -532,10 +529,3 @@ void do_signal(struct pt_regs *regs)
 	 */
 	restore_saved_sigmask();
 }
-
-void do_notify_resume(struct pt_regs *regs)
-{
-	clear_thread_flag(TIF_NOTIFY_RESUME);
-	tracehook_notify_resume(regs);
-	rseq_handle_notify_resume(NULL, regs);
-}
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 2794cad9312e..63f41dfaba85 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -5,7 +5,6 @@
  *    Copyright IBM Corp. 1999, 2012
  *    Author(s): Denis Joseph Barrow,
  *		 Martin Schwidefsky <schwidefsky@de.ibm.com>,
- *		 Heiko Carstens <heiko.carstens@de.ibm.com>,
  *
  *  based on other smp stuff by
  *    (c) 1995 Alan Cox, CymruNET Ltd  <alan@cymru.net>
@@ -19,6 +18,7 @@
 #define KMSG_COMPONENT "cpu"
 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 
+#include <linux/cpufeature.h>
 #include <linux/workqueue.h>
 #include <linux/memblock.h>
 #include <linux/export.h>
@@ -30,24 +30,28 @@
 #include <linux/delay.h>
 #include <linux/interrupt.h>
 #include <linux/irqflags.h>
+#include <linux/irq_work.h>
 #include <linux/cpu.h>
 #include <linux/slab.h>
 #include <linux/sched/hotplug.h>
 #include <linux/sched/task_stack.h>
 #include <linux/crash_dump.h>
 #include <linux/kprobes.h>
+#include <asm/access-regs.h>
 #include <asm/asm-offsets.h>
+#include <asm/machine.h>
+#include <asm/ctlreg.h>
+#include <asm/pfault.h>
 #include <asm/diag.h>
-#include <asm/switch_to.h>
 #include <asm/facility.h>
+#include <asm/fpu.h>
 #include <asm/ipl.h>
 #include <asm/setup.h>
 #include <asm/irq.h>
 #include <asm/tlbflush.h>
 #include <asm/vtimer.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
 #include <asm/sclp.h>
-#include <asm/vdso.h>
 #include <asm/debug.h>
 #include <asm/os_info.h>
 #include <asm/sigp.h>
@@ -55,12 +59,16 @@
 #include <asm/nmi.h>
 #include <asm/stacktrace.h>
 #include <asm/topology.h>
+#include <asm/vdso.h>
+#include <asm/maccess.h>
 #include "entry.h"
 
 enum {
 	ec_schedule = 0,
 	ec_call_function_single,
 	ec_stop_cpu,
+	ec_mcck_pending,
+	ec_irq_work,
 };
 
 enum {
@@ -68,19 +76,15 @@ enum {
 	CPU_STATE_CONFIGURED,
 };
 
-static DEFINE_PER_CPU(struct cpu *, cpu_device);
-
-struct pcpu {
-	struct lowcore *lowcore;	/* lowcore page(s) for the cpu */
-	unsigned long ec_mask;		/* bit mask for ec_xxx functions */
-	unsigned long ec_clk;		/* sigp timestamp for ec_xxx */
-	signed char state;		/* physical cpu state */
-	signed char polarization;	/* physical polarization */
-	u16 address;			/* physical cpu address */
-};
-
 static u8 boot_core_type;
-static struct pcpu pcpu_devices[NR_CPUS];
+DEFINE_PER_CPU(struct pcpu, pcpu_devices);
+/*
+ * Pointer to the pcpu area of the boot CPU. This is required when a restart
+ * interrupt is triggered on an offline CPU. For that case accessing percpu
+ * data with the common primitives does not work, since the percpu offset is
+ * stored in a non existent lowcore.
+ */
+static struct pcpu *ipl_pcpu;
 
 unsigned int smp_cpu_mt_shift;
 EXPORT_SYMBOL(smp_cpu_mt_shift);
@@ -93,13 +97,7 @@ __vector128 __initdata boot_cpu_vector_save_area[__NUM_VXRS];
 #endif
 
 static unsigned int smp_max_threads __initdata = -1U;
-
-static int __init early_nosmt(char *s)
-{
-	smp_max_threads = 1;
-	return 0;
-}
-early_param("nosmt", early_nosmt);
+cpumask_t cpu_setup_mask;
 
 static int __init early_smt(char *s)
 {
@@ -110,7 +108,7 @@ early_param("smt", early_smt);
 
 /*
  * The smp_cpu_state_mutex must be held when changing the state or polarization
- * member of a pcpu data structure within the pcpu_devices arreay.
+ * member of a pcpu data structure within the pcpu_devices array.
  */
 DEFINE_MUTEX(smp_cpu_state_mutex);
 
@@ -145,7 +143,7 @@ static int pcpu_sigp_retry(struct pcpu *pcpu, u8 order, u32 parm)
 
 static inline int pcpu_stopped(struct pcpu *pcpu)
 {
-	u32 uninitialized_var(status);
+	u32 status;
 
 	if (__pcpu_sigp(pcpu->address, SIGP_SENSE,
 			0, &status) != SIGP_CC_STATUS_STORED)
@@ -170,8 +168,8 @@ static struct pcpu *pcpu_find_address(const struct cpumask *mask, u16 address)
 	int cpu;
 
 	for_each_cpu(cpu, mask)
-		if (pcpu_devices[cpu].address == address)
-			return pcpu_devices + cpu;
+		if (per_cpu(pcpu_devices, cpu).address == address)
+			return &per_cpu(pcpu_devices, cpu);
 	return NULL;
 }
 
@@ -188,102 +186,96 @@ static void pcpu_ec_call(struct pcpu *pcpu, int ec_bit)
 
 static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
 {
-	unsigned long async_stack, nodat_stack;
+	unsigned long async_stack, nodat_stack, mcck_stack;
 	struct lowcore *lc;
 
-	if (pcpu != &pcpu_devices[0]) {
-		pcpu->lowcore =	(struct lowcore *)
-			__get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER);
-		nodat_stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
-		if (!pcpu->lowcore || !nodat_stack)
-			goto out;
-	} else {
-		nodat_stack = pcpu->lowcore->nodat_stack - STACK_INIT_OFFSET;
-	}
+	lc = (struct lowcore *) __get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER);
+	nodat_stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
 	async_stack = stack_alloc();
-	if (!async_stack)
+	mcck_stack = stack_alloc();
+	if (!lc || !nodat_stack || !async_stack || !mcck_stack)
 		goto out;
-	lc = pcpu->lowcore;
-	memcpy(lc, &S390_lowcore, 512);
+	memcpy(lc, get_lowcore(), 512);
 	memset((char *) lc + 512, 0, sizeof(*lc) - 512);
 	lc->async_stack = async_stack + STACK_INIT_OFFSET;
 	lc->nodat_stack = nodat_stack + STACK_INIT_OFFSET;
+	lc->mcck_stack = mcck_stack + STACK_INIT_OFFSET;
 	lc->cpu_nr = cpu;
 	lc->spinlock_lockval = arch_spin_lockval(cpu);
 	lc->spinlock_index = 0;
-	lc->br_r1_trampoline = 0x07f1;	/* br %r1 */
-	if (nmi_alloc_per_cpu(lc))
-		goto out_async;
-	if (vdso_alloc_per_cpu(lc))
+	lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW);
+	lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW);
+	lc->preempt_count = PREEMPT_DISABLED;
+	if (nmi_alloc_mcesa(&lc->mcesad))
+		goto out;
+	if (abs_lowcore_map(cpu, lc, true))
 		goto out_mcesa;
 	lowcore_ptr[cpu] = lc;
-	pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, (u32)(unsigned long) lc);
+	pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, __pa(lc));
 	return 0;
 
 out_mcesa:
-	nmi_free_per_cpu(lc);
-out_async:
-	stack_free(async_stack);
+	nmi_free_mcesa(&lc->mcesad);
 out:
-	if (pcpu != &pcpu_devices[0]) {
-		free_pages(nodat_stack, THREAD_SIZE_ORDER);
-		free_pages((unsigned long) pcpu->lowcore, LC_ORDER);
-	}
+	stack_free(mcck_stack);
+	stack_free(async_stack);
+	free_pages(nodat_stack, THREAD_SIZE_ORDER);
+	free_pages((unsigned long) lc, LC_ORDER);
 	return -ENOMEM;
 }
 
-static void pcpu_free_lowcore(struct pcpu *pcpu)
+static void pcpu_free_lowcore(struct pcpu *pcpu, int cpu)
 {
-	unsigned long async_stack, nodat_stack, lowcore;
-
-	nodat_stack = pcpu->lowcore->nodat_stack - STACK_INIT_OFFSET;
-	async_stack = pcpu->lowcore->async_stack - STACK_INIT_OFFSET;
-	lowcore = (unsigned long) pcpu->lowcore;
+	unsigned long async_stack, nodat_stack, mcck_stack;
+	struct lowcore *lc;
 
+	lc = lowcore_ptr[cpu];
+	nodat_stack = lc->nodat_stack - STACK_INIT_OFFSET;
+	async_stack = lc->async_stack - STACK_INIT_OFFSET;
+	mcck_stack = lc->mcck_stack - STACK_INIT_OFFSET;
 	pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, 0);
-	lowcore_ptr[pcpu - pcpu_devices] = NULL;
-	vdso_free_per_cpu(pcpu->lowcore);
-	nmi_free_per_cpu(pcpu->lowcore);
+	lowcore_ptr[cpu] = NULL;
+	abs_lowcore_unmap(cpu);
+	nmi_free_mcesa(&lc->mcesad);
 	stack_free(async_stack);
-	if (pcpu == &pcpu_devices[0])
-		return;
+	stack_free(mcck_stack);
 	free_pages(nodat_stack, THREAD_SIZE_ORDER);
-	free_pages(lowcore, LC_ORDER);
+	free_pages((unsigned long) lc, LC_ORDER);
 }
 
 static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu)
 {
-	struct lowcore *lc = pcpu->lowcore;
+	struct lowcore *lc, *abs_lc;
 
+	lc = lowcore_ptr[cpu];
 	cpumask_set_cpu(cpu, &init_mm.context.cpu_attach_mask);
 	cpumask_set_cpu(cpu, mm_cpumask(&init_mm));
 	lc->cpu_nr = cpu;
+	lc->pcpu = (unsigned long)pcpu;
+	lc->restart_flags = RESTART_FLAG_CTLREGS;
 	lc->spinlock_lockval = arch_spin_lockval(cpu);
 	lc->spinlock_index = 0;
 	lc->percpu_offset = __per_cpu_offset[cpu];
-	lc->kernel_asce = S390_lowcore.kernel_asce;
-	lc->user_asce = S390_lowcore.kernel_asce;
-	lc->machine_flags = S390_lowcore.machine_flags;
+	lc->kernel_asce = get_lowcore()->kernel_asce;
+	lc->user_asce = s390_invalid_asce;
 	lc->user_timer = lc->system_timer =
 		lc->steal_timer = lc->avg_steal_timer = 0;
-	__ctl_store(lc->cregs_save_area, 0, 15);
+	abs_lc = get_abs_lowcore();
+	memcpy(lc->cregs_save_area, abs_lc->cregs_save_area, sizeof(lc->cregs_save_area));
+	put_abs_lowcore(abs_lc);
 	lc->cregs_save_area[1] = lc->kernel_asce;
-	lc->cregs_save_area[7] = lc->vdso_asce;
+	lc->cregs_save_area[7] = lc->user_asce;
 	save_access_regs((unsigned int *) lc->access_regs_save_area);
-	memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list,
-	       sizeof(lc->stfle_fac_list));
-	memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list,
-	       sizeof(lc->alt_stfle_fac_list));
 	arch_spin_lock_setup(cpu);
 }
 
-static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk)
+static void pcpu_attach_task(int cpu, struct task_struct *tsk)
 {
-	struct lowcore *lc = pcpu->lowcore;
+	struct lowcore *lc;
 
-	lc->kernel_stack = (unsigned long) task_stack_page(tsk)
-		+ THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs);
-	lc->current_task = (unsigned long) tsk;
+	lc = lowcore_ptr[cpu];
+	lc->kernel_stack = (unsigned long)task_stack_page(tsk) + STACK_INIT_OFFSET;
+	lc->current_task = (unsigned long)tsk;
 	lc->lpp = LPP_MAGIC;
 	lc->current_pid = tsk->pid;
 	lc->user_timer = tsk->thread.user_timer;
@@ -294,43 +286,59 @@ static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk)
 	lc->steal_timer = 0;
 }
 
-static void pcpu_start_fn(struct pcpu *pcpu, void (*func)(void *), void *data)
+static void pcpu_start_fn(int cpu, void (*func)(void *), void *data)
 {
-	struct lowcore *lc = pcpu->lowcore;
+	struct lowcore *lc;
 
-	lc->restart_stack = lc->nodat_stack;
+	lc = lowcore_ptr[cpu];
+	lc->restart_stack = lc->kernel_stack;
 	lc->restart_fn = (unsigned long) func;
 	lc->restart_data = (unsigned long) data;
-	lc->restart_source = -1UL;
-	pcpu_sigp_retry(pcpu, SIGP_RESTART, 0);
+	lc->restart_source = -1U;
+	pcpu_sigp_retry(per_cpu_ptr(&pcpu_devices, cpu), SIGP_RESTART, 0);
 }
 
+typedef void (pcpu_delegate_fn)(void *);
+
 /*
  * Call function via PSW restart on pcpu and stop the current cpu.
  */
-static void __pcpu_delegate(void (*func)(void*), void *data)
+static void __pcpu_delegate(pcpu_delegate_fn *func, void *data)
 {
 	func(data);	/* should not return */
 }
 
-static void __no_sanitize_address pcpu_delegate(struct pcpu *pcpu,
-						void (*func)(void *),
-						void *data, unsigned long stack)
+static void pcpu_delegate(struct pcpu *pcpu, int cpu,
+			  pcpu_delegate_fn *func,
+			  void *data, unsigned long stack)
 {
-	struct lowcore *lc = lowcore_ptr[pcpu - pcpu_devices];
-	unsigned long source_cpu = stap();
+	struct lowcore *lc, *abs_lc;
+	unsigned int source_cpu;
 
-	__load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT);
-	if (pcpu->address == source_cpu)
-		CALL_ON_STACK(__pcpu_delegate, stack, 2, func, data);
+	lc = lowcore_ptr[cpu];
+	source_cpu = stap();
+
+	if (pcpu->address == source_cpu) {
+		call_on_stack(2, stack, void, __pcpu_delegate,
+			      pcpu_delegate_fn *, func, void *, data);
+	}
 	/* Stop target cpu (if func returns this stops the current cpu). */
 	pcpu_sigp_retry(pcpu, SIGP_STOP, 0);
+	pcpu_sigp_retry(pcpu, SIGP_CPU_RESET, 0);
 	/* Restart func on the target cpu and stop the current cpu. */
-	mem_assign_absolute(lc->restart_stack, stack);
-	mem_assign_absolute(lc->restart_fn, (unsigned long) func);
-	mem_assign_absolute(lc->restart_data, (unsigned long) data);
-	mem_assign_absolute(lc->restart_source, source_cpu);
-	__bpon();
+	if (lc) {
+		lc->restart_stack = stack;
+		lc->restart_fn = (unsigned long)func;
+		lc->restart_data = (unsigned long)data;
+		lc->restart_source = source_cpu;
+	} else {
+		abs_lc = get_abs_lowcore();
+		abs_lc->restart_stack = stack;
+		abs_lc->restart_fn = (unsigned long)func;
+		abs_lc->restart_data = (unsigned long)data;
+		abs_lc->restart_source = source_cpu;
+		put_abs_lowcore(abs_lc);
+	}
 	asm volatile(
 		"0:	sigp	0,%0,%2	# sigp restart to target cpu\n"
 		"	brc	2,0b	# busy, try again\n"
@@ -357,38 +365,22 @@ static int pcpu_set_smt(unsigned int mtid)
 		smp_cpu_mt_shift = 0;
 		while (smp_cpu_mtid >= (1U << smp_cpu_mt_shift))
 			smp_cpu_mt_shift++;
-		pcpu_devices[0].address = stap();
+		per_cpu(pcpu_devices, 0).address = stap();
 	}
 	return cc;
 }
 
 /*
- * Call function on an online CPU.
- */
-void smp_call_online_cpu(void (*func)(void *), void *data)
-{
-	struct pcpu *pcpu;
-
-	/* Use the current cpu if it is online. */
-	pcpu = pcpu_find_address(cpu_online_mask, stap());
-	if (!pcpu)
-		/* Use the first online cpu. */
-		pcpu = pcpu_devices + cpumask_first(cpu_online_mask);
-	pcpu_delegate(pcpu, func, data, (unsigned long) restart_stack);
-}
-
-/*
  * Call function on the ipl CPU.
  */
 void smp_call_ipl_cpu(void (*func)(void *), void *data)
 {
-	struct lowcore *lc = pcpu_devices->lowcore;
+	struct lowcore *lc = lowcore_ptr[0];
 
-	if (pcpu_devices[0].address == stap())
-		lc = &S390_lowcore;
+	if (ipl_pcpu->address == stap())
+		lc = get_lowcore();
 
-	pcpu_delegate(&pcpu_devices[0], func, data,
-		      lc->nodat_stack);
+	pcpu_delegate(ipl_pcpu, 0, func, data, lc->nodat_stack);
 }
 
 int smp_find_processor_id(u16 address)
@@ -396,32 +388,35 @@ int smp_find_processor_id(u16 address)
 	int cpu;
 
 	for_each_present_cpu(cpu)
-		if (pcpu_devices[cpu].address == address)
+		if (per_cpu(pcpu_devices, cpu).address == address)
 			return cpu;
 	return -1;
 }
 
-bool arch_vcpu_is_preempted(int cpu)
+void schedule_mcck_handler(void)
+{
+	pcpu_ec_call(this_cpu_ptr(&pcpu_devices), ec_mcck_pending);
+}
+
+bool notrace arch_vcpu_is_preempted(int cpu)
 {
 	if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu))
 		return false;
-	if (pcpu_running(pcpu_devices + cpu))
+	if (pcpu_running(per_cpu_ptr(&pcpu_devices, cpu)))
 		return false;
 	return true;
 }
 EXPORT_SYMBOL(arch_vcpu_is_preempted);
 
-void smp_yield_cpu(int cpu)
+void notrace smp_yield_cpu(int cpu)
 {
-	if (MACHINE_HAS_DIAG9C) {
-		diag_stat_inc_norecursion(DIAG_STAT_X09C);
-		asm volatile("diag %0,0,0x9c"
-			     : : "d" (pcpu_devices[cpu].address));
-	} else if (MACHINE_HAS_DIAG44 && !smp_cpu_mtid) {
-		diag_stat_inc_norecursion(DIAG_STAT_X044);
-		asm volatile("diag 0,0,0x44");
-	}
+	if (!machine_has_diag9c())
+		return;
+	diag_stat_inc_norecursion(DIAG_STAT_X09C);
+	asm volatile("diag %0,0,0x9c"
+		     : : "d" (per_cpu(pcpu_devices, cpu).address));
 }
+EXPORT_SYMBOL_GPL(smp_yield_cpu);
 
 /*
  * Send cpus emergency shutdown signal. This gives the cpus the
@@ -429,16 +424,18 @@ void smp_yield_cpu(int cpu)
  */
 void notrace smp_emergency_stop(void)
 {
-	cpumask_t cpumask;
+	static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
+	static cpumask_t cpumask;
 	u64 end;
 	int cpu;
 
+	arch_spin_lock(&lock);
 	cpumask_copy(&cpumask, cpu_online_mask);
 	cpumask_clear_cpu(smp_processor_id(), &cpumask);
 
 	end = get_tod_clock() + (1000000UL << 12);
 	for_each_cpu(cpu, &cpumask) {
-		struct pcpu *pcpu = pcpu_devices + cpu;
+		struct pcpu *pcpu = per_cpu_ptr(&pcpu_devices, cpu);
 		set_bit(ec_stop_cpu, &pcpu->ec_mask);
 		while (__pcpu_sigp(pcpu->address, SIGP_EMERGENCY_SIGNAL,
 				   0, NULL) == SIGP_CC_BUSY &&
@@ -447,12 +444,13 @@ void notrace smp_emergency_stop(void)
 	}
 	while (get_tod_clock() < end) {
 		for_each_cpu(cpu, &cpumask)
-			if (pcpu_stopped(pcpu_devices + cpu))
+			if (pcpu_stopped(per_cpu_ptr(&pcpu_devices, cpu)))
 				cpumask_clear_cpu(cpu, &cpumask);
 		if (cpumask_empty(&cpumask))
 			break;
 		cpu_relax();
 	}
+	arch_spin_unlock(&lock);
 }
 NOKPROBE_SYMBOL(smp_emergency_stop);
 
@@ -461,10 +459,11 @@ NOKPROBE_SYMBOL(smp_emergency_stop);
  */
 void smp_send_stop(void)
 {
+	struct pcpu *pcpu;
 	int cpu;
 
 	/* Disable all interrupts/machine checks */
-	__load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT);
+	__load_psw_mask(PSW_KERNEL_BITS);
 	trace_hardirqs_off();
 
 	debug_set_critical();
@@ -476,8 +475,9 @@ void smp_send_stop(void)
 	for_each_online_cpu(cpu) {
 		if (cpu == smp_processor_id())
 			continue;
-		pcpu_sigp_retry(pcpu_devices + cpu, SIGP_STOP, 0);
-		while (!pcpu_stopped(pcpu_devices + cpu))
+		pcpu = per_cpu_ptr(&pcpu_devices, cpu);
+		pcpu_sigp_retry(pcpu, SIGP_STOP, 0);
+		while (!pcpu_stopped(pcpu))
 			cpu_relax();
 	}
 }
@@ -491,13 +491,17 @@ static void smp_handle_ext_call(void)
 	unsigned long bits;
 
 	/* handle bit signal external calls */
-	bits = xchg(&pcpu_devices[smp_processor_id()].ec_mask, 0);
+	bits = this_cpu_xchg(pcpu_devices.ec_mask, 0);
 	if (test_bit(ec_stop_cpu, &bits))
 		smp_stop_cpu();
 	if (test_bit(ec_schedule, &bits))
 		scheduler_ipi();
 	if (test_bit(ec_call_function_single, &bits))
 		generic_smp_call_function_single_interrupt();
+	if (test_bit(ec_mcck_pending, &bits))
+		s390_handle_mcck();
+	if (test_bit(ec_irq_work, &bits))
+		irq_work_run();
 }
 
 static void do_ext_call_interrupt(struct ext_code ext_code,
@@ -512,12 +516,12 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 	int cpu;
 
 	for_each_cpu(cpu, mask)
-		pcpu_ec_call(pcpu_devices + cpu, ec_call_function_single);
+		pcpu_ec_call(per_cpu_ptr(&pcpu_devices, cpu), ec_call_function_single);
 }
 
 void arch_send_call_function_single_ipi(int cpu)
 {
-	pcpu_ec_call(pcpu_devices + cpu, ec_call_function_single);
+	pcpu_ec_call(per_cpu_ptr(&pcpu_devices, cpu), ec_call_function_single);
 }
 
 /*
@@ -525,71 +529,37 @@ void arch_send_call_function_single_ipi(int cpu)
  * it goes straight through and wastes no time serializing
  * anything. Worst case is that we lose a reschedule ...
  */
-void smp_send_reschedule(int cpu)
-{
-	pcpu_ec_call(pcpu_devices + cpu, ec_schedule);
-}
-
-/*
- * parameter area for the set/clear control bit callbacks
- */
-struct ec_creg_mask_parms {
-	unsigned long orval;
-	unsigned long andval;
-	int cr;
-};
-
-/*
- * callback for setting/clearing control bits
- */
-static void smp_ctl_bit_callback(void *info)
-{
-	struct ec_creg_mask_parms *pp = info;
-	unsigned long cregs[16];
-
-	__ctl_store(cregs, 0, 15);
-	cregs[pp->cr] = (cregs[pp->cr] & pp->andval) | pp->orval;
-	__ctl_load(cregs, 0, 15);
-}
-
-/*
- * Set a bit in a control register of all cpus
- */
-void smp_ctl_set_bit(int cr, int bit)
+void arch_smp_send_reschedule(int cpu)
 {
-	struct ec_creg_mask_parms parms = { 1UL << bit, -1UL, cr };
-
-	on_each_cpu(smp_ctl_bit_callback, &parms, 1);
+	pcpu_ec_call(per_cpu_ptr(&pcpu_devices, cpu), ec_schedule);
 }
-EXPORT_SYMBOL(smp_ctl_set_bit);
 
-/*
- * Clear a bit in a control register of all cpus
- */
-void smp_ctl_clear_bit(int cr, int bit)
+#ifdef CONFIG_IRQ_WORK
+void arch_irq_work_raise(void)
 {
-	struct ec_creg_mask_parms parms = { 0, ~(1UL << bit), cr };
-
-	on_each_cpu(smp_ctl_bit_callback, &parms, 1);
+	pcpu_ec_call(this_cpu_ptr(&pcpu_devices), ec_irq_work);
 }
-EXPORT_SYMBOL(smp_ctl_clear_bit);
+#endif
 
 #ifdef CONFIG_CRASH_DUMP
 
 int smp_store_status(int cpu)
 {
-	struct pcpu *pcpu = pcpu_devices + cpu;
+	struct lowcore *lc;
+	struct pcpu *pcpu;
 	unsigned long pa;
 
-	pa = __pa(&pcpu->lowcore->floating_pt_save_area);
+	pcpu = per_cpu_ptr(&pcpu_devices, cpu);
+	lc = lowcore_ptr[cpu];
+	pa = __pa(&lc->floating_pt_save_area);
 	if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_STATUS_AT_ADDRESS,
 			      pa) != SIGP_CC_ORDER_CODE_ACCEPTED)
 		return -EIO;
-	if (!MACHINE_HAS_VX && !MACHINE_HAS_GS)
+	if (!cpu_has_vx() && !cpu_has_gs())
 		return 0;
-	pa = __pa(pcpu->lowcore->mcesad & MCESA_ORIGIN_MASK);
-	if (MACHINE_HAS_GS)
-		pa |= pcpu->lowcore->mcesad & MCESA_LC_MASK;
+	pa = lc->mcesad & MCESA_ORIGIN_MASK;
+	if (cpu_has_gs())
+		pa |= lc->mcesad & MCESA_LC_MASK;
 	if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_ADDITIONAL_STATUS,
 			      pa) != SIGP_CC_ORDER_CODE_ACCEPTED)
 		return -EIO;
@@ -598,66 +568,61 @@ int smp_store_status(int cpu)
 
 /*
  * Collect CPU state of the previous, crashed system.
- * There are four cases:
- * 1) standard zfcp dump
- *    condition: OLDMEM_BASE == NULL && ipl_info.type == IPL_TYPE_FCP_DUMP
+ * There are three cases:
+ * 1) standard zfcp/nvme dump
+ *    condition: OLDMEM_BASE == NULL && is_ipl_type_dump() == true
  *    The state for all CPUs except the boot CPU needs to be collected
  *    with sigp stop-and-store-status. The boot CPU state is located in
  *    the absolute lowcore of the memory stored in the HSA. The zcore code
  *    will copy the boot CPU state from the HSA.
- * 2) stand-alone kdump for SCSI (zfcp dump with swapped memory)
- *    condition: OLDMEM_BASE != NULL && ipl_info.type == IPL_TYPE_FCP_DUMP
+ * 2) stand-alone kdump for SCSI/NVMe (zfcp/nvme dump with swapped memory)
+ *    condition: OLDMEM_BASE != NULL && is_ipl_type_dump() == true
  *    The state for all CPUs except the boot CPU needs to be collected
  *    with sigp stop-and-store-status. The firmware or the boot-loader
  *    stored the registers of the boot CPU in the absolute lowcore in the
  *    memory of the old system.
- * 3) kdump and the old kernel did not store the CPU state,
- *    or stand-alone kdump for DASD
- *    condition: OLDMEM_BASE != NULL && !is_kdump_kernel()
+ * 3) kdump or stand-alone kdump for DASD
+ *    condition: OLDMEM_BASE != NULL && is_ipl_type_dump() == false
  *    The state for all CPUs except the boot CPU needs to be collected
  *    with sigp stop-and-store-status. The kexec code or the boot-loader
  *    stored the registers of the boot CPU in the memory of the old system.
- * 4) kdump and the old kernel stored the CPU state
- *    condition: OLDMEM_BASE != NULL && is_kdump_kernel()
- *    This case does not exist for s390 anymore, setup_arch explicitly
- *    deactivates the elfcorehdr= kernel parameter
+ *
+ * Note that the legacy kdump mode where the old kernel stored the CPU states
+ * does no longer exist: setup_arch() explicitly deactivates the elfcorehdr=
+ * kernel parameter. The is_kdump_kernel() implementation on s390 is independent
+ * of the elfcorehdr= parameter.
  */
-static __init void smp_save_cpu_vxrs(struct save_area *sa, u16 addr,
-				     bool is_boot_cpu, unsigned long page)
+static bool dump_available(void)
 {
-	__vector128 *vxrs = (__vector128 *) page;
-
-	if (is_boot_cpu)
-		vxrs = boot_cpu_vector_save_area;
-	else
-		__pcpu_sigp_relax(addr, SIGP_STORE_ADDITIONAL_STATUS, page);
-	save_area_add_vxrs(sa, vxrs);
+	return oldmem_data.start || is_ipl_type_dump();
 }
 
-static __init void smp_save_cpu_regs(struct save_area *sa, u16 addr,
-				     bool is_boot_cpu, unsigned long page)
+void __init smp_save_dump_ipl_cpu(void)
 {
-	void *regs = (void *) page;
+	struct save_area *sa;
+	void *regs;
 
-	if (is_boot_cpu)
-		copy_oldmem_kernel(regs, (void *) __LC_FPREGS_SAVE_AREA, 512);
-	else
-		__pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, page);
+	if (!dump_available())
+		return;
+	sa = save_area_alloc(true);
+	regs = memblock_alloc_or_panic(512, 8);
+	copy_oldmem_kernel(regs, __LC_FPREGS_SAVE_AREA, 512);
 	save_area_add_regs(sa, regs);
+	memblock_free(regs, 512);
+	if (cpu_has_vx())
+		save_area_add_vxrs(sa, boot_cpu_vector_save_area);
 }
 
-void __init smp_save_dump_cpus(void)
+void __init smp_save_dump_secondary_cpus(void)
 {
 	int addr, boot_cpu_addr, max_cpu_addr;
 	struct save_area *sa;
-	unsigned long page;
-	bool is_boot_cpu;
+	void *page;
 
-	if (!(OLDMEM_BASE || ipl_info.type == IPL_TYPE_FCP_DUMP))
-		/* No previous system present, normal boot. */
+	if (!dump_available())
 		return;
 	/* Allocate a page as dumping area for the store status sigps */
-	page = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0, 1UL << 31);
+	page = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE);
 	if (!page)
 		panic("ERROR: Failed to allocate %lx bytes below %lx\n",
 		      PAGE_SIZE, 1UL << 31);
@@ -667,41 +632,57 @@ void __init smp_save_dump_cpus(void)
 	boot_cpu_addr = stap();
 	max_cpu_addr = SCLP_MAX_CORES << sclp.mtid_prev;
 	for (addr = 0; addr <= max_cpu_addr; addr++) {
+		if (addr == boot_cpu_addr)
+			continue;
 		if (__pcpu_sigp_relax(addr, SIGP_SENSE, 0) ==
 		    SIGP_CC_NOT_OPERATIONAL)
 			continue;
-		is_boot_cpu = (addr == boot_cpu_addr);
-		/* Allocate save area */
-		sa = save_area_alloc(is_boot_cpu);
-		if (!sa)
-			panic("could not allocate memory for save area\n");
-		if (MACHINE_HAS_VX)
-			/* Get the vector registers */
-			smp_save_cpu_vxrs(sa, addr, is_boot_cpu, page);
-		/*
-		 * For a zfcp dump OLDMEM_BASE == NULL and the registers
-		 * of the boot CPU are stored in the HSA. To retrieve
-		 * these registers an SCLP request is required which is
-		 * done by drivers/s390/char/zcore.c:init_cpu_info()
-		 */
-		if (!is_boot_cpu || OLDMEM_BASE)
-			/* Get the CPU registers */
-			smp_save_cpu_regs(sa, addr, is_boot_cpu, page);
+		sa = save_area_alloc(false);
+		__pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, __pa(page));
+		save_area_add_regs(sa, page);
+		if (cpu_has_vx()) {
+			__pcpu_sigp_relax(addr, SIGP_STORE_ADDITIONAL_STATUS, __pa(page));
+			save_area_add_vxrs(sa, page);
+		}
 	}
 	memblock_free(page, PAGE_SIZE);
-	diag_dma_ops.diag308_reset();
+	diag_amode31_ops.diag308_reset();
 	pcpu_set_smt(0);
 }
 #endif /* CONFIG_CRASH_DUMP */
 
 void smp_cpu_set_polarization(int cpu, int val)
 {
-	pcpu_devices[cpu].polarization = val;
+	per_cpu(pcpu_devices, cpu).polarization = val;
 }
 
 int smp_cpu_get_polarization(int cpu)
 {
-	return pcpu_devices[cpu].polarization;
+	return per_cpu(pcpu_devices, cpu).polarization;
+}
+
+void smp_cpu_set_capacity(int cpu, unsigned long val)
+{
+	per_cpu(pcpu_devices, cpu).capacity = val;
+}
+
+unsigned long smp_cpu_get_capacity(int cpu)
+{
+	return per_cpu(pcpu_devices, cpu).capacity;
+}
+
+void smp_set_core_capacity(int cpu, unsigned long val)
+{
+	int i;
+
+	cpu = smp_get_base_cpu(cpu);
+	for (i = cpu; (i <= cpu + smp_cpu_mtid) && (i < nr_cpu_ids); i++)
+		smp_cpu_set_capacity(i, val);
+}
+
+int smp_cpu_get_cpu_address(int cpu)
+{
+	return per_cpu(pcpu_devices, cpu).address;
 }
 
 static void __ref smp_get_core_info(struct sclp_core_info *info, int early)
@@ -725,8 +706,6 @@ static void __ref smp_get_core_info(struct sclp_core_info *info, int early)
 	}
 }
 
-static int smp_add_present_cpu(int cpu);
-
 static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail,
 			bool configured, bool early)
 {
@@ -742,15 +721,16 @@ static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail,
 	for (i = 0; (i <= smp_cpu_mtid) && (cpu < nr_cpu_ids); i++) {
 		if (pcpu_find_address(cpu_present_mask, address + i))
 			continue;
-		pcpu = pcpu_devices + cpu;
+		pcpu = per_cpu_ptr(&pcpu_devices, cpu);
 		pcpu->address = address + i;
 		if (configured)
 			pcpu->state = CPU_STATE_CONFIGURED;
 		else
 			pcpu->state = CPU_STATE_STANDBY;
 		smp_cpu_set_polarization(cpu, POLARIZATION_UNKNOWN);
+		smp_cpu_set_capacity(cpu, CPU_CAPACITY_HIGH);
 		set_cpu_present(cpu, true);
-		if (!early && smp_add_present_cpu(cpu) != 0)
+		if (!early && arch_register_cpu(cpu))
 			set_cpu_present(cpu, false);
 		else
 			nr++;
@@ -763,11 +743,13 @@ static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail,
 static int __smp_rescan_cpus(struct sclp_core_info *info, bool early)
 {
 	struct sclp_core_entry *core;
-	cpumask_t avail;
+	static cpumask_t avail;
 	bool configured;
 	u16 core_id;
 	int nr, i;
 
+	cpus_read_lock();
+	mutex_lock(&smp_cpu_state_mutex);
 	nr = 0;
 	cpumask_xor(&avail, cpu_possible_mask, cpu_present_mask);
 	/*
@@ -775,7 +757,7 @@ static int __smp_rescan_cpus(struct sclp_core_info *info, bool early)
 	 * that all SMT threads get subsequent logical CPU numbers.
 	 */
 	if (early) {
-		core_id = pcpu_devices[0].address >> smp_cpu_mt_shift;
+		core_id = per_cpu(pcpu_devices, 0).address >> smp_cpu_mt_shift;
 		for (i = 0; i < info->configured; i++) {
 			core = &info->core[i];
 			if (core->core_id == core_id) {
@@ -788,6 +770,8 @@ static int __smp_rescan_cpus(struct sclp_core_info *info, bool early)
 		configured = i < info->configured;
 		nr += smp_add_core(&info->core[i], &avail, configured, early);
 	}
+	mutex_unlock(&smp_cpu_state_mutex);
+	cpus_read_unlock();
 	return nr;
 }
 
@@ -798,10 +782,7 @@ void __init smp_detect_cpus(void)
 	u16 address;
 
 	/* Get CPU information */
-	info = memblock_alloc(sizeof(*info), 8);
-	if (!info)
-		panic("%s: Failed to allocate %zu bytes align=0x%x\n",
-		      __func__, sizeof(*info), 8);
+	info = memblock_alloc_or_panic(sizeof(*info), 8);
 	smp_get_core_info(info, 1);
 	/* Find boot CPU type */
 	if (sclp.has_core_type) {
@@ -820,6 +801,7 @@ void __init smp_detect_cpus(void)
 	mtid = boot_core_type ? sclp.mtid : sclp.mtid_cp;
 	mtid = (mtid < smp_max_threads) ? mtid : smp_max_threads - 1;
 	pcpu_set_smt(mtid);
+	cpu_smt_set_num_threads(smp_cpu_mtid + 1, smp_cpu_mtid + 1);
 
 	/* Print number of CPUs */
 	c_cpus = s_cpus = 0;
@@ -833,85 +815,70 @@ void __init smp_detect_cpus(void)
 			s_cpus += smp_cpu_mtid + 1;
 	}
 	pr_info("%d configured CPUs, %d standby CPUs\n", c_cpus, s_cpus);
-
-	/* Add CPUs present at boot */
-	get_online_cpus();
-	__smp_rescan_cpus(info, true);
-	put_online_cpus();
-	memblock_free_early((unsigned long)info, sizeof(*info));
+	memblock_free(info, sizeof(*info));
 }
 
-static void smp_init_secondary(void)
+/*
+ *	Activate a secondary processor.
+ */
+static void smp_start_secondary(void *cpuvoid)
 {
-	int cpu = smp_processor_id();
-
-	S390_lowcore.last_update_clock = get_tod_clock();
-	restore_access_regs(S390_lowcore.access_regs_save_area);
-	set_cpu_flag(CIF_ASCE_PRIMARY);
-	set_cpu_flag(CIF_ASCE_SECONDARY);
+	struct lowcore *lc = get_lowcore();
+	int cpu = raw_smp_processor_id();
+
+	lc->last_update_clock = get_tod_clock();
+	lc->restart_stack = (unsigned long)restart_stack;
+	lc->restart_fn = (unsigned long)do_restart;
+	lc->restart_data = 0;
+	lc->restart_source = -1U;
+	lc->restart_flags = 0;
+	restore_access_regs(lc->access_regs_save_area);
 	cpu_init();
-	preempt_disable();
+	rcutree_report_cpu_starting(cpu);
 	init_cpu_timer();
 	vtime_init();
+	vdso_getcpu_init();
 	pfault_init();
-	notify_cpu_starting(smp_processor_id());
+	cpumask_set_cpu(cpu, &cpu_setup_mask);
+	update_cpu_masks();
+	notify_cpu_starting(cpu);
 	if (topology_cpu_dedicated(cpu))
 		set_cpu_flag(CIF_DEDICATED_CPU);
 	else
 		clear_cpu_flag(CIF_DEDICATED_CPU);
-	set_cpu_online(smp_processor_id(), true);
+	set_cpu_online(cpu, true);
 	inc_irq_stat(CPU_RST);
 	local_irq_enable();
 	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
-/*
- *	Activate a secondary processor.
- */
-static void __no_sanitize_address smp_start_secondary(void *cpuvoid)
-{
-	S390_lowcore.restart_stack = (unsigned long) restart_stack;
-	S390_lowcore.restart_fn = (unsigned long) do_restart;
-	S390_lowcore.restart_data = 0;
-	S390_lowcore.restart_source = -1UL;
-	__ctl_load(S390_lowcore.cregs_save_area, 0, 15);
-	__load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT);
-	CALL_ON_STACK_NORETURN(smp_init_secondary, S390_lowcore.kernel_stack);
-}
-
 /* Upping and downing of CPUs */
 int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
-	struct pcpu *pcpu;
-	int base, i, rc;
+	struct pcpu *pcpu = per_cpu_ptr(&pcpu_devices, cpu);
+	int rc;
 
-	pcpu = pcpu_devices + cpu;
 	if (pcpu->state != CPU_STATE_CONFIGURED)
 		return -EIO;
-	base = smp_get_base_cpu(cpu);
-	for (i = 0; i <= smp_cpu_mtid; i++) {
-		if (base + i < nr_cpu_ids)
-			if (cpu_online(base + i))
-				break;
-	}
-	/*
-	 * If this is the first CPU of the core to get online
-	 * do an initial CPU reset.
-	 */
-	if (i > smp_cpu_mtid &&
-	    pcpu_sigp_retry(pcpu_devices + base, SIGP_INITIAL_CPU_RESET, 0) !=
+	if (pcpu_sigp_retry(pcpu, SIGP_INITIAL_CPU_RESET, 0) !=
 	    SIGP_CC_ORDER_CODE_ACCEPTED)
 		return -EIO;
 
 	rc = pcpu_alloc_lowcore(pcpu, cpu);
 	if (rc)
 		return rc;
+	/*
+	 * Make sure global control register contents do not change
+	 * until new CPU has initialized control registers.
+	 */
+	system_ctlreg_lock();
 	pcpu_prepare_secondary(pcpu, cpu);
-	pcpu_attach_task(pcpu, tidle);
-	pcpu_start_fn(pcpu, smp_start_secondary, NULL);
+	pcpu_attach_task(cpu, tidle);
+	pcpu_start_fn(cpu, smp_start_secondary, NULL);
 	/* Wait until cpu puts itself in the online & active maps */
 	while (!cpu_online(cpu))
 		cpu_relax();
+	system_ctlreg_unlock();
 	return 0;
 }
 
@@ -926,19 +893,23 @@ early_param("possible_cpus", _setup_possible_cpus);
 
 int __cpu_disable(void)
 {
-	unsigned long cregs[16];
+	struct ctlreg cregs[16];
+	int cpu;
 
 	/* Handle possible pending IPIs */
 	smp_handle_ext_call();
-	set_cpu_online(smp_processor_id(), false);
+	cpu = smp_processor_id();
+	set_cpu_online(cpu, false);
+	cpumask_clear_cpu(cpu, &cpu_setup_mask);
+	update_cpu_masks();
 	/* Disable pseudo page faults on this cpu. */
 	pfault_fini();
 	/* Disable interrupt sources via control register. */
-	__ctl_store(cregs, 0, 15);
-	cregs[0]  &= ~0x0000ee70UL;	/* disable all external interrupts */
-	cregs[6]  &= ~0xff000000UL;	/* disable all I/O interrupts */
-	cregs[14] &= ~0x1f000000UL;	/* disable most machine checks */
-	__ctl_load(cregs, 0, 15);
+	__local_ctl_store(0, 15, cregs);
+	cregs[0].val  &= ~0x0000ee70UL;	/* disable all external interrupts */
+	cregs[6].val  &= ~0xff000000UL;	/* disable all I/O interrupts */
+	cregs[14].val &= ~0x1f000000UL;	/* disable most machine checks */
+	__local_ctl_load(0, 15, cregs);
 	clear_cpu_flag(CIF_NOHZ_DELAY);
 	return 0;
 }
@@ -948,19 +919,19 @@ void __cpu_die(unsigned int cpu)
 	struct pcpu *pcpu;
 
 	/* Wait until target cpu is down */
-	pcpu = pcpu_devices + cpu;
+	pcpu = per_cpu_ptr(&pcpu_devices, cpu);
 	while (!pcpu_stopped(pcpu))
 		cpu_relax();
-	pcpu_free_lowcore(pcpu);
+	pcpu_free_lowcore(pcpu, cpu);
 	cpumask_clear_cpu(cpu, mm_cpumask(&init_mm));
 	cpumask_clear_cpu(cpu, &init_mm.context.cpu_attach_mask);
+	pcpu->flags = 0;
 }
 
 void __noreturn cpu_die(void)
 {
 	idle_task_exit();
-	__bpon();
-	pcpu_sigp_retry(pcpu_devices + smp_processor_id(), SIGP_STOP, 0);
+	pcpu_sigp_retry(this_cpu_ptr(&pcpu_devices), SIGP_STOP, 0);
 	for (;;) ;
 }
 
@@ -979,35 +950,36 @@ void __init smp_fill_possible_mask(void)
 
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
-	/* request the 0x1201 emergency signal external interrupt */
 	if (register_external_irq(EXT_IRQ_EMERGENCY_SIG, do_ext_call_interrupt))
 		panic("Couldn't request external interrupt 0x1201");
-	/* request the 0x1202 external call external interrupt */
+	system_ctl_set_bit(0, 14);
 	if (register_external_irq(EXT_IRQ_EXTERNAL_CALL, do_ext_call_interrupt))
 		panic("Couldn't request external interrupt 0x1202");
+	system_ctl_set_bit(0, 13);
+	smp_rescan_cpus(true);
 }
 
 void __init smp_prepare_boot_cpu(void)
 {
-	struct pcpu *pcpu = pcpu_devices;
+	struct lowcore *lc = get_lowcore();
 
 	WARN_ON(!cpu_present(0) || !cpu_online(0));
-	pcpu->state = CPU_STATE_CONFIGURED;
-	pcpu->lowcore = (struct lowcore *)(unsigned long) store_prefix();
-	S390_lowcore.percpu_offset = __per_cpu_offset[0];
+	lc->percpu_offset = __per_cpu_offset[0];
+	ipl_pcpu = per_cpu_ptr(&pcpu_devices, 0);
+	ipl_pcpu->state = CPU_STATE_CONFIGURED;
+	lc->pcpu = (unsigned long)ipl_pcpu;
 	smp_cpu_set_polarization(0, POLARIZATION_UNKNOWN);
-}
-
-void __init smp_cpus_done(unsigned int max_cpus)
-{
+	smp_cpu_set_capacity(0, CPU_CAPACITY_HIGH);
 }
 
 void __init smp_setup_processor_id(void)
 {
-	pcpu_devices[0].address = stap();
-	S390_lowcore.cpu_nr = 0;
-	S390_lowcore.spinlock_lockval = arch_spin_lockval(0);
-	S390_lowcore.spinlock_index = 0;
+	struct lowcore *lc = get_lowcore();
+
+	lc->cpu_nr = 0;
+	per_cpu(pcpu_devices, 0).address = stap();
+	lc->spinlock_lockval = arch_spin_lockval(0);
+	lc->spinlock_index = 0;
 }
 
 /*
@@ -1027,7 +999,7 @@ static ssize_t cpu_configure_show(struct device *dev,
 	ssize_t count;
 
 	mutex_lock(&smp_cpu_state_mutex);
-	count = sprintf(buf, "%d\n", pcpu_devices[dev->id].state);
+	count = sysfs_emit(buf, "%d\n", per_cpu(pcpu_devices, dev->id).state);
 	mutex_unlock(&smp_cpu_state_mutex);
 	return count;
 }
@@ -1044,18 +1016,16 @@ static ssize_t cpu_configure_store(struct device *dev,
 		return -EINVAL;
 	if (val != 0 && val != 1)
 		return -EINVAL;
-	get_online_cpus();
+	cpus_read_lock();
 	mutex_lock(&smp_cpu_state_mutex);
 	rc = -EBUSY;
-	/* disallow configuration changes of online cpus and cpu 0 */
+	/* disallow configuration changes of online cpus */
 	cpu = dev->id;
 	cpu = smp_get_base_cpu(cpu);
-	if (cpu == 0)
-		goto out;
 	for (i = 0; i <= smp_cpu_mtid; i++)
 		if (cpu_online(cpu + i))
 			goto out;
-	pcpu = pcpu_devices + cpu;
+	pcpu = per_cpu_ptr(&pcpu_devices, cpu);
 	rc = 0;
 	switch (val) {
 	case 0:
@@ -1067,7 +1037,7 @@ static ssize_t cpu_configure_store(struct device *dev,
 		for (i = 0; i <= smp_cpu_mtid; i++) {
 			if (cpu + i >= nr_cpu_ids || !cpu_present(cpu + i))
 				continue;
-			pcpu[i].state = CPU_STATE_STANDBY;
+			per_cpu(pcpu_devices, cpu + i).state = CPU_STATE_STANDBY;
 			smp_cpu_set_polarization(cpu + i,
 						 POLARIZATION_UNKNOWN);
 		}
@@ -1082,7 +1052,7 @@ static ssize_t cpu_configure_store(struct device *dev,
 		for (i = 0; i <= smp_cpu_mtid; i++) {
 			if (cpu + i >= nr_cpu_ids || !cpu_present(cpu + i))
 				continue;
-			pcpu[i].state = CPU_STATE_CONFIGURED;
+			per_cpu(pcpu_devices, cpu + i).state = CPU_STATE_CONFIGURED;
 			smp_cpu_set_polarization(cpu + i,
 						 POLARIZATION_UNKNOWN);
 		}
@@ -1093,7 +1063,7 @@ static ssize_t cpu_configure_store(struct device *dev,
 	}
 out:
 	mutex_unlock(&smp_cpu_state_mutex);
-	put_online_cpus();
+	cpus_read_unlock();
 	return rc ? rc : count;
 }
 static DEVICE_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store);
@@ -1101,7 +1071,7 @@ static DEVICE_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store);
 static ssize_t show_cpu_address(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%d\n", pcpu_devices[dev->id].address);
+	return sysfs_emit(buf, "%d\n", per_cpu(pcpu_devices, dev->id).address);
 }
 static DEVICE_ATTR(address, 0444, show_cpu_address, NULL);
 
@@ -1127,34 +1097,34 @@ static struct attribute_group cpu_online_attr_group = {
 
 static int smp_cpu_online(unsigned int cpu)
 {
-	struct device *s = &per_cpu(cpu_device, cpu)->dev;
+	struct cpu *c = per_cpu_ptr(&cpu_devices, cpu);
 
-	return sysfs_create_group(&s->kobj, &cpu_online_attr_group);
+	return sysfs_create_group(&c->dev.kobj, &cpu_online_attr_group);
 }
+
 static int smp_cpu_pre_down(unsigned int cpu)
 {
-	struct device *s = &per_cpu(cpu_device, cpu)->dev;
+	struct cpu *c = per_cpu_ptr(&cpu_devices, cpu);
 
-	sysfs_remove_group(&s->kobj, &cpu_online_attr_group);
+	sysfs_remove_group(&c->dev.kobj, &cpu_online_attr_group);
 	return 0;
 }
 
-static int smp_add_present_cpu(int cpu)
+bool arch_cpu_is_hotpluggable(int cpu)
 {
-	struct device *s;
-	struct cpu *c;
+	return !!cpu;
+}
+
+int arch_register_cpu(int cpu)
+{
+	struct cpu *c = per_cpu_ptr(&cpu_devices, cpu);
 	int rc;
 
-	c = kzalloc(sizeof(*c), GFP_KERNEL);
-	if (!c)
-		return -ENOMEM;
-	per_cpu(cpu_device, cpu) = c;
-	s = &c->dev;
-	c->hotpluggable = 1;
+	c->hotpluggable = arch_cpu_is_hotpluggable(cpu);
 	rc = register_cpu(c, cpu);
 	if (rc)
 		goto out;
-	rc = sysfs_create_group(&s->kobj, &cpu_common_attr_group);
+	rc = sysfs_create_group(&c->dev.kobj, &cpu_common_attr_group);
 	if (rc)
 		goto out_cpu;
 	rc = topology_cpu_init(c);
@@ -1163,14 +1133,14 @@ static int smp_add_present_cpu(int cpu)
 	return 0;
 
 out_topology:
-	sysfs_remove_group(&s->kobj, &cpu_common_attr_group);
+	sysfs_remove_group(&c->dev.kobj, &cpu_common_attr_group);
 out_cpu:
 	unregister_cpu(c);
 out:
 	return rc;
 }
 
-int __ref smp_rescan_cpus(void)
+int __ref smp_rescan_cpus(bool early)
 {
 	struct sclp_core_info *info;
 	int nr;
@@ -1179,11 +1149,7 @@ int __ref smp_rescan_cpus(void)
 	if (!info)
 		return -ENOMEM;
 	smp_get_core_info(info, 0);
-	get_online_cpus();
-	mutex_lock(&smp_cpu_state_mutex);
-	nr = __smp_rescan_cpus(info, false);
-	mutex_unlock(&smp_cpu_state_mutex);
-	put_online_cpus();
+	nr = __smp_rescan_cpus(info, early);
 	kfree(info);
 	if (nr)
 		topology_schedule_update();
@@ -1200,7 +1166,7 @@ static ssize_t __ref rescan_store(struct device *dev,
 	rc = lock_device_hotplug_sysfs();
 	if (rc)
 		return rc;
-	rc = smp_rescan_cpus();
+	rc = smp_rescan_cpus(false);
 	unlock_device_hotplug();
 	return rc ? rc : count;
 }
@@ -1208,21 +1174,19 @@ static DEVICE_ATTR_WO(rescan);
 
 static int __init s390_smp_init(void)
 {
-	int cpu, rc = 0;
+	struct device *dev_root;
+	int rc;
 
-	rc = device_create_file(cpu_subsys.dev_root, &dev_attr_rescan);
-	if (rc)
-		return rc;
-	for_each_present_cpu(cpu) {
-		rc = smp_add_present_cpu(cpu);
+	dev_root = bus_get_dev_root(&cpu_subsys);
+	if (dev_root) {
+		rc = device_create_file(dev_root, &dev_attr_rescan);
+		put_device(dev_root);
 		if (rc)
-			goto out;
+			return rc;
 	}
-
 	rc = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "s390/smp:online",
 			       smp_cpu_online, smp_cpu_pre_down);
 	rc = rc <= 0 ? rc : 0;
-out:
 	return rc;
 }
 subsys_initcall(s390_smp_init);
diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c
index fc5419ac64c8..b153a395f46d 100644
--- a/arch/s390/kernel/stacktrace.c
+++ b/arch/s390/kernel/stacktrace.c
@@ -3,13 +3,17 @@
  * Stack trace management functions
  *
  *  Copyright IBM Corp. 2006
- *  Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
  */
 
+#include <linux/perf_event.h>
 #include <linux/stacktrace.h>
+#include <linux/uaccess.h>
+#include <linux/compat.h>
+#include <asm/asm-offsets.h>
 #include <asm/stacktrace.h>
 #include <asm/unwind.h>
 #include <asm/kprobes.h>
+#include <asm/ptrace.h>
 
 void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
 		     struct task_struct *task, struct pt_regs *regs)
@@ -19,17 +23,11 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
 
 	unwind_for_each_frame(&state, task, regs, 0) {
 		addr = unwind_get_return_address(&state);
-		if (!addr || !consume_entry(cookie, addr, false))
+		if (!addr || !consume_entry(cookie, addr))
 			break;
 	}
 }
 
-/*
- * This function returns an error if it detects any unreliable features of the
- * stack.  Otherwise it guarantees that the stack trace is reliable.
- *
- * If the task is not 'current', the caller *must* ensure the task is inactive.
- */
 int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
 			     void *cookie, struct task_struct *task)
 {
@@ -47,16 +45,16 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
 		if (!addr)
 			return -EINVAL;
 
-#ifdef CONFIG_KPROBES
+#ifdef CONFIG_RETHOOK
 		/*
-		 * Mark stacktraces with kretprobed functions on them
+		 * Mark stacktraces with krethook functions on them
 		 * as unreliable.
 		 */
-		if (state.ip == (unsigned long)kretprobe_trampoline)
+		if (state.ip == (unsigned long)arch_rethook_trampoline)
 			return -EINVAL;
 #endif
 
-		if (!consume_entry(cookie, addr, false))
+		if (!consume_entry(cookie, addr))
 			return -EINVAL;
 	}
 
@@ -65,3 +63,103 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
 		return -EINVAL;
 	return 0;
 }
+
+static inline bool store_ip(stack_trace_consume_fn consume_entry, void *cookie,
+			    struct perf_callchain_entry_ctx *entry, bool perf,
+			    unsigned long ip)
+{
+#ifdef CONFIG_PERF_EVENTS
+	if (perf) {
+		if (perf_callchain_store(entry, ip))
+			return false;
+		return true;
+	}
+#endif
+	return consume_entry(cookie, ip);
+}
+
+static inline bool ip_invalid(unsigned long ip)
+{
+	/*
+	 * Perform some basic checks if an instruction address taken
+	 * from unreliable source is invalid.
+	 */
+	if (ip & 1)
+		return true;
+	if (ip < mmap_min_addr)
+		return true;
+	if (ip >= current->mm->context.asce_limit)
+		return true;
+	return false;
+}
+
+static inline bool ip_within_vdso(unsigned long ip)
+{
+	return in_range(ip, current->mm->context.vdso_base, vdso_text_size());
+}
+
+void arch_stack_walk_user_common(stack_trace_consume_fn consume_entry, void *cookie,
+				 struct perf_callchain_entry_ctx *entry,
+				 const struct pt_regs *regs, bool perf)
+{
+	struct stack_frame_vdso_wrapper __user *sf_vdso;
+	struct stack_frame_user __user *sf;
+	unsigned long ip, sp;
+	bool first = true;
+
+	if (is_compat_task())
+		return;
+	if (!current->mm)
+		return;
+	ip = instruction_pointer(regs);
+	if (!store_ip(consume_entry, cookie, entry, perf, ip))
+		return;
+	sf = (void __user *)user_stack_pointer(regs);
+	pagefault_disable();
+	while (1) {
+		if (__get_user(sp, &sf->back_chain))
+			break;
+		/*
+		 * VDSO entry code has a non-standard stack frame layout.
+		 * See VDSO user wrapper code for details.
+		 */
+		if (!sp && ip_within_vdso(ip)) {
+			sf_vdso = (void __user *)sf;
+			if (__get_user(ip, &sf_vdso->return_address))
+				break;
+			sp = (unsigned long)sf + STACK_FRAME_VDSO_OVERHEAD;
+			sf = (void __user *)sp;
+			if (__get_user(sp, &sf->back_chain))
+				break;
+		} else {
+			sf = (void __user *)sp;
+			if (__get_user(ip, &sf->gprs[8]))
+				break;
+		}
+		/* Sanity check: ABI requires SP to be 8 byte aligned. */
+		if (sp & 0x7)
+			break;
+		if (ip_invalid(ip)) {
+			/*
+			 * If the instruction address is invalid, and this
+			 * is the first stack frame, assume r14 has not
+			 * been written to the stack yet. Otherwise exit.
+			 */
+			if (!first)
+				break;
+			ip = regs->gprs[14];
+			if (ip_invalid(ip))
+				break;
+		}
+		if (!store_ip(consume_entry, cookie, entry, perf, ip))
+			break;
+		first = false;
+	}
+	pagefault_enable();
+}
+
+void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
+			  const struct pt_regs *regs)
+{
+	arch_stack_walk_user_common(consume_entry, cookie, NULL, regs, false);
+}
diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c
index 888cc2f166db..d40f0b983e74 100644
--- a/arch/s390/kernel/sthyi.c
+++ b/arch/s390/kernel/sthyi.c
@@ -17,6 +17,7 @@
 #include <asm/ebcdic.h>
 #include <asm/facility.h>
 #include <asm/sthyi.h>
+#include <asm/asm.h>
 #include "entry.h"
 
 #define DED_WEIGHT 0xffff
@@ -300,32 +301,57 @@ static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf,
 	return (struct diag204_x_part_block *)&block->cpus[i];
 }
 
-static void fill_diag(struct sthyi_sctns *sctns)
+static void *diag204_get_data(bool diag204_allow_busy)
 {
-	int i, r, pages;
-	bool this_lpar;
+	unsigned long subcode;
 	void *diag204_buf;
+	int pages, rc;
+
+	subcode = DIAG204_SUBC_RSI;
+	subcode |= DIAG204_INFO_EXT;
+	pages = diag204(subcode, 0, NULL);
+	if (pages < 0)
+		return ERR_PTR(pages);
+	if (pages == 0)
+		return ERR_PTR(-ENODATA);
+	diag204_buf = __vmalloc_node(array_size(pages, PAGE_SIZE),
+				     PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE,
+				     __builtin_return_address(0));
+	if (!diag204_buf)
+		return ERR_PTR(-ENOMEM);
+	subcode = DIAG204_SUBC_STIB7;
+	subcode |= DIAG204_INFO_EXT;
+	if (diag204_has_bif() && diag204_allow_busy)
+		subcode |= DIAG204_BIF_BIT;
+	rc = diag204(subcode, pages, diag204_buf);
+	if (rc < 0) {
+		vfree(diag204_buf);
+		return ERR_PTR(rc);
+	}
+	return diag204_buf;
+}
+
+static bool is_diag204_cached(struct sthyi_sctns *sctns)
+{
+	/*
+	 * Check if validity bits are set when diag204 data
+	 * is gathered.
+	 */
+	if (sctns->par.infpval1)
+		return true;
+	return false;
+}
+
+static void fill_diag(struct sthyi_sctns *sctns, void *diag204_buf)
+{
+	int i;
+	bool this_lpar;
 	void *diag224_buf = NULL;
 	struct diag204_x_info_blk_hdr *ti_hdr;
 	struct diag204_x_part_block *part_block;
 	struct diag204_x_phys_block *phys_block;
 	struct lpar_cpu_inf lpar_inf = {};
 
-	/* Errors are handled through the validity bits in the response. */
-	pages = diag204((unsigned long)DIAG204_SUBC_RSI |
-			(unsigned long)DIAG204_INFO_EXT, 0, NULL);
-	if (pages <= 0)
-		return;
-
-	diag204_buf = vmalloc(array_size(pages, PAGE_SIZE));
-	if (!diag204_buf)
-		return;
-
-	r = diag204((unsigned long)DIAG204_SUBC_STIB7 |
-		    (unsigned long)DIAG204_INFO_EXT, pages, diag204_buf);
-	if (r < 0)
-		goto out;
-
 	diag224_buf = (void *)__get_free_page(GFP_KERNEL | GFP_DMA);
 	if (!diag224_buf || diag224(diag224_buf))
 		goto out;
@@ -390,42 +416,51 @@ static void fill_diag(struct sthyi_sctns *sctns)
 
 out:
 	free_page((unsigned long)diag224_buf);
-	vfree(diag204_buf);
 }
 
 static int sthyi(u64 vaddr, u64 *rc)
 {
-	register u64 code asm("0") = 0;
-	register u64 addr asm("2") = vaddr;
-	register u64 rcode asm("3");
+	union register_pair r1 = { .even = 0, }; /* subcode */
+	union register_pair r2 = { .even = vaddr, };
 	int cc;
 
 	asm volatile(
-		".insn   rre,0xB2560000,%[code],%[addr]\n"
-		"ipm     %[cc]\n"
-		"srl     %[cc],28\n"
-		: [cc] "=d" (cc), "=d" (rcode)
-		: [code] "d" (code), [addr] "a" (addr)
-		: "memory", "cc");
-	*rc = rcode;
-	return cc;
+		".insn   rre,0xB2560000,%[r1],%[r2]\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc), [r2] "+&d" (r2.pair)
+		: [r1] "d" (r1.pair)
+		: CC_CLOBBER_LIST("memory"));
+	*rc = r2.odd;
+	return CC_TRANSFORM(cc);
 }
 
 static int fill_dst(void *dst, u64 *rc)
 {
+	void *diag204_buf;
+
 	struct sthyi_sctns *sctns = (struct sthyi_sctns *)dst;
 
 	/*
 	 * If the facility is on, we don't want to emulate the instruction.
 	 * We ask the hypervisor to provide the data.
 	 */
-	if (test_facility(74))
+	if (test_facility(74)) {
+		memset(dst, 0, PAGE_SIZE);
 		return sthyi((u64)dst, rc);
-
+	}
+	/*
+	 * When emulating, if diag204 returns BUSY don't reset dst buffer
+	 * and use cached data.
+	 */
+	*rc = 0;
+	diag204_buf = diag204_get_data(is_diag204_cached(sctns));
+	if (IS_ERR(diag204_buf))
+		return PTR_ERR(diag204_buf);
+	memset(dst, 0, PAGE_SIZE);
 	fill_hdr(sctns);
 	fill_stsi(sctns);
-	fill_diag(sctns);
-	*rc = 0;
+	fill_diag(sctns, diag204_buf);
+	vfree(diag204_buf);
 	return 0;
 }
 
@@ -444,11 +479,14 @@ static int sthyi_update_cache(u64 *rc)
 {
 	int r;
 
-	memset(sthyi_cache.info, 0, PAGE_SIZE);
 	r = fill_dst(sthyi_cache.info, rc);
-	if (r)
-		return r;
-	sthyi_cache.end = jiffies + CACHE_VALID_JIFFIES;
+	if (r == 0) {
+		sthyi_cache.end = jiffies + CACHE_VALID_JIFFIES;
+	} else if (r == -EBUSY) {
+		/* mark as expired and return 0 to keep using cached data */
+		sthyi_cache.end = jiffies - 1;
+		r = 0;
+	}
 	return r;
 }
 
@@ -460,9 +498,9 @@ static int sthyi_update_cache(u64 *rc)
  *
  * Fills the destination with system information returned by the STHYI
  * instruction. The data is generated by emulation or execution of STHYI,
- * if available. The return value is the condition code that would be
- * returned, the rc parameter is the return code which is passed in
- * register R2 + 1.
+ * if available. The return value is either a negative error value or
+ * the condition code that would be returned, the rc parameter is the
+ * return code which is passed in register R2 + 1.
  */
 int sthyi_fill(void *dst, u64 *rc)
 {
diff --git a/arch/s390/kernel/suspend.c b/arch/s390/kernel/suspend.c
deleted file mode 100644
index 75b7b307946e..000000000000
--- a/arch/s390/kernel/suspend.c
+++ /dev/null
@@ -1,240 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Suspend support specific for s390.
- *
- * Copyright IBM Corp. 2009
- *
- * Author(s): Hans-Joachim Picht <hans@linux.vnet.ibm.com>
- */
-
-#include <linux/pfn.h>
-#include <linux/suspend.h>
-#include <linux/mm.h>
-#include <linux/pci.h>
-#include <asm/ctl_reg.h>
-#include <asm/ipl.h>
-#include <asm/cio.h>
-#include <asm/sections.h>
-#include "entry.h"
-
-/*
- * The restore of the saved pages in an hibernation image will set
- * the change and referenced bits in the storage key for each page.
- * Overindication of the referenced bits after an hibernation cycle
- * does not cause any harm but the overindication of the change bits
- * would cause trouble.
- * Use the ARCH_SAVE_PAGE_KEYS hooks to save the storage key of each
- * page to the most significant byte of the associated page frame
- * number in the hibernation image.
- */
-
-/*
- * Key storage is allocated as a linked list of pages.
- * The size of the keys array is (PAGE_SIZE - sizeof(long))
- */
-struct page_key_data {
-	struct page_key_data *next;
-	unsigned char data[];
-};
-
-#define PAGE_KEY_DATA_SIZE	(PAGE_SIZE - sizeof(struct page_key_data *))
-
-static struct page_key_data *page_key_data;
-static struct page_key_data *page_key_rp, *page_key_wp;
-static unsigned long page_key_rx, page_key_wx;
-unsigned long suspend_zero_pages;
-
-/*
- * For each page in the hibernation image one additional byte is
- * stored in the most significant byte of the page frame number.
- * On suspend no additional memory is required but on resume the
- * keys need to be memorized until the page data has been restored.
- * Only then can the storage keys be set to their old state.
- */
-unsigned long page_key_additional_pages(unsigned long pages)
-{
-	return DIV_ROUND_UP(pages, PAGE_KEY_DATA_SIZE);
-}
-
-/*
- * Free page_key_data list of arrays.
- */
-void page_key_free(void)
-{
-	struct page_key_data *pkd;
-
-	while (page_key_data) {
-		pkd = page_key_data;
-		page_key_data = pkd->next;
-		free_page((unsigned long) pkd);
-	}
-}
-
-/*
- * Allocate page_key_data list of arrays with enough room to store
- * one byte for each page in the hibernation image.
- */
-int page_key_alloc(unsigned long pages)
-{
-	struct page_key_data *pk;
-	unsigned long size;
-
-	size = DIV_ROUND_UP(pages, PAGE_KEY_DATA_SIZE);
-	while (size--) {
-		pk = (struct page_key_data *) get_zeroed_page(GFP_KERNEL);
-		if (!pk) {
-			page_key_free();
-			return -ENOMEM;
-		}
-		pk->next = page_key_data;
-		page_key_data = pk;
-	}
-	page_key_rp = page_key_wp = page_key_data;
-	page_key_rx = page_key_wx = 0;
-	return 0;
-}
-
-/*
- * Save the storage key into the upper 8 bits of the page frame number.
- */
-void page_key_read(unsigned long *pfn)
-{
-	struct page *page;
-	unsigned long addr;
-	unsigned char key;
-
-	page = pfn_to_page(*pfn);
-	addr = (unsigned long) page_address(page);
-	key = (unsigned char) page_get_storage_key(addr) & 0x7f;
-	if (arch_test_page_nodat(page))
-		key |= 0x80;
-	*(unsigned char *) pfn = key;
-}
-
-/*
- * Extract the storage key from the upper 8 bits of the page frame number
- * and store it in the page_key_data list of arrays.
- */
-void page_key_memorize(unsigned long *pfn)
-{
-	page_key_wp->data[page_key_wx] = *(unsigned char *) pfn;
-	*(unsigned char *) pfn = 0;
-	if (++page_key_wx < PAGE_KEY_DATA_SIZE)
-		return;
-	page_key_wp = page_key_wp->next;
-	page_key_wx = 0;
-}
-
-/*
- * Get the next key from the page_key_data list of arrays and set the
- * storage key of the page referred by @address. If @address refers to
- * a "safe" page the swsusp_arch_resume code will transfer the storage
- * key from the buffer page to the original page.
- */
-void page_key_write(void *address)
-{
-	struct page *page;
-	unsigned char key;
-
-	key = page_key_rp->data[page_key_rx];
-	page_set_storage_key((unsigned long) address, key & 0x7f, 0);
-	page = virt_to_page(address);
-	if (key & 0x80)
-		arch_set_page_nodat(page, 0);
-	else
-		arch_set_page_dat(page, 0);
-	if (++page_key_rx >= PAGE_KEY_DATA_SIZE)
-		return;
-	page_key_rp = page_key_rp->next;
-	page_key_rx = 0;
-}
-
-int pfn_is_nosave(unsigned long pfn)
-{
-	unsigned long nosave_begin_pfn = PFN_DOWN(__pa(&__nosave_begin));
-	unsigned long nosave_end_pfn = PFN_DOWN(__pa(&__nosave_end));
-	unsigned long end_rodata_pfn = PFN_DOWN(__pa(__end_rodata)) - 1;
-	unsigned long stext_pfn = PFN_DOWN(__pa(_stext));
-
-	/* Always save lowcore pages (LC protection might be enabled). */
-	if (pfn <= LC_PAGES)
-		return 0;
-	if (pfn >= nosave_begin_pfn && pfn < nosave_end_pfn)
-		return 1;
-	/* Skip memory holes and read-only pages (DCSS, ...). */
-	if (pfn >= stext_pfn && pfn <= end_rodata_pfn)
-		return 0;
-	if (tprot(PFN_PHYS(pfn)))
-		return 1;
-	return 0;
-}
-
-/*
- * PM notifier callback for suspend
- */
-static int suspend_pm_cb(struct notifier_block *nb, unsigned long action,
-			 void *ptr)
-{
-	switch (action) {
-	case PM_SUSPEND_PREPARE:
-	case PM_HIBERNATION_PREPARE:
-		suspend_zero_pages = __get_free_pages(GFP_KERNEL, LC_ORDER);
-		if (!suspend_zero_pages)
-			return NOTIFY_BAD;
-		break;
-	case PM_POST_SUSPEND:
-	case PM_POST_HIBERNATION:
-		free_pages(suspend_zero_pages, LC_ORDER);
-		break;
-	default:
-		return NOTIFY_DONE;
-	}
-	return NOTIFY_OK;
-}
-
-static int __init suspend_pm_init(void)
-{
-	pm_notifier(suspend_pm_cb, 0);
-	return 0;
-}
-arch_initcall(suspend_pm_init);
-
-void save_processor_state(void)
-{
-	/* swsusp_arch_suspend() actually saves all cpu register contents.
-	 * Machine checks must be disabled since swsusp_arch_suspend() stores
-	 * register contents to their lowcore save areas. That's the same
-	 * place where register contents on machine checks would be saved.
-	 * To avoid register corruption disable machine checks.
-	 * We must also disable machine checks in the new psw mask for
-	 * program checks, since swsusp_arch_suspend() may generate program
-	 * checks. Disabling machine checks for all other new psw masks is
-	 * just paranoia.
-	 */
-	local_mcck_disable();
-	/* Disable lowcore protection */
-	__ctl_clear_bit(0,28);
-	S390_lowcore.external_new_psw.mask &= ~PSW_MASK_MCHECK;
-	S390_lowcore.svc_new_psw.mask &= ~PSW_MASK_MCHECK;
-	S390_lowcore.io_new_psw.mask &= ~PSW_MASK_MCHECK;
-	S390_lowcore.program_new_psw.mask &= ~PSW_MASK_MCHECK;
-}
-
-void restore_processor_state(void)
-{
-	S390_lowcore.external_new_psw.mask |= PSW_MASK_MCHECK;
-	S390_lowcore.svc_new_psw.mask |= PSW_MASK_MCHECK;
-	S390_lowcore.io_new_psw.mask |= PSW_MASK_MCHECK;
-	S390_lowcore.program_new_psw.mask |= PSW_MASK_MCHECK;
-	/* Enable lowcore protection */
-	__ctl_set_bit(0,28);
-	local_mcck_enable();
-}
-
-/* Called at the end of swsusp_arch_resume */
-void s390_early_resume(void)
-{
-	lgr_info_log();
-	channel_subsystem_reinit();
-	zpci_rescan();
-}
diff --git a/arch/s390/kernel/swsusp.S b/arch/s390/kernel/swsusp.S
deleted file mode 100644
index a7baf0b5f818..000000000000
--- a/arch/s390/kernel/swsusp.S
+++ /dev/null
@@ -1,276 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * S390 64-bit swsusp implementation
- *
- * Copyright IBM Corp. 2009
- *
- * Author(s): Hans-Joachim Picht <hans@linux.vnet.ibm.com>
- *	      Michael Holzheu <holzheu@linux.vnet.ibm.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/page.h>
-#include <asm/ptrace.h>
-#include <asm/thread_info.h>
-#include <asm/asm-offsets.h>
-#include <asm/nospec-insn.h>
-#include <asm/sigp.h>
-
-/*
- * Save register context in absolute 0 lowcore and call swsusp_save() to
- * create in-memory kernel image. The context is saved in the designated
- * "store status" memory locations (see POP).
- * We return from this function twice. The first time during the suspend to
- * disk process. The second time via the swsusp_arch_resume() function
- * (see below) in the resume process.
- * This function runs with disabled interrupts.
- */
-	GEN_BR_THUNK %r14
-
-	.section .text
-ENTRY(swsusp_arch_suspend)
-	lg	%r1,__LC_NODAT_STACK
-	stmg	%r6,%r15,__SF_GPRS(%r1)
-	aghi	%r1,-STACK_FRAME_OVERHEAD
-	stg	%r15,__SF_BACKCHAIN(%r1)
-	lgr	%r15,%r1
-
-	/* Store FPU registers */
-	brasl	%r14,save_fpu_regs
-
-	/* Deactivate DAT */
-	stnsm	__SF_EMPTY(%r15),0xfb
-
-	/* Store prefix register on stack */
-	stpx	__SF_EMPTY(%r15)
-
-	/* Save prefix register contents for lowcore copy */
-	llgf	%r10,__SF_EMPTY(%r15)
-
-	/* Get pointer to save area */
-	lghi	%r1,0x1000
-
-	/* Save CPU address */
-	stap	__LC_EXT_CPU_ADDR(%r0)
-
-	/* Store registers */
-	mvc	0x318(4,%r1),__SF_EMPTY(%r15)	/* move prefix to lowcore */
-	stam	%a0,%a15,0x340(%r1)		/* store access registers */
-	stctg	%c0,%c15,0x380(%r1)		/* store control registers */
-	stmg	%r0,%r15,0x280(%r1)		/* store general registers */
-
-	stpt	0x328(%r1)			/* store timer */
-	stck	__SF_EMPTY(%r15)		/* store clock */
-	stckc	0x330(%r1)			/* store clock comparator */
-
-	/* Update cputime accounting before going to sleep */
-	lg	%r0,__LC_LAST_UPDATE_TIMER
-	slg	%r0,0x328(%r1)
-	alg	%r0,__LC_SYSTEM_TIMER
-	stg	%r0,__LC_SYSTEM_TIMER
-	mvc	__LC_LAST_UPDATE_TIMER(8),0x328(%r1)
-	lg	%r0,__LC_LAST_UPDATE_CLOCK
-	slg	%r0,__SF_EMPTY(%r15)
-	alg	%r0,__LC_STEAL_TIMER
-	stg	%r0,__LC_STEAL_TIMER
-	mvc	__LC_LAST_UPDATE_CLOCK(8),__SF_EMPTY(%r15)
-
-	/* Activate DAT */
-	stosm	__SF_EMPTY(%r15),0x04
-
-	/* Set prefix page to zero */
-	xc	__SF_EMPTY(4,%r15),__SF_EMPTY(%r15)
-	spx	__SF_EMPTY(%r15)
-
-	/* Save absolute zero pages */
-	larl	%r2,suspend_zero_pages
-	lg	%r2,0(%r2)
-	lghi	%r4,0
-	lghi	%r3,2*PAGE_SIZE
-	lghi	%r5,2*PAGE_SIZE
-1:	mvcle	%r2,%r4,0
-	jo	1b
-
-	/* Copy lowcore to absolute zero lowcore */
-	lghi	%r2,0
-	lgr	%r4,%r10
-	lghi	%r3,2*PAGE_SIZE
-	lghi	%r5,2*PAGE_SIZE
-1:	mvcle	%r2,%r4,0
-	jo	1b
-
-	/* Save image */
-	brasl	%r14,swsusp_save
-
-	/* Restore prefix register and return */
-	lghi	%r1,0x1000
-	spx	0x318(%r1)
-	lmg	%r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15)
-	lghi	%r2,0
-	BR_EX	%r14
-ENDPROC(swsusp_arch_suspend)
-
-/*
- * Restore saved memory image to correct place and restore register context.
- * Then we return to the function that called swsusp_arch_suspend().
- * swsusp_arch_resume() runs with disabled interrupts.
- */
-ENTRY(swsusp_arch_resume)
-	stmg	%r6,%r15,__SF_GPRS(%r15)
-	lgr	%r1,%r15
-	aghi	%r15,-STACK_FRAME_OVERHEAD
-	stg	%r1,__SF_BACKCHAIN(%r15)
-
-	/* Make all free pages stable */
-	lghi	%r2,1
-	brasl	%r14,arch_set_page_states
-
-	/* Set prefix page to zero */
-	xc	__SF_EMPTY(4,%r15),__SF_EMPTY(%r15)
-	spx	__SF_EMPTY(%r15)
-
-	/* Deactivate DAT */
-	stnsm	__SF_EMPTY(%r15),0xfb
-
-	/* Restore saved image */
-	larl	%r1,restore_pblist
-	lg	%r1,0(%r1)
-	ltgr	%r1,%r1
-	jz	2f
-0:
-	lg	%r2,8(%r1)
-	lg	%r4,0(%r1)
-	iske	%r0,%r4
-	lghi	%r3,PAGE_SIZE
-	lghi	%r5,PAGE_SIZE
-1:
-	mvcle	%r2,%r4,0
-	jo	1b
-	lg	%r2,8(%r1)
-	sske	%r0,%r2
-	lg	%r1,16(%r1)
-	ltgr	%r1,%r1
-	jnz	0b
-2:
-	ptlb				/* flush tlb */
-
-	/* Reset System */
-	larl	%r1,.Lnew_pgm_check_psw
-	epsw	%r2,%r3
-	stm	%r2,%r3,0(%r1)
-	mvc	__LC_PGM_NEW_PSW(16,%r0),0(%r1)
-	larl	%r1,__swsusp_reset_dma
-	lg	%r1,0(%r1)
-	BASR_EX	%r14,%r1
-	larl	%r1,smp_cpu_mt_shift
-	icm	%r1,15,0(%r1)
-	jz	smt_done
-	llgfr	%r1,%r1
-smt_loop:
-	sigp	%r1,%r0,SIGP_SET_MULTI_THREADING
-	brc	8,smt_done			/* accepted */
-	brc	2,smt_loop			/* busy, try again */
-smt_done:
-	larl	%r1,.Lnew_pgm_check_psw
-	lpswe	0(%r1)
-pgm_check_entry:
-
-	/* Switch to original suspend CPU */
-	larl	%r1,.Lresume_cpu		/* Resume CPU address: r2 */
-	stap	0(%r1)
-	llgh	%r2,0(%r1)
-	llgh	%r1,__LC_EXT_CPU_ADDR(%r0)	/* Suspend CPU address: r1 */
-	cgr	%r1,%r2
-	je	restore_registers		/* r1 = r2 -> nothing to do */
-	larl	%r4,.Lrestart_suspend_psw	/* Set new restart PSW */
-	mvc	__LC_RST_NEW_PSW(16,%r0),0(%r4)
-3:
-	sigp	%r9,%r1,SIGP_INITIAL_CPU_RESET	/* sigp initial cpu reset */
-	brc	8,4f				/* accepted */
-	brc	2,3b				/* busy, try again */
-
-	/* Suspend CPU not available -> panic */
-	larl	%r15,init_thread_union+THREAD_SIZE-STACK_FRAME_OVERHEAD
-	larl	%r2,.Lpanic_string
-	brasl	%r14,sclp_early_printk_force
-	larl	%r3,.Ldisabled_wait_31
-	lpsw	0(%r3)
-4:
-	/* Switch to suspend CPU */
-	sigp	%r9,%r1,SIGP_RESTART	/* sigp restart to suspend CPU */
-	brc	2,4b			/* busy, try again */
-5:
-	sigp	%r9,%r2,SIGP_STOP	/* sigp stop to current resume CPU */
-	brc	2,5b			/* busy, try again */
-6:	j	6b
-
-restart_suspend:
-	larl	%r1,.Lresume_cpu
-	llgh	%r2,0(%r1)
-7:
-	sigp	%r9,%r2,SIGP_SENSE	/* sigp sense, wait for resume CPU */
-	brc	8,7b			/* accepted, status 0, still running */
-	brc	2,7b			/* busy, try again */
-	tmll	%r9,0x40		/* Test if resume CPU is stopped */
-	jz	7b
-
-restore_registers:
-	/* Restore registers */
-	lghi	%r13,0x1000		/* %r1 = pointer to save area */
-
-	/* Ignore time spent in suspended state. */
-	llgf	%r1,0x318(%r13)
-	stck	__LC_LAST_UPDATE_CLOCK(%r1)
-	spt	0x328(%r13)		/* reprogram timer */
-	//sckc	0x330(%r13)		/* set clock comparator */
-
-	lctlg	%c0,%c15,0x380(%r13)	/* load control registers */
-	lam	%a0,%a15,0x340(%r13)	/* load access registers */
-
-	/* Load old stack */
-	lg	%r15,0x2f8(%r13)
-
-	/* Save prefix register */
-	mvc __SF_EMPTY(4,%r15),0x318(%r13)
-
-	/* Restore absolute zero pages */
-	lghi	%r2,0
-	larl	%r4,suspend_zero_pages
-	lg	%r4,0(%r4)
-	lghi	%r3,2*PAGE_SIZE
-	lghi	%r5,2*PAGE_SIZE
-1:	mvcle	%r2,%r4,0
-	jo	1b
-
-	/* Restore prefix register */
-	spx	__SF_EMPTY(%r15)
-
-	/* Activate DAT */
-	stosm	__SF_EMPTY(%r15),0x04
-
-	/* Make all free pages unstable */
-	lghi	%r2,0
-	brasl	%r14,arch_set_page_states
-
-	/* Call arch specific early resume code */
-	brasl	%r14,s390_early_resume
-
-	/* Return 0 */
-	lmg	%r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15)
-	lghi	%r2,0
-	BR_EX	%r14
-ENDPROC(swsusp_arch_resume)
-
-	.section .data..nosave,"aw",@progbits
-	.align	8
-.Ldisabled_wait_31:
-	.long  0x000a0000,0x00000000
-.Lpanic_string:
-	.asciz	"Resume not possible because suspend CPU is no longer available\n"
-	.align	8
-.Lrestart_suspend_psw:
-	.quad	0x0000000180000000,restart_suspend
-.Lnew_pgm_check_psw:
-	.quad	0,pgm_check_entry
-.Lresume_cpu:
-	.byte	0,0
diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/syscall.c
index 202fa73ac167..4fee74553ca2 100644
--- a/arch/s390/kernel/sys_s390.c
+++ b/arch/s390/kernel/syscall.c
@@ -12,6 +12,7 @@
  *  platform.
  */
 
+#include <linux/cpufeature.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
@@ -29,34 +30,14 @@
 #include <linux/unistd.h>
 #include <linux/ipc.h>
 #include <linux/uaccess.h>
-#include "entry.h"
+#include <linux/string.h>
+#include <linux/thread_info.h>
+#include <linux/entry-common.h>
 
-/*
- * Perform the mmap() system call. Linux for S/390 isn't able to handle more
- * than 5 system call parameters, so this system call uses a memory block
- * for parameter passing.
- */
+#include <asm/ptrace.h>
+#include <asm/vtime.h>
 
-struct s390_mmap_arg_struct {
-	unsigned long addr;
-	unsigned long len;
-	unsigned long prot;
-	unsigned long flags;
-	unsigned long fd;
-	unsigned long offset;
-};
-
-SYSCALL_DEFINE1(mmap2, struct s390_mmap_arg_struct __user *, arg)
-{
-	struct s390_mmap_arg_struct a;
-	int error = -EFAULT;
-
-	if (copy_from_user(&a, arg, sizeof(a)))
-		goto out;
-	error = ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset);
-out:
-	return error;
-}
+#include "entry.h"
 
 #ifdef CONFIG_SYSVIPC
 /*
@@ -100,3 +81,48 @@ SYSCALL_DEFINE0(ni_syscall)
 {
 	return -ENOSYS;
 }
+
+void noinstr __do_syscall(struct pt_regs *regs, int per_trap)
+{
+	unsigned long nr;
+
+	add_random_kstack_offset();
+	enter_from_user_mode(regs);
+	regs->psw = get_lowcore()->svc_old_psw;
+	regs->int_code = get_lowcore()->svc_int_code;
+	update_timer_sys();
+	if (cpu_has_bear())
+		current->thread.last_break = regs->last_break;
+	local_irq_enable();
+	regs->orig_gpr2 = regs->gprs[2];
+	if (unlikely(per_trap))
+		set_thread_flag(TIF_PER_TRAP);
+	regs->flags = 0;
+	set_pt_regs_flag(regs, PIF_SYSCALL);
+	nr = regs->int_code & 0xffff;
+	if (likely(!nr)) {
+		nr = regs->gprs[1] & 0xffff;
+		regs->int_code &= ~0xffffUL;
+		regs->int_code |= nr;
+	}
+	regs->gprs[2] = nr;
+	if (nr == __NR_restart_syscall && !(current->restart_block.arch_data & 1)) {
+		regs->psw.addr = current->restart_block.arch_data;
+		current->restart_block.arch_data = 1;
+	}
+	nr = syscall_enter_from_user_mode_work(regs, nr);
+	/*
+	 * In the s390 ptrace ABI, both the syscall number and the return value
+	 * use gpr2. However, userspace puts the syscall number either in the
+	 * svc instruction itself, or uses gpr1. To make at least skipping syscalls
+	 * work, the ptrace code sets PIF_SYSCALL_RET_SET, which is checked here
+	 * and if set, the syscall will be skipped.
+	 */
+	if (unlikely(test_and_clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET)))
+		goto out;
+	regs->gprs[2] = -ENOSYS;
+	if (likely(nr < NR_syscalls))
+		regs->gprs[2] = current->thread.sys_call_table[nr](regs);
+out:
+	syscall_exit_to_user_mode(regs);
+}
diff --git a/arch/s390/kernel/syscalls/Makefile b/arch/s390/kernel/syscalls/Makefile
index b98f25029b8e..c5d958a09ff4 100644
--- a/arch/s390/kernel/syscalls/Makefile
+++ b/arch/s390/kernel/syscalls/Makefile
@@ -4,15 +4,15 @@ gen	:= arch/$(ARCH)/include/generated
 kapi	:= $(gen)/asm
 uapi	:= $(gen)/uapi/asm
 
-syscall	:= $(srctree)/$(src)/syscall.tbl
-systbl	:= $(srctree)/$(src)/syscalltbl
+syscall	:= $(src)/syscall.tbl
+systbl	:= $(src)/syscalltbl
 
 gen-y := $(kapi)/syscall_table.h
 kapi-hdrs-y := $(kapi)/unistd_nr.h
 uapi-hdrs-y := $(uapi)/unistd_32.h
 uapi-hdrs-y += $(uapi)/unistd_64.h
 
-targets += $(addprefix ../../../,$(gen-y) $(kapi-hdrs-y) $(uapi-hdrs-y))
+targets += $(addprefix ../../../../,$(gen-y) $(kapi-hdrs-y) $(uapi-hdrs-y))
 
 PHONY += kapi uapi
 
@@ -21,26 +21,28 @@ uapi:	$(uapi-hdrs-y)
 
 
 # Create output directory if not already present
-_dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \
-	  $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)')
+$(shell mkdir -p $(uapi) $(kapi))
 
-filechk_syshdr = $(CONFIG_SHELL) '$(systbl)' -H -a $(syshdr_abi_$(basetarget)) -f "$2" < $<
+quiet_cmd_syshdr = SYSHDR  $@
+      cmd_syshdr = $(CONFIG_SHELL) '$(systbl)' -H -a $(syshdr_abi_$(basetarget)) -f "$@" < $< > $@
 
-filechk_sysnr = $(CONFIG_SHELL) '$(systbl)' -N -a $(sysnr_abi_$(basetarget)) < $<
+quiet_cmd_sysnr = SYSNR   $@
+      cmd_sysnr = $(CONFIG_SHELL) '$(systbl)' -N -a $(sysnr_abi_$(basetarget)) < $< > $@
 
-filechk_syscalls = $(CONFIG_SHELL) '$(systbl)' -S < $<
+quiet_cmd_syscalls = SYSTBL  $@
+      cmd_syscalls = $(CONFIG_SHELL) '$(systbl)' -S < $< > $@
 
 syshdr_abi_unistd_32 := common,32
-$(uapi)/unistd_32.h: $(syscall) FORCE
-	$(call filechk,syshdr,$@)
+$(uapi)/unistd_32.h: $(syscall) $(systbl) FORCE
+	$(call if_changed,syshdr)
 
 syshdr_abi_unistd_64 := common,64
-$(uapi)/unistd_64.h: $(syscall) FORCE
-	$(call filechk,syshdr,$@)
+$(uapi)/unistd_64.h: $(syscall) $(systbl) FORCE
+	$(call if_changed,syshdr)
 
-$(kapi)/syscall_table.h: $(syscall) FORCE
-	$(call filechk,syscalls)
+$(kapi)/syscall_table.h: $(syscall) $(systbl) FORCE
+	$(call if_changed,syscalls)
 
 sysnr_abi_unistd_nr := common,32,64
-$(kapi)/unistd_nr.h: $(syscall) FORCE
-	$(call filechk,sysnr)
+$(kapi)/unistd_nr.h: $(syscall) $(systbl) FORCE
+	$(call if_changed,sysnr)
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 3054e9c035a3..a4569b96ef06 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -26,7 +26,7 @@
 16   32		lchown			-				sys_lchown16
 19   common	lseek			sys_lseek			compat_sys_lseek
 20   common	getpid			sys_getpid			sys_getpid
-21   common	mount			sys_mount			compat_sys_mount
+21   common	mount			sys_mount			sys_mount
 22   common	umount			sys_oldumount			sys_oldumount
 23   32		setuid			-				sys_setuid16
 24   32		getuid			-				sys_getuid16
@@ -100,7 +100,7 @@
 106  common	stat			sys_newstat			compat_sys_newstat
 107  common	lstat			sys_newlstat			compat_sys_newlstat
 108  common	fstat			sys_newfstat			compat_sys_newfstat
-110  common	lookup_dcookie		sys_lookup_dcookie		compat_sys_lookup_dcookie
+110  common	lookup_dcookie		-				-
 111  common	vhangup			sys_vhangup			sys_vhangup
 112  common	idle			-				-
 114  common	wait4			sys_wait4			compat_sys_wait4
@@ -122,7 +122,7 @@
 131  common	quotactl		sys_quotactl			sys_quotactl
 132  common	getpgid			sys_getpgid			sys_getpgid
 133  common	fchdir			sys_fchdir			sys_fchdir
-134  common	bdflush			sys_bdflush			sys_bdflush
+134  common	bdflush			sys_ni_syscall			sys_ni_syscall
 135  common	sysfs			sys_sysfs			sys_sysfs
 136  common	personality		sys_s390_personality		sys_s390_personality
 137  common	afs_syscall		-				-
@@ -134,11 +134,11 @@
 142  64		select			sys_select			-
 143  common	flock			sys_flock			sys_flock
 144  common	msync			sys_msync			sys_msync
-145  common	readv			sys_readv			compat_sys_readv
-146  common	writev			sys_writev			compat_sys_writev
+145  common	readv			sys_readv			sys_readv
+146  common	writev			sys_writev			sys_writev
 147  common	getsid			sys_getsid			sys_getsid
 148  common	fdatasync		sys_fdatasync			sys_fdatasync
-149  common	_sysctl			sys_sysctl			compat_sys_sysctl
+149  common	_sysctl			-				-
 150  common	mlock			sys_mlock			sys_mlock
 151  common	munlock			sys_munlock			sys_munlock
 152  common	mlockall		sys_mlockall			sys_mlockall
@@ -274,9 +274,9 @@
 265  common	statfs64		sys_statfs64			compat_sys_statfs64
 266  common	fstatfs64		sys_fstatfs64			compat_sys_fstatfs64
 267  common	remap_file_pages	sys_remap_file_pages		sys_remap_file_pages
-268  common	mbind			sys_mbind			compat_sys_mbind
-269  common	get_mempolicy		sys_get_mempolicy		compat_sys_get_mempolicy
-270  common	set_mempolicy		sys_set_mempolicy		compat_sys_set_mempolicy
+268  common	mbind			sys_mbind			sys_mbind
+269  common	get_mempolicy		sys_get_mempolicy		sys_get_mempolicy
+270  common	set_mempolicy		sys_set_mempolicy		sys_set_mempolicy
 271  common	mq_open			sys_mq_open			compat_sys_mq_open
 272  common	mq_unlink		sys_mq_unlink			sys_mq_unlink
 273  common	mq_timedsend		sys_mq_timedsend		sys_mq_timedsend_time32
@@ -293,7 +293,7 @@
 284  common	inotify_init		sys_inotify_init		sys_inotify_init
 285  common	inotify_add_watch	sys_inotify_add_watch		sys_inotify_add_watch
 286  common	inotify_rm_watch	sys_inotify_rm_watch		sys_inotify_rm_watch
-287  common	migrate_pages		sys_migrate_pages		compat_sys_migrate_pages
+287  common	migrate_pages		sys_migrate_pages		sys_migrate_pages
 288  common	openat			sys_openat			compat_sys_openat
 289  common	mkdirat			sys_mkdirat			sys_mkdirat
 290  common	mknodat			sys_mknodat			sys_mknodat
@@ -316,8 +316,8 @@
 306  common	splice			sys_splice			sys_splice
 307  common	sync_file_range		sys_sync_file_range		compat_sys_s390_sync_file_range
 308  common	tee			sys_tee				sys_tee
-309  common	vmsplice		sys_vmsplice			compat_sys_vmsplice
-310  common	move_pages		sys_move_pages			compat_sys_move_pages
+309  common	vmsplice		sys_vmsplice			sys_vmsplice
+310  common	move_pages		sys_move_pages			sys_move_pages
 311  common	getcpu			sys_getcpu			sys_getcpu
 312  common	epoll_pwait		sys_epoll_pwait			compat_sys_epoll_pwait
 313  common	utimes			sys_utimes			sys_utimes_time32
@@ -347,8 +347,8 @@
 337  common	clock_adjtime		sys_clock_adjtime		sys_clock_adjtime32
 338  common	syncfs			sys_syncfs			sys_syncfs
 339  common	setns			sys_setns			sys_setns
-340  common	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv
-341  common	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
+340  common	process_vm_readv	sys_process_vm_readv		sys_process_vm_readv
+341  common	process_vm_writev	sys_process_vm_writev		sys_process_vm_writev
 342  common	s390_runtime_instr	sys_s390_runtime_instr		sys_s390_runtime_instr
 343  common	kcmp			sys_kcmp			sys_kcmp
 344  common	finit_module		sys_finit_module		sys_finit_module
@@ -372,8 +372,8 @@
 362  common	connect			sys_connect			sys_connect
 363  common	listen			sys_listen			sys_listen
 364  common	accept4			sys_accept4			sys_accept4
-365  common	getsockopt		sys_getsockopt			compat_sys_getsockopt
-366  common	setsockopt		sys_setsockopt			compat_sys_setsockopt
+365  common	getsockopt		sys_getsockopt			sys_getsockopt
+366  common	setsockopt		sys_setsockopt			sys_setsockopt
 367  common	getsockname		sys_getsockname			sys_getsockname
 368  common	getpeername		sys_getpeername			sys_getpeername
 369  common	sendto			sys_sendto			sys_sendto
@@ -418,7 +418,7 @@
 412	32	utimensat_time64	-				sys_utimensat
 413	32	pselect6_time64		-				compat_sys_pselect6_time64
 414	32	ppoll_time64		-				compat_sys_ppoll_time64
-416	32	io_pgetevents_time64	-				sys_io_pgetevents
+416	32	io_pgetevents_time64	-				compat_sys_io_pgetevents_time64
 417	32	recvmmsg_time64		-				compat_sys_recvmmsg_time64
 418	32	mq_timedsend_time64	-				sys_mq_timedsend
 419	32	mq_timedreceive_time64	-				sys_mq_timedreceive
@@ -438,3 +438,35 @@
 433  common	fspick			sys_fspick			sys_fspick
 434  common	pidfd_open		sys_pidfd_open			sys_pidfd_open
 435  common	clone3			sys_clone3			sys_clone3
+436  common	close_range		sys_close_range			sys_close_range
+437  common	openat2			sys_openat2			sys_openat2
+438  common	pidfd_getfd		sys_pidfd_getfd			sys_pidfd_getfd
+439  common	faccessat2		sys_faccessat2			sys_faccessat2
+440  common	process_madvise		sys_process_madvise		sys_process_madvise
+441  common	epoll_pwait2		sys_epoll_pwait2		compat_sys_epoll_pwait2
+442  common	mount_setattr		sys_mount_setattr		sys_mount_setattr
+443  common	quotactl_fd		sys_quotactl_fd			sys_quotactl_fd
+444  common	landlock_create_ruleset	sys_landlock_create_ruleset	sys_landlock_create_ruleset
+445  common	landlock_add_rule	sys_landlock_add_rule		sys_landlock_add_rule
+446  common	landlock_restrict_self	sys_landlock_restrict_self	sys_landlock_restrict_self
+447  common	memfd_secret		sys_memfd_secret		sys_memfd_secret
+448  common	process_mrelease	sys_process_mrelease		sys_process_mrelease
+449  common	futex_waitv		sys_futex_waitv			sys_futex_waitv
+450  common	set_mempolicy_home_node	sys_set_mempolicy_home_node	sys_set_mempolicy_home_node
+451  common	cachestat		sys_cachestat			sys_cachestat
+452  common	fchmodat2		sys_fchmodat2			sys_fchmodat2
+453  common	map_shadow_stack	sys_map_shadow_stack		sys_map_shadow_stack
+454  common	futex_wake		sys_futex_wake			sys_futex_wake
+455  common	futex_wait		sys_futex_wait			sys_futex_wait
+456  common	futex_requeue		sys_futex_requeue		sys_futex_requeue
+457  common	statmount		sys_statmount			sys_statmount
+458  common	listmount		sys_listmount			sys_listmount
+459  common	lsm_get_self_attr	sys_lsm_get_self_attr		sys_lsm_get_self_attr
+460  common	lsm_set_self_attr	sys_lsm_set_self_attr		sys_lsm_set_self_attr
+461  common	lsm_list_modules	sys_lsm_list_modules		sys_lsm_list_modules
+462  common	mseal			sys_mseal			sys_mseal
+463  common	setxattrat		sys_setxattrat			sys_setxattrat
+464  common	getxattrat		sys_getxattrat			sys_getxattrat
+465  common	listxattrat		sys_listxattrat			sys_listxattrat
+466  common	removexattrat		sys_removexattrat		sys_removexattrat
+467  common	open_tree_attr		sys_open_tree_attr		sys_open_tree_attr
diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c
index 2ac3c9b56a13..1ea84e942bd4 100644
--- a/arch/s390/kernel/sysinfo.c
+++ b/arch/s390/kernel/sysinfo.c
@@ -5,6 +5,7 @@
  *	       Martin Schwidefsky <schwidefsky@de.ibm.com>,
  */
 
+#include <linux/cpufeature.h>
 #include <linux/debugfs.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
@@ -14,51 +15,18 @@
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/slab.h>
+#include <asm/asm-extable.h>
+#include <asm/machine.h>
 #include <asm/ebcdic.h>
 #include <asm/debug.h>
 #include <asm/sysinfo.h>
 #include <asm/cpcmd.h>
 #include <asm/topology.h>
-#include <asm/fpu/api.h>
+#include <asm/fpu.h>
+#include <asm/asm.h>
 
 int topology_max_mnest;
 
-static inline int __stsi(void *sysinfo, int fc, int sel1, int sel2, int *lvl)
-{
-	register int r0 asm("0") = (fc << 28) | sel1;
-	register int r1 asm("1") = sel2;
-	int rc = 0;
-
-	asm volatile(
-		"	stsi	0(%3)\n"
-		"0:	jz	2f\n"
-		"1:	lhi	%1,%4\n"
-		"2:\n"
-		EX_TABLE(0b, 1b)
-		: "+d" (r0), "+d" (rc)
-		: "d" (r1), "a" (sysinfo), "K" (-EOPNOTSUPP)
-		: "cc", "memory");
-	*lvl = ((unsigned int) r0) >> 28;
-	return rc;
-}
-
-/*
- * stsi - store system information
- *
- * Returns the current configuration level if function code 0 was specified.
- * Otherwise returns 0 on success or a negative value on error.
- */
-int stsi(void *sysinfo, int fc, int sel1, int sel2)
-{
-	int lvl, rc;
-
-	rc = __stsi(sysinfo, fc, sel1, sel2, &lvl);
-	if (rc)
-		return rc;
-	return fc ? 0 : lvl;
-}
-EXPORT_SYMBOL(stsi);
-
 #ifdef CONFIG_PROC_FS
 
 static bool convert_ext_name(unsigned char encoding, char *name, size_t len)
@@ -77,10 +45,12 @@ static bool convert_ext_name(unsigned char encoding, char *name, size_t len)
 
 static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info)
 {
+	bool has_var_cap;
 	int i;
 
 	if (stsi(info, 1, 1, 1))
 		return;
+	has_var_cap = !!info->model_var_cap[0];
 	EBCASC(info->manufacturer, sizeof(info->manufacturer));
 	EBCASC(info->type, sizeof(info->type));
 	EBCASC(info->model, sizeof(info->model));
@@ -89,6 +59,8 @@ static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info)
 	EBCASC(info->model_capacity, sizeof(info->model_capacity));
 	EBCASC(info->model_perm_cap, sizeof(info->model_perm_cap));
 	EBCASC(info->model_temp_cap, sizeof(info->model_temp_cap));
+	if (has_var_cap)
+		EBCASC(info->model_var_cap, sizeof(info->model_var_cap));
 	seq_printf(m, "Manufacturer:         %-16.16s\n", info->manufacturer);
 	seq_printf(m, "Type:                 %-4.4s\n", info->type);
 	if (info->lic)
@@ -116,12 +88,18 @@ static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info)
 		seq_printf(m, "Model Temp. Capacity: %-16.16s %08u\n",
 			   info->model_temp_cap,
 			   info->model_temp_cap_rating);
+	if (has_var_cap && info->model_var_cap_rating)
+		seq_printf(m, "Model Var. Capacity:  %-16.16s %08u\n",
+			   info->model_var_cap,
+			   info->model_var_cap_rating);
 	if (info->ncr)
 		seq_printf(m, "Nominal Cap. Rating:  %08u\n", info->ncr);
 	if (info->npr)
 		seq_printf(m, "Nominal Perm. Rating: %08u\n", info->npr);
 	if (info->ntr)
 		seq_printf(m, "Nominal Temp. Rating: %08u\n", info->ntr);
+	if (has_var_cap && info->nvr)
+		seq_printf(m, "Nominal Var. Rating:  %08u\n", info->nvr);
 	if (info->cai) {
 		seq_printf(m, "Capacity Adj. Ind.:   %d\n", info->cai);
 		seq_printf(m, "Capacity Ch. Reason:  %d\n", info->ccr);
@@ -140,7 +118,7 @@ static void stsi_15_1_x(struct seq_file *m, struct sysinfo_15_1_x *info)
 	int i;
 
 	seq_putc(m, '\n');
-	if (!MACHINE_HAS_TOPOLOGY)
+	if (!cpu_has_topology())
 		return;
 	if (stsi(info, 15, 1, topology_max_mnest))
 		return;
@@ -383,7 +361,7 @@ static void service_level_vm_print(struct seq_file *m,
 {
 	char *query_buffer, *str;
 
-	query_buffer = kmalloc(1024, GFP_KERNEL | GFP_DMA);
+	query_buffer = kmalloc(1024, GFP_KERNEL);
 	if (!query_buffer)
 		return;
 	cpcmd("QUERY CPLEVEL", query_buffer, 1024, NULL);
@@ -401,7 +379,7 @@ static struct service_level service_level_vm = {
 static __init int create_proc_service_level(void)
 {
 	proc_create_seq("service_levels", 0, NULL, &service_level_seq_ops);
-	if (MACHINE_IS_VM)
+	if (machine_is_vm())
 		register_service_level(&service_level_vm);
 	return 0;
 }
@@ -412,9 +390,9 @@ subsys_initcall(create_proc_service_level);
  */
 void s390_adjust_jiffies(void)
 {
+	DECLARE_KERNEL_FPU_ONSTACK16(fpu);
 	struct sysinfo_1_2_2 *info;
 	unsigned long capability;
-	struct kernel_fpu fpu;
 
 	info = (void *) get_zeroed_page(GFP_KERNEL);
 	if (!info)
@@ -433,21 +411,14 @@ void s390_adjust_jiffies(void)
 		 * point division ..
 		 */
 		kernel_fpu_begin(&fpu, KERNEL_FPR);
-		asm volatile(
-			"	sfpc	%3\n"
-			"	l	%0,%1\n"
-			"	tmlh	%0,0xff80\n"
-			"	jnz	0f\n"
-			"	cefbr	%%f2,%0\n"
-			"	j	1f\n"
-			"0:	le	%%f2,%1\n"
-			"1:	cefbr	%%f0,%2\n"
-			"	debr	%%f0,%%f2\n"
-			"	cgebr	%0,5,%%f0\n"
-			: "=&d" (capability)
-			: "Q" (info->capability), "d" (10000000), "d" (0)
-			: "cc"
-			);
+		fpu_sfpc(0);
+		if (info->capability & 0xff800000)
+			fpu_ldgr(2, info->capability);
+		else
+			fpu_cefbr(2, info->capability);
+		fpu_cefbr(0, 10000000);
+		fpu_debr(0, 2);
+		capability = fpu_cgebr(0, 5);
 		kernel_fpu_end(&fpu, KERNEL_FPR);
 	} else
 		/*
@@ -491,7 +462,6 @@ static const struct file_operations stsi_##fc##_##s1##_##s2##_fs_ops = {       \
 	.open		= stsi_open_##fc##_##s1##_##s2,			       \
 	.release	= stsi_release,					       \
 	.read		= stsi_read,					       \
-	.llseek		= no_llseek,					       \
 };
 
 static int stsi_release(struct inode *inode, struct file *file)
@@ -553,7 +523,7 @@ static __init int stsi_init_debugfs(void)
 		sf = &stsi_file[i];
 		debugfs_create_file(sf->name, 0400, stsi_root, NULL, sf->fops);
 	}
-	if (IS_ENABLED(CONFIG_SCHED_TOPOLOGY) && MACHINE_HAS_TOPOLOGY) {
+	if (IS_ENABLED(CONFIG_SCHED_TOPOLOGY) && cpu_has_topology()) {
 		char link_to[10];
 
 		sprintf(link_to, "15_1_%d", topology_mnest_limit());
diff --git a/arch/s390/kernel/text_amode31.S b/arch/s390/kernel/text_amode31.S
new file mode 100644
index 000000000000..26f2981aa09e
--- /dev/null
+++ b/arch/s390/kernel/text_amode31.S
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Code that needs to run below 2 GB.
+ *
+ * Copyright IBM Corp. 2019
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm-extable.h>
+#include <asm/errno.h>
+#include <asm/sigp.h>
+
+	.section .amode31.text,"ax"
+/*
+ * Simplified version of expoline thunk. The normal thunks can not be used here,
+ * because they might be more than 2 GB away, and not reachable by the relative
+ * branch. No comdat, exrl, etc. optimizations used here, because it only
+ * affects a few functions that are not performance-relevant.
+ */
+	.macro BR_EX_AMODE31_r14
+	exrl	0,0f
+	j	.
+0:	br	%r14
+	.endm
+
+/*
+ * int _diag14_amode31(unsigned long rx, unsigned long ry1, unsigned long subcode)
+ */
+SYM_FUNC_START(_diag14_amode31)
+	lgr	%r1,%r2
+	lgr	%r2,%r3
+	lgr	%r3,%r4
+	lhi	%r5,-EIO
+	sam31
+	diag	%r1,%r2,0x14
+.Ldiag14_ex:
+	ipm	%r5
+	srl	%r5,28
+.Ldiag14_fault:
+	sam64
+	lgfr	%r2,%r5
+	BR_EX_AMODE31_r14
+	EX_TABLE_AMODE31(.Ldiag14_ex, .Ldiag14_fault)
+SYM_FUNC_END(_diag14_amode31)
+
+/*
+ * int _diag210_amode31(struct diag210 *addr)
+ */
+SYM_FUNC_START(_diag210_amode31)
+	lgr	%r1,%r2
+	lhi	%r2,-1
+	sam31
+	diag	%r1,%r0,0x210
+.Ldiag210_ex:
+	ipm	%r2
+	srl	%r2,28
+.Ldiag210_fault:
+	sam64
+	lgfr	%r2,%r2
+	BR_EX_AMODE31_r14
+	EX_TABLE_AMODE31(.Ldiag210_ex, .Ldiag210_fault)
+SYM_FUNC_END(_diag210_amode31)
+
+/*
+ * int diag8c(struct diag8c *addr, struct ccw_dev_id *devno, size_t len)
+*/
+SYM_FUNC_START(_diag8c_amode31)
+	llgf	%r3,0(%r3)
+	sam31
+	diag	%r2,%r4,0x8c
+.Ldiag8c_ex:
+	sam64
+	lgfr	%r2,%r3
+	BR_EX_AMODE31_r14
+	EX_TABLE_AMODE31(.Ldiag8c_ex, .Ldiag8c_ex)
+SYM_FUNC_END(_diag8c_amode31)
+/*
+ * int _diag26c_amode31(void *req, void *resp, enum diag26c_sc subcode)
+ */
+SYM_FUNC_START(_diag26c_amode31)
+	lghi	%r5,-EOPNOTSUPP
+	sam31
+	diag	%r2,%r4,0x26c
+.Ldiag26c_ex:
+	sam64
+	lgfr	%r2,%r5
+	BR_EX_AMODE31_r14
+	EX_TABLE_AMODE31(.Ldiag26c_ex, .Ldiag26c_ex)
+SYM_FUNC_END(_diag26c_amode31)
+
+/*
+ * void _diag0c_amode31(unsigned long rx)
+ */
+SYM_FUNC_START(_diag0c_amode31)
+	sam31
+	diag	%r2,%r2,0x0c
+	sam64
+	BR_EX_AMODE31_r14
+SYM_FUNC_END(_diag0c_amode31)
+
+/*
+ * void _diag308_reset_amode31(void)
+ *
+ * Calls diag 308 subcode 1 and continues execution
+ */
+SYM_FUNC_START(_diag308_reset_amode31)
+	larl	%r4,ctlregs		# Save control registers
+	stctg	%c0,%c15,0(%r4)
+	lg	%r2,0(%r4)		# Disable lowcore protection
+	nilh	%r2,0xefff
+	larl	%r4,ctlreg0
+	stg	%r2,0(%r4)
+	lctlg	%c0,%c0,0(%r4)
+	larl	%r4,fpctl		# Floating point control register
+	stfpc	0(%r4)
+	larl	%r4,prefix		# Save prefix register
+	stpx	0(%r4)
+	larl	%r4,prefix_zero	# Set prefix register to 0
+	spx	0(%r4)
+	larl	%r4,continue_psw	# Save PSW flags
+	epsw	%r2,%r3
+	stm	%r2,%r3,0(%r4)
+	larl	%r4,.Lrestart_part2	# Setup restart PSW at absolute 0
+	larl	%r3,restart_diag308_psw
+	og	%r4,0(%r3)		# Save PSW
+	lghi	%r3,0
+	sturg	%r4,%r3			# Use sturg, because of large pages
+	lghi	%r1,1
+	lghi	%r0,0
+	diag	%r0,%r1,0x308
+.Lrestart_part2:
+	lhi	%r0,0			# Load r0 with zero
+	lhi	%r1,2			# Use mode 2 = ESAME (dump)
+	sigp	%r1,%r0,SIGP_SET_ARCHITECTURE	# Switch to ESAME mode
+	sam64				# Switch to 64 bit addressing mode
+	larl	%r4,ctlregs		# Restore control registers
+	lctlg	%c0,%c15,0(%r4)
+	larl	%r4,fpctl		# Restore floating point ctl register
+	lfpc	0(%r4)
+	larl	%r4,prefix		# Restore prefix register
+	spx	0(%r4)
+	larl	%r4,continue_psw	# Restore PSW flags
+	larl	%r2,.Lcontinue
+	stg	%r2,8(%r4)
+	lpswe	0(%r4)
+.Lcontinue:
+	BR_EX_AMODE31_r14
+SYM_FUNC_END(_diag308_reset_amode31)
+
+	.section .amode31.data,"aw",@progbits
+	.balign	8
+SYM_DATA_LOCAL(restart_diag308_psw,	.long 0x00080000,0x80000000)
+SYM_DATA_LOCAL(continue_psw,		.quad 0,0)
+SYM_DATA_LOCAL(ctlreg0,			.quad 0)
+SYM_DATA_LOCAL(ctlregs,			.fill 16,8,0)
+SYM_DATA_LOCAL(fpctl,			.long 0)
+SYM_DATA_LOCAL(prefix,			.long 0)
+SYM_DATA_LOCAL(prefix_zero,		.long 0)
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index f9d070d016e3..fed17d407a44 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -36,11 +36,13 @@
 #include <linux/profile.h>
 #include <linux/timex.h>
 #include <linux/notifier.h>
-#include <linux/timekeeper_internal.h>
 #include <linux/clockchips.h>
 #include <linux/gfp.h>
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
+#include <vdso/vsyscall.h>
+#include <vdso/clocksource.h>
+#include <vdso/helpers.h>
 #include <asm/facility.h>
 #include <asm/delay.h>
 #include <asm/div64.h>
@@ -52,14 +54,10 @@
 #include <asm/cio.h>
 #include "entry.h"
 
-unsigned char tod_clock_base[16] __aligned(8) = {
-	/* Force to data section. */
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
-};
+union tod_clock __bootdata_preserved(tod_clock_base);
 EXPORT_SYMBOL_GPL(tod_clock_base);
 
-u64 clock_comparator_max = -1ULL;
+u64 __bootdata_preserved(clock_comparator_max);
 EXPORT_SYMBOL_GPL(clock_comparator_max);
 
 static DEFINE_PER_CPU(struct clock_event_device, comparators);
@@ -69,10 +67,10 @@ EXPORT_SYMBOL(s390_epoch_delta_notifier);
 
 unsigned char ptff_function_mask[16];
 
-static unsigned long long lpar_offset;
-static unsigned long long initial_leap_seconds;
-static unsigned long long tod_steering_end;
-static long long tod_steering_delta;
+static unsigned long lpar_offset;
+static unsigned long initial_leap_seconds;
+static unsigned long tod_steering_end;
+static long tod_steering_delta;
 
 /*
  * Get time offsets with PTFF
@@ -83,8 +81,8 @@ void __init time_early_init(void)
 	struct ptff_qui qui;
 
 	/* Initialize TOD steering parameters */
-	tod_steering_end = *(unsigned long long *) &tod_clock_base[1];
-	vdso_data->ts_end = tod_steering_end;
+	tod_steering_end = tod_clock_base.tod;
+	vdso_k_time_data->arch_data.tod_steering_end = tod_steering_end;
 
 	if (!test_facility(28))
 		return;
@@ -97,10 +95,15 @@ void __init time_early_init(void)
 
 	/* get initial leap seconds */
 	if (ptff_query(PTFF_QUI) && ptff(&qui, sizeof(qui), PTFF_QUI) == 0)
-		initial_leap_seconds = (unsigned long long)
+		initial_leap_seconds = (unsigned long)
 			((long) qui.old_leap * 4096000000L);
 }
 
+unsigned long long noinstr sched_clock_noinstr(void)
+{
+	return tod_to_ns(__get_tod_clock_monotonic());
+}
+
 /*
  * Scheduler clock - returns current time in nanosec units.
  */
@@ -110,18 +113,13 @@ unsigned long long notrace sched_clock(void)
 }
 NOKPROBE_SYMBOL(sched_clock);
 
-static void ext_to_timespec64(unsigned char *clk, struct timespec64 *xt)
+static void ext_to_timespec64(union tod_clock *clk, struct timespec64 *xt)
 {
-	unsigned long long high, low, rem, sec, nsec;
+	unsigned long rem, sec, nsec;
 
-	/* Split extendnd TOD clock to micro-seconds and sub-micro-seconds */
-	high = (*(unsigned long long *) clk) >> 4;
-	low = (*(unsigned long long *)&clk[7]) << 4;
-	/* Calculate seconds and nano-seconds */
-	sec = high;
+	sec = clk->us;
 	rem = do_div(sec, 1000000);
-	nsec = (((low >> 32) + (rem << 32)) * 1000) >> 32;
-
+	nsec = ((clk->sus + (rem << 12)) * 125) >> 9;
 	xt->tv_sec = sec;
 	xt->tv_nsec = nsec;
 }
@@ -130,7 +128,7 @@ void clock_comparator_work(void)
 {
 	struct clock_event_device *cd;
 
-	S390_lowcore.clock_comparator = clock_comparator_max;
+	get_lowcore()->clock_comparator = clock_comparator_max;
 	cd = this_cpu_ptr(&comparators);
 	cd->event_handler(cd);
 }
@@ -138,8 +136,8 @@ void clock_comparator_work(void)
 static int s390_next_event(unsigned long delta,
 			   struct clock_event_device *evt)
 {
-	S390_lowcore.clock_comparator = get_tod_clock() + delta;
-	set_clock_comparator(S390_lowcore.clock_comparator);
+	get_lowcore()->clock_comparator = get_tod_clock() + delta;
+	set_clock_comparator(get_lowcore()->clock_comparator);
 	return 0;
 }
 
@@ -152,8 +150,8 @@ void init_cpu_timer(void)
 	struct clock_event_device *cd;
 	int cpu;
 
-	S390_lowcore.clock_comparator = clock_comparator_max;
-	set_clock_comparator(S390_lowcore.clock_comparator);
+	get_lowcore()->clock_comparator = clock_comparator_max;
+	set_clock_comparator(get_lowcore()->clock_comparator);
 
 	cpu = smp_processor_id();
 	cd = &per_cpu(comparators, cpu);
@@ -172,10 +170,10 @@ void init_cpu_timer(void)
 	clockevents_register_device(cd);
 
 	/* Enable clock comparator timer interrupt. */
-	__ctl_set_bit(0,11);
+	local_ctl_set_bit(0, CR0_CLOCK_COMPARATOR_SUBMASK_BIT);
 
 	/* Always allow the timing alert external interrupt. */
-	__ctl_set_bit(0, 4);
+	local_ctl_set_bit(0, CR0_ETR_SUBMASK_BIT);
 }
 
 static void clock_comparator_interrupt(struct ext_code ext_code,
@@ -183,8 +181,8 @@ static void clock_comparator_interrupt(struct ext_code ext_code,
 				       unsigned long param64)
 {
 	inc_irq_stat(IRQEXT_CLK);
-	if (S390_lowcore.clock_comparator == clock_comparator_max)
-		set_clock_comparator(S390_lowcore.clock_comparator);
+	if (get_lowcore()->clock_comparator == clock_comparator_max)
+		set_clock_comparator(get_lowcore()->clock_comparator);
 }
 
 static void stp_timing_alert(struct stp_irq_parm *);
@@ -201,30 +199,26 @@ static void stp_reset(void);
 
 void read_persistent_clock64(struct timespec64 *ts)
 {
-	unsigned char clk[STORE_CLOCK_EXT_SIZE];
-	__u64 delta;
+	union tod_clock clk;
+	u64 delta;
 
 	delta = initial_leap_seconds + TOD_UNIX_EPOCH;
-	get_tod_clock_ext(clk);
-	*(__u64 *) &clk[1] -= delta;
-	if (*(__u64 *) &clk[1] > delta)
-		clk[0]--;
-	ext_to_timespec64(clk, ts);
+	store_tod_clock_ext(&clk);
+	clk.eitod -= delta;
+	ext_to_timespec64(&clk, ts);
 }
 
 void __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
 						 struct timespec64 *boot_offset)
 {
-	unsigned char clk[STORE_CLOCK_EXT_SIZE];
 	struct timespec64 boot_time;
-	__u64 delta;
+	union tod_clock clk;
+	u64 delta;
 
 	delta = initial_leap_seconds + TOD_UNIX_EPOCH;
-	memcpy(clk, tod_clock_base, STORE_CLOCK_EXT_SIZE);
-	*(__u64 *)&clk[1] -= delta;
-	if (*(__u64 *)&clk[1] > delta)
-		clk[0]--;
-	ext_to_timespec64(clk, &boot_time);
+	clk = tod_clock_base;
+	clk.eitod -= delta;
+	ext_to_timespec64(&clk, &boot_time);
 
 	read_persistent_clock64(wall_time);
 	*boot_offset = timespec64_sub(*wall_time, boot_time);
@@ -232,12 +226,12 @@ void __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
 
 static u64 read_tod_clock(struct clocksource *cs)
 {
-	unsigned long long now, adj;
+	unsigned long now, adj;
 
 	preempt_disable(); /* protect from changes to steering parameters */
 	now = get_tod_clock();
 	adj = tod_steering_end - now;
-	if (unlikely((s64) adj >= 0))
+	if (unlikely((s64) adj > 0))
 		/*
 		 * manually steer by 1 cycle every 2^16 cycles. This
 		 * corresponds to shifting the tod delta by 15. 1s is
@@ -253,10 +247,12 @@ static struct clocksource clocksource_tod = {
 	.name		= "tod",
 	.rating		= 400,
 	.read		= read_tod_clock,
-	.mask		= -1ULL,
-	.mult		= 1000,
-	.shift		= 12,
+	.mask		= CLOCKSOURCE_MASK(64),
+	.mult		= 4096000,
+	.shift		= 24,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
+	.vdso_clock_mode = VDSO_CLOCKMODE_TOD,
+	.id		= CSID_S390_TOD,
 };
 
 struct clocksource * __init clocksource_default_clock(void)
@@ -264,55 +260,6 @@ struct clocksource * __init clocksource_default_clock(void)
 	return &clocksource_tod;
 }
 
-void update_vsyscall(struct timekeeper *tk)
-{
-	u64 nsecps;
-
-	if (tk->tkr_mono.clock != &clocksource_tod)
-		return;
-
-	/* Make userspace gettimeofday spin until we're done. */
-	++vdso_data->tb_update_count;
-	smp_wmb();
-	vdso_data->xtime_tod_stamp = tk->tkr_mono.cycle_last;
-	vdso_data->xtime_clock_sec = tk->xtime_sec;
-	vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec;
-	vdso_data->wtom_clock_sec =
-		tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
-	vdso_data->wtom_clock_nsec = tk->tkr_mono.xtime_nsec +
-		+ ((u64) tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift);
-	nsecps = (u64) NSEC_PER_SEC << tk->tkr_mono.shift;
-	while (vdso_data->wtom_clock_nsec >= nsecps) {
-		vdso_data->wtom_clock_nsec -= nsecps;
-		vdso_data->wtom_clock_sec++;
-	}
-
-	vdso_data->xtime_coarse_sec = tk->xtime_sec;
-	vdso_data->xtime_coarse_nsec =
-		(long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
-	vdso_data->wtom_coarse_sec =
-		vdso_data->xtime_coarse_sec + tk->wall_to_monotonic.tv_sec;
-	vdso_data->wtom_coarse_nsec =
-		vdso_data->xtime_coarse_nsec + tk->wall_to_monotonic.tv_nsec;
-	while (vdso_data->wtom_coarse_nsec >= NSEC_PER_SEC) {
-		vdso_data->wtom_coarse_nsec -= NSEC_PER_SEC;
-		vdso_data->wtom_coarse_sec++;
-	}
-
-	vdso_data->tk_mult = tk->tkr_mono.mult;
-	vdso_data->tk_shift = tk->tkr_mono.shift;
-	smp_wmb();
-	++vdso_data->tb_update_count;
-}
-
-extern struct timezone sys_tz;
-
-void update_vsyscall_tz(void)
-{
-	vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
-	vdso_data->tz_dsttime = sys_tz.tz_dsttime;
-}
-
 /*
  * Initialize the TOD clock and the CPU timer of
  * the boot cpu.
@@ -341,11 +288,12 @@ void __init time_init(void)
 }
 
 static DEFINE_PER_CPU(atomic_t, clock_sync_word);
-static DEFINE_MUTEX(clock_sync_mutex);
+static DEFINE_MUTEX(stp_mutex);
 static unsigned long clock_sync_flags;
 
-#define CLOCK_SYNC_HAS_STP	0
-#define CLOCK_SYNC_STP		1
+#define CLOCK_SYNC_HAS_STP		0
+#define CLOCK_SYNC_STP			1
+#define CLOCK_SYNC_STPINFO_VALID	2
 
 /*
  * The get_clock function for the physical clock. It will get the current
@@ -419,18 +367,14 @@ static inline int check_sync_clock(void)
  * Apply clock delta to the global data structures.
  * This is called once on the CPU that performed the clock sync.
  */
-static void clock_sync_global(unsigned long long delta)
+static void clock_sync_global(long delta)
 {
 	unsigned long now, adj;
 	struct ptff_qto qto;
 
 	/* Fixup the monotonic sched clock. */
-	*(unsigned long long *) &tod_clock_base[1] += delta;
-	if (*(unsigned long long *) &tod_clock_base[1] < delta)
-		/* Epoch overflow */
-		tod_clock_base[0]++;
+	tod_clock_base.eitod += delta;
 	/* Adjust TOD steering parameters. */
-	vdso_data->tb_update_count++;
 	now = get_tod_clock();
 	adj = tod_steering_end - now;
 	if (unlikely((s64) adj >= 0))
@@ -439,12 +383,12 @@ static void clock_sync_global(unsigned long long delta)
 			-(adj >> 15) : (adj >> 15);
 	tod_steering_delta += delta;
 	if ((abs(tod_steering_delta) >> 48) != 0)
-		panic("TOD clock sync offset %lli is too large to drift\n",
+		panic("TOD clock sync offset %li is too large to drift\n",
 		      tod_steering_delta);
 	tod_steering_end = now + (abs(tod_steering_delta) << 15);
-	vdso_data->ts_dir = (tod_steering_delta < 0) ? 0 : 1;
-	vdso_data->ts_end = tod_steering_end;
-	vdso_data->tb_update_count++;
+	vdso_k_time_data->arch_data.tod_steering_end = tod_steering_end;
+	vdso_k_time_data->arch_data.tod_steering_delta = tod_steering_delta;
+
 	/* Update LPAR offset. */
 	if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0)
 		lpar_offset = qto.tod_epoch_difference;
@@ -456,15 +400,15 @@ static void clock_sync_global(unsigned long long delta)
  * Apply clock delta to the per-CPU data structures of this CPU.
  * This is called for each online CPU after the call to clock_sync_global.
  */
-static void clock_sync_local(unsigned long long delta)
+static void clock_sync_local(long delta)
 {
 	/* Add the delta to the clock comparator. */
-	if (S390_lowcore.clock_comparator != clock_comparator_max) {
-		S390_lowcore.clock_comparator += delta;
-		set_clock_comparator(S390_lowcore.clock_comparator);
+	if (get_lowcore()->clock_comparator != clock_comparator_max) {
+		get_lowcore()->clock_comparator += delta;
+		set_clock_comparator(get_lowcore()->clock_comparator);
 	}
 	/* Adjust the last_update_clock time-stamp. */
-	S390_lowcore.last_update_clock += delta;
+	get_lowcore()->last_update_clock += delta;
 }
 
 /* Single threaded workqueue used for stp sync events */
@@ -480,7 +424,7 @@ static void __init time_init_wq(void)
 struct clock_sync_data {
 	atomic_t cpus;
 	int in_sync;
-	unsigned long long clock_delta;
+	long clock_delta;
 };
 
 /*
@@ -491,7 +435,6 @@ static struct stp_sstpi stp_info;
 static void *stp_page;
 
 static void stp_work_fn(struct work_struct *work);
-static DEFINE_MUTEX(stp_work_mutex);
 static DECLARE_WORK(stp_work, stp_work_fn);
 static struct timer_list stp_timer;
 
@@ -520,6 +463,12 @@ static void __init stp_reset(void)
 	}
 }
 
+bool stp_enabled(void)
+{
+	return test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags) && stp_online;
+}
+EXPORT_SYMBOL(stp_enabled);
+
 static void stp_timeout(struct timer_list *unused)
 {
 	queue_work(time_sync_wq, &stp_work);
@@ -582,10 +531,26 @@ void stp_queue_work(void)
 	queue_work(time_sync_wq, &stp_work);
 }
 
+static int __store_stpinfo(void)
+{
+	int rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
+
+	if (rc)
+		clear_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
+	else
+		set_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
+	return rc;
+}
+
+static int stpinfo_valid(void)
+{
+	return stp_online && test_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
+}
+
 static int stp_sync_clock(void *data)
 {
 	struct clock_sync_data *sync = data;
-	unsigned long long clock_delta;
+	long clock_delta, flags;
 	static int first;
 	int rc;
 
@@ -595,19 +560,18 @@ static int stp_sync_clock(void *data)
 		while (atomic_read(&sync->cpus) != 0)
 			cpu_relax();
 		rc = 0;
-		if (stp_info.todoff[0] || stp_info.todoff[1] ||
-		    stp_info.todoff[2] || stp_info.todoff[3] ||
-		    stp_info.tmd != 2) {
+		if (stp_info.todoff || stp_info.tmd != 2) {
+			flags = vdso_update_begin();
 			rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0,
 					&clock_delta);
 			if (rc == 0) {
 				sync->clock_delta = clock_delta;
 				clock_sync_global(clock_delta);
-				rc = chsc_sstpi(stp_page, &stp_info,
-						sizeof(struct stp_sstpi));
+				rc = __store_stpinfo();
 				if (rc == 0 && stp_info.tmd != 2)
 					rc = -EAGAIN;
 			}
+			vdso_update_end(flags);
 		}
 		sync->in_sync = rc ? -EAGAIN : 1;
 		xchg(&first, 0);
@@ -627,6 +591,81 @@ static int stp_sync_clock(void *data)
 	return 0;
 }
 
+static int stp_clear_leap(void)
+{
+	struct __kernel_timex txc;
+	int ret;
+
+	memset(&txc, 0, sizeof(txc));
+
+	ret = do_adjtimex(&txc);
+	if (ret < 0)
+		return ret;
+
+	txc.modes = ADJ_STATUS;
+	txc.status &= ~(STA_INS|STA_DEL);
+	return do_adjtimex(&txc);
+}
+
+static void stp_check_leap(void)
+{
+	struct stp_stzi stzi;
+	struct stp_lsoib *lsoib = &stzi.lsoib;
+	struct __kernel_timex txc;
+	int64_t timediff;
+	int leapdiff, ret;
+
+	if (!stp_info.lu || !check_sync_clock()) {
+		/*
+		 * Either a scheduled leap second was removed by the operator,
+		 * or STP is out of sync. In both cases, clear the leap second
+		 * kernel flags.
+		 */
+		if (stp_clear_leap() < 0)
+			pr_err("failed to clear leap second flags\n");
+		return;
+	}
+
+	if (chsc_stzi(stp_page, &stzi, sizeof(stzi))) {
+		pr_err("stzi failed\n");
+		return;
+	}
+
+	timediff = tod_to_ns(lsoib->nlsout - get_tod_clock()) / NSEC_PER_SEC;
+	leapdiff = lsoib->nlso - lsoib->also;
+
+	if (leapdiff != 1 && leapdiff != -1) {
+		pr_err("Cannot schedule %d leap seconds\n", leapdiff);
+		return;
+	}
+
+	if (timediff < 0) {
+		if (stp_clear_leap() < 0)
+			pr_err("failed to clear leap second flags\n");
+	} else if (timediff < 7200) {
+		memset(&txc, 0, sizeof(txc));
+		ret = do_adjtimex(&txc);
+		if (ret < 0)
+			return;
+
+		txc.modes = ADJ_STATUS;
+		if (leapdiff > 0)
+			txc.status |= STA_INS;
+		else
+			txc.status |= STA_DEL;
+		ret = do_adjtimex(&txc);
+		if (ret < 0)
+			pr_err("failed to set leap second flags\n");
+		/* arm Timer to clear leap second flags */
+		mod_timer(&stp_timer, jiffies + secs_to_jiffies(14400));
+	} else {
+		/* The day the leap second is scheduled for hasn't been reached. Retry
+		 * in one hour.
+		 */
+		mod_timer(&stp_timer, jiffies + secs_to_jiffies(3600));
+	}
+}
+
 /*
  * STP work. Check for the STP state and take over the clock
  * synchronization if the STP clock source is usable.
@@ -637,160 +676,224 @@ static void stp_work_fn(struct work_struct *work)
 	int rc;
 
 	/* prevent multiple execution. */
-	mutex_lock(&stp_work_mutex);
+	mutex_lock(&stp_mutex);
 
 	if (!stp_online) {
 		chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000, NULL);
-		del_timer_sync(&stp_timer);
+		timer_delete_sync(&stp_timer);
 		goto out_unlock;
 	}
 
-	rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xb0e0, NULL);
+	rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xf0e0, NULL);
 	if (rc)
 		goto out_unlock;
 
-	rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
+	rc = __store_stpinfo();
 	if (rc || stp_info.c == 0)
 		goto out_unlock;
 
 	/* Skip synchronization if the clock is already in sync. */
-	if (check_sync_clock())
-		goto out_unlock;
-
-	memset(&stp_sync, 0, sizeof(stp_sync));
-	cpus_read_lock();
-	atomic_set(&stp_sync.cpus, num_online_cpus() - 1);
-	stop_machine_cpuslocked(stp_sync_clock, &stp_sync, cpu_online_mask);
-	cpus_read_unlock();
+	if (!check_sync_clock()) {
+		memset(&stp_sync, 0, sizeof(stp_sync));
+		cpus_read_lock();
+		atomic_set(&stp_sync.cpus, num_online_cpus() - 1);
+		stop_machine_cpuslocked(stp_sync_clock, &stp_sync, cpu_online_mask);
+		cpus_read_unlock();
+	}
 
 	if (!check_sync_clock())
 		/*
-		 * There is a usable clock but the synchonization failed.
+		 * There is a usable clock but the synchronization failed.
 		 * Retry after a second.
 		 */
-		mod_timer(&stp_timer, jiffies + HZ);
+		mod_timer(&stp_timer, jiffies + msecs_to_jiffies(MSEC_PER_SEC));
+	else if (stp_info.lu)
+		stp_check_leap();
 
 out_unlock:
-	mutex_unlock(&stp_work_mutex);
+	mutex_unlock(&stp_mutex);
 }
 
 /*
  * STP subsys sysfs interface functions
  */
-static struct bus_type stp_subsys = {
+static const struct bus_type stp_subsys = {
 	.name		= "stp",
 	.dev_name	= "stp",
 };
 
-static ssize_t stp_ctn_id_show(struct device *dev,
+static ssize_t ctn_id_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
-	if (!stp_online)
-		return -ENODATA;
-	return sprintf(buf, "%016llx\n",
-		       *(unsigned long long *) stp_info.ctnid);
+	ssize_t ret = -ENODATA;
+
+	mutex_lock(&stp_mutex);
+	if (stpinfo_valid())
+		ret = sysfs_emit(buf, "%016lx\n",
+				 *(unsigned long *)stp_info.ctnid);
+	mutex_unlock(&stp_mutex);
+	return ret;
 }
 
-static DEVICE_ATTR(ctn_id, 0400, stp_ctn_id_show, NULL);
+static DEVICE_ATTR_RO(ctn_id);
 
-static ssize_t stp_ctn_type_show(struct device *dev,
+static ssize_t ctn_type_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
-	if (!stp_online)
-		return -ENODATA;
-	return sprintf(buf, "%i\n", stp_info.ctn);
+	ssize_t ret = -ENODATA;
+
+	mutex_lock(&stp_mutex);
+	if (stpinfo_valid())
+		ret = sysfs_emit(buf, "%i\n", stp_info.ctn);
+	mutex_unlock(&stp_mutex);
+	return ret;
 }
 
-static DEVICE_ATTR(ctn_type, 0400, stp_ctn_type_show, NULL);
+static DEVICE_ATTR_RO(ctn_type);
 
-static ssize_t stp_dst_offset_show(struct device *dev,
+static ssize_t dst_offset_show(struct device *dev,
 				   struct device_attribute *attr,
 				   char *buf)
 {
-	if (!stp_online || !(stp_info.vbits & 0x2000))
-		return -ENODATA;
-	return sprintf(buf, "%i\n", (int)(s16) stp_info.dsto);
+	ssize_t ret = -ENODATA;
+
+	mutex_lock(&stp_mutex);
+	if (stpinfo_valid() && (stp_info.vbits & 0x2000))
+		ret = sysfs_emit(buf, "%i\n", (int)(s16)stp_info.dsto);
+	mutex_unlock(&stp_mutex);
+	return ret;
 }
 
-static DEVICE_ATTR(dst_offset, 0400, stp_dst_offset_show, NULL);
+static DEVICE_ATTR_RO(dst_offset);
 
-static ssize_t stp_leap_seconds_show(struct device *dev,
+static ssize_t leap_seconds_show(struct device *dev,
 					struct device_attribute *attr,
 					char *buf)
 {
-	if (!stp_online || !(stp_info.vbits & 0x8000))
+	ssize_t ret = -ENODATA;
+
+	mutex_lock(&stp_mutex);
+	if (stpinfo_valid() && (stp_info.vbits & 0x8000))
+		ret = sysfs_emit(buf, "%i\n", (int)(s16)stp_info.leaps);
+	mutex_unlock(&stp_mutex);
+	return ret;
+}
+
+static DEVICE_ATTR_RO(leap_seconds);
+
+static ssize_t leap_seconds_scheduled_show(struct device *dev,
+						struct device_attribute *attr,
+						char *buf)
+{
+	struct stp_stzi stzi;
+	ssize_t ret;
+
+	mutex_lock(&stp_mutex);
+	if (!stpinfo_valid() || !(stp_info.vbits & 0x8000) || !stp_info.lu) {
+		mutex_unlock(&stp_mutex);
 		return -ENODATA;
-	return sprintf(buf, "%i\n", (int)(s16) stp_info.leaps);
+	}
+
+	ret = chsc_stzi(stp_page, &stzi, sizeof(stzi));
+	mutex_unlock(&stp_mutex);
+	if (ret < 0)
+		return ret;
+
+	if (!stzi.lsoib.p)
+		return sysfs_emit(buf, "0,0\n");
+
+	return sysfs_emit(buf, "%lu,%d\n",
+			  tod_to_ns(stzi.lsoib.nlsout - TOD_UNIX_EPOCH) / NSEC_PER_SEC,
+			  stzi.lsoib.nlso - stzi.lsoib.also);
 }
 
-static DEVICE_ATTR(leap_seconds, 0400, stp_leap_seconds_show, NULL);
+static DEVICE_ATTR_RO(leap_seconds_scheduled);
 
-static ssize_t stp_stratum_show(struct device *dev,
+static ssize_t stratum_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
-	if (!stp_online)
-		return -ENODATA;
-	return sprintf(buf, "%i\n", (int)(s16) stp_info.stratum);
+	ssize_t ret = -ENODATA;
+
+	mutex_lock(&stp_mutex);
+	if (stpinfo_valid())
+		ret = sysfs_emit(buf, "%i\n", (int)(s16)stp_info.stratum);
+	mutex_unlock(&stp_mutex);
+	return ret;
 }
 
-static DEVICE_ATTR(stratum, 0400, stp_stratum_show, NULL);
+static DEVICE_ATTR_RO(stratum);
 
-static ssize_t stp_time_offset_show(struct device *dev,
+static ssize_t time_offset_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
-	if (!stp_online || !(stp_info.vbits & 0x0800))
-		return -ENODATA;
-	return sprintf(buf, "%i\n", (int) stp_info.tto);
+	ssize_t ret = -ENODATA;
+
+	mutex_lock(&stp_mutex);
+	if (stpinfo_valid() && (stp_info.vbits & 0x0800))
+		ret = sysfs_emit(buf, "%i\n", (int)stp_info.tto);
+	mutex_unlock(&stp_mutex);
+	return ret;
 }
 
-static DEVICE_ATTR(time_offset, 0400, stp_time_offset_show, NULL);
+static DEVICE_ATTR_RO(time_offset);
 
-static ssize_t stp_time_zone_offset_show(struct device *dev,
+static ssize_t time_zone_offset_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
-	if (!stp_online || !(stp_info.vbits & 0x4000))
-		return -ENODATA;
-	return sprintf(buf, "%i\n", (int)(s16) stp_info.tzo);
+	ssize_t ret = -ENODATA;
+
+	mutex_lock(&stp_mutex);
+	if (stpinfo_valid() && (stp_info.vbits & 0x4000))
+		ret = sysfs_emit(buf, "%i\n", (int)(s16)stp_info.tzo);
+	mutex_unlock(&stp_mutex);
+	return ret;
 }
 
-static DEVICE_ATTR(time_zone_offset, 0400,
-			 stp_time_zone_offset_show, NULL);
+static DEVICE_ATTR_RO(time_zone_offset);
 
-static ssize_t stp_timing_mode_show(struct device *dev,
+static ssize_t timing_mode_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
-	if (!stp_online)
-		return -ENODATA;
-	return sprintf(buf, "%i\n", stp_info.tmd);
+	ssize_t ret = -ENODATA;
+
+	mutex_lock(&stp_mutex);
+	if (stpinfo_valid())
+		ret = sysfs_emit(buf, "%i\n", stp_info.tmd);
+	mutex_unlock(&stp_mutex);
+	return ret;
 }
 
-static DEVICE_ATTR(timing_mode, 0400, stp_timing_mode_show, NULL);
+static DEVICE_ATTR_RO(timing_mode);
 
-static ssize_t stp_timing_state_show(struct device *dev,
+static ssize_t timing_state_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
-	if (!stp_online)
-		return -ENODATA;
-	return sprintf(buf, "%i\n", stp_info.tst);
+	ssize_t ret = -ENODATA;
+
+	mutex_lock(&stp_mutex);
+	if (stpinfo_valid())
+		ret = sysfs_emit(buf, "%i\n", stp_info.tst);
+	mutex_unlock(&stp_mutex);
+	return ret;
 }
 
-static DEVICE_ATTR(timing_state, 0400, stp_timing_state_show, NULL);
+static DEVICE_ATTR_RO(timing_state);
 
-static ssize_t stp_online_show(struct device *dev,
+static ssize_t online_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
-	return sprintf(buf, "%i\n", stp_online);
+	return sysfs_emit(buf, "%i\n", stp_online);
 }
 
-static ssize_t stp_online_store(struct device *dev,
+static ssize_t online_store(struct device *dev,
 				struct device_attribute *attr,
 				const char *buf, size_t count)
 {
@@ -801,14 +904,14 @@ static ssize_t stp_online_store(struct device *dev,
 		return -EINVAL;
 	if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags))
 		return -EOPNOTSUPP;
-	mutex_lock(&clock_sync_mutex);
+	mutex_lock(&stp_mutex);
 	stp_online = value;
 	if (stp_online)
 		set_bit(CLOCK_SYNC_STP, &clock_sync_flags);
 	else
 		clear_bit(CLOCK_SYNC_STP, &clock_sync_flags);
 	queue_work(time_sync_wq, &stp_work);
-	mutex_unlock(&clock_sync_mutex);
+	mutex_unlock(&stp_mutex);
 	return count;
 }
 
@@ -816,46 +919,27 @@ static ssize_t stp_online_store(struct device *dev,
  * Can't use DEVICE_ATTR because the attribute should be named
  * stp/online but dev_attr_online already exists in this file ..
  */
-static struct device_attribute dev_attr_stp_online = {
-	.attr = { .name = "online", .mode = 0600 },
-	.show	= stp_online_show,
-	.store	= stp_online_store,
-};
-
-static struct device_attribute *stp_attributes[] = {
-	&dev_attr_ctn_id,
-	&dev_attr_ctn_type,
-	&dev_attr_dst_offset,
-	&dev_attr_leap_seconds,
-	&dev_attr_stp_online,
-	&dev_attr_stratum,
-	&dev_attr_time_offset,
-	&dev_attr_time_zone_offset,
-	&dev_attr_timing_mode,
-	&dev_attr_timing_state,
+static DEVICE_ATTR_RW(online);
+
+static struct attribute *stp_dev_attrs[] = {
+	&dev_attr_ctn_id.attr,
+	&dev_attr_ctn_type.attr,
+	&dev_attr_dst_offset.attr,
+	&dev_attr_leap_seconds.attr,
+	&dev_attr_online.attr,
+	&dev_attr_leap_seconds_scheduled.attr,
+	&dev_attr_stratum.attr,
+	&dev_attr_time_offset.attr,
+	&dev_attr_time_zone_offset.attr,
+	&dev_attr_timing_mode.attr,
+	&dev_attr_timing_state.attr,
 	NULL
 };
+ATTRIBUTE_GROUPS(stp_dev);
 
 static int __init stp_init_sysfs(void)
 {
-	struct device_attribute **attr;
-	int rc;
-
-	rc = subsys_system_register(&stp_subsys, NULL);
-	if (rc)
-		goto out;
-	for (attr = stp_attributes; *attr; attr++) {
-		rc = device_create_file(stp_subsys.dev_root, *attr);
-		if (rc)
-			goto out_unreg;
-	}
-	return 0;
-out_unreg:
-	for (; attr >= stp_attributes; attr--)
-		device_remove_file(stp_subsys.dev_root, *attr);
-	bus_unregister(&stp_subsys);
-out:
-	return rc;
+	return subsys_system_register(&stp_subsys, stp_dev_groups);
 }
 
 device_initcall(stp_init_sysfs);
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 3627953007ed..3df048e190b1 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -1,12 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *    Copyright IBM Corp. 2007, 2011
- *    Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
  */
 
 #define KMSG_COMPONENT "cpu"
 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 
+#include <linux/cpufeature.h>
 #include <linux/workqueue.h>
 #include <linux/memblock.h>
 #include <linux/uaccess.h>
@@ -25,8 +25,9 @@
 #include <linux/mm.h>
 #include <linux/nodemask.h>
 #include <linux/node.h>
+#include <asm/hiperdispatch.h>
 #include <asm/sysinfo.h>
-#include <asm/numa.h>
+#include <asm/asm.h>
 
 #define PTF_HORIZONTAL	(0UL)
 #define PTF_VERTICAL	(1UL)
@@ -49,6 +50,7 @@ static int topology_mode = TOPOLOGY_MODE_UNINITIALIZED;
 static void set_topology_timer(void);
 static void topology_work_fn(struct work_struct *work);
 static struct sysinfo_15_1_x *tl_info;
+static int cpu_management;
 
 static DECLARE_WORK(topology_work, topology_work_fn);
 
@@ -63,50 +65,56 @@ static struct mask_info drawer_info;
 struct cpu_topology_s390 cpu_topology[NR_CPUS];
 EXPORT_SYMBOL_GPL(cpu_topology);
 
-cpumask_t cpus_with_topology;
-
-static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu)
+static void cpu_group_map(cpumask_t *dst, struct mask_info *info, unsigned int cpu)
 {
-	cpumask_t mask;
+	static cpumask_t mask;
 
-	cpumask_copy(&mask, cpumask_of(cpu));
+	cpumask_clear(&mask);
+	if (!cpumask_test_cpu(cpu, &cpu_setup_mask))
+		goto out;
+	cpumask_set_cpu(cpu, &mask);
 	switch (topology_mode) {
 	case TOPOLOGY_MODE_HW:
 		while (info) {
 			if (cpumask_test_cpu(cpu, &info->mask)) {
-				mask = info->mask;
+				cpumask_copy(&mask, &info->mask);
 				break;
 			}
 			info = info->next;
 		}
-		if (cpumask_empty(&mask))
-			cpumask_copy(&mask, cpumask_of(cpu));
 		break;
 	case TOPOLOGY_MODE_PACKAGE:
 		cpumask_copy(&mask, cpu_present_mask);
 		break;
 	default:
-		/* fallthrough */
+		fallthrough;
 	case TOPOLOGY_MODE_SINGLE:
-		cpumask_copy(&mask, cpumask_of(cpu));
 		break;
 	}
-	return mask;
+	cpumask_and(&mask, &mask, &cpu_setup_mask);
+out:
+	cpumask_copy(dst, &mask);
 }
 
-static cpumask_t cpu_thread_map(unsigned int cpu)
+static void cpu_thread_map(cpumask_t *dst, unsigned int cpu)
 {
-	cpumask_t mask;
-	int i;
+	static cpumask_t mask;
+	unsigned int max_cpu;
 
-	cpumask_copy(&mask, cpumask_of(cpu));
+	cpumask_clear(&mask);
+	if (!cpumask_test_cpu(cpu, &cpu_setup_mask))
+		goto out;
+	cpumask_set_cpu(cpu, &mask);
 	if (topology_mode != TOPOLOGY_MODE_HW)
-		return mask;
+		goto out;
 	cpu -= cpu % (smp_cpu_mtid + 1);
-	for (i = 0; i <= smp_cpu_mtid; i++)
-		if (cpu_present(cpu + i))
-			cpumask_set_cpu(cpu + i, &mask);
-	return mask;
+	max_cpu = min(cpu + smp_cpu_mtid, nr_cpu_ids - 1);
+	for (; cpu <= max_cpu; cpu++) {
+		if (cpumask_test_cpu(cpu, &cpu_setup_mask))
+			cpumask_set_cpu(cpu, &mask);
+	}
+out:
+	cpumask_copy(dst, &mask);
 }
 
 #define TOPOLOGY_CORE_BITS	64
@@ -120,26 +128,27 @@ static void add_cpus_to_mask(struct topology_core *tl_core,
 	unsigned int core;
 
 	for_each_set_bit(core, &tl_core->mask, TOPOLOGY_CORE_BITS) {
-		unsigned int rcore;
-		int lcpu, i;
+		unsigned int max_cpu, rcore;
+		int cpu;
 
 		rcore = TOPOLOGY_CORE_BITS - 1 - core + tl_core->origin;
-		lcpu = smp_find_processor_id(rcore << smp_cpu_mt_shift);
-		if (lcpu < 0)
+		cpu = smp_find_processor_id(rcore << smp_cpu_mt_shift);
+		if (cpu < 0)
 			continue;
-		for (i = 0; i <= smp_cpu_mtid; i++) {
-			topo = &cpu_topology[lcpu + i];
+		max_cpu = min(cpu + smp_cpu_mtid, nr_cpu_ids - 1);
+		for (; cpu <= max_cpu; cpu++) {
+			topo = &cpu_topology[cpu];
 			topo->drawer_id = drawer->id;
 			topo->book_id = book->id;
 			topo->socket_id = socket->id;
 			topo->core_id = rcore;
-			topo->thread_id = lcpu + i;
+			topo->thread_id = cpu;
 			topo->dedicated = tl_core->d;
-			cpumask_set_cpu(lcpu + i, &drawer->mask);
-			cpumask_set_cpu(lcpu + i, &book->mask);
-			cpumask_set_cpu(lcpu + i, &socket->mask);
-			cpumask_set_cpu(lcpu + i, &cpus_with_topology);
-			smp_cpu_set_polarization(lcpu + i, tl_core->pp);
+			cpumask_set_cpu(cpu, &drawer->mask);
+			cpumask_set_cpu(cpu, &book->mask);
+			cpumask_set_cpu(cpu, &socket->mask);
+			smp_cpu_set_polarization(cpu, tl_core->pp);
+			smp_cpu_set_capacity(cpu, CPU_CAPACITY_HIGH);
 		}
 	}
 }
@@ -217,22 +226,22 @@ static void topology_update_polarization_simple(void)
 
 static int ptf(unsigned long fc)
 {
-	int rc;
+	int cc;
 
 	asm volatile(
-		"	.insn	rre,0xb9a20000,%1,%1\n"
-		"	ipm	%0\n"
-		"	srl	%0,28\n"
-		: "=d" (rc)
-		: "d" (fc)  : "cc");
-	return rc;
+		"	.insn	rre,0xb9a20000,%[fc],%[fc]\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc)
+		: [fc] "d" (fc)
+		: CC_CLOBBER);
+	return CC_TRANSFORM(cc);
 }
 
 int topology_set_cpu_management(int fc)
 {
 	int cpu, rc;
 
-	if (!MACHINE_HAS_TOPOLOGY)
+	if (!cpu_has_topology())
 		return -EOPNOTSUPP;
 	if (fc)
 		rc = ptf(PTF_VERTICAL);
@@ -245,17 +254,18 @@ int topology_set_cpu_management(int fc)
 	return rc;
 }
 
-static void update_cpu_masks(void)
+void update_cpu_masks(void)
 {
-	struct cpu_topology_s390 *topo;
-	int cpu, id;
+	struct cpu_topology_s390 *topo, *topo_package, *topo_sibling;
+	int cpu, sibling, pkg_first, smt_first, id;
 
 	for_each_possible_cpu(cpu) {
 		topo = &cpu_topology[cpu];
-		topo->thread_mask = cpu_thread_map(cpu);
-		topo->core_mask = cpu_group_map(&socket_info, cpu);
-		topo->book_mask = cpu_group_map(&book_info, cpu);
-		topo->drawer_mask = cpu_group_map(&drawer_info, cpu);
+		cpu_thread_map(&topo->thread_mask, cpu);
+		cpu_group_map(&topo->core_mask, &socket_info, cpu);
+		cpu_group_map(&topo->book_mask, &book_info, cpu);
+		cpu_group_map(&topo->drawer_mask, &drawer_info, cpu);
+		topo->booted_cores = 0;
 		if (topology_mode != TOPOLOGY_MODE_HW) {
 			id = topology_mode == TOPOLOGY_MODE_PACKAGE ? 0 : cpu;
 			topo->thread_id = cpu;
@@ -263,11 +273,26 @@ static void update_cpu_masks(void)
 			topo->socket_id = id;
 			topo->book_id = id;
 			topo->drawer_id = id;
-			if (cpu_present(cpu))
-				cpumask_set_cpu(cpu, &cpus_with_topology);
 		}
 	}
-	numa_update_cpu_topology();
+	hd_reset_state();
+	for_each_online_cpu(cpu) {
+		topo = &cpu_topology[cpu];
+		pkg_first = cpumask_first(&topo->core_mask);
+		topo_package = &cpu_topology[pkg_first];
+		if (cpu == pkg_first) {
+			for_each_cpu(sibling, &topo->core_mask) {
+				topo_sibling = &cpu_topology[sibling];
+				smt_first = cpumask_first(&topo_sibling->thread_mask);
+				if (sibling == smt_first) {
+					topo_package->booted_cores++;
+					hd_add_core(sibling);
+				}
+			}
+		} else {
+			topo->booted_cores = topo_package->booted_cores;
+		}
+	}
 }
 
 void store_topology(struct sysinfo_15_1_x *info)
@@ -286,34 +311,33 @@ static void __arch_update_dedicated_flag(void *arg)
 static int __arch_update_cpu_topology(void)
 {
 	struct sysinfo_15_1_x *info = tl_info;
-	int rc = 0;
+	int rc, hd_status;
 
+	hd_status = 0;
+	rc = 0;
 	mutex_lock(&smp_cpu_state_mutex);
-	cpumask_clear(&cpus_with_topology);
-	if (MACHINE_HAS_TOPOLOGY) {
+	if (cpu_has_topology()) {
 		rc = 1;
 		store_topology(info);
 		tl_to_masks(info);
 	}
 	update_cpu_masks();
-	if (!MACHINE_HAS_TOPOLOGY)
+	if (!cpu_has_topology())
 		topology_update_polarization_simple();
+	if (cpu_management == 1)
+		hd_status = hd_enable_hiperdispatch();
 	mutex_unlock(&smp_cpu_state_mutex);
+	if (hd_status == 0)
+		hd_disable_hiperdispatch();
 	return rc;
 }
 
 int arch_update_cpu_topology(void)
 {
-	struct device *dev;
-	int cpu, rc;
+	int rc;
 
 	rc = __arch_update_cpu_topology();
 	on_each_cpu(__arch_update_dedicated_flag, NULL, 0);
-	for_each_online_cpu(cpu) {
-		dev = get_cpu_device(cpu);
-		if (dev)
-			kobject_uevent(&dev->kobj, KOBJ_CHANGE);
-	}
 	return rc;
 }
 
@@ -346,14 +370,14 @@ static atomic_t topology_poll = ATOMIC_INIT(0);
 static void set_topology_timer(void)
 {
 	if (atomic_add_unless(&topology_poll, -1, 0))
-		mod_timer(&topology_timer, jiffies + HZ / 10);
+		mod_timer(&topology_timer, jiffies + msecs_to_jiffies(100));
 	else
-		mod_timer(&topology_timer, jiffies + HZ * 60);
+		mod_timer(&topology_timer, jiffies + secs_to_jiffies(60));
 }
 
 void topology_expect_change(void)
 {
-	if (!MACHINE_HAS_TOPOLOGY)
+	if (!cpu_has_topology())
 		return;
 	/* This is racy, but it doesn't matter since it is just a heuristic.
 	 * Worst case is that we poll in a higher frequency for a bit longer.
@@ -364,7 +388,24 @@ void topology_expect_change(void)
 	set_topology_timer();
 }
 
-static int cpu_management;
+static int set_polarization(int polarization)
+{
+	int rc = 0;
+
+	cpus_read_lock();
+	mutex_lock(&smp_cpu_state_mutex);
+	if (cpu_management == polarization)
+		goto out;
+	rc = topology_set_cpu_management(polarization);
+	if (rc)
+		goto out;
+	cpu_management = polarization;
+	topology_expect_change();
+out:
+	mutex_unlock(&smp_cpu_state_mutex);
+	cpus_read_unlock();
+	return rc;
+}
 
 static ssize_t dispatching_show(struct device *dev,
 				struct device_attribute *attr,
@@ -373,7 +414,7 @@ static ssize_t dispatching_show(struct device *dev,
 	ssize_t count;
 
 	mutex_lock(&smp_cpu_state_mutex);
-	count = sprintf(buf, "%d\n", cpu_management);
+	count = sysfs_emit(buf, "%d\n", cpu_management);
 	mutex_unlock(&smp_cpu_state_mutex);
 	return count;
 }
@@ -390,19 +431,7 @@ static ssize_t dispatching_store(struct device *dev,
 		return -EINVAL;
 	if (val != 0 && val != 1)
 		return -EINVAL;
-	rc = 0;
-	get_online_cpus();
-	mutex_lock(&smp_cpu_state_mutex);
-	if (cpu_management == val)
-		goto out;
-	rc = topology_set_cpu_management(val);
-	if (rc)
-		goto out;
-	cpu_management = val;
-	topology_expect_change();
-out:
-	mutex_unlock(&smp_cpu_state_mutex);
-	put_online_cpus();
+	rc = set_polarization(val);
 	return rc ? rc : count;
 }
 static DEVICE_ATTR_RW(dispatching);
@@ -416,19 +445,19 @@ static ssize_t cpu_polarization_show(struct device *dev,
 	mutex_lock(&smp_cpu_state_mutex);
 	switch (smp_cpu_get_polarization(cpu)) {
 	case POLARIZATION_HRZ:
-		count = sprintf(buf, "horizontal\n");
+		count = sysfs_emit(buf, "horizontal\n");
 		break;
 	case POLARIZATION_VL:
-		count = sprintf(buf, "vertical:low\n");
+		count = sysfs_emit(buf, "vertical:low\n");
 		break;
 	case POLARIZATION_VM:
-		count = sprintf(buf, "vertical:medium\n");
+		count = sysfs_emit(buf, "vertical:medium\n");
 		break;
 	case POLARIZATION_VH:
-		count = sprintf(buf, "vertical:high\n");
+		count = sysfs_emit(buf, "vertical:high\n");
 		break;
 	default:
-		count = sprintf(buf, "unknown\n");
+		count = sysfs_emit(buf, "unknown\n");
 		break;
 	}
 	mutex_unlock(&smp_cpu_state_mutex);
@@ -452,7 +481,7 @@ static ssize_t cpu_dedicated_show(struct device *dev,
 	ssize_t count;
 
 	mutex_lock(&smp_cpu_state_mutex);
-	count = sprintf(buf, "%d\n", topology_cpu_dedicated(cpu));
+	count = sysfs_emit(buf, "%d\n", topology_cpu_dedicated(cpu));
 	mutex_unlock(&smp_cpu_state_mutex);
 	return count;
 }
@@ -472,7 +501,7 @@ int topology_cpu_init(struct cpu *cpu)
 	int rc;
 
 	rc = sysfs_create_group(&cpu->dev.kobj, &topology_cpu_attr_group);
-	if (rc || !MACHINE_HAS_TOPOLOGY)
+	if (rc || !cpu_has_topology())
 		return rc;
 	rc = sysfs_create_group(&cpu->dev.kobj, &topology_extra_cpu_attr_group);
 	if (rc)
@@ -506,7 +535,7 @@ static struct sched_domain_topology_level s390_topology[] = {
 	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
 	{ cpu_book_mask, SD_INIT_NAME(BOOK) },
 	{ cpu_drawer_mask, SD_INIT_NAME(DRAWER) },
-	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ cpu_cpu_mask, SD_INIT_NAME(PKG) },
 	{ NULL, },
 };
 
@@ -520,33 +549,38 @@ static void __init alloc_masks(struct sysinfo_15_1_x *info,
 		nr_masks *= info->mag[TOPOLOGY_NR_MAG - offset - 1 - i];
 	nr_masks = max(nr_masks, 1);
 	for (i = 0; i < nr_masks; i++) {
-		mask->next = memblock_alloc(sizeof(*mask->next), 8);
-		if (!mask->next)
-			panic("%s: Failed to allocate %zu bytes align=0x%x\n",
-			      __func__, sizeof(*mask->next), 8);
+		mask->next = memblock_alloc_or_panic(sizeof(*mask->next), 8);
 		mask = mask->next;
 	}
 }
 
+static int __init detect_polarization(union topology_entry *tle)
+{
+	struct topology_core *tl_core;
+
+	while (tle->nl)
+		tle = next_tle(tle);
+	tl_core = (struct topology_core *)tle;
+	return tl_core->pp != POLARIZATION_HRZ;
+}
+
 void __init topology_init_early(void)
 {
 	struct sysinfo_15_1_x *info;
 
 	set_sched_topology(s390_topology);
 	if (topology_mode == TOPOLOGY_MODE_UNINITIALIZED) {
-		if (MACHINE_HAS_TOPOLOGY)
+		if (cpu_has_topology())
 			topology_mode = TOPOLOGY_MODE_HW;
 		else
 			topology_mode = TOPOLOGY_MODE_SINGLE;
 	}
-	if (!MACHINE_HAS_TOPOLOGY)
+	if (!cpu_has_topology())
 		goto out;
-	tl_info = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-	if (!tl_info)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, PAGE_SIZE, PAGE_SIZE);
+	tl_info = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
 	info = tl_info;
 	store_topology(info);
+	cpu_management = detect_polarization(info->tle);
 	pr_info("The CPU configuration topology of the machine is: %d %d %d %d %d %d / %d\n",
 		info->mag[0], info->mag[1], info->mag[2], info->mag[3],
 		info->mag[4], info->mag[5], info->mnest);
@@ -554,6 +588,7 @@ void __init topology_init_early(void)
 	alloc_masks(info, &book_info, 2);
 	alloc_masks(info, &drawer_info, 3);
 out:
+	cpumask_set_cpu(0, &cpu_setup_mask);
 	__arch_update_cpu_topology();
 	__arch_update_dedicated_flag(NULL);
 }
@@ -562,7 +597,7 @@ static inline int topology_get_mode(int enabled)
 {
 	if (!enabled)
 		return TOPOLOGY_MODE_SINGLE;
-	return MACHINE_HAS_TOPOLOGY ? TOPOLOGY_MODE_HW : TOPOLOGY_MODE_PACKAGE;
+	return cpu_has_topology() ? TOPOLOGY_MODE_HW : TOPOLOGY_MODE_PACKAGE;
 }
 
 static inline int topology_is_enabled(void)
@@ -583,8 +618,8 @@ static int __init topology_setup(char *str)
 }
 early_param("topology", topology_setup);
 
-static int topology_ctl_handler(struct ctl_table *ctl, int write,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
+static int topology_ctl_handler(const struct ctl_table *ctl, int write,
+				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int enabled = topology_is_enabled();
 	int new_mode;
@@ -613,33 +648,58 @@ static int topology_ctl_handler(struct ctl_table *ctl, int write,
 	return rc;
 }
 
-static struct ctl_table topology_ctl_table[] = {
+static int polarization_ctl_handler(const struct ctl_table *ctl, int write,
+				    void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int polarization;
+	int rc;
+	struct ctl_table ctl_entry = {
+		.procname	= ctl->procname,
+		.data		= &polarization,
+		.maxlen		= sizeof(int),
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	};
+
+	polarization = cpu_management;
+	rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
+	if (rc < 0 || !write)
+		return rc;
+	return set_polarization(polarization);
+}
+
+static const struct ctl_table topology_ctl_table[] = {
 	{
 		.procname	= "topology",
 		.mode		= 0644,
 		.proc_handler	= topology_ctl_handler,
 	},
-	{ },
-};
-
-static struct ctl_table topology_dir_table[] = {
 	{
-		.procname	= "s390",
-		.maxlen		= 0,
-		.mode		= 0555,
-		.child		= topology_ctl_table,
+		.procname	= "polarization",
+		.mode		= 0644,
+		.proc_handler	= polarization_ctl_handler,
 	},
-	{ },
 };
 
 static int __init topology_init(void)
 {
+	struct device *dev_root;
+	int rc = 0;
+
 	timer_setup(&topology_timer, topology_timer_fn, TIMER_DEFERRABLE);
-	if (MACHINE_HAS_TOPOLOGY)
+	if (cpu_has_topology())
 		set_topology_timer();
 	else
 		topology_update_polarization_simple();
-	register_sysctl_table(topology_dir_table);
-	return device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching);
+	if (IS_ENABLED(CONFIG_SCHED_TOPOLOGY_VERTICAL))
+		set_polarization(1);
+	register_sysctl("s390", topology_ctl_table);
+
+	dev_root = bus_get_dev_root(&cpu_subsys);
+	if (dev_root) {
+		rc = device_create_file(dev_root, &dev_attr_dispatching);
+		put_device(dev_root);
+	}
+	return rc;
 }
 device_initcall(topology_init);
diff --git a/arch/s390/kernel/trace.c b/arch/s390/kernel/trace.c
index 490b52e85014..11a669f3cc93 100644
--- a/arch/s390/kernel/trace.c
+++ b/arch/s390/kernel/trace.c
@@ -14,7 +14,7 @@ EXPORT_TRACEPOINT_SYMBOL(s390_diagnose);
 
 static DEFINE_PER_CPU(unsigned int, diagnose_trace_depth);
 
-void trace_s390_diagnose_norecursion(int diag_nr)
+void notrace trace_s390_diagnose_norecursion(int diag_nr)
 {
 	unsigned long flags;
 	unsigned int *depth;
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index 164c0282b41a..19687dab32f7 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -3,18 +3,16 @@
  *  S390 version
  *    Copyright IBM Corp. 1999, 2000
  *    Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com),
- *               Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com),
+ *		 Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com),
  *
  *  Derived from "arch/i386/kernel/traps.c"
  *    Copyright (C) 1991, 1992 Linus Torvalds
  */
 
-/*
- * 'Traps.c' handles hardware traps and faults after we have saved some
- * state in 'asm.s'.
- */
+#include <linux/cpufeature.h>
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
+#include <linux/randomize_kstack.h>
 #include <linux/extable.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
@@ -23,7 +21,14 @@
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/cpu.h>
-#include <asm/fpu/api.h>
+#include <linux/entry-common.h>
+#include <linux/kmsan.h>
+#include <asm/asm-extable.h>
+#include <asm/irqflags.h>
+#include <asm/ptrace.h>
+#include <asm/vtime.h>
+#include <asm/fpu.h>
+#include <asm/fault.h>
 #include "entry.h"
 
 static inline void __user *get_trap_ip(struct pt_regs *regs)
@@ -31,42 +36,33 @@ static inline void __user *get_trap_ip(struct pt_regs *regs)
 	unsigned long address;
 
 	if (regs->int_code & 0x200)
-		address = *(unsigned long *)(current->thread.trap_tdb + 24);
+		address = current->thread.trap_tdb.data[3];
 	else
 		address = regs->psw.addr;
-	return (void __user *) (address - (regs->int_code >> 16));
+	return (void __user *)(address - (regs->int_code >> 16));
 }
 
+#ifdef CONFIG_GENERIC_BUG
 int is_valid_bugaddr(unsigned long addr)
 {
 	return 1;
 }
+#endif
 
 void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str)
 {
 	if (user_mode(regs)) {
 		force_sig_fault(si_signo, si_code, get_trap_ip(regs));
 		report_user_fault(regs, si_signo, 0);
-        } else {
-                const struct exception_table_entry *fixup;
-		fixup = s390_search_extables(regs->psw.addr);
-                if (fixup)
-			regs->psw.addr = extable_fixup(fixup);
-		else {
-			enum bug_trap_type btt;
-
-			btt = report_bug(regs->psw.addr, regs);
-			if (btt == BUG_TRAP_TYPE_WARN)
-				return;
+	} else {
+		if (!fixup_exception(regs))
 			die(regs, str);
-		}
-        }
+	}
 }
 
 static void do_trap(struct pt_regs *regs, int si_signo, int si_code, char *str)
 {
-	if (notify_die(DIE_TRAP, str, regs, 0,
-		       regs->int_code, si_signo) == NOTIFY_STOP)
+	if (notify_die(DIE_TRAP, str, regs, 0, regs->int_code, si_signo) == NOTIFY_STOP)
 		return;
 	do_report_trap(regs, si_signo, si_code, str);
 }
@@ -78,56 +74,44 @@ void do_per_trap(struct pt_regs *regs)
 		return;
 	if (!current->ptrace)
 		return;
-	force_sig_fault(SIGTRAP, TRAP_HWBKPT,
-		(void __force __user *) current->thread.per_event.address);
+	force_sig_fault(SIGTRAP, TRAP_HWBKPT, (void __force __user *)current->thread.per_event.address);
 }
 NOKPROBE_SYMBOL(do_per_trap);
 
-void default_trap_handler(struct pt_regs *regs)
+static void default_trap_handler(struct pt_regs *regs)
 {
 	if (user_mode(regs)) {
 		report_user_fault(regs, SIGSEGV, 0);
-		do_exit(SIGSEGV);
+		force_exit_sig(SIGSEGV);
 	} else
 		die(regs, "Unknown program exception");
 }
 
 #define DO_ERROR_INFO(name, signr, sicode, str) \
-void name(struct pt_regs *regs)			\
+static void name(struct pt_regs *regs)		\
 {						\
 	do_trap(regs, signr, sicode, str);	\
 }
 
-DO_ERROR_INFO(addressing_exception, SIGILL, ILL_ILLADR,
-	      "addressing exception")
-DO_ERROR_INFO(execute_exception, SIGILL, ILL_ILLOPN,
-	      "execute exception")
-DO_ERROR_INFO(divide_exception, SIGFPE, FPE_INTDIV,
-	      "fixpoint divide exception")
-DO_ERROR_INFO(overflow_exception, SIGFPE, FPE_INTOVF,
-	      "fixpoint overflow exception")
-DO_ERROR_INFO(hfp_overflow_exception, SIGFPE, FPE_FLTOVF,
-	      "HFP overflow exception")
-DO_ERROR_INFO(hfp_underflow_exception, SIGFPE, FPE_FLTUND,
-	      "HFP underflow exception")
-DO_ERROR_INFO(hfp_significance_exception, SIGFPE, FPE_FLTRES,
-	      "HFP significance exception")
-DO_ERROR_INFO(hfp_divide_exception, SIGFPE, FPE_FLTDIV,
-	      "HFP divide exception")
-DO_ERROR_INFO(hfp_sqrt_exception, SIGFPE, FPE_FLTINV,
-	      "HFP square root exception")
-DO_ERROR_INFO(operand_exception, SIGILL, ILL_ILLOPN,
-	      "operand exception")
-DO_ERROR_INFO(privileged_op, SIGILL, ILL_PRVOPC,
-	      "privileged operation")
-DO_ERROR_INFO(special_op_exception, SIGILL, ILL_ILLOPN,
-	      "special operation exception")
-DO_ERROR_INFO(transaction_exception, SIGILL, ILL_ILLOPN,
-	      "transaction constraint exception")
+DO_ERROR_INFO(addressing_exception, SIGILL, ILL_ILLADR, "addressing exception")
+DO_ERROR_INFO(divide_exception, SIGFPE, FPE_INTDIV, "fixpoint divide exception")
+DO_ERROR_INFO(execute_exception, SIGILL, ILL_ILLOPN, "execute exception")
+DO_ERROR_INFO(hfp_divide_exception, SIGFPE, FPE_FLTDIV, "HFP divide exception")
+DO_ERROR_INFO(hfp_overflow_exception, SIGFPE, FPE_FLTOVF, "HFP overflow exception")
+DO_ERROR_INFO(hfp_significance_exception, SIGFPE, FPE_FLTRES, "HFP significance exception")
+DO_ERROR_INFO(hfp_sqrt_exception, SIGFPE, FPE_FLTINV, "HFP square root exception")
+DO_ERROR_INFO(hfp_underflow_exception, SIGFPE, FPE_FLTUND, "HFP underflow exception")
+DO_ERROR_INFO(operand_exception, SIGILL, ILL_ILLOPN, "operand exception")
+DO_ERROR_INFO(overflow_exception, SIGFPE, FPE_INTOVF, "fixpoint overflow exception")
+DO_ERROR_INFO(privileged_op, SIGILL, ILL_PRVOPC, "privileged operation")
+DO_ERROR_INFO(special_op_exception, SIGILL, ILL_ILLOPN, "special operation exception")
+DO_ERROR_INFO(specification_exception, SIGILL, ILL_ILLOPN, "specification exception");
+DO_ERROR_INFO(transaction_exception, SIGILL, ILL_ILLOPN, "transaction constraint exception")
 
 static inline void do_fp_trap(struct pt_regs *regs, __u32 fpc)
 {
 	int si_code = 0;
+
 	/* FPC[2] is Data Exception Code */
 	if ((fpc & 0x00000300) == 0) {
 		/* bits 6 and 7 of DXC are 0 iff IEEE exception */
@@ -145,44 +129,43 @@ static inline void do_fp_trap(struct pt_regs *regs, __u32 fpc)
 	do_trap(regs, SIGFPE, si_code, "floating point exception");
 }
 
-void translation_exception(struct pt_regs *regs)
+static void translation_specification_exception(struct pt_regs *regs)
 {
 	/* May never happen. */
-	panic("Translation exception");
+	panic("Translation-Specification Exception");
 }
 
-void illegal_op(struct pt_regs *regs)
+static void illegal_op(struct pt_regs *regs)
 {
-        __u8 opcode[6];
-	__u16 __user *location;
 	int is_uprobe_insn = 0;
+	u16 __user *location;
 	int signal = 0;
+	u16 opcode;
 
 	location = get_trap_ip(regs);
-
 	if (user_mode(regs)) {
-		if (get_user(*((__u16 *) opcode), (__u16 __user *) location))
+		if (get_user(opcode, location))
 			return;
-		if (*((__u16 *) opcode) == S390_BREAKPOINT_U16) {
+		if (opcode == S390_BREAKPOINT_U16) {
 			if (current->ptrace)
 				force_sig_fault(SIGTRAP, TRAP_BRKPT, location);
 			else
 				signal = SIGILL;
 #ifdef CONFIG_UPROBES
-		} else if (*((__u16 *) opcode) == UPROBE_SWBP_INSN) {
+		} else if (opcode == UPROBE_SWBP_INSN) {
 			is_uprobe_insn = 1;
 #endif
-		} else
+		} else {
 			signal = SIGILL;
+		}
 	}
 	/*
-	 * We got either an illegal op in kernel mode, or user space trapped
+	 * This is either an illegal op in kernel mode, or user space trapped
 	 * on a uprobes illegal instruction. See if kprobes or uprobes picks
 	 * it up. If not, SIGILL.
 	 */
 	if (is_uprobe_insn || !user_mode(regs)) {
-		if (notify_die(DIE_BPT, "bpt", regs, 0,
-			       3, SIGTRAP) != NOTIFY_STOP)
+		if (notify_die(DIE_BPT, "bpt", regs, 0, 3, SIGTRAP) != NOTIFY_STOP)
 			signal = SIGILL;
 	}
 	if (signal)
@@ -190,21 +173,13 @@ void illegal_op(struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(illegal_op);
 
-DO_ERROR_INFO(specification_exception, SIGILL, ILL_ILLOPN,
-	      "specification exception");
-
-void vector_exception(struct pt_regs *regs)
+static void vector_exception(struct pt_regs *regs)
 {
 	int si_code, vic;
 
-	if (!MACHINE_HAS_VX) {
-		do_trap(regs, SIGILL, ILL_ILLOPN, "illegal operation");
-		return;
-	}
-
 	/* get vector interrupt code from fpc */
-	save_fpu_regs();
-	vic = (current->thread.fpu.fpc & 0xf00) >> 8;
+	save_user_fpu_regs();
+	vic = (current->thread.ufpu.fpc & 0xf00) >> 8;
 	switch (vic) {
 	case 1: /* invalid vector operation */
 		si_code = FPE_FLTINV;
@@ -227,16 +202,16 @@ void vector_exception(struct pt_regs *regs)
 	do_trap(regs, SIGFPE, si_code, "vector exception");
 }
 
-void data_exception(struct pt_regs *regs)
+static void data_exception(struct pt_regs *regs)
 {
-	save_fpu_regs();
-	if (current->thread.fpu.fpc & FPC_DXC_MASK)
-		do_fp_trap(regs, current->thread.fpu.fpc);
+	save_user_fpu_regs();
+	if (current->thread.ufpu.fpc & FPC_DXC_MASK)
+		do_fp_trap(regs, current->thread.ufpu.fpc);
 	else
 		do_trap(regs, SIGILL, ILL_ILLOPN, "data exception");
 }
 
-void space_switch_exception(struct pt_regs *regs)
+static void space_switch_exception(struct pt_regs *regs)
 {
 	/* Set user psw back to home space mode. */
 	if (user_mode(regs))
@@ -245,18 +220,186 @@ void space_switch_exception(struct pt_regs *regs)
 	do_trap(regs, SIGILL, ILL_PRVOPC, "space switch event");
 }
 
-void kernel_stack_overflow(struct pt_regs *regs)
+static void monitor_event_exception(struct pt_regs *regs)
 {
+	if (user_mode(regs))
+		return;
+	switch (report_bug(regs->psw.addr - (regs->int_code >> 16), regs)) {
+	case BUG_TRAP_TYPE_NONE:
+		fixup_exception(regs);
+		break;
+	case BUG_TRAP_TYPE_WARN:
+		break;
+	case BUG_TRAP_TYPE_BUG:
+		die(regs, "monitor event");
+		break;
+	}
+}
+
+void kernel_stack_invalid(struct pt_regs *regs)
+{
+	/*
+	 * Normally regs are unpoisoned by the generic entry code, but
+	 * kernel_stack_overflow() is a rare case that is called bypassing it.
+	 */
+	kmsan_unpoison_entry_regs(regs);
 	bust_spinlocks(1);
-	printk("Kernel stack overflow.\n");
+	pr_emerg("Kernel stack pointer invalid\n");
 	show_regs(regs);
 	bust_spinlocks(0);
-	panic("Corrupt kernel stack, can't continue.");
+	panic("Invalid kernel stack pointer, cannot continue");
+}
+NOKPROBE_SYMBOL(kernel_stack_invalid);
+
+static void __init test_monitor_call(void)
+{
+	int val = 1;
+
+	if (!IS_ENABLED(CONFIG_BUG))
+		return;
+	asm_inline volatile(
+		"	mc	0,0\n"
+		"0:	lhi	%[val],0\n"
+		"1:\n"
+		EX_TABLE(0b, 1b)
+		: [val] "+d" (val));
+	if (!val)
+		panic("Monitor call doesn't work!\n");
 }
-NOKPROBE_SYMBOL(kernel_stack_overflow);
 
 void __init trap_init(void)
 {
-	sort_extable(__start_dma_ex_table, __stop_dma_ex_table);
+	struct lowcore *lc = get_lowcore();
+	unsigned long flags;
+	struct ctlreg cr0;
+
+	local_irq_save(flags);
+	cr0 = local_ctl_clear_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT);
+	psw_bits(lc->external_new_psw).mcheck = 1;
+	psw_bits(lc->program_new_psw).mcheck = 1;
+	psw_bits(lc->svc_new_psw).mcheck = 1;
+	psw_bits(lc->io_new_psw).mcheck = 1;
+	local_ctl_load(0, &cr0);
+	local_irq_restore(flags);
 	local_mcck_enable();
+	test_monitor_call();
 }
+
+static void (*pgm_check_table[128])(struct pt_regs *regs);
+
+void noinstr __do_pgm_check(struct pt_regs *regs)
+{
+	struct lowcore *lc = get_lowcore();
+	irqentry_state_t state;
+	unsigned int trapnr;
+	union teid teid;
+
+	teid.val = lc->trans_exc_code;
+	regs->int_code = lc->pgm_int_code;
+	regs->int_parm_long = teid.val;
+	/*
+	 * In case of a guest fault, short-circuit the fault handler and return.
+	 * This way the sie64a() function will return 0; fault address and
+	 * other relevant bits are saved in current->thread.gmap_teid, and
+	 * the fault number in current->thread.gmap_int_code. KVM will be
+	 * able to use this information to handle the fault.
+	 */
+	if (test_pt_regs_flag(regs, PIF_GUEST_FAULT)) {
+		current->thread.gmap_teid.val = regs->int_parm_long;
+		current->thread.gmap_int_code = regs->int_code & 0xffff;
+		return;
+	}
+	state = irqentry_enter(regs);
+	if (user_mode(regs)) {
+		update_timer_sys();
+		if (!cpu_has_bear()) {
+			if (regs->last_break < 4096)
+				regs->last_break = 1;
+		}
+		current->thread.last_break = regs->last_break;
+	}
+	if (lc->pgm_code & 0x0200) {
+		/* transaction abort */
+		current->thread.trap_tdb = lc->pgm_tdb;
+	}
+	if (lc->pgm_code & PGM_INT_CODE_PER) {
+		if (user_mode(regs)) {
+			struct per_event *ev = &current->thread.per_event;
+
+			set_thread_flag(TIF_PER_TRAP);
+			ev->address = lc->per_address;
+			ev->cause = lc->per_code_combined;
+			ev->paid = lc->per_access_id;
+		} else {
+			/* PER event in kernel is kprobes */
+			__arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER);
+			do_per_trap(regs);
+			goto out;
+		}
+	}
+	if (!irqs_disabled_flags(regs->psw.mask))
+		trace_hardirqs_on();
+	__arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER);
+	trapnr = regs->int_code & PGM_INT_CODE_MASK;
+	if (trapnr)
+		pgm_check_table[trapnr](regs);
+out:
+	local_irq_disable();
+	irqentry_exit(regs, state);
+}
+
+/*
+ * The program check table contains exactly 128 (0x00-0x7f) entries. Each
+ * line defines the function to be called corresponding to the program check
+ * interruption code.
+ */
+static void (*pgm_check_table[128])(struct pt_regs *regs) = {
+	[0x00]		= default_trap_handler,
+	[0x01]		= illegal_op,
+	[0x02]		= privileged_op,
+	[0x03]		= execute_exception,
+	[0x04]		= do_protection_exception,
+	[0x05]		= addressing_exception,
+	[0x06]		= specification_exception,
+	[0x07]		= data_exception,
+	[0x08]		= overflow_exception,
+	[0x09]		= divide_exception,
+	[0x0a]		= overflow_exception,
+	[0x0b]		= divide_exception,
+	[0x0c]		= hfp_overflow_exception,
+	[0x0d]		= hfp_underflow_exception,
+	[0x0e]		= hfp_significance_exception,
+	[0x0f]		= hfp_divide_exception,
+	[0x10]		= do_dat_exception,
+	[0x11]		= do_dat_exception,
+	[0x12]		= translation_specification_exception,
+	[0x13]		= special_op_exception,
+	[0x14]		= default_trap_handler,
+	[0x15]		= operand_exception,
+	[0x16]		= default_trap_handler,
+	[0x17]		= default_trap_handler,
+	[0x18]		= transaction_exception,
+	[0x19]		= default_trap_handler,
+	[0x1a]		= default_trap_handler,
+	[0x1b]		= vector_exception,
+	[0x1c]		= space_switch_exception,
+	[0x1d]		= hfp_sqrt_exception,
+	[0x1e ... 0x37] = default_trap_handler,
+	[0x38]		= do_dat_exception,
+	[0x39]		= do_dat_exception,
+	[0x3a]		= do_dat_exception,
+	[0x3b]		= do_dat_exception,
+	[0x3c]		= default_trap_handler,
+	[0x3d]		= do_secure_storage_access,
+	[0x3e]		= default_trap_handler,
+	[0x3f]		= default_trap_handler,
+	[0x40]		= monitor_event_exception,
+	[0x41 ... 0x7f] = default_trap_handler,
+};
+
+#define COND_TRAP(x) asm(			\
+	".weak " __stringify(x) "\n\t"		\
+	".set  " __stringify(x) ","		\
+	__stringify(default_trap_handler))
+
+COND_TRAP(do_secure_storage_access);
diff --git a/arch/s390/kernel/unwind_bc.c b/arch/s390/kernel/unwind_bc.c
index da2d4d4c5b0e..cd44be2b6ce8 100644
--- a/arch/s390/kernel/unwind_bc.c
+++ b/arch/s390/kernel/unwind_bc.c
@@ -36,12 +36,21 @@ static bool update_stack_info(struct unwind_state *state, unsigned long sp)
 	return true;
 }
 
-static inline bool is_task_pt_regs(struct unwind_state *state,
-				   struct pt_regs *regs)
+static inline bool is_final_pt_regs(struct unwind_state *state,
+				    struct pt_regs *regs)
 {
-	return task_pt_regs(state->task) == regs;
+	/* user mode or kernel thread pt_regs at the bottom of task stack */
+	if (task_pt_regs(state->task) == regs)
+		return true;
+
+	/* user mode pt_regs at the bottom of irq stack */
+	return state->stack_info.type == STACK_TYPE_IRQ &&
+	       state->stack_info.end - sizeof(struct pt_regs) == (unsigned long)regs &&
+	       READ_ONCE_NOCHECK(regs->psw.mask) & PSW_MASK_PSTATE;
 }
 
+/* Avoid KMSAN false positives from touching uninitialized frames. */
+__no_kmsan_checks
 bool unwind_next_frame(struct unwind_state *state)
 {
 	struct stack_info *info = &state->stack_info;
@@ -57,8 +66,8 @@ bool unwind_next_frame(struct unwind_state *state)
 		ip = READ_ONCE_NOCHECK(sf->gprs[8]);
 		reliable = false;
 		regs = NULL;
-		if (!__kernel_text_address(ip)) {
-			/* skip bogus %r14 */
+		/* skip bogus %r14 or if is the same as regs->psw.addr */
+		if (!__kernel_text_address(ip) || state->ip == unwind_recover_ret_addr(state, ip)) {
 			state->regs = NULL;
 			return unwind_next_frame(state);
 		}
@@ -80,7 +89,7 @@ bool unwind_next_frame(struct unwind_state *state)
 			if (!on_stack(info, sp, sizeof(struct pt_regs)))
 				goto out_err;
 			regs = (struct pt_regs *) sp;
-			if (is_task_pt_regs(state, regs))
+			if (is_final_pt_regs(state, regs))
 				goto out_stop;
 			ip = READ_ONCE_NOCHECK(regs->psw.addr);
 			sp = READ_ONCE_NOCHECK(regs->gprs[15]);
@@ -96,13 +105,11 @@ bool unwind_next_frame(struct unwind_state *state)
 	if (sp & 0x7)
 		goto out_err;
 
-	ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, (void *) sp);
-
 	/* Update unwind state */
 	state->sp = sp;
-	state->ip = ip;
 	state->regs = regs;
 	state->reliable = reliable;
+	state->ip = unwind_recover_ret_addr(state, ip);
 	return true;
 
 out_err:
@@ -113,6 +120,8 @@ out_stop:
 }
 EXPORT_SYMBOL_GPL(unwind_next_frame);
 
+/* Avoid KMSAN false positives from touching uninitialized frames. */
+__no_kmsan_checks
 void __unwind_start(struct unwind_state *state, struct task_struct *task,
 		    struct pt_regs *regs, unsigned long first_frame)
 {
@@ -154,12 +163,10 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
 		ip = READ_ONCE_NOCHECK(sf->gprs[8]);
 	}
 
-	ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, NULL);
-
 	/* Update unwind state */
 	state->sp = sp;
-	state->ip = ip;
 	state->reliable = true;
+	state->ip = unwind_recover_ret_addr(state, ip);
 
 	if (!first_frame)
 		return;
diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c
index 5007fac01bb5..5b0633ea8d93 100644
--- a/arch/s390/kernel/uprobes.c
+++ b/arch/s390/kernel/uprobes.c
@@ -12,7 +12,6 @@
 #include <linux/kdebug.h>
 #include <linux/sched/task_stack.h>
 
-#include <asm/switch_to.h>
 #include <asm/facility.h>
 #include <asm/kprobes.h>
 #include <asm/dis.h>
@@ -32,7 +31,7 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 		return -EINVAL;
 	if (!is_compat_task() && psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT)
 		return -EINVAL;
-	clear_pt_regs_flag(regs, PIF_PER_TRAP);
+	clear_thread_flag(TIF_PER_TRAP);
 	auprobe->saved_per = psw_bits(regs->psw).per;
 	auprobe->saved_int_code = regs->int_code;
 	regs->int_code = UPROBE_TRAP_NR;
@@ -103,7 +102,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 		/* fix per address */
 		current->thread.per_event.address = utask->vaddr;
 		/* trigger per event */
-		set_pt_regs_flag(regs, PIF_PER_TRAP);
+		set_thread_flag(TIF_PER_TRAP);
 	}
 	return 0;
 }
@@ -126,6 +125,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
 	case DIE_SSTEP:
 		if (uprobe_post_sstep_notifier(regs))
 			return NOTIFY_STOP;
+		break;
 	default:
 		break;
 	}
@@ -176,9 +176,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len)
 	__typeof__(*(ptr)) input;			\
 	int __rc = 0;					\
 							\
-	if (!test_facility(34))				\
-		__rc = EMU_ILLEGAL_OP;			\
-	else if ((u64 __force)ptr & mask)		\
+	if ((u64 __force)ptr & mask)			\
 		__rc = EMU_SPECIFICATION;		\
 	else if (get_user(input, ptr))			\
 		__rc = EMU_ADDRESSING;			\
@@ -193,9 +191,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len)
 	__typeof__(ptr) __ptr = (ptr);			\
 	int __rc = 0;					\
 							\
-	if (!test_facility(34))				\
-		__rc = EMU_ILLEGAL_OP;			\
-	else if ((u64 __force)__ptr & mask)		\
+	if ((u64 __force)__ptr & mask)			\
 		__rc = EMU_SPECIFICATION;		\
 	else if (put_user(*(input), __ptr))		\
 		__rc = EMU_ADDRESSING;			\
@@ -212,9 +208,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len)
 	__typeof__(*(ptr)) input;			\
 	int __rc = 0;					\
 							\
-	if (!test_facility(34))				\
-		__rc = EMU_ILLEGAL_OP;			\
-	else if ((u64 __force)ptr & mask)		\
+	if ((u64 __force)ptr & mask)			\
 		__rc = EMU_SPECIFICATION;		\
 	else if (get_user(input, ptr))			\
 		__rc = EMU_ADDRESSING;			\
@@ -259,7 +253,7 @@ static void sim_stor_event(struct pt_regs *regs, void *addr, int len)
 		return;
 	current->thread.per_event.address = regs->psw.addr;
 	current->thread.per_event.cause = PER_EVENT_STORE >> 16;
-	set_pt_regs_flag(regs, PIF_PER_TRAP);
+	set_thread_flag(TIF_PER_TRAP);
 }
 
 /*
@@ -326,10 +320,6 @@ static void handle_insn_ril(struct arch_uprobe *auprobe, struct pt_regs *regs)
 		break;
 	case 0xc6:
 		switch (insn->opc1) {
-		case 0x02: /* pfdrl */
-			if (!test_facility(34))
-				rc = EMU_ILLEGAL_OP;
-			break;
 		case 0x04: /* cghrl */
 			rc = emu_cmp_ril(regs, (s16 __user *)uptr, &rx->s64);
 			break;
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
new file mode 100644
index 000000000000..9a5d5be8acf4
--- /dev/null
+++ b/arch/s390/kernel/uv.c
@@ -0,0 +1,909 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common Ultravisor functions and initialization
+ *
+ * Copyright IBM Corp. 2019, 2024
+ */
+#define KMSG_COMPONENT "prot_virt"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/sizes.h>
+#include <linux/bitmap.h>
+#include <linux/memblock.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/pagewalk.h>
+#include <asm/facility.h>
+#include <asm/sections.h>
+#include <asm/uv.h>
+
+/* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */
+int __bootdata_preserved(prot_virt_guest);
+EXPORT_SYMBOL(prot_virt_guest);
+
+/*
+ * uv_info contains both host and guest information but it's currently only
+ * expected to be used within modules if it's the KVM module or for
+ * any PV guest module.
+ *
+ * The kernel itself will write these values once in uv_query_info()
+ * and then make some of them readable via a sysfs interface.
+ */
+struct uv_info __bootdata_preserved(uv_info);
+EXPORT_SYMBOL(uv_info);
+
+int __bootdata_preserved(prot_virt_host);
+EXPORT_SYMBOL(prot_virt_host);
+
+static int __init uv_init(phys_addr_t stor_base, unsigned long stor_len)
+{
+	struct uv_cb_init uvcb = {
+		.header.cmd = UVC_CMD_INIT_UV,
+		.header.len = sizeof(uvcb),
+		.stor_origin = stor_base,
+		.stor_len = stor_len,
+	};
+
+	if (uv_call(0, (uint64_t)&uvcb)) {
+		pr_err("Ultravisor init failed with rc: 0x%x rrc: 0%x\n",
+		       uvcb.header.rc, uvcb.header.rrc);
+		return -1;
+	}
+	return 0;
+}
+
+void __init setup_uv(void)
+{
+	void *uv_stor_base;
+
+	if (!is_prot_virt_host())
+		return;
+
+	uv_stor_base = memblock_alloc_try_nid(
+		uv_info.uv_base_stor_len, SZ_1M, SZ_2G,
+		MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE);
+	if (!uv_stor_base) {
+		pr_warn("Failed to reserve %lu bytes for ultravisor base storage\n",
+			uv_info.uv_base_stor_len);
+		goto fail;
+	}
+
+	if (uv_init(__pa(uv_stor_base), uv_info.uv_base_stor_len)) {
+		memblock_free(uv_stor_base, uv_info.uv_base_stor_len);
+		goto fail;
+	}
+
+	pr_info("Reserving %luMB as ultravisor base storage\n",
+		uv_info.uv_base_stor_len >> 20);
+	return;
+fail:
+	pr_info("Disabling support for protected virtualization");
+	prot_virt_host = 0;
+}
+
+/*
+ * Requests the Ultravisor to pin the page in the shared state. This will
+ * cause an intercept when the guest attempts to unshare the pinned page.
+ */
+int uv_pin_shared(unsigned long paddr)
+{
+	struct uv_cb_cfs uvcb = {
+		.header.cmd = UVC_CMD_PIN_PAGE_SHARED,
+		.header.len = sizeof(uvcb),
+		.paddr = paddr,
+	};
+
+	if (uv_call(0, (u64)&uvcb))
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(uv_pin_shared);
+
+/*
+ * Requests the Ultravisor to destroy a guest page and make it
+ * accessible to the host. The destroy clears the page instead of
+ * exporting.
+ *
+ * @paddr: Absolute host address of page to be destroyed
+ */
+static int uv_destroy(unsigned long paddr)
+{
+	struct uv_cb_cfs uvcb = {
+		.header.cmd = UVC_CMD_DESTR_SEC_STOR,
+		.header.len = sizeof(uvcb),
+		.paddr = paddr
+	};
+
+	if (uv_call(0, (u64)&uvcb)) {
+		/*
+		 * Older firmware uses 107/d as an indication of a non secure
+		 * page. Let us emulate the newer variant (no-op).
+		 */
+		if (uvcb.header.rc == 0x107 && uvcb.header.rrc == 0xd)
+			return 0;
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * The caller must already hold a reference to the folio
+ */
+int uv_destroy_folio(struct folio *folio)
+{
+	int rc;
+
+	/* See gmap_make_secure(): large folios cannot be secure */
+	if (unlikely(folio_test_large(folio)))
+		return 0;
+
+	folio_get(folio);
+	rc = uv_destroy(folio_to_phys(folio));
+	if (!rc)
+		clear_bit(PG_arch_1, &folio->flags);
+	folio_put(folio);
+	return rc;
+}
+EXPORT_SYMBOL(uv_destroy_folio);
+
+/*
+ * The present PTE still indirectly holds a folio reference through the mapping.
+ */
+int uv_destroy_pte(pte_t pte)
+{
+	VM_WARN_ON(!pte_present(pte));
+	return uv_destroy_folio(pfn_folio(pte_pfn(pte)));
+}
+
+/*
+ * Requests the Ultravisor to encrypt a guest page and make it
+ * accessible to the host for paging (export).
+ *
+ * @paddr: Absolute host address of page to be exported
+ */
+int uv_convert_from_secure(unsigned long paddr)
+{
+	struct uv_cb_cfs uvcb = {
+		.header.cmd = UVC_CMD_CONV_FROM_SEC_STOR,
+		.header.len = sizeof(uvcb),
+		.paddr = paddr
+	};
+
+	if (uv_call(0, (u64)&uvcb))
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(uv_convert_from_secure);
+
+/*
+ * The caller must already hold a reference to the folio.
+ */
+int uv_convert_from_secure_folio(struct folio *folio)
+{
+	int rc;
+
+	/* See gmap_make_secure(): large folios cannot be secure */
+	if (unlikely(folio_test_large(folio)))
+		return 0;
+
+	folio_get(folio);
+	rc = uv_convert_from_secure(folio_to_phys(folio));
+	if (!rc)
+		clear_bit(PG_arch_1, &folio->flags);
+	folio_put(folio);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(uv_convert_from_secure_folio);
+
+/*
+ * The present PTE still indirectly holds a folio reference through the mapping.
+ */
+int uv_convert_from_secure_pte(pte_t pte)
+{
+	VM_WARN_ON(!pte_present(pte));
+	return uv_convert_from_secure_folio(pfn_folio(pte_pfn(pte)));
+}
+
+/**
+ * should_export_before_import - Determine whether an export is needed
+ * before an import-like operation
+ * @uvcb: the Ultravisor control block of the UVC to be performed
+ * @mm: the mm of the process
+ *
+ * Returns whether an export is needed before every import-like operation.
+ * This is needed for shared pages, which don't trigger a secure storage
+ * exception when accessed from a different guest.
+ *
+ * Although considered as one, the Unpin Page UVC is not an actual import,
+ * so it is not affected.
+ *
+ * No export is needed also when there is only one protected VM, because the
+ * page cannot belong to the wrong VM in that case (there is no "other VM"
+ * it can belong to).
+ *
+ * Return: true if an export is needed before every import, otherwise false.
+ */
+static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm)
+{
+	/*
+	 * The misc feature indicates, among other things, that importing a
+	 * shared page from a different protected VM will automatically also
+	 * transfer its ownership.
+	 */
+	if (uv_has_feature(BIT_UV_FEAT_MISC))
+		return false;
+	if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED)
+		return false;
+	return atomic_read(&mm->context.protected_count) > 1;
+}
+
+/*
+ * Calculate the expected ref_count for a folio that would otherwise have no
+ * further pins. This was cribbed from similar functions in other places in
+ * the kernel, but with some slight modifications. We know that a secure
+ * folio can not be a large folio, for example.
+ */
+static int expected_folio_refs(struct folio *folio)
+{
+	int res;
+
+	res = folio_mapcount(folio);
+	if (folio_test_swapcache(folio)) {
+		res++;
+	} else if (folio_mapping(folio)) {
+		res++;
+		if (folio->private)
+			res++;
+	}
+	return res;
+}
+
+/**
+ * __make_folio_secure() - make a folio secure
+ * @folio: the folio to make secure
+ * @uvcb: the uvcb that describes the UVC to be used
+ *
+ * The folio @folio will be made secure if possible, @uvcb will be passed
+ * as-is to the UVC.
+ *
+ * Return: 0 on success;
+ *         -EBUSY if the folio is in writeback or has too many references;
+ *         -EAGAIN if the UVC needs to be attempted again;
+ *         -ENXIO if the address is not mapped;
+ *         -EINVAL if the UVC failed for other reasons.
+ *
+ * Context: The caller must hold exactly one extra reference on the folio
+ *          (it's the same logic as split_folio()), and the folio must be
+ *          locked.
+ */
+static int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
+{
+	int expected, cc = 0;
+
+	if (folio_test_writeback(folio))
+		return -EBUSY;
+	expected = expected_folio_refs(folio) + 1;
+	if (!folio_ref_freeze(folio, expected))
+		return -EBUSY;
+	set_bit(PG_arch_1, &folio->flags);
+	/*
+	 * If the UVC does not succeed or fail immediately, we don't want to
+	 * loop for long, or we might get stall notifications.
+	 * On the other hand, this is a complex scenario and we are holding a lot of
+	 * locks, so we can't easily sleep and reschedule. We try only once,
+	 * and if the UVC returned busy or partial completion, we return
+	 * -EAGAIN and we let the callers deal with it.
+	 */
+	cc = __uv_call(0, (u64)uvcb);
+	folio_ref_unfreeze(folio, expected);
+	/*
+	 * Return -ENXIO if the folio was not mapped, -EINVAL for other errors.
+	 * If busy or partially completed, return -EAGAIN.
+	 */
+	if (cc == UVC_CC_OK)
+		return 0;
+	else if (cc == UVC_CC_BUSY || cc == UVC_CC_PARTIAL)
+		return -EAGAIN;
+	return uvcb->rc == 0x10a ? -ENXIO : -EINVAL;
+}
+
+static int make_folio_secure(struct mm_struct *mm, struct folio *folio, struct uv_cb_header *uvcb)
+{
+	int rc;
+
+	if (!folio_trylock(folio))
+		return -EAGAIN;
+	if (should_export_before_import(uvcb, mm))
+		uv_convert_from_secure(folio_to_phys(folio));
+	rc = __make_folio_secure(folio, uvcb);
+	folio_unlock(folio);
+
+	return rc;
+}
+
+/**
+ * s390_wiggle_split_folio() - try to drain extra references to a folio and optionally split.
+ * @mm:    the mm containing the folio to work on
+ * @folio: the folio
+ * @split: whether to split a large folio
+ *
+ * Context: Must be called while holding an extra reference to the folio;
+ *          the mm lock should not be held.
+ * Return: 0 if the folio was split successfully;
+ *         -EAGAIN if the folio was not split successfully but another attempt
+ *                 can be made, or if @split was set to false;
+ *         -EINVAL in case of other errors. See split_folio().
+ */
+static int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split)
+{
+	int rc;
+
+	lockdep_assert_not_held(&mm->mmap_lock);
+	folio_wait_writeback(folio);
+	lru_add_drain_all();
+	if (split) {
+		folio_lock(folio);
+		rc = split_folio(folio);
+		folio_unlock(folio);
+
+		if (rc != -EBUSY)
+			return rc;
+	}
+	return -EAGAIN;
+}
+
+int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb)
+{
+	struct vm_area_struct *vma;
+	struct folio_walk fw;
+	struct folio *folio;
+	int rc;
+
+	mmap_read_lock(mm);
+	vma = vma_lookup(mm, hva);
+	if (!vma) {
+		mmap_read_unlock(mm);
+		return -EFAULT;
+	}
+	folio = folio_walk_start(&fw, vma, hva, 0);
+	if (!folio) {
+		mmap_read_unlock(mm);
+		return -ENXIO;
+	}
+
+	folio_get(folio);
+	/*
+	 * Secure pages cannot be huge and userspace should not combine both.
+	 * In case userspace does it anyway this will result in an -EFAULT for
+	 * the unpack. The guest is thus never reaching secure mode.
+	 * If userspace plays dirty tricks and decides to map huge pages at a
+	 * later point in time, it will receive a segmentation fault or
+	 * KVM_RUN will return -EFAULT.
+	 */
+	if (folio_test_hugetlb(folio))
+		rc = -EFAULT;
+	else if (folio_test_large(folio))
+		rc = -E2BIG;
+	else if (!pte_write(fw.pte) || (pte_val(fw.pte) & _PAGE_INVALID))
+		rc = -ENXIO;
+	else
+		rc = make_folio_secure(mm, folio, uvcb);
+	folio_walk_end(&fw, vma);
+	mmap_read_unlock(mm);
+
+	if (rc == -E2BIG || rc == -EBUSY)
+		rc = s390_wiggle_split_folio(mm, folio, rc == -E2BIG);
+	folio_put(folio);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(make_hva_secure);
+
+/*
+ * To be called with the folio locked or with an extra reference! This will
+ * prevent gmap_make_secure from touching the folio concurrently. Having 2
+ * parallel arch_make_folio_accessible is fine, as the UV calls will become a
+ * no-op if the folio is already exported.
+ */
+int arch_make_folio_accessible(struct folio *folio)
+{
+	int rc = 0;
+
+	/* See gmap_make_secure(): large folios cannot be secure */
+	if (unlikely(folio_test_large(folio)))
+		return 0;
+
+	/*
+	 * PG_arch_1 is used in 2 places:
+	 * 1. for storage keys of hugetlb folios and KVM
+	 * 2. As an indication that this small folio might be secure. This can
+	 *    overindicate, e.g. we set the bit before calling
+	 *    convert_to_secure.
+	 * As secure pages are never large folios, both variants can co-exists.
+	 */
+	if (!test_bit(PG_arch_1, &folio->flags))
+		return 0;
+
+	rc = uv_pin_shared(folio_to_phys(folio));
+	if (!rc) {
+		clear_bit(PG_arch_1, &folio->flags);
+		return 0;
+	}
+
+	rc = uv_convert_from_secure(folio_to_phys(folio));
+	if (!rc) {
+		clear_bit(PG_arch_1, &folio->flags);
+		return 0;
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(arch_make_folio_accessible);
+
+static ssize_t uv_query_facilities(struct kobject *kobj,
+				   struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%lx\n%lx\n%lx\n%lx\n",
+			  uv_info.inst_calls_list[0],
+			  uv_info.inst_calls_list[1],
+			  uv_info.inst_calls_list[2],
+			  uv_info.inst_calls_list[3]);
+}
+
+static struct kobj_attribute uv_query_facilities_attr =
+	__ATTR(facilities, 0444, uv_query_facilities, NULL);
+
+static ssize_t uv_query_supp_se_hdr_ver(struct kobject *kobj,
+					struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_ver);
+}
+
+static struct kobj_attribute uv_query_supp_se_hdr_ver_attr =
+	__ATTR(supp_se_hdr_ver, 0444, uv_query_supp_se_hdr_ver, NULL);
+
+static ssize_t uv_query_supp_se_hdr_pcf(struct kobject *kobj,
+					struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_pcf);
+}
+
+static struct kobj_attribute uv_query_supp_se_hdr_pcf_attr =
+	__ATTR(supp_se_hdr_pcf, 0444, uv_query_supp_se_hdr_pcf, NULL);
+
+static ssize_t uv_query_dump_cpu_len(struct kobject *kobj,
+				     struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%lx\n", uv_info.guest_cpu_stor_len);
+}
+
+static struct kobj_attribute uv_query_dump_cpu_len_attr =
+	__ATTR(uv_query_dump_cpu_len, 0444, uv_query_dump_cpu_len, NULL);
+
+static ssize_t uv_query_dump_storage_state_len(struct kobject *kobj,
+					       struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%lx\n", uv_info.conf_dump_storage_state_len);
+}
+
+static struct kobj_attribute uv_query_dump_storage_state_len_attr =
+	__ATTR(dump_storage_state_len, 0444, uv_query_dump_storage_state_len, NULL);
+
+static ssize_t uv_query_dump_finalize_len(struct kobject *kobj,
+					  struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%lx\n", uv_info.conf_dump_finalize_len);
+}
+
+static struct kobj_attribute uv_query_dump_finalize_len_attr =
+	__ATTR(dump_finalize_len, 0444, uv_query_dump_finalize_len, NULL);
+
+static ssize_t uv_query_feature_indications(struct kobject *kobj,
+					    struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%lx\n", uv_info.uv_feature_indications);
+}
+
+static struct kobj_attribute uv_query_feature_indications_attr =
+	__ATTR(feature_indications, 0444, uv_query_feature_indications, NULL);
+
+static ssize_t uv_query_max_guest_cpus(struct kobject *kobj,
+				       struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%d\n", uv_info.max_guest_cpu_id + 1);
+}
+
+static struct kobj_attribute uv_query_max_guest_cpus_attr =
+	__ATTR(max_cpus, 0444, uv_query_max_guest_cpus, NULL);
+
+static ssize_t uv_query_max_guest_vms(struct kobject *kobj,
+				      struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%d\n", uv_info.max_num_sec_conf);
+}
+
+static struct kobj_attribute uv_query_max_guest_vms_attr =
+	__ATTR(max_guests, 0444, uv_query_max_guest_vms, NULL);
+
+static ssize_t uv_query_max_guest_addr(struct kobject *kobj,
+				       struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%lx\n", uv_info.max_sec_stor_addr);
+}
+
+static struct kobj_attribute uv_query_max_guest_addr_attr =
+	__ATTR(max_address, 0444, uv_query_max_guest_addr, NULL);
+
+static ssize_t uv_query_supp_att_req_hdr_ver(struct kobject *kobj,
+					     struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%lx\n", uv_info.supp_att_req_hdr_ver);
+}
+
+static struct kobj_attribute uv_query_supp_att_req_hdr_ver_attr =
+	__ATTR(supp_att_req_hdr_ver, 0444, uv_query_supp_att_req_hdr_ver, NULL);
+
+static ssize_t uv_query_supp_att_pflags(struct kobject *kobj,
+					struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%lx\n", uv_info.supp_att_pflags);
+}
+
+static struct kobj_attribute uv_query_supp_att_pflags_attr =
+	__ATTR(supp_att_pflags, 0444, uv_query_supp_att_pflags, NULL);
+
+static ssize_t uv_query_supp_add_secret_req_ver(struct kobject *kobj,
+						struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%lx\n", uv_info.supp_add_secret_req_ver);
+}
+
+static struct kobj_attribute uv_query_supp_add_secret_req_ver_attr =
+	__ATTR(supp_add_secret_req_ver, 0444, uv_query_supp_add_secret_req_ver, NULL);
+
+static ssize_t uv_query_supp_add_secret_pcf(struct kobject *kobj,
+					    struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%lx\n", uv_info.supp_add_secret_pcf);
+}
+
+static struct kobj_attribute uv_query_supp_add_secret_pcf_attr =
+	__ATTR(supp_add_secret_pcf, 0444, uv_query_supp_add_secret_pcf, NULL);
+
+static ssize_t uv_query_supp_secret_types(struct kobject *kobj,
+					  struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%lx\n", uv_info.supp_secret_types);
+}
+
+static struct kobj_attribute uv_query_supp_secret_types_attr =
+	__ATTR(supp_secret_types, 0444, uv_query_supp_secret_types, NULL);
+
+static ssize_t uv_query_max_secrets(struct kobject *kobj,
+				    struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%d\n",
+			  uv_info.max_assoc_secrets + uv_info.max_retr_secrets);
+}
+
+static struct kobj_attribute uv_query_max_secrets_attr =
+	__ATTR(max_secrets, 0444, uv_query_max_secrets, NULL);
+
+static ssize_t uv_query_max_retr_secrets(struct kobject *kobj,
+					 struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%d\n", uv_info.max_retr_secrets);
+}
+
+static struct kobj_attribute uv_query_max_retr_secrets_attr =
+	__ATTR(max_retr_secrets, 0444, uv_query_max_retr_secrets, NULL);
+
+static ssize_t uv_query_max_assoc_secrets(struct kobject *kobj,
+					  struct kobj_attribute *attr,
+					  char *buf)
+{
+	return sysfs_emit(buf, "%d\n", uv_info.max_assoc_secrets);
+}
+
+static struct kobj_attribute uv_query_max_assoc_secrets_attr =
+	__ATTR(max_assoc_secrets, 0444, uv_query_max_assoc_secrets, NULL);
+
+static struct attribute *uv_query_attrs[] = {
+	&uv_query_facilities_attr.attr,
+	&uv_query_feature_indications_attr.attr,
+	&uv_query_max_guest_cpus_attr.attr,
+	&uv_query_max_guest_vms_attr.attr,
+	&uv_query_max_guest_addr_attr.attr,
+	&uv_query_supp_se_hdr_ver_attr.attr,
+	&uv_query_supp_se_hdr_pcf_attr.attr,
+	&uv_query_dump_storage_state_len_attr.attr,
+	&uv_query_dump_finalize_len_attr.attr,
+	&uv_query_dump_cpu_len_attr.attr,
+	&uv_query_supp_att_req_hdr_ver_attr.attr,
+	&uv_query_supp_att_pflags_attr.attr,
+	&uv_query_supp_add_secret_req_ver_attr.attr,
+	&uv_query_supp_add_secret_pcf_attr.attr,
+	&uv_query_supp_secret_types_attr.attr,
+	&uv_query_max_secrets_attr.attr,
+	&uv_query_max_assoc_secrets_attr.attr,
+	&uv_query_max_retr_secrets_attr.attr,
+	NULL,
+};
+
+static inline struct uv_cb_query_keys uv_query_keys(void)
+{
+	struct uv_cb_query_keys uvcb = {
+		.header.cmd = UVC_CMD_QUERY_KEYS,
+		.header.len = sizeof(uvcb)
+	};
+
+	uv_call(0, (uint64_t)&uvcb);
+	return uvcb;
+}
+
+static inline ssize_t emit_hash(struct uv_key_hash *hash, char *buf, int at)
+{
+	return sysfs_emit_at(buf, at, "%016llx%016llx%016llx%016llx\n",
+			    hash->dword[0], hash->dword[1], hash->dword[2], hash->dword[3]);
+}
+
+static ssize_t uv_keys_host_key(struct kobject *kobj,
+				struct kobj_attribute *attr, char *buf)
+{
+	struct uv_cb_query_keys uvcb = uv_query_keys();
+
+	return emit_hash(&uvcb.key_hashes[UVC_QUERY_KEYS_IDX_HK], buf, 0);
+}
+
+static struct kobj_attribute uv_keys_host_key_attr =
+	__ATTR(host_key, 0444, uv_keys_host_key, NULL);
+
+static ssize_t uv_keys_backup_host_key(struct kobject *kobj,
+				       struct kobj_attribute *attr, char *buf)
+{
+	struct uv_cb_query_keys uvcb = uv_query_keys();
+
+	return emit_hash(&uvcb.key_hashes[UVC_QUERY_KEYS_IDX_BACK_HK], buf, 0);
+}
+
+static struct kobj_attribute uv_keys_backup_host_key_attr =
+	__ATTR(backup_host_key, 0444, uv_keys_backup_host_key, NULL);
+
+static ssize_t uv_keys_all(struct kobject *kobj,
+			   struct kobj_attribute *attr, char *buf)
+{
+	struct uv_cb_query_keys uvcb = uv_query_keys();
+	ssize_t len = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(uvcb.key_hashes); i++)
+		len += emit_hash(uvcb.key_hashes + i, buf, len);
+
+	return len;
+}
+
+static struct kobj_attribute uv_keys_all_attr =
+	__ATTR(all, 0444, uv_keys_all, NULL);
+
+static struct attribute_group uv_query_attr_group = {
+	.attrs = uv_query_attrs,
+};
+
+static struct attribute *uv_keys_attrs[] = {
+	&uv_keys_host_key_attr.attr,
+	&uv_keys_backup_host_key_attr.attr,
+	&uv_keys_all_attr.attr,
+	NULL,
+};
+
+static struct attribute_group uv_keys_attr_group = {
+	.attrs = uv_keys_attrs,
+};
+
+static ssize_t uv_is_prot_virt_guest(struct kobject *kobj,
+				     struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%d\n", prot_virt_guest);
+}
+
+static ssize_t uv_is_prot_virt_host(struct kobject *kobj,
+				    struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%d\n", prot_virt_host);
+}
+
+static struct kobj_attribute uv_prot_virt_guest =
+	__ATTR(prot_virt_guest, 0444, uv_is_prot_virt_guest, NULL);
+
+static struct kobj_attribute uv_prot_virt_host =
+	__ATTR(prot_virt_host, 0444, uv_is_prot_virt_host, NULL);
+
+static const struct attribute *uv_prot_virt_attrs[] = {
+	&uv_prot_virt_guest.attr,
+	&uv_prot_virt_host.attr,
+	NULL,
+};
+
+static struct kset *uv_query_kset;
+static struct kset *uv_keys_kset;
+static struct kobject *uv_kobj;
+
+static int __init uv_sysfs_dir_init(const struct attribute_group *grp,
+				    struct kset **uv_dir_kset, const char *name)
+{
+	struct kset *kset;
+	int rc;
+
+	kset = kset_create_and_add(name, NULL, uv_kobj);
+	if (!kset)
+		return -ENOMEM;
+	*uv_dir_kset = kset;
+
+	rc = sysfs_create_group(&kset->kobj, grp);
+	if (rc)
+		kset_unregister(kset);
+	return rc;
+}
+
+static int __init uv_sysfs_init(void)
+{
+	int rc = -ENOMEM;
+
+	if (!test_facility(158))
+		return 0;
+
+	uv_kobj = kobject_create_and_add("uv", firmware_kobj);
+	if (!uv_kobj)
+		return -ENOMEM;
+
+	rc = sysfs_create_files(uv_kobj, uv_prot_virt_attrs);
+	if (rc)
+		goto out_kobj;
+
+	rc = uv_sysfs_dir_init(&uv_query_attr_group, &uv_query_kset, "query");
+	if (rc)
+		goto out_ind_files;
+
+	/* Get installed key hashes if available, ignore any errors */
+	if (test_bit_inv(BIT_UVC_CMD_QUERY_KEYS, uv_info.inst_calls_list))
+		uv_sysfs_dir_init(&uv_keys_attr_group, &uv_keys_kset, "keys");
+
+	return 0;
+
+out_ind_files:
+	sysfs_remove_files(uv_kobj, uv_prot_virt_attrs);
+out_kobj:
+	kobject_del(uv_kobj);
+	kobject_put(uv_kobj);
+	return rc;
+}
+device_initcall(uv_sysfs_init);
+
+/*
+ * Find the secret with the secret_id in the provided list.
+ *
+ * Context: might sleep.
+ */
+static int find_secret_in_page(const u8 secret_id[UV_SECRET_ID_LEN],
+			       const struct uv_secret_list *list,
+			       struct uv_secret_list_item_hdr *secret)
+{
+	u16 i;
+
+	for (i = 0; i < list->total_num_secrets; i++) {
+		if (memcmp(secret_id, list->secrets[i].id, UV_SECRET_ID_LEN) == 0) {
+			*secret = list->secrets[i].hdr;
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+/*
+ * Do the actual search for `uv_get_secret_metadata`.
+ *
+ * Context: might sleep.
+ */
+static int find_secret(const u8 secret_id[UV_SECRET_ID_LEN],
+		       struct uv_secret_list *list,
+		       struct uv_secret_list_item_hdr *secret)
+{
+	u16 start_idx = 0;
+	u16 list_rc;
+	int ret;
+
+	do {
+		uv_list_secrets(list, start_idx, &list_rc, NULL);
+		if (list_rc != UVC_RC_EXECUTED && list_rc != UVC_RC_MORE_DATA) {
+			if (list_rc == UVC_RC_INV_CMD)
+				return -ENODEV;
+			else
+				return -EIO;
+		}
+		ret = find_secret_in_page(secret_id, list, secret);
+		if (ret == 0)
+			return ret;
+		start_idx = list->next_secret_idx;
+	} while (list_rc == UVC_RC_MORE_DATA && start_idx < list->next_secret_idx);
+
+	return -ENOENT;
+}
+
+/**
+ * uv_get_secret_metadata() - get secret metadata for a given secret id.
+ * @secret_id: search pattern.
+ * @secret: output data, containing the secret's metadata.
+ *
+ * Search for a secret with the given secret_id in the Ultravisor secret store.
+ *
+ * Context: might sleep.
+ *
+ * Return:
+ * * %0:	- Found entry; secret->idx and secret->type are valid.
+ * * %ENOENT	- No entry found.
+ * * %ENODEV:	- Not supported: UV not available or command not available.
+ * * %EIO:	- Other unexpected UV error.
+ */
+int uv_get_secret_metadata(const u8 secret_id[UV_SECRET_ID_LEN],
+			   struct uv_secret_list_item_hdr *secret)
+{
+	struct uv_secret_list *buf;
+	int rc;
+
+	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	rc = find_secret(secret_id, buf, secret);
+	kfree(buf);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(uv_get_secret_metadata);
+
+/**
+ * uv_retrieve_secret() - get the secret value for the secret index.
+ * @secret_idx: Secret index for which the secret should be retrieved.
+ * @buf: Buffer to store retrieved secret.
+ * @buf_size: Size of the buffer. The correct buffer size is reported as part of
+ * the result from `uv_get_secret_metadata`.
+ *
+ * Calls the Retrieve Secret UVC and translates the UV return code into an errno.
+ *
+ * Context: might sleep.
+ *
+ * Return:
+ * * %0		- Entry found; buffer contains a valid secret.
+ * * %ENOENT:	- No entry found or secret at the index is non-retrievable.
+ * * %ENODEV:	- Not supported: UV not available or command not available.
+ * * %EINVAL:	- Buffer too small for content.
+ * * %EIO:	- Other unexpected UV error.
+ */
+int uv_retrieve_secret(u16 secret_idx, u8 *buf, size_t buf_size)
+{
+	struct uv_cb_retr_secr uvcb = {
+		.header.len = sizeof(uvcb),
+		.header.cmd = UVC_CMD_RETR_SECRET,
+		.secret_idx = secret_idx,
+		.buf_addr = (u64)buf,
+		.buf_size = buf_size,
+	};
+
+	uv_call_sched(0, (u64)&uvcb);
+
+	switch (uvcb.header.rc) {
+	case UVC_RC_EXECUTED:
+		return 0;
+	case UVC_RC_INV_CMD:
+		return -ENODEV;
+	case UVC_RC_RETR_SECR_STORE_EMPTY:
+	case UVC_RC_RETR_SECR_INV_SECRET:
+	case UVC_RC_RETR_SECR_INV_IDX:
+		return -ENOENT;
+	case UVC_RC_RETR_SECR_BUF_SMALL:
+		return -EINVAL;
+	default:
+		return -EIO;
+	}
+}
+EXPORT_SYMBOL_GPL(uv_retrieve_secret);
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index bcc9bdb39ba2..430feb1a5013 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -6,269 +6,182 @@
  *  Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
  */
 
-#include <linux/init.h>
+#include <linux/binfmts.h>
+#include <linux/compat.h>
+#include <linux/elf.h>
 #include <linux/errno.h>
-#include <linux/sched.h>
+#include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/stddef.h>
-#include <linux/unistd.h>
 #include <linux/slab.h>
-#include <linux/user.h>
-#include <linux/elf.h>
-#include <linux/security.h>
-#include <linux/memblock.h>
-#include <linux/compat.h>
-#include <asm/asm-offsets.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/sections.h>
+#include <linux/smp.h>
+#include <linux/random.h>
+#include <linux/vdso_datastore.h>
+#include <vdso/datapage.h>
+#include <asm/vdso/vsyscall.h>
+#include <asm/alternative.h>
 #include <asm/vdso.h>
-#include <asm/facility.h>
-
-extern char vdso64_start, vdso64_end;
-static void *vdso64_kbase = &vdso64_start;
-static unsigned int vdso64_pages;
-static struct page **vdso64_pagelist;
-
-/*
- * Should the kernel map a VDSO page into processes and pass its
- * address down to glibc upon exec()?
- */
-unsigned int __read_mostly vdso_enabled = 1;
 
-static vm_fault_t vdso_fault(const struct vm_special_mapping *sm,
-		      struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	struct page **vdso_pagelist;
-	unsigned long vdso_pages;
-
-	vdso_pagelist = vdso64_pagelist;
-	vdso_pages = vdso64_pages;
-
-	if (vmf->pgoff >= vdso_pages)
-		return VM_FAULT_SIGBUS;
-
-	vmf->page = vdso_pagelist[vmf->pgoff];
-	get_page(vmf->page);
-	return 0;
-}
+extern char vdso64_start[], vdso64_end[];
+extern char vdso32_start[], vdso32_end[];
 
 static int vdso_mremap(const struct vm_special_mapping *sm,
 		       struct vm_area_struct *vma)
 {
-	unsigned long vdso_pages;
-
-	vdso_pages = vdso64_pages;
-
-	if ((vdso_pages << PAGE_SHIFT) != vma->vm_end - vma->vm_start)
-		return -EINVAL;
-
-	if (WARN_ON_ONCE(current->mm != vma->vm_mm))
-		return -EFAULT;
-
 	current->mm->context.vdso_base = vma->vm_start;
 	return 0;
 }
 
-static const struct vm_special_mapping vdso_mapping = {
+static struct vm_special_mapping vdso64_mapping = {
 	.name = "[vdso]",
-	.fault = vdso_fault,
 	.mremap = vdso_mremap,
 };
 
-static int __init vdso_setup(char *str)
-{
-	bool enabled;
-
-	if (!kstrtobool(str, &enabled))
-		vdso_enabled = enabled;
-	return 1;
-}
-__setup("vdso=", vdso_setup);
-
-/*
- * The vdso data page
- */
-static union {
-	struct vdso_data	data;
-	u8			page[PAGE_SIZE];
-} vdso_data_store __page_aligned_data;
-struct vdso_data *vdso_data = &vdso_data_store.data;
+static struct vm_special_mapping vdso32_mapping = {
+	.name = "[vdso]",
+	.mremap = vdso_mremap,
+};
 
-/*
- * Setup vdso data page.
- */
-static void __init vdso_init_data(struct vdso_data *vd)
+int vdso_getcpu_init(void)
 {
-	vd->ectg_available = test_facility(31);
+	set_tod_programmable_field(smp_processor_id());
+	return 0;
 }
+early_initcall(vdso_getcpu_init); /* Must be called before SMP init */
 
-/*
- * Allocate/free per cpu vdso data.
- */
-#define SEGMENT_ORDER	2
-
-/*
- * The initial vdso_data structure for the boot CPU. Eventually
- * it is replaced with a properly allocated structure in vdso_init.
- * This is necessary because a valid S390_lowcore.vdso_per_cpu_data
- * pointer is required to be able to return from an interrupt or
- * program check. See the exit paths in entry.S.
- */
-struct vdso_data boot_vdso_data __initdata;
-
-void __init vdso_alloc_boot_cpu(struct lowcore *lowcore)
+static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len)
 {
-	lowcore->vdso_per_cpu_data = (unsigned long) &boot_vdso_data;
-}
+	unsigned long vvar_start, vdso_text_start, vdso_text_len;
+	struct vm_special_mapping *vdso_mapping;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	int rc;
 
-int vdso_alloc_per_cpu(struct lowcore *lowcore)
-{
-	unsigned long segment_table, page_table, page_frame;
-	struct vdso_per_cpu_data *vd;
+	BUILD_BUG_ON(VDSO_NR_PAGES != __VDSO_PAGES);
+	if (mmap_write_lock_killable(mm))
+		return -EINTR;
 
-	segment_table = __get_free_pages(GFP_KERNEL, SEGMENT_ORDER);
-	page_table = get_zeroed_page(GFP_KERNEL);
-	page_frame = get_zeroed_page(GFP_KERNEL);
-	if (!segment_table || !page_table || !page_frame)
+	if (is_compat_task()) {
+		vdso_text_len = vdso32_end - vdso32_start;
+		vdso_mapping = &vdso32_mapping;
+	} else {
+		vdso_text_len = vdso64_end - vdso64_start;
+		vdso_mapping = &vdso64_mapping;
+	}
+	vvar_start = get_unmapped_area(NULL, addr, vdso_mapping_len, 0, 0);
+	rc = vvar_start;
+	if (IS_ERR_VALUE(vvar_start))
 		goto out;
-	arch_set_page_dat(virt_to_page(segment_table), SEGMENT_ORDER);
-	arch_set_page_dat(virt_to_page(page_table), 0);
-
-	/* Initialize per-cpu vdso data page */
-	vd = (struct vdso_per_cpu_data *) page_frame;
-	vd->cpu_nr = lowcore->cpu_nr;
-	vd->node_id = cpu_to_node(vd->cpu_nr);
-
-	/* Set up page table for the vdso address space */
-	memset64((u64 *)segment_table, _SEGMENT_ENTRY_EMPTY, _CRST_ENTRIES);
-	memset64((u64 *)page_table, _PAGE_INVALID, PTRS_PER_PTE);
-
-	*(unsigned long *) segment_table = _SEGMENT_ENTRY + page_table;
-	*(unsigned long *) page_table = _PAGE_PROTECT + page_frame;
-
-	lowcore->vdso_asce = segment_table +
-		_ASCE_TABLE_LENGTH + _ASCE_USER_BITS + _ASCE_TYPE_SEGMENT;
-	lowcore->vdso_per_cpu_data = page_frame;
-
-	return 0;
-
+	vma = vdso_install_vvar_mapping(mm, vvar_start);
+	rc = PTR_ERR(vma);
+	if (IS_ERR(vma))
+		goto out;
+	vdso_text_start = vvar_start + VDSO_NR_PAGES * PAGE_SIZE;
+	/* VM_MAYWRITE for COW so gdb can set breakpoints */
+	vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len,
+				       VM_READ|VM_EXEC|VM_SEALED_SYSMAP|
+				       VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
+				       vdso_mapping);
+	if (IS_ERR(vma)) {
+		do_munmap(mm, vvar_start, PAGE_SIZE, NULL);
+		rc = PTR_ERR(vma);
+	} else {
+		current->mm->context.vdso_base = vdso_text_start;
+		rc = 0;
+	}
 out:
-	free_page(page_frame);
-	free_page(page_table);
-	free_pages(segment_table, SEGMENT_ORDER);
-	return -ENOMEM;
+	mmap_write_unlock(mm);
+	return rc;
 }
 
-void vdso_free_per_cpu(struct lowcore *lowcore)
+static unsigned long vdso_addr(unsigned long start, unsigned long len)
 {
-	unsigned long segment_table, page_table, page_frame;
+	unsigned long addr, end, offset;
 
-	segment_table = lowcore->vdso_asce & PAGE_MASK;
-	page_table = *(unsigned long *) segment_table;
-	page_frame = *(unsigned long *) page_table;
-
-	free_page(page_frame);
-	free_page(page_table);
-	free_pages(segment_table, SEGMENT_ORDER);
+	/*
+	 * Round up the start address. It can start out unaligned as a result
+	 * of stack start randomization.
+	 */
+	start = PAGE_ALIGN(start);
+
+	/* Round the lowest possible end address up to a PMD boundary. */
+	end = (start + len + PMD_SIZE - 1) & PMD_MASK;
+	if (end >= VDSO_BASE)
+		end = VDSO_BASE;
+	end -= len;
+
+	if (end > start) {
+		offset = get_random_u32_below(((end - start) >> PAGE_SHIFT) + 1);
+		addr = start + (offset << PAGE_SHIFT);
+	} else {
+		addr = start;
+	}
+	return addr;
 }
 
-/*
- * This is called from binfmt_elf, we create the special vma for the
- * vDSO and insert it into the mm struct tree
- */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+unsigned long vdso_text_size(void)
 {
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-	unsigned long vdso_pages;
-	unsigned long vdso_base;
-	int rc;
-
-	if (!vdso_enabled)
-		return 0;
+	unsigned long size;
 
 	if (is_compat_task())
-		return 0;
-
-	vdso_pages = vdso64_pages;
-	/*
-	 * vDSO has a problem and was disabled, just don't "enable" it for
-	 * the process
-	 */
-	if (vdso_pages == 0)
-		return 0;
-
-	/*
-	 * pick a base address for the vDSO in process space. We try to put
-	 * it at vdso_base which is the "natural" base for it, but we might
-	 * fail and end up putting it elsewhere.
-	 */
-	if (down_write_killable(&mm->mmap_sem))
-		return -EINTR;
-	vdso_base = get_unmapped_area(NULL, 0, vdso_pages << PAGE_SHIFT, 0, 0);
-	if (IS_ERR_VALUE(vdso_base)) {
-		rc = vdso_base;
-		goto out_up;
-	}
+		size = vdso32_end - vdso32_start;
+	else
+		size = vdso64_end - vdso64_start;
+	return PAGE_ALIGN(size);
+}
 
-	/*
-	 * our vma flags don't have VM_WRITE so by default, the process
-	 * isn't allowed to write those pages.
-	 * gdb can break that with ptrace interface, and thus trigger COW
-	 * on those pages but it's then your responsibility to never do that
-	 * on the "data" page of the vDSO or you'll stop getting kernel
-	 * updates and your nice userland gettimeofday will be totally dead.
-	 * It's fine to use that for setting breakpoints in the vDSO code
-	 * pages though.
-	 */
-	vma = _install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT,
-				       VM_READ|VM_EXEC|
-				       VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
-				       &vdso_mapping);
-	if (IS_ERR(vma)) {
-		rc = PTR_ERR(vma);
-		goto out_up;
-	}
+unsigned long vdso_size(void)
+{
+	return vdso_text_size() + VDSO_NR_PAGES * PAGE_SIZE;
+}
 
-	current->mm->context.vdso_base = vdso_base;
-	rc = 0;
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+{
+	unsigned long addr = VDSO_BASE;
+	unsigned long size = vdso_size();
 
-out_up:
-	up_write(&mm->mmap_sem);
-	return rc;
+	if (current->flags & PF_RANDOMIZE)
+		addr = vdso_addr(current->mm->start_stack + PAGE_SIZE, size);
+	return map_vdso(addr, size);
 }
 
-static int __init vdso_init(void)
+static struct page ** __init vdso_setup_pages(void *start, void *end)
 {
+	int pages = (end - start) >> PAGE_SHIFT;
+	struct page **pagelist;
 	int i;
 
-	vdso_init_data(vdso_data);
-
-	/* Calculate the size of the 64 bit vDSO */
-	vdso64_pages = ((&vdso64_end - &vdso64_start
-			 + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
-
-	/* Make sure pages are in the correct state */
-	vdso64_pagelist = kcalloc(vdso64_pages + 1, sizeof(struct page *),
-				  GFP_KERNEL);
-	BUG_ON(vdso64_pagelist == NULL);
-	for (i = 0; i < vdso64_pages - 1; i++) {
-		struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE);
-		get_page(pg);
-		vdso64_pagelist[i] = pg;
-	}
-	vdso64_pagelist[vdso64_pages - 1] = virt_to_page(vdso_data);
-	vdso64_pagelist[vdso64_pages] = NULL;
-	if (vdso_alloc_per_cpu(&S390_lowcore))
-		BUG();
+	pagelist = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL);
+	if (!pagelist)
+		panic("%s: Cannot allocate page list for VDSO", __func__);
+	for (i = 0; i < pages; i++)
+		pagelist[i] = virt_to_page(start + i * PAGE_SIZE);
+	return pagelist;
+}
 
-	get_page(virt_to_page(vdso_data));
+static void vdso_apply_alternatives(void)
+{
+	const struct elf64_shdr *alt, *shdr;
+	struct alt_instr *start, *end;
+	const struct elf64_hdr *hdr;
+
+	hdr = (struct elf64_hdr *)vdso64_start;
+	shdr = (void *)hdr + hdr->e_shoff;
+	alt = find_section(hdr, shdr, ".altinstructions");
+	if (!alt)
+		return;
+	start = (void *)hdr + alt->sh_offset;
+	end = (void *)hdr + alt->sh_offset + alt->sh_size;
+	apply_alternatives(start, end);
+}
 
+static int __init vdso_init(void)
+{
+	vdso_apply_alternatives();
+	vdso64_mapping.pages = vdso_setup_pages(vdso64_start, vdso64_end);
+	if (IS_ENABLED(CONFIG_COMPAT))
+		vdso32_mapping.pages = vdso_setup_pages(vdso32_start, vdso32_end);
 	return 0;
 }
-early_initcall(vdso_init);
+arch_initcall(vdso_init);
diff --git a/arch/s390/kernel/vdso32/.gitignore b/arch/s390/kernel/vdso32/.gitignore
new file mode 100644
index 000000000000..5167384843b9
--- /dev/null
+++ b/arch/s390/kernel/vdso32/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+vdso32.lds
diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile
new file mode 100644
index 000000000000..1e4ddd1a683f
--- /dev/null
+++ b/arch/s390/kernel/vdso32/Makefile
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: GPL-2.0
+# List of files in the vdso
+
+# Include the generic Makefile to check the built vdso.
+include $(srctree)/lib/vdso/Makefile.include
+obj-vdso32 = vdso_user_wrapper-32.o note-32.o
+
+# Build rules
+
+targets := $(obj-vdso32) vdso32.so vdso32.so.dbg
+obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32))
+
+KBUILD_AFLAGS += -DBUILD_VDSO
+KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING
+
+KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
+KBUILD_AFLAGS_32 += -m31 -s
+
+KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
+KBUILD_CFLAGS_32 := $(filter-out -mpacked-stack,$(KBUILD_CFLAGS))
+KBUILD_CFLAGS_32 := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out -fno-asynchronous-unwind-tables,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin -fasynchronous-unwind-tables
+
+LDFLAGS_vdso32.so.dbg += -shared -soname=linux-vdso32.so.1 \
+	--hash-style=both --build-id=sha1 -melf_s390 -T
+
+$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
+$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
+
+obj-y += vdso32_wrapper.o
+targets += vdso32.lds
+CPPFLAGS_vdso32.lds += -P -C -U$(ARCH)
+
+# Force dependency (incbin is bad)
+$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so
+
+quiet_cmd_vdso_and_check = VDSO    $@
+      cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check)
+
+$(obj)/vdso32.so.dbg: $(obj)/vdso32.lds $(obj-vdso32) FORCE
+	$(call if_changed,vdso_and_check)
+
+# strip rule for the .so file
+$(obj)/%.so: OBJCOPYFLAGS := -S
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
+	$(call if_changed,objcopy)
+
+$(obj-vdso32): %-32.o: %.S FORCE
+	$(call if_changed_dep,vdso32as)
+
+# actual build commands
+quiet_cmd_vdso32as = VDSO32A $@
+      cmd_vdso32as = $(CC) $(a_flags) -c -o $@ $<
+quiet_cmd_vdso32cc = VDSO32C $@
+      cmd_vdso32cc = $(CC) $(c_flags) -c -o $@ $<
+
+# Generate VDSO offsets using helper script
+gen-vdsosym := $(src)/gen_vdso_offsets.sh
+quiet_cmd_vdsosym = VDSOSYM $@
+	cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@
+
+include/generated/vdso32-offsets.h: $(obj)/vdso32.so.dbg FORCE
+	$(call if_changed,vdsosym)
diff --git a/arch/s390/kernel/vdso32/gen_vdso_offsets.sh b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh
new file mode 100755
index 000000000000..9c4f951e227d
--- /dev/null
+++ b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+#
+# Match symbols in the DSO that look like VDSO_*; produce a header file
+# of constant offsets into the shared object.
+#
+# Doing this inside the Makefile will break the $(filter-out) function,
+# causing Kbuild to rebuild the vdso-offsets header file every time.
+#
+# Inspired by arm64 version.
+#
+
+LC_ALL=C
+sed -n 's/\([0-9a-f]*\) . __kernel_compat_\(.*\)/\#define vdso32_offset_\2\t0x\1/p'
diff --git a/arch/s390/kernel/vdso32/note.S b/arch/s390/kernel/vdso32/note.S
new file mode 100644
index 000000000000..db19d0680a0a
--- /dev/null
+++ b/arch/s390/kernel/vdso32/note.S
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/uts.h>
+#include <linux/version.h>
+#include <linux/elfnote.h>
+
+ELFNOTE_START(Linux, 0, "a")
+	.long LINUX_VERSION_CODE
+ELFNOTE_END
diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S
new file mode 100644
index 000000000000..9630d58c2080
--- /dev/null
+++ b/arch/s390/kernel/vdso32/vdso32.lds.S
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This is the infamous ld script for the 64 bits vdso
+ * library
+ */
+
+#include <asm/page.h>
+#include <asm/vdso.h>
+#include <vdso/datapage.h>
+
+OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390")
+OUTPUT_ARCH(s390:31-bit)
+
+SECTIONS
+{
+	VDSO_VVAR_SYMS
+
+	. = SIZEOF_HEADERS;
+
+	.hash		: { *(.hash) }			:text
+	.gnu.hash	: { *(.gnu.hash) }
+	.dynsym		: { *(.dynsym) }
+	.dynstr		: { *(.dynstr) }
+	.gnu.version	: { *(.gnu.version) }
+	.gnu.version_d	: { *(.gnu.version_d) }
+	.gnu.version_r	: { *(.gnu.version_r) }
+
+	.note		: { *(.note.*) }		:text	:note
+
+	. = ALIGN(16);
+	.text		: {
+		*(.text .stub .text.* .gnu.linkonce.t.*)
+	} :text
+	PROVIDE(__etext = .);
+	PROVIDE(_etext = .);
+	PROVIDE(etext = .);
+
+	/*
+	 * Other stuff is appended to the text segment:
+	 */
+	.rodata		: { *(.rodata .rodata.* .gnu.linkonce.r.*) }
+	.rodata1	: { *(.rodata1) }
+
+	.dynamic	: { *(.dynamic) }		:text	:dynamic
+
+	.eh_frame_hdr	: { *(.eh_frame_hdr) }		:text	:eh_frame_hdr
+	.eh_frame	: { KEEP (*(.eh_frame)) }	:text
+	.gcc_except_table : { *(.gcc_except_table .gcc_except_table.*) }
+
+	.rela.dyn ALIGN(8) : { *(.rela.dyn) }
+	.got ALIGN(8)	: { *(.got .toc) }
+	.got.plt ALIGN(8) : { *(.got.plt) }
+
+	_end = .;
+	PROVIDE(end = .);
+
+	/*
+	 * Stabs debugging sections are here too.
+	 */
+	.stab	       0 : { *(.stab) }
+	.stabstr       0 : { *(.stabstr) }
+	.stab.excl     0 : { *(.stab.excl) }
+	.stab.exclstr  0 : { *(.stab.exclstr) }
+	.stab.index    0 : { *(.stab.index) }
+	.stab.indexstr 0 : { *(.stab.indexstr) }
+	.comment       0 : { *(.comment) }
+
+	/*
+	 * DWARF debug sections.
+	 * Symbols in the DWARF debugging sections are relative to the
+	 * beginning of the section so we begin them at 0.
+	 */
+	/* DWARF 1 */
+	.debug		0 : { *(.debug) }
+	.line		0 : { *(.line) }
+	/* GNU DWARF 1 extensions */
+	.debug_srcinfo	0 : { *(.debug_srcinfo) }
+	.debug_sfnames	0 : { *(.debug_sfnames) }
+	/* DWARF 1.1 and DWARF 2 */
+	.debug_aranges	0 : { *(.debug_aranges) }
+	.debug_pubnames 0 : { *(.debug_pubnames) }
+	/* DWARF 2 */
+	.debug_info	0 : { *(.debug_info .gnu.linkonce.wi.*) }
+	.debug_abbrev	0 : { *(.debug_abbrev) }
+	.debug_line	0 : { *(.debug_line) }
+	.debug_frame	0 : { *(.debug_frame) }
+	.debug_str	0 : { *(.debug_str) }
+	.debug_loc	0 : { *(.debug_loc) }
+	.debug_macinfo	0 : { *(.debug_macinfo) }
+	/* SGI/MIPS DWARF 2 extensions */
+	.debug_weaknames 0 : { *(.debug_weaknames) }
+	.debug_funcnames 0 : { *(.debug_funcnames) }
+	.debug_typenames 0 : { *(.debug_typenames) }
+	.debug_varnames  0 : { *(.debug_varnames) }
+	/* DWARF 3 */
+	.debug_pubtypes 0 : { *(.debug_pubtypes) }
+	.debug_ranges	0 : { *(.debug_ranges) }
+	.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
+
+	/DISCARD/	: {
+		*(.note.GNU-stack)
+		*(.branch_lt)
+		*(.data .data.* .gnu.linkonce.d.* .sdata*)
+		*(.bss .sbss .dynbss .dynsbss)
+	}
+}
+
+/*
+ * Very old versions of ld do not recognize this name token; use the constant.
+ */
+#define PT_GNU_EH_FRAME	0x6474e550
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+	text		PT_LOAD FILEHDR PHDRS FLAGS(5);	/* PF_R|PF_X */
+	dynamic		PT_DYNAMIC FLAGS(4);		/* PF_R */
+	note		PT_NOTE FLAGS(4);		/* PF_R */
+	eh_frame_hdr	PT_GNU_EH_FRAME;
+}
+
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+	VDSO_VERSION_STRING {
+	global:
+		/*
+		 * Has to be there for the kernel to find
+		 */
+		__kernel_compat_restart_syscall;
+		__kernel_compat_rt_sigreturn;
+		__kernel_compat_sigreturn;
+	local: *;
+	};
+}
diff --git a/arch/s390/kernel/vdso32/vdso32_wrapper.S b/arch/s390/kernel/vdso32/vdso32_wrapper.S
new file mode 100644
index 000000000000..de2fb930471a
--- /dev/null
+++ b/arch/s390/kernel/vdso32/vdso32_wrapper.S
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+	__PAGE_ALIGNED_DATA
+
+	.globl vdso32_start, vdso32_end
+	.balign PAGE_SIZE
+vdso32_start:
+	.incbin "arch/s390/kernel/vdso32/vdso32.so"
+	.balign PAGE_SIZE
+vdso32_end:
+
+	.previous
diff --git a/arch/s390/kernel/vdso32/vdso_user_wrapper.S b/arch/s390/kernel/vdso32/vdso_user_wrapper.S
new file mode 100644
index 000000000000..2e645003fdaf
--- /dev/null
+++ b/arch/s390/kernel/vdso32/vdso_user_wrapper.S
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/unistd.h>
+#include <asm/dwarf.h>
+
+.macro vdso_syscall func,syscall
+	.globl __kernel_compat_\func
+	.type  __kernel_compat_\func,@function
+	__ALIGN
+__kernel_compat_\func:
+	CFI_STARTPROC
+	svc	\syscall
+	/* Make sure we notice when a syscall returns, which shouldn't happen */
+	.word	0
+	CFI_ENDPROC
+	.size	__kernel_compat_\func,.-__kernel_compat_\func
+.endm
+
+vdso_syscall restart_syscall,__NR_restart_syscall
+vdso_syscall sigreturn,__NR_sigreturn
+vdso_syscall rt_sigreturn,__NR_rt_sigreturn
diff --git a/arch/s390/kernel/vdso64/.gitignore b/arch/s390/kernel/vdso64/.gitignore
index 3fd18cf9fec2..4ec80685fecc 100644
--- a/arch/s390/kernel/vdso64/.gitignore
+++ b/arch/s390/kernel/vdso64/.gitignore
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
 vdso64.lds
diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile
index bec19e7e6e1c..d8f0df742809 100644
--- a/arch/s390/kernel/vdso64/Makefile
+++ b/arch/s390/kernel/vdso64/Makefile
@@ -1,44 +1,56 @@
 # SPDX-License-Identifier: GPL-2.0
-# List of files in the vdso, has to be asm only for now
+# List of files in the vdso
 
-KCOV_INSTRUMENT := n
+# Include the generic Makefile to check the built vdso.
+include $(srctree)/lib/vdso/Makefile.include
+obj-vdso64 = vdso_user_wrapper.o note.o vgetrandom-chacha.o
+obj-cvdso64 = vdso64_generic.o getcpu.o vgetrandom.o
+VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE)
+CFLAGS_REMOVE_getcpu.o = $(VDSO_CFLAGS_REMOVE)
+CFLAGS_REMOVE_vgetrandom.o = $(VDSO_CFLAGS_REMOVE)
+CFLAGS_REMOVE_vdso64_generic.o = $(VDSO_CFLAGS_REMOVE)
 
-obj-vdso64 = gettimeofday.o clock_getres.o clock_gettime.o note.o getcpu.o
+ifneq ($(c-getrandom-y),)
+	CFLAGS_vgetrandom.o += -include $(c-getrandom-y)
+endif
 
 # Build rules
 
-targets := $(obj-vdso64) vdso64.so vdso64.so.dbg
+targets := $(obj-vdso64) $(obj-cvdso64) vdso64.so vdso64.so.dbg
 obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64))
+obj-cvdso64 := $(addprefix $(obj)/, $(obj-cvdso64))
 
 KBUILD_AFLAGS += -DBUILD_VDSO
-KBUILD_CFLAGS += -DBUILD_VDSO
+KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING
 
 KBUILD_AFLAGS_64 := $(filter-out -m64,$(KBUILD_AFLAGS))
-KBUILD_AFLAGS_64 += -m64 -s
+KBUILD_AFLAGS_64 += -m64
 
 KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS))
-KBUILD_CFLAGS_64 += -m64 -fPIC -shared -fno-common -fno-builtin
-KBUILD_CFLAGS_64 += -nostdlib -Wl,-soname=linux-vdso64.so.1 \
-		    -Wl,--hash-style=both
+KBUILD_CFLAGS_64 := $(filter-out -mpacked-stack,$(KBUILD_CFLAGS_64))
+KBUILD_CFLAGS_64 := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_64))
+KBUILD_CFLAGS_64 := $(filter-out -munaligned-symbols,$(KBUILD_CFLAGS_64))
+KBUILD_CFLAGS_64 := $(filter-out -fno-asynchronous-unwind-tables,$(KBUILD_CFLAGS_64))
+KBUILD_CFLAGS_64 += -m64 -fPIC -fno-common -fno-builtin -fasynchronous-unwind-tables
+ldflags-y := -shared -soname=linux-vdso64.so.1 \
+	     --hash-style=both --build-id=sha1 -T
 
 $(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64)
 $(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_64)
 
 obj-y += vdso64_wrapper.o
-extra-y += vdso64.lds
+targets += vdso64.lds
 CPPFLAGS_vdso64.lds += -P -C -U$(ARCH)
 
-# Disable gcov profiling, ubsan and kasan for VDSO code
-GCOV_PROFILE := n
-UBSAN_SANITIZE := n
-KASAN_SANITIZE := n
-
 # Force dependency (incbin is bad)
 $(obj)/vdso64_wrapper.o : $(obj)/vdso64.so
 
+quiet_cmd_vdso_and_check = VDSO    $@
+      cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check)
+
 # link rule for the .so file, .lds has to be first
-$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) FORCE
-	$(call if_changed,vdso64ld)
+$(obj)/vdso64.so.dbg: $(obj)/vdso64.lds $(obj-vdso64) $(obj-cvdso64) FORCE
+	$(call if_changed,vdso_and_check)
 
 # strip rule for the .so file
 $(obj)/%.so: OBJCOPYFLAGS := -S
@@ -49,18 +61,19 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
 $(obj-vdso64): %.o: %.S FORCE
 	$(call if_changed_dep,vdso64as)
 
+$(obj-cvdso64): %.o: %.c FORCE
+	$(call if_changed_dep,vdso64cc)
+
 # actual build commands
-quiet_cmd_vdso64ld = VDSO64L $@
-      cmd_vdso64ld = $(CC) $(c_flags) -Wl,-T $(filter %.lds %.o,$^) -o $@
 quiet_cmd_vdso64as = VDSO64A $@
       cmd_vdso64as = $(CC) $(a_flags) -c -o $@ $<
+quiet_cmd_vdso64cc = VDSO64C $@
+      cmd_vdso64cc = $(CC) $(c_flags) -c -o $@ $<
 
-# install commands for the unstripped file
-quiet_cmd_vdso_install = INSTALL $@
-      cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@
-
-vdso64.so: $(obj)/vdso64.so.dbg
-	@mkdir -p $(MODLIB)/vdso
-	$(call cmd,vdso_install)
+# Generate VDSO offsets using helper script
+gen-vdsosym := $(src)/gen_vdso_offsets.sh
+quiet_cmd_vdsosym = VDSOSYM $@
+	cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@
 
-vdso_install: vdso64.so
+include/generated/vdso64-offsets.h: $(obj)/vdso64.so.dbg FORCE
+	$(call if_changed,vdsosym)
diff --git a/arch/s390/kernel/vdso64/clock_getres.S b/arch/s390/kernel/vdso64/clock_getres.S
deleted file mode 100644
index 081435398e0a..000000000000
--- a/arch/s390/kernel/vdso64/clock_getres.S
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Userland implementation of clock_getres() for 64 bits processes in a
- * s390 kernel for use in the vDSO
- *
- *  Copyright IBM Corp. 2008
- *  Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
- */
-#include <asm/vdso.h>
-#include <asm/asm-offsets.h>
-#include <asm/unistd.h>
-#include <asm/dwarf.h>
-
-	.text
-	.align 4
-	.globl __kernel_clock_getres
-	.type  __kernel_clock_getres,@function
-__kernel_clock_getres:
-	CFI_STARTPROC
-	larl	%r1,4f
-	cghi	%r2,__CLOCK_REALTIME_COARSE
-	je	0f
-	cghi	%r2,__CLOCK_MONOTONIC_COARSE
-	je	0f
-	larl	%r1,3f
-	cghi	%r2,__CLOCK_REALTIME
-	je	0f
-	cghi	%r2,__CLOCK_MONOTONIC
-	je	0f
-	cghi	%r2,__CLOCK_THREAD_CPUTIME_ID
-	je	0f
-	cghi	%r2,-2		/* Per-thread CPUCLOCK with PID=0, VIRT=1 */
-	jne	2f
-	larl	%r5,_vdso_data
-	icm	%r0,15,__LC_ECTG_OK(%r5)
-	jz	2f
-0:	ltgr	%r3,%r3
-	jz	1f				/* res == NULL */
-	lg	%r0,0(%r1)
-	xc	0(8,%r3),0(%r3)			/* set tp->tv_sec to zero */
-	stg	%r0,8(%r3)			/* store tp->tv_usec */
-1:	lghi	%r2,0
-	br	%r14
-2:	lghi	%r1,__NR_clock_getres		/* fallback to svc */
-	svc	0
-	br	%r14
-	CFI_ENDPROC
-3:	.quad	__CLOCK_REALTIME_RES
-4:	.quad	__CLOCK_COARSE_RES
-	.size	__kernel_clock_getres,.-__kernel_clock_getres
diff --git a/arch/s390/kernel/vdso64/clock_gettime.S b/arch/s390/kernel/vdso64/clock_gettime.S
deleted file mode 100644
index 9d2ee79b90f2..000000000000
--- a/arch/s390/kernel/vdso64/clock_gettime.S
+++ /dev/null
@@ -1,163 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Userland implementation of clock_gettime() for 64 bits processes in a
- * s390 kernel for use in the vDSO
- *
- *  Copyright IBM Corp. 2008
- *  Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
- */
-#include <asm/vdso.h>
-#include <asm/asm-offsets.h>
-#include <asm/unistd.h>
-#include <asm/dwarf.h>
-#include <asm/ptrace.h>
-
-	.text
-	.align 4
-	.globl __kernel_clock_gettime
-	.type  __kernel_clock_gettime,@function
-__kernel_clock_gettime:
-	CFI_STARTPROC
-	aghi	%r15,-16
-	CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16
-	CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
-	larl	%r5,_vdso_data
-	cghi	%r2,__CLOCK_REALTIME_COARSE
-	je	4f
-	cghi	%r2,__CLOCK_REALTIME
-	je	5f
-	cghi	%r2,-3		/* Per-thread CPUCLOCK with PID=0, VIRT=1 */
-	je	9f
-	cghi	%r2,__CLOCK_MONOTONIC_COARSE
-	je	3f
-	cghi	%r2,__CLOCK_MONOTONIC
-	jne	12f
-
-	/* CLOCK_MONOTONIC */
-0:	lg	%r4,__VDSO_UPD_COUNT(%r5)	/* load update counter */
-	tmll	%r4,0x0001			/* pending update ? loop */
-	jnz	0b
-	stcke	0(%r15)				/* Store TOD clock */
-	lgf	%r2,__VDSO_TK_SHIFT(%r5)	/* Timekeeper shift */
-	lg	%r0,__VDSO_WTOM_SEC(%r5)
-	lg	%r1,1(%r15)
-	sg	%r1,__VDSO_XTIME_STAMP(%r5)	/* TOD - cycle_last */
-	msgf	%r1,__VDSO_TK_MULT(%r5)		/*  * tk->mult */
-	alg	%r1,__VDSO_WTOM_NSEC(%r5)
-	srlg	%r1,%r1,0(%r2)			/*  >> tk->shift */
-	clg	%r4,__VDSO_UPD_COUNT(%r5)	/* check update counter */
-	jne	0b
-	larl	%r5,13f
-1:	clg	%r1,0(%r5)
-	jl	2f
-	slg	%r1,0(%r5)
-	aghi	%r0,1
-	j	1b
-2:	stg	%r0,0(%r3)			/* store tp->tv_sec */
-	stg	%r1,8(%r3)			/* store tp->tv_nsec */
-	lghi	%r2,0
-	aghi	%r15,16
-	CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD
-	CFI_RESTORE 15
-	br	%r14
-
-	/* CLOCK_MONOTONIC_COARSE */
-	CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16
-	CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
-3:	lg	%r4,__VDSO_UPD_COUNT(%r5)	/* load update counter */
-	tmll	%r4,0x0001			/* pending update ? loop */
-	jnz	3b
-	lg	%r0,__VDSO_WTOM_CRS_SEC(%r5)
-	lg	%r1,__VDSO_WTOM_CRS_NSEC(%r5)
-	clg	%r4,__VDSO_UPD_COUNT(%r5)	/* check update counter */
-	jne	3b
-	j	2b
-
-	/* CLOCK_REALTIME_COARSE */
-4:	lg	%r4,__VDSO_UPD_COUNT(%r5)	/* load update counter */
-	tmll	%r4,0x0001			/* pending update ? loop */
-	jnz	4b
-	lg	%r0,__VDSO_XTIME_CRS_SEC(%r5)
-	lg	%r1,__VDSO_XTIME_CRS_NSEC(%r5)
-	clg	%r4,__VDSO_UPD_COUNT(%r5)	/* check update counter */
-	jne	4b
-	j	7f
-
-	/* CLOCK_REALTIME */
-5:	lg	%r4,__VDSO_UPD_COUNT(%r5)	/* load update counter */
-	tmll	%r4,0x0001			/* pending update ? loop */
-	jnz	5b
-	stcke	0(%r15)				/* Store TOD clock */
-	lg	%r1,1(%r15)
-	lg	%r0,__VDSO_TS_END(%r5)		/* TOD steering end time */
-	slgr	%r0,%r1				/* now - ts_steering_end */
-	ltgr	%r0,%r0				/* past end of steering ? */
-	jm	17f
-	srlg	%r0,%r0,15			/* 1 per 2^16 */
-	tm	__VDSO_TS_DIR+3(%r5),0x01	/* steering direction? */
-	jz	18f
-	lcgr	%r0,%r0				/* negative TOD offset */
-18:	algr	%r1,%r0				/* add steering offset */
-17:	lgf	%r2,__VDSO_TK_SHIFT(%r5)	/* Timekeeper shift */
-	sg	%r1,__VDSO_XTIME_STAMP(%r5)	/* TOD - cycle_last */
-	msgf	%r1,__VDSO_TK_MULT(%r5)		/*  * tk->mult */
-	alg	%r1,__VDSO_XTIME_NSEC(%r5)	/*  + tk->xtime_nsec */
-	srlg	%r1,%r1,0(%r2)			/*  >> tk->shift */
-	lg	%r0,__VDSO_XTIME_SEC(%r5)	/* tk->xtime_sec */
-	clg	%r4,__VDSO_UPD_COUNT(%r5)	/* check update counter */
-	jne	5b
-	larl	%r5,13f
-6:	clg	%r1,0(%r5)
-	jl	7f
-	slg	%r1,0(%r5)
-	aghi	%r0,1
-	j	6b
-7:	stg	%r0,0(%r3)			/* store tp->tv_sec */
-	stg	%r1,8(%r3)			/* store tp->tv_nsec */
-	lghi	%r2,0
-	aghi	%r15,16
-	CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD
-	CFI_RESTORE 15
-	br	%r14
-
-	/* CPUCLOCK_VIRT for this thread */
-	CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16
-	CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
-9:	lghi	%r4,0
-	icm	%r0,15,__VDSO_ECTG_OK(%r5)
-	jz	12f
-	sacf	256				/* Magic ectg instruction */
-	.insn	ssf,0xc80100000000,__VDSO_ECTG_BASE(4),__VDSO_ECTG_USER(4),4
-	sacf	0
-	algr	%r1,%r0				/* r1 = cputime as TOD value */
-	mghi	%r1,1000			/* convert to nanoseconds */
-	srlg	%r1,%r1,12			/* r1 = cputime in nanosec */
-	lgr	%r4,%r1
-	larl	%r5,13f
-	srlg	%r1,%r1,9			/* divide by 1000000000 */
-	mlg	%r0,8(%r5)
-	srlg	%r0,%r0,11			/* r0 = tv_sec */
-	stg	%r0,0(%r3)
-	msg	%r0,0(%r5)			/* calculate tv_nsec */
-	slgr	%r4,%r0				/* r4 = tv_nsec */
-	stg	%r4,8(%r3)
-	lghi	%r2,0
-	aghi	%r15,16
-	CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD
-	CFI_RESTORE 15
-	br	%r14
-
-	/* Fallback to system call */
-	CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16
-	CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
-12:	lghi	%r1,__NR_clock_gettime
-	svc	0
-	aghi	%r15,16
-	CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD
-	CFI_RESTORE 15
-	br	%r14
-	CFI_ENDPROC
-
-13:	.quad	1000000000
-14:	.quad	19342813113834067
-	.size	__kernel_clock_gettime,.-__kernel_clock_gettime
diff --git a/arch/s390/kernel/vdso64/gen_vdso_offsets.sh b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh
new file mode 100755
index 000000000000..37f05cb38dad
--- /dev/null
+++ b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+#
+# Match symbols in the DSO that look like VDSO_*; produce a header file
+# of constant offsets into the shared object.
+#
+# Doing this inside the Makefile will break the $(filter-out) function,
+# causing Kbuild to rebuild the vdso-offsets header file every time.
+#
+# Inspired by arm64 version.
+#
+
+LC_ALL=C
+sed -n 's/\([0-9a-f]*\) . __kernel_\(.*\)/\#define vdso64_offset_\2\t0x\1/p'
diff --git a/arch/s390/kernel/vdso64/getcpu.S b/arch/s390/kernel/vdso64/getcpu.S
deleted file mode 100644
index 3c04f7328500..000000000000
--- a/arch/s390/kernel/vdso64/getcpu.S
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Userland implementation of getcpu() for 64 bits processes in a
- * s390 kernel for use in the vDSO
- *
- *  Copyright IBM Corp. 2016
- *  Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
- */
-#include <asm/vdso.h>
-#include <asm/asm-offsets.h>
-#include <asm/dwarf.h>
-
-	.text
-	.align 4
-	.globl __kernel_getcpu
-	.type  __kernel_getcpu,@function
-__kernel_getcpu:
-	CFI_STARTPROC
-	sacf	256
-	lm	%r4,%r5,__VDSO_GETCPU_VAL(%r0)
-	sacf	0
-	ltgr	%r2,%r2
-	jz	2f
-	st	%r5,0(%r2)
-2:	ltgr	%r3,%r3
-	jz	3f
-	st	%r4,0(%r3)
-3:	lghi	%r2,0
-	br	%r14
-	CFI_ENDPROC
-	.size	__kernel_getcpu,.-__kernel_getcpu
diff --git a/arch/s390/kernel/vdso64/getcpu.c b/arch/s390/kernel/vdso64/getcpu.c
new file mode 100644
index 000000000000..5c5d4a848b76
--- /dev/null
+++ b/arch/s390/kernel/vdso64/getcpu.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright IBM Corp. 2020 */
+
+#include <linux/compiler.h>
+#include <linux/getcpu.h>
+#include <asm/timex.h>
+#include "vdso.h"
+
+int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+{
+	union tod_clock clk;
+
+	/* CPU number is stored in the programmable field of the TOD clock */
+	store_tod_clock_ext(&clk);
+	if (cpu)
+		*cpu = clk.pf;
+	/* NUMA node is always zero */
+	if (node)
+		*node = 0;
+	return 0;
+}
diff --git a/arch/s390/kernel/vdso64/gettimeofday.S b/arch/s390/kernel/vdso64/gettimeofday.S
deleted file mode 100644
index aebe10dc7c99..000000000000
--- a/arch/s390/kernel/vdso64/gettimeofday.S
+++ /dev/null
@@ -1,71 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Userland implementation of gettimeofday() for 64 bits processes in a
- * s390 kernel for use in the vDSO
- *
- *  Copyright IBM Corp. 2008
- *  Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
- */
-#include <asm/vdso.h>
-#include <asm/asm-offsets.h>
-#include <asm/unistd.h>
-#include <asm/dwarf.h>
-#include <asm/ptrace.h>
-
-	.text
-	.align 4
-	.globl __kernel_gettimeofday
-	.type  __kernel_gettimeofday,@function
-__kernel_gettimeofday:
-	CFI_STARTPROC
-	aghi	%r15,-16
-	CFI_ADJUST_CFA_OFFSET 16
-	CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
-	larl	%r5,_vdso_data
-0:	ltgr	%r3,%r3				/* check if tz is NULL */
-	je	1f
-	mvc	0(8,%r3),__VDSO_TIMEZONE(%r5)
-1:	ltgr	%r2,%r2				/* check if tv is NULL */
-	je	4f
-	lg	%r4,__VDSO_UPD_COUNT(%r5)	/* load update counter */
-	tmll	%r4,0x0001			/* pending update ? loop */
-	jnz	0b
-	stcke	0(%r15)				/* Store TOD clock */
-	lg	%r1,1(%r15)
-	lg	%r0,__VDSO_TS_END(%r5)		/* TOD steering end time */
-	slgr	%r0,%r1				/* now - ts_steering_end */
-	ltgr	%r0,%r0				/* past end of steering ? */
-	jm	6f
-	srlg	%r0,%r0,15			/* 1 per 2^16 */
-	tm	__VDSO_TS_DIR+3(%r5),0x01	/* steering direction? */
-	jz	7f
-	lcgr	%r0,%r0				/* negative TOD offset */
-7:	algr	%r1,%r0				/* add steering offset */
-6:	sg	%r1,__VDSO_XTIME_STAMP(%r5)	/* TOD - cycle_last */
-	msgf	%r1,__VDSO_TK_MULT(%r5)		/*  * tk->mult */
-	alg	%r1,__VDSO_XTIME_NSEC(%r5)	/*  + tk->xtime_nsec */
-	lg	%r0,__VDSO_XTIME_SEC(%r5)	/* tk->xtime_sec */
-	clg	%r4,__VDSO_UPD_COUNT(%r5)	/* check update counter */
-	jne	0b
-	lgf	%r5,__VDSO_TK_SHIFT(%r5)	/* Timekeeper shift */
-	srlg	%r1,%r1,0(%r5)			/*  >> tk->shift */
-	larl	%r5,5f
-2:	clg	%r1,0(%r5)
-	jl	3f
-	slg	%r1,0(%r5)
-	aghi	%r0,1
-	j	2b
-3:	stg	%r0,0(%r2)			/* store tv->tv_sec */
-	slgr	%r0,%r0				/* tv_nsec -> tv_usec */
-	ml	%r0,8(%r5)
-	srlg	%r0,%r0,6
-	stg	%r0,8(%r2)			/* store tv->tv_usec */
-4:	lghi	%r2,0
-	aghi	%r15,16
-	CFI_ADJUST_CFA_OFFSET -16
-	CFI_RESTORE 15
-	br	%r14
-	CFI_ENDPROC
-5:	.quad	1000000000
-	.long	274877907
-	.size	__kernel_gettimeofday,.-__kernel_gettimeofday
diff --git a/arch/s390/kernel/vdso64/vdso.h b/arch/s390/kernel/vdso64/vdso.h
new file mode 100644
index 000000000000..9e5397e7b590
--- /dev/null
+++ b/arch/s390/kernel/vdso64/vdso.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ARCH_S390_KERNEL_VDSO64_VDSO_H
+#define __ARCH_S390_KERNEL_VDSO64_VDSO_H
+
+#include <vdso/datapage.h>
+
+struct getcpu_cache;
+
+int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused);
+int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz);
+int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts);
+int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts);
+ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len);
+
+#endif /* __ARCH_S390_KERNEL_VDSO64_VDSO_H */
diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S
index 7ddb116b5e2e..e4f6551ae898 100644
--- a/arch/s390/kernel/vdso64/vdso64.lds.S
+++ b/arch/s390/kernel/vdso64/vdso64.lds.S
@@ -4,16 +4,19 @@
  * library
  */
 
+#include <asm/vdso/vsyscall.h>
 #include <asm/page.h>
 #include <asm/vdso.h>
+#include <vdso/datapage.h>
 
 OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390")
 OUTPUT_ARCH(s390:64-bit)
-ENTRY(_start)
 
 SECTIONS
 {
-	. = VDSO64_LBASE + SIZEOF_HEADERS;
+	VDSO_VVAR_SYMS
+
+	. = SIZEOF_HEADERS;
 
 	.hash		: { *(.hash) }			:text
 	.gnu.hash	: { *(.gnu.hash) }
@@ -39,6 +42,10 @@ SECTIONS
 	.rodata		: { *(.rodata .rodata.* .gnu.linkonce.r.*) }
 	.rodata1	: { *(.rodata1) }
 
+	. = ALIGN(8);
+	.altinstructions	: { *(.altinstructions) }
+	.altinstr_replacement	: { *(.altinstr_replacement) }
+
 	.dynamic	: { *(.dynamic) }		:text	:dynamic
 
 	.eh_frame_hdr	: { *(.eh_frame_hdr) }		:text	:eh_frame_hdr
@@ -47,6 +54,7 @@ SECTIONS
 
 	.rela.dyn ALIGN(8) : { *(.rela.dyn) }
 	.got ALIGN(8)	: { *(.got .toc) }
+	.got.plt ALIGN(8) : { *(.got.plt) }
 
 	_end = .;
 	PROVIDE(end = .);
@@ -94,9 +102,6 @@ SECTIONS
 	.debug_ranges	0 : { *(.debug_ranges) }
 	.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
 
-	. = ALIGN(PAGE_SIZE);
-	PROVIDE(_vdso_data = .);
-
 	/DISCARD/	: {
 		*(.note.GNU-stack)
 		*(.branch_lt)
@@ -136,7 +141,10 @@ VERSION
 		__kernel_clock_gettime;
 		__kernel_clock_getres;
 		__kernel_getcpu;
-
+		__kernel_restart_syscall;
+		__kernel_rt_sigreturn;
+		__kernel_sigreturn;
+		__kernel_getrandom;
 	local: *;
 	};
 }
diff --git a/arch/s390/kernel/vdso64/vdso64_generic.c b/arch/s390/kernel/vdso64/vdso64_generic.c
new file mode 100644
index 000000000000..a9aa75643c08
--- /dev/null
+++ b/arch/s390/kernel/vdso64/vdso64_generic.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "../../../../lib/vdso/gettimeofday.c"
+#include "vdso.h"
+
+int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv,
+			     struct timezone *tz)
+{
+	return __cvdso_gettimeofday(tv, tz);
+}
+
+int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts)
+{
+	return __cvdso_clock_gettime(clock, ts);
+}
+
+int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts)
+{
+	return __cvdso_clock_getres(clock, ts);
+}
diff --git a/arch/s390/kernel/vdso64/vdso_user_wrapper.S b/arch/s390/kernel/vdso64/vdso_user_wrapper.S
new file mode 100644
index 000000000000..aa06c85bcbd3
--- /dev/null
+++ b/arch/s390/kernel/vdso64/vdso_user_wrapper.S
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/vdso.h>
+#include <asm/unistd.h>
+#include <asm/asm-offsets.h>
+#include <asm/dwarf.h>
+#include <asm/ptrace.h>
+
+/*
+ * Older glibc version called vdso without allocating a stackframe. This wrapper
+ * is just used to allocate a stackframe. See
+ * https://sourceware.org/git/?p=glibc.git;a=commit;h=478593e6374f3818da39332260dc453cb19cfa1e
+ * for details.
+ */
+.macro vdso_func func
+SYM_FUNC_START(__kernel_\func)
+	CFI_STARTPROC
+	aghi	%r15,-STACK_FRAME_VDSO_OVERHEAD
+	CFI_DEF_CFA_OFFSET (STACK_FRAME_USER_OVERHEAD + STACK_FRAME_VDSO_OVERHEAD)
+	CFI_VAL_OFFSET 15,-STACK_FRAME_USER_OVERHEAD
+	stg	%r14,__SFVDSO_RETURN_ADDRESS(%r15)
+	CFI_REL_OFFSET 14,__SFVDSO_RETURN_ADDRESS
+	xc	__SFUSER_BACKCHAIN(8,%r15),__SFUSER_BACKCHAIN(%r15)
+	brasl	%r14,__s390_vdso_\func
+	lg	%r14,__SFVDSO_RETURN_ADDRESS(%r15)
+	CFI_RESTORE 14
+	aghi	%r15,STACK_FRAME_VDSO_OVERHEAD
+	CFI_DEF_CFA_OFFSET STACK_FRAME_USER_OVERHEAD
+	CFI_RESTORE 15
+	br	%r14
+	CFI_ENDPROC
+SYM_FUNC_END(__kernel_\func)
+.endm
+
+vdso_func gettimeofday
+vdso_func clock_getres
+vdso_func clock_gettime
+vdso_func getcpu
+
+.macro vdso_syscall func,syscall
+SYM_FUNC_START(__kernel_\func)
+	CFI_STARTPROC
+	svc	\syscall
+	/* Make sure we notice when a syscall returns, which shouldn't happen */
+	.word	0
+	CFI_ENDPROC
+SYM_FUNC_END(__kernel_\func)
+.endm
+
+vdso_syscall restart_syscall,__NR_restart_syscall
+vdso_syscall sigreturn,__NR_sigreturn
+vdso_syscall rt_sigreturn,__NR_rt_sigreturn
diff --git a/arch/s390/kernel/vdso64/vgetrandom-chacha.S b/arch/s390/kernel/vdso64/vgetrandom-chacha.S
new file mode 100644
index 000000000000..09c034c2f853
--- /dev/null
+++ b/arch/s390/kernel/vdso64/vgetrandom-chacha.S
@@ -0,0 +1,181 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/stringify.h>
+#include <linux/linkage.h>
+#include <asm/alternative.h>
+#include <asm/dwarf.h>
+#include <asm/fpu-insn.h>
+
+#define STATE0	%v0
+#define STATE1	%v1
+#define STATE2	%v2
+#define STATE3	%v3
+#define COPY0	%v4
+#define COPY1	%v5
+#define COPY2	%v6
+#define COPY3	%v7
+#define BEPERM	%v19
+#define TMP0	%v20
+#define TMP1	%v21
+#define TMP2	%v22
+#define TMP3	%v23
+
+	.section .rodata
+
+	.balign 32
+SYM_DATA_START_LOCAL(chacha20_constants)
+	.long	0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
+	.long	0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap
+SYM_DATA_END(chacha20_constants)
+
+	.text
+/*
+ * s390 ChaCha20 implementation meant for vDSO. Produces a given positive
+ * number of blocks of output with nonce 0, taking an input key and 8-bytes
+ * counter. Does not spill to the stack.
+ *
+ * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
+ *				       const uint8_t *key,
+ *				       uint32_t *counter,
+ *				       size_t nblocks)
+ */
+SYM_FUNC_START(__arch_chacha20_blocks_nostack)
+	CFI_STARTPROC
+	larl	%r1,chacha20_constants
+
+	/* COPY0 = "expand 32-byte k" */
+	VL	COPY0,0,,%r1
+
+	/* BEPERM = byte selectors for VPERM */
+	ALTERNATIVE __stringify(VL BEPERM,16,,%r1), "brcl 0,0", ALT_FACILITY(148)
+
+	/* COPY1,COPY2 = key */
+	VLM	COPY1,COPY2,0,%r3
+
+	/* COPY3 = counter || zero nonce  */
+	lg	%r3,0(%r4)
+	VZERO	COPY3
+	VLVGG	COPY3,%r3,0
+
+	lghi	%r1,0
+.Lblock:
+	VLR	STATE0,COPY0
+	VLR	STATE1,COPY1
+	VLR	STATE2,COPY2
+	VLR	STATE3,COPY3
+
+	lghi	%r0,10
+.Ldoubleround:
+	/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */
+	VAF	STATE0,STATE0,STATE1
+	VX	STATE3,STATE3,STATE0
+	VERLLF	STATE3,STATE3,16
+
+	/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */
+	VAF	STATE2,STATE2,STATE3
+	VX	STATE1,STATE1,STATE2
+	VERLLF	STATE1,STATE1,12
+
+	/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */
+	VAF	STATE0,STATE0,STATE1
+	VX	STATE3,STATE3,STATE0
+	VERLLF	STATE3,STATE3,8
+
+	/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */
+	VAF	STATE2,STATE2,STATE3
+	VX	STATE1,STATE1,STATE2
+	VERLLF	STATE1,STATE1,7
+
+	/* STATE1[0,1,2,3] = STATE1[1,2,3,0] */
+	VSLDB	STATE1,STATE1,STATE1,4
+	/* STATE2[0,1,2,3] = STATE2[2,3,0,1] */
+	VSLDB	STATE2,STATE2,STATE2,8
+	/* STATE3[0,1,2,3] = STATE3[3,0,1,2] */
+	VSLDB	STATE3,STATE3,STATE3,12
+
+	/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */
+	VAF	STATE0,STATE0,STATE1
+	VX	STATE3,STATE3,STATE0
+	VERLLF	STATE3,STATE3,16
+
+	/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */
+	VAF	STATE2,STATE2,STATE3
+	VX	STATE1,STATE1,STATE2
+	VERLLF	STATE1,STATE1,12
+
+	/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */
+	VAF	STATE0,STATE0,STATE1
+	VX	STATE3,STATE3,STATE0
+	VERLLF	STATE3,STATE3,8
+
+	/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */
+	VAF	STATE2,STATE2,STATE3
+	VX	STATE1,STATE1,STATE2
+	VERLLF	STATE1,STATE1,7
+
+	/* STATE1[0,1,2,3] = STATE1[3,0,1,2] */
+	VSLDB	STATE1,STATE1,STATE1,12
+	/* STATE2[0,1,2,3] = STATE2[2,3,0,1] */
+	VSLDB	STATE2,STATE2,STATE2,8
+	/* STATE3[0,1,2,3] = STATE3[1,2,3,0] */
+	VSLDB	STATE3,STATE3,STATE3,4
+	brctg	%r0,.Ldoubleround
+
+	/* OUTPUT0 = STATE0 + COPY0 */
+	VAF	STATE0,STATE0,COPY0
+	/* OUTPUT1 = STATE1 + COPY1 */
+	VAF	STATE1,STATE1,COPY1
+	/* OUTPUT2 = STATE2 + COPY2 */
+	VAF	STATE2,STATE2,COPY2
+	/* OUTPUT3 = STATE3 + COPY3 */
+	VAF	STATE3,STATE3,COPY3
+
+	ALTERNATIVE							\
+		__stringify(						\
+		/* Convert STATE to little endian and store to OUTPUT */\
+		VPERM	TMP0,STATE0,STATE0,BEPERM;			\
+		VPERM	TMP1,STATE1,STATE1,BEPERM;			\
+		VPERM	TMP2,STATE2,STATE2,BEPERM;			\
+		VPERM	TMP3,STATE3,STATE3,BEPERM;			\
+		VSTM	TMP0,TMP3,0,%r2),				\
+		__stringify(						\
+		/* 32 bit wise little endian store to OUTPUT */		\
+		VSTBRF	STATE0,0,,%r2;					\
+		VSTBRF	STATE1,16,,%r2;					\
+		VSTBRF	STATE2,32,,%r2;					\
+		VSTBRF	STATE3,48,,%r2;					\
+		brcl	0,0),						\
+		ALT_FACILITY(148)
+
+	/* ++COPY3.COUNTER */
+	/* alsih %r3,1 */
+	.insn	rilu,0xcc0a00000000,%r3,1
+	alcr	%r3,%r1
+	VLVGG	COPY3,%r3,0
+
+	/* OUTPUT += 64, --NBLOCKS */
+	aghi	%r2,64
+	brctg	%r5,.Lblock
+
+	/* COUNTER = COPY3.COUNTER */
+	stg	%r3,0(%r4)
+
+	/* Zero out potentially sensitive regs */
+	VZERO	STATE0
+	VZERO	STATE1
+	VZERO	STATE2
+	VZERO	STATE3
+	VZERO	COPY1
+	VZERO	COPY2
+
+	/* Early exit if TMP0-TMP3 have not been used */
+	ALTERNATIVE "nopr", "br %r14", ALT_FACILITY(148)
+
+	VZERO	TMP0
+	VZERO	TMP1
+	VZERO	TMP2
+	VZERO	TMP3
+
+	br	%r14
+	CFI_ENDPROC
+SYM_FUNC_END(__arch_chacha20_blocks_nostack)
diff --git a/arch/s390/kernel/vdso64/vgetrandom.c b/arch/s390/kernel/vdso64/vgetrandom.c
new file mode 100644
index 000000000000..b5268b507fb5
--- /dev/null
+++ b/arch/s390/kernel/vdso64/vgetrandom.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <asm/facility.h>
+#include <uapi/asm-generic/errno.h>
+#include "vdso.h"
+
+ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len)
+{
+	if (test_facility(129))
+		return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len);
+	if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags))
+		return -ENOSYS;
+	return getrandom_syscall(buffer, len, flags);
+}
diff --git a/arch/s390/kernel/vmcore_info.c b/arch/s390/kernel/vmcore_info.c
new file mode 100644
index 000000000000..cc8933e04ff7
--- /dev/null
+++ b/arch/s390/kernel/vmcore_info.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/vmcore_info.h>
+#include <linux/mm.h>
+#include <asm/abs_lowcore.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+
+void arch_crash_save_vmcoreinfo(void)
+{
+	struct lowcore *abs_lc;
+
+	VMCOREINFO_SYMBOL(lowcore_ptr);
+	VMCOREINFO_SYMBOL(high_memory);
+	VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
+	vmcoreinfo_append_str("SAMODE31=%lx\n", (unsigned long)__samode31);
+	vmcoreinfo_append_str("EAMODE31=%lx\n", (unsigned long)__eamode31);
+	vmcoreinfo_append_str("IDENTITYBASE=%lx\n", __identity_base);
+	vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
+	vmcoreinfo_append_str("KERNELOFFPHYS=%lx\n", __kaslr_offset_phys);
+	abs_lc = get_abs_lowcore();
+	abs_lc->vmcore_info = paddr_vmcoreinfo_note();
+	put_abs_lowcore(abs_lc);
+}
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 37695499717d..ff1ddba96352 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -5,16 +5,22 @@
 
 #include <asm/thread_info.h>
 #include <asm/page.h>
+#include <asm/ftrace.lds.h>
 
 /*
  * Put .bss..swapper_pg_dir as the first thing in .bss. This will
  * make sure it has 16k alignment.
  */
-#define BSS_FIRST_SECTIONS *(.bss..swapper_pg_dir)
+#define BSS_FIRST_SECTIONS *(.bss..swapper_pg_dir) \
+			   *(.bss..invalid_pg_dir)
+
+#define RO_EXCEPTION_TABLE_ALIGN	16
 
 /* Handle ro_after_init data on our own. */
 #define RO_AFTER_INIT_DATA
 
+#define RUNTIME_DISCARD_EXIT
+
 #define EMITS_PT_NOTE
 
 #include <asm-generic/vmlinux.lds.h>
@@ -33,20 +39,19 @@ PHDRS {
 
 SECTIONS
 {
-	. = 0x100000;
+	. = TEXT_OFFSET;
 	.text : {
 		_stext = .;		/* Start of text section */
 		_text = .;		/* Text and read-only data */
 		HEAD_TEXT
 		TEXT_TEXT
 		SCHED_TEXT
-		CPUIDLE_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
 		SOFTIRQENTRY_TEXT
+		FTRACE_HOTPATCH_TRAMPOLINES_TEXT
 		*(.text.*_indirect_*)
-		*(.fixup)
 		*(.gnu.warning)
 		. = ALIGN(PAGE_SIZE);
 		_etext = .;		/* End of text section */
@@ -63,13 +68,32 @@ SECTIONS
 		 *(.data..ro_after_init)
 		JUMP_TABLE_DATA
 	} :data
-	EXCEPTION_TABLE(16)
 	. = ALIGN(PAGE_SIZE);
 	__end_ro_after_init = .;
 
+	.data.rel.ro : {
+		*(.data.rel.ro .data.rel.ro.*)
+	}
+	.got : {
+		__got_start = .;
+		*(.got)
+		__got_end = .;
+	}
+
 	RW_DATA(0x100, PAGE_SIZE, THREAD_SIZE)
+	.data.rel : {
+		*(.data.rel*)
+	}
 	BOOT_DATA_PRESERVED
 
+	. = ALIGN(8);
+	.amode31.refs : {
+		_start_amode31_refs = .;
+		*(.amode31.refs)
+		_end_amode31_refs = .;
+	}
+
+	. = ALIGN(PAGE_SIZE);
 	_edata = .;		/* End of data section */
 
 	/* will be freed after init */
@@ -122,6 +146,7 @@ SECTIONS
 	/*
 	 * Table with the patch locations to undo expolines
 	*/
+	. = ALIGN(4);
 	.nospec_call_table : {
 		__nospec_call_start = . ;
 		*(.s390_indirect*)
@@ -135,28 +160,46 @@ SECTIONS
 
 	BOOT_DATA
 
+	/*
+	 * .amode31 section for code, data, ex_table that need to stay
+	 * below 2 GB, even when the kernel is relocated above 2 GB.
+	 */
+	. = ALIGN(PAGE_SIZE);
+	_samode31 = .;
+	.amode31.text : {
+		_stext_amode31 = .;
+		*(.amode31.text)
+		*(.amode31.text.*_indirect_*)
+		. = ALIGN(PAGE_SIZE);
+		_etext_amode31 = .;
+	}
+	. = ALIGN(16);
+	.amode31.ex_table : {
+		_start_amode31_ex_table = .;
+		KEEP(*(.amode31.ex_table))
+		_stop_amode31_ex_table = .;
+	}
+	. = ALIGN(PAGE_SIZE);
+	.amode31.data : {
+		*(.amode31.data)
+	}
+	. = _samode31 + AMODE31_SIZE;
+	_eamode31 = .;
+
 	/* early.c uses stsi, which requires page aligned data. */
 	. = ALIGN(PAGE_SIZE);
 	INIT_DATA_SECTION(0x100)
 
-	PERCPU_SECTION(0x100)
+	RUNTIME_CONST_VARIABLES
 
-	.dynsym ALIGN(8) : {
-		__dynsym_start = .;
-		*(.dynsym)
-		__dynsym_end = .;
-	}
-	.rela.dyn ALIGN(8) : {
-		__rela_dyn_start = .;
-		*(.rela*)
-		__rela_dyn_end = .;
-	}
+	PERCPU_SECTION(0x100)
 
 	. = ALIGN(PAGE_SIZE);
 	__init_end = .;		/* freed after init ends here */
 
 	BSS_SECTION(PAGE_SIZE, 4 * PAGE_SIZE, PAGE_SIZE)
 
+	. = ALIGN(PAGE_SIZE);
 	_end = . ;
 
 	/*
@@ -164,7 +207,6 @@ SECTIONS
 	 * it should match struct vmlinux_info
 	 */
 	.vmlinux.info 0 (INFO) : {
-		QUAD(_stext)					/* default_lma */
 		QUAD(startup_continue)				/* entry */
 		QUAD(__bss_start - _stext)			/* image_size */
 		QUAD(__bss_stop - __bss_start)			/* bss_size */
@@ -173,18 +215,54 @@ SECTIONS
 		QUAD(__boot_data_preserved_start)		/* bootdata_preserved_off */
 		QUAD(__boot_data_preserved_end -
 		     __boot_data_preserved_start)		/* bootdata_preserved_size */
-		QUAD(__dynsym_start)				/* dynsym_start */
-		QUAD(__rela_dyn_start)				/* rela_dyn_start */
-		QUAD(__rela_dyn_end)				/* rela_dyn_end */
+		QUAD(__got_start)				/* got_start */
+		QUAD(__got_end)					/* got_end */
+		QUAD(_eamode31 - _samode31)			/* amode31_size */
+		QUAD(init_mm)
+		QUAD(swapper_pg_dir)
+		QUAD(invalid_pg_dir)
+		QUAD(__alt_instructions)
+		QUAD(__alt_instructions_end)
+#ifdef CONFIG_KASAN
+		QUAD(kasan_early_shadow_page)
+		QUAD(kasan_early_shadow_pte)
+		QUAD(kasan_early_shadow_pmd)
+		QUAD(kasan_early_shadow_pud)
+		QUAD(kasan_early_shadow_p4d)
+#endif
 	} :NONE
 
 	/* Debugging sections.	*/
 	STABS_DEBUG
 	DWARF_DEBUG
+	ELF_DETAILS
+
+	/*
+	 * Make sure that the .got.plt is either completely empty or it
+	 * contains only the three reserved double words.
+	 */
+	.got.plt : {
+		*(.got.plt)
+	}
+	ASSERT(SIZEOF(.got.plt) == 0 || SIZEOF(.got.plt) == 0x18, "Unexpected GOT/PLT entries detected!")
+
+	/*
+	 * Sections that should stay zero sized, which is safer to
+	 * explicitly check instead of blindly discarding.
+	 */
+	.plt : {
+		*(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt)
+	}
+	ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!")
+	.rela.dyn : {
+		*(.rela.*) *(.rela_*)
+	}
+	ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!")
 
 	/* Sections to be discarded */
 	DISCARDS
 	/DISCARD/ : {
 		*(.eh_frame)
+		*(.interp)
 	}
 }
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 8df10d3c8f6c..234a0ba30510 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -7,13 +7,13 @@
  */
 
 #include <linux/kernel_stat.h>
-#include <linux/sched/cputime.h>
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/timex.h>
 #include <linux/types.h>
 #include <linux/time.h>
-
+#include <asm/alternative.h>
+#include <asm/cputime.h>
 #include <asm/vtimer.h>
 #include <asm/vtime.h>
 #include <asm/cpu_mf.h>
@@ -33,24 +33,17 @@ static DEFINE_PER_CPU(u64, mt_scaling_mult) = { 1 };
 static DEFINE_PER_CPU(u64, mt_scaling_div) = { 1 };
 static DEFINE_PER_CPU(u64, mt_scaling_jiffies);
 
-static inline u64 get_vtimer(void)
-{
-	u64 timer;
-
-	asm volatile("stpt %0" : "=Q" (timer));
-	return timer;
-}
-
 static inline void set_vtimer(u64 expires)
 {
+	struct lowcore *lc = get_lowcore();
 	u64 timer;
 
 	asm volatile(
 		"	stpt	%0\n"	/* Store current cpu timer value */
 		"	spt	%1"	/* Set new value imm. afterwards */
 		: "=Q" (timer) : "Q" (expires));
-	S390_lowcore.system_timer += S390_lowcore.last_update_timer - timer;
-	S390_lowcore.last_update_timer = expires;
+	lc->system_timer += lc->last_update_timer - timer;
+	lc->last_update_timer = expires;
 }
 
 static inline int virt_timer_forward(u64 elapsed)
@@ -125,25 +118,23 @@ static void account_system_index_scaled(struct task_struct *p, u64 cputime,
 static int do_account_vtime(struct task_struct *tsk)
 {
 	u64 timer, clock, user, guest, system, hardirq, softirq;
+	struct lowcore *lc = get_lowcore();
 
-	timer = S390_lowcore.last_update_timer;
-	clock = S390_lowcore.last_update_clock;
+	timer = lc->last_update_timer;
+	clock = lc->last_update_clock;
 	asm volatile(
 		"	stpt	%0\n"	/* Store current cpu timer value */
-#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES
 		"	stckf	%1"	/* Store current tod clock value */
-#else
-		"	stck	%1"	/* Store current tod clock value */
-#endif
-		: "=Q" (S390_lowcore.last_update_timer),
-		  "=Q" (S390_lowcore.last_update_clock));
-	clock = S390_lowcore.last_update_clock - clock;
-	timer -= S390_lowcore.last_update_timer;
+		: "=Q" (lc->last_update_timer),
+		  "=Q" (lc->last_update_clock)
+		: : "cc");
+	clock = lc->last_update_clock - clock;
+	timer -= lc->last_update_timer;
 
 	if (hardirq_count())
-		S390_lowcore.hardirq_timer += timer;
+		lc->hardirq_timer += timer;
 	else
-		S390_lowcore.system_timer += timer;
+		lc->system_timer += timer;
 
 	/* Update MT utilization calculation */
 	if (smp_cpu_mtid &&
@@ -152,16 +143,16 @@ static int do_account_vtime(struct task_struct *tsk)
 
 	/* Calculate cputime delta */
 	user = update_tsk_timer(&tsk->thread.user_timer,
-				READ_ONCE(S390_lowcore.user_timer));
+				READ_ONCE(lc->user_timer));
 	guest = update_tsk_timer(&tsk->thread.guest_timer,
-				 READ_ONCE(S390_lowcore.guest_timer));
+				 READ_ONCE(lc->guest_timer));
 	system = update_tsk_timer(&tsk->thread.system_timer,
-				  READ_ONCE(S390_lowcore.system_timer));
+				  READ_ONCE(lc->system_timer));
 	hardirq = update_tsk_timer(&tsk->thread.hardirq_timer,
-				   READ_ONCE(S390_lowcore.hardirq_timer));
+				   READ_ONCE(lc->hardirq_timer));
 	softirq = update_tsk_timer(&tsk->thread.softirq_timer,
-				   READ_ONCE(S390_lowcore.softirq_timer));
-	S390_lowcore.steal_timer +=
+				   READ_ONCE(lc->softirq_timer));
+	lc->steal_timer +=
 		clock - user - guest - system - hardirq - softirq;
 
 	/* Push account value */
@@ -187,17 +178,19 @@ static int do_account_vtime(struct task_struct *tsk)
 
 void vtime_task_switch(struct task_struct *prev)
 {
+	struct lowcore *lc = get_lowcore();
+
 	do_account_vtime(prev);
-	prev->thread.user_timer = S390_lowcore.user_timer;
-	prev->thread.guest_timer = S390_lowcore.guest_timer;
-	prev->thread.system_timer = S390_lowcore.system_timer;
-	prev->thread.hardirq_timer = S390_lowcore.hardirq_timer;
-	prev->thread.softirq_timer = S390_lowcore.softirq_timer;
-	S390_lowcore.user_timer = current->thread.user_timer;
-	S390_lowcore.guest_timer = current->thread.guest_timer;
-	S390_lowcore.system_timer = current->thread.system_timer;
-	S390_lowcore.hardirq_timer = current->thread.hardirq_timer;
-	S390_lowcore.softirq_timer = current->thread.softirq_timer;
+	prev->thread.user_timer = lc->user_timer;
+	prev->thread.guest_timer = lc->guest_timer;
+	prev->thread.system_timer = lc->system_timer;
+	prev->thread.hardirq_timer = lc->hardirq_timer;
+	prev->thread.softirq_timer = lc->softirq_timer;
+	lc->user_timer = current->thread.user_timer;
+	lc->guest_timer = current->thread.guest_timer;
+	lc->system_timer = current->thread.system_timer;
+	lc->hardirq_timer = current->thread.hardirq_timer;
+	lc->softirq_timer = current->thread.softirq_timer;
 }
 
 /*
@@ -207,50 +200,67 @@ void vtime_task_switch(struct task_struct *prev)
  */
 void vtime_flush(struct task_struct *tsk)
 {
+	struct lowcore *lc = get_lowcore();
 	u64 steal, avg_steal;
 
 	if (do_account_vtime(tsk))
 		virt_timer_expire();
 
-	steal = S390_lowcore.steal_timer;
-	avg_steal = S390_lowcore.avg_steal_timer / 2;
+	steal = lc->steal_timer;
+	avg_steal = lc->avg_steal_timer;
 	if ((s64) steal > 0) {
-		S390_lowcore.steal_timer = 0;
-		account_steal_time(steal);
+		lc->steal_timer = 0;
+		account_steal_time(cputime_to_nsecs(steal));
 		avg_steal += steal;
 	}
-	S390_lowcore.avg_steal_timer = avg_steal;
+	lc->avg_steal_timer = avg_steal / 2;
+}
+
+static u64 vtime_delta(void)
+{
+	struct lowcore *lc = get_lowcore();
+	u64 timer = lc->last_update_timer;
+
+	lc->last_update_timer = get_cpu_timer();
+	return timer - lc->last_update_timer;
 }
 
 /*
  * Update process times based on virtual cpu times stored by entry.S
  * to the lowcore fields user_timer, system_timer & steal_clock.
  */
-void vtime_account_irq_enter(struct task_struct *tsk)
+void vtime_account_kernel(struct task_struct *tsk)
 {
-	u64 timer;
+	struct lowcore *lc = get_lowcore();
+	u64 delta = vtime_delta();
 
-	timer = S390_lowcore.last_update_timer;
-	S390_lowcore.last_update_timer = get_vtimer();
-	timer -= S390_lowcore.last_update_timer;
-
-	if ((tsk->flags & PF_VCPU) && (irq_count() == 0))
-		S390_lowcore.guest_timer += timer;
-	else if (hardirq_count())
-		S390_lowcore.hardirq_timer += timer;
-	else if (in_serving_softirq())
-		S390_lowcore.softirq_timer += timer;
+	if (tsk->flags & PF_VCPU)
+		lc->guest_timer += delta;
 	else
-		S390_lowcore.system_timer += timer;
+		lc->system_timer += delta;
 
-	virt_timer_forward(timer);
+	virt_timer_forward(delta);
 }
-EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
-
-void vtime_account_kernel(struct task_struct *tsk)
-__attribute__((alias("vtime_account_irq_enter")));
 EXPORT_SYMBOL_GPL(vtime_account_kernel);
 
+void vtime_account_softirq(struct task_struct *tsk)
+{
+	u64 delta = vtime_delta();
+
+	get_lowcore()->softirq_timer += delta;
+
+	virt_timer_forward(delta);
+}
+
+void vtime_account_hardirq(struct task_struct *tsk)
+{
+	u64 delta = vtime_delta();
+
+	get_lowcore()->hardirq_timer += delta;
+
+	virt_timer_forward(delta);
+}
+
 /*
  * Sorted add to a list. List is linear searched until first bigger
  * element is found.
diff --git a/arch/s390/kernel/wti.c b/arch/s390/kernel/wti.c
new file mode 100644
index 000000000000..949fdbf0e8b6
--- /dev/null
+++ b/arch/s390/kernel/wti.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Support for warning track interruption
+ *
+ * Copyright IBM Corp. 2023
+ */
+
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/smpboot.h>
+#include <linux/irq.h>
+#include <uapi/linux/sched/types.h>
+#include <asm/debug.h>
+#include <asm/diag.h>
+#include <asm/sclp.h>
+
+#define WTI_DBF_LEN 64
+
+struct wti_debug {
+	unsigned long	missed;
+	unsigned long	addr;
+	pid_t		pid;
+};
+
+struct wti_state {
+	/* debug data for s390dbf */
+	struct wti_debug	dbg;
+	/*
+	 * Represents the real-time thread responsible to
+	 * acknowledge the warning-track interrupt and trigger
+	 * preliminary and postliminary precautions.
+	 */
+	struct task_struct	*thread;
+	/*
+	 * If pending is true, the real-time thread must be scheduled.
+	 * If not, a wake up of that thread will remain a noop.
+	 */
+	bool			pending;
+};
+
+static DEFINE_PER_CPU(struct wti_state, wti_state);
+
+static debug_info_t *wti_dbg;
+
+/*
+ * During a warning-track grace period, interrupts are disabled
+ * to prevent delays of the warning-track acknowledgment.
+ *
+ * Once the CPU is physically dispatched again, interrupts are
+ * re-enabled.
+ */
+
+static void wti_irq_disable(void)
+{
+	unsigned long flags;
+	struct ctlreg cr6;
+
+	local_irq_save(flags);
+	local_ctl_store(6, &cr6);
+	/* disable all I/O interrupts */
+	cr6.val &= ~0xff000000UL;
+	local_ctl_load(6, &cr6);
+	local_irq_restore(flags);
+}
+
+static void wti_irq_enable(void)
+{
+	unsigned long flags;
+	struct ctlreg cr6;
+
+	local_irq_save(flags);
+	local_ctl_store(6, &cr6);
+	/* enable all I/O interrupts */
+	cr6.val |= 0xff000000UL;
+	local_ctl_load(6, &cr6);
+	local_irq_restore(flags);
+}
+
+static void store_debug_data(struct wti_state *st)
+{
+	struct pt_regs *regs = get_irq_regs();
+
+	st->dbg.pid = current->pid;
+	st->dbg.addr = 0;
+	if (!user_mode(regs))
+		st->dbg.addr = regs->psw.addr;
+}
+
+static void wti_interrupt(struct ext_code ext_code,
+			  unsigned int param32, unsigned long param64)
+{
+	struct wti_state *st = this_cpu_ptr(&wti_state);
+
+	inc_irq_stat(IRQEXT_WTI);
+	wti_irq_disable();
+	store_debug_data(st);
+	st->pending = true;
+	wake_up_process(st->thread);
+}
+
+static int wti_pending(unsigned int cpu)
+{
+	struct wti_state *st = per_cpu_ptr(&wti_state, cpu);
+
+	return st->pending;
+}
+
+static void wti_dbf_grace_period(struct wti_state *st)
+{
+	struct wti_debug *wdi = &st->dbg;
+	char buf[WTI_DBF_LEN];
+
+	if (wdi->addr)
+		snprintf(buf, sizeof(buf), "%d %pS", wdi->pid, (void *)wdi->addr);
+	else
+		snprintf(buf, sizeof(buf), "%d <user>", wdi->pid);
+	debug_text_event(wti_dbg, 2, buf);
+	wdi->missed++;
+}
+
+static int wti_show(struct seq_file *seq, void *v)
+{
+	struct wti_state *st;
+	int cpu;
+
+	cpus_read_lock();
+	seq_puts(seq, "       ");
+	for_each_online_cpu(cpu)
+		seq_printf(seq, "CPU%-8d", cpu);
+	seq_putc(seq, '\n');
+	for_each_online_cpu(cpu) {
+		st = per_cpu_ptr(&wti_state, cpu);
+		seq_printf(seq, " %10lu", st->dbg.missed);
+	}
+	seq_putc(seq, '\n');
+	cpus_read_unlock();
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(wti);
+
+static void wti_thread_fn(unsigned int cpu)
+{
+	struct wti_state *st = per_cpu_ptr(&wti_state, cpu);
+
+	st->pending = false;
+	/*
+	 * Yield CPU voluntarily to the hypervisor. Control
+	 * resumes when hypervisor decides to dispatch CPU
+	 * to this LPAR again.
+	 */
+	if (diag49c(DIAG49C_SUBC_ACK))
+		wti_dbf_grace_period(st);
+	wti_irq_enable();
+}
+
+static struct smp_hotplug_thread wti_threads = {
+	.store			= &wti_state.thread,
+	.thread_should_run	= wti_pending,
+	.thread_fn		= wti_thread_fn,
+	.thread_comm		= "cpuwti/%u",
+	.selfparking		= false,
+};
+
+static int __init wti_init(void)
+{
+	struct sched_param wti_sched_param = { .sched_priority = MAX_RT_PRIO - 1 };
+	struct dentry *wti_dir;
+	struct wti_state *st;
+	int cpu, rc;
+
+	rc = -EOPNOTSUPP;
+	if (!sclp.has_wti)
+		goto out;
+	rc = smpboot_register_percpu_thread(&wti_threads);
+	if (WARN_ON(rc))
+		goto out;
+	for_each_online_cpu(cpu) {
+		st = per_cpu_ptr(&wti_state, cpu);
+		sched_setscheduler(st->thread, SCHED_FIFO, &wti_sched_param);
+	}
+	rc = register_external_irq(EXT_IRQ_WARNING_TRACK, wti_interrupt);
+	if (rc) {
+		pr_warn("Couldn't request external interrupt 0x1007\n");
+		goto out_thread;
+	}
+	irq_subclass_register(IRQ_SUBCLASS_WARNING_TRACK);
+	rc = diag49c(DIAG49C_SUBC_REG);
+	if (rc) {
+		pr_warn("Failed to register warning track interrupt through DIAG 49C\n");
+		rc = -EOPNOTSUPP;
+		goto out_subclass;
+	}
+	wti_dir = debugfs_create_dir("wti", arch_debugfs_dir);
+	debugfs_create_file("stat", 0400, wti_dir, NULL, &wti_fops);
+	wti_dbg = debug_register("wti", 1, 1, WTI_DBF_LEN);
+	if (!wti_dbg) {
+		rc = -ENOMEM;
+		goto out_debug_register;
+	}
+	rc = debug_register_view(wti_dbg, &debug_hex_ascii_view);
+	if (rc)
+		goto out_debug_register;
+	goto out;
+out_debug_register:
+	debug_unregister(wti_dbg);
+out_subclass:
+	irq_subclass_unregister(IRQ_SUBCLASS_WARNING_TRACK);
+	unregister_external_irq(EXT_IRQ_WARNING_TRACK, wti_interrupt);
+out_thread:
+	smpboot_unregister_percpu_thread(&wti_threads);
+out:
+	return rc;
+}
+late_initcall(wti_init);
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index d3db3d7ed077..cae908d64550 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -7,7 +7,7 @@ source "virt/kvm/Kconfig"
 menuconfig VIRTUALIZATION
 	def_bool y
 	prompt "KVM"
-	---help---
+	help
 	  Say Y here to get to see options for using your Linux host to run other
 	  operating systems inside virtual machines (guests).
 	  This option alone does not add any kernel code.
@@ -19,21 +19,18 @@ if VIRTUALIZATION
 config KVM
 	def_tristate y
 	prompt "Kernel-based Virtual Machine (KVM) support"
-	depends on HAVE_KVM
-	select PREEMPT_NOTIFIERS
 	select HAVE_KVM_CPU_RELAX_INTERCEPT
 	select HAVE_KVM_VCPU_ASYNC_IOCTL
-	select HAVE_KVM_EVENTFD
 	select KVM_ASYNC_PF
 	select KVM_ASYNC_PF_SYNC
+	select KVM_COMMON
 	select HAVE_KVM_IRQCHIP
-	select HAVE_KVM_IRQFD
 	select HAVE_KVM_IRQ_ROUTING
 	select HAVE_KVM_INVALID_WAKEUPS
 	select HAVE_KVM_NO_POLL
-	select SRCU
 	select KVM_VFIO
-	---help---
+	select MMU_NOTIFIER
+	help
 	  Support hosting paravirtualized guest machines using the SIE
 	  virtualization capability on the mainframe. This should work
 	  on any 64bit machine.
@@ -49,14 +46,10 @@ config KVM
 config KVM_S390_UCONTROL
 	bool "Userspace controlled virtual machines"
 	depends on KVM
-	---help---
+	help
 	  Allow CAP_SYS_ADMIN users to create KVM virtual machines that are
 	  controlled by userspace.
 
 	  If unsure, say N.
 
-# OK, it's a little counter-intuitive to do this, but it puts it neatly under
-# the virtualization menu.
-source "drivers/vhost/Kconfig"
-
 endif # VIRTUALIZATION
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index 05ee90a5ea08..f0ffe874adc2 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -3,12 +3,12 @@
 #
 # Copyright IBM Corp. 2008
 
-KVM := ../../../virt/kvm
-common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o $(KVM)/irqchip.o $(KVM)/vfio.o
+include $(srctree)/virt/kvm/Makefile.kvm
 
 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
 
-kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-objs += diag.o gaccess.o guestdbg.o vsie.o
+kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o
+kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap.o gmap-vsie.o
 
+kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o
 obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 3fb54ec2cf3e..74f73141f9b9 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -2,7 +2,7 @@
 /*
  * handling diagnose instructions
  *
- * Copyright IBM Corp. 2008, 2011
+ * Copyright IBM Corp. 2008, 2020
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  *               Christian Borntraeger <borntraeger@de.ibm.com>
@@ -10,7 +10,6 @@
 
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
-#include <asm/pgalloc.h>
 #include <asm/gmap.h>
 #include <asm/virtio-ccw.h>
 #include "kvm-s390.h"
@@ -25,7 +24,7 @@ static int diag_release_pages(struct kvm_vcpu *vcpu)
 
 	start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4];
 	end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + PAGE_SIZE;
-	vcpu->stat.diagnose_10++;
+	vcpu->stat.instruction_diagnose_10++;
 
 	if (start & ~PAGE_MASK || end & ~PAGE_MASK || start >= end
 	    || start < 2 * PAGE_SIZE)
@@ -75,10 +74,10 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
 
 	VCPU_EVENT(vcpu, 3, "diag page reference parameter block at 0x%llx",
 		   vcpu->run->s.regs.gprs[rx]);
-	vcpu->stat.diagnose_258++;
+	vcpu->stat.instruction_diagnose_258++;
 	if (vcpu->run->s.regs.gprs[rx] & 7)
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-	rc = read_guest(vcpu, vcpu->run->s.regs.gprs[rx], rx, &parm, sizeof(parm));
+	rc = read_guest_real(vcpu, vcpu->run->s.regs.gprs[rx], &parm, sizeof(parm));
 	if (rc)
 		return kvm_s390_inject_prog_cond(vcpu, rc);
 	if (parm.parm_version != 2 || parm.parm_len < 5 || parm.code != 0x258)
@@ -103,7 +102,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
 		    parm.token_addr & 7 || parm.zarch != 0x8000000000000000ULL)
 			return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-		if (kvm_is_error_gpa(vcpu->kvm, parm.token_addr))
+		if (!kvm_is_gpa_in_memslot(vcpu->kvm, parm.token_addr))
 			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
 		vcpu->arch.pfault_token = parm.token_addr;
@@ -146,18 +145,32 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
 static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
 {
 	VCPU_EVENT(vcpu, 5, "%s", "diag time slice end");
-	vcpu->stat.diagnose_44++;
+	vcpu->stat.instruction_diagnose_44++;
 	kvm_vcpu_on_spin(vcpu, true);
 	return 0;
 }
 
+static int forward_cnt;
+static unsigned long cur_slice;
+
+static int diag9c_forwarding_overrun(void)
+{
+	/* Reset the count on a new slice */
+	if (time_after(jiffies, cur_slice)) {
+		cur_slice = jiffies;
+		forward_cnt = diag9c_forwarding_hz / HZ;
+	}
+	return forward_cnt-- <= 0 ? 1 : 0;
+}
+
 static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
 {
 	struct kvm_vcpu *tcpu;
+	int tcpu_cpu;
 	int tid;
 
 	tid = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4];
-	vcpu->stat.diagnose_9c++;
+	vcpu->stat.instruction_diagnose_9c++;
 
 	/* yield to self */
 	if (tid == vcpu->vcpu_id)
@@ -168,9 +181,22 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
 	if (!tcpu)
 		goto no_yield;
 
-	/* target already running */
-	if (READ_ONCE(tcpu->cpu) >= 0)
-		goto no_yield;
+	/* target guest VCPU already running */
+	tcpu_cpu = READ_ONCE(tcpu->cpu);
+	if (tcpu_cpu >= 0) {
+		if (!diag9c_forwarding_hz || diag9c_forwarding_overrun())
+			goto no_yield;
+
+		/* target host CPU already running */
+		if (!vcpu_is_preempted(tcpu_cpu))
+			goto no_yield;
+		smp_yield_cpu(tcpu_cpu);
+		VCPU_EVENT(vcpu, 5,
+			   "diag time slice end directed to %d: yield forwarded",
+			   tid);
+		vcpu->stat.diag_9c_forward++;
+		return 0;
+	}
 
 	if (kvm_vcpu_yield_to(tcpu) <= 0)
 		goto no_yield;
@@ -179,7 +205,7 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
 	return 0;
 no_yield:
 	VCPU_EVENT(vcpu, 5, "diag time slice end directed to %d: ignored", tid);
-	vcpu->stat.diagnose_9c_ignored++;
+	vcpu->stat.diag_9c_ignored++;
 	return 0;
 }
 
@@ -189,7 +215,7 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
 	unsigned long subcode = vcpu->run->s.regs.gprs[reg] & 0xffff;
 
 	VCPU_EVENT(vcpu, 3, "diag ipl functions, subcode %lx", subcode);
-	vcpu->stat.diagnose_308++;
+	vcpu->stat.instruction_diagnose_308++;
 	switch (subcode) {
 	case 3:
 		vcpu->run->s390_reset_flags = KVM_S390_RESET_CLEAR;
@@ -201,6 +227,10 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
 		return -EOPNOTSUPP;
 	}
 
+	/*
+	 * no need to check the return value of vcpu_stop as it can only have
+	 * an error for protvirt, but protvirt means user cpu state
+	 */
 	if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm))
 		kvm_s390_vcpu_stop(vcpu);
 	vcpu->run->s390_reset_flags |= KVM_S390_RESET_SUBSYSTEM;
@@ -217,7 +247,7 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
 {
 	int ret;
 
-	vcpu->stat.diagnose_500++;
+	vcpu->stat.instruction_diagnose_500++;
 	/* No virtio-ccw notification? Get out quickly. */
 	if (!vcpu->kvm->arch.css_support ||
 	    (vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY))
@@ -271,7 +301,7 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
 	case 0x500:
 		return __diag_virtio_hypercall(vcpu);
 	default:
-		vcpu->stat.diagnose_other++;
+		vcpu->stat.instruction_diagnose_other++;
 		return -EOPNOTSUPP;
 	}
 }
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 07d30ffcfa41..f6fded15633a 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -9,170 +9,15 @@
 #include <linux/vmalloc.h>
 #include <linux/mm_types.h>
 #include <linux/err.h>
-
-#include <asm/pgtable.h>
+#include <linux/pgtable.h>
+#include <linux/bitfield.h>
+#include <asm/access-regs.h>
+#include <asm/fault.h>
 #include <asm/gmap.h>
+#include <asm/dat-bits.h>
 #include "kvm-s390.h"
+#include "gmap.h"
 #include "gaccess.h"
-#include <asm/switch_to.h>
-
-union asce {
-	unsigned long val;
-	struct {
-		unsigned long origin : 52; /* Region- or Segment-Table Origin */
-		unsigned long	 : 2;
-		unsigned long g  : 1; /* Subspace Group Control */
-		unsigned long p  : 1; /* Private Space Control */
-		unsigned long s  : 1; /* Storage-Alteration-Event Control */
-		unsigned long x  : 1; /* Space-Switch-Event Control */
-		unsigned long r  : 1; /* Real-Space Control */
-		unsigned long	 : 1;
-		unsigned long dt : 2; /* Designation-Type Control */
-		unsigned long tl : 2; /* Region- or Segment-Table Length */
-	};
-};
-
-enum {
-	ASCE_TYPE_SEGMENT = 0,
-	ASCE_TYPE_REGION3 = 1,
-	ASCE_TYPE_REGION2 = 2,
-	ASCE_TYPE_REGION1 = 3
-};
-
-union region1_table_entry {
-	unsigned long val;
-	struct {
-		unsigned long rto: 52;/* Region-Table Origin */
-		unsigned long	 : 2;
-		unsigned long p  : 1; /* DAT-Protection Bit */
-		unsigned long	 : 1;
-		unsigned long tf : 2; /* Region-Second-Table Offset */
-		unsigned long i  : 1; /* Region-Invalid Bit */
-		unsigned long	 : 1;
-		unsigned long tt : 2; /* Table-Type Bits */
-		unsigned long tl : 2; /* Region-Second-Table Length */
-	};
-};
-
-union region2_table_entry {
-	unsigned long val;
-	struct {
-		unsigned long rto: 52;/* Region-Table Origin */
-		unsigned long	 : 2;
-		unsigned long p  : 1; /* DAT-Protection Bit */
-		unsigned long	 : 1;
-		unsigned long tf : 2; /* Region-Third-Table Offset */
-		unsigned long i  : 1; /* Region-Invalid Bit */
-		unsigned long	 : 1;
-		unsigned long tt : 2; /* Table-Type Bits */
-		unsigned long tl : 2; /* Region-Third-Table Length */
-	};
-};
-
-struct region3_table_entry_fc0 {
-	unsigned long sto: 52;/* Segment-Table Origin */
-	unsigned long	 : 1;
-	unsigned long fc : 1; /* Format-Control */
-	unsigned long p  : 1; /* DAT-Protection Bit */
-	unsigned long	 : 1;
-	unsigned long tf : 2; /* Segment-Table Offset */
-	unsigned long i  : 1; /* Region-Invalid Bit */
-	unsigned long cr : 1; /* Common-Region Bit */
-	unsigned long tt : 2; /* Table-Type Bits */
-	unsigned long tl : 2; /* Segment-Table Length */
-};
-
-struct region3_table_entry_fc1 {
-	unsigned long rfaa : 33; /* Region-Frame Absolute Address */
-	unsigned long	 : 14;
-	unsigned long av : 1; /* ACCF-Validity Control */
-	unsigned long acc: 4; /* Access-Control Bits */
-	unsigned long f  : 1; /* Fetch-Protection Bit */
-	unsigned long fc : 1; /* Format-Control */
-	unsigned long p  : 1; /* DAT-Protection Bit */
-	unsigned long iep: 1; /* Instruction-Execution-Protection */
-	unsigned long	 : 2;
-	unsigned long i  : 1; /* Region-Invalid Bit */
-	unsigned long cr : 1; /* Common-Region Bit */
-	unsigned long tt : 2; /* Table-Type Bits */
-	unsigned long	 : 2;
-};
-
-union region3_table_entry {
-	unsigned long val;
-	struct region3_table_entry_fc0 fc0;
-	struct region3_table_entry_fc1 fc1;
-	struct {
-		unsigned long	 : 53;
-		unsigned long fc : 1; /* Format-Control */
-		unsigned long	 : 4;
-		unsigned long i  : 1; /* Region-Invalid Bit */
-		unsigned long cr : 1; /* Common-Region Bit */
-		unsigned long tt : 2; /* Table-Type Bits */
-		unsigned long	 : 2;
-	};
-};
-
-struct segment_entry_fc0 {
-	unsigned long pto: 53;/* Page-Table Origin */
-	unsigned long fc : 1; /* Format-Control */
-	unsigned long p  : 1; /* DAT-Protection Bit */
-	unsigned long	 : 3;
-	unsigned long i  : 1; /* Segment-Invalid Bit */
-	unsigned long cs : 1; /* Common-Segment Bit */
-	unsigned long tt : 2; /* Table-Type Bits */
-	unsigned long	 : 2;
-};
-
-struct segment_entry_fc1 {
-	unsigned long sfaa : 44; /* Segment-Frame Absolute Address */
-	unsigned long	 : 3;
-	unsigned long av : 1; /* ACCF-Validity Control */
-	unsigned long acc: 4; /* Access-Control Bits */
-	unsigned long f  : 1; /* Fetch-Protection Bit */
-	unsigned long fc : 1; /* Format-Control */
-	unsigned long p  : 1; /* DAT-Protection Bit */
-	unsigned long iep: 1; /* Instruction-Execution-Protection */
-	unsigned long	 : 2;
-	unsigned long i  : 1; /* Segment-Invalid Bit */
-	unsigned long cs : 1; /* Common-Segment Bit */
-	unsigned long tt : 2; /* Table-Type Bits */
-	unsigned long	 : 2;
-};
-
-union segment_table_entry {
-	unsigned long val;
-	struct segment_entry_fc0 fc0;
-	struct segment_entry_fc1 fc1;
-	struct {
-		unsigned long	 : 53;
-		unsigned long fc : 1; /* Format-Control */
-		unsigned long	 : 4;
-		unsigned long i  : 1; /* Segment-Invalid Bit */
-		unsigned long cs : 1; /* Common-Segment Bit */
-		unsigned long tt : 2; /* Table-Type Bits */
-		unsigned long	 : 2;
-	};
-};
-
-enum {
-	TABLE_TYPE_SEGMENT = 0,
-	TABLE_TYPE_REGION3 = 1,
-	TABLE_TYPE_REGION2 = 2,
-	TABLE_TYPE_REGION1 = 3
-};
-
-union page_table_entry {
-	unsigned long val;
-	struct {
-		unsigned long pfra : 52; /* Page-Frame Real Address */
-		unsigned long z  : 1; /* Zero Bit */
-		unsigned long i  : 1; /* Page-Invalid Bit */
-		unsigned long p  : 1; /* DAT-Protection Bit */
-		unsigned long iep: 1; /* Instruction-Execution-Protection */
-		unsigned long	 : 8;
-	};
-};
 
 /*
  * vaddress union in order to easily decode a virtual address into its
@@ -261,119 +106,119 @@ struct aste {
 	/* .. more fields there */
 };
 
-int ipte_lock_held(struct kvm_vcpu *vcpu)
+int ipte_lock_held(struct kvm *kvm)
 {
-	if (vcpu->arch.sie_block->eca & ECA_SII) {
+	if (sclp.has_siif) {
 		int rc;
 
-		read_lock(&vcpu->kvm->arch.sca_lock);
-		rc = kvm_s390_get_ipte_control(vcpu->kvm)->kh != 0;
-		read_unlock(&vcpu->kvm->arch.sca_lock);
+		read_lock(&kvm->arch.sca_lock);
+		rc = kvm_s390_get_ipte_control(kvm)->kh != 0;
+		read_unlock(&kvm->arch.sca_lock);
 		return rc;
 	}
-	return vcpu->kvm->arch.ipte_lock_count != 0;
+	return kvm->arch.ipte_lock_count != 0;
 }
 
-static void ipte_lock_simple(struct kvm_vcpu *vcpu)
+static void ipte_lock_simple(struct kvm *kvm)
 {
 	union ipte_control old, new, *ic;
 
-	mutex_lock(&vcpu->kvm->arch.ipte_mutex);
-	vcpu->kvm->arch.ipte_lock_count++;
-	if (vcpu->kvm->arch.ipte_lock_count > 1)
+	mutex_lock(&kvm->arch.ipte_mutex);
+	kvm->arch.ipte_lock_count++;
+	if (kvm->arch.ipte_lock_count > 1)
 		goto out;
 retry:
-	read_lock(&vcpu->kvm->arch.sca_lock);
-	ic = kvm_s390_get_ipte_control(vcpu->kvm);
+	read_lock(&kvm->arch.sca_lock);
+	ic = kvm_s390_get_ipte_control(kvm);
+	old = READ_ONCE(*ic);
 	do {
-		old = READ_ONCE(*ic);
 		if (old.k) {
-			read_unlock(&vcpu->kvm->arch.sca_lock);
+			read_unlock(&kvm->arch.sca_lock);
 			cond_resched();
 			goto retry;
 		}
 		new = old;
 		new.k = 1;
-	} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
-	read_unlock(&vcpu->kvm->arch.sca_lock);
+	} while (!try_cmpxchg(&ic->val, &old.val, new.val));
+	read_unlock(&kvm->arch.sca_lock);
 out:
-	mutex_unlock(&vcpu->kvm->arch.ipte_mutex);
+	mutex_unlock(&kvm->arch.ipte_mutex);
 }
 
-static void ipte_unlock_simple(struct kvm_vcpu *vcpu)
+static void ipte_unlock_simple(struct kvm *kvm)
 {
 	union ipte_control old, new, *ic;
 
-	mutex_lock(&vcpu->kvm->arch.ipte_mutex);
-	vcpu->kvm->arch.ipte_lock_count--;
-	if (vcpu->kvm->arch.ipte_lock_count)
+	mutex_lock(&kvm->arch.ipte_mutex);
+	kvm->arch.ipte_lock_count--;
+	if (kvm->arch.ipte_lock_count)
 		goto out;
-	read_lock(&vcpu->kvm->arch.sca_lock);
-	ic = kvm_s390_get_ipte_control(vcpu->kvm);
+	read_lock(&kvm->arch.sca_lock);
+	ic = kvm_s390_get_ipte_control(kvm);
+	old = READ_ONCE(*ic);
 	do {
-		old = READ_ONCE(*ic);
 		new = old;
 		new.k = 0;
-	} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
-	read_unlock(&vcpu->kvm->arch.sca_lock);
-	wake_up(&vcpu->kvm->arch.ipte_wq);
+	} while (!try_cmpxchg(&ic->val, &old.val, new.val));
+	read_unlock(&kvm->arch.sca_lock);
+	wake_up(&kvm->arch.ipte_wq);
 out:
-	mutex_unlock(&vcpu->kvm->arch.ipte_mutex);
+	mutex_unlock(&kvm->arch.ipte_mutex);
 }
 
-static void ipte_lock_siif(struct kvm_vcpu *vcpu)
+static void ipte_lock_siif(struct kvm *kvm)
 {
 	union ipte_control old, new, *ic;
 
 retry:
-	read_lock(&vcpu->kvm->arch.sca_lock);
-	ic = kvm_s390_get_ipte_control(vcpu->kvm);
+	read_lock(&kvm->arch.sca_lock);
+	ic = kvm_s390_get_ipte_control(kvm);
+	old = READ_ONCE(*ic);
 	do {
-		old = READ_ONCE(*ic);
 		if (old.kg) {
-			read_unlock(&vcpu->kvm->arch.sca_lock);
+			read_unlock(&kvm->arch.sca_lock);
 			cond_resched();
 			goto retry;
 		}
 		new = old;
 		new.k = 1;
 		new.kh++;
-	} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
-	read_unlock(&vcpu->kvm->arch.sca_lock);
+	} while (!try_cmpxchg(&ic->val, &old.val, new.val));
+	read_unlock(&kvm->arch.sca_lock);
 }
 
-static void ipte_unlock_siif(struct kvm_vcpu *vcpu)
+static void ipte_unlock_siif(struct kvm *kvm)
 {
 	union ipte_control old, new, *ic;
 
-	read_lock(&vcpu->kvm->arch.sca_lock);
-	ic = kvm_s390_get_ipte_control(vcpu->kvm);
+	read_lock(&kvm->arch.sca_lock);
+	ic = kvm_s390_get_ipte_control(kvm);
+	old = READ_ONCE(*ic);
 	do {
-		old = READ_ONCE(*ic);
 		new = old;
 		new.kh--;
 		if (!new.kh)
 			new.k = 0;
-	} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
-	read_unlock(&vcpu->kvm->arch.sca_lock);
+	} while (!try_cmpxchg(&ic->val, &old.val, new.val));
+	read_unlock(&kvm->arch.sca_lock);
 	if (!new.kh)
-		wake_up(&vcpu->kvm->arch.ipte_wq);
+		wake_up(&kvm->arch.ipte_wq);
 }
 
-void ipte_lock(struct kvm_vcpu *vcpu)
+void ipte_lock(struct kvm *kvm)
 {
-	if (vcpu->arch.sie_block->eca & ECA_SII)
-		ipte_lock_siif(vcpu);
+	if (sclp.has_siif)
+		ipte_lock_siif(kvm);
 	else
-		ipte_lock_simple(vcpu);
+		ipte_lock_simple(kvm);
 }
 
-void ipte_unlock(struct kvm_vcpu *vcpu)
+void ipte_unlock(struct kvm *kvm)
 {
-	if (vcpu->arch.sie_block->eca & ECA_SII)
-		ipte_unlock_siif(vcpu);
+	if (sclp.has_siif)
+		ipte_unlock_siif(kvm);
 	else
-		ipte_unlock_simple(vcpu);
+		ipte_unlock_simple(kvm);
 }
 
 static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar,
@@ -390,7 +235,8 @@ static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar,
 	if (ar >= NUM_ACRS)
 		return -EINVAL;
 
-	save_access_regs(vcpu->run->s.regs.acrs);
+	if (vcpu->arch.acrs_loaded)
+		save_access_regs(vcpu->run->s.regs.acrs);
 	alet.val = vcpu->run->s.regs.acrs[ar];
 
 	if (ar == 0 || alet.val == 0) {
@@ -465,61 +311,55 @@ static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar,
 	return 0;
 }
 
-struct trans_exc_code_bits {
-	unsigned long addr : 52; /* Translation-exception Address */
-	unsigned long fsi  : 2;  /* Access Exception Fetch/Store Indication */
-	unsigned long	   : 2;
-	unsigned long b56  : 1;
-	unsigned long	   : 3;
-	unsigned long b60  : 1;
-	unsigned long b61  : 1;
-	unsigned long as   : 2;  /* ASCE Identifier */
-};
-
-enum {
-	FSI_UNKNOWN = 0, /* Unknown wether fetch or store */
-	FSI_STORE   = 1, /* Exception was due to store operation */
-	FSI_FETCH   = 2  /* Exception was due to fetch operation */
-};
-
 enum prot_type {
 	PROT_TYPE_LA   = 0,
 	PROT_TYPE_KEYC = 1,
 	PROT_TYPE_ALC  = 2,
 	PROT_TYPE_DAT  = 3,
 	PROT_TYPE_IEP  = 4,
+	/* Dummy value for passing an initialized value when code != PGM_PROTECTION */
+	PROT_NONE,
 };
 
-static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
-		     u8 ar, enum gacc_mode mode, enum prot_type prot)
+static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar,
+			    enum gacc_mode mode, enum prot_type prot, bool terminate)
 {
 	struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
-	struct trans_exc_code_bits *tec;
+	union teid *teid;
 
 	memset(pgm, 0, sizeof(*pgm));
 	pgm->code = code;
-	tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
+	teid = (union teid *)&pgm->trans_exc_code;
 
 	switch (code) {
 	case PGM_PROTECTION:
 		switch (prot) {
+		case PROT_NONE:
+			/* We should never get here, acts like termination */
+			WARN_ON_ONCE(1);
+			break;
 		case PROT_TYPE_IEP:
-			tec->b61 = 1;
-			/* FALL THROUGH */
+			teid->b61 = 1;
+			fallthrough;
 		case PROT_TYPE_LA:
-			tec->b56 = 1;
+			teid->b56 = 1;
 			break;
 		case PROT_TYPE_KEYC:
-			tec->b60 = 1;
+			teid->b60 = 1;
 			break;
 		case PROT_TYPE_ALC:
-			tec->b60 = 1;
-			/* FALL THROUGH */
+			teid->b60 = 1;
+			fallthrough;
 		case PROT_TYPE_DAT:
-			tec->b61 = 1;
+			teid->b61 = 1;
 			break;
 		}
-		/* FALL THROUGH */
+		if (terminate) {
+			teid->b56 = 0;
+			teid->b60 = 0;
+			teid->b61 = 0;
+		}
+		fallthrough;
 	case PGM_ASCE_TYPE:
 	case PGM_PAGE_TRANSLATION:
 	case PGM_REGION_FIRST_TRANS:
@@ -531,10 +371,10 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
 		 * exc_access_id has to be set to 0 for some instructions. Both
 		 * cases have to be handled by the caller.
 		 */
-		tec->addr = gva >> PAGE_SHIFT;
-		tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
-		tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
-		/* FALL THROUGH */
+		teid->addr = gva >> PAGE_SHIFT;
+		teid->fsi = mode == GACC_STORE ? TEID_FSI_STORE : TEID_FSI_FETCH;
+		teid->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
+		fallthrough;
 	case PGM_ALEN_TRANSLATION:
 	case PGM_ALE_SEQUENCE:
 	case PGM_ASTE_VALIDITY:
@@ -551,6 +391,12 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
 	return code;
 }
 
+static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar,
+		     enum gacc_mode mode, enum prot_type prot)
+{
+	return trans_exc_ending(vcpu, code, gva, ar, mode, prot, false);
+}
+
 static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
 			 unsigned long ga, u8 ar, enum gacc_mode mode)
 {
@@ -607,7 +453,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
  * Returns: - zero on success; @gpa contains the resulting absolute address
  *	    - a negative value if guest access failed due to e.g. broken
  *	      guest mapping
- *	    - a positve value if an access exception happened. In this case
+ *	    - a positive value if an access exception happened. In this case
  *	      the returned value is the program interruption code as defined
  *	      by the architecture
  */
@@ -630,7 +476,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 	iep = ctlreg0.iep && test_kvm_facility(vcpu->kvm, 130);
 	if (asce.r)
 		goto real_address;
-	ptr = asce.origin * PAGE_SIZE;
+	ptr = asce.rsto * PAGE_SIZE;
 	switch (asce.dt) {
 	case ASCE_TYPE_REGION1:
 		if (vaddr.rfx01 > asce.tl)
@@ -663,7 +509,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 	case ASCE_TYPE_REGION1:	{
 		union region1_table_entry rfte;
 
-		if (kvm_is_error_gpa(vcpu->kvm, ptr))
+		if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
 			return PGM_ADDRESSING;
 		if (deref_table(vcpu->kvm, ptr, &rfte.val))
 			return -EFAULT;
@@ -677,11 +523,11 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 			dat_protection |= rfte.p;
 		ptr = rfte.rto * PAGE_SIZE + vaddr.rsx * 8;
 	}
-		/* fallthrough */
+		fallthrough;
 	case ASCE_TYPE_REGION2: {
 		union region2_table_entry rste;
 
-		if (kvm_is_error_gpa(vcpu->kvm, ptr))
+		if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
 			return PGM_ADDRESSING;
 		if (deref_table(vcpu->kvm, ptr, &rste.val))
 			return -EFAULT;
@@ -695,11 +541,11 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 			dat_protection |= rste.p;
 		ptr = rste.rto * PAGE_SIZE + vaddr.rtx * 8;
 	}
-		/* fallthrough */
+		fallthrough;
 	case ASCE_TYPE_REGION3: {
 		union region3_table_entry rtte;
 
-		if (kvm_is_error_gpa(vcpu->kvm, ptr))
+		if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
 			return PGM_ADDRESSING;
 		if (deref_table(vcpu->kvm, ptr, &rtte.val))
 			return -EFAULT;
@@ -723,11 +569,11 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 			dat_protection |= rtte.fc0.p;
 		ptr = rtte.fc0.sto * PAGE_SIZE + vaddr.sx * 8;
 	}
-		/* fallthrough */
+		fallthrough;
 	case ASCE_TYPE_SEGMENT: {
 		union segment_table_entry ste;
 
-		if (kvm_is_error_gpa(vcpu->kvm, ptr))
+		if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
 			return PGM_ADDRESSING;
 		if (deref_table(vcpu->kvm, ptr, &ste.val))
 			return -EFAULT;
@@ -747,7 +593,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 		ptr = ste.fc0.pto * (PAGE_SIZE / 2) + vaddr.px * 8;
 	}
 	}
-	if (kvm_is_error_gpa(vcpu->kvm, ptr))
+	if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
 		return PGM_ADDRESSING;
 	if (deref_table(vcpu->kvm, ptr, &pte.val))
 		return -EFAULT;
@@ -769,7 +615,7 @@ absolute_address:
 		*prot = PROT_TYPE_IEP;
 		return PGM_PROTECTION;
 	}
-	if (kvm_is_error_gpa(vcpu->kvm, raddr.addr))
+	if (!kvm_is_gpa_in_memslot(vcpu->kvm, raddr.addr))
 		return PGM_ADDRESSING;
 	*gpa = raddr.addr;
 	return 0;
@@ -794,48 +640,272 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
 	return 1;
 }
 
-static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
-			    unsigned long *pages, unsigned long nr_pages,
-			    const union asce asce, enum gacc_mode mode)
+static int vm_check_access_key(struct kvm *kvm, u8 access_key,
+			       enum gacc_mode mode, gpa_t gpa)
+{
+	u8 storage_key, access_control;
+	bool fetch_protected;
+	unsigned long hva;
+	int r;
+
+	if (access_key == 0)
+		return 0;
+
+	hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
+	if (kvm_is_error_hva(hva))
+		return PGM_ADDRESSING;
+
+	mmap_read_lock(current->mm);
+	r = get_guest_storage_key(current->mm, hva, &storage_key);
+	mmap_read_unlock(current->mm);
+	if (r)
+		return r;
+	access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key);
+	if (access_control == access_key)
+		return 0;
+	fetch_protected = storage_key & _PAGE_FP_BIT;
+	if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !fetch_protected)
+		return 0;
+	return PGM_PROTECTION;
+}
+
+static bool fetch_prot_override_applicable(struct kvm_vcpu *vcpu, enum gacc_mode mode,
+					   union asce asce)
+{
+	psw_t *psw = &vcpu->arch.sie_block->gpsw;
+	unsigned long override;
+
+	if (mode == GACC_FETCH || mode == GACC_IFETCH) {
+		/* check if fetch protection override enabled */
+		override = vcpu->arch.sie_block->gcr[0];
+		override &= CR0_FETCH_PROTECTION_OVERRIDE;
+		/* not applicable if subject to DAT && private space */
+		override = override && !(psw_bits(*psw).dat && asce.p);
+		return override;
+	}
+	return false;
+}
+
+static bool fetch_prot_override_applies(unsigned long ga, unsigned int len)
+{
+	return ga < 2048 && ga + len <= 2048;
+}
+
+static bool storage_prot_override_applicable(struct kvm_vcpu *vcpu)
+{
+	/* check if storage protection override enabled */
+	return vcpu->arch.sie_block->gcr[0] & CR0_STORAGE_PROTECTION_OVERRIDE;
+}
+
+static bool storage_prot_override_applies(u8 access_control)
+{
+	/* matches special storage protection override key (9) -> allow */
+	return access_control == PAGE_SPO_ACC;
+}
+
+static int vcpu_check_access_key(struct kvm_vcpu *vcpu, u8 access_key,
+				 enum gacc_mode mode, union asce asce, gpa_t gpa,
+				 unsigned long ga, unsigned int len)
+{
+	u8 storage_key, access_control;
+	unsigned long hva;
+	int r;
+
+	/* access key 0 matches any storage key -> allow */
+	if (access_key == 0)
+		return 0;
+	/*
+	 * caller needs to ensure that gfn is accessible, so we can
+	 * assume that this cannot fail
+	 */
+	hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gpa));
+	mmap_read_lock(current->mm);
+	r = get_guest_storage_key(current->mm, hva, &storage_key);
+	mmap_read_unlock(current->mm);
+	if (r)
+		return r;
+	access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key);
+	/* access key matches storage key -> allow */
+	if (access_control == access_key)
+		return 0;
+	if (mode == GACC_FETCH || mode == GACC_IFETCH) {
+		/* it is a fetch and fetch protection is off -> allow */
+		if (!(storage_key & _PAGE_FP_BIT))
+			return 0;
+		if (fetch_prot_override_applicable(vcpu, mode, asce) &&
+		    fetch_prot_override_applies(ga, len))
+			return 0;
+	}
+	if (storage_prot_override_applicable(vcpu) &&
+	    storage_prot_override_applies(access_control))
+		return 0;
+	return PGM_PROTECTION;
+}
+
+/**
+ * guest_range_to_gpas() - Calculate guest physical addresses of page fragments
+ * covering a logical range
+ * @vcpu: virtual cpu
+ * @ga: guest address, start of range
+ * @ar: access register
+ * @gpas: output argument, may be NULL
+ * @len: length of range in bytes
+ * @asce: address-space-control element to use for translation
+ * @mode: access mode
+ * @access_key: access key to mach the range's storage keys against
+ *
+ * Translate a logical range to a series of guest absolute addresses,
+ * such that the concatenation of page fragments starting at each gpa make up
+ * the whole range.
+ * The translation is performed as if done by the cpu for the given @asce, @ar,
+ * @mode and state of the @vcpu.
+ * If the translation causes an exception, its program interruption code is
+ * returned and the &struct kvm_s390_pgm_info pgm member of @vcpu is modified
+ * such that a subsequent call to kvm_s390_inject_prog_vcpu() will inject
+ * a correct exception into the guest.
+ * The resulting gpas are stored into @gpas, unless it is NULL.
+ *
+ * Note: All fragments except the first one start at the beginning of a page.
+ *	 When deriving the boundaries of a fragment from a gpa, all but the last
+ *	 fragment end at the end of the page.
+ *
+ * Return:
+ * * 0		- success
+ * * <0		- translation could not be performed, for example if  guest
+ *		  memory could not be accessed
+ * * >0		- an access exception occurred. In this case the returned value
+ *		  is the program interruption code and the contents of pgm may
+ *		  be used to inject an exception into the guest.
+ */
+static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
+			       unsigned long *gpas, unsigned long len,
+			       const union asce asce, enum gacc_mode mode,
+			       u8 access_key)
 {
 	psw_t *psw = &vcpu->arch.sie_block->gpsw;
+	unsigned int offset = offset_in_page(ga);
+	unsigned int fragment_len;
 	int lap_enabled, rc = 0;
 	enum prot_type prot;
+	unsigned long gpa;
 
 	lap_enabled = low_address_protection_enabled(vcpu, asce);
-	while (nr_pages) {
+	while (min(PAGE_SIZE - offset, len) > 0) {
+		fragment_len = min(PAGE_SIZE - offset, len);
 		ga = kvm_s390_logical_to_effective(vcpu, ga);
 		if (mode == GACC_STORE && lap_enabled && is_low_address(ga))
 			return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode,
 					 PROT_TYPE_LA);
-		ga &= PAGE_MASK;
 		if (psw_bits(*psw).dat) {
-			rc = guest_translate(vcpu, ga, pages, asce, mode, &prot);
+			rc = guest_translate(vcpu, ga, &gpa, asce, mode, &prot);
 			if (rc < 0)
 				return rc;
 		} else {
-			*pages = kvm_s390_real_to_abs(vcpu, ga);
-			if (kvm_is_error_gpa(vcpu->kvm, *pages))
+			gpa = kvm_s390_real_to_abs(vcpu, ga);
+			if (!kvm_is_gpa_in_memslot(vcpu->kvm, gpa)) {
 				rc = PGM_ADDRESSING;
+				prot = PROT_NONE;
+			}
 		}
 		if (rc)
 			return trans_exc(vcpu, rc, ga, ar, mode, prot);
-		ga += PAGE_SIZE;
-		pages++;
-		nr_pages--;
+		rc = vcpu_check_access_key(vcpu, access_key, mode, asce, gpa, ga,
+					   fragment_len);
+		if (rc)
+			return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_KEYC);
+		if (gpas)
+			*gpas++ = gpa;
+		offset = 0;
+		ga += fragment_len;
+		len -= fragment_len;
 	}
 	return 0;
 }
 
-int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
-		 unsigned long len, enum gacc_mode mode)
+static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
+			     void *data, unsigned int len)
+{
+	const unsigned int offset = offset_in_page(gpa);
+	const gfn_t gfn = gpa_to_gfn(gpa);
+	int rc;
+
+	if (!gfn_to_memslot(kvm, gfn))
+		return PGM_ADDRESSING;
+	if (mode == GACC_STORE)
+		rc = kvm_write_guest_page(kvm, gfn, data, offset, len);
+	else
+		rc = kvm_read_guest_page(kvm, gfn, data, offset, len);
+	return rc;
+}
+
+static int
+access_guest_page_with_key(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
+			   void *data, unsigned int len, u8 access_key)
+{
+	struct kvm_memory_slot *slot;
+	bool writable;
+	gfn_t gfn;
+	hva_t hva;
+	int rc;
+
+	gfn = gpa >> PAGE_SHIFT;
+	slot = gfn_to_memslot(kvm, gfn);
+	hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
+
+	if (kvm_is_error_hva(hva))
+		return PGM_ADDRESSING;
+	/*
+	 * Check if it's a ro memslot, even tho that can't occur (they're unsupported).
+	 * Don't try to actually handle that case.
+	 */
+	if (!writable && mode == GACC_STORE)
+		return -EOPNOTSUPP;
+	hva += offset_in_page(gpa);
+	if (mode == GACC_STORE)
+		rc = copy_to_user_key((void __user *)hva, data, len, access_key);
+	else
+		rc = copy_from_user_key(data, (void __user *)hva, len, access_key);
+	if (rc)
+		return PGM_PROTECTION;
+	if (mode == GACC_STORE)
+		mark_page_dirty_in_slot(kvm, slot, gfn);
+	return 0;
+}
+
+int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data,
+			      unsigned long len, enum gacc_mode mode, u8 access_key)
+{
+	int offset = offset_in_page(gpa);
+	int fragment_len;
+	int rc;
+
+	while (min(PAGE_SIZE - offset, len) > 0) {
+		fragment_len = min(PAGE_SIZE - offset, len);
+		rc = access_guest_page_with_key(kvm, mode, gpa, data, fragment_len, access_key);
+		if (rc)
+			return rc;
+		offset = 0;
+		len -= fragment_len;
+		data += fragment_len;
+		gpa += fragment_len;
+	}
+	return 0;
+}
+
+int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
+			  void *data, unsigned long len, enum gacc_mode mode,
+			  u8 access_key)
 {
 	psw_t *psw = &vcpu->arch.sie_block->gpsw;
-	unsigned long _len, nr_pages, gpa, idx;
-	unsigned long pages_array[2];
-	unsigned long *pages;
+	unsigned long nr_pages, idx;
+	unsigned long gpa_array[2];
+	unsigned int fragment_len;
+	unsigned long *gpas;
+	enum prot_type prot;
 	int need_ipte_lock;
 	union asce asce;
+	bool try_storage_prot_override;
+	bool try_fetch_prot_override;
 	int rc;
 
 	if (!len)
@@ -845,55 +915,201 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
 	if (rc)
 		return rc;
 	nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1;
-	pages = pages_array;
-	if (nr_pages > ARRAY_SIZE(pages_array))
-		pages = vmalloc(array_size(nr_pages, sizeof(unsigned long)));
-	if (!pages)
+	gpas = gpa_array;
+	if (nr_pages > ARRAY_SIZE(gpa_array))
+		gpas = vmalloc(array_size(nr_pages, sizeof(unsigned long)));
+	if (!gpas)
 		return -ENOMEM;
+	try_fetch_prot_override = fetch_prot_override_applicable(vcpu, mode, asce);
+	try_storage_prot_override = storage_prot_override_applicable(vcpu);
 	need_ipte_lock = psw_bits(*psw).dat && !asce.r;
 	if (need_ipte_lock)
-		ipte_lock(vcpu);
-	rc = guest_page_range(vcpu, ga, ar, pages, nr_pages, asce, mode);
-	for (idx = 0; idx < nr_pages && !rc; idx++) {
-		gpa = *(pages + idx) + (ga & ~PAGE_MASK);
-		_len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
-		if (mode == GACC_STORE)
-			rc = kvm_write_guest(vcpu->kvm, gpa, data, _len);
+		ipte_lock(vcpu->kvm);
+	/*
+	 * Since we do the access further down ultimately via a move instruction
+	 * that does key checking and returns an error in case of a protection
+	 * violation, we don't need to do the check during address translation.
+	 * Skip it by passing access key 0, which matches any storage key,
+	 * obviating the need for any further checks. As a result the check is
+	 * handled entirely in hardware on access, we only need to take care to
+	 * forego key protection checking if fetch protection override applies or
+	 * retry with the special key 9 in case of storage protection override.
+	 */
+	rc = guest_range_to_gpas(vcpu, ga, ar, gpas, len, asce, mode, 0);
+	if (rc)
+		goto out_unlock;
+	for (idx = 0; idx < nr_pages; idx++) {
+		fragment_len = min(PAGE_SIZE - offset_in_page(gpas[idx]), len);
+		if (try_fetch_prot_override && fetch_prot_override_applies(ga, fragment_len)) {
+			rc = access_guest_page(vcpu->kvm, mode, gpas[idx],
+					       data, fragment_len);
+		} else {
+			rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx],
+							data, fragment_len, access_key);
+		}
+		if (rc == PGM_PROTECTION && try_storage_prot_override)
+			rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx],
+							data, fragment_len, PAGE_SPO_ACC);
+		if (rc)
+			break;
+		len -= fragment_len;
+		data += fragment_len;
+		ga = kvm_s390_logical_to_effective(vcpu, ga + fragment_len);
+	}
+	if (rc > 0) {
+		bool terminate = (mode == GACC_STORE) && (idx > 0);
+
+		if (rc == PGM_PROTECTION)
+			prot = PROT_TYPE_KEYC;
 		else
-			rc = kvm_read_guest(vcpu->kvm, gpa, data, _len);
-		len -= _len;
-		ga += _len;
-		data += _len;
+			prot = PROT_NONE;
+		rc = trans_exc_ending(vcpu, rc, ga, ar, mode, prot, terminate);
 	}
+out_unlock:
 	if (need_ipte_lock)
-		ipte_unlock(vcpu);
-	if (nr_pages > ARRAY_SIZE(pages_array))
-		vfree(pages);
+		ipte_unlock(vcpu->kvm);
+	if (nr_pages > ARRAY_SIZE(gpa_array))
+		vfree(gpas);
 	return rc;
 }
 
 int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
 		      void *data, unsigned long len, enum gacc_mode mode)
 {
-	unsigned long _len, gpa;
+	unsigned int fragment_len;
+	unsigned long gpa;
 	int rc = 0;
 
 	while (len && !rc) {
 		gpa = kvm_s390_real_to_abs(vcpu, gra);
-		_len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
-		if (mode)
-			rc = write_guest_abs(vcpu, gpa, data, _len);
-		else
-			rc = read_guest_abs(vcpu, gpa, data, _len);
-		len -= _len;
-		gra += _len;
-		data += _len;
+		fragment_len = min(PAGE_SIZE - offset_in_page(gpa), len);
+		rc = access_guest_page(vcpu->kvm, mode, gpa, data, fragment_len);
+		len -= fragment_len;
+		gra += fragment_len;
+		data += fragment_len;
 	}
+	if (rc > 0)
+		vcpu->arch.pgm.code = rc;
 	return rc;
 }
 
 /**
- * guest_translate_address - translate guest logical into guest absolute address
+ * cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address.
+ * @kvm: Virtual machine instance.
+ * @gpa: Absolute guest address of the location to be changed.
+ * @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a
+ *       non power of two will result in failure.
+ * @old_addr: Pointer to old value. If the location at @gpa contains this value,
+ *            the exchange will succeed. After calling cmpxchg_guest_abs_with_key()
+ *            *@old_addr contains the value at @gpa before the attempt to
+ *            exchange the value.
+ * @new: The value to place at @gpa.
+ * @access_key: The access key to use for the guest access.
+ * @success: output value indicating if an exchange occurred.
+ *
+ * Atomically exchange the value at @gpa by @new, if it contains *@old.
+ * Honors storage keys.
+ *
+ * Return: * 0: successful exchange
+ *         * >0: a program interruption code indicating the reason cmpxchg could
+ *               not be attempted
+ *         * -EINVAL: address misaligned or len not power of two
+ *         * -EAGAIN: transient failure (len 1 or 2)
+ *         * -EOPNOTSUPP: read-only memslot (should never occur)
+ */
+int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len,
+			       __uint128_t *old_addr, __uint128_t new,
+			       u8 access_key, bool *success)
+{
+	gfn_t gfn = gpa_to_gfn(gpa);
+	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+	bool writable;
+	hva_t hva;
+	int ret;
+
+	if (!IS_ALIGNED(gpa, len))
+		return -EINVAL;
+
+	hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
+	if (kvm_is_error_hva(hva))
+		return PGM_ADDRESSING;
+	/*
+	 * Check if it's a read-only memslot, even though that cannot occur
+	 * since those are unsupported.
+	 * Don't try to actually handle that case.
+	 */
+	if (!writable)
+		return -EOPNOTSUPP;
+
+	hva += offset_in_page(gpa);
+	/*
+	 * The cmpxchg_user_key macro depends on the type of "old", so we need
+	 * a case for each valid length and get some code duplication as long
+	 * as we don't introduce a new macro.
+	 */
+	switch (len) {
+	case 1: {
+		u8 old;
+
+		ret = cmpxchg_user_key((u8 __user *)hva, &old, *old_addr, new, access_key);
+		*success = !ret && old == *old_addr;
+		*old_addr = old;
+		break;
+	}
+	case 2: {
+		u16 old;
+
+		ret = cmpxchg_user_key((u16 __user *)hva, &old, *old_addr, new, access_key);
+		*success = !ret && old == *old_addr;
+		*old_addr = old;
+		break;
+	}
+	case 4: {
+		u32 old;
+
+		ret = cmpxchg_user_key((u32 __user *)hva, &old, *old_addr, new, access_key);
+		*success = !ret && old == *old_addr;
+		*old_addr = old;
+		break;
+	}
+	case 8: {
+		u64 old;
+
+		ret = cmpxchg_user_key((u64 __user *)hva, &old, *old_addr, new, access_key);
+		*success = !ret && old == *old_addr;
+		*old_addr = old;
+		break;
+	}
+	case 16: {
+		__uint128_t old;
+
+		ret = cmpxchg_user_key((__uint128_t __user *)hva, &old, *old_addr, new, access_key);
+		*success = !ret && old == *old_addr;
+		*old_addr = old;
+		break;
+	}
+	default:
+		return -EINVAL;
+	}
+	if (*success)
+		mark_page_dirty_in_slot(kvm, slot, gfn);
+	/*
+	 * Assume that the fault is caused by protection, either key protection
+	 * or user page write protection.
+	 */
+	if (ret == -EFAULT)
+		ret = PGM_PROTECTION;
+	return ret;
+}
+
+/**
+ * guest_translate_address_with_key - translate guest logical into guest absolute address
+ * @vcpu: virtual cpu
+ * @gva: Guest virtual address
+ * @ar: Access register
+ * @gpa: Guest physical address
+ * @mode: Translation access mode
+ * @access_key: access key to mach the storage key with
  *
  * Parameter semantics are the same as the ones from guest_translate.
  * The memory contents at the guest address are not changed.
@@ -901,11 +1117,10 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
  * Note: The IPTE lock is not taken during this function, so the caller
  * has to take care of this.
  */
-int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
-			    unsigned long *gpa, enum gacc_mode mode)
+int guest_translate_address_with_key(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
+				     unsigned long *gpa, enum gacc_mode mode,
+				     u8 access_key)
 {
-	psw_t *psw = &vcpu->arch.sie_block->gpsw;
-	enum prot_type prot;
 	union asce asce;
 	int rc;
 
@@ -913,49 +1128,62 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
 	rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode);
 	if (rc)
 		return rc;
-	if (is_low_address(gva) && low_address_protection_enabled(vcpu, asce)) {
-		if (mode == GACC_STORE)
-			return trans_exc(vcpu, PGM_PROTECTION, gva, 0,
-					 mode, PROT_TYPE_LA);
-	}
+	return guest_range_to_gpas(vcpu, gva, ar, gpa, 1, asce, mode,
+				   access_key);
+}
 
-	if (psw_bits(*psw).dat && !asce.r) {	/* Use DAT? */
-		rc = guest_translate(vcpu, gva, gpa, asce, mode, &prot);
-		if (rc > 0)
-			return trans_exc(vcpu, rc, gva, 0, mode, prot);
-	} else {
-		*gpa = kvm_s390_real_to_abs(vcpu, gva);
-		if (kvm_is_error_gpa(vcpu->kvm, *gpa))
-			return trans_exc(vcpu, rc, gva, PGM_ADDRESSING, mode, 0);
-	}
+/**
+ * check_gva_range - test a range of guest virtual addresses for accessibility
+ * @vcpu: virtual cpu
+ * @gva: Guest virtual address
+ * @ar: Access register
+ * @length: Length of test range
+ * @mode: Translation access mode
+ * @access_key: access key to mach the storage keys with
+ */
+int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
+		    unsigned long length, enum gacc_mode mode, u8 access_key)
+{
+	union asce asce;
+	int rc = 0;
+
+	rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode);
+	if (rc)
+		return rc;
+	ipte_lock(vcpu->kvm);
+	rc = guest_range_to_gpas(vcpu, gva, ar, NULL, length, asce, mode,
+				 access_key);
+	ipte_unlock(vcpu->kvm);
 
 	return rc;
 }
 
 /**
- * check_gva_range - test a range of guest virtual addresses for accessibility
+ * check_gpa_range - test a range of guest physical addresses for accessibility
+ * @kvm: virtual machine instance
+ * @gpa: guest physical address
+ * @length: length of test range
+ * @mode: access mode to test, relevant for storage keys
+ * @access_key: access key to mach the storage keys with
  */
-int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
-		    unsigned long length, enum gacc_mode mode)
+int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length,
+		    enum gacc_mode mode, u8 access_key)
 {
-	unsigned long gpa;
-	unsigned long currlen;
+	unsigned int fragment_len;
 	int rc = 0;
 
-	ipte_lock(vcpu);
-	while (length > 0 && !rc) {
-		currlen = min(length, PAGE_SIZE - (gva % PAGE_SIZE));
-		rc = guest_translate_address(vcpu, gva, ar, &gpa, mode);
-		gva += currlen;
-		length -= currlen;
+	while (length && !rc) {
+		fragment_len = min(PAGE_SIZE - offset_in_page(gpa), length);
+		rc = vm_check_access_key(kvm, access_key, mode, gpa);
+		length -= fragment_len;
+		gpa += fragment_len;
 	}
-	ipte_unlock(vcpu);
-
 	return rc;
 }
 
 /**
  * kvm_s390_check_low_addr_prot_real - check for low-address protection
+ * @vcpu: virtual cpu
  * @gra: Guest real address
  *
  * Checks whether an address is subject to low-address protection and set
@@ -976,13 +1204,17 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
  * kvm_s390_shadow_tables - walk the guest page table and create shadow tables
  * @sg: pointer to the shadow guest address space structure
  * @saddr: faulting address in the shadow gmap
- * @pgt: pointer to the page table address result
+ * @pgt: pointer to the beginning of the page table for the given address if
+ *	 successful (return value 0), or to the first invalid DAT entry in
+ *	 case of exceptions (return value > 0)
+ * @dat_protection: referenced memory is write protected
  * @fake: pgt references contiguous guest memory block, not a pgtable
  */
 static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
 				  unsigned long *pgt, int *dat_protection,
 				  int *fake)
 {
+	struct kvm *kvm;
 	struct gmap *parent;
 	union asce asce;
 	union vaddress vaddr;
@@ -991,10 +1223,11 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
 
 	*fake = 0;
 	*dat_protection = 0;
+	kvm = sg->private;
 	parent = sg->parent;
 	vaddr.addr = saddr;
 	asce.val = sg->orig_asce;
-	ptr = asce.origin * PAGE_SIZE;
+	ptr = asce.rsto * PAGE_SIZE;
 	if (asce.r) {
 		*fake = 1;
 		ptr = 0;
@@ -1034,6 +1267,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
 			rfte.val = ptr;
 			goto shadow_r2t;
 		}
+		*pgt = ptr + vaddr.rfx * 8;
 		rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
 		if (rc)
 			return rc;
@@ -1050,7 +1284,9 @@ shadow_r2t:
 		rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake);
 		if (rc)
 			return rc;
-	} /* fallthrough */
+		kvm->stat.gmap_shadow_r1_entry++;
+	}
+		fallthrough;
 	case ASCE_TYPE_REGION2: {
 		union region2_table_entry rste;
 
@@ -1059,6 +1295,7 @@ shadow_r2t:
 			rste.val = ptr;
 			goto shadow_r3t;
 		}
+		*pgt = ptr + vaddr.rsx * 8;
 		rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
 		if (rc)
 			return rc;
@@ -1076,7 +1313,9 @@ shadow_r3t:
 		rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake);
 		if (rc)
 			return rc;
-	} /* fallthrough */
+		kvm->stat.gmap_shadow_r2_entry++;
+	}
+		fallthrough;
 	case ASCE_TYPE_REGION3: {
 		union region3_table_entry rtte;
 
@@ -1085,6 +1324,7 @@ shadow_r3t:
 			rtte.val = ptr;
 			goto shadow_sgt;
 		}
+		*pgt = ptr + vaddr.rtx * 8;
 		rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
 		if (rc)
 			return rc;
@@ -1111,7 +1351,9 @@ shadow_sgt:
 		rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
 		if (rc)
 			return rc;
-	} /* fallthrough */
+		kvm->stat.gmap_shadow_r3_entry++;
+	}
+		fallthrough;
 	case ASCE_TYPE_SEGMENT: {
 		union segment_table_entry ste;
 
@@ -1120,6 +1362,7 @@ shadow_sgt:
 			ste.val = ptr;
 			goto shadow_pgt;
 		}
+		*pgt = ptr + vaddr.sx * 8;
 		rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
 		if (rc)
 			return rc;
@@ -1142,6 +1385,7 @@ shadow_pgt:
 		rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake);
 		if (rc)
 			return rc;
+		kvm->stat.gmap_shadow_sg_entry++;
 	}
 	}
 	/* Return the parent address of the page table */
@@ -1150,10 +1394,50 @@ shadow_pgt:
 }
 
 /**
+ * shadow_pgt_lookup() - find a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: the address in the shadow aguest address space
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @dat_protection: if the pgtable is marked as protected by dat
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if the shadow page table was found and -EAGAIN if the page
+ * table was not found.
+ *
+ * Called with sg->mm->mmap_lock in read.
+ */
+static int shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, unsigned long *pgt,
+			     int *dat_protection, int *fake)
+{
+	unsigned long pt_index;
+	unsigned long *table;
+	struct page *page;
+	int rc;
+
+	spin_lock(&sg->guest_table_lock);
+	table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+	if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
+		/* Shadow page tables are full pages (pte+pgste) */
+		page = pfn_to_page(*table >> PAGE_SHIFT);
+		pt_index = gmap_pgste_get_pgt_addr(page_to_virt(page));
+		*pgt = pt_index & ~GMAP_SHADOW_FAKE_TABLE;
+		*dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
+		*fake = !!(pt_index & GMAP_SHADOW_FAKE_TABLE);
+		rc = 0;
+	} else  {
+		rc = -EAGAIN;
+	}
+	spin_unlock(&sg->guest_table_lock);
+	return rc;
+}
+
+/**
  * kvm_s390_shadow_fault - handle fault on a shadow page table
  * @vcpu: virtual cpu
  * @sg: pointer to the shadow guest address space structure
  * @saddr: faulting address in the shadow gmap
+ * @datptr: will contain the address of the faulting DAT table entry, or of
+ *	    the valid leaf, plus some flags
  *
  * Returns: - 0 if the shadow fault was successfully resolved
  *	    - > 0 (pgm exception code) on exceptions while faulting
@@ -1162,23 +1446,26 @@ shadow_pgt:
  *	    - -ENOMEM if out of memory
  */
 int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
-			  unsigned long saddr)
+			  unsigned long saddr, unsigned long *datptr)
 {
 	union vaddress vaddr;
 	union page_table_entry pte;
-	unsigned long pgt;
+	unsigned long pgt = 0;
 	int dat_protection, fake;
 	int rc;
 
-	down_read(&sg->mm->mmap_sem);
+	if (KVM_BUG_ON(!gmap_is_shadow(sg), vcpu->kvm))
+		return -EFAULT;
+
+	mmap_read_lock(sg->mm);
 	/*
 	 * We don't want any guest-2 tables to change - so the parent
 	 * tables/pointers we read stay valid - unshadowing is however
 	 * always possible - only guest_table_lock protects us.
 	 */
-	ipte_lock(vcpu);
+	ipte_lock(vcpu->kvm);
 
-	rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
+	rc = shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
 	if (rc)
 		rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection,
 					    &fake);
@@ -1188,8 +1475,20 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
 		pte.val = pgt + vaddr.px * PAGE_SIZE;
 		goto shadow_page;
 	}
-	if (!rc)
-		rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+
+	switch (rc) {
+	case PGM_SEGMENT_TRANSLATION:
+	case PGM_REGION_THIRD_TRANS:
+	case PGM_REGION_SECOND_TRANS:
+	case PGM_REGION_FIRST_TRANS:
+		pgt |= PEI_NOT_PTE;
+		break;
+	case 0:
+		pgt += vaddr.px * 8;
+		rc = gmap_read_table(sg->parent, pgt, &pte.val);
+	}
+	if (datptr)
+		*datptr = pgt | dat_protection * PEI_DAT_PROT;
 	if (!rc && pte.i)
 		rc = PGM_PAGE_TRANSLATION;
 	if (!rc && pte.z)
@@ -1198,7 +1497,8 @@ shadow_page:
 	pte.p |= dat_protection;
 	if (!rc)
 		rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
-	ipte_unlock(vcpu);
-	up_read(&sg->mm->mmap_sem);
+	vcpu->kvm->stat.gmap_shadow_pg_entry++;
+	ipte_unlock(vcpu->kvm);
+	mmap_read_unlock(sg->mm);
 	return rc;
 }
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index f4c51756c462..3fde45a151f2 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -18,17 +18,14 @@
 
 /**
  * kvm_s390_real_to_abs - convert guest real address to guest absolute address
- * @vcpu - guest virtual cpu
+ * @prefix - guest prefix
  * @gra - guest real address
  *
  * Returns the guest absolute address that corresponds to the passed guest real
- * address @gra of a virtual guest cpu by applying its prefix.
+ * address @gra of by applying the given prefix.
  */
-static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
-						 unsigned long gra)
+static inline unsigned long _kvm_s390_real_to_abs(u32 prefix, unsigned long gra)
 {
-	unsigned long prefix  = kvm_s390_get_prefix(vcpu);
-
 	if (gra < 2 * PAGE_SIZE)
 		gra += prefix;
 	else if (gra >= prefix && gra < prefix + 2 * PAGE_SIZE)
@@ -37,6 +34,43 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
 }
 
 /**
+ * kvm_s390_real_to_abs - convert guest real address to guest absolute address
+ * @vcpu - guest virtual cpu
+ * @gra - guest real address
+ *
+ * Returns the guest absolute address that corresponds to the passed guest real
+ * address @gra of a virtual guest cpu by applying its prefix.
+ */
+static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
+						 unsigned long gra)
+{
+	return _kvm_s390_real_to_abs(kvm_s390_get_prefix(vcpu), gra);
+}
+
+/**
+ * _kvm_s390_logical_to_effective - convert guest logical to effective address
+ * @psw: psw of the guest
+ * @ga: guest logical address
+ *
+ * Convert a guest logical address to an effective address by applying the
+ * rules of the addressing mode defined by bits 31 and 32 of the given PSW
+ * (extendended/basic addressing mode).
+ *
+ * Depending on the addressing mode, the upper 40 bits (24 bit addressing
+ * mode), 33 bits (31 bit addressing mode) or no bits (64 bit addressing
+ * mode) of @ga will be zeroed and the remaining bits will be returned.
+ */
+static inline unsigned long _kvm_s390_logical_to_effective(psw_t *psw,
+							   unsigned long ga)
+{
+	if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
+		return ga;
+	if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
+		return ga & ((1UL << 31) - 1);
+	return ga & ((1UL << 24) - 1);
+}
+
+/**
  * kvm_s390_logical_to_effective - convert guest logical to effective address
  * @vcpu: guest virtual cpu
  * @ga: guest logical address
@@ -52,13 +86,7 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
 static inline unsigned long kvm_s390_logical_to_effective(struct kvm_vcpu *vcpu,
 							  unsigned long ga)
 {
-	psw_t *psw = &vcpu->arch.sie_block->gpsw;
-
-	if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
-		return ga;
-	if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
-		return ga & ((1UL << 31) - 1);
-	return ga & ((1UL << 24) - 1);
+	return _kvm_s390_logical_to_effective(&vcpu->arch.sie_block->gpsw, ga);
 }
 
 /*
@@ -158,24 +186,37 @@ enum gacc_mode {
 	GACC_IFETCH,
 };
 
-int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva,
-			    u8 ar, unsigned long *gpa, enum gacc_mode mode);
+int guest_translate_address_with_key(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
+				     unsigned long *gpa, enum gacc_mode mode,
+				     u8 access_key);
+
 int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
-		    unsigned long length, enum gacc_mode mode);
+		    unsigned long length, enum gacc_mode mode, u8 access_key);
+
+int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length,
+		    enum gacc_mode mode, u8 access_key);
+
+int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data,
+			      unsigned long len, enum gacc_mode mode, u8 access_key);
 
-int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
-		 unsigned long len, enum gacc_mode mode);
+int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
+			  void *data, unsigned long len, enum gacc_mode mode,
+			  u8 access_key);
 
 int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
 		      void *data, unsigned long len, enum gacc_mode mode);
 
+int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, __uint128_t *old,
+			       __uint128_t new, u8 access_key, bool *success);
+
 /**
- * write_guest - copy data from kernel space to guest space
+ * write_guest_with_key - copy data from kernel space to guest space
  * @vcpu: virtual cpu
  * @ga: guest address
  * @ar: access register
  * @data: source address in kernel space
  * @len: number of bytes to copy
+ * @access_key: access key the storage key needs to match
  *
  * Copy @len bytes from @data (kernel space) to @ga (guest address).
  * In order to copy data to guest space the PSW of the vcpu is inspected:
@@ -186,8 +227,8 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
  * The addressing mode of the PSW is also inspected, so that address wrap
  * around is taken into account for 24-, 31- and 64-bit addressing mode,
  * if the to be copied data crosses page boundaries in guest address space.
- * In addition also low address and DAT protection are inspected before
- * copying any data (key protection is currently not implemented).
+ * In addition low address, DAT and key protection checks are performed before
+ * copying any data.
  *
  * This function modifies the 'struct kvm_s390_pgm_info pgm' member of @vcpu.
  * In case of an access exception (e.g. protection exception) pgm will contain
@@ -215,10 +256,53 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
  *	 if data has been changed in guest space in case of an exception.
  */
 static inline __must_check
+int write_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
+			 void *data, unsigned long len, u8 access_key)
+{
+	return access_guest_with_key(vcpu, ga, ar, data, len, GACC_STORE,
+				     access_key);
+}
+
+/**
+ * write_guest - copy data from kernel space to guest space
+ * @vcpu: virtual cpu
+ * @ga: guest address
+ * @ar: access register
+ * @data: source address in kernel space
+ * @len: number of bytes to copy
+ *
+ * The behaviour of write_guest is identical to write_guest_with_key, except
+ * that the PSW access key is used instead of an explicit argument.
+ */
+static inline __must_check
 int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
 		unsigned long len)
 {
-	return access_guest(vcpu, ga, ar, data, len, GACC_STORE);
+	u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key;
+
+	return write_guest_with_key(vcpu, ga, ar, data, len, access_key);
+}
+
+/**
+ * read_guest_with_key - copy data from guest space to kernel space
+ * @vcpu: virtual cpu
+ * @ga: guest address
+ * @ar: access register
+ * @data: destination address in kernel space
+ * @len: number of bytes to copy
+ * @access_key: access key the storage key needs to match
+ *
+ * Copy @len bytes from @ga (guest address) to @data (kernel space).
+ *
+ * The behaviour of read_guest_with_key is identical to write_guest_with_key,
+ * except that data will be copied from guest space to kernel space.
+ */
+static inline __must_check
+int read_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
+			void *data, unsigned long len, u8 access_key)
+{
+	return access_guest_with_key(vcpu, ga, ar, data, len, GACC_FETCH,
+				     access_key);
 }
 
 /**
@@ -231,14 +315,16 @@ int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
  *
  * Copy @len bytes from @ga (guest address) to @data (kernel space).
  *
- * The behaviour of read_guest is identical to write_guest, except that
- * data will be copied from guest space to kernel space.
+ * The behaviour of read_guest is identical to read_guest_with_key, except
+ * that the PSW access key is used instead of an explicit argument.
  */
 static inline __must_check
 int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
 	       unsigned long len)
 {
-	return access_guest(vcpu, ga, ar, data, len, GACC_FETCH);
+	u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key;
+
+	return read_guest_with_key(vcpu, ga, ar, data, len, access_key);
 }
 
 /**
@@ -259,7 +345,10 @@ static inline __must_check
 int read_guest_instr(struct kvm_vcpu *vcpu, unsigned long ga, void *data,
 		     unsigned long len)
 {
-	return access_guest(vcpu, ga, 0, data, len, GACC_IFETCH);
+	u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key;
+
+	return access_guest_with_key(vcpu, ga, 0, data, len, GACC_IFETCH,
+				     access_key);
 }
 
 /**
@@ -316,11 +405,12 @@ int read_guest_abs(struct kvm_vcpu *vcpu, unsigned long gpa, void *data,
  * @len: number of bytes to copy
  *
  * Copy @len bytes from @data (kernel space) to @gra (guest real address).
- * It is up to the caller to ensure that the entire guest memory range is
- * valid memory before calling this function.
  * Guest low address and key protection are not checked.
  *
- * Returns zero on success or -EFAULT on error.
+ * Returns zero on success, -EFAULT when copying from @data failed, or
+ * PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info
+ * is also stored to allow injecting into the guest (if applicable) using
+ * kvm_s390_inject_prog_cond().
  *
  * If an error occurs data may have been copied partially to guest memory.
  */
@@ -339,11 +429,12 @@ int write_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
  * @len: number of bytes to copy
  *
  * Copy @len bytes from @gra (guest real address) to @data (kernel space).
- * It is up to the caller to ensure that the entire guest memory range is
- * valid memory before calling this function.
  * Guest key protection is not checked.
  *
- * Returns zero on success or -EFAULT on error.
+ * Returns zero on success, -EFAULT when copying to @data failed, or
+ * PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info
+ * is also stored to allow injecting into the guest (if applicable) using
+ * kvm_s390_inject_prog_cond().
  *
  * If an error occurs data may have been copied partially to kernel space.
  */
@@ -354,12 +445,16 @@ int read_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
 	return access_guest_real(vcpu, gra, data, len, 0);
 }
 
-void ipte_lock(struct kvm_vcpu *vcpu);
-void ipte_unlock(struct kvm_vcpu *vcpu);
-int ipte_lock_held(struct kvm_vcpu *vcpu);
+void ipte_lock(struct kvm *kvm);
+void ipte_unlock(struct kvm *kvm);
+int ipte_lock_held(struct kvm *kvm);
 int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
 
+/* MVPG PEI indication bits */
+#define PEI_DAT_PROT 2
+#define PEI_NOT_PTE 4
+
 int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
-			  unsigned long saddr);
+			  unsigned long saddr, unsigned long *datptr);
 
 #endif /* __KVM_S390_GACCESS_H */
diff --git a/arch/s390/kvm/gmap-vsie.c b/arch/s390/kvm/gmap-vsie.c
new file mode 100644
index 000000000000..a6d1dbb04c97
--- /dev/null
+++ b/arch/s390/kvm/gmap-vsie.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Guest memory management for KVM/s390 nested VMs.
+ *
+ * Copyright IBM Corp. 2008, 2020, 2024
+ *
+ *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
+ *               Martin Schwidefsky <schwidefsky@de.ibm.com>
+ *               David Hildenbrand <david@redhat.com>
+ *               Janosch Frank <frankja@linux.vnet.ibm.com>
+ */
+
+#include <linux/compiler.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/pgtable.h>
+#include <linux/pagemap.h>
+#include <linux/mman.h>
+
+#include <asm/lowcore.h>
+#include <asm/gmap.h>
+#include <asm/uv.h>
+
+#include "kvm-s390.h"
+#include "gmap.h"
+
+/**
+ * gmap_find_shadow - find a specific asce in the list of shadow tables
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns the pointer to a gmap if a shadow table with the given asce is
+ * already available, ERR_PTR(-EAGAIN) if another one is just being created,
+ * otherwise NULL
+ *
+ * Context: Called with parent->shadow_lock held
+ */
+static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, int edat_level)
+{
+	struct gmap *sg;
+
+	lockdep_assert_held(&parent->shadow_lock);
+	list_for_each_entry(sg, &parent->children, list) {
+		if (!gmap_shadow_valid(sg, asce, edat_level))
+			continue;
+		if (!sg->initialized)
+			return ERR_PTR(-EAGAIN);
+		refcount_inc(&sg->ref_count);
+		return sg;
+	}
+	return NULL;
+}
+
+/**
+ * gmap_shadow - create/find a shadow guest address space
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * The pages of the top level page table referred by the asce parameter
+ * will be set to read-only and marked in the PGSTEs of the kvm process.
+ * The shadow table will be removed automatically on any change to the
+ * PTE mapping for the source table.
+ *
+ * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
+ * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
+ * parent gmap table could not be protected.
+ */
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level)
+{
+	struct gmap *sg, *new;
+	unsigned long limit;
+	int rc;
+
+	if (KVM_BUG_ON(parent->mm->context.allow_gmap_hpage_1m, (struct kvm *)parent->private) ||
+	    KVM_BUG_ON(gmap_is_shadow(parent), (struct kvm *)parent->private))
+		return ERR_PTR(-EFAULT);
+	spin_lock(&parent->shadow_lock);
+	sg = gmap_find_shadow(parent, asce, edat_level);
+	spin_unlock(&parent->shadow_lock);
+	if (sg)
+		return sg;
+	/* Create a new shadow gmap */
+	limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
+	if (asce & _ASCE_REAL_SPACE)
+		limit = -1UL;
+	new = gmap_alloc(limit);
+	if (!new)
+		return ERR_PTR(-ENOMEM);
+	new->mm = parent->mm;
+	new->parent = gmap_get(parent);
+	new->private = parent->private;
+	new->orig_asce = asce;
+	new->edat_level = edat_level;
+	new->initialized = false;
+	spin_lock(&parent->shadow_lock);
+	/* Recheck if another CPU created the same shadow */
+	sg = gmap_find_shadow(parent, asce, edat_level);
+	if (sg) {
+		spin_unlock(&parent->shadow_lock);
+		gmap_free(new);
+		return sg;
+	}
+	if (asce & _ASCE_REAL_SPACE) {
+		/* only allow one real-space gmap shadow */
+		list_for_each_entry(sg, &parent->children, list) {
+			if (sg->orig_asce & _ASCE_REAL_SPACE) {
+				spin_lock(&sg->guest_table_lock);
+				gmap_unshadow(sg);
+				spin_unlock(&sg->guest_table_lock);
+				list_del(&sg->list);
+				gmap_put(sg);
+				break;
+			}
+		}
+	}
+	refcount_set(&new->ref_count, 2);
+	list_add(&new->list, &parent->children);
+	if (asce & _ASCE_REAL_SPACE) {
+		/* nothing to protect, return right away */
+		new->initialized = true;
+		spin_unlock(&parent->shadow_lock);
+		return new;
+	}
+	spin_unlock(&parent->shadow_lock);
+	/* protect after insertion, so it will get properly invalidated */
+	mmap_read_lock(parent->mm);
+	rc = __kvm_s390_mprotect_many(parent, asce & _ASCE_ORIGIN,
+				      ((asce & _ASCE_TABLE_LENGTH) + 1),
+				      PROT_READ, GMAP_NOTIFY_SHADOW);
+	mmap_read_unlock(parent->mm);
+	spin_lock(&parent->shadow_lock);
+	new->initialized = true;
+	if (rc) {
+		list_del(&new->list);
+		gmap_free(new);
+		new = ERR_PTR(rc);
+	}
+	spin_unlock(&parent->shadow_lock);
+	return new;
+}
diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c
new file mode 100644
index 000000000000..6d8944d1b4a0
--- /dev/null
+++ b/arch/s390/kvm/gmap.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Guest memory management for KVM/s390
+ *
+ * Copyright IBM Corp. 2008, 2020, 2024
+ *
+ *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
+ *               Martin Schwidefsky <schwidefsky@de.ibm.com>
+ *               David Hildenbrand <david@redhat.com>
+ *               Janosch Frank <frankja@linux.vnet.ibm.com>
+ */
+
+#include <linux/compiler.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/pgtable.h>
+#include <linux/pagemap.h>
+
+#include <asm/lowcore.h>
+#include <asm/gmap.h>
+#include <asm/uv.h>
+
+#include "gmap.h"
+
+/**
+ * gmap_make_secure() - make one guest page secure
+ * @gmap: the guest gmap
+ * @gaddr: the guest address that needs to be made secure
+ * @uvcb: the UVCB specifying which operation needs to be performed
+ *
+ * Context: needs to be called with kvm->srcu held.
+ * Return: 0 on success, < 0 in case of error.
+ */
+int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
+{
+	struct kvm *kvm = gmap->private;
+	unsigned long vmaddr;
+
+	lockdep_assert_held(&kvm->srcu);
+
+	vmaddr = gfn_to_hva(kvm, gpa_to_gfn(gaddr));
+	if (kvm_is_error_hva(vmaddr))
+		return -EFAULT;
+	return make_hva_secure(gmap->mm, vmaddr, uvcb);
+}
+
+int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr)
+{
+	struct uv_cb_cts uvcb = {
+		.header.cmd = UVC_CMD_CONV_TO_SEC_STOR,
+		.header.len = sizeof(uvcb),
+		.guest_handle = gmap->guest_handle,
+		.gaddr = gaddr,
+	};
+
+	return gmap_make_secure(gmap, gaddr, &uvcb);
+}
+
+/**
+ * __gmap_destroy_page() - Destroy a guest page.
+ * @gmap: the gmap of the guest
+ * @page: the page to destroy
+ *
+ * An attempt will be made to destroy the given guest page. If the attempt
+ * fails, an attempt is made to export the page. If both attempts fail, an
+ * appropriate error is returned.
+ *
+ * Context: must be called holding the mm lock for gmap->mm
+ */
+static int __gmap_destroy_page(struct gmap *gmap, struct page *page)
+{
+	struct folio *folio = page_folio(page);
+	int rc;
+
+	/*
+	 * See gmap_make_secure(): large folios cannot be secure. Small
+	 * folio implies FW_LEVEL_PTE.
+	 */
+	if (folio_test_large(folio))
+		return -EFAULT;
+
+	rc = uv_destroy_folio(folio);
+	/*
+	 * Fault handlers can race; it is possible that two CPUs will fault
+	 * on the same secure page. One CPU can destroy the page, reboot,
+	 * re-enter secure mode and import it, while the second CPU was
+	 * stuck at the beginning of the handler. At some point the second
+	 * CPU will be able to progress, and it will not be able to destroy
+	 * the page. In that case we do not want to terminate the process,
+	 * we instead try to export the page.
+	 */
+	if (rc)
+		rc = uv_convert_from_secure_folio(folio);
+
+	return rc;
+}
+
+/**
+ * gmap_destroy_page() - Destroy a guest page.
+ * @gmap: the gmap of the guest
+ * @gaddr: the guest address to destroy
+ *
+ * An attempt will be made to destroy the given guest page. If the attempt
+ * fails, an attempt is made to export the page. If both attempts fail, an
+ * appropriate error is returned.
+ *
+ * Context: may sleep.
+ */
+int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr)
+{
+	struct page *page;
+	int rc = 0;
+
+	mmap_read_lock(gmap->mm);
+	page = gfn_to_page(gmap->private, gpa_to_gfn(gaddr));
+	if (page)
+		rc = __gmap_destroy_page(gmap, page);
+	kvm_release_page_clean(page);
+	mmap_read_unlock(gmap->mm);
+	return rc;
+}
diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h
new file mode 100644
index 000000000000..c8f031c9ea5f
--- /dev/null
+++ b/arch/s390/kvm/gmap.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  KVM guest address space mapping code
+ *
+ *    Copyright IBM Corp. 2007, 2016, 2025
+ *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ *               Claudio Imbrenda <imbrenda@linux.ibm.com>
+ */
+
+#ifndef ARCH_KVM_S390_GMAP_H
+#define ARCH_KVM_S390_GMAP_H
+
+#define GMAP_SHADOW_FAKE_TABLE 1ULL
+
+int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb);
+int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr);
+int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr);
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level);
+
+/**
+ * gmap_shadow_valid - check if a shadow guest address space matches the
+ *                     given properties and is still valid
+ * @sg: pointer to the shadow guest address space structure
+ * @asce: ASCE for which the shadow table is requested
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns 1 if the gmap shadow is still valid and matches the given
+ * properties, the caller can continue using it. Returns 0 otherwise, the
+ * caller has to request a new shadow gmap in this case.
+ *
+ */
+static inline int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
+{
+	if (sg->removed)
+		return 0;
+	return sg->orig_asce == asce && sg->edat_level == edat_level;
+}
+
+#endif
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index 394a5f53805b..80879fc73c90 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -184,7 +184,7 @@ static int __import_wp_info(struct kvm_vcpu *vcpu,
 	if (wp_info->len < 0 || wp_info->len > MAX_WP_SIZE)
 		return -EINVAL;
 
-	wp_info->old_data = kmalloc(bp_data->len, GFP_KERNEL);
+	wp_info->old_data = kmalloc(bp_data->len, GFP_KERNEL_ACCOUNT);
 	if (!wp_info->old_data)
 		return -ENOMEM;
 	/* try to backup the original value */
@@ -213,8 +213,8 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
 	else if (dbg->arch.nr_hw_bp > MAX_BP_COUNT)
 		return -EINVAL;
 
-	bp_data = memdup_user(dbg->arch.hw_bp,
-			      sizeof(*bp_data) * dbg->arch.nr_hw_bp);
+	bp_data = memdup_array_user(dbg->arch.hw_bp, dbg->arch.nr_hw_bp,
+				    sizeof(*bp_data));
 	if (IS_ERR(bp_data))
 		return PTR_ERR(bp_data);
 
@@ -234,7 +234,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
 	if (nr_wp > 0) {
 		wp_info = kmalloc_array(nr_wp,
 					sizeof(*wp_info),
-					GFP_KERNEL);
+					GFP_KERNEL_ACCOUNT);
 		if (!wp_info) {
 			ret = -ENOMEM;
 			goto error;
@@ -243,7 +243,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
 	if (nr_bp > 0) {
 		bp_info = kmalloc_array(nr_bp,
 					sizeof(*bp_info),
-					GFP_KERNEL);
+					GFP_KERNEL_ACCOUNT);
 		if (!bp_info) {
 			ret = -ENOMEM;
 			goto error;
@@ -349,7 +349,7 @@ static struct kvm_hw_wp_info_arch *any_wp_changed(struct kvm_vcpu *vcpu)
 		if (!wp_info || !wp_info->old_data || wp_info->len <= 0)
 			continue;
 
-		temp = kmalloc(wp_info->len, GFP_KERNEL);
+		temp = kmalloc(wp_info->len, GFP_KERNEL_ACCOUNT);
 		if (!temp)
 			continue;
 
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index a389fa85cca2..610dd44a948b 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -2,7 +2,7 @@
 /*
  * in-kernel handling for sie intercepts
  *
- * Copyright IBM Corp. 2008, 2014
+ * Copyright IBM Corp. 2008, 2020
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  *               Christian Borntraeger <borntraeger@de.ibm.com>
@@ -12,15 +12,16 @@
 #include <linux/errno.h>
 #include <linux/pagemap.h>
 
-#include <asm/kvm_host.h>
 #include <asm/asm-offsets.h>
 #include <asm/irq.h>
 #include <asm/sysinfo.h>
+#include <asm/uv.h>
 
 #include "kvm-s390.h"
 #include "gaccess.h"
 #include "trace.h"
 #include "trace-s390.h"
+#include "gmap.h"
 
 u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu)
 {
@@ -79,6 +80,10 @@ static int handle_stop(struct kvm_vcpu *vcpu)
 			return rc;
 	}
 
+	/*
+	 * no need to check the return value of vcpu_stop as it can only have
+	 * an error for protvirt, but protvirt means user cpu state
+	 */
 	if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm))
 		kvm_s390_vcpu_stop(vcpu);
 	return -EOPNOTSUPP;
@@ -213,7 +218,7 @@ static int handle_itdb(struct kvm_vcpu *vcpu)
 		return 0;
 	if (current->thread.per_flags & PER_FLAG_NO_TE)
 		return 0;
-	itdb = (struct kvm_s390_itdb *)vcpu->arch.sie_block->itdba;
+	itdb = phys_to_virt(vcpu->arch.sie_block->itdba);
 	rc = write_guest_lc(vcpu, __LC_PGM_TDB, itdb, sizeof(*itdb));
 	if (rc)
 		return rc;
@@ -224,6 +229,21 @@ static int handle_itdb(struct kvm_vcpu *vcpu)
 
 #define per_event(vcpu) (vcpu->arch.sie_block->iprcc & PGM_PER)
 
+static bool should_handle_per_event(const struct kvm_vcpu *vcpu)
+{
+	if (!guestdbg_enabled(vcpu) || !per_event(vcpu))
+		return false;
+	if (guestdbg_sstep_enabled(vcpu) &&
+	    vcpu->arch.sie_block->iprcc != PGM_PER) {
+		/*
+		 * __vcpu_run() will exit after delivering the concurrently
+		 * indicated condition.
+		 */
+		return false;
+	}
+	return true;
+}
+
 static int handle_prog(struct kvm_vcpu *vcpu)
 {
 	psw_t psw;
@@ -231,7 +251,14 @@ static int handle_prog(struct kvm_vcpu *vcpu)
 
 	vcpu->stat.exit_program_interruption++;
 
-	if (guestdbg_enabled(vcpu) && per_event(vcpu)) {
+	/*
+	 * Intercept 8 indicates a loop of specification exceptions
+	 * for protected guests.
+	 */
+	if (kvm_s390_pv_cpu_is_protected(vcpu))
+		return -EOPNOTSUPP;
+
+	if (should_handle_per_event(vcpu)) {
 		rc = kvm_s390_handle_per_event(vcpu);
 		if (rc)
 			return rc;
@@ -258,11 +285,20 @@ static int handle_prog(struct kvm_vcpu *vcpu)
 
 /**
  * handle_external_interrupt - used for external interruption interceptions
+ * @vcpu: virtual cpu
+ *
+ * This interception occurs if:
+ * - the CPUSTAT_EXT_INT bit was already set when the external interrupt
+ *   occurred. In this case, the interrupt needs to be injected manually to
+ *   preserve interrupt priority.
+ * - the external new PSW has external interrupts enabled, which will cause an
+ *   interruption loop. We drop to userspace in this case.
+ *
+ * The latter case can be detected by inspecting the external mask bit in the
+ * external new psw.
  *
- * This interception only occurs if the CPUSTAT_EXT_INT bit was set, or if
- * the new PSW does not have external interrupts disabled. In the first case,
- * we've got to deliver the interrupt manually, and in the second case, we
- * drop to userspace to handle the situation there.
+ * Under PV, only the latter case can occur, since interrupt priorities are
+ * handled in the ultravisor.
  */
 static int handle_external_interrupt(struct kvm_vcpu *vcpu)
 {
@@ -273,10 +309,18 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu)
 
 	vcpu->stat.exit_external_interrupt++;
 
-	rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t));
-	if (rc)
-		return rc;
-	/* We can not handle clock comparator or timer interrupt with bad PSW */
+	if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+		newpsw = vcpu->arch.sie_block->gpsw;
+	} else {
+		rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t));
+		if (rc)
+			return rc;
+	}
+
+	/*
+	 * Clock comparator or timer interrupt with external interrupt enabled
+	 * will cause interrupt loop. Drop to userspace.
+	 */
 	if ((eic == EXT_IRQ_CLK_COMP || eic == EXT_IRQ_CPU_TIMER) &&
 	    (newpsw.mask & PSW_MASK_EXT))
 		return -EOPNOTSUPP;
@@ -304,7 +348,8 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu)
 }
 
 /**
- * Handle MOVE PAGE partial execution interception.
+ * handle_mvpg_pei - Handle MOVE PAGE partial execution interception.
+ * @vcpu: virtual cpu
  *
  * This interception can only happen for guests with DAT disabled and
  * addresses that are currently not mapped in the host. Thus we try to
@@ -318,21 +363,21 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
 
 	kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
 
-	/* Make sure that the source is paged-in */
-	rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg2],
-				     reg2, &srcaddr, GACC_FETCH);
+	/* Ensure that the source is paged-in, no actual access -> no key checking */
+	rc = guest_translate_address_with_key(vcpu, vcpu->run->s.regs.gprs[reg2],
+					      reg2, &srcaddr, GACC_FETCH, 0);
 	if (rc)
 		return kvm_s390_inject_prog_cond(vcpu, rc);
-	rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0);
+	rc = kvm_s390_handle_dat_fault(vcpu, srcaddr, 0);
 	if (rc != 0)
 		return rc;
 
-	/* Make sure that the destination is paged-in */
-	rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg1],
-				     reg1, &dstaddr, GACC_STORE);
+	/* Ensure that the source is paged-in, no actual access -> no key checking */
+	rc = guest_translate_address_with_key(vcpu, vcpu->run->s.regs.gprs[reg1],
+					      reg1, &dstaddr, GACC_STORE, 0);
 	if (rc)
 		return kvm_s390_inject_prog_cond(vcpu, rc);
-	rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1);
+	rc = kvm_s390_handle_dat_fault(vcpu, dstaddr, FOLL_WRITE);
 	if (rc != 0)
 		return rc;
 
@@ -360,8 +405,8 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu)
  */
 int handle_sthyi(struct kvm_vcpu *vcpu)
 {
-	int reg1, reg2, r = 0;
-	u64 code, addr, cc = 0, rc = 0;
+	int reg1, reg2, cc = 0, r = 0;
+	u64 code, addr, rc = 0;
 	struct sthyi_sctns *sctns = NULL;
 
 	if (!test_kvm_facility(vcpu->kvm, 74))
@@ -384,21 +429,28 @@ int handle_sthyi(struct kvm_vcpu *vcpu)
 		goto out;
 	}
 
-	if (addr & ~PAGE_MASK)
+	if (!kvm_s390_pv_cpu_is_protected(vcpu) && (addr & ~PAGE_MASK))
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-	sctns = (void *)get_zeroed_page(GFP_KERNEL);
+	sctns = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
 	if (!sctns)
 		return -ENOMEM;
 
 	cc = sthyi_fill(sctns, &rc);
-
+	if (cc < 0) {
+		free_page((unsigned long)sctns);
+		return cc;
+	}
 out:
 	if (!cc) {
-		r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE);
-		if (r) {
-			free_page((unsigned long)sctns);
-			return kvm_s390_inject_prog_cond(vcpu, r);
+		if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+			memcpy(sida_addr(vcpu->arch.sie_block), sctns, PAGE_SIZE);
+		} else {
+			r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE);
+			if (r) {
+				free_page((unsigned long)sctns);
+				return kvm_s390_inject_prog_cond(vcpu, r);
+			}
 		}
 	}
 
@@ -444,6 +496,110 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
 	return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
 }
 
+static int handle_pv_spx(struct kvm_vcpu *vcpu)
+{
+	u32 pref = *(u32 *)sida_addr(vcpu->arch.sie_block);
+
+	kvm_s390_set_prefix(vcpu, pref);
+	trace_kvm_s390_handle_prefix(vcpu, 1, pref);
+	return 0;
+}
+
+static int handle_pv_sclp(struct kvm_vcpu *vcpu)
+{
+	struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
+
+	spin_lock(&fi->lock);
+	/*
+	 * 2 cases:
+	 * a: an sccb answering interrupt was already pending or in flight.
+	 *    As the sccb value is not known we can simply set some value to
+	 *    trigger delivery of a saved SCCB. UV will then use its saved
+	 *    copy of the SCCB value.
+	 * b: an error SCCB interrupt needs to be injected so we also inject
+	 *    a fake SCCB address. Firmware will use the proper one.
+	 * This makes sure, that both errors and real sccb returns will only
+	 * be delivered after a notification intercept (instruction has
+	 * finished) but not after others.
+	 */
+	fi->srv_signal.ext_params |= 0x43000;
+	set_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs);
+	clear_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs);
+	spin_unlock(&fi->lock);
+	return 0;
+}
+
+static int handle_pv_uvc(struct kvm_vcpu *vcpu)
+{
+	struct uv_cb_share *guest_uvcb = sida_addr(vcpu->arch.sie_block);
+	struct uv_cb_cts uvcb = {
+		.header.cmd	= UVC_CMD_UNPIN_PAGE_SHARED,
+		.header.len	= sizeof(uvcb),
+		.guest_handle	= kvm_s390_pv_get_handle(vcpu->kvm),
+		.gaddr		= guest_uvcb->paddr,
+	};
+	int rc;
+
+	if (guest_uvcb->header.cmd != UVC_CMD_REMOVE_SHARED_ACCESS) {
+		WARN_ONCE(1, "Unexpected notification intercept for UVC 0x%x\n",
+			  guest_uvcb->header.cmd);
+		return 0;
+	}
+	rc = gmap_make_secure(vcpu->arch.gmap, uvcb.gaddr, &uvcb);
+	/*
+	 * If the unpin did not succeed, the guest will exit again for the UVC
+	 * and we will retry the unpin.
+	 */
+	if (rc == -EINVAL || rc == -ENXIO)
+		return 0;
+	/*
+	 * If we got -EAGAIN here, we simply return it. It will eventually
+	 * get propagated all the way to userspace, which should then try
+	 * again.
+	 */
+	return rc;
+}
+
+static int handle_pv_notification(struct kvm_vcpu *vcpu)
+{
+	int ret;
+
+	if (vcpu->arch.sie_block->ipa == 0xb210)
+		return handle_pv_spx(vcpu);
+	if (vcpu->arch.sie_block->ipa == 0xb220)
+		return handle_pv_sclp(vcpu);
+	if (vcpu->arch.sie_block->ipa == 0xb9a4)
+		return handle_pv_uvc(vcpu);
+	if (vcpu->arch.sie_block->ipa >> 8 == 0xae) {
+		/*
+		 * Besides external call, other SIGP orders also cause a
+		 * 108 (pv notify) intercept. In contrast to external call,
+		 * these orders need to be emulated and hence the appropriate
+		 * place to handle them is in handle_instruction().
+		 * So first try kvm_s390_handle_sigp_pei() and if that isn't
+		 * successful, go on with handle_instruction().
+		 */
+		ret = kvm_s390_handle_sigp_pei(vcpu);
+		if (!ret)
+			return ret;
+	}
+
+	return handle_instruction(vcpu);
+}
+
+static bool should_handle_per_ifetch(const struct kvm_vcpu *vcpu, int rc)
+{
+	/* Process PER, also if the instruction is processed in user space. */
+	if (!(vcpu->arch.sie_block->icptstatus & 0x02))
+		return false;
+	if (rc != 0 && rc != -EOPNOTSUPP)
+		return false;
+	if (guestdbg_sstep_enabled(vcpu) && vcpu->arch.local_int.pending_irqs)
+		/* __vcpu_run() will exit after delivering the interrupt. */
+		return false;
+	return true;
+}
+
 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 {
 	int rc, per_rc = 0;
@@ -478,15 +634,35 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 		rc = handle_partial_execution(vcpu);
 		break;
 	case ICPT_KSS:
-		rc = kvm_s390_skey_check_enable(vcpu);
+		/* Instruction will be redriven, skip the PER check. */
+		return kvm_s390_skey_check_enable(vcpu);
+	case ICPT_MCHKREQ:
+	case ICPT_INT_ENABLE:
+		/*
+		 * PSW bit 13 or a CR (0, 6, 14) changed and we might
+		 * now be able to deliver interrupts. The pre-run code
+		 * will take care of this.
+		 */
+		rc = 0;
+		break;
+	case ICPT_PV_INSTR:
+		rc = handle_instruction(vcpu);
+		break;
+	case ICPT_PV_NOTIFY:
+		rc = handle_pv_notification(vcpu);
+		break;
+	case ICPT_PV_PREF:
+		rc = 0;
+		gmap_convert_to_secure(vcpu->arch.gmap,
+				       kvm_s390_get_prefix(vcpu));
+		gmap_convert_to_secure(vcpu->arch.gmap,
+				       kvm_s390_get_prefix(vcpu) + PAGE_SIZE);
 		break;
 	default:
 		return -EOPNOTSUPP;
 	}
 
-	/* process PER, also if the instrution is processed in user space */
-	if (vcpu->arch.sie_block->icptstatus & 0x02 &&
-	    (!rc || rc == -EOPNOTSUPP))
+	if (should_handle_per_ifetch(vcpu, rc))
 		per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu);
 	return per_rc ? per_rc : rc;
 }
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 165dea4c7f19..2811a6c093b8 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2,7 +2,7 @@
 /*
  * handling kvm guest interrupts
  *
- * Copyright IBM Corp. 2008, 2015
+ * Copyright IBM Corp. 2008, 2020
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  */
@@ -10,6 +10,7 @@
 #define KMSG_COMPONENT "kvm-s390"
 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 
+#include <linux/cpufeature.h>
 #include <linux/interrupt.h>
 #include <linux/kvm_host.h>
 #include <linux/hrtimer.h>
@@ -19,18 +20,20 @@
 #include <linux/slab.h>
 #include <linux/bitmap.h>
 #include <linux/vmalloc.h>
+#include <asm/access-regs.h>
 #include <asm/asm-offsets.h>
 #include <asm/dis.h>
 #include <linux/uaccess.h>
 #include <asm/sclp.h>
 #include <asm/isc.h>
 #include <asm/gmap.h>
-#include <asm/switch_to.h>
 #include <asm/nmi.h>
 #include <asm/airq.h>
+#include <asm/tpi.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
 #include "trace-s390.h"
+#include "pci.h"
 
 #define PFAULT_INIT 0x0600
 #define PFAULT_DONE 0x0680
@@ -81,8 +84,9 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id)
 		struct esca_block *sca = vcpu->kvm->arch.sca;
 		union esca_sigp_ctrl *sigp_ctrl =
 			&(sca->cpu[vcpu->vcpu_id].sigp_ctrl);
-		union esca_sigp_ctrl new_val = {0}, old_val = *sigp_ctrl;
+		union esca_sigp_ctrl new_val = {0}, old_val;
 
+		old_val = READ_ONCE(*sigp_ctrl);
 		new_val.scn = src_id;
 		new_val.c = 1;
 		old_val.c = 0;
@@ -93,8 +97,9 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id)
 		struct bsca_block *sca = vcpu->kvm->arch.sca;
 		union bsca_sigp_ctrl *sigp_ctrl =
 			&(sca->cpu[vcpu->vcpu_id].sigp_ctrl);
-		union bsca_sigp_ctrl new_val = {0}, old_val = *sigp_ctrl;
+		union bsca_sigp_ctrl new_val = {0}, old_val;
 
+		old_val = READ_ONCE(*sigp_ctrl);
 		new_val.scn = src_id;
 		new_val.c = 1;
 		old_val.c = 0;
@@ -114,8 +119,6 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id)
 
 static void sca_clear_ext_call(struct kvm_vcpu *vcpu)
 {
-	int rc, expect;
-
 	if (!kvm_s390_use_sca_entries())
 		return;
 	kvm_s390_clear_cpuflags(vcpu, CPUSTAT_ECALL_PEND);
@@ -124,21 +127,16 @@ static void sca_clear_ext_call(struct kvm_vcpu *vcpu)
 		struct esca_block *sca = vcpu->kvm->arch.sca;
 		union esca_sigp_ctrl *sigp_ctrl =
 			&(sca->cpu[vcpu->vcpu_id].sigp_ctrl);
-		union esca_sigp_ctrl old = *sigp_ctrl;
 
-		expect = old.value;
-		rc = cmpxchg(&sigp_ctrl->value, old.value, 0);
+		WRITE_ONCE(sigp_ctrl->value, 0);
 	} else {
 		struct bsca_block *sca = vcpu->kvm->arch.sca;
 		union bsca_sigp_ctrl *sigp_ctrl =
 			&(sca->cpu[vcpu->vcpu_id].sigp_ctrl);
-		union bsca_sigp_ctrl old = *sigp_ctrl;
 
-		expect = old.value;
-		rc = cmpxchg(&sigp_ctrl->value, old.value, 0);
+		WRITE_ONCE(sigp_ctrl->value, 0);
 	}
 	read_unlock(&vcpu->kvm->arch.sca_lock);
-	WARN_ON(rc != expect); /* cannot clear? */
 }
 
 int psw_extint_disabled(struct kvm_vcpu *vcpu)
@@ -241,12 +239,12 @@ static inline int gisa_set_iam(struct kvm_s390_gisa *gisa, u8 iam)
 {
 	u64 word, _word;
 
+	word = READ_ONCE(gisa->u64.word[0]);
 	do {
-		word = READ_ONCE(gisa->u64.word[0]);
 		if ((u64)gisa != word >> 32)
 			return -EBUSY;
 		_word = (word & ~0xffUL) | iam;
-	} while (cmpxchg(&gisa->u64.word[0], word, _word) != word);
+	} while (!try_cmpxchg(&gisa->u64.word[0], &word, _word));
 
 	return 0;
 }
@@ -264,10 +262,10 @@ static inline void gisa_clear_ipm(struct kvm_s390_gisa *gisa)
 {
 	u64 word, _word;
 
+	word = READ_ONCE(gisa->u64.word[0]);
 	do {
-		word = READ_ONCE(gisa->u64.word[0]);
 		_word = word & ~(0xffUL << 24);
-	} while (cmpxchg(&gisa->u64.word[0], word, _word) != word);
+	} while (!try_cmpxchg(&gisa->u64.word[0], &word, _word));
 }
 
 /**
@@ -285,23 +283,18 @@ static inline u8 gisa_get_ipm_or_restore_iam(struct kvm_s390_gisa_interrupt *gi)
 	u8 pending_mask, alert_mask;
 	u64 word, _word;
 
+	word = READ_ONCE(gi->origin->u64.word[0]);
 	do {
-		word = READ_ONCE(gi->origin->u64.word[0]);
 		alert_mask = READ_ONCE(gi->alert.mask);
 		pending_mask = (u8)(word >> 24) & alert_mask;
 		if (pending_mask)
 			return pending_mask;
 		_word = (word & ~0xffUL) | alert_mask;
-	} while (cmpxchg(&gi->origin->u64.word[0], word, _word) != word);
+	} while (!try_cmpxchg(&gi->origin->u64.word[0], &word, _word));
 
 	return 0;
 }
 
-static inline int gisa_in_alert_list(struct kvm_s390_gisa *gisa)
-{
-	return READ_ONCE(gisa->next_alert) != (u32)(u64)gisa;
-}
-
 static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
 {
 	set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
@@ -312,11 +305,6 @@ static inline u8 gisa_get_ipm(struct kvm_s390_gisa *gisa)
 	return READ_ONCE(gisa->ipm);
 }
 
-static inline void gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
-{
-	clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
-}
-
 static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
 {
 	return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
@@ -324,8 +312,11 @@ static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
 
 static inline unsigned long pending_irqs_no_gisa(struct kvm_vcpu *vcpu)
 {
-	return vcpu->kvm->arch.float_int.pending_irqs |
-		vcpu->arch.local_int.pending_irqs;
+	unsigned long pending = vcpu->kvm->arch.float_int.pending_irqs |
+				vcpu->arch.local_int.pending_irqs;
+
+	pending &= ~vcpu->kvm->arch.float_int.masked_irqs;
+	return pending;
 }
 
 static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu)
@@ -383,10 +374,18 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
 		__clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &active_mask);
 	if (!(vcpu->arch.sie_block->gcr[0] & CR0_CPU_TIMER_SUBMASK))
 		__clear_bit(IRQ_PEND_EXT_CPU_TIMER, &active_mask);
-	if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK))
+	if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) {
 		__clear_bit(IRQ_PEND_EXT_SERVICE, &active_mask);
+		__clear_bit(IRQ_PEND_EXT_SERVICE_EV, &active_mask);
+	}
 	if (psw_mchk_disabled(vcpu))
 		active_mask &= ~IRQ_PEND_MCHK_MASK;
+	/* PV guest cpus can have a single interruption injected at a time. */
+	if (kvm_s390_pv_cpu_get_handle(vcpu) &&
+	    vcpu->arch.sie_block->iictl != IICTL_CODE_NONE)
+		active_mask &= ~(IRQ_PEND_EXT_II_MASK |
+				 IRQ_PEND_IO_MASK |
+				 IRQ_PEND_MCHK_MASK);
 	/*
 	 * Check both floating and local interrupt's cr14 because
 	 * bit IRQ_PEND_MCHK_REP could be set in both cases.
@@ -408,13 +407,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
 static void __set_cpu_idle(struct kvm_vcpu *vcpu)
 {
 	kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT);
-	set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+	set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask);
 }
 
 static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
 {
 	kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT);
-	clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+	clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask);
 }
 
 static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
@@ -479,19 +478,23 @@ static void set_intercept_indicators(struct kvm_vcpu *vcpu)
 static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
-	int rc;
+	int rc = 0;
 
 	vcpu->stat.deliver_cputm++;
 	trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CPU_TIMER,
 					 0, 0);
-
-	rc  = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER,
-			   (u16 *)__LC_EXT_INT_CODE);
-	rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
-	rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
-			     &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-	rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
-			    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+	if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+		vcpu->arch.sie_block->iictl = IICTL_CODE_EXT;
+		vcpu->arch.sie_block->eic = EXT_IRQ_CPU_TIMER;
+	} else {
+		rc  = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER,
+				   (u16 *)__LC_EXT_INT_CODE);
+		rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
+		rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+				     &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+		rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
+				    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+	}
 	clear_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs);
 	return rc ? -EFAULT : 0;
 }
@@ -499,19 +502,23 @@ static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu)
 static int __must_check __deliver_ckc(struct kvm_vcpu *vcpu)
 {
 	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
-	int rc;
+	int rc = 0;
 
 	vcpu->stat.deliver_ckc++;
 	trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CLOCK_COMP,
 					 0, 0);
-
-	rc  = put_guest_lc(vcpu, EXT_IRQ_CLK_COMP,
-			   (u16 __user *)__LC_EXT_INT_CODE);
-	rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
-	rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
-			     &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-	rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
-			    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+	if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+		vcpu->arch.sie_block->iictl = IICTL_CODE_EXT;
+		vcpu->arch.sie_block->eic = EXT_IRQ_CLK_COMP;
+	} else {
+		rc  = put_guest_lc(vcpu, EXT_IRQ_CLK_COMP,
+				   (u16 __user *)__LC_EXT_INT_CODE);
+		rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
+		rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+				     &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+		rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
+				    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+	}
 	clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs);
 	return rc ? -EFAULT : 0;
 }
@@ -553,11 +560,25 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
 	union mci mci;
 	int rc;
 
+	/*
+	 * All other possible payload for a machine check (e.g. the register
+	 * contents in the save area) will be handled by the ultravisor, as
+	 * the hypervisor does not not have the needed information for
+	 * protected guests.
+	 */
+	if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+		vcpu->arch.sie_block->iictl = IICTL_CODE_MCHK;
+		vcpu->arch.sie_block->mcic = mchk->mcic;
+		vcpu->arch.sie_block->faddr = mchk->failing_storage_address;
+		vcpu->arch.sie_block->edc = mchk->ext_damage_code;
+		return 0;
+	}
+
 	mci.val = mchk->mcic;
 	/* take care of lazy register loading */
-	save_fpu_regs();
+	kvm_s390_fpu_store(vcpu->run);
 	save_access_regs(vcpu->run->s.regs.acrs);
-	if (MACHINE_HAS_GS && vcpu->arch.gs_enabled)
+	if (cpu_has_gs() && vcpu->arch.gs_enabled)
 		save_gs_cb(current->thread.gs_cb);
 
 	/* Extended save area */
@@ -610,7 +631,7 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
 	rc |= put_guest_lc(vcpu, mci.val, (u64 __user *) __LC_MCCK_CODE);
 
 	/* Register-save areas */
-	if (MACHINE_HAS_VX) {
+	if (cpu_has_vx()) {
 		convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs);
 		rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA, fprs, 128);
 	} else {
@@ -619,7 +640,7 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
 	}
 	rc |= write_guest_lc(vcpu, __LC_GPREGS_SAVE_AREA,
 			     vcpu->run->s.regs.gprs, 128);
-	rc |= put_guest_lc(vcpu, current->thread.fpu.fpc,
+	rc |= put_guest_lc(vcpu, vcpu->run->s.regs.fpc,
 			   (u32 __user *) __LC_FP_CREG_SAVE_AREA);
 	rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->todpr,
 			   (u32 __user *) __LC_TOD_PROGREG_SAVE_AREA);
@@ -669,7 +690,7 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu)
 	/*
 	 * We indicate floating repressible conditions along with
 	 * other pending conditions. Channel Report Pending and Channel
-	 * Subsystem damage are the only two and and are indicated by
+	 * Subsystem damage are the only two and are indicated by
 	 * bits in mcic and masked in cr14.
 	 */
 	if (test_and_clear_bit(IRQ_PEND_MCHK_REP, &fi->pending_irqs)) {
@@ -696,17 +717,21 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu)
 static int __must_check __deliver_restart(struct kvm_vcpu *vcpu)
 {
 	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
-	int rc;
+	int rc = 0;
 
 	VCPU_EVENT(vcpu, 3, "%s", "deliver: cpu restart");
 	vcpu->stat.deliver_restart_signal++;
 	trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_RESTART, 0, 0);
 
-	rc  = write_guest_lc(vcpu,
-			     offsetof(struct lowcore, restart_old_psw),
-			     &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-	rc |= read_guest_lc(vcpu, offsetof(struct lowcore, restart_psw),
-			    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+	if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+		vcpu->arch.sie_block->iictl = IICTL_CODE_RESTART;
+	} else {
+		rc  = write_guest_lc(vcpu,
+				     offsetof(struct lowcore, restart_old_psw),
+				     &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+		rc |= read_guest_lc(vcpu, offsetof(struct lowcore, restart_psw),
+				    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+	}
 	clear_bit(IRQ_PEND_RESTART, &li->pending_irqs);
 	return rc ? -EFAULT : 0;
 }
@@ -748,6 +773,12 @@ static int __must_check __deliver_emergency_signal(struct kvm_vcpu *vcpu)
 	vcpu->stat.deliver_emergency_signal++;
 	trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY,
 					 cpu_addr, 0);
+	if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+		vcpu->arch.sie_block->iictl = IICTL_CODE_EXT;
+		vcpu->arch.sie_block->eic = EXT_IRQ_EMERGENCY_SIG;
+		vcpu->arch.sie_block->extcpuaddr = cpu_addr;
+		return 0;
+	}
 
 	rc  = put_guest_lc(vcpu, EXT_IRQ_EMERGENCY_SIG,
 			   (u16 *)__LC_EXT_INT_CODE);
@@ -776,6 +807,12 @@ static int __must_check __deliver_external_call(struct kvm_vcpu *vcpu)
 	trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
 					 KVM_S390_INT_EXTERNAL_CALL,
 					 extcall.code, 0);
+	if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+		vcpu->arch.sie_block->iictl = IICTL_CODE_EXT;
+		vcpu->arch.sie_block->eic = EXT_IRQ_EXTERNAL_CALL;
+		vcpu->arch.sie_block->extcpuaddr = extcall.code;
+		return 0;
+	}
 
 	rc  = put_guest_lc(vcpu, EXT_IRQ_EXTERNAL_CALL,
 			   (u16 *)__LC_EXT_INT_CODE);
@@ -787,6 +824,21 @@ static int __must_check __deliver_external_call(struct kvm_vcpu *vcpu)
 	return rc ? -EFAULT : 0;
 }
 
+static int __deliver_prog_pv(struct kvm_vcpu *vcpu, u16 code)
+{
+	switch (code) {
+	case PGM_SPECIFICATION:
+		vcpu->arch.sie_block->iictl = IICTL_CODE_SPECIFICATION;
+		break;
+	case PGM_OPERAND:
+		vcpu->arch.sie_block->iictl = IICTL_CODE_OPERAND;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
 static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
 {
 	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -807,6 +859,10 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
 	trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
 					 pgm_info.code, 0);
 
+	/* PER is handled by the ultravisor */
+	if (kvm_s390_pv_cpu_is_protected(vcpu))
+		return __deliver_prog_pv(vcpu, pgm_info.code & ~PGM_PER);
+
 	switch (pgm_info.code & ~PGM_PER) {
 	case PGM_AFX_TRANSLATION:
 	case PGM_ASX_TRANSLATION:
@@ -818,7 +874,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
 	case PGM_PRIMARY_AUTHORITY:
 	case PGM_SECONDARY_AUTHORITY:
 		nullifying = true;
-		/* fall through */
+		fallthrough;
 	case PGM_SPACE_SWITCH:
 		rc = put_guest_lc(vcpu, pgm_info.trans_exc_code,
 				  (u64 *)__LC_TRANS_EXC_CODE);
@@ -892,9 +948,8 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
 	/* bit 1+2 of the target are the ilc, so we can directly use ilen */
 	rc |= put_guest_lc(vcpu, ilen, (u16 *) __LC_PGM_ILC);
 	rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->gbea,
-				 (u64 *) __LC_LAST_BREAK);
-	rc |= put_guest_lc(vcpu, pgm_info.code,
-			   (u16 *)__LC_PGM_INT_CODE);
+				 (u64 *) __LC_PGM_LAST_BREAK);
+	rc |= put_guest_lc(vcpu, pgm_info.code, (u16 *)__LC_PGM_CODE);
 	rc |= write_guest_lc(vcpu, __LC_PGM_OLD_PSW,
 			     &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
 	rc |= read_guest_lc(vcpu, __LC_PGM_NEW_PSW,
@@ -902,20 +957,49 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
 	return rc ? -EFAULT : 0;
 }
 
+#define SCCB_MASK 0xFFFFFFF8
+#define SCCB_EVENT_PENDING 0x3
+
+static int write_sclp(struct kvm_vcpu *vcpu, u32 parm)
+{
+	int rc;
+
+	if (kvm_s390_pv_cpu_get_handle(vcpu)) {
+		vcpu->arch.sie_block->iictl = IICTL_CODE_EXT;
+		vcpu->arch.sie_block->eic = EXT_IRQ_SERVICE_SIG;
+		vcpu->arch.sie_block->eiparams = parm;
+		return 0;
+	}
+
+	rc  = put_guest_lc(vcpu, EXT_IRQ_SERVICE_SIG, (u16 *)__LC_EXT_INT_CODE);
+	rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
+	rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+			     &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+	rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
+			    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+	rc |= put_guest_lc(vcpu, parm,
+			   (u32 *)__LC_EXT_PARAMS);
+
+	return rc ? -EFAULT : 0;
+}
+
 static int __must_check __deliver_service(struct kvm_vcpu *vcpu)
 {
 	struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
 	struct kvm_s390_ext_info ext;
-	int rc = 0;
 
 	spin_lock(&fi->lock);
-	if (!(test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs))) {
+	if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs) ||
+	    !(test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs))) {
 		spin_unlock(&fi->lock);
 		return 0;
 	}
 	ext = fi->srv_signal;
 	memset(&fi->srv_signal, 0, sizeof(ext));
 	clear_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs);
+	clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs);
+	if (kvm_s390_pv_cpu_is_protected(vcpu))
+		set_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs);
 	spin_unlock(&fi->lock);
 
 	VCPU_EVENT(vcpu, 4, "deliver: sclp parameter 0x%x",
@@ -924,16 +1008,31 @@ static int __must_check __deliver_service(struct kvm_vcpu *vcpu)
 	trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE,
 					 ext.ext_params, 0);
 
-	rc  = put_guest_lc(vcpu, EXT_IRQ_SERVICE_SIG, (u16 *)__LC_EXT_INT_CODE);
-	rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
-	rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
-			     &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-	rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
-			    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-	rc |= put_guest_lc(vcpu, ext.ext_params,
-			   (u32 *)__LC_EXT_PARAMS);
+	return write_sclp(vcpu, ext.ext_params);
+}
 
-	return rc ? -EFAULT : 0;
+static int __must_check __deliver_service_ev(struct kvm_vcpu *vcpu)
+{
+	struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
+	struct kvm_s390_ext_info ext;
+
+	spin_lock(&fi->lock);
+	if (!(test_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs))) {
+		spin_unlock(&fi->lock);
+		return 0;
+	}
+	ext = fi->srv_signal;
+	/* only clear the event bits */
+	fi->srv_signal.ext_params &= ~SCCB_EVENT_PENDING;
+	clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs);
+	spin_unlock(&fi->lock);
+
+	VCPU_EVENT(vcpu, 4, "%s", "deliver: sclp parameter event");
+	vcpu->stat.deliver_service_signal++;
+	trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE,
+					 ext.ext_params, 0);
+
+	return write_sclp(vcpu, ext.ext_params & SCCB_EVENT_PENDING);
 }
 
 static int __must_check __deliver_pfault_done(struct kvm_vcpu *vcpu)
@@ -1028,6 +1127,15 @@ static int __do_deliver_io(struct kvm_vcpu *vcpu, struct kvm_s390_io_info *io)
 {
 	int rc;
 
+	if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+		vcpu->arch.sie_block->iictl = IICTL_CODE_IO;
+		vcpu->arch.sie_block->subchannel_id = io->subchannel_id;
+		vcpu->arch.sie_block->subchannel_nr = io->subchannel_nr;
+		vcpu->arch.sie_block->io_int_parm = io->io_int_parm;
+		vcpu->arch.sie_block->io_int_word = io->io_int_word;
+		return 0;
+	}
+
 	rc  = put_guest_lc(vcpu, io->subchannel_id, (u16 *)__LC_SUBCHANNEL_ID);
 	rc |= put_guest_lc(vcpu, io->subchannel_nr, (u16 *)__LC_SUBCHANNEL_NR);
 	rc |= put_guest_lc(vcpu, io->io_int_parm, (u32 *)__LC_IO_INT_PARM);
@@ -1166,7 +1274,7 @@ static u64 __calculate_sltime(struct kvm_vcpu *vcpu)
 			/* already expired? */
 			if (cputm >> 63)
 				return 0;
-			return min(sltime, tod_to_ns(cputm));
+			return min_t(u64, sltime, tod_to_ns(cputm));
 		}
 	} else if (cpu_timer_interrupts_enabled(vcpu)) {
 		sltime = kvm_s390_get_cpu_timer(vcpu);
@@ -1213,10 +1321,11 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
 	hrtimer_start(&vcpu->arch.ckc_timer, sltime, HRTIMER_MODE_REL);
 	VCPU_EVENT(vcpu, 4, "enabled wait: %llu ns", sltime);
 no_timer:
-	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-	kvm_vcpu_block(vcpu);
+	kvm_vcpu_srcu_read_unlock(vcpu);
+	kvm_vcpu_halt(vcpu);
+	vcpu->valid_wakeup = false;
 	__unset_cpu_idle(vcpu);
-	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+	kvm_vcpu_srcu_read_lock(vcpu);
 
 	hrtimer_cancel(&vcpu->arch.ckc_timer);
 	return 0;
@@ -1269,6 +1378,7 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
 {
 	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
 	int rc = 0;
+	bool delivered = false;
 	unsigned long irq_type;
 	unsigned long irqs;
 
@@ -1329,6 +1439,9 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
 		case IRQ_PEND_EXT_SERVICE:
 			rc = __deliver_service(vcpu);
 			break;
+		case IRQ_PEND_EXT_SERVICE_EV:
+			rc = __deliver_service_ev(vcpu);
+			break;
 		case IRQ_PEND_PFAULT_DONE:
 			rc = __deliver_pfault_done(vcpu);
 			break;
@@ -1339,6 +1452,19 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
 			WARN_ONCE(1, "Unknown pending irq type %ld", irq_type);
 			clear_bit(irq_type, &li->pending_irqs);
 		}
+		delivered |= !rc;
+	}
+
+	/*
+	 * We delivered at least one interrupt and modified the PC. Force a
+	 * singlestep event now.
+	 */
+	if (delivered && guestdbg_sstep_enabled(vcpu)) {
+		struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch;
+
+		debug_exit->addr = vcpu->arch.sie_block->gpsw.addr;
+		debug_exit->type = KVM_SINGLESTEP;
+		vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING;
 	}
 
 	set_intercept_indicators(vcpu);
@@ -1421,7 +1547,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
 	if (kvm_get_vcpu_by_id(vcpu->kvm, src_id) == NULL)
 		return -EINVAL;
 
-	if (sclp.has_sigpif)
+	if (sclp.has_sigpif && !kvm_s390_pv_cpu_get_handle(vcpu))
 		return sca_inject_ext_call(vcpu, src_id);
 
 	if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs))
@@ -1668,7 +1794,7 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
 		goto out;
 	}
 gisa_out:
-	tmp_inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+	tmp_inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT);
 	if (tmp_inti) {
 		tmp_inti->type = KVM_S390_INT_IO(1, 0, 0, 0);
 		tmp_inti->io.io_int_word = isc_to_int_word(isc);
@@ -1681,9 +1807,6 @@ out:
 	return inti;
 }
 
-#define SCCB_MASK 0xFFFFFFF8
-#define SCCB_EVENT_PENDING 0x3
-
 static int __inject_service(struct kvm *kvm,
 			     struct kvm_s390_interrupt_info *inti)
 {
@@ -1692,6 +1815,11 @@ static int __inject_service(struct kvm *kvm,
 	kvm->stat.inject_service_signal++;
 	spin_lock(&fi->lock);
 	fi->srv_signal.ext_params |= inti->ext.ext_params & SCCB_EVENT_PENDING;
+
+	/* We always allow events, track them separately from the sccb ints */
+	if (fi->srv_signal.ext_params & SCCB_EVENT_PENDING)
+		set_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs);
+
 	/*
 	 * Early versions of the QEMU s390 bios will inject several
 	 * service interrupts after another without handling a
@@ -1773,6 +1901,12 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
 	kvm->stat.inject_io++;
 	isc = int_word_to_isc(inti->io.io_int_word);
 
+	/*
+	 * We do not use the lock checking variant as this is just a
+	 * performance optimization and we do not hold the lock here.
+	 * This is ok as the code will pick interrupts from both "lists"
+	 * for delivery.
+	 */
 	if (gi->origin && inti->type & KVM_S390_INT_IO_AI_MASK) {
 		VM_EVENT(kvm, 4, "%s isc %1u", "inject: I/O (AI/gisa)", isc);
 		gisa_set_ipm_gisc(gi->origin, isc);
@@ -1834,7 +1968,8 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type)
 		break;
 	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
 		if (!(type & KVM_S390_INT_IO_AI_MASK &&
-		      kvm->arch.gisa_int.origin))
+		      kvm->arch.gisa_int.origin) ||
+		      kvm_s390_pv_cpu_get_handle(dst_vcpu))
 			kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT);
 		break;
 	default:
@@ -1881,7 +2016,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
 	struct kvm_s390_interrupt_info *inti;
 	int rc;
 
-	inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+	inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT);
 	if (!inti)
 		return -ENOMEM;
 
@@ -1981,6 +2116,13 @@ int kvm_s390_is_stop_irq_pending(struct kvm_vcpu *vcpu)
 	return test_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs);
 }
 
+int kvm_s390_is_restart_irq_pending(struct kvm_vcpu *vcpu)
+{
+	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+
+	return test_bit(IRQ_PEND_RESTART, &li->pending_irqs);
+}
+
 void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu)
 {
 	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -2080,6 +2222,10 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm)
 	struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
 	int i;
 
+	mutex_lock(&kvm->lock);
+	if (!kvm_s390_pv_is_protected(kvm))
+		fi->masked_irqs = 0;
+	mutex_unlock(&kvm->lock);
 	spin_lock(&fi->lock);
 	fi->pending_irqs = 0;
 	memset(&fi->srv_signal, 0, sizeof(fi->srv_signal));
@@ -2146,7 +2292,8 @@ static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len)
 			n++;
 		}
 	}
-	if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs)) {
+	if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs) ||
+	    test_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs)) {
 		if (n == max_irqs) {
 			/* signal userspace to try again */
 			ret = -ENOMEM;
@@ -2190,7 +2337,7 @@ static int flic_ais_mode_get_all(struct kvm *kvm, struct kvm_device_attr *attr)
 		return -EINVAL;
 
 	if (!test_kvm_facility(kvm, 72))
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	mutex_lock(&fi->ais_lock);
 	ais.simm = fi->simm;
@@ -2275,7 +2422,7 @@ static int enqueue_floating_irq(struct kvm_device *dev,
 		return -EINVAL;
 
 	while (len >= sizeof(struct kvm_s390_irq)) {
-		inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+		inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT);
 		if (!inti)
 			return -ENOMEM;
 
@@ -2323,13 +2470,10 @@ static int register_io_adapter(struct kvm_device *dev,
 	if (dev->kvm->arch.adapters[adapter_info.id] != NULL)
 		return -EINVAL;
 
-	adapter = kzalloc(sizeof(*adapter), GFP_KERNEL);
+	adapter = kzalloc(sizeof(*adapter), GFP_KERNEL_ACCOUNT);
 	if (!adapter)
 		return -ENOMEM;
 
-	INIT_LIST_HEAD(&adapter->maps);
-	init_rwsem(&adapter->maps_lock);
-	atomic_set(&adapter->nr_maps, 0);
 	adapter->id = adapter_info.id;
 	adapter->isc = adapter_info.isc;
 	adapter->maskable = adapter_info.maskable;
@@ -2354,87 +2498,12 @@ int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked)
 	return ret;
 }
 
-static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr)
-{
-	struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
-	struct s390_map_info *map;
-	int ret;
-
-	if (!adapter || !addr)
-		return -EINVAL;
-
-	map = kzalloc(sizeof(*map), GFP_KERNEL);
-	if (!map) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	INIT_LIST_HEAD(&map->list);
-	map->guest_addr = addr;
-	map->addr = gmap_translate(kvm->arch.gmap, addr);
-	if (map->addr == -EFAULT) {
-		ret = -EFAULT;
-		goto out;
-	}
-	ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page);
-	if (ret < 0)
-		goto out;
-	BUG_ON(ret != 1);
-	down_write(&adapter->maps_lock);
-	if (atomic_inc_return(&adapter->nr_maps) < MAX_S390_ADAPTER_MAPS) {
-		list_add_tail(&map->list, &adapter->maps);
-		ret = 0;
-	} else {
-		put_page(map->page);
-		ret = -EINVAL;
-	}
-	up_write(&adapter->maps_lock);
-out:
-	if (ret)
-		kfree(map);
-	return ret;
-}
-
-static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr)
-{
-	struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
-	struct s390_map_info *map, *tmp;
-	int found = 0;
-
-	if (!adapter || !addr)
-		return -EINVAL;
-
-	down_write(&adapter->maps_lock);
-	list_for_each_entry_safe(map, tmp, &adapter->maps, list) {
-		if (map->guest_addr == addr) {
-			found = 1;
-			atomic_dec(&adapter->nr_maps);
-			list_del(&map->list);
-			put_page(map->page);
-			kfree(map);
-			break;
-		}
-	}
-	up_write(&adapter->maps_lock);
-
-	return found ? 0 : -EINVAL;
-}
-
 void kvm_s390_destroy_adapters(struct kvm *kvm)
 {
 	int i;
-	struct s390_map_info *map, *tmp;
 
-	for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) {
-		if (!kvm->arch.adapters[i])
-			continue;
-		list_for_each_entry_safe(map, tmp,
-					 &kvm->arch.adapters[i]->maps, list) {
-			list_del(&map->list);
-			put_page(map->page);
-			kfree(map);
-		}
+	for (i = 0; i < MAX_S390_IO_ADAPTERS; i++)
 		kfree(kvm->arch.adapters[i]);
-	}
 }
 
 static int modify_io_adapter(struct kvm_device *dev,
@@ -2456,11 +2525,14 @@ static int modify_io_adapter(struct kvm_device *dev,
 		if (ret > 0)
 			ret = 0;
 		break;
+	/*
+	 * The following operations are no longer needed and therefore no-ops.
+	 * The gpa to hva translation is done when an IRQ route is set up. The
+	 * set_irq code uses get_user_pages_remote() to do the actual write.
+	 */
 	case KVM_S390_IO_ADAPTER_MAP:
-		ret = kvm_s390_adapter_map(dev->kvm, req.id, req.addr);
-		break;
 	case KVM_S390_IO_ADAPTER_UNMAP:
-		ret = kvm_s390_adapter_unmap(dev->kvm, req.id, req.addr);
+		ret = 0;
 		break;
 	default:
 		ret = -EINVAL;
@@ -2499,7 +2571,7 @@ static int modify_ais_mode(struct kvm *kvm, struct kvm_device_attr *attr)
 	int ret = 0;
 
 	if (!test_kvm_facility(kvm, 72))
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req)))
 		return -EFAULT;
@@ -2579,7 +2651,7 @@ static int flic_ais_mode_set_all(struct kvm *kvm, struct kvm_device_attr *attr)
 	struct kvm_s390_ais_all ais;
 
 	if (!test_kvm_facility(kvm, 72))
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	if (copy_from_user(&ais, (void __user *)attr->addr, sizeof(ais)))
 		return -EFAULT;
@@ -2595,7 +2667,7 @@ static int flic_ais_mode_set_all(struct kvm *kvm, struct kvm_device_attr *attr)
 static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 {
 	int r = 0;
-	unsigned int i;
+	unsigned long i;
 	struct kvm_vcpu *vcpu;
 
 	switch (attr->group) {
@@ -2606,9 +2678,13 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 		kvm_s390_clear_float_irqs(dev->kvm);
 		break;
 	case KVM_DEV_FLIC_APF_ENABLE:
+		if (kvm_is_ucontrol(dev->kvm))
+			return -EINVAL;
 		dev->kvm->arch.gmap->pfault_enabled = 1;
 		break;
 	case KVM_DEV_FLIC_APF_DISABLE_WAIT:
+		if (kvm_is_ucontrol(dev->kvm))
+			return -EINVAL;
 		dev->kvm->arch.gmap->pfault_enabled = 0;
 		/*
 		 * Make sure no async faults are in transition when
@@ -2699,19 +2775,15 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap)
 	return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit;
 }
 
-static struct s390_map_info *get_map_info(struct s390_io_adapter *adapter,
-					  u64 addr)
+static struct page *get_map_page(struct kvm *kvm, u64 uaddr)
 {
-	struct s390_map_info *map;
-
-	if (!adapter)
-		return NULL;
+	struct page *page = NULL;
 
-	list_for_each_entry(map, &adapter->maps, list) {
-		if (map->guest_addr == addr)
-			return map;
-	}
-	return NULL;
+	mmap_read_lock(kvm->mm);
+	get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE,
+			      &page, NULL);
+	mmap_read_unlock(kvm->mm);
+	return page;
 }
 
 static int adapter_indicators_set(struct kvm *kvm,
@@ -2720,30 +2792,35 @@ static int adapter_indicators_set(struct kvm *kvm,
 {
 	unsigned long bit;
 	int summary_set, idx;
-	struct s390_map_info *info;
+	struct page *ind_page, *summary_page;
 	void *map;
 
-	info = get_map_info(adapter, adapter_int->ind_addr);
-	if (!info)
+	ind_page = get_map_page(kvm, adapter_int->ind_addr);
+	if (!ind_page)
 		return -1;
-	map = page_address(info->page);
-	bit = get_ind_bit(info->addr, adapter_int->ind_offset, adapter->swap);
-	set_bit(bit, map);
-	idx = srcu_read_lock(&kvm->srcu);
-	mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT);
-	set_page_dirty_lock(info->page);
-	info = get_map_info(adapter, adapter_int->summary_addr);
-	if (!info) {
-		srcu_read_unlock(&kvm->srcu, idx);
+	summary_page = get_map_page(kvm, adapter_int->summary_addr);
+	if (!summary_page) {
+		put_page(ind_page);
 		return -1;
 	}
-	map = page_address(info->page);
-	bit = get_ind_bit(info->addr, adapter_int->summary_offset,
-			  adapter->swap);
+
+	idx = srcu_read_lock(&kvm->srcu);
+	map = page_address(ind_page);
+	bit = get_ind_bit(adapter_int->ind_addr,
+			  adapter_int->ind_offset, adapter->swap);
+	set_bit(bit, map);
+	mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT);
+	set_page_dirty_lock(ind_page);
+	map = page_address(summary_page);
+	bit = get_ind_bit(adapter_int->summary_addr,
+			  adapter_int->summary_offset, adapter->swap);
 	summary_set = test_and_set_bit(bit, map);
-	mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT);
-	set_page_dirty_lock(info->page);
+	mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT);
+	set_page_dirty_lock(summary_page);
 	srcu_read_unlock(&kvm->srcu, idx);
+
+	put_page(ind_page);
+	put_page(summary_page);
 	return summary_set ? 0 : 1;
 }
 
@@ -2765,9 +2842,7 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
 	adapter = get_io_adapter(kvm, e->adapter.adapter_id);
 	if (!adapter)
 		return -1;
-	down_read(&adapter->maps_lock);
 	ret = adapter_indicators_set(kvm, adapter, &e->adapter);
-	up_read(&adapter->maps_lock);
 	if ((ret > 0) && !adapter->masked) {
 		ret = kvm_s390_inject_airq(kvm, adapter);
 		if (ret == 0)
@@ -2818,23 +2893,32 @@ int kvm_set_routing_entry(struct kvm *kvm,
 			  struct kvm_kernel_irq_routing_entry *e,
 			  const struct kvm_irq_routing_entry *ue)
 {
-	int ret;
+	u64 uaddr_s, uaddr_i;
+	int idx;
 
 	switch (ue->type) {
+	/* we store the userspace addresses instead of the guest addresses */
 	case KVM_IRQ_ROUTING_S390_ADAPTER:
+		if (kvm_is_ucontrol(kvm))
+			return -EINVAL;
 		e->set = set_adapter_int;
-		e->adapter.summary_addr = ue->u.adapter.summary_addr;
-		e->adapter.ind_addr = ue->u.adapter.ind_addr;
+
+		idx = srcu_read_lock(&kvm->srcu);
+		uaddr_s = gpa_to_hva(kvm, ue->u.adapter.summary_addr);
+		uaddr_i = gpa_to_hva(kvm, ue->u.adapter.ind_addr);
+		srcu_read_unlock(&kvm->srcu, idx);
+
+		if (kvm_is_error_hva(uaddr_s) || kvm_is_error_hva(uaddr_i))
+			return -EFAULT;
+		e->adapter.summary_addr = uaddr_s;
+		e->adapter.ind_addr = uaddr_i;
 		e->adapter.summary_offset = ue->u.adapter.summary_offset;
 		e->adapter.ind_offset = ue->u.adapter.ind_offset;
 		e->adapter.adapter_id = ue->u.adapter.adapter_id;
-		ret = 0;
-		break;
+		return 0;
 	default:
-		ret = -EINVAL;
+		return -EINVAL;
 	}
-
-	return ret;
 }
 
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
@@ -2983,18 +3067,19 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len)
 
 static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask)
 {
-	int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus);
+	int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus);
 	struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
 	struct kvm_vcpu *vcpu;
+	u8 vcpu_isc_mask;
 
-	for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) {
-		vcpu = kvm_get_vcpu(kvm, vcpu_id);
+	for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) {
+		vcpu = kvm_get_vcpu(kvm, vcpu_idx);
 		if (psw_ioint_disabled(vcpu))
 			continue;
-		deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24);
-		if (deliverable_mask) {
+		vcpu_isc_mask = (u8)(vcpu->arch.sie_block->gcr[6] >> 24);
+		if (deliverable_mask & vcpu_isc_mask) {
 			/* lately kicked but not yet running */
-			if (test_and_set_bit(vcpu_id, gi->kicked_mask))
+			if (test_and_set_bit(vcpu_idx, gi->kicked_mask))
 				return;
 			kvm_s390_vcpu_wakeup(vcpu);
 			return;
@@ -3015,7 +3100,7 @@ static enum hrtimer_restart gisa_vcpu_kicker(struct hrtimer *timer)
 		__airqs_kick_single_vcpu(kvm, pending_mask);
 		hrtimer_forward_now(timer, ns_to_ktime(gi->expires));
 		return HRTIMER_RESTART;
-	};
+	}
 
 	return HRTIMER_NORESTART;
 }
@@ -3027,9 +3112,9 @@ static enum hrtimer_restart gisa_vcpu_kicker(struct hrtimer *timer)
 static void process_gib_alert_list(void)
 {
 	struct kvm_s390_gisa_interrupt *gi;
+	u32 final, gisa_phys, origin = 0UL;
 	struct kvm_s390_gisa *gisa;
 	struct kvm *kvm;
-	u32 final, origin = 0UL;
 
 	do {
 		/*
@@ -3055,9 +3140,10 @@ static void process_gib_alert_list(void)
 		 * interruptions asap.
 		 */
 		while (origin & GISA_ADDR_MASK) {
-			gisa = (struct kvm_s390_gisa *)(u64)origin;
+			gisa_phys = origin;
+			gisa = phys_to_virt(gisa_phys);
 			origin = gisa->next_alert;
-			gisa->next_alert = (u32)(u64)gisa;
+			gisa->next_alert = gisa_phys;
 			kvm = container_of(gisa, struct sie_page2, gisa)->kvm;
 			gi = &kvm->arch.gisa_int;
 			if (hrtimer_active(&gi->timer))
@@ -3088,26 +3174,69 @@ void kvm_s390_gisa_init(struct kvm *kvm)
 	gi->alert.mask = 0;
 	spin_lock_init(&gi->alert.ref_lock);
 	gi->expires = 50 * 1000; /* 50 usec */
-	hrtimer_init(&gi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	gi->timer.function = gisa_vcpu_kicker;
+	hrtimer_setup(&gi->timer, gisa_vcpu_kicker, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	memset(gi->origin, 0, sizeof(struct kvm_s390_gisa));
-	gi->origin->next_alert = (u32)(u64)gi->origin;
+	gi->origin->next_alert = (u32)virt_to_phys(gi->origin);
 	VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin);
 }
 
+void kvm_s390_gisa_enable(struct kvm *kvm)
+{
+	struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
+	struct kvm_vcpu *vcpu;
+	unsigned long i;
+	u32 gisa_desc;
+
+	if (gi->origin)
+		return;
+	kvm_s390_gisa_init(kvm);
+	gisa_desc = kvm_s390_get_gisa_desc(kvm);
+	if (!gisa_desc)
+		return;
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		mutex_lock(&vcpu->mutex);
+		vcpu->arch.sie_block->gd = gisa_desc;
+		vcpu->arch.sie_block->eca |= ECA_AIV;
+		VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u",
+			   vcpu->arch.sie_block->gd & 0x3, vcpu->vcpu_id);
+		mutex_unlock(&vcpu->mutex);
+	}
+}
+
 void kvm_s390_gisa_destroy(struct kvm *kvm)
 {
 	struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
+	struct kvm_s390_gisa *gisa = gi->origin;
 
 	if (!gi->origin)
 		return;
-	if (gi->alert.mask)
-		KVM_EVENT(3, "vm 0x%pK has unexpected iam 0x%02x",
-			  kvm, gi->alert.mask);
-	while (gisa_in_alert_list(gi->origin))
-		cpu_relax();
+	WARN(gi->alert.mask != 0x00,
+	     "unexpected non zero alert.mask 0x%02x",
+	     gi->alert.mask);
+	gi->alert.mask = 0x00;
+	if (gisa_set_iam(gi->origin, gi->alert.mask))
+		process_gib_alert_list();
 	hrtimer_cancel(&gi->timer);
 	gi->origin = NULL;
+	VM_EVENT(kvm, 3, "gisa 0x%pK destroyed", gisa);
+}
+
+void kvm_s390_gisa_disable(struct kvm *kvm)
+{
+	struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
+	struct kvm_vcpu *vcpu;
+	unsigned long i;
+
+	if (!gi->origin)
+		return;
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		mutex_lock(&vcpu->mutex);
+		vcpu->arch.sie_block->eca &= ~ECA_AIV;
+		vcpu->arch.sie_block->gd = 0U;
+		mutex_unlock(&vcpu->mutex);
+		VCPU_EVENT(vcpu, 3, "AIV disabled for cpu %03u", vcpu->vcpu_id);
+	}
+	kvm_s390_gisa_destroy(kvm);
 }
 
 /**
@@ -3193,29 +3322,111 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister);
 
-static void gib_alert_irq_handler(struct airq_struct *airq, bool floating)
+static void aen_host_forward(unsigned long si)
+{
+	struct kvm_s390_gisa_interrupt *gi;
+	struct zpci_gaite *gaite;
+	struct kvm *kvm;
+
+	gaite = (struct zpci_gaite *)aift->gait +
+		(si * sizeof(struct zpci_gaite));
+	if (gaite->count == 0)
+		return;
+	if (gaite->aisb != 0)
+		set_bit_inv(gaite->aisbo, phys_to_virt(gaite->aisb));
+
+	kvm = kvm_s390_pci_si_to_kvm(aift, si);
+	if (!kvm)
+		return;
+	gi = &kvm->arch.gisa_int;
+
+	if (!(gi->origin->g1.simm & AIS_MODE_MASK(gaite->gisc)) ||
+	    !(gi->origin->g1.nimm & AIS_MODE_MASK(gaite->gisc))) {
+		gisa_set_ipm_gisc(gi->origin, gaite->gisc);
+		if (hrtimer_active(&gi->timer))
+			hrtimer_cancel(&gi->timer);
+		hrtimer_start(&gi->timer, 0, HRTIMER_MODE_REL);
+		kvm->stat.aen_forward++;
+	}
+}
+
+static void aen_process_gait(u8 isc)
+{
+	bool found = false, first = true;
+	union zpci_sic_iib iib = {{0}};
+	unsigned long si, flags;
+
+	spin_lock_irqsave(&aift->gait_lock, flags);
+
+	if (!aift->gait) {
+		spin_unlock_irqrestore(&aift->gait_lock, flags);
+		return;
+	}
+
+	for (si = 0;;) {
+		/* Scan adapter summary indicator bit vector */
+		si = airq_iv_scan(aift->sbv, si, airq_iv_end(aift->sbv));
+		if (si == -1UL) {
+			if (first || found) {
+				/* Re-enable interrupts. */
+				zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, isc,
+						  &iib);
+				first = found = false;
+			} else {
+				/* Interrupts on and all bits processed */
+				break;
+			}
+			found = false;
+			si = 0;
+			/* Scan again after re-enabling interrupts */
+			continue;
+		}
+		found = true;
+		aen_host_forward(si);
+	}
+
+	spin_unlock_irqrestore(&aift->gait_lock, flags);
+}
+
+static void gib_alert_irq_handler(struct airq_struct *airq,
+				  struct tpi_info *tpi_info)
 {
+	struct tpi_adapter_info *info = (struct tpi_adapter_info *)tpi_info;
+
 	inc_irq_stat(IRQIO_GAL);
-	process_gib_alert_list();
+
+	if ((info->forward || info->error) &&
+	    IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) {
+		aen_process_gait(info->isc);
+		if (info->aism != 0)
+			process_gib_alert_list();
+	} else {
+		process_gib_alert_list();
+	}
 }
 
 static struct airq_struct gib_alert_irq = {
 	.handler = gib_alert_irq_handler,
-	.lsi_ptr = &gib_alert_irq.lsi_mask,
 };
 
 void kvm_s390_gib_destroy(void)
 {
 	if (!gib)
 		return;
+	if (kvm_s390_pci_interp_allowed() && aift) {
+		mutex_lock(&aift->aift_lock);
+		kvm_s390_pci_aen_exit();
+		mutex_unlock(&aift->aift_lock);
+	}
 	chsc_sgib(0);
 	unregister_adapter_interrupt(&gib_alert_irq);
 	free_page((unsigned long)gib);
 	gib = NULL;
 }
 
-int kvm_s390_gib_init(u8 nisc)
+int __init kvm_s390_gib_init(u8 nisc)
 {
+	u32 gib_origin;
 	int rc = 0;
 
 	if (!css_general_characteristics.aiv) {
@@ -3223,7 +3434,7 @@ int kvm_s390_gib_init(u8 nisc)
 		goto out;
 	}
 
-	gib = (struct kvm_s390_gib *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
+	gib = (struct kvm_s390_gib *)get_zeroed_page(GFP_KERNEL_ACCOUNT | GFP_DMA);
 	if (!gib) {
 		rc = -ENOMEM;
 		goto out;
@@ -3235,9 +3446,12 @@ int kvm_s390_gib_init(u8 nisc)
 		rc = -EIO;
 		goto out_free_gib;
 	}
+	/* adapter interrupts used for AP (applicable here) don't use the LSI */
+	*gib_alert_irq.lsi_ptr = 0xff;
 
 	gib->nisc = nisc;
-	if (chsc_sgib((u32)(u64)gib)) {
+	gib_origin = virt_to_phys(gib);
+	if (chsc_sgib(gib_origin)) {
 		pr_err("Associating the GIB with the AIV facility failed\n");
 		free_page((unsigned long)gib);
 		gib = NULL;
@@ -3245,6 +3459,14 @@ int kvm_s390_gib_init(u8 nisc)
 		goto out_unreg_gal;
 	}
 
+	if (kvm_s390_pci_interp_allowed()) {
+		if (kvm_s390_pci_aen_init(nisc)) {
+			pr_err("Initializing AEN for PCI failed\n");
+			rc = -EIO;
+			goto out_unreg_gal;
+		}
+	}
+
 	KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc);
 	goto out;
 
diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h
deleted file mode 100644
index 484608c71dd0..000000000000
--- a/arch/s390/kvm/irq.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * s390 irqchip routines
- *
- * Copyright IBM Corp. 2014
- *
- *    Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
- */
-#ifndef __KVM_IRQ_H
-#define __KVM_IRQ_H
-
-#include <linux/kvm_host.h>
-
-static inline int irqchip_in_kernel(struct kvm *kvm)
-{
-	return 1;
-}
-
-#endif
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d9e6bf3d54f0..fff863734975 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2,11 +2,10 @@
 /*
  * hosting IBM Z kernel virtual machines (s390x)
  *
- * Copyright IBM Corp. 2008, 2018
+ * Copyright IBM Corp. 2008, 2020
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  *               Christian Borntraeger <borntraeger@de.ibm.com>
- *               Heiko Carstens <heiko.carstens@de.ibm.com>
  *               Christian Ehrhardt <ehrhardt@de.ibm.com>
  *               Jason J. Herne <jjherne@us.ibm.com>
  */
@@ -24,6 +23,7 @@
 #include <linux/mman.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/cpufeature.h>
 #include <linux/random.h>
 #include <linux/slab.h>
 #include <linux/timer.h>
@@ -31,21 +31,28 @@
 #include <linux/bitmap.h>
 #include <linux/sched/signal.h>
 #include <linux/string.h>
+#include <linux/pgtable.h>
+#include <linux/mmu_notifier.h>
 
+#include <asm/access-regs.h>
 #include <asm/asm-offsets.h>
 #include <asm/lowcore.h>
+#include <asm/machine.h>
 #include <asm/stp.h>
-#include <asm/pgtable.h>
 #include <asm/gmap.h>
 #include <asm/nmi.h>
-#include <asm/switch_to.h>
 #include <asm/isc.h>
 #include <asm/sclp.h>
 #include <asm/cpacf.h>
 #include <asm/timex.h>
+#include <asm/asm.h>
+#include <asm/fpu.h>
 #include <asm/ap.h>
+#include <asm/uv.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
+#include "pci.h"
+#include "gmap.h"
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
@@ -56,118 +63,138 @@
 #define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \
 			   (KVM_MAX_VCPUS + LOCAL_IRQS))
 
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
-#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
-
-struct kvm_stats_debugfs_item debugfs_entries[] = {
-	{ "userspace_handled", VCPU_STAT(exit_userspace) },
-	{ "exit_null", VCPU_STAT(exit_null) },
-	{ "exit_validity", VCPU_STAT(exit_validity) },
-	{ "exit_stop_request", VCPU_STAT(exit_stop_request) },
-	{ "exit_external_request", VCPU_STAT(exit_external_request) },
-	{ "exit_io_request", VCPU_STAT(exit_io_request) },
-	{ "exit_external_interrupt", VCPU_STAT(exit_external_interrupt) },
-	{ "exit_instruction", VCPU_STAT(exit_instruction) },
-	{ "exit_pei", VCPU_STAT(exit_pei) },
-	{ "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
-	{ "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
-	{ "exit_operation_exception", VCPU_STAT(exit_operation_exception) },
-	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
-	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
-	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
-	{ "halt_no_poll_steal", VCPU_STAT(halt_no_poll_steal) },
-	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
-	{ "instruction_lctlg", VCPU_STAT(instruction_lctlg) },
-	{ "instruction_lctl", VCPU_STAT(instruction_lctl) },
-	{ "instruction_stctl", VCPU_STAT(instruction_stctl) },
-	{ "instruction_stctg", VCPU_STAT(instruction_stctg) },
-	{ "deliver_ckc", VCPU_STAT(deliver_ckc) },
-	{ "deliver_cputm", VCPU_STAT(deliver_cputm) },
-	{ "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) },
-	{ "deliver_external_call", VCPU_STAT(deliver_external_call) },
-	{ "deliver_service_signal", VCPU_STAT(deliver_service_signal) },
-	{ "deliver_virtio", VCPU_STAT(deliver_virtio) },
-	{ "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) },
-	{ "deliver_prefix_signal", VCPU_STAT(deliver_prefix_signal) },
-	{ "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) },
-	{ "deliver_program", VCPU_STAT(deliver_program) },
-	{ "deliver_io", VCPU_STAT(deliver_io) },
-	{ "deliver_machine_check", VCPU_STAT(deliver_machine_check) },
-	{ "exit_wait_state", VCPU_STAT(exit_wait_state) },
-	{ "inject_ckc", VCPU_STAT(inject_ckc) },
-	{ "inject_cputm", VCPU_STAT(inject_cputm) },
-	{ "inject_external_call", VCPU_STAT(inject_external_call) },
-	{ "inject_float_mchk", VM_STAT(inject_float_mchk) },
-	{ "inject_emergency_signal", VCPU_STAT(inject_emergency_signal) },
-	{ "inject_io", VM_STAT(inject_io) },
-	{ "inject_mchk", VCPU_STAT(inject_mchk) },
-	{ "inject_pfault_done", VM_STAT(inject_pfault_done) },
-	{ "inject_program", VCPU_STAT(inject_program) },
-	{ "inject_restart", VCPU_STAT(inject_restart) },
-	{ "inject_service_signal", VM_STAT(inject_service_signal) },
-	{ "inject_set_prefix", VCPU_STAT(inject_set_prefix) },
-	{ "inject_stop_signal", VCPU_STAT(inject_stop_signal) },
-	{ "inject_pfault_init", VCPU_STAT(inject_pfault_init) },
-	{ "inject_virtio", VM_STAT(inject_virtio) },
-	{ "instruction_epsw", VCPU_STAT(instruction_epsw) },
-	{ "instruction_gs", VCPU_STAT(instruction_gs) },
-	{ "instruction_io_other", VCPU_STAT(instruction_io_other) },
-	{ "instruction_lpsw", VCPU_STAT(instruction_lpsw) },
-	{ "instruction_lpswe", VCPU_STAT(instruction_lpswe) },
-	{ "instruction_pfmf", VCPU_STAT(instruction_pfmf) },
-	{ "instruction_ptff", VCPU_STAT(instruction_ptff) },
-	{ "instruction_stidp", VCPU_STAT(instruction_stidp) },
-	{ "instruction_sck", VCPU_STAT(instruction_sck) },
-	{ "instruction_sckpf", VCPU_STAT(instruction_sckpf) },
-	{ "instruction_spx", VCPU_STAT(instruction_spx) },
-	{ "instruction_stpx", VCPU_STAT(instruction_stpx) },
-	{ "instruction_stap", VCPU_STAT(instruction_stap) },
-	{ "instruction_iske", VCPU_STAT(instruction_iske) },
-	{ "instruction_ri", VCPU_STAT(instruction_ri) },
-	{ "instruction_rrbe", VCPU_STAT(instruction_rrbe) },
-	{ "instruction_sske", VCPU_STAT(instruction_sske) },
-	{ "instruction_ipte_interlock", VCPU_STAT(instruction_ipte_interlock) },
-	{ "instruction_essa", VCPU_STAT(instruction_essa) },
-	{ "instruction_stsi", VCPU_STAT(instruction_stsi) },
-	{ "instruction_stfl", VCPU_STAT(instruction_stfl) },
-	{ "instruction_tb", VCPU_STAT(instruction_tb) },
-	{ "instruction_tpi", VCPU_STAT(instruction_tpi) },
-	{ "instruction_tprot", VCPU_STAT(instruction_tprot) },
-	{ "instruction_tsch", VCPU_STAT(instruction_tsch) },
-	{ "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
-	{ "instruction_sie", VCPU_STAT(instruction_sie) },
-	{ "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
-	{ "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) },
-	{ "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
-	{ "instruction_sigp_emergency", VCPU_STAT(instruction_sigp_emergency) },
-	{ "instruction_sigp_cond_emergency", VCPU_STAT(instruction_sigp_cond_emergency) },
-	{ "instruction_sigp_start", VCPU_STAT(instruction_sigp_start) },
-	{ "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) },
-	{ "instruction_sigp_stop_store_status", VCPU_STAT(instruction_sigp_stop_store_status) },
-	{ "instruction_sigp_store_status", VCPU_STAT(instruction_sigp_store_status) },
-	{ "instruction_sigp_store_adtl_status", VCPU_STAT(instruction_sigp_store_adtl_status) },
-	{ "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) },
-	{ "instruction_sigp_set_prefix", VCPU_STAT(instruction_sigp_prefix) },
-	{ "instruction_sigp_restart", VCPU_STAT(instruction_sigp_restart) },
-	{ "instruction_sigp_cpu_reset", VCPU_STAT(instruction_sigp_cpu_reset) },
-	{ "instruction_sigp_init_cpu_reset", VCPU_STAT(instruction_sigp_init_cpu_reset) },
-	{ "instruction_sigp_unknown", VCPU_STAT(instruction_sigp_unknown) },
-	{ "instruction_diag_10", VCPU_STAT(diagnose_10) },
-	{ "instruction_diag_44", VCPU_STAT(diagnose_44) },
-	{ "instruction_diag_9c", VCPU_STAT(diagnose_9c) },
-	{ "diag_9c_ignored", VCPU_STAT(diagnose_9c_ignored) },
-	{ "instruction_diag_258", VCPU_STAT(diagnose_258) },
-	{ "instruction_diag_308", VCPU_STAT(diagnose_308) },
-	{ "instruction_diag_500", VCPU_STAT(diagnose_500) },
-	{ "instruction_diag_other", VCPU_STAT(diagnose_other) },
-	{ NULL }
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+	KVM_GENERIC_VM_STATS(),
+	STATS_DESC_COUNTER(VM, inject_io),
+	STATS_DESC_COUNTER(VM, inject_float_mchk),
+	STATS_DESC_COUNTER(VM, inject_pfault_done),
+	STATS_DESC_COUNTER(VM, inject_service_signal),
+	STATS_DESC_COUNTER(VM, inject_virtio),
+	STATS_DESC_COUNTER(VM, aen_forward),
+	STATS_DESC_COUNTER(VM, gmap_shadow_reuse),
+	STATS_DESC_COUNTER(VM, gmap_shadow_create),
+	STATS_DESC_COUNTER(VM, gmap_shadow_r1_entry),
+	STATS_DESC_COUNTER(VM, gmap_shadow_r2_entry),
+	STATS_DESC_COUNTER(VM, gmap_shadow_r3_entry),
+	STATS_DESC_COUNTER(VM, gmap_shadow_sg_entry),
+	STATS_DESC_COUNTER(VM, gmap_shadow_pg_entry),
 };
 
-struct kvm_s390_tod_clock_ext {
-	__u8 epoch_idx;
-	__u64 tod;
-	__u8 reserved[7];
-} __packed;
+const struct kvm_stats_header kvm_vm_stats_header = {
+	.name_size = KVM_STATS_NAME_SIZE,
+	.num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+	.id_offset = sizeof(struct kvm_stats_header),
+	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+		       sizeof(kvm_vm_stats_desc),
+};
+
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+	KVM_GENERIC_VCPU_STATS(),
+	STATS_DESC_COUNTER(VCPU, exit_userspace),
+	STATS_DESC_COUNTER(VCPU, exit_null),
+	STATS_DESC_COUNTER(VCPU, exit_external_request),
+	STATS_DESC_COUNTER(VCPU, exit_io_request),
+	STATS_DESC_COUNTER(VCPU, exit_external_interrupt),
+	STATS_DESC_COUNTER(VCPU, exit_stop_request),
+	STATS_DESC_COUNTER(VCPU, exit_validity),
+	STATS_DESC_COUNTER(VCPU, exit_instruction),
+	STATS_DESC_COUNTER(VCPU, exit_pei),
+	STATS_DESC_COUNTER(VCPU, halt_no_poll_steal),
+	STATS_DESC_COUNTER(VCPU, instruction_lctl),
+	STATS_DESC_COUNTER(VCPU, instruction_lctlg),
+	STATS_DESC_COUNTER(VCPU, instruction_stctl),
+	STATS_DESC_COUNTER(VCPU, instruction_stctg),
+	STATS_DESC_COUNTER(VCPU, exit_program_interruption),
+	STATS_DESC_COUNTER(VCPU, exit_instr_and_program),
+	STATS_DESC_COUNTER(VCPU, exit_operation_exception),
+	STATS_DESC_COUNTER(VCPU, deliver_ckc),
+	STATS_DESC_COUNTER(VCPU, deliver_cputm),
+	STATS_DESC_COUNTER(VCPU, deliver_external_call),
+	STATS_DESC_COUNTER(VCPU, deliver_emergency_signal),
+	STATS_DESC_COUNTER(VCPU, deliver_service_signal),
+	STATS_DESC_COUNTER(VCPU, deliver_virtio),
+	STATS_DESC_COUNTER(VCPU, deliver_stop_signal),
+	STATS_DESC_COUNTER(VCPU, deliver_prefix_signal),
+	STATS_DESC_COUNTER(VCPU, deliver_restart_signal),
+	STATS_DESC_COUNTER(VCPU, deliver_program),
+	STATS_DESC_COUNTER(VCPU, deliver_io),
+	STATS_DESC_COUNTER(VCPU, deliver_machine_check),
+	STATS_DESC_COUNTER(VCPU, exit_wait_state),
+	STATS_DESC_COUNTER(VCPU, inject_ckc),
+	STATS_DESC_COUNTER(VCPU, inject_cputm),
+	STATS_DESC_COUNTER(VCPU, inject_external_call),
+	STATS_DESC_COUNTER(VCPU, inject_emergency_signal),
+	STATS_DESC_COUNTER(VCPU, inject_mchk),
+	STATS_DESC_COUNTER(VCPU, inject_pfault_init),
+	STATS_DESC_COUNTER(VCPU, inject_program),
+	STATS_DESC_COUNTER(VCPU, inject_restart),
+	STATS_DESC_COUNTER(VCPU, inject_set_prefix),
+	STATS_DESC_COUNTER(VCPU, inject_stop_signal),
+	STATS_DESC_COUNTER(VCPU, instruction_epsw),
+	STATS_DESC_COUNTER(VCPU, instruction_gs),
+	STATS_DESC_COUNTER(VCPU, instruction_io_other),
+	STATS_DESC_COUNTER(VCPU, instruction_lpsw),
+	STATS_DESC_COUNTER(VCPU, instruction_lpswe),
+	STATS_DESC_COUNTER(VCPU, instruction_lpswey),
+	STATS_DESC_COUNTER(VCPU, instruction_pfmf),
+	STATS_DESC_COUNTER(VCPU, instruction_ptff),
+	STATS_DESC_COUNTER(VCPU, instruction_sck),
+	STATS_DESC_COUNTER(VCPU, instruction_sckpf),
+	STATS_DESC_COUNTER(VCPU, instruction_stidp),
+	STATS_DESC_COUNTER(VCPU, instruction_spx),
+	STATS_DESC_COUNTER(VCPU, instruction_stpx),
+	STATS_DESC_COUNTER(VCPU, instruction_stap),
+	STATS_DESC_COUNTER(VCPU, instruction_iske),
+	STATS_DESC_COUNTER(VCPU, instruction_ri),
+	STATS_DESC_COUNTER(VCPU, instruction_rrbe),
+	STATS_DESC_COUNTER(VCPU, instruction_sske),
+	STATS_DESC_COUNTER(VCPU, instruction_ipte_interlock),
+	STATS_DESC_COUNTER(VCPU, instruction_stsi),
+	STATS_DESC_COUNTER(VCPU, instruction_stfl),
+	STATS_DESC_COUNTER(VCPU, instruction_tb),
+	STATS_DESC_COUNTER(VCPU, instruction_tpi),
+	STATS_DESC_COUNTER(VCPU, instruction_tprot),
+	STATS_DESC_COUNTER(VCPU, instruction_tsch),
+	STATS_DESC_COUNTER(VCPU, instruction_sie),
+	STATS_DESC_COUNTER(VCPU, instruction_essa),
+	STATS_DESC_COUNTER(VCPU, instruction_sthyi),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_sense),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_sense_running),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_external_call),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_emergency),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_cond_emergency),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_start),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_stop),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_stop_store_status),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_store_status),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_store_adtl_status),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_arch),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_prefix),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_restart),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_init_cpu_reset),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_cpu_reset),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_unknown),
+	STATS_DESC_COUNTER(VCPU, instruction_diagnose_10),
+	STATS_DESC_COUNTER(VCPU, instruction_diagnose_44),
+	STATS_DESC_COUNTER(VCPU, instruction_diagnose_9c),
+	STATS_DESC_COUNTER(VCPU, diag_9c_ignored),
+	STATS_DESC_COUNTER(VCPU, diag_9c_forward),
+	STATS_DESC_COUNTER(VCPU, instruction_diagnose_258),
+	STATS_DESC_COUNTER(VCPU, instruction_diagnose_308),
+	STATS_DESC_COUNTER(VCPU, instruction_diagnose_500),
+	STATS_DESC_COUNTER(VCPU, instruction_diagnose_other),
+	STATS_DESC_COUNTER(VCPU, pfault_sync)
+};
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+	.name_size = KVM_STATS_NAME_SIZE,
+	.num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+	.id_offset = sizeof(struct kvm_stats_header),
+	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+		       sizeof(kvm_vcpu_stats_desc),
+};
 
 /* allow nested virtualization in KVM (if enabled by user space) */
 static int nested;
@@ -184,6 +211,24 @@ static u8 halt_poll_max_steal = 10;
 module_param(halt_poll_max_steal, byte, 0644);
 MODULE_PARM_DESC(halt_poll_max_steal, "Maximum percentage of steal time to allow polling");
 
+/* if set to true, the GISA will be initialized and used if available */
+static bool use_gisa  = true;
+module_param(use_gisa, bool, 0644);
+MODULE_PARM_DESC(use_gisa, "Use the GISA if the host supports it.");
+
+/* maximum diag9c forwarding per second */
+unsigned int diag9c_forwarding_hz;
+module_param(diag9c_forwarding_hz, uint, 0644);
+MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off");
+
+/*
+ * allow asynchronous deinit for protected guests; enable by default since
+ * the feature is opt-in anyway
+ */
+static int async_destroy = 1;
+module_param(async_destroy, int, 0444);
+MODULE_PARM_DESC(async_destroy, "Asynchronous destroy for protected guests");
+
 /*
  * For now we handle at most 16 double words as this is what the s390 base
  * kernel handles and stores in the prefix page. If we ever need to go beyond
@@ -207,7 +252,7 @@ static unsigned long kvm_s390_fac_size(void)
 	BUILD_BUG_ON(SIZE_INTERNAL > S390_ARCH_FAC_MASK_SIZE_U64);
 	BUILD_BUG_ON(SIZE_INTERNAL > S390_ARCH_FAC_LIST_SIZE_U64);
 	BUILD_BUG_ON(SIZE_INTERNAL * sizeof(unsigned long) >
-		sizeof(S390_lowcore.stfle_fac_list));
+		sizeof(stfle_fac_list));
 
 	return SIZE_INTERNAL;
 }
@@ -220,21 +265,13 @@ static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
 static struct gmap_notifier gmap_notifier;
 static struct gmap_notifier vsie_gmap_notifier;
 debug_info_t *kvm_s390_dbf;
+debug_info_t *kvm_s390_dbf_uv;
 
 /* Section: not file related */
-int kvm_arch_hardware_enable(void)
-{
-	/* every s390 is virtualization enabled ;-) */
-	return 0;
-}
-
-int kvm_arch_check_processor_compat(void)
-{
-	return 0;
-}
-
+/* forward declarations */
 static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
 			      unsigned long end);
+static int sca_switch_to_extended(struct kvm *kvm);
 
 static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta)
 {
@@ -269,7 +306,7 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val,
 {
 	struct kvm *kvm;
 	struct kvm_vcpu *vcpu;
-	int i;
+	unsigned long i;
 	unsigned long long *delta = v;
 
 	list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -293,25 +330,6 @@ static struct notifier_block kvm_clock_notifier = {
 	.notifier_call = kvm_clock_sync,
 };
 
-int kvm_arch_hardware_setup(void)
-{
-	gmap_notifier.notifier_call = kvm_gmap_notifier;
-	gmap_register_pte_notifier(&gmap_notifier);
-	vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
-	gmap_register_pte_notifier(&vsie_gmap_notifier);
-	atomic_notifier_chain_register(&s390_epoch_delta_notifier,
-				       &kvm_clock_notifier);
-	return 0;
-}
-
-void kvm_arch_hardware_unsetup(void)
-{
-	gmap_unregister_pte_notifier(&gmap_notifier);
-	gmap_unregister_pte_notifier(&vsie_gmap_notifier);
-	atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
-					 &kvm_clock_notifier);
-}
-
 static void allow_cpu_feat(unsigned long nr)
 {
 	set_bit_inv(nr, kvm_s390_available_cpu_feat);
@@ -319,37 +337,55 @@ static void allow_cpu_feat(unsigned long nr)
 
 static inline int plo_test_bit(unsigned char nr)
 {
-	register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
+	unsigned long function = (unsigned long)nr | 0x100;
 	int cc;
 
 	asm volatile(
+		"	lgr	0,%[function]\n"
 		/* Parameter registers are ignored for "test bit" */
 		"	plo	0,0,0,0(0)\n"
-		"	ipm	%0\n"
-		"	srl	%0,28\n"
-		: "=d" (cc)
-		: "d" (r0)
-		: "cc");
-	return cc == 0;
+		CC_IPM(cc)
+		: CC_OUT(cc, cc)
+		: [function] "d" (function)
+		: CC_CLOBBER_LIST("0"));
+	return CC_TRANSFORM(cc) == 0;
 }
 
-static __always_inline void __insn32_query(unsigned int opcode, u8 *query)
+static __always_inline void pfcr_query(u8 (*query)[16])
 {
-	register unsigned long r0 asm("0") = 0;	/* query function */
-	register unsigned long r1 asm("1") = (unsigned long) query;
+	asm volatile(
+		"	lghi	0,0\n"
+		"	.insn   rsy,0xeb0000000016,0,0,%[query]\n"
+		: [query] "=QS" (*query)
+		:
+		: "cc", "0");
+}
 
+static __always_inline void __sortl_query(u8 (*query)[32])
+{
 	asm volatile(
-		/* Parameter regs are ignored */
-		"	.insn	rrf,%[opc] << 16,2,4,6,0\n"
+		"	lghi	0,0\n"
+		"	la	1,%[query]\n"
+		/* Parameter registers are ignored */
+		"	.insn	rre,0xb9380000,2,4\n"
+		: [query] "=R" (*query)
 		:
-		: "d" (r0), "a" (r1), [opc] "i" (opcode)
-		: "cc", "memory");
+		: "cc", "0", "1");
 }
 
-#define INSN_SORTL 0xb938
-#define INSN_DFLTCC 0xb939
+static __always_inline void __dfltcc_query(u8 (*query)[32])
+{
+	asm volatile(
+		"	lghi	0,0\n"
+		"	la	1,%[query]\n"
+		/* Parameter registers are ignored */
+		"	.insn	rrf,0xb9390000,2,4,6,0\n"
+		: [query] "=R" (*query)
+		:
+		: "cc", "0", "1");
+}
 
-static void kvm_s390_cpu_feat_init(void)
+static void __init kvm_s390_cpu_feat_init(void)
 {
 	int i;
 
@@ -401,18 +437,21 @@ static void kvm_s390_cpu_feat_init(void)
 			      kvm_s390_available_subfunc.kdsa);
 
 	if (test_facility(150)) /* SORTL */
-		__insn32_query(INSN_SORTL, kvm_s390_available_subfunc.sortl);
+		__sortl_query(&kvm_s390_available_subfunc.sortl);
 
 	if (test_facility(151)) /* DFLTCC */
-		__insn32_query(INSN_DFLTCC, kvm_s390_available_subfunc.dfltcc);
+		__dfltcc_query(&kvm_s390_available_subfunc.dfltcc);
 
-	if (MACHINE_HAS_ESOP)
+	if (test_facility(201))	/* PFCR */
+		pfcr_query(&kvm_s390_available_subfunc.pfcr);
+
+	if (machine_has_esop())
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
 	/*
 	 * We need SIE support, ESOP (PROT_READ protection for gmap_shadow),
 	 * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing).
 	 */
-	if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao ||
+	if (!sclp.has_sief2 || !machine_has_esop() || !sclp.has_64bscao ||
 	    !test_facility(3) || !nested)
 		return;
 	allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2);
@@ -452,7 +491,7 @@ static void kvm_s390_cpu_feat_init(void)
 	 */
 }
 
-int kvm_arch_init(void *opaque)
+static int __init __kvm_s390_init(void)
 {
 	int rc = -ENOMEM;
 
@@ -460,8 +499,13 @@ int kvm_arch_init(void *opaque)
 	if (!kvm_s390_dbf)
 		return -ENOMEM;
 
-	if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view))
-		goto out;
+	kvm_s390_dbf_uv = debug_register("kvm-uv", 32, 1, 7 * sizeof(long));
+	if (!kvm_s390_dbf_uv)
+		goto err_kvm_uv;
+
+	if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view) ||
+	    debug_register_view(kvm_s390_dbf_uv, &debug_sprintf_view))
+		goto err_debug_view;
 
 	kvm_s390_cpu_feat_init();
 
@@ -469,24 +513,54 @@ int kvm_arch_init(void *opaque)
 	rc = kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC);
 	if (rc) {
 		pr_err("A FLIC registration call failed with rc=%d\n", rc);
-		goto out;
+		goto err_flic;
+	}
+
+	if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) {
+		rc = kvm_s390_pci_init();
+		if (rc) {
+			pr_err("Unable to allocate AIFT for PCI\n");
+			goto err_pci;
+		}
 	}
 
 	rc = kvm_s390_gib_init(GAL_ISC);
 	if (rc)
-		goto out;
+		goto err_gib;
+
+	gmap_notifier.notifier_call = kvm_gmap_notifier;
+	gmap_register_pte_notifier(&gmap_notifier);
+	vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
+	gmap_register_pte_notifier(&vsie_gmap_notifier);
+	atomic_notifier_chain_register(&s390_epoch_delta_notifier,
+				       &kvm_clock_notifier);
 
 	return 0;
 
-out:
-	kvm_arch_exit();
+err_gib:
+	if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
+		kvm_s390_pci_exit();
+err_pci:
+err_flic:
+err_debug_view:
+	debug_unregister(kvm_s390_dbf_uv);
+err_kvm_uv:
+	debug_unregister(kvm_s390_dbf);
 	return rc;
 }
 
-void kvm_arch_exit(void)
+static void __kvm_s390_exit(void)
 {
+	gmap_unregister_pte_notifier(&gmap_notifier);
+	gmap_unregister_pte_notifier(&vsie_gmap_notifier);
+	atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
+					 &kvm_clock_notifier);
+
 	kvm_s390_gib_destroy();
+	if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
+		kvm_s390_pci_exit();
 	debug_unregister(kvm_s390_dbf);
+	debug_unregister(kvm_s390_dbf_uv);
 }
 
 /* Section: device related */
@@ -515,7 +589,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ENABLE_CAP:
 	case KVM_CAP_S390_CSS_SUPPORT:
 	case KVM_CAP_IOEVENTFD:
-	case KVM_CAP_DEVICE_CTRL:
 	case KVM_CAP_S390_IRQCHIP:
 	case KVM_CAP_VM_ATTRIBUTES:
 	case KVM_CAP_MP_STATE:
@@ -529,16 +602,32 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_CMMA_MIGRATION:
 	case KVM_CAP_S390_AIS:
 	case KVM_CAP_S390_AIS_MIGRATION:
+	case KVM_CAP_S390_VCPU_RESETS:
+	case KVM_CAP_SET_GUEST_DEBUG:
+	case KVM_CAP_S390_DIAG318:
+	case KVM_CAP_IRQFD_RESAMPLE:
 		r = 1;
 		break;
+	case KVM_CAP_SET_GUEST_DEBUG2:
+		r = KVM_GUESTDBG_VALID_MASK;
+		break;
 	case KVM_CAP_S390_HPAGE_1M:
 		r = 0;
-		if (hpage && !kvm_is_ucontrol(kvm))
+		if (hpage && !(kvm && kvm_is_ucontrol(kvm)))
 			r = 1;
 		break;
 	case KVM_CAP_S390_MEM_OP:
 		r = MEM_OP_MAX_SIZE;
 		break;
+	case KVM_CAP_S390_MEM_OP_EXTENSION:
+		/*
+		 * Flag bits indicating which extensions are supported.
+		 * If r > 0, the base extension must also be supported/indicated,
+		 * in order to maintain backwards compatibility.
+		 */
+		r = KVM_S390_MEMOP_EXTENSION_CAP_BASE |
+		    KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG;
+		break;
 	case KVM_CAP_NR_VCPUS:
 	case KVM_CAP_MAX_VCPUS:
 	case KVM_CAP_MAX_VCPU_ID:
@@ -547,12 +636,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 			r = KVM_MAX_VCPUS;
 		else if (sclp.has_esca && sclp.has_64bscao)
 			r = KVM_S390_ESCA_CPU_SLOTS;
+		if (ext == KVM_CAP_NR_VCPUS)
+			r = min_t(unsigned int, num_online_cpus(), r);
 		break;
 	case KVM_CAP_S390_COW:
-		r = MACHINE_HAS_ESOP;
+		r = machine_has_esop();
 		break;
 	case KVM_CAP_S390_VECTOR_REGISTERS:
-		r = MACHINE_HAS_VX;
+		r = test_facility(129);
 		break;
 	case KVM_CAP_S390_RI:
 		r = test_facility(64);
@@ -563,14 +654,45 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_BPB:
 		r = test_facility(82);
 		break;
+	case KVM_CAP_S390_PROTECTED_ASYNC_DISABLE:
+		r = async_destroy && is_prot_virt_host();
+		break;
+	case KVM_CAP_S390_PROTECTED:
+		r = is_prot_virt_host();
+		break;
+	case KVM_CAP_S390_PROTECTED_DUMP: {
+		u64 pv_cmds_dump[] = {
+			BIT_UVC_CMD_DUMP_INIT,
+			BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE,
+			BIT_UVC_CMD_DUMP_CPU,
+			BIT_UVC_CMD_DUMP_COMPLETE,
+		};
+		int i;
+
+		r = is_prot_virt_host();
+
+		for (i = 0; i < ARRAY_SIZE(pv_cmds_dump); i++) {
+			if (!test_bit_inv(pv_cmds_dump[i],
+					  (unsigned long *)&uv_info.inst_calls_list)) {
+				r = 0;
+				break;
+			}
+		}
+		break;
+	}
+	case KVM_CAP_S390_ZPCI_OP:
+		r = kvm_s390_pci_interp_allowed();
+		break;
+	case KVM_CAP_S390_CPU_TOPOLOGY:
+		r = test_facility(11);
+		break;
 	default:
 		r = 0;
 	}
 	return r;
 }
 
-static void kvm_s390_sync_dirty_log(struct kvm *kvm,
-				    struct kvm_memory_slot *memslot)
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 {
 	int i;
 	gfn_t cur_gfn, last_gfn;
@@ -611,9 +733,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 {
 	int r;
 	unsigned long n;
-	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
-	int is_dirty = 0;
+	int is_dirty;
 
 	if (kvm_is_ucontrol(kvm))
 		return -EINVAL;
@@ -624,14 +745,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 	if (log->slot >= KVM_USER_MEM_SLOTS)
 		goto out;
 
-	slots = kvm_memslots(kvm);
-	memslot = id_to_memslot(slots, log->slot);
-	r = -ENOENT;
-	if (!memslot->dirty_bitmap)
-		goto out;
-
-	kvm_s390_sync_dirty_log(kvm, memslot);
-	r = kvm_get_dirty_log(kvm, log, &is_dirty);
+	r = kvm_get_dirty_log(kvm, log, &is_dirty, &memslot);
 	if (r)
 		goto out;
 
@@ -648,7 +762,7 @@ out:
 
 static void icpt_operexc_on_all_vcpus(struct kvm *kvm)
 {
-	unsigned int i;
+	unsigned long i;
 	struct kvm_vcpu *vcpu;
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -678,7 +792,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 		mutex_lock(&kvm->lock);
 		if (kvm->created_vcpus) {
 			r = -EBUSY;
-		} else if (MACHINE_HAS_VX) {
+		} else if (cpu_has_vx()) {
 			set_kvm_facility(kvm->arch.model.fac_mask, 129);
 			set_kvm_facility(kvm->arch.model.fac_list, 129);
 			if (test_facility(134)) {
@@ -697,6 +811,18 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 				set_kvm_facility(kvm->arch.model.fac_mask, 152);
 				set_kvm_facility(kvm->arch.model.fac_list, 152);
 			}
+			if (test_facility(192)) {
+				set_kvm_facility(kvm->arch.model.fac_mask, 192);
+				set_kvm_facility(kvm->arch.model.fac_list, 192);
+			}
+			if (test_facility(198)) {
+				set_kvm_facility(kvm->arch.model.fac_mask, 198);
+				set_kvm_facility(kvm->arch.model.fac_list, 198);
+			}
+			if (test_facility(199)) {
+				set_kvm_facility(kvm->arch.model.fac_mask, 199);
+				set_kvm_facility(kvm->arch.model.fac_list, 199);
+			}
 			r = 0;
 		} else
 			r = -EINVAL;
@@ -753,9 +879,9 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 			r = -EINVAL;
 		else {
 			r = 0;
-			down_write(&kvm->mm->mmap_sem);
+			mmap_write_lock(kvm->mm);
 			kvm->mm->context.allow_gmap_hpage_1m = 1;
-			up_write(&kvm->mm->mmap_sem);
+			mmap_write_unlock(kvm->mm);
 			/*
 			 * We might have to create fake 4k page
 			 * tables. To avoid that the hardware works on
@@ -779,6 +905,20 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 		icpt_operexc_on_all_vcpus(kvm);
 		r = 0;
 		break;
+	case KVM_CAP_S390_CPU_TOPOLOGY:
+		r = -EINVAL;
+		mutex_lock(&kvm->lock);
+		if (kvm->created_vcpus) {
+			r = -EBUSY;
+		} else if (test_facility(11)) {
+			set_kvm_facility(kvm->arch.model.fac_mask, 11);
+			set_kvm_facility(kvm->arch.model.fac_list, 11);
+			r = 0;
+		}
+		mutex_unlock(&kvm->lock);
+		VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s",
+			 r ? "(not available)" : "(success)");
+		break;
 	default:
 		r = -EINVAL;
 		break;
@@ -898,7 +1038,7 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm)
 {
 	struct kvm_vcpu *vcpu;
-	int i;
+	unsigned long i;
 
 	kvm_s390_vcpu_block_all(kvm);
 
@@ -981,9 +1121,45 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr)
 	return 0;
 }
 
+static void kvm_s390_vcpu_pci_setup(struct kvm_vcpu *vcpu)
+{
+	/* Only set the ECB bits after guest requests zPCI interpretation */
+	if (!vcpu->kvm->arch.use_zpci_interp)
+		return;
+
+	vcpu->arch.sie_block->ecb2 |= ECB2_ZPCI_LSI;
+	vcpu->arch.sie_block->ecb3 |= ECB3_AISII + ECB3_AISI;
+}
+
+void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu;
+	unsigned long i;
+
+	lockdep_assert_held(&kvm->lock);
+
+	if (!kvm_s390_pci_interp_allowed())
+		return;
+
+	/*
+	 * If host is configured for PCI and the necessary facilities are
+	 * available, turn on interpretation for the life of this guest
+	 */
+	kvm->arch.use_zpci_interp = 1;
+
+	kvm_s390_vcpu_block_all(kvm);
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		kvm_s390_vcpu_pci_setup(vcpu);
+		kvm_s390_sync_request(KVM_REQ_VSIE_RESTART, vcpu);
+	}
+
+	kvm_s390_vcpu_unblock_all(kvm);
+}
+
 static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req)
 {
-	int cx;
+	unsigned long cx;
 	struct kvm_vcpu *vcpu;
 
 	kvm_for_each_vcpu(cx, vcpu, kvm)
@@ -999,13 +1175,13 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm)
 	struct kvm_memory_slot *ms;
 	struct kvm_memslots *slots;
 	unsigned long ram_pages = 0;
-	int slotnr;
+	int bkt;
 
 	/* migration mode already enabled */
 	if (kvm->arch.migration_mode)
 		return 0;
 	slots = kvm_memslots(kvm);
-	if (!slots || !slots->used_slots)
+	if (!slots || kvm_memslots_empty(slots))
 		return -EINVAL;
 
 	if (!kvm->arch.use_cmma) {
@@ -1013,8 +1189,7 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm)
 		return 0;
 	}
 	/* mark all the pages in active slots as dirty */
-	for (slotnr = 0; slotnr < slots->used_slots; slotnr++) {
-		ms = slots->memslots + slotnr;
+	kvm_for_each_memslot(ms, bkt, slots) {
 		if (!ms->dirty_bitmap)
 			return -EINVAL;
 		/*
@@ -1081,6 +1256,8 @@ static int kvm_s390_vm_get_migration(struct kvm *kvm,
 	return 0;
 }
 
+static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod);
+
 static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr)
 {
 	struct kvm_s390_vm_tod_clock gtod;
@@ -1090,7 +1267,7 @@ static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr)
 
 	if (!test_kvm_facility(kvm, 139) && gtod.epoch_idx)
 		return -EINVAL;
-	kvm_s390_set_tod_clock(kvm, &gtod);
+	__kvm_s390_set_tod_clock(kvm, &gtod);
 
 	VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x, TOD base: 0x%llx",
 		gtod.epoch_idx, gtod.tod);
@@ -1121,7 +1298,7 @@ static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
 			   sizeof(gtod.tod)))
 		return -EFAULT;
 
-	kvm_s390_set_tod_clock(kvm, &gtod);
+	__kvm_s390_set_tod_clock(kvm, &gtod);
 	VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod.tod);
 	return 0;
 }
@@ -1133,6 +1310,16 @@ static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr)
 	if (attr->flags)
 		return -EINVAL;
 
+	mutex_lock(&kvm->lock);
+	/*
+	 * For protected guests, the TOD is managed by the ultravisor, so trying
+	 * to change it will never bring the expected results.
+	 */
+	if (kvm_s390_pv_is_protected(kvm)) {
+		ret = -EOPNOTSUPP;
+		goto out_unlock;
+	}
+
 	switch (attr->attr) {
 	case KVM_S390_VM_TOD_EXT:
 		ret = kvm_s390_set_tod_ext(kvm, attr);
@@ -1147,23 +1334,26 @@ static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr)
 		ret = -ENXIO;
 		break;
 	}
+
+out_unlock:
+	mutex_unlock(&kvm->lock);
 	return ret;
 }
 
 static void kvm_s390_get_tod_clock(struct kvm *kvm,
 				   struct kvm_s390_vm_tod_clock *gtod)
 {
-	struct kvm_s390_tod_clock_ext htod;
+	union tod_clock clk;
 
 	preempt_disable();
 
-	get_tod_clock_ext((char *)&htod);
+	store_tod_clock_ext(&clk);
 
-	gtod->tod = htod.tod + kvm->arch.epoch;
+	gtod->tod = clk.tod + kvm->arch.epoch;
 	gtod->epoch_idx = 0;
 	if (test_kvm_facility(kvm, 139)) {
-		gtod->epoch_idx = htod.epoch_idx + kvm->arch.epdx;
-		if (gtod->tod < htod.tod)
+		gtod->epoch_idx = clk.ei + kvm->arch.epdx;
+		if (gtod->tod < clk.tod)
 			gtod->epoch_idx += 1;
 	}
 
@@ -1243,7 +1433,7 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
 		ret = -EBUSY;
 		goto out;
 	}
-	proc = kzalloc(sizeof(*proc), GFP_KERNEL);
+	proc = kzalloc(sizeof(*proc), GFP_KERNEL_ACCOUNT);
 	if (!proc) {
 		ret = -ENOMEM;
 		goto out;
@@ -1295,8 +1485,7 @@ static int kvm_s390_set_processor_feat(struct kvm *kvm,
 		mutex_unlock(&kvm->lock);
 		return -EBUSY;
 	}
-	bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat,
-		    KVM_S390_VM_CPU_FEAT_NR_BITS);
+	bitmap_from_arr64(kvm->arch.cpu_feat, data.feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
 	mutex_unlock(&kvm->lock);
 	VM_EVENT(kvm, 3, "SET: guest feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx",
 			 data.feat[0],
@@ -1378,6 +1567,42 @@ static int kvm_s390_set_processor_subfunc(struct kvm *kvm,
 		 ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[1],
 		 ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[2],
 		 ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[3]);
+	VM_EVENT(kvm, 3, "GET: guest PFCR   subfunc 0x%16.16lx.%16.16lx",
+		 ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[0],
+		 ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[1]);
+
+	return 0;
+}
+
+#define KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK	\
+(						\
+	((struct kvm_s390_vm_cpu_uv_feat){	\
+		.ap = 1,			\
+		.ap_intr = 1,			\
+	})					\
+	.feat					\
+)
+
+static int kvm_s390_set_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+	struct kvm_s390_vm_cpu_uv_feat __user *ptr = (void __user *)attr->addr;
+	unsigned long data, filter;
+
+	filter = uv_info.uv_feature_indications & KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK;
+	if (get_user(data, &ptr->feat))
+		return -EFAULT;
+	if (!bitmap_subset(&data, &filter, KVM_S390_VM_CPU_UV_FEAT_NR_BITS))
+		return -EINVAL;
+
+	mutex_lock(&kvm->lock);
+	if (kvm->created_vcpus) {
+		mutex_unlock(&kvm->lock);
+		return -EBUSY;
+	}
+	kvm->arch.model.uv_feat_guest.feat = data;
+	mutex_unlock(&kvm->lock);
+
+	VM_EVENT(kvm, 3, "SET: guest UV-feat: 0x%16.16lx", data);
 
 	return 0;
 }
@@ -1396,6 +1621,9 @@ static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
 	case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
 		ret = kvm_s390_set_processor_subfunc(kvm, attr);
 		break;
+	case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST:
+		ret = kvm_s390_set_uv_feat(kvm, attr);
+		break;
 	}
 	return ret;
 }
@@ -1405,7 +1633,7 @@ static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr)
 	struct kvm_s390_vm_cpu_processor *proc;
 	int ret = 0;
 
-	proc = kzalloc(sizeof(*proc), GFP_KERNEL);
+	proc = kzalloc(sizeof(*proc), GFP_KERNEL_ACCOUNT);
 	if (!proc) {
 		ret = -ENOMEM;
 		goto out;
@@ -1433,7 +1661,7 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr)
 	struct kvm_s390_vm_cpu_machine *mach;
 	int ret = 0;
 
-	mach = kzalloc(sizeof(*mach), GFP_KERNEL);
+	mach = kzalloc(sizeof(*mach), GFP_KERNEL_ACCOUNT);
 	if (!mach) {
 		ret = -ENOMEM;
 		goto out;
@@ -1442,8 +1670,8 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr)
 	mach->ibc = sclp.ibc;
 	memcpy(&mach->fac_mask, kvm->arch.model.fac_mask,
 	       S390_ARCH_FAC_LIST_SIZE_BYTE);
-	memcpy((unsigned long *)&mach->fac_list, S390_lowcore.stfle_fac_list,
-	       sizeof(S390_lowcore.stfle_fac_list));
+	memcpy((unsigned long *)&mach->fac_list, stfle_fac_list,
+	       sizeof(stfle_fac_list));
 	VM_EVENT(kvm, 3, "GET: host ibc:  0x%4.4x, host cpuid:  0x%16.16llx",
 		 kvm->arch.model.ibc,
 		 kvm->arch.model.cpuid);
@@ -1467,8 +1695,7 @@ static int kvm_s390_get_processor_feat(struct kvm *kvm,
 {
 	struct kvm_s390_vm_cpu_feat data;
 
-	bitmap_copy((unsigned long *) data.feat, kvm->arch.cpu_feat,
-		    KVM_S390_VM_CPU_FEAT_NR_BITS);
+	bitmap_to_arr64(data.feat, kvm->arch.cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
 	if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
 		return -EFAULT;
 	VM_EVENT(kvm, 3, "GET: guest feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx",
@@ -1483,9 +1710,7 @@ static int kvm_s390_get_machine_feat(struct kvm *kvm,
 {
 	struct kvm_s390_vm_cpu_feat data;
 
-	bitmap_copy((unsigned long *) data.feat,
-		    kvm_s390_available_cpu_feat,
-		    KVM_S390_VM_CPU_FEAT_NR_BITS);
+	bitmap_to_arr64(data.feat, kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
 	if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
 		return -EFAULT;
 	VM_EVENT(kvm, 3, "GET: host feat:  0x%16.16llx.0x%16.16llx.0x%16.16llx",
@@ -1559,6 +1784,9 @@ static int kvm_s390_get_processor_subfunc(struct kvm *kvm,
 		 ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[1],
 		 ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[2],
 		 ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[3]);
+	VM_EVENT(kvm, 3, "GET: guest PFCR   subfunc 0x%16.16lx.%16.16lx",
+		 ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[0],
+		 ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[1]);
 
 	return 0;
 }
@@ -1627,6 +1855,36 @@ static int kvm_s390_get_machine_subfunc(struct kvm *kvm,
 		 ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[1],
 		 ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[2],
 		 ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[3]);
+	VM_EVENT(kvm, 3, "GET: host  PFCR   subfunc 0x%16.16lx.%16.16lx",
+		 ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[0],
+		 ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[1]);
+
+	return 0;
+}
+
+static int kvm_s390_get_processor_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+	struct kvm_s390_vm_cpu_uv_feat __user *dst = (void __user *)attr->addr;
+	unsigned long feat = kvm->arch.model.uv_feat_guest.feat;
+
+	if (put_user(feat, &dst->feat))
+		return -EFAULT;
+	VM_EVENT(kvm, 3, "GET: guest UV-feat: 0x%16.16lx", feat);
+
+	return 0;
+}
+
+static int kvm_s390_get_machine_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+	struct kvm_s390_vm_cpu_uv_feat __user *dst = (void __user *)attr->addr;
+	unsigned long feat;
+
+	BUILD_BUG_ON(sizeof(*dst) != sizeof(uv_info.uv_feature_indications));
+
+	feat = uv_info.uv_feature_indications & KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK;
+	if (put_user(feat, &dst->feat))
+		return -EFAULT;
+	VM_EVENT(kvm, 3, "GET: guest UV-feat: 0x%16.16lx", feat);
 
 	return 0;
 }
@@ -1654,10 +1912,67 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
 	case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
 		ret = kvm_s390_get_machine_subfunc(kvm, attr);
 		break;
+	case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST:
+		ret = kvm_s390_get_processor_uv_feat(kvm, attr);
+		break;
+	case KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST:
+		ret = kvm_s390_get_machine_uv_feat(kvm, attr);
+		break;
 	}
 	return ret;
 }
 
+/**
+ * kvm_s390_update_topology_change_report - update CPU topology change report
+ * @kvm: guest KVM description
+ * @val: set or clear the MTCR bit
+ *
+ * Updates the Multiprocessor Topology-Change-Report bit to signal
+ * the guest with a topology change.
+ * This is only relevant if the topology facility is present.
+ *
+ * The SCA version, bsca or esca, doesn't matter as offset is the same.
+ */
+static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val)
+{
+	union sca_utility new, old;
+	struct bsca_block *sca;
+
+	read_lock(&kvm->arch.sca_lock);
+	sca = kvm->arch.sca;
+	old = READ_ONCE(sca->utility);
+	do {
+		new = old;
+		new.mtcr = val;
+	} while (!try_cmpxchg(&sca->utility.val, &old.val, new.val));
+	read_unlock(&kvm->arch.sca_lock);
+}
+
+static int kvm_s390_set_topo_change_indication(struct kvm *kvm,
+					       struct kvm_device_attr *attr)
+{
+	if (!test_kvm_facility(kvm, 11))
+		return -ENXIO;
+
+	kvm_s390_update_topology_change_report(kvm, !!attr->attr);
+	return 0;
+}
+
+static int kvm_s390_get_topo_change_indication(struct kvm *kvm,
+					       struct kvm_device_attr *attr)
+{
+	u8 topo;
+
+	if (!test_kvm_facility(kvm, 11))
+		return -ENXIO;
+
+	read_lock(&kvm->arch.sca_lock);
+	topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr;
+	read_unlock(&kvm->arch.sca_lock);
+
+	return put_user(topo, (u8 __user *)attr->addr);
+}
+
 static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 {
 	int ret;
@@ -1678,6 +1993,9 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 	case KVM_S390_VM_MIGRATION:
 		ret = kvm_s390_vm_set_migration(kvm, attr);
 		break;
+	case KVM_S390_VM_CPU_TOPOLOGY:
+		ret = kvm_s390_set_topo_change_indication(kvm, attr);
+		break;
 	default:
 		ret = -ENXIO;
 		break;
@@ -1703,6 +2021,9 @@ static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 	case KVM_S390_VM_MIGRATION:
 		ret = kvm_s390_vm_get_migration(kvm, attr);
 		break;
+	case KVM_S390_VM_CPU_TOPOLOGY:
+		ret = kvm_s390_get_topo_change_indication(kvm, attr);
+		break;
 	default:
 		ret = -ENXIO;
 		break;
@@ -1749,6 +2070,8 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 		case KVM_S390_VM_CPU_MACHINE_FEAT:
 		case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
 		case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+		case KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST:
+		case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST:
 			ret = 0;
 			break;
 		default:
@@ -1776,6 +2099,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 	case KVM_S390_VM_MIGRATION:
 		ret = 0;
 		break;
+	case KVM_S390_VM_CPU_TOPOLOGY:
+		ret = test_kvm_facility(kvm, 11) ? 0 : -ENXIO;
+		break;
 	default:
 		ret = -ENXIO;
 		break;
@@ -1784,7 +2110,7 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 	return ret;
 }
 
-static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
+static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 {
 	uint8_t *keys;
 	uint64_t hva;
@@ -1801,11 +2127,11 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 	if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
 		return -EINVAL;
 
-	keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL);
+	keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT);
 	if (!keys)
 		return -ENOMEM;
 
-	down_read(&current->mm->mmap_sem);
+	mmap_read_lock(current->mm);
 	srcu_idx = srcu_read_lock(&kvm->srcu);
 	for (i = 0; i < args->count; i++) {
 		hva = gfn_to_hva(kvm, args->start_gfn + i);
@@ -1819,7 +2145,7 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 			break;
 	}
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
-	up_read(&current->mm->mmap_sem);
+	mmap_read_unlock(current->mm);
 
 	if (!r) {
 		r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
@@ -1832,7 +2158,7 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 	return r;
 }
 
-static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
+static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 {
 	uint8_t *keys;
 	uint64_t hva;
@@ -1846,7 +2172,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 	if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
 		return -EINVAL;
 
-	keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL);
+	keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT);
 	if (!keys)
 		return -ENOMEM;
 
@@ -1863,7 +2189,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 		goto out;
 
 	i = 0;
-	down_read(&current->mm->mmap_sem);
+	mmap_read_lock(current->mm);
 	srcu_idx = srcu_read_lock(&kvm->srcu);
         while (i < args->count) {
 		unlocked = false;
@@ -1881,7 +2207,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 
 		r = set_guest_storage_key(current->mm, hva, keys[i], 0);
 		if (r) {
-			r = fixup_user_fault(current, current->mm, hva,
+			r = fixup_user_fault(current->mm, hva,
 					     FAULT_FLAG_WRITE, &unlocked);
 			if (r)
 				break;
@@ -1890,7 +2216,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 			i++;
 	}
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
-	up_read(&current->mm->mmap_sem);
+	mmap_read_unlock(current->mm);
 out:
 	kvfree(keys);
 	return r;
@@ -1905,38 +2231,6 @@ out:
 /* for consistency */
 #define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX)
 
-/*
- * Similar to gfn_to_memslot, but returns the index of a memslot also when the
- * address falls in a hole. In that case the index of one of the memslots
- * bordering the hole is returned.
- */
-static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn)
-{
-	int start = 0, end = slots->used_slots;
-	int slot = atomic_read(&slots->lru_slot);
-	struct kvm_memory_slot *memslots = slots->memslots;
-
-	if (gfn >= memslots[slot].base_gfn &&
-	    gfn < memslots[slot].base_gfn + memslots[slot].npages)
-		return slot;
-
-	while (start < end) {
-		slot = start + (end - start) / 2;
-
-		if (gfn >= memslots[slot].base_gfn)
-			end = slot;
-		else
-			start = slot + 1;
-	}
-
-	if (gfn >= memslots[start].base_gfn &&
-	    gfn < memslots[start].base_gfn + memslots[start].npages) {
-		atomic_set(&slots->lru_slot, start);
-	}
-
-	return start;
-}
-
 static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
 			      u8 *res, unsigned long bufsize)
 {
@@ -1960,27 +2254,36 @@ static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
 	return 0;
 }
 
+static struct kvm_memory_slot *gfn_to_memslot_approx(struct kvm_memslots *slots,
+						     gfn_t gfn)
+{
+	return ____gfn_to_memslot(slots, gfn, true);
+}
+
 static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots,
 					      unsigned long cur_gfn)
 {
-	int slotidx = gfn_to_memslot_approx(slots, cur_gfn);
-	struct kvm_memory_slot *ms = slots->memslots + slotidx;
+	struct kvm_memory_slot *ms = gfn_to_memslot_approx(slots, cur_gfn);
 	unsigned long ofs = cur_gfn - ms->base_gfn;
+	struct rb_node *mnode = &ms->gfn_node[slots->node_idx];
 
 	if (ms->base_gfn + ms->npages <= cur_gfn) {
-		slotidx--;
+		mnode = rb_next(mnode);
 		/* If we are above the highest slot, wrap around */
-		if (slotidx < 0)
-			slotidx = slots->used_slots - 1;
+		if (!mnode)
+			mnode = rb_first(&slots->gfn_tree);
 
-		ms = slots->memslots + slotidx;
+		ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]);
 		ofs = 0;
 	}
+
+	if (cur_gfn < ms->base_gfn)
+		ofs = 0;
+
 	ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs);
-	while ((slotidx > 0) && (ofs >= ms->npages)) {
-		slotidx--;
-		ms = slots->memslots + slotidx;
-		ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, 0);
+	while (ofs >= ms->npages && (mnode = rb_next(mnode))) {
+		ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]);
+		ofs = find_first_bit(kvm_second_dirty_bitmap(ms), ms->npages);
 	}
 	return ms->base_gfn + ofs;
 }
@@ -1992,6 +2295,9 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 	struct kvm_memory_slot *ms;
 
+	if (unlikely(kvm_memslots_empty(slots)))
+		return 0;
+
 	cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn);
 	ms = gfn_to_memslot(kvm, cur_gfn);
 	args->count = 0;
@@ -1999,7 +2305,7 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
 	if (!ms)
 		return 0;
 	next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1);
-	mem_end = slots->memslots[0].base_gfn + slots->memslots[0].npages;
+	mem_end = kvm_s390_get_gfn_end(slots);
 
 	while (args->count < bufsize) {
 		hva = gfn_to_hva(kvm, cur_gfn);
@@ -2073,14 +2379,14 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm,
 	if (!values)
 		return -ENOMEM;
 
-	down_read(&kvm->mm->mmap_sem);
+	mmap_read_lock(kvm->mm);
 	srcu_idx = srcu_read_lock(&kvm->srcu);
 	if (peek)
 		ret = kvm_s390_peek_cmma(kvm, args, values, bufsize);
 	else
 		ret = kvm_s390_get_cmma(kvm, args, values, bufsize);
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
-	up_read(&kvm->mm->mmap_sem);
+	mmap_read_unlock(kvm->mm);
 
 	if (kvm->arch.migration_mode)
 		args->remaining = atomic64_read(&kvm->arch.cmma_dirty_pages);
@@ -2130,7 +2436,7 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm,
 		goto out;
 	}
 
-	down_read(&kvm->mm->mmap_sem);
+	mmap_read_lock(kvm->mm);
 	srcu_idx = srcu_read_lock(&kvm->srcu);
 	for (i = 0; i < args->count; i++) {
 		hva = gfn_to_hva(kvm, args->start_gfn + i);
@@ -2145,20 +2451,577 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm,
 		set_pgste_bits(kvm->mm, hva, mask, pgstev);
 	}
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
-	up_read(&kvm->mm->mmap_sem);
+	mmap_read_unlock(kvm->mm);
 
 	if (!kvm->mm->context.uses_cmm) {
-		down_write(&kvm->mm->mmap_sem);
+		mmap_write_lock(kvm->mm);
 		kvm->mm->context.uses_cmm = 1;
-		up_write(&kvm->mm->mmap_sem);
+		mmap_write_unlock(kvm->mm);
 	}
 out:
 	vfree(bits);
 	return r;
 }
 
-long kvm_arch_vm_ioctl(struct file *filp,
-		       unsigned int ioctl, unsigned long arg)
+/**
+ * kvm_s390_cpus_from_pv - Convert all protected vCPUs in a protected VM to
+ * non protected.
+ * @kvm: the VM whose protected vCPUs are to be converted
+ * @rc: return value for the RC field of the UVC (in case of error)
+ * @rrc: return value for the RRC field of the UVC (in case of error)
+ *
+ * Does not stop in case of error, tries to convert as many
+ * CPUs as possible. In case of error, the RC and RRC of the last error are
+ * returned.
+ *
+ * Return: 0 in case of success, otherwise -EIO
+ */
+int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+	struct kvm_vcpu *vcpu;
+	unsigned long i;
+	u16 _rc, _rrc;
+	int ret = 0;
+
+	/*
+	 * We ignore failures and try to destroy as many CPUs as possible.
+	 * At the same time we must not free the assigned resources when
+	 * this fails, as the ultravisor has still access to that memory.
+	 * So kvm_s390_pv_destroy_cpu can leave a "wanted" memory leak
+	 * behind.
+	 * We want to return the first failure rc and rrc, though.
+	 */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		mutex_lock(&vcpu->mutex);
+		if (kvm_s390_pv_destroy_cpu(vcpu, &_rc, &_rrc) && !ret) {
+			*rc = _rc;
+			*rrc = _rrc;
+			ret = -EIO;
+		}
+		mutex_unlock(&vcpu->mutex);
+	}
+	/* Ensure that we re-enable gisa if the non-PV guest used it but the PV guest did not. */
+	if (use_gisa)
+		kvm_s390_gisa_enable(kvm);
+	return ret;
+}
+
+/**
+ * kvm_s390_cpus_to_pv - Convert all non-protected vCPUs in a protected VM
+ * to protected.
+ * @kvm: the VM whose protected vCPUs are to be converted
+ * @rc: return value for the RC field of the UVC (in case of error)
+ * @rrc: return value for the RRC field of the UVC (in case of error)
+ *
+ * Tries to undo the conversion in case of error.
+ *
+ * Return: 0 in case of success, otherwise -EIO
+ */
+static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+	unsigned long i;
+	int r = 0;
+	u16 dummy;
+
+	struct kvm_vcpu *vcpu;
+
+	/* Disable the GISA if the ultravisor does not support AIV. */
+	if (!uv_has_feature(BIT_UV_FEAT_AIV))
+		kvm_s390_gisa_disable(kvm);
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		mutex_lock(&vcpu->mutex);
+		r = kvm_s390_pv_create_cpu(vcpu, rc, rrc);
+		mutex_unlock(&vcpu->mutex);
+		if (r)
+			break;
+	}
+	if (r)
+		kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
+	return r;
+}
+
+/*
+ * Here we provide user space with a direct interface to query UV
+ * related data like UV maxima and available features as well as
+ * feature specific data.
+ *
+ * To facilitate future extension of the data structures we'll try to
+ * write data up to the maximum requested length.
+ */
+static ssize_t kvm_s390_handle_pv_info(struct kvm_s390_pv_info *info)
+{
+	ssize_t len_min;
+
+	switch (info->header.id) {
+	case KVM_PV_INFO_VM: {
+		len_min =  sizeof(info->header) + sizeof(info->vm);
+
+		if (info->header.len_max < len_min)
+			return -EINVAL;
+
+		memcpy(info->vm.inst_calls_list,
+		       uv_info.inst_calls_list,
+		       sizeof(uv_info.inst_calls_list));
+
+		/* It's max cpuid not max cpus, so it's off by one */
+		info->vm.max_cpus = uv_info.max_guest_cpu_id + 1;
+		info->vm.max_guests = uv_info.max_num_sec_conf;
+		info->vm.max_guest_addr = uv_info.max_sec_stor_addr;
+		info->vm.feature_indication = uv_info.uv_feature_indications;
+
+		return len_min;
+	}
+	case KVM_PV_INFO_DUMP: {
+		len_min =  sizeof(info->header) + sizeof(info->dump);
+
+		if (info->header.len_max < len_min)
+			return -EINVAL;
+
+		info->dump.dump_cpu_buffer_len = uv_info.guest_cpu_stor_len;
+		info->dump.dump_config_mem_buffer_per_1m = uv_info.conf_dump_storage_state_len;
+		info->dump.dump_config_finalize_len = uv_info.conf_dump_finalize_len;
+		return len_min;
+	}
+	default:
+		return -EINVAL;
+	}
+}
+
+static int kvm_s390_pv_dmp(struct kvm *kvm, struct kvm_pv_cmd *cmd,
+			   struct kvm_s390_pv_dmp dmp)
+{
+	int r = -EINVAL;
+	void __user *result_buff = (void __user *)dmp.buff_addr;
+
+	switch (dmp.subcmd) {
+	case KVM_PV_DUMP_INIT: {
+		if (kvm->arch.pv.dumping)
+			break;
+
+		/*
+		 * Block SIE entry as concurrent dump UVCs could lead
+		 * to validities.
+		 */
+		kvm_s390_vcpu_block_all(kvm);
+
+		r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+				  UVC_CMD_DUMP_INIT, &cmd->rc, &cmd->rrc);
+		KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP INIT: rc %x rrc %x",
+			     cmd->rc, cmd->rrc);
+		if (!r) {
+			kvm->arch.pv.dumping = true;
+		} else {
+			kvm_s390_vcpu_unblock_all(kvm);
+			r = -EINVAL;
+		}
+		break;
+	}
+	case KVM_PV_DUMP_CONFIG_STOR_STATE: {
+		if (!kvm->arch.pv.dumping)
+			break;
+
+		/*
+		 * gaddr is an output parameter since we might stop
+		 * early. As dmp will be copied back in our caller, we
+		 * don't need to do it ourselves.
+		 */
+		r = kvm_s390_pv_dump_stor_state(kvm, result_buff, &dmp.gaddr, dmp.buff_len,
+						&cmd->rc, &cmd->rrc);
+		break;
+	}
+	case KVM_PV_DUMP_COMPLETE: {
+		if (!kvm->arch.pv.dumping)
+			break;
+
+		r = -EINVAL;
+		if (dmp.buff_len < uv_info.conf_dump_finalize_len)
+			break;
+
+		r = kvm_s390_pv_dump_complete(kvm, result_buff,
+					      &cmd->rc, &cmd->rrc);
+		break;
+	}
+	default:
+		r = -ENOTTY;
+		break;
+	}
+
+	return r;
+}
+
+static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
+{
+	const bool need_lock = (cmd->cmd != KVM_PV_ASYNC_CLEANUP_PERFORM);
+	void __user *argp = (void __user *)cmd->data;
+	int r = 0;
+	u16 dummy;
+
+	if (need_lock)
+		mutex_lock(&kvm->lock);
+
+	switch (cmd->cmd) {
+	case KVM_PV_ENABLE: {
+		r = -EINVAL;
+		if (kvm_s390_pv_is_protected(kvm))
+			break;
+
+		/*
+		 *  FMT 4 SIE needs esca. As we never switch back to bsca from
+		 *  esca, we need no cleanup in the error cases below
+		 */
+		r = sca_switch_to_extended(kvm);
+		if (r)
+			break;
+
+		r = s390_disable_cow_sharing();
+		if (r)
+			break;
+
+		r = kvm_s390_pv_init_vm(kvm, &cmd->rc, &cmd->rrc);
+		if (r)
+			break;
+
+		r = kvm_s390_cpus_to_pv(kvm, &cmd->rc, &cmd->rrc);
+		if (r)
+			kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
+
+		/* we need to block service interrupts from now on */
+		set_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs);
+		break;
+	}
+	case KVM_PV_ASYNC_CLEANUP_PREPARE:
+		r = -EINVAL;
+		if (!kvm_s390_pv_is_protected(kvm) || !async_destroy)
+			break;
+
+		r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc);
+		/*
+		 * If a CPU could not be destroyed, destroy VM will also fail.
+		 * There is no point in trying to destroy it. Instead return
+		 * the rc and rrc from the first CPU that failed destroying.
+		 */
+		if (r)
+			break;
+		r = kvm_s390_pv_set_aside(kvm, &cmd->rc, &cmd->rrc);
+
+		/* no need to block service interrupts any more */
+		clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs);
+		break;
+	case KVM_PV_ASYNC_CLEANUP_PERFORM:
+		r = -EINVAL;
+		if (!async_destroy)
+			break;
+		/* kvm->lock must not be held; this is asserted inside the function. */
+		r = kvm_s390_pv_deinit_aside_vm(kvm, &cmd->rc, &cmd->rrc);
+		break;
+	case KVM_PV_DISABLE: {
+		r = -EINVAL;
+		if (!kvm_s390_pv_is_protected(kvm))
+			break;
+
+		r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc);
+		/*
+		 * If a CPU could not be destroyed, destroy VM will also fail.
+		 * There is no point in trying to destroy it. Instead return
+		 * the rc and rrc from the first CPU that failed destroying.
+		 */
+		if (r)
+			break;
+		r = kvm_s390_pv_deinit_cleanup_all(kvm, &cmd->rc, &cmd->rrc);
+
+		/* no need to block service interrupts any more */
+		clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs);
+		break;
+	}
+	case KVM_PV_SET_SEC_PARMS: {
+		struct kvm_s390_pv_sec_parm parms = {};
+		void *hdr;
+
+		r = -EINVAL;
+		if (!kvm_s390_pv_is_protected(kvm))
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(&parms, argp, sizeof(parms)))
+			break;
+
+		/* Currently restricted to 8KB */
+		r = -EINVAL;
+		if (parms.length > PAGE_SIZE * 2)
+			break;
+
+		r = -ENOMEM;
+		hdr = vmalloc(parms.length);
+		if (!hdr)
+			break;
+
+		r = -EFAULT;
+		if (!copy_from_user(hdr, (void __user *)parms.origin,
+				    parms.length))
+			r = kvm_s390_pv_set_sec_parms(kvm, hdr, parms.length,
+						      &cmd->rc, &cmd->rrc);
+
+		vfree(hdr);
+		break;
+	}
+	case KVM_PV_UNPACK: {
+		struct kvm_s390_pv_unp unp = {};
+
+		r = -EINVAL;
+		if (!kvm_s390_pv_is_protected(kvm) || !mm_is_protected(kvm->mm))
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(&unp, argp, sizeof(unp)))
+			break;
+
+		r = kvm_s390_pv_unpack(kvm, unp.addr, unp.size, unp.tweak,
+				       &cmd->rc, &cmd->rrc);
+		break;
+	}
+	case KVM_PV_VERIFY: {
+		r = -EINVAL;
+		if (!kvm_s390_pv_is_protected(kvm))
+			break;
+
+		r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+				  UVC_CMD_VERIFY_IMG, &cmd->rc, &cmd->rrc);
+		KVM_UV_EVENT(kvm, 3, "PROTVIRT VERIFY: rc %x rrc %x", cmd->rc,
+			     cmd->rrc);
+		break;
+	}
+	case KVM_PV_PREP_RESET: {
+		r = -EINVAL;
+		if (!kvm_s390_pv_is_protected(kvm))
+			break;
+
+		r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+				  UVC_CMD_PREPARE_RESET, &cmd->rc, &cmd->rrc);
+		KVM_UV_EVENT(kvm, 3, "PROTVIRT PREP RESET: rc %x rrc %x",
+			     cmd->rc, cmd->rrc);
+		break;
+	}
+	case KVM_PV_UNSHARE_ALL: {
+		r = -EINVAL;
+		if (!kvm_s390_pv_is_protected(kvm))
+			break;
+
+		r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+				  UVC_CMD_SET_UNSHARE_ALL, &cmd->rc, &cmd->rrc);
+		KVM_UV_EVENT(kvm, 3, "PROTVIRT UNSHARE: rc %x rrc %x",
+			     cmd->rc, cmd->rrc);
+		break;
+	}
+	case KVM_PV_INFO: {
+		struct kvm_s390_pv_info info = {};
+		ssize_t data_len;
+
+		/*
+		 * No need to check the VM protection here.
+		 *
+		 * Maybe user space wants to query some of the data
+		 * when the VM is still unprotected. If we see the
+		 * need to fence a new data command we can still
+		 * return an error in the info handler.
+		 */
+
+		r = -EFAULT;
+		if (copy_from_user(&info, argp, sizeof(info.header)))
+			break;
+
+		r = -EINVAL;
+		if (info.header.len_max < sizeof(info.header))
+			break;
+
+		data_len = kvm_s390_handle_pv_info(&info);
+		if (data_len < 0) {
+			r = data_len;
+			break;
+		}
+		/*
+		 * If a data command struct is extended (multiple
+		 * times) this can be used to determine how much of it
+		 * is valid.
+		 */
+		info.header.len_written = data_len;
+
+		r = -EFAULT;
+		if (copy_to_user(argp, &info, data_len))
+			break;
+
+		r = 0;
+		break;
+	}
+	case KVM_PV_DUMP: {
+		struct kvm_s390_pv_dmp dmp;
+
+		r = -EINVAL;
+		if (!kvm_s390_pv_is_protected(kvm))
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(&dmp, argp, sizeof(dmp)))
+			break;
+
+		r = kvm_s390_pv_dmp(kvm, cmd, dmp);
+		if (r)
+			break;
+
+		if (copy_to_user(argp, &dmp, sizeof(dmp))) {
+			r = -EFAULT;
+			break;
+		}
+
+		break;
+	}
+	default:
+		r = -ENOTTY;
+	}
+	if (need_lock)
+		mutex_unlock(&kvm->lock);
+
+	return r;
+}
+
+static int mem_op_validate_common(struct kvm_s390_mem_op *mop, u64 supported_flags)
+{
+	if (mop->flags & ~supported_flags || !mop->size)
+		return -EINVAL;
+	if (mop->size > MEM_OP_MAX_SIZE)
+		return -E2BIG;
+	if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) {
+		if (mop->key > 0xf)
+			return -EINVAL;
+	} else {
+		mop->key = 0;
+	}
+	return 0;
+}
+
+static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop)
+{
+	void __user *uaddr = (void __user *)mop->buf;
+	enum gacc_mode acc_mode;
+	void *tmpbuf = NULL;
+	int r, srcu_idx;
+
+	r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION |
+					KVM_S390_MEMOP_F_CHECK_ONLY);
+	if (r)
+		return r;
+
+	if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) {
+		tmpbuf = vmalloc(mop->size);
+		if (!tmpbuf)
+			return -ENOMEM;
+	}
+
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+
+	if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) {
+		r = PGM_ADDRESSING;
+		goto out_unlock;
+	}
+
+	acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE;
+	if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
+		r = check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key);
+		goto out_unlock;
+	}
+	if (acc_mode == GACC_FETCH) {
+		r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf,
+					      mop->size, GACC_FETCH, mop->key);
+		if (r)
+			goto out_unlock;
+		if (copy_to_user(uaddr, tmpbuf, mop->size))
+			r = -EFAULT;
+	} else {
+		if (copy_from_user(tmpbuf, uaddr, mop->size)) {
+			r = -EFAULT;
+			goto out_unlock;
+		}
+		r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf,
+					      mop->size, GACC_STORE, mop->key);
+	}
+
+out_unlock:
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+	vfree(tmpbuf);
+	return r;
+}
+
+static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *mop)
+{
+	void __user *uaddr = (void __user *)mop->buf;
+	void __user *old_addr = (void __user *)mop->old_addr;
+	union {
+		__uint128_t quad;
+		char raw[sizeof(__uint128_t)];
+	} old = { .quad = 0}, new = { .quad = 0 };
+	unsigned int off_in_quad = sizeof(new) - mop->size;
+	int r, srcu_idx;
+	bool success;
+
+	r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION);
+	if (r)
+		return r;
+	/*
+	 * This validates off_in_quad. Checking that size is a power
+	 * of two is not necessary, as cmpxchg_guest_abs_with_key
+	 * takes care of that
+	 */
+	if (mop->size > sizeof(new))
+		return -EINVAL;
+	if (copy_from_user(&new.raw[off_in_quad], uaddr, mop->size))
+		return -EFAULT;
+	if (copy_from_user(&old.raw[off_in_quad], old_addr, mop->size))
+		return -EFAULT;
+
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+
+	if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) {
+		r = PGM_ADDRESSING;
+		goto out_unlock;
+	}
+
+	r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old.quad,
+				       new.quad, mop->key, &success);
+	if (!success && copy_to_user(old_addr, &old.raw[off_in_quad], mop->size))
+		r = -EFAULT;
+
+out_unlock:
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	return r;
+}
+
+static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
+{
+	/*
+	 * This is technically a heuristic only, if the kvm->lock is not
+	 * taken, it is not guaranteed that the vm is/remains non-protected.
+	 * This is ok from a kernel perspective, wrongdoing is detected
+	 * on the access, -EFAULT is returned and the vm may crash the
+	 * next time it accesses the memory in question.
+	 * There is no sane usecase to do switching and a memop on two
+	 * different CPUs at the same time.
+	 */
+	if (kvm_s390_pv_get_handle(kvm))
+		return -EINVAL;
+
+	switch (mop->op) {
+	case KVM_S390_MEMOP_ABSOLUTE_READ:
+	case KVM_S390_MEMOP_ABSOLUTE_WRITE:
+		return kvm_s390_vm_mem_op_abs(kvm, mop);
+	case KVM_S390_MEMOP_ABSOLUTE_CMPXCHG:
+		return kvm_s390_vm_mem_op_cmpxchg(kvm, mop);
+	default:
+		return -EINVAL;
+	}
+}
+
+int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 {
 	struct kvm *kvm = filp->private_data;
 	void __user *argp = (void __user *)arg;
@@ -2176,14 +3039,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_CREATE_IRQCHIP: {
-		struct kvm_irq_routing_entry routing;
-
 		r = -EINVAL;
-		if (kvm->arch.use_irqchip) {
-			/* Set up dummy routing. */
-			memset(&routing, 0, sizeof(routing));
-			r = kvm_set_irq_routing(kvm, &routing, 0, 0);
-		}
+		if (kvm->arch.use_irqchip)
+			r = 0;
 		break;
 	}
 	case KVM_SET_DEVICE_ATTR: {
@@ -2254,6 +3112,54 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		mutex_unlock(&kvm->slots_lock);
 		break;
 	}
+	case KVM_S390_PV_COMMAND: {
+		struct kvm_pv_cmd args;
+
+		/* protvirt means user cpu state */
+		kvm_s390_set_user_cpu_state_ctrl(kvm);
+		r = 0;
+		if (!is_prot_virt_host()) {
+			r = -EINVAL;
+			break;
+		}
+		if (copy_from_user(&args, argp, sizeof(args))) {
+			r = -EFAULT;
+			break;
+		}
+		if (args.flags) {
+			r = -EINVAL;
+			break;
+		}
+		/* must be called without kvm->lock */
+		r = kvm_s390_handle_pv(kvm, &args);
+		if (copy_to_user(argp, &args, sizeof(args))) {
+			r = -EFAULT;
+			break;
+		}
+		break;
+	}
+	case KVM_S390_MEM_OP: {
+		struct kvm_s390_mem_op mem_op;
+
+		if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0)
+			r = kvm_s390_vm_mem_op(kvm, &mem_op);
+		else
+			r = -EFAULT;
+		break;
+	}
+	case KVM_S390_ZPCI_OP: {
+		struct kvm_s390_zpci_op args;
+
+		r = -EINVAL;
+		if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
+			break;
+		if (copy_from_user(&args, argp, sizeof(args))) {
+			r = -EFAULT;
+			break;
+		}
+		r = kvm_s390_pci_zpci_op(kvm, &args);
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
@@ -2283,7 +3189,7 @@ static int kvm_s390_apxa_installed(void)
  */
 static void kvm_s390_set_crycb_format(struct kvm *kvm)
 {
-	kvm->arch.crypto.crycbd = (__u32)(unsigned long) kvm->arch.crypto.crycb;
+	kvm->arch.crypto.crycbd = virt_to_phys(kvm->arch.crypto.crycb);
 
 	/* Clear the CRYCB format bits - i.e., set format 0 by default */
 	kvm->arch.crypto.crycbd &= ~(CRYCB_FORMAT_MASK);
@@ -2298,12 +3204,26 @@ static void kvm_s390_set_crycb_format(struct kvm *kvm)
 		kvm->arch.crypto.crycbd |= CRYCB_FORMAT1;
 }
 
+/*
+ * kvm_arch_crypto_set_masks
+ *
+ * @kvm: pointer to the target guest's KVM struct containing the crypto masks
+ *	 to be set.
+ * @apm: the mask identifying the accessible AP adapters
+ * @aqm: the mask identifying the accessible AP domains
+ * @adm: the mask identifying the accessible AP control domains
+ *
+ * Set the masks that identify the adapters, domains and control domains to
+ * which the KVM guest is granted access.
+ *
+ * Note: The kvm->lock mutex must be locked by the caller before invoking this
+ *	 function.
+ */
 void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm,
 			       unsigned long *aqm, unsigned long *adm)
 {
 	struct kvm_s390_crypto_cb *crycb = kvm->arch.crypto.crycb;
 
-	mutex_lock(&kvm->lock);
 	kvm_s390_vcpu_block_all(kvm);
 
 	switch (kvm->arch.crypto.crycbd & CRYCB_FORMAT_MASK) {
@@ -2334,13 +3254,23 @@ void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm,
 	/* recreate the shadow crycb for each vcpu */
 	kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART);
 	kvm_s390_vcpu_unblock_all(kvm);
-	mutex_unlock(&kvm->lock);
 }
 EXPORT_SYMBOL_GPL(kvm_arch_crypto_set_masks);
 
+/*
+ * kvm_arch_crypto_clear_masks
+ *
+ * @kvm: pointer to the target guest's KVM struct containing the crypto masks
+ *	 to be cleared.
+ *
+ * Clear the masks that identify the adapters, domains and control domains to
+ * which the KVM guest is granted access.
+ *
+ * Note: The kvm->lock mutex must be locked by the caller before invoking this
+ *	 function.
+ */
 void kvm_arch_crypto_clear_masks(struct kvm *kvm)
 {
-	mutex_lock(&kvm->lock);
 	kvm_s390_vcpu_block_all(kvm);
 
 	memset(&kvm->arch.crypto.crycb->apcb0, 0,
@@ -2352,7 +3282,6 @@ void kvm_arch_crypto_clear_masks(struct kvm *kvm)
 	/* recreate the shadow crycb for each vcpu */
 	kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART);
 	kvm_s390_vcpu_unblock_all(kvm);
-	mutex_unlock(&kvm->lock);
 }
 EXPORT_SYMBOL_GPL(kvm_arch_crypto_clear_masks);
 
@@ -2369,6 +3298,7 @@ static void kvm_s390_crypto_init(struct kvm *kvm)
 {
 	kvm->arch.crypto.crycb = &kvm->arch.sie_page2->crycb;
 	kvm_s390_set_crycb_format(kvm);
+	init_rwsem(&kvm->arch.crypto.pqap_hook_rwsem);
 
 	if (!test_kvm_facility(kvm, 76))
 		return;
@@ -2391,9 +3321,17 @@ static void sca_dispose(struct kvm *kvm)
 	kvm->arch.sca = NULL;
 }
 
+void kvm_arch_free_vm(struct kvm *kvm)
+{
+	if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
+		kvm_s390_pci_clear_list(kvm);
+
+	__kvm_arch_free_vm(kvm);
+}
+
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
-	gfp_t alloc_flags = GFP_KERNEL;
+	gfp_t alloc_flags = GFP_KERNEL_ACCOUNT;
 	int i, rc;
 	char debug_name[16];
 	static unsigned long sca_offset;
@@ -2438,7 +3376,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 	BUILD_BUG_ON(sizeof(struct sie_page2) != 4096);
 	kvm->arch.sie_page2 =
-	     (struct sie_page2 *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
+	     (struct sie_page2 *) get_zeroed_page(GFP_KERNEL_ACCOUNT | GFP_DMA);
 	if (!kvm->arch.sie_page2)
 		goto out_err;
 
@@ -2446,10 +3384,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	kvm->arch.model.fac_list = kvm->arch.sie_page2->fac_list;
 
 	for (i = 0; i < kvm_s390_fac_size(); i++) {
-		kvm->arch.model.fac_mask[i] = S390_lowcore.stfle_fac_list[i] &
+		kvm->arch.model.fac_mask[i] = stfle_fac_list[i] &
 					      (kvm_s390_fac_base[i] |
 					       kvm_s390_fac_ext[i]);
-		kvm->arch.model.fac_list[i] = S390_lowcore.stfle_fac_list[i] &
+		kvm->arch.model.fac_list[i] = stfle_fac_list[i] &
 					      kvm_s390_fac_base[i];
 	}
 	kvm->arch.model.subfuncs = kvm_s390_available_subfunc;
@@ -2460,7 +3398,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	/* we emulate STHYI in kvm */
 	set_kvm_facility(kvm->arch.model.fac_mask, 74);
 	set_kvm_facility(kvm->arch.model.fac_list, 74);
-	if (MACHINE_HAS_TLB_GUEST) {
+	if (machine_has_tlb_guest()) {
 		set_kvm_facility(kvm->arch.model.fac_mask, 147);
 		set_kvm_facility(kvm->arch.model.fac_list, 147);
 	}
@@ -2471,8 +3409,17 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid();
 	kvm->arch.model.ibc = sclp.ibc & 0x0fff;
 
+	kvm->arch.model.uv_feat_guest.feat = 0;
+
 	kvm_s390_crypto_init(kvm);
 
+	if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) {
+		mutex_lock(&kvm->lock);
+		kvm_s390_pci_init_list(kvm);
+		kvm_s390_vcpu_pci_enable_interp(kvm);
+		mutex_unlock(&kvm->lock);
+	}
+
 	mutex_init(&kvm->arch.float_int.ais_lock);
 	spin_lock_init(&kvm->arch.float_int.lock);
 	for (i = 0; i < FIRQ_LIST_COUNT; i++)
@@ -2484,8 +3431,20 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	VM_EVENT(kvm, 3, "vm created with type %lu", type);
 
 	if (type & KVM_VM_S390_UCONTROL) {
+		struct kvm_userspace_memory_region2 fake_memslot = {
+			.slot = KVM_S390_UCONTROL_MEMSLOT,
+			.guest_phys_addr = 0,
+			.userspace_addr = 0,
+			.memory_size = ALIGN_DOWN(TASK_SIZE, _SEGMENT_SIZE),
+			.flags = 0,
+		};
+
 		kvm->arch.gmap = NULL;
 		kvm->arch.mem_limit = KVM_S390_NO_MEM_LIMIT;
+		/* one flat fake memslot covering the whole address-space */
+		mutex_lock(&kvm->slots_lock);
+		KVM_BUG_ON(kvm_set_internal_memslot(kvm, &fake_memslot), kvm);
+		mutex_unlock(&kvm->slots_lock);
 	} else {
 		if (sclp.hamax == U64_MAX)
 			kvm->arch.mem_limit = TASK_SIZE_MAX;
@@ -2503,7 +3462,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	kvm->arch.use_skf = sclp.has_skey;
 	spin_lock_init(&kvm->arch.start_stop_lock);
 	kvm_s390_vsie_init(kvm);
-	kvm_s390_gisa_init(kvm);
+	if (use_gisa)
+		kvm_s390_gisa_init(kvm);
+	INIT_LIST_HEAD(&kvm->arch.pv.need_cleanup);
+	kvm->arch.pv.set_aside = NULL;
 	KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
 
 	return 0;
@@ -2517,46 +3479,50 @@ out_err:
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
+	u16 rc, rrc;
+
 	VCPU_EVENT(vcpu, 3, "%s", "free cpu");
 	trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id);
 	kvm_s390_clear_local_irqs(vcpu);
 	kvm_clear_async_pf_completion_queue(vcpu);
 	if (!kvm_is_ucontrol(vcpu->kvm))
 		sca_del_vcpu(vcpu);
+	kvm_s390_update_topology_change_report(vcpu->kvm, 1);
 
 	if (kvm_is_ucontrol(vcpu->kvm))
 		gmap_remove(vcpu->arch.gmap);
 
 	if (vcpu->kvm->arch.use_cmma)
 		kvm_s390_vcpu_unsetup_cmma(vcpu);
+	/* We can not hold the vcpu mutex here, we are already dying */
+	if (kvm_s390_pv_cpu_get_handle(vcpu))
+		kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc);
 	free_page((unsigned long)(vcpu->arch.sie_block));
-
-	kvm_vcpu_uninit(vcpu);
-	kmem_cache_free(kvm_vcpu_cache, vcpu);
-}
-
-static void kvm_free_vcpus(struct kvm *kvm)
-{
-	unsigned int i;
-	struct kvm_vcpu *vcpu;
-
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		kvm_arch_vcpu_destroy(vcpu);
-
-	mutex_lock(&kvm->lock);
-	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
-		kvm->vcpus[i] = NULL;
-
-	atomic_set(&kvm->online_vcpus, 0);
-	mutex_unlock(&kvm->lock);
 }
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
-	kvm_free_vcpus(kvm);
+	u16 rc, rrc;
+
+	kvm_destroy_vcpus(kvm);
 	sca_dispose(kvm);
-	debug_unregister(kvm->arch.dbf);
 	kvm_s390_gisa_destroy(kvm);
+	/*
+	 * We are already at the end of life and kvm->lock is not taken.
+	 * This is ok as the file descriptor is closed by now and nobody
+	 * can mess with the pv state.
+	 */
+	kvm_s390_pv_deinit_cleanup_all(kvm, &rc, &rrc);
+	/*
+	 * Remove the mmu notifier only when the whole KVM VM is torn down,
+	 * and only if one was registered to begin with. If the VM is
+	 * currently not protected, but has been previously been protected,
+	 * then it's possible that the notifier is still registered.
+	 */
+	if (kvm->arch.pv.mmu_notifier.ops)
+		mmu_notifier_unregister(&kvm->arch.pv.mmu_notifier, kvm->mm);
+
+	debug_unregister(kvm->arch.dbf);
 	free_page((unsigned long)kvm->arch.sie_page2);
 	if (!kvm_is_ucontrol(kvm))
 		gmap_remove(kvm->arch.gmap);
@@ -2599,28 +3565,30 @@ static void sca_del_vcpu(struct kvm_vcpu *vcpu)
 static void sca_add_vcpu(struct kvm_vcpu *vcpu)
 {
 	if (!kvm_s390_use_sca_entries()) {
-		struct bsca_block *sca = vcpu->kvm->arch.sca;
+		phys_addr_t sca_phys = virt_to_phys(vcpu->kvm->arch.sca);
 
 		/* we still need the basic sca for the ipte control */
-		vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
-		vcpu->arch.sie_block->scaol = (__u32)(__u64)sca;
+		vcpu->arch.sie_block->scaoh = sca_phys >> 32;
+		vcpu->arch.sie_block->scaol = sca_phys;
 		return;
 	}
 	read_lock(&vcpu->kvm->arch.sca_lock);
 	if (vcpu->kvm->arch.use_esca) {
 		struct esca_block *sca = vcpu->kvm->arch.sca;
+		phys_addr_t sca_phys = virt_to_phys(sca);
 
-		sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block;
-		vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
-		vcpu->arch.sie_block->scaol = (__u32)(__u64)sca & ~0x3fU;
+		sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block);
+		vcpu->arch.sie_block->scaoh = sca_phys >> 32;
+		vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK;
 		vcpu->arch.sie_block->ecb2 |= ECB2_ESCA;
 		set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn);
 	} else {
 		struct bsca_block *sca = vcpu->kvm->arch.sca;
+		phys_addr_t sca_phys = virt_to_phys(sca);
 
-		sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block;
-		vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
-		vcpu->arch.sie_block->scaol = (__u32)(__u64)sca;
+		sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block);
+		vcpu->arch.sie_block->scaoh = sca_phys >> 32;
+		vcpu->arch.sie_block->scaol = sca_phys;
 		set_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn);
 	}
 	read_unlock(&vcpu->kvm->arch.sca_lock);
@@ -2649,15 +3617,20 @@ static int sca_switch_to_extended(struct kvm *kvm)
 	struct bsca_block *old_sca = kvm->arch.sca;
 	struct esca_block *new_sca;
 	struct kvm_vcpu *vcpu;
-	unsigned int vcpu_idx;
+	unsigned long vcpu_idx;
 	u32 scaol, scaoh;
+	phys_addr_t new_sca_phys;
 
-	new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL|__GFP_ZERO);
+	if (kvm->arch.use_esca)
+		return 0;
+
+	new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL_ACCOUNT | __GFP_ZERO);
 	if (!new_sca)
 		return -ENOMEM;
 
-	scaoh = (u32)((u64)(new_sca) >> 32);
-	scaol = (u32)(u64)(new_sca) & ~0x3fU;
+	new_sca_phys = virt_to_phys(new_sca);
+	scaoh = new_sca_phys >> 32;
+	scaol = new_sca_phys & ESCA_SCAOL_MASK;
 
 	kvm_s390_vcpu_block_all(kvm);
 	write_lock(&kvm->arch.sca_lock);
@@ -2696,46 +3669,11 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id)
 	if (!sclp.has_esca || !sclp.has_64bscao)
 		return false;
 
-	mutex_lock(&kvm->lock);
 	rc = kvm->arch.use_esca ? 0 : sca_switch_to_extended(kvm);
-	mutex_unlock(&kvm->lock);
 
 	return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS;
 }
 
-int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
-{
-	vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
-	kvm_clear_async_pf_completion_queue(vcpu);
-	vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX |
-				    KVM_SYNC_GPRS |
-				    KVM_SYNC_ACRS |
-				    KVM_SYNC_CRS |
-				    KVM_SYNC_ARCH0 |
-				    KVM_SYNC_PFAULT;
-	kvm_s390_set_prefix(vcpu, 0);
-	if (test_kvm_facility(vcpu->kvm, 64))
-		vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB;
-	if (test_kvm_facility(vcpu->kvm, 82))
-		vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC;
-	if (test_kvm_facility(vcpu->kvm, 133))
-		vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB;
-	if (test_kvm_facility(vcpu->kvm, 156))
-		vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN;
-	/* fprs can be synchronized via vrs, even if the guest has no vx. With
-	 * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format.
-	 */
-	if (MACHINE_HAS_VX)
-		vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS;
-	else
-		vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS;
-
-	if (kvm_is_ucontrol(vcpu->kvm))
-		return __kvm_ucontrol_vcpu_init(vcpu);
-
-	return 0;
-}
-
 /* needs disabled preemption to protect from TOD sync and vcpu_load/put */
 static void __start_cpu_timer_accounting(struct kvm_vcpu *vcpu)
 {
@@ -2826,7 +3764,6 @@ __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu)
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 
-	gmap_enable(vcpu->arch.enabled_gmap);
 	kvm_s390_set_cpuflags(vcpu, CPUSTAT_RUNNING);
 	if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
 		__start_cpu_timer_accounting(vcpu);
@@ -2839,40 +3776,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
 		__stop_cpu_timer_accounting(vcpu);
 	kvm_s390_clear_cpuflags(vcpu, CPUSTAT_RUNNING);
-	vcpu->arch.enabled_gmap = gmap_get_enabled();
-	gmap_disable(vcpu->arch.enabled_gmap);
 
 }
 
-static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
-{
-	/* this equals initial cpu reset in pop, but we don't switch to ESA */
-	vcpu->arch.sie_block->gpsw.mask = 0UL;
-	vcpu->arch.sie_block->gpsw.addr = 0UL;
-	kvm_s390_set_prefix(vcpu, 0);
-	kvm_s390_set_cpu_timer(vcpu, 0);
-	vcpu->arch.sie_block->ckc       = 0UL;
-	vcpu->arch.sie_block->todpr     = 0;
-	memset(vcpu->arch.sie_block->gcr, 0, 16 * sizeof(__u64));
-	vcpu->arch.sie_block->gcr[0]  = CR0_UNUSED_56 |
-					CR0_INTERRUPT_KEY_SUBMASK |
-					CR0_MEASUREMENT_ALERT_SUBMASK;
-	vcpu->arch.sie_block->gcr[14] = CR14_UNUSED_32 |
-					CR14_UNUSED_33 |
-					CR14_EXTERNAL_DAMAGE_SUBMASK;
-	/* make sure the new fpc will be lazily loaded */
-	save_fpu_regs();
-	current->thread.fpu.fpc = 0;
-	vcpu->arch.sie_block->gbea = 1;
-	vcpu->arch.sie_block->pp = 0;
-	vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
-	vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
-	kvm_clear_async_pf_completion_queue(vcpu);
-	if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm))
-		kvm_s390_vcpu_stop(vcpu);
-	kvm_s390_clear_local_irqs(vcpu);
-}
-
 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
 	mutex_lock(&vcpu->kvm->lock);
@@ -2887,8 +3793,6 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 	}
 	if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0)
 		vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
-	/* make vcpu_load load the right gmap on the first trigger */
-	vcpu->arch.enabled_gmap = vcpu->arch.gmap;
 }
 
 static bool kvm_has_pckmo_subfunc(struct kvm *kvm, unsigned long nr)
@@ -2910,6 +3814,13 @@ static bool kvm_has_pckmo_ecc(struct kvm *kvm)
 
 }
 
+static bool kvm_has_pckmo_hmac(struct kvm *kvm)
+{
+	/* At least one HMAC subfunction must be present */
+	return kvm_has_pckmo_subfunc(kvm, 118) ||
+	       kvm_has_pckmo_subfunc(kvm, 122);
+}
+
 static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -2922,7 +3833,7 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
 	vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd;
 	vcpu->arch.sie_block->ecb3 &= ~(ECB3_AES | ECB3_DEA);
 	vcpu->arch.sie_block->eca &= ~ECA_APIE;
-	vcpu->arch.sie_block->ecd &= ~ECD_ECC;
+	vcpu->arch.sie_block->ecd &= ~(ECD_ECC | ECD_HMAC);
 
 	if (vcpu->kvm->arch.crypto.apie)
 		vcpu->arch.sie_block->eca |= ECA_APIE;
@@ -2930,9 +3841,11 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
 	/* Set up protected key support */
 	if (vcpu->kvm->arch.crypto.aes_kw) {
 		vcpu->arch.sie_block->ecb3 |= ECB3_AES;
-		/* ecc is also wrapped with AES key */
+		/* ecc/hmac is also wrapped with AES key */
 		if (kvm_has_pckmo_ecc(vcpu->kvm))
 			vcpu->arch.sie_block->ecd |= ECD_ECC;
+		if (kvm_has_pckmo_hmac(vcpu->kvm))
+			vcpu->arch.sie_block->ecd |= ECD_HMAC;
 	}
 
 	if (vcpu->kvm->arch.crypto.dea_kw)
@@ -2941,15 +3854,18 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
 
 void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu)
 {
-	free_page(vcpu->arch.sie_block->cbrlo);
+	free_page((unsigned long)phys_to_virt(vcpu->arch.sie_block->cbrlo));
 	vcpu->arch.sie_block->cbrlo = 0;
 }
 
 int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.sie_block->cbrlo = get_zeroed_page(GFP_KERNEL);
-	if (!vcpu->arch.sie_block->cbrlo)
+	void *cbrlo_page = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+
+	if (!cbrlo_page)
 		return -ENOMEM;
+
+	vcpu->arch.sie_block->cbrlo = virt_to_phys(cbrlo_page);
 	return 0;
 }
 
@@ -2959,12 +3875,13 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.sie_block->ibc = model->ibc;
 	if (test_kvm_facility(vcpu->kvm, 7))
-		vcpu->arch.sie_block->fac = (u32)(u64) model->fac_list;
+		vcpu->arch.sie_block->fac = virt_to_phys(model->fac_list);
 }
 
-int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 	int rc = 0;
+	u16 uvrc, uvrrc;
 
 	atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH |
 						    CPUSTAT_SM |
@@ -2977,13 +3894,17 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
 	kvm_s390_vcpu_setup_model(vcpu);
 
-	/* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */
-	if (MACHINE_HAS_ESOP)
+	/* pgste_set_pte has special handling for !machine_has_esop() */
+	if (machine_has_esop())
 		vcpu->arch.sie_block->ecb |= ECB_HOSTPROTINT;
 	if (test_kvm_facility(vcpu->kvm, 9))
 		vcpu->arch.sie_block->ecb |= ECB_SRSI;
+	if (test_kvm_facility(vcpu->kvm, 11))
+		vcpu->arch.sie_block->ecb |= ECB_PTF;
 	if (test_kvm_facility(vcpu->kvm, 73))
 		vcpu->arch.sie_block->ecb |= ECB_TE;
+	if (!kvm_is_ucontrol(vcpu->kvm))
+		vcpu->arch.sie_block->ecb |= ECB_SPECI;
 
 	if (test_kvm_facility(vcpu->kvm, 8) && vcpu->kvm->arch.use_pfmfi)
 		vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI;
@@ -3011,9 +3932,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 		VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u",
 			   vcpu->arch.sie_block->gd & 0x3, vcpu->vcpu_id);
 	}
-	vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx)
-					| SDNXC;
-	vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb;
+	vcpu->arch.sie_block->sdnxo = virt_to_phys(&vcpu->run->s.regs.sdnx) | SDNXC;
+	vcpu->arch.sie_block->riccbd = virt_to_phys(&vcpu->run->s.regs.riccb);
 
 	if (sclp.has_kss)
 		kvm_s390_set_cpuflags(vcpu, CPUSTAT_KSS);
@@ -3025,69 +3945,110 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 		if (rc)
 			return rc;
 	}
-	hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup;
+	hrtimer_setup(&vcpu->arch.ckc_timer, kvm_s390_idle_wakeup, CLOCK_MONOTONIC,
+		      HRTIMER_MODE_REL);
 
 	vcpu->arch.sie_block->hpid = HPID_KVM;
 
 	kvm_s390_vcpu_crypto_setup(vcpu);
 
+	kvm_s390_vcpu_pci_setup(vcpu);
+
+	mutex_lock(&vcpu->kvm->lock);
+	if (kvm_s390_pv_is_protected(vcpu->kvm)) {
+		rc = kvm_s390_pv_create_cpu(vcpu, &uvrc, &uvrrc);
+		if (rc)
+			kvm_s390_vcpu_unsetup_cmma(vcpu);
+	}
+	mutex_unlock(&vcpu->kvm->lock);
+
 	return rc;
 }
 
-struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
-				      unsigned int id)
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
 {
-	struct kvm_vcpu *vcpu;
-	struct sie_page *sie_page;
-	int rc = -EINVAL;
-
 	if (!kvm_is_ucontrol(kvm) && !sca_can_add_vcpu(kvm, id))
-		goto out;
-
-	rc = -ENOMEM;
+		return -EINVAL;
+	return 0;
+}
 
-	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
-	if (!vcpu)
-		goto out;
+int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
+{
+	struct sie_page *sie_page;
+	int rc;
 
 	BUILD_BUG_ON(sizeof(struct sie_page) != 4096);
-	sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL);
+	sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL_ACCOUNT);
 	if (!sie_page)
-		goto out_free_cpu;
+		return -ENOMEM;
 
 	vcpu->arch.sie_block = &sie_page->sie_block;
-	vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb;
+	vcpu->arch.sie_block->itdba = virt_to_phys(&sie_page->itdb);
 
 	/* the real guest size will always be smaller than msl */
 	vcpu->arch.sie_block->mso = 0;
 	vcpu->arch.sie_block->msl = sclp.hamax;
 
-	vcpu->arch.sie_block->icpua = id;
+	vcpu->arch.sie_block->icpua = vcpu->vcpu_id;
 	spin_lock_init(&vcpu->arch.local_int.lock);
-	vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa_int.origin;
-	if (vcpu->arch.sie_block->gd && sclp.has_gisaf)
-		vcpu->arch.sie_block->gd |= GISA_FORMAT1;
+	vcpu->arch.sie_block->gd = kvm_s390_get_gisa_desc(vcpu->kvm);
 	seqcount_init(&vcpu->arch.cputm_seqcount);
 
-	rc = kvm_vcpu_init(vcpu, kvm, id);
+	vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
+	kvm_clear_async_pf_completion_queue(vcpu);
+	vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX |
+				    KVM_SYNC_GPRS |
+				    KVM_SYNC_ACRS |
+				    KVM_SYNC_CRS |
+				    KVM_SYNC_ARCH0 |
+				    KVM_SYNC_PFAULT |
+				    KVM_SYNC_DIAG318;
+	vcpu->arch.acrs_loaded = false;
+	kvm_s390_set_prefix(vcpu, 0);
+	if (test_kvm_facility(vcpu->kvm, 64))
+		vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB;
+	if (test_kvm_facility(vcpu->kvm, 82))
+		vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC;
+	if (test_kvm_facility(vcpu->kvm, 133))
+		vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB;
+	if (test_kvm_facility(vcpu->kvm, 156))
+		vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN;
+	/* fprs can be synchronized via vrs, even if the guest has no vx. With
+	 * cpu_has_vx(), (load|store)_fpu_regs() will work with vrs format.
+	 */
+	if (cpu_has_vx())
+		vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS;
+	else
+		vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS;
+
+	if (kvm_is_ucontrol(vcpu->kvm)) {
+		rc = __kvm_ucontrol_vcpu_init(vcpu);
+		if (rc)
+			goto out_free_sie_block;
+	}
+
+	VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK",
+		 vcpu->vcpu_id, vcpu, vcpu->arch.sie_block);
+	trace_kvm_s390_create_vcpu(vcpu->vcpu_id, vcpu, vcpu->arch.sie_block);
+
+	rc = kvm_s390_vcpu_setup(vcpu);
 	if (rc)
-		goto out_free_sie_block;
-	VM_EVENT(kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", id, vcpu,
-		 vcpu->arch.sie_block);
-	trace_kvm_s390_create_vcpu(id, vcpu, vcpu->arch.sie_block);
+		goto out_ucontrol_uninit;
+
+	kvm_s390_update_topology_change_report(vcpu->kvm, 1);
+	return 0;
 
-	return vcpu;
+out_ucontrol_uninit:
+	if (kvm_is_ucontrol(vcpu->kvm))
+		gmap_remove(vcpu->arch.gmap);
 out_free_sie_block:
 	free_page((unsigned long)(vcpu->arch.sie_block));
-out_free_cpu:
-	kmem_cache_free(kvm_vcpu_cache, vcpu);
-out:
-	return ERR_PTR(rc);
+	return rc;
 }
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
+	clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask);
 	return kvm_s390_vcpu_has_irq(vcpu, 0);
 }
 
@@ -3139,7 +4100,7 @@ void exit_sie(struct kvm_vcpu *vcpu)
 /* Kick a guest cpu out of SIE to process a request synchronously */
 void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu)
 {
-	kvm_make_request(req, vcpu);
+	__kvm_make_request(req, vcpu);
 	kvm_s390_vcpu_request(vcpu);
 }
 
@@ -3149,7 +4110,9 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
 	struct kvm *kvm = gmap->private;
 	struct kvm_vcpu *vcpu;
 	unsigned long prefix;
-	int i;
+	unsigned long i;
+
+	trace_kvm_s390_gmap_notifier(start, end, gmap_is_shadow(gmap));
 
 	if (gmap_is_shadow(gmap))
 		return;
@@ -3162,7 +4125,7 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
 		if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) {
 			VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx",
 				   start, end);
-			kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu);
+			kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
 		}
 	}
 }
@@ -3170,8 +4133,8 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
 bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
 {
 	/* do not poll with more than halt_poll_max_steal percent of steal time */
-	if (S390_lowcore.avg_steal_timer * 100 / (TICK_USEC << 12) >=
-	    halt_poll_max_steal) {
+	if (get_lowcore()->avg_steal_timer * 100 / (TICK_USEC << 12) >=
+	    READ_ONCE(halt_poll_max_steal)) {
 		vcpu->stat.halt_no_poll_steal++;
 		return true;
 	}
@@ -3287,10 +4250,76 @@ static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu,
 	return r;
 }
 
-static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
+static void kvm_arch_vcpu_ioctl_normal_reset(struct kvm_vcpu *vcpu)
 {
-	kvm_s390_vcpu_initial_reset(vcpu);
-	return 0;
+	vcpu->arch.sie_block->gpsw.mask &= ~PSW_MASK_RI;
+	vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
+	memset(vcpu->run->s.regs.riccb, 0, sizeof(vcpu->run->s.regs.riccb));
+
+	kvm_clear_async_pf_completion_queue(vcpu);
+	if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm))
+		kvm_s390_vcpu_stop(vcpu);
+	kvm_s390_clear_local_irqs(vcpu);
+}
+
+static void kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
+{
+	/* Initial reset is a superset of the normal reset */
+	kvm_arch_vcpu_ioctl_normal_reset(vcpu);
+
+	/*
+	 * This equals initial cpu reset in pop, but we don't switch to ESA.
+	 * We do not only reset the internal data, but also ...
+	 */
+	vcpu->arch.sie_block->gpsw.mask = 0;
+	vcpu->arch.sie_block->gpsw.addr = 0;
+	kvm_s390_set_prefix(vcpu, 0);
+	kvm_s390_set_cpu_timer(vcpu, 0);
+	vcpu->arch.sie_block->ckc = 0;
+	memset(vcpu->arch.sie_block->gcr, 0, sizeof(vcpu->arch.sie_block->gcr));
+	vcpu->arch.sie_block->gcr[0] = CR0_INITIAL_MASK;
+	vcpu->arch.sie_block->gcr[14] = CR14_INITIAL_MASK;
+
+	/* ... the data in sync regs */
+	memset(vcpu->run->s.regs.crs, 0, sizeof(vcpu->run->s.regs.crs));
+	vcpu->run->s.regs.ckc = 0;
+	vcpu->run->s.regs.crs[0] = CR0_INITIAL_MASK;
+	vcpu->run->s.regs.crs[14] = CR14_INITIAL_MASK;
+	vcpu->run->psw_addr = 0;
+	vcpu->run->psw_mask = 0;
+	vcpu->run->s.regs.todpr = 0;
+	vcpu->run->s.regs.cputm = 0;
+	vcpu->run->s.regs.ckc = 0;
+	vcpu->run->s.regs.pp = 0;
+	vcpu->run->s.regs.gbea = 1;
+	vcpu->run->s.regs.fpc = 0;
+	/*
+	 * Do not reset these registers in the protected case, as some of
+	 * them are overlaid and they are not accessible in this case
+	 * anyway.
+	 */
+	if (!kvm_s390_pv_cpu_is_protected(vcpu)) {
+		vcpu->arch.sie_block->gbea = 1;
+		vcpu->arch.sie_block->pp = 0;
+		vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
+		vcpu->arch.sie_block->todpr = 0;
+	}
+}
+
+static void kvm_arch_vcpu_ioctl_clear_reset(struct kvm_vcpu *vcpu)
+{
+	struct kvm_sync_regs *regs = &vcpu->run->s.regs;
+
+	/* Clear reset is a superset of the initial reset */
+	kvm_arch_vcpu_ioctl_initial_reset(vcpu);
+
+	memset(&regs->gprs, 0, sizeof(regs->gprs));
+	memset(&regs->vrs, 0, sizeof(regs->vrs));
+	memset(&regs->acrs, 0, sizeof(regs->acrs));
+	memset(&regs->gscb, 0, sizeof(regs->gscb));
+
+	regs->etoken = 0;
+	regs->etoken_extension = 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
@@ -3339,18 +4368,13 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 
 	vcpu_load(vcpu);
 
-	if (test_fp_ctl(fpu->fpc)) {
-		ret = -EINVAL;
-		goto out;
-	}
 	vcpu->run->s.regs.fpc = fpu->fpc;
-	if (MACHINE_HAS_VX)
+	if (cpu_has_vx())
 		convert_fp_to_vx((__vector128 *) vcpu->run->s.regs.vrs,
 				 (freg_t *) fpu->fprs);
 	else
 		memcpy(vcpu->run->s.regs.fprs, &fpu->fprs, sizeof(fpu->fprs));
 
-out:
 	vcpu_put(vcpu);
 	return ret;
 }
@@ -3359,9 +4383,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 	vcpu_load(vcpu);
 
-	/* make sure we have the latest values */
-	save_fpu_regs();
-	if (MACHINE_HAS_VX)
+	if (cpu_has_vx())
 		convert_vx_to_fp((freg_t *) fpu->fprs,
 				 (__vector128 *) vcpu->run->s.regs.vrs);
 	else
@@ -3460,18 +4482,24 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	vcpu_load(vcpu);
 
 	/* user space knows about this interface - let it control the state */
-	vcpu->kvm->arch.user_cpu_state_ctrl = 1;
+	kvm_s390_set_user_cpu_state_ctrl(vcpu->kvm);
 
 	switch (mp_state->mp_state) {
 	case KVM_MP_STATE_STOPPED:
-		kvm_s390_vcpu_stop(vcpu);
+		rc = kvm_s390_vcpu_stop(vcpu);
 		break;
 	case KVM_MP_STATE_OPERATING:
-		kvm_s390_vcpu_start(vcpu);
+		rc = kvm_s390_vcpu_start(vcpu);
 		break;
 	case KVM_MP_STATE_LOAD:
+		if (!kvm_s390_pv_cpu_is_protected(vcpu)) {
+			rc = -ENXIO;
+			break;
+		}
+		rc = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_OPR_LOAD);
+		break;
 	case KVM_MP_STATE_CHECK_STOP:
-		/* fall through - CHECK_STOP and LOAD are not supported yet */
+		fallthrough;	/* CHECK_STOP and LOAD are not supported yet */
 	default:
 		rc = -ENXIO;
 	}
@@ -3485,6 +4513,75 @@ static bool ibs_enabled(struct kvm_vcpu *vcpu)
 	return kvm_s390_test_cpuflags(vcpu, CPUSTAT_IBS);
 }
 
+static int __kvm_s390_fixup_fault_sync(struct gmap *gmap, gpa_t gaddr, unsigned int flags)
+{
+	struct kvm *kvm = gmap->private;
+	gfn_t gfn = gpa_to_gfn(gaddr);
+	bool unlocked;
+	hva_t vmaddr;
+	gpa_t tmp;
+	int rc;
+
+	if (kvm_is_ucontrol(kvm)) {
+		tmp = __gmap_translate(gmap, gaddr);
+		gfn = gpa_to_gfn(tmp);
+	}
+
+	vmaddr = gfn_to_hva(kvm, gfn);
+	rc = fixup_user_fault(gmap->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked);
+	if (!rc)
+		rc = __gmap_link(gmap, gaddr, vmaddr);
+	return rc;
+}
+
+/**
+ * __kvm_s390_mprotect_many() - Apply specified protection to guest pages
+ * @gmap: the gmap of the guest
+ * @gpa: the starting guest address
+ * @npages: how many pages to protect
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: pgste notification bits to set
+ *
+ * Returns: 0 in case of success, < 0 in case of error - see gmap_protect_one()
+ *
+ * Context: kvm->srcu and gmap->mm need to be held in read mode
+ */
+int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot,
+			     unsigned long bits)
+{
+	unsigned int fault_flag = (prot & PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
+	gpa_t end = gpa + npages * PAGE_SIZE;
+	int rc;
+
+	for (; gpa < end; gpa = ALIGN(gpa + 1, rc)) {
+		rc = gmap_protect_one(gmap, gpa, prot, bits);
+		if (rc == -EAGAIN) {
+			__kvm_s390_fixup_fault_sync(gmap, gpa, fault_flag);
+			rc = gmap_protect_one(gmap, gpa, prot, bits);
+		}
+		if (rc < 0)
+			return rc;
+	}
+
+	return 0;
+}
+
+static int kvm_s390_mprotect_notify_prefix(struct kvm_vcpu *vcpu)
+{
+	gpa_t gaddr = kvm_s390_get_prefix(vcpu);
+	int idx, rc;
+
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+	mmap_read_lock(vcpu->arch.gmap->mm);
+
+	rc = __kvm_s390_mprotect_many(vcpu->arch.gmap, gaddr, 2, PROT_WRITE, GMAP_NOTIFY_MPROT);
+
+	mmap_read_unlock(vcpu->arch.gmap->mm);
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+	return rc;
+}
+
 static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
 {
 retry:
@@ -3492,19 +4589,18 @@ retry:
 	if (!kvm_request_pending(vcpu))
 		return 0;
 	/*
-	 * We use MMU_RELOAD just to re-arm the ipte notifier for the
+	 * If the guest prefix changed, re-arm the ipte notifier for the
 	 * guest prefix page. gmap_mprotect_notify will wait on the ptl lock.
 	 * This ensures that the ipte instruction for this request has
 	 * already finished. We might race against a second unmapper that
 	 * wants to set the blocking bit. Lets just retry the request loop.
 	 */
-	if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
+	if (kvm_check_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu)) {
 		int rc;
-		rc = gmap_mprotect_notify(vcpu->arch.gmap,
-					  kvm_s390_get_prefix(vcpu),
-					  PAGE_SIZE * 2, PROT_WRITE);
+
+		rc = kvm_s390_mprotect_notify_prefix(vcpu);
 		if (rc) {
-			kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+			kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
 			return rc;
 		}
 		goto retry;
@@ -3557,30 +4653,26 @@ retry:
 		goto retry;
 	}
 
-	/* nothing to do, just clear the request */
-	kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 	/* we left the vsie handler, nothing to do, just clear the request */
 	kvm_clear_request(KVM_REQ_VSIE_RESTART, vcpu);
 
 	return 0;
 }
 
-void kvm_s390_set_tod_clock(struct kvm *kvm,
-			    const struct kvm_s390_vm_tod_clock *gtod)
+static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod)
 {
 	struct kvm_vcpu *vcpu;
-	struct kvm_s390_tod_clock_ext htod;
-	int i;
+	union tod_clock clk;
+	unsigned long i;
 
-	mutex_lock(&kvm->lock);
 	preempt_disable();
 
-	get_tod_clock_ext((char *)&htod);
+	store_tod_clock_ext(&clk);
 
-	kvm->arch.epoch = gtod->tod - htod.tod;
+	kvm->arch.epoch = gtod->tod - clk.tod;
 	kvm->arch.epdx = 0;
 	if (test_kvm_facility(kvm, 139)) {
-		kvm->arch.epdx = gtod->epoch_idx - htod.epoch_idx;
+		kvm->arch.epdx = gtod->epoch_idx - clk.ei;
 		if (kvm->arch.epoch > gtod->tod)
 			kvm->arch.epdx -= 1;
 	}
@@ -3593,23 +4685,15 @@ void kvm_s390_set_tod_clock(struct kvm *kvm,
 
 	kvm_s390_vcpu_unblock_all(kvm);
 	preempt_enable();
-	mutex_unlock(&kvm->lock);
 }
 
-/**
- * kvm_arch_fault_in_page - fault-in guest page if necessary
- * @vcpu: The corresponding virtual cpu
- * @gpa: Guest physical address
- * @writable: Whether the page should be writable or not
- *
- * Make sure that a guest page has been faulted-in on the host.
- *
- * Return: Zero on success, negative error code otherwise.
- */
-long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable)
+int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod)
 {
-	return gmap_fault(vcpu->arch.gmap, gpa,
-			  writable ? FAULT_FLAG_WRITE : 0);
+	if (!mutex_trylock(&kvm->lock))
+		return 0;
+	__kvm_s390_set_tod_clock(kvm, gtod);
+	mutex_unlock(&kvm->lock);
+	return 1;
 }
 
 static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token,
@@ -3629,11 +4713,13 @@ static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token,
 	}
 }
 
-void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
 				     struct kvm_async_pf *work)
 {
 	trace_kvm_s390_pfault_init(vcpu, work->arch.pfault_token);
 	__kvm_inject_pfault_token(vcpu, true, work->arch.pfault_token);
+
+	return true;
 }
 
 void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
@@ -3649,7 +4735,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
 	/* s390 will always inject the page directly */
 }
 
-bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
+bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
 {
 	/*
 	 * s390 will always inject the page directly,
@@ -3658,33 +4744,30 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
 	return true;
 }
 
-static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
+static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
 {
 	hva_t hva;
 	struct kvm_arch_async_pf arch;
-	int rc;
 
 	if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
-		return 0;
+		return false;
 	if ((vcpu->arch.sie_block->gpsw.mask & vcpu->arch.pfault_select) !=
 	    vcpu->arch.pfault_compare)
-		return 0;
+		return false;
 	if (psw_extint_disabled(vcpu))
-		return 0;
+		return false;
 	if (kvm_s390_vcpu_has_irq(vcpu, 0))
-		return 0;
+		return false;
 	if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK))
-		return 0;
+		return false;
 	if (!vcpu->arch.gmap->pfault_enabled)
-		return 0;
+		return false;
 
-	hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(current->thread.gmap_addr));
-	hva += current->thread.gmap_addr & ~PAGE_MASK;
+	hva = gfn_to_hva(vcpu->kvm, current->thread.gmap_teid.addr);
 	if (read_guest_real(vcpu, vcpu->arch.pfault_token, &arch.pfault_token, 8))
-		return 0;
+		return false;
 
-	rc = kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch);
-	return rc;
+	return kvm_setup_async_pf(vcpu, current->thread.gmap_teid.addr * PAGE_SIZE, hva, &arch);
 }
 
 static int vcpu_pre_run(struct kvm_vcpu *vcpu)
@@ -3704,12 +4787,9 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
 	if (need_resched())
 		schedule();
 
-	if (test_cpu_flag(CIF_MCCK_PENDING))
-		s390_handle_mcck();
-
 	if (!kvm_is_ucontrol(vcpu->kvm)) {
 		rc = kvm_s390_deliver_pending_interrupts(vcpu);
-		if (rc)
+		if (rc || guestdbg_exit_pending(vcpu))
 			return rc;
 	}
 
@@ -3722,9 +4802,10 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
 		kvm_s390_patch_guest_per_regs(vcpu);
 	}
 
-	clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask);
+	clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask);
 
 	vcpu->arch.sie_block->icptcode = 0;
+	current->thread.gmap_int_code = 0;
 	cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
 	VCPU_EVENT(vcpu, 6, "entering sie flags %x", cpuflags);
 	trace_kvm_s390_sie_enter(vcpu, cpuflags);
@@ -3732,7 +4813,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
+static int vcpu_post_run_addressing_exception(struct kvm_vcpu *vcpu)
 {
 	struct kvm_s390_pgm_info pgm_info = {
 		.code = PGM_ADDRESSING,
@@ -3768,10 +4849,182 @@ static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
 	return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
 }
 
+static void kvm_s390_assert_primary_as(struct kvm_vcpu *vcpu)
+{
+	KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm,
+		"Unexpected program interrupt 0x%x, TEID 0x%016lx",
+		current->thread.gmap_int_code, current->thread.gmap_teid.val);
+}
+
+/*
+ * __kvm_s390_handle_dat_fault() - handle a dat fault for the gmap of a vcpu
+ * @vcpu: the vCPU whose gmap is to be fixed up
+ * @gfn: the guest frame number used for memslots (including fake memslots)
+ * @gaddr: the gmap address, does not have to match @gfn for ucontrol gmaps
+ * @flags: FOLL_* flags
+ *
+ * Return: 0 on success, < 0 in case of error.
+ * Context: The mm lock must not be held before calling. May sleep.
+ */
+int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags)
+{
+	struct kvm_memory_slot *slot;
+	unsigned int fault_flags;
+	bool writable, unlocked;
+	unsigned long vmaddr;
+	struct page *page;
+	kvm_pfn_t pfn;
+	int rc;
+
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+		return vcpu_post_run_addressing_exception(vcpu);
+
+	fault_flags = flags & FOLL_WRITE ? FAULT_FLAG_WRITE : 0;
+	if (vcpu->arch.gmap->pfault_enabled)
+		flags |= FOLL_NOWAIT;
+	vmaddr = __gfn_to_hva_memslot(slot, gfn);
+
+try_again:
+	pfn = __kvm_faultin_pfn(slot, gfn, flags, &writable, &page);
+
+	/* Access outside memory, inject addressing exception */
+	if (is_noslot_pfn(pfn))
+		return vcpu_post_run_addressing_exception(vcpu);
+	/* Signal pending: try again */
+	if (pfn == KVM_PFN_ERR_SIGPENDING)
+		return -EAGAIN;
+
+	/* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT) */
+	if (pfn == KVM_PFN_ERR_NEEDS_IO) {
+		trace_kvm_s390_major_guest_pfault(vcpu);
+		if (kvm_arch_setup_async_pf(vcpu))
+			return 0;
+		vcpu->stat.pfault_sync++;
+		/* Could not setup async pfault, try again synchronously */
+		flags &= ~FOLL_NOWAIT;
+		goto try_again;
+	}
+	/* Any other error */
+	if (is_error_pfn(pfn))
+		return -EFAULT;
+
+	/* Success */
+	mmap_read_lock(vcpu->arch.gmap->mm);
+	/* Mark the userspace PTEs as young and/or dirty, to avoid page fault loops */
+	rc = fixup_user_fault(vcpu->arch.gmap->mm, vmaddr, fault_flags, &unlocked);
+	if (!rc)
+		rc = __gmap_link(vcpu->arch.gmap, gaddr, vmaddr);
+	scoped_guard(spinlock, &vcpu->kvm->mmu_lock) {
+		kvm_release_faultin_page(vcpu->kvm, page, false, writable);
+	}
+	mmap_read_unlock(vcpu->arch.gmap->mm);
+	return rc;
+}
+
+static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int flags)
+{
+	unsigned long gaddr_tmp;
+	gfn_t gfn;
+
+	gfn = gpa_to_gfn(gaddr);
+	if (kvm_is_ucontrol(vcpu->kvm)) {
+		/*
+		 * This translates the per-vCPU guest address into a
+		 * fake guest address, which can then be used with the
+		 * fake memslots that are identity mapping userspace.
+		 * This allows ucontrol VMs to use the normal fault
+		 * resolution path, like normal VMs.
+		 */
+		mmap_read_lock(vcpu->arch.gmap->mm);
+		gaddr_tmp = __gmap_translate(vcpu->arch.gmap, gaddr);
+		mmap_read_unlock(vcpu->arch.gmap->mm);
+		if (gaddr_tmp == -EFAULT) {
+			vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL;
+			vcpu->run->s390_ucontrol.trans_exc_code = gaddr;
+			vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION;
+			return -EREMOTE;
+		}
+		gfn = gpa_to_gfn(gaddr_tmp);
+	}
+	return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, flags);
+}
+
+static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
+{
+	unsigned int flags = 0;
+	unsigned long gaddr;
+	int rc;
+
+	gaddr = current->thread.gmap_teid.addr * PAGE_SIZE;
+	if (kvm_s390_cur_gmap_fault_is_write())
+		flags = FAULT_FLAG_WRITE;
+
+	switch (current->thread.gmap_int_code & PGM_INT_CODE_MASK) {
+	case 0:
+		vcpu->stat.exit_null++;
+		break;
+	case PGM_SECURE_STORAGE_ACCESS:
+	case PGM_SECURE_STORAGE_VIOLATION:
+		kvm_s390_assert_primary_as(vcpu);
+		/*
+		 * This can happen after a reboot with asynchronous teardown;
+		 * the new guest (normal or protected) will run on top of the
+		 * previous protected guest. The old pages need to be destroyed
+		 * so the new guest can use them.
+		 */
+		if (gmap_destroy_page(vcpu->arch.gmap, gaddr)) {
+			/*
+			 * Either KVM messed up the secure guest mapping or the
+			 * same page is mapped into multiple secure guests.
+			 *
+			 * This exception is only triggered when a guest 2 is
+			 * running and can therefore never occur in kernel
+			 * context.
+			 */
+			pr_warn_ratelimited("Secure storage violation (%x) in task: %s, pid %d\n",
+					    current->thread.gmap_int_code, current->comm,
+					    current->pid);
+			send_sig(SIGSEGV, current, 0);
+		}
+		break;
+	case PGM_NON_SECURE_STORAGE_ACCESS:
+		kvm_s390_assert_primary_as(vcpu);
+		/*
+		 * This is normal operation; a page belonging to a protected
+		 * guest has not been imported yet. Try to import the page into
+		 * the protected guest.
+		 */
+		rc = gmap_convert_to_secure(vcpu->arch.gmap, gaddr);
+		if (rc == -EINVAL)
+			send_sig(SIGSEGV, current, 0);
+		if (rc != -ENXIO)
+			break;
+		flags = FAULT_FLAG_WRITE;
+		fallthrough;
+	case PGM_PROTECTION:
+	case PGM_SEGMENT_TRANSLATION:
+	case PGM_PAGE_TRANSLATION:
+	case PGM_ASCE_TYPE:
+	case PGM_REGION_FIRST_TRANS:
+	case PGM_REGION_SECOND_TRANS:
+	case PGM_REGION_THIRD_TRANS:
+		kvm_s390_assert_primary_as(vcpu);
+		return vcpu_dat_fault_handler(vcpu, gaddr, flags);
+	default:
+		KVM_BUG(1, vcpu->kvm, "Unexpected program interrupt 0x%x, TEID 0x%016lx",
+			current->thread.gmap_int_code, current->thread.gmap_teid.val);
+		send_sig(SIGSEGV, current, 0);
+		break;
+	}
+	return 0;
+}
+
 static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
 {
 	struct mcck_volatile_info *mcck_info;
 	struct sie_page *sie_page;
+	int rc;
 
 	VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
 		   vcpu->arch.sie_block->icptcode);
@@ -3793,7 +5046,7 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
 	}
 
 	if (vcpu->arch.sie_block->icptcode > 0) {
-		int rc = kvm_handle_sie_intercept(vcpu);
+		rc = kvm_handle_sie_intercept(vcpu);
 
 		if (rc != -EOPNOTSUPP)
 			return rc;
@@ -3802,41 +5055,29 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
 		vcpu->run->s390_sieic.ipa = vcpu->arch.sie_block->ipa;
 		vcpu->run->s390_sieic.ipb = vcpu->arch.sie_block->ipb;
 		return -EREMOTE;
-	} else if (exit_reason != -EFAULT) {
-		vcpu->stat.exit_null++;
-		return 0;
-	} else if (kvm_is_ucontrol(vcpu->kvm)) {
-		vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL;
-		vcpu->run->s390_ucontrol.trans_exc_code =
-						current->thread.gmap_addr;
-		vcpu->run->s390_ucontrol.pgm_code = 0x10;
-		return -EREMOTE;
-	} else if (current->thread.gmap_pfault) {
-		trace_kvm_s390_major_guest_pfault(vcpu);
-		current->thread.gmap_pfault = 0;
-		if (kvm_arch_setup_async_pf(vcpu))
-			return 0;
-		return kvm_arch_fault_in_page(vcpu, current->thread.gmap_addr, 1);
 	}
-	return vcpu_post_run_fault_in_sie(vcpu);
+
+	return vcpu_post_run_handle_fault(vcpu);
 }
 
+#define PSW_INT_MASK (PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_MCHECK)
 static int __vcpu_run(struct kvm_vcpu *vcpu)
 {
 	int rc, exit_reason;
+	struct sie_page *sie_page = (struct sie_page *)vcpu->arch.sie_block;
 
 	/*
 	 * We try to hold kvm->srcu during most of vcpu_run (except when run-
 	 * ning the guest), so that memslots (and other stuff) are protected
 	 */
-	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+	kvm_vcpu_srcu_read_lock(vcpu);
 
 	do {
 		rc = vcpu_pre_run(vcpu);
-		if (rc)
+		if (rc || guestdbg_exit_pending(vcpu))
 			break;
 
-		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+		kvm_vcpu_srcu_read_unlock(vcpu);
 		/*
 		 * As PF_VCPU will be used in fault handler, between
 		 * guest_enter and guest_exit should be no uaccess.
@@ -3845,23 +5086,45 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 		guest_enter_irqoff();
 		__disable_cpu_timer_accounting(vcpu);
 		local_irq_enable();
+		if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+			memcpy(sie_page->pv_grregs,
+			       vcpu->run->s.regs.gprs,
+			       sizeof(sie_page->pv_grregs));
+		}
 		exit_reason = sie64a(vcpu->arch.sie_block,
-				     vcpu->run->s.regs.gprs);
+				     vcpu->run->s.regs.gprs,
+				     vcpu->arch.gmap->asce);
+		if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+			memcpy(vcpu->run->s.regs.gprs,
+			       sie_page->pv_grregs,
+			       sizeof(sie_page->pv_grregs));
+			/*
+			 * We're not allowed to inject interrupts on intercepts
+			 * that leave the guest state in an "in-between" state
+			 * where the next SIE entry will do a continuation.
+			 * Fence interrupts in our "internal" PSW.
+			 */
+			if (vcpu->arch.sie_block->icptcode == ICPT_PV_INSTR ||
+			    vcpu->arch.sie_block->icptcode == ICPT_PV_PREF) {
+				vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK;
+			}
+		}
 		local_irq_disable();
 		__enable_cpu_timer_accounting(vcpu);
 		guest_exit_irqoff();
 		local_irq_enable();
-		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+		kvm_vcpu_srcu_read_lock(vcpu);
 
 		rc = vcpu_post_run(vcpu, exit_reason);
 	} while (!signal_pending(current) && !guestdbg_exit_pending(vcpu) && !rc);
 
-	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+	kvm_vcpu_srcu_read_unlock(vcpu);
 	return rc;
 }
 
-static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void sync_regs_fmt2(struct kvm_vcpu *vcpu)
 {
+	struct kvm_run *kvm_run = vcpu->run;
 	struct runtime_instr_cb *riccb;
 	struct gs_cb *gscb;
 
@@ -3869,16 +5132,7 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	gscb = (struct gs_cb *) &kvm_run->s.regs.gscb;
 	vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask;
 	vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr;
-	if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX)
-		kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix);
-	if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) {
-		memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128);
-		/* some control register changes require a tlb flush */
-		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-	}
 	if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) {
-		kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm);
-		vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc;
 		vcpu->arch.sie_block->todpr = kvm_run->s.regs.todpr;
 		vcpu->arch.sie_block->pp = kvm_run->s.regs.pp;
 		vcpu->arch.sie_block->gbea = kvm_run->s.regs.gbea;
@@ -3890,6 +5144,11 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
 			kvm_clear_async_pf_completion_queue(vcpu);
 	}
+	if (kvm_run->kvm_dirty_regs & KVM_SYNC_DIAG318) {
+		vcpu->arch.diag318_info.val = kvm_run->s.regs.diag318;
+		vcpu->arch.sie_block->cpnc = vcpu->arch.diag318_info.cpnc;
+		VCPU_EVENT(vcpu, 3, "setting cpnc to %d", vcpu->arch.diag318_info.cpnc);
+	}
 	/*
 	 * If userspace sets the riccb (e.g. after migration) to a valid state,
 	 * we should enable RI here instead of doing the lazy enablement.
@@ -3919,23 +5178,9 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
 		vcpu->arch.sie_block->fpf |= kvm_run->s.regs.bpbc ? FPF_BPBC : 0;
 	}
-	save_access_regs(vcpu->arch.host_acrs);
-	restore_access_regs(vcpu->run->s.regs.acrs);
-	/* save host (userspace) fprs/vrs */
-	save_fpu_regs();
-	vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc;
-	vcpu->arch.host_fpregs.regs = current->thread.fpu.regs;
-	if (MACHINE_HAS_VX)
-		current->thread.fpu.regs = vcpu->run->s.regs.vrs;
-	else
-		current->thread.fpu.regs = vcpu->run->s.regs.fprs;
-	current->thread.fpu.fpc = vcpu->run->s.regs.fpc;
-	if (test_fp_ctl(current->thread.fpu.fpc))
-		/* User space provided an invalid FPC, let's clear it */
-		current->thread.fpu.fpc = 0;
-	if (MACHINE_HAS_GS) {
+	if (cpu_has_gs()) {
 		preempt_disable();
-		__ctl_set_bit(2, 4);
+		local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT);
 		if (current->thread.gs_cb) {
 			vcpu->arch.host_gscb = current->thread.gs_cb;
 			save_gs_cb(vcpu->arch.host_gscb);
@@ -3948,53 +5193,109 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		preempt_enable();
 	}
 	/* SIE will load etoken directly from SDNX and therefore kvm_run */
+}
+
+static void sync_regs(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *kvm_run = vcpu->run;
+
+	if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX)
+		kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix);
+	if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) {
+		memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128);
+		/* some control register changes require a tlb flush */
+		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+	}
+	if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) {
+		kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm);
+		vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc;
+	}
+	save_access_regs(vcpu->arch.host_acrs);
+	restore_access_regs(vcpu->run->s.regs.acrs);
+	vcpu->arch.acrs_loaded = true;
+	kvm_s390_fpu_load(vcpu->run);
+	/* Sync fmt2 only data */
+	if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) {
+		sync_regs_fmt2(vcpu);
+	} else {
+		/*
+		 * In several places we have to modify our internal view to
+		 * not do things that are disallowed by the ultravisor. For
+		 * example we must not inject interrupts after specific exits
+		 * (e.g. 112 prefix page not secure). We do this by turning
+		 * off the machine check, external and I/O interrupt bits
+		 * of our PSW copy. To avoid getting validity intercepts, we
+		 * do only accept the condition code from userspace.
+		 */
+		vcpu->arch.sie_block->gpsw.mask &= ~PSW_MASK_CC;
+		vcpu->arch.sie_block->gpsw.mask |= kvm_run->psw_mask &
+						   PSW_MASK_CC;
+	}
 
 	kvm_run->kvm_dirty_regs = 0;
 }
 
-static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void store_regs_fmt2(struct kvm_vcpu *vcpu)
 {
-	kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask;
-	kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr;
-	kvm_run->s.regs.prefix = kvm_s390_get_prefix(vcpu);
-	memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128);
-	kvm_run->s.regs.cputm = kvm_s390_get_cpu_timer(vcpu);
-	kvm_run->s.regs.ckc = vcpu->arch.sie_block->ckc;
+	struct kvm_run *kvm_run = vcpu->run;
+
 	kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr;
 	kvm_run->s.regs.pp = vcpu->arch.sie_block->pp;
 	kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea;
-	kvm_run->s.regs.pft = vcpu->arch.pfault_token;
-	kvm_run->s.regs.pfs = vcpu->arch.pfault_select;
-	kvm_run->s.regs.pfc = vcpu->arch.pfault_compare;
 	kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC;
-	save_access_regs(vcpu->run->s.regs.acrs);
-	restore_access_regs(vcpu->arch.host_acrs);
-	/* Save guest register state */
-	save_fpu_regs();
-	vcpu->run->s.regs.fpc = current->thread.fpu.fpc;
-	/* Restore will be done lazily at return */
-	current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc;
-	current->thread.fpu.regs = vcpu->arch.host_fpregs.regs;
-	if (MACHINE_HAS_GS) {
-		__ctl_set_bit(2, 4);
+	kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val;
+	if (cpu_has_gs()) {
+		preempt_disable();
+		local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT);
 		if (vcpu->arch.gs_enabled)
 			save_gs_cb(current->thread.gs_cb);
-		preempt_disable();
 		current->thread.gs_cb = vcpu->arch.host_gscb;
 		restore_gs_cb(vcpu->arch.host_gscb);
-		preempt_enable();
 		if (!vcpu->arch.host_gscb)
-			__ctl_clear_bit(2, 4);
+			local_ctl_clear_bit(2, CR2_GUARDED_STORAGE_BIT);
 		vcpu->arch.host_gscb = NULL;
+		preempt_enable();
 	}
 	/* SIE will save etoken directly into SDNX and therefore kvm_run */
 }
 
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void store_regs(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *kvm_run = vcpu->run;
+
+	kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask;
+	kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr;
+	kvm_run->s.regs.prefix = kvm_s390_get_prefix(vcpu);
+	memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128);
+	kvm_run->s.regs.cputm = kvm_s390_get_cpu_timer(vcpu);
+	kvm_run->s.regs.ckc = vcpu->arch.sie_block->ckc;
+	kvm_run->s.regs.pft = vcpu->arch.pfault_token;
+	kvm_run->s.regs.pfs = vcpu->arch.pfault_select;
+	kvm_run->s.regs.pfc = vcpu->arch.pfault_compare;
+	save_access_regs(vcpu->run->s.regs.acrs);
+	restore_access_regs(vcpu->arch.host_acrs);
+	vcpu->arch.acrs_loaded = false;
+	kvm_s390_fpu_store(vcpu->run);
+	if (likely(!kvm_s390_pv_cpu_is_protected(vcpu)))
+		store_regs_fmt2(vcpu);
+}
+
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 {
+	struct kvm_run *kvm_run = vcpu->run;
+	DECLARE_KERNEL_FPU_ONSTACK32(fpu);
 	int rc;
 
-	if (kvm_run->immediate_exit)
+	/*
+	 * Running a VM while dumping always has the potential to
+	 * produce inconsistent dump data. But for PV vcpus a SIE
+	 * entry while dumping could also lead to a fatal validity
+	 * intercept which we absolutely want to avoid.
+	 */
+	if (vcpu->kvm->arch.pv.dumping)
+		return -EINVAL;
+
+	if (!vcpu->wants_to_run)
 		return -EINTR;
 
 	if (kvm_run->kvm_valid_regs & ~KVM_SYNC_S390_VALID_FIELDS ||
@@ -4011,6 +5312,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	kvm_sigset_activate(vcpu);
 
+	/*
+	 * no need to check the return value of vcpu_start as it can only have
+	 * an error for protvirt, but protvirt means user cpu state
+	 */
 	if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) {
 		kvm_s390_vcpu_start(vcpu);
 	} else if (is_vcpu_stopped(vcpu)) {
@@ -4020,7 +5325,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		goto out;
 	}
 
-	sync_regs(vcpu, kvm_run);
+	kernel_fpu_begin(&fpu, KERNEL_FPC | KERNEL_VXR);
+	sync_regs(vcpu);
 	enable_cpu_timer_accounting(vcpu);
 
 	might_fault();
@@ -4042,7 +5348,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 
 	disable_cpu_timer_accounting(vcpu);
-	store_regs(vcpu, kvm_run);
+	store_regs(vcpu);
+	kernel_fpu_end(&fpu, KERNEL_FPC | KERNEL_VXR);
 
 	kvm_sigset_deactivate(vcpu);
 
@@ -4079,7 +5386,7 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa)
 		gpa -= __LC_FPREGS_SAVE_AREA;
 
 	/* manually convert vector registers if necessary */
-	if (MACHINE_HAS_VX) {
+	if (cpu_has_vx()) {
 		convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs);
 		rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA,
 				     fprs, 128);
@@ -4117,8 +5424,7 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
 	 * switch in the run ioctl. Let's update our copies before we save
 	 * it into the save area
 	 */
-	save_fpu_regs();
-	vcpu->run->s.regs.fpc = current->thread.fpu.fpc;
+	kvm_s390_fpu_store(vcpu->run);
 	save_access_regs(vcpu->run->s.regs.acrs);
 
 	return kvm_s390_store_status_unloaded(vcpu, addr);
@@ -4132,7 +5438,7 @@ static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
 
 static void __disable_ibs_on_all_vcpus(struct kvm *kvm)
 {
-	unsigned int i;
+	unsigned long i;
 	struct kvm_vcpu *vcpu;
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -4148,20 +5454,29 @@ static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
 	kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu);
 }
 
-void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
+int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
 {
-	int i, online_vcpus, started_vcpus = 0;
+	int i, online_vcpus, r = 0, started_vcpus = 0;
 
 	if (!is_vcpu_stopped(vcpu))
-		return;
+		return 0;
 
 	trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 1);
 	/* Only one cpu at a time may enter/leave the STOPPED state. */
 	spin_lock(&vcpu->kvm->arch.start_stop_lock);
 	online_vcpus = atomic_read(&vcpu->kvm->online_vcpus);
 
+	/* Let's tell the UV that we want to change into the operating state */
+	if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+		r = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_OPR);
+		if (r) {
+			spin_unlock(&vcpu->kvm->arch.start_stop_lock);
+			return r;
+		}
+	}
+
 	for (i = 0; i < online_vcpus; i++) {
-		if (!is_vcpu_stopped(vcpu->kvm->vcpus[i]))
+		if (!is_vcpu_stopped(kvm_get_vcpu(vcpu->kvm, i)))
 			started_vcpus++;
 	}
 
@@ -4172,44 +5487,67 @@ void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
 		/*
 		 * As we are starting a second VCPU, we have to disable
 		 * the IBS facility on all VCPUs to remove potentially
-		 * oustanding ENABLE requests.
+		 * outstanding ENABLE requests.
 		 */
 		__disable_ibs_on_all_vcpus(vcpu->kvm);
 	}
 
 	kvm_s390_clear_cpuflags(vcpu, CPUSTAT_STOPPED);
 	/*
+	 * The real PSW might have changed due to a RESTART interpreted by the
+	 * ultravisor. We block all interrupts and let the next sie exit
+	 * refresh our view.
+	 */
+	if (kvm_s390_pv_cpu_is_protected(vcpu))
+		vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK;
+	/*
 	 * Another VCPU might have used IBS while we were offline.
 	 * Let's play safe and flush the VCPU at startup.
 	 */
 	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 	spin_unlock(&vcpu->kvm->arch.start_stop_lock);
-	return;
+	return 0;
 }
 
-void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
+int kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
 {
-	int i, online_vcpus, started_vcpus = 0;
+	int i, online_vcpus, r = 0, started_vcpus = 0;
 	struct kvm_vcpu *started_vcpu = NULL;
 
 	if (is_vcpu_stopped(vcpu))
-		return;
+		return 0;
 
 	trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 0);
 	/* Only one cpu at a time may enter/leave the STOPPED state. */
 	spin_lock(&vcpu->kvm->arch.start_stop_lock);
 	online_vcpus = atomic_read(&vcpu->kvm->online_vcpus);
 
-	/* SIGP STOP and SIGP STOP AND STORE STATUS has been fully processed */
-	kvm_s390_clear_stop_irq(vcpu);
+	/* Let's tell the UV that we want to change into the stopped state */
+	if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+		r = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_STP);
+		if (r) {
+			spin_unlock(&vcpu->kvm->arch.start_stop_lock);
+			return r;
+		}
+	}
 
+	/*
+	 * Set the VCPU to STOPPED and THEN clear the interrupt flag,
+	 * now that the SIGP STOP and SIGP STOP AND STORE STATUS orders
+	 * have been fully processed. This will ensure that the VCPU
+	 * is kept BUSY if another VCPU is inquiring with SIGP SENSE.
+	 */
 	kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOPPED);
+	kvm_s390_clear_stop_irq(vcpu);
+
 	__disable_ibs_on_vcpu(vcpu);
 
 	for (i = 0; i < online_vcpus; i++) {
-		if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) {
+		struct kvm_vcpu *tmp = kvm_get_vcpu(vcpu->kvm, i);
+
+		if (!is_vcpu_stopped(tmp)) {
 			started_vcpus++;
-			started_vcpu = vcpu->kvm->vcpus[i];
+			started_vcpu = tmp;
 		}
 	}
 
@@ -4222,7 +5560,7 @@ void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
 	}
 
 	spin_unlock(&vcpu->kvm->arch.start_stop_lock);
-	return;
+	return 0;
 }
 
 static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
@@ -4249,64 +5587,116 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 	return r;
 }
 
-static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
+static long kvm_s390_vcpu_sida_op(struct kvm_vcpu *vcpu,
 				  struct kvm_s390_mem_op *mop)
 {
 	void __user *uaddr = (void __user *)mop->buf;
-	void *tmpbuf = NULL;
-	int r, srcu_idx;
-	const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION
-				    | KVM_S390_MEMOP_F_CHECK_ONLY;
+	void *sida_addr;
+	int r = 0;
 
-	if (mop->flags & ~supported_flags || mop->ar >= NUM_ACRS || !mop->size)
+	if (mop->flags || !mop->size)
 		return -EINVAL;
-
-	if (mop->size > MEM_OP_MAX_SIZE)
+	if (mop->size + mop->sida_offset < mop->size)
+		return -EINVAL;
+	if (mop->size + mop->sida_offset > sida_size(vcpu->arch.sie_block))
 		return -E2BIG;
+	if (!kvm_s390_pv_cpu_is_protected(vcpu))
+		return -EINVAL;
+
+	sida_addr = (char *)sida_addr(vcpu->arch.sie_block) + mop->sida_offset;
+
+	switch (mop->op) {
+	case KVM_S390_MEMOP_SIDA_READ:
+		if (copy_to_user(uaddr, sida_addr, mop->size))
+			r = -EFAULT;
+
+		break;
+	case KVM_S390_MEMOP_SIDA_WRITE:
+		if (copy_from_user(sida_addr, uaddr, mop->size))
+			r = -EFAULT;
+		break;
+	}
+	return r;
+}
 
+static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu,
+				 struct kvm_s390_mem_op *mop)
+{
+	void __user *uaddr = (void __user *)mop->buf;
+	enum gacc_mode acc_mode;
+	void *tmpbuf = NULL;
+	int r;
+
+	r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_INJECT_EXCEPTION |
+					KVM_S390_MEMOP_F_CHECK_ONLY |
+					KVM_S390_MEMOP_F_SKEY_PROTECTION);
+	if (r)
+		return r;
+	if (mop->ar >= NUM_ACRS)
+		return -EINVAL;
+	if (kvm_s390_pv_cpu_is_protected(vcpu))
+		return -EINVAL;
 	if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) {
 		tmpbuf = vmalloc(mop->size);
 		if (!tmpbuf)
 			return -ENOMEM;
 	}
 
+	acc_mode = mop->op == KVM_S390_MEMOP_LOGICAL_READ ? GACC_FETCH : GACC_STORE;
+	if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
+		r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size,
+				    acc_mode, mop->key);
+		goto out_inject;
+	}
+	if (acc_mode == GACC_FETCH) {
+		r = read_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf,
+					mop->size, mop->key);
+		if (r)
+			goto out_inject;
+		if (copy_to_user(uaddr, tmpbuf, mop->size)) {
+			r = -EFAULT;
+			goto out_free;
+		}
+	} else {
+		if (copy_from_user(tmpbuf, uaddr, mop->size)) {
+			r = -EFAULT;
+			goto out_free;
+		}
+		r = write_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf,
+					 mop->size, mop->key);
+	}
+
+out_inject:
+	if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0)
+		kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
+
+out_free:
+	vfree(tmpbuf);
+	return r;
+}
+
+static long kvm_s390_vcpu_memsida_op(struct kvm_vcpu *vcpu,
+				     struct kvm_s390_mem_op *mop)
+{
+	int r, srcu_idx;
+
 	srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 
 	switch (mop->op) {
 	case KVM_S390_MEMOP_LOGICAL_READ:
-		if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
-			r = check_gva_range(vcpu, mop->gaddr, mop->ar,
-					    mop->size, GACC_FETCH);
-			break;
-		}
-		r = read_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size);
-		if (r == 0) {
-			if (copy_to_user(uaddr, tmpbuf, mop->size))
-				r = -EFAULT;
-		}
-		break;
 	case KVM_S390_MEMOP_LOGICAL_WRITE:
-		if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
-			r = check_gva_range(vcpu, mop->gaddr, mop->ar,
-					    mop->size, GACC_STORE);
-			break;
-		}
-		if (copy_from_user(tmpbuf, uaddr, mop->size)) {
-			r = -EFAULT;
-			break;
-		}
-		r = write_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size);
+		r = kvm_s390_vcpu_mem_op(vcpu, mop);
+		break;
+	case KVM_S390_MEMOP_SIDA_READ:
+	case KVM_S390_MEMOP_SIDA_WRITE:
+		/* we are locked against sida going away by the vcpu->mutex */
+		r = kvm_s390_vcpu_sida_op(vcpu, mop);
 		break;
 	default:
 		r = -EINVAL;
 	}
 
 	srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
-
-	if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0)
-		kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
-
-	vfree(tmpbuf);
 	return r;
 }
 
@@ -4315,6 +5705,7 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp,
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
+	int rc;
 
 	switch (ioctl) {
 	case KVM_S390_IRQ: {
@@ -4322,7 +5713,8 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp,
 
 		if (copy_from_user(&s390irq, argp, sizeof(s390irq)))
 			return -EFAULT;
-		return kvm_s390_inject_vcpu(vcpu, &s390irq);
+		rc = kvm_s390_inject_vcpu(vcpu, &s390irq);
+		break;
 	}
 	case KVM_S390_INTERRUPT: {
 		struct kvm_s390_interrupt s390int;
@@ -4332,10 +5724,67 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp,
 			return -EFAULT;
 		if (s390int_to_s390irq(&s390int, &s390irq))
 			return -EINVAL;
-		return kvm_s390_inject_vcpu(vcpu, &s390irq);
+		rc = kvm_s390_inject_vcpu(vcpu, &s390irq);
+		break;
 	}
+	default:
+		rc = -ENOIOCTLCMD;
+		break;
 	}
-	return -ENOIOCTLCMD;
+
+	/*
+	 * To simplify single stepping of userspace-emulated instructions,
+	 * KVM_EXIT_S390_SIEIC exit sets KVM_GUESTDBG_EXIT_PENDING (see
+	 * should_handle_per_ifetch()). However, if userspace emulation injects
+	 * an interrupt, it needs to be cleared, so that KVM_EXIT_DEBUG happens
+	 * after (and not before) the interrupt delivery.
+	 */
+	if (!rc)
+		vcpu->guest_debug &= ~KVM_GUESTDBG_EXIT_PENDING;
+
+	return rc;
+}
+
+static int kvm_s390_handle_pv_vcpu_dump(struct kvm_vcpu *vcpu,
+					struct kvm_pv_cmd *cmd)
+{
+	struct kvm_s390_pv_dmp dmp;
+	void *data;
+	int ret;
+
+	/* Dump initialization is a prerequisite */
+	if (!vcpu->kvm->arch.pv.dumping)
+		return -EINVAL;
+
+	if (copy_from_user(&dmp, (__u8 __user *)cmd->data, sizeof(dmp)))
+		return -EFAULT;
+
+	/* We only handle this subcmd right now */
+	if (dmp.subcmd != KVM_PV_DUMP_CPU)
+		return -EINVAL;
+
+	/* CPU dump length is the same as create cpu storage donation. */
+	if (dmp.buff_len != uv_info.guest_cpu_stor_len)
+		return -EINVAL;
+
+	data = kvzalloc(uv_info.guest_cpu_stor_len, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	ret = kvm_s390_pv_dump_cpu(vcpu, data, &cmd->rc, &cmd->rrc);
+
+	VCPU_EVENT(vcpu, 3, "PROTVIRT DUMP CPU %d rc %x rrc %x",
+		   vcpu->vcpu_id, cmd->rc, cmd->rrc);
+
+	if (ret)
+		ret = -EINVAL;
+
+	/* On success copy over the dump data */
+	if (!ret && copy_to_user((__u8 __user *)dmp.buff_addr, data, uv_info.guest_cpu_stor_len))
+		ret = -EFAULT;
+
+	kvfree(data);
+	return ret;
 }
 
 long kvm_arch_vcpu_ioctl(struct file *filp,
@@ -4345,13 +5794,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	void __user *argp = (void __user *)arg;
 	int idx;
 	long r;
+	u16 rc, rrc;
 
 	vcpu_load(vcpu);
 
 	switch (ioctl) {
 	case KVM_S390_STORE_STATUS:
 		idx = srcu_read_lock(&vcpu->kvm->srcu);
-		r = kvm_s390_vcpu_store_status(vcpu, arg);
+		r = kvm_s390_store_status_unloaded(vcpu, arg);
 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
 		break;
 	case KVM_S390_SET_INITIAL_PSW: {
@@ -4363,12 +5813,43 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw);
 		break;
 	}
+	case KVM_S390_CLEAR_RESET:
+		r = 0;
+		kvm_arch_vcpu_ioctl_clear_reset(vcpu);
+		if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+			r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu),
+					  UVC_CMD_CPU_RESET_CLEAR, &rc, &rrc);
+			VCPU_EVENT(vcpu, 3, "PROTVIRT RESET CLEAR VCPU: rc %x rrc %x",
+				   rc, rrc);
+		}
+		break;
 	case KVM_S390_INITIAL_RESET:
-		r = kvm_arch_vcpu_ioctl_initial_reset(vcpu);
+		r = 0;
+		kvm_arch_vcpu_ioctl_initial_reset(vcpu);
+		if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+			r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu),
+					  UVC_CMD_CPU_RESET_INITIAL,
+					  &rc, &rrc);
+			VCPU_EVENT(vcpu, 3, "PROTVIRT RESET INITIAL VCPU: rc %x rrc %x",
+				   rc, rrc);
+		}
+		break;
+	case KVM_S390_NORMAL_RESET:
+		r = 0;
+		kvm_arch_vcpu_ioctl_normal_reset(vcpu);
+		if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+			r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu),
+					  UVC_CMD_CPU_RESET, &rc, &rrc);
+			VCPU_EVENT(vcpu, 3, "PROTVIRT RESET NORMAL VCPU: rc %x rrc %x",
+				   rc, rrc);
+		}
 		break;
 	case KVM_SET_ONE_REG:
 	case KVM_GET_ONE_REG: {
 		struct kvm_one_reg reg;
+		r = -EINVAL;
+		if (kvm_s390_pv_cpu_is_protected(vcpu))
+			break;
 		r = -EFAULT;
 		if (copy_from_user(&reg, argp, sizeof(reg)))
 			break;
@@ -4415,7 +5896,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	}
 #endif
 	case KVM_S390_VCPU_FAULT: {
-		r = gmap_fault(vcpu->arch.gmap, arg, 0);
+		idx = srcu_read_lock(&vcpu->kvm->srcu);
+		r = vcpu_dat_fault_handler(vcpu, arg, 0);
+		srcu_read_unlock(&vcpu->kvm->srcu, idx);
 		break;
 	}
 	case KVM_ENABLE_CAP:
@@ -4431,7 +5914,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		struct kvm_s390_mem_op mem_op;
 
 		if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0)
-			r = kvm_s390_guest_mem_op(vcpu, &mem_op);
+			r = kvm_s390_vcpu_memsida_op(vcpu, &mem_op);
 		else
 			r = -EFAULT;
 		break;
@@ -4470,6 +5953,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 					   irq_state.len);
 		break;
 	}
+	case KVM_S390_PV_CPU_COMMAND: {
+		struct kvm_pv_cmd cmd;
+
+		r = -EINVAL;
+		if (!is_prot_virt_host())
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(&cmd, argp, sizeof(cmd)))
+			break;
+
+		r = -EINVAL;
+		if (cmd.flags)
+			break;
+
+		/* We only handle this cmd right now */
+		if (cmd.cmd != KVM_PV_DUMP)
+			break;
+
+		r = kvm_s390_handle_pv_vcpu_dump(vcpu, &cmd);
+
+		/* Always copy over UV rc / rrc data */
+		if (copy_to_user((__u8 __user *)argp, &cmd.rc,
+				 sizeof(cmd.rc) + sizeof(cmd.rrc)))
+			r = -EFAULT;
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
@@ -4491,43 +6001,74 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 	return VM_FAULT_SIGBUS;
 }
 
-int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
-			    unsigned long npages)
+bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
 {
-	return 0;
+	return true;
 }
 
 /* Section: memory related */
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
-				   struct kvm_memory_slot *memslot,
-				   const struct kvm_userspace_memory_region *mem,
+				   const struct kvm_memory_slot *old,
+				   struct kvm_memory_slot *new,
 				   enum kvm_mr_change change)
 {
-	/* A few sanity checks. We can have memory slots which have to be
-	   located/ended at a segment boundary (1MB). The memory in userland is
-	   ok to be fragmented into various different vmas. It is okay to mmap()
-	   and munmap() stuff in this slot after doing this call at any time */
+	gpa_t size;
 
-	if (mem->userspace_addr & 0xffffful)
+	if (kvm_is_ucontrol(kvm) && new->id < KVM_USER_MEM_SLOTS)
 		return -EINVAL;
 
-	if (mem->memory_size & 0xffffful)
+	/* When we are protected, we should not change the memory slots */
+	if (kvm_s390_pv_get_handle(kvm))
 		return -EINVAL;
 
-	if (mem->guest_phys_addr + mem->memory_size > kvm->arch.mem_limit)
-		return -EINVAL;
+	if (change != KVM_MR_DELETE && change != KVM_MR_FLAGS_ONLY) {
+		/*
+		 * A few sanity checks. We can have memory slots which have to be
+		 * located/ended at a segment boundary (1MB). The memory in userland is
+		 * ok to be fragmented into various different vmas. It is okay to mmap()
+		 * and munmap() stuff in this slot after doing this call at any time
+		 */
+
+		if (new->userspace_addr & 0xffffful)
+			return -EINVAL;
+
+		size = new->npages * PAGE_SIZE;
+		if (size & 0xffffful)
+			return -EINVAL;
+
+		if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit)
+			return -EINVAL;
+	}
+
+	if (!kvm->arch.migration_mode)
+		return 0;
+
+	/*
+	 * Turn off migration mode when:
+	 * - userspace creates a new memslot with dirty logging off,
+	 * - userspace modifies an existing memslot (MOVE or FLAGS_ONLY) and
+	 *   dirty logging is turned off.
+	 * Migration mode expects dirty page logging being enabled to store
+	 * its dirty bitmap.
+	 */
+	if (change != KVM_MR_DELETE &&
+	    !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+		WARN(kvm_s390_vm_stop_migration(kvm),
+		     "Failed to stop migration mode");
 
 	return 0;
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-				const struct kvm_userspace_memory_region *mem,
-				const struct kvm_memory_slot *old,
+				struct kvm_memory_slot *old,
 				const struct kvm_memory_slot *new,
 				enum kvm_mr_change change)
 {
 	int rc = 0;
 
+	if (kvm_is_ucontrol(kvm))
+		return;
+
 	switch (change) {
 	case KVM_MR_DELETE:
 		rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE,
@@ -4538,10 +6079,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 					old->npages * PAGE_SIZE);
 		if (rc)
 			break;
-		/* FALLTHROUGH */
+		fallthrough;
 	case KVM_MR_CREATE:
-		rc = gmap_map_segment(kvm->arch.gmap, mem->userspace_addr,
-				      mem->guest_phys_addr, mem->memory_size);
+		rc = gmap_map_segment(kvm->arch.gmap, new->userspace_addr,
+				      new->base_gfn * PAGE_SIZE,
+				      new->npages * PAGE_SIZE);
 		break;
 	case KVM_MR_FLAGS_ONLY:
 		break;
@@ -4560,14 +6102,9 @@ static inline unsigned long nonhyp_mask(int i)
 	return 0x0000ffffffffffffUL >> (nonhyp_fai << 4);
 }
 
-void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu)
-{
-	vcpu->valid_wakeup = false;
-}
-
 static int __init kvm_s390_init(void)
 {
-	int i;
+	int i, r;
 
 	if (!sclp.has_sief2) {
 		pr_info("SIE is not available\n");
@@ -4581,14 +6118,25 @@ static int __init kvm_s390_init(void)
 
 	for (i = 0; i < 16; i++)
 		kvm_s390_fac_base[i] |=
-			S390_lowcore.stfle_fac_list[i] & nonhyp_mask(i);
+			stfle_fac_list[i] & nonhyp_mask(i);
+
+	r = __kvm_s390_init();
+	if (r)
+		return r;
 
-	return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+	r = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+	if (r) {
+		__kvm_s390_exit();
+		return r;
+	}
+	return 0;
 }
 
 static void __exit kvm_s390_exit(void)
 {
 	kvm_exit();
+
+	__kvm_s390_exit();
 }
 
 module_init(kvm_s390_init);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 6d9448dbd052..8d3bbb2dd8d2 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -2,7 +2,7 @@
 /*
  * definition for kvm on s390
  *
- * Copyright IBM Corp. 2008, 2009
+ * Copyright IBM Corp. 2008, 2020
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  *               Christian Borntraeger <borntraeger@de.ibm.com>
@@ -15,16 +15,49 @@
 #include <linux/hrtimer.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
+#include <linux/lockdep.h>
 #include <asm/facility.h>
 #include <asm/processor.h>
 #include <asm/sclp.h>
 
+#define KVM_S390_UCONTROL_MEMSLOT (KVM_USER_MEM_SLOTS + 0)
+
+static inline void kvm_s390_fpu_store(struct kvm_run *run)
+{
+	fpu_stfpc(&run->s.regs.fpc);
+	if (cpu_has_vx())
+		save_vx_regs((__vector128 *)&run->s.regs.vrs);
+	else
+		save_fp_regs((freg_t *)&run->s.regs.fprs);
+}
+
+static inline void kvm_s390_fpu_load(struct kvm_run *run)
+{
+	fpu_lfpc_safe(&run->s.regs.fpc);
+	if (cpu_has_vx())
+		load_vx_regs((__vector128 *)&run->s.regs.vrs);
+	else
+		load_fp_regs((freg_t *)&run->s.regs.fprs);
+}
+
 /* Transactional Memory Execution related macros */
 #define IS_TE_ENABLED(vcpu)	((vcpu->arch.sie_block->ecb & ECB_TE))
 #define TDB_FORMAT1		1
-#define IS_ITDB_VALID(vcpu)	((*(char *)vcpu->arch.sie_block->itdba == TDB_FORMAT1))
+#define IS_ITDB_VALID(vcpu) \
+	((*(char *)phys_to_virt((vcpu)->arch.sie_block->itdba) == TDB_FORMAT1))
 
 extern debug_info_t *kvm_s390_dbf;
+extern debug_info_t *kvm_s390_dbf_uv;
+
+#define KVM_UV_EVENT(d_kvm, d_loglevel, d_string, d_args...)\
+do { \
+	debug_sprintf_event((d_kvm)->arch.dbf, d_loglevel, d_string "\n", \
+	  d_args); \
+	debug_sprintf_event(kvm_s390_dbf_uv, d_loglevel, \
+			    "%d: " d_string "\n", (d_kvm)->userspace_pid, \
+			    d_args); \
+} while (0)
+
 #define KVM_EVENT(d_loglevel, d_string, d_args...)\
 do { \
 	debug_sprintf_event(kvm_s390_dbf, d_loglevel, d_string "\n", \
@@ -67,7 +100,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
 
 static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
 {
-	return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+	return test_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask);
 }
 
 static inline int kvm_is_ucontrol(struct kvm *kvm)
@@ -93,7 +126,7 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix)
 		   prefix);
 	vcpu->arch.sie_block->prefix = prefix >> GUEST_PREFIX_SHIFT;
 	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-	kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+	kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
 }
 
 static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, u8 *ar)
@@ -107,6 +140,21 @@ static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, u8 *ar)
 	return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
 }
 
+static inline u64 kvm_s390_get_base_disp_siy(struct kvm_vcpu *vcpu, u8 *ar)
+{
+	u32 base1 = vcpu->arch.sie_block->ipb >> 28;
+	s64 disp1;
+
+	/* The displacement is a 20bit _SIGNED_ value */
+	disp1 = sign_extend64(((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) +
+			      ((vcpu->arch.sie_block->ipb & 0xff00) << 4), 19);
+
+	if (ar)
+		*ar = base1;
+
+	return (base1 ? vcpu->run->s.regs.gprs[base1] : 0) + disp1;
+}
+
 static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu,
 					      u64 *address1, u64 *address2,
 					      u8 *ar_b1, u8 *ar_b2)
@@ -196,6 +244,81 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
 	return kvm->arch.user_cpu_state_ctrl != 0;
 }
 
+static inline void kvm_s390_set_user_cpu_state_ctrl(struct kvm *kvm)
+{
+	if (kvm->arch.user_cpu_state_ctrl)
+		return;
+
+	VM_EVENT(kvm, 3, "%s", "ENABLE: Userspace CPU state control");
+	kvm->arch.user_cpu_state_ctrl = 1;
+}
+
+/* get the end gfn of the last (highest gfn) memslot */
+static inline unsigned long kvm_s390_get_gfn_end(struct kvm_memslots *slots)
+{
+	struct rb_node *node;
+	struct kvm_memory_slot *ms;
+
+	if (WARN_ON(kvm_memslots_empty(slots)))
+		return 0;
+
+	node = rb_last(&slots->gfn_tree);
+	ms = container_of(node, struct kvm_memory_slot, gfn_node[slots->node_idx]);
+	return ms->base_gfn + ms->npages;
+}
+
+static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm)
+{
+	u32 gd;
+
+	if (!kvm->arch.gisa_int.origin)
+		return 0;
+
+	gd = virt_to_phys(kvm->arch.gisa_int.origin);
+
+	if (gd && sclp.has_gisaf)
+		gd |= GISA_FORMAT1;
+	return gd;
+}
+
+static inline hva_t gpa_to_hva(struct kvm *kvm, gpa_t gpa)
+{
+	hva_t hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
+
+	if (!kvm_is_error_hva(hva))
+		hva |= offset_in_page(gpa);
+	return hva;
+}
+
+/* implemented in pv.c */
+int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc);
+int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc);
+int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc);
+int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
+int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc);
+int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
+int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
+int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
+			      u16 *rrc);
+int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
+		       unsigned long tweak, u16 *rc, u16 *rrc);
+int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state);
+int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc);
+int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
+				u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc);
+int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
+			      u16 *rc, u16 *rrc);
+
+static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm)
+{
+	return kvm->arch.pv.handle;
+}
+
+static inline u64 kvm_s390_pv_cpu_get_handle(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.pv.handle;
+}
+
 /* implemented in interrupt.c */
 int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu);
@@ -281,13 +404,11 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
 
 /* implemented in kvm-s390.c */
-void kvm_s390_set_tod_clock(struct kvm *kvm,
-			    const struct kvm_s390_vm_tod_clock *gtod);
-long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
+int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod);
 int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
 int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr);
-void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
-void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu);
+int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
+int kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu);
 bool kvm_s390_vcpu_sie_inhibited(struct kvm_vcpu *vcpu);
@@ -297,13 +418,22 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu);
 void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm);
 __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu);
+int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc);
+int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags);
+int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot,
+			     unsigned long bits);
+
+static inline int kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gpa_t gaddr, unsigned int flags)
+{
+	return __kvm_s390_handle_dat_fault(vcpu, gpa_to_gfn(gaddr), gaddr, flags);
+}
 
 /* implemented in diag.c */
 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
 
 static inline void kvm_s390_vcpu_block_all(struct kvm *kvm)
 {
-	int i;
+	unsigned long i;
 	struct kvm_vcpu *vcpu;
 
 	WARN_ON(!mutex_is_locked(&kvm->lock));
@@ -313,7 +443,7 @@ static inline void kvm_s390_vcpu_block_all(struct kvm *kvm)
 
 static inline void kvm_s390_vcpu_unblock_all(struct kvm *kvm)
 {
-	int i;
+	unsigned long i;
 	struct kvm_vcpu *vcpu;
 
 	kvm_for_each_vcpu(i, vcpu, kvm)
@@ -373,6 +503,7 @@ void kvm_s390_destroy_adapters(struct kvm *kvm);
 int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu);
 extern struct kvm_device_ops kvm_flic_ops;
 int kvm_s390_is_stop_irq_pending(struct kvm_vcpu *vcpu);
+int kvm_s390_is_restart_irq_pending(struct kvm_vcpu *vcpu);
 void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu);
 int kvm_s390_set_irq_state(struct kvm_vcpu *vcpu,
 			   void __user *buf, int len);
@@ -381,7 +512,9 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu,
 void kvm_s390_gisa_init(struct kvm *kvm);
 void kvm_s390_gisa_clear(struct kvm *kvm);
 void kvm_s390_gisa_destroy(struct kvm *kvm);
-int kvm_s390_gib_init(u8 nisc);
+void kvm_s390_gisa_disable(struct kvm *kvm);
+void kvm_s390_gisa_enable(struct kvm *kvm);
+int __init kvm_s390_gib_init(u8 nisc);
 void kvm_s390_gib_destroy(void);
 
 /* implemented in guestdbg.c */
@@ -414,6 +547,13 @@ static inline int kvm_s390_use_sca_entries(void)
 void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
 				     struct mcck_volatile_info *mcck_info);
 
+static inline bool kvm_s390_cur_gmap_fault_is_write(void)
+{
+	if (current->thread.gmap_int_code == PGM_PROTECTION)
+		return true;
+	return test_facility(75) && (current->thread.gmap_teid.fsi == TEID_FSI_STORE);
+}
+
 /**
  * kvm_s390_vcpu_crypto_reset_all
  *
@@ -426,4 +566,22 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
  * @kvm: the KVM guest
  */
 void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm);
+
+/**
+ * kvm_s390_vcpu_pci_enable_interp
+ *
+ * Set the associated PCI attributes for each vcpu to allow for zPCI Load/Store
+ * interpretation as well as adapter interruption forwarding.
+ *
+ * @kvm: the KVM guest
+ */
+void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm);
+
+/**
+ * diag9c_forwarding_hz
+ *
+ * Set the maximum number of diag9c forwarding per second
+ */
+extern unsigned int diag9c_forwarding_hz;
+
 #endif
diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c
new file mode 100644
index 000000000000..8c40154ff50f
--- /dev/null
+++ b/arch/s390/kvm/pci.c
@@ -0,0 +1,690 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * s390 kvm PCI passthrough support
+ *
+ * Copyright IBM Corp. 2022
+ *
+ *    Author(s): Matthew Rosato <mjrosato@linux.ibm.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/pci.h>
+#include <asm/pci.h>
+#include <asm/pci_insn.h>
+#include <asm/pci_io.h>
+#include <asm/sclp.h>
+#include "pci.h"
+#include "kvm-s390.h"
+
+struct zpci_aift *aift;
+
+static inline int __set_irq_noiib(u16 ctl, u8 isc)
+{
+	union zpci_sic_iib iib = {{0}};
+
+	return zpci_set_irq_ctrl(ctl, isc, &iib);
+}
+
+void kvm_s390_pci_aen_exit(void)
+{
+	unsigned long flags;
+	struct kvm_zdev **gait_kzdev;
+
+	lockdep_assert_held(&aift->aift_lock);
+
+	/*
+	 * Contents of the aipb remain registered for the life of the host
+	 * kernel, the information preserved in zpci_aipb and zpci_aif_sbv
+	 * in case we insert the KVM module again later.  Clear the AIFT
+	 * information and free anything not registered with underlying
+	 * firmware.
+	 */
+	spin_lock_irqsave(&aift->gait_lock, flags);
+	gait_kzdev = aift->kzdev;
+	aift->gait = NULL;
+	aift->sbv = NULL;
+	aift->kzdev = NULL;
+	spin_unlock_irqrestore(&aift->gait_lock, flags);
+
+	kfree(gait_kzdev);
+}
+
+static int zpci_setup_aipb(u8 nisc)
+{
+	struct page *page;
+	int size, rc;
+
+	zpci_aipb = kzalloc(sizeof(union zpci_sic_iib), GFP_KERNEL);
+	if (!zpci_aipb)
+		return -ENOMEM;
+
+	aift->sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, NULL);
+	if (!aift->sbv) {
+		rc = -ENOMEM;
+		goto free_aipb;
+	}
+	zpci_aif_sbv = aift->sbv;
+	size = get_order(PAGE_ALIGN(ZPCI_NR_DEVICES *
+						sizeof(struct zpci_gaite)));
+	page = alloc_pages(GFP_KERNEL | __GFP_ZERO, size);
+	if (!page) {
+		rc = -ENOMEM;
+		goto free_sbv;
+	}
+	aift->gait = (struct zpci_gaite *)page_to_virt(page);
+
+	zpci_aipb->aipb.faisb = virt_to_phys(aift->sbv->vector);
+	zpci_aipb->aipb.gait = virt_to_phys(aift->gait);
+	zpci_aipb->aipb.afi = nisc;
+	zpci_aipb->aipb.faal = ZPCI_NR_DEVICES;
+
+	/* Setup Adapter Event Notification Interpretation */
+	if (zpci_set_irq_ctrl(SIC_SET_AENI_CONTROLS, 0, zpci_aipb)) {
+		rc = -EIO;
+		goto free_gait;
+	}
+
+	return 0;
+
+free_gait:
+	free_pages((unsigned long)aift->gait, size);
+free_sbv:
+	airq_iv_release(aift->sbv);
+	zpci_aif_sbv = NULL;
+free_aipb:
+	kfree(zpci_aipb);
+	zpci_aipb = NULL;
+
+	return rc;
+}
+
+static int zpci_reset_aipb(u8 nisc)
+{
+	/*
+	 * AEN registration can only happen once per system boot.  If
+	 * an aipb already exists then AEN was already registered and
+	 * we can reuse the aipb contents.  This can only happen if
+	 * the KVM module was removed and re-inserted.  However, we must
+	 * ensure that the same forwarding ISC is used as this is assigned
+	 * during KVM module load.
+	 */
+	if (zpci_aipb->aipb.afi != nisc)
+		return -EINVAL;
+
+	aift->sbv = zpci_aif_sbv;
+	aift->gait = phys_to_virt(zpci_aipb->aipb.gait);
+
+	return 0;
+}
+
+int kvm_s390_pci_aen_init(u8 nisc)
+{
+	int rc = 0;
+
+	/* If already enabled for AEN, bail out now */
+	if (aift->gait || aift->sbv)
+		return -EPERM;
+
+	mutex_lock(&aift->aift_lock);
+	aift->kzdev = kcalloc(ZPCI_NR_DEVICES, sizeof(struct kvm_zdev *),
+			      GFP_KERNEL);
+	if (!aift->kzdev) {
+		rc = -ENOMEM;
+		goto unlock;
+	}
+
+	if (!zpci_aipb)
+		rc = zpci_setup_aipb(nisc);
+	else
+		rc = zpci_reset_aipb(nisc);
+	if (rc)
+		goto free_zdev;
+
+	/* Enable floating IRQs */
+	if (__set_irq_noiib(SIC_IRQ_MODE_SINGLE, nisc)) {
+		rc = -EIO;
+		kvm_s390_pci_aen_exit();
+	}
+
+	goto unlock;
+
+free_zdev:
+	kfree(aift->kzdev);
+unlock:
+	mutex_unlock(&aift->aift_lock);
+	return rc;
+}
+
+/* Modify PCI: Register floating adapter interruption forwarding */
+static int kvm_zpci_set_airq(struct zpci_dev *zdev)
+{
+	u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT);
+	struct zpci_fib fib = {};
+	u8 status;
+
+	fib.fmt0.isc = zdev->kzdev->fib.fmt0.isc;
+	fib.fmt0.sum = 1;       /* enable summary notifications */
+	fib.fmt0.noi = airq_iv_end(zdev->aibv);
+	fib.fmt0.aibv = virt_to_phys(zdev->aibv->vector);
+	fib.fmt0.aibvo = 0;
+	fib.fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8);
+	fib.fmt0.aisbo = zdev->aisb & 63;
+	fib.gd = zdev->gisa;
+
+	return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
+}
+
+/* Modify PCI: Unregister floating adapter interruption forwarding */
+static int kvm_zpci_clear_airq(struct zpci_dev *zdev)
+{
+	u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT);
+	struct zpci_fib fib = {};
+	u8 cc, status;
+
+	fib.gd = zdev->gisa;
+
+	cc = zpci_mod_fc(req, &fib, &status);
+	if (cc == 3 || (cc == 1 && status == 24))
+		/* Function already gone or IRQs already deregistered. */
+		cc = 0;
+
+	return cc ? -EIO : 0;
+}
+
+static inline void unaccount_mem(unsigned long nr_pages)
+{
+	struct user_struct *user = get_uid(current_user());
+
+	if (user)
+		atomic_long_sub(nr_pages, &user->locked_vm);
+	if (current->mm)
+		atomic64_sub(nr_pages, &current->mm->pinned_vm);
+}
+
+static inline int account_mem(unsigned long nr_pages)
+{
+	struct user_struct *user = get_uid(current_user());
+	unsigned long page_limit, cur_pages, new_pages;
+
+	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	cur_pages = atomic_long_read(&user->locked_vm);
+	do {
+		new_pages = cur_pages + nr_pages;
+		if (new_pages > page_limit)
+			return -ENOMEM;
+	} while (!atomic_long_try_cmpxchg(&user->locked_vm, &cur_pages, new_pages));
+
+	atomic64_add(nr_pages, &current->mm->pinned_vm);
+
+	return 0;
+}
+
+static int kvm_s390_pci_aif_enable(struct zpci_dev *zdev, struct zpci_fib *fib,
+				   bool assist)
+{
+	struct page *pages[1], *aibv_page, *aisb_page = NULL;
+	unsigned int msi_vecs, idx;
+	struct zpci_gaite *gaite;
+	unsigned long hva, bit;
+	struct kvm *kvm;
+	phys_addr_t gaddr;
+	int rc = 0, gisc, npages, pcount = 0;
+
+	/*
+	 * Interrupt forwarding is only applicable if the device is already
+	 * enabled for interpretation
+	 */
+	if (zdev->gisa == 0)
+		return -EINVAL;
+
+	kvm = zdev->kzdev->kvm;
+	msi_vecs = min_t(unsigned int, fib->fmt0.noi, zdev->max_msi);
+
+	/* Get the associated forwarding ISC - if invalid, return the error */
+	gisc = kvm_s390_gisc_register(kvm, fib->fmt0.isc);
+	if (gisc < 0)
+		return gisc;
+
+	/* Replace AIBV address */
+	idx = srcu_read_lock(&kvm->srcu);
+	hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aibv));
+	npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM, pages);
+	srcu_read_unlock(&kvm->srcu, idx);
+	if (npages < 1) {
+		rc = -EIO;
+		goto out;
+	}
+	aibv_page = pages[0];
+	pcount++;
+	gaddr = page_to_phys(aibv_page) + (fib->fmt0.aibv & ~PAGE_MASK);
+	fib->fmt0.aibv = gaddr;
+
+	/* Pin the guest AISB if one was specified */
+	if (fib->fmt0.sum == 1) {
+		idx = srcu_read_lock(&kvm->srcu);
+		hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aisb));
+		npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM,
+					     pages);
+		srcu_read_unlock(&kvm->srcu, idx);
+		if (npages < 1) {
+			rc = -EIO;
+			goto unpin1;
+		}
+		aisb_page = pages[0];
+		pcount++;
+	}
+
+	/* Account for pinned pages, roll back on failure */
+	if (account_mem(pcount))
+		goto unpin2;
+
+	/* AISB must be allocated before we can fill in GAITE */
+	mutex_lock(&aift->aift_lock);
+	bit = airq_iv_alloc_bit(aift->sbv);
+	if (bit == -1UL)
+		goto unlock;
+	zdev->aisb = bit; /* store the summary bit number */
+	zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA |
+				    AIRQ_IV_BITLOCK |
+				    AIRQ_IV_GUESTVEC,
+				    phys_to_virt(fib->fmt0.aibv));
+
+	spin_lock_irq(&aift->gait_lock);
+	gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb *
+						   sizeof(struct zpci_gaite));
+
+	/* If assist not requested, host will get all alerts */
+	if (assist)
+		gaite->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa);
+	else
+		gaite->gisa = 0;
+
+	gaite->gisc = fib->fmt0.isc;
+	gaite->count++;
+	gaite->aisbo = fib->fmt0.aisbo;
+	gaite->aisb = virt_to_phys(page_address(aisb_page) + (fib->fmt0.aisb &
+							      ~PAGE_MASK));
+	aift->kzdev[zdev->aisb] = zdev->kzdev;
+	spin_unlock_irq(&aift->gait_lock);
+
+	/* Update guest FIB for re-issue */
+	fib->fmt0.aisbo = zdev->aisb & 63;
+	fib->fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8);
+	fib->fmt0.isc = gisc;
+
+	/* Save some guest fib values in the host for later use */
+	zdev->kzdev->fib.fmt0.isc = fib->fmt0.isc;
+	zdev->kzdev->fib.fmt0.aibv = fib->fmt0.aibv;
+	mutex_unlock(&aift->aift_lock);
+
+	/* Issue the clp to setup the irq now */
+	rc = kvm_zpci_set_airq(zdev);
+	return rc;
+
+unlock:
+	mutex_unlock(&aift->aift_lock);
+unpin2:
+	if (fib->fmt0.sum == 1)
+		unpin_user_page(aisb_page);
+unpin1:
+	unpin_user_page(aibv_page);
+out:
+	return rc;
+}
+
+static int kvm_s390_pci_aif_disable(struct zpci_dev *zdev, bool force)
+{
+	struct kvm_zdev *kzdev = zdev->kzdev;
+	struct zpci_gaite *gaite;
+	struct page *vpage = NULL, *spage = NULL;
+	int rc, pcount = 0;
+	u8 isc;
+
+	if (zdev->gisa == 0)
+		return -EINVAL;
+
+	mutex_lock(&aift->aift_lock);
+
+	/*
+	 * If the clear fails due to an error, leave now unless we know this
+	 * device is about to go away (force) -- In that case clear the GAITE
+	 * regardless.
+	 */
+	rc = kvm_zpci_clear_airq(zdev);
+	if (rc && !force)
+		goto out;
+
+	if (zdev->kzdev->fib.fmt0.aibv == 0)
+		goto out;
+	spin_lock_irq(&aift->gait_lock);
+	gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb *
+						   sizeof(struct zpci_gaite));
+	isc = gaite->gisc;
+	gaite->count--;
+	if (gaite->count == 0) {
+		/* Release guest AIBV and AISB */
+		vpage = phys_to_page(kzdev->fib.fmt0.aibv);
+		if (gaite->aisb != 0)
+			spage = phys_to_page(gaite->aisb);
+		/* Clear the GAIT entry */
+		gaite->aisb = 0;
+		gaite->gisc = 0;
+		gaite->aisbo = 0;
+		gaite->gisa = 0;
+		aift->kzdev[zdev->aisb] = NULL;
+		/* Clear zdev info */
+		airq_iv_free_bit(aift->sbv, zdev->aisb);
+		airq_iv_release(zdev->aibv);
+		zdev->aisb = 0;
+		zdev->aibv = NULL;
+	}
+	spin_unlock_irq(&aift->gait_lock);
+	kvm_s390_gisc_unregister(kzdev->kvm, isc);
+	kzdev->fib.fmt0.isc = 0;
+	kzdev->fib.fmt0.aibv = 0;
+
+	if (vpage) {
+		unpin_user_page(vpage);
+		pcount++;
+	}
+	if (spage) {
+		unpin_user_page(spage);
+		pcount++;
+	}
+	if (pcount > 0)
+		unaccount_mem(pcount);
+out:
+	mutex_unlock(&aift->aift_lock);
+
+	return rc;
+}
+
+static int kvm_s390_pci_dev_open(struct zpci_dev *zdev)
+{
+	struct kvm_zdev *kzdev;
+
+	kzdev = kzalloc(sizeof(struct kvm_zdev), GFP_KERNEL);
+	if (!kzdev)
+		return -ENOMEM;
+
+	kzdev->zdev = zdev;
+	zdev->kzdev = kzdev;
+
+	return 0;
+}
+
+static void kvm_s390_pci_dev_release(struct zpci_dev *zdev)
+{
+	struct kvm_zdev *kzdev;
+
+	kzdev = zdev->kzdev;
+	WARN_ON(kzdev->zdev != zdev);
+	zdev->kzdev = NULL;
+	kfree(kzdev);
+}
+
+
+/*
+ * Register device with the specified KVM. If interpretation facilities are
+ * available, enable them and let userspace indicate whether or not they will
+ * be used (specify SHM bit to disable).
+ */
+static int kvm_s390_pci_register_kvm(void *opaque, struct kvm *kvm)
+{
+	struct zpci_dev *zdev = opaque;
+	int rc;
+
+	if (!zdev)
+		return -EINVAL;
+
+	mutex_lock(&zdev->kzdev_lock);
+
+	if (zdev->kzdev || zdev->gisa != 0 || !kvm) {
+		mutex_unlock(&zdev->kzdev_lock);
+		return -EINVAL;
+	}
+
+	kvm_get_kvm(kvm);
+
+	mutex_lock(&kvm->lock);
+
+	rc = kvm_s390_pci_dev_open(zdev);
+	if (rc)
+		goto err;
+
+	/*
+	 * If interpretation facilities aren't available, add the device to
+	 * the kzdev list but don't enable for interpretation.
+	 */
+	if (!kvm_s390_pci_interp_allowed())
+		goto out;
+
+	/*
+	 * If this is the first request to use an interpreted device, make the
+	 * necessary vcpu changes
+	 */
+	if (!kvm->arch.use_zpci_interp)
+		kvm_s390_vcpu_pci_enable_interp(kvm);
+
+	if (zdev_enabled(zdev)) {
+		rc = zpci_disable_device(zdev);
+		if (rc)
+			goto err;
+	}
+
+	/*
+	 * Store information about the identity of the kvm guest allowed to
+	 * access this device via interpretation to be used by host CLP
+	 */
+	zdev->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa);
+
+	rc = zpci_reenable_device(zdev);
+	if (rc)
+		goto clear_gisa;
+
+out:
+	zdev->kzdev->kvm = kvm;
+
+	spin_lock(&kvm->arch.kzdev_list_lock);
+	list_add_tail(&zdev->kzdev->entry, &kvm->arch.kzdev_list);
+	spin_unlock(&kvm->arch.kzdev_list_lock);
+
+	mutex_unlock(&kvm->lock);
+	mutex_unlock(&zdev->kzdev_lock);
+	return 0;
+
+clear_gisa:
+	zdev->gisa = 0;
+err:
+	if (zdev->kzdev)
+		kvm_s390_pci_dev_release(zdev);
+	mutex_unlock(&kvm->lock);
+	mutex_unlock(&zdev->kzdev_lock);
+	kvm_put_kvm(kvm);
+	return rc;
+}
+
+static void kvm_s390_pci_unregister_kvm(void *opaque)
+{
+	struct zpci_dev *zdev = opaque;
+	struct kvm *kvm;
+
+	if (!zdev)
+		return;
+
+	mutex_lock(&zdev->kzdev_lock);
+
+	if (WARN_ON(!zdev->kzdev)) {
+		mutex_unlock(&zdev->kzdev_lock);
+		return;
+	}
+
+	kvm = zdev->kzdev->kvm;
+	mutex_lock(&kvm->lock);
+
+	/*
+	 * A 0 gisa means interpretation was never enabled, just remove the
+	 * device from the list.
+	 */
+	if (zdev->gisa == 0)
+		goto out;
+
+	/* Forwarding must be turned off before interpretation */
+	if (zdev->kzdev->fib.fmt0.aibv != 0)
+		kvm_s390_pci_aif_disable(zdev, true);
+
+	/* Remove the host CLP guest designation */
+	zdev->gisa = 0;
+
+	if (zdev_enabled(zdev)) {
+		if (zpci_disable_device(zdev))
+			goto out;
+	}
+
+	zpci_reenable_device(zdev);
+
+out:
+	spin_lock(&kvm->arch.kzdev_list_lock);
+	list_del(&zdev->kzdev->entry);
+	spin_unlock(&kvm->arch.kzdev_list_lock);
+	kvm_s390_pci_dev_release(zdev);
+
+	mutex_unlock(&kvm->lock);
+	mutex_unlock(&zdev->kzdev_lock);
+
+	kvm_put_kvm(kvm);
+}
+
+void kvm_s390_pci_init_list(struct kvm *kvm)
+{
+	spin_lock_init(&kvm->arch.kzdev_list_lock);
+	INIT_LIST_HEAD(&kvm->arch.kzdev_list);
+}
+
+void kvm_s390_pci_clear_list(struct kvm *kvm)
+{
+	/*
+	 * This list should already be empty, either via vfio device closures
+	 * or kvm fd cleanup.
+	 */
+	spin_lock(&kvm->arch.kzdev_list_lock);
+	WARN_ON_ONCE(!list_empty(&kvm->arch.kzdev_list));
+	spin_unlock(&kvm->arch.kzdev_list_lock);
+}
+
+static struct zpci_dev *get_zdev_from_kvm_by_fh(struct kvm *kvm, u32 fh)
+{
+	struct zpci_dev *zdev = NULL;
+	struct kvm_zdev *kzdev;
+
+	spin_lock(&kvm->arch.kzdev_list_lock);
+	list_for_each_entry(kzdev, &kvm->arch.kzdev_list, entry) {
+		if (kzdev->zdev->fh == fh) {
+			zdev = kzdev->zdev;
+			break;
+		}
+	}
+	spin_unlock(&kvm->arch.kzdev_list_lock);
+
+	return zdev;
+}
+
+static int kvm_s390_pci_zpci_reg_aen(struct zpci_dev *zdev,
+				     struct kvm_s390_zpci_op *args)
+{
+	struct zpci_fib fib = {};
+	bool hostflag;
+
+	fib.fmt0.aibv = args->u.reg_aen.ibv;
+	fib.fmt0.isc = args->u.reg_aen.isc;
+	fib.fmt0.noi = args->u.reg_aen.noi;
+	if (args->u.reg_aen.sb != 0) {
+		fib.fmt0.aisb = args->u.reg_aen.sb;
+		fib.fmt0.aisbo = args->u.reg_aen.sbo;
+		fib.fmt0.sum = 1;
+	} else {
+		fib.fmt0.aisb = 0;
+		fib.fmt0.aisbo = 0;
+		fib.fmt0.sum = 0;
+	}
+
+	hostflag = !(args->u.reg_aen.flags & KVM_S390_ZPCIOP_REGAEN_HOST);
+	return kvm_s390_pci_aif_enable(zdev, &fib, hostflag);
+}
+
+int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args)
+{
+	struct kvm_zdev *kzdev;
+	struct zpci_dev *zdev;
+	int r;
+
+	zdev = get_zdev_from_kvm_by_fh(kvm, args->fh);
+	if (!zdev)
+		return -ENODEV;
+
+	mutex_lock(&zdev->kzdev_lock);
+	mutex_lock(&kvm->lock);
+
+	kzdev = zdev->kzdev;
+	if (!kzdev) {
+		r = -ENODEV;
+		goto out;
+	}
+	if (kzdev->kvm != kvm) {
+		r = -EPERM;
+		goto out;
+	}
+
+	switch (args->op) {
+	case KVM_S390_ZPCIOP_REG_AEN:
+		/* Fail on unknown flags */
+		if (args->u.reg_aen.flags & ~KVM_S390_ZPCIOP_REGAEN_HOST) {
+			r = -EINVAL;
+			break;
+		}
+		r = kvm_s390_pci_zpci_reg_aen(zdev, args);
+		break;
+	case KVM_S390_ZPCIOP_DEREG_AEN:
+		r = kvm_s390_pci_aif_disable(zdev, false);
+		break;
+	default:
+		r = -EINVAL;
+	}
+
+out:
+	mutex_unlock(&kvm->lock);
+	mutex_unlock(&zdev->kzdev_lock);
+	return r;
+}
+
+int __init kvm_s390_pci_init(void)
+{
+	zpci_kvm_hook.kvm_register = kvm_s390_pci_register_kvm;
+	zpci_kvm_hook.kvm_unregister = kvm_s390_pci_unregister_kvm;
+
+	if (!kvm_s390_pci_interp_allowed())
+		return 0;
+
+	aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL);
+	if (!aift)
+		return -ENOMEM;
+
+	spin_lock_init(&aift->gait_lock);
+	mutex_init(&aift->aift_lock);
+
+	return 0;
+}
+
+void kvm_s390_pci_exit(void)
+{
+	zpci_kvm_hook.kvm_register = NULL;
+	zpci_kvm_hook.kvm_unregister = NULL;
+
+	if (!kvm_s390_pci_interp_allowed())
+		return;
+
+	mutex_destroy(&aift->aift_lock);
+
+	kfree(aift);
+}
diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h
new file mode 100644
index 000000000000..ff0972dd5e71
--- /dev/null
+++ b/arch/s390/kvm/pci.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * s390 kvm PCI passthrough support
+ *
+ * Copyright IBM Corp. 2022
+ *
+ *    Author(s): Matthew Rosato <mjrosato@linux.ibm.com>
+ */
+
+#ifndef __KVM_S390_PCI_H
+#define __KVM_S390_PCI_H
+
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <asm/airq.h>
+#include <asm/cpu.h>
+
+struct kvm_zdev {
+	struct zpci_dev *zdev;
+	struct kvm *kvm;
+	struct zpci_fib fib;
+	struct list_head entry;
+};
+
+struct zpci_gaite {
+	u32 gisa;
+	u8 gisc;
+	u8 count;
+	u8 reserved;
+	u8 aisbo;
+	u64 aisb;
+};
+
+struct zpci_aift {
+	struct zpci_gaite *gait;
+	struct airq_iv *sbv;
+	struct kvm_zdev **kzdev;
+	spinlock_t gait_lock; /* Protects the gait, used during AEN forward */
+	struct mutex aift_lock; /* Protects the other structures in aift */
+};
+
+extern struct zpci_aift *aift;
+
+static inline struct kvm *kvm_s390_pci_si_to_kvm(struct zpci_aift *aift,
+						 unsigned long si)
+{
+	if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) || !aift->kzdev ||
+	    !aift->kzdev[si])
+		return NULL;
+	return aift->kzdev[si]->kvm;
+};
+
+int kvm_s390_pci_aen_init(u8 nisc);
+void kvm_s390_pci_aen_exit(void);
+
+void kvm_s390_pci_init_list(struct kvm *kvm);
+void kvm_s390_pci_clear_list(struct kvm *kvm);
+
+int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args);
+
+int __init kvm_s390_pci_init(void);
+void kvm_s390_pci_exit(void);
+
+static inline bool kvm_s390_pci_interp_allowed(void)
+{
+	struct cpuid cpu_id;
+
+	get_cpu_id(&cpu_id);
+	switch (cpu_id.machine) {
+	case 0x2817:
+	case 0x2818:
+	case 0x2827:
+	case 0x2828:
+	case 0x2964:
+	case 0x2965:
+		/* No SHM on certain machines */
+		return false;
+	default:
+		return (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) &&
+			sclp.has_zpci_lsi && sclp.has_aeni && sclp.has_aisi &&
+			sclp.has_aisii);
+	}
+}
+
+#endif /* __KVM_S390_PCI_H */
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index ed52ffa8d5d4..1a49b89706f8 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -2,7 +2,7 @@
 /*
  * handling privileged instructions
  *
- * Copyright IBM Corp. 2008, 2018
+ * Copyright IBM Corp. 2008, 2020
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  *               Christian Borntraeger <borntraeger@de.ibm.com>
@@ -11,20 +11,17 @@
 #include <linux/kvm.h>
 #include <linux/gfp.h>
 #include <linux/errno.h>
-#include <linux/compat.h>
 #include <linux/mm_types.h>
-
+#include <linux/pgtable.h>
+#include <linux/io.h>
 #include <asm/asm-offsets.h>
 #include <asm/facility.h>
 #include <asm/current.h>
 #include <asm/debug.h>
 #include <asm/ebcdic.h>
 #include <asm/sysinfo.h>
-#include <asm/pgtable.h>
 #include <asm/page-states.h>
-#include <asm/pgalloc.h>
 #include <asm/gmap.h>
-#include <asm/io.h>
 #include <asm/ptrace.h>
 #include <asm/sclp.h>
 #include <asm/ap.h>
@@ -60,7 +57,7 @@ static int handle_gs(struct kvm_vcpu *vcpu)
 	if (test_kvm_facility(vcpu->kvm, 133)) {
 		VCPU_EVENT(vcpu, 3, "%s", "ENABLE: GS (lazy)");
 		preempt_disable();
-		__ctl_set_bit(2, 4);
+		local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT);
 		current->thread.gs_cb = (struct gs_cb *)&vcpu->run->s.regs.gscb;
 		restore_gs_cb(current->thread.gs_cb);
 		preempt_enable();
@@ -103,7 +100,20 @@ static int handle_set_clock(struct kvm_vcpu *vcpu)
 		return kvm_s390_inject_prog_cond(vcpu, rc);
 
 	VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", gtod.tod);
-	kvm_s390_set_tod_clock(vcpu->kvm, &gtod);
+	/*
+	 * To set the TOD clock the kvm lock must be taken, but the vcpu lock
+	 * is already held in handle_set_clock. The usual lock order is the
+	 * opposite.  As SCK is deprecated and should not be used in several
+	 * cases, for example when the multiple epoch facility or TOD clock
+	 * steering facility is installed (see Principles of Operation),  a
+	 * slow path can be used.  If the lock can not be taken via try_lock,
+	 * the instruction will be retried via -EAGAIN at a later point in
+	 * time.
+	 */
+	if (!kvm_s390_try_set_tod_clock(vcpu->kvm, &gtod)) {
+		kvm_s390_retry_instr(vcpu);
+		return -EAGAIN;
+	}
 
 	kvm_s390_set_psw_cc(vcpu, 0);
 	return 0;
@@ -139,7 +149,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
 	 * first page, since address is 8k aligned and memory pieces are always
 	 * at least 1MB aligned and have at least a size of 1MB.
 	 */
-	if (kvm_is_error_gpa(vcpu->kvm, address))
+	if (!kvm_is_gpa_in_memslot(vcpu->kvm, address))
 		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
 	kvm_s390_set_prefix(vcpu, address);
@@ -270,18 +280,18 @@ static int handle_iske(struct kvm_vcpu *vcpu)
 		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 retry:
 	unlocked = false;
-	down_read(&current->mm->mmap_sem);
+	mmap_read_lock(current->mm);
 	rc = get_guest_storage_key(current->mm, vmaddr, &key);
 
 	if (rc) {
-		rc = fixup_user_fault(current, current->mm, vmaddr,
+		rc = fixup_user_fault(current->mm, vmaddr,
 				      FAULT_FLAG_WRITE, &unlocked);
 		if (!rc) {
-			up_read(&current->mm->mmap_sem);
+			mmap_read_unlock(current->mm);
 			goto retry;
 		}
 	}
-	up_read(&current->mm->mmap_sem);
+	mmap_read_unlock(current->mm);
 	if (rc == -EFAULT)
 		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 	if (rc < 0)
@@ -317,17 +327,17 @@ static int handle_rrbe(struct kvm_vcpu *vcpu)
 		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 retry:
 	unlocked = false;
-	down_read(&current->mm->mmap_sem);
+	mmap_read_lock(current->mm);
 	rc = reset_guest_reference_bit(current->mm, vmaddr);
 	if (rc < 0) {
-		rc = fixup_user_fault(current, current->mm, vmaddr,
+		rc = fixup_user_fault(current->mm, vmaddr,
 				      FAULT_FLAG_WRITE, &unlocked);
 		if (!rc) {
-			up_read(&current->mm->mmap_sem);
+			mmap_read_unlock(current->mm);
 			goto retry;
 		}
 	}
-	up_read(&current->mm->mmap_sem);
+	mmap_read_unlock(current->mm);
 	if (rc == -EFAULT)
 		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 	if (rc < 0)
@@ -385,19 +395,21 @@ static int handle_sske(struct kvm_vcpu *vcpu)
 		if (kvm_is_error_hva(vmaddr))
 			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
-		down_read(&current->mm->mmap_sem);
+		mmap_read_lock(current->mm);
 		rc = cond_set_guest_storage_key(current->mm, vmaddr, key, &oldkey,
 						m3 & SSKE_NQ, m3 & SSKE_MR,
 						m3 & SSKE_MC);
 
 		if (rc < 0) {
-			rc = fixup_user_fault(current, current->mm, vmaddr,
+			rc = fixup_user_fault(current->mm, vmaddr,
 					      FAULT_FLAG_WRITE, &unlocked);
 			rc = !rc ? -EAGAIN : rc;
 		}
-		up_read(&current->mm->mmap_sem);
+		mmap_read_unlock(current->mm);
 		if (rc == -EFAULT)
 			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+		if (rc == -EAGAIN)
+			continue;
 		if (rc < 0)
 			return rc;
 		start += PAGE_SIZE;
@@ -429,7 +441,7 @@ static int handle_ipte_interlock(struct kvm_vcpu *vcpu)
 	vcpu->stat.instruction_ipte_interlock++;
 	if (psw_bits(vcpu->arch.sie_block->gpsw).pstate)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
-	wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu));
+	wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu->kvm));
 	kvm_s390_retry_instr(vcpu);
 	VCPU_EVENT(vcpu, 4, "%s", "retrying ipte interlock operation");
 	return 0;
@@ -452,7 +464,7 @@ static int handle_test_block(struct kvm_vcpu *vcpu)
 		return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
 	addr = kvm_s390_real_to_abs(vcpu, addr);
 
-	if (kvm_is_error_gpa(vcpu->kvm, addr))
+	if (!kvm_is_gpa_in_memslot(vcpu->kvm, addr))
 		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 	/*
 	 * We don't expect errors on modern systems, and do not care
@@ -611,6 +623,7 @@ static int handle_io_inst(struct kvm_vcpu *vcpu)
 static int handle_pqap(struct kvm_vcpu *vcpu)
 {
 	struct ap_queue_status status = {};
+	crypto_hook pqap_hook;
 	unsigned long reg0;
 	int ret;
 	uint8_t fc;
@@ -626,10 +639,12 @@ static int handle_pqap(struct kvm_vcpu *vcpu)
 	 * available for the guest are AQIC and TAPQ with the t bit set
 	 * since we do not set IC.3 (FIII) we currently will only intercept
 	 * the AQIC function code.
+	 * Note: running nested under z/VM can result in intercepts for other
+	 * function codes, e.g. PQAP(QCI). We do not support this and bail out.
 	 */
 	reg0 = vcpu->run->s.regs.gprs[0];
 	fc = (reg0 >> 24) & 0xff;
-	if (WARN_ON_ONCE(fc != 0x03))
+	if (fc != 0x03)
 		return -EOPNOTSUPP;
 
 	/* PQAP instruction is allowed for guest kernel only */
@@ -653,18 +668,24 @@ static int handle_pqap(struct kvm_vcpu *vcpu)
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
 	/*
-	 * Verify that the hook callback is registered, lock the owner
-	 * and call the hook.
+	 * If the hook callback is registered, there will be a pointer to the
+	 * hook function pointer in the kvm_s390_crypto structure. Lock the
+	 * owner, retrieve the hook function pointer and call the hook.
 	 */
+	down_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem);
 	if (vcpu->kvm->arch.crypto.pqap_hook) {
-		if (!try_module_get(vcpu->kvm->arch.crypto.pqap_hook->owner))
-			return -EOPNOTSUPP;
-		ret = vcpu->kvm->arch.crypto.pqap_hook->hook(vcpu);
-		module_put(vcpu->kvm->arch.crypto.pqap_hook->owner);
-		if (!ret && vcpu->run->s.regs.gprs[1] & 0x00ff0000)
-			kvm_s390_set_psw_cc(vcpu, 3);
+		pqap_hook = *vcpu->kvm->arch.crypto.pqap_hook;
+		ret = pqap_hook(vcpu);
+		if (!ret) {
+			if (vcpu->run->s.regs.gprs[1] & 0x00ff0000)
+				kvm_s390_set_psw_cc(vcpu, 3);
+			else
+				kvm_s390_set_psw_cc(vcpu, 0);
+		}
+		up_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem);
 		return ret;
 	}
+	up_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem);
 	/*
 	 * A vfio_driver must register a hook.
 	 * No hook means no driver to enable the SIE CRYCB and no queues.
@@ -776,6 +797,36 @@ static int handle_lpswe(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int handle_lpswey(struct kvm_vcpu *vcpu)
+{
+	psw_t new_psw;
+	u64 addr;
+	int rc;
+	u8 ar;
+
+	vcpu->stat.instruction_lpswey++;
+
+	if (!test_kvm_facility(vcpu->kvm, 193))
+		return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+
+	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+	addr = kvm_s390_get_base_disp_siy(vcpu, &ar);
+	if (addr & 7)
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+	rc = read_guest(vcpu, addr, ar, &new_psw, sizeof(new_psw));
+	if (rc)
+		return kvm_s390_inject_prog_cond(vcpu, rc);
+
+	vcpu->arch.sie_block->gpsw = new_psw;
+	if (!is_valid_psw(&vcpu->arch.sie_block->gpsw))
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+	return 0;
+}
+
 static int handle_stidp(struct kvm_vcpu *vcpu)
 {
 	u64 stidp_data = vcpu->kvm->arch.model.cpuid;
@@ -855,10 +906,18 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-	if (fc > 3) {
-		kvm_s390_set_psw_cc(vcpu, 3);
-		return 0;
-	}
+	/* Bailout forbidden function codes */
+	if (fc > 3 && fc != 15)
+		goto out_no_data;
+
+	/*
+	 * fc 15 is provided only with
+	 *   - PTF/CPU topology support through facility 15
+	 *   - KVM_CAP_S390_USER_STSI
+	 */
+	if (fc == 15 && (!test_kvm_facility(vcpu->kvm, 11) ||
+			 !vcpu->kvm->arch.user_stsi))
+		goto out_no_data;
 
 	if (vcpu->run->s.regs.gprs[0] & 0x0fffff00
 	    || vcpu->run->s.regs.gprs[1] & 0xffff0000)
@@ -872,13 +931,13 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
 
 	operand2 = kvm_s390_get_base_disp_s(vcpu, &ar);
 
-	if (operand2 & 0xfff)
+	if (!kvm_s390_pv_cpu_is_protected(vcpu) && (operand2 & 0xfff))
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
 	switch (fc) {
 	case 1: /* same handling for 1 and 2 */
 	case 2:
-		mem = get_zeroed_page(GFP_KERNEL);
+		mem = get_zeroed_page(GFP_KERNEL_ACCOUNT);
 		if (!mem)
 			goto out_no_data;
 		if (stsi((void *) mem, fc, sel1, sel2))
@@ -887,14 +946,22 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
 	case 3:
 		if (sel1 != 2 || sel2 != 2)
 			goto out_no_data;
-		mem = get_zeroed_page(GFP_KERNEL);
+		mem = get_zeroed_page(GFP_KERNEL_ACCOUNT);
 		if (!mem)
 			goto out_no_data;
 		handle_stsi_3_2_2(vcpu, (void *) mem);
 		break;
+	case 15: /* fc 15 is fully handled in userspace */
+		insert_stsi_usr_data(vcpu, operand2, ar, fc, sel1, sel2);
+		trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
+		return -EREMOTE;
+	}
+	if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+		memcpy(sida_addr(vcpu->arch.sie_block), (void *)mem, PAGE_SIZE);
+		rc = 0;
+	} else {
+		rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE);
 	}
-
-	rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE);
 	if (rc) {
 		rc = kvm_s390_inject_prog_cond(vcpu, rc);
 		goto out;
@@ -1084,15 +1151,15 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 
 			if (rc)
 				return rc;
-			down_read(&current->mm->mmap_sem);
+			mmap_read_lock(current->mm);
 			rc = cond_set_guest_storage_key(current->mm, vmaddr,
 							key, NULL, nq, mr, mc);
 			if (rc < 0) {
-				rc = fixup_user_fault(current, current->mm, vmaddr,
+				rc = fixup_user_fault(current->mm, vmaddr,
 						      FAULT_FLAG_WRITE, &unlocked);
 				rc = !rc ? -EAGAIN : rc;
 			}
-			up_read(&current->mm->mmap_sem);
+			mmap_read_unlock(current->mm);
 			if (rc == -EFAULT)
 				return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 			if (rc == -EAGAIN)
@@ -1115,7 +1182,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 }
 
 /*
- * Must be called with relevant read locks held (kvm->mm->mmap_sem, kvm->srcu)
+ * Must be called with relevant read locks held (kvm->mm->mmap_lock, kvm->srcu)
  */
 static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
 {
@@ -1213,9 +1280,9 @@ static int handle_essa(struct kvm_vcpu *vcpu)
 		 * already correct, we do nothing and avoid the lock.
 		 */
 		if (vcpu->kvm->mm->context.uses_cmm == 0) {
-			down_write(&vcpu->kvm->mm->mmap_sem);
+			mmap_write_lock(vcpu->kvm->mm);
 			vcpu->kvm->mm->context.uses_cmm = 1;
-			up_write(&vcpu->kvm->mm->mmap_sem);
+			mmap_write_unlock(vcpu->kvm->mm);
 		}
 		/*
 		 * If we are here, we are supposed to have CMMA enabled in
@@ -1232,11 +1299,11 @@ static int handle_essa(struct kvm_vcpu *vcpu)
 	} else {
 		int srcu_idx;
 
-		down_read(&vcpu->kvm->mm->mmap_sem);
+		mmap_read_lock(vcpu->kvm->mm);
 		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 		i = __do_essa(vcpu, orc);
 		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
-		up_read(&vcpu->kvm->mm->mmap_sem);
+		mmap_read_unlock(vcpu->kvm->mm);
 		if (i < 0)
 			return i;
 		/* Account for the possible extra cbrl entry */
@@ -1244,10 +1311,10 @@ static int handle_essa(struct kvm_vcpu *vcpu)
 	}
 	vcpu->arch.sie_block->cbrlo &= PAGE_MASK;	/* reset nceo */
 	cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo);
-	down_read(&gmap->mm->mmap_sem);
+	mmap_read_lock(gmap->mm);
 	for (i = 0; i < entries; ++i)
 		__gmap_zap(gmap, cbrlo[i]);
-	up_read(&gmap->mm->mmap_sem);
+	mmap_read_unlock(gmap->mm);
 	return 0;
 }
 
@@ -1425,6 +1492,8 @@ int kvm_s390_handle_eb(struct kvm_vcpu *vcpu)
 	case 0x61:
 	case 0x62:
 		return handle_ri(vcpu);
+	case 0x71:
+		return handle_lpswey(vcpu);
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -1432,10 +1501,11 @@ int kvm_s390_handle_eb(struct kvm_vcpu *vcpu)
 
 static int handle_tprot(struct kvm_vcpu *vcpu)
 {
-	u64 address1, address2;
-	unsigned long hva, gpa;
-	int ret = 0, cc = 0;
+	u64 address, operand2;
+	unsigned long gpa;
+	u8 access_key;
 	bool writable;
+	int ret, cc;
 	u8 ar;
 
 	vcpu->stat.instruction_tprot++;
@@ -1443,45 +1513,48 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-	kvm_s390_get_base_disp_sse(vcpu, &address1, &address2, &ar, NULL);
+	kvm_s390_get_base_disp_sse(vcpu, &address, &operand2, &ar, NULL);
+	access_key = (operand2 & 0xf0) >> 4;
 
-	/* we only handle the Linux memory detection case:
-	 * access key == 0
-	 * everything else goes to userspace. */
-	if (address2 & 0xf0)
-		return -EOPNOTSUPP;
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT)
-		ipte_lock(vcpu);
-	ret = guest_translate_address(vcpu, address1, ar, &gpa, GACC_STORE);
-	if (ret == PGM_PROTECTION) {
+		ipte_lock(vcpu->kvm);
+
+	ret = guest_translate_address_with_key(vcpu, address, ar, &gpa,
+					       GACC_STORE, access_key);
+	if (ret == 0) {
+		gfn_to_hva_prot(vcpu->kvm, gpa_to_gfn(gpa), &writable);
+	} else if (ret == PGM_PROTECTION) {
+		writable = false;
 		/* Write protected? Try again with read-only... */
-		cc = 1;
-		ret = guest_translate_address(vcpu, address1, ar, &gpa,
-					      GACC_FETCH);
+		ret = guest_translate_address_with_key(vcpu, address, ar, &gpa,
+						       GACC_FETCH, access_key);
 	}
-	if (ret) {
-		if (ret == PGM_ADDRESSING || ret == PGM_TRANSLATION_SPEC) {
-			ret = kvm_s390_inject_program_int(vcpu, ret);
-		} else if (ret > 0) {
-			/* Translation not available */
-			kvm_s390_set_psw_cc(vcpu, 3);
+	if (ret >= 0) {
+		cc = -1;
+
+		/* Fetching permitted; storing permitted */
+		if (ret == 0 && writable)
+			cc = 0;
+		/* Fetching permitted; storing not permitted */
+		else if (ret == 0 && !writable)
+			cc = 1;
+		/* Fetching not permitted; storing not permitted */
+		else if (ret == PGM_PROTECTION)
+			cc = 2;
+		/* Translation not available */
+		else if (ret != PGM_ADDRESSING && ret != PGM_TRANSLATION_SPEC)
+			cc = 3;
+
+		if (cc != -1) {
+			kvm_s390_set_psw_cc(vcpu, cc);
 			ret = 0;
+		} else {
+			ret = kvm_s390_inject_program_int(vcpu, ret);
 		}
-		goto out_unlock;
 	}
 
-	hva = gfn_to_hva_prot(vcpu->kvm, gpa_to_gfn(gpa), &writable);
-	if (kvm_is_error_hva(hva)) {
-		ret = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-	} else {
-		if (!writable)
-			cc = 1;		/* Write not permitted ==> read-only */
-		kvm_s390_set_psw_cc(vcpu, cc);
-		/* Note: CC2 only occurs for storage keys (not supported yet) */
-	}
-out_unlock:
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT)
-		ipte_unlock(vcpu);
+		ipte_unlock(vcpu->kvm);
 	return ret;
 }
 
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
new file mode 100644
index 000000000000..22c012aa5206
--- /dev/null
+++ b/arch/s390/kvm/pv.c
@@ -0,0 +1,915 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hosting Protected Virtual Machines
+ *
+ * Copyright IBM Corp. 2019, 2020
+ *    Author(s): Janosch Frank <frankja@linux.ibm.com>
+ */
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/minmax.h>
+#include <linux/pagemap.h>
+#include <linux/sched/signal.h>
+#include <asm/gmap.h>
+#include <asm/uv.h>
+#include <asm/mman.h>
+#include <linux/pagewalk.h>
+#include <linux/sched/mm.h>
+#include <linux/mmu_notifier.h>
+#include "kvm-s390.h"
+#include "gmap.h"
+
+bool kvm_s390_pv_is_protected(struct kvm *kvm)
+{
+	lockdep_assert_held(&kvm->lock);
+	return !!kvm_s390_pv_get_handle(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_s390_pv_is_protected);
+
+bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu)
+{
+	lockdep_assert_held(&vcpu->mutex);
+	return !!kvm_s390_pv_cpu_get_handle(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected);
+
+/**
+ * struct pv_vm_to_be_destroyed - Represents a protected VM that needs to
+ * be destroyed
+ *
+ * @list: list head for the list of leftover VMs
+ * @old_gmap_table: the gmap table of the leftover protected VM
+ * @handle: the handle of the leftover protected VM
+ * @stor_var: pointer to the variable storage of the leftover protected VM
+ * @stor_base: address of the base storage of the leftover protected VM
+ *
+ * Represents a protected VM that is still registered with the Ultravisor,
+ * but which does not correspond any longer to an active KVM VM. It should
+ * be destroyed at some point later, either asynchronously or when the
+ * process terminates.
+ */
+struct pv_vm_to_be_destroyed {
+	struct list_head list;
+	unsigned long old_gmap_table;
+	u64 handle;
+	void *stor_var;
+	unsigned long stor_base;
+};
+
+static void kvm_s390_clear_pv_state(struct kvm *kvm)
+{
+	kvm->arch.pv.handle = 0;
+	kvm->arch.pv.guest_len = 0;
+	kvm->arch.pv.stor_base = 0;
+	kvm->arch.pv.stor_var = NULL;
+}
+
+int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
+{
+	int cc;
+
+	if (!kvm_s390_pv_cpu_get_handle(vcpu))
+		return 0;
+
+	cc = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), UVC_CMD_DESTROY_SEC_CPU, rc, rrc);
+
+	KVM_UV_EVENT(vcpu->kvm, 3, "PROTVIRT DESTROY VCPU %d: rc %x rrc %x",
+		     vcpu->vcpu_id, *rc, *rrc);
+	WARN_ONCE(cc, "protvirt destroy cpu failed rc %x rrc %x", *rc, *rrc);
+
+	/* Intended memory leak for something that should never happen. */
+	if (!cc)
+		free_pages(vcpu->arch.pv.stor_base,
+			   get_order(uv_info.guest_cpu_stor_len));
+
+	free_page((unsigned long)sida_addr(vcpu->arch.sie_block));
+	vcpu->arch.sie_block->pv_handle_cpu = 0;
+	vcpu->arch.sie_block->pv_handle_config = 0;
+	memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv));
+	vcpu->arch.sie_block->sdf = 0;
+	/*
+	 * The sidad field (for sdf == 2) is now the gbea field (for sdf == 0).
+	 * Use the reset value of gbea to avoid leaking the kernel pointer of
+	 * the just freed sida.
+	 */
+	vcpu->arch.sie_block->gbea = 1;
+	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+
+	return cc ? EIO : 0;
+}
+
+int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
+{
+	struct uv_cb_csc uvcb = {
+		.header.cmd = UVC_CMD_CREATE_SEC_CPU,
+		.header.len = sizeof(uvcb),
+	};
+	void *sida_addr;
+	int cc;
+
+	if (kvm_s390_pv_cpu_get_handle(vcpu))
+		return -EINVAL;
+
+	vcpu->arch.pv.stor_base = __get_free_pages(GFP_KERNEL_ACCOUNT,
+						   get_order(uv_info.guest_cpu_stor_len));
+	if (!vcpu->arch.pv.stor_base)
+		return -ENOMEM;
+
+	/* Input */
+	uvcb.guest_handle = kvm_s390_pv_get_handle(vcpu->kvm);
+	uvcb.num = vcpu->arch.sie_block->icpua;
+	uvcb.state_origin = virt_to_phys(vcpu->arch.sie_block);
+	uvcb.stor_origin = virt_to_phys((void *)vcpu->arch.pv.stor_base);
+
+	/* Alloc Secure Instruction Data Area Designation */
+	sida_addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	if (!sida_addr) {
+		free_pages(vcpu->arch.pv.stor_base,
+			   get_order(uv_info.guest_cpu_stor_len));
+		return -ENOMEM;
+	}
+	vcpu->arch.sie_block->sidad = virt_to_phys(sida_addr);
+
+	cc = uv_call(0, (u64)&uvcb);
+	*rc = uvcb.header.rc;
+	*rrc = uvcb.header.rrc;
+	KVM_UV_EVENT(vcpu->kvm, 3,
+		     "PROTVIRT CREATE VCPU: cpu %d handle %llx rc %x rrc %x",
+		     vcpu->vcpu_id, uvcb.cpu_handle, uvcb.header.rc,
+		     uvcb.header.rrc);
+
+	if (cc) {
+		u16 dummy;
+
+		kvm_s390_pv_destroy_cpu(vcpu, &dummy, &dummy);
+		return -EIO;
+	}
+
+	/* Output */
+	vcpu->arch.pv.handle = uvcb.cpu_handle;
+	vcpu->arch.sie_block->pv_handle_cpu = uvcb.cpu_handle;
+	vcpu->arch.sie_block->pv_handle_config = kvm_s390_pv_get_handle(vcpu->kvm);
+	vcpu->arch.sie_block->sdf = 2;
+	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+	return 0;
+}
+
+/* only free resources when the destroy was successful */
+static void kvm_s390_pv_dealloc_vm(struct kvm *kvm)
+{
+	vfree(kvm->arch.pv.stor_var);
+	free_pages(kvm->arch.pv.stor_base,
+		   get_order(uv_info.guest_base_stor_len));
+	kvm_s390_clear_pv_state(kvm);
+}
+
+static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
+{
+	unsigned long base = uv_info.guest_base_stor_len;
+	unsigned long virt = uv_info.guest_virt_var_stor_len;
+	unsigned long npages = 0, vlen = 0;
+
+	kvm->arch.pv.stor_var = NULL;
+	kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL_ACCOUNT, get_order(base));
+	if (!kvm->arch.pv.stor_base)
+		return -ENOMEM;
+
+	/*
+	 * Calculate current guest storage for allocation of the
+	 * variable storage, which is based on the length in MB.
+	 *
+	 * Slots are sorted by GFN
+	 */
+	mutex_lock(&kvm->slots_lock);
+	npages = kvm_s390_get_gfn_end(kvm_memslots(kvm));
+	mutex_unlock(&kvm->slots_lock);
+
+	kvm->arch.pv.guest_len = npages * PAGE_SIZE;
+
+	/* Allocate variable storage */
+	vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE);
+	vlen += uv_info.guest_virt_base_stor_len;
+	kvm->arch.pv.stor_var = vzalloc(vlen);
+	if (!kvm->arch.pv.stor_var)
+		goto out_err;
+	return 0;
+
+out_err:
+	kvm_s390_pv_dealloc_vm(kvm);
+	return -ENOMEM;
+}
+
+/**
+ * kvm_s390_pv_dispose_one_leftover - Clean up one leftover protected VM.
+ * @kvm: the KVM that was associated with this leftover protected VM
+ * @leftover: details about the leftover protected VM that needs a clean up
+ * @rc: the RC code of the Destroy Secure Configuration UVC
+ * @rrc: the RRC code of the Destroy Secure Configuration UVC
+ *
+ * Destroy one leftover protected VM.
+ * On success, kvm->mm->context.protected_count will be decremented atomically
+ * and all other resources used by the VM will be freed.
+ *
+ * Return: 0 in case of success, otherwise 1
+ */
+static int kvm_s390_pv_dispose_one_leftover(struct kvm *kvm,
+					    struct pv_vm_to_be_destroyed *leftover,
+					    u16 *rc, u16 *rrc)
+{
+	int cc;
+
+	/* It used the destroy-fast UVC, nothing left to do here */
+	if (!leftover->handle)
+		goto done_fast;
+	cc = uv_cmd_nodata(leftover->handle, UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
+	KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY LEFTOVER VM: rc %x rrc %x", *rc, *rrc);
+	WARN_ONCE(cc, "protvirt destroy leftover vm failed rc %x rrc %x", *rc, *rrc);
+	if (cc)
+		return cc;
+	/*
+	 * Intentionally leak unusable memory. If the UVC fails, the memory
+	 * used for the VM and its metadata is permanently unusable.
+	 * This can only happen in case of a serious KVM or hardware bug; it
+	 * is not expected to happen in normal operation.
+	 */
+	free_pages(leftover->stor_base, get_order(uv_info.guest_base_stor_len));
+	free_pages(leftover->old_gmap_table, CRST_ALLOC_ORDER);
+	vfree(leftover->stor_var);
+done_fast:
+	atomic_dec(&kvm->mm->context.protected_count);
+	return 0;
+}
+
+/**
+ * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory.
+ * @kvm: the VM whose memory is to be cleared.
+ *
+ * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot.
+ * The CPUs of the protected VM need to be destroyed beforehand.
+ */
+static void kvm_s390_destroy_lower_2g(struct kvm *kvm)
+{
+	const unsigned long pages_2g = SZ_2G / PAGE_SIZE;
+	struct kvm_memory_slot *slot;
+	unsigned long len;
+	int srcu_idx;
+
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+
+	/* Take the memslot containing guest absolute address 0 */
+	slot = gfn_to_memslot(kvm, 0);
+	/* Clear all slots or parts thereof that are below 2GB */
+	while (slot && slot->base_gfn < pages_2g) {
+		len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE;
+		s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len);
+		/* Take the next memslot */
+		slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages);
+	}
+
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+}
+
+static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+	struct uv_cb_destroy_fast uvcb = {
+		.header.cmd = UVC_CMD_DESTROY_SEC_CONF_FAST,
+		.header.len = sizeof(uvcb),
+		.handle = kvm_s390_pv_get_handle(kvm),
+	};
+	int cc;
+
+	cc = uv_call_sched(0, (u64)&uvcb);
+	if (rc)
+		*rc = uvcb.header.rc;
+	if (rrc)
+		*rrc = uvcb.header.rrc;
+	WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
+	KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x",
+		     uvcb.header.rc, uvcb.header.rrc);
+	WARN_ONCE(cc && uvcb.header.rc != 0x104,
+		  "protvirt destroy vm fast failed handle %llx rc %x rrc %x",
+		  kvm_s390_pv_get_handle(kvm), uvcb.header.rc, uvcb.header.rrc);
+	/* Intended memory leak on "impossible" error */
+	if (!cc)
+		kvm_s390_pv_dealloc_vm(kvm);
+	return cc ? -EIO : 0;
+}
+
+static inline bool is_destroy_fast_available(void)
+{
+	return test_bit_inv(BIT_UVC_CMD_DESTROY_SEC_CONF_FAST, uv_info.inst_calls_list);
+}
+
+/**
+ * kvm_s390_pv_set_aside - Set aside a protected VM for later teardown.
+ * @kvm: the VM
+ * @rc: return value for the RC field of the UVCB
+ * @rrc: return value for the RRC field of the UVCB
+ *
+ * Set aside the protected VM for a subsequent teardown. The VM will be able
+ * to continue immediately as a non-secure VM, and the information needed to
+ * properly tear down the protected VM is set aside. If another protected VM
+ * was already set aside without starting its teardown, this function will
+ * fail.
+ * The CPUs of the protected VM need to be destroyed beforehand.
+ *
+ * Context: kvm->lock needs to be held
+ *
+ * Return: 0 in case of success, -EINVAL if another protected VM was already set
+ * aside, -ENOMEM if the system ran out of memory.
+ */
+int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+	struct pv_vm_to_be_destroyed *priv;
+	int res = 0;
+
+	lockdep_assert_held(&kvm->lock);
+	/*
+	 * If another protected VM was already prepared for teardown, refuse.
+	 * A normal deinitialization has to be performed instead.
+	 */
+	if (kvm->arch.pv.set_aside)
+		return -EINVAL;
+
+	/* Guest with segment type ASCE, refuse to destroy asynchronously */
+	if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
+		return -EINVAL;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	if (is_destroy_fast_available()) {
+		res = kvm_s390_pv_deinit_vm_fast(kvm, rc, rrc);
+	} else {
+		priv->stor_var = kvm->arch.pv.stor_var;
+		priv->stor_base = kvm->arch.pv.stor_base;
+		priv->handle = kvm_s390_pv_get_handle(kvm);
+		priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table;
+		WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
+		if (s390_replace_asce(kvm->arch.gmap))
+			res = -ENOMEM;
+	}
+
+	if (res) {
+		kfree(priv);
+		return res;
+	}
+
+	kvm_s390_destroy_lower_2g(kvm);
+	kvm_s390_clear_pv_state(kvm);
+	kvm->arch.pv.set_aside = priv;
+
+	*rc = UVC_RC_EXECUTED;
+	*rrc = 42;
+	return 0;
+}
+
+/**
+ * kvm_s390_pv_deinit_vm - Deinitialize the current protected VM
+ * @kvm: the KVM whose protected VM needs to be deinitialized
+ * @rc: the RC code of the UVC
+ * @rrc: the RRC code of the UVC
+ *
+ * Deinitialize the current protected VM. This function will destroy and
+ * cleanup the current protected VM, but it will not cleanup the guest
+ * memory. This function should only be called when the protected VM has
+ * just been created and therefore does not have any guest memory, or when
+ * the caller cleans up the guest memory separately.
+ *
+ * This function should not fail, but if it does, the donated memory must
+ * not be freed.
+ *
+ * Context: kvm->lock needs to be held
+ *
+ * Return: 0 in case of success, otherwise -EIO
+ */
+int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+	int cc;
+
+	cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+			   UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
+	WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
+	if (!cc) {
+		atomic_dec(&kvm->mm->context.protected_count);
+		kvm_s390_pv_dealloc_vm(kvm);
+	} else {
+		/* Intended memory leak on "impossible" error */
+		s390_replace_asce(kvm->arch.gmap);
+	}
+	KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc);
+	WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc);
+
+	return cc ? -EIO : 0;
+}
+
+/**
+ * kvm_s390_pv_deinit_cleanup_all - Clean up all protected VMs associated
+ * with a specific KVM.
+ * @kvm: the KVM to be cleaned up
+ * @rc: the RC code of the first failing UVC
+ * @rrc: the RRC code of the first failing UVC
+ *
+ * This function will clean up all protected VMs associated with a KVM.
+ * This includes the active one, the one prepared for deinitialization with
+ * kvm_s390_pv_set_aside, and any still pending in the need_cleanup list.
+ *
+ * Context: kvm->lock needs to be held unless being called from
+ * kvm_arch_destroy_vm.
+ *
+ * Return: 0 if all VMs are successfully cleaned up, otherwise -EIO
+ */
+int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+	struct pv_vm_to_be_destroyed *cur;
+	bool need_zap = false;
+	u16 _rc, _rrc;
+	int cc = 0;
+
+	/*
+	 * Nothing to do if the counter was already 0. Otherwise make sure
+	 * the counter does not reach 0 before calling s390_uv_destroy_range.
+	 */
+	if (!atomic_inc_not_zero(&kvm->mm->context.protected_count))
+		return 0;
+
+	*rc = 1;
+	/* If the current VM is protected, destroy it */
+	if (kvm_s390_pv_get_handle(kvm)) {
+		cc = kvm_s390_pv_deinit_vm(kvm, rc, rrc);
+		need_zap = true;
+	}
+
+	/* If a previous protected VM was set aside, put it in the need_cleanup list */
+	if (kvm->arch.pv.set_aside) {
+		list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup);
+		kvm->arch.pv.set_aside = NULL;
+	}
+
+	/* Cleanup all protected VMs in the need_cleanup list */
+	while (!list_empty(&kvm->arch.pv.need_cleanup)) {
+		cur = list_first_entry(&kvm->arch.pv.need_cleanup, typeof(*cur), list);
+		need_zap = true;
+		if (kvm_s390_pv_dispose_one_leftover(kvm, cur, &_rc, &_rrc)) {
+			cc = 1;
+			/*
+			 * Only return the first error rc and rrc, so make
+			 * sure it is not overwritten. All destroys will
+			 * additionally be reported via KVM_UV_EVENT().
+			 */
+			if (*rc == UVC_RC_EXECUTED) {
+				*rc = _rc;
+				*rrc = _rrc;
+			}
+		}
+		list_del(&cur->list);
+		kfree(cur);
+	}
+
+	/*
+	 * If the mm still has a mapping, try to mark all its pages as
+	 * accessible. The counter should not reach zero before this
+	 * cleanup has been performed.
+	 */
+	if (need_zap && mmget_not_zero(kvm->mm)) {
+		s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE);
+		mmput(kvm->mm);
+	}
+
+	/* Now the counter can safely reach 0 */
+	atomic_dec(&kvm->mm->context.protected_count);
+	return cc ? -EIO : 0;
+}
+
+/**
+ * kvm_s390_pv_deinit_aside_vm - Teardown a previously set aside protected VM.
+ * @kvm: the VM previously associated with the protected VM
+ * @rc: return value for the RC field of the UVCB
+ * @rrc: return value for the RRC field of the UVCB
+ *
+ * Tear down the protected VM that had been previously prepared for teardown
+ * using kvm_s390_pv_set_aside_vm. Ideally this should be called by
+ * userspace asynchronously from a separate thread.
+ *
+ * Context: kvm->lock must not be held.
+ *
+ * Return: 0 in case of success, -EINVAL if no protected VM had been
+ * prepared for asynchronous teardowm, -EIO in case of other errors.
+ */
+int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+	struct pv_vm_to_be_destroyed *p;
+	int ret = 0;
+
+	lockdep_assert_not_held(&kvm->lock);
+	mutex_lock(&kvm->lock);
+	p = kvm->arch.pv.set_aside;
+	kvm->arch.pv.set_aside = NULL;
+	mutex_unlock(&kvm->lock);
+	if (!p)
+		return -EINVAL;
+
+	/* When a fatal signal is received, stop immediately */
+	if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX))
+		goto done;
+	if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc))
+		ret = -EIO;
+	kfree(p);
+	p = NULL;
+done:
+	/*
+	 * p is not NULL if we aborted because of a fatal signal, in which
+	 * case queue the leftover for later cleanup.
+	 */
+	if (p) {
+		mutex_lock(&kvm->lock);
+		list_add(&p->list, &kvm->arch.pv.need_cleanup);
+		mutex_unlock(&kvm->lock);
+		/* Did not finish, but pretend things went well */
+		*rc = UVC_RC_EXECUTED;
+		*rrc = 42;
+	}
+	return ret;
+}
+
+static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription,
+					     struct mm_struct *mm)
+{
+	struct kvm *kvm = container_of(subscription, struct kvm, arch.pv.mmu_notifier);
+	u16 dummy;
+	int r;
+
+	/*
+	 * No locking is needed since this is the last thread of the last user of this
+	 * struct mm.
+	 * When the struct kvm gets deinitialized, this notifier is also
+	 * unregistered. This means that if this notifier runs, then the
+	 * struct kvm is still valid.
+	 */
+	r = kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
+	if (!r && is_destroy_fast_available() && kvm_s390_pv_get_handle(kvm))
+		kvm_s390_pv_deinit_vm_fast(kvm, &dummy, &dummy);
+}
+
+static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = {
+	.release = kvm_s390_pv_mmu_notifier_release,
+};
+
+int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+	struct uv_cb_cgc uvcb = {
+		.header.cmd = UVC_CMD_CREATE_SEC_CONF,
+		.header.len = sizeof(uvcb)
+	};
+	int cc, ret;
+	u16 dummy;
+
+	ret = kvm_s390_pv_alloc_vm(kvm);
+	if (ret)
+		return ret;
+
+	/* Inputs */
+	uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
+	uvcb.guest_stor_len = kvm->arch.pv.guest_len;
+	uvcb.guest_asce = kvm->arch.gmap->asce;
+	uvcb.guest_sca = virt_to_phys(kvm->arch.sca);
+	uvcb.conf_base_stor_origin =
+		virt_to_phys((void *)kvm->arch.pv.stor_base);
+	uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var;
+	uvcb.flags.ap_allow_instr = kvm->arch.model.uv_feat_guest.ap;
+	uvcb.flags.ap_instr_intr = kvm->arch.model.uv_feat_guest.ap_intr;
+
+	cc = uv_call_sched(0, (u64)&uvcb);
+	*rc = uvcb.header.rc;
+	*rrc = uvcb.header.rrc;
+	KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x flags %04x",
+		     uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc, uvcb.flags.raw);
+
+	/* Outputs */
+	kvm->arch.pv.handle = uvcb.guest_handle;
+
+	atomic_inc(&kvm->mm->context.protected_count);
+	if (cc) {
+		if (uvcb.header.rc & UVC_RC_NEED_DESTROY) {
+			kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
+		} else {
+			atomic_dec(&kvm->mm->context.protected_count);
+			kvm_s390_pv_dealloc_vm(kvm);
+		}
+		return -EIO;
+	}
+	kvm->arch.gmap->guest_handle = uvcb.guest_handle;
+	/* Add the notifier only once. No races because we hold kvm->lock */
+	if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) {
+		kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops;
+		mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm);
+	}
+	return 0;
+}
+
+int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
+			      u16 *rrc)
+{
+	struct uv_cb_ssc uvcb = {
+		.header.cmd = UVC_CMD_SET_SEC_CONF_PARAMS,
+		.header.len = sizeof(uvcb),
+		.sec_header_origin = (u64)hdr,
+		.sec_header_len = length,
+		.guest_handle = kvm_s390_pv_get_handle(kvm),
+	};
+	int cc = uv_call(0, (u64)&uvcb);
+
+	*rc = uvcb.header.rc;
+	*rrc = uvcb.header.rrc;
+	KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x",
+		     *rc, *rrc);
+	return cc ? -EINVAL : 0;
+}
+
+static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak,
+		      u64 offset, u16 *rc, u16 *rrc)
+{
+	struct uv_cb_unp uvcb = {
+		.header.cmd = UVC_CMD_UNPACK_IMG,
+		.header.len = sizeof(uvcb),
+		.guest_handle = kvm_s390_pv_get_handle(kvm),
+		.gaddr = addr,
+		.tweak[0] = tweak,
+		.tweak[1] = offset,
+	};
+	int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb);
+	unsigned long vmaddr;
+	bool unlocked;
+
+	*rc = uvcb.header.rc;
+	*rrc = uvcb.header.rrc;
+
+	if (ret == -ENXIO) {
+		mmap_read_lock(kvm->mm);
+		vmaddr = gfn_to_hva(kvm, gpa_to_gfn(addr));
+		if (kvm_is_error_hva(vmaddr)) {
+			ret = -EFAULT;
+		} else {
+			ret = fixup_user_fault(kvm->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked);
+			if (!ret)
+				ret = __gmap_link(kvm->arch.gmap, addr, vmaddr);
+		}
+		mmap_read_unlock(kvm->mm);
+		if (!ret)
+			return -EAGAIN;
+		return ret;
+	}
+
+	if (ret && ret != -EAGAIN)
+		KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: failed addr %llx with rc %x rrc %x",
+			     uvcb.gaddr, *rc, *rrc);
+	return ret;
+}
+
+int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
+		       unsigned long tweak, u16 *rc, u16 *rrc)
+{
+	u64 offset = 0;
+	int ret = 0;
+
+	if (addr & ~PAGE_MASK || !size || size & ~PAGE_MASK)
+		return -EINVAL;
+
+	KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: start addr %lx size %lx",
+		     addr, size);
+
+	guard(srcu)(&kvm->srcu);
+
+	while (offset < size) {
+		ret = unpack_one(kvm, addr, tweak, offset, rc, rrc);
+		if (ret == -EAGAIN) {
+			cond_resched();
+			if (fatal_signal_pending(current))
+				break;
+			continue;
+		}
+		if (ret)
+			break;
+		addr += PAGE_SIZE;
+		offset += PAGE_SIZE;
+	}
+	if (!ret)
+		KVM_UV_EVENT(kvm, 3, "%s", "PROTVIRT VM UNPACK: successful");
+	return ret;
+}
+
+int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state)
+{
+	struct uv_cb_cpu_set_state uvcb = {
+		.header.cmd	= UVC_CMD_CPU_SET_STATE,
+		.header.len	= sizeof(uvcb),
+		.cpu_handle	= kvm_s390_pv_cpu_get_handle(vcpu),
+		.state		= state,
+	};
+	int cc;
+
+	cc = uv_call(0, (u64)&uvcb);
+	KVM_UV_EVENT(vcpu->kvm, 3, "PROTVIRT SET CPU %d STATE %d rc %x rrc %x",
+		     vcpu->vcpu_id, state, uvcb.header.rc, uvcb.header.rrc);
+	if (cc)
+		return -EINVAL;
+	return 0;
+}
+
+int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc)
+{
+	struct uv_cb_dump_cpu uvcb = {
+		.header.cmd = UVC_CMD_DUMP_CPU,
+		.header.len = sizeof(uvcb),
+		.cpu_handle = vcpu->arch.pv.handle,
+		.dump_area_origin = (u64)buff,
+	};
+	int cc;
+
+	cc = uv_call_sched(0, (u64)&uvcb);
+	*rc = uvcb.header.rc;
+	*rrc = uvcb.header.rrc;
+	return cc;
+}
+
+/* Size of the cache for the storage state dump data. 1MB for now */
+#define DUMP_BUFF_LEN HPAGE_SIZE
+
+/**
+ * kvm_s390_pv_dump_stor_state
+ *
+ * @kvm: pointer to the guest's KVM struct
+ * @buff_user: Userspace pointer where we will write the results to
+ * @gaddr: Starting absolute guest address for which the storage state
+ *	   is requested.
+ * @buff_user_len: Length of the buff_user buffer
+ * @rc: Pointer to where the uvcb return code is stored
+ * @rrc: Pointer to where the uvcb return reason code is stored
+ *
+ * Stores buff_len bytes of tweak component values to buff_user
+ * starting with the 1MB block specified by the absolute guest address
+ * (gaddr). The gaddr pointer will be updated with the last address
+ * for which data was written when returning to userspace. buff_user
+ * might be written to even if an error rc is returned. For instance
+ * if we encounter a fault after writing the first page of data.
+ *
+ * Context: kvm->lock needs to be held
+ *
+ * Return:
+ *  0 on success
+ *  -ENOMEM if allocating the cache fails
+ *  -EINVAL if gaddr is not aligned to 1MB
+ *  -EINVAL if buff_user_len is not aligned to uv_info.conf_dump_storage_state_len
+ *  -EINVAL if the UV call fails, rc and rrc will be set in this case
+ *  -EFAULT if copying the result to buff_user failed
+ */
+int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
+				u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc)
+{
+	struct uv_cb_dump_stor_state uvcb = {
+		.header.cmd = UVC_CMD_DUMP_CONF_STOR_STATE,
+		.header.len = sizeof(uvcb),
+		.config_handle = kvm->arch.pv.handle,
+		.gaddr = *gaddr,
+		.dump_area_origin = 0,
+	};
+	const u64 increment_len = uv_info.conf_dump_storage_state_len;
+	size_t buff_kvm_size;
+	size_t size_done = 0;
+	u8 *buff_kvm = NULL;
+	int cc, ret;
+
+	ret = -EINVAL;
+	/* UV call processes 1MB guest storage chunks at a time */
+	if (!IS_ALIGNED(*gaddr, HPAGE_SIZE))
+		goto out;
+
+	/*
+	 * We provide the storage state for 1MB chunks of guest
+	 * storage. The buffer will need to be aligned to
+	 * conf_dump_storage_state_len so we don't end on a partial
+	 * chunk.
+	 */
+	if (!buff_user_len ||
+	    !IS_ALIGNED(buff_user_len, increment_len))
+		goto out;
+
+	/*
+	 * Allocate a buffer from which we will later copy to the user
+	 * process. We don't want userspace to dictate our buffer size
+	 * so we limit it to DUMP_BUFF_LEN.
+	 */
+	ret = -ENOMEM;
+	buff_kvm_size = min_t(u64, buff_user_len, DUMP_BUFF_LEN);
+	buff_kvm = vzalloc(buff_kvm_size);
+	if (!buff_kvm)
+		goto out;
+
+	ret = 0;
+	uvcb.dump_area_origin = (u64)buff_kvm;
+	/* We will loop until the user buffer is filled or an error occurs */
+	do {
+		/* Get 1MB worth of guest storage state data */
+		cc = uv_call_sched(0, (u64)&uvcb);
+
+		/* All or nothing */
+		if (cc) {
+			ret = -EINVAL;
+			break;
+		}
+
+		size_done += increment_len;
+		uvcb.dump_area_origin += increment_len;
+		buff_user_len -= increment_len;
+		uvcb.gaddr += HPAGE_SIZE;
+
+		/* KVM Buffer full, time to copy to the process */
+		if (!buff_user_len || size_done == DUMP_BUFF_LEN) {
+			if (copy_to_user(buff_user, buff_kvm, size_done)) {
+				ret = -EFAULT;
+				break;
+			}
+
+			buff_user += size_done;
+			size_done = 0;
+			uvcb.dump_area_origin = (u64)buff_kvm;
+		}
+	} while (buff_user_len);
+
+	/* Report back where we ended dumping */
+	*gaddr = uvcb.gaddr;
+
+	/* Lets only log errors, we don't want to spam */
+out:
+	if (ret)
+		KVM_UV_EVENT(kvm, 3,
+			     "PROTVIRT DUMP STORAGE STATE: addr %llx ret %d, uvcb rc %x rrc %x",
+			     uvcb.gaddr, ret, uvcb.header.rc, uvcb.header.rrc);
+	*rc = uvcb.header.rc;
+	*rrc = uvcb.header.rrc;
+	vfree(buff_kvm);
+
+	return ret;
+}
+
+/**
+ * kvm_s390_pv_dump_complete
+ *
+ * @kvm: pointer to the guest's KVM struct
+ * @buff_user: Userspace pointer where we will write the results to
+ * @rc: Pointer to where the uvcb return code is stored
+ * @rrc: Pointer to where the uvcb return reason code is stored
+ *
+ * Completes the dumping operation and writes the completion data to
+ * user space.
+ *
+ * Context: kvm->lock needs to be held
+ *
+ * Return:
+ *  0 on success
+ *  -ENOMEM if allocating the completion buffer fails
+ *  -EINVAL if the UV call fails, rc and rrc will be set in this case
+ *  -EFAULT if copying the result to buff_user failed
+ */
+int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
+			      u16 *rc, u16 *rrc)
+{
+	struct uv_cb_dump_complete complete = {
+		.header.len = sizeof(complete),
+		.header.cmd = UVC_CMD_DUMP_COMPLETE,
+		.config_handle = kvm_s390_pv_get_handle(kvm),
+	};
+	u64 *compl_data;
+	int ret;
+
+	/* Allocate dump area */
+	compl_data = vzalloc(uv_info.conf_dump_finalize_len);
+	if (!compl_data)
+		return -ENOMEM;
+	complete.dump_area_origin = (u64)compl_data;
+
+	ret = uv_call_sched(0, (u64)&complete);
+	*rc = complete.header.rc;
+	*rrc = complete.header.rrc;
+	KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP COMPLETE: rc %x rrc %x",
+		     complete.header.rc, complete.header.rrc);
+
+	if (!ret) {
+		/*
+		 * kvm_s390_pv_dealloc_vm() will also (mem)set
+		 * this to false on a reboot or other destroy
+		 * operation for this vm.
+		 */
+		kvm->arch.pv.dumping = false;
+		kvm_s390_vcpu_unblock_all(kvm);
+		ret = copy_to_user(buff_user, compl_data, uv_info.conf_dump_finalize_len);
+		if (ret)
+			ret = -EFAULT;
+	}
+	vfree(compl_data);
+	/* If the UVC returned an error, translate it to -EINVAL */
+	if (ret > 0)
+		ret = -EINVAL;
+	return ret;
+}
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 683036c1c92a..55c34cb35428 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -151,22 +151,10 @@ static int __sigp_stop_and_store_status(struct kvm_vcpu *vcpu,
 static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter,
 			   u64 *status_reg)
 {
-	unsigned int i;
-	struct kvm_vcpu *v;
-	bool all_stopped = true;
-
-	kvm_for_each_vcpu(i, v, vcpu->kvm) {
-		if (v == vcpu)
-			continue;
-		if (!is_vcpu_stopped(v))
-			all_stopped = false;
-	}
-
 	*status_reg &= 0xffffffff00000000UL;
 
 	/* Reject set arch order, with czam we're always in z/Arch mode. */
-	*status_reg |= (all_stopped ? SIGP_STATUS_INVALID_PARAMETER :
-					SIGP_STATUS_INCORRECT_STATE);
+	*status_reg |= SIGP_STATUS_INVALID_PARAMETER;
 	return SIGP_CC_STATUS_STORED;
 }
 
@@ -184,7 +172,7 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu,
 	 * first page, since address is 8k aligned and memory pieces are always
 	 * at least 1MB aligned and have at least a size of 1MB.
 	 */
-	if (kvm_is_error_gpa(vcpu->kvm, irq.u.prefix.address)) {
+	if (!kvm_is_gpa_in_memslot(vcpu->kvm, irq.u.prefix.address)) {
 		*reg &= 0xffffffff00000000UL;
 		*reg |= SIGP_STATUS_INVALID_PARAMETER;
 		return SIGP_CC_STATUS_STORED;
@@ -288,6 +276,34 @@ static int handle_sigp_dst(struct kvm_vcpu *vcpu, u8 order_code,
 	if (!dst_vcpu)
 		return SIGP_CC_NOT_OPERATIONAL;
 
+	/*
+	 * SIGP RESTART, SIGP STOP, and SIGP STOP AND STORE STATUS orders
+	 * are processed asynchronously. Until the affected VCPU finishes
+	 * its work and calls back into KVM to clear the (RESTART or STOP)
+	 * interrupt, we need to return any new non-reset orders "busy".
+	 *
+	 * This is important because a single VCPU could issue:
+	 *  1) SIGP STOP $DESTINATION
+	 *  2) SIGP SENSE $DESTINATION
+	 *
+	 * If the SIGP SENSE would not be rejected as "busy", it could
+	 * return an incorrect answer as to whether the VCPU is STOPPED
+	 * or OPERATING.
+	 */
+	if (order_code != SIGP_INITIAL_CPU_RESET &&
+	    order_code != SIGP_CPU_RESET) {
+		/*
+		 * Lockless check. Both SIGP STOP and SIGP (RE)START
+		 * properly synchronize everything while processing
+		 * their orders, while the guest cannot observe a
+		 * difference when issuing other orders from two
+		 * different VCPUs.
+		 */
+		if (kvm_s390_is_stop_irq_pending(dst_vcpu) ||
+		    kvm_s390_is_restart_irq_pending(dst_vcpu))
+			return SIGP_CC_BUSY;
+	}
+
 	switch (order_code) {
 	case SIGP_SENSE:
 		vcpu->stat.instruction_sigp_sense++;
@@ -453,7 +469,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
  *
  * This interception will occur at the source cpu when a source cpu sends an
  * external call to a target cpu and the target cpu has the WAIT bit set in
- * its cpuflags. Interception will occurr after the interrupt indicator bits at
+ * its cpuflags. Interception will occur after the interrupt indicator bits at
  * the target cpu have been set. All error cases will lead to instruction
  * interception, therefore nothing is to be checked or prepared.
  */
@@ -464,9 +480,9 @@ int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu)
 	struct kvm_vcpu *dest_vcpu;
 	u8 order_code = kvm_s390_get_base_disp_rs(vcpu, NULL);
 
-	trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr);
-
 	if (order_code == SIGP_EXTERNAL_CALL) {
+		trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr);
+
 		dest_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, cpu_addr);
 		BUG_ON(dest_vcpu == NULL);
 
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
index 6f0209d45164..9ac92dbf680d 100644
--- a/arch/s390/kvm/trace-s390.h
+++ b/arch/s390/kvm/trace-s390.h
@@ -333,6 +333,29 @@ TRACE_EVENT(kvm_s390_airq_suppressed,
 		      __entry->id, __entry->isc)
 	);
 
+/*
+ * Trace point for gmap notifier calls.
+ */
+TRACE_EVENT(kvm_s390_gmap_notifier,
+	    TP_PROTO(unsigned long start, unsigned long end, unsigned int shadow),
+	    TP_ARGS(start, end, shadow),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned long, start)
+		    __field(unsigned long, end)
+		    __field(unsigned int, shadow)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->start = start;
+		    __entry->end = end;
+		    __entry->shadow = shadow;
+		    ),
+
+	    TP_printk("gmap notified (start:0x%lx end:0x%lx shadow:%d)",
+		      __entry->start, __entry->end, __entry->shadow)
+	);
+
 
 #endif /* _TRACE_KVMS390_H */
 
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 076090f9e666..a78df3a4f353 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -12,14 +12,22 @@
 #include <linux/list.h>
 #include <linux/bitmap.h>
 #include <linux/sched/signal.h>
+#include <linux/io.h>
+#include <linux/mman.h>
 
 #include <asm/gmap.h>
 #include <asm/mmu_context.h>
 #include <asm/sclp.h>
 #include <asm/nmi.h>
 #include <asm/dis.h>
+#include <asm/facility.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
+#include "gmap.h"
+
+enum vsie_page_flags {
+	VSIE_PAGE_IN_USE = 0,
+};
 
 struct vsie_page {
 	struct kvm_s390_sie_block scb_s;	/* 0x0000 */
@@ -44,7 +52,18 @@ struct vsie_page {
 	gpa_t gvrd_gpa;				/* 0x0240 */
 	gpa_t riccbd_gpa;			/* 0x0248 */
 	gpa_t sdnx_gpa;				/* 0x0250 */
-	__u8 reserved[0x0700 - 0x0258];		/* 0x0258 */
+	/*
+	 * guest address of the original SCB. Remains set for free vsie
+	 * pages, so we can properly look them up in our addr_to_page
+	 * radix tree.
+	 */
+	gpa_t scb_gpa;				/* 0x0258 */
+	/*
+	 * Flags: must be set/cleared atomically after the vsie page can be
+	 * looked up by other CPUs.
+	 */
+	unsigned long flags;			/* 0x0260 */
+	__u8 reserved[0x0700 - 0x0268];		/* 0x0268 */
 	struct kvm_s390_crypto_cb crycb;	/* 0x0700 */
 	__u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE];	/* 0x0800 */
 };
@@ -137,11 +156,15 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 }
 /* Copy to APCB FORMAT1 from APCB FORMAT0 */
 static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s,
-			unsigned long apcb_o, struct kvm_s390_apcb1 *apcb_h)
+			unsigned long crycb_gpa, struct kvm_s390_apcb1 *apcb_h)
 {
 	struct kvm_s390_apcb0 tmp;
+	unsigned long apcb_gpa;
 
-	if (read_guest_real(vcpu, apcb_o, &tmp, sizeof(struct kvm_s390_apcb0)))
+	apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb0);
+
+	if (read_guest_real(vcpu, apcb_gpa, &tmp,
+			    sizeof(struct kvm_s390_apcb0)))
 		return -EFAULT;
 
 	apcb_s->apm[0] = apcb_h->apm[0] & tmp.apm[0];
@@ -156,19 +179,24 @@ static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s,
  * setup_apcb00 - Copy to APCB FORMAT0 from APCB FORMAT0
  * @vcpu: pointer to the virtual CPU
  * @apcb_s: pointer to start of apcb in the shadow crycb
- * @apcb_o: pointer to start of original apcb in the guest2
+ * @crycb_gpa: guest physical address to start of original guest crycb
  * @apcb_h: pointer to start of apcb in the guest1
  *
  * Returns 0 and -EFAULT on error reading guest apcb
  */
 static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
-			unsigned long apcb_o, unsigned long *apcb_h)
+			unsigned long crycb_gpa, unsigned long *apcb_h)
 {
-	if (read_guest_real(vcpu, apcb_o, apcb_s,
+	unsigned long apcb_gpa;
+
+	apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb0);
+
+	if (read_guest_real(vcpu, apcb_gpa, apcb_s,
 			    sizeof(struct kvm_s390_apcb0)))
 		return -EFAULT;
 
-	bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb0));
+	bitmap_and(apcb_s, apcb_s, apcb_h,
+		   BITS_PER_BYTE * sizeof(struct kvm_s390_apcb0));
 
 	return 0;
 }
@@ -177,20 +205,25 @@ static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
  * setup_apcb11 - Copy the FORMAT1 APCB from the guest to the shadow CRYCB
  * @vcpu: pointer to the virtual CPU
  * @apcb_s: pointer to start of apcb in the shadow crycb
- * @apcb_o: pointer to start of original guest apcb
+ * @crycb_gpa: guest physical address to start of original guest crycb
  * @apcb_h: pointer to start of apcb in the host
  *
  * Returns 0 and -EFAULT on error reading guest apcb
  */
 static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
-			unsigned long apcb_o,
+			unsigned long crycb_gpa,
 			unsigned long *apcb_h)
 {
-	if (read_guest_real(vcpu, apcb_o, apcb_s,
+	unsigned long apcb_gpa;
+
+	apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb1);
+
+	if (read_guest_real(vcpu, apcb_gpa, apcb_s,
 			    sizeof(struct kvm_s390_apcb1)))
 		return -EFAULT;
 
-	bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb1));
+	bitmap_and(apcb_s, apcb_s, apcb_h,
+		   BITS_PER_BYTE * sizeof(struct kvm_s390_apcb1));
 
 	return 0;
 }
@@ -199,7 +232,7 @@ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
  * setup_apcb - Create a shadow copy of the apcb.
  * @vcpu: pointer to the virtual CPU
  * @crycb_s: pointer to shadow crycb
- * @crycb_o: pointer to original guest crycb
+ * @crycb_gpa: guest physical address of original guest crycb
  * @crycb_h: pointer to the host crycb
  * @fmt_o: format of the original guest crycb.
  * @fmt_h: format of the host crycb.
@@ -210,50 +243,46 @@ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
  * Return 0 or an error number if the guest and host crycb are incompatible.
  */
 static int setup_apcb(struct kvm_vcpu *vcpu, struct kvm_s390_crypto_cb *crycb_s,
-	       const u32 crycb_o,
+	       const u32 crycb_gpa,
 	       struct kvm_s390_crypto_cb *crycb_h,
 	       int fmt_o, int fmt_h)
 {
-	struct kvm_s390_crypto_cb *crycb;
-
-	crycb = (struct kvm_s390_crypto_cb *) (unsigned long)crycb_o;
-
 	switch (fmt_o) {
 	case CRYCB_FORMAT2:
-		if ((crycb_o & PAGE_MASK) != ((crycb_o + 256) & PAGE_MASK))
+		if ((crycb_gpa & PAGE_MASK) != ((crycb_gpa + 256) & PAGE_MASK))
 			return -EACCES;
 		if (fmt_h != CRYCB_FORMAT2)
 			return -EINVAL;
 		return setup_apcb11(vcpu, (unsigned long *)&crycb_s->apcb1,
-				    (unsigned long) &crycb->apcb1,
+				    crycb_gpa,
 				    (unsigned long *)&crycb_h->apcb1);
 	case CRYCB_FORMAT1:
 		switch (fmt_h) {
 		case CRYCB_FORMAT2:
 			return setup_apcb10(vcpu, &crycb_s->apcb1,
-					    (unsigned long) &crycb->apcb0,
+					    crycb_gpa,
 					    &crycb_h->apcb1);
 		case CRYCB_FORMAT1:
 			return setup_apcb00(vcpu,
 					    (unsigned long *) &crycb_s->apcb0,
-					    (unsigned long) &crycb->apcb0,
+					    crycb_gpa,
 					    (unsigned long *) &crycb_h->apcb0);
 		}
 		break;
 	case CRYCB_FORMAT0:
-		if ((crycb_o & PAGE_MASK) != ((crycb_o + 32) & PAGE_MASK))
+		if ((crycb_gpa & PAGE_MASK) != ((crycb_gpa + 32) & PAGE_MASK))
 			return -EACCES;
 
 		switch (fmt_h) {
 		case CRYCB_FORMAT2:
 			return setup_apcb10(vcpu, &crycb_s->apcb1,
-					    (unsigned long) &crycb->apcb0,
+					    crycb_gpa,
 					    &crycb_h->apcb1);
 		case CRYCB_FORMAT1:
 		case CRYCB_FORMAT0:
 			return setup_apcb00(vcpu,
 					    (unsigned long *) &crycb_s->apcb0,
-					    (unsigned long) &crycb->apcb0,
+					    crycb_gpa,
 					    (unsigned long *) &crycb_h->apcb0);
 		}
 	}
@@ -323,7 +352,8 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	/* we may only allow it if enabled for guest 2 */
 	ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 &
 		     (ECB3_AES | ECB3_DEA);
-	ecd_flags = scb_o->ecd & vcpu->arch.sie_block->ecd & ECD_ECC;
+	ecd_flags = scb_o->ecd & vcpu->arch.sie_block->ecd &
+		     (ECD_ECC | ECD_HMAC);
 	if (!ecb3_flags && !ecd_flags)
 		goto end;
 
@@ -350,7 +380,7 @@ end:
 	case -EACCES:
 		return set_validity_icpt(scb_s, 0x003CU);
 	}
-	scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT2;
+	scb_s->crycbd = (u32)virt_to_phys(&vsie_page->crycb) | CRYCB_FORMAT2;
 	return 0;
 }
 
@@ -416,11 +446,6 @@ static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		memcpy((void *)((u64)scb_o + 0xc0),
 		       (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
 		break;
-	case ICPT_PARTEXEC:
-		/* MVPG only */
-		memcpy((void *)((u64)scb_o + 0xc0),
-		       (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
-		break;
 	}
 
 	if (scb_s->ihcpu != 0xffffU)
@@ -498,7 +523,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	scb_s->mso = new_mso;
 	scb_s->prefix = new_prefix;
 
-	/* We have to definetly flush the tlb if this scb never ran */
+	/* We have to definitely flush the tlb if this scb never ran */
 	if (scb_s->ihcpu != 0xffffU)
 		scb_s->ihcpu = scb_o->ihcpu;
 
@@ -507,6 +532,14 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	/* Host-protection-interruption introduced with ESOP */
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
 		scb_s->ecb |= scb_o->ecb & ECB_HOSTPROTINT;
+	/*
+	 * CPU Topology
+	 * This facility only uses the utility field of the SCA and none of
+	 * the cpu entries that are problematic with the other interpretation
+	 * facilities so we can pass it through
+	 */
+	if (test_kvm_facility(vcpu->kvm, 11))
+		scb_s->ecb |= scb_o->ecb & ECB_PTF;
 	/* transactional execution */
 	if (test_kvm_facility(vcpu->kvm, 73) && wants_tx) {
 		/* remap the prefix is tx is toggled on */
@@ -514,6 +547,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 			prefix_unmapped(vsie_page);
 		scb_s->ecb |= ECB_TE;
 	}
+	/* specification exception interpretation */
+	scb_s->ecb |= scb_o->ecb & ECB_SPECI;
 	/* branch prediction */
 	if (test_kvm_facility(vcpu->kvm, 82))
 		scb_s->fpf |= scb_o->fpf & FPF_BPBC;
@@ -540,14 +575,17 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI))
 		scb_s->eca |= scb_o->eca & ECA_CEI;
 	/* Epoch Extension */
-	if (test_kvm_facility(vcpu->kvm, 139))
+	if (test_kvm_facility(vcpu->kvm, 139)) {
 		scb_s->ecd |= scb_o->ecd & ECD_MEF;
+		scb_s->epdx = scb_o->epdx;
+	}
 
 	/* etoken */
 	if (test_kvm_facility(vcpu->kvm, 156))
 		scb_s->ecd |= scb_o->ecd & ECD_ETOKENF;
 
 	scb_s->hpid = HPID_VSIE;
+	scb_s->cpnc = scb_o->cpnc;
 
 	prepare_ibc(vcpu, vsie_page);
 	rc = shadow_crycb(vcpu, vsie_page);
@@ -563,24 +601,18 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
 	struct kvm *kvm = gmap->private;
 	struct vsie_page *cur;
 	unsigned long prefix;
-	struct page *page;
 	int i;
 
 	if (!gmap_is_shadow(gmap))
 		return;
-	if (start >= 1UL << 31)
-		/* We are only interested in prefix pages */
-		return;
-
 	/*
 	 * Only new shadow blocks are added to the list during runtime,
 	 * therefore we can safely reference them all the time.
 	 */
 	for (i = 0; i < kvm->arch.vsie.page_count; i++) {
-		page = READ_ONCE(kvm->arch.vsie.pages[i]);
-		if (!page)
+		cur = READ_ONCE(kvm->arch.vsie.pages[i]);
+		if (!cur)
 			continue;
-		cur = page_to_virt(page);
 		if (READ_ONCE(cur->gmap) != gmap)
 			continue;
 		prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
@@ -618,10 +650,10 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	/* with mso/msl, the prefix lies at offset *mso* */
 	prefix += scb_s->mso;
 
-	rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+	rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL);
 	if (!rc && (scb_s->ecb & ECB_TE))
 		rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
-					   prefix + PAGE_SIZE);
+					   prefix + PAGE_SIZE, NULL);
 	/*
 	 * We don't have to mprotect, we will be called for all unshadows.
 	 * SIE will detect if protection applies and trigger a validity.
@@ -645,16 +677,16 @@ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
 	struct page *page;
 
 	page = gfn_to_page(kvm, gpa_to_gfn(gpa));
-	if (is_error_page(page))
+	if (!page)
 		return -EINVAL;
-	*hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
+	*hpa = (hpa_t)page_to_phys(page) + (gpa & ~PAGE_MASK);
 	return 0;
 }
 
 /* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */
 static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
 {
-	kvm_release_pfn_dirty(hpa >> PAGE_SHIFT);
+	kvm_release_page_dirty(pfn_to_page(hpa >> PAGE_SHIFT));
 	/* mark the page always as dirty for migration */
 	mark_page_dirty(kvm, gpa_to_gfn(gpa));
 }
@@ -837,7 +869,7 @@ unpin:
 static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
 		      gpa_t gpa)
 {
-	hpa_t hpa = (hpa_t) vsie_page->scb_o;
+	hpa_t hpa = virt_to_phys(vsie_page->scb_o);
 
 	if (hpa)
 		unpin_guest_page(vcpu->kvm, gpa, hpa);
@@ -862,7 +894,7 @@ static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
 		WARN_ON_ONCE(rc);
 		return 1;
 	}
-	vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
+	vsie_page->scb_o = phys_to_virt(hpa);
 	return 0;
 }
 
@@ -882,7 +914,7 @@ static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr,
 			(vaddr & 0xfffffffffffff000UL) |
 			/* 52-53: store / fetch */
 			(((unsigned int) !write_flag) + 1) << 10,
-			/* 62-63: asce id (alway primary == 0) */
+			/* 62-63: asce id (always primary == 0) */
 		.exc_access_id = 0, /* always primary */
 		.op_access_id = 0, /* not MVPG */
 	};
@@ -906,19 +938,19 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
 	int rc;
 
-	if (current->thread.gmap_int_code == PGM_PROTECTION)
+	if ((current->thread.gmap_int_code & PGM_INT_CODE_MASK) == PGM_PROTECTION)
 		/* we can directly forward all protection exceptions */
 		return inject_fault(vcpu, PGM_PROTECTION,
-				    current->thread.gmap_addr, 1);
+				    current->thread.gmap_teid.addr * PAGE_SIZE, 1);
 
 	rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
-				   current->thread.gmap_addr);
+				   current->thread.gmap_teid.addr * PAGE_SIZE, NULL);
 	if (rc > 0) {
 		rc = inject_fault(vcpu, rc,
-				  current->thread.gmap_addr,
-				  current->thread.gmap_write_flag);
+				  current->thread.gmap_teid.addr * PAGE_SIZE,
+				  kvm_s390_cur_gmap_fault_is_write());
 		if (rc >= 0)
-			vsie_page->fault_addr = current->thread.gmap_addr;
+			vsie_page->fault_addr = current->thread.gmap_teid.addr * PAGE_SIZE;
 	}
 	return rc;
 }
@@ -934,7 +966,7 @@ static void handle_last_fault(struct kvm_vcpu *vcpu,
 {
 	if (vsie_page->fault_addr)
 		kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
-				      vsie_page->fault_addr);
+				      vsie_page->fault_addr, NULL);
 	vsie_page->fault_addr = 0;
 }
 
@@ -969,19 +1001,125 @@ static void retry_vsie_icpt(struct vsie_page *vsie_page)
 static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
-	__u32 fac = READ_ONCE(vsie_page->scb_o->fac) & 0x7ffffff8U;
+	__u32 fac = READ_ONCE(vsie_page->scb_o->fac);
 
+	/*
+	 * Alternate-STFLE-Interpretive-Execution facilities are not supported
+	 * -> format-0 flcb
+	 */
 	if (fac && test_kvm_facility(vcpu->kvm, 7)) {
 		retry_vsie_icpt(vsie_page);
+		/*
+		 * The facility list origin (FLO) is in bits 1 - 28 of the FLD
+		 * so we need to mask here before reading.
+		 */
+		fac = fac & 0x7ffffff8U;
+		/*
+		 * format-0 -> size of nested guest's facility list == guest's size
+		 * guest's size == host's size, since STFLE is interpretatively executed
+		 * using a format-0 for the guest, too.
+		 */
 		if (read_guest_real(vcpu, fac, &vsie_page->fac,
-				    sizeof(vsie_page->fac)))
+				    stfle_size() * sizeof(u64)))
 			return set_validity_icpt(scb_s, 0x1090U);
-		scb_s->fac = (__u32)(__u64) &vsie_page->fac;
+		scb_s->fac = (u32)virt_to_phys(&vsie_page->fac);
 	}
 	return 0;
 }
 
 /*
+ * Get a register for a nested guest.
+ * @vcpu the vcpu of the guest
+ * @vsie_page the vsie_page for the nested guest
+ * @reg the register number, the upper 4 bits are ignored.
+ * returns: the value of the register.
+ */
+static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, u8 reg)
+{
+	/* no need to validate the parameter and/or perform error handling */
+	reg &= 0xf;
+	switch (reg) {
+	case 15:
+		return vsie_page->scb_s.gg15;
+	case 14:
+		return vsie_page->scb_s.gg14;
+	default:
+		return vcpu->run->s.regs.gprs[reg];
+	}
+}
+
+static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	unsigned long pei_dest, pei_src, src, dest, mask, prefix;
+	u64 *pei_block = &vsie_page->scb_o->mcic;
+	int edat, rc_dest, rc_src;
+	union ctlreg0 cr0;
+
+	cr0.val = vcpu->arch.sie_block->gcr[0];
+	edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
+	mask = _kvm_s390_logical_to_effective(&scb_s->gpsw, PAGE_MASK);
+	prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
+
+	dest = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 20) & mask;
+	dest = _kvm_s390_real_to_abs(prefix, dest) + scb_s->mso;
+	src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask;
+	src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso;
+
+	rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest);
+	rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src);
+	/*
+	 * Either everything went well, or something non-critical went wrong
+	 * e.g. because of a race. In either case, simply retry.
+	 */
+	if (rc_dest == -EAGAIN || rc_src == -EAGAIN || (!rc_dest && !rc_src)) {
+		retry_vsie_icpt(vsie_page);
+		return -EAGAIN;
+	}
+	/* Something more serious went wrong, propagate the error */
+	if (rc_dest < 0)
+		return rc_dest;
+	if (rc_src < 0)
+		return rc_src;
+
+	/* The only possible suppressing exception: just deliver it */
+	if (rc_dest == PGM_TRANSLATION_SPEC || rc_src == PGM_TRANSLATION_SPEC) {
+		clear_vsie_icpt(vsie_page);
+		rc_dest = kvm_s390_inject_program_int(vcpu, PGM_TRANSLATION_SPEC);
+		WARN_ON_ONCE(rc_dest);
+		return 1;
+	}
+
+	/*
+	 * Forward the PEI intercept to the guest if it was a page fault, or
+	 * also for segment and region table faults if EDAT applies.
+	 */
+	if (edat) {
+		rc_dest = rc_dest == PGM_ASCE_TYPE ? rc_dest : 0;
+		rc_src = rc_src == PGM_ASCE_TYPE ? rc_src : 0;
+	} else {
+		rc_dest = rc_dest != PGM_PAGE_TRANSLATION ? rc_dest : 0;
+		rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0;
+	}
+	if (!rc_dest && !rc_src) {
+		pei_block[0] = pei_dest;
+		pei_block[1] = pei_src;
+		return 1;
+	}
+
+	retry_vsie_icpt(vsie_page);
+
+	/*
+	 * The host has edat, and the guest does not, or it was an ASCE type
+	 * exception. The host needs to inject the appropriate DAT interrupts
+	 * into the guest.
+	 */
+	if (rc_dest)
+		return inject_fault(vcpu, rc_dest, dest, 1);
+	return inject_fault(vcpu, rc_src, src, 0);
+}
+
+/*
  * Run the vsie on a shadow scb and a shadow gmap, without any further
  * sanity checks, handling SIE faults.
  *
@@ -1000,12 +1138,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 
 	handle_last_fault(vcpu, vsie_page);
 
-	if (need_resched())
-		schedule();
-	if (test_cpu_flag(CIF_MCCK_PENDING))
-		s390_handle_mcck();
-
-	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+	kvm_vcpu_srcu_read_unlock(vcpu);
 
 	/* save current guest state of bp isolation override */
 	guest_bp_isolation = test_thread_flag(TIF_ISOLATE_BP_GUEST);
@@ -1031,9 +1164,10 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	 * also kick the vSIE.
 	 */
 	vcpu->arch.sie_block->prog0c |= PROG_IN_SIE;
+	current->thread.gmap_int_code = 0;
 	barrier();
 	if (!kvm_s390_vcpu_sie_inhibited(vcpu))
-		rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
+		rc = sie64a(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce);
 	barrier();
 	vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE;
 
@@ -1045,7 +1179,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	if (!guest_bp_isolation)
 		clear_thread_flag(TIF_ISOLATE_BP_GUEST);
 
-	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+	kvm_vcpu_srcu_read_lock(vcpu);
 
 	if (rc == -EINTR) {
 		VCPU_EVENT(vcpu, 3, "%s", "machine check");
@@ -1055,7 +1189,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 
 	if (rc > 0)
 		rc = 0; /* we could still have an icpt */
-	else if (rc == -EFAULT)
+	else if (current->thread.gmap_int_code)
 		return handle_fault(vcpu, vsie_page);
 
 	switch (scb_s->icptcode) {
@@ -1072,6 +1206,10 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		if ((scb_s->ipa & 0xf000) != 0xf000)
 			scb_s->ipa += 0x1000;
 		break;
+	case ICPT_PARTEXEC:
+		if (scb_s->ipa == 0xb254)
+			rc = vsie_handle_mvpg(vcpu, vsie_page);
+		break;
 	}
 	return rc;
 }
@@ -1102,15 +1240,17 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
 	 * we're holding has been unshadowed. If the gmap is still valid,
 	 * we can safely reuse it.
 	 */
-	if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat))
+	if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) {
+		vcpu->kvm->stat.gmap_shadow_reuse++;
 		return 0;
+	}
 
 	/* release the old shadow - if any, and mark the prefix as unmapped */
 	release_gmap_shadow(vsie_page);
 	gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
 	if (IS_ERR(gmap))
 		return PTR_ERR(gmap);
-	gmap->private = vcpu->kvm;
+	vcpu->kvm->stat.gmap_shadow_create++;
 	WRITE_ONCE(vsie_page->gmap, gmap);
 	return 0;
 }
@@ -1172,19 +1312,32 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		if (!rc)
 			rc = map_prefix(vcpu, vsie_page);
 		if (!rc) {
-			gmap_enable(vsie_page->gmap);
 			update_intervention_requests(vsie_page);
 			rc = do_vsie_run(vcpu, vsie_page);
-			gmap_enable(vcpu->arch.gmap);
 		}
 		atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20);
 
 		if (rc == -EAGAIN)
 			rc = 0;
-		if (rc || scb_s->icptcode || signal_pending(current) ||
+
+		/*
+		 * Exit the loop if the guest needs to process the intercept
+		 */
+		if (rc || scb_s->icptcode)
+			break;
+
+		/*
+		 * Exit the loop if the host needs to process an intercept,
+		 * but rewind the PSW to re-enter SIE once that's completed
+		 * instead of passing a "no action" intercept to the guest.
+		 */
+		if (signal_pending(current) ||
 		    kvm_s390_vcpu_has_irq(vcpu, 0) ||
-		    kvm_s390_vcpu_sie_inhibited(vcpu))
+		    kvm_s390_vcpu_sie_inhibited(vcpu)) {
+			kvm_s390_rewind_psw(vcpu, 4);
 			break;
+		}
+		cond_resched();
 	}
 
 	if (rc == -EFAULT) {
@@ -1202,10 +1355,25 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		scb_s->iprcc = PGM_ADDRESSING;
 		scb_s->pgmilc = 4;
 		scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4);
+		rc = 1;
 	}
 	return rc;
 }
 
+/* Try getting a given vsie page, returning "true" on success. */
+static inline bool try_get_vsie_page(struct vsie_page *vsie_page)
+{
+	if (test_bit(VSIE_PAGE_IN_USE, &vsie_page->flags))
+		return false;
+	return !test_and_set_bit(VSIE_PAGE_IN_USE, &vsie_page->flags);
+}
+
+/* Put a vsie page acquired through get_vsie_page / try_get_vsie_page. */
+static void put_vsie_page(struct vsie_page *vsie_page)
+{
+	clear_bit(VSIE_PAGE_IN_USE, &vsie_page->flags);
+}
+
 /*
  * Get or create a vsie page for a scb address.
  *
@@ -1216,16 +1384,21 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
 {
 	struct vsie_page *vsie_page;
-	struct page *page;
 	int nr_vcpus;
 
 	rcu_read_lock();
-	page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9);
+	vsie_page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9);
 	rcu_read_unlock();
-	if (page) {
-		if (page_ref_inc_return(page) == 2)
-			return page_to_virt(page);
-		page_ref_dec(page);
+	if (vsie_page) {
+		if (try_get_vsie_page(vsie_page)) {
+			if (vsie_page->scb_gpa == addr)
+				return vsie_page;
+			/*
+			 * We raced with someone reusing + putting this vsie
+			 * page before we grabbed it.
+			 */
+			put_vsie_page(vsie_page);
+		}
 	}
 
 	/*
@@ -1236,36 +1409,40 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
 
 	mutex_lock(&kvm->arch.vsie.mutex);
 	if (kvm->arch.vsie.page_count < nr_vcpus) {
-		page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA);
-		if (!page) {
+		vsie_page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA);
+		if (!vsie_page) {
 			mutex_unlock(&kvm->arch.vsie.mutex);
 			return ERR_PTR(-ENOMEM);
 		}
-		page_ref_inc(page);
-		kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page;
+		__set_bit(VSIE_PAGE_IN_USE, &vsie_page->flags);
+		kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = vsie_page;
 		kvm->arch.vsie.page_count++;
 	} else {
 		/* reuse an existing entry that belongs to nobody */
 		while (true) {
-			page = kvm->arch.vsie.pages[kvm->arch.vsie.next];
-			if (page_ref_inc_return(page) == 2)
+			vsie_page = kvm->arch.vsie.pages[kvm->arch.vsie.next];
+			if (try_get_vsie_page(vsie_page))
 				break;
-			page_ref_dec(page);
 			kvm->arch.vsie.next++;
 			kvm->arch.vsie.next %= nr_vcpus;
 		}
-		radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+		if (vsie_page->scb_gpa != ULONG_MAX)
+			radix_tree_delete(&kvm->arch.vsie.addr_to_page,
+					  vsie_page->scb_gpa >> 9);
 	}
-	page->index = addr;
-	/* double use of the same address */
-	if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) {
-		page_ref_dec(page);
+	/* Mark it as invalid until it resides in the tree. */
+	vsie_page->scb_gpa = ULONG_MAX;
+
+	/* Double use of the same address or allocation failure. */
+	if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9,
+			      vsie_page)) {
+		put_vsie_page(vsie_page);
 		mutex_unlock(&kvm->arch.vsie.mutex);
 		return NULL;
 	}
+	vsie_page->scb_gpa = addr;
 	mutex_unlock(&kvm->arch.vsie.mutex);
 
-	vsie_page = page_to_virt(page);
 	memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
 	release_gmap_shadow(vsie_page);
 	vsie_page->fault_addr = 0;
@@ -1273,14 +1450,6 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
 	return vsie_page;
 }
 
-/* put a vsie page acquired via get_vsie_page */
-static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page)
-{
-	struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT);
-
-	page_ref_dec(page);
-}
-
 int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
 {
 	struct vsie_page *vsie_page;
@@ -1301,8 +1470,10 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
 	if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0) ||
-	    kvm_s390_vcpu_sie_inhibited(vcpu))
+	    kvm_s390_vcpu_sie_inhibited(vcpu)) {
+		kvm_s390_rewind_psw(vcpu, 4);
 		return 0;
+	}
 
 	vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
 	if (IS_ERR(vsie_page))
@@ -1329,7 +1500,7 @@ out_unshadow:
 out_unpin_scb:
 	unpin_scb(vcpu, vsie_page, scb_addr);
 out_put:
-	put_vsie_page(vcpu->kvm, vsie_page);
+	put_vsie_page(vsie_page);
 
 	return rc < 0 ? rc : 0;
 }
@@ -1338,25 +1509,25 @@ out_put:
 void kvm_s390_vsie_init(struct kvm *kvm)
 {
 	mutex_init(&kvm->arch.vsie.mutex);
-	INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL);
+	INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL_ACCOUNT);
 }
 
 /* Destroy the vsie data structures. To be called when a vm is destroyed. */
 void kvm_s390_vsie_destroy(struct kvm *kvm)
 {
 	struct vsie_page *vsie_page;
-	struct page *page;
 	int i;
 
 	mutex_lock(&kvm->arch.vsie.mutex);
 	for (i = 0; i < kvm->arch.vsie.page_count; i++) {
-		page = kvm->arch.vsie.pages[i];
+		vsie_page = kvm->arch.vsie.pages[i];
 		kvm->arch.vsie.pages[i] = NULL;
-		vsie_page = page_to_virt(page);
 		release_gmap_shadow(vsie_page);
 		/* free the radix tree entry */
-		radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
-		__free_page(page);
+		if (vsie_page->scb_gpa != ULONG_MAX)
+			radix_tree_delete(&kvm->arch.vsie.addr_to_page,
+					  vsie_page->scb_gpa >> 9);
+		free_page((unsigned long)vsie_page);
 	}
 	kvm->arch.vsie.page_count = 0;
 	mutex_unlock(&kvm->arch.vsie.mutex);
diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile
index 28fd66d558ff..14bbfe50033c 100644
--- a/arch/s390/lib/Makefile
+++ b/arch/s390/lib/Makefile
@@ -3,10 +3,13 @@
 # Makefile for s390-specific library files..
 #
 
-lib-y += delay.o string.o uaccess.o find.o spinlock.o
+lib-y += delay.o string.o uaccess.o find.o spinlock.o tishift.o
+lib-y += csum-partial.o
 obj-y += mem.o xor.o
 lib-$(CONFIG_KPROBES) += probes.o
 lib-$(CONFIG_UPROBES) += probes.o
+obj-$(CONFIG_S390_KPROBES_SANITY_TEST) += test_kprobes_s390.o
+test_kprobes_s390-objs += test_kprobes_asm.o test_kprobes.o
 
 # Instrumenting memory accesses to __user data (in different address space)
 # produce false positives
@@ -14,3 +17,13 @@ KASAN_SANITIZE_uaccess.o := n
 
 obj-$(CONFIG_S390_UNWIND_SELFTEST) += test_unwind.o
 CFLAGS_test_unwind.o += -fno-optimize-sibling-calls
+
+obj-$(CONFIG_S390_MODULES_SANITY_TEST) += test_modules.o
+obj-$(CONFIG_S390_MODULES_SANITY_TEST_HELPERS) += test_modules_helpers.o
+
+lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
+
+obj-$(CONFIG_EXPOLINE_EXTERN) += expoline.o
+
+obj-$(CONFIG_CRC32_ARCH) += crc32-s390.o
+crc32-s390-y := crc32-glue.o crc32le-vx.o crc32be-vx.o
diff --git a/arch/s390/lib/crc32-glue.c b/arch/s390/lib/crc32-glue.c
new file mode 100644
index 000000000000..124214a27340
--- /dev/null
+++ b/arch/s390/lib/crc32-glue.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CRC-32 implemented with the z/Architecture Vector Extension Facility.
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+#define KMSG_COMPONENT	"crc32-vx"
+#define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+#include <linux/crc32.h>
+#include <asm/fpu.h>
+#include "crc32-vx.h"
+
+#define VX_MIN_LEN		64
+#define VX_ALIGNMENT		16L
+#define VX_ALIGN_MASK		(VX_ALIGNMENT - 1)
+
+static DEFINE_STATIC_KEY_FALSE(have_vxrs);
+
+/*
+ * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension
+ *
+ * Creates a function to perform a particular CRC-32 computation. Depending
+ * on the message buffer, the hardware-accelerated or software implementation
+ * is used.   Note that the message buffer is aligned to improve fetch
+ * operations of VECTOR LOAD MULTIPLE instructions.
+ */
+#define DEFINE_CRC32_VX(___fname, ___crc32_vx, ___crc32_sw)		    \
+	u32 ___fname(u32 crc, const u8 *data, size_t datalen)		    \
+	{								    \
+		unsigned long prealign, aligned, remaining;		    \
+		DECLARE_KERNEL_FPU_ONSTACK16(vxstate);			    \
+									    \
+		if (datalen < VX_MIN_LEN + VX_ALIGN_MASK ||		    \
+		    !static_branch_likely(&have_vxrs))			    \
+			return ___crc32_sw(crc, data, datalen);		    \
+									    \
+		if ((unsigned long)data & VX_ALIGN_MASK) {		    \
+			prealign = VX_ALIGNMENT -			    \
+				  ((unsigned long)data & VX_ALIGN_MASK);    \
+			datalen -= prealign;				    \
+			crc = ___crc32_sw(crc, data, prealign);		    \
+			data = (void *)((unsigned long)data + prealign);    \
+		}							    \
+									    \
+		aligned = datalen & ~VX_ALIGN_MASK;			    \
+		remaining = datalen & VX_ALIGN_MASK;			    \
+									    \
+		kernel_fpu_begin(&vxstate, KERNEL_VXR_LOW);		    \
+		crc = ___crc32_vx(crc, data, aligned);			    \
+		kernel_fpu_end(&vxstate, KERNEL_VXR_LOW);		    \
+									    \
+		if (remaining)						    \
+			crc = ___crc32_sw(crc, data + aligned, remaining);  \
+									    \
+		return crc;						    \
+	}								    \
+	EXPORT_SYMBOL(___fname);
+
+DEFINE_CRC32_VX(crc32_le_arch, crc32_le_vgfm_16, crc32_le_base)
+DEFINE_CRC32_VX(crc32_be_arch, crc32_be_vgfm_16, crc32_be_base)
+DEFINE_CRC32_VX(crc32c_arch, crc32c_le_vgfm_16, crc32c_base)
+
+static int __init crc32_s390_init(void)
+{
+	if (cpu_have_feature(S390_CPU_FEATURE_VXRS))
+		static_branch_enable(&have_vxrs);
+	return 0;
+}
+arch_initcall(crc32_s390_init);
+
+static void __exit crc32_s390_exit(void)
+{
+}
+module_exit(crc32_s390_exit);
+
+u32 crc32_optimizations(void)
+{
+	if (static_key_enabled(&have_vxrs))
+		return CRC32_LE_OPTIMIZATION |
+		       CRC32_BE_OPTIMIZATION |
+		       CRC32C_OPTIMIZATION;
+	return 0;
+}
+EXPORT_SYMBOL(crc32_optimizations);
+
+MODULE_AUTHOR("Hendrik Brueckner <brueckner@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("CRC-32 algorithms using z/Architecture Vector Extension Facility");
+MODULE_LICENSE("GPL");
diff --git a/arch/s390/lib/crc32-vx.h b/arch/s390/lib/crc32-vx.h
new file mode 100644
index 000000000000..652c96e1a822
--- /dev/null
+++ b/arch/s390/lib/crc32-vx.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _CRC32_VX_S390_H
+#define _CRC32_VX_S390_H
+
+#include <linux/types.h>
+
+u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+
+#endif /* _CRC32_VX_S390_H */
diff --git a/arch/s390/crypto/crc32be-vx.S b/arch/s390/lib/crc32be-vx.c
index 0099044e2c86..fed7c9c70d05 100644
--- a/arch/s390/crypto/crc32be-vx.S
+++ b/arch/s390/lib/crc32be-vx.c
@@ -12,27 +12,24 @@
  * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
  */
 
-#include <linux/linkage.h>
-#include <asm/nospec-insn.h>
-#include <asm/vx-insn.h>
+#include <linux/types.h>
+#include <asm/fpu.h>
+#include "crc32-vx.h"
 
 /* Vector register range containing CRC-32 constants */
-#define CONST_R1R2		%v9
-#define CONST_R3R4		%v10
-#define CONST_R5		%v11
-#define CONST_R6		%v12
-#define CONST_RU_POLY		%v13
-#define CONST_CRC_POLY		%v14
-
-.data
-.align 8
+#define CONST_R1R2		9
+#define CONST_R3R4		10
+#define CONST_R5		11
+#define CONST_R6		12
+#define CONST_RU_POLY		13
+#define CONST_CRC_POLY		14
 
 /*
  * The CRC-32 constant block contains reduction constants to fold and
  * process particular chunks of the input data stream in parallel.
  *
  * For the CRC-32 variants, the constants are precomputed according to
- * these defintions:
+ * these definitions:
  *
  *	R1 = x4*128+64 mod P(x)
  *	R2 = x4*128    mod P(x)
@@ -48,7 +45,7 @@
  *
  * Note that the constant definitions below are extended in order to compute
  * intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
- * The righmost doubleword can be 0 to prevent contribution to the result or
+ * The rightmost doubleword can be 0 to prevent contribution to the result or
  * can be multiplied by 1 to perform an XOR without the need for a separate
  * VECTOR EXCLUSIVE OR instruction.
  *
@@ -58,104 +55,74 @@
  *	P'(x) = 0xEDB88320
  */
 
-.Lconstants_CRC_32_BE:
-	.quad		0x08833794c, 0x0e6228b11	# R1, R2
-	.quad		0x0c5b9cd4c, 0x0e8a45605	# R3, R4
-	.quad		0x0f200aa66, 1 << 32		# R5, x32
-	.quad		0x0490d678d, 1			# R6, 1
-	.quad		0x104d101df, 0			# u
-	.quad		0x104C11DB7, 0			# P(x)
-
-.previous
-
-	GEN_BR_THUNK %r14
-
-.text
-/*
- * The CRC-32 function(s) use these calling conventions:
- *
- * Parameters:
- *
- *	%r2:	Initial CRC value, typically ~0; and final CRC (return) value.
- *	%r3:	Input buffer pointer, performance might be improved if the
- *		buffer is on a doubleword boundary.
- *	%r4:	Length of the buffer, must be 64 bytes or greater.
+static unsigned long constants_CRC_32_BE[] = {
+	0x08833794c, 0x0e6228b11,	/* R1, R2 */
+	0x0c5b9cd4c, 0x0e8a45605,	/* R3, R4 */
+	0x0f200aa66, 1UL << 32,		/* R5, x32 */
+	0x0490d678d, 1,			/* R6, 1 */
+	0x104d101df, 0,			/* u */
+	0x104C11DB7, 0,			/* P(x) */
+};
+
+/**
+ * crc32_be_vgfm_16 - Compute CRC-32 (BE variant) with vector registers
+ * @crc: Initial CRC value, typically ~0.
+ * @buf: Input buffer pointer, performance might be improved if the
+ *	  buffer is on a doubleword boundary.
+ * @size: Size of the buffer, must be 64 bytes or greater.
  *
  * Register usage:
- *
- *	%r5:	CRC-32 constant pool base pointer.
  *	V0:	Initial CRC value and intermediate constants and results.
  *	V1..V4:	Data for CRC computation.
  *	V5..V8:	Next data chunks that are fetched from the input buffer.
- *
  *	V9..V14: CRC-32 constants.
  */
-ENTRY(crc32_be_vgfm_16)
+u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
+{
 	/* Load CRC-32 constants */
-	larl	%r5,.Lconstants_CRC_32_BE
-	VLM	CONST_R1R2,CONST_CRC_POLY,0,%r5
+	fpu_vlm(CONST_R1R2, CONST_CRC_POLY, &constants_CRC_32_BE);
+	fpu_vzero(0);
 
 	/* Load the initial CRC value into the leftmost word of V0. */
-	VZERO	%v0
-	VLVGF	%v0,%r2,0
+	fpu_vlvgf(0, crc, 0);
 
 	/* Load a 64-byte data chunk and XOR with CRC */
-	VLM	%v1,%v4,0,%r3		/* 64-bytes into V1..V4 */
-	VX	%v1,%v0,%v1		/* V1 ^= CRC */
-	aghi	%r3,64			/* BUF = BUF + 64 */
-	aghi	%r4,-64			/* LEN = LEN - 64 */
-
-	/* Check remaining buffer size and jump to proper folding method */
-	cghi	%r4,64
-	jl	.Lless_than_64bytes
-
-.Lfold_64bytes_loop:
-	/* Load the next 64-byte data chunk into V5 to V8 */
-	VLM	%v5,%v8,0,%r3
+	fpu_vlm(1, 4, buf);
+	fpu_vx(1, 0, 1);
+	buf += 64;
+	size -= 64;
+
+	while (size >= 64) {
+		/* Load the next 64-byte data chunk into V5 to V8 */
+		fpu_vlm(5, 8, buf);
+
+		/*
+		 * Perform a GF(2) multiplication of the doublewords in V1 with
+		 * the reduction constants in V0.  The intermediate result is
+		 * then folded (accumulated) with the next data chunk in V5 and
+		 * stored in V1.  Repeat this step for the register contents
+		 * in V2, V3, and V4 respectively.
+		 */
+		fpu_vgfmag(1, CONST_R1R2, 1, 5);
+		fpu_vgfmag(2, CONST_R1R2, 2, 6);
+		fpu_vgfmag(3, CONST_R1R2, 3, 7);
+		fpu_vgfmag(4, CONST_R1R2, 4, 8);
+		buf += 64;
+		size -= 64;
+	}
 
-	/*
-	 * Perform a GF(2) multiplication of the doublewords in V1 with
-	 * the reduction constants in V0.  The intermediate result is
-	 * then folded (accumulated) with the next data chunk in V5 and
-	 * stored in V1.  Repeat this step for the register contents
-	 * in V2, V3, and V4 respectively.
-	 */
-	VGFMAG	%v1,CONST_R1R2,%v1,%v5
-	VGFMAG	%v2,CONST_R1R2,%v2,%v6
-	VGFMAG	%v3,CONST_R1R2,%v3,%v7
-	VGFMAG	%v4,CONST_R1R2,%v4,%v8
-
-	/* Adjust buffer pointer and length for next loop */
-	aghi	%r3,64			/* BUF = BUF + 64 */
-	aghi	%r4,-64			/* LEN = LEN - 64 */
-
-	cghi	%r4,64
-	jnl	.Lfold_64bytes_loop
-
-.Lless_than_64bytes:
 	/* Fold V1 to V4 into a single 128-bit value in V1 */
-	VGFMAG	%v1,CONST_R3R4,%v1,%v2
-	VGFMAG	%v1,CONST_R3R4,%v1,%v3
-	VGFMAG	%v1,CONST_R3R4,%v1,%v4
-
-	/* Check whether to continue with 64-bit folding */
-	cghi	%r4,16
-	jl	.Lfinal_fold
+	fpu_vgfmag(1, CONST_R3R4, 1, 2);
+	fpu_vgfmag(1, CONST_R3R4, 1, 3);
+	fpu_vgfmag(1, CONST_R3R4, 1, 4);
 
-.Lfold_16bytes_loop:
+	while (size >= 16) {
+		fpu_vl(2, buf);
+		fpu_vgfmag(1, CONST_R3R4, 1, 2);
+		buf += 16;
+		size -= 16;
+	}
 
-	VL	%v2,0,,%r3		/* Load next data chunk */
-	VGFMAG	%v1,CONST_R3R4,%v1,%v2	/* Fold next data chunk */
-
-	/* Adjust buffer pointer and size for folding next data chunk */
-	aghi	%r3,16
-	aghi	%r4,-16
-
-	/* Process remaining data chunks */
-	cghi	%r4,16
-	jnl	.Lfold_16bytes_loop
-
-.Lfinal_fold:
 	/*
 	 * The R5 constant is used to fold a 128-bit value into an 96-bit value
 	 * that is XORed with the next 96-bit input data chunk.  To use a single
@@ -163,7 +130,7 @@ ENTRY(crc32_be_vgfm_16)
 	 * form an intermediate 96-bit value (with appended zeros) which is then
 	 * XORed with the intermediate reduction result.
 	 */
-	VGFMG	%v1,CONST_R5,%v1
+	fpu_vgfmg(1, CONST_R5, 1);
 
 	/*
 	 * Further reduce the remaining 96-bit value to a 64-bit value using a
@@ -172,7 +139,7 @@ ENTRY(crc32_be_vgfm_16)
 	 * doubleword with R6.	The result is a 64-bit value and is subject to
 	 * the Barret reduction.
 	 */
-	VGFMG	%v1,CONST_R6,%v1
+	fpu_vgfmg(1, CONST_R6, 1);
 
 	/*
 	 * The input values to the Barret reduction are the degree-63 polynomial
@@ -189,24 +156,19 @@ ENTRY(crc32_be_vgfm_16)
 	 * Note: To compensate the division by x^32, use the vector unpack
 	 * instruction to move the leftmost word into the leftmost doubleword
 	 * of the vector register.  The rightmost doubleword is multiplied
-	 * with zero to not contribute to the intermedate results.
+	 * with zero to not contribute to the intermediate results.
 	 */
 
 	/* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
-	VUPLLF	%v2,%v1
-	VGFMG	%v2,CONST_RU_POLY,%v2
+	fpu_vupllf(2, 1);
+	fpu_vgfmg(2, CONST_RU_POLY, 2);
 
 	/*
 	 * Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
 	 * V2 and XOR the intermediate result, T2(x),  with the value in V1.
 	 * The final result is in the rightmost word of V2.
 	 */
-	VUPLLF	%v2,%v2
-	VGFMAG	%v2,CONST_CRC_POLY,%v2,%v1
-
-.Ldone:
-	VLGVF	%r2,%v2,3
-	BR_EX	%r14
-ENDPROC(crc32_be_vgfm_16)
-
-.previous
+	fpu_vupllf(2, 2);
+	fpu_vgfmag(2, CONST_CRC_POLY, 2, 1);
+	return fpu_vlgvf(2, 3);
+}
diff --git a/arch/s390/crypto/crc32le-vx.S b/arch/s390/lib/crc32le-vx.c
index 71caf0f4ec08..2f629f394df7 100644
--- a/arch/s390/crypto/crc32le-vx.S
+++ b/arch/s390/lib/crc32le-vx.c
@@ -13,20 +13,17 @@
  * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
  */
 
-#include <linux/linkage.h>
-#include <asm/nospec-insn.h>
-#include <asm/vx-insn.h>
+#include <linux/types.h>
+#include <asm/fpu.h>
+#include "crc32-vx.h"
 
 /* Vector register range containing CRC-32 constants */
-#define CONST_PERM_LE2BE	%v9
-#define CONST_R2R1		%v10
-#define CONST_R4R3		%v11
-#define CONST_R5		%v12
-#define CONST_RU_POLY		%v13
-#define CONST_CRC_POLY		%v14
-
-.data
-.align 8
+#define CONST_PERM_LE2BE	9
+#define CONST_R2R1		10
+#define CONST_R4R3		11
+#define CONST_R5		12
+#define CONST_RU_POLY		13
+#define CONST_CRC_POLY		14
 
 /*
  * The CRC-32 constant block contains reduction constants to fold and
@@ -59,62 +56,43 @@
  *	P'(x) = 0x82F63B78
  */
 
-.Lconstants_CRC_32_LE:
-	.octa		0x0F0E0D0C0B0A09080706050403020100	# BE->LE mask
-	.quad		0x1c6e41596, 0x154442bd4		# R2, R1
-	.quad		0x0ccaa009e, 0x1751997d0		# R4, R3
-	.octa		0x163cd6124				# R5
-	.octa		0x1F7011641				# u'
-	.octa		0x1DB710641				# P'(x) << 1
-
-.Lconstants_CRC_32C_LE:
-	.octa		0x0F0E0D0C0B0A09080706050403020100	# BE->LE mask
-	.quad		0x09e4addf8, 0x740eef02			# R2, R1
-	.quad		0x14cd00bd6, 0xf20c0dfe			# R4, R3
-	.octa		0x0dd45aab8				# R5
-	.octa		0x0dea713f1				# u'
-	.octa		0x105ec76f0				# P'(x) << 1
-
-.previous
-
-	GEN_BR_THUNK %r14
-
-.text
-
-/*
- * The CRC-32 functions use these calling conventions:
- *
- * Parameters:
- *
- *	%r2:	Initial CRC value, typically ~0; and final CRC (return) value.
- *	%r3:	Input buffer pointer, performance might be improved if the
- *		buffer is on a doubleword boundary.
- *	%r4:	Length of the buffer, must be 64 bytes or greater.
+static unsigned long constants_CRC_32_LE[] = {
+	0x0f0e0d0c0b0a0908, 0x0706050403020100,	/* BE->LE mask */
+	0x1c6e41596, 0x154442bd4,		/* R2, R1 */
+	0x0ccaa009e, 0x1751997d0,		/* R4, R3 */
+	0x0, 0x163cd6124,			/* R5 */
+	0x0, 0x1f7011641,			/* u' */
+	0x0, 0x1db710641			/* P'(x) << 1 */
+};
+
+static unsigned long constants_CRC_32C_LE[] = {
+	0x0f0e0d0c0b0a0908, 0x0706050403020100,	/* BE->LE mask */
+	0x09e4addf8, 0x740eef02,		/* R2, R1 */
+	0x14cd00bd6, 0xf20c0dfe,		/* R4, R3 */
+	0x0, 0x0dd45aab8,			/* R5 */
+	0x0, 0x0dea713f1,			/* u' */
+	0x0, 0x105ec76f0			/* P'(x) << 1 */
+};
+
+/**
+ * crc32_le_vgfm_generic - Compute CRC-32 (LE variant) with vector registers
+ * @crc: Initial CRC value, typically ~0.
+ * @buf: Input buffer pointer, performance might be improved if the
+ *	 buffer is on a doubleword boundary.
+ * @size: Size of the buffer, must be 64 bytes or greater.
+ * @constants: CRC-32 constant pool base pointer.
  *
  * Register usage:
- *
- *	%r5:	CRC-32 constant pool base pointer.
- *	V0:	Initial CRC value and intermediate constants and results.
- *	V1..V4:	Data for CRC computation.
- *	V5..V8:	Next data chunks that are fetched from the input buffer.
- *	V9:	Constant for BE->LE conversion and shift operations
- *
+ *	V0:	  Initial CRC value and intermediate constants and results.
+ *	V1..V4:	  Data for CRC computation.
+ *	V5..V8:	  Next data chunks that are fetched from the input buffer.
+ *	V9:	  Constant for BE->LE conversion and shift operations
  *	V10..V14: CRC-32 constants.
  */
-
-ENTRY(crc32_le_vgfm_16)
-	larl	%r5,.Lconstants_CRC_32_LE
-	j	crc32_le_vgfm_generic
-ENDPROC(crc32_le_vgfm_16)
-
-ENTRY(crc32c_le_vgfm_16)
-	larl	%r5,.Lconstants_CRC_32C_LE
-	j	crc32_le_vgfm_generic
-ENDPROC(crc32c_le_vgfm_16)
-
-ENTRY(crc32_le_vgfm_generic)
+static u32 crc32_le_vgfm_generic(u32 crc, unsigned char const *buf, size_t size, unsigned long *constants)
+{
 	/* Load CRC-32 constants */
-	VLM	CONST_PERM_LE2BE,CONST_CRC_POLY,0,%r5
+	fpu_vlm(CONST_PERM_LE2BE, CONST_CRC_POLY, constants);
 
 	/*
 	 * Load the initial CRC value.
@@ -123,90 +101,73 @@ ENTRY(crc32_le_vgfm_generic)
 	 * vector register and is later XORed with the LSB portion
 	 * of the loaded input data.
 	 */
-	VZERO	%v0			/* Clear V0 */
-	VLVGF	%v0,%r2,3		/* Load CRC into rightmost word */
+	fpu_vzero(0);			/* Clear V0 */
+	fpu_vlvgf(0, crc, 3);		/* Load CRC into rightmost word */
 
 	/* Load a 64-byte data chunk and XOR with CRC */
-	VLM	%v1,%v4,0,%r3		/* 64-bytes into V1..V4 */
-	VPERM	%v1,%v1,%v1,CONST_PERM_LE2BE
-	VPERM	%v2,%v2,%v2,CONST_PERM_LE2BE
-	VPERM	%v3,%v3,%v3,CONST_PERM_LE2BE
-	VPERM	%v4,%v4,%v4,CONST_PERM_LE2BE
+	fpu_vlm(1, 4, buf);
+	fpu_vperm(1, 1, 1, CONST_PERM_LE2BE);
+	fpu_vperm(2, 2, 2, CONST_PERM_LE2BE);
+	fpu_vperm(3, 3, 3, CONST_PERM_LE2BE);
+	fpu_vperm(4, 4, 4, CONST_PERM_LE2BE);
+
+	fpu_vx(1, 0, 1);		/* V1 ^= CRC */
+	buf += 64;
+	size -= 64;
+
+	while (size >= 64) {
+		fpu_vlm(5, 8, buf);
+		fpu_vperm(5, 5, 5, CONST_PERM_LE2BE);
+		fpu_vperm(6, 6, 6, CONST_PERM_LE2BE);
+		fpu_vperm(7, 7, 7, CONST_PERM_LE2BE);
+		fpu_vperm(8, 8, 8, CONST_PERM_LE2BE);
+		/*
+		 * Perform a GF(2) multiplication of the doublewords in V1 with
+		 * the R1 and R2 reduction constants in V0.  The intermediate
+		 * result is then folded (accumulated) with the next data chunk
+		 * in V5 and stored in V1. Repeat this step for the register
+		 * contents in V2, V3, and V4 respectively.
+		 */
+		fpu_vgfmag(1, CONST_R2R1, 1, 5);
+		fpu_vgfmag(2, CONST_R2R1, 2, 6);
+		fpu_vgfmag(3, CONST_R2R1, 3, 7);
+		fpu_vgfmag(4, CONST_R2R1, 4, 8);
+		buf += 64;
+		size -= 64;
+	}
 
-	VX	%v1,%v0,%v1		/* V1 ^= CRC */
-	aghi	%r3,64			/* BUF = BUF + 64 */
-	aghi	%r4,-64			/* LEN = LEN - 64 */
-
-	cghi	%r4,64
-	jl	.Lless_than_64bytes
-
-.Lfold_64bytes_loop:
-	/* Load the next 64-byte data chunk into V5 to V8 */
-	VLM	%v5,%v8,0,%r3
-	VPERM	%v5,%v5,%v5,CONST_PERM_LE2BE
-	VPERM	%v6,%v6,%v6,CONST_PERM_LE2BE
-	VPERM	%v7,%v7,%v7,CONST_PERM_LE2BE
-	VPERM	%v8,%v8,%v8,CONST_PERM_LE2BE
-
-	/*
-	 * Perform a GF(2) multiplication of the doublewords in V1 with
-	 * the R1 and R2 reduction constants in V0.  The intermediate result
-	 * is then folded (accumulated) with the next data chunk in V5 and
-	 * stored in V1. Repeat this step for the register contents
-	 * in V2, V3, and V4 respectively.
-	 */
-	VGFMAG	%v1,CONST_R2R1,%v1,%v5
-	VGFMAG	%v2,CONST_R2R1,%v2,%v6
-	VGFMAG	%v3,CONST_R2R1,%v3,%v7
-	VGFMAG	%v4,CONST_R2R1,%v4,%v8
-
-	aghi	%r3,64			/* BUF = BUF + 64 */
-	aghi	%r4,-64			/* LEN = LEN - 64 */
-
-	cghi	%r4,64
-	jnl	.Lfold_64bytes_loop
-
-.Lless_than_64bytes:
 	/*
 	 * Fold V1 to V4 into a single 128-bit value in V1.  Multiply V1 with R3
 	 * and R4 and accumulating the next 128-bit chunk until a single 128-bit
 	 * value remains.
 	 */
-	VGFMAG	%v1,CONST_R4R3,%v1,%v2
-	VGFMAG	%v1,CONST_R4R3,%v1,%v3
-	VGFMAG	%v1,CONST_R4R3,%v1,%v4
-
-	cghi	%r4,16
-	jl	.Lfinal_fold
-
-.Lfold_16bytes_loop:
-
-	VL	%v2,0,,%r3		/* Load next data chunk */
-	VPERM	%v2,%v2,%v2,CONST_PERM_LE2BE
-	VGFMAG	%v1,CONST_R4R3,%v1,%v2	/* Fold next data chunk */
+	fpu_vgfmag(1, CONST_R4R3, 1, 2);
+	fpu_vgfmag(1, CONST_R4R3, 1, 3);
+	fpu_vgfmag(1, CONST_R4R3, 1, 4);
+
+	while (size >= 16) {
+		fpu_vl(2, buf);
+		fpu_vperm(2, 2, 2, CONST_PERM_LE2BE);
+		fpu_vgfmag(1, CONST_R4R3, 1, 2);
+		buf += 16;
+		size -= 16;
+	}
 
-	aghi	%r3,16
-	aghi	%r4,-16
-
-	cghi	%r4,16
-	jnl	.Lfold_16bytes_loop
-
-.Lfinal_fold:
 	/*
 	 * Set up a vector register for byte shifts.  The shift value must
 	 * be loaded in bits 1-4 in byte element 7 of a vector register.
 	 * Shift by 8 bytes: 0x40
 	 * Shift by 4 bytes: 0x20
 	 */
-	VLEIB	%v9,0x40,7
+	fpu_vleib(9, 0x40, 7);
 
 	/*
 	 * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
 	 * to move R4 into the rightmost doubleword and set the leftmost
 	 * doubleword to 0x1.
 	 */
-	VSRLB	%v0,CONST_R4R3,%v9
-	VLEIG	%v0,1,0
+	fpu_vsrlb(0, CONST_R4R3, 9);
+	fpu_vleig(0, 1, 0);
 
 	/*
 	 * Compute GF(2) product of V1 and V0.	The rightmost doubleword
@@ -214,7 +175,7 @@ ENTRY(crc32_le_vgfm_generic)
 	 * multiplied by 0x1 and is then XORed with rightmost product.
 	 * Implicitly, the intermediate leftmost product becomes padded
 	 */
-	VGFMG	%v1,%v0,%v1
+	fpu_vgfmg(1, 0, 1);
 
 	/*
 	 * Now do the final 32-bit fold by multiplying the rightmost word
@@ -229,10 +190,10 @@ ENTRY(crc32_le_vgfm_generic)
 	 * rightmost doubleword and the leftmost doubleword is zero to ignore
 	 * the leftmost product of V1.
 	 */
-	VLEIB	%v9,0x20,7		  /* Shift by words */
-	VSRLB	%v2,%v1,%v9		  /* Store remaining bits in V2 */
-	VUPLLF	%v1,%v1			  /* Split rightmost doubleword */
-	VGFMAG	%v1,CONST_R5,%v1,%v2	  /* V1 = (V1 * R5) XOR V2 */
+	fpu_vleib(9, 0x20, 7);		  /* Shift by words */
+	fpu_vsrlb(2, 1, 9);		  /* Store remaining bits in V2 */
+	fpu_vupllf(1, 1);		  /* Split rightmost doubleword */
+	fpu_vgfmag(1, CONST_R5, 1, 2);	  /* V1 = (V1 * R5) XOR V2 */
 
 	/*
 	 * Apply a Barret reduction to compute the final 32-bit CRC value.
@@ -254,20 +215,26 @@ ENTRY(crc32_le_vgfm_generic)
 	 */
 
 	/* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
-	VUPLLF	%v2,%v1
-	VGFMG	%v2,CONST_RU_POLY,%v2
+	fpu_vupllf(2, 1);
+	fpu_vgfmg(2, CONST_RU_POLY, 2);
 
 	/*
 	 * Compute the GF(2) product of the CRC polynomial with T1(x) in
 	 * V2 and XOR the intermediate result, T2(x), with the value in V1.
 	 * The final result is stored in word element 2 of V2.
 	 */
-	VUPLLF	%v2,%v2
-	VGFMAG	%v2,CONST_CRC_POLY,%v2,%v1
+	fpu_vupllf(2, 2);
+	fpu_vgfmag(2, CONST_CRC_POLY, 2, 1);
+
+	return fpu_vlgvf(2, 2);
+}
 
-.Ldone:
-	VLGVF	%r2,%v2,2
-	BR_EX	%r14
-ENDPROC(crc32_le_vgfm_generic)
+u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
+{
+	return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32_LE[0]);
+}
 
-.previous
+u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
+{
+	return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32C_LE[0]);
+}
diff --git a/arch/s390/lib/csum-partial.c b/arch/s390/lib/csum-partial.c
new file mode 100644
index 000000000000..458abd9bac70
--- /dev/null
+++ b/arch/s390/lib/csum-partial.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/export.h>
+#include <asm/checksum.h>
+#include <asm/fpu.h>
+
+/*
+ * Computes the checksum of a memory block at src, length len,
+ * and adds in "sum" (32-bit). If copy is true copies to dst.
+ *
+ * Returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic.
+ *
+ * This function must be called with even lengths, except
+ * for the last fragment, which may be odd.
+ *
+ * It's best to have src and dst aligned on a 64-bit boundary.
+ */
+static __always_inline __wsum csum_copy(void *dst, const void *src, int len, __wsum sum, bool copy)
+{
+	DECLARE_KERNEL_FPU_ONSTACK8(vxstate);
+
+	if (!cpu_has_vx()) {
+		if (copy)
+			memcpy(dst, src, len);
+		return cksm(dst, len, sum);
+	}
+	kernel_fpu_begin(&vxstate, KERNEL_VXR_V16V23);
+	fpu_vlvgf(16, (__force u32)sum, 1);
+	fpu_vzero(17);
+	fpu_vzero(18);
+	fpu_vzero(19);
+	while (len >= 64) {
+		fpu_vlm(20, 23, src);
+		if (copy) {
+			fpu_vstm(20, 23, dst);
+			dst += 64;
+		}
+		fpu_vcksm(16, 20, 16);
+		fpu_vcksm(17, 21, 17);
+		fpu_vcksm(18, 22, 18);
+		fpu_vcksm(19, 23, 19);
+		src += 64;
+		len -= 64;
+	}
+	while (len >= 32) {
+		fpu_vlm(20, 21, src);
+		if (copy) {
+			fpu_vstm(20, 21, dst);
+			dst += 32;
+		}
+		fpu_vcksm(16, 20, 16);
+		fpu_vcksm(17, 21, 17);
+		src += 32;
+		len -= 32;
+	}
+	while (len >= 16) {
+		fpu_vl(20, src);
+		if (copy) {
+			fpu_vst(20, dst);
+			dst += 16;
+		}
+		fpu_vcksm(16, 20, 16);
+		src += 16;
+		len -= 16;
+	}
+	if (len) {
+		fpu_vll(20, len - 1, src);
+		if (copy)
+			fpu_vstl(20, len - 1, dst);
+		fpu_vcksm(16, 20, 16);
+	}
+	fpu_vcksm(18, 19, 18);
+	fpu_vcksm(16, 17, 16);
+	fpu_vcksm(16, 18, 16);
+	sum = (__force __wsum)fpu_vlgvf(16, 1);
+	kernel_fpu_end(&vxstate, KERNEL_VXR_V16V23);
+	return sum;
+}
+
+__wsum csum_partial(const void *buff, int len, __wsum sum)
+{
+	return csum_copy(NULL, buff, len, sum, false);
+}
+EXPORT_SYMBOL(csum_partial);
+
+__wsum csum_partial_copy_nocheck(const void *src, void *dst, int len)
+{
+	return csum_copy(dst, src, len, 0, true);
+}
+EXPORT_SYMBOL(csum_partial_copy_nocheck);
diff --git a/arch/s390/lib/delay.c b/arch/s390/lib/delay.c
index d4aa10795605..be14c58cb989 100644
--- a/arch/s390/lib/delay.c
+++ b/arch/s390/lib/delay.c
@@ -4,126 +4,42 @@
  *
  *    Copyright IBM Corp. 1999, 2008
  *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>,
- *		 Heiko Carstens <heiko.carstens@de.ibm.com>,
  */
 
-#include <linux/sched.h>
+#include <linux/processor.h>
 #include <linux/delay.h>
-#include <linux/timex.h>
-#include <linux/export.h>
-#include <linux/irqflags.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <asm/vtimer.h>
 #include <asm/div64.h>
-#include <asm/idle.h>
+#include <asm/timex.h>
 
 void __delay(unsigned long loops)
 {
-        /*
-         * To end the bloody studid and useless discussion about the
-         * BogoMips number I took the liberty to define the __delay
-         * function in a way that that resulting BogoMips number will
-         * yield the megahertz number of the cpu. The important function
-         * is udelay and that is done using the tod clock. -- martin.
-         */
+	/*
+	 * Loop 'loops' times. Callers must not assume a specific
+	 * amount of time passes before this function returns.
+	 */
 	asm volatile("0: brct %0,0b" : : "d" ((loops/2) + 1));
 }
 EXPORT_SYMBOL(__delay);
 
-static void __udelay_disabled(unsigned long long usecs)
+static void delay_loop(unsigned long delta)
 {
-	unsigned long cr0, cr0_new, psw_mask;
-	struct s390_idle_data idle;
-	u64 end;
+	unsigned long end;
 
-	end = get_tod_clock() + (usecs << 12);
-	__ctl_store(cr0, 0, 0);
-	cr0_new = cr0 & ~CR0_IRQ_SUBCLASS_MASK;
-	cr0_new |= (1UL << (63 - 52)); /* enable clock comparator irq */
-	__ctl_load(cr0_new, 0, 0);
-	psw_mask = __extract_psw() | PSW_MASK_EXT | PSW_MASK_WAIT;
-	set_clock_comparator(end);
-	set_cpu_flag(CIF_IGNORE_IRQ);
-	psw_idle(&idle, psw_mask);
-	clear_cpu_flag(CIF_IGNORE_IRQ);
-	set_clock_comparator(S390_lowcore.clock_comparator);
-	__ctl_load(cr0, 0, 0);
-}
-
-static void __udelay_enabled(unsigned long long usecs)
-{
-	u64 clock_saved, end;
-
-	end = get_tod_clock_fast() + (usecs << 12);
-	do {
-		clock_saved = 0;
-		if (tod_after(S390_lowcore.clock_comparator, end)) {
-			clock_saved = local_tick_disable();
-			set_clock_comparator(end);
-		}
-		enabled_wait();
-		if (clock_saved)
-			local_tick_enable(clock_saved);
-	} while (get_tod_clock_fast() < end);
+	end = get_tod_clock_monotonic() + delta;
+	while (!tod_after(get_tod_clock_monotonic(), end))
+		cpu_relax();
 }
 
-/*
- * Waits for 'usecs' microseconds using the TOD clock comparator.
- */
-void __udelay(unsigned long long usecs)
+void __udelay(unsigned long usecs)
 {
-	unsigned long flags;
-
-	preempt_disable();
-	local_irq_save(flags);
-	if (in_irq()) {
-		__udelay_disabled(usecs);
-		goto out;
-	}
-	if (in_softirq()) {
-		if (raw_irqs_disabled_flags(flags))
-			__udelay_disabled(usecs);
-		else
-			__udelay_enabled(usecs);
-		goto out;
-	}
-	if (raw_irqs_disabled_flags(flags)) {
-		local_bh_disable();
-		__udelay_disabled(usecs);
-		_local_bh_enable();
-		goto out;
-	}
-	__udelay_enabled(usecs);
-out:
-	local_irq_restore(flags);
-	preempt_enable();
+	delay_loop(usecs << 12);
 }
 EXPORT_SYMBOL(__udelay);
 
-/*
- * Simple udelay variant. To be used on startup and reboot
- * when the interrupt handler isn't working.
- */
-void udelay_simple(unsigned long long usecs)
-{
-	u64 end;
-
-	end = get_tod_clock_fast() + (usecs << 12);
-	while (get_tod_clock_fast() < end)
-		cpu_relax();
-}
-
-void __ndelay(unsigned long long nsecs)
+void __ndelay(unsigned long nsecs)
 {
-	u64 end;
-
 	nsecs <<= 9;
 	do_div(nsecs, 125);
-	end = get_tod_clock_fast() + nsecs;
-	if (nsecs & ~0xfffUL)
-		__udelay(nsecs >> 12);
-	while (get_tod_clock_fast() < end)
-		barrier();
+	delay_loop(nsecs);
 }
 EXPORT_SYMBOL(__ndelay);
diff --git a/arch/s390/lib/error-inject.c b/arch/s390/lib/error-inject.c
new file mode 100644
index 000000000000..8c9d4da87eef
--- /dev/null
+++ b/arch/s390/lib/error-inject.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0+
+#include <asm/ptrace.h>
+#include <linux/error-injection.h>
+#include <linux/kprobes.h>
+
+void override_function_with_return(struct pt_regs *regs)
+{
+	/*
+	 * Emulate 'br 14'. 'regs' is captured by kprobes on entry to some
+	 * kernel function.
+	 */
+	regs->psw.addr = regs->gprs[14];
+}
+NOKPROBE_SYMBOL(override_function_with_return);
diff --git a/arch/s390/lib/expoline.S b/arch/s390/lib/expoline.S
new file mode 100644
index 000000000000..92ed8409a7a4
--- /dev/null
+++ b/arch/s390/lib/expoline.S
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <asm/nospec-insn.h>
+#include <linux/linkage.h>
+
+.macro GEN_ALL_BR_THUNK_EXTERN
+	.irp r1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+	GEN_BR_THUNK_EXTERN %r\r1
+	.endr
+.endm
+
+GEN_ALL_BR_THUNK_EXTERN
diff --git a/arch/s390/lib/mem.S b/arch/s390/lib/mem.S
index dc0874f2e203..d026debf250c 100644
--- a/arch/s390/lib/mem.S
+++ b/arch/s390/lib/mem.S
@@ -5,8 +5,8 @@
  * Copyright IBM Corp. 2012
  */
 
+#include <linux/export.h>
 #include <linux/linkage.h>
-#include <asm/export.h>
 #include <asm/nospec-insn.h>
 
 	GEN_BR_THUNK %r14
@@ -14,8 +14,7 @@
 /*
  * void *memmove(void *dest, const void *src, size_t n)
  */
-WEAK(memmove)
-ENTRY(__memmove)
+SYM_FUNC_START(__memmove)
 	ltgr	%r4,%r4
 	lgr	%r1,%r2
 	jz	.Lmemmove_exit
@@ -35,8 +34,7 @@ ENTRY(__memmove)
 	la	%r3,256(%r3)
 	brctg	%r0,.Lmemmove_forward_loop
 .Lmemmove_forward_remainder:
-	larl	%r5,.Lmemmove_mvc
-	ex	%r4,0(%r5)
+	exrl	%r4,.Lmemmove_mvc
 .Lmemmove_exit:
 	BR_EX	%r14
 .Lmemmove_reverse:
@@ -48,7 +46,10 @@ ENTRY(__memmove)
 	BR_EX	%r14
 .Lmemmove_mvc:
 	mvc	0(1,%r1),0(%r3)
-ENDPROC(__memmove)
+SYM_FUNC_END(__memmove)
+EXPORT_SYMBOL(__memmove)
+
+SYM_FUNC_ALIAS(memmove, __memmove)
 EXPORT_SYMBOL(memmove)
 
 /*
@@ -66,8 +67,7 @@ EXPORT_SYMBOL(memmove)
  *	return __builtin_memset(s, c, n);
  * }
  */
-WEAK(memset)
-ENTRY(__memset)
+SYM_FUNC_START(__memset)
 	ltgr	%r4,%r4
 	jz	.Lmemset_exit
 	ltgr	%r3,%r3
@@ -82,8 +82,7 @@ ENTRY(__memset)
 	la	%r1,256(%r1)
 	brctg	%r3,.Lmemset_clear_loop
 .Lmemset_clear_remainder:
-	larl	%r3,.Lmemset_xc
-	ex	%r4,0(%r3)
+	exrl	%r4,.Lmemset_xc
 .Lmemset_exit:
 	BR_EX	%r14
 .Lmemset_fill:
@@ -101,8 +100,7 @@ ENTRY(__memset)
 	brctg	%r5,.Lmemset_fill_loop
 .Lmemset_fill_remainder:
 	stc	%r3,0(%r1)
-	larl	%r5,.Lmemset_mvc
-	ex	%r4,0(%r5)
+	exrl	%r4,.Lmemset_mvc
 	BR_EX	%r14
 .Lmemset_fill_exit:
 	stc	%r3,0(%r1)
@@ -111,7 +109,10 @@ ENTRY(__memset)
 	xc	0(1,%r1),0(%r1)
 .Lmemset_mvc:
 	mvc	1(1,%r1),0(%r1)
-ENDPROC(__memset)
+SYM_FUNC_END(__memset)
+EXPORT_SYMBOL(__memset)
+
+SYM_FUNC_ALIAS(memset, __memset)
 EXPORT_SYMBOL(memset)
 
 /*
@@ -119,8 +120,7 @@ EXPORT_SYMBOL(memset)
  *
  * void *memcpy(void *dest, const void *src, size_t n)
  */
-WEAK(memcpy)
-ENTRY(__memcpy)
+SYM_FUNC_START(__memcpy)
 	ltgr	%r4,%r4
 	jz	.Lmemcpy_exit
 	aghi	%r4,-1
@@ -129,8 +129,7 @@ ENTRY(__memcpy)
 	lgr	%r1,%r2
 	jnz	.Lmemcpy_loop
 .Lmemcpy_remainder:
-	larl	%r5,.Lmemcpy_mvc
-	ex	%r4,0(%r5)
+	exrl	%r4,.Lmemcpy_mvc
 .Lmemcpy_exit:
 	BR_EX	%r14
 .Lmemcpy_loop:
@@ -141,7 +140,10 @@ ENTRY(__memcpy)
 	j	.Lmemcpy_remainder
 .Lmemcpy_mvc:
 	mvc	0(1,%r1),0(%r3)
-ENDPROC(__memcpy)
+SYM_FUNC_END(__memcpy)
+EXPORT_SYMBOL(__memcpy)
+
+SYM_FUNC_ALIAS(memcpy, __memcpy)
 EXPORT_SYMBOL(memcpy)
 
 /*
@@ -152,7 +154,7 @@ EXPORT_SYMBOL(memcpy)
  * void *__memset64(uint64_t *s, uint64_t v, size_t count)
  */
 .macro __MEMSET bits,bytes,insn
-ENTRY(__memset\bits)
+SYM_FUNC_START(__memset\bits)
 	ltgr	%r4,%r4
 	jz	.L__memset_exit\bits
 	cghi	%r4,\bytes
@@ -169,8 +171,7 @@ ENTRY(__memset\bits)
 	brctg	%r5,.L__memset_loop\bits
 .L__memset_remainder\bits:
 	\insn	%r3,0(%r1)
-	larl	%r5,.L__memset_mvc\bits
-	ex	%r4,0(%r5)
+	exrl	%r4,.L__memset_mvc\bits
 	BR_EX	%r14
 .L__memset_store\bits:
 	\insn	%r3,0(%r2)
@@ -178,7 +179,7 @@ ENTRY(__memset\bits)
 	BR_EX	%r14
 .L__memset_mvc\bits:
 	mvc	\bytes(1,%r1),0(%r1)
-ENDPROC(__memset\bits)
+SYM_FUNC_END(__memset\bits)
 .endm
 
 __MEMSET 16,2,sth
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index ce1e4bbe53aa..ad9da4038511 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -10,11 +10,14 @@
 #include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/jiffies.h>
+#include <linux/sysctl.h>
 #include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/percpu.h>
+#include <linux/io.h>
 #include <asm/alternative.h>
-#include <asm/io.h>
+#include <asm/machine.h>
+#include <asm/asm.h>
 
 int spin_retry = -1;
 
@@ -26,7 +29,7 @@ static int __init spin_retry_init(void)
 }
 early_initcall(spin_retry_init);
 
-/**
+/*
  * spin_retry= parameter
  */
 static int __init spin_retry_setup(char *str)
@@ -36,6 +39,23 @@ static int __init spin_retry_setup(char *str)
 }
 __setup("spin_retry=", spin_retry_setup);
 
+static const struct ctl_table s390_spin_sysctl_table[] = {
+	{
+		.procname	= "spin_retry",
+		.data		= &spin_retry,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+};
+
+static int __init init_s390_spin_sysctls(void)
+{
+	register_sysctl_init("kernel", s390_spin_sysctl_table);
+	return 0;
+}
+arch_initcall(init_s390_spin_sysctls);
+
 struct spin_wait {
 	struct spin_wait *next, *prev;
 	int node_id;
@@ -75,25 +95,44 @@ static inline int arch_load_niai4(int *lock)
 	int owner;
 
 	asm_inline volatile(
-		ALTERNATIVE("", ".long 0xb2fa0040", 49)	/* NIAI 4 */
-		"	l	%0,%1\n"
-		: "=d" (owner) : "Q" (*lock) : "memory");
+		ALTERNATIVE("nop", ".insn rre,0xb2fa0000,4,0", ALT_FACILITY(49)) /* NIAI 4 */
+		"	l	%[owner],%[lock]\n"
+		: [owner] "=d" (owner) : [lock] "R" (*lock) : "memory");
 	return owner;
 }
 
-static inline int arch_cmpxchg_niai8(int *lock, int old, int new)
+#ifdef __HAVE_ASM_FLAG_OUTPUTS__
+
+static inline int arch_try_cmpxchg_niai8(int *lock, int old, int new)
+{
+	int cc;
+
+	asm_inline volatile(
+		ALTERNATIVE("nop", ".insn rre,0xb2fa0000,8,0", ALT_FACILITY(49)) /* NIAI 8 */
+		"	cs	%[old],%[new],%[lock]\n"
+		: [old] "+d" (old), [lock] "+Q" (*lock), "=@cc" (cc)
+		: [new] "d" (new)
+		: "memory");
+	return cc == 0;
+}
+
+#else /* __HAVE_ASM_FLAG_OUTPUTS__ */
+
+static inline int arch_try_cmpxchg_niai8(int *lock, int old, int new)
 {
 	int expected = old;
 
 	asm_inline volatile(
-		ALTERNATIVE("", ".long 0xb2fa0080", 49)	/* NIAI 8 */
-		"	cs	%0,%3,%1\n"
-		: "=d" (old), "=Q" (*lock)
-		: "0" (old), "d" (new), "Q" (*lock)
+		ALTERNATIVE("nop", ".insn rre,0xb2fa0000,8,0", ALT_FACILITY(49)) /* NIAI 8 */
+		"	cs	%[old],%[new],%[lock]\n"
+		: [old] "+d" (old), [lock] "+Q" (*lock)
+		: [new] "d" (new)
 		: "cc", "memory");
 	return expected == old;
 }
 
+#endif /* __HAVE_ASM_FLAG_OUTPUTS__ */
+
 static inline struct spin_wait *arch_spin_decode_tail(int lock)
 {
 	int ix, cpu;
@@ -119,16 +158,16 @@ static inline void arch_spin_lock_queued(arch_spinlock_t *lp)
 	struct spin_wait *node, *next;
 	int lockval, ix, node_id, tail_id, old, new, owner, count;
 
-	ix = S390_lowcore.spinlock_index++;
+	ix = get_lowcore()->spinlock_index++;
 	barrier();
-	lockval = SPINLOCK_LOCKVAL;	/* cpu + 1 */
+	lockval = spinlock_lockval();	/* cpu + 1 */
 	node = this_cpu_ptr(&spin_wait[ix]);
 	node->prev = node->next = NULL;
 	node_id = node->node_id;
 
 	/* Enqueue the node for this CPU in the spinlock wait queue */
+	old = READ_ONCE(lp->lock);
 	while (1) {
-		old = READ_ONCE(lp->lock);
 		if ((old & _Q_LOCK_CPU_MASK) == 0 &&
 		    (old & _Q_LOCK_STEAL_MASK) != _Q_LOCK_STEAL_MASK) {
 			/*
@@ -139,7 +178,7 @@ static inline void arch_spin_lock_queued(arch_spinlock_t *lp)
 			 * waiter will get the lock.
 			 */
 			new = (old ? (old + _Q_LOCK_STEAL_ADD) : 0) | lockval;
-			if (__atomic_cmpxchg_bool(&lp->lock, old, new))
+			if (arch_try_cmpxchg(&lp->lock, &old, new))
 				/* Got the lock */
 				goto out;
 			/* lock passing in progress */
@@ -147,7 +186,7 @@ static inline void arch_spin_lock_queued(arch_spinlock_t *lp)
 		}
 		/* Make the node of this CPU the new tail. */
 		new = node_id | (old & _Q_LOCK_MASK);
-		if (__atomic_cmpxchg_bool(&lp->lock, old, new))
+		if (arch_try_cmpxchg(&lp->lock, &old, new))
 			break;
 	}
 	/* Set the 'next' pointer of the tail node in the queue */
@@ -184,7 +223,7 @@ static inline void arch_spin_lock_queued(arch_spinlock_t *lp)
 		if (!owner) {
 			tail_id = old & _Q_TAIL_MASK;
 			new = ((tail_id != node_id) ? tail_id : 0) | lockval;
-			if (__atomic_cmpxchg_bool(&lp->lock, old, new))
+			if (arch_try_cmpxchg(&lp->lock, &old, new))
 				/* Got the lock */
 				break;
 			continue;
@@ -192,7 +231,7 @@ static inline void arch_spin_lock_queued(arch_spinlock_t *lp)
 		if (count-- >= 0)
 			continue;
 		count = spin_retry;
-		if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(owner - 1))
+		if (!machine_is_lpar() || arch_vcpu_is_preempted(owner - 1))
 			smp_yield_cpu(owner - 1);
 	}
 
@@ -205,14 +244,14 @@ static inline void arch_spin_lock_queued(arch_spinlock_t *lp)
 	}
 
  out:
-	S390_lowcore.spinlock_index--;
+	get_lowcore()->spinlock_index--;
 }
 
 static inline void arch_spin_lock_classic(arch_spinlock_t *lp)
 {
 	int lockval, old, new, owner, count;
 
-	lockval = SPINLOCK_LOCKVAL;	/* cpu + 1 */
+	lockval = spinlock_lockval();	/* cpu + 1 */
 
 	/* Pass the virtual CPU to the lock holder if it is not running */
 	owner = arch_spin_yield_target(READ_ONCE(lp->lock), NULL);
@@ -226,7 +265,7 @@ static inline void arch_spin_lock_classic(arch_spinlock_t *lp)
 		/* Try to get the lock if it is free. */
 		if (!owner) {
 			new = (old & _Q_TAIL_MASK) | lockval;
-			if (arch_cmpxchg_niai8(&lp->lock, old, new)) {
+			if (arch_try_cmpxchg_niai8(&lp->lock, old, new)) {
 				/* Got the lock */
 				return;
 			}
@@ -235,14 +274,13 @@ static inline void arch_spin_lock_classic(arch_spinlock_t *lp)
 		if (count-- >= 0)
 			continue;
 		count = spin_retry;
-		if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(owner - 1))
+		if (!machine_is_lpar() || arch_vcpu_is_preempted(owner - 1))
 			smp_yield_cpu(owner - 1);
 	}
 }
 
 void arch_spin_lock_wait(arch_spinlock_t *lp)
 {
-	/* Use classic spinlocks + niai if the steal time is >= 10% */
 	if (test_cpu_flag(CIF_DEDICATED_CPU))
 		arch_spin_lock_queued(lp);
 	else
@@ -252,14 +290,14 @@ EXPORT_SYMBOL(arch_spin_lock_wait);
 
 int arch_spin_trylock_retry(arch_spinlock_t *lp)
 {
-	int cpu = SPINLOCK_LOCKVAL;
+	int cpu = spinlock_lockval();
 	int owner, count;
 
 	for (count = spin_retry; count > 0; count--) {
 		owner = READ_ONCE(lp->lock);
 		/* Try to get the lock if it is free. */
 		if (!owner) {
-			if (__atomic_cmpxchg_bool(&lp->lock, 0, cpu))
+			if (arch_try_cmpxchg(&lp->lock, &owner, cpu))
 				return 1;
 		}
 	}
@@ -301,7 +339,7 @@ void arch_write_lock_wait(arch_rwlock_t *rw)
 	while (1) {
 		old = READ_ONCE(rw->cnts);
 		if ((old & 0x1ffff) == 0 &&
-		    __atomic_cmpxchg_bool(&rw->cnts, old, old | 0x10000))
+		    arch_try_cmpxchg(&rw->cnts, &old, old | 0x10000))
 			/* Got the lock */
 			break;
 		barrier();
@@ -318,7 +356,7 @@ void arch_spin_relax(arch_spinlock_t *lp)
 	cpu = READ_ONCE(lp->lock) & _Q_LOCK_CPU_MASK;
 	if (!cpu)
 		return;
-	if (MACHINE_IS_LPAR && !arch_vcpu_is_preempted(cpu - 1))
+	if (machine_is_lpar() && !arch_vcpu_is_preempted(cpu - 1))
 		return;
 	smp_yield_cpu(cpu - 1);
 }
diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c
index 0e30e6e43b0c..373fa1f01937 100644
--- a/arch/s390/lib/string.c
+++ b/arch/s390/lib/string.c
@@ -8,33 +8,44 @@
  */
 
 #define IN_ARCH_STRING_C 1
+#ifndef __NO_FORTIFY
+# define __NO_FORTIFY
+#endif
 
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/export.h>
+#include <asm/asm.h>
 
 /*
  * Helper functions to find the end of a string
  */
 static inline char *__strend(const char *s)
 {
-	register unsigned long r0 asm("0") = 0;
-
-	asm volatile ("0: srst  %0,%1\n"
-		      "   jo    0b"
-		      : "+d" (r0), "+a" (s) :  : "cc", "memory");
-	return (char *) r0;
+	unsigned long e = 0;
+
+	asm volatile(
+		"	lghi	0,0\n"
+		"0:	srst	%[e],%[s]\n"
+		"	jo	0b\n"
+		: [e] "+&a" (e), [s] "+&a" (s)
+		:
+		: "cc", "memory", "0");
+	return (char *)e;
 }
 
 static inline char *__strnend(const char *s, size_t n)
 {
-	register unsigned long r0 asm("0") = 0;
 	const char *p = s + n;
 
-	asm volatile ("0: srst  %0,%1\n"
-		      "   jo    0b"
-		      : "+d" (p), "+a" (s) : "d" (r0) : "cc", "memory");
-	return (char *) p;
+	asm volatile(
+		"	lghi	0,0\n"
+		"0:	srst	%[p],%[s]\n"
+		"	jo	0b\n"
+		: [p] "+&d" (p), [s] "+&a" (s)
+		:
+		: "cc", "memory", "0");
+	return (char *)p;
 }
 
 /**
@@ -76,45 +87,21 @@ EXPORT_SYMBOL(strnlen);
 #ifdef __HAVE_ARCH_STRCPY
 char *strcpy(char *dest, const char *src)
 {
-	register int r0 asm("0") = 0;
 	char *ret = dest;
 
-	asm volatile ("0: mvst  %0,%1\n"
-		      "   jo    0b"
-		      : "+&a" (dest), "+&a" (src) : "d" (r0)
-		      : "cc", "memory" );
+	asm volatile(
+		"	lghi	0,0\n"
+		"0:	mvst	%[dest],%[src]\n"
+		"	jo	0b\n"
+		: [dest] "+&a" (dest), [src] "+&a" (src)
+		:
+		: "cc", "memory", "0");
 	return ret;
 }
 EXPORT_SYMBOL(strcpy);
 #endif
 
 /**
- * strlcpy - Copy a %NUL terminated string into a sized buffer
- * @dest: Where to copy the string to
- * @src: Where to copy the string from
- * @size: size of destination buffer
- *
- * Compatible with *BSD: the result is always a valid
- * NUL-terminated string that fits in the buffer (unless,
- * of course, the buffer size is zero). It does not pad
- * out the result like strncpy() does.
- */
-#ifdef __HAVE_ARCH_STRLCPY
-size_t strlcpy(char *dest, const char *src, size_t size)
-{
-	size_t ret = __strend(src) - src;
-
-	if (size) {
-		size_t len = (ret >= size) ? size-1 : ret;
-		dest[len] = '\0';
-		memcpy(dest, src, len);
-	}
-	return ret;
-}
-EXPORT_SYMBOL(strlcpy);
-#endif
-
-/**
  * strncpy - Copy a length-limited, %NUL-terminated string
  * @dest: Where to copy the string to
  * @src: Where to copy the string from
@@ -144,16 +131,18 @@ EXPORT_SYMBOL(strncpy);
 #ifdef __HAVE_ARCH_STRCAT
 char *strcat(char *dest, const char *src)
 {
-	register int r0 asm("0") = 0;
-	unsigned long dummy;
+	unsigned long dummy = 0;
 	char *ret = dest;
 
-	asm volatile ("0: srst  %0,%1\n"
-		      "   jo    0b\n"
-		      "1: mvst  %0,%2\n"
-		      "   jo    1b"
-		      : "=&a" (dummy), "+a" (dest), "+a" (src)
-		      : "d" (r0), "0" (0UL) : "cc", "memory" );
+	asm volatile(
+		"	lghi	0,0\n"
+		"0:	srst	%[dummy],%[dest]\n"
+		"	jo	0b\n"
+		"1:	mvst	%[dummy],%[src]\n"
+		"	jo	1b\n"
+		: [dummy] "+&a" (dummy), [dest] "+&a" (dest), [src] "+&a" (src)
+		:
+		: "cc", "memory", "0");
 	return ret;
 }
 EXPORT_SYMBOL(strcat);
@@ -221,59 +210,40 @@ EXPORT_SYMBOL(strncat);
 #ifdef __HAVE_ARCH_STRCMP
 int strcmp(const char *s1, const char *s2)
 {
-	register int r0 asm("0") = 0;
 	int ret = 0;
 
-	asm volatile ("0: clst %2,%3\n"
-		      "   jo   0b\n"
-		      "   je   1f\n"
-		      "   ic   %0,0(%2)\n"
-		      "   ic   %1,0(%3)\n"
-		      "   sr   %0,%1\n"
-		      "1:"
-		      : "+d" (ret), "+d" (r0), "+a" (s1), "+a" (s2)
-		      : : "cc", "memory");
+	asm volatile(
+		"	lghi	0,0\n"
+		"0:	clst	%[s1],%[s2]\n"
+		"	jo	0b\n"
+		"	je	1f\n"
+		"	ic	%[ret],0(%[s1])\n"
+		"	ic	0,0(%[s2])\n"
+		"	sr	%[ret],0\n"
+		"1:"
+		: [ret] "+&d" (ret), [s1] "+&a" (s1), [s2] "+&a" (s2)
+		:
+		: "cc", "memory", "0");
 	return ret;
 }
 EXPORT_SYMBOL(strcmp);
 #endif
 
-/**
- * strrchr - Find the last occurrence of a character in a string
- * @s: The string to be searched
- * @c: The character to search for
- */
-#ifdef __HAVE_ARCH_STRRCHR
-char *strrchr(const char *s, int c)
-{
-       size_t len = __strend(s) - s;
-
-       if (len)
-	       do {
-		       if (s[len] == (char) c)
-			       return (char *) s + len;
-	       } while (--len > 0);
-       return NULL;
-}
-EXPORT_SYMBOL(strrchr);
-#endif
-
 static inline int clcle(const char *s1, unsigned long l1,
 			const char *s2, unsigned long l2)
 {
-	register unsigned long r2 asm("2") = (unsigned long) s1;
-	register unsigned long r3 asm("3") = (unsigned long) l1;
-	register unsigned long r4 asm("4") = (unsigned long) s2;
-	register unsigned long r5 asm("5") = (unsigned long) l2;
+	union register_pair r1 = { .even = (unsigned long)s1, .odd = l1, };
+	union register_pair r3 = { .even = (unsigned long)s2, .odd = l2, };
 	int cc;
 
-	asm volatile ("0: clcle %1,%3,0\n"
-		      "   jo    0b\n"
-		      "   ipm   %0\n"
-		      "   srl   %0,28"
-		      : "=&d" (cc), "+a" (r2), "+a" (r3),
-			"+a" (r4), "+a" (r5) : : "cc", "memory");
-	return cc;
+	asm volatile(
+		"0:	clcle	%[r1],%[r3],0\n"
+		"	jo	0b\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc), [r1] "+d" (r1.pair), [r3] "+d" (r3.pair)
+		:
+		: CC_CLOBBER_LIST("memory"));
+	return CC_TRANSFORM(cc);
 }
 
 /**
@@ -315,15 +285,18 @@ EXPORT_SYMBOL(strstr);
 #ifdef __HAVE_ARCH_MEMCHR
 void *memchr(const void *s, int c, size_t n)
 {
-	register int r0 asm("0") = (char) c;
 	const void *ret = s + n;
 
-	asm volatile ("0: srst  %0,%1\n"
-		      "   jo    0b\n"
-		      "   jl	1f\n"
-		      "   la    %0,0\n"
-		      "1:"
-		      : "+a" (ret), "+&a" (s) : "d" (r0) : "cc", "memory");
+	asm volatile(
+		"	lgr	0,%[c]\n"
+		"0:	srst	%[ret],%[s]\n"
+		"	jo	0b\n"
+		"	jl	1f\n"
+		"	la	%[ret],0\n"
+		"1:"
+		: [ret] "+&a" (ret), [s] "+&a" (s)
+		: [c] "d" (c)
+		: "cc", "memory", "0");
 	return (void *) ret;
 }
 EXPORT_SYMBOL(memchr);
@@ -333,7 +306,7 @@ EXPORT_SYMBOL(memchr);
  * memcmp - Compare two areas of memory
  * @s1: One area of memory
  * @s2: Another area of memory
- * @count: The size of the area.
+ * @n: The size of the area.
  */
 #ifdef __HAVE_ARCH_MEMCMP
 int memcmp(const void *s1, const void *s2, size_t n)
@@ -360,13 +333,16 @@ EXPORT_SYMBOL(memcmp);
 #ifdef __HAVE_ARCH_MEMSCAN
 void *memscan(void *s, int c, size_t n)
 {
-	register int r0 asm("0") = (char) c;
 	const void *ret = s + n;
 
-	asm volatile ("0: srst  %0,%1\n"
-		      "   jo    0b\n"
-		      : "+a" (ret), "+&a" (s) : "d" (r0) : "cc", "memory");
-	return (void *) ret;
+	asm volatile(
+		"	lgr	0,%[c]\n"
+		"0:	srst	%[ret],%[s]\n"
+		"	jo	0b\n"
+		: [ret] "+&a" (ret), [s] "+&a" (s)
+		: [c] "d" (c)
+		: "cc", "memory", "0");
+	return (void *)ret;
 }
 EXPORT_SYMBOL(memscan);
 #endif
diff --git a/arch/s390/lib/test_kprobes.c b/arch/s390/lib/test_kprobes.c
new file mode 100644
index 000000000000..9021298c3e8a
--- /dev/null
+++ b/arch/s390/lib/test_kprobes.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/random.h>
+#include <kunit/test.h>
+#include "test_kprobes.h"
+
+static struct kprobe kp;
+
+static void setup_kprobe(struct kunit *test, struct kprobe *kp,
+			 const char *symbol, int offset)
+{
+	kp->offset = offset;
+	kp->addr = NULL;
+	kp->symbol_name = symbol;
+}
+
+static void test_kprobe_offset(struct kunit *test, struct kprobe *kp,
+			       const char *target, int offset)
+{
+	int ret;
+
+	setup_kprobe(test, kp, target, 0);
+	ret = register_kprobe(kp);
+	if (!ret)
+		unregister_kprobe(kp);
+	KUNIT_EXPECT_EQ(test, 0, ret);
+	setup_kprobe(test, kp, target, offset);
+	ret = register_kprobe(kp);
+	KUNIT_EXPECT_EQ(test, -EINVAL, ret);
+	if (!ret)
+		unregister_kprobe(kp);
+}
+
+static void test_kprobe_odd(struct kunit *test)
+{
+	test_kprobe_offset(test, &kp, "kprobes_target_odd",
+			   kprobes_target_odd_offs);
+}
+
+static void test_kprobe_in_insn4(struct kunit *test)
+{
+	test_kprobe_offset(test, &kp, "kprobes_target_in_insn4",
+			   kprobes_target_in_insn4_offs);
+}
+
+static void test_kprobe_in_insn6_lo(struct kunit *test)
+{
+	test_kprobe_offset(test, &kp, "kprobes_target_in_insn6_lo",
+			   kprobes_target_in_insn6_lo_offs);
+}
+
+static void test_kprobe_in_insn6_hi(struct kunit *test)
+{
+	test_kprobe_offset(test, &kp, "kprobes_target_in_insn6_hi",
+			   kprobes_target_in_insn6_hi_offs);
+}
+
+static struct kunit_case kprobes_testcases[] = {
+	KUNIT_CASE(test_kprobe_odd),
+	KUNIT_CASE(test_kprobe_in_insn4),
+	KUNIT_CASE(test_kprobe_in_insn6_lo),
+	KUNIT_CASE(test_kprobe_in_insn6_hi),
+	{}
+};
+
+static struct kunit_suite kprobes_test_suite = {
+	.name = "kprobes_test_s390",
+	.test_cases = kprobes_testcases,
+};
+
+kunit_test_suites(&kprobes_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests for kprobes");
+MODULE_LICENSE("GPL");
diff --git a/arch/s390/lib/test_kprobes.h b/arch/s390/lib/test_kprobes.h
new file mode 100644
index 000000000000..2b4c9bc337f1
--- /dev/null
+++ b/arch/s390/lib/test_kprobes.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#ifndef TEST_KPROBES_H
+#define TEST_KPROBES_H
+
+extern unsigned long kprobes_target_odd_offs;
+extern unsigned long kprobes_target_in_insn4_offs;
+extern unsigned long kprobes_target_in_insn6_lo_offs;
+extern unsigned long kprobes_target_in_insn6_hi_offs;
+
+#endif
diff --git a/arch/s390/lib/test_kprobes_asm.S b/arch/s390/lib/test_kprobes_asm.S
new file mode 100644
index 000000000000..ade7a3042334
--- /dev/null
+++ b/arch/s390/lib/test_kprobes_asm.S
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#include <linux/linkage.h>
+#include <asm/ftrace.h>
+
+#define KPROBES_TARGET_START(name)	\
+	SYM_FUNC_START(name);		\
+	FTRACE_GEN_NOP_ASM(name)
+
+#define KPROBES_TARGET_END(name)	\
+	SYM_FUNC_END(name);		\
+	SYM_DATA(name##_offs, .quad 1b - name)
+
+KPROBES_TARGET_START(kprobes_target_in_insn4)
+	.word 0x4700 // bc 0,0
+1:	.word 0x0000
+	br %r14
+KPROBES_TARGET_END(kprobes_target_in_insn4)
+
+KPROBES_TARGET_START(kprobes_target_in_insn6_lo)
+	.word 0xe310 // ly 1,0
+1:	.word 0x0000
+	.word 0x0058
+	br %r14
+KPROBES_TARGET_END(kprobes_target_in_insn6_lo)
+
+KPROBES_TARGET_START(kprobes_target_in_insn6_hi)
+	.word 0xe310 // ly 1,0
+	.word 0x0000
+1:	.word 0x0058
+	br %r14
+KPROBES_TARGET_END(kprobes_target_in_insn6_hi)
+
+KPROBES_TARGET_START(kprobes_target_bp)
+	nop
+	.word 0x0000
+	nop
+1:	br %r14
+KPROBES_TARGET_END(kprobes_target_bp)
+
+KPROBES_TARGET_START(kprobes_target_odd)
+	.byte 0x07
+1:	.byte 0x07
+	br %r14
+KPROBES_TARGET_END(kprobes_target_odd)
diff --git a/arch/s390/lib/test_modules.c b/arch/s390/lib/test_modules.c
new file mode 100644
index 000000000000..f96b6a3737e7
--- /dev/null
+++ b/arch/s390/lib/test_modules.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include <kunit/test.h>
+#include <linux/module.h>
+
+#include "test_modules.h"
+
+/*
+ * Test that modules with many relocations are loaded properly.
+ */
+static void test_modules_many_vmlinux_relocs(struct kunit *test)
+{
+	int result = 0;
+
+#define CALL_RETURN(i) result += test_modules_return_ ## i()
+	REPEAT_10000(CALL_RETURN);
+	KUNIT_ASSERT_EQ(test, result, 49995000);
+}
+
+static struct kunit_case modules_testcases[] = {
+	KUNIT_CASE(test_modules_many_vmlinux_relocs),
+	{}
+};
+
+static struct kunit_suite modules_test_suite = {
+	.name = "modules_test_s390",
+	.test_cases = modules_testcases,
+};
+
+kunit_test_suites(&modules_test_suite);
+
+MODULE_DESCRIPTION("KUnit test that modules with many relocations are loaded properly");
+MODULE_LICENSE("GPL");
diff --git a/arch/s390/lib/test_modules.h b/arch/s390/lib/test_modules.h
new file mode 100644
index 000000000000..6371fcf17684
--- /dev/null
+++ b/arch/s390/lib/test_modules.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#ifndef TEST_MODULES_H
+#define TEST_MODULES_H
+
+#define __REPEAT_10000_3(f, x) \
+	f(x ## 0); \
+	f(x ## 1); \
+	f(x ## 2); \
+	f(x ## 3); \
+	f(x ## 4); \
+	f(x ## 5); \
+	f(x ## 6); \
+	f(x ## 7); \
+	f(x ## 8); \
+	f(x ## 9)
+#define __REPEAT_10000_2(f, x) \
+	__REPEAT_10000_3(f, x ## 0); \
+	__REPEAT_10000_3(f, x ## 1); \
+	__REPEAT_10000_3(f, x ## 2); \
+	__REPEAT_10000_3(f, x ## 3); \
+	__REPEAT_10000_3(f, x ## 4); \
+	__REPEAT_10000_3(f, x ## 5); \
+	__REPEAT_10000_3(f, x ## 6); \
+	__REPEAT_10000_3(f, x ## 7); \
+	__REPEAT_10000_3(f, x ## 8); \
+	__REPEAT_10000_3(f, x ## 9)
+#define __REPEAT_10000_1(f, x) \
+	__REPEAT_10000_2(f, x ## 0); \
+	__REPEAT_10000_2(f, x ## 1); \
+	__REPEAT_10000_2(f, x ## 2); \
+	__REPEAT_10000_2(f, x ## 3); \
+	__REPEAT_10000_2(f, x ## 4); \
+	__REPEAT_10000_2(f, x ## 5); \
+	__REPEAT_10000_2(f, x ## 6); \
+	__REPEAT_10000_2(f, x ## 7); \
+	__REPEAT_10000_2(f, x ## 8); \
+	__REPEAT_10000_2(f, x ## 9)
+#define REPEAT_10000(f) \
+	__REPEAT_10000_1(f, 0); \
+	__REPEAT_10000_1(f, 1); \
+	__REPEAT_10000_1(f, 2); \
+	__REPEAT_10000_1(f, 3); \
+	__REPEAT_10000_1(f, 4); \
+	__REPEAT_10000_1(f, 5); \
+	__REPEAT_10000_1(f, 6); \
+	__REPEAT_10000_1(f, 7); \
+	__REPEAT_10000_1(f, 8); \
+	__REPEAT_10000_1(f, 9)
+
+#define DECLARE_RETURN(i) int test_modules_return_ ## i(void)
+REPEAT_10000(DECLARE_RETURN);
+
+#endif
diff --git a/arch/s390/lib/test_modules_helpers.c b/arch/s390/lib/test_modules_helpers.c
new file mode 100644
index 000000000000..1670349a03eb
--- /dev/null
+++ b/arch/s390/lib/test_modules_helpers.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include <linux/export.h>
+
+#include "test_modules.h"
+
+#define DEFINE_RETURN(i) \
+	int test_modules_return_ ## i(void) \
+	{ \
+		return 1 ## i - 10000; \
+	} \
+	EXPORT_SYMBOL_GPL(test_modules_return_ ## i)
+REPEAT_10000(DEFINE_RETURN);
diff --git a/arch/s390/lib/test_unwind.c b/arch/s390/lib/test_unwind.c
index bda7ac0ddd29..6e42100875e7 100644
--- a/arch/s390/lib/test_unwind.c
+++ b/arch/s390/lib/test_unwind.c
@@ -3,20 +3,28 @@
  * Test module for unwind_for_each_frame
  */
 
-#define pr_fmt(fmt) "test_unwind: " fmt
+#include <kunit/test.h>
 #include <asm/unwind.h>
 #include <linux/completion.h>
 #include <linux/kallsyms.h>
 #include <linux/kthread.h>
+#include <linux/ftrace.h>
 #include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/kprobes.h>
 #include <linux/wait.h>
 #include <asm/irq.h>
-#include <asm/delay.h>
+
+static struct kunit *current_test;
 
 #define BT_BUF_SIZE (PAGE_SIZE * 4)
 
+static bool force_bt;
+module_param_named(backtrace, force_bt, bool, 0444);
+MODULE_PARM_DESC(backtrace, "print backtraces for all tests");
+
 /*
  * To avoid printk line limit split backtrace by lines
  */
@@ -28,7 +36,7 @@ static void print_backtrace(char *bt)
 		p = strsep(&bt, "\n");
 		if (!p)
 			break;
-		pr_err("%s\n", p);
+		kunit_err(current_test, "%s\n", p);
 	}
 }
 
@@ -39,7 +47,7 @@ static void print_backtrace(char *bt)
 static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs,
 				unsigned long sp)
 {
-	int frame_count, prev_is_func2, seen_func2_func1;
+	int frame_count, prev_is_func2, seen_func2_func1, seen_arch_rethook_trampoline;
 	const int max_frames = 128;
 	struct unwind_state state;
 	size_t bt_pos = 0;
@@ -48,13 +56,14 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs,
 
 	bt = kmalloc(BT_BUF_SIZE, GFP_ATOMIC);
 	if (!bt) {
-		pr_err("failed to allocate backtrace buffer\n");
+		kunit_err(current_test, "failed to allocate backtrace buffer\n");
 		return -ENOMEM;
 	}
 	/* Unwind. */
 	frame_count = 0;
 	prev_is_func2 = 0;
 	seen_func2_func1 = 0;
+	seen_arch_rethook_trampoline = 0;
 	unwind_for_each_frame(&state, task, regs, sp) {
 		unsigned long addr = unwind_get_return_address(&state);
 		char sym[KSYM_SYMBOL_LEN];
@@ -62,8 +71,9 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs,
 		if (frame_count++ == max_frames)
 			break;
 		if (state.reliable && !addr) {
-			pr_err("unwind state reliable but addr is 0\n");
-			return -EINVAL;
+			kunit_err(current_test, "unwind state reliable but addr is 0\n");
+			ret = -EINVAL;
+			break;
 		}
 		sprint_symbol(sym, addr);
 		if (bt_pos < BT_BUF_SIZE) {
@@ -73,28 +83,34 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs,
 					   stack_type_name(state.stack_info.type),
 					   (void *)state.sp, (void *)state.ip);
 			if (bt_pos >= BT_BUF_SIZE)
-				pr_err("backtrace buffer is too small\n");
+				kunit_err(current_test, "backtrace buffer is too small\n");
 		}
 		frame_count += 1;
 		if (prev_is_func2 && str_has_prefix(sym, "unwindme_func1"))
 			seen_func2_func1 = 1;
 		prev_is_func2 = str_has_prefix(sym, "unwindme_func2");
+		if (str_has_prefix(sym, "arch_rethook_trampoline+0x0/"))
+			seen_arch_rethook_trampoline = 1;
 	}
 
 	/* Check the results. */
 	if (unwind_error(&state)) {
-		pr_err("unwind error\n");
+		kunit_err(current_test, "unwind error\n");
 		ret = -EINVAL;
 	}
 	if (!seen_func2_func1) {
-		pr_err("unwindme_func2 and unwindme_func1 not found\n");
+		kunit_err(current_test, "unwindme_func2 and unwindme_func1 not found\n");
 		ret = -EINVAL;
 	}
 	if (frame_count == max_frames) {
-		pr_err("Maximum number of frames exceeded\n");
+		kunit_err(current_test, "Maximum number of frames exceeded\n");
 		ret = -EINVAL;
 	}
-	if (ret)
+	if (seen_arch_rethook_trampoline) {
+		kunit_err(current_test, "arch_rethook_trampoline+0x0 in unwinding results\n");
+		ret = -EINVAL;
+	}
+	if (ret || force_bt)
 		print_backtrace(bt);
 	kfree(bt);
 	return ret;
@@ -118,31 +134,187 @@ static struct unwindme *unwindme;
 #define UWM_REGS		0x2	/* Pass regs to test_unwind(). */
 #define UWM_SP			0x4	/* Pass sp to test_unwind(). */
 #define UWM_CALLER		0x8	/* Unwind starting from caller. */
-#define UWM_SWITCH_STACK	0x10	/* Use CALL_ON_STACK. */
+#define UWM_SWITCH_STACK	0x10	/* Use call_on_stack. */
 #define UWM_IRQ			0x20	/* Unwind from irq context. */
-#define UWM_PGM			0x40	/* Unwind from program check handler. */
+#define UWM_PGM			0x40	/* Unwind from program check handler */
+#define UWM_KPROBE_ON_FTRACE	0x80	/* Unwind from kprobe handler called via ftrace. */
+#define UWM_FTRACE		0x100	/* Unwind from ftrace handler. */
+#define UWM_KRETPROBE		0x200	/* Unwind through kretprobed function. */
+#define UWM_KRETPROBE_HANDLER	0x400	/* Unwind from kretprobe handler. */
 
-static __always_inline unsigned long get_psw_addr(void)
+static __always_inline struct pt_regs fake_pt_regs(void)
 {
-	unsigned long psw_addr;
+	struct pt_regs regs;
+
+	memset(&regs, 0, sizeof(regs));
+	regs.gprs[15] = current_stack_pointer;
 
 	asm volatile(
 		"basr	%[psw_addr],0\n"
-		: [psw_addr] "=d" (psw_addr));
-	return psw_addr;
+		: [psw_addr] "=d" (regs.psw.addr));
+	return regs;
 }
 
-#ifdef CONFIG_KPROBES
-static int pgm_pre_handler(struct kprobe *p, struct pt_regs *regs)
+static int kretprobe_ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
 {
 	struct unwindme *u = unwindme;
 
+	if (!(u->flags & UWM_KRETPROBE_HANDLER))
+		return 0;
+
 	u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? regs : NULL,
 			     (u->flags & UWM_SP) ? u->sp : 0);
+
 	return 0;
 }
+
+static noinline notrace int test_unwind_kretprobed_func(struct unwindme *u)
+{
+	struct pt_regs regs;
+
+	if (!(u->flags & UWM_KRETPROBE))
+		return 0;
+
+	regs = fake_pt_regs();
+	return test_unwind(NULL, (u->flags & UWM_REGS) ? &regs : NULL,
+			   (u->flags & UWM_SP) ? u->sp : 0);
+}
+
+static noinline int test_unwind_kretprobed_func_caller(struct unwindme *u)
+{
+	return test_unwind_kretprobed_func(u);
+}
+
+static int test_unwind_kretprobe(struct unwindme *u)
+{
+	int ret;
+	struct kretprobe my_kretprobe;
+
+	if (!IS_ENABLED(CONFIG_KPROBES))
+		kunit_skip(current_test, "requires CONFIG_KPROBES");
+
+	u->ret = -1; /* make sure kprobe is called */
+	unwindme = u;
+
+	memset(&my_kretprobe, 0, sizeof(my_kretprobe));
+	my_kretprobe.handler = kretprobe_ret_handler;
+	my_kretprobe.maxactive = 1;
+	my_kretprobe.kp.addr = (kprobe_opcode_t *)test_unwind_kretprobed_func;
+
+	ret = register_kretprobe(&my_kretprobe);
+
+	if (ret < 0) {
+		kunit_err(current_test, "register_kretprobe failed %d\n", ret);
+		return -EINVAL;
+	}
+
+	ret = test_unwind_kretprobed_func_caller(u);
+	unregister_kretprobe(&my_kretprobe);
+	unwindme = NULL;
+	if (u->flags & UWM_KRETPROBE_HANDLER)
+		ret = u->ret;
+	return ret;
+}
+
+static int kprobe_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct unwindme *u = unwindme;
+
+	u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? regs : NULL,
+			     (u->flags & UWM_SP) ? u->sp : 0);
+	return 0;
+}
+
+extern const char test_unwind_kprobed_insn[];
+
+static noinline void test_unwind_kprobed_func(void)
+{
+	asm volatile(
+		"	nopr	%%r7\n"
+		"test_unwind_kprobed_insn:\n"
+		"	nopr	%%r7\n"
+		:);
+}
+
+static int test_unwind_kprobe(struct unwindme *u)
+{
+	struct kprobe kp;
+	int ret;
+
+	if (!IS_ENABLED(CONFIG_KPROBES))
+		kunit_skip(current_test, "requires CONFIG_KPROBES");
+	if (!IS_ENABLED(CONFIG_KPROBES_ON_FTRACE) && u->flags & UWM_KPROBE_ON_FTRACE)
+		kunit_skip(current_test, "requires CONFIG_KPROBES_ON_FTRACE");
+
+	u->ret = -1; /* make sure kprobe is called */
+	unwindme = u;
+	memset(&kp, 0, sizeof(kp));
+	kp.pre_handler = kprobe_pre_handler;
+	kp.addr = u->flags & UWM_KPROBE_ON_FTRACE ?
+				(kprobe_opcode_t *)test_unwind_kprobed_func :
+				(kprobe_opcode_t *)test_unwind_kprobed_insn;
+	ret = register_kprobe(&kp);
+	if (ret < 0) {
+		kunit_err(current_test, "register_kprobe failed %d\n", ret);
+		return -EINVAL;
+	}
+
+	test_unwind_kprobed_func();
+	unregister_kprobe(&kp);
+	unwindme = NULL;
+	return u->ret;
+}
+
+static void notrace __used test_unwind_ftrace_handler(unsigned long ip,
+						      unsigned long parent_ip,
+						      struct ftrace_ops *fops,
+						      struct ftrace_regs *fregs)
+{
+	struct unwindme *u = (struct unwindme *)arch_ftrace_regs(fregs)->regs.gprs[2];
+
+	u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? &arch_ftrace_regs(fregs)->regs : NULL,
+			     (u->flags & UWM_SP) ? u->sp : 0);
+}
+
+static noinline int test_unwind_ftraced_func(struct unwindme *u)
+{
+	return READ_ONCE(u)->ret;
+}
+
+static int test_unwind_ftrace(struct unwindme *u)
+{
+	int ret;
+#ifdef CONFIG_DYNAMIC_FTRACE
+	struct ftrace_ops *fops;
+
+	fops = kunit_kzalloc(current_test, sizeof(*fops), GFP_KERNEL);
+	fops->func = test_unwind_ftrace_handler;
+	fops->flags = FTRACE_OPS_FL_DYNAMIC |
+		     FTRACE_OPS_FL_RECURSION |
+		     FTRACE_OPS_FL_SAVE_REGS |
+		     FTRACE_OPS_FL_PERMANENT;
+#else
+	kunit_skip(current_test, "requires CONFIG_DYNAMIC_FTRACE");
 #endif
 
+	ret = ftrace_set_filter_ip(fops, (unsigned long)test_unwind_ftraced_func, 0, 0);
+	if (ret) {
+		kunit_err(current_test, "failed to set ftrace filter (%d)\n", ret);
+		return -1;
+	}
+
+	ret = register_ftrace_function(fops);
+	if (!ret) {
+		ret = test_unwind_ftraced_func(u);
+		unregister_ftrace_function(fops);
+	} else {
+		kunit_err(current_test, "failed to register ftrace handler (%d)\n", ret);
+	}
+
+	ftrace_set_filter_ip(fops, (unsigned long)test_unwind_ftraced_func, 1, 0);
+	return ret;
+}
+
 /* This function may or may not appear in the backtrace. */
 static noinline int unwindme_func4(struct unwindme *u)
 {
@@ -153,40 +325,15 @@ static noinline int unwindme_func4(struct unwindme *u)
 		wait_event(u->task_wq, kthread_should_park());
 		kthread_parkme();
 		return 0;
-#ifdef CONFIG_KPROBES
-	} else if (u->flags & UWM_PGM) {
-		struct kprobe kp;
-		int ret;
-
-		unwindme = u;
-		memset(&kp, 0, sizeof(kp));
-		kp.symbol_name = "do_report_trap";
-		kp.pre_handler = pgm_pre_handler;
-		ret = register_kprobe(&kp);
-		if (ret < 0) {
-			pr_err("register_kprobe failed %d\n", ret);
-			return -EINVAL;
-		}
-
-		/*
-		 * trigger specification exception
-		 */
-		asm volatile(
-			"	mvcl	%%r1,%%r1\n"
-			"0:	nopr	%%r7\n"
-			EX_TABLE(0b, 0b)
-			:);
-
-		unregister_kprobe(&kp);
-		unwindme = NULL;
-		return u->ret;
-#endif
+	} else if (u->flags & (UWM_PGM | UWM_KPROBE_ON_FTRACE)) {
+		return test_unwind_kprobe(u);
+	} else if (u->flags & (UWM_KRETPROBE | UWM_KRETPROBE_HANDLER)) {
+		return test_unwind_kretprobe(u);
+	} else if (u->flags & UWM_FTRACE) {
+		return test_unwind_ftrace(u);
 	} else {
-		struct pt_regs regs;
+		struct pt_regs regs = fake_pt_regs();
 
-		memset(&regs, 0, sizeof(regs));
-		regs.psw.addr = get_psw_addr();
-		regs.gprs[15] = current_stack_pointer();
 		return test_unwind(NULL,
 				   (u->flags & UWM_REGS) ? &regs : NULL,
 				   (u->flags & UWM_SP) ? u->sp : 0);
@@ -203,12 +350,16 @@ static noinline int unwindme_func3(struct unwindme *u)
 /* This function must appear in the backtrace. */
 static noinline int unwindme_func2(struct unwindme *u)
 {
+	unsigned long flags, mflags;
 	int rc;
 
 	if (u->flags & UWM_SWITCH_STACK) {
-		preempt_disable();
-		rc = CALL_ON_STACK(unwindme_func3, S390_lowcore.nodat_stack, 1, u);
-		preempt_enable();
+		local_irq_save(flags);
+		local_mcck_save(mflags);
+		rc = call_on_stack(1, get_lowcore()->nodat_stack,
+				   int, unwindme_func3, struct unwindme *, u);
+		local_mcck_restore(mflags);
+		local_irq_restore(flags);
 		return rc;
 	} else {
 		return unwindme_func3(u);
@@ -221,31 +372,27 @@ static noinline int unwindme_func1(void *u)
 	return unwindme_func2((struct unwindme *)u);
 }
 
-static void unwindme_irq_handler(struct ext_code ext_code,
-				       unsigned int param32,
-				       unsigned long param64)
+static void unwindme_timer_fn(struct timer_list *unused)
 {
 	struct unwindme *u = READ_ONCE(unwindme);
 
-	if (u && u->task == current) {
+	if (u) {
 		unwindme = NULL;
 		u->task = NULL;
 		u->ret = unwindme_func1(u);
+		complete(&u->task_ready);
 	}
 }
 
+static struct timer_list unwind_timer;
+
 static int test_unwind_irq(struct unwindme *u)
 {
-	preempt_disable();
-	if (register_external_irq(EXT_IRQ_CLK_COMP, unwindme_irq_handler)) {
-		pr_info("Couldn't reqister external interrupt handler");
-		return -1;
-	}
-	u->task = current;
 	unwindme = u;
-	udelay(1);
-	unregister_external_irq(EXT_IRQ_CLK_COMP, unwindme_irq_handler);
-	preempt_enable();
+	init_completion(&u->task_ready);
+	timer_setup(&unwind_timer, unwindme_timer_fn, 0);
+	mod_timer(&unwind_timer, jiffies + 1);
+	wait_for_completion(&u->task_ready);
 	return u->ret;
 }
 
@@ -265,7 +412,7 @@ static int test_unwind_task(struct unwindme *u)
 	 */
 	task = kthread_run(unwindme_func1, u, "%s", __func__);
 	if (IS_ERR(task)) {
-		pr_err("kthread_run() failed\n");
+		kunit_err(current_test, "kthread_run() failed\n");
 		return PTR_ERR(task);
 	}
 	/*
@@ -280,68 +427,97 @@ static int test_unwind_task(struct unwindme *u)
 	return ret;
 }
 
-static int test_unwind_flags(int flags)
+struct test_params {
+	int flags;
+	char *name;
+};
+
+/*
+ * Create required parameter list for tests
+ */
+#define TEST_WITH_FLAGS(f) { .flags = f, .name = #f }
+static const struct test_params param_list[] = {
+	TEST_WITH_FLAGS(UWM_DEFAULT),
+	TEST_WITH_FLAGS(UWM_SP),
+	TEST_WITH_FLAGS(UWM_REGS),
+	TEST_WITH_FLAGS(UWM_SWITCH_STACK),
+	TEST_WITH_FLAGS(UWM_SP | UWM_REGS),
+	TEST_WITH_FLAGS(UWM_CALLER | UWM_SP),
+	TEST_WITH_FLAGS(UWM_CALLER | UWM_SP | UWM_REGS),
+	TEST_WITH_FLAGS(UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK),
+	TEST_WITH_FLAGS(UWM_THREAD),
+	TEST_WITH_FLAGS(UWM_THREAD | UWM_SP),
+	TEST_WITH_FLAGS(UWM_THREAD | UWM_CALLER | UWM_SP),
+	TEST_WITH_FLAGS(UWM_IRQ),
+	TEST_WITH_FLAGS(UWM_IRQ | UWM_SWITCH_STACK),
+	TEST_WITH_FLAGS(UWM_IRQ | UWM_SP),
+	TEST_WITH_FLAGS(UWM_IRQ | UWM_REGS),
+	TEST_WITH_FLAGS(UWM_IRQ | UWM_SP | UWM_REGS),
+	TEST_WITH_FLAGS(UWM_IRQ | UWM_CALLER | UWM_SP),
+	TEST_WITH_FLAGS(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS),
+	TEST_WITH_FLAGS(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK),
+	TEST_WITH_FLAGS(UWM_PGM),
+	TEST_WITH_FLAGS(UWM_PGM | UWM_SP),
+	TEST_WITH_FLAGS(UWM_PGM | UWM_REGS),
+	TEST_WITH_FLAGS(UWM_PGM | UWM_SP | UWM_REGS),
+	TEST_WITH_FLAGS(UWM_KPROBE_ON_FTRACE),
+	TEST_WITH_FLAGS(UWM_KPROBE_ON_FTRACE | UWM_SP),
+	TEST_WITH_FLAGS(UWM_KPROBE_ON_FTRACE | UWM_REGS),
+	TEST_WITH_FLAGS(UWM_KPROBE_ON_FTRACE | UWM_SP | UWM_REGS),
+	TEST_WITH_FLAGS(UWM_FTRACE),
+	TEST_WITH_FLAGS(UWM_FTRACE | UWM_SP),
+	TEST_WITH_FLAGS(UWM_FTRACE | UWM_REGS),
+	TEST_WITH_FLAGS(UWM_FTRACE | UWM_SP | UWM_REGS),
+	TEST_WITH_FLAGS(UWM_KRETPROBE),
+	TEST_WITH_FLAGS(UWM_KRETPROBE | UWM_SP),
+	TEST_WITH_FLAGS(UWM_KRETPROBE | UWM_REGS),
+	TEST_WITH_FLAGS(UWM_KRETPROBE | UWM_SP | UWM_REGS),
+	TEST_WITH_FLAGS(UWM_KRETPROBE_HANDLER),
+	TEST_WITH_FLAGS(UWM_KRETPROBE_HANDLER | UWM_SP),
+	TEST_WITH_FLAGS(UWM_KRETPROBE_HANDLER | UWM_REGS),
+	TEST_WITH_FLAGS(UWM_KRETPROBE_HANDLER | UWM_SP | UWM_REGS),
+};
+
+/*
+ * Parameter description generator: required for KUNIT_ARRAY_PARAM()
+ */
+static void get_desc(const struct test_params *params, char *desc)
+{
+	strscpy(desc, params->name, KUNIT_PARAM_DESC_SIZE);
+}
+
+/*
+ * Create test_unwind_gen_params
+ */
+KUNIT_ARRAY_PARAM(test_unwind, param_list, get_desc);
+
+static void test_unwind_flags(struct kunit *test)
 {
 	struct unwindme u;
+	const struct test_params *params;
 
-	u.flags = flags;
+	current_test = test;
+	params = (const struct test_params *)test->param_value;
+	u.flags = params->flags;
 	if (u.flags & UWM_THREAD)
-		return test_unwind_task(&u);
+		KUNIT_EXPECT_EQ(test, 0, test_unwind_task(&u));
 	else if (u.flags & UWM_IRQ)
-		return test_unwind_irq(&u);
+		KUNIT_EXPECT_EQ(test, 0, test_unwind_irq(&u));
 	else
-		return unwindme_func1(&u);
+		KUNIT_EXPECT_EQ(test, 0, unwindme_func1(&u));
 }
 
-static int test_unwind_init(void)
-{
-	int ret = 0;
-
-#define TEST(flags)							\
-do {									\
-	pr_info("[ RUN      ] " #flags "\n");				\
-	if (!test_unwind_flags((flags))) {				\
-		pr_info("[       OK ] " #flags "\n");			\
-	} else {							\
-		pr_err("[  FAILED  ] " #flags "\n");			\
-		ret = -EINVAL;						\
-	}								\
-} while (0)
-
-	TEST(UWM_DEFAULT);
-	TEST(UWM_SP);
-	TEST(UWM_REGS);
-	TEST(UWM_SWITCH_STACK);
-	TEST(UWM_SP | UWM_REGS);
-	TEST(UWM_CALLER | UWM_SP);
-	TEST(UWM_CALLER | UWM_SP | UWM_REGS);
-	TEST(UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK);
-	TEST(UWM_THREAD);
-	TEST(UWM_THREAD | UWM_SP);
-	TEST(UWM_THREAD | UWM_CALLER | UWM_SP);
-	TEST(UWM_IRQ);
-	TEST(UWM_IRQ | UWM_SWITCH_STACK);
-	TEST(UWM_IRQ | UWM_SP);
-	TEST(UWM_IRQ | UWM_REGS);
-	TEST(UWM_IRQ | UWM_SP | UWM_REGS);
-	TEST(UWM_IRQ | UWM_CALLER | UWM_SP);
-	TEST(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS);
-	TEST(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK);
-#ifdef CONFIG_KPROBES
-	TEST(UWM_PGM);
-	TEST(UWM_PGM | UWM_SP);
-	TEST(UWM_PGM | UWM_REGS);
-	TEST(UWM_PGM | UWM_SP | UWM_REGS);
-#endif
-#undef TEST
+static struct kunit_case unwind_test_cases[] = {
+	KUNIT_CASE_PARAM(test_unwind_flags, test_unwind_gen_params),
+	{}
+};
 
-	return ret;
-}
+static struct kunit_suite test_unwind_suite = {
+	.name = "test_unwind",
+	.test_cases = unwind_test_cases,
+};
 
-static void test_unwind_exit(void)
-{
-}
+kunit_test_suites(&test_unwind_suite);
 
-module_init(test_unwind_init);
-module_exit(test_unwind_exit);
+MODULE_DESCRIPTION("KUnit test for unwind_for_each_frame");
 MODULE_LICENSE("GPL");
diff --git a/arch/s390/lib/tishift.S b/arch/s390/lib/tishift.S
new file mode 100644
index 000000000000..96214f51f49b
--- /dev/null
+++ b/arch/s390/lib/tishift.S
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/export.h>
+#include <linux/linkage.h>
+#include <asm/nospec-insn.h>
+
+	.section .noinstr.text, "ax"
+
+	GEN_BR_THUNK %r14
+
+SYM_FUNC_START(__ashlti3)
+	lmg	%r0,%r1,0(%r3)
+	cije	%r4,0,1f
+	lhi	%r3,64
+	sr	%r3,%r4
+	jnh	0f
+	srlg	%r3,%r1,0(%r3)
+	sllg	%r0,%r0,0(%r4)
+	sllg	%r1,%r1,0(%r4)
+	ogr	%r0,%r3
+	j	1f
+0:	sllg	%r0,%r1,-64(%r4)
+	lghi	%r1,0
+1:	stmg	%r0,%r1,0(%r2)
+	BR_EX	%r14
+SYM_FUNC_END(__ashlti3)
+EXPORT_SYMBOL(__ashlti3)
+
+SYM_FUNC_START(__ashrti3)
+	lmg	%r0,%r1,0(%r3)
+	cije	%r4,0,1f
+	lhi	%r3,64
+	sr	%r3,%r4
+	jnh	0f
+	sllg	%r3,%r0,0(%r3)
+	srlg	%r1,%r1,0(%r4)
+	srag	%r0,%r0,0(%r4)
+	ogr	%r1,%r3
+	j	1f
+0:	srag	%r1,%r0,-64(%r4)
+	srag	%r0,%r0,63
+1:	stmg	%r0,%r1,0(%r2)
+	BR_EX	%r14
+SYM_FUNC_END(__ashrti3)
+EXPORT_SYMBOL(__ashrti3)
+
+SYM_FUNC_START(__lshrti3)
+	lmg	%r0,%r1,0(%r3)
+	cije	%r4,0,1f
+	lhi	%r3,64
+	sr	%r3,%r4
+	jnh	0f
+	sllg	%r3,%r0,0(%r3)
+	srlg	%r1,%r1,0(%r4)
+	srlg	%r0,%r0,0(%r4)
+	ogr	%r1,%r3
+	j	1f
+0:	srlg	%r1,%r0,-64(%r4)
+	lghi	%r0,0
+1:	stmg	%r0,%r1,0(%r2)
+	BR_EX	%r14
+SYM_FUNC_END(__lshrti3)
+EXPORT_SYMBOL(__lshrti3)
diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c
index c4f8039a35e8..cec20db88479 100644
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@@ -8,437 +8,139 @@
  *		 Gerald Schaefer (gerald.schaefer@de.ibm.com)
  */
 
-#include <linux/jump_label.h>
 #include <linux/uaccess.h>
 #include <linux/export.h>
-#include <linux/errno.h>
 #include <linux/mm.h>
-#include <asm/mmu_context.h>
-#include <asm/facility.h>
-
-#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES
-static DEFINE_STATIC_KEY_FALSE(have_mvcos);
-
-static int __init uaccess_init(void)
-{
-	if (test_facility(27))
-		static_branch_enable(&have_mvcos);
-	return 0;
-}
-early_initcall(uaccess_init);
-
-static inline int copy_with_mvcos(void)
-{
-	if (static_branch_likely(&have_mvcos))
-		return 1;
-	return 0;
-}
-#else
-static inline int copy_with_mvcos(void)
-{
-	return 1;
-}
-#endif
-
-void set_fs(mm_segment_t fs)
-{
-	current->thread.mm_segment = fs;
-	if (fs == USER_DS) {
-		__ctl_load(S390_lowcore.user_asce, 1, 1);
-		clear_cpu_flag(CIF_ASCE_PRIMARY);
-	} else {
-		__ctl_load(S390_lowcore.kernel_asce, 1, 1);
-		set_cpu_flag(CIF_ASCE_PRIMARY);
-	}
-	if (fs & 1) {
-		if (fs == USER_DS_SACF)
-			__ctl_load(S390_lowcore.user_asce, 7, 7);
-		else
-			__ctl_load(S390_lowcore.kernel_asce, 7, 7);
-		set_cpu_flag(CIF_ASCE_SECONDARY);
+#include <asm/asm-extable.h>
+#include <asm/ctlreg.h>
+
+#ifdef CONFIG_DEBUG_ENTRY
+void debug_user_asce(int exit)
+{
+	struct ctlreg cr1, cr7;
+
+	local_ctl_store(1, &cr1);
+	local_ctl_store(7, &cr7);
+	if (cr1.val == get_lowcore()->kernel_asce.val && cr7.val == get_lowcore()->user_asce.val)
+		return;
+	panic("incorrect ASCE on kernel %s\n"
+	      "cr1:    %016lx cr7:  %016lx\n"
+	      "kernel: %016lx user: %016lx\n",
+	      exit ? "exit" : "entry", cr1.val, cr7.val,
+	      get_lowcore()->kernel_asce.val, get_lowcore()->user_asce.val);
+}
+#endif /*CONFIG_DEBUG_ENTRY */
+
+union oac {
+	unsigned int val;
+	struct {
+		struct {
+			unsigned short key : 4;
+			unsigned short	   : 4;
+			unsigned short as  : 2;
+			unsigned short	   : 4;
+			unsigned short k   : 1;
+			unsigned short a   : 1;
+		} oac1;
+		struct {
+			unsigned short key : 4;
+			unsigned short	   : 4;
+			unsigned short as  : 2;
+			unsigned short	   : 4;
+			unsigned short k   : 1;
+			unsigned short a   : 1;
+		} oac2;
+	};
+};
+
+static uaccess_kmsan_or_inline __must_check unsigned long
+raw_copy_from_user_key(void *to, const void __user *from, unsigned long size, unsigned long key)
+{
+	unsigned long osize;
+	union oac spec = {
+		.oac2.key = key,
+		.oac2.as = PSW_BITS_AS_SECONDARY,
+		.oac2.k = 1,
+		.oac2.a = 1,
+	};
+	int cc;
+
+	while (1) {
+		osize = size;
+		asm_inline volatile(
+			"	lr	%%r0,%[spec]\n"
+			"0:	mvcos	%[to],%[from],%[size]\n"
+			"1:	nopr	%%r7\n"
+			CC_IPM(cc)
+			EX_TABLE_UA_MVCOS_FROM(0b, 0b)
+			EX_TABLE_UA_MVCOS_FROM(1b, 0b)
+			: CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char *)to)
+			: [spec] "d" (spec.val), [from] "Q" (*(const char __user *)from)
+			: CC_CLOBBER_LIST("memory", "0"));
+		if (CC_TRANSFORM(cc) == 0)
+			return osize - size;
+		size -= 4096;
+		to += 4096;
+		from += 4096;
 	}
 }
-EXPORT_SYMBOL(set_fs);
 
-mm_segment_t enable_sacf_uaccess(void)
+unsigned long _copy_from_user_key(void *to, const void __user *from,
+				  unsigned long n, unsigned long key)
 {
-	mm_segment_t old_fs;
-	unsigned long asce, cr;
+	unsigned long res = n;
 
-	old_fs = current->thread.mm_segment;
-	if (old_fs & 1)
-		return old_fs;
-	current->thread.mm_segment |= 1;
-	asce = S390_lowcore.kernel_asce;
-	if (likely(old_fs == USER_DS)) {
-		__ctl_store(cr, 1, 1);
-		if (cr != S390_lowcore.kernel_asce) {
-			__ctl_load(S390_lowcore.kernel_asce, 1, 1);
-			set_cpu_flag(CIF_ASCE_PRIMARY);
-		}
-		asce = S390_lowcore.user_asce;
+	might_fault();
+	if (!should_fail_usercopy()) {
+		instrument_copy_from_user_before(to, from, n);
+		res = raw_copy_from_user_key(to, from, n, key);
+		instrument_copy_from_user_after(to, from, n, res);
 	}
-	__ctl_store(cr, 7, 7);
-	if (cr != asce) {
-		__ctl_load(asce, 7, 7);
-		set_cpu_flag(CIF_ASCE_SECONDARY);
+	if (unlikely(res))
+		memset(to + (n - res), 0, res);
+	return res;
+}
+EXPORT_SYMBOL(_copy_from_user_key);
+
+static uaccess_kmsan_or_inline __must_check unsigned long
+raw_copy_to_user_key(void __user *to, const void *from, unsigned long size, unsigned long key)
+{
+	unsigned long osize;
+	union oac spec = {
+		.oac1.key = key,
+		.oac1.as = PSW_BITS_AS_SECONDARY,
+		.oac1.k = 1,
+		.oac1.a = 1,
+	};
+	int cc;
+
+	while (1) {
+		osize = size;
+		asm_inline volatile(
+			"	lr	%%r0,%[spec]\n"
+			"0:	mvcos	%[to],%[from],%[size]\n"
+			"1:	nopr	%%r7\n"
+			CC_IPM(cc)
+			EX_TABLE_UA_MVCOS_TO(0b, 0b)
+			EX_TABLE_UA_MVCOS_TO(1b, 0b)
+			: CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char __user *)to)
+			: [spec] "d" (spec.val), [from] "Q" (*(const char *)from)
+			: CC_CLOBBER_LIST("memory", "0"));
+		if (CC_TRANSFORM(cc) == 0)
+			return osize - size;
+		size -= 4096;
+		to += 4096;
+		from += 4096;
 	}
-	return old_fs;
-}
-EXPORT_SYMBOL(enable_sacf_uaccess);
-
-void disable_sacf_uaccess(mm_segment_t old_fs)
-{
-	current->thread.mm_segment = old_fs;
-	if (old_fs == USER_DS && test_facility(27)) {
-		__ctl_load(S390_lowcore.user_asce, 1, 1);
-		clear_cpu_flag(CIF_ASCE_PRIMARY);
-	}
-}
-EXPORT_SYMBOL(disable_sacf_uaccess);
-
-static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr,
-						 unsigned long size)
-{
-	register unsigned long reg0 asm("0") = 0x01UL;
-	unsigned long tmp1, tmp2;
-
-	tmp1 = -4096UL;
-	asm volatile(
-		"0: .insn ss,0xc80000000000,0(%0,%2),0(%1),0\n"
-		"6: jz    4f\n"
-		"1: algr  %0,%3\n"
-		"   slgr  %1,%3\n"
-		"   slgr  %2,%3\n"
-		"   j     0b\n"
-		"2: la    %4,4095(%1)\n"/* %4 = ptr + 4095 */
-		"   nr    %4,%3\n"	/* %4 = (ptr + 4095) & -4096 */
-		"   slgr  %4,%1\n"
-		"   clgr  %0,%4\n"	/* copy crosses next page boundary? */
-		"   jnh   5f\n"
-		"3: .insn ss,0xc80000000000,0(%4,%2),0(%1),0\n"
-		"7: slgr  %0,%4\n"
-		"   j     5f\n"
-		"4: slgr  %0,%0\n"
-		"5:\n"
-		EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b)
-		: "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2)
-		: "d" (reg0) : "cc", "memory");
-	return size;
-}
-
-static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr,
-						unsigned long size)
-{
-	unsigned long tmp1, tmp2;
-	mm_segment_t old_fs;
-
-	old_fs = enable_sacf_uaccess();
-	tmp1 = -256UL;
-	asm volatile(
-		"   sacf  0\n"
-		"0: mvcp  0(%0,%2),0(%1),%3\n"
-		"7: jz    5f\n"
-		"1: algr  %0,%3\n"
-		"   la    %1,256(%1)\n"
-		"   la    %2,256(%2)\n"
-		"2: mvcp  0(%0,%2),0(%1),%3\n"
-		"8: jnz   1b\n"
-		"   j     5f\n"
-		"3: la    %4,255(%1)\n"	/* %4 = ptr + 255 */
-		"   lghi  %3,-4096\n"
-		"   nr    %4,%3\n"	/* %4 = (ptr + 255) & -4096 */
-		"   slgr  %4,%1\n"
-		"   clgr  %0,%4\n"	/* copy crosses next page boundary? */
-		"   jnh   6f\n"
-		"4: mvcp  0(%4,%2),0(%1),%3\n"
-		"9: slgr  %0,%4\n"
-		"   j     6f\n"
-		"5: slgr  %0,%0\n"
-		"6: sacf  768\n"
-		EX_TABLE(0b,3b) EX_TABLE(2b,3b) EX_TABLE(4b,6b)
-		EX_TABLE(7b,3b) EX_TABLE(8b,3b) EX_TABLE(9b,6b)
-		: "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2)
-		: : "cc", "memory");
-	disable_sacf_uaccess(old_fs);
-	return size;
-}
-
-unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n)
-{
-	if (copy_with_mvcos())
-		return copy_from_user_mvcos(to, from, n);
-	return copy_from_user_mvcp(to, from, n);
-}
-EXPORT_SYMBOL(raw_copy_from_user);
-
-static inline unsigned long copy_to_user_mvcos(void __user *ptr, const void *x,
-					       unsigned long size)
-{
-	register unsigned long reg0 asm("0") = 0x010000UL;
-	unsigned long tmp1, tmp2;
-
-	tmp1 = -4096UL;
-	asm volatile(
-		"0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n"
-		"6: jz    4f\n"
-		"1: algr  %0,%3\n"
-		"   slgr  %1,%3\n"
-		"   slgr  %2,%3\n"
-		"   j     0b\n"
-		"2: la    %4,4095(%1)\n"/* %4 = ptr + 4095 */
-		"   nr    %4,%3\n"	/* %4 = (ptr + 4095) & -4096 */
-		"   slgr  %4,%1\n"
-		"   clgr  %0,%4\n"	/* copy crosses next page boundary? */
-		"   jnh   5f\n"
-		"3: .insn ss,0xc80000000000,0(%4,%1),0(%2),0\n"
-		"7: slgr  %0,%4\n"
-		"   j     5f\n"
-		"4: slgr  %0,%0\n"
-		"5:\n"
-		EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b)
-		: "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2)
-		: "d" (reg0) : "cc", "memory");
-	return size;
-}
-
-static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x,
-					      unsigned long size)
-{
-	unsigned long tmp1, tmp2;
-	mm_segment_t old_fs;
-
-	old_fs = enable_sacf_uaccess();
-	tmp1 = -256UL;
-	asm volatile(
-		"   sacf  0\n"
-		"0: mvcs  0(%0,%1),0(%2),%3\n"
-		"7: jz    5f\n"
-		"1: algr  %0,%3\n"
-		"   la    %1,256(%1)\n"
-		"   la    %2,256(%2)\n"
-		"2: mvcs  0(%0,%1),0(%2),%3\n"
-		"8: jnz   1b\n"
-		"   j     5f\n"
-		"3: la    %4,255(%1)\n" /* %4 = ptr + 255 */
-		"   lghi  %3,-4096\n"
-		"   nr    %4,%3\n"	/* %4 = (ptr + 255) & -4096 */
-		"   slgr  %4,%1\n"
-		"   clgr  %0,%4\n"	/* copy crosses next page boundary? */
-		"   jnh   6f\n"
-		"4: mvcs  0(%4,%1),0(%2),%3\n"
-		"9: slgr  %0,%4\n"
-		"   j     6f\n"
-		"5: slgr  %0,%0\n"
-		"6: sacf  768\n"
-		EX_TABLE(0b,3b) EX_TABLE(2b,3b) EX_TABLE(4b,6b)
-		EX_TABLE(7b,3b) EX_TABLE(8b,3b) EX_TABLE(9b,6b)
-		: "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2)
-		: : "cc", "memory");
-	disable_sacf_uaccess(old_fs);
-	return size;
 }
 
-unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n)
+unsigned long _copy_to_user_key(void __user *to, const void *from,
+				unsigned long n, unsigned long key)
 {
-	if (copy_with_mvcos())
-		return copy_to_user_mvcos(to, from, n);
-	return copy_to_user_mvcs(to, from, n);
-}
-EXPORT_SYMBOL(raw_copy_to_user);
-
-static inline unsigned long copy_in_user_mvcos(void __user *to, const void __user *from,
-					       unsigned long size)
-{
-	register unsigned long reg0 asm("0") = 0x010001UL;
-	unsigned long tmp1, tmp2;
-
-	tmp1 = -4096UL;
-	/* FIXME: copy with reduced length. */
-	asm volatile(
-		"0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n"
-		"   jz	  2f\n"
-		"1: algr  %0,%3\n"
-		"   slgr  %1,%3\n"
-		"   slgr  %2,%3\n"
-		"   j	  0b\n"
-		"2:slgr  %0,%0\n"
-		"3: \n"
-		EX_TABLE(0b,3b)
-		: "+a" (size), "+a" (to), "+a" (from), "+a" (tmp1), "=a" (tmp2)
-		: "d" (reg0) : "cc", "memory");
-	return size;
-}
-
-static inline unsigned long copy_in_user_mvc(void __user *to, const void __user *from,
-					     unsigned long size)
-{
-	mm_segment_t old_fs;
-	unsigned long tmp1;
-
-	old_fs = enable_sacf_uaccess();
-	asm volatile(
-		"   sacf  256\n"
-		"   aghi  %0,-1\n"
-		"   jo	  5f\n"
-		"   bras  %3,3f\n"
-		"0: aghi  %0,257\n"
-		"1: mvc	  0(1,%1),0(%2)\n"
-		"   la	  %1,1(%1)\n"
-		"   la	  %2,1(%2)\n"
-		"   aghi  %0,-1\n"
-		"   jnz	  1b\n"
-		"   j	  5f\n"
-		"2: mvc	  0(256,%1),0(%2)\n"
-		"   la	  %1,256(%1)\n"
-		"   la	  %2,256(%2)\n"
-		"3: aghi  %0,-256\n"
-		"   jnm	  2b\n"
-		"4: ex	  %0,1b-0b(%3)\n"
-		"5: slgr  %0,%0\n"
-		"6: sacf  768\n"
-		EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b)
-		: "+a" (size), "+a" (to), "+a" (from), "=a" (tmp1)
-		: : "cc", "memory");
-	disable_sacf_uaccess(old_fs);
-	return size;
-}
-
-unsigned long raw_copy_in_user(void __user *to, const void __user *from, unsigned long n)
-{
-	if (copy_with_mvcos())
-		return copy_in_user_mvcos(to, from, n);
-	return copy_in_user_mvc(to, from, n);
-}
-EXPORT_SYMBOL(raw_copy_in_user);
-
-static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size)
-{
-	register unsigned long reg0 asm("0") = 0x010000UL;
-	unsigned long tmp1, tmp2;
-
-	tmp1 = -4096UL;
-	asm volatile(
-		"0: .insn ss,0xc80000000000,0(%0,%1),0(%4),0\n"
-		"   jz	  4f\n"
-		"1: algr  %0,%2\n"
-		"   slgr  %1,%2\n"
-		"   j	  0b\n"
-		"2: la	  %3,4095(%1)\n"/* %4 = to + 4095 */
-		"   nr	  %3,%2\n"	/* %4 = (to + 4095) & -4096 */
-		"   slgr  %3,%1\n"
-		"   clgr  %0,%3\n"	/* copy crosses next page boundary? */
-		"   jnh	  5f\n"
-		"3: .insn ss,0xc80000000000,0(%3,%1),0(%4),0\n"
-		"   slgr  %0,%3\n"
-		"   j	  5f\n"
-		"4: slgr  %0,%0\n"
-		"5:\n"
-		EX_TABLE(0b,2b) EX_TABLE(3b,5b)
-		: "+a" (size), "+a" (to), "+a" (tmp1), "=a" (tmp2)
-		: "a" (empty_zero_page), "d" (reg0) : "cc", "memory");
-	return size;
-}
-
-static inline unsigned long clear_user_xc(void __user *to, unsigned long size)
-{
-	mm_segment_t old_fs;
-	unsigned long tmp1, tmp2;
-
-	old_fs = enable_sacf_uaccess();
-	asm volatile(
-		"   sacf  256\n"
-		"   aghi  %0,-1\n"
-		"   jo    5f\n"
-		"   bras  %3,3f\n"
-		"   xc    0(1,%1),0(%1)\n"
-		"0: aghi  %0,257\n"
-		"   la    %2,255(%1)\n" /* %2 = ptr + 255 */
-		"   srl   %2,12\n"
-		"   sll   %2,12\n"	/* %2 = (ptr + 255) & -4096 */
-		"   slgr  %2,%1\n"
-		"   clgr  %0,%2\n"	/* clear crosses next page boundary? */
-		"   jnh   5f\n"
-		"   aghi  %2,-1\n"
-		"1: ex    %2,0(%3)\n"
-		"   aghi  %2,1\n"
-		"   slgr  %0,%2\n"
-		"   j     5f\n"
-		"2: xc    0(256,%1),0(%1)\n"
-		"   la    %1,256(%1)\n"
-		"3: aghi  %0,-256\n"
-		"   jnm   2b\n"
-		"4: ex    %0,0(%3)\n"
-		"5: slgr  %0,%0\n"
-		"6: sacf  768\n"
-		EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b)
-		: "+a" (size), "+a" (to), "=a" (tmp1), "=a" (tmp2)
-		: : "cc", "memory");
-	disable_sacf_uaccess(old_fs);
-	return size;
-}
-
-unsigned long __clear_user(void __user *to, unsigned long size)
-{
-	if (copy_with_mvcos())
-			return clear_user_mvcos(to, size);
-	return clear_user_xc(to, size);
-}
-EXPORT_SYMBOL(__clear_user);
-
-static inline unsigned long strnlen_user_srst(const char __user *src,
-					      unsigned long size)
-{
-	register unsigned long reg0 asm("0") = 0;
-	unsigned long tmp1, tmp2;
-
-	asm volatile(
-		"   la    %2,0(%1)\n"
-		"   la    %3,0(%0,%1)\n"
-		"   slgr  %0,%0\n"
-		"   sacf  256\n"
-		"0: srst  %3,%2\n"
-		"   jo    0b\n"
-		"   la    %0,1(%3)\n"	/* strnlen_user results includes \0 */
-		"   slgr  %0,%1\n"
-		"1: sacf  768\n"
-		EX_TABLE(0b,1b)
-		: "+a" (size), "+a" (src), "=a" (tmp1), "=a" (tmp2)
-		: "d" (reg0) : "cc", "memory");
-	return size;
-}
-
-unsigned long __strnlen_user(const char __user *src, unsigned long size)
-{
-	mm_segment_t old_fs;
-	unsigned long len;
-
-	if (unlikely(!size))
-		return 0;
-	old_fs = enable_sacf_uaccess();
-	len = strnlen_user_srst(src, size);
-	disable_sacf_uaccess(old_fs);
-	return len;
-}
-EXPORT_SYMBOL(__strnlen_user);
-
-long __strncpy_from_user(char *dst, const char __user *src, long size)
-{
-	size_t done, len, offset, len_str;
-
-	if (unlikely(size <= 0))
-		return 0;
-	done = 0;
-	do {
-		offset = (size_t)src & (L1_CACHE_BYTES - 1);
-		len = min(size - done, L1_CACHE_BYTES - offset);
-		if (copy_from_user(dst, src, len))
-			return -EFAULT;
-		len_str = strnlen(dst, len);
-		done += len_str;
-		src += len_str;
-		dst += len_str;
-	} while ((len_str == len) && (done < size));
-	return done;
+	might_fault();
+	if (should_fail_usercopy())
+		return n;
+	instrument_copy_to_user(to, from, n);
+	return raw_copy_to_user_key(to, from, n, key);
 }
-EXPORT_SYMBOL(__strncpy_from_user);
+EXPORT_SYMBOL(_copy_to_user_key);
diff --git a/arch/s390/lib/xor.c b/arch/s390/lib/xor.c
index 29d9470dbceb..ce7bcf7c0032 100644
--- a/arch/s390/lib/xor.c
+++ b/arch/s390/lib/xor.c
@@ -11,10 +11,10 @@
 #include <linux/raid/xor.h>
 #include <asm/xor.h>
 
-static void xor_xc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+static void xor_xc_2(unsigned long bytes, unsigned long * __restrict p1,
+		     const unsigned long * __restrict p2)
 {
 	asm volatile(
-		"	larl	1,2f\n"
 		"	aghi	%0,-1\n"
 		"	jm	3f\n"
 		"	srlg	0,%0,8\n"
@@ -24,21 +24,21 @@ static void xor_xc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
 		"	la	%1,256(%1)\n"
 		"	la	%2,256(%2)\n"
 		"	brctg	0,0b\n"
-		"1:	ex	%0,0(1)\n"
+		"1:	exrl	%0,2f\n"
 		"	j	3f\n"
 		"2:	xc	0(1,%1),0(%2)\n"
 		"3:\n"
 		: : "d" (bytes), "a" (p1), "a" (p2)
-		: "0", "1", "cc", "memory");
+		: "0", "cc", "memory");
 }
 
-static void xor_xc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
-		     unsigned long *p3)
+static void xor_xc_3(unsigned long bytes, unsigned long * __restrict p1,
+		     const unsigned long * __restrict p2,
+		     const unsigned long * __restrict p3)
 {
 	asm volatile(
-		"	larl	1,2f\n"
 		"	aghi	%0,-1\n"
-		"	jm	3f\n"
+		"	jm	4f\n"
 		"	srlg	0,%0,8\n"
 		"	ltgr	0,0\n"
 		"	jz	1f\n"
@@ -48,23 +48,24 @@ static void xor_xc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 		"	la	%2,256(%2)\n"
 		"	la	%3,256(%3)\n"
 		"	brctg	0,0b\n"
-		"1:	ex	%0,0(1)\n"
-		"	ex	%0,6(1)\n"
-		"	j	3f\n"
+		"1:	exrl	%0,2f\n"
+		"	exrl	%0,3f\n"
+		"	j	4f\n"
 		"2:	xc	0(1,%1),0(%2)\n"
-		"	xc	0(1,%1),0(%3)\n"
-		"3:\n"
+		"3:	xc	0(1,%1),0(%3)\n"
+		"4:\n"
 		: "+d" (bytes), "+a" (p1), "+a" (p2), "+a" (p3)
-		: : "0", "1", "cc", "memory");
+		: : "0", "cc", "memory");
 }
 
-static void xor_xc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
-		     unsigned long *p3, unsigned long *p4)
+static void xor_xc_4(unsigned long bytes, unsigned long * __restrict p1,
+		     const unsigned long * __restrict p2,
+		     const unsigned long * __restrict p3,
+		     const unsigned long * __restrict p4)
 {
 	asm volatile(
-		"	larl	1,2f\n"
 		"	aghi	%0,-1\n"
-		"	jm	3f\n"
+		"	jm	5f\n"
 		"	srlg	0,%0,8\n"
 		"	ltgr	0,0\n"
 		"	jz	1f\n"
@@ -76,28 +77,28 @@ static void xor_xc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 		"	la	%3,256(%3)\n"
 		"	la	%4,256(%4)\n"
 		"	brctg	0,0b\n"
-		"1:	ex	%0,0(1)\n"
-		"	ex	%0,6(1)\n"
-		"	ex	%0,12(1)\n"
-		"	j	3f\n"
+		"1:	exrl	%0,2f\n"
+		"	exrl	%0,3f\n"
+		"	exrl	%0,4f\n"
+		"	j	5f\n"
 		"2:	xc	0(1,%1),0(%2)\n"
-		"	xc	0(1,%1),0(%3)\n"
-		"	xc	0(1,%1),0(%4)\n"
-		"3:\n"
+		"3:	xc	0(1,%1),0(%3)\n"
+		"4:	xc	0(1,%1),0(%4)\n"
+		"5:\n"
 		: "+d" (bytes), "+a" (p1), "+a" (p2), "+a" (p3), "+a" (p4)
-		: : "0", "1", "cc", "memory");
+		: : "0", "cc", "memory");
 }
 
-static void xor_xc_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
-		     unsigned long *p3, unsigned long *p4, unsigned long *p5)
+static void xor_xc_5(unsigned long bytes, unsigned long * __restrict p1,
+		     const unsigned long * __restrict p2,
+		     const unsigned long * __restrict p3,
+		     const unsigned long * __restrict p4,
+		     const unsigned long * __restrict p5)
 {
-	/* Get around a gcc oddity */
-	register unsigned long *reg7 asm ("7") = p5;
-
 	asm volatile(
 		"	larl	1,2f\n"
 		"	aghi	%0,-1\n"
-		"	jm	3f\n"
+		"	jm	6f\n"
 		"	srlg	0,%0,8\n"
 		"	ltgr	0,0\n"
 		"	jz	1f\n"
@@ -111,19 +112,19 @@ static void xor_xc_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 		"	la	%4,256(%4)\n"
 		"	la	%5,256(%5)\n"
 		"	brctg	0,0b\n"
-		"1:	ex	%0,0(1)\n"
-		"	ex	%0,6(1)\n"
-		"	ex	%0,12(1)\n"
-		"	ex	%0,18(1)\n"
-		"	j	3f\n"
+		"1:	exrl	%0,2f\n"
+		"	exrl	%0,3f\n"
+		"	exrl	%0,4f\n"
+		"	exrl	%0,5f\n"
+		"	j	6f\n"
 		"2:	xc	0(1,%1),0(%2)\n"
-		"	xc	0(1,%1),0(%3)\n"
-		"	xc	0(1,%1),0(%4)\n"
-		"	xc	0(1,%1),0(%5)\n"
-		"3:\n"
+		"3:	xc	0(1,%1),0(%3)\n"
+		"4:	xc	0(1,%1),0(%4)\n"
+		"5:	xc	0(1,%1),0(%5)\n"
+		"6:\n"
 		: "+d" (bytes), "+a" (p1), "+a" (p2), "+a" (p3), "+a" (p4),
-		  "+a" (reg7)
-		: : "0", "1", "cc", "memory");
+		  "+a" (p5)
+		: : "0", "cc", "memory");
 }
 
 struct xor_block_template xor_block_xc = {
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index 3175413186b9..9726b91fe7e4 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -4,12 +4,11 @@
 #
 
 obj-y		:= init.o fault.o extmem.o mmap.o vmem.o maccess.o
-obj-y		+= page-states.o pageattr.o pgtable.o pgalloc.o
+obj-y		+= page-states.o pageattr.o pgtable.o pgalloc.o extable.o
 
 obj-$(CONFIG_CMM)		+= cmm.o
+obj-$(CONFIG_DEBUG_VIRTUAL)	+= physaddr.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
-obj-$(CONFIG_S390_PTDUMP)	+= dump_pagetables.o
+obj-$(CONFIG_PTDUMP)		+= dump_pagetables.o
 obj-$(CONFIG_PGSTE)		+= gmap.o
-
-KASAN_SANITIZE_kasan_init.o	:= n
-obj-$(CONFIG_KASAN)		+= kasan_init.o
+obj-$(CONFIG_PFAULT)		+= pfault.o
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index a51c892f14f3..e2a6eb92420f 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -14,15 +14,13 @@
 #include <linux/moduleparam.h>
 #include <linux/gfp.h>
 #include <linux/sched.h>
+#include <linux/string_helpers.h>
 #include <linux/sysctl.h>
-#include <linux/ctype.h>
 #include <linux/swap.h>
 #include <linux/kthread.h>
 #include <linux/oom.h>
-#include <linux/suspend.h>
 #include <linux/uaccess.h>
 
-#include <asm/pgalloc.h>
 #include <asm/diag.h>
 
 #ifdef CONFIG_CMM_IUCV
@@ -49,7 +47,6 @@ static volatile long cmm_pages_target;
 static volatile long cmm_timed_pages_target;
 static long cmm_timeout_pages;
 static long cmm_timeout_seconds;
-static int cmm_suspended;
 
 static struct cmm_page_array *cmm_page_list;
 static struct cmm_page_array *cmm_timed_page_list;
@@ -93,16 +90,17 @@ static long cmm_alloc_pages(long nr, long *counter,
 			} else
 				free_page((unsigned long) npa);
 		}
-		diag10_range(addr >> PAGE_SHIFT, 1);
+		diag10_range(virt_to_pfn((void *)addr), 1);
 		pa->pages[pa->index++] = addr;
 		(*counter)++;
 		spin_unlock(&cmm_lock);
 		nr--;
+		cond_resched();
 	}
 	return nr;
 }
 
-static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list)
+static long __cmm_free_pages(long nr, long *counter, struct cmm_page_array **list)
 {
 	struct cmm_page_array *pa;
 	unsigned long addr;
@@ -126,6 +124,21 @@ static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list)
 	return nr;
 }
 
+static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list)
+{
+	long inc = 0;
+
+	while (nr) {
+		inc = min(256L, nr);
+		nr -= inc;
+		inc = __cmm_free_pages(inc, counter, list);
+		if (inc)
+			break;
+		cond_resched();
+	}
+	return nr + inc;
+}
+
 static int cmm_oom_notify(struct notifier_block *self,
 			  unsigned long dummy, void *parm)
 {
@@ -151,9 +164,9 @@ static int cmm_thread(void *dummy)
 
 	while (1) {
 		rc = wait_event_interruptible(cmm_thread_wait,
-			(!cmm_suspended && (cmm_pages != cmm_pages_target ||
-			 cmm_timed_pages != cmm_timed_pages_target)) ||
-			 kthread_should_stop());
+			cmm_pages != cmm_pages_target ||
+			cmm_timed_pages != cmm_timed_pages_target ||
+			kthread_should_stop());
 		if (kthread_should_stop() || rc == -ERESTARTSYS) {
 			cmm_pages_target = cmm_pages;
 			cmm_timed_pages_target = cmm_timed_pages;
@@ -188,10 +201,10 @@ static void cmm_set_timer(void)
 {
 	if (cmm_timed_pages_target <= 0 || cmm_timeout_seconds <= 0) {
 		if (timer_pending(&cmm_timer))
-			del_timer(&cmm_timer);
+			timer_delete(&cmm_timer);
 		return;
 	}
-	mod_timer(&cmm_timer, jiffies + cmm_timeout_seconds * HZ);
+	mod_timer(&cmm_timer, jiffies + secs_to_jiffies(cmm_timeout_seconds));
 }
 
 static void cmm_timer_fn(struct timer_list *unused)
@@ -246,8 +259,8 @@ static int cmm_skip_blanks(char *cp, char **endp)
 	return str != cp;
 }
 
-static int cmm_pages_handler(struct ctl_table *ctl, int write,
-			     void __user *buffer, size_t *lenp, loff_t *ppos)
+static int cmm_pages_handler(const struct ctl_table *ctl, int write,
+			     void *buffer, size_t *lenp, loff_t *ppos)
 {
 	long nr = cmm_get_pages();
 	struct ctl_table ctl_entry = {
@@ -265,8 +278,8 @@ static int cmm_pages_handler(struct ctl_table *ctl, int write,
 	return 0;
 }
 
-static int cmm_timed_pages_handler(struct ctl_table *ctl, int write,
-				   void __user *buffer, size_t *lenp,
+static int cmm_timed_pages_handler(const struct ctl_table *ctl, int write,
+				   void *buffer, size_t *lenp,
 				   loff_t *ppos)
 {
 	long nr = cmm_get_timed_pages();
@@ -285,8 +298,8 @@ static int cmm_timed_pages_handler(struct ctl_table *ctl, int write,
 	return 0;
 }
 
-static int cmm_timeout_handler(struct ctl_table *ctl, int write,
-			       void __user *buffer, size_t *lenp, loff_t *ppos)
+static int cmm_timeout_handler(const struct ctl_table *ctl, int write,
+			       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	char buf[64], *p;
 	long nr, seconds;
@@ -299,8 +312,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write,
 
 	if (write) {
 		len = min(*lenp, sizeof(buf));
-		if (copy_from_user(buf, buffer, len))
-			return -EFAULT;
+		memcpy(buf, buffer, len);
 		buf[len - 1] = '\0';
 		cmm_skip_blanks(buf, &p);
 		nr = simple_strtoul(p, &p, 0);
@@ -313,15 +325,14 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write,
 			      cmm_timeout_pages, cmm_timeout_seconds);
 		if (len > *lenp)
 			len = *lenp;
-		if (copy_to_user(buffer, buf, len))
-			return -EFAULT;
+		memcpy(buffer, buf, len);
 		*lenp = len;
 		*ppos += len;
 	}
 	return 0;
 }
 
-static struct ctl_table cmm_table[] = {
+static const struct ctl_table cmm_table[] = {
 	{
 		.procname	= "cmm_pages",
 		.mode		= 0644,
@@ -337,17 +348,6 @@ static struct ctl_table cmm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= cmm_timeout_handler,
 	},
-	{ }
-};
-
-static struct ctl_table cmm_dir_table[] = {
-	{
-		.procname	= "vm",
-		.maxlen		= 0,
-		.mode		= 0555,
-		.child		= cmm_table,
-	},
-	{ }
 };
 
 #ifdef CONFIG_CMM_IUCV
@@ -390,54 +390,19 @@ static void cmm_smsg_target(const char *from, char *msg)
 
 static struct ctl_table_header *cmm_sysctl_header;
 
-static int cmm_suspend(void)
-{
-	cmm_suspended = 1;
-	cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list);
-	cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list);
-	return 0;
-}
-
-static int cmm_resume(void)
-{
-	cmm_suspended = 0;
-	cmm_kick_thread();
-	return 0;
-}
-
-static int cmm_power_event(struct notifier_block *this,
-			   unsigned long event, void *ptr)
-{
-	switch (event) {
-	case PM_POST_HIBERNATION:
-		return cmm_resume();
-	case PM_HIBERNATION_PREPARE:
-		return cmm_suspend();
-	default:
-		return NOTIFY_DONE;
-	}
-}
-
-static struct notifier_block cmm_power_notifier = {
-	.notifier_call = cmm_power_event,
-};
-
 static int __init cmm_init(void)
 {
 	int rc = -ENOMEM;
 
-	cmm_sysctl_header = register_sysctl_table(cmm_dir_table);
+	cmm_sysctl_header = register_sysctl("vm", cmm_table);
 	if (!cmm_sysctl_header)
 		goto out_sysctl;
 #ifdef CONFIG_CMM_IUCV
 	/* convert sender to uppercase characters */
-	if (sender) {
-		int len = strlen(sender);
-		while (len--)
-			sender[len] = toupper(sender[len]);
-	} else {
+	if (sender)
+		string_upper(sender, sender);
+	else
 		sender = cmm_default_sender;
-	}
 
 	rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target);
 	if (rc < 0)
@@ -446,16 +411,11 @@ static int __init cmm_init(void)
 	rc = register_oom_notifier(&cmm_oom_nb);
 	if (rc < 0)
 		goto out_oom_notify;
-	rc = register_pm_notifier(&cmm_power_notifier);
-	if (rc)
-		goto out_pm;
 	cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
 	if (!IS_ERR(cmm_thread_ptr))
 		return 0;
 
 	rc = PTR_ERR(cmm_thread_ptr);
-	unregister_pm_notifier(&cmm_power_notifier);
-out_pm:
 	unregister_oom_notifier(&cmm_oom_nb);
 out_oom_notify:
 #ifdef CONFIG_CMM_IUCV
@@ -464,7 +424,7 @@ out_smsg:
 #endif
 	unregister_sysctl_table(cmm_sysctl_header);
 out_sysctl:
-	del_timer_sync(&cmm_timer);
+	timer_delete_sync(&cmm_timer);
 	return rc;
 }
 module_init(cmm_init);
@@ -475,13 +435,13 @@ static void __exit cmm_exit(void)
 #ifdef CONFIG_CMM_IUCV
 	smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target);
 #endif
-	unregister_pm_notifier(&cmm_power_notifier);
 	unregister_oom_notifier(&cmm_oom_nb);
 	kthread_stop(cmm_thread_ptr);
-	del_timer_sync(&cmm_timer);
+	timer_delete_sync(&cmm_timer);
 	cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list);
 	cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list);
 }
 module_exit(cmm_exit);
 
+MODULE_DESCRIPTION("Cooperative memory management interface");
 MODULE_LICENSE("GPL");
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index 5d67b81c704a..d3e943752fa0 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -1,292 +1,329 @@
 // SPDX-License-Identifier: GPL-2.0
+
+#include <linux/cpufeature.h>
+#include <linux/set_memory.h>
+#include <linux/ptdump.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
-#include <linux/sched.h>
+#include <linux/sort.h>
 #include <linux/mm.h>
+#include <linux/kfence.h>
 #include <linux/kasan.h>
 #include <asm/kasan.h>
+#include <asm/abs_lowcore.h>
+#include <asm/nospec-branch.h>
 #include <asm/sections.h>
-#include <asm/pgtable.h>
+#include <asm/maccess.h>
 
 static unsigned long max_addr;
 
 struct addr_marker {
+	int is_start;
 	unsigned long start_address;
+	unsigned long size;
 	const char *name;
 };
 
-enum address_markers_idx {
-	IDENTITY_NR = 0,
-	KERNEL_START_NR,
-	KERNEL_END_NR,
-#ifdef CONFIG_KASAN
-	KASAN_SHADOW_START_NR,
-	KASAN_SHADOW_END_NR,
-#endif
-	VMEMMAP_NR,
-	VMALLOC_NR,
-	MODULES_NR,
-};
-
-static struct addr_marker address_markers[] = {
-	[IDENTITY_NR]		= {0, "Identity Mapping"},
-	[KERNEL_START_NR]	= {(unsigned long)_stext, "Kernel Image Start"},
-	[KERNEL_END_NR]		= {(unsigned long)_end, "Kernel Image End"},
-#ifdef CONFIG_KASAN
-	[KASAN_SHADOW_START_NR]	= {KASAN_SHADOW_START, "Kasan Shadow Start"},
-	[KASAN_SHADOW_END_NR]	= {KASAN_SHADOW_END, "Kasan Shadow End"},
-#endif
-	[VMEMMAP_NR]		= {0, "vmemmap Area"},
-	[VMALLOC_NR]		= {0, "vmalloc Area"},
-	[MODULES_NR]		= {0, "Modules Area"},
-	{ -1, NULL }
-};
+static struct addr_marker *markers;
+static unsigned int markers_cnt;
 
 struct pg_state {
+	struct ptdump_state ptdump;
+	struct seq_file *seq;
 	int level;
 	unsigned int current_prot;
+	bool check_wx;
+	unsigned long wx_pages;
 	unsigned long start_address;
-	unsigned long current_address;
 	const struct addr_marker *marker;
 };
 
+#define pt_dump_seq_printf(m, fmt, args...)	\
+({						\
+	struct seq_file *__m = (m);		\
+						\
+	if (__m)				\
+		seq_printf(__m, fmt, ##args);	\
+})
+
+#define pt_dump_seq_puts(m, fmt)		\
+({						\
+	struct seq_file *__m = (m);		\
+						\
+	if (__m)				\
+		seq_printf(__m, fmt);		\
+})
+
 static void print_prot(struct seq_file *m, unsigned int pr, int level)
 {
 	static const char * const level_name[] =
 		{ "ASCE", "PGD", "PUD", "PMD", "PTE" };
 
-	seq_printf(m, "%s ", level_name[level]);
+	pt_dump_seq_printf(m, "%s ", level_name[level]);
 	if (pr & _PAGE_INVALID) {
-		seq_printf(m, "I\n");
+		pt_dump_seq_printf(m, "I\n");
 		return;
 	}
-	seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW ");
-	seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n");
+	pt_dump_seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW ");
+	pt_dump_seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n");
 }
 
-static void note_page(struct seq_file *m, struct pg_state *st,
-		     unsigned int new_prot, int level)
+static void note_prot_wx(struct pg_state *st, unsigned long addr)
 {
-	static const char units[] = "KMGTPE";
-	int width = sizeof(unsigned long) * 2;
-	const char *unit = units;
-	unsigned int prot, cur;
-	unsigned long delta;
-
+	if (!st->check_wx)
+		return;
+	if (st->current_prot & _PAGE_INVALID)
+		return;
+	if (st->current_prot & _PAGE_PROTECT)
+		return;
+	if (st->current_prot & _PAGE_NOEXEC)
+		return;
 	/*
-	 * If we have a "break" in the series, we need to flush the state
-	 * that we have now. "break" is either changing perms, levels or
-	 * address space marker.
+	 * The first lowcore page is W+X if spectre mitigations are using
+	 * trampolines or the BEAR enhancements facility is not installed,
+	 * in which case we have two lpswe instructions in lowcore that need
+	 * to be executable.
 	 */
-	prot = new_prot;
-	cur = st->current_prot;
-
-	if (!st->level) {
-		/* First entry */
-		st->current_prot = new_prot;
-		st->level = level;
-		st->marker = address_markers;
-		seq_printf(m, "---[ %s ]---\n", st->marker->name);
-	} else if (prot != cur || level != st->level ||
-		   st->current_address >= st->marker[1].start_address) {
-		/* Print the actual finished series */
-		seq_printf(m, "0x%0*lx-0x%0*lx ",
-			   width, st->start_address,
-			   width, st->current_address);
-		delta = (st->current_address - st->start_address) >> 10;
-		while (!(delta & 0x3ff) && unit[1]) {
-			delta >>= 10;
-			unit++;
-		}
-		seq_printf(m, "%9lu%c ", delta, *unit);
-		print_prot(m, st->current_prot, st->level);
-		while (st->current_address >= st->marker[1].start_address) {
-			st->marker++;
-			seq_printf(m, "---[ %s ]---\n", st->marker->name);
-		}
-		st->start_address = st->current_address;
-		st->current_prot = new_prot;
-		st->level = level;
-	}
+	if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !cpu_has_bear()))
+		return;
+	WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX),
+		  "s390/mm: Found insecure W+X mapping at address %pS\n",
+		  (void *)st->start_address);
+	st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
 }
 
-#ifdef CONFIG_KASAN
-static void note_kasan_early_shadow_page(struct seq_file *m,
-						struct pg_state *st)
+static void note_page_update_state(struct pg_state *st, unsigned long addr, unsigned int prot, int level)
 {
-	unsigned int prot;
+	struct seq_file *m = st->seq;
 
-	prot = pte_val(*kasan_early_shadow_pte) &
-		(_PAGE_PROTECT | _PAGE_INVALID | _PAGE_NOEXEC);
-	note_page(m, st, prot, 4);
+	while (addr >= st->marker[1].start_address) {
+		st->marker++;
+		pt_dump_seq_printf(m, "---[ %s %s ]---\n", st->marker->name,
+				   st->marker->is_start ? "Start" : "End");
+	}
+	st->start_address = addr;
+	st->current_prot = prot;
+	st->level = level;
 }
-#endif
 
-/*
- * The actual page table walker functions. In order to keep the
- * implementation of print_prot() short, we only check and pass
- * _PAGE_INVALID and _PAGE_PROTECT flags to note_page() if a region,
- * segment or page table entry is invalid or read-only.
- * After all it's just a hint that the current level being walked
- * contains an invalid or read-only entry.
- */
-static void walk_pte_level(struct seq_file *m, struct pg_state *st,
-			   pmd_t *pmd, unsigned long addr)
+static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val)
 {
+	int width = sizeof(unsigned long) * 2;
+	static const char units[] = "KMGTPE";
+	const char *unit = units;
+	unsigned long delta;
+	struct pg_state *st;
+	struct seq_file *m;
 	unsigned int prot;
-	pte_t *pte;
-	int i;
 
-	for (i = 0; i < PTRS_PER_PTE && addr < max_addr; i++) {
-		st->current_address = addr;
-		pte = pte_offset_kernel(pmd, addr);
-		prot = pte_val(*pte) &
-			(_PAGE_PROTECT | _PAGE_INVALID | _PAGE_NOEXEC);
-		note_page(m, st, prot, 4);
-		addr += PAGE_SIZE;
+	st = container_of(pt_st, struct pg_state, ptdump);
+	m = st->seq;
+	prot = val & (_PAGE_PROTECT | _PAGE_NOEXEC);
+	if (level == 4 && (val & _PAGE_INVALID))
+		prot = _PAGE_INVALID;
+	/* For pmd_none() & friends val gets passed as zero. */
+	if (level != 4 && !val)
+		prot = _PAGE_INVALID;
+	/* Final flush from generic code. */
+	if (level == -1)
+		addr = max_addr;
+	if (st->level == -1) {
+		pt_dump_seq_puts(m, "---[ Kernel Virtual Address Space ]---\n");
+		note_page_update_state(st, addr, prot, level);
+	} else if (prot != st->current_prot || level != st->level ||
+		   addr >= st->marker[1].start_address) {
+		note_prot_wx(st, addr);
+		pt_dump_seq_printf(m, "0x%0*lx-0x%0*lx ",
+				   width, st->start_address,
+				   width, addr);
+		delta = (addr - st->start_address) >> 10;
+		while (!(delta & 0x3ff) && unit[1]) {
+			delta >>= 10;
+			unit++;
+		}
+		pt_dump_seq_printf(m, "%9lu%c ", delta, *unit);
+		print_prot(m, st->current_prot, st->level);
+		note_page_update_state(st, addr, prot, level);
 	}
 }
 
-static void walk_pmd_level(struct seq_file *m, struct pg_state *st,
-			   pud_t *pud, unsigned long addr)
+bool ptdump_check_wx(void)
 {
-	unsigned int prot;
-	pmd_t *pmd;
-	int i;
+	struct pg_state st = {
+		.ptdump = {
+			.note_page = note_page,
+			.range = (struct ptdump_range[]) {
+				{.start = 0, .end = max_addr},
+				{.start = 0, .end = 0},
+			}
+		},
+		.seq = NULL,
+		.level = -1,
+		.current_prot = 0,
+		.check_wx = true,
+		.wx_pages = 0,
+		.start_address = 0,
+		.marker = (struct addr_marker[]) {
+			{ .start_address =  0, .name = NULL},
+			{ .start_address = -1, .name = NULL},
+		},
+	};
 
-#ifdef CONFIG_KASAN
-	if ((pud_val(*pud) & PAGE_MASK) == __pa(kasan_early_shadow_pmd)) {
-		note_kasan_early_shadow_page(m, st);
-		return;
-	}
-#endif
+	if (!cpu_has_nx())
+		return true;
+	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+	if (st.wx_pages) {
+		pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages);
+
+		return false;
+	} else {
+		pr_info("Checked W+X mappings: passed, no %sW+X pages found\n",
+			(nospec_uses_trampoline() || !cpu_has_bear()) ?
+			"unexpected " : "");
 
-	pmd = pmd_offset(pud, addr);
-	for (i = 0; i < PTRS_PER_PMD && addr < max_addr; i++, pmd++) {
-		st->current_address = addr;
-		if (!pmd_none(*pmd)) {
-			if (pmd_large(*pmd)) {
-				prot = pmd_val(*pmd) &
-					(_SEGMENT_ENTRY_PROTECT |
-					 _SEGMENT_ENTRY_NOEXEC);
-				note_page(m, st, prot, 3);
-			} else
-				walk_pte_level(m, st, pmd, addr);
-		} else
-			note_page(m, st, _PAGE_INVALID, 3);
-		addr += PMD_SIZE;
+		return true;
 	}
 }
 
-static void walk_pud_level(struct seq_file *m, struct pg_state *st,
-			   p4d_t *p4d, unsigned long addr)
+#ifdef CONFIG_PTDUMP_DEBUGFS
+static int ptdump_show(struct seq_file *m, void *v)
 {
-	unsigned int prot;
-	pud_t *pud;
-	int i;
-
-#ifdef CONFIG_KASAN
-	if ((p4d_val(*p4d) & PAGE_MASK) == __pa(kasan_early_shadow_pud)) {
-		note_kasan_early_shadow_page(m, st);
-		return;
-	}
-#endif
+	struct pg_state st = {
+		.ptdump = {
+			.note_page = note_page,
+			.range = (struct ptdump_range[]) {
+				{.start = 0, .end = max_addr},
+				{.start = 0, .end = 0},
+			}
+		},
+		.seq = m,
+		.level = -1,
+		.current_prot = 0,
+		.check_wx = false,
+		.wx_pages = 0,
+		.start_address = 0,
+		.marker = markers,
+	};
 
-	pud = pud_offset(p4d, addr);
-	for (i = 0; i < PTRS_PER_PUD && addr < max_addr; i++, pud++) {
-		st->current_address = addr;
-		if (!pud_none(*pud))
-			if (pud_large(*pud)) {
-				prot = pud_val(*pud) &
-					(_REGION_ENTRY_PROTECT |
-					 _REGION_ENTRY_NOEXEC);
-				note_page(m, st, prot, 2);
-			} else
-				walk_pmd_level(m, st, pud, addr);
-		else
-			note_page(m, st, _PAGE_INVALID, 2);
-		addr += PUD_SIZE;
-	}
+	get_online_mems();
+	mutex_lock(&cpa_mutex);
+	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+	mutex_unlock(&cpa_mutex);
+	put_online_mems();
+	return 0;
 }
+DEFINE_SHOW_ATTRIBUTE(ptdump);
+#endif /* CONFIG_PTDUMP_DEBUGFS */
 
-static void walk_p4d_level(struct seq_file *m, struct pg_state *st,
-			   pgd_t *pgd, unsigned long addr)
+static int ptdump_cmp(const void *a, const void *b)
 {
-	p4d_t *p4d;
-	int i;
+	const struct addr_marker *ama = a;
+	const struct addr_marker *amb = b;
 
-#ifdef CONFIG_KASAN
-	if ((pgd_val(*pgd) & PAGE_MASK) == __pa(kasan_early_shadow_p4d)) {
-		note_kasan_early_shadow_page(m, st);
-		return;
+	if (ama->start_address > amb->start_address)
+		return 1;
+	if (ama->start_address < amb->start_address)
+		return -1;
+	/*
+	 * If the start addresses of two markers are identical sort markers in an
+	 * order that considers areas contained within other areas correctly.
+	 */
+	if (ama->is_start && amb->is_start) {
+		if (ama->size > amb->size)
+			return -1;
+		if (ama->size < amb->size)
+			return 1;
+		return 0;
 	}
-#endif
-
-	p4d = p4d_offset(pgd, addr);
-	for (i = 0; i < PTRS_PER_P4D && addr < max_addr; i++, p4d++) {
-		st->current_address = addr;
-		if (!p4d_none(*p4d))
-			walk_pud_level(m, st, p4d, addr);
-		else
-			note_page(m, st, _PAGE_INVALID, 2);
-		addr += P4D_SIZE;
+	if (!ama->is_start && !amb->is_start) {
+		if (ama->size > amb->size)
+			return 1;
+		if (ama->size < amb->size)
+			return -1;
+		return 0;
 	}
+	if (ama->is_start)
+		return 1;
+	if (amb->is_start)
+		return -1;
+	return 0;
 }
 
-static void walk_pgd_level(struct seq_file *m)
+static int add_marker(unsigned long start, unsigned long end, const char *name)
 {
-	unsigned long addr = 0;
-	struct pg_state st;
-	pgd_t *pgd;
-	int i;
-
-	memset(&st, 0, sizeof(st));
-	for (i = 0; i < PTRS_PER_PGD && addr < max_addr; i++) {
-		st.current_address = addr;
-		pgd = pgd_offset_k(addr);
-		if (!pgd_none(*pgd))
-			walk_p4d_level(m, &st, pgd, addr);
-		else
-			note_page(m, &st, _PAGE_INVALID, 1);
-		addr += PGDIR_SIZE;
-		cond_resched();
-	}
-	/* Flush out the last page */
-	st.current_address = max_addr;
-	note_page(m, &st, 0, 0);
-}
+	size_t oldsize, newsize;
 
-static int ptdump_show(struct seq_file *m, void *v)
-{
-	walk_pgd_level(m);
+	oldsize = markers_cnt * sizeof(*markers);
+	newsize = oldsize + 2 * sizeof(*markers);
+	if (!oldsize)
+		markers = kvmalloc(newsize, GFP_KERNEL);
+	else
+		markers = kvrealloc(markers, newsize, GFP_KERNEL);
+	if (!markers)
+		goto error;
+	markers[markers_cnt].is_start = 1;
+	markers[markers_cnt].start_address = start;
+	markers[markers_cnt].size = end - start;
+	markers[markers_cnt].name = name;
+	markers_cnt++;
+	markers[markers_cnt].is_start = 0;
+	markers[markers_cnt].start_address = end;
+	markers[markers_cnt].size = end - start;
+	markers[markers_cnt].name = name;
+	markers_cnt++;
 	return 0;
+error:
+	markers_cnt = 0;
+	return -ENOMEM;
 }
 
-static int ptdump_open(struct inode *inode, struct file *filp)
-{
-	return single_open(filp, ptdump_show, NULL);
-}
-
-static const struct file_operations ptdump_fops = {
-	.open		= ptdump_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int pt_dump_init(void)
 {
+#ifdef CONFIG_KFENCE
+	unsigned long kfence_start = (unsigned long)__kfence_pool;
+#endif
+	unsigned long lowcore = (unsigned long)get_lowcore();
+	int rc;
+
 	/*
 	 * Figure out the maximum virtual address being accessible with the
 	 * kernel ASCE. We need this to keep the page table walker functions
 	 * from accessing non-existent entries.
 	 */
-	max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2;
+	max_addr = (get_lowcore()->kernel_asce.val & _REGION_ENTRY_TYPE_MASK) >> 2;
 	max_addr = 1UL << (max_addr * 11 + 31);
-	address_markers[MODULES_NR].start_address = MODULES_VADDR;
-	address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap;
-	address_markers[VMALLOC_NR].start_address = VMALLOC_START;
+	/* start + end markers - must be added first */
+	rc = add_marker(0, -1UL, NULL);
+	rc |= add_marker((unsigned long)_stext, (unsigned long)_end, "Kernel Image");
+	rc |= add_marker(lowcore, lowcore + sizeof(struct lowcore), "Lowcore");
+	rc |= add_marker(__identity_base, __identity_base + ident_map_size, "Identity Mapping");
+	rc |= add_marker((unsigned long)__samode31, (unsigned long)__eamode31, "Amode31 Area");
+	rc |= add_marker(MODULES_VADDR, MODULES_END, "Modules Area");
+	rc |= add_marker(__abs_lowcore, __abs_lowcore + ABS_LOWCORE_MAP_SIZE, "Lowcore Area");
+	rc |= add_marker(__memcpy_real_area, __memcpy_real_area + MEMCPY_REAL_SIZE, "Real Memory Copy Area");
+	rc |= add_marker((unsigned long)vmemmap, (unsigned long)vmemmap + vmemmap_size, "vmemmap Area");
+	rc |= add_marker(VMALLOC_START, VMALLOC_END, "vmalloc Area");
+#ifdef CONFIG_KFENCE
+	rc |= add_marker(kfence_start, kfence_start + KFENCE_POOL_SIZE, "KFence Pool");
+#endif
+#ifdef CONFIG_KMSAN
+	rc |= add_marker(KMSAN_VMALLOC_SHADOW_START, KMSAN_VMALLOC_SHADOW_END, "Kmsan vmalloc Shadow");
+	rc |= add_marker(KMSAN_VMALLOC_ORIGIN_START, KMSAN_VMALLOC_ORIGIN_END, "Kmsan vmalloc Origins");
+	rc |= add_marker(KMSAN_MODULES_SHADOW_START, KMSAN_MODULES_SHADOW_END, "Kmsan Modules Shadow");
+	rc |= add_marker(KMSAN_MODULES_ORIGIN_START, KMSAN_MODULES_ORIGIN_END, "Kmsan Modules Origins");
+#endif
+#ifdef CONFIG_KASAN
+	rc |= add_marker(KASAN_SHADOW_START, KASAN_SHADOW_END, "Kasan Shadow");
+#endif
+	if (rc)
+		goto error;
+	sort(&markers[1], markers_cnt - 1, sizeof(*markers), ptdump_cmp, NULL);
+#ifdef CONFIG_PTDUMP_DEBUGFS
 	debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops);
+#endif /* CONFIG_PTDUMP_DEBUGFS */
 	return 0;
+error:
+	kvfree(markers);
+	return -ENOMEM;
 }
 device_initcall(pt_dump_init);
diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c
new file mode 100644
index 000000000000..7498e858c401
--- /dev/null
+++ b/arch/s390/mm/extable.c
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitfield.h>
+#include <linux/extable.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/panic.h>
+#include <asm/asm-extable.h>
+#include <asm/extable.h>
+#include <asm/fpu.h>
+
+const struct exception_table_entry *s390_search_extables(unsigned long addr)
+{
+	const struct exception_table_entry *fixup;
+	size_t num;
+
+	fixup = search_exception_tables(addr);
+	if (fixup)
+		return fixup;
+	num = __stop_amode31_ex_table - __start_amode31_ex_table;
+	return search_extable(__start_amode31_ex_table, num, addr);
+}
+
+static bool ex_handler_fixup(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+	regs->psw.addr = extable_fixup(ex);
+	return true;
+}
+
+static bool ex_handler_ua_fault(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+	unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+
+	regs->gprs[reg_err] = -EFAULT;
+	regs->psw.addr = extable_fixup(ex);
+	return true;
+}
+
+static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex,
+				   bool pair, struct pt_regs *regs)
+{
+	unsigned int reg_zero = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
+	unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+
+	regs->gprs[reg_err] = -EFAULT;
+	regs->gprs[reg_zero] = 0;
+	if (pair)
+		regs->gprs[reg_zero + 1] = 0;
+	regs->psw.addr = extable_fixup(ex);
+	return true;
+}
+
+static bool ex_handler_zeropad(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+	unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
+	unsigned int reg_data = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+	unsigned long data, addr, offset;
+
+	addr = regs->gprs[reg_addr];
+	offset = addr & (sizeof(unsigned long) - 1);
+	addr &= ~(sizeof(unsigned long) - 1);
+	data = *(unsigned long *)addr;
+	data <<= BITS_PER_BYTE * offset;
+	regs->gprs[reg_data] = data;
+	regs->psw.addr = extable_fixup(ex);
+	return true;
+}
+
+static bool ex_handler_fpc(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+	fpu_sfpc(0);
+	regs->psw.addr = extable_fixup(ex);
+	return true;
+}
+
+struct insn_ssf {
+	u64	opc1 : 8;
+	u64	r3   : 4;
+	u64	opc2 : 4;
+	u64	b1   : 4;
+	u64	d1   : 12;
+	u64	b2   : 4;
+	u64	d2   : 12;
+} __packed;
+
+static bool ex_handler_ua_mvcos(const struct exception_table_entry *ex,
+				bool from, struct pt_regs *regs)
+{
+	unsigned long uaddr, remainder;
+	struct insn_ssf *insn;
+
+	/*
+	 * If the faulting user space access crossed a page boundary retry by
+	 * limiting the access to the first page (adjust length accordingly).
+	 * Then the mvcos instruction will either complete with condition code
+	 * zero, or generate another fault where the user space access did not
+	 * cross a page boundary.
+	 * If the faulting user space access did not cross a page boundary set
+	 * length to zero and retry. In this case no user space access will
+	 * happen, and the mvcos instruction will complete with condition code
+	 * zero.
+	 * In both cases the instruction will complete with condition code
+	 * zero (copying finished), and the register which contains the
+	 * length, indicates the number of bytes copied.
+	 */
+	regs->psw.addr = extable_fixup(ex);
+	insn = (struct insn_ssf *)regs->psw.addr;
+	if (from)
+		uaddr = regs->gprs[insn->b2] + insn->d2;
+	else
+		uaddr = regs->gprs[insn->b1] + insn->d1;
+	remainder = PAGE_SIZE - (uaddr & (PAGE_SIZE - 1));
+	if (regs->gprs[insn->r3] <= remainder)
+		remainder = 0;
+	regs->gprs[insn->r3] = remainder;
+	return true;
+}
+
+bool fixup_exception(struct pt_regs *regs)
+{
+	const struct exception_table_entry *ex;
+
+	ex = s390_search_extables(instruction_pointer(regs));
+	if (!ex)
+		return false;
+	switch (ex->type) {
+	case EX_TYPE_FIXUP:
+		return ex_handler_fixup(ex, regs);
+	case EX_TYPE_BPF:
+		return ex_handler_bpf(ex, regs);
+	case EX_TYPE_UA_FAULT:
+		return ex_handler_ua_fault(ex, regs);
+	case EX_TYPE_UA_LOAD_REG:
+		return ex_handler_ua_load_reg(ex, false, regs);
+	case EX_TYPE_UA_LOAD_REGPAIR:
+		return ex_handler_ua_load_reg(ex, true, regs);
+	case EX_TYPE_ZEROPAD:
+		return ex_handler_zeropad(ex, regs);
+	case EX_TYPE_FPC:
+		return ex_handler_fpc(ex, regs);
+	case EX_TYPE_UA_MVCOS_TO:
+		return ex_handler_ua_mvcos(ex, false, regs);
+	case EX_TYPE_UA_MVCOS_FROM:
+		return ex_handler_ua_mvcos(ex, true, regs);
+	}
+	panic("invalid exception table entry");
+}
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index fd0dae9d10f4..a6b8b8ea9086 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -20,14 +20,16 @@
 #include <linux/ctype.h>
 #include <linux/ioport.h>
 #include <linux/refcount.h>
+#include <linux/pgtable.h>
+#include <asm/machine.h>
 #include <asm/diag.h>
 #include <asm/page.h>
-#include <asm/pgtable.h>
 #include <asm/ebcdic.h>
 #include <asm/errno.h>
 #include <asm/extmem.h>
 #include <asm/cpcmd.h>
 #include <asm/setup.h>
+#include <asm/asm.h>
 
 #define DCSS_PURGESEG   0x08
 #define DCSS_LOADSHRX	0x20
@@ -134,20 +136,21 @@ dcss_diag(int *func, void *parameter,
            unsigned long *ret1, unsigned long *ret2)
 {
 	unsigned long rx, ry;
-	int rc;
+	int cc;
 
-	rx = (unsigned long) parameter;
+	rx = virt_to_phys(parameter);
 	ry = (unsigned long) *func;
 
 	diag_stat_inc(DIAG_STAT_X064);
 	asm volatile(
-		"	diag	%0,%1,0x64\n"
-		"	ipm	%2\n"
-		"	srl	%2,28\n"
-		: "+d" (rx), "+d" (ry), "=d" (rc) : : "cc");
+		"	diag	%[rx],%[ry],0x64\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc), [rx] "+d" (rx), [ry] "+d" (ry)
+		:
+		: CC_CLOBBER);
 	*ret1 = rx;
 	*ret2 = ry;
-	return rc;
+	return CC_TRANSFORM(cc);
 }
 
 static inline int
@@ -178,7 +181,7 @@ query_segment_type (struct dcss_segment *seg)
 
 	/* initialize diag input parameters */
 	qin->qopcode = DCSS_FINDSEGA;
-	qin->qoutptr = (unsigned long) qout;
+	qin->qoutptr = virt_to_phys(qout);
 	qin->qoutlen = sizeof(struct qout64);
 	memcpy (qin->qname, seg->dcss_name, 8);
 
@@ -253,7 +256,7 @@ segment_type (char* name)
 	int rc;
 	struct dcss_segment seg;
 
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return -ENOSYS;
 
 	dcss_mkname(name, seg.dcss_name);
@@ -289,15 +292,17 @@ segment_overlaps_others (struct dcss_segment *seg)
 
 /*
  * real segment loading function, called from segment_load
+ * Must return either an error code < 0, or the segment type code >= 0
  */
 static int
 __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long *end)
 {
 	unsigned long start_addr, end_addr, dummy;
 	struct dcss_segment *seg;
-	int rc, diag_cc;
+	int rc, diag_cc, segtype;
 
 	start_addr = end_addr = 0;
+	segtype = -1;
 	seg = kmalloc(sizeof(*seg), GFP_KERNEL | GFP_DMA);
 	if (seg == NULL) {
 		rc = -ENOMEM;
@@ -313,15 +318,10 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
 		goto out_free;
 	}
 
-	rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
-
-	if (rc)
-		goto out_free;
-
 	seg->res = kzalloc(sizeof(struct resource), GFP_KERNEL);
 	if (seg->res == NULL) {
 		rc = -ENOMEM;
-		goto out_shared;
+		goto out_free;
 	}
 	seg->res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
 	seg->res->start = seg->start_addr;
@@ -331,16 +331,21 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
 	seg->res_name[8] = '\0';
 	strlcat(seg->res_name, " (DCSS)", sizeof(seg->res_name));
 	seg->res->name = seg->res_name;
-	rc = seg->vm_segtype;
-	if (rc == SEG_TYPE_SC ||
-	    ((rc == SEG_TYPE_SR || rc == SEG_TYPE_ER) && !do_nonshared))
+	segtype = seg->vm_segtype;
+	if (segtype == SEG_TYPE_SC ||
+	    ((segtype == SEG_TYPE_SR || segtype == SEG_TYPE_ER) && !do_nonshared))
 		seg->res->flags |= IORESOURCE_READONLY;
+
+	/* Check for overlapping resources before adding the mapping. */
 	if (request_resource(&iomem_resource, seg->res)) {
 		rc = -EBUSY;
-		kfree(seg->res);
-		goto out_shared;
+		goto out_free_resource;
 	}
 
+	rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
+	if (rc)
+		goto out_resource;
+
 	if (do_nonshared)
 		diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name,
 				&start_addr, &end_addr);
@@ -351,14 +356,14 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
 		dcss_diag(&purgeseg_scode, seg->dcss_name,
 				&dummy, &dummy);
 		rc = diag_cc;
-		goto out_resource;
+		goto out_mapping;
 	}
 	if (diag_cc > 1) {
 		pr_warn("Loading DCSS %s failed with rc=%ld\n", name, end_addr);
 		rc = dcss_diag_translate_rc(end_addr);
 		dcss_diag(&purgeseg_scode, seg->dcss_name,
 				&dummy, &dummy);
-		goto out_resource;
+		goto out_mapping;
 	}
 	seg->start_addr = start_addr;
 	seg->end = end_addr;
@@ -377,15 +382,16 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
 			(void*) seg->end, segtype_string[seg->vm_segtype]);
 	}
 	goto out;
+ out_mapping:
+	vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
  out_resource:
 	release_resource(seg->res);
+ out_free_resource:
 	kfree(seg->res);
- out_shared:
-	vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
  out_free:
 	kfree(seg);
  out:
-	return rc;
+	return rc < 0 ? rc : segtype;
 }
 
 /*
@@ -400,8 +406,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
  * -EIO     : could not perform query or load diagnose
  * -ENOENT  : no such segment
  * -EOPNOTSUPP: multi-part segment cannot be used with linux
- * -ENOSPC  : segment cannot be used (overlaps with storage)
- * -EBUSY   : segment can temporarily not be used (overlaps with dcss)
+ * -EBUSY   : segment cannot be used (overlaps with dcss or storage)
  * -ERANGE  : segment cannot be used (exceeds kernel mapping range)
  * -EPERM   : segment is currently loaded with incompatible permissions
  * -ENOMEM  : out of memory
@@ -414,7 +419,7 @@ segment_load (char *name, int do_nonshared, unsigned long *addr,
 	struct dcss_segment *seg;
 	int rc;
 
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return -ENOSYS;
 
 	mutex_lock(&dcss_lock);
@@ -536,7 +541,7 @@ segment_unload(char *name)
 	unsigned long dummy;
 	struct dcss_segment *seg;
 
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return;
 
 	mutex_lock(&dcss_lock);
@@ -568,7 +573,7 @@ segment_save(char *name)
 	char cmd2[80];
 	int i, response;
 
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return;
 
 	mutex_lock(&dcss_lock);
@@ -626,10 +631,6 @@ void segment_warning(int rc, char *seg_name)
 		pr_err("DCSS %s has multiple page ranges and cannot be "
 		       "loaded or queried\n", seg_name);
 		break;
-	case -ENOSPC:
-		pr_err("DCSS %s overlaps with used storage and cannot "
-		       "be loaded\n", seg_name);
-		break;
 	case -EBUSY:
 		pr_err("%s needs used memory resources and cannot be "
 		       "loaded or queried\n", seg_name);
@@ -642,10 +643,13 @@ void segment_warning(int rc, char *seg_name)
 		pr_err("There is not enough memory to load or query "
 		       "DCSS %s\n", seg_name);
 		break;
-	case -ERANGE:
-		pr_err("DCSS %s exceeds the kernel mapping range (%lu) "
-		       "and cannot be loaded\n", seg_name, VMEM_MAX_PHYS);
+	case -ERANGE: {
+		struct range mhp_range = arch_get_mappable_range();
+
+		pr_err("DCSS %s exceeds the kernel mapping range (%llu) "
+		       "and cannot be loaded\n", seg_name, mhp_range.end + 1);
 		break;
+	}
 	default:
 		break;
 	}
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 7b0bb475c166..da84ff6770de 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -3,13 +3,15 @@
  *  S390 version
  *    Copyright IBM Corp. 1999
  *    Author(s): Hartmut Penner (hp@de.ibm.com)
- *               Ulrich Weigand (uweigand@de.ibm.com)
+ *		 Ulrich Weigand (uweigand@de.ibm.com)
  *
  *  Derived from "arch/i386/mm/fault.c"
  *    Copyright (C) 1995  Linus Torvalds
  */
 
 #include <linux/kernel_stat.h>
+#include <linux/mmu_context.h>
+#include <linux/cpufeature.h>
 #include <linux/perf_event.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
@@ -31,129 +33,95 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/hugetlb.h>
+#include <linux/kfence.h>
+#include <linux/pagewalk.h>
+#include <asm/asm-extable.h>
 #include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
+#include <asm/fault.h>
 #include <asm/diag.h>
-#include <asm/pgtable.h>
 #include <asm/gmap.h>
 #include <asm/irq.h>
-#include <asm/mmu_context.h>
 #include <asm/facility.h>
+#include <asm/uv.h>
 #include "../kernel/entry.h"
 
-#define __FAIL_ADDR_MASK -4096L
-#define __SUBCODE_MASK 0x0600
-#define __PF_RES_FIELD 0x8000000000000000ULL
-
-#define VM_FAULT_BADCONTEXT	0x010000
-#define VM_FAULT_BADMAP		0x020000
-#define VM_FAULT_BADACCESS	0x040000
-#define VM_FAULT_SIGNAL		0x080000
-#define VM_FAULT_PFAULT		0x100000
-
-enum fault_type {
-	KERNEL_FAULT,
-	USER_FAULT,
-	VDSO_FAULT,
-	GMAP_FAULT,
-};
-
-static unsigned long store_indication __read_mostly;
-
-static int __init fault_init(void)
-{
-	if (test_facility(75))
-		store_indication = 0xc00;
-	return 0;
-}
-early_initcall(fault_init);
-
 /*
  * Find out which address space caused the exception.
  */
-static enum fault_type get_fault_type(struct pt_regs *regs)
+static bool is_kernel_fault(struct pt_regs *regs)
 {
-	unsigned long trans_exc_code;
+	union teid teid = { .val = regs->int_parm_long };
 
-	trans_exc_code = regs->int_parm_long & 3;
-	if (likely(trans_exc_code == 0)) {
-		/* primary space exception */
-		if (IS_ENABLED(CONFIG_PGSTE) &&
-		    test_pt_regs_flag(regs, PIF_GUEST_FAULT))
-			return GMAP_FAULT;
-		if (current->thread.mm_segment == USER_DS)
-			return USER_FAULT;
-		return KERNEL_FAULT;
-	}
-	if (trans_exc_code == 2) {
-		/* secondary space exception */
-		if (current->thread.mm_segment & 1) {
-			if (current->thread.mm_segment == USER_DS_SACF)
-				return USER_FAULT;
-			return KERNEL_FAULT;
-		}
-		return VDSO_FAULT;
-	}
-	if (trans_exc_code == 1) {
-		/* access register mode, not used in the kernel */
-		return USER_FAULT;
-	}
-	/* home space exception -> access via kernel ASCE */
-	return KERNEL_FAULT;
+	if (user_mode(regs))
+		return false;
+	if (teid.as == PSW_BITS_AS_SECONDARY)
+		return false;
+	return true;
 }
 
-static int bad_address(void *p)
+static unsigned long get_fault_address(struct pt_regs *regs)
 {
-	unsigned long dummy;
+	union teid teid = { .val = regs->int_parm_long };
 
-	return probe_kernel_address((unsigned long *)p, dummy);
+	return teid.addr * PAGE_SIZE;
+}
+
+static __always_inline bool fault_is_write(struct pt_regs *regs)
+{
+	union teid teid = { .val = regs->int_parm_long };
+
+	if (test_facility(75))
+		return teid.fsi == TEID_FSI_STORE;
+	return false;
 }
 
 static void dump_pagetable(unsigned long asce, unsigned long address)
 {
-	unsigned long *table = __va(asce & _ASCE_ORIGIN);
+	unsigned long entry, *table = __va(asce & _ASCE_ORIGIN);
 
 	pr_alert("AS:%016lx ", asce);
 	switch (asce & _ASCE_TYPE_MASK) {
 	case _ASCE_TYPE_REGION1:
 		table += (address & _REGION1_INDEX) >> _REGION1_SHIFT;
-		if (bad_address(table))
+		if (get_kernel_nofault(entry, table))
 			goto bad;
-		pr_cont("R1:%016lx ", *table);
-		if (*table & _REGION_ENTRY_INVALID)
+		pr_cont("R1:%016lx ", entry);
+		if (entry & _REGION_ENTRY_INVALID)
 			goto out;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-		/* fallthrough */
+		table = __va(entry & _REGION_ENTRY_ORIGIN);
+		fallthrough;
 	case _ASCE_TYPE_REGION2:
 		table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
-		if (bad_address(table))
+		if (get_kernel_nofault(entry, table))
 			goto bad;
-		pr_cont("R2:%016lx ", *table);
-		if (*table & _REGION_ENTRY_INVALID)
+		pr_cont("R2:%016lx ", entry);
+		if (entry & _REGION_ENTRY_INVALID)
 			goto out;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-		/* fallthrough */
+		table = __va(entry & _REGION_ENTRY_ORIGIN);
+		fallthrough;
 	case _ASCE_TYPE_REGION3:
 		table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
-		if (bad_address(table))
+		if (get_kernel_nofault(entry, table))
 			goto bad;
-		pr_cont("R3:%016lx ", *table);
-		if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
+		pr_cont("R3:%016lx ", entry);
+		if (entry & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
 			goto out;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-		/* fallthrough */
+		table = __va(entry & _REGION_ENTRY_ORIGIN);
+		fallthrough;
 	case _ASCE_TYPE_SEGMENT:
 		table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
-		if (bad_address(table))
+		if (get_kernel_nofault(entry, table))
 			goto bad;
-		pr_cont("S:%016lx ", *table);
-		if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
+		pr_cont("S:%016lx ", entry);
+		if (entry & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
 			goto out;
-		table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+		table = __va(entry & _SEGMENT_ENTRY_ORIGIN);
 	}
-	table += (address & _PAGE_INDEX) >> _PAGE_SHIFT;
-	if (bad_address(table))
+	table += (address & _PAGE_INDEX) >> PAGE_SHIFT;
+	if (get_kernel_nofault(entry, table))
 		goto bad;
-	pr_cont("P:%016lx ", *table);
+	pr_cont("P:%016lx ", entry);
 out:
 	pr_cont("\n");
 	return;
@@ -163,212 +131,118 @@ bad:
 
 static void dump_fault_info(struct pt_regs *regs)
 {
+	union teid teid = { .val = regs->int_parm_long };
 	unsigned long asce;
 
 	pr_alert("Failing address: %016lx TEID: %016lx\n",
-		 regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long);
+		 get_fault_address(regs), teid.val);
 	pr_alert("Fault in ");
-	switch (regs->int_parm_long & 3) {
-	case 3:
+	switch (teid.as) {
+	case PSW_BITS_AS_HOME:
 		pr_cont("home space ");
 		break;
-	case 2:
+	case PSW_BITS_AS_SECONDARY:
 		pr_cont("secondary space ");
 		break;
-	case 1:
+	case PSW_BITS_AS_ACCREG:
 		pr_cont("access register ");
 		break;
-	case 0:
+	case PSW_BITS_AS_PRIMARY:
 		pr_cont("primary space ");
 		break;
 	}
 	pr_cont("mode while using ");
-	switch (get_fault_type(regs)) {
-	case USER_FAULT:
-		asce = S390_lowcore.user_asce;
-		pr_cont("user ");
-		break;
-	case VDSO_FAULT:
-		asce = S390_lowcore.vdso_asce;
-		pr_cont("vdso ");
-		break;
-	case GMAP_FAULT:
-		asce = ((struct gmap *) S390_lowcore.gmap)->asce;
-		pr_cont("gmap ");
-		break;
-	case KERNEL_FAULT:
-		asce = S390_lowcore.kernel_asce;
+	if (is_kernel_fault(regs)) {
+		asce = get_lowcore()->kernel_asce.val;
 		pr_cont("kernel ");
-		break;
-	default:
-		unreachable();
+	} else {
+		asce = get_lowcore()->user_asce.val;
+		pr_cont("user ");
 	}
 	pr_cont("ASCE.\n");
-	dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK);
+	dump_pagetable(asce, get_fault_address(regs));
 }
 
 int show_unhandled_signals = 1;
 
+static const struct ctl_table s390_fault_sysctl_table[] = {
+	{
+		.procname	= "userprocess_debug",
+		.data		= &show_unhandled_signals,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+};
+
+static int __init init_s390_fault_sysctls(void)
+{
+	register_sysctl_init("kernel", s390_fault_sysctl_table);
+	return 0;
+}
+arch_initcall(init_s390_fault_sysctls);
+
 void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault)
 {
+	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
+
 	if ((task_pid_nr(current) > 1) && !show_unhandled_signals)
 		return;
 	if (!unhandled_signal(current, signr))
 		return;
-	if (!printk_ratelimit())
+	if (!__ratelimit(&rs))
 		return;
-	printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ",
-	       regs->int_code & 0xffff, regs->int_code >> 17);
+	pr_alert("User process fault: interruption code %04x ilc:%d ",
+		 regs->int_code & 0xffff, regs->int_code >> 17);
 	print_vma_addr(KERN_CONT "in ", regs->psw.addr);
-	printk(KERN_CONT "\n");
+	pr_cont("\n");
 	if (is_mm_fault)
 		dump_fault_info(regs);
 	show_regs(regs);
 }
 
-/*
- * Send SIGSEGV to task.  This is an external routine
- * to keep the stack usage of do_page_fault small.
- */
-static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
+static void do_sigsegv(struct pt_regs *regs, int si_code)
 {
 	report_user_fault(regs, SIGSEGV, 1);
-	force_sig_fault(SIGSEGV, si_code,
-			(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
-}
-
-const struct exception_table_entry *s390_search_extables(unsigned long addr)
-{
-	const struct exception_table_entry *fixup;
-
-	fixup = search_extable(__start_dma_ex_table,
-			       __stop_dma_ex_table - __start_dma_ex_table,
-			       addr);
-	if (!fixup)
-		fixup = search_exception_tables(addr);
-	return fixup;
+	force_sig_fault(SIGSEGV, si_code, (void __user *)get_fault_address(regs));
 }
 
-static noinline void do_no_context(struct pt_regs *regs)
+static void handle_fault_error_nolock(struct pt_regs *regs, int si_code)
 {
-	const struct exception_table_entry *fixup;
+	unsigned long address;
+	bool is_write;
 
-	/* Are we prepared to handle this kernel fault?  */
-	fixup = s390_search_extables(regs->psw.addr);
-	if (fixup) {
-		regs->psw.addr = extable_fixup(fixup);
+	if (user_mode(regs)) {
+		if (WARN_ON_ONCE(!si_code))
+			si_code = SEGV_MAPERR;
+		return do_sigsegv(regs, si_code);
+	}
+	if (fixup_exception(regs))
 		return;
+	if (is_kernel_fault(regs)) {
+		address = get_fault_address(regs);
+		is_write = fault_is_write(regs);
+		if (kfence_handle_page_fault(address, is_write, regs))
+			return;
+		pr_alert("Unable to handle kernel pointer dereference in virtual kernel address space\n");
+	} else {
+		pr_alert("Unable to handle kernel paging request in virtual user address space\n");
 	}
-
-	/*
-	 * Oops. The kernel tried to access some bad page. We'll have to
-	 * terminate things with extreme prejudice.
-	 */
-	if (get_fault_type(regs) == KERNEL_FAULT)
-		printk(KERN_ALERT "Unable to handle kernel pointer dereference"
-		       " in virtual kernel address space\n");
-	else
-		printk(KERN_ALERT "Unable to handle kernel paging request"
-		       " in virtual user address space\n");
 	dump_fault_info(regs);
 	die(regs, "Oops");
-	do_exit(SIGKILL);
 }
 
-static noinline void do_low_address(struct pt_regs *regs)
+static void handle_fault_error(struct pt_regs *regs, int si_code)
 {
-	/* Low-address protection hit in kernel mode means
-	   NULL pointer write access in kernel mode.  */
-	if (regs->psw.mask & PSW_MASK_PSTATE) {
-		/* Low-address protection hit in user mode 'cannot happen'. */
-		die (regs, "Low-address protection");
-		do_exit(SIGKILL);
-	}
+	struct mm_struct *mm = current->mm;
 
-	do_no_context(regs);
+	mmap_read_unlock(mm);
+	handle_fault_error_nolock(regs, si_code);
 }
 
-static noinline void do_sigbus(struct pt_regs *regs)
+static void do_sigbus(struct pt_regs *regs)
 {
-	/*
-	 * Send a sigbus, regardless of whether we were in kernel
-	 * or user mode.
-	 */
-	force_sig_fault(SIGBUS, BUS_ADRERR,
-			(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
-}
-
-static noinline int signal_return(struct pt_regs *regs)
-{
-	u16 instruction;
-	int rc;
-
-	rc = __get_user(instruction, (u16 __user *) regs->psw.addr);
-	if (rc)
-		return rc;
-	if (instruction == 0x0a77) {
-		set_pt_regs_flag(regs, PIF_SYSCALL);
-		regs->int_code = 0x00040077;
-		return 0;
-	} else if (instruction == 0x0aad) {
-		set_pt_regs_flag(regs, PIF_SYSCALL);
-		regs->int_code = 0x000400ad;
-		return 0;
-	}
-	return -EACCES;
-}
-
-static noinline void do_fault_error(struct pt_regs *regs, int access,
-					vm_fault_t fault)
-{
-	int si_code;
-
-	switch (fault) {
-	case VM_FAULT_BADACCESS:
-		if (access == VM_EXEC && signal_return(regs) == 0)
-			break;
-		/* fallthrough */
-	case VM_FAULT_BADMAP:
-		/* Bad memory access. Check if it is kernel or user space. */
-		if (user_mode(regs)) {
-			/* User mode accesses just cause a SIGSEGV */
-			si_code = (fault == VM_FAULT_BADMAP) ?
-				SEGV_MAPERR : SEGV_ACCERR;
-			do_sigsegv(regs, si_code);
-			break;
-		}
-		/* fallthrough */
-	case VM_FAULT_BADCONTEXT:
-		/* fallthrough */
-	case VM_FAULT_PFAULT:
-		do_no_context(regs);
-		break;
-	case VM_FAULT_SIGNAL:
-		if (!user_mode(regs))
-			do_no_context(regs);
-		break;
-	default: /* fault & VM_FAULT_ERROR */
-		if (fault & VM_FAULT_OOM) {
-			if (!user_mode(regs))
-				do_no_context(regs);
-			else
-				pagefault_out_of_memory();
-		} else if (fault & VM_FAULT_SIGSEGV) {
-			/* Kernel mode? Handle exceptions or die */
-			if (!user_mode(regs))
-				do_no_context(regs);
-			else
-				do_sigsegv(regs, SEGV_MAPERR);
-		} else if (fault & VM_FAULT_SIGBUS) {
-			/* Kernel mode? Handle exceptions or die */
-			if (!user_mode(regs))
-				do_no_context(regs);
-			else
-				do_sigbus(regs);
-		} else
-			BUG();
-		break;
-	}
+	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)get_fault_address(regs));
 }
 
 /*
@@ -377,178 +251,116 @@ static noinline void do_fault_error(struct pt_regs *regs, int access,
  * routines.
  *
  * interruption code (int_code):
- *   04       Protection           ->  Write-Protection  (suprression)
- *   10       Segment translation  ->  Not present       (nullification)
- *   11       Page translation     ->  Not present       (nullification)
- *   3b       Region third trans.  ->  Not present       (nullification)
+ *   04       Protection	   ->  Write-Protection  (suppression)
+ *   10       Segment translation  ->  Not present	 (nullification)
+ *   11       Page translation	   ->  Not present	 (nullification)
+ *   3b       Region third trans.  ->  Not present	 (nullification)
  */
-static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
+static void do_exception(struct pt_regs *regs, int access)
 {
-	struct gmap *gmap;
-	struct task_struct *tsk;
-	struct mm_struct *mm;
 	struct vm_area_struct *vma;
-	enum fault_type type;
-	unsigned long trans_exc_code;
 	unsigned long address;
+	struct mm_struct *mm;
 	unsigned int flags;
 	vm_fault_t fault;
+	bool is_write;
 
-	tsk = current;
 	/*
 	 * The instruction that caused the program check has
 	 * been nullified. Don't signal single step via SIGTRAP.
 	 */
-	clear_pt_regs_flag(regs, PIF_PER_TRAP);
-
+	clear_thread_flag(TIF_PER_TRAP);
 	if (kprobe_page_fault(regs, 14))
-		return 0;
-
-	mm = tsk->mm;
-	trans_exc_code = regs->int_parm_long;
-
-	/*
-	 * Verify that the fault happened in user space, that
-	 * we are not in an interrupt and that there is a 
-	 * user context.
-	 */
-	fault = VM_FAULT_BADCONTEXT;
-	type = get_fault_type(regs);
-	switch (type) {
-	case KERNEL_FAULT:
-		goto out;
-	case VDSO_FAULT:
-		fault = VM_FAULT_BADMAP;
-		goto out;
-	case USER_FAULT:
-	case GMAP_FAULT:
-		if (faulthandler_disabled() || !mm)
-			goto out;
-		break;
-	}
-
-	address = trans_exc_code & __FAIL_ADDR_MASK;
+		return;
+	mm = current->mm;
+	address = get_fault_address(regs);
+	is_write = fault_is_write(regs);
+	if (is_kernel_fault(regs) || faulthandler_disabled() || !mm)
+		return handle_fault_error_nolock(regs, 0);
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
-	flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+	flags = FAULT_FLAG_DEFAULT;
 	if (user_mode(regs))
 		flags |= FAULT_FLAG_USER;
-	if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
+	if (is_write)
+		access = VM_WRITE;
+	if (access == VM_WRITE)
 		flags |= FAULT_FLAG_WRITE;
-	down_read(&mm->mmap_sem);
-
-	gmap = NULL;
-	if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
-		gmap = (struct gmap *) S390_lowcore.gmap;
-		current->thread.gmap_addr = address;
-		current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
-		current->thread.gmap_int_code = regs->int_code & 0xffff;
-		address = __gmap_translate(gmap, address);
-		if (address == -EFAULT) {
-			fault = VM_FAULT_BADMAP;
-			goto out_up;
-		}
-		if (gmap->pfault_enabled)
-			flags |= FAULT_FLAG_RETRY_NOWAIT;
+	if (!(flags & FAULT_FLAG_USER))
+		goto lock_mmap;
+	vma = lock_vma_under_rcu(mm, address);
+	if (!vma)
+		goto lock_mmap;
+	if (!(vma->vm_flags & access)) {
+		vma_end_read(vma);
+		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+		return handle_fault_error_nolock(regs, SEGV_ACCERR);
 	}
-
+	fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
+	if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
+		vma_end_read(vma);
+	if (!(fault & VM_FAULT_RETRY)) {
+		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+		goto done;
+	}
+	count_vm_vma_lock_event(VMA_LOCK_RETRY);
+	if (fault & VM_FAULT_MAJOR)
+		flags |= FAULT_FLAG_TRIED;
+	/* Quick path to respond to signals */
+	if (fault_signal_pending(fault, regs)) {
+		if (!user_mode(regs))
+			handle_fault_error_nolock(regs, 0);
+		return;
+	}
+lock_mmap:
 retry:
-	fault = VM_FAULT_BADMAP;
-	vma = find_vma(mm, address);
+	vma = lock_mm_and_find_vma(mm, address, regs);
 	if (!vma)
-		goto out_up;
-
-	if (unlikely(vma->vm_start > address)) {
-		if (!(vma->vm_flags & VM_GROWSDOWN))
-			goto out_up;
-		if (expand_stack(vma, address))
-			goto out_up;
-	}
-
-	/*
-	 * Ok, we have a good vm_area for this memory access, so
-	 * we can handle it..
-	 */
-	fault = VM_FAULT_BADACCESS;
+		return handle_fault_error_nolock(regs, SEGV_MAPERR);
 	if (unlikely(!(vma->vm_flags & access)))
-		goto out_up;
-
-	if (is_vm_hugetlb_page(vma))
-		address &= HPAGE_MASK;
-	/*
-	 * If for any reason at all we couldn't handle the fault,
-	 * make sure we exit gracefully rather than endlessly redo
-	 * the fault.
-	 */
-	fault = handle_mm_fault(vma, address, flags);
-	/* No reason to continue if interrupted by SIGKILL. */
-	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
-		fault = VM_FAULT_SIGNAL;
-		if (flags & FAULT_FLAG_RETRY_NOWAIT)
-			goto out_up;
-		goto out;
+		return handle_fault_error(regs, SEGV_ACCERR);
+	fault = handle_mm_fault(vma, address, flags, regs);
+	if (fault_signal_pending(fault, regs)) {
+		if (!user_mode(regs))
+			handle_fault_error_nolock(regs, 0);
+		return;
 	}
-	if (unlikely(fault & VM_FAULT_ERROR))
-		goto out_up;
-
-	/*
-	 * Major/minor page fault accounting is only done on the
-	 * initial attempt. If we go through a retry, it is extremely
-	 * likely that the page will be found in page cache at that point.
-	 */
-	if (flags & FAULT_FLAG_ALLOW_RETRY) {
-		if (fault & VM_FAULT_MAJOR) {
-			tsk->maj_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
-				      regs, address);
-		} else {
-			tsk->min_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
-				      regs, address);
-		}
-		if (fault & VM_FAULT_RETRY) {
-			if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
-			    (flags & FAULT_FLAG_RETRY_NOWAIT)) {
-				/* FAULT_FLAG_RETRY_NOWAIT has been set,
-				 * mmap_sem has not been released */
-				current->thread.gmap_pfault = 1;
-				fault = VM_FAULT_PFAULT;
-				goto out_up;
-			}
-			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
-			 * of starvation. */
-			flags &= ~(FAULT_FLAG_ALLOW_RETRY |
-				   FAULT_FLAG_RETRY_NOWAIT);
-			flags |= FAULT_FLAG_TRIED;
-			down_read(&mm->mmap_sem);
-			goto retry;
-		}
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+	if (fault & VM_FAULT_RETRY) {
+		flags |= FAULT_FLAG_TRIED;
+		goto retry;
 	}
-	if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
-		address =  __gmap_link(gmap, current->thread.gmap_addr,
-				       address);
-		if (address == -EFAULT) {
-			fault = VM_FAULT_BADMAP;
-			goto out_up;
-		}
-		if (address == -ENOMEM) {
-			fault = VM_FAULT_OOM;
-			goto out_up;
-		}
+	mmap_read_unlock(mm);
+done:
+	if (!(fault & VM_FAULT_ERROR))
+		return;
+	if (fault & VM_FAULT_OOM) {
+		if (!user_mode(regs))
+			handle_fault_error_nolock(regs, 0);
+		else
+			pagefault_out_of_memory();
+	} else if (fault & VM_FAULT_SIGSEGV) {
+		if (!user_mode(regs))
+			handle_fault_error_nolock(regs, 0);
+		else
+			do_sigsegv(regs, SEGV_MAPERR);
+	} else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON |
+			    VM_FAULT_HWPOISON_LARGE)) {
+		if (!user_mode(regs))
+			handle_fault_error_nolock(regs, 0);
+		else
+			do_sigbus(regs);
+	} else {
+		pr_emerg("Unexpected fault flags: %08x\n", fault);
+		BUG();
 	}
-	fault = 0;
-out_up:
-	up_read(&mm->mmap_sem);
-out:
-	return fault;
 }
 
 void do_protection_exception(struct pt_regs *regs)
 {
-	unsigned long trans_exc_code;
-	int access;
-	vm_fault_t fault;
+	union teid teid = { .val = regs->int_parm_long };
 
-	trans_exc_code = regs->int_parm_long;
 	/*
 	 * Protection exceptions are suppressing, decrement psw address.
 	 * The exception to this rule are aborted transactions, for these
@@ -561,258 +373,95 @@ void do_protection_exception(struct pt_regs *regs)
 	 * as a special case because the translation exception code
 	 * field is not guaranteed to contain valid data in this case.
 	 */
-	if (unlikely(!(trans_exc_code & 4))) {
-		do_low_address(regs);
-		return;
+	if (unlikely(!teid.b61)) {
+		if (user_mode(regs)) {
+			/* Low-address protection in user mode: cannot happen */
+			dump_fault_info(regs);
+			die(regs, "Low-address protection");
+		}
+		/*
+		 * Low-address protection in kernel mode means
+		 * NULL pointer write access in kernel mode.
+		 */
+		return handle_fault_error_nolock(regs, 0);
 	}
-	if (unlikely(MACHINE_HAS_NX && (trans_exc_code & 0x80))) {
-		regs->int_parm_long = (trans_exc_code & ~PAGE_MASK) |
-					(regs->psw.addr & PAGE_MASK);
-		access = VM_EXEC;
-		fault = VM_FAULT_BADACCESS;
-	} else {
-		access = VM_WRITE;
-		fault = do_exception(regs, access);
+	if (unlikely(cpu_has_nx() && teid.b56)) {
+		regs->int_parm_long = (teid.addr * PAGE_SIZE) | (regs->psw.addr & PAGE_MASK);
+		return handle_fault_error_nolock(regs, SEGV_ACCERR);
 	}
-	if (unlikely(fault))
-		do_fault_error(regs, access, fault);
+	do_exception(regs, VM_WRITE);
 }
 NOKPROBE_SYMBOL(do_protection_exception);
 
 void do_dat_exception(struct pt_regs *regs)
 {
-	int access;
-	vm_fault_t fault;
-
-	access = VM_READ | VM_EXEC | VM_WRITE;
-	fault = do_exception(regs, access);
-	if (unlikely(fault))
-		do_fault_error(regs, access, fault);
+	do_exception(regs, VM_ACCESS_FLAGS);
 }
 NOKPROBE_SYMBOL(do_dat_exception);
 
-#ifdef CONFIG_PFAULT 
-/*
- * 'pfault' pseudo page faults routines.
- */
-static int pfault_disable;
+#if IS_ENABLED(CONFIG_PGSTE)
 
-static int __init nopfault(char *str)
+void do_secure_storage_access(struct pt_regs *regs)
 {
-	pfault_disable = 1;
-	return 1;
-}
-
-__setup("nopfault", nopfault);
-
-struct pfault_refbk {
-	u16 refdiagc;
-	u16 reffcode;
-	u16 refdwlen;
-	u16 refversn;
-	u64 refgaddr;
-	u64 refselmk;
-	u64 refcmpmk;
-	u64 reserved;
-} __attribute__ ((packed, aligned(8)));
-
-static struct pfault_refbk pfault_init_refbk = {
-	.refdiagc = 0x258,
-	.reffcode = 0,
-	.refdwlen = 5,
-	.refversn = 2,
-	.refgaddr = __LC_LPP,
-	.refselmk = 1ULL << 48,
-	.refcmpmk = 1ULL << 48,
-	.reserved = __PF_RES_FIELD
-};
-
-int pfault_init(void)
-{
-        int rc;
-
-	if (pfault_disable)
-		return -1;
-	diag_stat_inc(DIAG_STAT_X258);
-	asm volatile(
-		"	diag	%1,%0,0x258\n"
-		"0:	j	2f\n"
-		"1:	la	%0,8\n"
-		"2:\n"
-		EX_TABLE(0b,1b)
-		: "=d" (rc)
-		: "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc");
-        return rc;
-}
-
-static struct pfault_refbk pfault_fini_refbk = {
-	.refdiagc = 0x258,
-	.reffcode = 1,
-	.refdwlen = 5,
-	.refversn = 2,
-};
-
-void pfault_fini(void)
-{
-
-	if (pfault_disable)
-		return;
-	diag_stat_inc(DIAG_STAT_X258);
-	asm volatile(
-		"	diag	%0,0,0x258\n"
-		"0:	nopr	%%r7\n"
-		EX_TABLE(0b,0b)
-		: : "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc");
-}
-
-static DEFINE_SPINLOCK(pfault_lock);
-static LIST_HEAD(pfault_list);
-
-#define PF_COMPLETE	0x0080
-
-/*
- * The mechanism of our pfault code: if Linux is running as guest, runs a user
- * space process and the user space process accesses a page that the host has
- * paged out we get a pfault interrupt.
- *
- * This allows us, within the guest, to schedule a different process. Without
- * this mechanism the host would have to suspend the whole virtual cpu until
- * the page has been paged in.
- *
- * So when we get such an interrupt then we set the state of the current task
- * to uninterruptible and also set the need_resched flag. Both happens within
- * interrupt context(!). If we later on want to return to user space we
- * recognize the need_resched flag and then call schedule().  It's not very
- * obvious how this works...
- *
- * Of course we have a lot of additional fun with the completion interrupt (->
- * host signals that a page of a process has been paged in and the process can
- * continue to run). This interrupt can arrive on any cpu and, since we have
- * virtual cpus, actually appear before the interrupt that signals that a page
- * is missing.
- */
-static void pfault_interrupt(struct ext_code ext_code,
-			     unsigned int param32, unsigned long param64)
-{
-	struct task_struct *tsk;
-	__u16 subcode;
-	pid_t pid;
+	union teid teid = { .val = regs->int_parm_long };
+	unsigned long addr = get_fault_address(regs);
+	struct vm_area_struct *vma;
+	struct folio_walk fw;
+	struct mm_struct *mm;
+	struct folio *folio;
+	int rc;
 
 	/*
-	 * Get the external interruption subcode & pfault initial/completion
-	 * signal bit. VM stores this in the 'cpu address' field associated
-	 * with the external interrupt.
+	 * Bit 61 indicates if the address is valid, if it is not the
+	 * kernel should be stopped or SIGSEGV should be sent to the
+	 * process. Bit 61 is not reliable without the misc UV feature,
+	 * therefore this needs to be checked too.
 	 */
-	subcode = ext_code.subcode;
-	if ((subcode & 0xff00) != __SUBCODE_MASK)
-		return;
-	inc_irq_stat(IRQEXT_PFL);
-	/* Get the token (= pid of the affected task). */
-	pid = param64 & LPP_PID_MASK;
-	rcu_read_lock();
-	tsk = find_task_by_pid_ns(pid, &init_pid_ns);
-	if (tsk)
-		get_task_struct(tsk);
-	rcu_read_unlock();
-	if (!tsk)
-		return;
-	spin_lock(&pfault_lock);
-	if (subcode & PF_COMPLETE) {
-		/* signal bit is set -> a page has been swapped in by VM */
-		if (tsk->thread.pfault_wait == 1) {
-			/* Initial interrupt was faster than the completion
-			 * interrupt. pfault_wait is valid. Set pfault_wait
-			 * back to zero and wake up the process. This can
-			 * safely be done because the task is still sleeping
-			 * and can't produce new pfaults. */
-			tsk->thread.pfault_wait = 0;
-			list_del(&tsk->thread.list);
-			wake_up_process(tsk);
-			put_task_struct(tsk);
-		} else {
-			/* Completion interrupt was faster than initial
-			 * interrupt. Set pfault_wait to -1 so the initial
-			 * interrupt doesn't put the task to sleep.
-			 * If the task is not running, ignore the completion
-			 * interrupt since it must be a leftover of a PFAULT
-			 * CANCEL operation which didn't remove all pending
-			 * completion interrupts. */
-			if (tsk->state == TASK_RUNNING)
-				tsk->thread.pfault_wait = -1;
+	if (uv_has_feature(BIT_UV_FEAT_MISC) && !teid.b61) {
+		/*
+		 * When this happens, userspace did something that it
+		 * was not supposed to do, e.g. branching into secure
+		 * memory. Trigger a segmentation fault.
+		 */
+		if (user_mode(regs)) {
+			send_sig(SIGSEGV, current, 0);
+			return;
 		}
+		/*
+		 * The kernel should never run into this case and
+		 * there is no way out of this situation.
+		 */
+		panic("Unexpected PGM 0x3d with TEID bit 61=0");
+	}
+	if (is_kernel_fault(regs)) {
+		folio = phys_to_folio(addr);
+		if (unlikely(!folio_try_get(folio)))
+			return;
+		rc = arch_make_folio_accessible(folio);
+		folio_put(folio);
+		if (rc)
+			BUG();
 	} else {
-		/* signal bit not set -> a real page is missing. */
-		if (WARN_ON_ONCE(tsk != current))
-			goto out;
-		if (tsk->thread.pfault_wait == 1) {
-			/* Already on the list with a reference: put to sleep */
-			goto block;
-		} else if (tsk->thread.pfault_wait == -1) {
-			/* Completion interrupt was faster than the initial
-			 * interrupt (pfault_wait == -1). Set pfault_wait
-			 * back to zero and exit. */
-			tsk->thread.pfault_wait = 0;
-		} else {
-			/* Initial interrupt arrived before completion
-			 * interrupt. Let the task sleep.
-			 * An extra task reference is needed since a different
-			 * cpu may set the task state to TASK_RUNNING again
-			 * before the scheduler is reached. */
-			get_task_struct(tsk);
-			tsk->thread.pfault_wait = 1;
-			list_add(&tsk->thread.list, &pfault_list);
-block:
-			/* Since this must be a userspace fault, there
-			 * is no kernel task state to trample. Rely on the
-			 * return to userspace schedule() to block. */
-			__set_current_state(TASK_UNINTERRUPTIBLE);
-			set_tsk_need_resched(tsk);
-			set_preempt_need_resched();
+		mm = current->mm;
+		mmap_read_lock(mm);
+		vma = find_vma(mm, addr);
+		if (!vma)
+			return handle_fault_error(regs, SEGV_MAPERR);
+		folio = folio_walk_start(&fw, vma, addr, 0);
+		if (!folio) {
+			mmap_read_unlock(mm);
+			return;
 		}
+		/* arch_make_folio_accessible() needs a raised refcount. */
+		folio_get(folio);
+		rc = arch_make_folio_accessible(folio);
+		folio_put(folio);
+		folio_walk_end(&fw, vma);
+		if (rc)
+			send_sig(SIGSEGV, current, 0);
+		mmap_read_unlock(mm);
 	}
-out:
-	spin_unlock(&pfault_lock);
-	put_task_struct(tsk);
-}
-
-static int pfault_cpu_dead(unsigned int cpu)
-{
-	struct thread_struct *thread, *next;
-	struct task_struct *tsk;
-
-	spin_lock_irq(&pfault_lock);
-	list_for_each_entry_safe(thread, next, &pfault_list, list) {
-		thread->pfault_wait = 0;
-		list_del(&thread->list);
-		tsk = container_of(thread, struct task_struct, thread);
-		wake_up_process(tsk);
-		put_task_struct(tsk);
-	}
-	spin_unlock_irq(&pfault_lock);
-	return 0;
-}
-
-static int __init pfault_irq_init(void)
-{
-	int rc;
-
-	rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
-	if (rc)
-		goto out_extint;
-	rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
-	if (rc)
-		goto out_pfault;
-	irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
-	cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead",
-				  NULL, pfault_cpu_dead);
-	return 0;
-
-out_pfault:
-	unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
-out_extint:
-	pfault_disable = 1;
-	return rc;
 }
-early_initcall(pfault_irq_init);
+NOKPROBE_SYMBOL(do_secure_storage_access);
 
-#endif /* CONFIG_PFAULT */
+#endif /* CONFIG_PGSTE */
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index edcdca97e85e..a94bd4870c65 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2,12 +2,13 @@
 /*
  *  KVM guest address space mapping code
  *
- *    Copyright IBM Corp. 2007, 2016, 2018
+ *    Copyright IBM Corp. 2007, 2020
  *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  *		 David Hildenbrand <david@redhat.com>
  *		 Janosch Frank <frankja@linux.vnet.ibm.com>
  */
 
+#include <linux/cpufeature.h>
 #include <linux/kernel.h>
 #include <linux/pagewalk.h>
 #include <linux/swap.h>
@@ -17,22 +18,44 @@
 #include <linux/swapops.h>
 #include <linux/ksm.h>
 #include <linux/mman.h>
-
-#include <asm/pgtable.h>
+#include <linux/pgtable.h>
+#include <asm/page-states.h>
 #include <asm/pgalloc.h>
+#include <asm/machine.h>
 #include <asm/gmap.h>
+#include <asm/page.h>
 #include <asm/tlb.h>
 
+/*
+ * The address is saved in a radix tree directly; NULL would be ambiguous,
+ * since 0 is a valid address, and NULL is returned when nothing was found.
+ * The lower bits are ignored by all users of the macro, so it can be used
+ * to distinguish a valid address 0 from a NULL.
+ */
+#define VALID_GADDR_FLAG 1
+#define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG)
+#define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG)
+
 #define GMAP_SHADOW_FAKE_TABLE 1ULL
 
+static struct page *gmap_alloc_crst(void)
+{
+	struct page *page;
+
+	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+	if (!page)
+		return NULL;
+	__arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER);
+	return page;
+}
+
 /**
  * gmap_alloc - allocate and initialize a guest address space
- * @mm: pointer to the parent mm_struct
  * @limit: maximum address of the gmap address space
  *
  * Returns a guest address space structure.
  */
-static struct gmap *gmap_alloc(unsigned long limit)
+struct gmap *gmap_alloc(unsigned long limit)
 {
 	struct gmap *gmap;
 	struct page *page;
@@ -56,24 +79,20 @@ static struct gmap *gmap_alloc(unsigned long limit)
 		atype = _ASCE_TYPE_REGION1;
 		etype = _REGION1_ENTRY_EMPTY;
 	}
-	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
+	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
 	if (!gmap)
 		goto out;
-	INIT_LIST_HEAD(&gmap->crst_list);
 	INIT_LIST_HEAD(&gmap->children);
-	INIT_LIST_HEAD(&gmap->pt_list);
-	INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
-	INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
-	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
+	INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
+	INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
+	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
 	spin_lock_init(&gmap->guest_table_lock);
 	spin_lock_init(&gmap->shadow_lock);
 	refcount_set(&gmap->ref_count, 1);
-	page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+	page = gmap_alloc_crst();
 	if (!page)
 		goto out_free;
-	page->index = 0;
-	list_add(&page->lru, &gmap->crst_list);
-	table = (unsigned long *) page_to_phys(page);
+	table = page_to_virt(page);
 	crst_table_init(table, etype);
 	gmap->table = table;
 	gmap->asce = atype | _ASCE_TABLE_LENGTH |
@@ -86,6 +105,7 @@ out_free:
 out:
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(gmap_alloc);
 
 /**
  * gmap_create - create a guest address space
@@ -117,7 +137,7 @@ EXPORT_SYMBOL_GPL(gmap_create);
 
 static void gmap_flush_tlb(struct gmap *gmap)
 {
-	if (MACHINE_HAS_IDTE)
+	if (cpu_has_idte())
 		__tlb_flush_idte(gmap->asce);
 	else
 		__tlb_flush_global();
@@ -174,30 +194,46 @@ static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
 	} while (nr > 0);
 }
 
+static void gmap_free_crst(unsigned long *table, bool free_ptes)
+{
+	bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0;
+	int i;
+
+	if (is_segment) {
+		if (!free_ptes)
+			goto out;
+		for (i = 0; i < _CRST_ENTRIES; i++)
+			if (!(table[i] & _SEGMENT_ENTRY_INVALID))
+				page_table_free_pgste(page_ptdesc(phys_to_page(table[i])));
+	} else {
+		for (i = 0; i < _CRST_ENTRIES; i++)
+			if (!(table[i] & _REGION_ENTRY_INVALID))
+				gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes);
+	}
+
+out:
+	free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+}
+
 /**
  * gmap_free - free a guest address space
  * @gmap: pointer to the guest address space structure
  *
  * No locks required. There are no references to this gmap anymore.
  */
-static void gmap_free(struct gmap *gmap)
+void gmap_free(struct gmap *gmap)
 {
-	struct page *page, *next;
-
 	/* Flush tlb of all gmaps (if not already done for shadows) */
 	if (!(gmap_is_shadow(gmap) && gmap->removed))
 		gmap_flush_tlb(gmap);
 	/* Free all segment & region tables. */
-	list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
-		__free_pages(page, CRST_ALLOC_ORDER);
+	gmap_free_crst(gmap->table, gmap_is_shadow(gmap));
+
 	gmap_radix_tree_free(&gmap->guest_to_host);
 	gmap_radix_tree_free(&gmap->host_to_guest);
 
 	/* Free additional data for a shadow gmap */
 	if (gmap_is_shadow(gmap)) {
-		/* Free all page tables. */
-		list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
-			page_table_free_pgste(page);
 		gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
 		/* Release reference to the parent */
 		gmap_put(gmap->parent);
@@ -205,6 +241,7 @@ static void gmap_free(struct gmap *gmap)
 
 	kfree(gmap);
 }
+EXPORT_SYMBOL_GPL(gmap_free);
 
 /**
  * gmap_get - increase reference counter for guest address space
@@ -268,39 +305,8 @@ void gmap_remove(struct gmap *gmap)
 }
 EXPORT_SYMBOL_GPL(gmap_remove);
 
-/**
- * gmap_enable - switch primary space to the guest address space
- * @gmap: pointer to the guest address space structure
- */
-void gmap_enable(struct gmap *gmap)
-{
-	S390_lowcore.gmap = (unsigned long) gmap;
-}
-EXPORT_SYMBOL_GPL(gmap_enable);
-
-/**
- * gmap_disable - switch back to the standard primary address space
- * @gmap: pointer to the guest address space structure
- */
-void gmap_disable(struct gmap *gmap)
-{
-	S390_lowcore.gmap = 0UL;
-}
-EXPORT_SYMBOL_GPL(gmap_disable);
-
-/**
- * gmap_get_enabled - get a pointer to the currently enabled gmap
- *
- * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
- */
-struct gmap *gmap_get_enabled(void)
-{
-	return (struct gmap *) S390_lowcore.gmap;
-}
-EXPORT_SYMBOL_GPL(gmap_get_enabled);
-
 /*
- * gmap_alloc_table is assumed to be called with mmap_sem held
+ * gmap_alloc_table is assumed to be called with mmap_lock held
  */
 static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
 			    unsigned long init, unsigned long gaddr)
@@ -309,17 +315,15 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
 	unsigned long *new;
 
 	/* since we dont free the gmap table until gmap_free we can unlock */
-	page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+	page = gmap_alloc_crst();
 	if (!page)
 		return -ENOMEM;
-	new = (unsigned long *) page_to_phys(page);
+	new = page_to_virt(page);
 	crst_table_init(new, init);
 	spin_lock(&gmap->guest_table_lock);
 	if (*table & _REGION_ENTRY_INVALID) {
-		list_add(&page->lru, &gmap->crst_list);
-		*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
+		*table = __pa(new) | _REGION_ENTRY_LENGTH |
 			(*table & _REGION_ENTRY_TYPE_MASK);
-		page->index = gaddr;
 		page = NULL;
 	}
 	spin_unlock(&gmap->guest_table_lock);
@@ -328,22 +332,23 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
 	return 0;
 }
 
-/**
- * __gmap_segment_gaddr - find virtual address from segment pointer
- * @entry: pointer to a segment table entry in the guest address space
- *
- * Returns the virtual address in the guest address space for the segment
- */
-static unsigned long __gmap_segment_gaddr(unsigned long *entry)
+static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr)
 {
-	struct page *page;
-	unsigned long offset, mask;
+	return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
+}
+
+static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr)
+{
+	return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
+}
 
-	offset = (unsigned long) entry / sizeof(unsigned long);
-	offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
-	mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
-	page = virt_to_page((void *)((unsigned long) entry & mask));
-	return page->index + offset;
+static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr,
+				       unsigned long *gaddr)
+{
+	*gaddr = host_to_guest_delete(gmap, vmaddr);
+	if (IS_GADDR_VALID(*gaddr))
+		return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1);
+	return NULL;
 }
 
 /**
@@ -355,16 +360,19 @@ static unsigned long __gmap_segment_gaddr(unsigned long *entry)
  */
 static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
 {
-	unsigned long *entry;
+	unsigned long gaddr;
 	int flush = 0;
+	pmd_t *pmdp;
 
 	BUG_ON(gmap_is_shadow(gmap));
 	spin_lock(&gmap->guest_table_lock);
-	entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
-	if (entry) {
-		flush = (*entry != _SEGMENT_ENTRY_EMPTY);
-		*entry = _SEGMENT_ENTRY_EMPTY;
+
+	pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
+	if (pmdp) {
+		flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY);
+		*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
 	}
+
 	spin_unlock(&gmap->guest_table_lock);
 	return flush;
 }
@@ -405,10 +413,10 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
 		return -EINVAL;
 
 	flush = 0;
-	down_write(&gmap->mm->mmap_sem);
+	mmap_write_lock(gmap->mm);
 	for (off = 0; off < len; off += PMD_SIZE)
 		flush |= __gmap_unmap_by_gaddr(gmap, to + off);
-	up_write(&gmap->mm->mmap_sem);
+	mmap_write_unlock(gmap->mm);
 	if (flush)
 		gmap_flush_tlb(gmap);
 	return 0;
@@ -438,7 +446,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
 		return -EINVAL;
 
 	flush = 0;
-	down_write(&gmap->mm->mmap_sem);
+	mmap_write_lock(gmap->mm);
 	for (off = 0; off < len; off += PMD_SIZE) {
 		/* Remove old translation */
 		flush |= __gmap_unmap_by_gaddr(gmap, to + off);
@@ -448,7 +456,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
 				      (void *) from + off))
 			break;
 	}
-	up_write(&gmap->mm->mmap_sem);
+	mmap_write_unlock(gmap->mm);
 	if (flush)
 		gmap_flush_tlb(gmap);
 	if (off >= len)
@@ -466,7 +474,7 @@ EXPORT_SYMBOL_GPL(gmap_map_segment);
  * Returns user space address which corresponds to the guest address or
  * -EFAULT if no such mapping exists.
  * This function does not establish potentially missing page table entries.
- * The mmap_sem of the mm that belongs to the address space must be held
+ * The mmap_lock of the mm that belongs to the address space must be held
  * when this function gets called.
  *
  * Note: Can also be called for shadow gmaps.
@@ -483,28 +491,8 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
 EXPORT_SYMBOL_GPL(__gmap_translate);
 
 /**
- * gmap_translate - translate a guest address to a user space address
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: guest address
- *
- * Returns user space address which corresponds to the guest address or
- * -EFAULT if no such mapping exists.
- * This function does not establish potentially missing page table entries.
- */
-unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
-{
-	unsigned long rc;
-
-	down_read(&gmap->mm->mmap_sem);
-	rc = __gmap_translate(gmap, gaddr);
-	up_read(&gmap->mm->mmap_sem);
-	return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_translate);
-
-/**
  * gmap_unlink - disconnect a page table from the gmap shadow tables
- * @gmap: pointer to guest mapping meta data structure
+ * @mm: pointer to the parent mm_struct
  * @table: pointer to the host page table
  * @vmaddr: vm address associated with the host page table
  */
@@ -527,14 +515,14 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
 			   unsigned long gaddr);
 
 /**
- * gmap_link - set up shadow page tables to connect a host to a guest address
+ * __gmap_link - set up shadow page tables to connect a host to a guest address
  * @gmap: pointer to guest mapping meta data structure
  * @gaddr: guest address
  * @vmaddr: vm address
  *
  * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
  * if the vm address is already mapped to a different guest segment.
- * The mmap_sem of the mm that belongs to the address space must be held
+ * The mmap_lock of the mm that belongs to the address space must be held
  * when this function gets called.
  */
 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
@@ -558,7 +546,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 		    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
 				     gaddr & _REGION1_MASK))
 			return -ENOMEM;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+		table = __va(*table & _REGION_ENTRY_ORIGIN);
 	}
 	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
 		table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
@@ -566,7 +554,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 		    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
 				     gaddr & _REGION2_MASK))
 			return -ENOMEM;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+		table = __va(*table & _REGION_ENTRY_ORIGIN);
 	}
 	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
 		table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
@@ -574,7 +562,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 		    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
 				     gaddr & _REGION3_MASK))
 			return -ENOMEM;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+		table = __va(*table & _REGION_ENTRY_ORIGIN);
 	}
 	table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
 	/* Walk the parent mm page table */
@@ -586,27 +574,29 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 	pud = pud_offset(p4d, vmaddr);
 	VM_BUG_ON(pud_none(*pud));
 	/* large puds cannot yet be handled */
-	if (pud_large(*pud))
+	if (pud_leaf(*pud))
 		return -EFAULT;
 	pmd = pmd_offset(pud, vmaddr);
 	VM_BUG_ON(pmd_none(*pmd));
 	/* Are we allowed to use huge pages? */
-	if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
+	if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
 		return -EFAULT;
 	/* Link gmap segment table entry location to page table. */
-	rc = radix_tree_preload(GFP_KERNEL);
+	rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
 	if (rc)
 		return rc;
 	ptl = pmd_lock(mm, pmd);
 	spin_lock(&gmap->guest_table_lock);
 	if (*table == _SEGMENT_ENTRY_EMPTY) {
 		rc = radix_tree_insert(&gmap->host_to_guest,
-				       vmaddr >> PMD_SHIFT, table);
+				       vmaddr >> PMD_SHIFT,
+				       (void *)MAKE_VALID_GADDR(gaddr));
 		if (!rc) {
-			if (pmd_large(*pmd)) {
+			if (pmd_leaf(*pmd)) {
 				*table = (pmd_val(*pmd) &
 					  _SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
-					| _SEGMENT_ENTRY_GMAP_UC;
+					| _SEGMENT_ENTRY_GMAP_UC
+					| _SEGMENT_ENTRY;
 			} else
 				*table = pmd_val(*pmd) &
 					_SEGMENT_ENTRY_HARDWARE_BITS;
@@ -623,56 +613,14 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 	radix_tree_preload_end();
 	return rc;
 }
-
-/**
- * gmap_fault - resolve a fault on a guest address
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: guest address
- * @fault_flags: flags to pass down to handle_mm_fault()
- *
- * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
- * if the vm address is already mapped to a different guest segment.
- */
-int gmap_fault(struct gmap *gmap, unsigned long gaddr,
-	       unsigned int fault_flags)
-{
-	unsigned long vmaddr;
-	int rc;
-	bool unlocked;
-
-	down_read(&gmap->mm->mmap_sem);
-
-retry:
-	unlocked = false;
-	vmaddr = __gmap_translate(gmap, gaddr);
-	if (IS_ERR_VALUE(vmaddr)) {
-		rc = vmaddr;
-		goto out_up;
-	}
-	if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags,
-			     &unlocked)) {
-		rc = -EFAULT;
-		goto out_up;
-	}
-	/*
-	 * In the case that fixup_user_fault unlocked the mmap_sem during
-	 * faultin redo __gmap_translate to not race with a map/unmap_segment.
-	 */
-	if (unlocked)
-		goto retry;
-
-	rc = __gmap_link(gmap, gaddr, vmaddr);
-out_up:
-	up_read(&gmap->mm->mmap_sem);
-	return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_fault);
+EXPORT_SYMBOL(__gmap_link);
 
 /*
- * this function is assumed to be called with mmap_sem held
+ * this function is assumed to be called with mmap_lock held
  */
 void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
 {
+	struct vm_area_struct *vma;
 	unsigned long vmaddr;
 	spinlock_t *ptl;
 	pte_t *ptep;
@@ -682,11 +630,17 @@ void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
 						   gaddr >> PMD_SHIFT);
 	if (vmaddr) {
 		vmaddr |= gaddr & ~PMD_MASK;
+
+		vma = vma_lookup(gmap->mm, vmaddr);
+		if (!vma || is_vm_hugetlb_page(vma))
+			return;
+
 		/* Get pointer to the page table entry */
 		ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
-		if (likely(ptep))
+		if (likely(ptep)) {
 			ptep_zap_unused(gmap->mm, vmaddr, ptep, 0);
-		pte_unmap_unlock(ptep, ptl);
+			pte_unmap_unlock(ptep, ptl);
+		}
 	}
 }
 EXPORT_SYMBOL_GPL(__gmap_zap);
@@ -696,7 +650,7 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
 	unsigned long gaddr, vmaddr, size;
 	struct vm_area_struct *vma;
 
-	down_read(&gmap->mm->mmap_sem);
+	mmap_read_lock(gmap->mm);
 	for (gaddr = from; gaddr < to;
 	     gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
 		/* Find the vm address for the guest address */
@@ -717,9 +671,9 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
 		if (is_vm_hugetlb_page(vma))
 			continue;
 		size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
-		zap_page_range(vma, vmaddr, size);
+		zap_page_range_single(vma, vmaddr, size, NULL);
 	}
-	up_read(&gmap->mm->mmap_sem);
+	mmap_read_unlock(gmap->mm);
 }
 EXPORT_SYMBOL_GPL(gmap_discard);
 
@@ -784,54 +738,58 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
  *
  * Note: Can also be called for shadow gmaps.
  */
-static inline unsigned long *gmap_table_walk(struct gmap *gmap,
-					     unsigned long gaddr, int level)
+unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level)
 {
-	unsigned long *table;
+	const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
+	unsigned long *table = gmap->table;
 
-	if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
-		return NULL;
 	if (gmap_is_shadow(gmap) && gmap->removed)
 		return NULL;
-	if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11)))
+
+	if (WARN_ON_ONCE(level > (asce_type >> 2) + 1))
 		return NULL;
-	table = gmap->table;
-	switch (gmap->asce & _ASCE_TYPE_MASK) {
+
+	if (asce_type != _ASCE_TYPE_REGION1 &&
+	    gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
+		return NULL;
+
+	switch (asce_type) {
 	case _ASCE_TYPE_REGION1:
 		table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
 		if (level == 4)
 			break;
 		if (*table & _REGION_ENTRY_INVALID)
 			return NULL;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-		/* Fallthrough */
+		table = __va(*table & _REGION_ENTRY_ORIGIN);
+		fallthrough;
 	case _ASCE_TYPE_REGION2:
 		table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
 		if (level == 3)
 			break;
 		if (*table & _REGION_ENTRY_INVALID)
 			return NULL;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-		/* Fallthrough */
+		table = __va(*table & _REGION_ENTRY_ORIGIN);
+		fallthrough;
 	case _ASCE_TYPE_REGION3:
 		table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
 		if (level == 2)
 			break;
 		if (*table & _REGION_ENTRY_INVALID)
 			return NULL;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-		/* Fallthrough */
+		table = __va(*table & _REGION_ENTRY_ORIGIN);
+		fallthrough;
 	case _ASCE_TYPE_SEGMENT:
 		table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
 		if (level == 1)
 			break;
 		if (*table & _REGION_ENTRY_INVALID)
 			return NULL;
-		table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
-		table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT;
+		table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
+		table += (gaddr & _PAGE_INDEX) >> PAGE_SHIFT;
 	}
 	return table;
 }
+EXPORT_SYMBOL(gmap_table_walk);
 
 /**
  * gmap_pte_op_walk - walk the gmap page table, get the page table lock
@@ -875,10 +833,10 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
 
 	BUG_ON(gmap_is_shadow(gmap));
 	fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
-	if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked))
+	if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked))
 		return -EFAULT;
 	if (unlocked)
-		/* lost mmap_sem, caller has to retry __gmap_translate */
+		/* lost mmap_lock, caller has to retry __gmap_translate */
 		return 0;
 	/* Connect the page tables */
 	return __gmap_link(gmap, gaddr, vmaddr);
@@ -886,12 +844,12 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
 
 /**
  * gmap_pte_op_end - release the page table lock
- * @ptl: pointer to the spinlock pointer
+ * @ptep: pointer to the locked pte
+ * @ptl: pointer to the page table spinlock
  */
-static void gmap_pte_op_end(spinlock_t *ptl)
+static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl)
 {
-	if (ptl)
-		spin_unlock(ptl);
+	pte_unmap_unlock(ptep, ptl);
 }
 
 /**
@@ -922,7 +880,7 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
 	}
 
 	/* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
-	if (!pmd_large(*pmdp))
+	if (!pmd_leaf(*pmdp))
 		spin_unlock(&gmap->guest_table_lock);
 	return pmdp;
 }
@@ -934,7 +892,7 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
  */
 static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
 {
-	if (pmd_large(*pmdp))
+	if (pmd_leaf(*pmdp))
 		spin_unlock(&gmap->guest_table_lock);
 }
 
@@ -949,7 +907,7 @@ static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
  * -EAGAIN if a fixup is needed
  * -EINVAL if unsupported notifier bits have been specified
  *
- * Expected to be called with sg->mm->mmap_sem in read and
+ * Expected to be called with sg->mm->mmap_lock in read and
  * guest_table_lock held.
  */
 static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
@@ -964,18 +922,18 @@ static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
 		return -EAGAIN;
 
 	if (prot == PROT_NONE && !pmd_i) {
-		pmd_val(new) |= _SEGMENT_ENTRY_INVALID;
+		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
 		gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
 	}
 
 	if (prot == PROT_READ && !pmd_p) {
-		pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID;
-		pmd_val(new) |= _SEGMENT_ENTRY_PROTECT;
+		new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
+		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT));
 		gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
 	}
 
 	if (bits & GMAP_NOTIFY_MPROT)
-		pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN;
+		set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
 
 	/* Shadow GMAP protection needs split PMDs */
 	if (bits & GMAP_NOTIFY_SHADOW)
@@ -995,14 +953,14 @@ static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
  * Returns 0 if successfully protected, -ENOMEM if out of memory and
  * -EAGAIN if a fixup is needed.
  *
- * Expected to be called with sg->mm->mmap_sem in read
+ * Expected to be called with sg->mm->mmap_lock in read
  */
 static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
 			    pmd_t *pmdp, int prot, unsigned long bits)
 {
 	int rc;
 	pte_t *ptep;
-	spinlock_t *ptl = NULL;
+	spinlock_t *ptl;
 	unsigned long pbits = 0;
 
 	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
@@ -1016,7 +974,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
 	pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0;
 	/* Protect and unlock. */
 	rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits);
-	gmap_pte_op_end(ptl);
+	gmap_pte_op_end(ptep, ptl);
 	return rc;
 }
 
@@ -1028,86 +986,40 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
  * @bits: pgste notification bits to set
  *
- * Returns 0 if successfully protected, -ENOMEM if out of memory and
- * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
+ * Returns:
+ *   PAGE_SIZE if a small page was successfully protected;
+ *   HPAGE_SIZE if a large page was successfully protected;
+ *   -ENOMEM if out of memory;
+ *   -EFAULT if gaddr is invalid (or mapping for shadows is missing);
+ *   -EAGAIN if the guest mapping is missing and should be fixed by the caller.
  *
- * Called with sg->mm->mmap_sem in read.
+ * Context: Called with sg->mm->mmap_lock in read.
  */
-static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
-			      unsigned long len, int prot, unsigned long bits)
+int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits)
 {
-	unsigned long vmaddr, dist;
 	pmd_t *pmdp;
-	int rc;
+	int rc = 0;
 
 	BUG_ON(gmap_is_shadow(gmap));
-	while (len) {
-		rc = -EAGAIN;
-		pmdp = gmap_pmd_op_walk(gmap, gaddr);
-		if (pmdp) {
-			if (!pmd_large(*pmdp)) {
-				rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
-						      bits);
-				if (!rc) {
-					len -= PAGE_SIZE;
-					gaddr += PAGE_SIZE;
-				}
-			} else {
-				rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot,
-						      bits);
-				if (!rc) {
-					dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK);
-					len = len < dist ? 0 : len - dist;
-					gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE;
-				}
-			}
-			gmap_pmd_op_end(gmap, pmdp);
-		}
-		if (rc) {
-			if (rc == -EINVAL)
-				return rc;
 
-			/* -EAGAIN, fixup of userspace mm and gmap */
-			vmaddr = __gmap_translate(gmap, gaddr);
-			if (IS_ERR_VALUE(vmaddr))
-				return vmaddr;
-			rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
-			if (rc)
-				return rc;
-		}
-	}
-	return 0;
-}
+	pmdp = gmap_pmd_op_walk(gmap, gaddr);
+	if (!pmdp)
+		return -EAGAIN;
 
-/**
- * gmap_mprotect_notify - change access rights for a range of ptes and
- *                        call the notifier if any pte changes again
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: virtual address in the guest address space
- * @len: size of area
- * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
- *
- * Returns 0 if for each page in the given range a gmap mapping exists,
- * the new access rights could be set and the notifier could be armed.
- * If the gmap mapping is missing for one or more pages -EFAULT is
- * returned. If no memory could be allocated -ENOMEM is returned.
- * This function establishes missing page table entries.
- */
-int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
-			 unsigned long len, int prot)
-{
-	int rc;
+	if (!pmd_leaf(*pmdp)) {
+		rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits);
+		if (!rc)
+			rc = PAGE_SIZE;
+	} else {
+		rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits);
+		if (!rc)
+			rc = HPAGE_SIZE;
+	}
+	gmap_pmd_op_end(gmap, pmdp);
 
-	if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
-		return -EINVAL;
-	if (!MACHINE_HAS_ESOP && prot == PROT_READ)
-		return -EINVAL;
-	down_read(&gmap->mm->mmap_sem);
-	rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT);
-	up_read(&gmap->mm->mmap_sem);
 	return rc;
 }
-EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
+EXPORT_SYMBOL_GPL(gmap_protect_one);
 
 /**
  * gmap_read_table - get an unsigned long value from a guest page table using
@@ -1120,7 +1032,7 @@ EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
  * if reading using the virtual address failed. -EINVAL if called on a gmap
  * shadow.
  *
- * Called with gmap->mm->mmap_sem in read.
+ * Called with gmap->mm->mmap_lock in read.
  */
 int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
 {
@@ -1140,12 +1052,12 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
 			if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
 				address = pte_val(pte) & PAGE_MASK;
 				address += gaddr & ~PAGE_MASK;
-				*val = *(unsigned long *) address;
-				pte_val(*ptep) |= _PAGE_YOUNG;
+				*val = *(unsigned long *)__va(address);
+				set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG)));
 				/* Do *NOT* clear the _PAGE_INVALID bit! */
 				rc = 0;
 			}
-			gmap_pte_op_end(ptl);
+			gmap_pte_op_end(ptep, ptl);
 		}
 		if (!rc)
 			break;
@@ -1173,6 +1085,7 @@ EXPORT_SYMBOL_GPL(gmap_read_table);
 static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
 				    struct gmap_rmap *rmap)
 {
+	struct gmap_rmap *temp;
 	void __rcu **slot;
 
 	BUG_ON(!gmap_is_shadow(sg));
@@ -1180,6 +1093,12 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
 	if (slot) {
 		rmap->next = radix_tree_deref_slot_protected(slot,
 							&sg->guest_table_lock);
+		for (temp = rmap->next; temp; temp = temp->next) {
+			if (temp->raddr == rmap->raddr) {
+				kfree(rmap);
+				return;
+			}
+		}
 		radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
 	} else {
 		rmap->next = NULL;
@@ -1214,11 +1133,11 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
 		vmaddr = __gmap_translate(parent, paddr);
 		if (IS_ERR_VALUE(vmaddr))
 			return vmaddr;
-		rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+		rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
 		if (!rmap)
 			return -ENOMEM;
 		rmap->raddr = raddr;
-		rc = radix_tree_preload(GFP_KERNEL);
+		rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
 		if (rc) {
 			kfree(rmap);
 			return rc;
@@ -1232,7 +1151,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
 			if (!rc)
 				gmap_insert_rmap(sg, vmaddr, rmap);
 			spin_unlock(&sg->guest_table_lock);
-			gmap_pte_op_end(ptl);
+			gmap_pte_op_end(ptep, ptl);
 		}
 		radix_tree_preload_end();
 		if (rc) {
@@ -1268,7 +1187,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
 static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
 {
 	asm volatile(
-		"	.insn	rrf,0xb98e0000,%0,%1,0,0"
+		"	idte	%0,0,%1"
 		: : "a" (asce), "a" (vaddr) : "cc", "memory");
 }
 
@@ -1287,7 +1206,7 @@ static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
 	table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
 	if (!table || *table & _PAGE_INVALID)
 		return;
-	gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1);
+	gmap_call_notifier(sg, raddr, raddr + PAGE_SIZE - 1);
 	ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
 }
 
@@ -1305,7 +1224,7 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
 	int i;
 
 	BUG_ON(!gmap_is_shadow(sg));
-	for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE)
+	for (i = 0; i < _PAGE_ENTRIES; i++, raddr += PAGE_SIZE)
 		pgt[i] = _PAGE_INVALID;
 }
 
@@ -1318,23 +1237,23 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
  */
 static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
 {
-	unsigned long sto, *ste, *pgt;
-	struct page *page;
+	unsigned long *ste;
+	phys_addr_t sto, pgt;
+	struct ptdesc *ptdesc;
 
 	BUG_ON(!gmap_is_shadow(sg));
 	ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
 	if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
 		return;
 	gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1);
-	sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
+	sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
 	gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
-	pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
+	pgt = *ste & _SEGMENT_ENTRY_ORIGIN;
 	*ste = _SEGMENT_ENTRY_EMPTY;
-	__gmap_unshadow_pgt(sg, raddr, pgt);
+	__gmap_unshadow_pgt(sg, raddr, __va(pgt));
 	/* Free page table */
-	page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
-	list_del(&page->lru);
-	page_table_free_pgste(page);
+	ptdesc = page_ptdesc(phys_to_page(pgt));
+	page_table_free_pgste(ptdesc);
 }
 
 /**
@@ -1348,21 +1267,20 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
 static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
 				unsigned long *sgt)
 {
-	unsigned long *pgt;
-	struct page *page;
+	struct ptdesc *ptdesc;
+	phys_addr_t pgt;
 	int i;
 
 	BUG_ON(!gmap_is_shadow(sg));
 	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) {
 		if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
 			continue;
-		pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
+		pgt = sgt[i] & _REGION_ENTRY_ORIGIN;
 		sgt[i] = _SEGMENT_ENTRY_EMPTY;
-		__gmap_unshadow_pgt(sg, raddr, pgt);
+		__gmap_unshadow_pgt(sg, raddr, __va(pgt));
 		/* Free page table */
-		page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
-		list_del(&page->lru);
-		page_table_free_pgste(page);
+		ptdesc = page_ptdesc(phys_to_page(pgt));
+		page_table_free_pgste(ptdesc);
 	}
 }
 
@@ -1375,7 +1293,8 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
  */
 static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
 {
-	unsigned long r3o, *r3e, *sgt;
+	unsigned long r3o, *r3e;
+	phys_addr_t sgt;
 	struct page *page;
 
 	BUG_ON(!gmap_is_shadow(sg));
@@ -1384,13 +1303,12 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
 		return;
 	gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1);
 	r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT));
-	gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
-	sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
+	gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr);
+	sgt = *r3e & _REGION_ENTRY_ORIGIN;
 	*r3e = _REGION3_ENTRY_EMPTY;
-	__gmap_unshadow_sgt(sg, raddr, sgt);
+	__gmap_unshadow_sgt(sg, raddr, __va(sgt));
 	/* Free segment table */
-	page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
-	list_del(&page->lru);
+	page = phys_to_page(sgt);
 	__free_pages(page, CRST_ALLOC_ORDER);
 }
 
@@ -1405,20 +1323,19 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
 static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
 				unsigned long *r3t)
 {
-	unsigned long *sgt;
 	struct page *page;
+	phys_addr_t sgt;
 	int i;
 
 	BUG_ON(!gmap_is_shadow(sg));
 	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) {
 		if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
 			continue;
-		sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
+		sgt = r3t[i] & _REGION_ENTRY_ORIGIN;
 		r3t[i] = _REGION3_ENTRY_EMPTY;
-		__gmap_unshadow_sgt(sg, raddr, sgt);
+		__gmap_unshadow_sgt(sg, raddr, __va(sgt));
 		/* Free segment table */
-		page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
-		list_del(&page->lru);
+		page = phys_to_page(sgt);
 		__free_pages(page, CRST_ALLOC_ORDER);
 	}
 }
@@ -1432,7 +1349,8 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
  */
 static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
 {
-	unsigned long r2o, *r2e, *r3t;
+	unsigned long r2o, *r2e;
+	phys_addr_t r3t;
 	struct page *page;
 
 	BUG_ON(!gmap_is_shadow(sg));
@@ -1441,13 +1359,12 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
 		return;
 	gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1);
 	r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT));
-	gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
-	r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
+	gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr);
+	r3t = *r2e & _REGION_ENTRY_ORIGIN;
 	*r2e = _REGION2_ENTRY_EMPTY;
-	__gmap_unshadow_r3t(sg, raddr, r3t);
+	__gmap_unshadow_r3t(sg, raddr, __va(r3t));
 	/* Free region 3 table */
-	page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
-	list_del(&page->lru);
+	page = phys_to_page(r3t);
 	__free_pages(page, CRST_ALLOC_ORDER);
 }
 
@@ -1462,7 +1379,7 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
 static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
 				unsigned long *r2t)
 {
-	unsigned long *r3t;
+	phys_addr_t r3t;
 	struct page *page;
 	int i;
 
@@ -1470,12 +1387,11 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
 	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) {
 		if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
 			continue;
-		r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
+		r3t = r2t[i] & _REGION_ENTRY_ORIGIN;
 		r2t[i] = _REGION2_ENTRY_EMPTY;
-		__gmap_unshadow_r3t(sg, raddr, r3t);
+		__gmap_unshadow_r3t(sg, raddr, __va(r3t));
 		/* Free region 3 table */
-		page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
-		list_del(&page->lru);
+		page = phys_to_page(r3t);
 		__free_pages(page, CRST_ALLOC_ORDER);
 	}
 }
@@ -1489,8 +1405,9 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
  */
 static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
 {
-	unsigned long r1o, *r1e, *r2t;
+	unsigned long r1o, *r1e;
 	struct page *page;
+	phys_addr_t r2t;
 
 	BUG_ON(!gmap_is_shadow(sg));
 	r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
@@ -1498,13 +1415,12 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
 		return;
 	gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1);
 	r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT));
-	gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
-	r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
+	gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr);
+	r2t = *r1e & _REGION_ENTRY_ORIGIN;
 	*r1e = _REGION1_ENTRY_EMPTY;
-	__gmap_unshadow_r2t(sg, raddr, r2t);
+	__gmap_unshadow_r2t(sg, raddr, __va(r2t));
 	/* Free region 2 table */
-	page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
-	list_del(&page->lru);
+	page = phys_to_page(r2t);
 	__free_pages(page, CRST_ALLOC_ORDER);
 }
 
@@ -1519,23 +1435,23 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
 static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
 				unsigned long *r1t)
 {
-	unsigned long asce, *r2t;
+	unsigned long asce;
 	struct page *page;
+	phys_addr_t r2t;
 	int i;
 
 	BUG_ON(!gmap_is_shadow(sg));
-	asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
+	asce = __pa(r1t) | _ASCE_TYPE_REGION1;
 	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) {
 		if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
 			continue;
-		r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
-		__gmap_unshadow_r2t(sg, raddr, r2t);
+		r2t = r1t[i] & _REGION_ENTRY_ORIGIN;
+		__gmap_unshadow_r2t(sg, raddr, __va(r2t));
 		/* Clear entry and flush translation r1t -> r2t */
 		gmap_idte_one(asce, raddr);
 		r1t[i] = _REGION1_ENTRY_EMPTY;
 		/* Free region 2 table */
-		page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
-		list_del(&page->lru);
+		page = phys_to_page(r2t);
 		__free_pages(page, CRST_ALLOC_ORDER);
 	}
 }
@@ -1546,7 +1462,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
  *
  * Called with sg->guest_table_lock
  */
-static void gmap_unshadow(struct gmap *sg)
+void gmap_unshadow(struct gmap *sg)
 {
 	unsigned long *table;
 
@@ -1556,7 +1472,7 @@ static void gmap_unshadow(struct gmap *sg)
 	sg->removed = 1;
 	gmap_call_notifier(sg, 0, -1UL);
 	gmap_flush_tlb(sg);
-	table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
+	table = __va(sg->asce & _ASCE_ORIGIN);
 	switch (sg->asce & _ASCE_TYPE_MASK) {
 	case _ASCE_TYPE_REGION1:
 		__gmap_unshadow_r1t(sg, 0, table);
@@ -1572,142 +1488,7 @@ static void gmap_unshadow(struct gmap *sg)
 		break;
 	}
 }
-
-/**
- * gmap_find_shadow - find a specific asce in the list of shadow tables
- * @parent: pointer to the parent gmap
- * @asce: ASCE for which the shadow table is created
- * @edat_level: edat level to be used for the shadow translation
- *
- * Returns the pointer to a gmap if a shadow table with the given asce is
- * already available, ERR_PTR(-EAGAIN) if another one is just being created,
- * otherwise NULL
- */
-static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
-				     int edat_level)
-{
-	struct gmap *sg;
-
-	list_for_each_entry(sg, &parent->children, list) {
-		if (sg->orig_asce != asce || sg->edat_level != edat_level ||
-		    sg->removed)
-			continue;
-		if (!sg->initialized)
-			return ERR_PTR(-EAGAIN);
-		refcount_inc(&sg->ref_count);
-		return sg;
-	}
-	return NULL;
-}
-
-/**
- * gmap_shadow_valid - check if a shadow guest address space matches the
- *                     given properties and is still valid
- * @sg: pointer to the shadow guest address space structure
- * @asce: ASCE for which the shadow table is requested
- * @edat_level: edat level to be used for the shadow translation
- *
- * Returns 1 if the gmap shadow is still valid and matches the given
- * properties, the caller can continue using it. Returns 0 otherwise, the
- * caller has to request a new shadow gmap in this case.
- *
- */
-int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
-{
-	if (sg->removed)
-		return 0;
-	return sg->orig_asce == asce && sg->edat_level == edat_level;
-}
-EXPORT_SYMBOL_GPL(gmap_shadow_valid);
-
-/**
- * gmap_shadow - create/find a shadow guest address space
- * @parent: pointer to the parent gmap
- * @asce: ASCE for which the shadow table is created
- * @edat_level: edat level to be used for the shadow translation
- *
- * The pages of the top level page table referred by the asce parameter
- * will be set to read-only and marked in the PGSTEs of the kvm process.
- * The shadow table will be removed automatically on any change to the
- * PTE mapping for the source table.
- *
- * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
- * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
- * parent gmap table could not be protected.
- */
-struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
-			 int edat_level)
-{
-	struct gmap *sg, *new;
-	unsigned long limit;
-	int rc;
-
-	BUG_ON(parent->mm->context.allow_gmap_hpage_1m);
-	BUG_ON(gmap_is_shadow(parent));
-	spin_lock(&parent->shadow_lock);
-	sg = gmap_find_shadow(parent, asce, edat_level);
-	spin_unlock(&parent->shadow_lock);
-	if (sg)
-		return sg;
-	/* Create a new shadow gmap */
-	limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
-	if (asce & _ASCE_REAL_SPACE)
-		limit = -1UL;
-	new = gmap_alloc(limit);
-	if (!new)
-		return ERR_PTR(-ENOMEM);
-	new->mm = parent->mm;
-	new->parent = gmap_get(parent);
-	new->orig_asce = asce;
-	new->edat_level = edat_level;
-	new->initialized = false;
-	spin_lock(&parent->shadow_lock);
-	/* Recheck if another CPU created the same shadow */
-	sg = gmap_find_shadow(parent, asce, edat_level);
-	if (sg) {
-		spin_unlock(&parent->shadow_lock);
-		gmap_free(new);
-		return sg;
-	}
-	if (asce & _ASCE_REAL_SPACE) {
-		/* only allow one real-space gmap shadow */
-		list_for_each_entry(sg, &parent->children, list) {
-			if (sg->orig_asce & _ASCE_REAL_SPACE) {
-				spin_lock(&sg->guest_table_lock);
-				gmap_unshadow(sg);
-				spin_unlock(&sg->guest_table_lock);
-				list_del(&sg->list);
-				gmap_put(sg);
-				break;
-			}
-		}
-	}
-	refcount_set(&new->ref_count, 2);
-	list_add(&new->list, &parent->children);
-	if (asce & _ASCE_REAL_SPACE) {
-		/* nothing to protect, return right away */
-		new->initialized = true;
-		spin_unlock(&parent->shadow_lock);
-		return new;
-	}
-	spin_unlock(&parent->shadow_lock);
-	/* protect after insertion, so it will get properly invalidated */
-	down_read(&parent->mm->mmap_sem);
-	rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
-				((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE,
-				PROT_READ, GMAP_NOTIFY_SHADOW);
-	up_read(&parent->mm->mmap_sem);
-	spin_lock(&parent->shadow_lock);
-	new->initialized = true;
-	if (rc) {
-		list_del(&new->list);
-		gmap_free(new);
-		new = ERR_PTR(rc);
-	}
-	spin_unlock(&parent->shadow_lock);
-	return new;
-}
-EXPORT_SYMBOL_GPL(gmap_shadow);
+EXPORT_SYMBOL(gmap_unshadow);
 
 /**
  * gmap_shadow_r2t - create an empty shadow region 2 table
@@ -1719,31 +1500,29 @@ EXPORT_SYMBOL_GPL(gmap_shadow);
  * The r2t parameter specifies the address of the source table. The
  * four pages of the source table are made read-only in the parent gmap
  * address space. A write to the source table area @r2t will automatically
- * remove the shadow r2 table and all of its decendents.
+ * remove the shadow r2 table and all of its descendants.
  *
  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
  * shadow table structure is incomplete, -ENOMEM if out of memory and
  * -EFAULT if an address in the parent gmap could not be resolved.
  *
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
  */
 int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
 		    int fake)
 {
 	unsigned long raddr, origin, offset, len;
-	unsigned long *s_r2t, *table;
+	unsigned long *table;
+	phys_addr_t s_r2t;
 	struct page *page;
 	int rc;
 
 	BUG_ON(!gmap_is_shadow(sg));
 	/* Allocate a shadow region second table */
-	page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+	page = gmap_alloc_crst();
 	if (!page)
 		return -ENOMEM;
-	page->index = r2t & _REGION_ENTRY_ORIGIN;
-	if (fake)
-		page->index |= GMAP_SHADOW_FAKE_TABLE;
-	s_r2t = (unsigned long *) page_to_phys(page);
+	s_r2t = page_to_phys(page);
 	/* Install shadow region second table */
 	spin_lock(&sg->guest_table_lock);
 	table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
@@ -1758,13 +1537,12 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
 		rc = -EAGAIN;		/* Race with shadow */
 		goto out_free;
 	}
-	crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
+	crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY);
 	/* mark as invalid as long as the parent table is not protected */
-	*table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
+	*table = s_r2t | _REGION_ENTRY_LENGTH |
 		 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
 	if (sg->edat_level >= 1)
 		*table |= (r2t & _REGION_ENTRY_PROTECT);
-	list_add(&page->lru, &sg->crst_list);
 	if (fake) {
 		/* nothing to protect for fake tables */
 		*table &= ~_REGION_ENTRY_INVALID;
@@ -1781,8 +1559,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
 	spin_lock(&sg->guest_table_lock);
 	if (!rc) {
 		table = gmap_table_walk(sg, saddr, 4);
-		if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
-			      (unsigned long) s_r2t)
+		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t)
 			rc = -EAGAIN;		/* Race with unshadow */
 		else
 			*table &= ~_REGION_ENTRY_INVALID;
@@ -1809,25 +1586,23 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
  * shadow table structure is incomplete, -ENOMEM if out of memory and
  * -EFAULT if an address in the parent gmap could not be resolved.
  *
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
  */
 int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
 		    int fake)
 {
 	unsigned long raddr, origin, offset, len;
-	unsigned long *s_r3t, *table;
+	unsigned long *table;
+	phys_addr_t s_r3t;
 	struct page *page;
 	int rc;
 
 	BUG_ON(!gmap_is_shadow(sg));
 	/* Allocate a shadow region second table */
-	page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+	page = gmap_alloc_crst();
 	if (!page)
 		return -ENOMEM;
-	page->index = r3t & _REGION_ENTRY_ORIGIN;
-	if (fake)
-		page->index |= GMAP_SHADOW_FAKE_TABLE;
-	s_r3t = (unsigned long *) page_to_phys(page);
+	s_r3t = page_to_phys(page);
 	/* Install shadow region second table */
 	spin_lock(&sg->guest_table_lock);
 	table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
@@ -1840,14 +1615,14 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
 		goto out_free;
 	} else if (*table & _REGION_ENTRY_ORIGIN) {
 		rc = -EAGAIN;		/* Race with shadow */
+		goto out_free;
 	}
-	crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
+	crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY);
 	/* mark as invalid as long as the parent table is not protected */
-	*table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
+	*table = s_r3t | _REGION_ENTRY_LENGTH |
 		 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
 	if (sg->edat_level >= 1)
 		*table |= (r3t & _REGION_ENTRY_PROTECT);
-	list_add(&page->lru, &sg->crst_list);
 	if (fake) {
 		/* nothing to protect for fake tables */
 		*table &= ~_REGION_ENTRY_INVALID;
@@ -1864,8 +1639,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
 	spin_lock(&sg->guest_table_lock);
 	if (!rc) {
 		table = gmap_table_walk(sg, saddr, 3);
-		if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
-			      (unsigned long) s_r3t)
+		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t)
 			rc = -EAGAIN;		/* Race with unshadow */
 		else
 			*table &= ~_REGION_ENTRY_INVALID;
@@ -1892,25 +1666,23 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
  * shadow table structure is incomplete, -ENOMEM if out of memory and
  * -EFAULT if an address in the parent gmap could not be resolved.
  *
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
  */
 int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
 		    int fake)
 {
 	unsigned long raddr, origin, offset, len;
-	unsigned long *s_sgt, *table;
+	unsigned long *table;
+	phys_addr_t s_sgt;
 	struct page *page;
 	int rc;
 
 	BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
 	/* Allocate a shadow segment table */
-	page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+	page = gmap_alloc_crst();
 	if (!page)
 		return -ENOMEM;
-	page->index = sgt & _REGION_ENTRY_ORIGIN;
-	if (fake)
-		page->index |= GMAP_SHADOW_FAKE_TABLE;
-	s_sgt = (unsigned long *) page_to_phys(page);
+	s_sgt = page_to_phys(page);
 	/* Install shadow region second table */
 	spin_lock(&sg->guest_table_lock);
 	table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
@@ -1925,13 +1697,12 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
 		rc = -EAGAIN;		/* Race with shadow */
 		goto out_free;
 	}
-	crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
+	crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY);
 	/* mark as invalid as long as the parent table is not protected */
-	*table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
+	*table = s_sgt | _REGION_ENTRY_LENGTH |
 		 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
 	if (sg->edat_level >= 1)
 		*table |= sgt & _REGION_ENTRY_PROTECT;
-	list_add(&page->lru, &sg->crst_list);
 	if (fake) {
 		/* nothing to protect for fake tables */
 		*table &= ~_REGION_ENTRY_INVALID;
@@ -1948,8 +1719,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
 	spin_lock(&sg->guest_table_lock);
 	if (!rc) {
 		table = gmap_table_walk(sg, saddr, 2);
-		if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
-			      (unsigned long) s_sgt)
+		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt)
 			rc = -EAGAIN;		/* Race with unshadow */
 		else
 			*table &= ~_REGION_ENTRY_INVALID;
@@ -1965,45 +1735,22 @@ out_free:
 }
 EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
 
-/**
- * gmap_shadow_lookup_pgtable - find a shadow page table
- * @sg: pointer to the shadow guest address space structure
- * @saddr: the address in the shadow aguest address space
- * @pgt: parent gmap address of the page table to get shadowed
- * @dat_protection: if the pgtable is marked as protected by dat
- * @fake: pgt references contiguous guest memory block, not a pgtable
- *
- * Returns 0 if the shadow page table was found and -EAGAIN if the page
- * table was not found.
- *
- * Called with sg->mm->mmap_sem in read.
- */
-int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
-			   unsigned long *pgt, int *dat_protection,
-			   int *fake)
+static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr)
 {
-	unsigned long *table;
-	struct page *page;
-	int rc;
+	unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc));
 
-	BUG_ON(!gmap_is_shadow(sg));
-	spin_lock(&sg->guest_table_lock);
-	table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
-	if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
-		/* Shadow page tables are full pages (pte+pgste) */
-		page = pfn_to_page(*table >> PAGE_SHIFT);
-		*pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
-		*dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
-		*fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
-		rc = 0;
-	} else  {
-		rc = -EAGAIN;
-	}
-	spin_unlock(&sg->guest_table_lock);
-	return rc;
+	pgstes += _PAGE_ENTRIES;
+
+	pgstes[0] &= ~PGSTE_ST2_MASK;
+	pgstes[1] &= ~PGSTE_ST2_MASK;
+	pgstes[2] &= ~PGSTE_ST2_MASK;
+	pgstes[3] &= ~PGSTE_ST2_MASK;
 
+	pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK;
+	pgstes[1] |= pgt_addr & PGSTE_ST2_MASK;
+	pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK;
+	pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK;
 }
-EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
 
 /**
  * gmap_shadow_pgt - instantiate a shadow page table
@@ -2016,25 +1763,27 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
  * shadow table structure is incomplete, -ENOMEM if out of memory,
  * -EFAULT if an address in the parent gmap could not be resolved and
  *
- * Called with gmap->mm->mmap_sem in read
+ * Called with gmap->mm->mmap_lock in read
  */
 int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
 		    int fake)
 {
 	unsigned long raddr, origin;
-	unsigned long *s_pgt, *table;
-	struct page *page;
+	unsigned long *table;
+	struct ptdesc *ptdesc;
+	phys_addr_t s_pgt;
 	int rc;
 
 	BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
 	/* Allocate a shadow page table */
-	page = page_table_alloc_pgste(sg->mm);
-	if (!page)
+	ptdesc = page_table_alloc_pgste(sg->mm);
+	if (!ptdesc)
 		return -ENOMEM;
-	page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
+	origin = pgt & _SEGMENT_ENTRY_ORIGIN;
 	if (fake)
-		page->index |= GMAP_SHADOW_FAKE_TABLE;
-	s_pgt = (unsigned long *) page_to_phys(page);
+		origin |= GMAP_SHADOW_FAKE_TABLE;
+	gmap_pgste_set_pgt_addr(ptdesc, origin);
+	s_pgt = page_to_phys(ptdesc_page(ptdesc));
 	/* Install shadow page table */
 	spin_lock(&sg->guest_table_lock);
 	table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
@@ -2052,7 +1801,6 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
 	/* mark as invalid as long as the parent table is not protected */
 	*table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
 		 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
-	list_add(&page->lru, &sg->pt_list);
 	if (fake) {
 		/* nothing to protect for fake tables */
 		*table &= ~_SEGMENT_ENTRY_INVALID;
@@ -2067,8 +1815,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
 	spin_lock(&sg->guest_table_lock);
 	if (!rc) {
 		table = gmap_table_walk(sg, saddr, 1);
-		if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
-			      (unsigned long) s_pgt)
+		if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt)
 			rc = -EAGAIN;		/* Race with unshadow */
 		else
 			*table &= ~_SEGMENT_ENTRY_INVALID;
@@ -2079,7 +1826,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
 	return rc;
 out_free:
 	spin_unlock(&sg->guest_table_lock);
-	page_table_free_pgste(page);
+	page_table_free_pgste(ptdesc);
 	return rc;
 
 }
@@ -2095,7 +1842,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
  * shadow table structure is incomplete, -ENOMEM if out of memory and
  * -EFAULT if an address in the parent gmap could not be resolved.
  *
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
  */
 int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
 {
@@ -2111,7 +1858,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
 	parent = sg->parent;
 	prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
 
-	rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+	rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
 	if (!rmap)
 		return -ENOMEM;
 	rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
@@ -2123,7 +1870,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
 			rc = vmaddr;
 			break;
 		}
-		rc = radix_tree_preload(GFP_KERNEL);
+		rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
 		if (rc)
 			break;
 		rc = -EAGAIN;
@@ -2134,7 +1881,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
 			tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
 			if (!tptep) {
 				spin_unlock(&sg->guest_table_lock);
-				gmap_pte_op_end(ptl);
+				gmap_pte_op_end(sptep, ptl);
 				radix_tree_preload_end();
 				break;
 			}
@@ -2145,7 +1892,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
 				rmap = NULL;
 				rc = 0;
 			}
-			gmap_pte_op_end(ptl);
+			gmap_pte_op_end(sptep, ptl);
 			spin_unlock(&sg->guest_table_lock);
 		}
 		radix_tree_preload_end();
@@ -2160,7 +1907,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
 }
 EXPORT_SYMBOL_GPL(gmap_shadow_page);
 
-/**
+/*
  * gmap_shadow_notify - handle notifications for shadow gmap
  *
  * Called with sg->parent->shadow_lock.
@@ -2220,7 +1967,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
 /**
  * ptep_notify - call all invalidation callbacks for a specific pte.
  * @mm: pointer to the process mm_struct
- * @addr: virtual address in the process address space
+ * @vmaddr: virtual address in the process address space
  * @pte: pointer to the page table entry
  * @bits: bits from the pgste that caused the notify call
  *
@@ -2231,7 +1978,6 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
 		 pte_t *pte, unsigned long bits)
 {
 	unsigned long offset, gaddr = 0;
-	unsigned long *table;
 	struct gmap *gmap, *sg, *next;
 
 	offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
@@ -2239,12 +1985,9 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
 	rcu_read_lock();
 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
 		spin_lock(&gmap->guest_table_lock);
-		table = radix_tree_lookup(&gmap->host_to_guest,
-					  vmaddr >> PMD_SHIFT);
-		if (table)
-			gaddr = __gmap_segment_gaddr(table) + offset;
+		gaddr = host_to_guest_lookup(gmap, vmaddr) + offset;
 		spin_unlock(&gmap->guest_table_lock);
-		if (!table)
+		if (!IS_GADDR_VALID(gaddr))
 			continue;
 
 		if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
@@ -2264,7 +2007,7 @@ EXPORT_SYMBOL_GPL(ptep_notify);
 static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
 			     unsigned long gaddr)
 {
-	pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_IN;
+	set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
 	gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
 }
 
@@ -2283,15 +2026,15 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
 {
 	gaddr &= HPAGE_MASK;
 	pmdp_notify_gmap(gmap, pmdp, gaddr);
-	pmd_val(new) &= ~_SEGMENT_ENTRY_GMAP_IN;
-	if (MACHINE_HAS_TLB_GUEST)
+	new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN));
+	if (machine_has_tlb_guest())
 		__pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
 			    IDTE_GLOBAL);
-	else if (MACHINE_HAS_IDTE)
+	else if (cpu_has_idte())
 		__pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
 	else
 		__pmdp_csp(pmdp);
-	*pmdp = new;
+	set_pmd(pmdp, new);
 }
 
 static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
@@ -2304,16 +2047,15 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
 	rcu_read_lock();
 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
 		spin_lock(&gmap->guest_table_lock);
-		pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest,
-						  vmaddr >> PMD_SHIFT);
+		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
 		if (pmdp) {
-			gaddr = __gmap_segment_gaddr((unsigned long *)pmdp);
 			pmdp_notify_gmap(gmap, pmdp, gaddr);
 			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
-						   _SEGMENT_ENTRY_GMAP_UC));
+						   _SEGMENT_ENTRY_GMAP_UC |
+						   _SEGMENT_ENTRY));
 			if (purge)
 				__pmdp_csp(pmdp);
-			pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
+			set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
 		}
 		spin_unlock(&gmap->guest_table_lock);
 	}
@@ -2350,27 +2092,25 @@ EXPORT_SYMBOL_GPL(gmap_pmdp_csp);
  */
 void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
 {
-	unsigned long *entry, gaddr;
+	unsigned long gaddr;
 	struct gmap *gmap;
 	pmd_t *pmdp;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
 		spin_lock(&gmap->guest_table_lock);
-		entry = radix_tree_delete(&gmap->host_to_guest,
-					  vmaddr >> PMD_SHIFT);
-		if (entry) {
-			pmdp = (pmd_t *)entry;
-			gaddr = __gmap_segment_gaddr(entry);
+		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
+		if (pmdp) {
 			pmdp_notify_gmap(gmap, pmdp, gaddr);
-			WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
-					   _SEGMENT_ENTRY_GMAP_UC));
-			if (MACHINE_HAS_TLB_GUEST)
+			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+						   _SEGMENT_ENTRY_GMAP_UC |
+						   _SEGMENT_ENTRY));
+			if (machine_has_tlb_guest())
 				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
 					    gmap->asce, IDTE_LOCAL);
-			else if (MACHINE_HAS_IDTE)
+			else if (cpu_has_idte())
 				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
-			*entry = _SEGMENT_ENTRY_EMPTY;
+			*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
 		}
 		spin_unlock(&gmap->guest_table_lock);
 	}
@@ -2385,29 +2125,27 @@ EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
  */
 void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
 {
-	unsigned long *entry, gaddr;
+	unsigned long gaddr;
 	struct gmap *gmap;
 	pmd_t *pmdp;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
 		spin_lock(&gmap->guest_table_lock);
-		entry = radix_tree_delete(&gmap->host_to_guest,
-					  vmaddr >> PMD_SHIFT);
-		if (entry) {
-			pmdp = (pmd_t *)entry;
-			gaddr = __gmap_segment_gaddr(entry);
+		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
+		if (pmdp) {
 			pmdp_notify_gmap(gmap, pmdp, gaddr);
-			WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
-					   _SEGMENT_ENTRY_GMAP_UC));
-			if (MACHINE_HAS_TLB_GUEST)
+			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+						   _SEGMENT_ENTRY_GMAP_UC |
+						   _SEGMENT_ENTRY));
+			if (machine_has_tlb_guest())
 				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
 					    gmap->asce, IDTE_GLOBAL);
-			else if (MACHINE_HAS_IDTE)
+			else if (cpu_has_idte())
 				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
 			else
 				__pmdp_csp(pmdp);
-			*entry = _SEGMENT_ENTRY_EMPTY;
+			*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
 		}
 		spin_unlock(&gmap->guest_table_lock);
 	}
@@ -2436,7 +2174,7 @@ static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
 		return false;
 
 	/* Clear UC indication and reset protection */
-	pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC;
+	set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC)));
 	gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
 	return true;
 }
@@ -2463,7 +2201,7 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
 	if (!pmdp)
 		return;
 
-	if (pmd_large(*pmdp)) {
+	if (pmd_leaf(*pmdp)) {
 		if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr))
 			bitmap_fill(bitmap, _PAGE_ENTRIES);
 	} else {
@@ -2473,57 +2211,44 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
 				continue;
 			if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep))
 				set_bit(i, bitmap);
-			spin_unlock(ptl);
+			pte_unmap_unlock(ptep, ptl);
 		}
 	}
 	gmap_pmd_op_end(gmap, pmdp);
 }
 EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
+				    unsigned long end, struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->vma;
+
+	split_huge_pmd(vma, pmd, addr);
+	return 0;
+}
+
+static const struct mm_walk_ops thp_split_walk_ops = {
+	.pmd_entry	= thp_split_walk_pmd_entry,
+	.walk_lock	= PGWALK_WRLOCK_VERIFY,
+};
+
 static inline void thp_split_mm(struct mm_struct *mm)
 {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	struct vm_area_struct *vma;
-	unsigned long addr;
+	VMA_ITERATOR(vmi, mm, 0);
 
-	for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
-		for (addr = vma->vm_start;
-		     addr < vma->vm_end;
-		     addr += PAGE_SIZE)
-			follow_page(vma, addr, FOLL_SPLIT);
-		vma->vm_flags &= ~VM_HUGEPAGE;
-		vma->vm_flags |= VM_NOHUGEPAGE;
+	for_each_vma(vmi, vma) {
+		vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE);
+		walk_page_vma(vma, &thp_split_walk_ops, NULL);
 	}
 	mm->def_flags |= VM_NOHUGEPAGE;
-#endif
 }
-
-/*
- * Remove all empty zero pages from the mapping for lazy refaulting
- * - This must be called after mm->context.has_pgste is set, to avoid
- *   future creation of zero pages
- * - This must be called after THP was enabled
- */
-static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
-			   unsigned long end, struct mm_walk *walk)
+#else
+static inline void thp_split_mm(struct mm_struct *mm)
 {
-	unsigned long addr;
-
-	for (addr = start; addr != end; addr += PAGE_SIZE) {
-		pte_t *ptep;
-		spinlock_t *ptl;
-
-		ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
-		if (is_zero_pfn(pte_pfn(*ptep)))
-			ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID));
-		pte_unmap_unlock(ptep, ptl);
-	}
-	return 0;
 }
-
-static const struct mm_walk_ops zap_zero_walk_ops = {
-	.pmd_entry	= __zap_zero_pages,
-};
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 /*
  * switch on pgstes for its userspace process (for kvm)
@@ -2535,19 +2260,147 @@ int s390_enable_sie(void)
 	/* Do we have pgstes? if yes, we are done */
 	if (mm_has_pgste(mm))
 		return 0;
-	/* Fail if the page tables are 2K */
-	if (!mm_alloc_pgste(mm))
-		return -EINVAL;
-	down_write(&mm->mmap_sem);
+	mmap_write_lock(mm);
 	mm->context.has_pgste = 1;
 	/* split thp mappings and disable thp for future mappings */
 	thp_split_mm(mm);
-	walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
-	up_write(&mm->mmap_sem);
+	mmap_write_unlock(mm);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(s390_enable_sie);
 
+static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
+				   unsigned long end, struct mm_walk *walk)
+{
+	unsigned long *found_addr = walk->private;
+
+	/* Return 1 of the page is a zeropage. */
+	if (is_zero_pfn(pte_pfn(*pte))) {
+		/*
+		 * Shared zeropage in e.g., a FS DAX mapping? We cannot do the
+		 * right thing and likely don't care: FAULT_FLAG_UNSHARE
+		 * currently only works in COW mappings, which is also where
+		 * mm_forbids_zeropage() is checked.
+		 */
+		if (!is_cow_mapping(walk->vma->vm_flags))
+			return -EFAULT;
+
+		*found_addr = addr;
+		return 1;
+	}
+	return 0;
+}
+
+static const struct mm_walk_ops find_zeropage_ops = {
+	.pte_entry	= find_zeropage_pte_entry,
+	.walk_lock	= PGWALK_WRLOCK,
+};
+
+/*
+ * Unshare all shared zeropages, replacing them by anonymous pages. Note that
+ * we cannot simply zap all shared zeropages, because this could later
+ * trigger unexpected userfaultfd missing events.
+ *
+ * This must be called after mm->context.allow_cow_sharing was
+ * set to 0, to avoid future mappings of shared zeropages.
+ *
+ * mm contracts with s390, that even if mm were to remove a page table,
+ * and racing with walk_page_range_vma() calling pte_offset_map_lock()
+ * would fail, it will never insert a page table containing empty zero
+ * pages once mm_forbids_zeropage(mm) i.e.
+ * mm->context.allow_cow_sharing is set to 0.
+ */
+static int __s390_unshare_zeropages(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+	VMA_ITERATOR(vmi, mm, 0);
+	unsigned long addr;
+	vm_fault_t fault;
+	int rc;
+
+	for_each_vma(vmi, vma) {
+		/*
+		 * We could only look at COW mappings, but it's more future
+		 * proof to catch unexpected zeropages in other mappings and
+		 * fail.
+		 */
+		if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma))
+			continue;
+		addr = vma->vm_start;
+
+retry:
+		rc = walk_page_range_vma(vma, addr, vma->vm_end,
+					 &find_zeropage_ops, &addr);
+		if (rc < 0)
+			return rc;
+		else if (!rc)
+			continue;
+
+		/* addr was updated by find_zeropage_pte_entry() */
+		fault = handle_mm_fault(vma, addr,
+					FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
+					NULL);
+		if (fault & VM_FAULT_OOM)
+			return -ENOMEM;
+		/*
+		 * See break_ksm(): even after handle_mm_fault() returned 0, we
+		 * must start the lookup from the current address, because
+		 * handle_mm_fault() may back out if there's any difficulty.
+		 *
+		 * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but
+		 * maybe they could trigger in the future on concurrent
+		 * truncation. In that case, the shared zeropage would be gone
+		 * and we can simply retry and make progress.
+		 */
+		cond_resched();
+		goto retry;
+	}
+
+	return 0;
+}
+
+static int __s390_disable_cow_sharing(struct mm_struct *mm)
+{
+	int rc;
+
+	if (!mm->context.allow_cow_sharing)
+		return 0;
+
+	mm->context.allow_cow_sharing = 0;
+
+	/* Replace all shared zeropages by anonymous pages. */
+	rc = __s390_unshare_zeropages(mm);
+	/*
+	 * Make sure to disable KSM (if enabled for the whole process or
+	 * individual VMAs). Note that nothing currently hinders user space
+	 * from re-enabling it.
+	 */
+	if (!rc)
+		rc = ksm_disable(mm);
+	if (rc)
+		mm->context.allow_cow_sharing = 1;
+	return rc;
+}
+
+/*
+ * Disable most COW-sharing of memory pages for the whole process:
+ * (1) Disable KSM and unmerge/unshare any KSM pages.
+ * (2) Disallow shared zeropages and unshare any zerpages that are mapped.
+ *
+ * Not that we currently don't bother with COW-shared pages that are shared
+ * with parent/child processes due to fork().
+ */
+int s390_disable_cow_sharing(void)
+{
+	int rc;
+
+	mmap_write_lock(current->mm);
+	rc = __s390_disable_cow_sharing(current->mm);
+	mmap_write_unlock(current->mm);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(s390_disable_cow_sharing);
+
 /*
  * Enable storage key handling from now on and initialize the storage
  * keys with the default key.
@@ -2560,13 +2413,25 @@ static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
 	return 0;
 }
 
+/*
+ * Give a chance to schedule after setting a key to 256 pages.
+ * We only hold the mm lock, which is a rwsem and the kvm srcu.
+ * Both can sleep.
+ */
+static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr,
+				  unsigned long next, struct mm_walk *walk)
+{
+	cond_resched();
+	return 0;
+}
+
 static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
 				      unsigned long hmask, unsigned long next,
 				      struct mm_walk *walk)
 {
 	pmd_t *pmd = (pmd_t *)pte;
 	unsigned long start, end;
-	struct page *page = pmd_page(*pmd);
+	struct folio *folio = page_folio(pmd_page(*pmd));
 
 	/*
 	 * The write check makes sure we do not set a key on shared
@@ -2579,42 +2444,39 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
 		return 0;
 
 	start = pmd_val(*pmd) & HPAGE_MASK;
-	end = start + HPAGE_SIZE - 1;
+	end = start + HPAGE_SIZE;
 	__storage_key_init_range(start, end);
-	set_bit(PG_arch_1, &page->flags);
+	set_bit(PG_arch_1, &folio->flags);
+	cond_resched();
 	return 0;
 }
 
 static const struct mm_walk_ops enable_skey_walk_ops = {
 	.hugetlb_entry		= __s390_enable_skey_hugetlb,
 	.pte_entry		= __s390_enable_skey_pte,
+	.pmd_entry		= __s390_enable_skey_pmd,
+	.walk_lock		= PGWALK_WRLOCK,
 };
 
 int s390_enable_skey(void)
 {
 	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
 	int rc = 0;
 
-	down_write(&mm->mmap_sem);
+	mmap_write_lock(mm);
 	if (mm_uses_skeys(mm))
 		goto out_up;
 
 	mm->context.uses_skeys = 1;
-	for (vma = mm->mmap; vma; vma = vma->vm_next) {
-		if (ksm_madvise(vma, vma->vm_start, vma->vm_end,
-				MADV_UNMERGEABLE, &vma->vm_flags)) {
-			mm->context.uses_skeys = 0;
-			rc = -ENOMEM;
-			goto out_up;
-		}
+	rc = __s390_disable_cow_sharing(mm);
+	if (rc) {
+		mm->context.uses_skeys = 0;
+		goto out_up;
 	}
-	mm->def_flags &= ~VM_MERGEABLE;
-
 	walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
 
 out_up:
-	up_write(&mm->mmap_sem);
+	mmap_write_unlock(mm);
 	return rc;
 }
 EXPORT_SYMBOL_GPL(s390_enable_skey);
@@ -2631,12 +2493,135 @@ static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
 
 static const struct mm_walk_ops reset_cmma_walk_ops = {
 	.pte_entry		= __s390_reset_cmma,
+	.walk_lock		= PGWALK_WRLOCK,
 };
 
 void s390_reset_cmma(struct mm_struct *mm)
 {
-	down_write(&mm->mmap_sem);
+	mmap_write_lock(mm);
 	walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
-	up_write(&mm->mmap_sem);
+	mmap_write_unlock(mm);
 }
 EXPORT_SYMBOL_GPL(s390_reset_cmma);
+
+#define GATHER_GET_PAGES 32
+
+struct reset_walk_state {
+	unsigned long next;
+	unsigned long count;
+	unsigned long pfns[GATHER_GET_PAGES];
+};
+
+static int s390_gather_pages(pte_t *ptep, unsigned long addr,
+			     unsigned long next, struct mm_walk *walk)
+{
+	struct reset_walk_state *p = walk->private;
+	pte_t pte = READ_ONCE(*ptep);
+
+	if (pte_present(pte)) {
+		/* we have a reference from the mapping, take an extra one */
+		get_page(phys_to_page(pte_val(pte)));
+		p->pfns[p->count] = phys_to_pfn(pte_val(pte));
+		p->next = next;
+		p->count++;
+	}
+	return p->count >= GATHER_GET_PAGES;
+}
+
+static const struct mm_walk_ops gather_pages_ops = {
+	.pte_entry = s390_gather_pages,
+	.walk_lock = PGWALK_RDLOCK,
+};
+
+/*
+ * Call the Destroy secure page UVC on each page in the given array of PFNs.
+ * Each page needs to have an extra reference, which will be released here.
+ */
+void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns)
+{
+	struct folio *folio;
+	unsigned long i;
+
+	for (i = 0; i < count; i++) {
+		folio = pfn_folio(pfns[i]);
+		/* we always have an extra reference */
+		uv_destroy_folio(folio);
+		/* get rid of the extra reference */
+		folio_put(folio);
+		cond_resched();
+	}
+}
+EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns);
+
+/**
+ * __s390_uv_destroy_range - Call the destroy secure page UVC on each page
+ * in the given range of the given address space.
+ * @mm: the mm to operate on
+ * @start: the start of the range
+ * @end: the end of the range
+ * @interruptible: if not 0, stop when a fatal signal is received
+ *
+ * Walk the given range of the given address space and call the destroy
+ * secure page UVC on each page. Optionally exit early if a fatal signal is
+ * pending.
+ *
+ * Return: 0 on success, -EINTR if the function stopped before completing
+ */
+int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
+			    unsigned long end, bool interruptible)
+{
+	struct reset_walk_state state = { .next = start };
+	int r = 1;
+
+	while (r > 0) {
+		state.count = 0;
+		mmap_read_lock(mm);
+		r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state);
+		mmap_read_unlock(mm);
+		cond_resched();
+		s390_uv_destroy_pfns(state.count, state.pfns);
+		if (interruptible && fatal_signal_pending(current))
+			return -EINTR;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
+
+/**
+ * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
+ * @gmap: the gmap whose ASCE needs to be replaced
+ *
+ * If the ASCE is a SEGMENT type then this function will return -EINVAL,
+ * otherwise the pointers in the host_to_guest radix tree will keep pointing
+ * to the wrong pages, causing use-after-free and memory corruption.
+ * If the allocation of the new top level page table fails, the ASCE is not
+ * replaced.
+ * In any case, the old ASCE is always removed from the gmap CRST list.
+ * Therefore the caller has to make sure to save a pointer to it
+ * beforehand, unless a leak is actually intended.
+ */
+int s390_replace_asce(struct gmap *gmap)
+{
+	unsigned long asce;
+	struct page *page;
+	void *table;
+
+	/* Replacing segment type ASCEs would cause serious issues */
+	if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
+		return -EINVAL;
+
+	page = gmap_alloc_crst();
+	if (!page)
+		return -ENOMEM;
+	table = page_to_virt(page);
+	memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
+
+	/* Set new table origin while preserving existing ASCE control bits */
+	asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
+	WRITE_ONCE(gmap->asce, asce);
+	WRITE_ONCE(gmap->mm->context.gmap_asce, asce);
+	WRITE_ONCE(gmap->table, table);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(s390_replace_asce);
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index b0246c705a19..e88c02c9e642 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -2,15 +2,20 @@
 /*
  *  IBM System z Huge TLB Page Support for Kernel.
  *
- *    Copyright IBM Corp. 2007,2016
+ *    Copyright IBM Corp. 2007,2020
  *    Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com>
  */
 
 #define KMSG_COMPONENT "hugetlb"
 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 
+#include <linux/cpufeature.h>
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/security.h>
+#include <asm/pgalloc.h>
 
 /*
  * If the bit selected by single-bit bitmask "a" is set within "x", move
@@ -20,6 +25,7 @@
 
 static inline unsigned long __pte_to_rste(pte_t pte)
 {
+	swp_entry_t arch_entry;
 	unsigned long rste;
 
 	/*
@@ -44,6 +50,7 @@ static inline unsigned long __pte_to_rste(pte_t pte)
 	 */
 	if (pte_present(pte)) {
 		rste = pte_val(pte) & PAGE_MASK;
+		rste |= _SEGMENT_ENTRY_PRESENT;
 		rste |= move_set_bit(pte_val(pte), _PAGE_READ,
 				     _SEGMENT_ENTRY_READ);
 		rste |= move_set_bit(pte_val(pte), _PAGE_WRITE,
@@ -62,6 +69,10 @@ static inline unsigned long __pte_to_rste(pte_t pte)
 #endif
 		rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC,
 				     _SEGMENT_ENTRY_NOEXEC);
+	} else if (!pte_none(pte)) {
+		/* swap pte */
+		arch_entry = __pte_to_swp_entry(pte);
+		rste = mk_swap_rste(__swp_type(arch_entry), __swp_offset(arch_entry));
 	} else
 		rste = _SEGMENT_ENTRY_EMPTY;
 	return rste;
@@ -69,13 +80,18 @@ static inline unsigned long __pte_to_rste(pte_t pte)
 
 static inline pte_t __rste_to_pte(unsigned long rste)
 {
-	int present;
+	swp_entry_t arch_entry;
+	unsigned long pteval;
+	int present, none;
 	pte_t pte;
 
-	if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
+	if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
 		present = pud_present(__pud(rste));
-	else
+		none = pud_none(__pud(rste));
+	} else {
 		present = pmd_present(__pmd(rste));
+		none = pmd_none(__pmd(rste));
+	}
 
 	/*
 	 * Convert encoding		pmd / pud bits	    pte bits
@@ -98,34 +114,31 @@ static inline pte_t __rste_to_pte(unsigned long rste)
 	 *	    u unused, l large
 	 */
 	if (present) {
-		pte_val(pte) = rste & _SEGMENT_ENTRY_ORIGIN_LARGE;
-		pte_val(pte) |= _PAGE_LARGE | _PAGE_PRESENT;
-		pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_READ,
-					     _PAGE_READ);
-		pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE,
-					     _PAGE_WRITE);
-		pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID,
-					     _PAGE_INVALID);
-		pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT,
-					     _PAGE_PROTECT);
-		pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY,
-					     _PAGE_DIRTY);
-		pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG,
-					     _PAGE_YOUNG);
+		pteval = rste & _SEGMENT_ENTRY_ORIGIN_LARGE;
+		pteval |= _PAGE_LARGE | _PAGE_PRESENT;
+		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_READ, _PAGE_READ);
+		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE, _PAGE_WRITE);
+		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID, _PAGE_INVALID);
+		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT, _PAGE_PROTECT);
+		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY, _PAGE_DIRTY);
+		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG, _PAGE_YOUNG);
 #ifdef CONFIG_MEM_SOFT_DIRTY
-		pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY,
-					     _PAGE_DIRTY);
+		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY);
 #endif
-		pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC,
-					     _PAGE_NOEXEC);
+		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC);
+	} else if (!none) {
+		/* swap rste */
+		arch_entry = __rste_to_swp_entry(rste);
+		pte = mk_swap_pte(__swp_type_rste(arch_entry), __swp_offset_rste(arch_entry));
+		pteval = pte_val(pte);
 	} else
-		pte_val(pte) = _PAGE_INVALID;
-	return pte;
+		pteval = _PAGE_INVALID;
+	return __pte(pteval);
 }
 
 static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
 {
-	struct page *page;
+	struct folio *folio;
 	unsigned long size, paddr;
 
 	if (!mm_uses_skeys(mm) ||
@@ -133,46 +146,53 @@ static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
 		return;
 
 	if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
-		page = pud_page(__pud(rste));
+		folio = page_folio(pud_page(__pud(rste)));
 		size = PUD_SIZE;
 		paddr = rste & PUD_MASK;
 	} else {
-		page = pmd_page(__pmd(rste));
+		folio = page_folio(pmd_page(__pmd(rste)));
 		size = PMD_SIZE;
 		paddr = rste & PMD_MASK;
 	}
 
-	if (!test_and_set_bit(PG_arch_1, &page->flags))
-		__storage_key_init_range(paddr, paddr + size - 1);
+	if (!test_and_set_bit(PG_arch_1, &folio->flags))
+		__storage_key_init_range(paddr, paddr + size);
 }
 
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t pte)
 {
 	unsigned long rste;
 
 	rste = __pte_to_rste(pte);
-	if (!MACHINE_HAS_NX)
-		rste &= ~_SEGMENT_ENTRY_NOEXEC;
 
 	/* Set correct table type for 2G hugepages */
-	if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
-		rste |= _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE;
-	else
+	if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
+		if (likely(pte_present(pte)))
+			rste |= _REGION3_ENTRY_LARGE;
+		rste |= _REGION_ENTRY_TYPE_R3;
+	} else if (likely(pte_present(pte)))
 		rste |= _SEGMENT_ENTRY_LARGE;
+
 	clear_huge_pte_skeys(mm, rste);
-	pte_val(*ptep) = rste;
+	set_pte(ptep, __pte(rste));
 }
 
-pte_t huge_ptep_get(pte_t *ptep)
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t pte, unsigned long sz)
+{
+	__set_huge_pte_at(mm, addr, ptep, pte);
+}
+
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
 	return __rste_to_pte(pte_val(*ptep));
 }
 
-pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
-			      unsigned long addr, pte_t *ptep)
+pte_t __huge_ptep_get_and_clear(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep)
 {
-	pte_t pte = huge_ptep_get(ptep);
+	pte_t pte = huge_ptep_get(mm, addr, ptep);
 	pmd_t *pmdp = (pmd_t *) ptep;
 	pud_t *pudp = (pud_t *) ptep;
 
@@ -183,7 +203,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 	return pte;
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgdp;
@@ -218,52 +238,21 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
 		p4dp = p4d_offset(pgdp, addr);
 		if (p4d_present(*p4dp)) {
 			pudp = pud_offset(p4dp, addr);
-			if (pud_present(*pudp)) {
-				if (pud_large(*pudp))
-					return (pte_t *) pudp;
+			if (sz == PUD_SIZE)
+				return (pte_t *)pudp;
+			if (pud_present(*pudp))
 				pmdp = pmd_offset(pudp, addr);
-			}
 		}
 	}
 	return (pte_t *) pmdp;
 }
 
-int pmd_huge(pmd_t pmd)
+bool __init arch_hugetlb_valid_size(unsigned long size)
 {
-	return pmd_large(pmd);
-}
-
-int pud_huge(pud_t pud)
-{
-	return pud_large(pud);
-}
-
-struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
-		pud_t *pud, int flags)
-{
-	if (flags & FOLL_GET)
-		return NULL;
-
-	return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
-}
-
-static __init int setup_hugepagesz(char *opt)
-{
-	unsigned long size;
-	char *string = opt;
-
-	size = memparse(opt, &opt);
-	if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) {
-		hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
-	} else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) {
-		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
-	} else {
-		hugetlb_bad_size();
-		pr_err("hugepagesz= specifies an unsupported page size %s\n",
-			string);
-		return 0;
-	}
-	return 1;
+	if (cpu_has_edat1() && size == PMD_SIZE)
+		return true;
+	else if (cpu_has_edat2() && size == PUD_SIZE)
+		return true;
+	else
+		return false;
 }
-__setup("hugepagesz=", setup_hugepagesz);
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index f0ce22220565..afa085e8186c 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -8,6 +8,7 @@
  *    Copyright (C) 1995  Linus Torvalds
  */
 
+#include <linux/cpufeature.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
@@ -31,53 +32,57 @@
 #include <linux/cma.h>
 #include <linux/gfp.h>
 #include <linux/dma-direct.h>
+#include <linux/percpu.h>
 #include <asm/processor.h>
 #include <linux/uaccess.h>
-#include <asm/pgtable.h>
 #include <asm/pgalloc.h>
+#include <asm/ctlreg.h>
+#include <asm/kfence.h>
 #include <asm/dma.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
-#include <asm/ctl_reg.h>
 #include <asm/sclp.h>
 #include <asm/set_memory.h>
 #include <asm/kasan.h>
 #include <asm/dma-mapping.h>
 #include <asm/uv.h>
+#include <linux/virtio_anchor.h>
+#include <linux/virtio_config.h>
+#include <linux/execmem.h>
 
-pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(.bss..swapper_pg_dir);
+pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir");
+pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir");
+
+struct ctlreg __bootdata_preserved(s390_invalid_asce);
+
+unsigned long __bootdata_preserved(page_noexec_mask);
+EXPORT_SYMBOL(page_noexec_mask);
+
+unsigned long __bootdata_preserved(segment_noexec_mask);
+EXPORT_SYMBOL(segment_noexec_mask);
+
+unsigned long __bootdata_preserved(region_noexec_mask);
+EXPORT_SYMBOL(region_noexec_mask);
 
 unsigned long empty_zero_page, zero_page_mask;
 EXPORT_SYMBOL(empty_zero_page);
 EXPORT_SYMBOL(zero_page_mask);
 
-bool initmem_freed;
-
 static void __init setup_zero_pages(void)
 {
+	unsigned long total_pages = memblock_estimated_nr_free_pages();
 	unsigned int order;
-	struct page *page;
-	int i;
 
 	/* Latest machines require a mapping granularity of 512KB */
 	order = 7;
 
 	/* Limit number of empty zero pages for small memory sizes */
-	while (order > 2 && (totalram_pages() >> 10) < (1UL << order))
+	while (order > 2 && (total_pages >> 10) < (1UL << order))
 		order--;
 
-	empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
-	if (!empty_zero_page)
-		panic("Out of memory in setup_zero_pages");
-
-	page = virt_to_page((void *) empty_zero_page);
-	split_page(page, order);
-	for (i = 1 << order; i > 0; i--) {
-		mark_page_reserved(page);
-		page++;
-	}
+	empty_zero_page = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE << order, PAGE_SIZE);
 
 	zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK;
 }
@@ -88,70 +93,45 @@ static void __init setup_zero_pages(void)
 void __init paging_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
-	unsigned long pgd_type, asce_bits;
-	psw_t psw;
-
-	init_mm.pgd = swapper_pg_dir;
-	if (VMALLOC_END > _REGION2_SIZE) {
-		asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
-		pgd_type = _REGION2_ENTRY_EMPTY;
-	} else {
-		asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
-		pgd_type = _REGION3_ENTRY_EMPTY;
-	}
-	init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits;
-	S390_lowcore.kernel_asce = init_mm.context.asce;
-	S390_lowcore.user_asce = S390_lowcore.kernel_asce;
-	crst_table_init((unsigned long *) init_mm.pgd, pgd_type);
+
 	vmem_map_init();
-	kasan_copy_shadow(init_mm.pgd);
-
-	/* enable virtual mapping in kernel mode */
-	__ctl_load(S390_lowcore.kernel_asce, 1, 1);
-	__ctl_load(S390_lowcore.kernel_asce, 7, 7);
-	__ctl_load(S390_lowcore.kernel_asce, 13, 13);
-	psw.mask = __extract_psw();
-	psw_bits(psw).dat = 1;
-	psw_bits(psw).as = PSW_BITS_AS_HOME;
-	__load_psw_mask(psw.mask);
-	kasan_free_early_identity();
-
-	sparse_memory_present_with_active_regions(MAX_NUMNODES);
 	sparse_init();
-	zone_dma_bits = 31;
+	zone_dma_limit = DMA_BIT_MASK(31);
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-	max_zone_pfns[ZONE_DMA] = PFN_DOWN(MAX_DMA_ADDRESS);
+	max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS);
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-	free_area_init_nodes(max_zone_pfns);
+	free_area_init(max_zone_pfns);
 }
 
 void mark_rodata_ro(void)
 {
 	unsigned long size = __end_ro_after_init - __start_ro_after_init;
 
-	set_memory_ro((unsigned long)__start_ro_after_init, size >> PAGE_SHIFT);
+	if (cpu_has_nx())
+		system_ctl_set_bit(0, CR0_INSTRUCTION_EXEC_PROTECTION_BIT);
+	__set_memory_ro(__start_ro_after_init, __end_ro_after_init);
 	pr_info("Write protected read-only-after-init data: %luk\n", size >> 10);
 }
 
-int set_memory_encrypted(unsigned long addr, int numpages)
+int set_memory_encrypted(unsigned long vaddr, int numpages)
 {
 	int i;
 
 	/* make specified pages unshared, (swiotlb, dma_free) */
 	for (i = 0; i < numpages; ++i) {
-		uv_remove_shared(addr);
-		addr += PAGE_SIZE;
+		uv_remove_shared(virt_to_phys((void *)vaddr));
+		vaddr += PAGE_SIZE;
 	}
 	return 0;
 }
 
-int set_memory_decrypted(unsigned long addr, int numpages)
+int set_memory_decrypted(unsigned long vaddr, int numpages)
 {
 	int i;
 	/* make specified pages shared (swiotlb, dma_alloca) */
 	for (i = 0; i < numpages; ++i) {
-		uv_set_shared(addr);
-		addr += PAGE_SIZE;
+		uv_set_shared(virt_to_phys((void *)vaddr));
+		vaddr += PAGE_SIZE;
 	}
 	return 0;
 }
@@ -168,50 +148,65 @@ static void pv_init(void)
 	if (!is_prot_virt_guest())
 		return;
 
+	virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
+
 	/* make sure bounce buffers are shared */
-	swiotlb_init(1);
+	swiotlb_init(true, SWIOTLB_FORCE | SWIOTLB_VERBOSE);
 	swiotlb_update_mem_attributes();
-	swiotlb_force = SWIOTLB_FORCE;
 }
 
-void __init mem_init(void)
+void __init arch_mm_preinit(void)
 {
 	cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask);
 	cpumask_set_cpu(0, mm_cpumask(&init_mm));
 
-	set_max_mapnr(max_low_pfn);
-        high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
-
 	pv_init();
 
-	/* Setup guest page hinting */
-	cmma_init();
-
-	/* this will put all low memory onto the freelists */
-	memblock_free_all();
 	setup_zero_pages();	/* Setup zeroed pages. */
+}
 
-	cmma_init_nodat();
+unsigned long memory_block_size_bytes(void)
+{
+	/*
+	 * Make sure the memory block size is always greater
+	 * or equal than the memory increment size.
+	 */
+	return max_t(unsigned long, MIN_MEMORY_BLOCK_SIZE, sclp.rzm);
+}
 
-	mem_init_print_info(NULL);
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(__per_cpu_offset);
+
+static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
+{
+	return LOCAL_DISTANCE;
 }
 
-void free_initmem(void)
+static int __init pcpu_cpu_to_node(int cpu)
 {
-	initmem_freed = true;
-	__set_memory((unsigned long)_sinittext,
-		     (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
-		     SET_MEMORY_RW | SET_MEMORY_NX);
-	free_initmem_default(POISON_FREE_INITMEM);
+	return 0;
 }
 
-unsigned long memory_block_size_bytes(void)
+void __init setup_per_cpu_areas(void)
 {
+	unsigned long delta;
+	unsigned int cpu;
+	int rc;
+
 	/*
-	 * Make sure the memory block size is always greater
-	 * or equal than the memory increment size.
+	 * Always reserve area for module percpu variables.  That's
+	 * what the legacy allocator did.
 	 */
-	return max_t(unsigned long, MIN_MEMORY_BLOCK_SIZE, sclp.rzm);
+	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
+				    PERCPU_DYNAMIC_RESERVE, PAGE_SIZE,
+				    pcpu_cpu_distance,
+				    pcpu_cpu_to_node);
+	if (rc < 0)
+		panic("Failed to initialize percpu areas.");
+
+	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+	for_each_possible_cpu(cpu)
+		__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -228,16 +223,13 @@ struct s390_cma_mem_data {
 static int s390_cma_check_range(struct cma *cma, void *data)
 {
 	struct s390_cma_mem_data *mem_data;
-	unsigned long start, end;
 
 	mem_data = data;
-	start = cma_get_base(cma);
-	end = start + cma_get_size(cma);
-	if (end < mem_data->start)
-		return 0;
-	if (start >= mem_data->end)
-		return 0;
-	return -EBUSY;
+
+	if (cma_intersects(cma, mem_data->start, mem_data->end))
+		return -EBUSY;
+
+	return 0;
 }
 
 static int s390_cma_mem_notifier(struct notifier_block *nb,
@@ -268,34 +260,61 @@ device_initcall(s390_cma_mem_init);
 #endif /* CONFIG_CMA */
 
 int arch_add_memory(int nid, u64 start, u64 size,
-		struct mhp_restrictions *restrictions)
+		    struct mhp_params *params)
 {
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long size_pages = PFN_DOWN(size);
 	int rc;
 
-	if (WARN_ON_ONCE(restrictions->altmap))
+	if (WARN_ON_ONCE(pgprot_val(params->pgprot) != pgprot_val(PAGE_KERNEL)))
 		return -EINVAL;
 
+	VM_BUG_ON(!mhp_range_allowed(start, size, true));
 	rc = vmem_add_mapping(start, size);
 	if (rc)
 		return rc;
 
-	rc = __add_pages(nid, start_pfn, size_pages, restrictions);
+	rc = __add_pages(nid, start_pfn, size_pages, params);
 	if (rc)
 		vmem_remove_mapping(start, size);
 	return rc;
 }
 
-void arch_remove_memory(int nid, u64 start, u64 size,
-			struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
-	struct zone *zone;
 
-	zone = page_zone(pfn_to_page(start_pfn));
-	__remove_pages(zone, start_pfn, nr_pages, altmap);
+	__remove_pages(start_pfn, nr_pages, altmap);
 	vmem_remove_mapping(start, size);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
+
+#ifdef CONFIG_EXECMEM
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
+{
+	unsigned long module_load_offset = 0;
+	unsigned long start;
+
+	if (kaslr_enabled())
+		module_load_offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
+
+	start = MODULES_VADDR + module_load_offset;
+
+	execmem_info = (struct execmem_info){
+		.ranges = {
+			[EXECMEM_DEFAULT] = {
+				.flags	= EXECMEM_KASAN_SHADOW,
+				.start	= start,
+				.end	= MODULES_END,
+				.pgprot	= PAGE_KERNEL,
+				.alignment = MODULE_ALIGN,
+			},
+		},
+	};
+
+	return &execmem_info;
+}
+#endif /* CONFIG_EXECMEM */
diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c
deleted file mode 100644
index 460f25572940..000000000000
--- a/arch/s390/mm/kasan_init.c
+++ /dev/null
@@ -1,382 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/kasan.h>
-#include <linux/sched/task.h>
-#include <linux/memblock.h>
-#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
-#include <asm/kasan.h>
-#include <asm/mem_detect.h>
-#include <asm/processor.h>
-#include <asm/sclp.h>
-#include <asm/facility.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-
-static unsigned long segment_pos __initdata;
-static unsigned long segment_low __initdata;
-static unsigned long pgalloc_pos __initdata;
-static unsigned long pgalloc_low __initdata;
-static unsigned long pgalloc_freeable __initdata;
-static bool has_edat __initdata;
-static bool has_nx __initdata;
-
-#define __sha(x) ((unsigned long)kasan_mem_to_shadow((void *)x))
-
-static pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
-
-static void __init kasan_early_panic(const char *reason)
-{
-	sclp_early_printk("The Linux kernel failed to boot with the KernelAddressSanitizer:\n");
-	sclp_early_printk(reason);
-	disabled_wait();
-}
-
-static void * __init kasan_early_alloc_segment(void)
-{
-	segment_pos -= _SEGMENT_SIZE;
-
-	if (segment_pos < segment_low)
-		kasan_early_panic("out of memory during initialisation\n");
-
-	return (void *)segment_pos;
-}
-
-static void * __init kasan_early_alloc_pages(unsigned int order)
-{
-	pgalloc_pos -= (PAGE_SIZE << order);
-
-	if (pgalloc_pos < pgalloc_low)
-		kasan_early_panic("out of memory during initialisation\n");
-
-	return (void *)pgalloc_pos;
-}
-
-static void * __init kasan_early_crst_alloc(unsigned long val)
-{
-	unsigned long *table;
-
-	table = kasan_early_alloc_pages(CRST_ALLOC_ORDER);
-	if (table)
-		crst_table_init(table, val);
-	return table;
-}
-
-static pte_t * __init kasan_early_pte_alloc(void)
-{
-	static void *pte_leftover;
-	pte_t *pte;
-
-	BUILD_BUG_ON(_PAGE_TABLE_SIZE * 2 != PAGE_SIZE);
-
-	if (!pte_leftover) {
-		pte_leftover = kasan_early_alloc_pages(0);
-		pte = pte_leftover + _PAGE_TABLE_SIZE;
-	} else {
-		pte = pte_leftover;
-		pte_leftover = NULL;
-	}
-	memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE);
-	return pte;
-}
-
-enum populate_mode {
-	POPULATE_ONE2ONE,
-	POPULATE_MAP,
-	POPULATE_ZERO_SHADOW
-};
-static void __init kasan_early_vmemmap_populate(unsigned long address,
-						unsigned long end,
-						enum populate_mode mode)
-{
-	unsigned long pgt_prot_zero, pgt_prot, sgt_prot;
-	pgd_t *pg_dir;
-	p4d_t *p4_dir;
-	pud_t *pu_dir;
-	pmd_t *pm_dir;
-	pte_t *pt_dir;
-
-	pgt_prot_zero = pgprot_val(PAGE_KERNEL_RO);
-	if (!has_nx)
-		pgt_prot_zero &= ~_PAGE_NOEXEC;
-	pgt_prot = pgprot_val(PAGE_KERNEL_EXEC);
-	sgt_prot = pgprot_val(SEGMENT_KERNEL_EXEC);
-
-	while (address < end) {
-		pg_dir = pgd_offset_k(address);
-		if (pgd_none(*pg_dir)) {
-			if (mode == POPULATE_ZERO_SHADOW &&
-			    IS_ALIGNED(address, PGDIR_SIZE) &&
-			    end - address >= PGDIR_SIZE) {
-				pgd_populate(&init_mm, pg_dir,
-						kasan_early_shadow_p4d);
-				address = (address + PGDIR_SIZE) & PGDIR_MASK;
-				continue;
-			}
-			p4_dir = kasan_early_crst_alloc(_REGION2_ENTRY_EMPTY);
-			pgd_populate(&init_mm, pg_dir, p4_dir);
-		}
-
-		p4_dir = p4d_offset(pg_dir, address);
-		if (p4d_none(*p4_dir)) {
-			if (mode == POPULATE_ZERO_SHADOW &&
-			    IS_ALIGNED(address, P4D_SIZE) &&
-			    end - address >= P4D_SIZE) {
-				p4d_populate(&init_mm, p4_dir,
-						kasan_early_shadow_pud);
-				address = (address + P4D_SIZE) & P4D_MASK;
-				continue;
-			}
-			pu_dir = kasan_early_crst_alloc(_REGION3_ENTRY_EMPTY);
-			p4d_populate(&init_mm, p4_dir, pu_dir);
-		}
-
-		pu_dir = pud_offset(p4_dir, address);
-		if (pud_none(*pu_dir)) {
-			if (mode == POPULATE_ZERO_SHADOW &&
-			    IS_ALIGNED(address, PUD_SIZE) &&
-			    end - address >= PUD_SIZE) {
-				pud_populate(&init_mm, pu_dir,
-						kasan_early_shadow_pmd);
-				address = (address + PUD_SIZE) & PUD_MASK;
-				continue;
-			}
-			pm_dir = kasan_early_crst_alloc(_SEGMENT_ENTRY_EMPTY);
-			pud_populate(&init_mm, pu_dir, pm_dir);
-		}
-
-		pm_dir = pmd_offset(pu_dir, address);
-		if (pmd_none(*pm_dir)) {
-			if (mode == POPULATE_ZERO_SHADOW &&
-			    IS_ALIGNED(address, PMD_SIZE) &&
-			    end - address >= PMD_SIZE) {
-				pmd_populate(&init_mm, pm_dir,
-						kasan_early_shadow_pte);
-				address = (address + PMD_SIZE) & PMD_MASK;
-				continue;
-			}
-			/* the first megabyte of 1:1 is mapped with 4k pages */
-			if (has_edat && address && end - address >= PMD_SIZE &&
-			    mode != POPULATE_ZERO_SHADOW) {
-				void *page;
-
-				if (mode == POPULATE_ONE2ONE) {
-					page = (void *)address;
-				} else {
-					page = kasan_early_alloc_segment();
-					memset(page, 0, _SEGMENT_SIZE);
-				}
-				pmd_val(*pm_dir) = __pa(page) | sgt_prot;
-				address = (address + PMD_SIZE) & PMD_MASK;
-				continue;
-			}
-
-			pt_dir = kasan_early_pte_alloc();
-			pmd_populate(&init_mm, pm_dir, pt_dir);
-		} else if (pmd_large(*pm_dir)) {
-			address = (address + PMD_SIZE) & PMD_MASK;
-			continue;
-		}
-
-		pt_dir = pte_offset_kernel(pm_dir, address);
-		if (pte_none(*pt_dir)) {
-			void *page;
-
-			switch (mode) {
-			case POPULATE_ONE2ONE:
-				page = (void *)address;
-				pte_val(*pt_dir) = __pa(page) | pgt_prot;
-				break;
-			case POPULATE_MAP:
-				page = kasan_early_alloc_pages(0);
-				memset(page, 0, PAGE_SIZE);
-				pte_val(*pt_dir) = __pa(page) | pgt_prot;
-				break;
-			case POPULATE_ZERO_SHADOW:
-				page = kasan_early_shadow_page;
-				pte_val(*pt_dir) = __pa(page) | pgt_prot_zero;
-				break;
-			}
-		}
-		address += PAGE_SIZE;
-	}
-}
-
-static void __init kasan_set_pgd(pgd_t *pgd, unsigned long asce_type)
-{
-	unsigned long asce_bits;
-
-	asce_bits = asce_type | _ASCE_TABLE_LENGTH;
-	S390_lowcore.kernel_asce = (__pa(pgd) & PAGE_MASK) | asce_bits;
-	S390_lowcore.user_asce = S390_lowcore.kernel_asce;
-
-	__ctl_load(S390_lowcore.kernel_asce, 1, 1);
-	__ctl_load(S390_lowcore.kernel_asce, 7, 7);
-	__ctl_load(S390_lowcore.kernel_asce, 13, 13);
-}
-
-static void __init kasan_enable_dat(void)
-{
-	psw_t psw;
-
-	psw.mask = __extract_psw();
-	psw_bits(psw).dat = 1;
-	psw_bits(psw).as = PSW_BITS_AS_HOME;
-	__load_psw_mask(psw.mask);
-}
-
-static void __init kasan_early_detect_facilities(void)
-{
-	if (test_facility(8)) {
-		has_edat = true;
-		__ctl_set_bit(0, 23);
-	}
-	if (!noexec_disabled && test_facility(130)) {
-		has_nx = true;
-		__ctl_set_bit(0, 20);
-	}
-}
-
-void __init kasan_early_init(void)
-{
-	unsigned long untracked_mem_end;
-	unsigned long shadow_alloc_size;
-	unsigned long initrd_end;
-	unsigned long asce_type;
-	unsigned long memsize;
-	unsigned long vmax;
-	unsigned long pgt_prot = pgprot_val(PAGE_KERNEL_RO);
-	pte_t pte_z;
-	pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY);
-	pud_t pud_z = __pud(__pa(kasan_early_shadow_pmd) | _REGION3_ENTRY);
-	p4d_t p4d_z = __p4d(__pa(kasan_early_shadow_pud) | _REGION2_ENTRY);
-
-	kasan_early_detect_facilities();
-	if (!has_nx)
-		pgt_prot &= ~_PAGE_NOEXEC;
-	pte_z = __pte(__pa(kasan_early_shadow_page) | pgt_prot);
-
-	memsize = get_mem_detect_end();
-	if (!memsize)
-		kasan_early_panic("cannot detect physical memory size\n");
-	/* respect mem= cmdline parameter */
-	if (memory_end_set && memsize > memory_end)
-		memsize = memory_end;
-	if (IS_ENABLED(CONFIG_CRASH_DUMP) && OLDMEM_BASE)
-		memsize = min(memsize, OLDMEM_SIZE);
-	memsize = min(memsize, KASAN_SHADOW_START);
-
-	if (IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING)) {
-		/* 4 level paging */
-		BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, P4D_SIZE));
-		BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, P4D_SIZE));
-		crst_table_init((unsigned long *)early_pg_dir,
-				_REGION2_ENTRY_EMPTY);
-		untracked_mem_end = vmax = _REGION1_SIZE;
-		asce_type = _ASCE_TYPE_REGION2;
-	} else {
-		/* 3 level paging */
-		BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PUD_SIZE));
-		BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PUD_SIZE));
-		crst_table_init((unsigned long *)early_pg_dir,
-				_REGION3_ENTRY_EMPTY);
-		untracked_mem_end = vmax = _REGION2_SIZE;
-		asce_type = _ASCE_TYPE_REGION3;
-	}
-
-	/* init kasan zero shadow */
-	crst_table_init((unsigned long *)kasan_early_shadow_p4d,
-				p4d_val(p4d_z));
-	crst_table_init((unsigned long *)kasan_early_shadow_pud,
-				pud_val(pud_z));
-	crst_table_init((unsigned long *)kasan_early_shadow_pmd,
-				pmd_val(pmd_z));
-	memset64((u64 *)kasan_early_shadow_pte, pte_val(pte_z), PTRS_PER_PTE);
-
-	shadow_alloc_size = memsize >> KASAN_SHADOW_SCALE_SHIFT;
-	pgalloc_low = round_up((unsigned long)_end, _SEGMENT_SIZE);
-	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) {
-		initrd_end =
-		    round_up(INITRD_START + INITRD_SIZE, _SEGMENT_SIZE);
-		pgalloc_low = max(pgalloc_low, initrd_end);
-	}
-
-	if (pgalloc_low + shadow_alloc_size > memsize)
-		kasan_early_panic("out of memory during initialisation\n");
-
-	if (has_edat) {
-		segment_pos = round_down(memsize, _SEGMENT_SIZE);
-		segment_low = segment_pos - shadow_alloc_size;
-		pgalloc_pos = segment_low;
-	} else {
-		pgalloc_pos = memsize;
-	}
-	init_mm.pgd = early_pg_dir;
-	/*
-	 * Current memory layout:
-	 * +- 0 -------------+	 +- shadow start -+
-	 * | 1:1 ram mapping |	/| 1/8 ram	  |
-	 * +- end of ram ----+ / +----------------+
-	 * | ... gap ...     |/  |	kasan	  |
-	 * +- shadow start --+	 |	zero	  |
-	 * | 1/8 addr space  |	 |	page	  |
-	 * +- shadow end    -+	 |	mapping	  |
-	 * | ... gap ...     |\  |    (untracked) |
-	 * +- modules vaddr -+ \ +----------------+
-	 * | 2Gb	     |	\|	unmapped  | allocated per module
-	 * +-----------------+	 +- shadow end ---+
-	 */
-	/* populate kasan shadow (for identity mapping and zero page mapping) */
-	kasan_early_vmemmap_populate(__sha(0), __sha(memsize), POPULATE_MAP);
-	if (IS_ENABLED(CONFIG_MODULES))
-		untracked_mem_end = vmax - MODULES_LEN;
-	kasan_early_vmemmap_populate(__sha(max_physmem_end),
-				     __sha(untracked_mem_end),
-				     POPULATE_ZERO_SHADOW);
-	/* memory allocated for identity mapping structs will be freed later */
-	pgalloc_freeable = pgalloc_pos;
-	/* populate identity mapping */
-	kasan_early_vmemmap_populate(0, memsize, POPULATE_ONE2ONE);
-	kasan_set_pgd(early_pg_dir, asce_type);
-	kasan_enable_dat();
-	/* enable kasan */
-	init_task.kasan_depth = 0;
-	memblock_reserve(pgalloc_pos, memsize - pgalloc_pos);
-	sclp_early_printk("KernelAddressSanitizer initialized\n");
-}
-
-void __init kasan_copy_shadow(pgd_t *pg_dir)
-{
-	/*
-	 * At this point we are still running on early pages setup early_pg_dir,
-	 * while swapper_pg_dir has just been initialized with identity mapping.
-	 * Carry over shadow memory region from early_pg_dir to swapper_pg_dir.
-	 */
-
-	pgd_t *pg_dir_src;
-	pgd_t *pg_dir_dst;
-	p4d_t *p4_dir_src;
-	p4d_t *p4_dir_dst;
-	pud_t *pu_dir_src;
-	pud_t *pu_dir_dst;
-
-	pg_dir_src = pgd_offset_raw(early_pg_dir, KASAN_SHADOW_START);
-	pg_dir_dst = pgd_offset_raw(pg_dir, KASAN_SHADOW_START);
-	p4_dir_src = p4d_offset(pg_dir_src, KASAN_SHADOW_START);
-	p4_dir_dst = p4d_offset(pg_dir_dst, KASAN_SHADOW_START);
-	if (!p4d_folded(*p4_dir_src)) {
-		/* 4 level paging */
-		memcpy(p4_dir_dst, p4_dir_src,
-		       (KASAN_SHADOW_SIZE >> P4D_SHIFT) * sizeof(p4d_t));
-		return;
-	}
-	/* 3 level paging */
-	pu_dir_src = pud_offset(p4_dir_src, KASAN_SHADOW_START);
-	pu_dir_dst = pud_offset(p4_dir_dst, KASAN_SHADOW_START);
-	memcpy(pu_dir_dst, pu_dir_src,
-	       (KASAN_SHADOW_SIZE >> PUD_SHIFT) * sizeof(pud_t));
-}
-
-void __init kasan_free_early_identity(void)
-{
-	memblock_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos);
-}
diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c
index de7ca4b6718f..44426e0f2944 100644
--- a/arch/s390/mm/maccess.c
+++ b/arch/s390/mm/maccess.c
@@ -4,8 +4,6 @@
  *
  * Copyright IBM Corp. 2009, 2015
  *
- *   Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>,
- *
  */
 
 #include <linux/uaccess.h>
@@ -14,9 +12,18 @@
 #include <linux/errno.h>
 #include <linux/gfp.h>
 #include <linux/cpu.h>
-#include <asm/ctl_reg.h>
-#include <asm/io.h>
+#include <linux/uio.h>
+#include <linux/io.h>
+#include <asm/asm-extable.h>
+#include <asm/abs_lowcore.h>
 #include <asm/stacktrace.h>
+#include <asm/sections.h>
+#include <asm/maccess.h>
+#include <asm/ctlreg.h>
+
+unsigned long __bootdata_preserved(__memcpy_real_area);
+pte_t *__bootdata_preserved(memcpy_real_ptep);
+static DEFINE_MUTEX(memcpy_real_mutex);
 
 static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size)
 {
@@ -42,7 +49,7 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz
 }
 
 /*
- * s390_kernel_write - write to kernel memory bypassing DAT
+ * __s390_kernel_write - write to kernel memory bypassing DAT
  * @dst: destination address
  * @src: source address
  * @size: number of bytes to copy
@@ -55,155 +62,84 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz
  */
 static DEFINE_SPINLOCK(s390_kernel_write_lock);
 
-void notrace s390_kernel_write(void *dst, const void *src, size_t size)
+notrace void *__s390_kernel_write(void *dst, const void *src, size_t size)
 {
+	void *tmp = dst;
 	unsigned long flags;
 	long copied;
 
 	spin_lock_irqsave(&s390_kernel_write_lock, flags);
 	while (size) {
-		copied = s390_kernel_write_odd(dst, src, size);
-		dst += copied;
+		copied = s390_kernel_write_odd(tmp, src, size);
+		tmp += copied;
 		src += copied;
 		size -= copied;
 	}
 	spin_unlock_irqrestore(&s390_kernel_write_lock, flags);
-}
-
-static int __no_sanitize_address __memcpy_real(void *dest, void *src, size_t count)
-{
-	register unsigned long _dest asm("2") = (unsigned long) dest;
-	register unsigned long _len1 asm("3") = (unsigned long) count;
-	register unsigned long _src  asm("4") = (unsigned long) src;
-	register unsigned long _len2 asm("5") = (unsigned long) count;
-	int rc = -EFAULT;
-
-	asm volatile (
-		"0:	mvcle	%1,%2,0x0\n"
-		"1:	jo	0b\n"
-		"	lhi	%0,0x0\n"
-		"2:\n"
-		EX_TABLE(1b,2b)
-		: "+d" (rc), "+d" (_dest), "+d" (_src), "+d" (_len1),
-		  "+d" (_len2), "=m" (*((long *) dest))
-		: "m" (*((long *) src))
-		: "cc", "memory");
-	return rc;
-}
-
-static unsigned long __no_sanitize_address _memcpy_real(unsigned long dest,
-							unsigned long src,
-							unsigned long count)
-{
-	int irqs_disabled, rc;
-	unsigned long flags;
 
-	if (!count)
-		return 0;
-	flags = arch_local_irq_save();
-	irqs_disabled = arch_irqs_disabled_flags(flags);
-	if (!irqs_disabled)
-		trace_hardirqs_off();
-	__arch_local_irq_stnsm(0xf8); // disable DAT
-	rc = __memcpy_real((void *) dest, (void *) src, (size_t) count);
-	if (flags & PSW_MASK_DAT)
-		__arch_local_irq_stosm(0x04); // enable DAT
-	if (!irqs_disabled)
-		trace_hardirqs_on();
-	__arch_local_irq_ssm(flags);
-	return rc;
+	return dst;
 }
 
-/*
- * Copy memory in real mode (kernel to kernel)
- */
-int memcpy_real(void *dest, void *src, size_t count)
+size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count)
 {
-	int rc;
-
-	if (S390_lowcore.nodat_stack != 0) {
-		preempt_disable();
-		rc = CALL_ON_STACK(_memcpy_real, S390_lowcore.nodat_stack, 3,
-				   dest, src, count);
-		preempt_enable();
-		return rc;
-	}
-	/*
-	 * This is a really early memcpy_real call, the stacks are
-	 * not set up yet. Just call _memcpy_real on the early boot
-	 * stack
-	 */
-	return _memcpy_real((unsigned long) dest,(unsigned long) src,
-			    (unsigned long) count);
-}
-
-/*
- * Copy memory in absolute mode (kernel to kernel)
- */
-void memcpy_absolute(void *dest, void *src, size_t count)
-{
-	unsigned long cr0, flags, prefix;
-
-	flags = arch_local_irq_save();
-	__ctl_store(cr0, 0, 0);
-	__ctl_clear_bit(0, 28); /* disable lowcore protection */
-	prefix = store_prefix();
-	if (prefix) {
-		local_mcck_disable();
-		set_prefix(0);
-		memcpy(dest, src, count);
-		set_prefix(prefix);
-		local_mcck_enable();
-	} else {
-		memcpy(dest, src, count);
+	size_t len, copied, res = 0;
+	unsigned long phys, offset;
+	void *chunk;
+	pte_t pte;
+
+	BUILD_BUG_ON(MEMCPY_REAL_SIZE != PAGE_SIZE);
+	while (count) {
+		phys = src & MEMCPY_REAL_MASK;
+		offset = src & ~MEMCPY_REAL_MASK;
+		chunk = (void *)(__memcpy_real_area + offset);
+		len = min(count, MEMCPY_REAL_SIZE - offset);
+		pte = mk_pte_phys(phys, PAGE_KERNEL_RO);
+
+		mutex_lock(&memcpy_real_mutex);
+		if (pte_val(pte) != pte_val(*memcpy_real_ptep)) {
+			__ptep_ipte(__memcpy_real_area, memcpy_real_ptep, 0, 0, IPTE_GLOBAL);
+			set_pte(memcpy_real_ptep, pte);
+		}
+		copied = copy_to_iter(chunk, len, iter);
+		mutex_unlock(&memcpy_real_mutex);
+
+		count -= copied;
+		src += copied;
+		res += copied;
+		if (copied < len)
+			break;
 	}
-	__ctl_load(cr0, 0, 0);
-	arch_local_irq_restore(flags);
+	return res;
 }
 
-/*
- * Copy memory from kernel (real) to user (virtual)
- */
-int copy_to_user_real(void __user *dest, void *src, unsigned long count)
+int memcpy_real(void *dest, unsigned long src, size_t count)
 {
-	int offs = 0, size, rc;
-	char *buf;
-
-	buf = (char *) __get_free_page(GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-	rc = -EFAULT;
-	while (offs < count) {
-		size = min(PAGE_SIZE, count - offs);
-		if (memcpy_real(buf, src + offs, size))
-			goto out;
-		if (copy_to_user(dest + offs, buf, size))
-			goto out;
-		offs += size;
-	}
-	rc = 0;
-out:
-	free_page((unsigned long) buf);
-	return rc;
+	struct iov_iter iter;
+	struct kvec kvec;
+
+	kvec.iov_base = dest;
+	kvec.iov_len = count;
+	iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
+	if (memcpy_real_iter(&iter, src, count) < count)
+		return -EFAULT;
+	return 0;
 }
 
 /*
- * Check if physical address is within prefix or zero page
+ * Find CPU that owns swapped prefix page
  */
-static int is_swapped(unsigned long addr)
+static int get_swapped_owner(phys_addr_t addr)
 {
-	unsigned long lc;
+	phys_addr_t lc;
 	int cpu;
 
-	if (addr < sizeof(struct lowcore))
-		return 1;
 	for_each_online_cpu(cpu) {
-		lc = (unsigned long) lowcore_ptr[cpu];
+		lc = virt_to_phys(lowcore_ptr[cpu]);
 		if (addr > lc + sizeof(struct lowcore) - 1 || addr < lc)
 			continue;
-		return 1;
+		return cpu;
 	}
-	return 0;
+	return -1;
 }
 
 /*
@@ -214,27 +150,45 @@ static int is_swapped(unsigned long addr)
  */
 void *xlate_dev_mem_ptr(phys_addr_t addr)
 {
-	void *bounce = (void *) addr;
+	void *ptr = phys_to_virt(addr);
+	void *bounce = ptr;
+	struct lowcore *abs_lc;
 	unsigned long size;
+	int this_cpu, cpu;
 
-	get_online_cpus();
-	preempt_disable();
-	if (is_swapped(addr)) {
-		size = PAGE_SIZE - (addr & ~PAGE_MASK);
-		bounce = (void *) __get_free_page(GFP_ATOMIC);
-		if (bounce)
-			memcpy_absolute(bounce, (void *) addr, size);
+	cpus_read_lock();
+	this_cpu = get_cpu();
+	if (addr >= sizeof(struct lowcore)) {
+		cpu = get_swapped_owner(addr);
+		if (cpu < 0)
+			goto out;
 	}
-	preempt_enable();
-	put_online_cpus();
+	bounce = (void *)__get_free_page(GFP_ATOMIC);
+	if (!bounce)
+		goto out;
+	size = PAGE_SIZE - (addr & ~PAGE_MASK);
+	if (addr < sizeof(struct lowcore)) {
+		abs_lc = get_abs_lowcore();
+		ptr = (void *)abs_lc + addr;
+		memcpy(bounce, ptr, size);
+		put_abs_lowcore(abs_lc);
+	} else if (cpu == this_cpu) {
+		ptr = (void *)(addr - virt_to_phys(lowcore_ptr[cpu]));
+		memcpy(bounce, ptr, size);
+	} else {
+		memcpy(bounce, ptr, size);
+	}
+out:
+	put_cpu();
+	cpus_read_unlock();
 	return bounce;
 }
 
 /*
  * Free converted buffer for /dev/mem access (if necessary)
  */
-void unxlate_dev_mem_ptr(phys_addr_t addr, void *buf)
+void unxlate_dev_mem_ptr(phys_addr_t addr, void *ptr)
 {
-	if ((void *) addr != buf)
-		free_page((unsigned long) buf);
+	if (addr != virt_to_phys(ptr))
+		free_page((unsigned long)ptr);
 }
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index cbc718ba6d78..40a526d28184 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -17,7 +17,7 @@
 #include <linux/random.h>
 #include <linux/compat.h>
 #include <linux/security.h>
-#include <asm/pgalloc.h>
+#include <linux/hugetlb.h>
 #include <asm/elf.h>
 
 static unsigned long stack_maxrandom_size(void)
@@ -38,7 +38,7 @@ static inline int mmap_is_legacy(struct rlimit *rlim_stack)
 
 unsigned long arch_mmap_rnd(void)
 {
-	return (get_random_int() & MMAP_RND_MASK) << PAGE_SHIFT;
+	return (get_random_u32() & MMAP_RND_MASK) << PAGE_SHIFT;
 }
 
 static unsigned long mmap_base_legacy(unsigned long rnd)
@@ -51,7 +51,6 @@ static inline unsigned long mmap_base(unsigned long rnd,
 {
 	unsigned long gap = rlim_stack->rlim_cur;
 	unsigned long pad = stack_maxrandom_size() + stack_guard_gap;
-	unsigned long gap_min, gap_max;
 
 	/* Values close to RLIM_INFINITY can overflow. */
 	if (gap + pad > gap)
@@ -59,27 +58,31 @@ static inline unsigned long mmap_base(unsigned long rnd,
 
 	/*
 	 * Top of mmap area (just below the process stack).
-	 * Leave at least a ~32 MB hole.
+	 * Leave at least a ~128 MB hole.
 	 */
-	gap_min = 32 * 1024 * 1024UL;
-	gap_max = (STACK_TOP / 6) * 5;
-
-	if (gap < gap_min)
-		gap = gap_min;
-	else if (gap > gap_max)
-		gap = gap_max;
+	gap = clamp(gap, SZ_128M, (STACK_TOP / 6) * 5);
 
 	return PAGE_ALIGN(STACK_TOP - gap - rnd);
 }
 
-unsigned long
-arch_get_unmapped_area(struct file *filp, unsigned long addr,
-		unsigned long len, unsigned long pgoff, unsigned long flags)
+static int get_align_mask(struct file *filp, unsigned long flags)
+{
+	if (filp && is_file_hugepages(filp))
+		return huge_page_mask_align(filp);
+	if (!(current->flags & PF_RANDOMIZE))
+		return 0;
+	if (filp || (flags & MAP_SHARED))
+		return MMAP_ALIGN_MASK << PAGE_SHIFT;
+	return 0;
+}
+
+unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
+				     unsigned long len, unsigned long pgoff,
+				     unsigned long flags, vm_flags_t vm_flags)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
-	struct vm_unmapped_area_info info;
-	int rc;
+	struct vm_unmapped_area_info info = {};
 
 	if (len > TASK_SIZE - mmap_min_addr)
 		return -ENOMEM;
@@ -95,40 +98,27 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 			goto check_asce_limit;
 	}
 
-	info.flags = 0;
 	info.length = len;
 	info.low_limit = mm->mmap_base;
 	info.high_limit = TASK_SIZE;
-	if (filp || (flags & MAP_SHARED))
-		info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT;
-	else
-		info.align_mask = 0;
-	info.align_offset = pgoff << PAGE_SHIFT;
+	info.align_mask = get_align_mask(filp, flags);
+	if (!(filp && is_file_hugepages(filp)))
+		info.align_offset = pgoff << PAGE_SHIFT;
 	addr = vm_unmapped_area(&info);
-	if (addr & ~PAGE_MASK)
+	if (offset_in_page(addr))
 		return addr;
 
 check_asce_limit:
-	if (addr + len > current->mm->context.asce_limit &&
-	    addr + len <= TASK_SIZE) {
-		rc = crst_table_upgrade(mm, addr + len);
-		if (rc)
-			return (unsigned long) rc;
-	}
-
-	return addr;
+	return check_asce_limit(mm, addr, len);
 }
 
-unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
-			  const unsigned long len, const unsigned long pgoff,
-			  const unsigned long flags)
+unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+					     unsigned long len, unsigned long pgoff,
+					     unsigned long flags, vm_flags_t vm_flags)
 {
 	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
-	unsigned long addr = addr0;
-	struct vm_unmapped_area_info info;
-	int rc;
+	struct vm_unmapped_area_info info = {};
 
 	/* requested length too big for entire address space */
 	if (len > TASK_SIZE - mmap_min_addr)
@@ -148,13 +138,11 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 
 	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
 	info.length = len;
-	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+	info.low_limit = PAGE_SIZE;
 	info.high_limit = mm->mmap_base;
-	if (filp || (flags & MAP_SHARED))
-		info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT;
-	else
-		info.align_mask = 0;
-	info.align_offset = pgoff << PAGE_SHIFT;
+	info.align_mask = get_align_mask(filp, flags);
+	if (!(filp && is_file_hugepages(filp)))
+		info.align_offset = pgoff << PAGE_SHIFT;
 	addr = vm_unmapped_area(&info);
 
 	/*
@@ -163,25 +151,18 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	 * can happen with large stack limits and large mmap()
 	 * allocations.
 	 */
-	if (addr & ~PAGE_MASK) {
+	if (offset_in_page(addr)) {
 		VM_BUG_ON(addr != -ENOMEM);
 		info.flags = 0;
 		info.low_limit = TASK_UNMAPPED_BASE;
 		info.high_limit = TASK_SIZE;
 		addr = vm_unmapped_area(&info);
-		if (addr & ~PAGE_MASK)
+		if (offset_in_page(addr))
 			return addr;
 	}
 
 check_asce_limit:
-	if (addr + len > current->mm->context.asce_limit &&
-	    addr + len <= TASK_SIZE) {
-		rc = crst_table_upgrade(mm, addr + len);
-		if (rc)
-			return (unsigned long) rc;
-	}
-
-	return addr;
+	return check_asce_limit(mm, addr, len);
 }
 
 /*
@@ -201,9 +182,35 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
 	 */
 	if (mmap_is_legacy(rlim_stack)) {
 		mm->mmap_base = mmap_base_legacy(random_factor);
-		mm->get_unmapped_area = arch_get_unmapped_area;
+		clear_bit(MMF_TOPDOWN, &mm->flags);
 	} else {
 		mm->mmap_base = mmap_base(random_factor, rlim_stack);
-		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+		set_bit(MMF_TOPDOWN, &mm->flags);
 	}
 }
+
+static pgprot_t protection_map[16] __ro_after_init;
+
+void __init setup_protection_map(void)
+{
+	pgprot_t *pm = protection_map;
+
+	pm[VM_NONE]					= PAGE_NONE;
+	pm[VM_READ]					= PAGE_RO;
+	pm[VM_WRITE]					= PAGE_RO;
+	pm[VM_WRITE | VM_READ]				= PAGE_RO;
+	pm[VM_EXEC]					= PAGE_RX;
+	pm[VM_EXEC | VM_READ]				= PAGE_RX;
+	pm[VM_EXEC | VM_WRITE]				= PAGE_RX;
+	pm[VM_EXEC | VM_WRITE | VM_READ]		= PAGE_RX;
+	pm[VM_SHARED]					= PAGE_NONE;
+	pm[VM_SHARED | VM_READ]				= PAGE_RO;
+	pm[VM_SHARED | VM_WRITE]			= PAGE_RW;
+	pm[VM_SHARED | VM_WRITE | VM_READ]		= PAGE_RW;
+	pm[VM_SHARED | VM_EXEC]				= PAGE_RX;
+	pm[VM_SHARED | VM_EXEC | VM_READ]		= PAGE_RX;
+	pm[VM_SHARED | VM_EXEC | VM_WRITE]		= PAGE_RWX;
+	pm[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ]	= PAGE_RWX;
+}
+
+DECLARE_VM_GET_PAGE_PROT
diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
index fc141893d028..01f9b39e65f5 100644
--- a/arch/s390/mm/page-states.c
+++ b/arch/s390/mm/page-states.c
@@ -7,211 +7,18 @@
  * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  */
 
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/types.h>
 #include <linux/mm.h>
-#include <linux/memblock.h>
-#include <linux/gfp.h>
-#include <linux/init.h>
-#include <asm/facility.h>
 #include <asm/page-states.h>
+#include <asm/sections.h>
+#include <asm/page.h>
 
-static int cmma_flag = 1;
-
-static int __init cmma(char *str)
-{
-	bool enabled;
-
-	if (!kstrtobool(str, &enabled))
-		cmma_flag = enabled;
-	return 1;
-}
-__setup("cmma=", cmma);
-
-static inline int cmma_test_essa(void)
-{
-	register unsigned long tmp asm("0") = 0;
-	register int rc asm("1");
-
-	/* test ESSA_GET_STATE */
-	asm volatile(
-		"	.insn	rrf,0xb9ab0000,%1,%1,%2,0\n"
-		"0:     la      %0,0\n"
-		"1:\n"
-		EX_TABLE(0b,1b)
-		: "=&d" (rc), "+&d" (tmp)
-		: "i" (ESSA_GET_STATE), "0" (-EOPNOTSUPP));
-	return rc;
-}
-
-void __init cmma_init(void)
-{
-	if (!cmma_flag)
-		return;
-	if (cmma_test_essa()) {
-		cmma_flag = 0;
-		return;
-	}
-	if (test_facility(147))
-		cmma_flag = 2;
-}
-
-static inline unsigned char get_page_state(struct page *page)
-{
-	unsigned char state;
-
-	asm volatile("	.insn	rrf,0xb9ab0000,%0,%1,%2,0"
-		     : "=&d" (state)
-		     : "a" (page_to_phys(page)),
-		       "i" (ESSA_GET_STATE));
-	return state & 0x3f;
-}
-
-static inline void set_page_unused(struct page *page, int order)
-{
-	int i, rc;
-
-	for (i = 0; i < (1 << order); i++)
-		asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0"
-			     : "=&d" (rc)
-			     : "a" (page_to_phys(page + i)),
-			       "i" (ESSA_SET_UNUSED));
-}
-
-static inline void set_page_stable_dat(struct page *page, int order)
-{
-	int i, rc;
-
-	for (i = 0; i < (1 << order); i++)
-		asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0"
-			     : "=&d" (rc)
-			     : "a" (page_to_phys(page + i)),
-			       "i" (ESSA_SET_STABLE));
-}
-
-static inline void set_page_stable_nodat(struct page *page, int order)
-{
-	int i, rc;
-
-	for (i = 0; i < (1 << order); i++)
-		asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0"
-			     : "=&d" (rc)
-			     : "a" (page_to_phys(page + i)),
-			       "i" (ESSA_SET_STABLE_NODAT));
-}
-
-static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end)
-{
-	unsigned long next;
-	struct page *page;
-	pmd_t *pmd;
-
-	pmd = pmd_offset(pud, addr);
-	do {
-		next = pmd_addr_end(addr, end);
-		if (pmd_none(*pmd) || pmd_large(*pmd))
-			continue;
-		page = virt_to_page(pmd_val(*pmd));
-		set_bit(PG_arch_1, &page->flags);
-	} while (pmd++, addr = next, addr != end);
-}
-
-static void mark_kernel_pud(p4d_t *p4d, unsigned long addr, unsigned long end)
-{
-	unsigned long next;
-	struct page *page;
-	pud_t *pud;
-	int i;
-
-	pud = pud_offset(p4d, addr);
-	do {
-		next = pud_addr_end(addr, end);
-		if (pud_none(*pud) || pud_large(*pud))
-			continue;
-		if (!pud_folded(*pud)) {
-			page = virt_to_page(pud_val(*pud));
-			for (i = 0; i < 3; i++)
-				set_bit(PG_arch_1, &page[i].flags);
-		}
-		mark_kernel_pmd(pud, addr, next);
-	} while (pud++, addr = next, addr != end);
-}
-
-static void mark_kernel_p4d(pgd_t *pgd, unsigned long addr, unsigned long end)
-{
-	unsigned long next;
-	struct page *page;
-	p4d_t *p4d;
-	int i;
-
-	p4d = p4d_offset(pgd, addr);
-	do {
-		next = p4d_addr_end(addr, end);
-		if (p4d_none(*p4d))
-			continue;
-		if (!p4d_folded(*p4d)) {
-			page = virt_to_page(p4d_val(*p4d));
-			for (i = 0; i < 3; i++)
-				set_bit(PG_arch_1, &page[i].flags);
-		}
-		mark_kernel_pud(p4d, addr, next);
-	} while (p4d++, addr = next, addr != end);
-}
-
-static void mark_kernel_pgd(void)
-{
-	unsigned long addr, next;
-	struct page *page;
-	pgd_t *pgd;
-	int i;
-
-	addr = 0;
-	pgd = pgd_offset_k(addr);
-	do {
-		next = pgd_addr_end(addr, MODULES_END);
-		if (pgd_none(*pgd))
-			continue;
-		if (!pgd_folded(*pgd)) {
-			page = virt_to_page(pgd_val(*pgd));
-			for (i = 0; i < 3; i++)
-				set_bit(PG_arch_1, &page[i].flags);
-		}
-		mark_kernel_p4d(pgd, addr, next);
-	} while (pgd++, addr = next, addr != MODULES_END);
-}
-
-void __init cmma_init_nodat(void)
-{
-	struct memblock_region *reg;
-	struct page *page;
-	unsigned long start, end, ix;
-
-	if (cmma_flag < 2)
-		return;
-	/* Mark pages used in kernel page tables */
-	mark_kernel_pgd();
-
-	/* Set all kernel pages not used for page tables to stable/no-dat */
-	for_each_memblock(memory, reg) {
-		start = memblock_region_memory_base_pfn(reg);
-		end = memblock_region_memory_end_pfn(reg);
-		page = pfn_to_page(start);
-		for (ix = start; ix < end; ix++, page++) {
-			if (__test_and_clear_bit(PG_arch_1, &page->flags))
-				continue;	/* skip page table pages */
-			if (!list_empty(&page->lru))
-				continue;	/* skip free pages */
-			set_page_stable_nodat(page, 0);
-		}
-	}
-}
+int __bootdata_preserved(cmma_flag);
 
 void arch_free_page(struct page *page, int order)
 {
 	if (!cmma_flag)
 		return;
-	set_page_unused(page, order);
+	__set_page_unused(page_to_virt(page), 1UL << order);
 }
 
 void arch_alloc_page(struct page *page, int order)
@@ -219,57 +26,7 @@ void arch_alloc_page(struct page *page, int order)
 	if (!cmma_flag)
 		return;
 	if (cmma_flag < 2)
-		set_page_stable_dat(page, order);
+		__set_page_stable_dat(page_to_virt(page), 1UL << order);
 	else
-		set_page_stable_nodat(page, order);
-}
-
-void arch_set_page_dat(struct page *page, int order)
-{
-	if (!cmma_flag)
-		return;
-	set_page_stable_dat(page, order);
-}
-
-void arch_set_page_nodat(struct page *page, int order)
-{
-	if (cmma_flag < 2)
-		return;
-	set_page_stable_nodat(page, order);
-}
-
-int arch_test_page_nodat(struct page *page)
-{
-	unsigned char state;
-
-	if (cmma_flag < 2)
-		return 0;
-	state = get_page_state(page);
-	return !!(state & 0x20);
-}
-
-void arch_set_page_states(int make_stable)
-{
-	unsigned long flags, order, t;
-	struct list_head *l;
-	struct page *page;
-	struct zone *zone;
-
-	if (!cmma_flag)
-		return;
-	if (make_stable)
-		drain_local_pages(NULL);
-	for_each_populated_zone(zone) {
-		spin_lock_irqsave(&zone->lock, flags);
-		for_each_migratetype_order(order, t) {
-			list_for_each(l, &zone->free_area[order].free_list[t]) {
-				page = list_entry(l, struct page, lru);
-				if (make_stable)
-					set_page_stable_dat(page, order);
-				else
-					set_page_unused(page, order);
-			}
-		}
-		spin_unlock_irqrestore(&zone->lock, flags);
-	}
+		__set_page_stable_nodat(page_to_virt(page), 1UL << order);
 }
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index f8c6faab41f4..348e759840e7 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -3,13 +3,17 @@
  * Copyright IBM Corp. 2011
  * Author(s): Jan Glauber <jang@linux.vnet.ibm.com>
  */
+#include <linux/cpufeature.h>
 #include <linux/hugetlb.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <asm/cacheflush.h>
 #include <asm/facility.h>
-#include <asm/pgtable.h>
 #include <asm/pgalloc.h>
+#include <asm/kfence.h>
 #include <asm/page.h>
+#include <asm/asm.h>
 #include <asm/set_memory.h>
 
 static inline unsigned long sske_frame(unsigned long addr, unsigned char skey)
@@ -24,7 +28,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end)
 	unsigned long boundary, size;
 
 	while (start < end) {
-		if (MACHINE_HAS_EDAT1) {
+		if (cpu_has_edat1()) {
 			/* set storage keys for a 1MB frame */
 			size = 1UL << 20;
 			boundary = (start + size) & ~(size - 1);
@@ -41,7 +45,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end)
 }
 
 #ifdef CONFIG_PROC_FS
-atomic_long_t direct_pages_count[PG_DIRECT_MAP_MAX];
+atomic_long_t __bootdata_preserved(direct_pages_count[PG_DIRECT_MAP_MAX]);
 
 void arch_report_meminfo(struct seq_file *m)
 {
@@ -57,10 +61,10 @@ void arch_report_meminfo(struct seq_file *m)
 static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
 		    unsigned long dtt)
 {
-	unsigned long table, mask;
+	unsigned long *table, mask;
 
 	mask = 0;
-	if (MACHINE_HAS_EDAT2) {
+	if (cpu_has_edat2()) {
 		switch (dtt) {
 		case CRDTE_DTT_REGION3:
 			mask = ~(PTRS_PER_PUD * sizeof(pud_t) - 1);
@@ -72,9 +76,9 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
 			mask = ~(PTRS_PER_PTE * sizeof(pte_t) - 1);
 			break;
 		}
-		table = (unsigned long)old & mask;
-		crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce);
-	} else if (MACHINE_HAS_IDTE) {
+		table = (unsigned long *)((unsigned long)old & mask);
+		crdte(*old, new, table, dtt, addr, get_lowcore()->kernel_asce.val);
+	} else if (cpu_has_idte()) {
 		cspg(old, *old, new);
 	} else {
 		csp((unsigned int *)old + 1, *old, new);
@@ -86,7 +90,9 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end,
 {
 	pte_t *ptep, new;
 
-	ptep = pte_offset(pmdp, addr);
+	if (flags == SET_MEMORY_4K)
+		return 0;
+	ptep = pte_offset_kernel(pmdp, addr);
 	do {
 		new = *ptep;
 		if (pte_none(new))
@@ -94,11 +100,17 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		if (flags & SET_MEMORY_RO)
 			new = pte_wrprotect(new);
 		else if (flags & SET_MEMORY_RW)
-			new = pte_mkwrite(pte_mkdirty(new));
+			new = pte_mkwrite_novma(pte_mkdirty(new));
 		if (flags & SET_MEMORY_NX)
-			pte_val(new) |= _PAGE_NOEXEC;
+			new = set_pte_bit(new, __pgprot(_PAGE_NOEXEC));
 		else if (flags & SET_MEMORY_X)
-			pte_val(new) &= ~_PAGE_NOEXEC;
+			new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC));
+		if (flags & SET_MEMORY_INV) {
+			new = set_pte_bit(new, __pgprot(_PAGE_INVALID));
+		} else if (flags & SET_MEMORY_DEF) {
+			new = __pte(pte_val(new) & PAGE_MASK);
+			new = set_pte_bit(new, PAGE_KERNEL);
+		}
 		pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE);
 		ptep++;
 		addr += PAGE_SIZE;
@@ -125,11 +137,11 @@ static int split_pmd_page(pmd_t *pmdp, unsigned long addr)
 		prot &= ~_PAGE_NOEXEC;
 	ptep = pt_dir;
 	for (i = 0; i < PTRS_PER_PTE; i++) {
-		pte_val(*ptep) = pte_addr | prot;
+		set_pte(ptep, __pte(pte_addr | prot));
 		pte_addr += PAGE_SIZE;
 		ptep++;
 	}
-	pmd_val(new) = __pa(pt_dir) | _SEGMENT_ENTRY;
+	new = __pmd(__pa(pt_dir) | _SEGMENT_ENTRY);
 	pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
 	update_page_count(PG_DIRECT_MAP_4K, PTRS_PER_PTE);
 	update_page_count(PG_DIRECT_MAP_1M, -1);
@@ -144,11 +156,17 @@ static void modify_pmd_page(pmd_t *pmdp, unsigned long addr,
 	if (flags & SET_MEMORY_RO)
 		new = pmd_wrprotect(new);
 	else if (flags & SET_MEMORY_RW)
-		new = pmd_mkwrite(pmd_mkdirty(new));
+		new = pmd_mkwrite_novma(pmd_mkdirty(new));
 	if (flags & SET_MEMORY_NX)
-		pmd_val(new) |= _SEGMENT_ENTRY_NOEXEC;
+		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC));
 	else if (flags & SET_MEMORY_X)
-		pmd_val(new) &= ~_SEGMENT_ENTRY_NOEXEC;
+		new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC));
+	if (flags & SET_MEMORY_INV) {
+		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
+	} else if (flags & SET_MEMORY_DEF) {
+		new = __pmd(pmd_val(new) & PMD_MASK);
+		new = set_pmd_bit(new, SEGMENT_KERNEL);
+	}
 	pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
 }
 
@@ -156,6 +174,7 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end,
 			  unsigned long flags)
 {
 	unsigned long next;
+	int need_split;
 	pmd_t *pmdp;
 	int rc = 0;
 
@@ -164,8 +183,11 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end,
 		if (pmd_none(*pmdp))
 			return -EINVAL;
 		next = pmd_addr_end(addr, end);
-		if (pmd_large(*pmdp)) {
-			if (addr & ~PMD_MASK || addr + PMD_SIZE > next) {
+		if (pmd_leaf(*pmdp)) {
+			need_split  = !!(flags & SET_MEMORY_4K);
+			need_split |= !!(addr & ~PMD_MASK);
+			need_split |= !!(addr + PMD_SIZE > next);
+			if (need_split) {
 				rc = split_pmd_page(pmdp, addr);
 				if (rc)
 					return rc;
@@ -202,11 +224,11 @@ static int split_pud_page(pud_t *pudp, unsigned long addr)
 		prot &= ~_SEGMENT_ENTRY_NOEXEC;
 	pmdp = pm_dir;
 	for (i = 0; i < PTRS_PER_PMD; i++) {
-		pmd_val(*pmdp) = pmd_addr | prot;
+		set_pmd(pmdp, __pmd(pmd_addr | prot));
 		pmd_addr += PMD_SIZE;
 		pmdp++;
 	}
-	pud_val(new) = __pa(pm_dir) | _REGION3_ENTRY;
+	new = __pud(__pa(pm_dir) | _REGION3_ENTRY);
 	pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
 	update_page_count(PG_DIRECT_MAP_1M, PTRS_PER_PMD);
 	update_page_count(PG_DIRECT_MAP_2G, -1);
@@ -223,9 +245,15 @@ static void modify_pud_page(pud_t *pudp, unsigned long addr,
 	else if (flags & SET_MEMORY_RW)
 		new = pud_mkwrite(pud_mkdirty(new));
 	if (flags & SET_MEMORY_NX)
-		pud_val(new) |= _REGION_ENTRY_NOEXEC;
+		new = set_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC));
 	else if (flags & SET_MEMORY_X)
-		pud_val(new) &= ~_REGION_ENTRY_NOEXEC;
+		new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC));
+	if (flags & SET_MEMORY_INV) {
+		new = set_pud_bit(new, __pgprot(_REGION_ENTRY_INVALID));
+	} else if (flags & SET_MEMORY_DEF) {
+		new = __pud(pud_val(new) & PUD_MASK);
+		new = set_pud_bit(new, REGION3_KERNEL);
+	}
 	pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
 }
 
@@ -233,6 +261,7 @@ static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end,
 			  unsigned long flags)
 {
 	unsigned long next;
+	int need_split;
 	pud_t *pudp;
 	int rc = 0;
 
@@ -241,8 +270,11 @@ static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end,
 		if (pud_none(*pudp))
 			return -EINVAL;
 		next = pud_addr_end(addr, end);
-		if (pud_large(*pudp)) {
-			if (addr & ~PUD_MASK || addr + PUD_SIZE > next) {
+		if (pud_leaf(*pudp)) {
+			need_split  = !!(flags & SET_MEMORY_4K);
+			need_split |= !!(addr & ~PUD_MASK);
+			need_split |= !!(addr + PUD_SIZE > next);
+			if (need_split) {
 				rc = split_pud_page(pudp, addr);
 				if (rc)
 					break;
@@ -279,7 +311,7 @@ static int walk_p4d_level(pgd_t *pgd, unsigned long addr, unsigned long end,
 	return rc;
 }
 
-static DEFINE_MUTEX(cpa_mutex);
+DEFINE_MUTEX(cpa_mutex);
 
 static int change_page_attr(unsigned long addr, unsigned long end,
 			    unsigned long flags)
@@ -288,11 +320,6 @@ static int change_page_attr(unsigned long addr, unsigned long end,
 	int rc = -EINVAL;
 	pgd_t *pgdp;
 
-	if (addr == end)
-		return 0;
-	if (end >= MODULES_END)
-		return -EINVAL;
-	mutex_lock(&cpa_mutex);
 	pgdp = pgd_offset_k(addr);
 	do {
 		if (pgd_none(*pgdp))
@@ -303,21 +330,106 @@ static int change_page_attr(unsigned long addr, unsigned long end,
 			break;
 		cond_resched();
 	} while (pgdp++, addr = next, addr < end && !rc);
-	mutex_unlock(&cpa_mutex);
 	return rc;
 }
 
-int __set_memory(unsigned long addr, int numpages, unsigned long flags)
+static int change_page_attr_alias(unsigned long addr, unsigned long end,
+				  unsigned long flags)
 {
-	if (!MACHINE_HAS_NX)
+	unsigned long alias, offset, va_start, va_end;
+	struct vm_struct *area;
+	int rc = 0;
+
+	/*
+	 * Changes to read-only permissions on kernel VA mappings are also
+	 * applied to the kernel direct mapping. Execute permissions are
+	 * intentionally not transferred to keep all allocated pages within
+	 * the direct mapping non-executable.
+	 */
+	flags &= SET_MEMORY_RO | SET_MEMORY_RW;
+	if (!flags)
+		return 0;
+	area = NULL;
+	while (addr < end) {
+		if (!area)
+			area = find_vm_area((void *)addr);
+		if (!area || !(area->flags & VM_ALLOC))
+			return 0;
+		va_start = (unsigned long)area->addr;
+		va_end = va_start + area->nr_pages * PAGE_SIZE;
+		offset = (addr - va_start) >> PAGE_SHIFT;
+		alias = (unsigned long)page_address(area->pages[offset]);
+		rc = change_page_attr(alias, alias + PAGE_SIZE, flags);
+		if (rc)
+			break;
+		addr += PAGE_SIZE;
+		if (addr >= va_end)
+			area = NULL;
+	}
+	return rc;
+}
+
+int __set_memory(unsigned long addr, unsigned long numpages, unsigned long flags)
+{
+	unsigned long end;
+	int rc;
+
+	if (!cpu_has_nx())
 		flags &= ~(SET_MEMORY_NX | SET_MEMORY_X);
 	if (!flags)
 		return 0;
+	if (!numpages)
+		return 0;
 	addr &= PAGE_MASK;
-	return change_page_attr(addr, addr + numpages * PAGE_SIZE, flags);
+	end = addr + numpages * PAGE_SIZE;
+	mutex_lock(&cpa_mutex);
+	rc = change_page_attr(addr, end, flags);
+	if (rc)
+		goto out;
+	rc = change_page_attr_alias(addr, end, flags);
+out:
+	mutex_unlock(&cpa_mutex);
+	return rc;
 }
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
+int set_direct_map_invalid_noflush(struct page *page)
+{
+	return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_INV);
+}
+
+int set_direct_map_default_noflush(struct page *page)
+{
+	return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_DEF);
+}
+
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
+{
+	unsigned long flags;
+
+	if (valid)
+		flags = SET_MEMORY_DEF;
+	else
+		flags = SET_MEMORY_INV;
+
+	return __set_memory((unsigned long)page_to_virt(page), nr, flags);
+}
+
+bool kernel_page_present(struct page *page)
+{
+	unsigned long addr;
+	unsigned int cc;
+
+	addr = (unsigned long)page_address(page);
+	asm volatile(
+		"	lra	%[addr],0(%[addr])\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc), [addr] "+a" (addr)
+		:
+		: CC_CLOBBER);
+	return CC_TRANSFORM(cc) == 0;
+}
+
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
 
 static void ipte_range(pte_t *pte, unsigned long address, int nr)
 {
@@ -337,50 +449,27 @@ static void ipte_range(pte_t *pte, unsigned long address, int nr)
 void __kernel_map_pages(struct page *page, int numpages, int enable)
 {
 	unsigned long address;
+	pte_t *ptep, pte;
 	int nr, i, j;
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
 
 	for (i = 0; i < numpages;) {
-		address = page_to_phys(page + i);
-		pgd = pgd_offset_k(address);
-		p4d = p4d_offset(pgd, address);
-		pud = pud_offset(p4d, address);
-		pmd = pmd_offset(pud, address);
-		pte = pte_offset_kernel(pmd, address);
-		nr = (unsigned long)pte >> ilog2(sizeof(long));
+		address = (unsigned long)page_to_virt(page + i);
+		ptep = virt_to_kpte(address);
+		nr = (unsigned long)ptep >> ilog2(sizeof(long));
 		nr = PTRS_PER_PTE - (nr & (PTRS_PER_PTE - 1));
 		nr = min(numpages - i, nr);
 		if (enable) {
 			for (j = 0; j < nr; j++) {
-				pte_val(*pte) &= ~_PAGE_INVALID;
+				pte = clear_pte_bit(*ptep, __pgprot(_PAGE_INVALID));
+				set_pte(ptep, pte);
 				address += PAGE_SIZE;
-				pte++;
+				ptep++;
 			}
 		} else {
-			ipte_range(pte, address, nr);
+			ipte_range(ptep, address, nr);
 		}
 		i += nr;
 	}
 }
 
-#ifdef CONFIG_HIBERNATION
-bool kernel_page_present(struct page *page)
-{
-	unsigned long addr;
-	int cc;
-
-	addr = page_to_phys(page);
-	asm volatile(
-		"	lra	%1,0(%1)\n"
-		"	ipm	%0\n"
-		"	srl	%0,28"
-		: "=d" (cc), "+a" (addr) : : "cc");
-	return cc == 0;
-}
-#endif /* CONFIG_HIBERNATION */
-
 #endif /* CONFIG_DEBUG_PAGEALLOC */
diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c
new file mode 100644
index 000000000000..e6175d75e4b0
--- /dev/null
+++ b/arch/s390/mm/pfault.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 1999, 2023
+ */
+
+#include <linux/cpuhotplug.h>
+#include <linux/sched/task.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <asm/asm-extable.h>
+#include <asm/asm-offsets.h>
+#include <asm/pfault.h>
+#include <asm/diag.h>
+
+#define __SUBCODE_MASK 0x0600
+#define __PF_RES_FIELD 0x8000000000000000UL
+
+/*
+ * 'pfault' pseudo page faults routines.
+ */
+static int pfault_disable;
+
+static int __init nopfault(char *str)
+{
+	pfault_disable = 1;
+	return 1;
+}
+early_param("nopfault", nopfault);
+
+struct pfault_refbk {
+	u16 refdiagc;
+	u16 reffcode;
+	u16 refdwlen;
+	u16 refversn;
+	u64 refgaddr;
+	u64 refselmk;
+	u64 refcmpmk;
+	u64 reserved;
+};
+
+static struct pfault_refbk pfault_init_refbk = {
+	.refdiagc = 0x258,
+	.reffcode = 0,
+	.refdwlen = 5,
+	.refversn = 2,
+	.refgaddr = __LC_LPP,
+	.refselmk = 1UL << 48,
+	.refcmpmk = 1UL << 48,
+	.reserved = __PF_RES_FIELD
+};
+
+int __pfault_init(void)
+{
+	int rc = -EOPNOTSUPP;
+
+	if (pfault_disable)
+		return rc;
+	diag_stat_inc(DIAG_STAT_X258);
+	asm_inline volatile(
+		"	diag	%[refbk],%[rc],0x258\n"
+		"0:	nopr	%%r7\n"
+		EX_TABLE(0b, 0b)
+		: [rc] "+d" (rc)
+		: [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk)
+		: "cc");
+	return rc;
+}
+
+static struct pfault_refbk pfault_fini_refbk = {
+	.refdiagc = 0x258,
+	.reffcode = 1,
+	.refdwlen = 5,
+	.refversn = 2,
+};
+
+void __pfault_fini(void)
+{
+	if (pfault_disable)
+		return;
+	diag_stat_inc(DIAG_STAT_X258);
+	asm_inline volatile(
+		"	diag	%[refbk],0,0x258\n"
+		"0:	nopr	%%r7\n"
+		EX_TABLE(0b, 0b)
+		:
+		: [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk)
+		: "cc");
+}
+
+static DEFINE_SPINLOCK(pfault_lock);
+static LIST_HEAD(pfault_list);
+
+#define PF_COMPLETE	0x0080
+
+/*
+ * The mechanism of our pfault code: if Linux is running as guest, runs a user
+ * space process and the user space process accesses a page that the host has
+ * paged out we get a pfault interrupt.
+ *
+ * This allows us, within the guest, to schedule a different process. Without
+ * this mechanism the host would have to suspend the whole virtual cpu until
+ * the page has been paged in.
+ *
+ * So when we get such an interrupt then we set the state of the current task
+ * to uninterruptible and also set the need_resched flag. Both happens within
+ * interrupt context(!). If we later on want to return to user space we
+ * recognize the need_resched flag and then call schedule().  It's not very
+ * obvious how this works...
+ *
+ * Of course we have a lot of additional fun with the completion interrupt (->
+ * host signals that a page of a process has been paged in and the process can
+ * continue to run). This interrupt can arrive on any cpu and, since we have
+ * virtual cpus, actually appear before the interrupt that signals that a page
+ * is missing.
+ */
+static void pfault_interrupt(struct ext_code ext_code,
+			     unsigned int param32, unsigned long param64)
+{
+	struct task_struct *tsk;
+	__u16 subcode;
+	pid_t pid;
+
+	/*
+	 * Get the external interruption subcode & pfault initial/completion
+	 * signal bit. VM stores this in the 'cpu address' field associated
+	 * with the external interrupt.
+	 */
+	subcode = ext_code.subcode;
+	if ((subcode & 0xff00) != __SUBCODE_MASK)
+		return;
+	inc_irq_stat(IRQEXT_PFL);
+	/* Get the token (= pid of the affected task). */
+	pid = param64 & LPP_PID_MASK;
+	rcu_read_lock();
+	tsk = find_task_by_pid_ns(pid, &init_pid_ns);
+	if (tsk)
+		get_task_struct(tsk);
+	rcu_read_unlock();
+	if (!tsk)
+		return;
+	spin_lock(&pfault_lock);
+	if (subcode & PF_COMPLETE) {
+		/* signal bit is set -> a page has been swapped in by VM */
+		if (tsk->thread.pfault_wait == 1) {
+			/*
+			 * Initial interrupt was faster than the completion
+			 * interrupt. pfault_wait is valid. Set pfault_wait
+			 * back to zero and wake up the process. This can
+			 * safely be done because the task is still sleeping
+			 * and can't produce new pfaults.
+			 */
+			tsk->thread.pfault_wait = 0;
+			list_del(&tsk->thread.list);
+			wake_up_process(tsk);
+			put_task_struct(tsk);
+		} else {
+			/*
+			 * Completion interrupt was faster than initial
+			 * interrupt. Set pfault_wait to -1 so the initial
+			 * interrupt doesn't put the task to sleep.
+			 * If the task is not running, ignore the completion
+			 * interrupt since it must be a leftover of a PFAULT
+			 * CANCEL operation which didn't remove all pending
+			 * completion interrupts.
+			 */
+			if (task_is_running(tsk))
+				tsk->thread.pfault_wait = -1;
+		}
+	} else {
+		/* signal bit not set -> a real page is missing. */
+		if (WARN_ON_ONCE(tsk != current))
+			goto out;
+		if (tsk->thread.pfault_wait == 1) {
+			/* Already on the list with a reference: put to sleep */
+			goto block;
+		} else if (tsk->thread.pfault_wait == -1) {
+			/*
+			 * Completion interrupt was faster than the initial
+			 * interrupt (pfault_wait == -1). Set pfault_wait
+			 * back to zero and exit.
+			 */
+			tsk->thread.pfault_wait = 0;
+		} else {
+			/*
+			 * Initial interrupt arrived before completion
+			 * interrupt. Let the task sleep.
+			 * An extra task reference is needed since a different
+			 * cpu may set the task state to TASK_RUNNING again
+			 * before the scheduler is reached.
+			 */
+			get_task_struct(tsk);
+			tsk->thread.pfault_wait = 1;
+			list_add(&tsk->thread.list, &pfault_list);
+block:
+			/*
+			 * Since this must be a userspace fault, there
+			 * is no kernel task state to trample. Rely on the
+			 * return to userspace schedule() to block.
+			 */
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			set_tsk_need_resched(tsk);
+			set_preempt_need_resched();
+		}
+	}
+out:
+	spin_unlock(&pfault_lock);
+	put_task_struct(tsk);
+}
+
+static int pfault_cpu_dead(unsigned int cpu)
+{
+	struct thread_struct *thread, *next;
+	struct task_struct *tsk;
+
+	spin_lock_irq(&pfault_lock);
+	list_for_each_entry_safe(thread, next, &pfault_list, list) {
+		thread->pfault_wait = 0;
+		list_del(&thread->list);
+		tsk = container_of(thread, struct task_struct, thread);
+		wake_up_process(tsk);
+		put_task_struct(tsk);
+	}
+	spin_unlock_irq(&pfault_lock);
+	return 0;
+}
+
+static int __init pfault_irq_init(void)
+{
+	int rc;
+
+	rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
+	if (rc)
+		goto out_extint;
+	rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
+	if (rc)
+		goto out_pfault;
+	irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
+	cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead",
+				  NULL, pfault_cpu_dead);
+	return 0;
+
+out_pfault:
+	unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
+out_extint:
+	pfault_disable = 1;
+	return rc;
+}
+early_initcall(pfault_irq_init);
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 3dd253f81a77..e3a6f8ae156c 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -10,309 +10,179 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <asm/mmu_context.h>
+#include <asm/page-states.h>
 #include <asm/pgalloc.h>
 #include <asm/gmap.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
-#ifdef CONFIG_PGSTE
-
-int page_table_allocate_pgste = 0;
-EXPORT_SYMBOL(page_table_allocate_pgste);
-
-static struct ctl_table page_table_sysctl[] = {
-	{
-		.procname	= "allocate_pgste",
-		.data		= &page_table_allocate_pgste,
-		.maxlen		= sizeof(int),
-		.mode		= S_IRUGO | S_IWUSR,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
-	},
-	{ }
-};
-
-static struct ctl_table page_table_sysctl_dir[] = {
-	{
-		.procname	= "vm",
-		.maxlen		= 0,
-		.mode		= 0555,
-		.child		= page_table_sysctl,
-	},
-	{ }
-};
-
-static int __init page_table_register_sysctl(void)
-{
-	return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
-}
-__initcall(page_table_register_sysctl);
-
-#endif /* CONFIG_PGSTE */
-
 unsigned long *crst_table_alloc(struct mm_struct *mm)
 {
-	struct page *page = alloc_pages(GFP_KERNEL, 2);
+	struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
+	unsigned long *table;
 
-	if (!page)
+	if (!ptdesc)
 		return NULL;
-	arch_set_page_dat(page, 2);
-	return (unsigned long *) page_to_phys(page);
+	table = ptdesc_to_virt(ptdesc);
+	__arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER);
+	return table;
 }
 
 void crst_table_free(struct mm_struct *mm, unsigned long *table)
 {
-	free_pages((unsigned long) table, 2);
+	if (!table)
+		return;
+	pagetable_free(virt_to_ptdesc(table));
 }
 
 static void __crst_table_upgrade(void *arg)
 {
 	struct mm_struct *mm = arg;
 
-	if (current->active_mm == mm)
-		set_user_asce(mm);
+	/* change all active ASCEs to avoid the creation of new TLBs */
+	if (current->active_mm == mm) {
+		get_lowcore()->user_asce.val = mm->context.asce;
+		local_ctl_load(7, &get_lowcore()->user_asce);
+	}
 	__tlb_flush_local();
 }
 
 int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
 {
-	unsigned long *table, *pgd;
-	int rc, notify;
+	unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
+	unsigned long asce_limit = mm->context.asce_limit;
 
 	/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
-	VM_BUG_ON(mm->context.asce_limit < _REGION2_SIZE);
-	rc = 0;
-	notify = 0;
-	while (mm->context.asce_limit < end) {
-		table = crst_table_alloc(mm);
-		if (!table) {
-			rc = -ENOMEM;
-			break;
-		}
-		spin_lock_bh(&mm->page_table_lock);
-		pgd = (unsigned long *) mm->pgd;
-		if (mm->context.asce_limit == _REGION2_SIZE) {
-			crst_table_init(table, _REGION2_ENTRY_EMPTY);
-			p4d_populate(mm, (p4d_t *) table, (pud_t *) pgd);
-			mm->pgd = (pgd_t *) table;
-			mm->context.asce_limit = _REGION1_SIZE;
-			mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
-				_ASCE_USER_BITS | _ASCE_TYPE_REGION2;
-			mm_inc_nr_puds(mm);
-		} else {
-			crst_table_init(table, _REGION1_ENTRY_EMPTY);
-			pgd_populate(mm, (pgd_t *) table, (p4d_t *) pgd);
-			mm->pgd = (pgd_t *) table;
-			mm->context.asce_limit = -PAGE_SIZE;
-			mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
-				_ASCE_USER_BITS | _ASCE_TYPE_REGION1;
-		}
-		notify = 1;
-		spin_unlock_bh(&mm->page_table_lock);
-	}
-	if (notify)
-		on_each_cpu(__crst_table_upgrade, mm, 0);
-	return rc;
-}
+	VM_BUG_ON(asce_limit < _REGION2_SIZE);
 
-void crst_table_downgrade(struct mm_struct *mm)
-{
-	pgd_t *pgd;
+	if (end <= asce_limit)
+		return 0;
 
-	/* downgrade should only happen from 3 to 2 levels (compat only) */
-	VM_BUG_ON(mm->context.asce_limit != _REGION2_SIZE);
+	if (asce_limit == _REGION2_SIZE) {
+		p4d = crst_table_alloc(mm);
+		if (unlikely(!p4d))
+			goto err_p4d;
+		crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
+		pagetable_p4d_ctor(virt_to_ptdesc(p4d));
+	}
+	if (end > _REGION1_SIZE) {
+		pgd = crst_table_alloc(mm);
+		if (unlikely(!pgd))
+			goto err_pgd;
+		crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
+		pagetable_pgd_ctor(virt_to_ptdesc(pgd));
+	}
 
-	if (current->active_mm == mm) {
-		clear_user_asce();
-		__tlb_flush_mm(mm);
+	spin_lock_bh(&mm->page_table_lock);
+
+	/*
+	 * This routine gets called with mmap_lock lock held and there is
+	 * no reason to optimize for the case of otherwise. However, if
+	 * that would ever change, the below check will let us know.
+	 */
+	VM_BUG_ON(asce_limit != mm->context.asce_limit);
+
+	if (p4d) {
+		__pgd = (unsigned long *) mm->pgd;
+		p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
+		mm->pgd = (pgd_t *) p4d;
+		mm->context.asce_limit = _REGION1_SIZE;
+		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+			_ASCE_USER_BITS | _ASCE_TYPE_REGION2;
+		mm_inc_nr_puds(mm);
+	}
+	if (pgd) {
+		__pgd = (unsigned long *) mm->pgd;
+		pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd);
+		mm->pgd = (pgd_t *) pgd;
+		mm->context.asce_limit = TASK_SIZE_MAX;
+		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+			_ASCE_USER_BITS | _ASCE_TYPE_REGION1;
 	}
 
-	pgd = mm->pgd;
-	mm_dec_nr_pmds(mm);
-	mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
-	mm->context.asce_limit = _REGION3_SIZE;
-	mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
-			   _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
-	crst_table_free(mm, (unsigned long *) pgd);
+	spin_unlock_bh(&mm->page_table_lock);
 
-	if (current->active_mm == mm)
-		set_user_asce(mm);
-}
+	on_each_cpu(__crst_table_upgrade, mm, 0);
 
-static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
-{
-	unsigned int old, new;
+	return 0;
 
-	do {
-		old = atomic_read(v);
-		new = old ^ bits;
-	} while (atomic_cmpxchg(v, old, new) != old);
-	return new;
+err_pgd:
+	pagetable_dtor(virt_to_ptdesc(p4d));
+	crst_table_free(mm, p4d);
+err_p4d:
+	return -ENOMEM;
 }
 
 #ifdef CONFIG_PGSTE
 
-struct page *page_table_alloc_pgste(struct mm_struct *mm)
+struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm)
 {
-	struct page *page;
+	struct ptdesc *ptdesc;
 	u64 *table;
 
-	page = alloc_page(GFP_KERNEL);
-	if (page) {
-		table = (u64 *)page_to_phys(page);
+	ptdesc = pagetable_alloc(GFP_KERNEL, 0);
+	if (ptdesc) {
+		table = (u64 *)ptdesc_to_virt(ptdesc);
+		__arch_set_page_dat(table, 1);
 		memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
 		memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
 	}
-	return page;
+	return ptdesc;
 }
 
-void page_table_free_pgste(struct page *page)
+void page_table_free_pgste(struct ptdesc *ptdesc)
 {
-	__free_page(page);
+	pagetable_free(ptdesc);
 }
 
 #endif /* CONFIG_PGSTE */
 
-/*
- * page table entry allocation/free routines.
- */
 unsigned long *page_table_alloc(struct mm_struct *mm)
 {
+	struct ptdesc *ptdesc;
 	unsigned long *table;
-	struct page *page;
-	unsigned int mask, bit;
-
-	/* Try to get a fragment of a 4K page as a 2K page table */
-	if (!mm_alloc_pgste(mm)) {
-		table = NULL;
-		spin_lock_bh(&mm->context.lock);
-		if (!list_empty(&mm->context.pgtable_list)) {
-			page = list_first_entry(&mm->context.pgtable_list,
-						struct page, lru);
-			mask = atomic_read(&page->_refcount) >> 24;
-			mask = (mask | (mask >> 4)) & 3;
-			if (mask != 3) {
-				table = (unsigned long *) page_to_phys(page);
-				bit = mask & 1;		/* =1 -> second 2K */
-				if (bit)
-					table += PTRS_PER_PTE;
-				atomic_xor_bits(&page->_refcount,
-							1U << (bit + 24));
-				list_del(&page->lru);
-			}
-		}
-		spin_unlock_bh(&mm->context.lock);
-		if (table)
-			return table;
-	}
-	/* Allocate a fresh page */
-	page = alloc_page(GFP_KERNEL);
-	if (!page)
+
+	ptdesc = pagetable_alloc(GFP_KERNEL, 0);
+	if (!ptdesc)
 		return NULL;
-	if (!pgtable_pte_page_ctor(page)) {
-		__free_page(page);
+	if (!pagetable_pte_ctor(ptdesc)) {
+		pagetable_free(ptdesc);
 		return NULL;
 	}
-	arch_set_page_dat(page, 0);
-	/* Initialize page table */
-	table = (unsigned long *) page_to_phys(page);
-	if (mm_alloc_pgste(mm)) {
-		/* Return 4K page table with PGSTEs */
-		atomic_xor_bits(&page->_refcount, 3 << 24);
-		memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
-		memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
-	} else {
-		/* Return the first 2K fragment of the page */
-		atomic_xor_bits(&page->_refcount, 1 << 24);
-		memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
-		spin_lock_bh(&mm->context.lock);
-		list_add(&page->lru, &mm->context.pgtable_list);
-		spin_unlock_bh(&mm->context.lock);
-	}
+	table = ptdesc_to_virt(ptdesc);
+	__arch_set_page_dat(table, 1);
+	memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
+	memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
 	return table;
 }
 
 void page_table_free(struct mm_struct *mm, unsigned long *table)
 {
-	struct page *page;
-	unsigned int bit, mask;
-
-	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
-	if (!mm_alloc_pgste(mm)) {
-		/* Free 2K page table fragment of a 4K page */
-		bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
-		spin_lock_bh(&mm->context.lock);
-		mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24));
-		mask >>= 24;
-		if (mask & 3)
-			list_add(&page->lru, &mm->context.pgtable_list);
-		else
-			list_del(&page->lru);
-		spin_unlock_bh(&mm->context.lock);
-		if (mask != 0)
-			return;
-	} else {
-		atomic_xor_bits(&page->_refcount, 3U << 24);
-	}
+	struct ptdesc *ptdesc = virt_to_ptdesc(table);
 
-	pgtable_pte_page_dtor(page);
-	__free_page(page);
+	pagetable_dtor_free(ptdesc);
 }
 
-void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
-			 unsigned long vmaddr)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void pte_free_now(struct rcu_head *head)
 {
-	struct mm_struct *mm;
-	struct page *page;
-	unsigned int bit, mask;
-
-	mm = tlb->mm;
-	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
-	if (mm_alloc_pgste(mm)) {
-		gmap_unlink(mm, table, vmaddr);
-		table = (unsigned long *) (__pa(table) | 3);
-		tlb_remove_table(tlb, table);
-		return;
-	}
-	bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
-	spin_lock_bh(&mm->context.lock);
-	mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
-	mask >>= 24;
-	if (mask & 3)
-		list_add_tail(&page->lru, &mm->context.pgtable_list);
-	else
-		list_del(&page->lru);
-	spin_unlock_bh(&mm->context.lock);
-	table = (unsigned long *) (__pa(table) | (1U << bit));
-	tlb_remove_table(tlb, table);
+	struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
+
+	pagetable_dtor_free(ptdesc);
 }
 
-void __tlb_remove_table(void *_table)
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
 {
-	unsigned int mask = (unsigned long) _table & 3;
-	void *table = (void *)((unsigned long) _table ^ mask);
-	struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
-
-	switch (mask) {
-	case 0:		/* pmd, pud, or p4d */
-		free_pages((unsigned long) table, 2);
-		break;
-	case 1:		/* lower 2K of a 4K page table */
-	case 2:		/* higher 2K of a 4K page table */
-		mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
-		mask >>= 24;
-		if (mask != 0)
-			break;
-		/* fallthrough */
-	case 3:		/* 4K page table with pgstes */
-		if (mask & 3)
-			atomic_xor_bits(&page->_refcount, 3 << 24);
-		pgtable_pte_page_dtor(page);
-		__free_page(page);
-		break;
-	}
+	struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
+
+	call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
+	/*
+	 * THPs are not allowed for KVM guests. Warn if pgste ever reaches here.
+	 * Turn to the generic pte_free_defer() version once gmap is removed.
+	 */
+	WARN_ON_ONCE(mm_has_pgste(mm));
 }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 /*
  * Base infrastructure required to generate basic asces, region, segment,
@@ -321,34 +191,39 @@ void __tlb_remove_table(void *_table)
 
 static struct kmem_cache *base_pgt_cache;
 
-static unsigned long base_pgt_alloc(void)
+static unsigned long *base_pgt_alloc(void)
 {
-	u64 *table;
+	unsigned long *table;
 
 	table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
 	if (table)
-		memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
-	return (unsigned long) table;
+		memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
+	return table;
 }
 
-static void base_pgt_free(unsigned long table)
+static void base_pgt_free(unsigned long *table)
 {
-	kmem_cache_free(base_pgt_cache, (void *) table);
+	kmem_cache_free(base_pgt_cache, table);
 }
 
-static unsigned long base_crst_alloc(unsigned long val)
+static unsigned long *base_crst_alloc(unsigned long val)
 {
-	unsigned long table;
+	unsigned long *table;
+	struct ptdesc *ptdesc;
 
-	table =	 __get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
-	if (table)
-		crst_table_init((unsigned long *)table, val);
+	ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
+	if (!ptdesc)
+		return NULL;
+	table = ptdesc_address(ptdesc);
+	crst_table_init(table, val);
 	return table;
 }
 
-static void base_crst_free(unsigned long table)
+static void base_crst_free(unsigned long *table)
 {
-	free_pages(table, CRST_ALLOC_ORDER);
+	if (!table)
+		return;
+	pagetable_free(virt_to_ptdesc(table));
 }
 
 #define BASE_ADDR_END_FUNC(NAME, SIZE)					\
@@ -360,7 +235,7 @@ static inline unsigned long base_##NAME##_addr_end(unsigned long addr,	\
 	return (next - 1) < (end - 1) ? next : end;			\
 }
 
-BASE_ADDR_END_FUNC(page,    _PAGE_SIZE)
+BASE_ADDR_END_FUNC(page,    PAGE_SIZE)
 BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
 BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
 BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
@@ -376,15 +251,15 @@ static inline unsigned long base_lra(unsigned long address)
 	return real;
 }
 
-static int base_page_walk(unsigned long origin, unsigned long addr,
+static int base_page_walk(unsigned long *origin, unsigned long addr,
 			  unsigned long end, int alloc)
 {
 	unsigned long *pte, next;
 
 	if (!alloc)
 		return 0;
-	pte = (unsigned long *) origin;
-	pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
+	pte = origin;
+	pte += (addr & _PAGE_INDEX) >> PAGE_SHIFT;
 	do {
 		next = base_page_addr_end(addr, end);
 		*pte = base_lra(addr);
@@ -392,13 +267,13 @@ static int base_page_walk(unsigned long origin, unsigned long addr,
 	return 0;
 }
 
-static int base_segment_walk(unsigned long origin, unsigned long addr,
+static int base_segment_walk(unsigned long *origin, unsigned long addr,
 			     unsigned long end, int alloc)
 {
-	unsigned long *ste, next, table;
+	unsigned long *ste, next, *table;
 	int rc;
 
-	ste = (unsigned long *) origin;
+	ste = origin;
 	ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
 	do {
 		next = base_segment_addr_end(addr, end);
@@ -408,9 +283,9 @@ static int base_segment_walk(unsigned long origin, unsigned long addr,
 			table = base_pgt_alloc();
 			if (!table)
 				return -ENOMEM;
-			*ste = table | _SEGMENT_ENTRY;
+			*ste = __pa(table) | _SEGMENT_ENTRY;
 		}
-		table = *ste & _SEGMENT_ENTRY_ORIGIN;
+		table = __va(*ste & _SEGMENT_ENTRY_ORIGIN);
 		rc = base_page_walk(table, addr, next, alloc);
 		if (rc)
 			return rc;
@@ -421,13 +296,13 @@ static int base_segment_walk(unsigned long origin, unsigned long addr,
 	return 0;
 }
 
-static int base_region3_walk(unsigned long origin, unsigned long addr,
+static int base_region3_walk(unsigned long *origin, unsigned long addr,
 			     unsigned long end, int alloc)
 {
-	unsigned long *rtte, next, table;
+	unsigned long *rtte, next, *table;
 	int rc;
 
-	rtte = (unsigned long *) origin;
+	rtte = origin;
 	rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
 	do {
 		next = base_region3_addr_end(addr, end);
@@ -437,9 +312,9 @@ static int base_region3_walk(unsigned long origin, unsigned long addr,
 			table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
 			if (!table)
 				return -ENOMEM;
-			*rtte = table | _REGION3_ENTRY;
+			*rtte = __pa(table) | _REGION3_ENTRY;
 		}
-		table = *rtte & _REGION_ENTRY_ORIGIN;
+		table = __va(*rtte & _REGION_ENTRY_ORIGIN);
 		rc = base_segment_walk(table, addr, next, alloc);
 		if (rc)
 			return rc;
@@ -449,13 +324,13 @@ static int base_region3_walk(unsigned long origin, unsigned long addr,
 	return 0;
 }
 
-static int base_region2_walk(unsigned long origin, unsigned long addr,
+static int base_region2_walk(unsigned long *origin, unsigned long addr,
 			     unsigned long end, int alloc)
 {
-	unsigned long *rste, next, table;
+	unsigned long *rste, next, *table;
 	int rc;
 
-	rste = (unsigned long *) origin;
+	rste = origin;
 	rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
 	do {
 		next = base_region2_addr_end(addr, end);
@@ -465,9 +340,9 @@ static int base_region2_walk(unsigned long origin, unsigned long addr,
 			table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
 			if (!table)
 				return -ENOMEM;
-			*rste = table | _REGION2_ENTRY;
+			*rste = __pa(table) | _REGION2_ENTRY;
 		}
-		table = *rste & _REGION_ENTRY_ORIGIN;
+		table = __va(*rste & _REGION_ENTRY_ORIGIN);
 		rc = base_region3_walk(table, addr, next, alloc);
 		if (rc)
 			return rc;
@@ -477,13 +352,13 @@ static int base_region2_walk(unsigned long origin, unsigned long addr,
 	return 0;
 }
 
-static int base_region1_walk(unsigned long origin, unsigned long addr,
+static int base_region1_walk(unsigned long *origin, unsigned long addr,
 			     unsigned long end, int alloc)
 {
-	unsigned long *rfte, next, table;
+	unsigned long *rfte, next, *table;
 	int rc;
 
-	rfte = (unsigned long *) origin;
+	rfte = origin;
 	rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
 	do {
 		next = base_region1_addr_end(addr, end);
@@ -493,9 +368,9 @@ static int base_region1_walk(unsigned long origin, unsigned long addr,
 			table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
 			if (!table)
 				return -ENOMEM;
-			*rfte = table | _REGION1_ENTRY;
+			*rfte = __pa(table) | _REGION1_ENTRY;
 		}
-		table = *rfte & _REGION_ENTRY_ORIGIN;
+		table = __va(*rfte & _REGION_ENTRY_ORIGIN);
 		rc = base_region2_walk(table, addr, next, alloc);
 		if (rc)
 			return rc;
@@ -514,7 +389,7 @@ static int base_region1_walk(unsigned long origin, unsigned long addr,
  */
 void base_asce_free(unsigned long asce)
 {
-	unsigned long table = asce & _ASCE_ORIGIN;
+	unsigned long *table = __va(asce & _ASCE_ORIGIN);
 
 	if (!asce)
 		return;
@@ -529,7 +404,7 @@ void base_asce_free(unsigned long asce)
 		base_region2_walk(table, 0, _REGION1_SIZE, 0);
 		break;
 	case _ASCE_TYPE_REGION1:
-		base_region1_walk(table, 0, -_PAGE_SIZE, 0);
+		base_region1_walk(table, 0, TASK_SIZE_MAX, 0);
 		break;
 	}
 	base_crst_free(table);
@@ -566,7 +441,7 @@ static int base_pgt_cache_init(void)
  */
 unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
 {
-	unsigned long asce, table, end;
+	unsigned long asce, *table, end;
 	int rc;
 
 	if (base_pgt_cache_init())
@@ -577,25 +452,25 @@ unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
 		if (!table)
 			return 0;
 		rc = base_segment_walk(table, addr, end, 1);
-		asce = table | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
+		asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
 	} else if (end <= _REGION2_SIZE) {
 		table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
 		if (!table)
 			return 0;
 		rc = base_region3_walk(table, addr, end, 1);
-		asce = table | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
+		asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
 	} else if (end <= _REGION1_SIZE) {
 		table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
 		if (!table)
 			return 0;
 		rc = base_region2_walk(table, addr, end, 1);
-		asce = table | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
+		asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
 	} else {
 		table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
 		if (!table)
 			return 0;
 		rc = base_region1_walk(table, addr, end, 1);
-		asce = table | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
+		asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
 	}
 	if (rc) {
 		base_asce_free(asce);
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 9ebd01219812..9901934284ec 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -4,6 +4,7 @@
  *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  */
 
+#include <linux/cpufeature.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
@@ -19,19 +20,28 @@
 #include <linux/ksm.h>
 #include <linux/mman.h>
 
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 #include <asm/page-states.h>
+#include <asm/machine.h>
+
+pgprot_t pgprot_writecombine(pgprot_t prot)
+{
+	/*
+	 * mio_wb_bit_mask may be set on a different CPU, but it is only set
+	 * once at init and only read afterwards.
+	 */
+	return __pgprot(pgprot_val(prot) | mio_wb_bit_mask);
+}
+EXPORT_SYMBOL_GPL(pgprot_writecombine);
 
 static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr,
 				   pte_t *ptep, int nodat)
 {
 	unsigned long opt, asce;
 
-	if (MACHINE_HAS_TLB_GUEST) {
+	if (machine_has_tlb_guest()) {
 		opt = 0;
 		asce = READ_ONCE(mm->context.gmap_asce);
 		if (asce == 0UL || nodat)
@@ -51,7 +61,7 @@ static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr,
 {
 	unsigned long opt, asce;
 
-	if (MACHINE_HAS_TLB_GUEST) {
+	if (machine_has_tlb_guest()) {
 		opt = 0;
 		asce = READ_ONCE(mm->context.gmap_asce);
 		if (asce == 0UL || nodat)
@@ -76,7 +86,7 @@ static inline pte_t ptep_flush_direct(struct mm_struct *mm,
 	if (unlikely(pte_val(old) & _PAGE_INVALID))
 		return old;
 	atomic_inc(&mm->context.flush_count);
-	if (MACHINE_HAS_TLB_LC &&
+	if (cpu_has_tlb_lc() &&
 	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
 		ptep_ipte_local(mm, addr, ptep, nodat);
 	else
@@ -97,7 +107,7 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm,
 	atomic_inc(&mm->context.flush_count);
 	if (cpumask_equal(&mm->context.cpu_attach_mask,
 			  cpumask_of(smp_processor_id()))) {
-		pte_val(*ptep) |= _PAGE_INVALID;
+		set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_INVALID)));
 		mm->context.flush_mm = 1;
 	} else
 		ptep_ipte_global(mm, addr, ptep, nodat);
@@ -107,32 +117,23 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm,
 
 static inline pgste_t pgste_get_lock(pte_t *ptep)
 {
-	unsigned long new = 0;
+	unsigned long value = 0;
 #ifdef CONFIG_PGSTE
-	unsigned long old;
-
-	asm(
-		"	lg	%0,%2\n"
-		"0:	lgr	%1,%0\n"
-		"	nihh	%0,0xff7f\n"	/* clear PCL bit in old */
-		"	oihh	%1,0x0080\n"	/* set PCL bit in new */
-		"	csg	%0,%1,%2\n"
-		"	jl	0b\n"
-		: "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE])
-		: "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory");
+	unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE);
+
+	do {
+		value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr);
+	} while (value & PGSTE_PCL_BIT);
+	value |= PGSTE_PCL_BIT;
 #endif
-	return __pgste(new);
+	return __pgste(value);
 }
 
 static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste)
 {
 #ifdef CONFIG_PGSTE
-	asm(
-		"	nihh	%1,0xff7f\n"	/* clear PCL bit */
-		"	stg	%1,%0\n"
-		: "=Q" (ptep[PTRS_PER_PTE])
-		: "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE])
-		: "cc", "memory");
+	barrier();
+	WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT);
 #endif
 }
 
@@ -164,10 +165,10 @@ static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste,
 	skey = (unsigned long) page_get_storage_key(address);
 	bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
 	/* Transfer page changed & referenced bit to guest bits in pgste */
-	pgste_val(pgste) |= bits << 48;		/* GR bit & GC bit */
+	pgste = set_pgste_bit(pgste, bits << 48); /* GR bit & GC bit */
 	/* Copy page access key and fetch protection bit to pgste */
-	pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT);
-	pgste_val(pgste) |= (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
+	pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT);
+	pgste = set_pgste_bit(pgste, (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56);
 #endif
 	return pgste;
 
@@ -201,20 +202,20 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
 	if ((pte_val(entry) & _PAGE_PRESENT) &&
 	    (pte_val(entry) & _PAGE_WRITE) &&
 	    !(pte_val(entry) & _PAGE_INVALID)) {
-		if (!MACHINE_HAS_ESOP) {
+		if (!machine_has_esop()) {
 			/*
 			 * Without enhanced suppression-on-protection force
 			 * the dirty bit on for all writable ptes.
 			 */
-			pte_val(entry) |= _PAGE_DIRTY;
-			pte_val(entry) &= ~_PAGE_PROTECT;
+			entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY));
+			entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT));
 		}
 		if (!(pte_val(entry) & _PAGE_PROTECT))
 			/* This pte allows write access, set user-dirty */
-			pgste_val(pgste) |= PGSTE_UC_BIT;
+			pgste = set_pgste_bit(pgste, PGSTE_UC_BIT);
 	}
 #endif
-	*ptep = entry;
+	set_pte(ptep, entry);
 	return pgste;
 }
 
@@ -227,7 +228,7 @@ static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
 
 	bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
 	if (bits) {
-		pgste_val(pgste) ^= bits;
+		pgste = __pgste(pgste_val(pgste) ^ bits);
 		ptep_notify(mm, addr, ptep, bits);
 	}
 #endif
@@ -257,12 +258,12 @@ static inline pte_t ptep_xchg_commit(struct mm_struct *mm,
 			pgste = pgste_update_all(old, pgste, mm);
 			if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) ==
 			    _PGSTE_GPS_USAGE_UNUSED)
-				pte_val(old) |= _PAGE_UNUSED;
+				old = set_pte_bit(old, __pgprot(_PAGE_UNUSED));
 		}
 		pgste = pgste_set_pte(ptep, pgste, new);
 		pgste_set_unlock(ptep, pgste);
 	} else {
-		*ptep = new;
+		set_pte(ptep, new);
 	}
 	return old;
 }
@@ -284,6 +285,31 @@ pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr,
 }
 EXPORT_SYMBOL(ptep_xchg_direct);
 
+/*
+ * Caller must check that new PTE only differs in _PAGE_PROTECT HW bit, so that
+ * RDP can be used instead of IPTE. See also comments at pte_allow_rdp().
+ */
+void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+			 pte_t new)
+{
+	preempt_disable();
+	atomic_inc(&mm->context.flush_count);
+	if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+		__ptep_rdp(addr, ptep, 0, 0, 1);
+	else
+		__ptep_rdp(addr, ptep, 0, 0, 0);
+	/*
+	 * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That
+	 * means it is still valid and active, and must not be changed according
+	 * to the architecture. But writing a new value that only differs in SW
+	 * bits is allowed.
+	 */
+	set_pte(ptep, new);
+	atomic_dec(&mm->context.flush_count);
+	preempt_enable();
+}
+EXPORT_SYMBOL(ptep_reset_dat_prot);
+
 pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t new)
 {
@@ -326,15 +352,13 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
 	pgste_t pgste;
 	struct mm_struct *mm = vma->vm_mm;
 
-	if (!MACHINE_HAS_NX)
-		pte_val(pte) &= ~_PAGE_NOEXEC;
 	if (mm_has_pgste(mm)) {
 		pgste = pgste_get(ptep);
 		pgste_set_key(ptep, pgste, pte, mm);
 		pgste = pgste_set_pte(ptep, pgste, pte);
 		pgste_set_unlock(ptep, pgste);
 	} else {
-		*ptep = pte;
+		set_pte(ptep, pte);
 	}
 	preempt_enable();
 }
@@ -342,7 +366,7 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
 static inline void pmdp_idte_local(struct mm_struct *mm,
 				   unsigned long addr, pmd_t *pmdp)
 {
-	if (MACHINE_HAS_TLB_GUEST)
+	if (machine_has_tlb_guest())
 		__pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE,
 			    mm->context.asce, IDTE_LOCAL);
 	else
@@ -354,12 +378,12 @@ static inline void pmdp_idte_local(struct mm_struct *mm,
 static inline void pmdp_idte_global(struct mm_struct *mm,
 				    unsigned long addr, pmd_t *pmdp)
 {
-	if (MACHINE_HAS_TLB_GUEST) {
+	if (machine_has_tlb_guest()) {
 		__pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE,
 			    mm->context.asce, IDTE_GLOBAL);
 		if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
 			gmap_pmdp_idte_global(mm, addr);
-	} else if (MACHINE_HAS_IDTE) {
+	} else if (cpu_has_idte()) {
 		__pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL);
 		if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
 			gmap_pmdp_idte_global(mm, addr);
@@ -379,7 +403,7 @@ static inline pmd_t pmdp_flush_direct(struct mm_struct *mm,
 	if (pmd_val(old) & _SEGMENT_ENTRY_INVALID)
 		return old;
 	atomic_inc(&mm->context.flush_count);
-	if (MACHINE_HAS_TLB_LC &&
+	if (cpu_has_tlb_lc() &&
 	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
 		pmdp_idte_local(mm, addr, pmdp);
 	else
@@ -399,7 +423,7 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
 	atomic_inc(&mm->context.flush_count);
 	if (cpumask_equal(&mm->context.cpu_attach_mask,
 			  cpumask_of(smp_processor_id()))) {
-		pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID;
+		set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID)));
 		mm->context.flush_mm = 1;
 		if (mm_has_pgste(mm))
 			gmap_pmdp_invalidate(mm, addr);
@@ -411,22 +435,36 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
 }
 
 #ifdef CONFIG_PGSTE
-static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr)
+static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp)
 {
+	struct vm_area_struct *vma;
 	pgd_t *pgd;
 	p4d_t *p4d;
 	pud_t *pud;
-	pmd_t *pmd;
+
+	/* We need a valid VMA, otherwise this is clearly a fault. */
+	vma = vma_lookup(mm, addr);
+	if (!vma)
+		return -EFAULT;
 
 	pgd = pgd_offset(mm, addr);
-	p4d = p4d_alloc(mm, pgd, addr);
-	if (!p4d)
-		return NULL;
-	pud = pud_alloc(mm, p4d, addr);
-	if (!pud)
-		return NULL;
-	pmd = pmd_alloc(mm, pud, addr);
-	return pmd;
+	if (!pgd_present(*pgd))
+		return -ENOENT;
+
+	p4d = p4d_offset(pgd, addr);
+	if (!p4d_present(*p4d))
+		return -ENOENT;
+
+	pud = pud_offset(p4d, addr);
+	if (!pud_present(*pud))
+		return -ENOENT;
+
+	/* Large PUDs are not supported yet. */
+	if (pud_leaf(*pud))
+		return -EFAULT;
+
+	*pmdp = pmd_offset(pud, addr);
+	return 0;
 }
 #endif
 
@@ -437,7 +475,7 @@ pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr,
 
 	preempt_disable();
 	old = pmdp_flush_direct(mm, addr, pmdp);
-	*pmdp = new;
+	set_pmd(pmdp, new);
 	preempt_enable();
 	return old;
 }
@@ -450,7 +488,7 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr,
 
 	preempt_disable();
 	old = pmdp_flush_lazy(mm, addr, pmdp);
-	*pmdp = new;
+	set_pmd(pmdp, new);
 	preempt_enable();
 	return old;
 }
@@ -459,7 +497,7 @@ EXPORT_SYMBOL(pmdp_xchg_lazy);
 static inline void pudp_idte_local(struct mm_struct *mm,
 				   unsigned long addr, pud_t *pudp)
 {
-	if (MACHINE_HAS_TLB_GUEST)
+	if (machine_has_tlb_guest())
 		__pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE,
 			    mm->context.asce, IDTE_LOCAL);
 	else
@@ -469,15 +507,15 @@ static inline void pudp_idte_local(struct mm_struct *mm,
 static inline void pudp_idte_global(struct mm_struct *mm,
 				    unsigned long addr, pud_t *pudp)
 {
-	if (MACHINE_HAS_TLB_GUEST)
+	if (machine_has_tlb_guest())
 		__pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE,
 			    mm->context.asce, IDTE_GLOBAL);
-	else if (MACHINE_HAS_IDTE)
+	else if (cpu_has_idte())
 		__pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL);
 	else
 		/*
 		 * Invalid bit position is the same for pmd and pud, so we can
-		 * re-use _pmd_csp() here
+		 * reuse _pmd_csp() here
 		 */
 		__pmdp_csp((pmd_t *) pudp);
 }
@@ -491,7 +529,7 @@ static inline pud_t pudp_flush_direct(struct mm_struct *mm,
 	if (pud_val(old) & _REGION_ENTRY_INVALID)
 		return old;
 	atomic_inc(&mm->context.flush_count);
-	if (MACHINE_HAS_TLB_LC &&
+	if (cpu_has_tlb_lc() &&
 	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
 		pudp_idte_local(mm, addr, pudp);
 	else
@@ -507,7 +545,7 @@ pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr,
 
 	preempt_disable();
 	old = pudp_flush_direct(mm, addr, pudp);
-	*pudp = new;
+	set_pud(pudp, new);
 	preempt_enable();
 	return old;
 }
@@ -547,9 +585,9 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 		list_del(lh);
 	}
 	ptep = (pte_t *) pgtable;
-	pte_val(*ptep) = _PAGE_INVALID;
+	set_pte(ptep, __pte(_PAGE_INVALID));
 	ptep++;
-	pte_val(*ptep) = _PAGE_INVALID;
+	set_pte(ptep, __pte(_PAGE_INVALID));
 	return pgtable;
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -563,7 +601,7 @@ void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
 	/* the mm_has_pgste() check is done in set_pte_at() */
 	preempt_disable();
 	pgste = pgste_get_lock(ptep);
-	pgste_val(pgste) &= ~_PGSTE_GPS_ZERO;
+	pgste = clear_pgste_bit(pgste, _PGSTE_GPS_ZERO);
 	pgste_set_key(ptep, pgste, entry, mm);
 	pgste = pgste_set_pte(ptep, pgste, entry);
 	pgste_set_unlock(ptep, pgste);
@@ -576,7 +614,7 @@ void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 
 	preempt_disable();
 	pgste = pgste_get_lock(ptep);
-	pgste_val(pgste) |= PGSTE_IN_BIT;
+	pgste = set_pgste_bit(pgste, PGSTE_IN_BIT);
 	pgste_set_unlock(ptep, pgste);
 	preempt_enable();
 }
@@ -614,14 +652,14 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
 	if (prot == PROT_NONE && !pte_i) {
 		ptep_flush_direct(mm, addr, ptep, nodat);
 		pgste = pgste_update_all(entry, pgste, mm);
-		pte_val(entry) |= _PAGE_INVALID;
+		entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID));
 	}
 	if (prot == PROT_READ && !pte_p) {
 		ptep_flush_direct(mm, addr, ptep, nodat);
-		pte_val(entry) &= ~_PAGE_INVALID;
-		pte_val(entry) |= _PAGE_PROTECT;
+		entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID));
+		entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT));
 	}
-	pgste_val(pgste) |= bit;
+	pgste = set_pgste_bit(pgste, bit);
 	pgste = pgste_set_pte(ptep, pgste, entry);
 	pgste_set_unlock(ptep, pgste);
 	return 0;
@@ -641,10 +679,10 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
 	if (!(pte_val(spte) & _PAGE_INVALID) &&
 	    !((pte_val(spte) & _PAGE_PROTECT) &&
 	      !(pte_val(pte) & _PAGE_PROTECT))) {
-		pgste_val(spgste) |= PGSTE_VSIE_BIT;
+		spgste = set_pgste_bit(spgste, PGSTE_VSIE_BIT);
 		tpgste = pgste_get_lock(tptep);
-		pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
-				(pte_val(pte) & _PAGE_PROTECT);
+		tpte = __pte((pte_val(spte) & PAGE_MASK) |
+			     (pte_val(pte) & _PAGE_PROTECT));
 		/* don't touch the storage key - it belongs to parent pgste */
 		tpgste = pgste_set_pte(tptep, tpgste, tpte);
 		pgste_set_unlock(tptep, tpgste);
@@ -673,9 +711,9 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
 	if (!non_swap_entry(entry))
 		dec_mm_counter(mm, MM_SWAPENTS);
 	else if (is_migration_entry(entry)) {
-		struct page *page = migration_entry_to_page(entry);
+		struct folio *folio = pfn_swap_entry_folio(entry);
 
-		dec_mm_counter(mm, mm_counter(page));
+		dec_mm_counter(mm, mm_counter(folio));
 	}
 	free_swap_and_cache(entry);
 }
@@ -699,7 +737,7 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
 		pte_clear(mm, addr, ptep);
 	}
 	if (reset)
-		pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
+		pgste = clear_pgste_bit(pgste, _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT);
 	pgste_set_unlock(ptep, pgste);
 	preempt_enable();
 }
@@ -712,11 +750,11 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 	/* Clear storage key ACC and F, but set R/C */
 	preempt_disable();
 	pgste = pgste_get_lock(ptep);
-	pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT);
-	pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT;
+	pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT);
+	pgste = set_pgste_bit(pgste, PGSTE_GR_BIT | PGSTE_GC_BIT);
 	ptev = pte_val(*ptep);
 	if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
-		page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1);
+		page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0);
 	pgste_set_unlock(ptep, pgste);
 	preempt_enable();
 }
@@ -734,17 +772,17 @@ bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr,
 
 	pgste = pgste_get_lock(ptep);
 	dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
-	pgste_val(pgste) &= ~PGSTE_UC_BIT;
+	pgste = clear_pgste_bit(pgste, PGSTE_UC_BIT);
 	pte = *ptep;
 	if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
 		pgste = pgste_pte_notify(mm, addr, ptep, pgste);
 		nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
 		ptep_ipte_global(mm, addr, ptep, nodat);
-		if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
-			pte_val(pte) |= _PAGE_PROTECT;
+		if (machine_has_esop() || !(pte_val(pte) & _PAGE_WRITE))
+			pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT));
 		else
-			pte_val(pte) |= _PAGE_INVALID;
-		*ptep = pte;
+			pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID));
+		set_pte(ptep, pte);
 	}
 	pgste_set_unlock(ptep, pgste);
 	return dirty;
@@ -760,17 +798,26 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 	pmd_t *pmdp;
 	pte_t *ptep;
 
-	pmdp = pmd_alloc_map(mm, addr);
-	if (unlikely(!pmdp))
+	/*
+	 * If we don't have a PTE table and if there is no huge page mapped,
+	 * we can ignore attempts to set the key to 0, because it already is 0.
+	 */
+	switch (pmd_lookup(mm, addr, &pmdp)) {
+	case -ENOENT:
+		return key ? -EFAULT : 0;
+	case 0:
+		break;
+	default:
 		return -EFAULT;
-
+	}
+again:
 	ptl = pmd_lock(mm, pmdp);
 	if (!pmd_present(*pmdp)) {
 		spin_unlock(ptl);
-		return -EFAULT;
+		return key ? -EFAULT : 0;
 	}
 
-	if (pmd_large(*pmdp)) {
+	if (pmd_leaf(*pmdp)) {
 		paddr = pmd_val(*pmdp) & HPAGE_MASK;
 		paddr |= addr & ~HPAGE_MASK;
 		/*
@@ -783,16 +830,15 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 	}
 	spin_unlock(ptl);
 
-	ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
-	if (unlikely(!ptep))
-		return -EFAULT;
-
+	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+	if (!ptep)
+		goto again;
 	new = old = pgste_get_lock(ptep);
-	pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
-			    PGSTE_ACC_BITS | PGSTE_FP_BIT);
+	new = clear_pgste_bit(new, PGSTE_GR_BIT | PGSTE_GC_BIT |
+				   PGSTE_ACC_BITS | PGSTE_FP_BIT);
 	keyul = (unsigned long) key;
-	pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
-	pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
+	new = set_pgste_bit(new, (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48);
+	new = set_pgste_bit(new, (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56);
 	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
 		unsigned long bits, skey;
 
@@ -803,12 +849,12 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 		/* Set storage key ACC and FP */
 		page_set_storage_key(paddr, skey, !nq);
 		/* Merge host changed & referenced into pgste  */
-		pgste_val(new) |= bits << 52;
+		new = set_pgste_bit(new, bits << 52);
 	}
 	/* changing the guest storage key is considered a change of the page */
 	if ((pgste_val(new) ^ pgste_val(old)) &
 	    (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
-		pgste_val(new) |= PGSTE_UC_BIT;
+		new = set_pgste_bit(new, PGSTE_UC_BIT);
 
 	pgste_set_unlock(ptep, new);
 	pte_unmap_unlock(ptep, ptl);
@@ -816,7 +862,7 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 }
 EXPORT_SYMBOL(set_guest_storage_key);
 
-/**
+/*
  * Conditionally set a guest storage key (handling csske).
  * oldkey will be updated when either mr or mc is set and a pointer is given.
  *
@@ -849,7 +895,7 @@ int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 }
 EXPORT_SYMBOL(cond_set_guest_storage_key);
 
-/**
+/*
  * Reset a guest reference bit (rrbe), returning the reference and changed bit.
  *
  * Returns < 0 in case of error, otherwise the cc to be reported to the guest.
@@ -863,17 +909,26 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
 	pte_t *ptep;
 	int cc = 0;
 
-	pmdp = pmd_alloc_map(mm, addr);
-	if (unlikely(!pmdp))
+	/*
+	 * If we don't have a PTE table and if there is no huge page mapped,
+	 * the storage key is 0 and there is nothing for us to do.
+	 */
+	switch (pmd_lookup(mm, addr, &pmdp)) {
+	case -ENOENT:
+		return 0;
+	case 0:
+		break;
+	default:
 		return -EFAULT;
-
+	}
+again:
 	ptl = pmd_lock(mm, pmdp);
 	if (!pmd_present(*pmdp)) {
 		spin_unlock(ptl);
-		return -EFAULT;
+		return 0;
 	}
 
-	if (pmd_large(*pmdp)) {
+	if (pmd_leaf(*pmdp)) {
 		paddr = pmd_val(*pmdp) & HPAGE_MASK;
 		paddr |= addr & ~HPAGE_MASK;
 		cc = page_reset_referenced(paddr);
@@ -882,25 +937,24 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
 	}
 	spin_unlock(ptl);
 
-	ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
-	if (unlikely(!ptep))
-		return -EFAULT;
-
+	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+	if (!ptep)
+		goto again;
 	new = old = pgste_get_lock(ptep);
 	/* Reset guest reference bit only */
-	pgste_val(new) &= ~PGSTE_GR_BIT;
+	new = clear_pgste_bit(new, PGSTE_GR_BIT);
 
 	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
 		paddr = pte_val(*ptep) & PAGE_MASK;
 		cc = page_reset_referenced(paddr);
 		/* Merge real referenced bit into host-set */
-		pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT;
+		new = set_pgste_bit(new, ((unsigned long)cc << 53) & PGSTE_HR_BIT);
 	}
 	/* Reflect guest's logical view, not physical */
 	cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49;
 	/* Changing the guest storage key is considered a change of the page */
 	if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT)
-		pgste_val(new) |= PGSTE_UC_BIT;
+		new = set_pgste_bit(new, PGSTE_UC_BIT);
 
 	pgste_set_unlock(ptep, new);
 	pte_unmap_unlock(ptep, ptl);
@@ -917,19 +971,28 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 	pmd_t *pmdp;
 	pte_t *ptep;
 
-	pmdp = pmd_alloc_map(mm, addr);
-	if (unlikely(!pmdp))
-		return -EFAULT;
+	/*
+	 * If we don't have a PTE table and if there is no huge page mapped,
+	 * the storage key is 0.
+	 */
+	*key = 0;
 
+	switch (pmd_lookup(mm, addr, &pmdp)) {
+	case -ENOENT:
+		return 0;
+	case 0:
+		break;
+	default:
+		return -EFAULT;
+	}
+again:
 	ptl = pmd_lock(mm, pmdp);
 	if (!pmd_present(*pmdp)) {
-		/* Not yet mapped memory has a zero key */
 		spin_unlock(ptl);
-		*key = 0;
 		return 0;
 	}
 
-	if (pmd_large(*pmdp)) {
+	if (pmd_leaf(*pmdp)) {
 		paddr = pmd_val(*pmdp) & HPAGE_MASK;
 		paddr |= addr & ~HPAGE_MASK;
 		*key = page_get_storage_key(paddr);
@@ -938,10 +1001,9 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 	}
 	spin_unlock(ptl);
 
-	ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
-	if (unlikely(!ptep))
-		return -EFAULT;
-
+	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+	if (!ptep)
+		goto again;
 	pgste = pgste_get_lock(ptep);
 	*key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
 	paddr = pte_val(*ptep) & PAGE_MASK;
@@ -970,6 +1032,7 @@ EXPORT_SYMBOL(get_guest_storage_key);
 int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
 			unsigned long *oldpte, unsigned long *oldpgste)
 {
+	struct vm_area_struct *vma;
 	unsigned long pgstev;
 	spinlock_t *ptl;
 	pgste_t pgste;
@@ -979,6 +1042,10 @@ int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
 	WARN_ON_ONCE(orc > ESSA_MAX);
 	if (unlikely(orc > ESSA_MAX))
 		return -EINVAL;
+
+	vma = vma_lookup(mm, hva);
+	if (!vma || is_vm_hugetlb_page(vma))
+		return -EFAULT;
 	ptep = get_locked_pte(mm, hva, &ptl);
 	if (unlikely(!ptep))
 		return -EFAULT;
@@ -1051,7 +1118,7 @@ int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
 	if (res)
 		pgstev |= _PGSTE_GPS_ZERO;
 
-	pgste_val(pgste) = pgstev;
+	pgste = __pgste(pgstev);
 	pgste_set_unlock(ptep, pgste);
 	pte_unmap_unlock(ptep, ptl);
 	return res;
@@ -1071,17 +1138,21 @@ EXPORT_SYMBOL(pgste_perform_essa);
 int set_pgste_bits(struct mm_struct *mm, unsigned long hva,
 			unsigned long bits, unsigned long value)
 {
+	struct vm_area_struct *vma;
 	spinlock_t *ptl;
 	pgste_t new;
 	pte_t *ptep;
 
+	vma = vma_lookup(mm, hva);
+	if (!vma || is_vm_hugetlb_page(vma))
+		return -EFAULT;
 	ptep = get_locked_pte(mm, hva, &ptl);
 	if (unlikely(!ptep))
 		return -EFAULT;
 	new = pgste_get_lock(ptep);
 
-	pgste_val(new) &= ~bits;
-	pgste_val(new) |= value & bits;
+	new = clear_pgste_bit(new, bits);
+	new = set_pgste_bit(new, value & bits);
 
 	pgste_set_unlock(ptep, new);
 	pte_unmap_unlock(ptep, ptl);
@@ -1099,9 +1170,13 @@ EXPORT_SYMBOL(set_pgste_bits);
  */
 int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep)
 {
+	struct vm_area_struct *vma;
 	spinlock_t *ptl;
 	pte_t *ptep;
 
+	vma = vma_lookup(mm, hva);
+	if (!vma || is_vm_hugetlb_page(vma))
+		return -EFAULT;
 	ptep = get_locked_pte(mm, hva, &ptl);
 	if (unlikely(!ptep))
 		return -EFAULT;
diff --git a/arch/s390/mm/physaddr.c b/arch/s390/mm/physaddr.c
new file mode 100644
index 000000000000..59de866c72d9
--- /dev/null
+++ b/arch/s390/mm/physaddr.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mmdebug.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <asm/page.h>
+
+unsigned long __phys_addr(unsigned long x, bool is_31bit)
+{
+	VIRTUAL_BUG_ON(is_vmalloc_or_module_addr((void *)(x)));
+	x = __pa_nodebug(x);
+	if (is_31bit)
+		VIRTUAL_BUG_ON(x >> 31);
+	return x;
+}
+EXPORT_SYMBOL(__phys_addr);
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index b403fa14847d..448dd6ed1069 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -1,9 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *    Copyright IBM Corp. 2006
- *    Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
  */
 
+#include <linux/memory_hotplug.h>
+#include <linux/cpufeature.h>
 #include <linux/memblock.h>
 #include <linux/pfn.h>
 #include <linux/mm.h>
@@ -11,31 +12,42 @@
 #include <linux/list.h>
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
+#include <linux/sort.h>
+#include <asm/page-states.h>
+#include <asm/abs_lowcore.h>
 #include <asm/cacheflush.h>
+#include <asm/maccess.h>
+#include <asm/nospec-branch.h>
+#include <asm/ctlreg.h>
 #include <asm/pgalloc.h>
-#include <asm/pgtable.h>
 #include <asm/setup.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
 #include <asm/set_memory.h>
+#include <asm/physmem_info.h>
 
 static DEFINE_MUTEX(vmem_mutex);
 
-struct memory_segment {
-	struct list_head list;
-	unsigned long start;
-	unsigned long size;
-};
-
-static LIST_HEAD(mem_segs);
-
 static void __ref *vmem_alloc_pages(unsigned int order)
 {
 	unsigned long size = PAGE_SIZE << order;
 
 	if (slab_is_available())
 		return (void *)__get_free_pages(GFP_KERNEL, order);
-	return (void *) memblock_phys_alloc(size, size);
+	return memblock_alloc(size, size);
+}
+
+static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *altmap)
+{
+	if (altmap) {
+		vmem_altmap_free(altmap, 1 << order);
+		return;
+	}
+	/* We don't expect boot memory to be removed ever. */
+	if (!slab_is_available() ||
+	    WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr))))
+		return;
+	free_pages(addr, order);
 }
 
 void *vmem_crst_alloc(unsigned long val)
@@ -43,8 +55,10 @@ void *vmem_crst_alloc(unsigned long val)
 	unsigned long *table;
 
 	table = vmem_alloc_pages(CRST_ALLOC_ORDER);
-	if (table)
-		crst_table_init(table, val);
+	if (!table)
+		return NULL;
+	crst_table_init(table, val);
+	__arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER);
 	return table;
 }
 
@@ -56,389 +70,600 @@ pte_t __ref *vmem_pte_alloc(void)
 	if (slab_is_available())
 		pte = (pte_t *) page_table_alloc(&init_mm);
 	else
-		pte = (pte_t *) memblock_phys_alloc(size, size);
+		pte = (pte_t *) memblock_alloc(size, size);
 	if (!pte)
 		return NULL;
 	memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE);
+	__arch_set_page_dat(pte, 1);
 	return pte;
 }
 
+static void vmem_pte_free(unsigned long *table)
+{
+	/* We don't expect boot memory to be removed ever. */
+	if (!slab_is_available() ||
+	    WARN_ON_ONCE(PageReserved(virt_to_page(table))))
+		return;
+	page_table_free(&init_mm, table);
+}
+
+#define PAGE_UNUSED 0xFD
+
 /*
- * Add a physical memory range to the 1:1 mapping.
+ * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges
+ * from unused_sub_pmd_start to next PMD_SIZE boundary.
  */
-static int vmem_add_mem(unsigned long start, unsigned long size)
-{
-	unsigned long pgt_prot, sgt_prot, r3_prot;
-	unsigned long pages4k, pages1m, pages2g;
-	unsigned long end = start + size;
-	unsigned long address = start;
-	pgd_t *pg_dir;
-	p4d_t *p4_dir;
-	pud_t *pu_dir;
-	pmd_t *pm_dir;
-	pte_t *pt_dir;
-	int ret = -ENOMEM;
+static unsigned long unused_sub_pmd_start;
 
-	pgt_prot = pgprot_val(PAGE_KERNEL);
-	sgt_prot = pgprot_val(SEGMENT_KERNEL);
-	r3_prot = pgprot_val(REGION3_KERNEL);
-	if (!MACHINE_HAS_NX) {
-		pgt_prot &= ~_PAGE_NOEXEC;
-		sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC;
-		r3_prot &= ~_REGION_ENTRY_NOEXEC;
+static void vmemmap_flush_unused_sub_pmd(void)
+{
+	if (!unused_sub_pmd_start)
+		return;
+	memset((void *)unused_sub_pmd_start, PAGE_UNUSED,
+	       ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start);
+	unused_sub_pmd_start = 0;
+}
+
+static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end)
+{
+	/*
+	 * As we expect to add in the same granularity as we remove, it's
+	 * sufficient to mark only some piece used to block the memmap page from
+	 * getting removed (just in case the memmap never gets initialized,
+	 * e.g., because the memory block never gets onlined).
+	 */
+	memset((void *)start, 0, sizeof(struct page));
+}
+
+static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
+{
+	/*
+	 * We only optimize if the new used range directly follows the
+	 * previously unused range (esp., when populating consecutive sections).
+	 */
+	if (unused_sub_pmd_start == start) {
+		unused_sub_pmd_start = end;
+		if (likely(IS_ALIGNED(unused_sub_pmd_start, PMD_SIZE)))
+			unused_sub_pmd_start = 0;
+		return;
 	}
-	pages4k = pages1m = pages2g = 0;
-	while (address < end) {
-		pg_dir = pgd_offset_k(address);
-		if (pgd_none(*pg_dir)) {
-			p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
-			if (!p4_dir)
-				goto out;
-			pgd_populate(&init_mm, pg_dir, p4_dir);
-		}
-		p4_dir = p4d_offset(pg_dir, address);
-		if (p4d_none(*p4_dir)) {
-			pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
-			if (!pu_dir)
-				goto out;
-			p4d_populate(&init_mm, p4_dir, pu_dir);
-		}
-		pu_dir = pud_offset(p4_dir, address);
-		if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address &&
-		    !(address & ~PUD_MASK) && (address + PUD_SIZE <= end) &&
-		     !debug_pagealloc_enabled()) {
-			pud_val(*pu_dir) = address | r3_prot;
-			address += PUD_SIZE;
-			pages2g++;
-			continue;
-		}
-		if (pud_none(*pu_dir)) {
-			pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
-			if (!pm_dir)
-				goto out;
-			pud_populate(&init_mm, pu_dir, pm_dir);
-		}
-		pm_dir = pmd_offset(pu_dir, address);
-		if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address &&
-		    !(address & ~PMD_MASK) && (address + PMD_SIZE <= end) &&
-		    !debug_pagealloc_enabled()) {
-			pmd_val(*pm_dir) = address | sgt_prot;
-			address += PMD_SIZE;
-			pages1m++;
+	vmemmap_flush_unused_sub_pmd();
+	vmemmap_mark_sub_pmd_used(start, end);
+}
+
+static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
+{
+	unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
+
+	vmemmap_flush_unused_sub_pmd();
+
+	/* Could be our memmap page is filled with PAGE_UNUSED already ... */
+	vmemmap_mark_sub_pmd_used(start, end);
+
+	/* Mark the unused parts of the new memmap page PAGE_UNUSED. */
+	if (!IS_ALIGNED(start, PMD_SIZE))
+		memset((void *)page, PAGE_UNUSED, start - page);
+	/*
+	 * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
+	 * consecutive sections. Remember for the last added PMD the last
+	 * unused range in the populated PMD.
+	 */
+	if (!IS_ALIGNED(end, PMD_SIZE))
+		unused_sub_pmd_start = end;
+}
+
+/* Returns true if the PMD is completely unused and can be freed. */
+static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end)
+{
+	unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
+
+	vmemmap_flush_unused_sub_pmd();
+	memset((void *)start, PAGE_UNUSED, end - start);
+	return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE);
+}
+
+/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
+static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
+				  unsigned long end, bool add, bool direct,
+				  struct vmem_altmap *altmap)
+{
+	unsigned long prot, pages = 0;
+	int ret = -ENOMEM;
+	pte_t *pte;
+
+	prot = pgprot_val(PAGE_KERNEL);
+	pte = pte_offset_kernel(pmd, addr);
+	for (; addr < end; addr += PAGE_SIZE, pte++) {
+		if (!add) {
+			if (pte_none(*pte))
+				continue;
+			if (!direct)
+				vmem_free_pages((unsigned long)pfn_to_virt(pte_pfn(*pte)), get_order(PAGE_SIZE), altmap);
+			pte_clear(&init_mm, addr, pte);
+		} else if (pte_none(*pte)) {
+			if (!direct) {
+				void *new_page = vmemmap_alloc_block_buf(PAGE_SIZE, NUMA_NO_NODE, altmap);
+
+				if (!new_page)
+					goto out;
+				set_pte(pte, __pte(__pa(new_page) | prot));
+			} else {
+				set_pte(pte, __pte(__pa(addr) | prot));
+			}
+		} else {
 			continue;
 		}
-		if (pmd_none(*pm_dir)) {
-			pt_dir = vmem_pte_alloc();
-			if (!pt_dir)
-				goto out;
-			pmd_populate(&init_mm, pm_dir, pt_dir);
-		}
-
-		pt_dir = pte_offset_kernel(pm_dir, address);
-		pte_val(*pt_dir) = address | pgt_prot;
-		address += PAGE_SIZE;
-		pages4k++;
+		pages++;
 	}
 	ret = 0;
 out:
-	update_page_count(PG_DIRECT_MAP_4K, pages4k);
-	update_page_count(PG_DIRECT_MAP_1M, pages1m);
-	update_page_count(PG_DIRECT_MAP_2G, pages2g);
+	if (direct)
+		update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages);
 	return ret;
 }
 
-/*
- * Remove a physical memory range from the 1:1 mapping.
- * Currently only invalidates page table entries.
- */
-static void vmem_remove_range(unsigned long start, unsigned long size)
+static void try_free_pte_table(pmd_t *pmd, unsigned long start)
 {
-	unsigned long pages4k, pages1m, pages2g;
-	unsigned long end = start + size;
-	unsigned long address = start;
-	pgd_t *pg_dir;
-	p4d_t *p4_dir;
-	pud_t *pu_dir;
-	pmd_t *pm_dir;
-	pte_t *pt_dir;
-
-	pages4k = pages1m = pages2g = 0;
-	while (address < end) {
-		pg_dir = pgd_offset_k(address);
-		if (pgd_none(*pg_dir)) {
-			address += PGDIR_SIZE;
-			continue;
-		}
-		p4_dir = p4d_offset(pg_dir, address);
-		if (p4d_none(*p4_dir)) {
-			address += P4D_SIZE;
-			continue;
-		}
-		pu_dir = pud_offset(p4_dir, address);
-		if (pud_none(*pu_dir)) {
-			address += PUD_SIZE;
-			continue;
-		}
-		if (pud_large(*pu_dir)) {
-			pud_clear(pu_dir);
-			address += PUD_SIZE;
-			pages2g++;
-			continue;
-		}
-		pm_dir = pmd_offset(pu_dir, address);
-		if (pmd_none(*pm_dir)) {
-			address += PMD_SIZE;
-			continue;
-		}
-		if (pmd_large(*pm_dir)) {
-			pmd_clear(pm_dir);
-			address += PMD_SIZE;
-			pages1m++;
-			continue;
-		}
-		pt_dir = pte_offset_kernel(pm_dir, address);
-		pte_clear(&init_mm, address, pt_dir);
-		address += PAGE_SIZE;
-		pages4k++;
+	pte_t *pte;
+	int i;
+
+	/* We can safely assume this is fully in 1:1 mapping & vmemmap area */
+	pte = pte_offset_kernel(pmd, start);
+	for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
+		if (!pte_none(*pte))
+			return;
 	}
-	flush_tlb_kernel_range(start, end);
-	update_page_count(PG_DIRECT_MAP_4K, -pages4k);
-	update_page_count(PG_DIRECT_MAP_1M, -pages1m);
-	update_page_count(PG_DIRECT_MAP_2G, -pages2g);
+	vmem_pte_free((unsigned long *) pmd_deref(*pmd));
+	pmd_clear(pmd);
 }
 
-/*
- * Add a backed mem_map array to the virtual mem_map array.
- */
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
-		struct vmem_altmap *altmap)
-{
-	unsigned long pgt_prot, sgt_prot;
-	unsigned long address = start;
-	pgd_t *pg_dir;
-	p4d_t *p4_dir;
-	pud_t *pu_dir;
-	pmd_t *pm_dir;
-	pte_t *pt_dir;
+/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
+static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
+				  unsigned long end, bool add, bool direct,
+				  struct vmem_altmap *altmap)
+{
+	unsigned long next, prot, pages = 0;
 	int ret = -ENOMEM;
+	pmd_t *pmd;
+	pte_t *pte;
 
-	pgt_prot = pgprot_val(PAGE_KERNEL);
-	sgt_prot = pgprot_val(SEGMENT_KERNEL);
-	if (!MACHINE_HAS_NX) {
-		pgt_prot &= ~_PAGE_NOEXEC;
-		sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC;
-	}
-	for (address = start; address < end;) {
-		pg_dir = pgd_offset_k(address);
-		if (pgd_none(*pg_dir)) {
-			p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
-			if (!p4_dir)
-				goto out;
-			pgd_populate(&init_mm, pg_dir, p4_dir);
-		}
-
-		p4_dir = p4d_offset(pg_dir, address);
-		if (p4d_none(*p4_dir)) {
-			pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
-			if (!pu_dir)
-				goto out;
-			p4d_populate(&init_mm, p4_dir, pu_dir);
-		}
+	prot = pgprot_val(SEGMENT_KERNEL);
+	pmd = pmd_offset(pud, addr);
+	for (; addr < end; addr = next, pmd++) {
+		next = pmd_addr_end(addr, end);
+		if (!add) {
+			if (pmd_none(*pmd))
+				continue;
+			if (pmd_leaf(*pmd)) {
+				if (IS_ALIGNED(addr, PMD_SIZE) &&
+				    IS_ALIGNED(next, PMD_SIZE)) {
+					if (!direct)
+						vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap);
+					pmd_clear(pmd);
+					pages++;
+				} else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) {
+					vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap);
+					pmd_clear(pmd);
+				}
+				continue;
+			}
+		} else if (pmd_none(*pmd)) {
+			if (IS_ALIGNED(addr, PMD_SIZE) &&
+			    IS_ALIGNED(next, PMD_SIZE) &&
+			    cpu_has_edat1() && direct &&
+			    !debug_pagealloc_enabled()) {
+				set_pmd(pmd, __pmd(__pa(addr) | prot));
+				pages++;
+				continue;
+			} else if (!direct && cpu_has_edat1()) {
+				void *new_page;
 
-		pu_dir = pud_offset(p4_dir, address);
-		if (pud_none(*pu_dir)) {
-			pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
-			if (!pm_dir)
+				/*
+				 * Use 1MB frames for vmemmap if available. We
+				 * always use large frames even if they are only
+				 * partially used. Otherwise we would have also
+				 * page tables since vmemmap_populate gets
+				 * called for each section separately.
+				 */
+				new_page = vmemmap_alloc_block_buf(PMD_SIZE, NUMA_NO_NODE, altmap);
+				if (new_page) {
+					set_pmd(pmd, __pmd(__pa(new_page) | prot));
+					if (!IS_ALIGNED(addr, PMD_SIZE) ||
+					    !IS_ALIGNED(next, PMD_SIZE)) {
+						vmemmap_use_new_sub_pmd(addr, next);
+					}
+					continue;
+				}
+			}
+			pte = vmem_pte_alloc();
+			if (!pte)
 				goto out;
-			pud_populate(&init_mm, pu_dir, pm_dir);
+			pmd_populate(&init_mm, pmd, pte);
+		} else if (pmd_leaf(*pmd)) {
+			if (!direct)
+				vmemmap_use_sub_pmd(addr, next);
+			continue;
 		}
+		ret = modify_pte_table(pmd, addr, next, add, direct, altmap);
+		if (ret)
+			goto out;
+		if (!add)
+			try_free_pte_table(pmd, addr & PMD_MASK);
+	}
+	ret = 0;
+out:
+	if (direct)
+		update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages);
+	return ret;
+}
 
-		pm_dir = pmd_offset(pu_dir, address);
-		if (pmd_none(*pm_dir)) {
-			/* Use 1MB frames for vmemmap if available. We always
-			 * use large frames even if they are only partially
-			 * used.
-			 * Otherwise we would have also page tables since
-			 * vmemmap_populate gets called for each section
-			 * separately. */
-			if (MACHINE_HAS_EDAT1) {
-				void *new_page;
+static void try_free_pmd_table(pud_t *pud, unsigned long start)
+{
+	pmd_t *pmd;
+	int i;
+
+	pmd = pmd_offset(pud, start);
+	for (i = 0; i < PTRS_PER_PMD; i++, pmd++)
+		if (!pmd_none(*pmd))
+			return;
+	vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER, NULL);
+	pud_clear(pud);
+}
 
-				new_page = vmemmap_alloc_block(PMD_SIZE, node);
-				if (!new_page)
-					goto out;
-				pmd_val(*pm_dir) = __pa(new_page) | sgt_prot;
-				address = (address + PMD_SIZE) & PMD_MASK;
+static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
+			    bool add, bool direct, struct vmem_altmap *altmap)
+{
+	unsigned long next, prot, pages = 0;
+	int ret = -ENOMEM;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	prot = pgprot_val(REGION3_KERNEL);
+	pud = pud_offset(p4d, addr);
+	for (; addr < end; addr = next, pud++) {
+		next = pud_addr_end(addr, end);
+		if (!add) {
+			if (pud_none(*pud))
+				continue;
+			if (pud_leaf(*pud)) {
+				if (IS_ALIGNED(addr, PUD_SIZE) &&
+				    IS_ALIGNED(next, PUD_SIZE)) {
+					pud_clear(pud);
+					pages++;
+				}
 				continue;
 			}
-			pt_dir = vmem_pte_alloc();
-			if (!pt_dir)
+		} else if (pud_none(*pud)) {
+			if (IS_ALIGNED(addr, PUD_SIZE) &&
+			    IS_ALIGNED(next, PUD_SIZE) &&
+			    cpu_has_edat2() && direct &&
+			    !debug_pagealloc_enabled()) {
+				set_pud(pud, __pud(__pa(addr) | prot));
+				pages++;
+				continue;
+			}
+			pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+			if (!pmd)
 				goto out;
-			pmd_populate(&init_mm, pm_dir, pt_dir);
-		} else if (pmd_large(*pm_dir)) {
-			address = (address + PMD_SIZE) & PMD_MASK;
+			pud_populate(&init_mm, pud, pmd);
+		} else if (pud_leaf(*pud)) {
 			continue;
 		}
+		ret = modify_pmd_table(pud, addr, next, add, direct, altmap);
+		if (ret)
+			goto out;
+		if (!add)
+			try_free_pmd_table(pud, addr & PUD_MASK);
+	}
+	ret = 0;
+out:
+	if (direct)
+		update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages);
+	return ret;
+}
 
-		pt_dir = pte_offset_kernel(pm_dir, address);
-		if (pte_none(*pt_dir)) {
-			void *new_page;
+static void try_free_pud_table(p4d_t *p4d, unsigned long start)
+{
+	pud_t *pud;
+	int i;
 
-			new_page = vmemmap_alloc_block(PAGE_SIZE, node);
-			if (!new_page)
+	pud = pud_offset(p4d, start);
+	for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
+		if (!pud_none(*pud))
+			return;
+	}
+	vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER, NULL);
+	p4d_clear(p4d);
+}
+
+static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
+			    bool add, bool direct, struct vmem_altmap *altmap)
+{
+	unsigned long next;
+	int ret = -ENOMEM;
+	p4d_t *p4d;
+	pud_t *pud;
+
+	p4d = p4d_offset(pgd, addr);
+	for (; addr < end; addr = next, p4d++) {
+		next = p4d_addr_end(addr, end);
+		if (!add) {
+			if (p4d_none(*p4d))
+				continue;
+		} else if (p4d_none(*p4d)) {
+			pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
+			if (!pud)
 				goto out;
-			pte_val(*pt_dir) = __pa(new_page) | pgt_prot;
+			p4d_populate(&init_mm, p4d, pud);
 		}
-		address += PAGE_SIZE;
+		ret = modify_pud_table(p4d, addr, next, add, direct, altmap);
+		if (ret)
+			goto out;
+		if (!add)
+			try_free_pud_table(p4d, addr & P4D_MASK);
 	}
 	ret = 0;
 out:
 	return ret;
 }
 
-void vmemmap_free(unsigned long start, unsigned long end,
-		struct vmem_altmap *altmap)
+static void try_free_p4d_table(pgd_t *pgd, unsigned long start)
 {
+	p4d_t *p4d;
+	int i;
+
+	p4d = p4d_offset(pgd, start);
+	for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
+		if (!p4d_none(*p4d))
+			return;
+	}
+	vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER, NULL);
+	pgd_clear(pgd);
 }
 
-/*
- * Add memory segment to the segment list if it doesn't overlap with
- * an already present segment.
- */
-static int insert_memory_segment(struct memory_segment *seg)
+static int modify_pagetable(unsigned long start, unsigned long end, bool add,
+			    bool direct, struct vmem_altmap *altmap)
 {
-	struct memory_segment *tmp;
+	unsigned long addr, next;
+	int ret = -ENOMEM;
+	pgd_t *pgd;
+	p4d_t *p4d;
+
+	if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end)))
+		return -EINVAL;
+	/* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
+	if (WARN_ON_ONCE(end > __abs_lowcore))
+		return -EINVAL;
+	for (addr = start; addr < end; addr = next) {
+		next = pgd_addr_end(addr, end);
+		pgd = pgd_offset_k(addr);
+
+		if (!add) {
+			if (pgd_none(*pgd))
+				continue;
+		} else if (pgd_none(*pgd)) {
+			p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
+			if (!p4d)
+				goto out;
+			pgd_populate(&init_mm, pgd, p4d);
+		}
+		ret = modify_p4d_table(pgd, addr, next, add, direct, altmap);
+		if (ret)
+			goto out;
+		if (!add)
+			try_free_p4d_table(pgd, addr & PGDIR_MASK);
+	}
+	ret = 0;
+out:
+	if (!add)
+		flush_tlb_kernel_range(start, end);
+	return ret;
+}
 
-	if (seg->start + seg->size > VMEM_MAX_PHYS ||
-	    seg->start + seg->size < seg->start)
-		return -ERANGE;
+static int add_pagetable(unsigned long start, unsigned long end, bool direct,
+			 struct vmem_altmap *altmap)
+{
+	return modify_pagetable(start, end, true, direct, altmap);
+}
 
-	list_for_each_entry(tmp, &mem_segs, list) {
-		if (seg->start >= tmp->start + tmp->size)
-			continue;
-		if (seg->start + seg->size <= tmp->start)
-			continue;
-		return -ENOSPC;
-	}
-	list_add(&seg->list, &mem_segs);
-	return 0;
+static int remove_pagetable(unsigned long start, unsigned long end, bool direct,
+			    struct vmem_altmap *altmap)
+{
+	return modify_pagetable(start, end, false, direct, altmap);
 }
 
 /*
- * Remove memory segment from the segment list.
+ * Add a physical memory range to the 1:1 mapping.
  */
-static void remove_memory_segment(struct memory_segment *seg)
+static int vmem_add_range(unsigned long start, unsigned long size)
 {
-	list_del(&seg->list);
+	start = (unsigned long)__va(start);
+	return add_pagetable(start, start + size, true, NULL);
 }
 
-static void __remove_shared_memory(struct memory_segment *seg)
+/*
+ * Remove a physical memory range from the 1:1 mapping.
+ */
+static void vmem_remove_range(unsigned long start, unsigned long size)
 {
-	remove_memory_segment(seg);
-	vmem_remove_range(seg->start, seg->size);
+	start = (unsigned long)__va(start);
+	remove_pagetable(start, start + size, true, NULL);
 }
 
-int vmem_remove_mapping(unsigned long start, unsigned long size)
+/*
+ * Add a backed mem_map array to the virtual mem_map array.
+ */
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
+			       struct vmem_altmap *altmap)
 {
-	struct memory_segment *seg;
 	int ret;
 
 	mutex_lock(&vmem_mutex);
+	/* We don't care about the node, just use NUMA_NO_NODE on allocations */
+	ret = add_pagetable(start, end, false, altmap);
+	if (ret)
+		remove_pagetable(start, end, false, altmap);
+	mutex_unlock(&vmem_mutex);
+	return ret;
+}
 
-	ret = -ENOENT;
-	list_for_each_entry(seg, &mem_segs, list) {
-		if (seg->start == start && seg->size == size)
-			break;
-	}
+#ifdef CONFIG_MEMORY_HOTPLUG
 
-	if (seg->start != start || seg->size != size)
-		goto out;
+void vmemmap_free(unsigned long start, unsigned long end,
+		  struct vmem_altmap *altmap)
+{
+	mutex_lock(&vmem_mutex);
+	remove_pagetable(start, end, false, altmap);
+	mutex_unlock(&vmem_mutex);
+}
 
-	ret = 0;
-	__remove_shared_memory(seg);
-	kfree(seg);
-out:
+#endif
+
+void vmem_remove_mapping(unsigned long start, unsigned long size)
+{
+	mutex_lock(&vmem_mutex);
+	vmem_remove_range(start, size);
 	mutex_unlock(&vmem_mutex);
-	return ret;
+}
+
+struct range arch_get_mappable_range(void)
+{
+	struct range mhp_range;
+
+	mhp_range.start = 0;
+	mhp_range.end = max_mappable - 1;
+	return mhp_range;
 }
 
 int vmem_add_mapping(unsigned long start, unsigned long size)
 {
-	struct memory_segment *seg;
+	struct range range = arch_get_mappable_range();
 	int ret;
 
-	mutex_lock(&vmem_mutex);
-	ret = -ENOMEM;
-	seg = kzalloc(sizeof(*seg), GFP_KERNEL);
-	if (!seg)
-		goto out;
-	seg->start = start;
-	seg->size = size;
-
-	ret = insert_memory_segment(seg);
-	if (ret)
-		goto out_free;
+	if (start < range.start ||
+	    start + size > range.end + 1 ||
+	    start + size < start)
+		return -ERANGE;
 
-	ret = vmem_add_mem(start, size);
+	mutex_lock(&vmem_mutex);
+	ret = vmem_add_range(start, size);
 	if (ret)
-		goto out_remove;
-	goto out;
-
-out_remove:
-	__remove_shared_memory(seg);
-out_free:
-	kfree(seg);
-out:
+		vmem_remove_range(start, size);
 	mutex_unlock(&vmem_mutex);
 	return ret;
 }
 
 /*
- * map whole physical memory to virtual memory (identity mapping)
- * we reserve enough space in the vmalloc area for vmemmap to hotplug
- * additional memory segments.
+ * Allocate new or return existing page-table entry, but do not map it
+ * to any physical address. If missing, allocate segment- and region-
+ * table entries along. Meeting a large segment- or region-table entry
+ * while traversing is an error, since the function is expected to be
+ * called against virtual regions reserved for 4KB mappings only.
  */
-void __init vmem_map_init(void)
+pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc)
 {
-	struct memblock_region *reg;
-
-	for_each_memblock(memory, reg)
-		vmem_add_mem(reg->base, reg->size);
-	__set_memory((unsigned long)_stext,
-		     (unsigned long)(_etext - _stext) >> PAGE_SHIFT,
-		     SET_MEMORY_RO | SET_MEMORY_X);
-	__set_memory((unsigned long)_etext,
-		     (unsigned long)(__end_rodata - _etext) >> PAGE_SHIFT,
-		     SET_MEMORY_RO);
-	__set_memory((unsigned long)_sinittext,
-		     (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
-		     SET_MEMORY_RO | SET_MEMORY_X);
-	__set_memory(__stext_dma, (__etext_dma - __stext_dma) >> PAGE_SHIFT,
-		     SET_MEMORY_RO | SET_MEMORY_X);
-	pr_info("Write protected kernel read-only data: %luk\n",
-		(unsigned long)(__end_rodata - _stext) >> 10);
+	pte_t *ptep = NULL;
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = pgd_offset_k(addr);
+	if (pgd_none(*pgd)) {
+		if (!alloc)
+			goto out;
+		p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
+		if (!p4d)
+			goto out;
+		pgd_populate(&init_mm, pgd, p4d);
+	}
+	p4d = p4d_offset(pgd, addr);
+	if (p4d_none(*p4d)) {
+		if (!alloc)
+			goto out;
+		pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
+		if (!pud)
+			goto out;
+		p4d_populate(&init_mm, p4d, pud);
+	}
+	pud = pud_offset(p4d, addr);
+	if (pud_none(*pud)) {
+		if (!alloc)
+			goto out;
+		pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+		if (!pmd)
+			goto out;
+		pud_populate(&init_mm, pud, pmd);
+	} else if (WARN_ON_ONCE(pud_leaf(*pud))) {
+		goto out;
+	}
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none(*pmd)) {
+		if (!alloc)
+			goto out;
+		pte = vmem_pte_alloc();
+		if (!pte)
+			goto out;
+		pmd_populate(&init_mm, pmd, pte);
+	} else if (WARN_ON_ONCE(pmd_leaf(*pmd))) {
+		goto out;
+	}
+	ptep = pte_offset_kernel(pmd, addr);
+out:
+	return ptep;
 }
 
-/*
- * Convert memblock.memory  to a memory segment list so there is a single
- * list that contains all memory segments.
- */
-static int __init vmem_convert_memory_chunk(void)
+int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc)
+{
+	pte_t *ptep, pte;
+
+	if (!IS_ALIGNED(addr, PAGE_SIZE))
+		return -EINVAL;
+	ptep = vmem_get_alloc_pte(addr, alloc);
+	if (!ptep)
+		return -ENOMEM;
+	__ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL);
+	pte = mk_pte_phys(phys, prot);
+	set_pte(ptep, pte);
+	return 0;
+}
+
+int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot)
 {
-	struct memblock_region *reg;
-	struct memory_segment *seg;
+	int rc;
 
 	mutex_lock(&vmem_mutex);
-	for_each_memblock(memory, reg) {
-		seg = kzalloc(sizeof(*seg), GFP_KERNEL);
-		if (!seg)
-			panic("Out of memory...\n");
-		seg->start = reg->base;
-		seg->size = reg->size;
-		insert_memory_segment(seg);
-	}
+	rc = __vmem_map_4k_page(addr, phys, prot, true);
+	mutex_unlock(&vmem_mutex);
+	return rc;
+}
+
+void vmem_unmap_4k_page(unsigned long addr)
+{
+	pte_t *ptep;
+
+	mutex_lock(&vmem_mutex);
+	ptep = virt_to_kpte(addr);
+	__ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL);
+	pte_clear(&init_mm, addr, ptep);
 	mutex_unlock(&vmem_mutex);
-	return 0;
 }
 
-core_initcall(vmem_convert_memory_chunk);
+void __init vmem_map_init(void)
+{
+	__set_memory_rox(_stext, _etext);
+	__set_memory_ro(_etext, __end_rodata);
+	__set_memory_rox(__stext_amode31, __etext_amode31);
+	/*
+	 * If the BEAR-enhancement facility is not installed the first
+	 * prefix page is used to return to the previous context with
+	 * an LPSWE instruction and therefore must be executable.
+	 */
+	if (!cpu_has_bear())
+		set_memory_x(0, 1);
+	if (debug_pagealloc_enabled())
+		__set_memory_4k(__va(0), absolute_pointer(__va(0)) + ident_map_size);
+	pr_info("Write protected kernel read-only data: %luk\n",
+		(unsigned long)(__end_rodata - _stext) >> 10);
+}
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 8d2134136290..0776dfde2dba 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -7,7 +7,6 @@
  *  - HAVE_MARCH_Z196_FEATURES: laal, laalg
  *  - HAVE_MARCH_Z10_FEATURES: msfi, cgrj, clgrj
  *  - HAVE_MARCH_Z9_109_FEATURES: alfi, llilf, clfi, oilf, nilf
- *  - PACK_STACK
  *  - 64BIT
  *
  * Copyright IBM Corp. 2012,2015
@@ -26,15 +25,18 @@
 #include <linux/mm.h>
 #include <linux/kernel.h>
 #include <asm/cacheflush.h>
+#include <asm/extable.h>
 #include <asm/dis.h>
 #include <asm/facility.h>
 #include <asm/nospec-branch.h>
 #include <asm/set_memory.h>
+#include <asm/text-patching.h>
+#include <asm/unwind.h>
 #include "bpf_jit.h"
 
 struct bpf_jit {
 	u32 seen;		/* Flags to remember seen eBPF instructions */
-	u32 seen_reg[16];	/* Array to remember which registers are used */
+	u16 seen_regs;		/* Mask to remember which registers are used */
 	u32 *addrs;		/* Array with relative instruction addresses */
 	u8 *prg_buf;		/* Start of program */
 	int size;		/* Size of program and literal pool */
@@ -49,15 +51,20 @@ struct bpf_jit {
 	int r1_thunk_ip;	/* Address of expoline thunk for 'br %r1' */
 	int r14_thunk_ip;	/* Address of expoline thunk for 'br %r14' */
 	int tail_call_start;	/* Tail call start offset */
-	int labels[1];		/* Labels for local jumps */
+	int excnt;		/* Number of exception table entries */
+	int prologue_plt_ret;	/* Return address for prologue hotpatch PLT */
+	int prologue_plt;	/* Start of prologue hotpatch PLT */
+	int kern_arena;		/* Pool offset of kernel arena address */
+	u64 user_arena;		/* User arena address */
 };
 
 #define SEEN_MEM	BIT(0)		/* use mem[] for temporary storage */
 #define SEEN_LITERAL	BIT(1)		/* code uses literals */
 #define SEEN_FUNC	BIT(2)		/* calls C functions */
-#define SEEN_TAIL_CALL	BIT(3)		/* code uses tail calls */
 #define SEEN_STACK	(SEEN_FUNC | SEEN_MEM)
 
+#define NVREGS		0xffc0		/* %r6-%r15 */
+
 /*
  * s390 registers
  */
@@ -68,6 +75,10 @@ struct bpf_jit {
 #define REG_0		REG_W0			/* Register 0 */
 #define REG_1		REG_W1			/* Register 1 */
 #define REG_2		BPF_REG_1		/* Register 2 */
+#define REG_3		BPF_REG_2		/* Register 3 */
+#define REG_4		BPF_REG_3		/* Register 4 */
+#define REG_7		BPF_REG_6		/* Register 7 */
+#define REG_8		BPF_REG_7		/* Register 8 */
 #define REG_14		BPF_REG_0		/* Register 14 */
 
 /*
@@ -112,8 +123,8 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1)
 {
 	u32 r1 = reg2hex[b1];
 
-	if (!jit->seen_reg[r1] && r1 >= 6 && r1 <= 15)
-		jit->seen_reg[r1] = 1;
+	if (r1 >= 6 && r1 <= 15)
+		jit->seen_regs |= (1 << r1);
 }
 
 #define REG_SET_SEEN(b1)					\
@@ -121,8 +132,6 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1)
 	reg_set_seen(jit, b1);					\
 })
 
-#define REG_SEEN(b1) jit->seen_reg[reg2hex[(b1)]]
-
 /*
  * EMIT macros for code generation
  */
@@ -228,18 +237,18 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1)
 	REG_SET_SEEN(b3);					\
 })
 
-#define EMIT6_PCREL_LABEL(op1, op2, b1, b2, label, mask)	\
+#define EMIT6_PCREL_RIEB(op1, op2, b1, b2, mask, target)	\
 ({								\
-	int rel = (jit->labels[label] - jit->prg) >> 1;		\
+	unsigned int rel = (int)((target) - jit->prg) / 2;	\
 	_EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff),	\
 	       (op2) | (mask) << 12);				\
 	REG_SET_SEEN(b1);					\
 	REG_SET_SEEN(b2);					\
 })
 
-#define EMIT6_PCREL_IMM_LABEL(op1, op2, b1, imm, label, mask)	\
+#define EMIT6_PCREL_RIEC(op1, op2, b1, imm, mask, target)	\
 ({								\
-	int rel = (jit->labels[label] - jit->prg) >> 1;		\
+	unsigned int rel = (int)((target) - jit->prg) / 2;	\
 	_EMIT6((op1) | (reg_high(b1) | (mask)) << 16 |		\
 		(rel & 0xffff), (op2) | ((imm) & 0xff) << 8);	\
 	REG_SET_SEEN(b1);					\
@@ -248,8 +257,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1)
 
 #define EMIT6_PCREL(op1, op2, b1, b2, i, off, mask)		\
 ({								\
-	/* Branch instruction needs 6 bytes */			\
-	int rel = (addrs[(i) + (off) + 1] - (addrs[(i) + 1] - 6)) / 2;\
+	int rel = (addrs[(i) + (off) + 1] - jit->prg) / 2;	\
 	_EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), (op2) | (mask));\
 	REG_SET_SEEN(b1);					\
 	REG_SET_SEEN(b2);					\
@@ -431,12 +439,12 @@ static void restore_regs(struct bpf_jit *jit, u32 rs, u32 re, u32 stack_depth)
 /*
  * Return first seen register (from start)
  */
-static int get_start(struct bpf_jit *jit, int start)
+static int get_start(u16 seen_regs, int start)
 {
 	int i;
 
 	for (i = start; i <= 15; i++) {
-		if (jit->seen_reg[i])
+		if (seen_regs & (1 << i))
 			return i;
 	}
 	return 0;
@@ -445,15 +453,15 @@ static int get_start(struct bpf_jit *jit, int start)
 /*
  * Return last seen register (from start) (gap >= 2)
  */
-static int get_end(struct bpf_jit *jit, int start)
+static int get_end(u16 seen_regs, int start)
 {
 	int i;
 
 	for (i = start; i < 15; i++) {
-		if (!jit->seen_reg[i] && !jit->seen_reg[i + 1])
+		if (!(seen_regs & (3 << i)))
 			return i - 1;
 	}
-	return jit->seen_reg[15] ? 15 : 14;
+	return (seen_regs & (1 << 15)) ? 15 : 14;
 }
 
 #define REGS_SAVE	1
@@ -462,8 +470,10 @@ static int get_end(struct bpf_jit *jit, int start)
  * Save and restore clobbered registers (6-15) on stack.
  * We save/restore registers in chunks with gap >= 2 registers.
  */
-static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth)
+static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth,
+			      u16 extra_regs)
 {
+	u16 seen_regs = jit->seen_regs | extra_regs;
 	const int last = 15, save_restore_size = 6;
 	int re = 6, rs;
 
@@ -477,10 +487,10 @@ static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth)
 	}
 
 	do {
-		rs = get_start(jit, re);
+		rs = get_start(seen_regs, re);
 		if (!rs)
 			break;
-		re = get_end(jit, rs + 1);
+		re = get_end(seen_regs, rs + 1);
 		if (op == REGS_SAVE)
 			save_regs(jit, rs, re);
 		else
@@ -489,26 +499,97 @@ static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth)
 	} while (re <= last);
 }
 
+static void bpf_skip(struct bpf_jit *jit, int size)
+{
+	if (size >= 6 && !is_valid_rel(size)) {
+		/* brcl 0xf,size */
+		EMIT6_PCREL_RIL(0xc0f4000000, size);
+		size -= 6;
+	} else if (size >= 4 && is_valid_rel(size)) {
+		/* brc 0xf,size */
+		EMIT4_PCREL(0xa7f40000, size);
+		size -= 4;
+	}
+	while (size >= 2) {
+		/* bcr 0,%0 */
+		_EMIT2(0x0700);
+		size -= 2;
+	}
+}
+
+/*
+ * PLT for hotpatchable calls. The calling convention is the same as for the
+ * ftrace hotpatch trampolines: %r0 is return address, %r1 is clobbered.
+ */
+struct bpf_plt {
+	char code[16];
+	void *ret;
+	void *target;
+} __packed;
+extern const struct bpf_plt bpf_plt;
+asm(
+	".pushsection .rodata\n"
+	"	.balign 8\n"
+	"bpf_plt:\n"
+	"	lgrl %r0,bpf_plt_ret\n"
+	"	lgrl %r1,bpf_plt_target\n"
+	"	br %r1\n"
+	"	.balign 8\n"
+	"bpf_plt_ret: .quad 0\n"
+	"bpf_plt_target: .quad 0\n"
+	"	.popsection\n"
+);
+
+static void bpf_jit_plt(struct bpf_plt *plt, void *ret, void *target)
+{
+	memcpy(plt, &bpf_plt, sizeof(*plt));
+	plt->ret = ret;
+	plt->target = target;
+}
+
 /*
  * Emit function prologue
  *
  * Save registers and create stack frame if necessary.
- * See stack frame layout desription in "bpf_jit.h"!
+ * See stack frame layout description in "bpf_jit.h"!
  */
-static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth)
+static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp,
+			     u32 stack_depth)
 {
-	if (jit->seen & SEEN_TAIL_CALL) {
+	/* No-op for hotpatching */
+	/* brcl 0,prologue_plt */
+	EMIT6_PCREL_RILC(0xc0040000, 0, jit->prologue_plt);
+	jit->prologue_plt_ret = jit->prg;
+
+	if (!bpf_is_subprog(fp)) {
+		/* Initialize the tail call counter in the main program. */
 		/* xc STK_OFF_TCCNT(4,%r15),STK_OFF_TCCNT(%r15) */
 		_EMIT6(0xd703f000 | STK_OFF_TCCNT, 0xf000 | STK_OFF_TCCNT);
 	} else {
-		/* j tail_call_start: NOP if no tail calls are used */
-		EMIT4_PCREL(0xa7f40000, 6);
-		_EMIT2(0);
+		/*
+		 * Skip the tail call counter initialization in subprograms.
+		 * Insert nops in order to have tail_call_start at a
+		 * predictable offset.
+		 */
+		bpf_skip(jit, 6);
 	}
 	/* Tail calls have to skip above initialization */
 	jit->tail_call_start = jit->prg;
-	/* Save registers */
-	save_restore_regs(jit, REGS_SAVE, stack_depth);
+	if (fp->aux->exception_cb) {
+		/*
+		 * Switch stack, the new address is in the 2nd parameter.
+		 *
+		 * Arrange the restoration of %r6-%r15 in the epilogue.
+		 * Do not restore them now, the prog does not need them.
+		 */
+		/* lgr %r15,%r3 */
+		EMIT4(0xb9040000, REG_15, REG_3);
+		jit->seen_regs |= NVREGS;
+	} else {
+		/* Save registers */
+		save_restore_regs(jit, REGS_SAVE, stack_depth,
+				  fp->aux->exception_boundary ? NVREGS : 0);
+	}
 	/* Setup literal pool */
 	if (is_first_pass(jit) || (jit->seen & SEEN_LITERAL)) {
 		if (!is_first_pass(jit) &&
@@ -539,6 +620,43 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth)
 }
 
 /*
+ * Emit an expoline for a jump that follows
+ */
+static void emit_expoline(struct bpf_jit *jit)
+{
+	/* exrl %r0,.+10 */
+	EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10);
+	/* j . */
+	EMIT4_PCREL(0xa7f40000, 0);
+}
+
+/*
+ * Emit __s390_indirect_jump_r1 thunk if necessary
+ */
+static void emit_r1_thunk(struct bpf_jit *jit)
+{
+	if (nospec_uses_trampoline()) {
+		jit->r1_thunk_ip = jit->prg;
+		emit_expoline(jit);
+		/* br %r1 */
+		_EMIT2(0x07f1);
+	}
+}
+
+/*
+ * Call r1 either directly or via __s390_indirect_jump_r1 thunk
+ */
+static void call_r1(struct bpf_jit *jit)
+{
+	if (nospec_uses_trampoline())
+		/* brasl %r14,__s390_indirect_jump_r1 */
+		EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip);
+	else
+		/* basr %r14,%r1 */
+		EMIT2(0x0d00, REG_14, REG_1);
+}
+
+/*
  * Function epilogue
  */
 static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth)
@@ -547,43 +665,187 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth)
 	/* Load exit code: lgr %r2,%b0 */
 	EMIT4(0xb9040000, REG_2, BPF_REG_0);
 	/* Restore registers */
-	save_restore_regs(jit, REGS_RESTORE, stack_depth);
-	if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) {
+	save_restore_regs(jit, REGS_RESTORE, stack_depth, 0);
+	if (nospec_uses_trampoline()) {
 		jit->r14_thunk_ip = jit->prg;
 		/* Generate __s390_indirect_jump_r14 thunk */
-		if (test_facility(35)) {
-			/* exrl %r0,.+10 */
-			EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10);
-		} else {
-			/* larl %r1,.+14 */
-			EMIT6_PCREL_RILB(0xc0000000, REG_1, jit->prg + 14);
-			/* ex 0,0(%r1) */
-			EMIT4_DISP(0x44000000, REG_0, REG_1, 0);
-		}
-		/* j . */
-		EMIT4_PCREL(0xa7f40000, 0);
+		emit_expoline(jit);
 	}
 	/* br %r14 */
 	_EMIT2(0x07fe);
 
-	if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable &&
-	    (is_first_pass(jit) || (jit->seen & SEEN_FUNC))) {
-		jit->r1_thunk_ip = jit->prg;
-		/* Generate __s390_indirect_jump_r1 thunk */
-		if (test_facility(35)) {
-			/* exrl %r0,.+10 */
-			EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10);
-			/* j . */
-			EMIT4_PCREL(0xa7f40000, 0);
-			/* br %r1 */
-			_EMIT2(0x07f1);
-		} else {
-			/* ex 0,S390_lowcore.br_r1_tampoline */
-			EMIT4_DISP(0x44000000, REG_0, REG_0,
-				   offsetof(struct lowcore, br_r1_trampoline));
-			/* j . */
-			EMIT4_PCREL(0xa7f40000, 0);
-		}
+	if (is_first_pass(jit) || (jit->seen & SEEN_FUNC))
+		emit_r1_thunk(jit);
+
+	jit->prg = ALIGN(jit->prg, 8);
+	jit->prologue_plt = jit->prg;
+	if (jit->prg_buf)
+		bpf_jit_plt((struct bpf_plt *)(jit->prg_buf + jit->prg),
+			    jit->prg_buf + jit->prologue_plt_ret, NULL);
+	jit->prg += sizeof(struct bpf_plt);
+}
+
+bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
+{
+	regs->psw.addr = extable_fixup(x);
+	if (x->data != -1)
+		regs->gprs[x->data] = 0;
+	return true;
+}
+
+/*
+ * A single BPF probe instruction
+ */
+struct bpf_jit_probe {
+	int prg;	/* JITed instruction offset */
+	int nop_prg;	/* JITed nop offset */
+	int reg;	/* Register to clear on exception */
+	int arena_reg;	/* Register to use for arena addressing */
+};
+
+static void bpf_jit_probe_init(struct bpf_jit_probe *probe)
+{
+	probe->prg = -1;
+	probe->nop_prg = -1;
+	probe->reg = -1;
+	probe->arena_reg = REG_0;
+}
+
+/*
+ * Handlers of certain exceptions leave psw.addr pointing to the instruction
+ * directly after the failing one. Therefore, create two exception table
+ * entries and also add a nop in case two probing instructions come directly
+ * after each other.
+ */
+static void bpf_jit_probe_emit_nop(struct bpf_jit *jit,
+				   struct bpf_jit_probe *probe)
+{
+	if (probe->prg == -1 || probe->nop_prg != -1)
+		/* The probe is not armed or nop is already emitted. */
+		return;
+
+	probe->nop_prg = jit->prg;
+	/* bcr 0,%0 */
+	_EMIT2(0x0700);
+}
+
+static void bpf_jit_probe_load_pre(struct bpf_jit *jit, struct bpf_insn *insn,
+				   struct bpf_jit_probe *probe)
+{
+	if (BPF_MODE(insn->code) != BPF_PROBE_MEM &&
+	    BPF_MODE(insn->code) != BPF_PROBE_MEMSX &&
+	    BPF_MODE(insn->code) != BPF_PROBE_MEM32)
+		return;
+
+	if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) {
+		/* lgrl %r1,kern_arena */
+		EMIT6_PCREL_RILB(0xc4080000, REG_W1, jit->kern_arena);
+		probe->arena_reg = REG_W1;
+	}
+	probe->prg = jit->prg;
+	probe->reg = reg2hex[insn->dst_reg];
+}
+
+static void bpf_jit_probe_store_pre(struct bpf_jit *jit, struct bpf_insn *insn,
+				    struct bpf_jit_probe *probe)
+{
+	if (BPF_MODE(insn->code) != BPF_PROBE_MEM32)
+		return;
+
+	/* lgrl %r1,kern_arena */
+	EMIT6_PCREL_RILB(0xc4080000, REG_W1, jit->kern_arena);
+	probe->arena_reg = REG_W1;
+	probe->prg = jit->prg;
+}
+
+static void bpf_jit_probe_atomic_pre(struct bpf_jit *jit,
+				     struct bpf_insn *insn,
+				     struct bpf_jit_probe *probe)
+{
+	if (BPF_MODE(insn->code) != BPF_PROBE_ATOMIC)
+		return;
+
+	/* lgrl %r1,kern_arena */
+	EMIT6_PCREL_RILB(0xc4080000, REG_W1, jit->kern_arena);
+	/* agr %r1,%dst */
+	EMIT4(0xb9080000, REG_W1, insn->dst_reg);
+	probe->arena_reg = REG_W1;
+	probe->prg = jit->prg;
+}
+
+static int bpf_jit_probe_post(struct bpf_jit *jit, struct bpf_prog *fp,
+			      struct bpf_jit_probe *probe)
+{
+	struct exception_table_entry *ex;
+	int i, prg;
+	s64 delta;
+	u8 *insn;
+
+	if (probe->prg == -1)
+		/* The probe is not armed. */
+		return 0;
+	bpf_jit_probe_emit_nop(jit, probe);
+	if (!fp->aux->extable)
+		/* Do nothing during early JIT passes. */
+		return 0;
+	insn = jit->prg_buf + probe->prg;
+	if (WARN_ON_ONCE(probe->prg + insn_length(*insn) != probe->nop_prg))
+		/* JIT bug - gap between probe and nop instructions. */
+		return -1;
+	for (i = 0; i < 2; i++) {
+		if (WARN_ON_ONCE(jit->excnt >= fp->aux->num_exentries))
+			/* Verifier bug - not enough entries. */
+			return -1;
+		ex = &fp->aux->extable[jit->excnt];
+		/* Add extable entries for probe and nop instructions. */
+		prg = i == 0 ? probe->prg : probe->nop_prg;
+		delta = jit->prg_buf + prg - (u8 *)&ex->insn;
+		if (WARN_ON_ONCE(delta < INT_MIN || delta > INT_MAX))
+			/* JIT bug - code and extable must be close. */
+			return -1;
+		ex->insn = delta;
+		/*
+		 * Land on the current instruction. Note that the extable
+		 * infrastructure ignores the fixup field; it is handled by
+		 * ex_handler_bpf().
+		 */
+		delta = jit->prg_buf + jit->prg - (u8 *)&ex->fixup;
+		if (WARN_ON_ONCE(delta < INT_MIN || delta > INT_MAX))
+			/* JIT bug - landing pad and extable must be close. */
+			return -1;
+		ex->fixup = delta;
+		ex->type = EX_TYPE_BPF;
+		ex->data = probe->reg;
+		jit->excnt++;
+	}
+	return 0;
+}
+
+/*
+ * Sign-extend the register if necessary
+ */
+static int sign_extend(struct bpf_jit *jit, int r, u8 size, u8 flags)
+{
+	if (!(flags & BTF_FMODEL_SIGNED_ARG))
+		return 0;
+
+	switch (size) {
+	case 1:
+		/* lgbr %r,%r */
+		EMIT4(0xb9060000, r, r);
+		return 0;
+	case 2:
+		/* lghr %r,%r */
+		EMIT4(0xb9070000, r, r);
+		return 0;
+	case 4:
+		/* lgfr %r,%r */
+		EMIT4(0xb9140000, r, r);
+		return 0;
+	case 8:
+		return 0;
+	default:
+		return -1;
 	}
 }
 
@@ -594,30 +856,83 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth)
  * stack space for the large switch statement.
  */
 static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
-				 int i, bool extra_pass)
+				 int i, bool extra_pass, u32 stack_depth)
 {
 	struct bpf_insn *insn = &fp->insnsi[i];
+	s32 branch_oc_off = insn->off;
 	u32 dst_reg = insn->dst_reg;
 	u32 src_reg = insn->src_reg;
+	struct bpf_jit_probe probe;
 	int last, insn_count = 1;
 	u32 *addrs = jit->addrs;
 	s32 imm = insn->imm;
 	s16 off = insn->off;
 	unsigned int mask;
+	int err;
+
+	bpf_jit_probe_init(&probe);
 
 	switch (insn->code) {
 	/*
 	 * BPF_MOV
 	 */
-	case BPF_ALU | BPF_MOV | BPF_X: /* dst = (u32) src */
-		/* llgfr %dst,%src */
-		EMIT4(0xb9160000, dst_reg, src_reg);
-		if (insn_is_zext(&insn[1]))
-			insn_count = 2;
+	case BPF_ALU | BPF_MOV | BPF_X:
+		switch (insn->off) {
+		case 0: /* DST = (u32) SRC */
+			/* llgfr %dst,%src */
+			EMIT4(0xb9160000, dst_reg, src_reg);
+			if (insn_is_zext(&insn[1]))
+				insn_count = 2;
+			break;
+		case 8: /* DST = (u32)(s8) SRC */
+			/* lbr %dst,%src */
+			EMIT4(0xb9260000, dst_reg, src_reg);
+			/* llgfr %dst,%dst */
+			EMIT4(0xb9160000, dst_reg, dst_reg);
+			break;
+		case 16: /* DST = (u32)(s16) SRC */
+			/* lhr %dst,%src */
+			EMIT4(0xb9270000, dst_reg, src_reg);
+			/* llgfr %dst,%dst */
+			EMIT4(0xb9160000, dst_reg, dst_reg);
+			break;
+		}
 		break;
-	case BPF_ALU64 | BPF_MOV | BPF_X: /* dst = src */
-		/* lgr %dst,%src */
-		EMIT4(0xb9040000, dst_reg, src_reg);
+	case BPF_ALU64 | BPF_MOV | BPF_X:
+		if (insn_is_cast_user(insn)) {
+			int patch_brc;
+
+			/* ltgr %dst,%src */
+			EMIT4(0xb9020000, dst_reg, src_reg);
+			/* brc 8,0f */
+			patch_brc = jit->prg;
+			EMIT4_PCREL_RIC(0xa7040000, 8, 0);
+			/* iihf %dst,user_arena>>32 */
+			EMIT6_IMM(0xc0080000, dst_reg, jit->user_arena >> 32);
+			/* 0: */
+			if (jit->prg_buf)
+				*(u16 *)(jit->prg_buf + patch_brc + 2) =
+					(jit->prg - patch_brc) >> 1;
+			break;
+		}
+		switch (insn->off) {
+		case 0: /* DST = SRC */
+			/* lgr %dst,%src */
+			EMIT4(0xb9040000, dst_reg, src_reg);
+			break;
+		case 8: /* DST = (s8) SRC */
+			/* lgbr %dst,%src */
+			EMIT4(0xb9060000, dst_reg, src_reg);
+			break;
+		case 16: /* DST = (s16) SRC */
+			/* lghr %dst,%src */
+			EMIT4(0xb9070000, dst_reg, src_reg);
+			break;
+		case 32: /* DST = (s32) SRC */
+			/* lgfr %dst,%src */
+			EMIT4(0xb9140000, dst_reg, src_reg);
+			break;
+		}
 		break;
 	case BPF_ALU | BPF_MOV | BPF_K: /* dst = (u32) imm */
 		/* llilf %dst,imm */
@@ -656,10 +971,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 		EMIT4(0xb9080000, dst_reg, src_reg);
 		break;
 	case BPF_ALU | BPF_ADD | BPF_K: /* dst = (u32) dst + (u32) imm */
-		if (!imm)
-			break;
-		/* alfi %dst,imm */
-		EMIT6_IMM(0xc20b0000, dst_reg, imm);
+		if (imm != 0) {
+			/* alfi %dst,imm */
+			EMIT6_IMM(0xc20b0000, dst_reg, imm);
+		}
 		EMIT_ZERO(dst_reg);
 		break;
 	case BPF_ALU64 | BPF_ADD | BPF_K: /* dst = dst + imm */
@@ -681,17 +996,22 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 		EMIT4(0xb9090000, dst_reg, src_reg);
 		break;
 	case BPF_ALU | BPF_SUB | BPF_K: /* dst = (u32) dst - (u32) imm */
-		if (!imm)
-			break;
-		/* alfi %dst,-imm */
-		EMIT6_IMM(0xc20b0000, dst_reg, -imm);
+		if (imm != 0) {
+			/* alfi %dst,-imm */
+			EMIT6_IMM(0xc20b0000, dst_reg, -imm);
+		}
 		EMIT_ZERO(dst_reg);
 		break;
 	case BPF_ALU64 | BPF_SUB | BPF_K: /* dst = dst - imm */
 		if (!imm)
 			break;
-		/* agfi %dst,-imm */
-		EMIT6_IMM(0xc2080000, dst_reg, -imm);
+		if (imm == -0x80000000) {
+			/* algfi %dst,0x80000000 */
+			EMIT6_IMM(0xc20a0000, dst_reg, 0x80000000);
+		} else {
+			/* agfi %dst,-imm */
+			EMIT6_IMM(0xc2080000, dst_reg, -imm);
+		}
 		break;
 	/*
 	 * BPF_MUL
@@ -706,10 +1026,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 		EMIT4(0xb90c0000, dst_reg, src_reg);
 		break;
 	case BPF_ALU | BPF_MUL | BPF_K: /* dst = (u32) dst * (u32) imm */
-		if (imm == 1)
-			break;
-		/* msfi %r5,imm */
-		EMIT6_IMM(0xc2010000, dst_reg, imm);
+		if (imm != 1) {
+			/* msfi %r5,imm */
+			EMIT6_IMM(0xc2010000, dst_reg, imm);
+		}
 		EMIT_ZERO(dst_reg);
 		break;
 	case BPF_ALU64 | BPF_MUL | BPF_K: /* dst = dst * imm */
@@ -721,64 +1041,115 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 	/*
 	 * BPF_DIV / BPF_MOD
 	 */
-	case BPF_ALU | BPF_DIV | BPF_X: /* dst = (u32) dst / (u32) src */
-	case BPF_ALU | BPF_MOD | BPF_X: /* dst = (u32) dst % (u32) src */
+	case BPF_ALU | BPF_DIV | BPF_X:
+	case BPF_ALU | BPF_MOD | BPF_X:
 	{
 		int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0;
 
-		/* lhi %w0,0 */
-		EMIT4_IMM(0xa7080000, REG_W0, 0);
-		/* lr %w1,%dst */
-		EMIT2(0x1800, REG_W1, dst_reg);
-		/* dlr %w0,%src */
-		EMIT4(0xb9970000, REG_W0, src_reg);
+		switch (off) {
+		case 0: /* dst = (u32) dst {/,%} (u32) src */
+			/* xr %w0,%w0 */
+			EMIT2(0x1700, REG_W0, REG_W0);
+			/* lr %w1,%dst */
+			EMIT2(0x1800, REG_W1, dst_reg);
+			/* dlr %w0,%src */
+			EMIT4(0xb9970000, REG_W0, src_reg);
+			break;
+		case 1: /* dst = (u32) ((s32) dst {/,%} (s32) src) */
+			/* lgfr %r1,%dst */
+			EMIT4(0xb9140000, REG_W1, dst_reg);
+			/* dsgfr %r0,%src */
+			EMIT4(0xb91d0000, REG_W0, src_reg);
+			break;
+		}
 		/* llgfr %dst,%rc */
 		EMIT4(0xb9160000, dst_reg, rc_reg);
 		if (insn_is_zext(&insn[1]))
 			insn_count = 2;
 		break;
 	}
-	case BPF_ALU64 | BPF_DIV | BPF_X: /* dst = dst / src */
-	case BPF_ALU64 | BPF_MOD | BPF_X: /* dst = dst % src */
+	case BPF_ALU64 | BPF_DIV | BPF_X:
+	case BPF_ALU64 | BPF_MOD | BPF_X:
 	{
 		int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0;
 
-		/* lghi %w0,0 */
-		EMIT4_IMM(0xa7090000, REG_W0, 0);
-		/* lgr %w1,%dst */
-		EMIT4(0xb9040000, REG_W1, dst_reg);
-		/* dlgr %w0,%dst */
-		EMIT4(0xb9870000, REG_W0, src_reg);
+		switch (off) {
+		case 0: /* dst = dst {/,%} src */
+			/* lghi %w0,0 */
+			EMIT4_IMM(0xa7090000, REG_W0, 0);
+			/* lgr %w1,%dst */
+			EMIT4(0xb9040000, REG_W1, dst_reg);
+			/* dlgr %w0,%src */
+			EMIT4(0xb9870000, REG_W0, src_reg);
+			break;
+		case 1: /* dst = (s64) dst {/,%} (s64) src */
+			/* lgr %w1,%dst */
+			EMIT4(0xb9040000, REG_W1, dst_reg);
+			/* dsgr %w0,%src */
+			EMIT4(0xb90d0000, REG_W0, src_reg);
+			break;
+		}
 		/* lgr %dst,%rc */
 		EMIT4(0xb9040000, dst_reg, rc_reg);
 		break;
 	}
-	case BPF_ALU | BPF_DIV | BPF_K: /* dst = (u32) dst / (u32) imm */
-	case BPF_ALU | BPF_MOD | BPF_K: /* dst = (u32) dst % (u32) imm */
+	case BPF_ALU | BPF_DIV | BPF_K:
+	case BPF_ALU | BPF_MOD | BPF_K:
 	{
 		int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0;
 
 		if (imm == 1) {
 			if (BPF_OP(insn->code) == BPF_MOD)
-				/* lhgi %dst,0 */
+				/* lghi %dst,0 */
 				EMIT4_IMM(0xa7090000, dst_reg, 0);
+			else
+				EMIT_ZERO(dst_reg);
 			break;
 		}
-		/* lhi %w0,0 */
-		EMIT4_IMM(0xa7080000, REG_W0, 0);
-		/* lr %w1,%dst */
-		EMIT2(0x1800, REG_W1, dst_reg);
 		if (!is_first_pass(jit) && can_use_ldisp_for_lit32(jit)) {
-			/* dl %w0,<d(imm)>(%l) */
-			EMIT6_DISP_LH(0xe3000000, 0x0097, REG_W0, REG_0, REG_L,
-				      EMIT_CONST_U32(imm));
+			switch (off) {
+			case 0: /* dst = (u32) dst {/,%} (u32) imm */
+				/* xr %w0,%w0 */
+				EMIT2(0x1700, REG_W0, REG_W0);
+				/* lr %w1,%dst */
+				EMIT2(0x1800, REG_W1, dst_reg);
+				/* dl %w0,<d(imm)>(%l) */
+				EMIT6_DISP_LH(0xe3000000, 0x0097, REG_W0, REG_0,
+					      REG_L, EMIT_CONST_U32(imm));
+				break;
+			case 1: /* dst = (s32) dst {/,%} (s32) imm */
+				/* lgfr %r1,%dst */
+				EMIT4(0xb9140000, REG_W1, dst_reg);
+				/* dsgf %r0,<d(imm)>(%l) */
+				EMIT6_DISP_LH(0xe3000000, 0x001d, REG_W0, REG_0,
+					      REG_L, EMIT_CONST_U32(imm));
+				break;
+			}
 		} else {
-			/* lgfrl %dst,imm */
-			EMIT6_PCREL_RILB(0xc40c0000, dst_reg,
-					 _EMIT_CONST_U32(imm));
-			jit->seen |= SEEN_LITERAL;
-			/* dlr %w0,%dst */
-			EMIT4(0xb9970000, REG_W0, dst_reg);
+			switch (off) {
+			case 0: /* dst = (u32) dst {/,%} (u32) imm */
+				/* xr %w0,%w0 */
+				EMIT2(0x1700, REG_W0, REG_W0);
+				/* lr %w1,%dst */
+				EMIT2(0x1800, REG_W1, dst_reg);
+				/* lrl %dst,imm */
+				EMIT6_PCREL_RILB(0xc40d0000, dst_reg,
+						 _EMIT_CONST_U32(imm));
+				jit->seen |= SEEN_LITERAL;
+				/* dlr %w0,%dst */
+				EMIT4(0xb9970000, REG_W0, dst_reg);
+				break;
+			case 1: /* dst = (s32) dst {/,%} (s32) imm */
+				/* lgfr %w1,%dst */
+				EMIT4(0xb9140000, REG_W1, dst_reg);
+				/* lgfrl %dst,imm */
+				EMIT6_PCREL_RILB(0xc40c0000, dst_reg,
+						 _EMIT_CONST_U32(imm));
+				jit->seen |= SEEN_LITERAL;
+				/* dsgr %w0,%dst */
+				EMIT4(0xb90d0000, REG_W0, dst_reg);
+				break;
+			}
 		}
 		/* llgfr %dst,%rc */
 		EMIT4(0xb9160000, dst_reg, rc_reg);
@@ -786,8 +1157,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 			insn_count = 2;
 		break;
 	}
-	case BPF_ALU64 | BPF_DIV | BPF_K: /* dst = dst / imm */
-	case BPF_ALU64 | BPF_MOD | BPF_K: /* dst = dst % imm */
+	case BPF_ALU64 | BPF_DIV | BPF_K:
+	case BPF_ALU64 | BPF_MOD | BPF_K:
 	{
 		int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0;
 
@@ -797,21 +1168,50 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 				EMIT4_IMM(0xa7090000, dst_reg, 0);
 			break;
 		}
-		/* lghi %w0,0 */
-		EMIT4_IMM(0xa7090000, REG_W0, 0);
-		/* lgr %w1,%dst */
-		EMIT4(0xb9040000, REG_W1, dst_reg);
 		if (!is_first_pass(jit) && can_use_ldisp_for_lit64(jit)) {
-			/* dlg %w0,<d(imm)>(%l) */
-			EMIT6_DISP_LH(0xe3000000, 0x0087, REG_W0, REG_0, REG_L,
-				      EMIT_CONST_U64(imm));
+			switch (off) {
+			case 0: /* dst = dst {/,%} imm */
+				/* lghi %w0,0 */
+				EMIT4_IMM(0xa7090000, REG_W0, 0);
+				/* lgr %w1,%dst */
+				EMIT4(0xb9040000, REG_W1, dst_reg);
+				/* dlg %w0,<d(imm)>(%l) */
+				EMIT6_DISP_LH(0xe3000000, 0x0087, REG_W0, REG_0,
+					      REG_L, EMIT_CONST_U64(imm));
+				break;
+			case 1: /* dst = (s64) dst {/,%} (s64) imm */
+				/* lgr %w1,%dst */
+				EMIT4(0xb9040000, REG_W1, dst_reg);
+				/* dsg %w0,<d(imm)>(%l) */
+				EMIT6_DISP_LH(0xe3000000, 0x000d, REG_W0, REG_0,
+					      REG_L, EMIT_CONST_U64(imm));
+				break;
+			}
 		} else {
-			/* lgrl %dst,imm */
-			EMIT6_PCREL_RILB(0xc4080000, dst_reg,
-					 _EMIT_CONST_U64(imm));
-			jit->seen |= SEEN_LITERAL;
-			/* dlgr %w0,%dst */
-			EMIT4(0xb9870000, REG_W0, dst_reg);
+			switch (off) {
+			case 0: /* dst = dst {/,%} imm */
+				/* lghi %w0,0 */
+				EMIT4_IMM(0xa7090000, REG_W0, 0);
+				/* lgr %w1,%dst */
+				EMIT4(0xb9040000, REG_W1, dst_reg);
+				/* lgrl %dst,imm */
+				EMIT6_PCREL_RILB(0xc4080000, dst_reg,
+						 _EMIT_CONST_U64(imm));
+				jit->seen |= SEEN_LITERAL;
+				/* dlgr %w0,%dst */
+				EMIT4(0xb9870000, REG_W0, dst_reg);
+				break;
+			case 1: /* dst = (s64) dst {/,%} (s64) imm */
+				/* lgr %w1,%dst */
+				EMIT4(0xb9040000, REG_W1, dst_reg);
+				/* lgrl %dst,imm */
+				EMIT6_PCREL_RILB(0xc4080000, dst_reg,
+						 _EMIT_CONST_U64(imm));
+				jit->seen |= SEEN_LITERAL;
+				/* dsgr %w0,%dst */
+				EMIT4(0xb90d0000, REG_W0, dst_reg);
+				break;
+			}
 		}
 		/* lgr %dst,%rc */
 		EMIT4(0xb9040000, dst_reg, rc_reg);
@@ -894,10 +1294,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 		EMIT4(0xb9820000, dst_reg, src_reg);
 		break;
 	case BPF_ALU | BPF_XOR | BPF_K: /* dst = (u32) dst ^ (u32) imm */
-		if (!imm)
-			break;
-		/* xilf %dst,imm */
-		EMIT6_IMM(0xc0070000, dst_reg, imm);
+		if (imm != 0) {
+			/* xilf %dst,imm */
+			EMIT6_IMM(0xc0070000, dst_reg, imm);
+		}
 		EMIT_ZERO(dst_reg);
 		break;
 	case BPF_ALU64 | BPF_XOR | BPF_K: /* dst = dst ^ imm */
@@ -928,10 +1328,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 		EMIT6_DISP_LH(0xeb000000, 0x000d, dst_reg, dst_reg, src_reg, 0);
 		break;
 	case BPF_ALU | BPF_LSH | BPF_K: /* dst = (u32) dst << (u32) imm */
-		if (imm == 0)
-			break;
-		/* sll %dst,imm(%r0) */
-		EMIT4_DISP(0x89000000, dst_reg, REG_0, imm);
+		if (imm != 0) {
+			/* sll %dst,imm(%r0) */
+			EMIT4_DISP(0x89000000, dst_reg, REG_0, imm);
+		}
 		EMIT_ZERO(dst_reg);
 		break;
 	case BPF_ALU64 | BPF_LSH | BPF_K: /* dst = dst << imm */
@@ -953,10 +1353,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 		EMIT6_DISP_LH(0xeb000000, 0x000c, dst_reg, dst_reg, src_reg, 0);
 		break;
 	case BPF_ALU | BPF_RSH | BPF_K: /* dst = (u32) dst >> (u32) imm */
-		if (imm == 0)
-			break;
-		/* srl %dst,imm(%r0) */
-		EMIT4_DISP(0x88000000, dst_reg, REG_0, imm);
+		if (imm != 0) {
+			/* srl %dst,imm(%r0) */
+			EMIT4_DISP(0x88000000, dst_reg, REG_0, imm);
+		}
 		EMIT_ZERO(dst_reg);
 		break;
 	case BPF_ALU64 | BPF_RSH | BPF_K: /* dst = dst >> imm */
@@ -978,10 +1378,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 		EMIT6_DISP_LH(0xeb000000, 0x000a, dst_reg, dst_reg, src_reg, 0);
 		break;
 	case BPF_ALU | BPF_ARSH | BPF_K: /* ((s32) dst >> imm */
-		if (imm == 0)
-			break;
-		/* sra %dst,imm(%r0) */
-		EMIT4_DISP(0x8a000000, dst_reg, REG_0, imm);
+		if (imm != 0) {
+			/* sra %dst,imm(%r0) */
+			EMIT4_DISP(0x8a000000, dst_reg, REG_0, imm);
+		}
 		EMIT_ZERO(dst_reg);
 		break;
 	case BPF_ALU64 | BPF_ARSH | BPF_K: /* ((s64) dst) >>= imm */
@@ -1024,6 +1424,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 		}
 		break;
 	case BPF_ALU | BPF_END | BPF_FROM_LE:
+	case BPF_ALU64 | BPF_END | BPF_FROM_LE:
 		switch (imm) {
 		case 16: /* dst = (u16) cpu_to_le16(dst) */
 			/* lrvr %dst,%dst */
@@ -1049,108 +1450,315 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 		}
 		break;
 	/*
+	 * BPF_NOSPEC (speculation barrier)
+	 */
+	case BPF_ST | BPF_NOSPEC:
+		break;
+	/*
 	 * BPF_ST(X)
 	 */
 	case BPF_STX | BPF_MEM | BPF_B: /* *(u8 *)(dst + off) = src_reg */
-		/* stcy %src,off(%dst) */
-		EMIT6_DISP_LH(0xe3000000, 0x0072, src_reg, dst_reg, REG_0, off);
+	case BPF_STX | BPF_PROBE_MEM32 | BPF_B:
+		bpf_jit_probe_store_pre(jit, insn, &probe);
+		/* stcy %src,off(%dst,%arena) */
+		EMIT6_DISP_LH(0xe3000000, 0x0072, src_reg, dst_reg,
+			      probe.arena_reg, off);
+		err = bpf_jit_probe_post(jit, fp, &probe);
+		if (err < 0)
+			return err;
 		jit->seen |= SEEN_MEM;
 		break;
 	case BPF_STX | BPF_MEM | BPF_H: /* (u16 *)(dst + off) = src */
-		/* sthy %src,off(%dst) */
-		EMIT6_DISP_LH(0xe3000000, 0x0070, src_reg, dst_reg, REG_0, off);
+	case BPF_STX | BPF_PROBE_MEM32 | BPF_H:
+		bpf_jit_probe_store_pre(jit, insn, &probe);
+		/* sthy %src,off(%dst,%arena) */
+		EMIT6_DISP_LH(0xe3000000, 0x0070, src_reg, dst_reg,
+			      probe.arena_reg, off);
+		err = bpf_jit_probe_post(jit, fp, &probe);
+		if (err < 0)
+			return err;
 		jit->seen |= SEEN_MEM;
 		break;
 	case BPF_STX | BPF_MEM | BPF_W: /* *(u32 *)(dst + off) = src */
-		/* sty %src,off(%dst) */
-		EMIT6_DISP_LH(0xe3000000, 0x0050, src_reg, dst_reg, REG_0, off);
+	case BPF_STX | BPF_PROBE_MEM32 | BPF_W:
+		bpf_jit_probe_store_pre(jit, insn, &probe);
+		/* sty %src,off(%dst,%arena) */
+		EMIT6_DISP_LH(0xe3000000, 0x0050, src_reg, dst_reg,
+			      probe.arena_reg, off);
+		err = bpf_jit_probe_post(jit, fp, &probe);
+		if (err < 0)
+			return err;
 		jit->seen |= SEEN_MEM;
 		break;
 	case BPF_STX | BPF_MEM | BPF_DW: /* (u64 *)(dst + off) = src */
-		/* stg %src,off(%dst) */
-		EMIT6_DISP_LH(0xe3000000, 0x0024, src_reg, dst_reg, REG_0, off);
+	case BPF_STX | BPF_PROBE_MEM32 | BPF_DW:
+		bpf_jit_probe_store_pre(jit, insn, &probe);
+		/* stg %src,off(%dst,%arena) */
+		EMIT6_DISP_LH(0xe3000000, 0x0024, src_reg, dst_reg,
+			      probe.arena_reg, off);
+		err = bpf_jit_probe_post(jit, fp, &probe);
+		if (err < 0)
+			return err;
 		jit->seen |= SEEN_MEM;
 		break;
 	case BPF_ST | BPF_MEM | BPF_B: /* *(u8 *)(dst + off) = imm */
+	case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
 		/* lhi %w0,imm */
 		EMIT4_IMM(0xa7080000, REG_W0, (u8) imm);
-		/* stcy %w0,off(dst) */
-		EMIT6_DISP_LH(0xe3000000, 0x0072, REG_W0, dst_reg, REG_0, off);
+		bpf_jit_probe_store_pre(jit, insn, &probe);
+		/* stcy %w0,off(%dst,%arena) */
+		EMIT6_DISP_LH(0xe3000000, 0x0072, REG_W0, dst_reg,
+			      probe.arena_reg, off);
+		err = bpf_jit_probe_post(jit, fp, &probe);
+		if (err < 0)
+			return err;
 		jit->seen |= SEEN_MEM;
 		break;
 	case BPF_ST | BPF_MEM | BPF_H: /* (u16 *)(dst + off) = imm */
+	case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
 		/* lhi %w0,imm */
 		EMIT4_IMM(0xa7080000, REG_W0, (u16) imm);
-		/* sthy %w0,off(dst) */
-		EMIT6_DISP_LH(0xe3000000, 0x0070, REG_W0, dst_reg, REG_0, off);
+		bpf_jit_probe_store_pre(jit, insn, &probe);
+		/* sthy %w0,off(%dst,%arena) */
+		EMIT6_DISP_LH(0xe3000000, 0x0070, REG_W0, dst_reg,
+			      probe.arena_reg, off);
+		err = bpf_jit_probe_post(jit, fp, &probe);
+		if (err < 0)
+			return err;
 		jit->seen |= SEEN_MEM;
 		break;
 	case BPF_ST | BPF_MEM | BPF_W: /* *(u32 *)(dst + off) = imm */
+	case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
 		/* llilf %w0,imm  */
 		EMIT6_IMM(0xc00f0000, REG_W0, (u32) imm);
-		/* sty %w0,off(%dst) */
-		EMIT6_DISP_LH(0xe3000000, 0x0050, REG_W0, dst_reg, REG_0, off);
+		bpf_jit_probe_store_pre(jit, insn, &probe);
+		/* sty %w0,off(%dst,%arena) */
+		EMIT6_DISP_LH(0xe3000000, 0x0050, REG_W0, dst_reg,
+			      probe.arena_reg, off);
+		err = bpf_jit_probe_post(jit, fp, &probe);
+		if (err < 0)
+			return err;
 		jit->seen |= SEEN_MEM;
 		break;
 	case BPF_ST | BPF_MEM | BPF_DW: /* *(u64 *)(dst + off) = imm */
+	case BPF_ST | BPF_PROBE_MEM32 | BPF_DW:
 		/* lgfi %w0,imm */
 		EMIT6_IMM(0xc0010000, REG_W0, imm);
-		/* stg %w0,off(%dst) */
-		EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W0, dst_reg, REG_0, off);
+		bpf_jit_probe_store_pre(jit, insn, &probe);
+		/* stg %w0,off(%dst,%arena) */
+		EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W0, dst_reg,
+			      probe.arena_reg, off);
+		err = bpf_jit_probe_post(jit, fp, &probe);
+		if (err < 0)
+			return err;
 		jit->seen |= SEEN_MEM;
 		break;
 	/*
-	 * BPF_STX XADD (atomic_add)
+	 * BPF_ATOMIC
 	 */
-	case BPF_STX | BPF_XADD | BPF_W: /* *(u32 *)(dst + off) += src */
-		/* laal %w0,%src,off(%dst) */
-		EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W0, src_reg,
-			      dst_reg, off);
-		jit->seen |= SEEN_MEM;
-		break;
-	case BPF_STX | BPF_XADD | BPF_DW: /* *(u64 *)(dst + off) += src */
-		/* laalg %w0,%src,off(%dst) */
-		EMIT6_DISP_LH(0xeb000000, 0x00ea, REG_W0, src_reg,
-			      dst_reg, off);
+	case BPF_STX | BPF_ATOMIC | BPF_DW:
+	case BPF_STX | BPF_ATOMIC | BPF_W:
+	case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW:
+	case BPF_STX | BPF_PROBE_ATOMIC | BPF_W:
+	{
+		bool is32 = BPF_SIZE(insn->code) == BPF_W;
+
+		/*
+		 * Unlike loads and stores, atomics have only a base register,
+		 * but no index register. For the non-arena case, simply use
+		 * %dst as a base. For the arena case, use the work register
+		 * %r1: first, load the arena base into it, and then add %dst
+		 * to it.
+		 */
+		probe.arena_reg = dst_reg;
+
+		switch (insn->imm) {
+#define EMIT_ATOMIC(op32, op64) do {					\
+	bpf_jit_probe_atomic_pre(jit, insn, &probe);			\
+	/* {op32|op64} {%w0|%src},%src,off(%arena) */			\
+	EMIT6_DISP_LH(0xeb000000, is32 ? (op32) : (op64),		\
+		      (insn->imm & BPF_FETCH) ? src_reg : REG_W0,	\
+		      src_reg, probe.arena_reg, off);			\
+	err = bpf_jit_probe_post(jit, fp, &probe);			\
+	if (err < 0)							\
+		return err;						\
+	if (insn->imm & BPF_FETCH) {					\
+		/* bcr 14,0 - see atomic_fetch_{add,and,or,xor}() */	\
+		_EMIT2(0x07e0);						\
+		if (is32)                                               \
+			EMIT_ZERO(src_reg);				\
+	}								\
+} while (0)
+		case BPF_ADD:
+		case BPF_ADD | BPF_FETCH:
+			/* {laal|laalg} */
+			EMIT_ATOMIC(0x00fa, 0x00ea);
+			break;
+		case BPF_AND:
+		case BPF_AND | BPF_FETCH:
+			/* {lan|lang} */
+			EMIT_ATOMIC(0x00f4, 0x00e4);
+			break;
+		case BPF_OR:
+		case BPF_OR | BPF_FETCH:
+			/* {lao|laog} */
+			EMIT_ATOMIC(0x00f6, 0x00e6);
+			break;
+		case BPF_XOR:
+		case BPF_XOR | BPF_FETCH:
+			/* {lax|laxg} */
+			EMIT_ATOMIC(0x00f7, 0x00e7);
+			break;
+#undef EMIT_ATOMIC
+		case BPF_XCHG: {
+			struct bpf_jit_probe load_probe = probe;
+			int loop_start;
+
+			bpf_jit_probe_atomic_pre(jit, insn, &load_probe);
+			/* {ly|lg} %w0,off(%arena) */
+			EMIT6_DISP_LH(0xe3000000,
+				      is32 ? 0x0058 : 0x0004, REG_W0, REG_0,
+				      load_probe.arena_reg, off);
+			bpf_jit_probe_emit_nop(jit, &load_probe);
+			/* Reuse {ly|lg}'s arena_reg for {csy|csg}. */
+			if (load_probe.prg != -1) {
+				probe.prg = jit->prg;
+				probe.arena_reg = load_probe.arena_reg;
+			}
+			loop_start = jit->prg;
+			/* 0: {csy|csg} %w0,%src,off(%arena) */
+			EMIT6_DISP_LH(0xeb000000, is32 ? 0x0014 : 0x0030,
+				      REG_W0, src_reg, probe.arena_reg, off);
+			bpf_jit_probe_emit_nop(jit, &probe);
+			/* brc 4,0b */
+			EMIT4_PCREL_RIC(0xa7040000, 4, loop_start);
+			/* {llgfr|lgr} %src,%w0 */
+			EMIT4(is32 ? 0xb9160000 : 0xb9040000, src_reg, REG_W0);
+			/* Both probes should land here on exception. */
+			err = bpf_jit_probe_post(jit, fp, &load_probe);
+			if (err < 0)
+				return err;
+			err = bpf_jit_probe_post(jit, fp, &probe);
+			if (err < 0)
+				return err;
+			if (is32 && insn_is_zext(&insn[1]))
+				insn_count = 2;
+			break;
+		}
+		case BPF_CMPXCHG:
+			bpf_jit_probe_atomic_pre(jit, insn, &probe);
+			/* 0: {csy|csg} %b0,%src,off(%arena) */
+			EMIT6_DISP_LH(0xeb000000, is32 ? 0x0014 : 0x0030,
+				      BPF_REG_0, src_reg,
+				      probe.arena_reg, off);
+			err = bpf_jit_probe_post(jit, fp, &probe);
+			if (err < 0)
+				return err;
+			break;
+		default:
+			pr_err("Unknown atomic operation %02x\n", insn->imm);
+			return -1;
+		}
+
 		jit->seen |= SEEN_MEM;
 		break;
+	}
 	/*
 	 * BPF_LDX
 	 */
 	case BPF_LDX | BPF_MEM | BPF_B: /* dst = *(u8 *)(ul) (src + off) */
-		/* llgc %dst,0(off,%src) */
-		EMIT6_DISP_LH(0xe3000000, 0x0090, dst_reg, src_reg, REG_0, off);
+	case BPF_LDX | BPF_PROBE_MEM | BPF_B:
+	case BPF_LDX | BPF_PROBE_MEM32 | BPF_B:
+		bpf_jit_probe_load_pre(jit, insn, &probe);
+		/* llgc %dst,off(%src,%arena) */
+		EMIT6_DISP_LH(0xe3000000, 0x0090, dst_reg, src_reg,
+			      probe.arena_reg, off);
+		err = bpf_jit_probe_post(jit, fp, &probe);
+		if (err < 0)
+			return err;
 		jit->seen |= SEEN_MEM;
 		if (insn_is_zext(&insn[1]))
 			insn_count = 2;
 		break;
+	case BPF_LDX | BPF_MEMSX | BPF_B: /* dst = *(s8 *)(ul) (src + off) */
+	case BPF_LDX | BPF_PROBE_MEMSX | BPF_B:
+		bpf_jit_probe_load_pre(jit, insn, &probe);
+		/* lgb %dst,off(%src) */
+		EMIT6_DISP_LH(0xe3000000, 0x0077, dst_reg, src_reg, REG_0, off);
+		err = bpf_jit_probe_post(jit, fp, &probe);
+		if (err < 0)
+			return err;
+		jit->seen |= SEEN_MEM;
+		break;
 	case BPF_LDX | BPF_MEM | BPF_H: /* dst = *(u16 *)(ul) (src + off) */
-		/* llgh %dst,0(off,%src) */
-		EMIT6_DISP_LH(0xe3000000, 0x0091, dst_reg, src_reg, REG_0, off);
+	case BPF_LDX | BPF_PROBE_MEM | BPF_H:
+	case BPF_LDX | BPF_PROBE_MEM32 | BPF_H:
+		bpf_jit_probe_load_pre(jit, insn, &probe);
+		/* llgh %dst,off(%src,%arena) */
+		EMIT6_DISP_LH(0xe3000000, 0x0091, dst_reg, src_reg,
+			      probe.arena_reg, off);
+		err = bpf_jit_probe_post(jit, fp, &probe);
+		if (err < 0)
+			return err;
 		jit->seen |= SEEN_MEM;
 		if (insn_is_zext(&insn[1]))
 			insn_count = 2;
 		break;
+	case BPF_LDX | BPF_MEMSX | BPF_H: /* dst = *(s16 *)(ul) (src + off) */
+	case BPF_LDX | BPF_PROBE_MEMSX | BPF_H:
+		bpf_jit_probe_load_pre(jit, insn, &probe);
+		/* lgh %dst,off(%src) */
+		EMIT6_DISP_LH(0xe3000000, 0x0015, dst_reg, src_reg, REG_0, off);
+		err = bpf_jit_probe_post(jit, fp, &probe);
+		if (err < 0)
+			return err;
+		jit->seen |= SEEN_MEM;
+		break;
 	case BPF_LDX | BPF_MEM | BPF_W: /* dst = *(u32 *)(ul) (src + off) */
+	case BPF_LDX | BPF_PROBE_MEM | BPF_W:
+	case BPF_LDX | BPF_PROBE_MEM32 | BPF_W:
+		bpf_jit_probe_load_pre(jit, insn, &probe);
 		/* llgf %dst,off(%src) */
 		jit->seen |= SEEN_MEM;
-		EMIT6_DISP_LH(0xe3000000, 0x0016, dst_reg, src_reg, REG_0, off);
+		EMIT6_DISP_LH(0xe3000000, 0x0016, dst_reg, src_reg,
+			      probe.arena_reg, off);
+		err = bpf_jit_probe_post(jit, fp, &probe);
+		if (err < 0)
+			return err;
 		if (insn_is_zext(&insn[1]))
 			insn_count = 2;
 		break;
+	case BPF_LDX | BPF_MEMSX | BPF_W: /* dst = *(s32 *)(ul) (src + off) */
+	case BPF_LDX | BPF_PROBE_MEMSX | BPF_W:
+		bpf_jit_probe_load_pre(jit, insn, &probe);
+		/* lgf %dst,off(%src) */
+		jit->seen |= SEEN_MEM;
+		EMIT6_DISP_LH(0xe3000000, 0x0014, dst_reg, src_reg, REG_0, off);
+		err = bpf_jit_probe_post(jit, fp, &probe);
+		if (err < 0)
+			return err;
+		break;
 	case BPF_LDX | BPF_MEM | BPF_DW: /* dst = *(u64 *)(ul) (src + off) */
-		/* lg %dst,0(off,%src) */
+	case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
+	case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW:
+		bpf_jit_probe_load_pre(jit, insn, &probe);
+		/* lg %dst,off(%src,%arena) */
 		jit->seen |= SEEN_MEM;
-		EMIT6_DISP_LH(0xe3000000, 0x0004, dst_reg, src_reg, REG_0, off);
+		EMIT6_DISP_LH(0xe3000000, 0x0004, dst_reg, src_reg,
+			      probe.arena_reg, off);
+		err = bpf_jit_probe_post(jit, fp, &probe);
+		if (err < 0)
+			return err;
 		break;
 	/*
 	 * BPF_JMP / CALL
 	 */
 	case BPF_JMP | BPF_CALL:
 	{
-		u64 func;
+		const struct btf_func_model *m;
 		bool func_addr_fixed;
-		int ret;
+		int j, ret;
+		u64 func;
 
 		ret = bpf_jit_get_func_addr(fp, insn, extra_pass,
 					    &func, &func_addr_fixed);
@@ -1159,29 +1767,51 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 
 		REG_SET_SEEN(BPF_REG_5);
 		jit->seen |= SEEN_FUNC;
+		/*
+		 * Copy the tail call counter to where the callee expects it.
+		 *
+		 * Note 1: The callee can increment the tail call counter, but
+		 * we do not load it back, since the x86 JIT does not do this
+		 * either.
+		 *
+		 * Note 2: We assume that the verifier does not let us call the
+		 * main program, which clears the tail call counter on entry.
+		 */
+		/* mvc STK_OFF_TCCNT(4,%r15),N(%r15) */
+		_EMIT6(0xd203f000 | STK_OFF_TCCNT,
+		       0xf000 | (STK_OFF_TCCNT + STK_OFF + stack_depth));
+
+		/* Sign-extend the kfunc arguments. */
+		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+			m = bpf_jit_find_kfunc_model(fp, insn);
+			if (!m)
+				return -1;
+
+			for (j = 0; j < m->nr_args; j++) {
+				if (sign_extend(jit, BPF_REG_1 + j,
+						m->arg_size[j],
+						m->arg_flags[j]))
+					return -1;
+			}
+		}
+
 		/* lgrl %w1,func */
 		EMIT6_PCREL_RILB(0xc4080000, REG_W1, _EMIT_CONST_U64(func));
-		if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) {
-			/* brasl %r14,__s390_indirect_jump_r1 */
-			EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip);
-		} else {
-			/* basr %r14,%w1 */
-			EMIT2(0x0d00, REG_14, REG_W1);
-		}
+		/* %r1() */
+		call_r1(jit);
 		/* lgr %b0,%r2: load return value into %b0 */
 		EMIT4(0xb9040000, BPF_REG_0, REG_2);
 		break;
 	}
-	case BPF_JMP | BPF_TAIL_CALL:
+	case BPF_JMP | BPF_TAIL_CALL: {
+		int patch_1_clrj, patch_2_clij, patch_3_brc;
+
 		/*
 		 * Implicit input:
 		 *  B1: pointer to ctx
 		 *  B2: pointer to bpf_array
 		 *  B3: index in bpf_array
-		 */
-		jit->seen |= SEEN_TAIL_CALL;
-
-		/*
+		 *
 		 * if (index >= array->map.max_entries)
 		 *         goto out;
 		 */
@@ -1190,40 +1820,28 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 		EMIT6_DISP_LH(0xe3000000, 0x0016, REG_W1, REG_0, BPF_REG_2,
 			      offsetof(struct bpf_array, map.max_entries));
 		/* if ((u32)%b3 >= (u32)%w1) goto out; */
-		if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) {
-			/* clrj %b3,%w1,0xa,label0 */
-			EMIT6_PCREL_LABEL(0xec000000, 0x0077, BPF_REG_3,
-					  REG_W1, 0, 0xa);
-		} else {
-			/* clr %b3,%w1 */
-			EMIT2(0x1500, BPF_REG_3, REG_W1);
-			/* brcl 0xa,label0 */
-			EMIT6_PCREL_RILC(0xc0040000, 0xa, jit->labels[0]);
-		}
+		/* clrj %b3,%w1,0xa,out */
+		patch_1_clrj = jit->prg;
+		EMIT6_PCREL_RIEB(0xec000000, 0x0077, BPF_REG_3, REG_W1, 0xa,
+				 jit->prg);
 
 		/*
-		 * if (tail_call_cnt++ > MAX_TAIL_CALL_CNT)
+		 * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
 		 *         goto out;
 		 */
 
 		if (jit->seen & SEEN_STACK)
-			off = STK_OFF_TCCNT + STK_OFF + fp->aux->stack_depth;
+			off = STK_OFF_TCCNT + STK_OFF + stack_depth;
 		else
 			off = STK_OFF_TCCNT;
 		/* lhi %w0,1 */
 		EMIT4_IMM(0xa7080000, REG_W0, 1);
 		/* laal %w1,%w0,off(%r15) */
 		EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W1, REG_W0, REG_15, off);
-		if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) {
-			/* clij %w1,MAX_TAIL_CALL_CNT,0x2,label0 */
-			EMIT6_PCREL_IMM_LABEL(0xec000000, 0x007f, REG_W1,
-					      MAX_TAIL_CALL_CNT, 0, 0x2);
-		} else {
-			/* clfi %w1,MAX_TAIL_CALL_CNT */
-			EMIT6_IMM(0xc20f0000, REG_W1, MAX_TAIL_CALL_CNT);
-			/* brcl 0x2,label0 */
-			EMIT6_PCREL_RILC(0xc0040000, 0x2, jit->labels[0]);
-		}
+		/* clij %w1,MAX_TAIL_CALL_CNT-1,0x2,out */
+		patch_2_clij = jit->prg;
+		EMIT6_PCREL_RIEC(0xec000000, 0x007f, REG_W1, MAX_TAIL_CALL_CNT - 1,
+				 2, jit->prg);
 
 		/*
 		 * prog = array->ptrs[index];
@@ -1238,18 +1856,14 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 		/* ltg %r1,prog(%b2,%r1) */
 		EMIT6_DISP_LH(0xe3000000, 0x0002, REG_1, BPF_REG_2,
 			      REG_1, offsetof(struct bpf_array, ptrs));
-		if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) {
-			/* brc 0x8,label0 */
-			EMIT4_PCREL_RIC(0xa7040000, 0x8, jit->labels[0]);
-		} else {
-			/* brcl 0x8,label0 */
-			EMIT6_PCREL_RILC(0xc0040000, 0x8, jit->labels[0]);
-		}
+		/* brc 0x8,out */
+		patch_3_brc = jit->prg;
+		EMIT4_PCREL_RIC(0xa7040000, 8, jit->prg);
 
 		/*
 		 * Restore registers before calling function
 		 */
-		save_restore_regs(jit, REGS_RESTORE, fp->aux->stack_depth);
+		save_restore_regs(jit, REGS_RESTORE, stack_depth, 0);
 
 		/*
 		 * goto *(prog->bpf_func + tail_call_start);
@@ -1258,17 +1872,37 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 		/* lg %r1,bpf_func(%r1) */
 		EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, REG_1, REG_0,
 			      offsetof(struct bpf_prog, bpf_func));
-		/* bc 0xf,tail_call_start(%r1) */
-		_EMIT4(0x47f01000 + jit->tail_call_start);
+		if (nospec_uses_trampoline()) {
+			jit->seen |= SEEN_FUNC;
+			/* aghi %r1,tail_call_start */
+			EMIT4_IMM(0xa70b0000, REG_1, jit->tail_call_start);
+			/* brcl 0xf,__s390_indirect_jump_r1 */
+			EMIT6_PCREL_RILC(0xc0040000, 0xf, jit->r1_thunk_ip);
+		} else {
+			/* bc 0xf,tail_call_start(%r1) */
+			_EMIT4(0x47f01000 + jit->tail_call_start);
+		}
 		/* out: */
-		jit->labels[0] = jit->prg;
+		if (jit->prg_buf) {
+			*(u16 *)(jit->prg_buf + patch_1_clrj + 2) =
+				(jit->prg - patch_1_clrj) >> 1;
+			*(u16 *)(jit->prg_buf + patch_2_clij + 2) =
+				(jit->prg - patch_2_clij) >> 1;
+			*(u16 *)(jit->prg_buf + patch_3_brc + 2) =
+				(jit->prg - patch_3_brc) >> 1;
+		}
 		break;
+	}
 	case BPF_JMP | BPF_EXIT: /* return b0 */
 		last = (i == fp->len - 1) ? 1 : 0;
 		if (last)
 			break;
-		/* j <exit> */
-		EMIT4_PCREL(0xa7f40000, jit->exit_ip - jit->prg);
+		if (!is_first_pass(jit) && can_use_rel(jit, jit->exit_ip))
+			/* brc 0xf, <exit> */
+			EMIT4_PCREL_RIC(0xa7040000, 0xf, jit->exit_ip);
+		else
+			/* brcl 0xf, <exit> */
+			EMIT6_PCREL_RILC(0xc0040000, 0xf, jit->exit_ip);
 		break;
 	/*
 	 * Branch relative (number of skipped instructions) to offset on
@@ -1290,6 +1924,9 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 	 * instruction itself (loop) and for BPF with offset 0 we
 	 * branch to the instruction behind the branch.
 	 */
+	case BPF_JMP32 | BPF_JA: /* if (true) */
+		branch_oc_off = imm;
+		fallthrough;
 	case BPF_JMP | BPF_JA: /* if (true) */
 		mask = 0xf000; /* j */
 		goto branch_oc;
@@ -1416,21 +2053,10 @@ branch_ks:
 		}
 		break;
 branch_ku:
-		is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
-		/* clfi or clgfi %dst,imm */
-		EMIT6_IMM(is_jmp32 ? 0xc20f0000 : 0xc20e0000,
-			  dst_reg, imm);
-		if (!is_first_pass(jit) &&
-		    can_use_rel(jit, addrs[i + off + 1])) {
-			/* brc mask,off */
-			EMIT4_PCREL_RIC(0xa7040000,
-					mask >> 12, addrs[i + off + 1]);
-		} else {
-			/* brcl mask,off */
-			EMIT6_PCREL_RILC(0xc0040000,
-					 mask >> 12, addrs[i + off + 1]);
-		}
-		break;
+		/* lgfi %w1,imm (load sign extend imm) */
+		src_reg = REG_1;
+		EMIT6_IMM(0xc0010000, src_reg, imm);
+		goto branch_xu;
 branch_xs:
 		is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
 		if (!is_first_pass(jit) &&
@@ -1469,14 +2095,16 @@ branch_xu:
 		break;
 branch_oc:
 		if (!is_first_pass(jit) &&
-		    can_use_rel(jit, addrs[i + off + 1])) {
+		    can_use_rel(jit, addrs[i + branch_oc_off + 1])) {
 			/* brc mask,off */
 			EMIT4_PCREL_RIC(0xa7040000,
-					mask >> 12, addrs[i + off + 1]);
+					mask >> 12,
+					addrs[i + branch_oc_off + 1]);
 		} else {
 			/* brcl mask,off */
 			EMIT6_PCREL_RILC(0xc0040000,
-					 mask >> 12, addrs[i + off + 1]);
+					 mask >> 12,
+					 addrs[i + branch_oc_off + 1]);
 		}
 		break;
 	}
@@ -1484,6 +2112,7 @@ branch_oc:
 		pr_err("Unknown opcode %02x\n", insn->code);
 		return -1;
 	}
+
 	return insn_count;
 }
 
@@ -1509,7 +2138,14 @@ static bool bpf_is_new_addr_sane(struct bpf_jit *jit, int i)
  */
 static int bpf_set_addr(struct bpf_jit *jit, int i)
 {
-	if (!bpf_is_new_addr_sane(jit, i))
+	int delta;
+
+	if (is_codegen_pass(jit)) {
+		delta = jit->prg - jit->addrs[i];
+		if (delta < 0)
+			bpf_skip(jit, -delta);
+	}
+	if (WARN_ON_ONCE(!bpf_is_new_addr_sane(jit, i)))
 		return -1;
 	jit->addrs[i] = jit->prg;
 	return 0;
@@ -1519,26 +2155,33 @@ static int bpf_set_addr(struct bpf_jit *jit, int i)
  * Compile eBPF program into s390x code
  */
 static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp,
-			bool extra_pass)
+			bool extra_pass, u32 stack_depth)
 {
 	int i, insn_count, lit32_size, lit64_size;
+	u64 kern_arena;
 
 	jit->lit32 = jit->lit32_start;
 	jit->lit64 = jit->lit64_start;
 	jit->prg = 0;
+	jit->excnt = 0;
+
+	kern_arena = bpf_arena_get_kern_vm_start(fp->aux->arena);
+	if (kern_arena)
+		jit->kern_arena = _EMIT_CONST_U64(kern_arena);
+	jit->user_arena = bpf_arena_get_user_vm_start(fp->aux->arena);
 
-	bpf_jit_prologue(jit, fp->aux->stack_depth);
+	bpf_jit_prologue(jit, fp, stack_depth);
 	if (bpf_set_addr(jit, 0) < 0)
 		return -1;
 	for (i = 0; i < fp->len; i += insn_count) {
-		insn_count = bpf_jit_insn(jit, fp, i, extra_pass);
+		insn_count = bpf_jit_insn(jit, fp, i, extra_pass, stack_depth);
 		if (insn_count < 0)
 			return -1;
 		/* Next instruction address */
 		if (bpf_set_addr(jit, i + insn_count) < 0)
 			return -1;
 	}
-	bpf_jit_epilogue(jit, fp->aux->stack_depth);
+	bpf_jit_epilogue(jit, stack_depth);
 
 	lit32_size = jit->lit32 - jit->lit32_start;
 	lit64_size = jit->lit64 - jit->lit64_start;
@@ -1550,6 +2193,12 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp,
 		jit->lit64_start = ALIGN(jit->lit64_start, 8);
 	jit->size = jit->lit64_start + lit64_size;
 	jit->size_prg = jit->prg;
+
+	if (WARN_ON_ONCE(fp->aux->extable &&
+			 jit->excnt != fp->aux->num_exentries))
+		/* Verifier bug - too many entries. */
+		return -1;
+
 	return 0;
 }
 
@@ -1564,11 +2213,51 @@ struct s390_jit_data {
 	int pass;
 };
 
+static struct bpf_binary_header *bpf_jit_alloc(struct bpf_jit *jit,
+					       struct bpf_prog *fp)
+{
+	struct bpf_binary_header *header;
+	struct bpf_insn *insn;
+	u32 extable_size;
+	u32 code_size;
+	int i;
+
+	for (i = 0; i < fp->len; i++) {
+		insn = &fp->insnsi[i];
+
+		if (BPF_CLASS(insn->code) == BPF_STX &&
+		    BPF_MODE(insn->code) == BPF_PROBE_ATOMIC &&
+		    (BPF_SIZE(insn->code) == BPF_DW ||
+		     BPF_SIZE(insn->code) == BPF_W) &&
+		    insn->imm == BPF_XCHG)
+			/*
+			 * bpf_jit_insn() emits a load and a compare-and-swap,
+			 * both of which need to be probed.
+			 */
+			fp->aux->num_exentries += 1;
+	}
+	/* We need two entries per insn. */
+	fp->aux->num_exentries *= 2;
+
+	code_size = roundup(jit->size,
+			    __alignof__(struct exception_table_entry));
+	extable_size = fp->aux->num_exentries *
+		sizeof(struct exception_table_entry);
+	header = bpf_jit_binary_alloc(code_size + extable_size, &jit->prg_buf,
+				      8, jit_fill_hole);
+	if (!header)
+		return NULL;
+	fp->aux->extable = (struct exception_table_entry *)
+		(jit->prg_buf + code_size);
+	return header;
+}
+
 /*
  * Compile eBPF program "fp"
  */
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 {
+	u32 stack_depth = round_up(fp->aux->stack_depth, 8);
 	struct bpf_prog *tmp, *orig_fp = fp;
 	struct bpf_binary_header *header;
 	struct s390_jit_data *jit_data;
@@ -1613,15 +2302,15 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 	jit.addrs = kvcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL);
 	if (jit.addrs == NULL) {
 		fp = orig_fp;
-		goto out;
+		goto free_addrs;
 	}
 	/*
 	 * Three initial passes:
 	 *   - 1/2: Determine clobbered registers
-	 *   - 3:   Calculate program size and addrs arrray
+	 *   - 3:   Calculate program size and addrs array
 	 */
 	for (pass = 1; pass <= 3; pass++) {
-		if (bpf_jit_prog(&jit, fp, extra_pass)) {
+		if (bpf_jit_prog(&jit, fp, extra_pass, stack_depth)) {
 			fp = orig_fp;
 			goto free_addrs;
 		}
@@ -1629,13 +2318,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 	/*
 	 * Final pass: Allocate and generate program
 	 */
-	header = bpf_jit_binary_alloc(jit.size, &jit.prg_buf, 8, jit_fill_hole);
+	header = bpf_jit_alloc(&jit, fp);
 	if (!header) {
 		fp = orig_fp;
 		goto free_addrs;
 	}
 skip_init_ctx:
-	if (bpf_jit_prog(&jit, fp, extra_pass)) {
+	if (bpf_jit_prog(&jit, fp, extra_pass, stack_depth)) {
 		bpf_jit_binary_free(header);
 		fp = orig_fp;
 		goto free_addrs;
@@ -1645,7 +2334,11 @@ skip_init_ctx:
 		print_fn_code(jit.prg_buf, jit.size_prg);
 	}
 	if (!fp->is_func || extra_pass) {
-		bpf_jit_binary_lock_ro(header);
+		if (bpf_jit_binary_lock_ro(header)) {
+			bpf_jit_binary_free(header);
+			fp = orig_fp;
+			goto free_addrs;
+		}
 	} else {
 		jit_data->header = header;
 		jit_data->ctx = jit;
@@ -1668,3 +2361,608 @@ out:
 					   tmp : orig_fp);
 	return fp;
 }
+
+bool bpf_jit_supports_kfunc_call(void)
+{
+	return true;
+}
+
+bool bpf_jit_supports_far_kfunc_call(void)
+{
+	return true;
+}
+
+int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
+		       void *old_addr, void *new_addr)
+{
+	struct bpf_plt expected_plt, current_plt, new_plt, *plt;
+	struct {
+		u16 opc;
+		s32 disp;
+	} __packed insn;
+	char *ret;
+	int err;
+
+	/* Verify the branch to be patched. */
+	err = copy_from_kernel_nofault(&insn, ip, sizeof(insn));
+	if (err < 0)
+		return err;
+	if (insn.opc != (0xc004 | (old_addr ? 0xf0 : 0)))
+		return -EINVAL;
+
+	if (t == BPF_MOD_JUMP &&
+	    insn.disp == ((char *)new_addr - (char *)ip) >> 1) {
+		/*
+		 * The branch already points to the destination,
+		 * there is no PLT.
+		 */
+	} else {
+		/* Verify the PLT. */
+		plt = ip + (insn.disp << 1);
+		err = copy_from_kernel_nofault(&current_plt, plt,
+					       sizeof(current_plt));
+		if (err < 0)
+			return err;
+		ret = (char *)ip + 6;
+		bpf_jit_plt(&expected_plt, ret, old_addr);
+		if (memcmp(&current_plt, &expected_plt, sizeof(current_plt)))
+			return -EINVAL;
+		/* Adjust the call address. */
+		bpf_jit_plt(&new_plt, ret, new_addr);
+		s390_kernel_write(&plt->target, &new_plt.target,
+				  sizeof(void *));
+	}
+
+	/* Adjust the mask of the branch. */
+	insn.opc = 0xc004 | (new_addr ? 0xf0 : 0);
+	s390_kernel_write((char *)ip + 1, (char *)&insn.opc + 1, 1);
+
+	/* Make the new code visible to the other CPUs. */
+	text_poke_sync_lock();
+
+	return 0;
+}
+
+struct bpf_tramp_jit {
+	struct bpf_jit common;
+	int orig_stack_args_off;/* Offset of arguments placed on stack by the
+				 * func_addr's original caller
+				 */
+	int stack_size;		/* Trampoline stack size */
+	int backchain_off;	/* Offset of backchain */
+	int stack_args_off;	/* Offset of stack arguments for calling
+				 * func_addr, has to be at the top
+				 */
+	int reg_args_off;	/* Offset of register arguments for calling
+				 * func_addr
+				 */
+	int ip_off;		/* For bpf_get_func_ip(), has to be at
+				 * (ctx - 16)
+				 */
+	int arg_cnt_off;	/* For bpf_get_func_arg_cnt(), has to be at
+				 * (ctx - 8)
+				 */
+	int bpf_args_off;	/* Offset of BPF_PROG context, which consists
+				 * of BPF arguments followed by return value
+				 */
+	int retval_off;		/* Offset of return value (see above) */
+	int r7_r8_off;		/* Offset of saved %r7 and %r8, which are used
+				 * for __bpf_prog_enter() return value and
+				 * func_addr respectively
+				 */
+	int run_ctx_off;	/* Offset of struct bpf_tramp_run_ctx */
+	int tccnt_off;		/* Offset of saved tailcall counter */
+	int r14_off;		/* Offset of saved %r14, has to be at the
+				 * bottom */
+	int do_fexit;		/* do_fexit: label */
+};
+
+static void load_imm64(struct bpf_jit *jit, int dst_reg, u64 val)
+{
+	/* llihf %dst_reg,val_hi */
+	EMIT6_IMM(0xc00e0000, dst_reg, (val >> 32));
+	/* oilf %rdst_reg,val_lo */
+	EMIT6_IMM(0xc00d0000, dst_reg, val);
+}
+
+static int invoke_bpf_prog(struct bpf_tramp_jit *tjit,
+			   const struct btf_func_model *m,
+			   struct bpf_tramp_link *tlink, bool save_ret)
+{
+	struct bpf_jit *jit = &tjit->common;
+	int cookie_off = tjit->run_ctx_off +
+			 offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
+	struct bpf_prog *p = tlink->link.prog;
+	int patch;
+
+	/*
+	 * run_ctx.cookie = tlink->cookie;
+	 */
+
+	/* %r0 = tlink->cookie */
+	load_imm64(jit, REG_W0, tlink->cookie);
+	/* stg %r0,cookie_off(%r15) */
+	EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W0, REG_0, REG_15, cookie_off);
+
+	/*
+	 * if ((start = __bpf_prog_enter(p, &run_ctx)) == 0)
+	 *         goto skip;
+	 */
+
+	/* %r1 = __bpf_prog_enter */
+	load_imm64(jit, REG_1, (u64)bpf_trampoline_enter(p));
+	/* %r2 = p */
+	load_imm64(jit, REG_2, (u64)p);
+	/* la %r3,run_ctx_off(%r15) */
+	EMIT4_DISP(0x41000000, REG_3, REG_15, tjit->run_ctx_off);
+	/* %r1() */
+	call_r1(jit);
+	/* ltgr %r7,%r2 */
+	EMIT4(0xb9020000, REG_7, REG_2);
+	/* brcl 8,skip */
+	patch = jit->prg;
+	EMIT6_PCREL_RILC(0xc0040000, 8, 0);
+
+	/*
+	 * retval = bpf_func(args, p->insnsi);
+	 */
+
+	/* %r1 = p->bpf_func */
+	load_imm64(jit, REG_1, (u64)p->bpf_func);
+	/* la %r2,bpf_args_off(%r15) */
+	EMIT4_DISP(0x41000000, REG_2, REG_15, tjit->bpf_args_off);
+	/* %r3 = p->insnsi */
+	if (!p->jited)
+		load_imm64(jit, REG_3, (u64)p->insnsi);
+	/* %r1() */
+	call_r1(jit);
+	/* stg %r2,retval_off(%r15) */
+	if (save_ret) {
+		if (sign_extend(jit, REG_2, m->ret_size, m->ret_flags))
+			return -1;
+		EMIT6_DISP_LH(0xe3000000, 0x0024, REG_2, REG_0, REG_15,
+			      tjit->retval_off);
+	}
+
+	/* skip: */
+	if (jit->prg_buf)
+		*(u32 *)&jit->prg_buf[patch + 2] = (jit->prg - patch) >> 1;
+
+	/*
+	 * __bpf_prog_exit(p, start, &run_ctx);
+	 */
+
+	/* %r1 = __bpf_prog_exit */
+	load_imm64(jit, REG_1, (u64)bpf_trampoline_exit(p));
+	/* %r2 = p */
+	load_imm64(jit, REG_2, (u64)p);
+	/* lgr %r3,%r7 */
+	EMIT4(0xb9040000, REG_3, REG_7);
+	/* la %r4,run_ctx_off(%r15) */
+	EMIT4_DISP(0x41000000, REG_4, REG_15, tjit->run_ctx_off);
+	/* %r1() */
+	call_r1(jit);
+
+	return 0;
+}
+
+static int alloc_stack(struct bpf_tramp_jit *tjit, size_t size)
+{
+	int stack_offset = tjit->stack_size;
+
+	tjit->stack_size += size;
+	return stack_offset;
+}
+
+/* ABI uses %r2 - %r6 for parameter passing. */
+#define MAX_NR_REG_ARGS 5
+
+/* The "L" field of the "mvc" instruction is 8 bits. */
+#define MAX_MVC_SIZE 256
+#define MAX_NR_STACK_ARGS (MAX_MVC_SIZE / sizeof(u64))
+
+/* -mfentry generates a 6-byte nop on s390x. */
+#define S390X_PATCH_SIZE 6
+
+static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
+					 struct bpf_tramp_jit *tjit,
+					 const struct btf_func_model *m,
+					 u32 flags,
+					 struct bpf_tramp_links *tlinks,
+					 void *func_addr)
+{
+	struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
+	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
+	struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
+	int nr_bpf_args, nr_reg_args, nr_stack_args;
+	struct bpf_jit *jit = &tjit->common;
+	int arg, bpf_arg_off;
+	int i, j;
+
+	/* Support as many stack arguments as "mvc" instruction can handle. */
+	nr_reg_args = min_t(int, m->nr_args, MAX_NR_REG_ARGS);
+	nr_stack_args = m->nr_args - nr_reg_args;
+	if (nr_stack_args > MAX_NR_STACK_ARGS)
+		return -ENOTSUPP;
+
+	/* Return to %r14, since func_addr and %r0 are not available. */
+	if ((!func_addr && !(flags & BPF_TRAMP_F_ORIG_STACK)) ||
+	    (flags & BPF_TRAMP_F_INDIRECT))
+		flags |= BPF_TRAMP_F_SKIP_FRAME;
+
+	/*
+	 * Compute how many arguments we need to pass to BPF programs.
+	 * BPF ABI mirrors that of x86_64: arguments that are 16 bytes or
+	 * smaller are packed into 1 or 2 registers; larger arguments are
+	 * passed via pointers.
+	 * In s390x ABI, arguments that are 8 bytes or smaller are packed into
+	 * a register; larger arguments are passed via pointers.
+	 * We need to deal with this difference.
+	 */
+	nr_bpf_args = 0;
+	for (i = 0; i < m->nr_args; i++) {
+		if (m->arg_size[i] <= 8)
+			nr_bpf_args += 1;
+		else if (m->arg_size[i] <= 16)
+			nr_bpf_args += 2;
+		else
+			return -ENOTSUPP;
+	}
+
+	/*
+	 * Calculate the stack layout.
+	 */
+
+	/*
+	 * Allocate STACK_FRAME_OVERHEAD bytes for the callees. As the s390x
+	 * ABI requires, put our backchain at the end of the allocated memory.
+	 */
+	tjit->stack_size = STACK_FRAME_OVERHEAD;
+	tjit->backchain_off = tjit->stack_size - sizeof(u64);
+	tjit->stack_args_off = alloc_stack(tjit, nr_stack_args * sizeof(u64));
+	tjit->reg_args_off = alloc_stack(tjit, nr_reg_args * sizeof(u64));
+	tjit->ip_off = alloc_stack(tjit, sizeof(u64));
+	tjit->arg_cnt_off = alloc_stack(tjit, sizeof(u64));
+	tjit->bpf_args_off = alloc_stack(tjit, nr_bpf_args * sizeof(u64));
+	tjit->retval_off = alloc_stack(tjit, sizeof(u64));
+	tjit->r7_r8_off = alloc_stack(tjit, 2 * sizeof(u64));
+	tjit->run_ctx_off = alloc_stack(tjit,
+					sizeof(struct bpf_tramp_run_ctx));
+	tjit->tccnt_off = alloc_stack(tjit, sizeof(u64));
+	tjit->r14_off = alloc_stack(tjit, sizeof(u64) * 2);
+	/*
+	 * In accordance with the s390x ABI, the caller has allocated
+	 * STACK_FRAME_OVERHEAD bytes for us. 8 of them contain the caller's
+	 * backchain, and the rest we can use.
+	 */
+	tjit->stack_size -= STACK_FRAME_OVERHEAD - sizeof(u64);
+	tjit->orig_stack_args_off = tjit->stack_size + STACK_FRAME_OVERHEAD;
+
+	/* lgr %r1,%r15 */
+	EMIT4(0xb9040000, REG_1, REG_15);
+	/* aghi %r15,-stack_size */
+	EMIT4_IMM(0xa70b0000, REG_15, -tjit->stack_size);
+	/* stg %r1,backchain_off(%r15) */
+	EMIT6_DISP_LH(0xe3000000, 0x0024, REG_1, REG_0, REG_15,
+		      tjit->backchain_off);
+	/* mvc tccnt_off(4,%r15),stack_size+STK_OFF_TCCNT(%r15) */
+	_EMIT6(0xd203f000 | tjit->tccnt_off,
+	       0xf000 | (tjit->stack_size + STK_OFF_TCCNT));
+	/* stmg %r2,%rN,fwd_reg_args_off(%r15) */
+	if (nr_reg_args)
+		EMIT6_DISP_LH(0xeb000000, 0x0024, REG_2,
+			      REG_2 + (nr_reg_args - 1), REG_15,
+			      tjit->reg_args_off);
+	for (i = 0, j = 0; i < m->nr_args; i++) {
+		if (i < MAX_NR_REG_ARGS)
+			arg = REG_2 + i;
+		else
+			arg = tjit->orig_stack_args_off +
+			      (i - MAX_NR_REG_ARGS) * sizeof(u64);
+		bpf_arg_off = tjit->bpf_args_off + j * sizeof(u64);
+		if (m->arg_size[i] <= 8) {
+			if (i < MAX_NR_REG_ARGS)
+				/* stg %arg,bpf_arg_off(%r15) */
+				EMIT6_DISP_LH(0xe3000000, 0x0024, arg,
+					      REG_0, REG_15, bpf_arg_off);
+			else
+				/* mvc bpf_arg_off(8,%r15),arg(%r15) */
+				_EMIT6(0xd207f000 | bpf_arg_off,
+				       0xf000 | arg);
+			j += 1;
+		} else {
+			if (i < MAX_NR_REG_ARGS) {
+				/* mvc bpf_arg_off(16,%r15),0(%arg) */
+				_EMIT6(0xd20ff000 | bpf_arg_off,
+				       reg2hex[arg] << 12);
+			} else {
+				/* lg %r1,arg(%r15) */
+				EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, REG_0,
+					      REG_15, arg);
+				/* mvc bpf_arg_off(16,%r15),0(%r1) */
+				_EMIT6(0xd20ff000 | bpf_arg_off, 0x1000);
+			}
+			j += 2;
+		}
+	}
+	/* stmg %r7,%r8,r7_r8_off(%r15) */
+	EMIT6_DISP_LH(0xeb000000, 0x0024, REG_7, REG_8, REG_15,
+		      tjit->r7_r8_off);
+	/* stg %r14,r14_off(%r15) */
+	EMIT6_DISP_LH(0xe3000000, 0x0024, REG_14, REG_0, REG_15, tjit->r14_off);
+
+	if (flags & BPF_TRAMP_F_ORIG_STACK) {
+		/*
+		 * The ftrace trampoline puts the return address (which is the
+		 * address of the original function + S390X_PATCH_SIZE) into
+		 * %r0; see ftrace_shared_hotpatch_trampoline_br and
+		 * ftrace_init_nop() for details.
+		 */
+
+		/* lgr %r8,%r0 */
+		EMIT4(0xb9040000, REG_8, REG_0);
+	} else {
+		/* %r8 = func_addr + S390X_PATCH_SIZE */
+		load_imm64(jit, REG_8, (u64)func_addr + S390X_PATCH_SIZE);
+	}
+
+	/*
+	 * ip = func_addr;
+	 * arg_cnt = m->nr_args;
+	 */
+
+	if (flags & BPF_TRAMP_F_IP_ARG) {
+		/* %r0 = func_addr */
+		load_imm64(jit, REG_0, (u64)func_addr);
+		/* stg %r0,ip_off(%r15) */
+		EMIT6_DISP_LH(0xe3000000, 0x0024, REG_0, REG_0, REG_15,
+			      tjit->ip_off);
+	}
+	/* lghi %r0,nr_bpf_args */
+	EMIT4_IMM(0xa7090000, REG_0, nr_bpf_args);
+	/* stg %r0,arg_cnt_off(%r15) */
+	EMIT6_DISP_LH(0xe3000000, 0x0024, REG_0, REG_0, REG_15,
+		      tjit->arg_cnt_off);
+
+	if (flags & BPF_TRAMP_F_CALL_ORIG) {
+		/*
+		 * __bpf_tramp_enter(im);
+		 */
+
+		/* %r1 = __bpf_tramp_enter */
+		load_imm64(jit, REG_1, (u64)__bpf_tramp_enter);
+		/* %r2 = im */
+		load_imm64(jit, REG_2, (u64)im);
+		/* %r1() */
+		call_r1(jit);
+	}
+
+	for (i = 0; i < fentry->nr_links; i++)
+		if (invoke_bpf_prog(tjit, m, fentry->links[i],
+				    flags & BPF_TRAMP_F_RET_FENTRY_RET))
+			return -EINVAL;
+
+	if (fmod_ret->nr_links) {
+		/*
+		 * retval = 0;
+		 */
+
+		/* xc retval_off(8,%r15),retval_off(%r15) */
+		_EMIT6(0xd707f000 | tjit->retval_off,
+		       0xf000 | tjit->retval_off);
+
+		for (i = 0; i < fmod_ret->nr_links; i++) {
+			if (invoke_bpf_prog(tjit, m, fmod_ret->links[i], true))
+				return -EINVAL;
+
+			/*
+			 * if (retval)
+			 *         goto do_fexit;
+			 */
+
+			/* ltg %r0,retval_off(%r15) */
+			EMIT6_DISP_LH(0xe3000000, 0x0002, REG_0, REG_0, REG_15,
+				      tjit->retval_off);
+			/* brcl 7,do_fexit */
+			EMIT6_PCREL_RILC(0xc0040000, 7, tjit->do_fexit);
+		}
+	}
+
+	if (flags & BPF_TRAMP_F_CALL_ORIG) {
+		/*
+		 * retval = func_addr(args);
+		 */
+
+		/* lmg %r2,%rN,reg_args_off(%r15) */
+		if (nr_reg_args)
+			EMIT6_DISP_LH(0xeb000000, 0x0004, REG_2,
+				      REG_2 + (nr_reg_args - 1), REG_15,
+				      tjit->reg_args_off);
+		/* mvc stack_args_off(N,%r15),orig_stack_args_off(%r15) */
+		if (nr_stack_args)
+			_EMIT6(0xd200f000 |
+				       (nr_stack_args * sizeof(u64) - 1) << 16 |
+				       tjit->stack_args_off,
+			       0xf000 | tjit->orig_stack_args_off);
+		/* mvc STK_OFF_TCCNT(4,%r15),tccnt_off(%r15) */
+		_EMIT6(0xd203f000 | STK_OFF_TCCNT, 0xf000 | tjit->tccnt_off);
+		/* lgr %r1,%r8 */
+		EMIT4(0xb9040000, REG_1, REG_8);
+		/* %r1() */
+		call_r1(jit);
+		/* stg %r2,retval_off(%r15) */
+		EMIT6_DISP_LH(0xe3000000, 0x0024, REG_2, REG_0, REG_15,
+			      tjit->retval_off);
+
+		im->ip_after_call = jit->prg_buf + jit->prg;
+
+		/*
+		 * The following nop will be patched by bpf_tramp_image_put().
+		 */
+
+		/* brcl 0,im->ip_epilogue */
+		EMIT6_PCREL_RILC(0xc0040000, 0, (u64)im->ip_epilogue);
+	}
+
+	/* do_fexit: */
+	tjit->do_fexit = jit->prg;
+	for (i = 0; i < fexit->nr_links; i++)
+		if (invoke_bpf_prog(tjit, m, fexit->links[i], false))
+			return -EINVAL;
+
+	if (flags & BPF_TRAMP_F_CALL_ORIG) {
+		im->ip_epilogue = jit->prg_buf + jit->prg;
+
+		/*
+		 * __bpf_tramp_exit(im);
+		 */
+
+		/* %r1 = __bpf_tramp_exit */
+		load_imm64(jit, REG_1, (u64)__bpf_tramp_exit);
+		/* %r2 = im */
+		load_imm64(jit, REG_2, (u64)im);
+		/* %r1() */
+		call_r1(jit);
+	}
+
+	/* lmg %r2,%rN,reg_args_off(%r15) */
+	if ((flags & BPF_TRAMP_F_RESTORE_REGS) && nr_reg_args)
+		EMIT6_DISP_LH(0xeb000000, 0x0004, REG_2,
+			      REG_2 + (nr_reg_args - 1), REG_15,
+			      tjit->reg_args_off);
+	/* lgr %r1,%r8 */
+	if (!(flags & BPF_TRAMP_F_SKIP_FRAME))
+		EMIT4(0xb9040000, REG_1, REG_8);
+	/* lmg %r7,%r8,r7_r8_off(%r15) */
+	EMIT6_DISP_LH(0xeb000000, 0x0004, REG_7, REG_8, REG_15,
+		      tjit->r7_r8_off);
+	/* lg %r14,r14_off(%r15) */
+	EMIT6_DISP_LH(0xe3000000, 0x0004, REG_14, REG_0, REG_15, tjit->r14_off);
+	/* lg %r2,retval_off(%r15) */
+	if (flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET))
+		EMIT6_DISP_LH(0xe3000000, 0x0004, REG_2, REG_0, REG_15,
+			      tjit->retval_off);
+	/* mvc stack_size+STK_OFF_TCCNT(4,%r15),tccnt_off(%r15) */
+	_EMIT6(0xd203f000 | (tjit->stack_size + STK_OFF_TCCNT),
+	       0xf000 | tjit->tccnt_off);
+	/* aghi %r15,stack_size */
+	EMIT4_IMM(0xa70b0000, REG_15, tjit->stack_size);
+	/* Emit an expoline for the following indirect jump. */
+	if (nospec_uses_trampoline())
+		emit_expoline(jit);
+	if (flags & BPF_TRAMP_F_SKIP_FRAME)
+		/* br %r14 */
+		_EMIT2(0x07fe);
+	else
+		/* br %r1 */
+		_EMIT2(0x07f1);
+
+	emit_r1_thunk(jit);
+
+	return 0;
+}
+
+int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+			     struct bpf_tramp_links *tlinks, void *orig_call)
+{
+	struct bpf_tramp_image im;
+	struct bpf_tramp_jit tjit;
+	int ret;
+
+	memset(&tjit, 0, sizeof(tjit));
+
+	ret = __arch_prepare_bpf_trampoline(&im, &tjit, m, flags,
+					    tlinks, orig_call);
+
+	return ret < 0 ? ret : tjit.common.prg;
+}
+
+int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
+				void *image_end, const struct btf_func_model *m,
+				u32 flags, struct bpf_tramp_links *tlinks,
+				void *func_addr)
+{
+	struct bpf_tramp_jit tjit;
+	int ret;
+
+	/* Compute offsets, check whether the code fits. */
+	memset(&tjit, 0, sizeof(tjit));
+	ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags,
+					    tlinks, func_addr);
+
+	if (ret < 0)
+		return ret;
+	if (tjit.common.prg > (char *)image_end - (char *)image)
+		/*
+		 * Use the same error code as for exceeding
+		 * BPF_MAX_TRAMP_LINKS.
+		 */
+		return -E2BIG;
+
+	tjit.common.prg = 0;
+	tjit.common.prg_buf = image;
+	ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags,
+					    tlinks, func_addr);
+
+	return ret < 0 ? ret : tjit.common.prg;
+}
+
+bool bpf_jit_supports_subprog_tailcalls(void)
+{
+	return true;
+}
+
+bool bpf_jit_supports_arena(void)
+{
+	return true;
+}
+
+bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
+{
+	if (!in_arena)
+		return true;
+	switch (insn->code) {
+	case BPF_STX | BPF_ATOMIC | BPF_B:
+	case BPF_STX | BPF_ATOMIC | BPF_H:
+	case BPF_STX | BPF_ATOMIC | BPF_W:
+	case BPF_STX | BPF_ATOMIC | BPF_DW:
+		if (bpf_atomic_is_load_store(insn))
+			return false;
+	}
+	return true;
+}
+
+bool bpf_jit_supports_exceptions(void)
+{
+	/*
+	 * Exceptions require unwinding support, which is always available,
+	 * because the kernel is always built with backchain.
+	 */
+	return true;
+}
+
+void arch_bpf_stack_walk(bool (*consume_fn)(void *, u64, u64, u64),
+			 void *cookie)
+{
+	unsigned long addr, prev_addr = 0;
+	struct unwind_state state;
+
+	unwind_for_each_frame(&state, NULL, NULL, 0) {
+		addr = unwind_get_return_address(&state);
+		if (!addr)
+			break;
+		/*
+		 * addr is a return address and state.sp is the value of %r15
+		 * at this address. exception_cb needs %r15 at entry to the
+		 * function containing addr, so take the next state.sp.
+		 *
+		 * There is no bp, and the exception_cb prog does not need one
+		 * to perform a quasi-longjmp. The common code requires a
+		 * non-zero bp, so pass sp there as well.
+		 */
+		if (prev_addr && !consume_fn(cookie, prev_addr, state.sp,
+					     state.sp))
+			break;
+		prev_addr = addr;
+	}
+}
diff --git a/arch/s390/numa/Makefile b/arch/s390/numa/Makefile
deleted file mode 100644
index 66c2dff74895..000000000000
--- a/arch/s390/numa/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-obj-y			+= numa.o
-obj-y			+= toptree.o
-obj-$(CONFIG_NUMA_EMU)	+= mode_emu.o
diff --git a/arch/s390/numa/mode_emu.c b/arch/s390/numa/mode_emu.c
deleted file mode 100644
index 72d742bb2d17..000000000000
--- a/arch/s390/numa/mode_emu.c
+++ /dev/null
@@ -1,577 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NUMA support for s390
- *
- * NUMA emulation (aka fake NUMA) distributes the available memory to nodes
- * without using real topology information about the physical memory of the
- * machine.
- *
- * It distributes the available CPUs to nodes while respecting the original
- * machine topology information. This is done by trying to avoid to separate
- * CPUs which reside on the same book or even on the same MC.
- *
- * Because the current Linux scheduler code requires a stable cpu to node
- * mapping, cores are pinned to nodes when the first CPU thread is set online.
- *
- * Copyright IBM Corp. 2015
- */
-
-#define KMSG_COMPONENT "numa_emu"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/cpumask.h>
-#include <linux/memblock.h>
-#include <linux/node.h>
-#include <linux/memory.h>
-#include <linux/slab.h>
-#include <asm/smp.h>
-#include <asm/topology.h>
-#include "numa_mode.h"
-#include "toptree.h"
-
-/* Distances between the different system components */
-#define DIST_EMPTY	0
-#define DIST_CORE	1
-#define DIST_MC		2
-#define DIST_BOOK	3
-#define DIST_DRAWER	4
-#define DIST_MAX	5
-
-/* Node distance reported to common code */
-#define EMU_NODE_DIST	10
-
-/* Node ID for free (not yet pinned) cores */
-#define NODE_ID_FREE	-1
-
-/* Different levels of toptree */
-enum toptree_level {CORE, MC, BOOK, DRAWER, NODE, TOPOLOGY};
-
-/* The two toptree IDs */
-enum {TOPTREE_ID_PHYS, TOPTREE_ID_NUMA};
-
-/* Number of NUMA nodes */
-static int emu_nodes = 1;
-/* NUMA stripe size */
-static unsigned long emu_size;
-
-/*
- * Node to core pinning information updates are protected by
- * "sched_domains_mutex".
- */
-static struct {
-	s32 to_node_id[CONFIG_NR_CPUS];	/* Pinned core to node mapping */
-	int total;			/* Total number of pinned cores */
-	int per_node_target;		/* Cores per node without extra cores */
-	int per_node[MAX_NUMNODES];	/* Number of cores pinned to node */
-} *emu_cores;
-
-/*
- * Pin a core to a node
- */
-static void pin_core_to_node(int core_id, int node_id)
-{
-	if (emu_cores->to_node_id[core_id] == NODE_ID_FREE) {
-		emu_cores->per_node[node_id]++;
-		emu_cores->to_node_id[core_id] = node_id;
-		emu_cores->total++;
-	} else {
-		WARN_ON(emu_cores->to_node_id[core_id] != node_id);
-	}
-}
-
-/*
- * Number of pinned cores of a node
- */
-static int cores_pinned(struct toptree *node)
-{
-	return emu_cores->per_node[node->id];
-}
-
-/*
- * ID of the node where the core is pinned (or NODE_ID_FREE)
- */
-static int core_pinned_to_node_id(struct toptree *core)
-{
-	return emu_cores->to_node_id[core->id];
-}
-
-/*
- * Number of cores in the tree that are not yet pinned
- */
-static int cores_free(struct toptree *tree)
-{
-	struct toptree *core;
-	int count = 0;
-
-	toptree_for_each(core, tree, CORE) {
-		if (core_pinned_to_node_id(core) == NODE_ID_FREE)
-			count++;
-	}
-	return count;
-}
-
-/*
- * Return node of core
- */
-static struct toptree *core_node(struct toptree *core)
-{
-	return core->parent->parent->parent->parent;
-}
-
-/*
- * Return drawer of core
- */
-static struct toptree *core_drawer(struct toptree *core)
-{
-	return core->parent->parent->parent;
-}
-
-/*
- * Return book of core
- */
-static struct toptree *core_book(struct toptree *core)
-{
-	return core->parent->parent;
-}
-
-/*
- * Return mc of core
- */
-static struct toptree *core_mc(struct toptree *core)
-{
-	return core->parent;
-}
-
-/*
- * Distance between two cores
- */
-static int dist_core_to_core(struct toptree *core1, struct toptree *core2)
-{
-	if (core_drawer(core1)->id != core_drawer(core2)->id)
-		return DIST_DRAWER;
-	if (core_book(core1)->id != core_book(core2)->id)
-		return DIST_BOOK;
-	if (core_mc(core1)->id != core_mc(core2)->id)
-		return DIST_MC;
-	/* Same core or sibling on same MC */
-	return DIST_CORE;
-}
-
-/*
- * Distance of a node to a core
- */
-static int dist_node_to_core(struct toptree *node, struct toptree *core)
-{
-	struct toptree *core_node;
-	int dist_min = DIST_MAX;
-
-	toptree_for_each(core_node, node, CORE)
-		dist_min = min(dist_min, dist_core_to_core(core_node, core));
-	return dist_min == DIST_MAX ? DIST_EMPTY : dist_min;
-}
-
-/*
- * Unify will delete empty nodes, therefore recreate nodes.
- */
-static void toptree_unify_tree(struct toptree *tree)
-{
-	int nid;
-
-	toptree_unify(tree);
-	for (nid = 0; nid < emu_nodes; nid++)
-		toptree_get_child(tree, nid);
-}
-
-/*
- * Find the best/nearest node for a given core and ensure that no node
- * gets more than "emu_cores->per_node_target + extra" cores.
- */
-static struct toptree *node_for_core(struct toptree *numa, struct toptree *core,
-				     int extra)
-{
-	struct toptree *node, *node_best = NULL;
-	int dist_cur, dist_best, cores_target;
-
-	cores_target = emu_cores->per_node_target + extra;
-	dist_best = DIST_MAX;
-	node_best = NULL;
-	toptree_for_each(node, numa, NODE) {
-		/* Already pinned cores must use their nodes */
-		if (core_pinned_to_node_id(core) == node->id) {
-			node_best = node;
-			break;
-		}
-		/* Skip nodes that already have enough cores */
-		if (cores_pinned(node) >= cores_target)
-			continue;
-		dist_cur = dist_node_to_core(node, core);
-		if (dist_cur < dist_best) {
-			dist_best = dist_cur;
-			node_best = node;
-		}
-	}
-	return node_best;
-}
-
-/*
- * Find the best node for each core with respect to "extra" core count
- */
-static void toptree_to_numa_single(struct toptree *numa, struct toptree *phys,
-				   int extra)
-{
-	struct toptree *node, *core, *tmp;
-
-	toptree_for_each_safe(core, tmp, phys, CORE) {
-		node = node_for_core(numa, core, extra);
-		if (!node)
-			return;
-		toptree_move(core, node);
-		pin_core_to_node(core->id, node->id);
-	}
-}
-
-/*
- * Move structures of given level to specified NUMA node
- */
-static void move_level_to_numa_node(struct toptree *node, struct toptree *phys,
-				    enum toptree_level level, bool perfect)
-{
-	int cores_free, cores_target = emu_cores->per_node_target;
-	struct toptree *cur, *tmp;
-
-	toptree_for_each_safe(cur, tmp, phys, level) {
-		cores_free = cores_target - toptree_count(node, CORE);
-		if (perfect) {
-			if (cores_free == toptree_count(cur, CORE))
-				toptree_move(cur, node);
-		} else {
-			if (cores_free >= toptree_count(cur, CORE))
-				toptree_move(cur, node);
-		}
-	}
-}
-
-/*
- * Move structures of a given level to NUMA nodes. If "perfect" is specified
- * move only perfectly fitting structures. Otherwise move also smaller
- * than needed structures.
- */
-static void move_level_to_numa(struct toptree *numa, struct toptree *phys,
-			       enum toptree_level level, bool perfect)
-{
-	struct toptree *node;
-
-	toptree_for_each(node, numa, NODE)
-		move_level_to_numa_node(node, phys, level, perfect);
-}
-
-/*
- * For the first run try to move the big structures
- */
-static void toptree_to_numa_first(struct toptree *numa, struct toptree *phys)
-{
-	struct toptree *core;
-
-	/* Always try to move perfectly fitting structures first */
-	move_level_to_numa(numa, phys, DRAWER, true);
-	move_level_to_numa(numa, phys, DRAWER, false);
-	move_level_to_numa(numa, phys, BOOK, true);
-	move_level_to_numa(numa, phys, BOOK, false);
-	move_level_to_numa(numa, phys, MC, true);
-	move_level_to_numa(numa, phys, MC, false);
-	/* Now pin all the moved cores */
-	toptree_for_each(core, numa, CORE)
-		pin_core_to_node(core->id, core_node(core)->id);
-}
-
-/*
- * Allocate new topology and create required nodes
- */
-static struct toptree *toptree_new(int id, int nodes)
-{
-	struct toptree *tree;
-	int nid;
-
-	tree = toptree_alloc(TOPOLOGY, id);
-	if (!tree)
-		goto fail;
-	for (nid = 0; nid < nodes; nid++) {
-		if (!toptree_get_child(tree, nid))
-			goto fail;
-	}
-	return tree;
-fail:
-	panic("NUMA emulation could not allocate topology");
-}
-
-/*
- * Allocate and initialize core to node mapping
- */
-static void __ref create_core_to_node_map(void)
-{
-	int i;
-
-	emu_cores = memblock_alloc(sizeof(*emu_cores), 8);
-	if (!emu_cores)
-		panic("%s: Failed to allocate %zu bytes align=0x%x\n",
-		      __func__, sizeof(*emu_cores), 8);
-	for (i = 0; i < ARRAY_SIZE(emu_cores->to_node_id); i++)
-		emu_cores->to_node_id[i] = NODE_ID_FREE;
-}
-
-/*
- * Move cores from physical topology into NUMA target topology
- * and try to keep as much of the physical topology as possible.
- */
-static struct toptree *toptree_to_numa(struct toptree *phys)
-{
-	static int first = 1;
-	struct toptree *numa;
-	int cores_total;
-
-	cores_total = emu_cores->total + cores_free(phys);
-	emu_cores->per_node_target = cores_total / emu_nodes;
-	numa = toptree_new(TOPTREE_ID_NUMA, emu_nodes);
-	if (first) {
-		toptree_to_numa_first(numa, phys);
-		first = 0;
-	}
-	toptree_to_numa_single(numa, phys, 0);
-	toptree_to_numa_single(numa, phys, 1);
-	toptree_unify_tree(numa);
-
-	WARN_ON(cpumask_weight(&phys->mask));
-	return numa;
-}
-
-/*
- * Create a toptree out of the physical topology that we got from the hypervisor
- */
-static struct toptree *toptree_from_topology(void)
-{
-	struct toptree *phys, *node, *drawer, *book, *mc, *core;
-	struct cpu_topology_s390 *top;
-	int cpu;
-
-	phys = toptree_new(TOPTREE_ID_PHYS, 1);
-
-	for_each_cpu(cpu, &cpus_with_topology) {
-		top = &cpu_topology[cpu];
-		node = toptree_get_child(phys, 0);
-		drawer = toptree_get_child(node, top->drawer_id);
-		book = toptree_get_child(drawer, top->book_id);
-		mc = toptree_get_child(book, top->socket_id);
-		core = toptree_get_child(mc, smp_get_base_cpu(cpu));
-		if (!drawer || !book || !mc || !core)
-			panic("NUMA emulation could not allocate memory");
-		cpumask_set_cpu(cpu, &core->mask);
-		toptree_update_mask(mc);
-	}
-	return phys;
-}
-
-/*
- * Add toptree core to topology and create correct CPU masks
- */
-static void topology_add_core(struct toptree *core)
-{
-	struct cpu_topology_s390 *top;
-	int cpu;
-
-	for_each_cpu(cpu, &core->mask) {
-		top = &cpu_topology[cpu];
-		cpumask_copy(&top->thread_mask, &core->mask);
-		cpumask_copy(&top->core_mask, &core_mc(core)->mask);
-		cpumask_copy(&top->book_mask, &core_book(core)->mask);
-		cpumask_copy(&top->drawer_mask, &core_drawer(core)->mask);
-		cpumask_set_cpu(cpu, &node_to_cpumask_map[core_node(core)->id]);
-		top->node_id = core_node(core)->id;
-	}
-}
-
-/*
- * Apply toptree to topology and create CPU masks
- */
-static void toptree_to_topology(struct toptree *numa)
-{
-	struct toptree *core;
-	int i;
-
-	/* Clear all node masks */
-	for (i = 0; i < MAX_NUMNODES; i++)
-		cpumask_clear(&node_to_cpumask_map[i]);
-
-	/* Rebuild all masks */
-	toptree_for_each(core, numa, CORE)
-		topology_add_core(core);
-}
-
-/*
- * Show the node to core mapping
- */
-static void print_node_to_core_map(void)
-{
-	int nid, cid;
-
-	if (!numa_debug_enabled)
-		return;
-	printk(KERN_DEBUG "NUMA node to core mapping\n");
-	for (nid = 0; nid < emu_nodes; nid++) {
-		printk(KERN_DEBUG "  node %3d: ", nid);
-		for (cid = 0; cid < ARRAY_SIZE(emu_cores->to_node_id); cid++) {
-			if (emu_cores->to_node_id[cid] == nid)
-				printk(KERN_CONT "%d ", cid);
-		}
-		printk(KERN_CONT "\n");
-	}
-}
-
-static void pin_all_possible_cpus(void)
-{
-	int core_id, node_id, cpu;
-	static int initialized;
-
-	if (initialized)
-		return;
-	print_node_to_core_map();
-	node_id = 0;
-	for_each_possible_cpu(cpu) {
-		core_id = smp_get_base_cpu(cpu);
-		if (emu_cores->to_node_id[core_id] != NODE_ID_FREE)
-			continue;
-		pin_core_to_node(core_id, node_id);
-		cpu_topology[cpu].node_id = node_id;
-		node_id = (node_id + 1) % emu_nodes;
-	}
-	print_node_to_core_map();
-	initialized = 1;
-}
-
-/*
- * Transfer physical topology into a NUMA topology and modify CPU masks
- * according to the NUMA topology.
- *
- * Must be called with "sched_domains_mutex" lock held.
- */
-static void emu_update_cpu_topology(void)
-{
-	struct toptree *phys, *numa;
-
-	if (emu_cores == NULL)
-		create_core_to_node_map();
-	phys = toptree_from_topology();
-	numa = toptree_to_numa(phys);
-	toptree_free(phys);
-	toptree_to_topology(numa);
-	toptree_free(numa);
-	pin_all_possible_cpus();
-}
-
-/*
- * If emu_size is not set, use CONFIG_EMU_SIZE. Then round to minimum
- * alignment (needed for memory hotplug).
- */
-static unsigned long emu_setup_size_adjust(unsigned long size)
-{
-	unsigned long size_new;
-
-	size = size ? : CONFIG_EMU_SIZE;
-	size_new = roundup(size, memory_block_size_bytes());
-	if (size_new == size)
-		return size;
-	pr_warn("Increasing memory stripe size from %ld MB to %ld MB\n",
-		size >> 20, size_new >> 20);
-	return size_new;
-}
-
-/*
- * If we have not enough memory for the specified nodes, reduce the node count.
- */
-static int emu_setup_nodes_adjust(int nodes)
-{
-	int nodes_max;
-
-	nodes_max = memblock.memory.total_size / emu_size;
-	nodes_max = max(nodes_max, 1);
-	if (nodes_max >= nodes)
-		return nodes;
-	pr_warn("Not enough memory for %d nodes, reducing node count\n", nodes);
-	return nodes_max;
-}
-
-/*
- * Early emu setup
- */
-static void emu_setup(void)
-{
-	int nid;
-
-	emu_size = emu_setup_size_adjust(emu_size);
-	emu_nodes = emu_setup_nodes_adjust(emu_nodes);
-	for (nid = 0; nid < emu_nodes; nid++)
-		node_set(nid, node_possible_map);
-	pr_info("Creating %d nodes with memory stripe size %ld MB\n",
-		emu_nodes, emu_size >> 20);
-}
-
-/*
- * Return node id for given page number
- */
-static int emu_pfn_to_nid(unsigned long pfn)
-{
-	return (pfn / (emu_size >> PAGE_SHIFT)) % emu_nodes;
-}
-
-/*
- * Return stripe size
- */
-static unsigned long emu_align(void)
-{
-	return emu_size;
-}
-
-/*
- * Return distance between two nodes
- */
-static int emu_distance(int node1, int node2)
-{
-	return (node1 != node2) * EMU_NODE_DIST;
-}
-
-/*
- * Define callbacks for generic s390 NUMA infrastructure
- */
-const struct numa_mode numa_mode_emu = {
-	.name = "emu",
-	.setup = emu_setup,
-	.update_cpu_topology = emu_update_cpu_topology,
-	.__pfn_to_nid = emu_pfn_to_nid,
-	.align = emu_align,
-	.distance = emu_distance,
-};
-
-/*
- * Kernel parameter: emu_nodes=<n>
- */
-static int __init early_parse_emu_nodes(char *p)
-{
-	int count;
-
-	if (!p || kstrtoint(p, 0, &count) != 0 || count <= 0)
-		return 0;
-	emu_nodes = min(count, MAX_NUMNODES);
-	return 0;
-}
-early_param("emu_nodes", early_parse_emu_nodes);
-
-/*
- * Kernel parameter: emu_size=[<n>[k|M|G|T]]
- */
-static int __init early_parse_emu_size(char *p)
-{
-	if (p)
-		emu_size = memparse(p, NULL);
-	return 0;
-}
-early_param("emu_size", early_parse_emu_size);
diff --git a/arch/s390/numa/numa.c b/arch/s390/numa/numa.c
deleted file mode 100644
index d2910fa834c8..000000000000
--- a/arch/s390/numa/numa.c
+++ /dev/null
@@ -1,171 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NUMA support for s390
- *
- * Implement NUMA core code.
- *
- * Copyright IBM Corp. 2015
- */
-
-#define KMSG_COMPONENT "numa"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/mmzone.h>
-#include <linux/cpumask.h>
-#include <linux/memblock.h>
-#include <linux/slab.h>
-#include <linux/node.h>
-
-#include <asm/numa.h>
-#include "numa_mode.h"
-
-pg_data_t *node_data[MAX_NUMNODES];
-EXPORT_SYMBOL(node_data);
-
-cpumask_t node_to_cpumask_map[MAX_NUMNODES];
-EXPORT_SYMBOL(node_to_cpumask_map);
-
-static void plain_setup(void)
-{
-	node_set(0, node_possible_map);
-}
-
-const struct numa_mode numa_mode_plain = {
-	.name = "plain",
-	.setup = plain_setup,
-};
-
-static const struct numa_mode *mode = &numa_mode_plain;
-
-int numa_pfn_to_nid(unsigned long pfn)
-{
-	return mode->__pfn_to_nid ? mode->__pfn_to_nid(pfn) : 0;
-}
-
-void numa_update_cpu_topology(void)
-{
-	if (mode->update_cpu_topology)
-		mode->update_cpu_topology();
-}
-
-int __node_distance(int a, int b)
-{
-	return mode->distance ? mode->distance(a, b) : 0;
-}
-EXPORT_SYMBOL(__node_distance);
-
-int numa_debug_enabled;
-
-/*
- * numa_setup_memory() - Assign bootmem to nodes
- *
- * The memory is first added to memblock without any respect to nodes.
- * This is fixed before remaining memblock memory is handed over to the
- * buddy allocator.
- * An important side effect is that large bootmem allocations might easily
- * cross node boundaries, which can be needed for large allocations with
- * smaller memory stripes in each node (i.e. when using NUMA emulation).
- *
- * Memory defines nodes:
- * Therefore this routine also sets the nodes online with memory.
- */
-static void __init numa_setup_memory(void)
-{
-	unsigned long cur_base, align, end_of_dram;
-	int nid = 0;
-
-	end_of_dram = memblock_end_of_DRAM();
-	align = mode->align ? mode->align() : ULONG_MAX;
-
-	/*
-	 * Step through all available memory and assign it to the nodes
-	 * indicated by the mode implementation.
-	 * All nodes which are seen here will be set online.
-	 */
-	cur_base = 0;
-	do {
-		nid = numa_pfn_to_nid(PFN_DOWN(cur_base));
-		node_set_online(nid);
-		memblock_set_node(cur_base, align, &memblock.memory, nid);
-		cur_base += align;
-	} while (cur_base < end_of_dram);
-
-	/* Allocate and fill out node_data */
-	for (nid = 0; nid < MAX_NUMNODES; nid++) {
-		NODE_DATA(nid) = memblock_alloc(sizeof(pg_data_t), 8);
-		if (!NODE_DATA(nid))
-			panic("%s: Failed to allocate %zu bytes align=0x%x\n",
-			      __func__, sizeof(pg_data_t), 8);
-	}
-
-	for_each_online_node(nid) {
-		unsigned long start_pfn, end_pfn;
-		unsigned long t_start, t_end;
-		int i;
-
-		start_pfn = ULONG_MAX;
-		end_pfn = 0;
-		for_each_mem_pfn_range(i, nid, &t_start, &t_end, NULL) {
-			if (t_start < start_pfn)
-				start_pfn = t_start;
-			if (t_end > end_pfn)
-				end_pfn = t_end;
-		}
-		NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
-		NODE_DATA(nid)->node_id = nid;
-	}
-}
-
-/*
- * numa_setup() - Earliest initialization
- *
- * Assign the mode and call the mode's setup routine.
- */
-void __init numa_setup(void)
-{
-	pr_info("NUMA mode: %s\n", mode->name);
-	nodes_clear(node_possible_map);
-	/* Initially attach all possible CPUs to node 0. */
-	cpumask_copy(&node_to_cpumask_map[0], cpu_possible_mask);
-	if (mode->setup)
-		mode->setup();
-	numa_setup_memory();
-	memblock_dump_all();
-}
-
-/*
- * numa_init_late() - Initialization initcall
- *
- * Register NUMA nodes.
- */
-static int __init numa_init_late(void)
-{
-	int nid;
-
-	for_each_online_node(nid)
-		register_one_node(nid);
-	return 0;
-}
-arch_initcall(numa_init_late);
-
-static int __init parse_debug(char *parm)
-{
-	numa_debug_enabled = 1;
-	return 0;
-}
-early_param("numa_debug", parse_debug);
-
-static int __init parse_numa(char *parm)
-{
-	if (!parm)
-		return 1;
-	if (strcmp(parm, numa_mode_plain.name) == 0)
-		mode = &numa_mode_plain;
-#ifdef CONFIG_NUMA_EMU
-	if (strcmp(parm, numa_mode_emu.name) == 0)
-		mode = &numa_mode_emu;
-#endif
-	return 0;
-}
-early_param("numa", parse_numa);
diff --git a/arch/s390/numa/numa_mode.h b/arch/s390/numa/numa_mode.h
deleted file mode 100644
index dfd3e2784081..000000000000
--- a/arch/s390/numa/numa_mode.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * NUMA support for s390
- *
- * Define declarations used for communication between NUMA mode
- * implementations and NUMA core functionality.
- *
- * Copyright IBM Corp. 2015
- */
-#ifndef __S390_NUMA_MODE_H
-#define __S390_NUMA_MODE_H
-
-struct numa_mode {
-	char *name;				/* Name of mode */
-	void (*setup)(void);			/* Initizalize mode */
-	void (*update_cpu_topology)(void);	/* Called by topology code */
-	int (*__pfn_to_nid)(unsigned long pfn);	/* PFN to node ID */
-	unsigned long (*align)(void);		/* Minimum node alignment */
-	int (*distance)(int a, int b);		/* Distance between two nodes */
-};
-
-extern const struct numa_mode numa_mode_plain;
-extern const struct numa_mode numa_mode_emu;
-
-#endif /* __S390_NUMA_MODE_H */
diff --git a/arch/s390/numa/toptree.c b/arch/s390/numa/toptree.c
deleted file mode 100644
index 71a608cd4f61..000000000000
--- a/arch/s390/numa/toptree.c
+++ /dev/null
@@ -1,351 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NUMA support for s390
- *
- * A tree structure used for machine topology mangling
- *
- * Copyright IBM Corp. 2015
- */
-
-#include <linux/kernel.h>
-#include <linux/memblock.h>
-#include <linux/cpumask.h>
-#include <linux/list.h>
-#include <linux/list_sort.h>
-#include <linux/slab.h>
-#include <asm/numa.h>
-
-#include "toptree.h"
-
-/**
- * toptree_alloc - Allocate and initialize a new tree node.
- * @level: The node's vertical level; level 0 contains the leaves.
- * @id: ID number, explicitly not unique beyond scope of node's siblings
- *
- * Allocate a new tree node and initialize it.
- *
- * RETURNS:
- * Pointer to the new tree node or NULL on error
- */
-struct toptree __ref *toptree_alloc(int level, int id)
-{
-	struct toptree *res;
-
-	if (slab_is_available())
-		res = kzalloc(sizeof(*res), GFP_KERNEL);
-	else
-		res = memblock_alloc(sizeof(*res), 8);
-	if (!res)
-		return res;
-
-	INIT_LIST_HEAD(&res->children);
-	INIT_LIST_HEAD(&res->sibling);
-	cpumask_clear(&res->mask);
-	res->level = level;
-	res->id = id;
-	return res;
-}
-
-/**
- * toptree_remove - Remove a tree node from a tree
- * @cand: Pointer to the node to remove
- *
- * The node is detached from its parent node. The parent node's
- * masks will be updated to reflect the loss of the child.
- */
-static void toptree_remove(struct toptree *cand)
-{
-	struct toptree *oldparent;
-
-	list_del_init(&cand->sibling);
-	oldparent = cand->parent;
-	cand->parent = NULL;
-	toptree_update_mask(oldparent);
-}
-
-/**
- * toptree_free - discard a tree node
- * @cand: Pointer to the tree node to discard
- *
- * Checks if @cand is attached to a parent node. Detaches it
- * cleanly using toptree_remove. Possible children are freed
- * recursively. In the end @cand itself is freed.
- */
-void __ref toptree_free(struct toptree *cand)
-{
-	struct toptree *child, *tmp;
-
-	if (cand->parent)
-		toptree_remove(cand);
-	toptree_for_each_child_safe(child, tmp, cand)
-		toptree_free(child);
-	if (slab_is_available())
-		kfree(cand);
-	else
-		memblock_free_early((unsigned long)cand, sizeof(*cand));
-}
-
-/**
- * toptree_update_mask - Update node bitmasks
- * @cand: Pointer to a tree node
- *
- * The node's cpumask will be updated by combining all children's
- * masks. Then toptree_update_mask is called recursively for the
- * parent if applicable.
- *
- * NOTE:
- * This must not be called on leaves. If called on a leaf, its
- * CPU mask is cleared and lost.
- */
-void toptree_update_mask(struct toptree *cand)
-{
-	struct toptree *child;
-
-	cpumask_clear(&cand->mask);
-	list_for_each_entry(child, &cand->children, sibling)
-		cpumask_or(&cand->mask, &cand->mask, &child->mask);
-	if (cand->parent)
-		toptree_update_mask(cand->parent);
-}
-
-/**
- * toptree_insert - Insert a tree node into tree
- * @cand: Pointer to the node to insert
- * @target: Pointer to the node to which @cand will added as a child
- *
- * Insert a tree node into a tree. Masks will be updated automatically.
- *
- * RETURNS:
- * 0 on success, -1 if NULL is passed as argument or the node levels
- * don't fit.
- */
-static int toptree_insert(struct toptree *cand, struct toptree *target)
-{
-	if (!cand || !target)
-		return -1;
-	if (target->level != (cand->level + 1))
-		return -1;
-	list_add_tail(&cand->sibling, &target->children);
-	cand->parent = target;
-	toptree_update_mask(target);
-	return 0;
-}
-
-/**
- * toptree_move_children - Move all child nodes of a node to a new place
- * @cand: Pointer to the node whose children are to be moved
- * @target: Pointer to the node to which @cand's children will be attached
- *
- * Take all child nodes of @cand and move them using toptree_move.
- */
-static void toptree_move_children(struct toptree *cand, struct toptree *target)
-{
-	struct toptree *child, *tmp;
-
-	toptree_for_each_child_safe(child, tmp, cand)
-		toptree_move(child, target);
-}
-
-/**
- * toptree_unify - Merge children with same ID
- * @cand: Pointer to node whose direct children should be made unique
- *
- * When mangling the tree it is possible that a node has two or more children
- * which have the same ID. This routine merges these children into one and
- * moves all children of the merged nodes into the unified node.
- */
-void toptree_unify(struct toptree *cand)
-{
-	struct toptree *child, *tmp, *cand_copy;
-
-	/* Threads cannot be split, cores are not split */
-	if (cand->level < 2)
-		return;
-
-	cand_copy = toptree_alloc(cand->level, 0);
-	toptree_for_each_child_safe(child, tmp, cand) {
-		struct toptree *tmpchild;
-
-		if (!cpumask_empty(&child->mask)) {
-			tmpchild = toptree_get_child(cand_copy, child->id);
-			toptree_move_children(child, tmpchild);
-		}
-		toptree_free(child);
-	}
-	toptree_move_children(cand_copy, cand);
-	toptree_free(cand_copy);
-
-	toptree_for_each_child(child, cand)
-		toptree_unify(child);
-}
-
-/**
- * toptree_move - Move a node to another context
- * @cand: Pointer to the node to move
- * @target: Pointer to the node where @cand should go
- *
- * In the easiest case @cand is exactly on the level below @target
- * and will be immediately moved to the target.
- *
- * If @target's level is not the direct parent level of @cand,
- * nodes for the missing levels are created and put between
- * @cand and @target. The "stacking" nodes' IDs are taken from
- * @cand's parents.
- *
- * After this it is likely to have redundant nodes in the tree
- * which are addressed by means of toptree_unify.
- */
-void toptree_move(struct toptree *cand, struct toptree *target)
-{
-	struct toptree *stack_target, *real_insert_point, *ptr, *tmp;
-
-	if (cand->level + 1 == target->level) {
-		toptree_remove(cand);
-		toptree_insert(cand, target);
-		return;
-	}
-
-	real_insert_point = NULL;
-	ptr = cand;
-	stack_target = NULL;
-
-	do {
-		tmp = stack_target;
-		stack_target = toptree_alloc(ptr->level + 1,
-					     ptr->parent->id);
-		toptree_insert(tmp, stack_target);
-		if (!real_insert_point)
-			real_insert_point = stack_target;
-		ptr = ptr->parent;
-	} while (stack_target->level < (target->level - 1));
-
-	toptree_remove(cand);
-	toptree_insert(cand, real_insert_point);
-	toptree_insert(stack_target, target);
-}
-
-/**
- * toptree_get_child - Access a tree node's child by its ID
- * @cand: Pointer to tree node whose child is to access
- * @id: The desired child's ID
- *
- * @cand's children are searched for a child with matching ID.
- * If no match can be found, a new child with the desired ID
- * is created and returned.
- */
-struct toptree *toptree_get_child(struct toptree *cand, int id)
-{
-	struct toptree *child;
-
-	toptree_for_each_child(child, cand)
-		if (child->id == id)
-			return child;
-	child = toptree_alloc(cand->level-1, id);
-	toptree_insert(child, cand);
-	return child;
-}
-
-/**
- * toptree_first - Find the first descendant on specified level
- * @context: Pointer to tree node whose descendants are to be used
- * @level: The level of interest
- *
- * RETURNS:
- * @context's first descendant on the specified level, or NULL
- * if there is no matching descendant
- */
-struct toptree *toptree_first(struct toptree *context, int level)
-{
-	struct toptree *child, *tmp;
-
-	if (context->level == level)
-		return context;
-
-	if (!list_empty(&context->children)) {
-		list_for_each_entry(child, &context->children, sibling) {
-			tmp = toptree_first(child, level);
-			if (tmp)
-				return tmp;
-		}
-	}
-	return NULL;
-}
-
-/**
- * toptree_next_sibling - Return next sibling
- * @cur: Pointer to a tree node
- *
- * RETURNS:
- * If @cur has a parent and is not the last in the parent's children list,
- * the next sibling is returned. Or NULL when there are no siblings left.
- */
-static struct toptree *toptree_next_sibling(struct toptree *cur)
-{
-	if (cur->parent == NULL)
-		return NULL;
-
-	if (cur == list_last_entry(&cur->parent->children,
-				   struct toptree, sibling))
-		return NULL;
-	return (struct toptree *) list_next_entry(cur, sibling);
-}
-
-/**
- * toptree_next - Tree traversal function
- * @cur: Pointer to current element
- * @context: Pointer to the root node of the tree or subtree to
- * be traversed.
- * @level: The level of interest.
- *
- * RETURNS:
- * Pointer to the next node on level @level
- * or NULL when there is no next node.
- */
-struct toptree *toptree_next(struct toptree *cur, struct toptree *context,
-			     int level)
-{
-	struct toptree *cur_context, *tmp;
-
-	if (!cur)
-		return NULL;
-
-	if (context->level == level)
-		return NULL;
-
-	tmp = toptree_next_sibling(cur);
-	if (tmp != NULL)
-		return tmp;
-
-	cur_context = cur;
-	while (cur_context->level < context->level - 1) {
-		/* Step up */
-		cur_context = cur_context->parent;
-		/* Step aside */
-		tmp = toptree_next_sibling(cur_context);
-		if (tmp != NULL) {
-			/* Step down */
-			tmp = toptree_first(tmp, level);
-			if (tmp != NULL)
-				return tmp;
-		}
-	}
-	return NULL;
-}
-
-/**
- * toptree_count - Count descendants on specified level
- * @context: Pointer to node whose descendants are to be considered
- * @level: Only descendants on the specified level will be counted
- *
- * RETURNS:
- * Number of descendants on the specified level
- */
-int toptree_count(struct toptree *context, int level)
-{
-	struct toptree *cur;
-	int cnt = 0;
-
-	toptree_for_each(cur, context, level)
-		cnt++;
-	return cnt;
-}
diff --git a/arch/s390/numa/toptree.h b/arch/s390/numa/toptree.h
deleted file mode 100644
index 5246371ec713..000000000000
--- a/arch/s390/numa/toptree.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * NUMA support for s390
- *
- * A tree structure used for machine topology mangling
- *
- * Copyright IBM Corp. 2015
- */
-#ifndef S390_TOPTREE_H
-#define S390_TOPTREE_H
-
-#include <linux/cpumask.h>
-#include <linux/list.h>
-
-struct toptree {
-	int level;
-	int id;
-	cpumask_t mask;
-	struct toptree *parent;
-	struct list_head sibling;
-	struct list_head children;
-};
-
-struct toptree *toptree_alloc(int level, int id);
-void toptree_free(struct toptree *cand);
-void toptree_update_mask(struct toptree *cand);
-void toptree_unify(struct toptree *cand);
-struct toptree *toptree_get_child(struct toptree *cand, int id);
-void toptree_move(struct toptree *cand, struct toptree *target);
-int toptree_count(struct toptree *context, int level);
-
-struct toptree *toptree_first(struct toptree *context, int level);
-struct toptree *toptree_next(struct toptree *cur, struct toptree *context,
-			     int level);
-
-#define toptree_for_each_child(child, ptree)				\
-	list_for_each_entry(child,  &ptree->children, sibling)
-
-#define toptree_for_each_child_safe(child, ptmp, ptree)			\
-	list_for_each_entry_safe(child, ptmp, &ptree->children, sibling)
-
-#define toptree_is_last(ptree)					\
-	((ptree->parent == NULL) ||				\
-	 (ptree->parent->children.prev == &ptree->sibling))
-
-#define toptree_for_each(ptree, cont, ttype)		\
-	for (ptree = toptree_first(cont, ttype);	\
-	     ptree != NULL;				\
-	     ptree = toptree_next(ptree, cont, ttype))
-
-#define toptree_for_each_safe(ptree, tmp, cont, ttype)		\
-	for (ptree = toptree_first(cont, ttype),		\
-		     tmp = toptree_next(ptree, cont, ttype);	\
-	     ptree != NULL;					\
-	     ptree = tmp,					\
-		     tmp = toptree_next(ptree, cont, ttype))
-
-#define toptree_for_each_sibling(ptree, start)			\
-	toptree_for_each(ptree, start->parent, start->level)
-
-#endif /* S390_TOPTREE_H */
diff --git a/arch/s390/oprofile/Makefile b/arch/s390/oprofile/Makefile
deleted file mode 100644
index 36261f9d360b..000000000000
--- a/arch/s390/oprofile/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_OPROFILE) += oprofile.o
-
-DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
-		oprof.o cpu_buffer.o buffer_sync.o \
-		event_buffer.o oprofile_files.o \
-		oprofilefs.o oprofile_stats.o  \
-		timer_int.o )
-
-oprofile-y :=	$(DRIVER_OBJS) init.o
diff --git a/arch/s390/oprofile/init.c b/arch/s390/oprofile/init.c
deleted file mode 100644
index 7441857df51b..000000000000
--- a/arch/s390/oprofile/init.c
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * S390 Version
- *   Copyright IBM Corp. 2002, 2011
- *   Author(s): Thomas Spatzier (tspat@de.ibm.com)
- *   Author(s): Mahesh Salgaonkar (mahesh@linux.vnet.ibm.com)
- *   Author(s): Heinz Graalfs (graalfs@linux.vnet.ibm.com)
- *   Author(s): Andreas Krebbel (krebbel@linux.vnet.ibm.com)
- *
- * @remark Copyright 2002-2011 OProfile authors
- */
-
-#include <linux/oprofile.h>
-#include <linux/init.h>
-#include <asm/processor.h>
-#include <asm/unwind.h>
-
-static void s390_backtrace(struct pt_regs *regs, unsigned int depth)
-{
-	struct unwind_state state;
-
-	unwind_for_each_frame(&state, current, regs, 0) {
-		if (depth-- == 0)
-			break;
-		oprofile_add_trace(state.ip);
-	}
-}
-
-int __init oprofile_arch_init(struct oprofile_operations *ops)
-{
-	ops->backtrace = s390_backtrace;
-	return 0;
-}
-
-void oprofile_arch_exit(void)
-{
-}
diff --git a/arch/s390/pci/Makefile b/arch/s390/pci/Makefile
index 748626a33028..1810e0944a4e 100644
--- a/arch/s390/pci/Makefile
+++ b/arch/s390/pci/Makefile
@@ -3,5 +3,8 @@
 # Makefile for the s390 PCI subsystem.
 #
 
-obj-$(CONFIG_PCI)	+= pci.o pci_irq.o pci_dma.o pci_clp.o pci_sysfs.o \
-			   pci_event.o pci_debug.o pci_insn.o pci_mmio.o
+obj-$(CONFIG_PCI)	+= pci.o pci_irq.o pci_clp.o \
+			   pci_event.o pci_debug.o pci_insn.o pci_mmio.o \
+			   pci_bus.o pci_kvm_hook.o pci_report.o pci_fixup.o
+obj-$(CONFIG_PCI_IOV)	+= pci_iov.o
+obj-$(CONFIG_SYSFS)	+= pci_sysfs.o
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 8e872951c07b..5bbdc4190b8b 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -28,7 +28,10 @@
 #include <linux/jump_label.h>
 #include <linux/pci.h>
 #include <linux/printk.h>
+#include <linux/lockdep.h>
+#include <linux/list_sort.h>
 
+#include <asm/machine.h>
 #include <asm/isc.h>
 #include <asm/airq.h>
 #include <asm/facility.h>
@@ -36,17 +39,22 @@
 #include <asm/pci_clp.h>
 #include <asm/pci_dma.h>
 
+#include "pci_bus.h"
+#include "pci_iov.h"
+
 /* list of all detected zpci devices */
 static LIST_HEAD(zpci_list);
 static DEFINE_SPINLOCK(zpci_list_lock);
 
-static DECLARE_BITMAP(zpci_domain, ZPCI_NR_DEVICES);
+static DECLARE_BITMAP(zpci_domain, ZPCI_DOMAIN_BITMAP_SIZE);
 static DEFINE_SPINLOCK(zpci_domain_lock);
 
 #define ZPCI_IOMAP_ENTRIES						\
 	min(((unsigned long) ZPCI_NR_DEVICES * PCI_STD_NUM_BARS / 2),	\
 	    ZPCI_IOMAP_MAX_ENTRIES)
 
+unsigned int s390_pci_no_rid;
+
 static DEFINE_SPINLOCK(zpci_iomap_lock);
 static unsigned long *zpci_iomap_bitmap;
 struct zpci_iomap_entry *zpci_iomap_start;
@@ -56,6 +64,12 @@ DEFINE_STATIC_KEY_FALSE(have_mio);
 
 static struct kmem_cache *zdev_fmb_cache;
 
+/* AEN structures that must be preserved over KVM module re-insertion */
+union zpci_sic_iib *zpci_aipb;
+EXPORT_SYMBOL_GPL(zpci_aipb);
+struct airq_iv *zpci_aif_sbv;
+EXPORT_SYMBOL_GPL(zpci_aif_sbv);
+
 struct zpci_dev *get_zdev_by_fid(u32 fid)
 {
 	struct zpci_dev *tmp, *zdev = NULL;
@@ -64,6 +78,7 @@ struct zpci_dev *get_zdev_by_fid(u32 fid)
 	list_for_each_entry(tmp, &zpci_list, entry) {
 		if (tmp->fid == fid) {
 			zdev = tmp;
+			zpci_zdev_get(zdev);
 			break;
 		}
 	}
@@ -87,17 +102,12 @@ void zpci_remove_reserved_devices(void)
 	spin_unlock(&zpci_list_lock);
 
 	list_for_each_entry_safe(zdev, tmp, &remove, entry)
-		zpci_remove_device(zdev);
-}
-
-static struct zpci_dev *get_zdev_by_bus(struct pci_bus *bus)
-{
-	return (bus && bus->sysdata) ? (struct zpci_dev *) bus->sysdata : NULL;
+		zpci_device_reserved(zdev);
 }
 
 int pci_domain_nr(struct pci_bus *bus)
 {
-	return ((struct zpci_dev *) bus->sysdata)->domain;
+	return ((struct zpci_bus *) bus->sysdata)->domain_nr;
 }
 EXPORT_SYMBOL_GPL(pci_domain_nr);
 
@@ -109,18 +119,26 @@ EXPORT_SYMBOL_GPL(pci_proc_domain);
 
 /* Modify PCI: Register I/O address translation parameters */
 int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas,
-		       u64 base, u64 limit, u64 iota)
+		       u64 base, u64 limit, u64 iota, u8 *status)
 {
 	u64 req = ZPCI_CREATE_REQ(zdev->fh, dmaas, ZPCI_MOD_FC_REG_IOAT);
 	struct zpci_fib fib = {0};
-	u8 status;
+	u8 cc;
 
-	WARN_ON_ONCE(iota & 0x3fff);
 	fib.pba = base;
-	fib.pal = limit;
-	fib.iota = iota | ZPCI_IOTA_RTTO_FLAG;
-	return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
+	/* Work around off by one in ISM virt device */
+	if (zdev->pft == PCI_FUNC_TYPE_ISM && limit > base)
+		fib.pal = limit + (1 << 12);
+	else
+		fib.pal = limit;
+	fib.iota = iota;
+	fib.gd = zdev->gisa;
+	cc = zpci_mod_fc(req, &fib, status);
+	if (cc)
+		zpci_dbg(3, "reg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, *status);
+	return cc;
 }
+EXPORT_SYMBOL_GPL(zpci_register_ioat);
 
 /* Modify PCI: Unregister I/O address translation parameters */
 int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas)
@@ -129,17 +147,21 @@ int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas)
 	struct zpci_fib fib = {0};
 	u8 cc, status;
 
+	fib.gd = zdev->gisa;
+
 	cc = zpci_mod_fc(req, &fib, &status);
-	if (cc == 3) /* Function already gone. */
-		cc = 0;
-	return cc ? -EIO : 0;
+	if (cc)
+		zpci_dbg(3, "unreg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status);
+	return cc;
 }
 
 /* Modify PCI: Set PCI function measurement parameters */
 int zpci_fmb_enable_device(struct zpci_dev *zdev)
 {
 	u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_SET_MEASURE);
+	struct zpci_iommu_ctrs *ctrs;
 	struct zpci_fib fib = {0};
+	unsigned long flags;
 	u8 cc, status;
 
 	if (zdev->fmb || sizeof(*zdev->fmb) < zdev->fmb_length)
@@ -151,11 +173,20 @@ int zpci_fmb_enable_device(struct zpci_dev *zdev)
 	WARN_ON((u64) zdev->fmb & 0xf);
 
 	/* reset software counters */
-	atomic64_set(&zdev->allocated_pages, 0);
-	atomic64_set(&zdev->mapped_pages, 0);
-	atomic64_set(&zdev->unmapped_pages, 0);
+	spin_lock_irqsave(&zdev->dom_lock, flags);
+	ctrs = zpci_get_iommu_ctrs(zdev);
+	if (ctrs) {
+		atomic64_set(&ctrs->mapped_pages, 0);
+		atomic64_set(&ctrs->unmapped_pages, 0);
+		atomic64_set(&ctrs->global_rpcits, 0);
+		atomic64_set(&ctrs->sync_map_rpcits, 0);
+		atomic64_set(&ctrs->sync_rpcits, 0);
+	}
+	spin_unlock_irqrestore(&zdev->dom_lock, flags);
+
 
 	fib.fmb_addr = virt_to_phys(zdev->fmb);
+	fib.gd = zdev->gisa;
 	cc = zpci_mod_fc(req, &fib, &status);
 	if (cc) {
 		kmem_cache_free(zdev_fmb_cache, zdev->fmb);
@@ -174,6 +205,8 @@ int zpci_fmb_disable_device(struct zpci_dev *zdev)
 	if (!zdev->fmb)
 		return -EINVAL;
 
+	fib.gd = zdev->gisa;
+
 	/* Function measurement is disabled if fmb address is zero */
 	cc = zpci_mod_fc(req, &fib, &status);
 	if (cc == 3) /* Function already gone. */
@@ -221,44 +254,25 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res,
 	return 0;
 }
 
-/* combine single writes by using store-block insn */
-void __iowrite64_copy(void __iomem *to, const void *from, size_t count)
-{
-       zpci_memcpy_toio(to, from, count);
-}
-
-void __iomem *ioremap(unsigned long ioaddr, unsigned long size)
+void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
+			   pgprot_t prot)
 {
-	struct vm_struct *area;
-	unsigned long offset;
-
-	if (!size)
-		return NULL;
-
+	/*
+	 * When PCI MIO instructions are unavailable the "physical" address
+	 * encodes a hint for accessing the PCI memory space it represents.
+	 * Just pass it unchanged such that ioread/iowrite can decode it.
+	 */
 	if (!static_branch_unlikely(&have_mio))
-		return (void __iomem *) ioaddr;
-
-	offset = ioaddr & ~PAGE_MASK;
-	ioaddr &= PAGE_MASK;
-	size = PAGE_ALIGN(size + offset);
-	area = get_vm_area(size, VM_IOREMAP);
-	if (!area)
-		return NULL;
+		return (void __iomem *)phys_addr;
 
-	if (ioremap_page_range((unsigned long) area->addr,
-			       (unsigned long) area->addr + size,
-			       ioaddr, PAGE_KERNEL)) {
-		vunmap(area->addr);
-		return NULL;
-	}
-	return (void __iomem *) ((unsigned long) area->addr + offset);
+	return generic_ioremap_prot(phys_addr, size, prot);
 }
-EXPORT_SYMBOL(ioremap);
+EXPORT_SYMBOL(ioremap_prot);
 
 void iounmap(volatile void __iomem *addr)
 {
 	if (static_branch_likely(&have_mio))
-		vunmap((__force void *) ((unsigned long) addr & PAGE_MASK));
+		generic_iounmap(addr);
 }
 EXPORT_SYMBOL(iounmap);
 
@@ -372,29 +386,17 @@ EXPORT_SYMBOL(pci_iounmap);
 static int pci_read(struct pci_bus *bus, unsigned int devfn, int where,
 		    int size, u32 *val)
 {
-	struct zpci_dev *zdev = get_zdev_by_bus(bus);
-	int ret;
-
-	if (!zdev || devfn != ZPCI_DEVFN)
-		ret = -ENODEV;
-	else
-		ret = zpci_cfg_load(zdev, where, val, size);
+	struct zpci_dev *zdev = zdev_from_bus(bus, devfn);
 
-	return ret;
+	return (zdev) ? zpci_cfg_load(zdev, where, val, size) : -ENODEV;
 }
 
 static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
 		     int size, u32 val)
 {
-	struct zpci_dev *zdev = get_zdev_by_bus(bus);
-	int ret;
-
-	if (!zdev || devfn != ZPCI_DEVFN)
-		ret = -ENODEV;
-	else
-		ret = zpci_cfg_store(zdev, where, val, size);
+	struct zpci_dev *zdev = zdev_from_bus(bus, devfn);
 
-	return ret;
+	return (zdev) ? zpci_cfg_store(zdev, where, val, size) : -ENODEV;
 }
 
 static struct pci_ops pci_root_ops = {
@@ -402,15 +404,6 @@ static struct pci_ops pci_root_ops = {
 	.write = pci_write,
 };
 
-#ifdef CONFIG_PCI_IOV
-static struct resource iov_res = {
-	.name	= "PCI IOV res",
-	.start	= 0,
-	.end	= -1,
-	.flags	= IORESOURCE_MEM,
-};
-#endif
-
 static void zpci_map_resources(struct pci_dev *pdev)
 {
 	struct zpci_dev *zdev = to_zpci(pdev);
@@ -424,23 +417,14 @@ static void zpci_map_resources(struct pci_dev *pdev)
 
 		if (zpci_use_mio(zdev))
 			pdev->resource[i].start =
-				(resource_size_t __force) zdev->bars[i].mio_wb;
+				(resource_size_t __force) zdev->bars[i].mio_wt;
 		else
 			pdev->resource[i].start = (resource_size_t __force)
 				pci_iomap_range_fh(pdev, i, 0, 0);
 		pdev->resource[i].end = pdev->resource[i].start + len - 1;
 	}
 
-#ifdef CONFIG_PCI_IOV
-	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
-		int bar = i + PCI_IOV_RESOURCES;
-
-		len = pci_resource_len(pdev, bar);
-		if (!len)
-			continue;
-		pdev->resource[bar].parent = &iov_res;
-	}
-#endif
+	zpci_iov_map_resources(pdev);
 }
 
 static void zpci_unmap_resources(struct pci_dev *pdev)
@@ -484,6 +468,34 @@ static void zpci_free_iomap(struct zpci_dev *zdev, int entry)
 	spin_unlock(&zpci_iomap_lock);
 }
 
+static void zpci_do_update_iomap_fh(struct zpci_dev *zdev, u32 fh)
+{
+	int bar, idx;
+
+	spin_lock(&zpci_iomap_lock);
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
+		if (!zdev->bars[bar].size)
+			continue;
+		idx = zdev->bars[bar].map_idx;
+		if (!zpci_iomap_start[idx].count)
+			continue;
+		WRITE_ONCE(zpci_iomap_start[idx].fh, zdev->fh);
+	}
+	spin_unlock(&zpci_iomap_lock);
+}
+
+void zpci_update_fh(struct zpci_dev *zdev, u32 fh)
+{
+	if (!fh || zdev->fh == fh)
+		return;
+
+	zdev->fh = fh;
+	if (zpci_use_mio(zdev))
+		return;
+	if (zdev->has_resources && zdev_enabled(zdev))
+		zpci_do_update_iomap_fh(zdev, fh);
+}
+
 static struct resource *__alloc_res(struct zpci_dev *zdev, unsigned long start,
 				    unsigned long size, unsigned long flags)
 {
@@ -505,15 +517,14 @@ static struct resource *__alloc_res(struct zpci_dev *zdev, unsigned long start,
 	return r;
 }
 
-static int zpci_setup_bus_resources(struct zpci_dev *zdev,
-				    struct list_head *resources)
+int zpci_setup_bus_resources(struct zpci_dev *zdev)
 {
 	unsigned long addr, size, flags;
 	struct resource *res;
 	int i, entry;
 
 	snprintf(zdev->res_name, sizeof(zdev->res_name),
-		 "PCI Bus %04x:%02x", zdev->domain, ZPCI_BUS_NR);
+		 "PCI Bus %04x:%02x", zdev->uid, ZPCI_BUS_NR);
 
 	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		if (!zdev->bars[i].size)
@@ -531,7 +542,7 @@ static int zpci_setup_bus_resources(struct zpci_dev *zdev,
 			flags |= IORESOURCE_MEM_64;
 
 		if (zpci_use_mio(zdev))
-			addr = (unsigned long) zdev->bars[i].mio_wb;
+			addr = (unsigned long) zdev->bars[i].mio_wt;
 		else
 			addr = ZPCI_ADDR(entry);
 		size = 1UL << zdev->bars[i].size;
@@ -542,36 +553,44 @@ static int zpci_setup_bus_resources(struct zpci_dev *zdev,
 			return -ENOMEM;
 		}
 		zdev->bars[i].res = res;
-		pci_add_resource(resources, res);
 	}
+	zdev->has_resources = 1;
 
 	return 0;
 }
 
 static void zpci_cleanup_bus_resources(struct zpci_dev *zdev)
 {
+	struct resource *res;
 	int i;
 
+	pci_lock_rescan_remove();
 	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
-		if (!zdev->bars[i].size || !zdev->bars[i].res)
+		res = zdev->bars[i].res;
+		if (!res)
 			continue;
 
+		release_resource(res);
+		pci_bus_remove_resource(zdev->zbus->bus, res);
 		zpci_free_iomap(zdev, zdev->bars[i].map_idx);
-		release_resource(zdev->bars[i].res);
-		kfree(zdev->bars[i].res);
+		zdev->bars[i].res = NULL;
+		kfree(res);
 	}
+	zdev->has_resources = 0;
+	pci_unlock_rescan_remove();
 }
 
-int pcibios_add_device(struct pci_dev *pdev)
+int pcibios_device_add(struct pci_dev *pdev)
 {
+	struct zpci_dev *zdev = to_zpci(pdev);
 	struct resource *res;
 	int i;
 
+	/* The pdev has a reference to the zdev via its bus */
+	zpci_zdev_get(zdev);
 	if (pdev->is_physfn)
 		pdev->no_vf_scan = 1;
 
-	pdev->dev.groups = zpci_attr_groups;
-	pdev->dev.dma_ops = &s390_pci_dma_ops;
 	zpci_map_resources(pdev);
 
 	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
@@ -586,7 +605,10 @@ int pcibios_add_device(struct pci_dev *pdev)
 
 void pcibios_release_device(struct pci_dev *pdev)
 {
+	struct zpci_dev *zdev = to_zpci(pdev);
+
 	zpci_unmap_resources(pdev);
+	zpci_zdev_put(zdev);
 }
 
 int pcibios_enable_device(struct pci_dev *pdev, int mask)
@@ -607,210 +629,342 @@ void pcibios_disable_device(struct pci_dev *pdev)
 	zpci_debug_exit_device(zdev);
 }
 
-#ifdef CONFIG_HIBERNATE_CALLBACKS
-static int zpci_restore(struct device *dev)
+static int __zpci_register_domain(int domain)
 {
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct zpci_dev *zdev = to_zpci(pdev);
-	int ret = 0;
-
-	if (zdev->state != ZPCI_FN_STATE_ONLINE)
-		goto out;
-
-	ret = clp_enable_fh(zdev, ZPCI_NR_DMA_SPACES);
-	if (ret)
-		goto out;
-
-	zpci_map_resources(pdev);
-	zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
-			   (u64) zdev->dma_table);
-
-out:
-	return ret;
+	spin_lock(&zpci_domain_lock);
+	if (test_bit(domain, zpci_domain)) {
+		spin_unlock(&zpci_domain_lock);
+		pr_err("Domain %04x is already assigned\n", domain);
+		return -EEXIST;
+	}
+	set_bit(domain, zpci_domain);
+	spin_unlock(&zpci_domain_lock);
+	return domain;
 }
 
-static int zpci_freeze(struct device *dev)
+static int __zpci_alloc_domain(void)
 {
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct zpci_dev *zdev = to_zpci(pdev);
-
-	if (zdev->state != ZPCI_FN_STATE_ONLINE)
-		return 0;
+	int domain;
 
-	zpci_unregister_ioat(zdev, 0);
-	zpci_unmap_resources(pdev);
-	return clp_disable_fh(zdev);
+	spin_lock(&zpci_domain_lock);
+	/*
+	 * We can always auto allocate domains below ZPCI_NR_DEVICES.
+	 * There is either a free domain or we have reached the maximum in
+	 * which case we would have bailed earlier.
+	 */
+	domain = find_first_zero_bit(zpci_domain, ZPCI_NR_DEVICES);
+	set_bit(domain, zpci_domain);
+	spin_unlock(&zpci_domain_lock);
+	return domain;
 }
 
-struct dev_pm_ops pcibios_pm_ops = {
-	.thaw_noirq = zpci_restore,
-	.freeze_noirq = zpci_freeze,
-	.restore_noirq = zpci_restore,
-	.poweroff_noirq = zpci_freeze,
-};
-#endif /* CONFIG_HIBERNATE_CALLBACKS */
-
-static int zpci_alloc_domain(struct zpci_dev *zdev)
+int zpci_alloc_domain(int domain)
 {
 	if (zpci_unique_uid) {
-		zdev->domain = (u16) zdev->uid;
-		if (zdev->domain >= ZPCI_NR_DEVICES)
-			return 0;
-
-		spin_lock(&zpci_domain_lock);
-		if (test_bit(zdev->domain, zpci_domain)) {
-			spin_unlock(&zpci_domain_lock);
-			pr_err("Adding PCI function %08x failed because domain %04x is already assigned\n",
-				zdev->fid, zdev->domain);
-			return -EEXIST;
-		}
-		set_bit(zdev->domain, zpci_domain);
-		spin_unlock(&zpci_domain_lock);
-		return 0;
+		if (domain)
+			return __zpci_register_domain(domain);
+		pr_warn("UID checking was active but no UID is provided: switching to automatic domain allocation\n");
+		update_uid_checking(false);
 	}
+	return __zpci_alloc_domain();
+}
 
+void zpci_free_domain(int domain)
+{
 	spin_lock(&zpci_domain_lock);
-	zdev->domain = find_first_zero_bit(zpci_domain, ZPCI_NR_DEVICES);
-	if (zdev->domain == ZPCI_NR_DEVICES) {
-		spin_unlock(&zpci_domain_lock);
-		pr_err("Adding PCI function %08x failed because the configured limit of %d is reached\n",
-			zdev->fid, ZPCI_NR_DEVICES);
-		return -ENOSPC;
-	}
-	set_bit(zdev->domain, zpci_domain);
+	clear_bit(domain, zpci_domain);
 	spin_unlock(&zpci_domain_lock);
-	return 0;
 }
 
-static void zpci_free_domain(struct zpci_dev *zdev)
+
+int zpci_enable_device(struct zpci_dev *zdev)
 {
-	if (zdev->domain >= ZPCI_NR_DEVICES)
-		return;
+	u32 fh = zdev->fh;
+	int rc = 0;
 
-	spin_lock(&zpci_domain_lock);
-	clear_bit(zdev->domain, zpci_domain);
-	spin_unlock(&zpci_domain_lock);
+	if (clp_enable_fh(zdev, &fh, ZPCI_NR_DMA_SPACES))
+		rc = -EIO;
+	else
+		zpci_update_fh(zdev, fh);
+	return rc;
 }
+EXPORT_SYMBOL_GPL(zpci_enable_device);
 
-void pcibios_remove_bus(struct pci_bus *bus)
+int zpci_reenable_device(struct zpci_dev *zdev)
 {
-	struct zpci_dev *zdev = get_zdev_by_bus(bus);
+	u8 status;
+	int rc;
 
-	zpci_exit_slot(zdev);
-	zpci_cleanup_bus_resources(zdev);
-	zpci_destroy_iommu(zdev);
-	zpci_free_domain(zdev);
+	rc = zpci_enable_device(zdev);
+	if (rc)
+		return rc;
 
-	spin_lock(&zpci_list_lock);
-	list_del(&zdev->entry);
-	spin_unlock(&zpci_list_lock);
+	rc = zpci_iommu_register_ioat(zdev, &status);
+	if (rc)
+		zpci_disable_device(zdev);
 
-	zpci_dbg(3, "rem fid:%x\n", zdev->fid);
-	kfree(zdev);
+	return rc;
 }
+EXPORT_SYMBOL_GPL(zpci_reenable_device);
 
-static int zpci_scan_bus(struct zpci_dev *zdev)
+int zpci_disable_device(struct zpci_dev *zdev)
 {
-	LIST_HEAD(resources);
-	int ret;
+	u32 fh = zdev->fh;
+	int cc, rc = 0;
 
-	ret = zpci_setup_bus_resources(zdev, &resources);
-	if (ret)
-		goto error;
+	cc = clp_disable_fh(zdev, &fh);
+	if (!cc) {
+		zpci_update_fh(zdev, fh);
+	} else if (cc == CLP_RC_SETPCIFN_ALRDY) {
+		pr_info("Disabling PCI function %08x had no effect as it was already disabled\n",
+			zdev->fid);
+		/* Function is already disabled - update handle */
+		rc = clp_refresh_fh(zdev->fid, &fh);
+		if (!rc) {
+			zpci_update_fh(zdev, fh);
+			rc = -EINVAL;
+		}
+	} else {
+		rc = -EIO;
+	}
+	return rc;
+}
+EXPORT_SYMBOL_GPL(zpci_disable_device);
 
-	zdev->bus = pci_scan_root_bus(NULL, ZPCI_BUS_NR, &pci_root_ops,
-				      zdev, &resources);
-	if (!zdev->bus) {
-		ret = -EIO;
-		goto error;
+/**
+ * zpci_hot_reset_device - perform a reset of the given zPCI function
+ * @zdev: the slot which should be reset
+ *
+ * Performs a low level reset of the zPCI function. The reset is low level in
+ * the sense that the zPCI function can be reset without detaching it from the
+ * common PCI subsystem. The reset may be performed while under control of
+ * either DMA or IOMMU APIs in which case the existing DMA/IOMMU translation
+ * table is reinstated at the end of the reset.
+ *
+ * After the reset the functions internal state is reset to an initial state
+ * equivalent to its state during boot when first probing a driver.
+ * Consequently after reset the PCI function requires re-initialization via the
+ * common PCI code including re-enabling IRQs via pci_alloc_irq_vectors()
+ * and enabling the function via e.g. pci_enable_device_flags(). The caller
+ * must guard against concurrent reset attempts.
+ *
+ * In most cases this function should not be called directly but through
+ * pci_reset_function() or pci_reset_bus() which handle the save/restore and
+ * locking - asserted by lockdep.
+ *
+ * Return: 0 on success and an error value otherwise
+ */
+int zpci_hot_reset_device(struct zpci_dev *zdev)
+{
+	int rc;
+
+	lockdep_assert_held(&zdev->state_lock);
+	zpci_dbg(3, "rst fid:%x, fh:%x\n", zdev->fid, zdev->fh);
+	if (zdev_enabled(zdev)) {
+		/* Disables device access, DMAs and IRQs (reset state) */
+		rc = zpci_disable_device(zdev);
+		/*
+		 * Due to a z/VM vs LPAR inconsistency in the error state the
+		 * FH may indicate an enabled device but disable says the
+		 * device is already disabled don't treat it as an error here.
+		 */
+		if (rc == -EINVAL)
+			rc = 0;
+		if (rc)
+			return rc;
 	}
-	zdev->bus->max_bus_speed = zdev->max_bus_speed;
-	pci_bus_add_devices(zdev->bus);
-	return 0;
+
+	rc = zpci_reenable_device(zdev);
+
+	return rc;
+}
+
+/**
+ * zpci_create_device() - Create a new zpci_dev and add it to the zbus
+ * @fid: Function ID of the device to be created
+ * @fh: Current Function Handle of the device to be created
+ * @state: Initial state after creation either Standby or Configured
+ *
+ * Allocates a new struct zpci_dev and queries the platform for its details.
+ * If successful the device can subsequently be added to the zPCI subsystem
+ * using zpci_add_device().
+ *
+ * Returns: the zdev on success or an error pointer otherwise
+ */
+struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
+{
+	struct zpci_dev *zdev;
+	int rc;
+
+	zdev = kzalloc(sizeof(*zdev), GFP_KERNEL);
+	if (!zdev)
+		return ERR_PTR(-ENOMEM);
+
+	/* FID and Function Handle are the static/dynamic identifiers */
+	zdev->fid = fid;
+	zdev->fh = fh;
+
+	/* Query function properties and update zdev */
+	rc = clp_query_pci_fn(zdev);
+	if (rc)
+		goto error;
+	zdev->state =  state;
+
+	mutex_init(&zdev->state_lock);
+	mutex_init(&zdev->fmb_lock);
+	mutex_init(&zdev->kzdev_lock);
+
+	return zdev;
 
 error:
-	zpci_cleanup_bus_resources(zdev);
-	pci_free_resource_list(&resources);
-	return ret;
+	zpci_dbg(0, "crt fid:%x, rc:%d\n", fid, rc);
+	kfree(zdev);
+	return ERR_PTR(rc);
 }
 
-int zpci_enable_device(struct zpci_dev *zdev)
+/**
+ * zpci_add_device() - Add a previously created zPCI device to the zPCI subsystem
+ * @zdev: The zPCI device to be added
+ *
+ * A struct zpci_dev is added to the zPCI subsystem and to a virtual PCI bus creating
+ * a new one as necessary. A hotplug slot is created and events start to be handled.
+ * If successful from this point on zpci_zdev_get() and zpci_zdev_put() must be used.
+ * If adding the struct zpci_dev fails the device was not added and should be freed.
+ *
+ * Return: 0 on success, or an error code otherwise
+ */
+int zpci_add_device(struct zpci_dev *zdev)
 {
 	int rc;
 
-	rc = clp_enable_fh(zdev, ZPCI_NR_DMA_SPACES);
+	zpci_dbg(1, "add fid:%x, fh:%x, c:%d\n", zdev->fid, zdev->fh, zdev->state);
+	rc = zpci_init_iommu(zdev);
 	if (rc)
-		goto out;
+		goto error;
 
-	rc = zpci_dma_init_device(zdev);
+	rc = zpci_bus_device_register(zdev, &pci_root_ops);
 	if (rc)
-		goto out_dma;
+		goto error_destroy_iommu;
 
-	zdev->state = ZPCI_FN_STATE_ONLINE;
+	kref_init(&zdev->kref);
+	spin_lock(&zpci_list_lock);
+	list_add_tail(&zdev->entry, &zpci_list);
+	spin_unlock(&zpci_list_lock);
 	return 0;
 
-out_dma:
-	clp_disable_fh(zdev);
-out:
+error_destroy_iommu:
+	zpci_destroy_iommu(zdev);
+error:
+	zpci_dbg(0, "add fid:%x, rc:%d\n", zdev->fid, rc);
 	return rc;
 }
-EXPORT_SYMBOL_GPL(zpci_enable_device);
 
-int zpci_disable_device(struct zpci_dev *zdev)
+bool zpci_is_device_configured(struct zpci_dev *zdev)
 {
-	zpci_dma_exit_device(zdev);
-	return clp_disable_fh(zdev);
+	enum zpci_state state = zdev->state;
+
+	return state != ZPCI_FN_STATE_RESERVED &&
+		state != ZPCI_FN_STATE_STANDBY;
+}
+
+/**
+ * zpci_scan_configured_device() - Scan a freshly configured zpci_dev
+ * @zdev: The zpci_dev to be configured
+ * @fh: The general function handle supplied by the platform
+ *
+ * Given a device in the configuration state Configured, enables, scans and
+ * adds it to the common code PCI subsystem if possible. If any failure occurs,
+ * the zpci_dev is left disabled.
+ *
+ * Return: 0 on success, or an error code otherwise
+ */
+int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh)
+{
+	zpci_update_fh(zdev, fh);
+	return zpci_bus_scan_device(zdev);
 }
-EXPORT_SYMBOL_GPL(zpci_disable_device);
 
-int zpci_create_device(struct zpci_dev *zdev)
+/**
+ * zpci_deconfigure_device() - Deconfigure a zpci_dev
+ * @zdev: The zpci_dev to configure
+ *
+ * Deconfigure a zPCI function that is currently configured and possibly known
+ * to the common code PCI subsystem.
+ * If any failure occurs the device is left as is.
+ *
+ * Return: 0 on success, or an error code otherwise
+ */
+int zpci_deconfigure_device(struct zpci_dev *zdev)
 {
 	int rc;
 
-	rc = zpci_alloc_domain(zdev);
-	if (rc)
-		goto out;
+	lockdep_assert_held(&zdev->state_lock);
+	if (zdev->state != ZPCI_FN_STATE_CONFIGURED)
+		return 0;
 
-	rc = zpci_init_iommu(zdev);
-	if (rc)
-		goto out_free;
+	if (zdev->zbus->bus)
+		zpci_bus_remove_device(zdev, false);
 
-	mutex_init(&zdev->lock);
-	if (zdev->state == ZPCI_FN_STATE_CONFIGURED) {
-		rc = zpci_enable_device(zdev);
+	if (zdev_enabled(zdev)) {
+		rc = zpci_disable_device(zdev);
 		if (rc)
-			goto out_destroy_iommu;
+			return rc;
 	}
-	rc = zpci_scan_bus(zdev);
+
+	rc = sclp_pci_deconfigure(zdev->fid);
+	zpci_dbg(3, "deconf fid:%x, rc:%d\n", zdev->fid, rc);
 	if (rc)
-		goto out_disable;
+		return rc;
+	zdev->state = ZPCI_FN_STATE_STANDBY;
+
+	return 0;
+}
 
+/**
+ * zpci_device_reserved() - Mark device as reserved
+ * @zdev: the zpci_dev that was reserved
+ *
+ * Handle the case that a given zPCI function was reserved by another system.
+ * After a call to this function the zpci_dev can not be found via
+ * get_zdev_by_fid() anymore but may still be accessible via existing
+ * references though it will not be functional anymore.
+ */
+void zpci_device_reserved(struct zpci_dev *zdev)
+{
+	/*
+	 * Remove device from zpci_list as it is going away. This also
+	 * makes sure we ignore subsequent zPCI events for this device.
+	 */
 	spin_lock(&zpci_list_lock);
-	list_add_tail(&zdev->entry, &zpci_list);
+	list_del(&zdev->entry);
 	spin_unlock(&zpci_list_lock);
+	zdev->state = ZPCI_FN_STATE_RESERVED;
+	zpci_dbg(3, "rsv fid:%x\n", zdev->fid);
+	zpci_zdev_put(zdev);
+}
 
-	zpci_init_slot(zdev);
+void zpci_release_device(struct kref *kref)
+{
+	struct zpci_dev *zdev = container_of(kref, struct zpci_dev, kref);
 
-	return 0;
+	WARN_ON(zdev->state != ZPCI_FN_STATE_RESERVED);
+
+	if (zdev->zbus->bus)
+		zpci_bus_remove_device(zdev, false);
 
-out_disable:
-	if (zdev->state == ZPCI_FN_STATE_ONLINE)
+	if (zdev_enabled(zdev))
 		zpci_disable_device(zdev);
-out_destroy_iommu:
-	zpci_destroy_iommu(zdev);
-out_free:
-	zpci_free_domain(zdev);
-out:
-	return rc;
-}
 
-void zpci_remove_device(struct zpci_dev *zdev)
-{
-	if (!zdev->bus)
-		return;
+	if (zdev->has_hp_slot)
+		zpci_exit_slot(zdev);
+
+	if (zdev->has_resources)
+		zpci_cleanup_bus_resources(zdev);
 
-	pci_stop_root_bus(zdev->bus);
-	pci_remove_root_bus(zdev->bus);
+	zpci_bus_device_unregister(zdev);
+	zpci_destroy_iommu(zdev);
+	zpci_dbg(3, "rem fid:%x\n", zdev->fid);
+	kfree_rcu(zdev, rcu);
 }
 
 int zpci_report_error(struct pci_dev *pdev,
@@ -822,6 +976,59 @@ int zpci_report_error(struct pci_dev *pdev,
 }
 EXPORT_SYMBOL(zpci_report_error);
 
+/**
+ * zpci_clear_error_state() - Clears the zPCI error state of the device
+ * @zdev: The zdev for which the zPCI error state should be reset
+ *
+ * Clear the zPCI error state of the device. If clearing the zPCI error state
+ * fails the device is left in the error state. In this case it may make sense
+ * to call zpci_io_perm_failure() on the associated pdev if it exists.
+ *
+ * Returns: 0 on success, -EIO otherwise
+ */
+int zpci_clear_error_state(struct zpci_dev *zdev)
+{
+	u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_RESET_ERROR);
+	struct zpci_fib fib = {0};
+	u8 status;
+	int cc;
+
+	cc = zpci_mod_fc(req, &fib, &status);
+	if (cc) {
+		zpci_dbg(3, "ces fid:%x, cc:%d, status:%x\n", zdev->fid, cc, status);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * zpci_reset_load_store_blocked() - Re-enables L/S from error state
+ * @zdev: The zdev for which to unblock load/store access
+ *
+ * Re-enables load/store access for a PCI function in the error state while
+ * keeping DMA blocked. In this state drivers can poke MMIO space to determine
+ * if error recovery is possible while catching any rogue DMA access from the
+ * device.
+ *
+ * Returns: 0 on success, -EIO otherwise
+ */
+int zpci_reset_load_store_blocked(struct zpci_dev *zdev)
+{
+	u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_RESET_BLOCK);
+	struct zpci_fib fib = {0};
+	u8 status;
+	int cc;
+
+	cc = zpci_mod_fc(req, &fib, &status);
+	if (cc) {
+		zpci_dbg(3, "rls fid:%x, cc:%d, status:%x\n", zdev->fid, cc, status);
+		return -EIO;
+	}
+
+	return 0;
+}
+
 static int zpci_mem_init(void)
 {
 	BUILD_BUG_ON(!is_power_of_2(__alignof__(struct zpci_fmb)) ||
@@ -842,6 +1049,9 @@ static int zpci_mem_init(void)
 	if (!zpci_iomap_bitmap)
 		goto error_iomap_bitmap;
 
+	if (static_branch_likely(&have_mio))
+		clp_setup_writeback_mio();
+
 	return 0;
 error_iomap_bitmap:
 	kfree(zpci_iomap_start);
@@ -859,7 +1069,6 @@ static void zpci_mem_exit(void)
 }
 
 static unsigned int s390_pci_probe __initdata = 1;
-static unsigned int s390_pci_no_mio __initdata;
 unsigned int s390_pci_force_floating __initdata;
 static unsigned int s390_pci_initialized;
 
@@ -870,13 +1079,17 @@ char * __init pcibios_setup(char *str)
 		return NULL;
 	}
 	if (!strcmp(str, "nomio")) {
-		s390_pci_no_mio = 1;
+		clear_machine_feature(MFEATURE_PCI_MIO);
 		return NULL;
 	}
 	if (!strcmp(str, "force_floating")) {
 		s390_pci_force_floating = 1;
 		return NULL;
 	}
+	if (!strcmp(str, "norid")) {
+		s390_pci_no_rid = 1;
+		return NULL;
+	}
 	return str;
 }
 
@@ -885,6 +1098,50 @@ bool zpci_is_enabled(void)
 	return s390_pci_initialized;
 }
 
+static int zpci_cmp_rid(void *priv, const struct list_head *a,
+			const struct list_head *b)
+{
+	struct zpci_dev *za = container_of(a, struct zpci_dev, entry);
+	struct zpci_dev *zb = container_of(b, struct zpci_dev, entry);
+
+	/*
+	 * PCI functions without RID available maintain original order
+	 * between themselves but sort before those with RID.
+	 */
+	if (za->rid == zb->rid)
+		return za->rid_available > zb->rid_available;
+	/*
+	 * PCI functions with RID sort by RID ascending.
+	 */
+	return za->rid > zb->rid;
+}
+
+static void zpci_add_devices(struct list_head *scan_list)
+{
+	struct zpci_dev *zdev, *tmp;
+
+	list_sort(NULL, scan_list, &zpci_cmp_rid);
+	list_for_each_entry_safe(zdev, tmp, scan_list, entry) {
+		list_del_init(&zdev->entry);
+		if (zpci_add_device(zdev))
+			kfree(zdev);
+	}
+}
+
+int zpci_scan_devices(void)
+{
+	LIST_HEAD(scan_list);
+	int rc;
+
+	rc = clp_scan_pci_devices(&scan_list);
+	if (rc)
+		return rc;
+
+	zpci_add_devices(&scan_list);
+	zpci_bus_scan_busses();
+	return 0;
+}
+
 static int __init pci_base_init(void)
 {
 	int rc;
@@ -892,12 +1149,14 @@ static int __init pci_base_init(void)
 	if (!s390_pci_probe)
 		return 0;
 
-	if (!test_facility(69) || !test_facility(71))
+	if (!test_facility(69) || !test_facility(71)) {
+		pr_info("PCI is not supported because CPU facilities 69 or 71 are not available\n");
 		return 0;
+	}
 
-	if (test_facility(153) && !s390_pci_no_mio) {
+	if (test_machine_feature(MFEATURE_PCI_MIO)) {
 		static_branch_enable(&have_mio);
-		ctl_set_bit(2, 5);
+		system_ctl_set_bit(2, CR2_MIO_ADDRESSING_BIT);
 	}
 
 	rc = zpci_debug_init();
@@ -912,11 +1171,7 @@ static int __init pci_base_init(void)
 	if (rc)
 		goto out_irq;
 
-	rc = zpci_dma_init();
-	if (rc)
-		goto out_dma;
-
-	rc = clp_scan_pci_devices();
+	rc = zpci_scan_devices();
 	if (rc)
 		goto out_find;
 
@@ -924,8 +1179,6 @@ static int __init pci_base_init(void)
 	return 0;
 
 out_find:
-	zpci_dma_exit();
-out_dma:
 	zpci_irq_exit();
 out_irq:
 	zpci_mem_exit();
@@ -935,9 +1188,3 @@ out:
 	return rc;
 }
 subsys_initcall_sync(pci_base_init);
-
-void zpci_rescan(void)
-{
-	if (zpci_is_enabled())
-		clp_rescan_pci_devices_simple();
-}
diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c
new file mode 100644
index 000000000000..81bdb54ad5e3
--- /dev/null
+++ b/arch/s390/pci/pci_bus.c
@@ -0,0 +1,429 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s):
+ *   Pierre Morel <pmorel@linux.ibm.com>
+ *
+ */
+
+#define KMSG_COMPONENT "zpci"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/seq_file.h>
+#include <linux/jump_label.h>
+#include <linux/pci.h>
+#include <linux/printk.h>
+#include <linux/dma-direct.h>
+
+#include <asm/pci_clp.h>
+#include <asm/pci_dma.h>
+
+#include "pci_bus.h"
+#include "pci_iov.h"
+
+static LIST_HEAD(zbus_list);
+static DEFINE_MUTEX(zbus_list_lock);
+static int zpci_nb_devices;
+
+/* zpci_bus_prepare_device - Prepare a zPCI function for scanning
+ * @zdev: the zPCI function to be prepared
+ *
+ * The PCI resources for the function are set up and added to its zbus and the
+ * function is enabled. The function must be added to a zbus which must have
+ * a PCI bus created. If an error occurs the zPCI function is not enabled.
+ *
+ * Return: 0 on success, an error code otherwise
+ */
+static int zpci_bus_prepare_device(struct zpci_dev *zdev)
+{
+	int rc, i;
+
+	if (!zdev_enabled(zdev)) {
+		rc = zpci_enable_device(zdev);
+		if (rc)
+			return rc;
+	}
+
+	if (!zdev->has_resources) {
+		zpci_setup_bus_resources(zdev);
+		for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+			if (zdev->bars[i].res)
+				pci_bus_add_resource(zdev->zbus->bus, zdev->bars[i].res);
+		}
+	}
+
+	return 0;
+}
+
+/* zpci_bus_scan_device - Scan a single device adding it to the PCI core
+ * @zdev: the zdev to be scanned
+ *
+ * Scans the PCI function making it available to the common PCI code.
+ *
+ * Return: 0 on success, an error value otherwise
+ */
+int zpci_bus_scan_device(struct zpci_dev *zdev)
+{
+	struct pci_dev *pdev;
+	int rc;
+
+	rc = zpci_bus_prepare_device(zdev);
+	if (rc)
+		return rc;
+
+	pdev = pci_scan_single_device(zdev->zbus->bus, zdev->devfn);
+	if (!pdev)
+		return -ENODEV;
+
+	pci_lock_rescan_remove();
+	pci_bus_add_device(pdev);
+	pci_unlock_rescan_remove();
+
+	return 0;
+}
+
+/* zpci_bus_remove_device - Removes the given zdev from the PCI core
+ * @zdev: the zdev to be removed from the PCI core
+ * @set_error: if true the device's error state is set to permanent failure
+ *
+ * Sets a zPCI device to a configured but offline state; the zPCI
+ * device is still accessible through its hotplug slot and the zPCI
+ * API but is removed from the common code PCI bus, making it
+ * no longer available to drivers.
+ */
+void zpci_bus_remove_device(struct zpci_dev *zdev, bool set_error)
+{
+	struct zpci_bus *zbus = zdev->zbus;
+	struct pci_dev *pdev;
+
+	if (!zdev->zbus->bus)
+		return;
+
+	pdev = pci_get_slot(zbus->bus, zdev->devfn);
+	if (pdev) {
+		if (set_error)
+			pdev->error_state = pci_channel_io_perm_failure;
+		if (pdev->is_virtfn) {
+			zpci_iov_remove_virtfn(pdev, zdev->vfn);
+			/* balance pci_get_slot */
+			pci_dev_put(pdev);
+			return;
+		}
+		pci_stop_and_remove_bus_device_locked(pdev);
+		/* balance pci_get_slot */
+		pci_dev_put(pdev);
+	}
+}
+
+/* zpci_bus_scan_bus - Scan all configured zPCI functions on the bus
+ * @zbus: the zbus to be scanned
+ *
+ * Enables and scans all PCI functions on the bus making them available to the
+ * common PCI code. If a PCI function fails to be initialized an error will be
+ * returned but attempts will still be made for all other functions on the bus.
+ *
+ * Return: 0 on success, an error value otherwise
+ */
+int zpci_bus_scan_bus(struct zpci_bus *zbus)
+{
+	struct zpci_dev *zdev;
+	int devfn, rc, ret = 0;
+
+	for (devfn = 0; devfn < ZPCI_FUNCTIONS_PER_BUS; devfn++) {
+		zdev = zbus->function[devfn];
+		if (zdev && zdev->state == ZPCI_FN_STATE_CONFIGURED) {
+			rc = zpci_bus_prepare_device(zdev);
+			if (rc)
+				ret = -EIO;
+		}
+	}
+
+	pci_lock_rescan_remove();
+	pci_scan_child_bus(zbus->bus);
+	pci_bus_add_devices(zbus->bus);
+	pci_unlock_rescan_remove();
+
+	return ret;
+}
+
+/* zpci_bus_scan_busses - Scan all registered busses
+ *
+ * Scan all available zbusses
+ *
+ */
+void zpci_bus_scan_busses(void)
+{
+	struct zpci_bus *zbus = NULL;
+
+	mutex_lock(&zbus_list_lock);
+	list_for_each_entry(zbus, &zbus_list, bus_next) {
+		zpci_bus_scan_bus(zbus);
+		cond_resched();
+	}
+	mutex_unlock(&zbus_list_lock);
+}
+
+static bool zpci_bus_is_multifunction_root(struct zpci_dev *zdev)
+{
+	return !s390_pci_no_rid && zdev->rid_available &&
+		!zdev->vfn;
+}
+
+/* zpci_bus_create_pci_bus - Create the PCI bus associated with this zbus
+ * @zbus: the zbus holding the zdevices
+ * @fr: PCI root function that will determine the bus's domain, and bus speed
+ * @ops: the pci operations
+ *
+ * The PCI function @fr determines the domain (its UID), multifunction property
+ * and maximum bus speed of the entire bus.
+ *
+ * Return: 0 on success, an error code otherwise
+ */
+static int zpci_bus_create_pci_bus(struct zpci_bus *zbus, struct zpci_dev *fr, struct pci_ops *ops)
+{
+	struct pci_bus *bus;
+	int domain;
+
+	domain = zpci_alloc_domain((u16)fr->uid);
+	if (domain < 0)
+		return domain;
+
+	zbus->domain_nr = domain;
+	zbus->multifunction = zpci_bus_is_multifunction_root(fr);
+	zbus->max_bus_speed = fr->max_bus_speed;
+
+	/*
+	 * Note that the zbus->resources are taken over and zbus->resources
+	 * is empty after a successful call
+	 */
+	bus = pci_create_root_bus(NULL, ZPCI_BUS_NR, ops, zbus, &zbus->resources);
+	if (!bus) {
+		zpci_free_domain(zbus->domain_nr);
+		return -EFAULT;
+	}
+
+	zbus->bus = bus;
+
+	return 0;
+}
+
+static void zpci_bus_release(struct kref *kref)
+{
+	struct zpci_bus *zbus = container_of(kref, struct zpci_bus, kref);
+
+	if (zbus->bus) {
+		pci_lock_rescan_remove();
+		pci_stop_root_bus(zbus->bus);
+
+		zpci_free_domain(zbus->domain_nr);
+		pci_free_resource_list(&zbus->resources);
+
+		pci_remove_root_bus(zbus->bus);
+		pci_unlock_rescan_remove();
+	}
+
+	mutex_lock(&zbus_list_lock);
+	list_del(&zbus->bus_next);
+	mutex_unlock(&zbus_list_lock);
+	kfree(zbus);
+}
+
+static void zpci_bus_put(struct zpci_bus *zbus)
+{
+	kref_put(&zbus->kref, zpci_bus_release);
+}
+
+static struct zpci_bus *zpci_bus_get(int topo, bool topo_is_tid)
+{
+	struct zpci_bus *zbus;
+
+	mutex_lock(&zbus_list_lock);
+	list_for_each_entry(zbus, &zbus_list, bus_next) {
+		if (!zbus->multifunction)
+			continue;
+		if (topo_is_tid == zbus->topo_is_tid && topo == zbus->topo) {
+			kref_get(&zbus->kref);
+			goto out_unlock;
+		}
+	}
+	zbus = NULL;
+out_unlock:
+	mutex_unlock(&zbus_list_lock);
+	return zbus;
+}
+
+static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid)
+{
+	struct zpci_bus *zbus;
+
+	zbus = kzalloc(sizeof(*zbus), GFP_KERNEL);
+	if (!zbus)
+		return NULL;
+
+	zbus->topo = topo;
+	zbus->topo_is_tid = topo_is_tid;
+	INIT_LIST_HEAD(&zbus->bus_next);
+	mutex_lock(&zbus_list_lock);
+	list_add_tail(&zbus->bus_next, &zbus_list);
+	mutex_unlock(&zbus_list_lock);
+
+	kref_init(&zbus->kref);
+	INIT_LIST_HEAD(&zbus->resources);
+
+	zbus->bus_resource.start = 0;
+	zbus->bus_resource.end = ZPCI_BUS_NR;
+	zbus->bus_resource.flags = IORESOURCE_BUS;
+	pci_add_resource(&zbus->resources, &zbus->bus_resource);
+
+	return zbus;
+}
+
+static void pci_dma_range_setup(struct pci_dev *pdev)
+{
+	struct zpci_dev *zdev = to_zpci(pdev);
+	u64 aligned_end, size;
+	dma_addr_t dma_start;
+	int ret;
+
+	dma_start = PAGE_ALIGN(zdev->start_dma);
+	aligned_end = PAGE_ALIGN_DOWN(zdev->end_dma + 1);
+	if (aligned_end >= dma_start)
+		size = aligned_end - dma_start;
+	else
+		size = 0;
+	WARN_ON_ONCE(size == 0);
+
+	ret = dma_direct_set_offset(&pdev->dev, 0, dma_start, size);
+	if (ret)
+		pr_err("Failed to allocate DMA range map for %s\n", pci_name(pdev));
+}
+
+void pcibios_bus_add_device(struct pci_dev *pdev)
+{
+	struct zpci_dev *zdev = to_zpci(pdev);
+
+	pci_dma_range_setup(pdev);
+
+	/*
+	 * With pdev->no_vf_scan the common PCI probing code does not
+	 * perform PF/VF linking.
+	 */
+	if (zdev->vfn) {
+		zpci_iov_setup_virtfn(zdev->zbus, pdev, zdev->vfn);
+		pdev->no_command_memory = 1;
+	}
+}
+
+static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev)
+{
+	int rc = -EINVAL;
+
+	if (zbus->multifunction) {
+		if (!zdev->rid_available) {
+			WARN_ONCE(1, "rid_available not set for multifunction\n");
+			return rc;
+		}
+		zdev->devfn = zdev->rid & ZPCI_RID_MASK_DEVFN;
+	}
+
+	if (zbus->function[zdev->devfn]) {
+		pr_err("devfn %04x is already assigned\n", zdev->devfn);
+		return rc;
+	}
+	zdev->zbus = zbus;
+	zbus->function[zdev->devfn] = zdev;
+	zpci_nb_devices++;
+
+	rc = zpci_init_slot(zdev);
+	if (rc)
+		goto error;
+	zdev->has_hp_slot = 1;
+
+	return 0;
+
+error:
+	zbus->function[zdev->devfn] = NULL;
+	zdev->zbus = NULL;
+	zpci_nb_devices--;
+	return rc;
+}
+
+static bool zpci_bus_is_isolated_vf(struct zpci_bus *zbus, struct zpci_dev *zdev)
+{
+	struct pci_dev *pdev;
+
+	if (!zdev->vfn)
+		return false;
+
+	pdev = zpci_iov_find_parent_pf(zbus, zdev);
+	if (!pdev)
+		return true;
+	pci_dev_put(pdev);
+	return false;
+}
+
+int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops)
+{
+	bool topo_is_tid = zdev->tid_avail;
+	struct zpci_bus *zbus = NULL;
+	int topo, rc = -EBADF;
+
+	if (zpci_nb_devices == ZPCI_NR_DEVICES) {
+		pr_warn("Adding PCI function %08x failed because the configured limit of %d is reached\n",
+			zdev->fid, ZPCI_NR_DEVICES);
+		return -ENOSPC;
+	}
+
+	topo = topo_is_tid ? zdev->tid : zdev->pchid;
+	zbus = zpci_bus_get(topo, topo_is_tid);
+	/*
+	 * An isolated VF gets its own domain/bus even if there exists
+	 * a matching domain/bus already
+	 */
+	if (zbus && zpci_bus_is_isolated_vf(zbus, zdev)) {
+		zpci_bus_put(zbus);
+		zbus = NULL;
+	}
+
+	if (!zbus) {
+		zbus = zpci_bus_alloc(topo, topo_is_tid);
+		if (!zbus)
+			return -ENOMEM;
+	}
+
+	if (!zbus->bus) {
+		/* The UID of the first PCI function registered with a zpci_bus
+		 * is used as the domain number for that bus. Currently there
+		 * is exactly one zpci_bus per domain.
+		 */
+		rc = zpci_bus_create_pci_bus(zbus, zdev, ops);
+		if (rc)
+			goto error;
+	}
+
+	rc = zpci_bus_add_device(zbus, zdev);
+	if (rc)
+		goto error;
+
+	return 0;
+
+error:
+	pr_err("Adding PCI function %08x failed\n", zdev->fid);
+	zpci_bus_put(zbus);
+	return rc;
+}
+
+void zpci_bus_device_unregister(struct zpci_dev *zdev)
+{
+	struct zpci_bus *zbus = zdev->zbus;
+
+	zpci_nb_devices--;
+	zbus->function[zdev->devfn] = NULL;
+	zpci_bus_put(zbus);
+}
diff --git a/arch/s390/pci/pci_bus.h b/arch/s390/pci/pci_bus.h
new file mode 100644
index 000000000000..e86a9419d233
--- /dev/null
+++ b/arch/s390/pci/pci_bus.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s):
+ *   Pierre Morel <pmorel@linux.ibm.com>
+ *
+ */
+#ifndef __S390_PCI_BUS_H
+#define __S390_PCI_BUS_H
+
+#include <linux/pci.h>
+
+int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops);
+void zpci_bus_device_unregister(struct zpci_dev *zdev);
+
+int zpci_bus_scan_bus(struct zpci_bus *zbus);
+void zpci_bus_scan_busses(void);
+
+int zpci_bus_scan_device(struct zpci_dev *zdev);
+void zpci_bus_remove_device(struct zpci_dev *zdev, bool set_error);
+
+void zpci_release_device(struct kref *kref);
+static inline void zpci_zdev_put(struct zpci_dev *zdev)
+{
+	if (zdev)
+		kref_put(&zdev->kref, zpci_release_device);
+}
+
+static inline void zpci_zdev_get(struct zpci_dev *zdev)
+{
+	kref_get(&zdev->kref);
+}
+
+int zpci_alloc_domain(int domain);
+void zpci_free_domain(int domain);
+int zpci_setup_bus_resources(struct zpci_dev *zdev);
+
+static inline struct zpci_dev *zdev_from_bus(struct pci_bus *bus,
+					     unsigned int devfn)
+{
+	struct zpci_bus *zbus = bus->sysdata;
+
+	return (devfn >= ZPCI_FUNCTIONS_PER_BUS) ? NULL : zbus->function[devfn];
+}
+
+#endif /* __S390_PCI_BUS_H */
diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c
index 4c613e569fe0..9a929bbcc397 100644
--- a/arch/s390/pci/pci_clp.c
+++ b/arch/s390/pci/pci_clp.c
@@ -17,17 +17,21 @@
 #include <linux/delay.h>
 #include <linux/pci.h>
 #include <linux/uaccess.h>
+#include <asm/asm-extable.h>
 #include <asm/pci_debug.h>
 #include <asm/pci_clp.h>
+#include <asm/asm.h>
 #include <asm/clp.h>
 #include <uapi/asm/clp.h>
 
+#include "pci_bus.h"
+
 bool zpci_unique_uid;
 
-static void update_uid_checking(bool new)
+void update_uid_checking(bool new)
 {
 	if (zpci_unique_uid != new)
-		zpci_dbg(1, "uid checking:%d\n", new);
+		zpci_dbg(3, "uid checking:%d\n", new);
 
 	zpci_unique_uid = new;
 }
@@ -49,18 +53,20 @@ static inline void zpci_err_clp(unsigned int rsp, int rc)
 static inline int clp_get_ilp(unsigned long *ilp)
 {
 	unsigned long mask;
-	int cc = 3;
+	int cc, exception;
 
-	asm volatile (
+	exception = 1;
+	asm_inline volatile (
 		"	.insn	rrf,0xb9a00000,%[mask],%[cmd],8,0\n"
-		"0:	ipm	%[cc]\n"
-		"	srl	%[cc],28\n"
+		"0:	lhi	%[exc],0\n"
 		"1:\n"
+		CC_IPM(cc)
 		EX_TABLE(0b, 1b)
-		: [cc] "+d" (cc), [mask] "=d" (mask) : [cmd] "a" (1)
-		: "cc");
+		: CC_OUT(cc, cc), [mask] "=d" (mask), [exc] "+d" (exception)
+		: [cmd] "a" (1)
+		: CC_CLOBBER);
 	*ilp = mask;
-	return cc;
+	return exception ? 3 : CC_TRANSFORM(cc);
 }
 
 /*
@@ -69,19 +75,20 @@ static inline int clp_get_ilp(unsigned long *ilp)
 static __always_inline int clp_req(void *data, unsigned int lps)
 {
 	struct { u8 _[CLP_BLK_SIZE]; } *req = data;
+	int cc, exception;
 	u64 ignored;
-	int cc = 3;
 
-	asm volatile (
+	exception = 1;
+	asm_inline volatile (
 		"	.insn	rrf,0xb9a00000,%[ign],%[req],0,%[lps]\n"
-		"0:	ipm	%[cc]\n"
-		"	srl	%[cc],28\n"
+		"0:	lhi	%[exc],0\n"
 		"1:\n"
+		CC_IPM(cc)
 		EX_TABLE(0b, 1b)
-		: [cc] "+d" (cc), [ign] "=d" (ignored), "+m" (*req)
+		: CC_OUT(cc, cc), [ign] "=d" (ignored), "+m" (*req), [exc] "+d" (exception)
 		: [req] "a" (req), [lps] "i" (lps)
-		: "cc");
-	return cc;
+		: CC_CLOBBER);
+	return exception ? 3 : CC_TRANSFORM(cc);
 }
 
 static void *clp_alloc_block(gfp_t gfp_mask)
@@ -102,6 +109,10 @@ static void clp_store_query_pci_fngrp(struct zpci_dev *zdev,
 	zdev->msi_addr = response->msia;
 	zdev->max_msi = response->noi;
 	zdev->fmb_update = response->mui;
+	zdev->version = response->version;
+	zdev->maxstbl = response->maxstbl;
+	zdev->dtsm = response->dtsm;
+	zdev->rtr_avail = response->rtr;
 
 	switch (response->version) {
 	case 1:
@@ -155,13 +166,23 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev,
 	zdev->pfgid = response->pfgid;
 	zdev->pft = response->pft;
 	zdev->vfn = response->vfn;
+	zdev->port = response->port;
+	zdev->fidparm = response->fidparm;
 	zdev->uid = response->uid;
 	zdev->fmb_length = sizeof(u32) * response->fmb_len;
+	zdev->is_physfn = response->is_physfn;
+	zdev->rid_available = response->rid_avail;
+	if (zdev->rid_available)
+		zdev->rid = response->rid;
+	zdev->tid_avail = response->tid_avail;
+	if (zdev->tid_avail)
+		zdev->tid = response->tid;
 
 	memcpy(zdev->pfip, response->pfip, sizeof(zdev->pfip));
 	if (response->util_str_avail) {
 		memcpy(zdev->util_str, response->util_str,
 		       sizeof(zdev->util_str));
+		zdev->util_str_avail = 1;
 	}
 	zdev->mio_capable = response->mio_addr_avail;
 	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
@@ -174,7 +195,7 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev,
 	return 0;
 }
 
-static int clp_query_pci_fn(struct zpci_dev *zdev, u32 fh)
+int clp_query_pci_fn(struct zpci_dev *zdev)
 {
 	struct clp_req_rsp_query_pci *rrb;
 	int rc;
@@ -187,7 +208,7 @@ static int clp_query_pci_fn(struct zpci_dev *zdev, u32 fh)
 	rrb->request.hdr.len = sizeof(rrb->request);
 	rrb->request.hdr.cmd = CLP_QUERY_PCI_FN;
 	rrb->response.hdr.len = sizeof(rrb->response);
-	rrb->request.fh = fh;
+	rrb->request.fh = zdev->fh;
 
 	rc = clp_req(rrb, CLP_LPS_PCI);
 	if (!rc && rrb->response.hdr.rsp == CLP_RC_OK) {
@@ -205,60 +226,39 @@ out:
 	return rc;
 }
 
-int clp_add_pci_device(u32 fid, u32 fh, int configured)
-{
-	struct zpci_dev *zdev;
-	int rc = -ENOMEM;
-
-	zpci_dbg(3, "add fid:%x, fh:%x, c:%d\n", fid, fh, configured);
-	zdev = kzalloc(sizeof(*zdev), GFP_KERNEL);
-	if (!zdev)
-		goto error;
-
-	zdev->fh = fh;
-	zdev->fid = fid;
-
-	/* Query function properties and update zdev */
-	rc = clp_query_pci_fn(zdev, fh);
-	if (rc)
-		goto error;
-
-	if (configured)
-		zdev->state = ZPCI_FN_STATE_CONFIGURED;
-	else
-		zdev->state = ZPCI_FN_STATE_STANDBY;
-
-	rc = zpci_create_device(zdev);
-	if (rc)
-		goto error;
-	return 0;
-
-error:
-	zpci_dbg(0, "add fid:%x, rc:%d\n", fid, rc);
-	kfree(zdev);
-	return rc;
-}
-
-/*
- * Enable/Disable a given PCI function defined by its function handle.
+/**
+ * clp_set_pci_fn() - Execute a command on a PCI function
+ * @zdev: Function that will be affected
+ * @fh: Out parameter for updated function handle
+ * @nr_dma_as: DMA address space number
+ * @command: The command code to execute
+ *
+ * Returns: 0 on success, < 0 for Linux errors (e.g. -ENOMEM), and
+ * > 0 for non-success platform responses
  */
-static int clp_set_pci_fn(u32 *fh, u8 nr_dma_as, u8 command)
+static int clp_set_pci_fn(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as, u8 command)
 {
 	struct clp_req_rsp_set_pci *rrb;
 	int rc, retries = 100;
+	u32 gisa = 0;
 
+	*fh = 0;
 	rrb = clp_alloc_block(GFP_KERNEL);
 	if (!rrb)
 		return -ENOMEM;
 
+	if (command != CLP_SET_DISABLE_PCI_FN)
+		gisa = zdev->gisa;
+
 	do {
 		memset(rrb, 0, sizeof(*rrb));
 		rrb->request.hdr.len = sizeof(rrb->request);
 		rrb->request.hdr.cmd = CLP_SET_PCI_FN;
 		rrb->response.hdr.len = sizeof(rrb->response);
-		rrb->request.fh = *fh;
+		rrb->request.fh = zdev->fh;
 		rrb->request.oc = command;
 		rrb->request.ndas = nr_dma_as;
+		rrb->request.gisa = gisa;
 
 		rc = clp_req(rrb, CLP_LPS_PCI);
 		if (rrb->response.hdr.rsp == CLP_RC_SETPCIFN_BUSY) {
@@ -269,50 +269,107 @@ static int clp_set_pci_fn(u32 *fh, u8 nr_dma_as, u8 command)
 		}
 	} while (rrb->response.hdr.rsp == CLP_RC_SETPCIFN_BUSY);
 
-	if (!rc && rrb->response.hdr.rsp == CLP_RC_OK)
+	if (!rc && rrb->response.hdr.rsp == CLP_RC_OK) {
 		*fh = rrb->response.fh;
-	else {
+	} else {
 		zpci_err("Set PCI FN:\n");
 		zpci_err_clp(rrb->response.hdr.rsp, rc);
-		rc = -EIO;
+		if (!rc)
+			rc = rrb->response.hdr.rsp;
 	}
 	clp_free_block(rrb);
 	return rc;
 }
 
-int clp_enable_fh(struct zpci_dev *zdev, u8 nr_dma_as)
+int clp_setup_writeback_mio(void)
 {
-	u32 fh = zdev->fh;
+	struct clp_req_rsp_slpc_pci *rrb;
+	u8  wb_bit_pos;
 	int rc;
 
-	rc = clp_set_pci_fn(&fh, nr_dma_as, CLP_SET_ENABLE_PCI_FN);
-	zpci_dbg(3, "ena fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc);
-	if (rc)
-		goto out;
+	rrb = clp_alloc_block(GFP_KERNEL);
+	if (!rrb)
+		return -ENOMEM;
 
-	zdev->fh = fh;
-	if (zpci_use_mio(zdev)) {
-		rc = clp_set_pci_fn(&fh, nr_dma_as, CLP_SET_ENABLE_MIO);
-		zpci_dbg(3, "ena mio fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc);
+	memset(rrb, 0, sizeof(*rrb));
+	rrb->request.hdr.len = sizeof(rrb->request);
+	rrb->request.hdr.cmd = CLP_SLPC;
+	rrb->response.hdr.len = sizeof(rrb->response);
+
+	rc = clp_req(rrb, CLP_LPS_PCI);
+	if (!rc && rrb->response.hdr.rsp == CLP_RC_OK) {
+		if (rrb->response.vwb) {
+			wb_bit_pos = rrb->response.mio_wb;
+			set_bit_inv(wb_bit_pos, &mio_wb_bit_mask);
+			zpci_dbg(3, "wb bit: %d\n", wb_bit_pos);
+		} else {
+			zpci_dbg(3, "wb bit: n.a.\n");
+		}
+
+	} else {
+		zpci_err("SLPC PCI:\n");
+		zpci_err_clp(rrb->response.hdr.rsp, rc);
+		rc = -EIO;
+	}
+	clp_free_block(rrb);
+	return rc;
+}
+
+int clp_enable_fh(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as)
+{
+	int rc;
+
+	rc = clp_set_pci_fn(zdev, fh, nr_dma_as, CLP_SET_ENABLE_PCI_FN);
+	zpci_dbg(3, "ena fid:%x, fh:%x, rc:%d\n", zdev->fid, *fh, rc);
+	if (!rc && zpci_use_mio(zdev)) {
+		rc = clp_set_pci_fn(zdev, fh, nr_dma_as, CLP_SET_ENABLE_MIO);
+		zpci_dbg(3, "ena mio fid:%x, fh:%x, rc:%d\n",
+				zdev->fid, *fh, rc);
 		if (rc)
-			clp_disable_fh(zdev);
+			clp_disable_fh(zdev, fh);
 	}
-out:
 	return rc;
 }
 
-int clp_disable_fh(struct zpci_dev *zdev)
+int clp_disable_fh(struct zpci_dev *zdev, u32 *fh)
 {
-	u32 fh = zdev->fh;
 	int rc;
 
 	if (!zdev_enabled(zdev))
 		return 0;
 
-	rc = clp_set_pci_fn(&fh, 0, CLP_SET_DISABLE_PCI_FN);
-	zpci_dbg(3, "dis fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc);
-	if (!rc)
-		zdev->fh = fh;
+	rc = clp_set_pci_fn(zdev, fh, 0, CLP_SET_DISABLE_PCI_FN);
+	zpci_dbg(3, "dis fid:%x, fh:%x, rc:%d\n", zdev->fid, *fh, rc);
+	return rc;
+}
+
+static int clp_list_pci_req(struct clp_req_rsp_list_pci *rrb,
+			    u64 *resume_token, int *nentries)
+{
+	int rc;
+
+	memset(rrb, 0, sizeof(*rrb));
+	rrb->request.hdr.len = sizeof(rrb->request);
+	rrb->request.hdr.cmd = CLP_LIST_PCI;
+	/* store as many entries as possible */
+	rrb->response.hdr.len = CLP_BLK_SIZE - LIST_PCI_HDR_LEN;
+	rrb->request.resume_token = *resume_token;
+
+	/* Get PCI function handle list */
+	rc = clp_req(rrb, CLP_LPS_PCI);
+	if (rc || rrb->response.hdr.rsp != CLP_RC_OK) {
+		zpci_err("List PCI FN:\n");
+		zpci_err_clp(rrb->response.hdr.rsp, rc);
+		return -EIO;
+	}
+
+	update_uid_checking(rrb->response.uid_checking);
+	WARN_ON_ONCE(rrb->response.entry_size !=
+		sizeof(struct clp_fh_list_entry));
+
+	*nentries = (rrb->response.hdr.len - LIST_PCI_HDR_LEN) /
+		rrb->response.entry_size;
+	*resume_token = rrb->response.resume_token;
 
 	return rc;
 }
@@ -321,141 +378,112 @@ static int clp_list_pci(struct clp_req_rsp_list_pci *rrb, void *data,
 			void (*cb)(struct clp_fh_list_entry *, void *))
 {
 	u64 resume_token = 0;
-	int entries, i, rc;
+	int nentries, i, rc;
 
 	do {
-		memset(rrb, 0, sizeof(*rrb));
-		rrb->request.hdr.len = sizeof(rrb->request);
-		rrb->request.hdr.cmd = CLP_LIST_PCI;
-		/* store as many entries as possible */
-		rrb->response.hdr.len = CLP_BLK_SIZE - LIST_PCI_HDR_LEN;
-		rrb->request.resume_token = resume_token;
-
-		/* Get PCI function handle list */
-		rc = clp_req(rrb, CLP_LPS_PCI);
-		if (rc || rrb->response.hdr.rsp != CLP_RC_OK) {
-			zpci_err("List PCI FN:\n");
-			zpci_err_clp(rrb->response.hdr.rsp, rc);
-			rc = -EIO;
-			goto out;
-		}
-
-		update_uid_checking(rrb->response.uid_checking);
-		WARN_ON_ONCE(rrb->response.entry_size !=
-			sizeof(struct clp_fh_list_entry));
-
-		entries = (rrb->response.hdr.len - LIST_PCI_HDR_LEN) /
-			rrb->response.entry_size;
-
-		resume_token = rrb->response.resume_token;
-		for (i = 0; i < entries; i++)
+		rc = clp_list_pci_req(rrb, &resume_token, &nentries);
+		if (rc)
+			return rc;
+		for (i = 0; i < nentries; i++)
 			cb(&rrb->response.fh_list[i], data);
 	} while (resume_token);
-out:
+
 	return rc;
 }
 
-static void __clp_add(struct clp_fh_list_entry *entry, void *data)
+static int clp_find_pci(struct clp_req_rsp_list_pci *rrb, u32 fid,
+			struct clp_fh_list_entry *entry)
 {
-	struct zpci_dev *zdev;
+	struct clp_fh_list_entry *fh_list;
+	u64 resume_token = 0;
+	int nentries, i, rc;
 
-	if (!entry->vendor_id)
-		return;
+	do {
+		rc = clp_list_pci_req(rrb, &resume_token, &nentries);
+		if (rc)
+			return rc;
+		fh_list = rrb->response.fh_list;
+		for (i = 0; i < nentries; i++) {
+			if (fh_list[i].fid == fid) {
+				*entry = fh_list[i];
+				return 0;
+			}
+		}
+	} while (resume_token);
 
-	zdev = get_zdev_by_fid(entry->fid);
-	if (!zdev)
-		clp_add_pci_device(entry->fid, entry->fh, entry->config_state);
+	return -ENODEV;
 }
 
-static void __clp_update(struct clp_fh_list_entry *entry, void *data)
+static void __clp_add(struct clp_fh_list_entry *entry, void *data)
 {
+	struct list_head *scan_list = data;
 	struct zpci_dev *zdev;
 
 	if (!entry->vendor_id)
 		return;
 
 	zdev = get_zdev_by_fid(entry->fid);
-	if (!zdev)
+	if (zdev) {
+		zpci_zdev_put(zdev);
 		return;
-
-	zdev->fh = entry->fh;
-}
-
-int clp_scan_pci_devices(void)
-{
-	struct clp_req_rsp_list_pci *rrb;
-	int rc;
-
-	rrb = clp_alloc_block(GFP_KERNEL);
-	if (!rrb)
-		return -ENOMEM;
-
-	rc = clp_list_pci(rrb, NULL, __clp_add);
-
-	clp_free_block(rrb);
-	return rc;
+	}
+	zdev = zpci_create_device(entry->fid, entry->fh, entry->config_state);
+	list_add_tail(&zdev->entry, scan_list);
 }
 
-int clp_rescan_pci_devices(void)
+int clp_scan_pci_devices(struct list_head *scan_list)
 {
 	struct clp_req_rsp_list_pci *rrb;
 	int rc;
 
-	zpci_remove_reserved_devices();
-
 	rrb = clp_alloc_block(GFP_KERNEL);
 	if (!rrb)
 		return -ENOMEM;
 
-	rc = clp_list_pci(rrb, NULL, __clp_add);
+	rc = clp_list_pci(rrb, scan_list, __clp_add);
 
 	clp_free_block(rrb);
 	return rc;
 }
 
-int clp_rescan_pci_devices_simple(void)
+/*
+ * Get the current function handle of the function matching @fid
+ */
+int clp_refresh_fh(u32 fid, u32 *fh)
 {
 	struct clp_req_rsp_list_pci *rrb;
+	struct clp_fh_list_entry entry;
 	int rc;
 
 	rrb = clp_alloc_block(GFP_NOWAIT);
 	if (!rrb)
 		return -ENOMEM;
 
-	rc = clp_list_pci(rrb, NULL, __clp_update);
+	rc = clp_find_pci(rrb, fid, &entry);
+	if (!rc)
+		*fh = entry.fh;
 
 	clp_free_block(rrb);
 	return rc;
 }
 
-struct clp_state_data {
-	u32 fid;
-	enum zpci_state state;
-};
-
-static void __clp_get_state(struct clp_fh_list_entry *entry, void *data)
-{
-	struct clp_state_data *sd = data;
-
-	if (entry->fid != sd->fid)
-		return;
-
-	sd->state = entry->config_state;
-}
-
 int clp_get_state(u32 fid, enum zpci_state *state)
 {
 	struct clp_req_rsp_list_pci *rrb;
-	struct clp_state_data sd = {fid, ZPCI_FN_STATE_RESERVED};
+	struct clp_fh_list_entry entry;
 	int rc;
 
 	rrb = clp_alloc_block(GFP_ATOMIC);
 	if (!rrb)
 		return -ENOMEM;
 
-	rc = clp_list_pci(rrb, &sd, __clp_get_state);
-	if (!rc)
-		*state = sd.state;
+	rc = clp_find_pci(rrb, fid, &entry);
+	if (!rc) {
+		*state = entry.config_state;
+	} else if (rc == -ENODEV) {
+		*state = ZPCI_FN_STATE_RESERVED;
+		rc = 0;
+	}
 
 	clp_free_block(rrb);
 	return rc;
@@ -481,7 +509,7 @@ static int clp_base_command(struct clp_req *req, struct clp_req_hdr *lpcb)
 	}
 }
 
-static int clp_pci_slpc(struct clp_req *req, struct clp_req_rsp_slpc *lpcb)
+static int clp_pci_slpc(struct clp_req *req, struct clp_req_rsp_slpc_pci *lpcb)
 {
 	unsigned long limit = PAGE_SIZE - sizeof(lpcb->request);
 
@@ -640,7 +668,6 @@ static const struct file_operations clp_misc_fops = {
 	.release = clp_misc_release,
 	.unlocked_ioctl = clp_misc_ioctl,
 	.compat_ioctl = clp_misc_ioctl,
-	.llseek = no_llseek,
 };
 
 static struct miscdevice clp_misc_device = {
@@ -649,9 +676,4 @@ static struct miscdevice clp_misc_device = {
 	.fops = &clp_misc_fops,
 };
 
-static int __init clp_misc_init(void)
-{
-	return misc_register(&clp_misc_device);
-}
-
-device_initcall(clp_misc_init);
+builtin_misc_device(clp_misc_device);
diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c
index 3408c0df3ebf..38014206c16b 100644
--- a/arch/s390/pci/pci_debug.c
+++ b/arch/s390/pci/pci_debug.c
@@ -53,9 +53,11 @@ static char *pci_fmt3_names[] = {
 };
 
 static char *pci_sw_names[] = {
-	"Allocated pages",
 	"Mapped pages",
 	"Unmapped pages",
+	"Global RPCITs",
+	"Sync Map RPCITs",
+	"Sync RPCITs",
 };
 
 static void pci_fmb_show(struct seq_file *m, char *name[], int length,
@@ -70,12 +72,22 @@ static void pci_fmb_show(struct seq_file *m, char *name[], int length,
 static void pci_sw_counter_show(struct seq_file *m)
 {
 	struct zpci_dev *zdev = m->private;
-	atomic64_t *counter = &zdev->allocated_pages;
+	struct zpci_iommu_ctrs *ctrs;
+	atomic64_t *counter;
+	unsigned long flags;
 	int i;
 
+	spin_lock_irqsave(&zdev->dom_lock, flags);
+	ctrs = zpci_get_iommu_ctrs(m->private);
+	if (!ctrs)
+		goto unlock;
+
+	counter = &ctrs->mapped_pages;
 	for (i = 0; i < ARRAY_SIZE(pci_sw_names); i++, counter++)
 		seq_printf(m, "%26s:\t%llu\n", pci_sw_names[i],
 			   atomic64_read(counter));
+unlock:
+	spin_unlock_irqrestore(&zdev->dom_lock, flags);
 }
 
 static int pci_perf_show(struct seq_file *m, void *v)
@@ -85,9 +97,9 @@ static int pci_perf_show(struct seq_file *m, void *v)
 	if (!zdev)
 		return 0;
 
-	mutex_lock(&zdev->lock);
+	mutex_lock(&zdev->fmb_lock);
 	if (!zdev->fmb) {
-		mutex_unlock(&zdev->lock);
+		mutex_unlock(&zdev->fmb_lock);
 		seq_puts(m, "FMB statistics disabled\n");
 		return 0;
 	}
@@ -124,7 +136,7 @@ static int pci_perf_show(struct seq_file *m, void *v)
 	}
 
 	pci_sw_counter_show(m);
-	mutex_unlock(&zdev->lock);
+	mutex_unlock(&zdev->fmb_lock);
 	return 0;
 }
 
@@ -142,7 +154,7 @@ static ssize_t pci_perf_seq_write(struct file *file, const char __user *ubuf,
 	if (rc)
 		return rc;
 
-	mutex_lock(&zdev->lock);
+	mutex_lock(&zdev->fmb_lock);
 	switch (val) {
 	case 0:
 		rc = zpci_fmb_disable_device(zdev);
@@ -151,7 +163,7 @@ static ssize_t pci_perf_seq_write(struct file *file, const char __user *ubuf,
 		rc = zpci_fmb_enable_device(zdev);
 		break;
 	}
-	mutex_unlock(&zdev->lock);
+	mutex_unlock(&zdev->fmb_lock);
 	return rc ? rc : count;
 }
 
@@ -196,7 +208,7 @@ int __init zpci_debug_init(void)
 	if (!pci_debug_err_id)
 		return -EINVAL;
 	debug_register_view(pci_debug_err_id, &debug_hex_ascii_view);
-	debug_set_level(pci_debug_err_id, 6);
+	debug_set_level(pci_debug_err_id, 3);
 
 	debugfs_root = debugfs_create_dir("pci", NULL);
 	return 0;
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
deleted file mode 100644
index 64b1399a73f0..000000000000
--- a/arch/s390/pci/pci_dma.c
+++ /dev/null
@@ -1,684 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright IBM Corp. 2012
- *
- * Author(s):
- *   Jan Glauber <jang@linux.vnet.ibm.com>
- */
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include <linux/iommu-helper.h>
-#include <linux/dma-mapping.h>
-#include <linux/vmalloc.h>
-#include <linux/pci.h>
-#include <asm/pci_dma.h>
-
-static struct kmem_cache *dma_region_table_cache;
-static struct kmem_cache *dma_page_table_cache;
-static int s390_iommu_strict;
-
-static int zpci_refresh_global(struct zpci_dev *zdev)
-{
-	return zpci_refresh_trans((u64) zdev->fh << 32, zdev->start_dma,
-				  zdev->iommu_pages * PAGE_SIZE);
-}
-
-unsigned long *dma_alloc_cpu_table(void)
-{
-	unsigned long *table, *entry;
-
-	table = kmem_cache_alloc(dma_region_table_cache, GFP_ATOMIC);
-	if (!table)
-		return NULL;
-
-	for (entry = table; entry < table + ZPCI_TABLE_ENTRIES; entry++)
-		*entry = ZPCI_TABLE_INVALID;
-	return table;
-}
-
-static void dma_free_cpu_table(void *table)
-{
-	kmem_cache_free(dma_region_table_cache, table);
-}
-
-static unsigned long *dma_alloc_page_table(void)
-{
-	unsigned long *table, *entry;
-
-	table = kmem_cache_alloc(dma_page_table_cache, GFP_ATOMIC);
-	if (!table)
-		return NULL;
-
-	for (entry = table; entry < table + ZPCI_PT_ENTRIES; entry++)
-		*entry = ZPCI_PTE_INVALID;
-	return table;
-}
-
-static void dma_free_page_table(void *table)
-{
-	kmem_cache_free(dma_page_table_cache, table);
-}
-
-static unsigned long *dma_get_seg_table_origin(unsigned long *entry)
-{
-	unsigned long *sto;
-
-	if (reg_entry_isvalid(*entry))
-		sto = get_rt_sto(*entry);
-	else {
-		sto = dma_alloc_cpu_table();
-		if (!sto)
-			return NULL;
-
-		set_rt_sto(entry, sto);
-		validate_rt_entry(entry);
-		entry_clr_protected(entry);
-	}
-	return sto;
-}
-
-static unsigned long *dma_get_page_table_origin(unsigned long *entry)
-{
-	unsigned long *pto;
-
-	if (reg_entry_isvalid(*entry))
-		pto = get_st_pto(*entry);
-	else {
-		pto = dma_alloc_page_table();
-		if (!pto)
-			return NULL;
-		set_st_pto(entry, pto);
-		validate_st_entry(entry);
-		entry_clr_protected(entry);
-	}
-	return pto;
-}
-
-unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr)
-{
-	unsigned long *sto, *pto;
-	unsigned int rtx, sx, px;
-
-	rtx = calc_rtx(dma_addr);
-	sto = dma_get_seg_table_origin(&rto[rtx]);
-	if (!sto)
-		return NULL;
-
-	sx = calc_sx(dma_addr);
-	pto = dma_get_page_table_origin(&sto[sx]);
-	if (!pto)
-		return NULL;
-
-	px = calc_px(dma_addr);
-	return &pto[px];
-}
-
-void dma_update_cpu_trans(unsigned long *entry, void *page_addr, int flags)
-{
-	if (flags & ZPCI_PTE_INVALID) {
-		invalidate_pt_entry(entry);
-	} else {
-		set_pt_pfaa(entry, page_addr);
-		validate_pt_entry(entry);
-	}
-
-	if (flags & ZPCI_TABLE_PROTECTED)
-		entry_set_protected(entry);
-	else
-		entry_clr_protected(entry);
-}
-
-static int __dma_update_trans(struct zpci_dev *zdev, unsigned long pa,
-			      dma_addr_t dma_addr, size_t size, int flags)
-{
-	unsigned int nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
-	u8 *page_addr = (u8 *) (pa & PAGE_MASK);
-	unsigned long irq_flags;
-	unsigned long *entry;
-	int i, rc = 0;
-
-	if (!nr_pages)
-		return -EINVAL;
-
-	spin_lock_irqsave(&zdev->dma_table_lock, irq_flags);
-	if (!zdev->dma_table) {
-		rc = -EINVAL;
-		goto out_unlock;
-	}
-
-	for (i = 0; i < nr_pages; i++) {
-		entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr);
-		if (!entry) {
-			rc = -ENOMEM;
-			goto undo_cpu_trans;
-		}
-		dma_update_cpu_trans(entry, page_addr, flags);
-		page_addr += PAGE_SIZE;
-		dma_addr += PAGE_SIZE;
-	}
-
-undo_cpu_trans:
-	if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID)) {
-		flags = ZPCI_PTE_INVALID;
-		while (i-- > 0) {
-			page_addr -= PAGE_SIZE;
-			dma_addr -= PAGE_SIZE;
-			entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr);
-			if (!entry)
-				break;
-			dma_update_cpu_trans(entry, page_addr, flags);
-		}
-	}
-out_unlock:
-	spin_unlock_irqrestore(&zdev->dma_table_lock, irq_flags);
-	return rc;
-}
-
-static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr,
-			   size_t size, int flags)
-{
-	unsigned long irqflags;
-	int ret;
-
-	/*
-	 * With zdev->tlb_refresh == 0, rpcit is not required to establish new
-	 * translations when previously invalid translation-table entries are
-	 * validated. With lazy unmap, rpcit is skipped for previously valid
-	 * entries, but a global rpcit is then required before any address can
-	 * be re-used, i.e. after each iommu bitmap wrap-around.
-	 */
-	if ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID) {
-		if (!zdev->tlb_refresh)
-			return 0;
-	} else {
-		if (!s390_iommu_strict)
-			return 0;
-	}
-
-	ret = zpci_refresh_trans((u64) zdev->fh << 32, dma_addr,
-				 PAGE_ALIGN(size));
-	if (ret == -ENOMEM && !s390_iommu_strict) {
-		/* enable the hypervisor to free some resources */
-		if (zpci_refresh_global(zdev))
-			goto out;
-
-		spin_lock_irqsave(&zdev->iommu_bitmap_lock, irqflags);
-		bitmap_andnot(zdev->iommu_bitmap, zdev->iommu_bitmap,
-			      zdev->lazy_bitmap, zdev->iommu_pages);
-		bitmap_zero(zdev->lazy_bitmap, zdev->iommu_pages);
-		spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, irqflags);
-		ret = 0;
-	}
-out:
-	return ret;
-}
-
-static int dma_update_trans(struct zpci_dev *zdev, unsigned long pa,
-			    dma_addr_t dma_addr, size_t size, int flags)
-{
-	int rc;
-
-	rc = __dma_update_trans(zdev, pa, dma_addr, size, flags);
-	if (rc)
-		return rc;
-
-	rc = __dma_purge_tlb(zdev, dma_addr, size, flags);
-	if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID))
-		__dma_update_trans(zdev, pa, dma_addr, size, ZPCI_PTE_INVALID);
-
-	return rc;
-}
-
-void dma_free_seg_table(unsigned long entry)
-{
-	unsigned long *sto = get_rt_sto(entry);
-	int sx;
-
-	for (sx = 0; sx < ZPCI_TABLE_ENTRIES; sx++)
-		if (reg_entry_isvalid(sto[sx]))
-			dma_free_page_table(get_st_pto(sto[sx]));
-
-	dma_free_cpu_table(sto);
-}
-
-void dma_cleanup_tables(unsigned long *table)
-{
-	int rtx;
-
-	if (!table)
-		return;
-
-	for (rtx = 0; rtx < ZPCI_TABLE_ENTRIES; rtx++)
-		if (reg_entry_isvalid(table[rtx]))
-			dma_free_seg_table(table[rtx]);
-
-	dma_free_cpu_table(table);
-}
-
-static unsigned long __dma_alloc_iommu(struct device *dev,
-				       unsigned long start, int size)
-{
-	struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
-	unsigned long boundary_size;
-
-	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
-			      PAGE_SIZE) >> PAGE_SHIFT;
-	return iommu_area_alloc(zdev->iommu_bitmap, zdev->iommu_pages,
-				start, size, zdev->start_dma >> PAGE_SHIFT,
-				boundary_size, 0);
-}
-
-static dma_addr_t dma_alloc_address(struct device *dev, int size)
-{
-	struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
-	unsigned long offset, flags;
-
-	spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags);
-	offset = __dma_alloc_iommu(dev, zdev->next_bit, size);
-	if (offset == -1) {
-		if (!s390_iommu_strict) {
-			/* global flush before DMA addresses are reused */
-			if (zpci_refresh_global(zdev))
-				goto out_error;
-
-			bitmap_andnot(zdev->iommu_bitmap, zdev->iommu_bitmap,
-				      zdev->lazy_bitmap, zdev->iommu_pages);
-			bitmap_zero(zdev->lazy_bitmap, zdev->iommu_pages);
-		}
-		/* wrap-around */
-		offset = __dma_alloc_iommu(dev, 0, size);
-		if (offset == -1)
-			goto out_error;
-	}
-	zdev->next_bit = offset + size;
-	spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags);
-
-	return zdev->start_dma + offset * PAGE_SIZE;
-
-out_error:
-	spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags);
-	return DMA_MAPPING_ERROR;
-}
-
-static void dma_free_address(struct device *dev, dma_addr_t dma_addr, int size)
-{
-	struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
-	unsigned long flags, offset;
-
-	offset = (dma_addr - zdev->start_dma) >> PAGE_SHIFT;
-
-	spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags);
-	if (!zdev->iommu_bitmap)
-		goto out;
-
-	if (s390_iommu_strict)
-		bitmap_clear(zdev->iommu_bitmap, offset, size);
-	else
-		bitmap_set(zdev->lazy_bitmap, offset, size);
-
-out:
-	spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags);
-}
-
-static inline void zpci_err_dma(unsigned long rc, unsigned long addr)
-{
-	struct {
-		unsigned long rc;
-		unsigned long addr;
-	} __packed data = {rc, addr};
-
-	zpci_err_hex(&data, sizeof(data));
-}
-
-static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page,
-				     unsigned long offset, size_t size,
-				     enum dma_data_direction direction,
-				     unsigned long attrs)
-{
-	struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
-	unsigned long pa = page_to_phys(page) + offset;
-	int flags = ZPCI_PTE_VALID;
-	unsigned long nr_pages;
-	dma_addr_t dma_addr;
-	int ret;
-
-	/* This rounds up number of pages based on size and offset */
-	nr_pages = iommu_num_pages(pa, size, PAGE_SIZE);
-	dma_addr = dma_alloc_address(dev, nr_pages);
-	if (dma_addr == DMA_MAPPING_ERROR) {
-		ret = -ENOSPC;
-		goto out_err;
-	}
-
-	/* Use rounded up size */
-	size = nr_pages * PAGE_SIZE;
-
-	if (direction == DMA_NONE || direction == DMA_TO_DEVICE)
-		flags |= ZPCI_TABLE_PROTECTED;
-
-	ret = dma_update_trans(zdev, pa, dma_addr, size, flags);
-	if (ret)
-		goto out_free;
-
-	atomic64_add(nr_pages, &zdev->mapped_pages);
-	return dma_addr + (offset & ~PAGE_MASK);
-
-out_free:
-	dma_free_address(dev, dma_addr, nr_pages);
-out_err:
-	zpci_err("map error:\n");
-	zpci_err_dma(ret, pa);
-	return DMA_MAPPING_ERROR;
-}
-
-static void s390_dma_unmap_pages(struct device *dev, dma_addr_t dma_addr,
-				 size_t size, enum dma_data_direction direction,
-				 unsigned long attrs)
-{
-	struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
-	int npages, ret;
-
-	npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
-	dma_addr = dma_addr & PAGE_MASK;
-	ret = dma_update_trans(zdev, 0, dma_addr, npages * PAGE_SIZE,
-			       ZPCI_PTE_INVALID);
-	if (ret) {
-		zpci_err("unmap error:\n");
-		zpci_err_dma(ret, dma_addr);
-		return;
-	}
-
-	atomic64_add(npages, &zdev->unmapped_pages);
-	dma_free_address(dev, dma_addr, npages);
-}
-
-static void *s390_dma_alloc(struct device *dev, size_t size,
-			    dma_addr_t *dma_handle, gfp_t flag,
-			    unsigned long attrs)
-{
-	struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
-	struct page *page;
-	unsigned long pa;
-	dma_addr_t map;
-
-	size = PAGE_ALIGN(size);
-	page = alloc_pages(flag | __GFP_ZERO, get_order(size));
-	if (!page)
-		return NULL;
-
-	pa = page_to_phys(page);
-	map = s390_dma_map_pages(dev, page, 0, size, DMA_BIDIRECTIONAL, 0);
-	if (dma_mapping_error(dev, map)) {
-		free_pages(pa, get_order(size));
-		return NULL;
-	}
-
-	atomic64_add(size / PAGE_SIZE, &zdev->allocated_pages);
-	if (dma_handle)
-		*dma_handle = map;
-	return (void *) pa;
-}
-
-static void s390_dma_free(struct device *dev, size_t size,
-			  void *pa, dma_addr_t dma_handle,
-			  unsigned long attrs)
-{
-	struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
-
-	size = PAGE_ALIGN(size);
-	atomic64_sub(size / PAGE_SIZE, &zdev->allocated_pages);
-	s390_dma_unmap_pages(dev, dma_handle, size, DMA_BIDIRECTIONAL, 0);
-	free_pages((unsigned long) pa, get_order(size));
-}
-
-/* Map a segment into a contiguous dma address area */
-static int __s390_dma_map_sg(struct device *dev, struct scatterlist *sg,
-			     size_t size, dma_addr_t *handle,
-			     enum dma_data_direction dir)
-{
-	unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
-	struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
-	dma_addr_t dma_addr_base, dma_addr;
-	int flags = ZPCI_PTE_VALID;
-	struct scatterlist *s;
-	unsigned long pa = 0;
-	int ret;
-
-	dma_addr_base = dma_alloc_address(dev, nr_pages);
-	if (dma_addr_base == DMA_MAPPING_ERROR)
-		return -ENOMEM;
-
-	dma_addr = dma_addr_base;
-	if (dir == DMA_NONE || dir == DMA_TO_DEVICE)
-		flags |= ZPCI_TABLE_PROTECTED;
-
-	for (s = sg; dma_addr < dma_addr_base + size; s = sg_next(s)) {
-		pa = page_to_phys(sg_page(s));
-		ret = __dma_update_trans(zdev, pa, dma_addr,
-					 s->offset + s->length, flags);
-		if (ret)
-			goto unmap;
-
-		dma_addr += s->offset + s->length;
-	}
-	ret = __dma_purge_tlb(zdev, dma_addr_base, size, flags);
-	if (ret)
-		goto unmap;
-
-	*handle = dma_addr_base;
-	atomic64_add(nr_pages, &zdev->mapped_pages);
-
-	return ret;
-
-unmap:
-	dma_update_trans(zdev, 0, dma_addr_base, dma_addr - dma_addr_base,
-			 ZPCI_PTE_INVALID);
-	dma_free_address(dev, dma_addr_base, nr_pages);
-	zpci_err("map error:\n");
-	zpci_err_dma(ret, pa);
-	return ret;
-}
-
-static int s390_dma_map_sg(struct device *dev, struct scatterlist *sg,
-			   int nr_elements, enum dma_data_direction dir,
-			   unsigned long attrs)
-{
-	struct scatterlist *s = sg, *start = sg, *dma = sg;
-	unsigned int max = dma_get_max_seg_size(dev);
-	unsigned int size = s->offset + s->length;
-	unsigned int offset = s->offset;
-	int count = 0, i;
-
-	for (i = 1; i < nr_elements; i++) {
-		s = sg_next(s);
-
-		s->dma_address = DMA_MAPPING_ERROR;
-		s->dma_length = 0;
-
-		if (s->offset || (size & ~PAGE_MASK) ||
-		    size + s->length > max) {
-			if (__s390_dma_map_sg(dev, start, size,
-					      &dma->dma_address, dir))
-				goto unmap;
-
-			dma->dma_address += offset;
-			dma->dma_length = size - offset;
-
-			size = offset = s->offset;
-			start = s;
-			dma = sg_next(dma);
-			count++;
-		}
-		size += s->length;
-	}
-	if (__s390_dma_map_sg(dev, start, size, &dma->dma_address, dir))
-		goto unmap;
-
-	dma->dma_address += offset;
-	dma->dma_length = size - offset;
-
-	return count + 1;
-unmap:
-	for_each_sg(sg, s, count, i)
-		s390_dma_unmap_pages(dev, sg_dma_address(s), sg_dma_len(s),
-				     dir, attrs);
-
-	return 0;
-}
-
-static void s390_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
-			      int nr_elements, enum dma_data_direction dir,
-			      unsigned long attrs)
-{
-	struct scatterlist *s;
-	int i;
-
-	for_each_sg(sg, s, nr_elements, i) {
-		if (s->dma_length)
-			s390_dma_unmap_pages(dev, s->dma_address, s->dma_length,
-					     dir, attrs);
-		s->dma_address = 0;
-		s->dma_length = 0;
-	}
-}
-	
-int zpci_dma_init_device(struct zpci_dev *zdev)
-{
-	int rc;
-
-	/*
-	 * At this point, if the device is part of an IOMMU domain, this would
-	 * be a strong hint towards a bug in the IOMMU API (common) code and/or
-	 * simultaneous access via IOMMU and DMA API. So let's issue a warning.
-	 */
-	WARN_ON(zdev->s390_domain);
-
-	spin_lock_init(&zdev->iommu_bitmap_lock);
-	spin_lock_init(&zdev->dma_table_lock);
-
-	zdev->dma_table = dma_alloc_cpu_table();
-	if (!zdev->dma_table) {
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	/*
-	 * Restrict the iommu bitmap size to the minimum of the following:
-	 * - main memory size
-	 * - 3-level pagetable address limit minus start_dma offset
-	 * - DMA address range allowed by the hardware (clp query pci fn)
-	 *
-	 * Also set zdev->end_dma to the actual end address of the usable
-	 * range, instead of the theoretical maximum as reported by hardware.
-	 */
-	zdev->start_dma = PAGE_ALIGN(zdev->start_dma);
-	zdev->iommu_size = min3((u64) high_memory,
-				ZPCI_TABLE_SIZE_RT - zdev->start_dma,
-				zdev->end_dma - zdev->start_dma + 1);
-	zdev->end_dma = zdev->start_dma + zdev->iommu_size - 1;
-	zdev->iommu_pages = zdev->iommu_size >> PAGE_SHIFT;
-	zdev->iommu_bitmap = vzalloc(zdev->iommu_pages / 8);
-	if (!zdev->iommu_bitmap) {
-		rc = -ENOMEM;
-		goto free_dma_table;
-	}
-	if (!s390_iommu_strict) {
-		zdev->lazy_bitmap = vzalloc(zdev->iommu_pages / 8);
-		if (!zdev->lazy_bitmap) {
-			rc = -ENOMEM;
-			goto free_bitmap;
-		}
-
-	}
-	rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
-				(u64) zdev->dma_table);
-	if (rc)
-		goto free_bitmap;
-
-	return 0;
-free_bitmap:
-	vfree(zdev->iommu_bitmap);
-	zdev->iommu_bitmap = NULL;
-	vfree(zdev->lazy_bitmap);
-	zdev->lazy_bitmap = NULL;
-free_dma_table:
-	dma_free_cpu_table(zdev->dma_table);
-	zdev->dma_table = NULL;
-out:
-	return rc;
-}
-
-void zpci_dma_exit_device(struct zpci_dev *zdev)
-{
-	/*
-	 * At this point, if the device is part of an IOMMU domain, this would
-	 * be a strong hint towards a bug in the IOMMU API (common) code and/or
-	 * simultaneous access via IOMMU and DMA API. So let's issue a warning.
-	 */
-	WARN_ON(zdev->s390_domain);
-
-	if (zpci_unregister_ioat(zdev, 0))
-		return;
-
-	dma_cleanup_tables(zdev->dma_table);
-	zdev->dma_table = NULL;
-	vfree(zdev->iommu_bitmap);
-	zdev->iommu_bitmap = NULL;
-	vfree(zdev->lazy_bitmap);
-	zdev->lazy_bitmap = NULL;
-
-	zdev->next_bit = 0;
-}
-
-static int __init dma_alloc_cpu_table_caches(void)
-{
-	dma_region_table_cache = kmem_cache_create("PCI_DMA_region_tables",
-					ZPCI_TABLE_SIZE, ZPCI_TABLE_ALIGN,
-					0, NULL);
-	if (!dma_region_table_cache)
-		return -ENOMEM;
-
-	dma_page_table_cache = kmem_cache_create("PCI_DMA_page_tables",
-					ZPCI_PT_SIZE, ZPCI_PT_ALIGN,
-					0, NULL);
-	if (!dma_page_table_cache) {
-		kmem_cache_destroy(dma_region_table_cache);
-		return -ENOMEM;
-	}
-	return 0;
-}
-
-int __init zpci_dma_init(void)
-{
-	return dma_alloc_cpu_table_caches();
-}
-
-void zpci_dma_exit(void)
-{
-	kmem_cache_destroy(dma_page_table_cache);
-	kmem_cache_destroy(dma_region_table_cache);
-}
-
-const struct dma_map_ops s390_pci_dma_ops = {
-	.alloc		= s390_dma_alloc,
-	.free		= s390_dma_free,
-	.map_sg		= s390_dma_map_sg,
-	.unmap_sg	= s390_dma_unmap_sg,
-	.map_page	= s390_dma_map_pages,
-	.unmap_page	= s390_dma_unmap_pages,
-	.mmap		= dma_common_mmap,
-	.get_sgtable	= dma_common_get_sgtable,
-	/* dma_supported is unconditionally true without a callback */
-};
-EXPORT_SYMBOL_GPL(s390_pci_dma_ops);
-
-static int __init s390_iommu_setup(char *str)
-{
-	if (!strcmp(str, "strict"))
-		s390_iommu_strict = 1;
-	return 1;
-}
-
-__setup("s390_iommu=", s390_iommu_setup);
diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c
index 8d6ee4af4230..7bd7721c1239 100644
--- a/arch/s390/pci/pci_event.c
+++ b/arch/s390/pci/pci_event.c
@@ -12,8 +12,12 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <asm/pci_debug.h>
+#include <asm/pci_dma.h>
 #include <asm/sclp.h>
 
+#include "pci_bus.h"
+#include "pci_report.h"
+
 /* Content Code Description for PCI Function Error */
 struct zpci_ccdf_err {
 	u32 reserved1;
@@ -44,25 +48,270 @@ struct zpci_ccdf_avail {
 	u16 pec;			/* PCI event code */
 } __packed;
 
+static inline bool ers_result_indicates_abort(pci_ers_result_t ers_res)
+{
+	switch (ers_res) {
+	case PCI_ERS_RESULT_CAN_RECOVER:
+	case PCI_ERS_RESULT_RECOVERED:
+	case PCI_ERS_RESULT_NEED_RESET:
+		return false;
+	default:
+		return true;
+	}
+}
+
+static bool is_passed_through(struct pci_dev *pdev)
+{
+	struct zpci_dev *zdev = to_zpci(pdev);
+	bool ret;
+
+	mutex_lock(&zdev->kzdev_lock);
+	ret = !!zdev->kzdev;
+	mutex_unlock(&zdev->kzdev_lock);
+
+	return ret;
+}
+
+static bool is_driver_supported(struct pci_driver *driver)
+{
+	if (!driver || !driver->err_handler)
+		return false;
+	if (!driver->err_handler->error_detected)
+		return false;
+	if (!driver->err_handler->slot_reset)
+		return false;
+	if (!driver->err_handler->resume)
+		return false;
+	return true;
+}
+
+static pci_ers_result_t zpci_event_notify_error_detected(struct pci_dev *pdev,
+							 struct pci_driver *driver)
+{
+	pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
+
+	ers_res = driver->err_handler->error_detected(pdev,  pdev->error_state);
+	if (ers_result_indicates_abort(ers_res))
+		pr_info("%s: Automatic recovery failed after initial reporting\n", pci_name(pdev));
+	else if (ers_res == PCI_ERS_RESULT_NEED_RESET)
+		pr_debug("%s: Driver needs reset to recover\n", pci_name(pdev));
+
+	return ers_res;
+}
+
+static pci_ers_result_t zpci_event_do_error_state_clear(struct pci_dev *pdev,
+							struct pci_driver *driver)
+{
+	pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
+	struct zpci_dev *zdev = to_zpci(pdev);
+	int rc;
+
+	pr_info("%s: Unblocking device access for examination\n", pci_name(pdev));
+	rc = zpci_reset_load_store_blocked(zdev);
+	if (rc) {
+		pr_err("%s: Unblocking device access failed\n", pci_name(pdev));
+		/* Let's try a full reset instead */
+		return PCI_ERS_RESULT_NEED_RESET;
+	}
+
+	if (driver->err_handler->mmio_enabled) {
+		ers_res = driver->err_handler->mmio_enabled(pdev);
+		if (ers_result_indicates_abort(ers_res)) {
+			pr_info("%s: Automatic recovery failed after MMIO re-enable\n",
+				pci_name(pdev));
+			return ers_res;
+		} else if (ers_res == PCI_ERS_RESULT_NEED_RESET) {
+			pr_debug("%s: Driver needs reset to recover\n", pci_name(pdev));
+			return ers_res;
+		}
+	}
+
+	pr_debug("%s: Unblocking DMA\n", pci_name(pdev));
+	rc = zpci_clear_error_state(zdev);
+	if (!rc) {
+		pdev->error_state = pci_channel_io_normal;
+	} else {
+		pr_err("%s: Unblocking DMA failed\n", pci_name(pdev));
+		/* Let's try a full reset instead */
+		return PCI_ERS_RESULT_NEED_RESET;
+	}
+
+	return ers_res;
+}
+
+static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev,
+					    struct pci_driver *driver)
+{
+	pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
+
+	pr_info("%s: Initiating reset\n", pci_name(pdev));
+	if (zpci_hot_reset_device(to_zpci(pdev))) {
+		pr_err("%s: The reset request failed\n", pci_name(pdev));
+		return ers_res;
+	}
+	pdev->error_state = pci_channel_io_normal;
+	ers_res = driver->err_handler->slot_reset(pdev);
+	if (ers_result_indicates_abort(ers_res)) {
+		pr_info("%s: Automatic recovery failed after slot reset\n", pci_name(pdev));
+		return ers_res;
+	}
+
+	return ers_res;
+}
+
+/* zpci_event_attempt_error_recovery - Try to recover the given PCI function
+ * @pdev: PCI function to recover currently in the error state
+ *
+ * We follow the scheme outlined in Documentation/PCI/pci-error-recovery.rst.
+ * With the simplification that recovery always happens per function
+ * and the platform determines which functions are affected for
+ * multi-function devices.
+ */
+static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
+{
+	pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
+	struct zpci_dev *zdev = to_zpci(pdev);
+	char *status_str = "success";
+	struct pci_driver *driver;
+
+	/*
+	 * Ensure that the PCI function is not removed concurrently, no driver
+	 * is unbound or probed and that userspace can't access its
+	 * configuration space while we perform recovery.
+	 */
+	pci_dev_lock(pdev);
+	if (pdev->error_state == pci_channel_io_perm_failure) {
+		ers_res = PCI_ERS_RESULT_DISCONNECT;
+		goto out_unlock;
+	}
+	pdev->error_state = pci_channel_io_frozen;
+
+	if (is_passed_through(pdev)) {
+		pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n",
+			pci_name(pdev));
+		status_str = "failed (pass-through)";
+		goto out_unlock;
+	}
+
+	driver = to_pci_driver(pdev->dev.driver);
+	if (!is_driver_supported(driver)) {
+		if (!driver) {
+			pr_info("%s: Cannot be recovered because no driver is bound to the device\n",
+				pci_name(pdev));
+			status_str = "failed (no driver)";
+		} else {
+			pr_info("%s: The %s driver bound to the device does not support error recovery\n",
+				pci_name(pdev),
+				driver->name);
+			status_str = "failed (no driver support)";
+		}
+		goto out_unlock;
+	}
+
+	ers_res = zpci_event_notify_error_detected(pdev, driver);
+	if (ers_result_indicates_abort(ers_res)) {
+		status_str = "failed (abort on detection)";
+		goto out_unlock;
+	}
+
+	if (ers_res == PCI_ERS_RESULT_CAN_RECOVER) {
+		ers_res = zpci_event_do_error_state_clear(pdev, driver);
+		if (ers_result_indicates_abort(ers_res)) {
+			status_str = "failed (abort on MMIO enable)";
+			goto out_unlock;
+		}
+	}
+
+	if (ers_res == PCI_ERS_RESULT_NEED_RESET)
+		ers_res = zpci_event_do_reset(pdev, driver);
+
+	if (ers_res != PCI_ERS_RESULT_RECOVERED) {
+		pr_err("%s: Automatic recovery failed; operator intervention is required\n",
+		       pci_name(pdev));
+		status_str = "failed (driver can't recover)";
+		goto out_unlock;
+	}
+
+	pr_info("%s: The device is ready to resume operations\n", pci_name(pdev));
+	if (driver->err_handler->resume)
+		driver->err_handler->resume(pdev);
+out_unlock:
+	pci_dev_unlock(pdev);
+	zpci_report_status(zdev, "recovery", status_str);
+
+	return ers_res;
+}
+
+/* zpci_event_io_failure - Report PCI channel failure state to driver
+ * @pdev: PCI function for which to report
+ * @es: PCI channel failure state to report
+ */
+static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es)
+{
+	struct pci_driver *driver;
+
+	pci_dev_lock(pdev);
+	pdev->error_state = es;
+	/**
+	 * While vfio-pci's error_detected callback notifies user-space QEMU
+	 * reacts to this by freezing the guest. In an s390 environment PCI
+	 * errors are rarely fatal so this is overkill. Instead in the future
+	 * we will inject the error event and let the guest recover the device
+	 * itself.
+	 */
+	if (is_passed_through(pdev))
+		goto out;
+	driver = to_pci_driver(pdev->dev.driver);
+	if (driver && driver->err_handler && driver->err_handler->error_detected)
+		driver->err_handler->error_detected(pdev, pdev->error_state);
+out:
+	pci_dev_unlock(pdev);
+}
+
 static void __zpci_event_error(struct zpci_ccdf_err *ccdf)
 {
 	struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid);
 	struct pci_dev *pdev = NULL;
+	pci_ers_result_t ers_res;
 
+	zpci_dbg(3, "err fid:%x, fh:%x, pec:%x\n",
+		 ccdf->fid, ccdf->fh, ccdf->pec);
 	zpci_err("error CCDF:\n");
 	zpci_err_hex(ccdf, sizeof(*ccdf));
 
-	if (zdev)
-		pdev = pci_get_slot(zdev->bus, ZPCI_DEVFN);
+	if (zdev) {
+		mutex_lock(&zdev->state_lock);
+		zpci_update_fh(zdev, ccdf->fh);
+		if (zdev->zbus->bus)
+			pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn);
+	}
 
 	pr_err("%s: Event 0x%x reports an error for PCI function 0x%x\n",
 	       pdev ? pci_name(pdev) : "n/a", ccdf->pec, ccdf->fid);
 
 	if (!pdev)
-		return;
+		goto no_pdev;
 
-	pdev->error_state = pci_channel_io_perm_failure;
+	switch (ccdf->pec) {
+	case 0x002a: /* Error event concerns FMB */
+	case 0x002b:
+	case 0x002c:
+		break;
+	case 0x0040: /* Service Action or Error Recovery Failed */
+	case 0x003b:
+		zpci_event_io_failure(pdev, pci_channel_io_perm_failure);
+		break;
+	default: /* PCI function left in the error state attempt to recover */
+		ers_res = zpci_event_attempt_error_recovery(pdev);
+		if (ers_res != PCI_ERS_RESULT_RECOVERED)
+			zpci_event_io_failure(pdev, pci_channel_io_perm_failure);
+		break;
+	}
 	pci_dev_put(pdev);
+no_pdev:
+	if (zdev)
+		mutex_unlock(&zdev->state_lock);
+	zpci_zdev_put(zdev);
 }
 
 void zpci_event_error(void *data)
@@ -71,90 +320,105 @@ void zpci_event_error(void *data)
 		__zpci_event_error(data);
 }
 
+static void zpci_event_hard_deconfigured(struct zpci_dev *zdev, u32 fh)
+{
+	zpci_update_fh(zdev, fh);
+	/* Give the driver a hint that the function is
+	 * already unusable.
+	 */
+	zpci_bus_remove_device(zdev, true);
+	/* Even though the device is already gone we still
+	 * need to free zPCI resources as part of the disable.
+	 */
+	if (zdev_enabled(zdev))
+		zpci_disable_device(zdev);
+	zdev->state = ZPCI_FN_STATE_STANDBY;
+}
+
 static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
 {
 	struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid);
-	struct pci_dev *pdev = NULL;
+	bool existing_zdev = !!zdev;
 	enum zpci_state state;
-	int ret;
 
-	if (zdev)
-		pdev = pci_get_slot(zdev->bus, ZPCI_DEVFN);
+	zpci_dbg(3, "avl fid:%x, fh:%x, pec:%x\n",
+		 ccdf->fid, ccdf->fh, ccdf->pec);
 
-	pr_info("%s: Event 0x%x reconfigured PCI function 0x%x\n",
-		pdev ? pci_name(pdev) : "n/a", ccdf->pec, ccdf->fid);
-	zpci_err("avail CCDF:\n");
-	zpci_err_hex(ccdf, sizeof(*ccdf));
+	if (existing_zdev)
+		mutex_lock(&zdev->state_lock);
 
 	switch (ccdf->pec) {
 	case 0x0301: /* Reserved|Standby -> Configured */
 		if (!zdev) {
-			ret = clp_add_pci_device(ccdf->fid, ccdf->fh, 0);
-			if (ret)
+			zdev = zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_CONFIGURED);
+			if (IS_ERR(zdev))
+				break;
+			if (zpci_add_device(zdev)) {
+				kfree(zdev);
 				break;
-			zdev = get_zdev_by_fid(ccdf->fid);
+			}
+		} else {
+			/* the configuration request may be stale */
+			if (zdev->state != ZPCI_FN_STATE_STANDBY)
+				break;
+			zdev->state = ZPCI_FN_STATE_CONFIGURED;
 		}
-		if (!zdev || zdev->state != ZPCI_FN_STATE_STANDBY)
-			break;
-		zdev->state = ZPCI_FN_STATE_CONFIGURED;
-		zdev->fh = ccdf->fh;
-		ret = zpci_enable_device(zdev);
-		if (ret)
-			break;
-		pci_lock_rescan_remove();
-		pci_rescan_bus(zdev->bus);
-		pci_unlock_rescan_remove();
+		zpci_scan_configured_device(zdev, ccdf->fh);
 		break;
 	case 0x0302: /* Reserved -> Standby */
-		if (!zdev)
-			clp_add_pci_device(ccdf->fid, ccdf->fh, 0);
+		if (!zdev) {
+			zdev = zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_STANDBY);
+			if (IS_ERR(zdev))
+				break;
+			if (zpci_add_device(zdev)) {
+				kfree(zdev);
+				break;
+			}
+		} else {
+			zpci_update_fh(zdev, ccdf->fh);
+		}
 		break;
 	case 0x0303: /* Deconfiguration requested */
-		if (!zdev)
-			break;
-		if (pdev)
-			pci_stop_and_remove_bus_device_locked(pdev);
-
-		ret = zpci_disable_device(zdev);
-		if (ret)
-			break;
-
-		ret = sclp_pci_deconfigure(zdev->fid);
-		zpci_dbg(3, "deconf fid:%x, rc:%d\n", zdev->fid, ret);
-		if (!ret)
-			zdev->state = ZPCI_FN_STATE_STANDBY;
-
+		if (zdev) {
+			/* The event may have been queued before we configured
+			 * the device.
+			 */
+			if (zdev->state != ZPCI_FN_STATE_CONFIGURED)
+				break;
+			zpci_update_fh(zdev, ccdf->fh);
+			zpci_deconfigure_device(zdev);
+		}
 		break;
 	case 0x0304: /* Configured -> Standby|Reserved */
-		if (!zdev)
-			break;
-		if (pdev) {
-			/* Give the driver a hint that the function is
-			 * already unusable. */
-			pdev->error_state = pci_channel_io_perm_failure;
-			pci_stop_and_remove_bus_device_locked(pdev);
-		}
-
-		zdev->fh = ccdf->fh;
-		zpci_disable_device(zdev);
-		zdev->state = ZPCI_FN_STATE_STANDBY;
-		if (!clp_get_state(ccdf->fid, &state) &&
-		    state == ZPCI_FN_STATE_RESERVED) {
-			zpci_remove_device(zdev);
+		if (zdev) {
+			/* The event may have been queued before we configured
+			 * the device.:
+			 */
+			if (zdev->state == ZPCI_FN_STATE_CONFIGURED)
+				zpci_event_hard_deconfigured(zdev, ccdf->fh);
+			/* The 0x0304 event may immediately reserve the device */
+			if (!clp_get_state(zdev->fid, &state) &&
+			    state == ZPCI_FN_STATE_RESERVED) {
+				zpci_device_reserved(zdev);
+			}
 		}
 		break;
 	case 0x0306: /* 0x308 or 0x302 for multiple devices */
-		clp_rescan_pci_devices();
+		zpci_remove_reserved_devices();
+		zpci_scan_devices();
 		break;
 	case 0x0308: /* Standby -> Reserved */
 		if (!zdev)
 			break;
-		zpci_remove_device(zdev);
+		zpci_device_reserved(zdev);
 		break;
 	default:
 		break;
 	}
-	pci_dev_put(pdev);
+	if (existing_zdev) {
+		mutex_unlock(&zdev->state_lock);
+		zpci_zdev_put(zdev);
+	}
 }
 
 void zpci_event_availability(void *data)
diff --git a/arch/s390/pci/pci_fixup.c b/arch/s390/pci/pci_fixup.c
new file mode 100644
index 000000000000..35688b645098
--- /dev/null
+++ b/arch/s390/pci/pci_fixup.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Exceptions for specific devices,
+ *
+ * Copyright IBM Corp. 2025
+ *
+ * Author(s):
+ *   Niklas Schnelle <schnelle@linux.ibm.com>
+ */
+#include <linux/pci.h>
+
+static void zpci_ism_bar_no_mmap(struct pci_dev *pdev)
+{
+	/*
+	 * ISM's BAR is special. Drivers written for ISM know
+	 * how to handle this but others need to be aware of their
+	 * special nature e.g. to prevent attempts to mmap() it.
+	 */
+	pdev->non_mappable_bars = 1;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_IBM,
+			PCI_DEVICE_ID_IBM_ISM,
+			zpci_ism_bar_no_mmap);
diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c
index 02f9505c99a8..eb978c8012be 100644
--- a/arch/s390/pci/pci_insn.c
+++ b/arch/s390/pci/pci_insn.c
@@ -9,87 +9,128 @@
 #include <linux/errno.h>
 #include <linux/delay.h>
 #include <linux/jump_label.h>
+#include <asm/asm-extable.h>
 #include <asm/facility.h>
 #include <asm/pci_insn.h>
 #include <asm/pci_debug.h>
 #include <asm/pci_io.h>
 #include <asm/processor.h>
+#include <asm/asm.h>
 
 #define ZPCI_INSN_BUSY_DELAY	1	/* 1 microsecond */
 
-static inline void zpci_err_insn(u8 cc, u8 status, u64 req, u64 offset)
+struct zpci_err_insn_data {
+	u8 insn;
+	u8 cc;
+	u8 status;
+	union {
+		struct {
+			u64 req;
+			u64 offset;
+		};
+		struct {
+			u64 addr;
+			u64 len;
+		};
+	};
+} __packed;
+
+static inline void zpci_err_insn_req(int lvl, u8 insn, u8 cc, u8 status,
+				     u64 req, u64 offset)
 {
-	struct {
-		u64 req;
-		u64 offset;
-		u8 cc;
-		u8 status;
-	} __packed data = {req, offset, cc, status};
-
-	zpci_err_hex(&data, sizeof(data));
+	struct zpci_err_insn_data data = {
+		.insn = insn, .cc = cc, .status = status,
+		.req = req, .offset = offset};
+
+	zpci_err_hex_level(lvl, &data, sizeof(data));
+}
+
+static inline void zpci_err_insn_addr(int lvl, u8 insn, u8 cc, u8 status,
+				      u64 addr, u64 len)
+{
+	struct zpci_err_insn_data data = {
+		.insn = insn, .cc = cc, .status = status,
+		.addr = addr, .len = len};
+
+	zpci_err_hex_level(lvl, &data, sizeof(data));
 }
 
 /* Modify PCI Function Controls */
 static inline u8 __mpcifc(u64 req, struct zpci_fib *fib, u8 *status)
 {
-	u8 cc;
+	int cc;
 
 	asm volatile (
 		"	.insn	rxy,0xe300000000d0,%[req],%[fib]\n"
-		"	ipm	%[cc]\n"
-		"	srl	%[cc],28\n"
-		: [cc] "=d" (cc), [req] "+d" (req), [fib] "+Q" (*fib)
-		: : "cc");
+		CC_IPM(cc)
+		: CC_OUT(cc, cc), [req] "+d" (req), [fib] "+Q" (*fib)
+		:
+		: CC_CLOBBER);
 	*status = req >> 24 & 0xff;
-	return cc;
+	return CC_TRANSFORM(cc);
 }
 
 u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status)
 {
+	bool retried = false;
 	u8 cc;
 
 	do {
 		cc = __mpcifc(req, fib, status);
-		if (cc == 2)
+		if (cc == 2) {
 			msleep(ZPCI_INSN_BUSY_DELAY);
+			if (!retried) {
+				zpci_err_insn_req(1, 'M', cc, *status, req, 0);
+				retried = true;
+			}
+		}
 	} while (cc == 2);
 
 	if (cc)
-		zpci_err_insn(cc, *status, req, 0);
+		zpci_err_insn_req(0, 'M', cc, *status, req, 0);
+	else if (retried)
+		zpci_err_insn_req(1, 'M', cc, *status, req, 0);
 
 	return cc;
 }
+EXPORT_SYMBOL_GPL(zpci_mod_fc);
 
 /* Refresh PCI Translations */
 static inline u8 __rpcit(u64 fn, u64 addr, u64 range, u8 *status)
 {
-	register u64 __addr asm("2") = addr;
-	register u64 __range asm("3") = range;
-	u8 cc;
+	union register_pair addr_range = {.even = addr, .odd = range};
+	int cc;
 
 	asm volatile (
-		"	.insn	rre,0xb9d30000,%[fn],%[addr]\n"
-		"	ipm	%[cc]\n"
-		"	srl	%[cc],28\n"
-		: [cc] "=d" (cc), [fn] "+d" (fn)
-		: [addr] "d" (__addr), "d" (__range)
-		: "cc");
+		"	.insn	rre,0xb9d30000,%[fn],%[addr_range]\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc), [fn] "+d" (fn)
+		: [addr_range] "d" (addr_range.pair)
+		: CC_CLOBBER);
 	*status = fn >> 24 & 0xff;
-	return cc;
+	return CC_TRANSFORM(cc);
 }
 
 int zpci_refresh_trans(u64 fn, u64 addr, u64 range)
 {
+	bool retried = false;
 	u8 cc, status;
 
 	do {
 		cc = __rpcit(fn, addr, range, &status);
-		if (cc == 2)
+		if (cc == 2) {
 			udelay(ZPCI_INSN_BUSY_DELAY);
+			if (!retried) {
+				zpci_err_insn_addr(1, 'R', cc, status, addr, range);
+				retried = true;
+			}
+		}
 	} while (cc == 2);
 
 	if (cc)
-		zpci_err_insn(cc, status, addr, range);
+		zpci_err_insn_addr(0, 'R', cc, status, addr, range);
+	else if (retried)
+		zpci_err_insn_addr(1, 'R', cc, status, addr, range);
 
 	if (cc == 1 && (status == 4 || status == 16))
 		return -ENOMEM;
@@ -98,7 +139,7 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range)
 }
 
 /* Set Interruption Controls */
-int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib)
+int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib)
 {
 	if (!test_facility(72))
 		return -EIO;
@@ -109,27 +150,29 @@ int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(zpci_set_irq_ctrl);
 
 /* PCI Load */
 static inline int ____pcilg(u64 *data, u64 req, u64 offset, u8 *status)
 {
-	register u64 __req asm("2") = req;
-	register u64 __offset asm("3") = offset;
-	int cc = -ENXIO;
+	union register_pair req_off = {.even = req, .odd = offset};
+	int cc, exception;
 	u64 __data;
 
-	asm volatile (
-		"	.insn	rre,0xb9d20000,%[data],%[req]\n"
-		"0:	ipm	%[cc]\n"
-		"	srl	%[cc],28\n"
+	exception = 1;
+	asm_inline volatile (
+		"	.insn	rre,0xb9d20000,%[data],%[req_off]\n"
+		"0:	lhi	%[exc],0\n"
 		"1:\n"
+		CC_IPM(cc)
 		EX_TABLE(0b, 1b)
-		: [cc] "+d" (cc), [data] "=d" (__data), [req] "+d" (__req)
-		:  "d" (__offset)
-		: "cc");
-	*status = __req >> 24 & 0xff;
+		: CC_OUT(cc, cc), [data] "=d" (__data),
+		  [req_off] "+d" (req_off.pair), [exc] "+d" (exception)
+		:
+		: CC_CLOBBER);
+	*status = req_off.even >> 24 & 0xff;
 	*data = __data;
-	return cc;
+	return exception ? -ENXIO : CC_TRANSFORM(cc);
 }
 
 static inline int __pcilg(u64 *data, u64 req, u64 offset, u8 *status)
@@ -146,17 +189,25 @@ static inline int __pcilg(u64 *data, u64 req, u64 offset, u8 *status)
 
 int __zpci_load(u64 *data, u64 req, u64 offset)
 {
+	bool retried = false;
 	u8 status;
 	int cc;
 
 	do {
 		cc = __pcilg(data, req, offset, &status);
-		if (cc == 2)
+		if (cc == 2) {
 			udelay(ZPCI_INSN_BUSY_DELAY);
+			if (!retried) {
+				zpci_err_insn_req(1, 'l', cc, status, req, offset);
+				retried = true;
+			}
+		}
 	} while (cc == 2);
 
 	if (cc)
-		zpci_err_insn(cc, status, req, offset);
+		zpci_err_insn_req(0, 'l', cc, status, req, offset);
+	else if (retried)
+		zpci_err_insn_req(1, 'l', cc, status, req, offset);
 
 	return (cc > 0) ? -EIO : cc;
 }
@@ -166,30 +217,31 @@ static inline int zpci_load_fh(u64 *data, const volatile void __iomem *addr,
 			       unsigned long len)
 {
 	struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)];
-	u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len);
+	u64 req = ZPCI_CREATE_REQ(READ_ONCE(entry->fh), entry->bar, len);
 
 	return __zpci_load(data, req, ZPCI_OFFSET(addr));
 }
 
 static inline int __pcilg_mio(u64 *data, u64 ioaddr, u64 len, u8 *status)
 {
-	register u64 addr asm("2") = ioaddr;
-	register u64 r3 asm("3") = len;
-	int cc = -ENXIO;
+	union register_pair ioaddr_len = {.even = ioaddr, .odd = len};
+	int cc, exception;
 	u64 __data;
 
-	asm volatile (
-		"       .insn   rre,0xb9d60000,%[data],%[ioaddr]\n"
-		"0:     ipm     %[cc]\n"
-		"       srl     %[cc],28\n"
+	exception = 1;
+	asm_inline volatile (
+		"       .insn   rre,0xb9d60000,%[data],%[ioaddr_len]\n"
+		"0:	lhi	%[exc],0\n"
 		"1:\n"
+		CC_IPM(cc)
 		EX_TABLE(0b, 1b)
-		: [cc] "+d" (cc), [data] "=d" (__data), "+d" (r3)
-		: [ioaddr] "d" (addr)
-		: "cc");
-	*status = r3 >> 24 & 0xff;
+		: CC_OUT(cc, cc), [data] "=d" (__data),
+		  [ioaddr_len] "+d" (ioaddr_len.pair), [exc] "+d" (exception)
+		:
+		: CC_CLOBBER);
+	*status = ioaddr_len.odd >> 24 & 0xff;
 	*data = __data;
-	return cc;
+	return exception ? -ENXIO : CC_TRANSFORM(cc);
 }
 
 int zpci_load(u64 *data, const volatile void __iomem *addr, unsigned long len)
@@ -202,7 +254,7 @@ int zpci_load(u64 *data, const volatile void __iomem *addr, unsigned long len)
 
 	cc = __pcilg_mio(data, (__force u64) addr, len, &status);
 	if (cc)
-		zpci_err_insn(cc, status, 0, (__force u64) addr);
+		zpci_err_insn_addr(0, 'L', cc, status, (__force u64) addr, len);
 
 	return (cc > 0) ? -EIO : cc;
 }
@@ -211,36 +263,44 @@ EXPORT_SYMBOL_GPL(zpci_load);
 /* PCI Store */
 static inline int __pcistg(u64 data, u64 req, u64 offset, u8 *status)
 {
-	register u64 __req asm("2") = req;
-	register u64 __offset asm("3") = offset;
-	int cc = -ENXIO;
+	union register_pair req_off = {.even = req, .odd = offset};
+	int cc, exception;
 
-	asm volatile (
-		"	.insn	rre,0xb9d00000,%[data],%[req]\n"
-		"0:	ipm	%[cc]\n"
-		"	srl	%[cc],28\n"
+	exception = 1;
+	asm_inline volatile (
+		"	.insn	rre,0xb9d00000,%[data],%[req_off]\n"
+		"0:	lhi	%[exc],0\n"
 		"1:\n"
+		CC_IPM(cc)
 		EX_TABLE(0b, 1b)
-		: [cc] "+d" (cc), [req] "+d" (__req)
-		: "d" (__offset), [data] "d" (data)
-		: "cc");
-	*status = __req >> 24 & 0xff;
-	return cc;
+		: CC_OUT(cc, cc), [req_off] "+d" (req_off.pair), [exc] "+d" (exception)
+		: [data] "d" (data)
+		: CC_CLOBBER);
+	*status = req_off.even >> 24 & 0xff;
+	return exception ? -ENXIO : CC_TRANSFORM(cc);
 }
 
 int __zpci_store(u64 data, u64 req, u64 offset)
 {
+	bool retried = false;
 	u8 status;
 	int cc;
 
 	do {
 		cc = __pcistg(data, req, offset, &status);
-		if (cc == 2)
+		if (cc == 2) {
 			udelay(ZPCI_INSN_BUSY_DELAY);
+			if (!retried) {
+				zpci_err_insn_req(1, 's', cc, status, req, offset);
+				retried = true;
+			}
+		}
 	} while (cc == 2);
 
 	if (cc)
-		zpci_err_insn(cc, status, req, offset);
+		zpci_err_insn_req(0, 's', cc, status, req, offset);
+	else if (retried)
+		zpci_err_insn_req(1, 's', cc, status, req, offset);
 
 	return (cc > 0) ? -EIO : cc;
 }
@@ -250,28 +310,28 @@ static inline int zpci_store_fh(const volatile void __iomem *addr, u64 data,
 				unsigned long len)
 {
 	struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)];
-	u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len);
+	u64 req = ZPCI_CREATE_REQ(READ_ONCE(entry->fh), entry->bar, len);
 
 	return __zpci_store(data, req, ZPCI_OFFSET(addr));
 }
 
 static inline int __pcistg_mio(u64 data, u64 ioaddr, u64 len, u8 *status)
 {
-	register u64 addr asm("2") = ioaddr;
-	register u64 r3 asm("3") = len;
-	int cc = -ENXIO;
+	union register_pair ioaddr_len = {.even = ioaddr, .odd = len};
+	int cc, exception;
 
-	asm volatile (
-		"       .insn   rre,0xb9d40000,%[data],%[ioaddr]\n"
-		"0:     ipm     %[cc]\n"
-		"       srl     %[cc],28\n"
+	exception = 1;
+	asm_inline volatile (
+		"       .insn   rre,0xb9d40000,%[data],%[ioaddr_len]\n"
+		"0:	lhi	%[exc],0\n"
 		"1:\n"
+		CC_IPM(cc)
 		EX_TABLE(0b, 1b)
-		: [cc] "+d" (cc), "+d" (r3)
-		: [data] "d" (data), [ioaddr] "d" (addr)
-		: "cc");
-	*status = r3 >> 24 & 0xff;
-	return cc;
+		: CC_OUT(cc, cc), [ioaddr_len] "+d" (ioaddr_len.pair), [exc] "+d" (exception)
+		: [data] "d" (data)
+		: CC_CLOBBER_LIST("memory"));
+	*status = ioaddr_len.odd >> 24 & 0xff;
+	return exception ? -ENXIO : CC_TRANSFORM(cc);
 }
 
 int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len)
@@ -284,7 +344,7 @@ int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len)
 
 	cc = __pcistg_mio(data, (__force u64) addr, len, &status);
 	if (cc)
-		zpci_err_insn(cc, status, 0, (__force u64) addr);
+		zpci_err_insn_addr(0, 'S', cc, status, (__force u64) addr, len);
 
 	return (cc > 0) ? -EIO : cc;
 }
@@ -293,34 +353,43 @@ EXPORT_SYMBOL_GPL(zpci_store);
 /* PCI Store Block */
 static inline int __pcistb(const u64 *data, u64 req, u64 offset, u8 *status)
 {
-	int cc = -ENXIO;
+	int cc, exception;
 
-	asm volatile (
+	exception = 1;
+	asm_inline volatile (
 		"	.insn	rsy,0xeb00000000d0,%[req],%[offset],%[data]\n"
-		"0:	ipm	%[cc]\n"
-		"	srl	%[cc],28\n"
+		"0:	lhi	%[exc],0\n"
 		"1:\n"
+		CC_IPM(cc)
 		EX_TABLE(0b, 1b)
-		: [cc] "+d" (cc), [req] "+d" (req)
+		: CC_OUT(cc, cc), [req] "+d" (req), [exc] "+d" (exception)
 		: [offset] "d" (offset), [data] "Q" (*data)
-		: "cc");
+		: CC_CLOBBER);
 	*status = req >> 24 & 0xff;
-	return cc;
+	return exception ? -ENXIO : CC_TRANSFORM(cc);
 }
 
 int __zpci_store_block(const u64 *data, u64 req, u64 offset)
 {
+	bool retried = false;
 	u8 status;
 	int cc;
 
 	do {
 		cc = __pcistb(data, req, offset, &status);
-		if (cc == 2)
+		if (cc == 2) {
 			udelay(ZPCI_INSN_BUSY_DELAY);
+			if (!retried) {
+				zpci_err_insn_req(0, 'b', cc, status, req, offset);
+				retried = true;
+			}
+		}
 	} while (cc == 2);
 
 	if (cc)
-		zpci_err_insn(cc, status, req, offset);
+		zpci_err_insn_req(0, 'b', cc, status, req, offset);
+	else if (retried)
+		zpci_err_insn_req(1, 'b', cc, status, req, offset);
 
 	return (cc > 0) ? -EIO : cc;
 }
@@ -338,19 +407,20 @@ static inline int zpci_write_block_fh(volatile void __iomem *dst,
 
 static inline int __pcistb_mio(const u64 *data, u64 ioaddr, u64 len, u8 *status)
 {
-	int cc = -ENXIO;
+	int cc, exception;
 
-	asm volatile (
+	exception = 1;
+	asm_inline volatile (
 		"       .insn   rsy,0xeb00000000d4,%[len],%[ioaddr],%[data]\n"
-		"0:     ipm     %[cc]\n"
-		"       srl     %[cc],28\n"
+		"0:	lhi	%[exc],0\n"
 		"1:\n"
+		CC_IPM(cc)
 		EX_TABLE(0b, 1b)
-		: [cc] "+d" (cc), [len] "+d" (len)
+		: CC_OUT(cc, cc), [len] "+d" (len), [exc] "+d" (exception)
 		: [ioaddr] "d" (ioaddr), [data] "Q" (*data)
-		: "cc");
+		: CC_CLOBBER);
 	*status = len >> 24 & 0xff;
-	return cc;
+	return exception ? -ENXIO : CC_TRANSFORM(cc);
 }
 
 int zpci_write_block(volatile void __iomem *dst,
@@ -364,7 +434,7 @@ int zpci_write_block(volatile void __iomem *dst,
 
 	cc = __pcistb_mio(src, (__force u64) dst, len, &status);
 	if (cc)
-		zpci_err_insn(cc, status, 0, (__force u64) dst);
+		zpci_err_insn_addr(0, 'B', cc, status, (__force u64) dst, len);
 
 	return (cc > 0) ? -EIO : cc;
 }
@@ -372,10 +442,7 @@ EXPORT_SYMBOL_GPL(zpci_write_block);
 
 static inline void __pciwb_mio(void)
 {
-	unsigned long unused = 0;
-
-	asm volatile (".insn    rre,0xb9d50000,%[op],%[op]\n"
-		      : [op] "+d" (unused));
+	asm volatile (".insn    rre,0xb9d50000,0,0\n");
 }
 
 void zpci_barrier(void)
diff --git a/arch/s390/pci/pci_iov.c b/arch/s390/pci/pci_iov.c
new file mode 100644
index 000000000000..191e56a623f6
--- /dev/null
+++ b/arch/s390/pci/pci_iov.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s):
+ *   Niklas Schnelle <schnelle@linux.ibm.com>
+ *
+ */
+
+#define KMSG_COMPONENT "zpci"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+
+#include "pci_iov.h"
+
+static struct resource iov_res = {
+	.name	= "PCI IOV res",
+	.start	= 0,
+	.end	= -1,
+	.flags	= IORESOURCE_MEM,
+};
+
+void zpci_iov_map_resources(struct pci_dev *pdev)
+{
+	resource_size_t len;
+	int i;
+
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		int bar = i + PCI_IOV_RESOURCES;
+
+		len = pci_resource_len(pdev, bar);
+		if (!len)
+			continue;
+		pdev->resource[bar].parent = &iov_res;
+	}
+}
+
+void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn)
+{
+	pci_lock_rescan_remove();
+	/* Linux' vfid's start at 0 vfn at 1 */
+	pci_iov_remove_virtfn(pdev->physfn, vfn - 1);
+	pci_unlock_rescan_remove();
+}
+
+static int zpci_iov_link_virtfn(struct pci_dev *pdev, struct pci_dev *virtfn, int vfid)
+{
+	int rc;
+
+	rc = pci_iov_sysfs_link(pdev, virtfn, vfid);
+	if (rc)
+		return rc;
+
+	virtfn->is_virtfn = 1;
+	virtfn->multifunction = 0;
+	virtfn->physfn = pci_dev_get(pdev);
+
+	return 0;
+}
+
+/**
+ * zpci_iov_find_parent_pf - Find the parent PF, if any, of the given function
+ * @zbus:	The bus that the PCI function is on, or would be added on
+ * @zdev:	The PCI function
+ *
+ * Finds the parent PF, if it exists and is configured, of the given PCI function
+ * and increments its refcount. Th PF is searched for on the provided bus so the
+ * caller has to ensure that this is the correct bus to search. This function may
+ * be used before adding the PCI function to a zbus.
+ *
+ * Return: Pointer to the struct pci_dev of the parent PF or NULL if it not
+ * found. If the function is not a VF or has no RequesterID information,
+ * NULL is returned as well.
+ */
+struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev)
+{
+	int i, vfid, devfn, cand_devfn;
+	struct pci_dev *pdev;
+
+	if (!zbus->multifunction)
+		return NULL;
+	/* Non-VFs and VFs without RID available don't have a parent */
+	if (!zdev->vfn || !zdev->rid_available)
+		return NULL;
+	/* Linux vfid starts at 0 vfn at 1 */
+	vfid = zdev->vfn - 1;
+	devfn = zdev->rid & ZPCI_RID_MASK_DEVFN;
+	/*
+	 * If the parent PF for the given VF is also configured in the
+	 * instance, it must be on the same zbus.
+	 * We can then identify the parent PF by checking what
+	 * devfn the VF would have if it belonged to that PF using the PF's
+	 * stride and offset. Only if this candidate devfn matches the
+	 * actual devfn will we link both functions.
+	 */
+	for (i = 0; i < ZPCI_FUNCTIONS_PER_BUS; i++) {
+		zdev = zbus->function[i];
+		if (zdev && zdev->is_physfn) {
+			pdev = pci_get_slot(zbus->bus, zdev->devfn);
+			if (!pdev)
+				continue;
+			cand_devfn = pci_iov_virtfn_devfn(pdev, vfid);
+			if (cand_devfn == devfn)
+				return pdev;
+			/* balance pci_get_slot() */
+			pci_dev_put(pdev);
+		}
+	}
+	return NULL;
+}
+
+int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn)
+{
+	struct zpci_dev *zdev = to_zpci(virtfn);
+	struct pci_dev *pdev_pf;
+	int rc = 0;
+
+	pdev_pf = zpci_iov_find_parent_pf(zbus, zdev);
+	if (pdev_pf) {
+		/* Linux' vfids start at 0 while zdev->vfn starts at 1 */
+		rc = zpci_iov_link_virtfn(pdev_pf, virtfn, zdev->vfn - 1);
+		pci_dev_put(pdev_pf);
+	}
+	return rc;
+}
diff --git a/arch/s390/pci/pci_iov.h b/arch/s390/pci/pci_iov.h
new file mode 100644
index 000000000000..d2c2793eb0f3
--- /dev/null
+++ b/arch/s390/pci/pci_iov.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s):
+ *   Niklas Schnelle <schnelle@linux.ibm.com>
+ *
+ */
+
+#ifndef __S390_PCI_IOV_H
+#define __S390_PCI_IOV_H
+
+#include <linux/pci.h>
+
+#ifdef CONFIG_PCI_IOV
+void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn);
+
+void zpci_iov_map_resources(struct pci_dev *pdev);
+
+int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn);
+
+struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev);
+
+#else /* CONFIG_PCI_IOV */
+static inline void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn) {}
+
+static inline void zpci_iov_map_resources(struct pci_dev *pdev) {}
+
+static inline int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn)
+{
+	return 0;
+}
+
+static inline struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev)
+{
+	return NULL;
+}
+#endif /* CONFIG_PCI_IOV */
+#endif /* __S390_PCI_IOV_h */
diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c
index fbe97ab2e228..84482a921332 100644
--- a/arch/s390/pci/pci_irq.c
+++ b/arch/s390/pci/pci_irq.c
@@ -11,16 +11,10 @@
 
 #include <asm/isc.h>
 #include <asm/airq.h>
+#include <asm/tpi.h>
 
 static enum {FLOATING, DIRECTED} irq_delivery;
 
-#define	SIC_IRQ_MODE_ALL		0
-#define	SIC_IRQ_MODE_SINGLE		1
-#define	SIC_IRQ_MODE_DIRECT		4
-#define	SIC_IRQ_MODE_D_ALL		16
-#define	SIC_IRQ_MODE_D_SINGLE		17
-#define	SIC_IRQ_MODE_SET_CPU		18
-
 /*
  * summary bit vector
  * FLOATING - summary bit per function
@@ -35,7 +29,7 @@ static struct airq_iv *zpci_sbv;
  */
 static struct airq_iv **zpci_ibv;
 
-/* Modify PCI: Register adapter interruptions */
+/* Modify PCI: Register floating adapter interruptions */
 static int zpci_set_airq(struct zpci_dev *zdev)
 {
 	u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT);
@@ -45,21 +39,24 @@ static int zpci_set_airq(struct zpci_dev *zdev)
 	fib.fmt0.isc = PCI_ISC;
 	fib.fmt0.sum = 1;	/* enable summary notifications */
 	fib.fmt0.noi = airq_iv_end(zdev->aibv);
-	fib.fmt0.aibv = (unsigned long) zdev->aibv->vector;
+	fib.fmt0.aibv = virt_to_phys(zdev->aibv->vector);
 	fib.fmt0.aibvo = 0;	/* each zdev has its own interrupt vector */
-	fib.fmt0.aisb = (unsigned long) zpci_sbv->vector + (zdev->aisb/64)*8;
+	fib.fmt0.aisb = virt_to_phys(zpci_sbv->vector) + (zdev->aisb / 64) * 8;
 	fib.fmt0.aisbo = zdev->aisb & 63;
+	fib.gd = zdev->gisa;
 
 	return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
 }
 
-/* Modify PCI: Unregister adapter interruptions */
+/* Modify PCI: Unregister floating adapter interruptions */
 static int zpci_clear_airq(struct zpci_dev *zdev)
 {
 	u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT);
 	struct zpci_fib fib = {0};
 	u8 cc, status;
 
+	fib.gd = zdev->gisa;
+
 	cc = zpci_mod_fc(req, &fib, &status);
 	if (cc == 3 || (cc == 1 && status == 24))
 		/* Function already gone or IRQs already deregistered. */
@@ -78,6 +75,7 @@ static int zpci_set_directed_irq(struct zpci_dev *zdev)
 	fib.fmt = 1;
 	fib.fmt1.noi = zdev->msi_nr_irqs;
 	fib.fmt1.dibvo = zdev->msi_first_bit;
+	fib.gd = zdev->gisa;
 
 	return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
 }
@@ -90,6 +88,7 @@ static int zpci_clear_directed_irq(struct zpci_dev *zdev)
 	u8 cc, status;
 
 	fib.fmt = 1;
+	fib.gd = zdev->gisa;
 	cc = zpci_mod_fc(req, &fib, &status);
 	if (cc == 3 || (cc == 1 && status == 24))
 		/* Function already gone or IRQs already deregistered. */
@@ -98,14 +97,47 @@ static int zpci_clear_directed_irq(struct zpci_dev *zdev)
 	return cc ? -EIO : 0;
 }
 
+/* Register adapter interruptions */
+static int zpci_set_irq(struct zpci_dev *zdev)
+{
+	int rc;
+
+	if (irq_delivery == DIRECTED)
+		rc = zpci_set_directed_irq(zdev);
+	else
+		rc = zpci_set_airq(zdev);
+
+	if (!rc)
+		zdev->irqs_registered = 1;
+
+	return rc;
+}
+
+/* Clear adapter interruptions */
+static int zpci_clear_irq(struct zpci_dev *zdev)
+{
+	int rc;
+
+	if (irq_delivery == DIRECTED)
+		rc = zpci_clear_directed_irq(zdev);
+	else
+		rc = zpci_clear_airq(zdev);
+
+	if (!rc)
+		zdev->irqs_registered = 0;
+
+	return rc;
+}
+
 static int zpci_set_irq_affinity(struct irq_data *data, const struct cpumask *dest,
 				 bool force)
 {
-	struct msi_desc *entry = irq_get_msi_desc(data->irq);
+	struct msi_desc *entry = irq_data_get_msi_desc(data);
 	struct msi_msg msg = entry->msg;
+	int cpu_addr = smp_cpu_get_cpu_address(cpumask_first(dest));
 
 	msg.address_lo &= 0xff0000ff;
-	msg.address_lo |= (cpumask_first(dest) << 8);
+	msg.address_lo |= (cpu_addr << 8);
 	pci_write_msi_msg(data->irq, &msg);
 
 	return IRQ_SET_MASK_OK;
@@ -115,12 +147,12 @@ static struct irq_chip zpci_irq_chip = {
 	.name = "PCI-MSI",
 	.irq_unmask = pci_msi_unmask_irq,
 	.irq_mask = pci_msi_mask_irq,
-	.irq_set_affinity = zpci_set_irq_affinity,
 };
 
 static void zpci_handle_cpu_local_irq(bool rescan)
 {
 	struct airq_iv *dibv = zpci_ibv[smp_processor_id()];
+	union zpci_sic_iib iib = {{0}};
 	unsigned long bit;
 	int irqs_on = 0;
 
@@ -131,8 +163,8 @@ static void zpci_handle_cpu_local_irq(bool rescan)
 			if (!rescan || irqs_on++)
 				/* End of second scan with interrupts on. */
 				break;
-			/* First scan complete, reenable interrupts. */
-			if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC))
+			/* First scan complete, re-enable interrupts. */
+			if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &iib))
 				break;
 			bit = 0;
 			continue;
@@ -160,6 +192,7 @@ static void zpci_handle_remote_irq(void *data)
 static void zpci_handle_fallback_irq(void)
 {
 	struct cpu_irq_data *cpu_data;
+	union zpci_sic_iib iib = {{0}};
 	unsigned long cpu;
 	int irqs_on = 0;
 
@@ -169,8 +202,8 @@ static void zpci_handle_fallback_irq(void)
 			if (irqs_on++)
 				/* End of second scan with interrupts on. */
 				break;
-			/* First scan complete, reenable interrupts. */
-			if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC))
+			/* First scan complete, re-enable interrupts. */
+			if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib))
 				break;
 			cpu = 0;
 			continue;
@@ -179,15 +212,16 @@ static void zpci_handle_fallback_irq(void)
 		if (atomic_inc_return(&cpu_data->scheduled) > 1)
 			continue;
 
-		cpu_data->csd.func = zpci_handle_remote_irq;
-		cpu_data->csd.info = &cpu_data->scheduled;
-		cpu_data->csd.flags = 0;
+		INIT_CSD(&cpu_data->csd, zpci_handle_remote_irq, &cpu_data->scheduled);
 		smp_call_function_single_async(cpu, &cpu_data->csd);
 	}
 }
 
-static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating)
+static void zpci_directed_irq_handler(struct airq_struct *airq,
+				      struct tpi_info *tpi_info)
 {
+	bool floating = !tpi_info->directed_irq;
+
 	if (floating) {
 		inc_irq_stat(IRQIO_PCF);
 		zpci_handle_fallback_irq();
@@ -197,8 +231,10 @@ static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating)
 	}
 }
 
-static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating)
+static void zpci_floating_irq_handler(struct airq_struct *airq,
+				      struct tpi_info *tpi_info)
 {
+	union zpci_sic_iib iib = {{0}};
 	unsigned long si, ai;
 	struct airq_iv *aibv;
 	int irqs_on = 0;
@@ -211,8 +247,8 @@ static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating)
 			if (irqs_on++)
 				/* End of second scan with interrupts on. */
 				break;
-			/* First scan complete, reenable interrupts. */
-			if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC))
+			/* First scan complete, re-enable interrupts. */
+			if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib))
 				break;
 			si = 0;
 			continue;
@@ -232,112 +268,141 @@ static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating)
 	}
 }
 
-int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
+static int __alloc_airq(struct zpci_dev *zdev, int msi_vecs,
+			unsigned long *bit)
 {
-	struct zpci_dev *zdev = to_zpci(pdev);
-	unsigned int hwirq, msi_vecs, cpu;
-	unsigned long bit;
-	struct msi_desc *msi;
-	struct msi_msg msg;
-	int rc, irq;
-
-	zdev->aisb = -1UL;
-	zdev->msi_first_bit = -1U;
-	if (type == PCI_CAP_ID_MSI && nvec > 1)
-		return 1;
-	msi_vecs = min_t(unsigned int, nvec, zdev->max_msi);
-
 	if (irq_delivery == DIRECTED) {
 		/* Allocate cpu vector bits */
-		bit = airq_iv_alloc(zpci_ibv[0], msi_vecs);
-		if (bit == -1UL)
+		*bit = airq_iv_alloc(zpci_ibv[0], msi_vecs);
+		if (*bit == -1UL)
 			return -EIO;
 	} else {
 		/* Allocate adapter summary indicator bit */
-		bit = airq_iv_alloc_bit(zpci_sbv);
-		if (bit == -1UL)
+		*bit = airq_iv_alloc_bit(zpci_sbv);
+		if (*bit == -1UL)
 			return -EIO;
-		zdev->aisb = bit;
+		zdev->aisb = *bit;
 
 		/* Create adapter interrupt vector */
-		zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK);
+		zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK, NULL);
 		if (!zdev->aibv)
 			return -ENOMEM;
 
 		/* Wire up shortcut pointer */
-		zpci_ibv[bit] = zdev->aibv;
+		zpci_ibv[*bit] = zdev->aibv;
 		/* Each function has its own interrupt vector */
-		bit = 0;
+		*bit = 0;
 	}
+	return 0;
+}
 
-	/* Request MSI interrupts */
+int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
+{
+	unsigned int hwirq, msi_vecs, irqs_per_msi, i, cpu;
+	struct zpci_dev *zdev = to_zpci(pdev);
+	struct msi_desc *msi;
+	struct msi_msg msg;
+	unsigned long bit;
+	int cpu_addr;
+	int rc, irq;
+
+	zdev->aisb = -1UL;
+	zdev->msi_first_bit = -1U;
+
+	msi_vecs = min_t(unsigned int, nvec, zdev->max_msi);
+	if (msi_vecs < nvec) {
+		pr_info("%s requested %d irqs, allocate system limit of %d",
+			pci_name(pdev), nvec, zdev->max_msi);
+	}
+
+	rc = __alloc_airq(zdev, msi_vecs, &bit);
+	if (rc < 0)
+		return rc;
+
+	/*
+	 * Request MSI interrupts:
+	 * When using MSI, nvec_used interrupt sources and their irq
+	 * descriptors are controlled through one msi descriptor.
+	 * Thus the outer loop over msi descriptors shall run only once,
+	 * while two inner loops iterate over the interrupt vectors.
+	 * When using MSI-X, each interrupt vector/irq descriptor
+	 * is bound to exactly one msi descriptor (nvec_used is one).
+	 * So the inner loops are executed once, while the outer iterates
+	 * over the MSI-X descriptors.
+	 */
 	hwirq = bit;
-	for_each_pci_msi_entry(msi, pdev) {
-		rc = -EIO;
+	msi_for_each_desc(msi, &pdev->dev, MSI_DESC_NOTASSOCIATED) {
 		if (hwirq - bit >= msi_vecs)
 			break;
-		irq = __irq_alloc_descs(-1, 0, 1, 0, THIS_MODULE, msi->affinity);
+		irqs_per_msi = min_t(unsigned int, msi_vecs, msi->nvec_used);
+		irq = __irq_alloc_descs(-1, 0, irqs_per_msi, 0, THIS_MODULE,
+					(irq_delivery == DIRECTED) ?
+					msi->affinity : NULL);
 		if (irq < 0)
 			return -ENOMEM;
-		rc = irq_set_msi_desc(irq, msi);
-		if (rc)
-			return rc;
-		irq_set_chip_and_handler(irq, &zpci_irq_chip,
-					 handle_percpu_irq);
+
+		for (i = 0; i < irqs_per_msi; i++) {
+			rc = irq_set_msi_desc_off(irq, i, msi);
+			if (rc)
+				return rc;
+			irq_set_chip_and_handler(irq + i, &zpci_irq_chip,
+						 handle_percpu_irq);
+		}
+
 		msg.data = hwirq - bit;
 		if (irq_delivery == DIRECTED) {
+			if (msi->affinity)
+				cpu = cpumask_first(&msi->affinity->mask);
+			else
+				cpu = 0;
+			cpu_addr = smp_cpu_get_cpu_address(cpu);
+
 			msg.address_lo = zdev->msi_addr & 0xff0000ff;
-			msg.address_lo |= msi->affinity ?
-				(cpumask_first(&msi->affinity->mask) << 8) : 0;
+			msg.address_lo |= (cpu_addr << 8);
+
 			for_each_possible_cpu(cpu) {
-				airq_iv_set_data(zpci_ibv[cpu], hwirq, irq);
+				for (i = 0; i < irqs_per_msi; i++)
+					airq_iv_set_data(zpci_ibv[cpu],
+							 hwirq + i, irq + i);
 			}
 		} else {
 			msg.address_lo = zdev->msi_addr & 0xffffffff;
-			airq_iv_set_data(zdev->aibv, hwirq, irq);
+			for (i = 0; i < irqs_per_msi; i++)
+				airq_iv_set_data(zdev->aibv, hwirq + i, irq + i);
 		}
 		msg.address_hi = zdev->msi_addr >> 32;
 		pci_write_msi_msg(irq, &msg);
-		hwirq++;
+		hwirq += irqs_per_msi;
 	}
 
 	zdev->msi_first_bit = bit;
-	zdev->msi_nr_irqs = msi_vecs;
+	zdev->msi_nr_irqs = hwirq - bit;
 
-	if (irq_delivery == DIRECTED)
-		rc = zpci_set_directed_irq(zdev);
-	else
-		rc = zpci_set_airq(zdev);
+	rc = zpci_set_irq(zdev);
 	if (rc)
 		return rc;
 
-	return (msi_vecs == nvec) ? 0 : msi_vecs;
+	return (zdev->msi_nr_irqs == nvec) ? 0 : zdev->msi_nr_irqs;
 }
 
 void arch_teardown_msi_irqs(struct pci_dev *pdev)
 {
 	struct zpci_dev *zdev = to_zpci(pdev);
 	struct msi_desc *msi;
+	unsigned int i;
 	int rc;
 
 	/* Disable interrupts */
-	if (irq_delivery == DIRECTED)
-		rc = zpci_clear_directed_irq(zdev);
-	else
-		rc = zpci_clear_airq(zdev);
+	rc = zpci_clear_irq(zdev);
 	if (rc)
 		return;
 
 	/* Release MSI interrupts */
-	for_each_pci_msi_entry(msi, pdev) {
-		if (!msi->irq)
-			continue;
-		if (msi->msi_attrib.is_msix)
-			__pci_msix_desc_mask_irq(msi, 1);
-		else
-			__pci_msi_desc_mask_irq(msi, 1, 1);
-		irq_set_msi_desc(msi->irq, NULL);
-		irq_free_desc(msi->irq);
+	msi_for_each_desc(msi, &pdev->dev, MSI_DESC_ASSOCIATED) {
+		for (i = 0; i < msi->nvec_used; i++) {
+			irq_set_msi_desc(msi->irq + i, NULL);
+			irq_free_desc(msi->irq + i);
+		}
 		msi->msg.address_lo = 0;
 		msi->msg.address_hi = 0;
 		msi->msg.data = 0;
@@ -358,6 +423,15 @@ void arch_teardown_msi_irqs(struct pci_dev *pdev)
 		airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, zdev->msi_nr_irqs);
 }
 
+bool arch_restore_msi_irqs(struct pci_dev *pdev)
+{
+	struct zpci_dev *zdev = to_zpci(pdev);
+
+	if (!zdev->irqs_registered)
+		zpci_set_irq(zdev);
+	return true;
+}
+
 static struct airq_struct zpci_airq = {
 	.handler = zpci_floating_irq_handler,
 	.isc = PCI_ISC,
@@ -366,11 +440,12 @@ static struct airq_struct zpci_airq = {
 static void __init cpu_enable_directed_irq(void *unused)
 {
 	union zpci_sic_iib iib = {{0}};
+	union zpci_sic_iib ziib = {{0}};
 
-	iib.cdiib.dibv_addr = (u64) zpci_ibv[smp_processor_id()]->vector;
+	iib.cdiib.dibv_addr = virt_to_phys(zpci_ibv[smp_processor_id()]->vector);
 
-	__zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib);
-	zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC);
+	zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib);
+	zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &ziib);
 }
 
 static int __init zpci_directed_irq_init(void)
@@ -378,14 +453,14 @@ static int __init zpci_directed_irq_init(void)
 	union zpci_sic_iib iib = {{0}};
 	unsigned int cpu;
 
-	zpci_sbv = airq_iv_create(num_possible_cpus(), 0);
+	zpci_sbv = airq_iv_create(num_possible_cpus(), 0, NULL);
 	if (!zpci_sbv)
 		return -ENOMEM;
 
 	iib.diib.isc = PCI_ISC;
 	iib.diib.nr_cpus = num_possible_cpus();
-	iib.diib.disb_addr = (u64) zpci_sbv->vector;
-	__zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib);
+	iib.diib.disb_addr = virt_to_phys(zpci_sbv->vector);
+	zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib);
 
 	zpci_ibv = kcalloc(num_possible_cpus(), sizeof(*zpci_ibv),
 			   GFP_KERNEL);
@@ -400,7 +475,7 @@ static int __init zpci_directed_irq_init(void)
 		zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE,
 					       AIRQ_IV_DATA |
 					       AIRQ_IV_CACHELINE |
-					       (!cpu ? AIRQ_IV_ALLOC : 0));
+					       (!cpu ? AIRQ_IV_ALLOC : 0), NULL);
 		if (!zpci_ibv[cpu])
 			return -ENOMEM;
 	}
@@ -417,7 +492,7 @@ static int __init zpci_floating_irq_init(void)
 	if (!zpci_ibv)
 		return -ENOMEM;
 
-	zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC);
+	zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, NULL);
 	if (!zpci_sbv)
 		goto out_free;
 
@@ -430,6 +505,7 @@ out_free:
 
 int __init zpci_irq_init(void)
 {
+	union zpci_sic_iib iib = {{0}};
 	int rc;
 
 	irq_delivery = sclp.has_dirq ? DIRECTED : FLOATING;
@@ -461,7 +537,7 @@ int __init zpci_irq_init(void)
 	 * Enable floating IRQs (with suppression after one IRQ). When using
 	 * directed IRQs this enables the fallback path.
 	 */
-	zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC);
+	zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib);
 
 	return 0;
 out_airq:
diff --git a/arch/s390/pci/pci_kvm_hook.c b/arch/s390/pci/pci_kvm_hook.c
new file mode 100644
index 000000000000..ff34baf50a3e
--- /dev/null
+++ b/arch/s390/pci/pci_kvm_hook.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VFIO ZPCI devices support
+ *
+ * Copyright (C) IBM Corp. 2022.  All rights reserved.
+ *	Author(s): Pierre Morel <pmorel@linux.ibm.com>
+ */
+#include <linux/kvm_host.h>
+
+struct zpci_kvm_hook zpci_kvm_hook;
+EXPORT_SYMBOL_GPL(zpci_kvm_hook);
diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c
index 7d42a8794f10..5fcc1a3b04bd 100644
--- a/arch/s390/pci/pci_mmio.c
+++ b/arch/s390/pci/pci_mmio.c
@@ -11,34 +11,122 @@
 #include <linux/mm.h>
 #include <linux/errno.h>
 #include <linux/pci.h>
+#include <asm/asm-extable.h>
+#include <asm/pci_io.h>
+#include <asm/pci_debug.h>
+#include <asm/asm.h>
 
-static long get_pfn(unsigned long user_addr, unsigned long access,
-		    unsigned long *pfn)
+static inline void zpci_err_mmio(u8 cc, u8 status, u64 offset)
 {
-	struct vm_area_struct *vma;
-	long ret;
+	struct {
+		u64 offset;
+		u8 cc;
+		u8 status;
+	} data = {offset, cc, status};
 
-	down_read(&current->mm->mmap_sem);
-	ret = -EINVAL;
-	vma = find_vma(current->mm, user_addr);
-	if (!vma)
-		goto out;
-	ret = -EACCES;
-	if (!(vma->vm_flags & access))
-		goto out;
-	ret = follow_pfn(vma, user_addr, pfn);
-out:
-	up_read(&current->mm->mmap_sem);
-	return ret;
+	zpci_err_hex(&data, sizeof(data));
+}
+
+static inline int __pcistb_mio_inuser(
+		void __iomem *ioaddr, const void __user *src,
+		u64 len, u8 *status)
+{
+	int cc, exception;
+
+	exception = 1;
+	asm_inline volatile (
+		"	sacf	256\n"
+		"0:	.insn	rsy,0xeb00000000d4,%[len],%[ioaddr],%[src]\n"
+		"1:	lhi	%[exc],0\n"
+		"2:	sacf	768\n"
+		CC_IPM(cc)
+		EX_TABLE(0b, 2b) EX_TABLE(1b, 2b)
+		: CC_OUT(cc, cc), [len] "+d" (len), [exc] "+d" (exception)
+		: [ioaddr] "a" (ioaddr), [src] "Q" (*((u8 __force *)src))
+		: CC_CLOBBER_LIST("memory"));
+	*status = len >> 24 & 0xff;
+	return exception ? -ENXIO : CC_TRANSFORM(cc);
+}
+
+static inline int __pcistg_mio_inuser(
+		void __iomem *ioaddr, const void __user *src,
+		u64 ulen, u8 *status)
+{
+	union register_pair ioaddr_len = {.even = (u64 __force)ioaddr, .odd = ulen};
+	int cc, exception;
+	u64 val = 0;
+	u64 cnt = ulen;
+	u8 tmp;
+
+	/*
+	 * copy 0 < @len <= 8 bytes from @src into the right most bytes of
+	 * a register, then store it to PCI at @ioaddr while in secondary
+	 * address space. pcistg then uses the user mappings.
+	 */
+	exception = 1;
+	asm_inline volatile (
+		"	sacf	256\n"
+		"0:	llgc	%[tmp],0(%[src])\n"
+		"4:	sllg	%[val],%[val],8\n"
+		"	aghi	%[src],1\n"
+		"	ogr	%[val],%[tmp]\n"
+		"	brctg	%[cnt],0b\n"
+		"1:	.insn	rre,0xb9d40000,%[val],%[ioaddr_len]\n"
+		"2:	lhi	%[exc],0\n"
+		"3:	sacf	768\n"
+		CC_IPM(cc)
+		EX_TABLE(0b, 3b) EX_TABLE(4b, 3b) EX_TABLE(1b, 3b) EX_TABLE(2b, 3b)
+		: [src] "+a" (src), [cnt] "+d" (cnt),
+		  [val] "+d" (val), [tmp] "=d" (tmp), [exc] "+d" (exception),
+		  CC_OUT(cc, cc), [ioaddr_len] "+&d" (ioaddr_len.pair)
+		:
+		: CC_CLOBBER_LIST("memory"));
+	*status = ioaddr_len.odd >> 24 & 0xff;
+
+	cc = exception ? -ENXIO : CC_TRANSFORM(cc);
+	/* did we read everything from user memory? */
+	if (!cc && cnt != 0)
+		cc = -EFAULT;
+
+	return cc;
+}
+
+static inline int __memcpy_toio_inuser(void __iomem *dst,
+				   const void __user *src, size_t n)
+{
+	int size, rc = 0;
+	u8 status = 0;
+
+	if (!src)
+		return -EINVAL;
+
+	while (n > 0) {
+		size = zpci_get_max_io_size((u64 __force) dst,
+					    (u64 __force) src, n,
+					    ZPCI_MAX_WRITE_SIZE);
+		if (size > 8) /* main path */
+			rc = __pcistb_mio_inuser(dst, src, size, &status);
+		else
+			rc = __pcistg_mio_inuser(dst, src, size, &status);
+		if (rc)
+			break;
+		src += size;
+		dst += size;
+		n -= size;
+	}
+	if (rc)
+		zpci_err_mmio(rc, status, (__force u64) dst);
+	return rc;
 }
 
 SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
 		const void __user *, user_buffer, size_t, length)
 {
+	struct follow_pfnmap_args args = { };
 	u8 local_buf[64];
 	void __iomem *io_addr;
 	void *buf;
-	unsigned long pfn;
+	struct vm_area_struct *vma;
 	long ret;
 
 	if (!zpci_is_enabled())
@@ -46,6 +134,22 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
 
 	if (length <= 0 || PAGE_SIZE - (mmio_addr & ~PAGE_MASK) < length)
 		return -EINVAL;
+
+	/*
+	 * We only support write access to MIO capable devices if we are on
+	 * a MIO enabled system. Otherwise we would have to check for every
+	 * address if it is a special ZPCI_ADDR and would have to do
+	 * a pfn lookup which we don't need for MIO capable devices.  Currently
+	 * ISM devices are the only devices without MIO support and there is no
+	 * known need for accessing these from userspace.
+	 */
+	if (static_branch_likely(&have_mio)) {
+		ret = __memcpy_toio_inuser((void  __iomem *) mmio_addr,
+					user_buffer,
+					length);
+		return ret;
+	}
+
 	if (length > 64) {
 		buf = kmalloc(length, GFP_KERNEL);
 		if (!buf)
@@ -53,32 +157,129 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
 	} else
 		buf = local_buf;
 
-	ret = get_pfn(mmio_addr, VM_WRITE, &pfn);
-	if (ret)
-		goto out;
-	io_addr = (void __iomem *)((pfn << PAGE_SHIFT) | (mmio_addr & ~PAGE_MASK));
-
 	ret = -EFAULT;
-	if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE)
-		goto out;
-
 	if (copy_from_user(buf, user_buffer, length))
-		goto out;
+		goto out_free;
+
+	mmap_read_lock(current->mm);
+	ret = -EINVAL;
+	vma = vma_lookup(current->mm, mmio_addr);
+	if (!vma)
+		goto out_unlock_mmap;
+	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+		goto out_unlock_mmap;
+	ret = -EACCES;
+	if (!(vma->vm_flags & VM_WRITE))
+		goto out_unlock_mmap;
+
+	args.address = mmio_addr;
+	args.vma = vma;
+	ret = follow_pfnmap_start(&args);
+	if (ret) {
+		fixup_user_fault(current->mm, mmio_addr, FAULT_FLAG_WRITE, NULL);
+		ret = follow_pfnmap_start(&args);
+		if (ret)
+			goto out_unlock_mmap;
+	}
+
+	io_addr = (void __iomem *)((args.pfn << PAGE_SHIFT) |
+			(mmio_addr & ~PAGE_MASK));
+
+	if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE)
+		goto out_unlock_pt;
 
 	ret = zpci_memcpy_toio(io_addr, buf, length);
-out:
+out_unlock_pt:
+	follow_pfnmap_end(&args);
+out_unlock_mmap:
+	mmap_read_unlock(current->mm);
+out_free:
 	if (buf != local_buf)
 		kfree(buf);
 	return ret;
 }
 
+static inline int __pcilg_mio_inuser(
+		void __user *dst, const void __iomem *ioaddr,
+		u64 ulen, u8 *status)
+{
+	union register_pair ioaddr_len = {.even = (u64 __force)ioaddr, .odd = ulen};
+	u64 cnt = ulen;
+	int shift = ulen * 8;
+	int cc, exception;
+	u64 val, tmp;
+
+	/*
+	 * read 0 < @len <= 8 bytes from the PCI memory mapped at @ioaddr (in
+	 * user space) into a register using pcilg then store these bytes at
+	 * user address @dst
+	 */
+	exception = 1;
+	asm_inline volatile (
+		"	sacf	256\n"
+		"0:	.insn	rre,0xb9d60000,%[val],%[ioaddr_len]\n"
+		"1:	lhi	%[exc],0\n"
+		"	jne	4f\n"
+		"2:	ahi	%[shift],-8\n"
+		"	srlg	%[tmp],%[val],0(%[shift])\n"
+		"3:	stc	%[tmp],0(%[dst])\n"
+		"5:	aghi	%[dst],1\n"
+		"	brctg	%[cnt],2b\n"
+		/*
+		 * Use xr to clear exc and set condition code to zero
+		 * to ensure flag output is correct for this branch.
+		 */
+		"	xr	%[exc],%[exc]\n"
+		"4:	sacf	768\n"
+		CC_IPM(cc)
+		EX_TABLE(0b, 4b) EX_TABLE(1b, 4b) EX_TABLE(3b, 4b) EX_TABLE(5b, 4b)
+		: [ioaddr_len] "+&d" (ioaddr_len.pair), [exc] "+d" (exception),
+		  CC_OUT(cc, cc), [val] "=d" (val),
+		  [dst] "+a" (dst), [cnt] "+d" (cnt), [tmp] "=d" (tmp),
+		  [shift] "+d" (shift)
+		:
+		: CC_CLOBBER_LIST("memory"));
+
+	cc = exception ? -ENXIO : CC_TRANSFORM(cc);
+	/* did we write everything to the user space buffer? */
+	if (!cc && cnt != 0)
+		cc = -EFAULT;
+
+	*status = ioaddr_len.odd >> 24 & 0xff;
+	return cc;
+}
+
+static inline int __memcpy_fromio_inuser(void __user *dst,
+				     const void __iomem *src,
+				     unsigned long n)
+{
+	int size, rc = 0;
+	u8 status;
+
+	while (n > 0) {
+		size = zpci_get_max_io_size((u64 __force) src,
+					    (u64 __force) dst, n,
+					    ZPCI_MAX_READ_SIZE);
+		rc = __pcilg_mio_inuser(dst, src, size, &status);
+		if (rc)
+			break;
+		src += size;
+		dst += size;
+		n -= size;
+	}
+	if (rc)
+		zpci_err_mmio(rc, status, (__force u64) dst);
+	return rc;
+}
+
 SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
 		void __user *, user_buffer, size_t, length)
 {
+	struct follow_pfnmap_args args = { };
 	u8 local_buf[64];
 	void __iomem *io_addr;
 	void *buf;
-	unsigned long pfn;
+	struct vm_area_struct *vma;
 	long ret;
 
 	if (!zpci_is_enabled())
@@ -86,29 +287,68 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
 
 	if (length <= 0 || PAGE_SIZE - (mmio_addr & ~PAGE_MASK) < length)
 		return -EINVAL;
+
+	/*
+	 * We only support read access to MIO capable devices if we are on
+	 * a MIO enabled system. Otherwise we would have to check for every
+	 * address if it is a special ZPCI_ADDR and would have to do
+	 * a pfn lookup which we don't need for MIO capable devices.  Currently
+	 * ISM devices are the only devices without MIO support and there is no
+	 * known need for accessing these from userspace.
+	 */
+	if (static_branch_likely(&have_mio)) {
+		ret = __memcpy_fromio_inuser(
+				user_buffer, (const void __iomem *)mmio_addr,
+				length);
+		return ret;
+	}
+
 	if (length > 64) {
 		buf = kmalloc(length, GFP_KERNEL);
 		if (!buf)
 			return -ENOMEM;
-	} else
+	} else {
 		buf = local_buf;
+	}
 
-	ret = get_pfn(mmio_addr, VM_READ, &pfn);
-	if (ret)
-		goto out;
-	io_addr = (void __iomem *)((pfn << PAGE_SHIFT) | (mmio_addr & ~PAGE_MASK));
+	mmap_read_lock(current->mm);
+	ret = -EINVAL;
+	vma = vma_lookup(current->mm, mmio_addr);
+	if (!vma)
+		goto out_unlock_mmap;
+	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+		goto out_unlock_mmap;
+	ret = -EACCES;
+	if (!(vma->vm_flags & VM_READ))
+		goto out_unlock_mmap;
+
+	args.vma = vma;
+	args.address = mmio_addr;
+	ret = follow_pfnmap_start(&args);
+	if (ret) {
+		fixup_user_fault(current->mm, mmio_addr, 0, NULL);
+		ret = follow_pfnmap_start(&args);
+		if (ret)
+			goto out_unlock_mmap;
+	}
+
+	io_addr = (void __iomem *)((args.pfn << PAGE_SHIFT) |
+			(mmio_addr & ~PAGE_MASK));
 
 	if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE) {
 		ret = -EFAULT;
-		goto out;
+		goto out_unlock_pt;
 	}
 	ret = zpci_memcpy_fromio(buf, io_addr, length);
-	if (ret)
-		goto out;
-	if (copy_to_user(user_buffer, buf, length))
+
+out_unlock_pt:
+	follow_pfnmap_end(&args);
+out_unlock_mmap:
+	mmap_read_unlock(current->mm);
+
+	if (!ret && copy_to_user(user_buffer, buf, length))
 		ret = -EFAULT;
 
-out:
 	if (buf != local_buf)
 		kfree(buf);
 	return ret;
diff --git a/arch/s390/pci/pci_report.c b/arch/s390/pci/pci_report.c
new file mode 100644
index 000000000000..1b494e5ecc4d
--- /dev/null
+++ b/arch/s390/pci/pci_report.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2024
+ *
+ * Author(s):
+ *   Niklas Schnelle <schnelle@linux.ibm.com>
+ *
+ */
+
+#define KMSG_COMPONENT "zpci"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/sprintf.h>
+#include <linux/pci.h>
+
+#include <asm/sclp.h>
+#include <asm/debug.h>
+#include <asm/pci_debug.h>
+
+#include "pci_report.h"
+
+#define ZPCI_ERR_LOG_ID_KERNEL_REPORT 0x4714
+
+struct zpci_report_error_data {
+	u64 timestamp;
+	u64 err_log_id;
+	char log_data[];
+} __packed;
+
+#define ZPCI_REPORT_SIZE	(PAGE_SIZE - sizeof(struct err_notify_sccb))
+#define ZPCI_REPORT_DATA_SIZE	(ZPCI_REPORT_SIZE - sizeof(struct zpci_report_error_data))
+
+struct zpci_report_error {
+	struct zpci_report_error_header header;
+	struct zpci_report_error_data data;
+} __packed;
+
+static const char *zpci_state_str(pci_channel_state_t state)
+{
+	switch (state) {
+	case pci_channel_io_normal:
+		return "normal";
+	case pci_channel_io_frozen:
+		return "frozen";
+	case pci_channel_io_perm_failure:
+		return "permanent-failure";
+	default:
+		return "invalid";
+	};
+}
+
+static int debug_log_header_fn(debug_info_t *id, struct debug_view *view,
+			       int area, debug_entry_t *entry, char *out_buf,
+			       size_t out_buf_size)
+{
+	unsigned long sec, usec;
+	unsigned int level;
+	char *except_str;
+	int rc = 0;
+
+	level = entry->level;
+	sec = entry->clock;
+	usec = do_div(sec, USEC_PER_SEC);
+
+	if (entry->exception)
+		except_str = "*";
+	else
+		except_str = "-";
+	rc += scnprintf(out_buf, out_buf_size, "%011ld:%06lu %1u %1s %04u  ",
+			sec, usec, level, except_str,
+			entry->cpu);
+	return rc;
+}
+
+static int debug_prolog_header(debug_info_t *id, struct debug_view *view,
+			       char *out_buf, size_t out_buf_size)
+{
+	return scnprintf(out_buf, out_buf_size, "sec:usec level except cpu  msg\n");
+}
+
+static struct debug_view debug_log_view = {
+	"pci_msg_log",
+	&debug_prolog_header,
+	&debug_log_header_fn,
+	&debug_sprintf_format_fn,
+	NULL,
+	NULL
+};
+
+/**
+ * zpci_report_status - Report the status of operations on a PCI device
+ * @zdev:	The PCI device for which to report status
+ * @operation:	A string representing the operation reported
+ * @status:	A string representing the status of the operation
+ *
+ * This function creates a human readable report about an operation such as
+ * PCI device recovery and forwards this to the platform using the SCLP Write
+ * Event Data mechanism. Besides the operation and status strings the report
+ * also contains additional information about the device deemed useful for
+ * debug such as the currently bound device driver, if any, and error state.
+ * Additionally a string representation of pci_debug_msg_id, or as much as fits,
+ * is also included.
+ *
+ * Return: 0 on success an error code < 0 otherwise.
+ */
+int zpci_report_status(struct zpci_dev *zdev, const char *operation, const char *status)
+{
+	struct zpci_report_error *report;
+	struct pci_driver *driver = NULL;
+	struct pci_dev *pdev = NULL;
+	char *buf, *end;
+	int ret;
+
+	if (!zdev || !zdev->zbus)
+		return -ENODEV;
+
+	/* Protected virtualization hosts get nothing from us */
+	if (prot_virt_guest)
+		return -ENODATA;
+
+	report = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!report)
+		return -ENOMEM;
+	if (zdev->zbus->bus)
+		pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn);
+	if (pdev)
+		driver = to_pci_driver(pdev->dev.driver);
+
+	buf = report->data.log_data;
+	end = report->data.log_data + ZPCI_REPORT_DATA_SIZE;
+	buf += scnprintf(buf, end - buf, "report: %s\n", operation);
+	buf += scnprintf(buf, end - buf, "status: %s\n", status);
+	buf += scnprintf(buf, end - buf, "state: %s\n",
+			 (pdev) ? zpci_state_str(pdev->error_state) : "n/a");
+	buf += scnprintf(buf, end - buf, "driver: %s\n", (driver) ? driver->name : "n/a");
+	ret = debug_dump(pci_debug_msg_id, &debug_log_view, buf, end - buf, true);
+	if (ret < 0)
+		pr_err("Reading PCI debug messages failed with code %d\n", ret);
+	else
+		buf += ret;
+
+	report->header.version = 1;
+	report->header.action = SCLP_ERRNOTIFY_AQ_INFO_LOG;
+	report->header.length = buf - (char *)&report->data;
+	report->data.timestamp = ktime_get_clocktai_seconds();
+	report->data.err_log_id = ZPCI_ERR_LOG_ID_KERNEL_REPORT;
+
+	ret = sclp_pci_report(&report->header, zdev->fh, zdev->fid);
+	if (ret)
+		pr_err("Reporting PCI status failed with code %d\n", ret);
+	else
+		pr_info("Reported PCI device status\n");
+
+	free_page((unsigned long)report);
+
+	return ret;
+}
diff --git a/arch/s390/pci/pci_report.h b/arch/s390/pci/pci_report.h
new file mode 100644
index 000000000000..e08003d51a97
--- /dev/null
+++ b/arch/s390/pci/pci_report.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 2024
+ *
+ * Author(s):
+ *   Niklas Schnelle <schnelle@linux.ibm.com>
+ *
+ */
+#ifndef __S390_PCI_REPORT_H
+#define __S390_PCI_REPORT_H
+
+struct zpci_dev;
+
+int zpci_report_status(struct zpci_dev *zdev, const char *operation, const char *status);
+
+#endif /* __S390_PCI_REPORT_H */
diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c
index a433ba01a317..0ecad08e1b1e 100644
--- a/arch/s390/pci/pci_sysfs.c
+++ b/arch/s390/pci/pci_sysfs.c
@@ -13,6 +13,8 @@
 #include <linux/stat.h>
 #include <linux/pci.h>
 
+#include "../../../drivers/pci/pci.h"
+
 #include <asm/sclp.h>
 
 #define zpci_attr(name, fmt, member)					\
@@ -21,7 +23,7 @@ static ssize_t name##_show(struct device *dev,				\
 {									\
 	struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));		\
 									\
-	return sprintf(buf, fmt, zdev->member);				\
+	return sysfs_emit(buf, fmt, zdev->member);				\
 }									\
 static DEVICE_ATTR_RO(name)
 
@@ -31,6 +33,8 @@ zpci_attr(pchid, "0x%04x\n", pchid);
 zpci_attr(pfgid, "0x%02x\n", pfgid);
 zpci_attr(vfn, "0x%04x\n", vfn);
 zpci_attr(pft, "0x%02x\n", pft);
+zpci_attr(port, "%d\n", port);
+zpci_attr(fidparm, "0x%02x\n", fidparm);
 zpci_attr(uid, "0x%x\n", uid);
 zpci_attr(segment0, "0x%02x\n", pfip[0]);
 zpci_attr(segment1, "0x%02x\n", pfip[1]);
@@ -42,43 +46,87 @@ static ssize_t mio_enabled_show(struct device *dev,
 {
 	struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
 
-	return sprintf(buf, zpci_use_mio(zdev) ? "1\n" : "0\n");
+	return sysfs_emit(buf, zpci_use_mio(zdev) ? "1\n" : "0\n");
 }
 static DEVICE_ATTR_RO(mio_enabled);
 
-static ssize_t recover_store(struct device *dev, struct device_attribute *attr,
-			     const char *buf, size_t count)
+static int _do_recover(struct pci_dev *pdev, struct zpci_dev *zdev)
 {
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct zpci_dev *zdev = to_zpci(pdev);
 	int ret;
 
-	if (!device_remove_file_self(dev, attr))
-		return count;
-
-	pci_lock_rescan_remove();
 	pci_stop_and_remove_bus_device(pdev);
-	ret = zpci_disable_device(zdev);
-	if (ret)
-		goto error;
+	if (zdev_enabled(zdev)) {
+		ret = zpci_disable_device(zdev);
+		/*
+		 * Due to a z/VM vs LPAR inconsistency in the error
+		 * state the FH may indicate an enabled device but
+		 * disable says the device is already disabled don't
+		 * treat it as an error here.
+		 */
+		if (ret == -EINVAL)
+			ret = 0;
+		if (ret)
+			return ret;
+	}
+
+	ret = zpci_reenable_device(zdev);
 
-	ret = zpci_enable_device(zdev);
-	if (ret)
-		goto error;
+	return ret;
+}
 
-	pci_rescan_bus(zdev->bus);
+static ssize_t recover_store(struct device *dev, struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct kernfs_node *kn;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct zpci_dev *zdev = to_zpci(pdev);
+	int ret = 0;
+
+	/* Can't use device_remove_self() here as that would lead us to lock
+	 * the pci_rescan_remove_lock while holding the device' kernfs lock.
+	 * This would create a possible deadlock with disable_slot() which is
+	 * not directly protected by the device' kernfs lock but takes it
+	 * during the device removal which happens under
+	 * pci_rescan_remove_lock.
+	 *
+	 * This is analogous to sdev_store_delete() in
+	 * drivers/scsi/scsi_sysfs.c
+	 */
+	kn = sysfs_break_active_protection(&dev->kobj, &attr->attr);
+	WARN_ON_ONCE(!kn);
+
+	/* Device needs to be configured and state must not change */
+	mutex_lock(&zdev->state_lock);
+	if (zdev->state != ZPCI_FN_STATE_CONFIGURED)
+		goto out;
+
+	/* device_remove_file() serializes concurrent calls ignoring all but
+	 * the first
+	 */
+	device_remove_file(dev, attr);
+
+	/* A concurrent call to recover_store() may slip between
+	 * sysfs_break_active_protection() and the sysfs file removal.
+	 * Once it unblocks from pci_lock_rescan_remove() the original pdev
+	 * will already be removed.
+	 */
+	pci_lock_rescan_remove();
+	if (pci_dev_is_added(pdev)) {
+		ret = _do_recover(pdev, zdev);
+	}
+	pci_rescan_bus(zdev->zbus->bus);
 	pci_unlock_rescan_remove();
 
-	return count;
-
-error:
-	pci_unlock_rescan_remove();
-	return ret;
+out:
+	mutex_unlock(&zdev->state_lock);
+	if (kn)
+		sysfs_unbreak_active_protection(kn);
+	return ret ? ret : count;
 }
 static DEVICE_ATTR_WO(recover);
 
 static ssize_t util_string_read(struct file *filp, struct kobject *kobj,
-				struct bin_attribute *attr, char *buf,
+				const struct bin_attribute *attr, char *buf,
 				loff_t off, size_t count)
 {
 	struct device *dev = kobj_to_dev(kobj);
@@ -88,10 +136,10 @@ static ssize_t util_string_read(struct file *filp, struct kobject *kobj,
 	return memory_read_from_buffer(buf, count, &off, zdev->util_str,
 				       sizeof(zdev->util_str));
 }
-static BIN_ATTR_RO(util_string, CLP_UTIL_STR_LEN);
+static const BIN_ATTR_RO(util_string, CLP_UTIL_STR_LEN);
 
 static ssize_t report_error_write(struct file *filp, struct kobject *kobj,
-				  struct bin_attribute *attr, char *buf,
+				  const struct bin_attribute *attr, char *buf,
 				  loff_t off, size_t count)
 {
 	struct zpci_report_error_header *report = (void *) buf;
@@ -107,9 +155,46 @@ static ssize_t report_error_write(struct file *filp, struct kobject *kobj,
 
 	return ret ? ret : count;
 }
-static BIN_ATTR(report_error, S_IWUSR, NULL, report_error_write, PAGE_SIZE);
+static const BIN_ATTR(report_error, S_IWUSR, NULL, report_error_write, PAGE_SIZE);
 
-static struct bin_attribute *zpci_bin_attrs[] = {
+static ssize_t uid_is_unique_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%d\n", zpci_unique_uid ? 1 : 0);
+}
+static DEVICE_ATTR_RO(uid_is_unique);
+
+/* analogous to smbios index */
+static ssize_t index_show(struct device *dev,
+			  struct device_attribute *attr, char *buf)
+{
+	struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
+	u32 index = ~0;
+
+	if (zpci_unique_uid)
+		index = zdev->uid;
+
+	return sysfs_emit(buf, "%u\n", index);
+}
+static DEVICE_ATTR_RO(index);
+
+static umode_t zpci_index_is_visible(struct kobject *kobj,
+				     struct attribute *attr, int n)
+{
+	return zpci_unique_uid ? attr->mode : 0;
+}
+
+static struct attribute *zpci_ident_attrs[] = {
+	&dev_attr_index.attr,
+	NULL,
+};
+
+const struct attribute_group zpci_ident_attr_group = {
+	.attrs = zpci_ident_attrs,
+	.is_visible = zpci_index_is_visible,
+};
+
+static const struct bin_attribute *const zpci_bin_attrs[] = {
 	&bin_attr_util_string,
 	&bin_attr_report_error,
 	NULL,
@@ -121,15 +206,19 @@ static struct attribute *zpci_dev_attrs[] = {
 	&dev_attr_pchid.attr,
 	&dev_attr_pfgid.attr,
 	&dev_attr_pft.attr,
+	&dev_attr_port.attr,
+	&dev_attr_fidparm.attr,
 	&dev_attr_vfn.attr,
 	&dev_attr_uid.attr,
 	&dev_attr_recover.attr,
 	&dev_attr_mio_enabled.attr,
+	&dev_attr_uid_is_unique.attr,
 	NULL,
 };
-static struct attribute_group zpci_attr_group = {
+
+const struct attribute_group zpci_attr_group = {
 	.attrs = zpci_dev_attrs,
-	.bin_attrs = zpci_bin_attrs,
+	.bin_attrs_new = zpci_bin_attrs,
 };
 
 static struct attribute *pfip_attrs[] = {
@@ -139,13 +228,8 @@ static struct attribute *pfip_attrs[] = {
 	&dev_attr_segment3.attr,
 	NULL,
 };
-static struct attribute_group pfip_attr_group = {
+
+const struct attribute_group pfip_attr_group = {
 	.name = "pfip",
 	.attrs = pfip_attrs,
 };
-
-const struct attribute_group *zpci_attr_groups[] = {
-	&zpci_attr_group,
-	&pfip_attr_group,
-	NULL,
-};
diff --git a/arch/s390/purgatory/.gitignore b/arch/s390/purgatory/.gitignore
index 04a03433c720..97ca52779457 100644
--- a/arch/s390/purgatory/.gitignore
+++ b/arch/s390/purgatory/.gitignore
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
 purgatory
+purgatory.chk
 purgatory.lds
 purgatory.ro
diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile
index bc0d7a0d0394..bd39b36e7bd6 100644
--- a/arch/s390/purgatory/Makefile
+++ b/arch/s390/purgatory/Makefile
@@ -1,43 +1,48 @@
 # SPDX-License-Identifier: GPL-2.0
 
-OBJECT_FILES_NON_STANDARD := y
-
 purgatory-y := head.o purgatory.o string.o sha256.o mem.o
 
-targets += $(purgatory-y) purgatory.lds purgatory purgatory.ro
+targets += $(purgatory-y) purgatory.lds purgatory purgatory.chk purgatory.ro
 PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y))
 
 $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE
 	$(call if_changed_rule,cc_o_c)
 
-CFLAGS_sha256.o := -D__DISABLE_EXPORTS
+CFLAGS_sha256.o := -D__NO_FORTIFY
 
 $(obj)/mem.o: $(srctree)/arch/s390/lib/mem.S FORCE
 	$(call if_changed_rule,as_o_S)
 
-$(obj)/string.o: $(srctree)/arch/s390/lib/string.c FORCE
-	$(call if_changed_rule,cc_o_c)
-
-KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes
+KBUILD_CFLAGS := -std=gnu11 -fno-strict-aliasing -Wall -Wstrict-prototypes
 KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare
 KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding
-KBUILD_CFLAGS += -c -MD -Os -m64 -msoft-float -fno-common
+KBUILD_CFLAGS += -Os -m64 -msoft-float -fno-common
+KBUILD_CFLAGS += -fno-stack-protector
+KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
+KBUILD_CFLAGS += -D__DISABLE_EXPORTS
 KBUILD_CFLAGS += $(CLANG_FLAGS)
 KBUILD_CFLAGS += $(call cc-option,-fno-PIE)
 KBUILD_AFLAGS := $(filter-out -DCC_USING_EXPOLINE,$(KBUILD_AFLAGS))
+KBUILD_AFLAGS += -D__DISABLE_EXPORTS
 
-LDFLAGS_purgatory := -r --no-undefined -nostdlib -z nodefaultlib -T
+# Since we link purgatory with -r unresolved symbols are not checked, so we
+# also link a purgatory.chk binary without -r to check for unresolved symbols.
+PURGATORY_LDFLAGS := -nostdlib -z nodefaultlib
+LDFLAGS_purgatory := -r $(PURGATORY_LDFLAGS) -T
+LDFLAGS_purgatory.chk := -e purgatory_start $(PURGATORY_LDFLAGS)
 $(obj)/purgatory: $(obj)/purgatory.lds $(PURGATORY_OBJS) FORCE
 		$(call if_changed,ld)
 
+$(obj)/purgatory.chk: $(obj)/purgatory FORCE
+		$(call if_changed,ld)
+
 OBJCOPYFLAGS_purgatory.ro := -O elf64-s390
 OBJCOPYFLAGS_purgatory.ro += --remove-section='*debug*'
 OBJCOPYFLAGS_purgatory.ro += --remove-section='.comment'
 OBJCOPYFLAGS_purgatory.ro += --remove-section='.note.*'
-$(obj)/purgatory.ro: $(obj)/purgatory FORCE
+$(obj)/purgatory.ro: $(obj)/purgatory $(obj)/purgatory.chk FORCE
 		$(call if_changed,objcopy)
 
-$(obj)/kexec-purgatory.o: $(obj)/kexec-purgatory.S $(obj)/purgatory.ro FORCE
-	$(call if_changed_rule,as_o_S)
+$(obj)/kexec-purgatory.o: $(obj)/purgatory.ro
 
-obj-$(CONFIG_ARCH_HAS_KEXEC_PURGATORY) += kexec-purgatory.o
+obj-y += kexec-purgatory.o
diff --git a/arch/s390/purgatory/head.S b/arch/s390/purgatory/head.S
index 5a10ce34b95d..db3ab2402621 100644
--- a/arch/s390/purgatory/head.S
+++ b/arch/s390/purgatory/head.S
@@ -44,11 +44,14 @@
 .endm
 
 .macro MEMSWAP dst,src,buf,len
-10:	cghi	\len,bufsz
+10:	larl	%r0,purgatory_end
+	larl	%r1,stack
+	slgr	%r0,%r1
+	cgr	\len,%r0
 	jh	11f
 	lgr	%r4,\len
 	j	12f
-11:	lghi	%r4,bufsz
+11:	lgr	%r4,%r0
 
 12:	MEMCPY	\buf,\dst,%r4
 	MEMCPY	\dst,\src,%r4
@@ -62,19 +65,20 @@
 	jh	10b
 .endm
 
-.macro START_NEXT_KERNEL base
+.macro START_NEXT_KERNEL base subcode
 	lg	%r4,kernel_entry-\base(%r13)
 	lg	%r5,load_psw_mask-\base(%r13)
 	ogr	%r4,%r5
 	stg	%r4,0(%r0)
 
 	xgr	%r0,%r0
-	diag	%r0,%r0,0x308
+	lghi	%r1,\subcode
+	diag	%r0,%r1,0x308
 .endm
 
-.text
-.align PAGE_SIZE
-ENTRY(purgatory_start)
+	.text
+	.balign PAGE_SIZE
+SYM_CODE_START(purgatory_start)
 	/* The purgatory might be called after a diag308 so better set
 	 * architecture and addressing mode.
 	 */
@@ -96,7 +100,7 @@ ENTRY(purgatory_start)
 	 * checksum verification only (%r2 = 0 -> verification only).
 	 *
 	 * Check now and preserve over C function call by storing in
-	 * %r10 whith
+	 * %r10 with
 	 *	1 -> checksum verification only
 	 *	0 -> load new kernel
 	 */
@@ -123,7 +127,7 @@ ENTRY(purgatory_start)
 	je	.start_crash_kernel
 
 	/* start normal kernel */
-	START_NEXT_KERNEL .base_crash
+	START_NEXT_KERNEL .base_crash 0
 
 .return_old_kernel:
 	lmg	%r6,%r15,gprregs-.base_crash(%r13)
@@ -134,29 +138,39 @@ ENTRY(purgatory_start)
 
 .start_crash_kernel:
 	/* Location of purgatory_start in crash memory */
+	larl	%r0,.base_crash
+	larl	%r1,purgatory_start
+	slgr	%r0,%r1
 	lgr	%r8,%r13
-	aghi	%r8,-(.base_crash-purgatory_start)
+	sgr	%r8,%r0
 
 	/* Destination for this code i.e. end of memory to be swapped. */
+	larl	%r0,purgatory_end
+	larl	%r1,purgatory_start
+	slgr	%r0,%r1
 	lg	%r9,crash_size-.base_crash(%r13)
-	aghi	%r9,-(purgatory_end-purgatory_start)
+	sgr	%r9,%r0
 
 	/* Destination in crash memory, i.e. same as r9 but in crash memory. */
 	lg	%r10,crash_start-.base_crash(%r13)
 	agr	%r10,%r9
 
 	/* Buffer location (in crash memory) and size. As the purgatory is
-	 * behind the point of no return it can re-use the stack as buffer.
+	 * behind the point of no return it can reuse the stack as buffer.
 	 */
-	lghi	%r11,bufsz
+	larl	%r11,purgatory_end
 	larl	%r12,stack
+	slgr	%r11,%r12
 
 	MEMCPY	%r12,%r9,%r11	/* dst	-> (crash) buf */
 	MEMCPY	%r9,%r8,%r11	/* self -> dst */
 
 	/* Jump to new location. */
 	lgr	%r7,%r9
-	aghi	%r7,.jump_to_dst-purgatory_start
+	larl	%r0,.jump_to_dst
+	larl	%r1,purgatory_start
+	slgr	%r0,%r1
+	agr	%r7,%r0
 	br	%r7
 
 .jump_to_dst:
@@ -168,7 +182,10 @@ ENTRY(purgatory_start)
 
 	/* Load new buffer location after jump */
 	larl	%r7,stack
-	aghi	%r10,stack-purgatory_start
+	lgr	%r0,%r7
+	larl	%r1,purgatory_start
+	slgr	%r0,%r1
+	agr	%r10,%r0
 	MEMCPY	%r10,%r7,%r11	/* (new) buf -> (crash) buf */
 
 	/* Now the code is set up to run from its designated location. Start
@@ -227,46 +244,22 @@ ENTRY(purgatory_start)
 	MEMCPY	%r9,%r10,%r11
 
 	/* start crash kernel */
-	START_NEXT_KERNEL .base_dst
-
-
-load_psw_mask:
-	.long	0x00080000,0x80000000
-
-	.align	8
-disabled_wait_psw:
-	.quad	0x0002000180000000
-	.quad	0x0000000000000000 + .do_checksum_verification
-
-gprregs:
-	.rept	10
-	.quad	0
-	.endr
-
-/* Macro to define a global variable with name and size (in bytes) to be
- * shared with C code.
- *
- * Add the .size and .type attribute to satisfy checks on the Elf_Sym during
- * purgatory load.
- */
-.macro GLOBAL_VARIABLE name,size
-\name:
-	.global \name
-	.size	\name,\size
-	.type	\name,object
-	.skip	\size,0
-.endm
-
-GLOBAL_VARIABLE purgatory_sha256_digest,32
-GLOBAL_VARIABLE purgatory_sha_regions,16*__KEXEC_SHA_REGION_SIZE
-GLOBAL_VARIABLE kernel_entry,8
-GLOBAL_VARIABLE kernel_type,8
-GLOBAL_VARIABLE crash_start,8
-GLOBAL_VARIABLE crash_size,8
-
-	.align	PAGE_SIZE
-stack:
+	START_NEXT_KERNEL .base_dst 1
+SYM_CODE_END(purgatory_start)
+
+SYM_DATA_LOCAL(load_psw_mask,		.long 0x00080000,0x80000000)
+	.balign	8
+SYM_DATA_LOCAL(disabled_wait_psw,	.quad 0x0002000180000000,.do_checksum_verification)
+SYM_DATA_LOCAL(gprregs,			.fill 10,8,0)
+SYM_DATA(purgatory_sha256_digest,	.skip 32)
+SYM_DATA(purgatory_sha_regions,		.skip 16*__KEXEC_SHA_REGION_SIZE)
+SYM_DATA(kernel_entry,			.skip 8)
+SYM_DATA(kernel_type,			.skip 8)
+SYM_DATA(crash_start,			.skip 8)
+SYM_DATA(crash_size,			.skip 8)
+	.balign	PAGE_SIZE
+SYM_DATA_START_LOCAL(stack)
 	/* The buffer to move this code must be as big as the code. */
 	.skip	stack-purgatory_start
-	.align	PAGE_SIZE
-purgatory_end:
+	.balign	PAGE_SIZE
+SYM_DATA_END_LABEL(stack, SYM_L_LOCAL, purgatory_end)
diff --git a/arch/s390/purgatory/kexec-purgatory.S b/arch/s390/purgatory/kexec-purgatory.S
index 8293753100ae..25f512b1de12 100644
--- a/arch/s390/purgatory/kexec-purgatory.S
+++ b/arch/s390/purgatory/kexec-purgatory.S
@@ -1,14 +1,12 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
 
 	.section .rodata, "a"
 
-	.align	8
-kexec_purgatory:
-	.globl	kexec_purgatory
+	.balign	8
+SYM_DATA_START(kexec_purgatory)
 	.incbin	"arch/s390/purgatory/purgatory.ro"
-.Lkexec_purgatroy_end:
+SYM_DATA_END_LABEL(kexec_purgatory, SYM_L_LOCAL, kexec_purgatory_end)
 
-	.align	8
-kexec_purgatory_size:
-	.globl	kexec_purgatory_size
-	.quad	.Lkexec_purgatroy_end - kexec_purgatory
+	.balign	8
+SYM_DATA(kexec_purgatory_size, .quad kexec_purgatory_end-kexec_purgatory)
diff --git a/arch/s390/purgatory/purgatory.c b/arch/s390/purgatory/purgatory.c
index 0a423bcf6746..030efda05dbe 100644
--- a/arch/s390/purgatory/purgatory.c
+++ b/arch/s390/purgatory/purgatory.c
@@ -9,7 +9,7 @@
 
 #include <linux/kexec.h>
 #include <linux/string.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <asm/purgatory.h>
 
 int verify_sha256_digest(void)
diff --git a/arch/s390/purgatory/string.c b/arch/s390/purgatory/string.c
new file mode 100644
index 000000000000..c98c22a72db7
--- /dev/null
+++ b/arch/s390/purgatory/string.c
@@ -0,0 +1,3 @@
+// SPDX-License-Identifier: GPL-2.0
+#define __HAVE_ARCH_MEMCMP	/* arch function */
+#include "../lib/string.c"
diff --git a/arch/s390/scripts/Makefile.chkbss b/arch/s390/scripts/Makefile.chkbss
deleted file mode 100644
index f4f4c2c6dee9..000000000000
--- a/arch/s390/scripts/Makefile.chkbss
+++ /dev/null
@@ -1,20 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-
-chkbss-target ?= built-in.a
-$(obj)/$(chkbss-target): chkbss
-
-chkbss-files := $(addsuffix .chkbss, $(chkbss))
-clean-files += $(chkbss-files)
-
-PHONY += chkbss
-chkbss: $(addprefix $(obj)/, $(chkbss-files))
-
-quiet_cmd_chkbss = CHKBSS  $<
-      cmd_chkbss = \
-	if ! $(OBJSIZE) --common $< | $(AWK) 'END { if ($$3) exit 1 }'; then \
-		echo "error: $< .bss section is not empty" >&2; exit 1; \
-	fi; \
-	touch $@;
-
-$(obj)/%.o.chkbss: $(obj)/%.o
-	$(call cmd,chkbss)
diff --git a/arch/s390/tools/.gitignore b/arch/s390/tools/.gitignore
index 71bd6f8eebaf..e6af51d9d183 100644
--- a/arch/s390/tools/.gitignore
+++ b/arch/s390/tools/.gitignore
@@ -1,2 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 gen_facilities
 gen_opcode_table
+relocs
diff --git a/arch/s390/tools/Makefile b/arch/s390/tools/Makefile
index b5e35e8f999a..f2862364fb42 100644
--- a/arch/s390/tools/Makefile
+++ b/arch/s390/tools/Makefile
@@ -10,8 +10,8 @@ PHONY += kapi
 
 kapi:	$(kapi-hdrs-y)
 
-hostprogs-y		    += gen_facilities
-hostprogs-y		    += gen_opcode_table
+hostprogs		    += gen_facilities
+hostprogs		    += gen_opcode_table
 
 HOSTCFLAGS_gen_facilities.o += $(LINUXINCLUDE)
 
@@ -25,3 +25,8 @@ $(kapi)/facility-defs.h: $(obj)/gen_facilities FORCE
 
 $(kapi)/dis-defs.h: $(obj)/gen_opcode_table FORCE
 	$(call filechk,dis-defs.h)
+
+hostprogs	+= relocs
+PHONY		+= relocs
+relocs: $(obj)/relocs
+	@:
diff --git a/arch/s390/tools/gcc-thunk-extern.sh b/arch/s390/tools/gcc-thunk-extern.sh
new file mode 100755
index 000000000000..20bcbf6dd7ab
--- /dev/null
+++ b/arch/s390/tools/gcc-thunk-extern.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# Borrowed from gcc: gcc/testsuite/gcc.target/s390/nobp-section-type-conflict.c
+# Checks that we don't get error: section type conflict with ‘put_page’.
+
+cat << "END" | $@ -x c - -fno-PIE -march=z10 -mindirect-branch=thunk-extern -mfunction-return=thunk-extern -mindirect-branch-table -O2 -c -o /dev/null
+int a;
+int b (void);
+void c (int);
+
+static void
+put_page (void)
+{
+  if (b ())
+    c (a);
+}
+
+__attribute__ ((__section__ (".init.text"), __cold__)) void
+d (void)
+{
+  put_page ();
+  put_page ();
+}
+END
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index 61ce5b59b828..855f818deb98 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -27,24 +27,16 @@ static struct facility_def facility_defs[] = {
 		 */
 		.name = "FACILITIES_ALS",
 		.bits = (int[]){
-#ifdef CONFIG_HAVE_MARCH_Z900_FEATURES
 			0,  /* N3 instructions */
 			1,  /* z/Arch mode installed */
-#endif
-#ifdef CONFIG_HAVE_MARCH_Z990_FEATURES
 			18, /* long displacement facility */
-#endif
-#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES
 			21, /* extended-immediate facility */
 			25, /* store clock fast */
-#endif
-#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
 			27, /* mvcos */
 			32, /* compare and swap and store */
 			33, /* compare and swap and store 2 */
 			34, /* general instructions extension */
 			35, /* execute extensions */
-#endif
 #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
 			45, /* fast-BCR, etc. */
 #endif
@@ -54,6 +46,7 @@ static struct facility_def facility_defs[] = {
 #endif
 #ifdef CONFIG_HAVE_MARCH_Z13_FEATURES
 			53, /* load-and-zero-rightmost-byte, etc. */
+			129, /* vector */
 #endif
 #ifdef CONFIG_HAVE_MARCH_Z14_FEATURES
 			58, /* miscellaneous-instruction-extension 2 */
@@ -115,6 +108,13 @@ static struct facility_def facility_defs[] = {
 			12, /* AP Query Configuration Information */
 			15, /* AP Facilities Test */
 			156, /* etoken facility */
+			165, /* nnpa facility */
+			170, /* ineffective-nonconstrained-transaction facility */
+			193, /* bear enhancement facility */
+			194, /* rdp enhancement facility */
+			196, /* processor activity instrumentation facility */
+			197, /* processor activity instrumentation extension 1 */
+			201, /* concurrent-functions facility */
 			-1  /* END */
 		}
 	},
diff --git a/arch/s390/tools/gen_opcode_table.c b/arch/s390/tools/gen_opcode_table.c
index a1bc02b29c81..7d76c417f83f 100644
--- a/arch/s390/tools/gen_opcode_table.c
+++ b/arch/s390/tools/gen_opcode_table.c
@@ -201,6 +201,17 @@ static int cmp_long_insn(const void *a, const void *b)
 	return strcmp(((struct insn *)a)->name, ((struct insn *)b)->name);
 }
 
+static void print_insn_name(const char *name)
+{
+	size_t i, len;
+
+	len = strlen(name);
+	printf("{");
+	for (i = 0; i < len; i++)
+		printf(" \'%c\',", name[i]);
+	printf(" }");
+}
+
 static void print_long_insn(struct gen_opcode *desc)
 {
 	struct insn *insn;
@@ -223,7 +234,9 @@ static void print_long_insn(struct gen_opcode *desc)
 		insn = &desc->insn[i];
 		if (insn->name_len < 6)
 			continue;
-		printf("\t[LONG_INSN_%s] = \"%s\", \\\n", insn->upper, insn->name);
+		printf("\t[LONG_INSN_%s] = ", insn->upper);
+		print_insn_name(insn->name);
+		printf(", \\\n");
 	}
 	printf("}\n\n");
 }
@@ -236,11 +249,13 @@ static void print_opcode(struct insn *insn, int nr)
 	if (insn->type->byte != 0)
 		opcode += 2;
 	printf("\t[%4d] = { .opfrag = 0x%s, .format = INSTR_%s, ", nr, opcode, insn->format);
-	if (insn->name_len < 6)
-		printf(".name = \"%s\" ", insn->name);
-	else
-		printf(".offset = LONG_INSN_%s ", insn->upper);
-	printf("}, \\\n");
+	if (insn->name_len < 6) {
+		printf(".name =  ");
+		print_insn_name(insn->name);
+	} else {
+		printf(".offset = LONG_INSN_%s", insn->upper);
+	}
+	printf(" }, \\\n");
 }
 
 static void add_to_group(struct gen_opcode *desc, struct insn *insn, int offset)
diff --git a/arch/s390/tools/opcodes.txt b/arch/s390/tools/opcodes.txt
index 46d8ed96cf06..def2659f6602 100644
--- a/arch/s390/tools/opcodes.txt
+++ b/arch/s390/tools/opcodes.txt
@@ -189,6 +189,8 @@ ad	stosm	SI_URD
 ae	sigp	RS_RRRD
 af	mc	SI_URD
 b1	lra	RX_RRRD
+b200	lbear	S_RD
+b201	stbear	S_RD
 b202	stidp	S_RD
 b204	sck	S_RD
 b205	stck	S_RD
@@ -274,6 +276,7 @@ b285	lpctl	S_RD
 b286	qsi	S_RD
 b287	lsctl	S_RD
 b28e	qctri	S_RD
+b28f	qpaci	S_RD
 b299	srnm	S_RD
 b29c	stfpc	S_RD
 b29d	lfpc	S_RD
@@ -523,9 +526,10 @@ b931	clgfr	RRE_RR
 b938	sortl	RRE_RR
 b939	dfltcc	RRF_R0RR2
 b93a	kdsa	RRE_RR
-b93c	ppno	RRE_RR
-b93e	kimd	RRE_RR
-b93f	klmd	RRE_RR
+b93b	nnpa	RRE_00
+b93c	prno	RRE_RR
+b93e	kimd	RRF_U0RR
+b93f	klmd	RRF_U0RR
 b941	cfdtr	RRF_UURF
 b942	clgdtr	RRF_UURF
 b943	clfdtr	RRF_UURF
@@ -545,6 +549,10 @@ b964	nngrk	RRF_R0RR2
 b965	ocgrk	RRF_R0RR2
 b966	nogrk	RRF_R0RR2
 b967	nxgrk	RRF_R0RR2
+b968	clzg	RRE_RR
+b969	ctzg	RRE_RR
+b96c	bextg	RRF_R0RR2
+b96d	bdepg	RRF_R0RR2
 b972	crt	RRF_U0RR
 b973	clrt	RRF_U0RR
 b974	nnrk	RRF_R0RR2
@@ -562,6 +570,7 @@ b987	dlgr	RRE_RR
 b988	alcgr	RRE_RR
 b989	slbgr	RRE_RR
 b98a	cspg	RRE_RR
+b98b	rdp	RRF_RURR2
 b98d	epsw	RRE_RR
 b98e	idte	RRF_RURR2
 b98f	crdte	RRF_RURR2
@@ -597,7 +606,7 @@ b9b3	cu42	RRE_RR
 b9bd	trtre	RRF_U0RR
 b9be	srstu	RRE_RR
 b9bf	trte	RRF_U0RR
-b9c0	selhhhr	RRF_RURR
+b9c0	selfhr	RRF_RURR
 b9c8	ahhhr	RRF_R0RR2
 b9c9	shhhr	RRF_R0RR2
 b9ca	alhhhr	RRF_R0RR2
@@ -791,6 +800,16 @@ e35b	sy	RXY_RRRD
 e35c	mfy	RXY_RRRD
 e35e	aly	RXY_RRRD
 e35f	sly	RXY_RRRD
+e360	lxab	RXY_RRRD
+e361	llxab	RXY_RRRD
+e362	lxah	RXY_RRRD
+e363	llxah	RXY_RRRD
+e364	lxaf	RXY_RRRD
+e365	llxaf	RXY_RRRD
+e366	lxag	RXY_RRRD
+e367	llxag	RXY_RRRD
+e368	lxaq	RXY_RRRD
+e369	llxaq	RXY_RRRD
 e370	sthy	RXY_RRRD
 e371	lay	RXY_RRRD
 e372	stcy	RXY_RRRD
@@ -875,21 +894,37 @@ e63c	vupkz	VSI_URDV
 e63d	vstrl	VSI_URDV
 e63f	vstrlr	VRS_RRDV
 e649	vlip	VRI_V0UU2
+e64a	vcvdq	VRI_VV0UU
+e64e	vcvbq	VRR_VV0U2
 e650	vcvb	VRR_RV0UU
+e651	vclzdp	VRR_VV0U2
 e652	vcvbg	VRR_RV0UU
+e654	vupkzh	VRR_VV0U2
+e655	vcnf	VRR_VV0UU2
+e656	vclfnh	VRR_VV0UU2
 e658	vcvd	VRI_VR0UU
 e659	vsrp	VRI_VVUUU2
 e65a	vcvdg	VRI_VR0UU
 e65b	vpsop	VRI_VVUUU2
-e65f	vtp	VRR_0V
+e65c	vupkzl	VRR_VV0U2
+e65d	vcfn	VRR_VV0UU2
+e65e	vclfnl	VRR_VV0UU2
+e65f	vtp	VRR_0V0U
+e670	vpkzr	VRI_VVV0UU2
 e671	vap	VRI_VVV0UU2
+e672	vsrpr	VRI_VVV0UU2
 e673	vsp	VRI_VVV0UU2
+e674	vschp	VRR_VVV0U0U
+e675	vcrnf	VRR_VVV0UU
 e677	vcp	VRR_0VV0U
 e678	vmp	VRI_VVV0UU2
 e679	vmsp	VRI_VVV0UU2
 e67a	vdp	VRI_VVV0UU2
 e67b	vrp	VRI_VVV0UU2
+e67c	vscshp	VRR_VVV
+e67d	vcsph	VRR_VVV0U0
 e67e	vsdp	VRI_VVV0UU2
+e67f	vtz	VRR_0VVU
 e700	vleb	VRX_VRRDU
 e701	vleh	VRX_VRRDU
 e702	vleg	VRX_VRRDU
@@ -930,6 +965,7 @@ e74d	vrep	VRI_VVUU
 e750	vpopct	VRR_VV0U
 e752	vctz	VRR_VV0U
 e753	vclz	VRR_VV0U
+e754	vgem	VRR_VV0U
 e756	vlr	VRX_VV
 e75c	vistr	VRR_VV0U0U
 e75f	vseg	VRR_VV0U
@@ -967,6 +1003,8 @@ e784	vpdi	VRR_VVV0U
 e785	vbperm	VRR_VVV
 e786	vsld	VRI_VVV0U
 e787	vsrd	VRI_VVV0U
+e788	veval	VRI_VVV0UV
+e789	vblend	VRR_VVVU0V
 e78a	vstrc	VRR_VVVUU0V
 e78b	vstrs	VRR_VVVUU0V
 e78c	vperm	VRR_VVV0V
@@ -992,6 +1030,10 @@ e7ac	vmale	VRR_VVVU0V
 e7ad	vmalo	VRR_VVVU0V
 e7ae	vmae	VRR_VVVU0V
 e7af	vmao	VRR_VVVU0V
+e7b0	vdl	VRR_VVV0UU
+e7b1	vrl	VRR_VVV0UU
+e7b2	vd	VRR_VVV0UU
+e7b3	vr	VRR_VVV0UU
 e7b4	vgfm	VRR_VVV0U
 e7b8	vmsl	VRR_VVVUU0V
 e7b9	vaccc	VRR_VVVU0V
@@ -999,12 +1041,12 @@ e7bb	vac	VRR_VVVU0V
 e7bc	vgfma	VRR_VVVU0V
 e7bd	vsbcbi	VRR_VVVU0V
 e7bf	vsbi	VRR_VVVU0V
-e7c0	vclgd	VRR_VV0UUU
-e7c1	vcdlg	VRR_VV0UUU
-e7c2	vcgd	VRR_VV0UUU
-e7c3	vcdg	VRR_VV0UUU
-e7c4	vlde	VRR_VV0UU2
-e7c5	vled	VRR_VV0UUU
+e7c0	vclfp	VRR_VV0UUU
+e7c1	vcfpl	VRR_VV0UUU
+e7c2	vcsfp	VRR_VV0UUU
+e7c3	vcfps	VRR_VV0UUU
+e7c4	vfll	VRR_VV0UU2
+e7c5	vflr	VRR_VV0UUU
 e7c7	vfi	VRR_VV0UUU
 e7ca	wfk	VRR_VV0UU2
 e7cb	wfc	VRR_VV0UU2
@@ -1076,16 +1118,17 @@ eb54	niy	SIY_URD
 eb55	cliy	SIY_URD
 eb56	oiy	SIY_URD
 eb57	xiy	SIY_URD
-eb60	lric	RSY_RDRU
-eb61	stric	RSY_RDRU
-eb62	mric	RSY_RDRU
+eb60	lric	RSY_RURD2
+eb61	stric	RSY_RURD2
+eb62	mric	RSY_RURD2
 eb6a	asi	SIY_IRD
 eb6e	alsi	SIY_IRD
+eb71	lpswey	SIY_RD
 eb7a	agsi	SIY_IRD
 eb7e	algsi	SIY_IRD
 eb80	icmh	RSY_RURD
 eb81	icmy	RSY_RURD
-eb8a	sqbs	RSY_RDRU
+eb8a	sqbs	RSY_RURD2
 eb8e	mvclu	RSY_RRRD
 eb8f	clclu	RSY_RRRD
 eb90	stmy	RSY_RRRD
diff --git a/arch/s390/tools/relocs.c b/arch/s390/tools/relocs.c
new file mode 100644
index 000000000000..30a732c808f3
--- /dev/null
+++ b/arch/s390/tools/relocs.c
@@ -0,0 +1,387 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <elf.h>
+#include <byteswap.h>
+#define USE_BSD
+#include <endian.h>
+
+#define ELF_BITS 64
+
+#define ELF_MACHINE		EM_S390
+#define ELF_MACHINE_NAME	"IBM S/390"
+#define SHT_REL_TYPE		SHT_RELA
+#define Elf_Rel			Elf64_Rela
+
+#define ELF_CLASS		ELFCLASS64
+#define ELF_ENDIAN		ELFDATA2MSB
+#define ELF_R_SYM(val)		ELF64_R_SYM(val)
+#define ELF_R_TYPE(val)		ELF64_R_TYPE(val)
+#define ELF_ST_TYPE(o)		ELF64_ST_TYPE(o)
+#define ELF_ST_BIND(o)		ELF64_ST_BIND(o)
+#define ELF_ST_VISIBILITY(o)	ELF64_ST_VISIBILITY(o)
+
+#define ElfW(type)		_ElfW(ELF_BITS, type)
+#define _ElfW(bits, type)	__ElfW(bits, type)
+#define __ElfW(bits, type)	Elf##bits##_##type
+
+#define Elf_Addr		ElfW(Addr)
+#define Elf_Ehdr		ElfW(Ehdr)
+#define Elf_Phdr		ElfW(Phdr)
+#define Elf_Shdr		ElfW(Shdr)
+#define Elf_Sym			ElfW(Sym)
+
+static Elf_Ehdr		ehdr;
+static unsigned long	shnum;
+static unsigned int	shstrndx;
+
+struct relocs {
+	uint32_t	*offset;
+	unsigned long	count;
+	unsigned long	size;
+};
+
+static struct relocs relocs64;
+#define FMT PRIu64
+
+struct section {
+	Elf_Shdr	shdr;
+	struct section	*link;
+	Elf_Rel		*reltab;
+};
+
+static struct section *secs;
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define le16_to_cpu(val)	(val)
+#define le32_to_cpu(val)	(val)
+#define le64_to_cpu(val)	(val)
+#define be16_to_cpu(val)	bswap_16(val)
+#define be32_to_cpu(val)	bswap_32(val)
+#define be64_to_cpu(val)	bswap_64(val)
+#endif
+
+#if BYTE_ORDER == BIG_ENDIAN
+#define le16_to_cpu(val)	bswap_16(val)
+#define le32_to_cpu(val)	bswap_32(val)
+#define le64_to_cpu(val)	bswap_64(val)
+#define be16_to_cpu(val)	(val)
+#define be32_to_cpu(val)	(val)
+#define be64_to_cpu(val)	(val)
+#endif
+
+static uint16_t elf16_to_cpu(uint16_t val)
+{
+	if (ehdr.e_ident[EI_DATA] == ELFDATA2LSB)
+		return le16_to_cpu(val);
+	else
+		return be16_to_cpu(val);
+}
+
+static uint32_t elf32_to_cpu(uint32_t val)
+{
+	if (ehdr.e_ident[EI_DATA] == ELFDATA2LSB)
+		return le32_to_cpu(val);
+	else
+		return be32_to_cpu(val);
+}
+
+#define elf_half_to_cpu(x)	elf16_to_cpu(x)
+#define elf_word_to_cpu(x)	elf32_to_cpu(x)
+
+static uint64_t elf64_to_cpu(uint64_t val)
+{
+	return be64_to_cpu(val);
+}
+
+#define elf_addr_to_cpu(x)	elf64_to_cpu(x)
+#define elf_off_to_cpu(x)	elf64_to_cpu(x)
+#define elf_xword_to_cpu(x)	elf64_to_cpu(x)
+
+static void die(char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vfprintf(stderr, fmt, ap);
+	va_end(ap);
+	exit(1);
+}
+
+static void read_ehdr(FILE *fp)
+{
+	if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1)
+		die("Cannot read ELF header: %s\n", strerror(errno));
+	if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0)
+		die("No ELF magic\n");
+	if (ehdr.e_ident[EI_CLASS] != ELF_CLASS)
+		die("Not a %d bit executable\n", ELF_BITS);
+	if (ehdr.e_ident[EI_DATA] != ELF_ENDIAN)
+		die("ELF endian mismatch\n");
+	if (ehdr.e_ident[EI_VERSION] != EV_CURRENT)
+		die("Unknown ELF version\n");
+
+	/* Convert the fields to native endian */
+	ehdr.e_type	 = elf_half_to_cpu(ehdr.e_type);
+	ehdr.e_machine	 = elf_half_to_cpu(ehdr.e_machine);
+	ehdr.e_version	 = elf_word_to_cpu(ehdr.e_version);
+	ehdr.e_entry	 = elf_addr_to_cpu(ehdr.e_entry);
+	ehdr.e_phoff	 = elf_off_to_cpu(ehdr.e_phoff);
+	ehdr.e_shoff	 = elf_off_to_cpu(ehdr.e_shoff);
+	ehdr.e_flags	 = elf_word_to_cpu(ehdr.e_flags);
+	ehdr.e_ehsize	 = elf_half_to_cpu(ehdr.e_ehsize);
+	ehdr.e_phentsize = elf_half_to_cpu(ehdr.e_phentsize);
+	ehdr.e_phnum	 = elf_half_to_cpu(ehdr.e_phnum);
+	ehdr.e_shentsize = elf_half_to_cpu(ehdr.e_shentsize);
+	ehdr.e_shnum	 = elf_half_to_cpu(ehdr.e_shnum);
+	ehdr.e_shstrndx  = elf_half_to_cpu(ehdr.e_shstrndx);
+
+	shnum = ehdr.e_shnum;
+	shstrndx = ehdr.e_shstrndx;
+
+	if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN))
+		die("Unsupported ELF header type\n");
+	if (ehdr.e_machine != ELF_MACHINE)
+		die("Not for %s\n", ELF_MACHINE_NAME);
+	if (ehdr.e_version != EV_CURRENT)
+		die("Unknown ELF version\n");
+	if (ehdr.e_ehsize != sizeof(Elf_Ehdr))
+		die("Bad Elf header size\n");
+	if (ehdr.e_phentsize != sizeof(Elf_Phdr))
+		die("Bad program header entry\n");
+	if (ehdr.e_shentsize != sizeof(Elf_Shdr))
+		die("Bad section header entry\n");
+
+	if (shnum == SHN_UNDEF || shstrndx == SHN_XINDEX) {
+		Elf_Shdr shdr;
+
+		if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0)
+			die("Seek to %" FMT " failed: %s\n", ehdr.e_shoff, strerror(errno));
+
+		if (fread(&shdr, sizeof(shdr), 1, fp) != 1)
+			die("Cannot read initial ELF section header: %s\n", strerror(errno));
+
+		if (shnum == SHN_UNDEF)
+			shnum = elf_xword_to_cpu(shdr.sh_size);
+
+		if (shstrndx == SHN_XINDEX)
+			shstrndx = elf_word_to_cpu(shdr.sh_link);
+	}
+
+	if (shstrndx >= shnum)
+		die("String table index out of bounds\n");
+}
+
+static void read_shdrs(FILE *fp)
+{
+	Elf_Shdr shdr;
+	int i;
+
+	secs = calloc(shnum, sizeof(struct section));
+	if (!secs)
+		die("Unable to allocate %ld section headers\n", shnum);
+
+	if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0)
+		die("Seek to %" FMT " failed: %s\n", ehdr.e_shoff, strerror(errno));
+
+	for (i = 0; i < shnum; i++) {
+		struct section *sec = &secs[i];
+
+		if (fread(&shdr, sizeof(shdr), 1, fp) != 1) {
+			die("Cannot read ELF section headers %d/%ld: %s\n",
+			    i, shnum, strerror(errno));
+		}
+
+		sec->shdr.sh_name      = elf_word_to_cpu(shdr.sh_name);
+		sec->shdr.sh_type      = elf_word_to_cpu(shdr.sh_type);
+		sec->shdr.sh_flags     = elf_xword_to_cpu(shdr.sh_flags);
+		sec->shdr.sh_addr      = elf_addr_to_cpu(shdr.sh_addr);
+		sec->shdr.sh_offset    = elf_off_to_cpu(shdr.sh_offset);
+		sec->shdr.sh_size      = elf_xword_to_cpu(shdr.sh_size);
+		sec->shdr.sh_link      = elf_word_to_cpu(shdr.sh_link);
+		sec->shdr.sh_info      = elf_word_to_cpu(shdr.sh_info);
+		sec->shdr.sh_addralign = elf_xword_to_cpu(shdr.sh_addralign);
+		sec->shdr.sh_entsize   = elf_xword_to_cpu(shdr.sh_entsize);
+
+		if (sec->shdr.sh_link < shnum)
+			sec->link = &secs[sec->shdr.sh_link];
+	}
+
+}
+
+static void read_relocs(FILE *fp)
+{
+	int i, j;
+
+	for (i = 0; i < shnum; i++) {
+		struct section *sec = &secs[i];
+
+		if (sec->shdr.sh_type != SHT_REL_TYPE)
+			continue;
+
+		sec->reltab = malloc(sec->shdr.sh_size);
+		if (!sec->reltab)
+			die("malloc of %" FMT " bytes for relocs failed\n", sec->shdr.sh_size);
+
+		if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0)
+			die("Seek to %" FMT " failed: %s\n", sec->shdr.sh_offset, strerror(errno));
+
+		if (fread(sec->reltab, 1, sec->shdr.sh_size, fp) != sec->shdr.sh_size)
+			die("Cannot read symbol table: %s\n", strerror(errno));
+
+		for (j = 0; j < sec->shdr.sh_size / sizeof(Elf_Rel); j++) {
+			Elf_Rel *rel = &sec->reltab[j];
+
+			rel->r_offset = elf_addr_to_cpu(rel->r_offset);
+			rel->r_info   = elf_xword_to_cpu(rel->r_info);
+#if (SHT_REL_TYPE == SHT_RELA)
+			rel->r_addend = elf_xword_to_cpu(rel->r_addend);
+#endif
+		}
+	}
+}
+
+static void add_reloc(struct relocs *r, uint32_t offset)
+{
+	if (r->count == r->size) {
+		unsigned long newsize = r->size + 50000;
+		void *mem = realloc(r->offset, newsize * sizeof(r->offset[0]));
+
+		if (!mem)
+			die("realloc of %ld entries for relocs failed\n", newsize);
+
+		r->offset = mem;
+		r->size = newsize;
+	}
+	r->offset[r->count++] = offset;
+}
+
+static int do_reloc(struct section *sec, Elf_Rel *rel)
+{
+	unsigned int r_type = ELF64_R_TYPE(rel->r_info);
+	ElfW(Addr) offset = rel->r_offset;
+
+	switch (r_type) {
+	case R_390_NONE:
+	case R_390_PC32:
+	case R_390_PC64:
+	case R_390_PC16DBL:
+	case R_390_PC32DBL:
+	case R_390_PLT32DBL:
+	case R_390_GOTENT:
+	case R_390_GOTPCDBL:
+	case R_390_GOTOFF64:
+		break;
+	case R_390_64:
+		add_reloc(&relocs64, offset);
+		break;
+	default:
+		die("Unsupported relocation type: %d\n", r_type);
+		break;
+	}
+
+	return 0;
+}
+
+static void walk_relocs(void)
+{
+	int i;
+
+	/* Walk through the relocations */
+	for (i = 0; i < shnum; i++) {
+		struct section *sec_applies;
+		int j;
+		struct section *sec = &secs[i];
+
+		if (sec->shdr.sh_type != SHT_REL_TYPE)
+			continue;
+
+		sec_applies = &secs[sec->shdr.sh_info];
+		if (!(sec_applies->shdr.sh_flags & SHF_ALLOC))
+			continue;
+
+		for (j = 0; j < sec->shdr.sh_size / sizeof(Elf_Rel); j++) {
+			Elf_Rel *rel = &sec->reltab[j];
+
+			do_reloc(sec, rel);
+		}
+	}
+}
+
+static int cmp_relocs(const void *va, const void *vb)
+{
+	const uint32_t *a, *b;
+
+	a = va; b = vb;
+	return (*a == *b) ? 0 : (*a > *b) ? 1 : -1;
+}
+
+static void sort_relocs(struct relocs *r)
+{
+	qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs);
+}
+
+static int print_reloc(uint32_t v)
+{
+	return fprintf(stdout, "\t.long 0x%08"PRIx32"\n", v) > 0 ? 0 : -1;
+}
+
+static void emit_relocs(void)
+{
+	int i;
+
+	walk_relocs();
+	sort_relocs(&relocs64);
+
+	printf(".section \".vmlinux.relocs_64\",\"a\"\n");
+	for (i = 0; i < relocs64.count; i++)
+		print_reloc(relocs64.offset[i]);
+}
+
+static void process(FILE *fp)
+{
+	read_ehdr(fp);
+	read_shdrs(fp);
+	read_relocs(fp);
+	emit_relocs();
+}
+
+static void usage(void)
+{
+	die("relocs vmlinux\n");
+}
+
+int main(int argc, char **argv)
+{
+	unsigned char e_ident[EI_NIDENT];
+	const char *fname;
+	FILE *fp;
+
+	fname = NULL;
+
+	if (argc != 2)
+		usage();
+
+	fname = argv[1];
+
+	fp = fopen(fname, "r");
+	if (!fp)
+		die("Cannot open %s: %s\n", fname, strerror(errno));
+
+	if (fread(&e_ident, 1, EI_NIDENT, fp) != EI_NIDENT)
+		die("Cannot read %s: %s", fname, strerror(errno));
+
+	rewind(fp);
+
+	process(fp);
+
+	fclose(fp);
+	return 0;
+}