8 files changed, 192 insertions, 570 deletions
diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile
index 14bbfe50033c..f43f897d3fc0 100644
--- a/arch/s390/lib/Makefile
+++ b/arch/s390/lib/Makefile
@@ -24,6 +24,3 @@ obj-$(CONFIG_S390_MODULES_SANITY_TEST_HELPERS) += test_modules_helpers.o
 lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 
 obj-$(CONFIG_EXPOLINE_EXTERN) += expoline.o
-
-obj-$(CONFIG_CRC32_ARCH) += crc32-s390.o
-crc32-s390-y := crc32-glue.o crc32le-vx.o crc32be-vx.o
diff --git a/arch/s390/lib/crc32-glue.c b/arch/s390/lib/crc32-glue.c
deleted file mode 100644
index 124214a27340..000000000000
--- a/arch/s390/lib/crc32-glue.c
+++ /dev/null
@@ -1,92 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * CRC-32 implemented with the z/Architecture Vector Extension Facility.
- *
- * Copyright IBM Corp. 2015
- * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
- */
-#define KMSG_COMPONENT	"crc32-vx"
-#define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
-
-#include <linux/module.h>
-#include <linux/cpufeature.h>
-#include <linux/crc32.h>
-#include <asm/fpu.h>
-#include "crc32-vx.h"
-
-#define VX_MIN_LEN		64
-#define VX_ALIGNMENT		16L
-#define VX_ALIGN_MASK		(VX_ALIGNMENT - 1)
-
-static DEFINE_STATIC_KEY_FALSE(have_vxrs);
-
-/*
- * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension
- *
- * Creates a function to perform a particular CRC-32 computation. Depending
- * on the message buffer, the hardware-accelerated or software implementation
- * is used.   Note that the message buffer is aligned to improve fetch
- * operations of VECTOR LOAD MULTIPLE instructions.
- */
-#define DEFINE_CRC32_VX(___fname, ___crc32_vx, ___crc32_sw)		    \
-	u32 ___fname(u32 crc, const u8 *data, size_t datalen)		    \
-	{								    \
-		unsigned long prealign, aligned, remaining;		    \
-		DECLARE_KERNEL_FPU_ONSTACK16(vxstate);			    \
-									    \
-		if (datalen < VX_MIN_LEN + VX_ALIGN_MASK ||		    \
-		    !static_branch_likely(&have_vxrs))			    \
-			return ___crc32_sw(crc, data, datalen);		    \
-									    \
-		if ((unsigned long)data & VX_ALIGN_MASK) {		    \
-			prealign = VX_ALIGNMENT -			    \
-				  ((unsigned long)data & VX_ALIGN_MASK);    \
-			datalen -= prealign;				    \
-			crc = ___crc32_sw(crc, data, prealign);		    \
-			data = (void *)((unsigned long)data + prealign);    \
-		}							    \
-									    \
-		aligned = datalen & ~VX_ALIGN_MASK;			    \
-		remaining = datalen & VX_ALIGN_MASK;			    \
-									    \
-		kernel_fpu_begin(&vxstate, KERNEL_VXR_LOW);		    \
-		crc = ___crc32_vx(crc, data, aligned);			    \
-		kernel_fpu_end(&vxstate, KERNEL_VXR_LOW);		    \
-									    \
-		if (remaining)						    \
-			crc = ___crc32_sw(crc, data + aligned, remaining);  \
-									    \
-		return crc;						    \
-	}								    \
-	EXPORT_SYMBOL(___fname);
-
-DEFINE_CRC32_VX(crc32_le_arch, crc32_le_vgfm_16, crc32_le_base)
-DEFINE_CRC32_VX(crc32_be_arch, crc32_be_vgfm_16, crc32_be_base)
-DEFINE_CRC32_VX(crc32c_arch, crc32c_le_vgfm_16, crc32c_base)
-
-static int __init crc32_s390_init(void)
-{
-	if (cpu_have_feature(S390_CPU_FEATURE_VXRS))
-		static_branch_enable(&have_vxrs);
-	return 0;
-}
-arch_initcall(crc32_s390_init);
-
-static void __exit crc32_s390_exit(void)
-{
-}
-module_exit(crc32_s390_exit);
-
-u32 crc32_optimizations(void)
-{
-	if (static_key_enabled(&have_vxrs))
-		return CRC32_LE_OPTIMIZATION |
-		       CRC32_BE_OPTIMIZATION |
-		       CRC32C_OPTIMIZATION;
-	return 0;
-}
-EXPORT_SYMBOL(crc32_optimizations);
-
-MODULE_AUTHOR("Hendrik Brueckner <brueckner@linux.vnet.ibm.com>");
-MODULE_DESCRIPTION("CRC-32 algorithms using z/Architecture Vector Extension Facility");
-MODULE_LICENSE("GPL");
diff --git a/arch/s390/lib/crc32-vx.h b/arch/s390/lib/crc32-vx.h
deleted file mode 100644
index 652c96e1a822..000000000000
--- a/arch/s390/lib/crc32-vx.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _CRC32_VX_S390_H
-#define _CRC32_VX_S390_H
-
-#include <linux/types.h>
-
-u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
-u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
-u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
-
-#endif /* _CRC32_VX_S390_H */
diff --git a/arch/s390/lib/crc32be-vx.c b/arch/s390/lib/crc32be-vx.c
deleted file mode 100644
index fed7c9c70d05..000000000000
--- a/arch/s390/lib/crc32be-vx.c
+++ /dev/null
@@ -1,174 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Hardware-accelerated CRC-32 variants for Linux on z Systems
- *
- * Use the z/Architecture Vector Extension Facility to accelerate the
- * computing of CRC-32 checksums.
- *
- * This CRC-32 implementation algorithm processes the most-significant
- * bit first (BE).
- *
- * Copyright IBM Corp. 2015
- * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
- */
-
-#include <linux/types.h>
-#include <asm/fpu.h>
-#include "crc32-vx.h"
-
-/* Vector register range containing CRC-32 constants */
-#define CONST_R1R2		9
-#define CONST_R3R4		10
-#define CONST_R5		11
-#define CONST_R6		12
-#define CONST_RU_POLY		13
-#define CONST_CRC_POLY		14
-
-/*
- * The CRC-32 constant block contains reduction constants to fold and
- * process particular chunks of the input data stream in parallel.
- *
- * For the CRC-32 variants, the constants are precomputed according to
- * these definitions:
- *
- *	R1 = x4*128+64 mod P(x)
- *	R2 = x4*128    mod P(x)
- *	R3 = x128+64   mod P(x)
- *	R4 = x128      mod P(x)
- *	R5 = x96       mod P(x)
- *	R6 = x64       mod P(x)
- *
- *	Barret reduction constant, u, is defined as floor(x**64 / P(x)).
- *
- *	where P(x) is the polynomial in the normal domain and the P'(x) is the
- *	polynomial in the reversed (bitreflected) domain.
- *
- * Note that the constant definitions below are extended in order to compute
- * intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
- * The rightmost doubleword can be 0 to prevent contribution to the result or
- * can be multiplied by 1 to perform an XOR without the need for a separate
- * VECTOR EXCLUSIVE OR instruction.
- *
- * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
- *
- *	P(x)  = 0x04C11DB7
- *	P'(x) = 0xEDB88320
- */
-
-static unsigned long constants_CRC_32_BE[] = {
-	0x08833794c, 0x0e6228b11,	/* R1, R2 */
-	0x0c5b9cd4c, 0x0e8a45605,	/* R3, R4 */
-	0x0f200aa66, 1UL << 32,		/* R5, x32 */
-	0x0490d678d, 1,			/* R6, 1 */
-	0x104d101df, 0,			/* u */
-	0x104C11DB7, 0,			/* P(x) */
-};
-
-/**
- * crc32_be_vgfm_16 - Compute CRC-32 (BE variant) with vector registers
- * @crc: Initial CRC value, typically ~0.
- * @buf: Input buffer pointer, performance might be improved if the
- *	  buffer is on a doubleword boundary.
- * @size: Size of the buffer, must be 64 bytes or greater.
- *
- * Register usage:
- *	V0:	Initial CRC value and intermediate constants and results.
- *	V1..V4:	Data for CRC computation.
- *	V5..V8:	Next data chunks that are fetched from the input buffer.
- *	V9..V14: CRC-32 constants.
- */
-u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
-{
-	/* Load CRC-32 constants */
-	fpu_vlm(CONST_R1R2, CONST_CRC_POLY, &constants_CRC_32_BE);
-	fpu_vzero(0);
-
-	/* Load the initial CRC value into the leftmost word of V0. */
-	fpu_vlvgf(0, crc, 0);
-
-	/* Load a 64-byte data chunk and XOR with CRC */
-	fpu_vlm(1, 4, buf);
-	fpu_vx(1, 0, 1);
-	buf += 64;
-	size -= 64;
-
-	while (size >= 64) {
-		/* Load the next 64-byte data chunk into V5 to V8 */
-		fpu_vlm(5, 8, buf);
-
-		/*
-		 * Perform a GF(2) multiplication of the doublewords in V1 with
-		 * the reduction constants in V0.  The intermediate result is
-		 * then folded (accumulated) with the next data chunk in V5 and
-		 * stored in V1.  Repeat this step for the register contents
-		 * in V2, V3, and V4 respectively.
-		 */
-		fpu_vgfmag(1, CONST_R1R2, 1, 5);
-		fpu_vgfmag(2, CONST_R1R2, 2, 6);
-		fpu_vgfmag(3, CONST_R1R2, 3, 7);
-		fpu_vgfmag(4, CONST_R1R2, 4, 8);
-		buf += 64;
-		size -= 64;
-	}
-
-	/* Fold V1 to V4 into a single 128-bit value in V1 */
-	fpu_vgfmag(1, CONST_R3R4, 1, 2);
-	fpu_vgfmag(1, CONST_R3R4, 1, 3);
-	fpu_vgfmag(1, CONST_R3R4, 1, 4);
-
-	while (size >= 16) {
-		fpu_vl(2, buf);
-		fpu_vgfmag(1, CONST_R3R4, 1, 2);
-		buf += 16;
-		size -= 16;
-	}
-
-	/*
-	 * The R5 constant is used to fold a 128-bit value into an 96-bit value
-	 * that is XORed with the next 96-bit input data chunk.  To use a single
-	 * VGFMG instruction, multiply the rightmost 64-bit with x^32 (1<<32) to
-	 * form an intermediate 96-bit value (with appended zeros) which is then
-	 * XORed with the intermediate reduction result.
-	 */
-	fpu_vgfmg(1, CONST_R5, 1);
-
-	/*
-	 * Further reduce the remaining 96-bit value to a 64-bit value using a
-	 * single VGFMG, the rightmost doubleword is multiplied with 0x1. The
-	 * intermediate result is then XORed with the product of the leftmost
-	 * doubleword with R6.	The result is a 64-bit value and is subject to
-	 * the Barret reduction.
-	 */
-	fpu_vgfmg(1, CONST_R6, 1);
-
-	/*
-	 * The input values to the Barret reduction are the degree-63 polynomial
-	 * in V1 (R(x)), degree-32 generator polynomial, and the reduction
-	 * constant u.	The Barret reduction result is the CRC value of R(x) mod
-	 * P(x).
-	 *
-	 * The Barret reduction algorithm is defined as:
-	 *
-	 *    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
-	 *    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
-	 *    3. C(x)  = R(x) XOR T2(x) mod x^32
-	 *
-	 * Note: To compensate the division by x^32, use the vector unpack
-	 * instruction to move the leftmost word into the leftmost doubleword
-	 * of the vector register.  The rightmost doubleword is multiplied
-	 * with zero to not contribute to the intermediate results.
-	 */
-
-	/* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
-	fpu_vupllf(2, 1);
-	fpu_vgfmg(2, CONST_RU_POLY, 2);
-
-	/*
-	 * Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
-	 * V2 and XOR the intermediate result, T2(x),  with the value in V1.
-	 * The final result is in the rightmost word of V2.
-	 */
-	fpu_vupllf(2, 2);
-	fpu_vgfmag(2, CONST_CRC_POLY, 2, 1);
-	return fpu_vlgvf(2, 3);
-}
diff --git a/arch/s390/lib/crc32le-vx.c b/arch/s390/lib/crc32le-vx.c
deleted file mode 100644
index 2f629f394df7..000000000000
--- a/arch/s390/lib/crc32le-vx.c
+++ /dev/null
@@ -1,240 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Hardware-accelerated CRC-32 variants for Linux on z Systems
- *
- * Use the z/Architecture Vector Extension Facility to accelerate the
- * computing of bitreflected CRC-32 checksums for IEEE 802.3 Ethernet
- * and Castagnoli.
- *
- * This CRC-32 implementation algorithm is bitreflected and processes
- * the least-significant bit first (Little-Endian).
- *
- * Copyright IBM Corp. 2015
- * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
- */
-
-#include <linux/types.h>
-#include <asm/fpu.h>
-#include "crc32-vx.h"
-
-/* Vector register range containing CRC-32 constants */
-#define CONST_PERM_LE2BE	9
-#define CONST_R2R1		10
-#define CONST_R4R3		11
-#define CONST_R5		12
-#define CONST_RU_POLY		13
-#define CONST_CRC_POLY		14
-
-/*
- * The CRC-32 constant block contains reduction constants to fold and
- * process particular chunks of the input data stream in parallel.
- *
- * For the CRC-32 variants, the constants are precomputed according to
- * these definitions:
- *
- *	R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
- *	R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
- *	R3 = [(x128+32 mod P'(x) << 32)]'   << 1
- *	R4 = [(x128-32 mod P'(x) << 32)]'   << 1
- *	R5 = [(x64 mod P'(x) << 32)]'	    << 1
- *	R6 = [(x32 mod P'(x) << 32)]'	    << 1
- *
- *	The bitreflected Barret reduction constant, u', is defined as
- *	the bit reversal of floor(x**64 / P(x)).
- *
- *	where P(x) is the polynomial in the normal domain and the P'(x) is the
- *	polynomial in the reversed (bitreflected) domain.
- *
- * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
- *
- *	P(x)  = 0x04C11DB7
- *	P'(x) = 0xEDB88320
- *
- * CRC-32C (Castagnoli) polynomials:
- *
- *	P(x)  = 0x1EDC6F41
- *	P'(x) = 0x82F63B78
- */
-
-static unsigned long constants_CRC_32_LE[] = {
-	0x0f0e0d0c0b0a0908, 0x0706050403020100,	/* BE->LE mask */
-	0x1c6e41596, 0x154442bd4,		/* R2, R1 */
-	0x0ccaa009e, 0x1751997d0,		/* R4, R3 */
-	0x0, 0x163cd6124,			/* R5 */
-	0x0, 0x1f7011641,			/* u' */
-	0x0, 0x1db710641			/* P'(x) << 1 */
-};
-
-static unsigned long constants_CRC_32C_LE[] = {
-	0x0f0e0d0c0b0a0908, 0x0706050403020100,	/* BE->LE mask */
-	0x09e4addf8, 0x740eef02,		/* R2, R1 */
-	0x14cd00bd6, 0xf20c0dfe,		/* R4, R3 */
-	0x0, 0x0dd45aab8,			/* R5 */
-	0x0, 0x0dea713f1,			/* u' */
-	0x0, 0x105ec76f0			/* P'(x) << 1 */
-};
-
-/**
- * crc32_le_vgfm_generic - Compute CRC-32 (LE variant) with vector registers
- * @crc: Initial CRC value, typically ~0.
- * @buf: Input buffer pointer, performance might be improved if the
- *	 buffer is on a doubleword boundary.
- * @size: Size of the buffer, must be 64 bytes or greater.
- * @constants: CRC-32 constant pool base pointer.
- *
- * Register usage:
- *	V0:	  Initial CRC value and intermediate constants and results.
- *	V1..V4:	  Data for CRC computation.
- *	V5..V8:	  Next data chunks that are fetched from the input buffer.
- *	V9:	  Constant for BE->LE conversion and shift operations
- *	V10..V14: CRC-32 constants.
- */
-static u32 crc32_le_vgfm_generic(u32 crc, unsigned char const *buf, size_t size, unsigned long *constants)
-{
-	/* Load CRC-32 constants */
-	fpu_vlm(CONST_PERM_LE2BE, CONST_CRC_POLY, constants);
-
-	/*
-	 * Load the initial CRC value.
-	 *
-	 * The CRC value is loaded into the rightmost word of the
-	 * vector register and is later XORed with the LSB portion
-	 * of the loaded input data.
-	 */
-	fpu_vzero(0);			/* Clear V0 */
-	fpu_vlvgf(0, crc, 3);		/* Load CRC into rightmost word */
-
-	/* Load a 64-byte data chunk and XOR with CRC */
-	fpu_vlm(1, 4, buf);
-	fpu_vperm(1, 1, 1, CONST_PERM_LE2BE);
-	fpu_vperm(2, 2, 2, CONST_PERM_LE2BE);
-	fpu_vperm(3, 3, 3, CONST_PERM_LE2BE);
-	fpu_vperm(4, 4, 4, CONST_PERM_LE2BE);
-
-	fpu_vx(1, 0, 1);		/* V1 ^= CRC */
-	buf += 64;
-	size -= 64;
-
-	while (size >= 64) {
-		fpu_vlm(5, 8, buf);
-		fpu_vperm(5, 5, 5, CONST_PERM_LE2BE);
-		fpu_vperm(6, 6, 6, CONST_PERM_LE2BE);
-		fpu_vperm(7, 7, 7, CONST_PERM_LE2BE);
-		fpu_vperm(8, 8, 8, CONST_PERM_LE2BE);
-		/*
-		 * Perform a GF(2) multiplication of the doublewords in V1 with
-		 * the R1 and R2 reduction constants in V0.  The intermediate
-		 * result is then folded (accumulated) with the next data chunk
-		 * in V5 and stored in V1. Repeat this step for the register
-		 * contents in V2, V3, and V4 respectively.
-		 */
-		fpu_vgfmag(1, CONST_R2R1, 1, 5);
-		fpu_vgfmag(2, CONST_R2R1, 2, 6);
-		fpu_vgfmag(3, CONST_R2R1, 3, 7);
-		fpu_vgfmag(4, CONST_R2R1, 4, 8);
-		buf += 64;
-		size -= 64;
-	}
-
-	/*
-	 * Fold V1 to V4 into a single 128-bit value in V1.  Multiply V1 with R3
-	 * and R4 and accumulating the next 128-bit chunk until a single 128-bit
-	 * value remains.
-	 */
-	fpu_vgfmag(1, CONST_R4R3, 1, 2);
-	fpu_vgfmag(1, CONST_R4R3, 1, 3);
-	fpu_vgfmag(1, CONST_R4R3, 1, 4);
-
-	while (size >= 16) {
-		fpu_vl(2, buf);
-		fpu_vperm(2, 2, 2, CONST_PERM_LE2BE);
-		fpu_vgfmag(1, CONST_R4R3, 1, 2);
-		buf += 16;
-		size -= 16;
-	}
-
-	/*
-	 * Set up a vector register for byte shifts.  The shift value must
-	 * be loaded in bits 1-4 in byte element 7 of a vector register.
-	 * Shift by 8 bytes: 0x40
-	 * Shift by 4 bytes: 0x20
-	 */
-	fpu_vleib(9, 0x40, 7);
-
-	/*
-	 * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
-	 * to move R4 into the rightmost doubleword and set the leftmost
-	 * doubleword to 0x1.
-	 */
-	fpu_vsrlb(0, CONST_R4R3, 9);
-	fpu_vleig(0, 1, 0);
-
-	/*
-	 * Compute GF(2) product of V1 and V0.	The rightmost doubleword
-	 * of V1 is multiplied with R4.  The leftmost doubleword of V1 is
-	 * multiplied by 0x1 and is then XORed with rightmost product.
-	 * Implicitly, the intermediate leftmost product becomes padded
-	 */
-	fpu_vgfmg(1, 0, 1);
-
-	/*
-	 * Now do the final 32-bit fold by multiplying the rightmost word
-	 * in V1 with R5 and XOR the result with the remaining bits in V1.
-	 *
-	 * To achieve this by a single VGFMAG, right shift V1 by a word
-	 * and store the result in V2 which is then accumulated.  Use the
-	 * vector unpack instruction to load the rightmost half of the
-	 * doubleword into the rightmost doubleword element of V1; the other
-	 * half is loaded in the leftmost doubleword.
-	 * The vector register with CONST_R5 contains the R5 constant in the
-	 * rightmost doubleword and the leftmost doubleword is zero to ignore
-	 * the leftmost product of V1.
-	 */
-	fpu_vleib(9, 0x20, 7);		  /* Shift by words */
-	fpu_vsrlb(2, 1, 9);		  /* Store remaining bits in V2 */
-	fpu_vupllf(1, 1);		  /* Split rightmost doubleword */
-	fpu_vgfmag(1, CONST_R5, 1, 2);	  /* V1 = (V1 * R5) XOR V2 */
-
-	/*
-	 * Apply a Barret reduction to compute the final 32-bit CRC value.
-	 *
-	 * The input values to the Barret reduction are the degree-63 polynomial
-	 * in V1 (R(x)), degree-32 generator polynomial, and the reduction
-	 * constant u.	The Barret reduction result is the CRC value of R(x) mod
-	 * P(x).
-	 *
-	 * The Barret reduction algorithm is defined as:
-	 *
-	 *    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
-	 *    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
-	 *    3. C(x)  = R(x) XOR T2(x) mod x^32
-	 *
-	 *  Note: The leftmost doubleword of vector register containing
-	 *  CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
-	 *  is zero and does not contribute to the final result.
-	 */
-
-	/* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
-	fpu_vupllf(2, 1);
-	fpu_vgfmg(2, CONST_RU_POLY, 2);
-
-	/*
-	 * Compute the GF(2) product of the CRC polynomial with T1(x) in
-	 * V2 and XOR the intermediate result, T2(x), with the value in V1.
-	 * The final result is stored in word element 2 of V2.
-	 */
-	fpu_vupllf(2, 2);
-	fpu_vgfmag(2, CONST_CRC_POLY, 2, 1);
-
-	return fpu_vlgvf(2, 2);
-}
-
-u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
-{
-	return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32_LE[0]);
-}
-
-u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
-{
-	return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32C_LE[0]);
-}
diff --git a/arch/s390/lib/delay.c b/arch/s390/lib/delay.c
index be14c58cb989..c1ea14e3c927 100644
--- a/arch/s390/lib/delay.c
+++ b/arch/s390/lib/delay.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/processor.h>
+#include <linux/export.h>
 #include <linux/delay.h>
 #include <asm/div64.h>
 #include <asm/timex.h>
diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c
index 373fa1f01937..099de76e8b1a 100644
--- a/arch/s390/lib/string.c
+++ b/arch/s390/lib/string.c
@@ -78,50 +78,6 @@ EXPORT_SYMBOL(strnlen);
 #endif
 
 /**
- * strcpy - Copy a %NUL terminated string
- * @dest: Where to copy the string to
- * @src: Where to copy the string from
- *
- * returns a pointer to @dest
- */
-#ifdef __HAVE_ARCH_STRCPY
-char *strcpy(char *dest, const char *src)
-{
-	char *ret = dest;
-
-	asm volatile(
-		"	lghi	0,0\n"
-		"0:	mvst	%[dest],%[src]\n"
-		"	jo	0b\n"
-		: [dest] "+&a" (dest), [src] "+&a" (src)
-		:
-		: "cc", "memory", "0");
-	return ret;
-}
-EXPORT_SYMBOL(strcpy);
-#endif
-
-/**
- * strncpy - Copy a length-limited, %NUL-terminated string
- * @dest: Where to copy the string to
- * @src: Where to copy the string from
- * @n: The maximum number of bytes to copy
- *
- * The result is not %NUL-terminated if the source exceeds
- * @n bytes.
- */
-#ifdef __HAVE_ARCH_STRNCPY
-char *strncpy(char *dest, const char *src, size_t n)
-{
-	size_t len = __strnend(src, n) - src;
-	memset(dest + len, 0, n - len);
-	memcpy(dest, src, len);
-	return dest;
-}
-EXPORT_SYMBOL(strncpy);
-#endif
-
-/**
  * strcat - Append one %NUL-terminated string to another
  * @dest: The string to be appended to
  * @src: The string to append to it
@@ -181,9 +137,6 @@ EXPORT_SYMBOL(strlcat);
  * @n: The maximum numbers of bytes to copy
  *
  * returns a pointer to @dest
- *
- * Note that in contrast to strncpy, strncat ensures the result is
- * terminated.
  */
 #ifdef __HAVE_ARCH_STRNCAT
 char *strncat(char *dest, const char *src, size_t n)
diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c
index cec20db88479..1a6ba105e071 100644
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@@ -8,26 +8,29 @@
  *		 Gerald Schaefer (gerald.schaefer@de.ibm.com)
  */
 
+#include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/export.h>
 #include <linux/mm.h>
 #include <asm/asm-extable.h>
 #include <asm/ctlreg.h>
+#include <asm/skey.h>
 
 #ifdef CONFIG_DEBUG_ENTRY
 void debug_user_asce(int exit)
 {
+	struct lowcore *lc = get_lowcore();
 	struct ctlreg cr1, cr7;
 
 	local_ctl_store(1, &cr1);
 	local_ctl_store(7, &cr7);
-	if (cr1.val == get_lowcore()->kernel_asce.val && cr7.val == get_lowcore()->user_asce.val)
+	if (cr1.val == lc->user_asce.val && cr7.val == lc->user_asce.val)
 		return;
 	panic("incorrect ASCE on kernel %s\n"
 	      "cr1:    %016lx cr7:  %016lx\n"
 	      "kernel: %016lx user: %016lx\n",
 	      exit ? "exit" : "entry", cr1.val, cr7.val,
-	      get_lowcore()->kernel_asce.val, get_lowcore()->user_asce.val);
+	      lc->kernel_asce.val, lc->user_asce.val);
 }
 #endif /*CONFIG_DEBUG_ENTRY */
 
@@ -144,3 +147,189 @@ unsigned long _copy_to_user_key(void __user *to, const void *from,
 	return raw_copy_to_user_key(to, from, n, key);
 }
 EXPORT_SYMBOL(_copy_to_user_key);
+
+#define CMPXCHG_USER_KEY_MAX_LOOPS 128
+
+static nokprobe_inline int __cmpxchg_user_key_small(unsigned long address, unsigned int *uval,
+						    unsigned int old, unsigned int new,
+						    unsigned int mask, unsigned long key)
+{
+	unsigned long count;
+	unsigned int prev;
+	bool sacf_flag;
+	int rc = 0;
+
+	skey_regions_initialize();
+	sacf_flag = enable_sacf_uaccess();
+	asm_inline volatile(
+		"20:	spka	0(%[key])\n"
+		"	sacf	256\n"
+		"	llill	%[count],%[max_loops]\n"
+		"0:	l	%[prev],%[address]\n"
+		"1:	nr	%[prev],%[mask]\n"
+		"	xilf	%[mask],0xffffffff\n"
+		"	or	%[new],%[prev]\n"
+		"	or	%[prev],%[tmp]\n"
+		"2:	lr	%[tmp],%[prev]\n"
+		"3:	cs	%[prev],%[new],%[address]\n"
+		"4:	jnl	5f\n"
+		"	xr	%[tmp],%[prev]\n"
+		"	xr	%[new],%[tmp]\n"
+		"	nr	%[tmp],%[mask]\n"
+		"	jnz	5f\n"
+		"	brct	%[count],2b\n"
+		"5:	sacf	768\n"
+		"	spka	%[default_key]\n"
+		"21:\n"
+		EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev])
+		EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev])
+		EX_TABLE_UA_LOAD_REG(3b, 5b, %[rc], %[prev])
+		EX_TABLE_UA_LOAD_REG(4b, 5b, %[rc], %[prev])
+		SKEY_REGION(20b, 21b)
+		: [rc] "+&d" (rc),
+		[prev] "=&d" (prev),
+		[address] "+Q" (*(int *)address),
+		[tmp] "+&d" (old),
+		[new] "+&d" (new),
+		[mask] "+&d" (mask),
+		[count] "=a" (count)
+		: [key] "%[count]" (key << 4),
+		[default_key] "J" (PAGE_DEFAULT_KEY),
+		[max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS)
+		: "memory", "cc");
+	disable_sacf_uaccess(sacf_flag);
+	*uval = prev;
+	if (!count)
+		rc = -EAGAIN;
+	return rc;
+}
+
+int __kprobes __cmpxchg_user_key1(unsigned long address, unsigned char *uval,
+				  unsigned char old, unsigned char new, unsigned long key)
+{
+	unsigned int prev, shift, mask, _old, _new;
+	int rc;
+
+	shift = (3 ^ (address & 3)) << 3;
+	address ^= address & 3;
+	_old = (unsigned int)old << shift;
+	_new = (unsigned int)new << shift;
+	mask = ~(0xff << shift);
+	rc = __cmpxchg_user_key_small(address, &prev, _old, _new, mask, key);
+	*uval = prev >> shift;
+	return rc;
+}
+EXPORT_SYMBOL(__cmpxchg_user_key1);
+
+int __kprobes __cmpxchg_user_key2(unsigned long address, unsigned short *uval,
+				  unsigned short old, unsigned short new, unsigned long key)
+{
+	unsigned int prev, shift, mask, _old, _new;
+	int rc;
+
+	shift = (2 ^ (address & 2)) << 3;
+	address ^= address & 2;
+	_old = (unsigned int)old << shift;
+	_new = (unsigned int)new << shift;
+	mask = ~(0xffff << shift);
+	rc = __cmpxchg_user_key_small(address, &prev, _old, _new, mask, key);
+	*uval = prev >> shift;
+	return rc;
+}
+EXPORT_SYMBOL(__cmpxchg_user_key2);
+
+int __kprobes __cmpxchg_user_key4(unsigned long address, unsigned int *uval,
+				  unsigned int old, unsigned int new, unsigned long key)
+{
+	unsigned int prev = old;
+	bool sacf_flag;
+	int rc = 0;
+
+	skey_regions_initialize();
+	sacf_flag = enable_sacf_uaccess();
+	asm_inline volatile(
+		"20:	spka	0(%[key])\n"
+		"	sacf	256\n"
+		"0:	cs	%[prev],%[new],%[address]\n"
+		"1:	sacf	768\n"
+		"	spka	%[default_key]\n"
+		"21:\n"
+		EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev])
+		EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev])
+		SKEY_REGION(20b, 21b)
+		: [rc] "+&d" (rc),
+		[prev] "+&d" (prev),
+		[address] "+Q" (*(int *)address)
+		: [new] "d" (new),
+		[key] "a" (key << 4),
+		[default_key] "J" (PAGE_DEFAULT_KEY)
+		: "memory", "cc");
+	disable_sacf_uaccess(sacf_flag);
+	*uval = prev;
+	return rc;
+}
+EXPORT_SYMBOL(__cmpxchg_user_key4);
+
+int __kprobes __cmpxchg_user_key8(unsigned long address, unsigned long *uval,
+				  unsigned long old, unsigned long new, unsigned long key)
+{
+	unsigned long prev = old;
+	bool sacf_flag;
+	int rc = 0;
+
+	skey_regions_initialize();
+	sacf_flag = enable_sacf_uaccess();
+	asm_inline volatile(
+		"20:	spka	0(%[key])\n"
+		"	sacf	256\n"
+		"0:	csg	%[prev],%[new],%[address]\n"
+		"1:	sacf	768\n"
+		"	spka	%[default_key]\n"
+		"21:\n"
+		EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev])
+		EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev])
+		SKEY_REGION(20b, 21b)
+		: [rc] "+&d" (rc),
+		[prev] "+&d" (prev),
+		[address] "+QS" (*(long *)address)
+		: [new] "d" (new),
+		[key] "a" (key << 4),
+		[default_key] "J" (PAGE_DEFAULT_KEY)
+		: "memory", "cc");
+	disable_sacf_uaccess(sacf_flag);
+	*uval = prev;
+	return rc;
+}
+EXPORT_SYMBOL(__cmpxchg_user_key8);
+
+int __kprobes __cmpxchg_user_key16(unsigned long address, __uint128_t *uval,
+				   __uint128_t old, __uint128_t new, unsigned long key)
+{
+	__uint128_t prev = old;
+	bool sacf_flag;
+	int rc = 0;
+
+	skey_regions_initialize();
+	sacf_flag = enable_sacf_uaccess();
+	asm_inline volatile(
+		"20:	spka	0(%[key])\n"
+		"	sacf	256\n"
+		"0:	cdsg	%[prev],%[new],%[address]\n"
+		"1:	sacf	768\n"
+		"	spka	%[default_key]\n"
+		"21:\n"
+		EX_TABLE_UA_LOAD_REGPAIR(0b, 1b, %[rc], %[prev])
+		EX_TABLE_UA_LOAD_REGPAIR(1b, 1b, %[rc], %[prev])
+		SKEY_REGION(20b, 21b)
+		: [rc] "+&d" (rc),
+		[prev] "+&d" (prev),
+		[address] "+QS" (*(__int128_t *)address)
+		: [new] "d" (new),
+		[key] "a" (key << 4),
+		[default_key] "J" (PAGE_DEFAULT_KEY)
+		: "memory", "cc");
+	disable_sacf_uaccess(sacf_flag);
+	*uval = prev;
+	return rc;
+}
+EXPORT_SYMBOL(__cmpxchg_user_key16);