From d36cebe03c3ae4ea1fde20cfc797fab8729c3ab5 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 1 Dec 2024 17:08:27 -0800
Subject: lib/crc32: improve support for arch-specific overrides

Currently the CRC32 library functions are defined as weak symbols, and
the arm64 and riscv architectures override them.

This method of arch-specific overrides has the limitation that it only
works when both the base and arch code is built-in.  Also, it makes the
arch-specific code be silently not used if it is accidentally built with
lib-y instead of obj-y; unfortunately the RISC-V code does this.

This commit reorganizes the code to have explicit *_arch() functions
that are called when they are enabled, similar to how some of the crypto
library code works (e.g. chacha_crypt() calls chacha_crypt_arch()).

Make the existing kconfig choice for the CRC32 implementation also
control whether the arch-optimized implementation (if one is available)
is enabled or not.  Make it enabled by default if CRC32 is also enabled.

The result is that arch-optimized CRC32 library functions will be
included automatically when appropriate, but it is now possible to
disable them.  They can also now be built as a loadable module if the
CRC32 library functions happen to be used only by loadable modules, in
which case the arch and base CRC32 modules will be automatically loaded
via direct symbol dependency when appropriate.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20241202010844.144356-3-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/arm64/Kconfig           |   1 +
 arch/arm64/lib/Makefile      |   3 +-
 arch/arm64/lib/crc32-glue.c  |  13 +-
 arch/riscv/Kconfig           |   1 +
 arch/riscv/lib/Makefile      |   3 +-
 arch/riscv/lib/crc32-riscv.c | 301 +++++++++++++++++++++++++++++++++++++++++++
 arch/riscv/lib/crc32.c       | 294 ------------------------------------------
 crypto/crc32_generic.c       |   4 +-
 crypto/crc32c_generic.c      |   4 +-
 include/linux/crc32.h        |  35 ++++-
 lib/Kconfig                  |  70 +++++++---
 lib/crc32.c                  |  22 +---
 12 files changed, 409 insertions(+), 342 deletions(-)
 create mode 100644 arch/riscv/lib/crc32-riscv.c
 delete mode 100644 arch/riscv/lib/crc32.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 100570a048c5..71f6310c8240 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -21,6 +21,7 @@ config ARM64
 	select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
 	select ARCH_HAS_CACHE_LINE_SIZE
 	select ARCH_HAS_CC_PLATFORM
+	select ARCH_HAS_CRC32
 	select ARCH_HAS_CURRENT_STACK_POINTER
 	select ARCH_HAS_DEBUG_VIRTUAL
 	select ARCH_HAS_DEBUG_VM_PGTABLE
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 8e882f479d98..5fbcf0d56665 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -13,7 +13,8 @@ endif
 
 lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
 
-obj-$(CONFIG_CRC32) += crc32.o crc32-glue.o
+obj-$(CONFIG_CRC32_ARCH) += crc32-arm64.o
+crc32-arm64-y := crc32.o crc32-glue.o
 
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 
diff --git a/arch/arm64/lib/crc32-glue.c b/arch/arm64/lib/crc32-glue.c
index ad015223d15d..d7f6e1cbf0d2 100644
--- a/arch/arm64/lib/crc32-glue.c
+++ b/arch/arm64/lib/crc32-glue.c
@@ -2,6 +2,7 @@
 
 #include <linux/crc32.h>
 #include <linux/linkage.h>
+#include <linux/module.h>
 
 #include <asm/alternative.h>
 #include <asm/cpufeature.h>
@@ -21,7 +22,7 @@ asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
 asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
 asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len);
 
-u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
+u32 __pure crc32_le_arch(u32 crc, const u8 *p, size_t len)
 {
 	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
 		return crc32_le_base(crc, p, len);
@@ -40,8 +41,9 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
 
 	return crc32_le_arm64(crc, p, len);
 }
+EXPORT_SYMBOL(crc32_le_arch);
 
-u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
+u32 __pure crc32c_le_arch(u32 crc, const u8 *p, size_t len)
 {
 	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
 		return crc32c_le_base(crc, p, len);
@@ -60,8 +62,9 @@ u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
 
 	return crc32c_le_arm64(crc, p, len);
 }
+EXPORT_SYMBOL(crc32c_le_arch);
 
-u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
+u32 __pure crc32_be_arch(u32 crc, const u8 *p, size_t len)
 {
 	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
 		return crc32_be_base(crc, p, len);
@@ -80,3 +83,7 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
 
 	return crc32_be_arm64(crc, p, len);
 }
+EXPORT_SYMBOL(crc32_be_arch);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("arm64-optimized CRC32 functions");
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index d4a7ca0388c0..7d5718667e39 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -24,6 +24,7 @@ config RISCV
 	select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
 	select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
 	select ARCH_HAS_BINFMT_FLAT
+	select ARCH_HAS_CRC32 if RISCV_ISA_ZBC
 	select ARCH_HAS_CURRENT_STACK_POINTER
 	select ARCH_HAS_DEBUG_VIRTUAL if MMU
 	select ARCH_HAS_DEBUG_VM_PGTABLE
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index 8eec6b69a875..79368a895fee 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -15,8 +15,7 @@ endif
 lib-$(CONFIG_MMU)	+= uaccess.o
 lib-$(CONFIG_64BIT)	+= tishift.o
 lib-$(CONFIG_RISCV_ISA_ZICBOZ)	+= clear_page.o
-lib-$(CONFIG_RISCV_ISA_ZBC)	+= crc32.o
-
+obj-$(CONFIG_CRC32_ARCH)	+= crc32-riscv.o
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 lib-$(CONFIG_RISCV_ISA_V)	+= xor.o
 lib-$(CONFIG_RISCV_ISA_V)	+= riscv_v_helpers.o
diff --git a/arch/riscv/lib/crc32-riscv.c b/arch/riscv/lib/crc32-riscv.c
new file mode 100644
index 000000000000..a3ff7db2a1ce
--- /dev/null
+++ b/arch/riscv/lib/crc32-riscv.c
@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC32 implementation with Zbc extension.
+ *
+ * Copyright (C) 2024 Intel Corporation
+ */
+
+#include <asm/hwcap.h>
+#include <asm/alternative-macros.h>
+#include <asm/byteorder.h>
+
+#include <linux/types.h>
+#include <linux/minmax.h>
+#include <linux/crc32poly.h>
+#include <linux/crc32.h>
+#include <linux/byteorder/generic.h>
+#include <linux/module.h>
+
+/*
+ * Refer to https://www.corsix.org/content/barrett-reduction-polynomials for
+ * better understanding of how this math works.
+ *
+ * let "+" denotes polynomial add (XOR)
+ * let "-" denotes polynomial sub (XOR)
+ * let "*" denotes polynomial multiplication
+ * let "/" denotes polynomial floor division
+ * let "S" denotes source data, XLEN bit wide
+ * let "P" denotes CRC32 polynomial
+ * let "T" denotes 2^(XLEN+32)
+ * let "QT" denotes quotient of T/P, with the bit for 2^XLEN being implicit
+ *
+ * crc32(S, P)
+ * => S * (2^32) - S * (2^32) / P * P
+ * => lowest 32 bits of: S * (2^32) / P * P
+ * => lowest 32 bits of: S * (2^32) * (T / P) / T * P
+ * => lowest 32 bits of: S * (2^32) * quotient / T * P
+ * => lowest 32 bits of: S * quotient / 2^XLEN * P
+ * => lowest 32 bits of: (clmul_high_part(S, QT) + S) * P
+ * => clmul_low_part(clmul_high_part(S, QT) + S, P)
+ *
+ * In terms of below implementations, the BE case is more intuitive, since the
+ * higher order bit sits at more significant position.
+ */
+
+#if __riscv_xlen == 64
+/* Slide by XLEN bits per iteration */
+# define STEP_ORDER 3
+
+/* Each below polynomial quotient has an implicit bit for 2^XLEN */
+
+/* Polynomial quotient of (2^(XLEN+32))/CRC32_POLY, in LE format */
+# define CRC32_POLY_QT_LE	0x5a72d812fb808b20
+
+/* Polynomial quotient of (2^(XLEN+32))/CRC32C_POLY, in LE format */
+# define CRC32C_POLY_QT_LE	0xa434f61c6f5389f8
+
+/* Polynomial quotient of (2^(XLEN+32))/CRC32_POLY, in BE format, it should be
+ * the same as the bit-reversed version of CRC32_POLY_QT_LE
+ */
+# define CRC32_POLY_QT_BE	0x04d101df481b4e5a
+
+static inline u64 crc32_le_prep(u32 crc, unsigned long const *ptr)
+{
+	return (u64)crc ^ (__force u64)__cpu_to_le64(*ptr);
+}
+
+static inline u32 crc32_le_zbc(unsigned long s, u32 poly, unsigned long poly_qt)
+{
+	u32 crc;
+
+	/* We don't have a "clmulrh" insn, so use clmul + slli instead. */
+	asm volatile (".option push\n"
+		      ".option arch,+zbc\n"
+		      "clmul	%0, %1, %2\n"
+		      "slli	%0, %0, 1\n"
+		      "xor	%0, %0, %1\n"
+		      "clmulr	%0, %0, %3\n"
+		      "srli	%0, %0, 32\n"
+		      ".option pop\n"
+		      : "=&r" (crc)
+		      : "r" (s),
+			"r" (poly_qt),
+			"r" ((u64)poly << 32)
+		      :);
+	return crc;
+}
+
+static inline u64 crc32_be_prep(u32 crc, unsigned long const *ptr)
+{
+	return ((u64)crc << 32) ^ (__force u64)__cpu_to_be64(*ptr);
+}
+
+#elif __riscv_xlen == 32
+# define STEP_ORDER 2
+/* Each quotient should match the upper half of its analog in RV64 */
+# define CRC32_POLY_QT_LE	0xfb808b20
+# define CRC32C_POLY_QT_LE	0x6f5389f8
+# define CRC32_POLY_QT_BE	0x04d101df
+
+static inline u32 crc32_le_prep(u32 crc, unsigned long const *ptr)
+{
+	return crc ^ (__force u32)__cpu_to_le32(*ptr);
+}
+
+static inline u32 crc32_le_zbc(unsigned long s, u32 poly, unsigned long poly_qt)
+{
+	u32 crc;
+
+	/* We don't have a "clmulrh" insn, so use clmul + slli instead. */
+	asm volatile (".option push\n"
+		      ".option arch,+zbc\n"
+		      "clmul	%0, %1, %2\n"
+		      "slli	%0, %0, 1\n"
+		      "xor	%0, %0, %1\n"
+		      "clmulr	%0, %0, %3\n"
+		      ".option pop\n"
+		      : "=&r" (crc)
+		      : "r" (s),
+			"r" (poly_qt),
+			"r" (poly)
+		      :);
+	return crc;
+}
+
+static inline u32 crc32_be_prep(u32 crc, unsigned long const *ptr)
+{
+	return crc ^ (__force u32)__cpu_to_be32(*ptr);
+}
+
+#else
+# error "Unexpected __riscv_xlen"
+#endif
+
+static inline u32 crc32_be_zbc(unsigned long s)
+{
+	u32 crc;
+
+	asm volatile (".option push\n"
+		      ".option arch,+zbc\n"
+		      "clmulh	%0, %1, %2\n"
+		      "xor	%0, %0, %1\n"
+		      "clmul	%0, %0, %3\n"
+		      ".option pop\n"
+		      : "=&r" (crc)
+		      : "r" (s),
+			"r" (CRC32_POLY_QT_BE),
+			"r" (CRC32_POLY_BE)
+		      :);
+	return crc;
+}
+
+#define STEP		(1 << STEP_ORDER)
+#define OFFSET_MASK	(STEP - 1)
+
+typedef u32 (*fallback)(u32 crc, unsigned char const *p, size_t len);
+
+static inline u32 crc32_le_unaligned(u32 crc, unsigned char const *p,
+				     size_t len, u32 poly,
+				     unsigned long poly_qt)
+{
+	size_t bits = len * 8;
+	unsigned long s = 0;
+	u32 crc_low = 0;
+
+	for (int i = 0; i < len; i++)
+		s = ((unsigned long)*p++ << (__riscv_xlen - 8)) | (s >> 8);
+
+	s ^= (unsigned long)crc << (__riscv_xlen - bits);
+	if (__riscv_xlen == 32 || len < sizeof(u32))
+		crc_low = crc >> bits;
+
+	crc = crc32_le_zbc(s, poly, poly_qt);
+	crc ^= crc_low;
+
+	return crc;
+}
+
+static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
+					  size_t len, u32 poly,
+					  unsigned long poly_qt,
+					  fallback crc_fb)
+{
+	size_t offset, head_len, tail_len;
+	unsigned long const *p_ul;
+	unsigned long s;
+
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+			     RISCV_ISA_EXT_ZBC, 1)
+		 : : : : legacy);
+
+	/* Handle the unaligned head. */
+	offset = (unsigned long)p & OFFSET_MASK;
+	if (offset && len) {
+		head_len = min(STEP - offset, len);
+		crc = crc32_le_unaligned(crc, p, head_len, poly, poly_qt);
+		p += head_len;
+		len -= head_len;
+	}
+
+	tail_len = len & OFFSET_MASK;
+	len = len >> STEP_ORDER;
+	p_ul = (unsigned long const *)p;
+
+	for (int i = 0; i < len; i++) {
+		s = crc32_le_prep(crc, p_ul);
+		crc = crc32_le_zbc(s, poly, poly_qt);
+		p_ul++;
+	}
+
+	/* Handle the tail bytes. */
+	p = (unsigned char const *)p_ul;
+	if (tail_len)
+		crc = crc32_le_unaligned(crc, p, tail_len, poly, poly_qt);
+
+	return crc;
+
+legacy:
+	return crc_fb(crc, p, len);
+}
+
+u32 __pure crc32_le_arch(u32 crc, const u8 *p, size_t len)
+{
+	return crc32_le_generic(crc, p, len, CRC32_POLY_LE, CRC32_POLY_QT_LE,
+				crc32_le_base);
+}
+EXPORT_SYMBOL(crc32_le_arch);
+
+u32 __pure crc32c_le_arch(u32 crc, const u8 *p, size_t len)
+{
+	return crc32_le_generic(crc, p, len, CRC32C_POLY_LE,
+				CRC32C_POLY_QT_LE, crc32c_le_base);
+}
+EXPORT_SYMBOL(crc32c_le_arch);
+
+static inline u32 crc32_be_unaligned(u32 crc, unsigned char const *p,
+				     size_t len)
+{
+	size_t bits = len * 8;
+	unsigned long s = 0;
+	u32 crc_low = 0;
+
+	s = 0;
+	for (int i = 0; i < len; i++)
+		s = *p++ | (s << 8);
+
+	if (__riscv_xlen == 32 || len < sizeof(u32)) {
+		s ^= crc >> (32 - bits);
+		crc_low = crc << bits;
+	} else {
+		s ^= (unsigned long)crc << (bits - 32);
+	}
+
+	crc = crc32_be_zbc(s);
+	crc ^= crc_low;
+
+	return crc;
+}
+
+u32 __pure crc32_be_arch(u32 crc, const u8 *p, size_t len)
+{
+	size_t offset, head_len, tail_len;
+	unsigned long const *p_ul;
+	unsigned long s;
+
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+			     RISCV_ISA_EXT_ZBC, 1)
+		 : : : : legacy);
+
+	/* Handle the unaligned head. */
+	offset = (unsigned long)p & OFFSET_MASK;
+	if (offset && len) {
+		head_len = min(STEP - offset, len);
+		crc = crc32_be_unaligned(crc, p, head_len);
+		p += head_len;
+		len -= head_len;
+	}
+
+	tail_len = len & OFFSET_MASK;
+	len = len >> STEP_ORDER;
+	p_ul = (unsigned long const *)p;
+
+	for (int i = 0; i < len; i++) {
+		s = crc32_be_prep(crc, p_ul);
+		crc = crc32_be_zbc(s);
+		p_ul++;
+	}
+
+	/* Handle the tail bytes. */
+	p = (unsigned char const *)p_ul;
+	if (tail_len)
+		crc = crc32_be_unaligned(crc, p, tail_len);
+
+	return crc;
+
+legacy:
+	return crc32_be_base(crc, p, len);
+}
+EXPORT_SYMBOL(crc32_be_arch);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Accelerated CRC32 implementation with Zbc extension");
diff --git a/arch/riscv/lib/crc32.c b/arch/riscv/lib/crc32.c
deleted file mode 100644
index 333fb7af1192..000000000000
--- a/arch/riscv/lib/crc32.c
+++ /dev/null
@@ -1,294 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Accelerated CRC32 implementation with Zbc extension.
- *
- * Copyright (C) 2024 Intel Corporation
- */
-
-#include <asm/hwcap.h>
-#include <asm/alternative-macros.h>
-#include <asm/byteorder.h>
-
-#include <linux/types.h>
-#include <linux/minmax.h>
-#include <linux/crc32poly.h>
-#include <linux/crc32.h>
-#include <linux/byteorder/generic.h>
-
-/*
- * Refer to https://www.corsix.org/content/barrett-reduction-polynomials for
- * better understanding of how this math works.
- *
- * let "+" denotes polynomial add (XOR)
- * let "-" denotes polynomial sub (XOR)
- * let "*" denotes polynomial multiplication
- * let "/" denotes polynomial floor division
- * let "S" denotes source data, XLEN bit wide
- * let "P" denotes CRC32 polynomial
- * let "T" denotes 2^(XLEN+32)
- * let "QT" denotes quotient of T/P, with the bit for 2^XLEN being implicit
- *
- * crc32(S, P)
- * => S * (2^32) - S * (2^32) / P * P
- * => lowest 32 bits of: S * (2^32) / P * P
- * => lowest 32 bits of: S * (2^32) * (T / P) / T * P
- * => lowest 32 bits of: S * (2^32) * quotient / T * P
- * => lowest 32 bits of: S * quotient / 2^XLEN * P
- * => lowest 32 bits of: (clmul_high_part(S, QT) + S) * P
- * => clmul_low_part(clmul_high_part(S, QT) + S, P)
- *
- * In terms of below implementations, the BE case is more intuitive, since the
- * higher order bit sits at more significant position.
- */
-
-#if __riscv_xlen == 64
-/* Slide by XLEN bits per iteration */
-# define STEP_ORDER 3
-
-/* Each below polynomial quotient has an implicit bit for 2^XLEN */
-
-/* Polynomial quotient of (2^(XLEN+32))/CRC32_POLY, in LE format */
-# define CRC32_POLY_QT_LE	0x5a72d812fb808b20
-
-/* Polynomial quotient of (2^(XLEN+32))/CRC32C_POLY, in LE format */
-# define CRC32C_POLY_QT_LE	0xa434f61c6f5389f8
-
-/* Polynomial quotient of (2^(XLEN+32))/CRC32_POLY, in BE format, it should be
- * the same as the bit-reversed version of CRC32_POLY_QT_LE
- */
-# define CRC32_POLY_QT_BE	0x04d101df481b4e5a
-
-static inline u64 crc32_le_prep(u32 crc, unsigned long const *ptr)
-{
-	return (u64)crc ^ (__force u64)__cpu_to_le64(*ptr);
-}
-
-static inline u32 crc32_le_zbc(unsigned long s, u32 poly, unsigned long poly_qt)
-{
-	u32 crc;
-
-	/* We don't have a "clmulrh" insn, so use clmul + slli instead. */
-	asm volatile (".option push\n"
-		      ".option arch,+zbc\n"
-		      "clmul	%0, %1, %2\n"
-		      "slli	%0, %0, 1\n"
-		      "xor	%0, %0, %1\n"
-		      "clmulr	%0, %0, %3\n"
-		      "srli	%0, %0, 32\n"
-		      ".option pop\n"
-		      : "=&r" (crc)
-		      : "r" (s),
-			"r" (poly_qt),
-			"r" ((u64)poly << 32)
-		      :);
-	return crc;
-}
-
-static inline u64 crc32_be_prep(u32 crc, unsigned long const *ptr)
-{
-	return ((u64)crc << 32) ^ (__force u64)__cpu_to_be64(*ptr);
-}
-
-#elif __riscv_xlen == 32
-# define STEP_ORDER 2
-/* Each quotient should match the upper half of its analog in RV64 */
-# define CRC32_POLY_QT_LE	0xfb808b20
-# define CRC32C_POLY_QT_LE	0x6f5389f8
-# define CRC32_POLY_QT_BE	0x04d101df
-
-static inline u32 crc32_le_prep(u32 crc, unsigned long const *ptr)
-{
-	return crc ^ (__force u32)__cpu_to_le32(*ptr);
-}
-
-static inline u32 crc32_le_zbc(unsigned long s, u32 poly, unsigned long poly_qt)
-{
-	u32 crc;
-
-	/* We don't have a "clmulrh" insn, so use clmul + slli instead. */
-	asm volatile (".option push\n"
-		      ".option arch,+zbc\n"
-		      "clmul	%0, %1, %2\n"
-		      "slli	%0, %0, 1\n"
-		      "xor	%0, %0, %1\n"
-		      "clmulr	%0, %0, %3\n"
-		      ".option pop\n"
-		      : "=&r" (crc)
-		      : "r" (s),
-			"r" (poly_qt),
-			"r" (poly)
-		      :);
-	return crc;
-}
-
-static inline u32 crc32_be_prep(u32 crc, unsigned long const *ptr)
-{
-	return crc ^ (__force u32)__cpu_to_be32(*ptr);
-}
-
-#else
-# error "Unexpected __riscv_xlen"
-#endif
-
-static inline u32 crc32_be_zbc(unsigned long s)
-{
-	u32 crc;
-
-	asm volatile (".option push\n"
-		      ".option arch,+zbc\n"
-		      "clmulh	%0, %1, %2\n"
-		      "xor	%0, %0, %1\n"
-		      "clmul	%0, %0, %3\n"
-		      ".option pop\n"
-		      : "=&r" (crc)
-		      : "r" (s),
-			"r" (CRC32_POLY_QT_BE),
-			"r" (CRC32_POLY_BE)
-		      :);
-	return crc;
-}
-
-#define STEP		(1 << STEP_ORDER)
-#define OFFSET_MASK	(STEP - 1)
-
-typedef u32 (*fallback)(u32 crc, unsigned char const *p, size_t len);
-
-static inline u32 crc32_le_unaligned(u32 crc, unsigned char const *p,
-				     size_t len, u32 poly,
-				     unsigned long poly_qt)
-{
-	size_t bits = len * 8;
-	unsigned long s = 0;
-	u32 crc_low = 0;
-
-	for (int i = 0; i < len; i++)
-		s = ((unsigned long)*p++ << (__riscv_xlen - 8)) | (s >> 8);
-
-	s ^= (unsigned long)crc << (__riscv_xlen - bits);
-	if (__riscv_xlen == 32 || len < sizeof(u32))
-		crc_low = crc >> bits;
-
-	crc = crc32_le_zbc(s, poly, poly_qt);
-	crc ^= crc_low;
-
-	return crc;
-}
-
-static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
-					  size_t len, u32 poly,
-					  unsigned long poly_qt,
-					  fallback crc_fb)
-{
-	size_t offset, head_len, tail_len;
-	unsigned long const *p_ul;
-	unsigned long s;
-
-	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
-			     RISCV_ISA_EXT_ZBC, 1)
-		 : : : : legacy);
-
-	/* Handle the unaligned head. */
-	offset = (unsigned long)p & OFFSET_MASK;
-	if (offset && len) {
-		head_len = min(STEP - offset, len);
-		crc = crc32_le_unaligned(crc, p, head_len, poly, poly_qt);
-		p += head_len;
-		len -= head_len;
-	}
-
-	tail_len = len & OFFSET_MASK;
-	len = len >> STEP_ORDER;
-	p_ul = (unsigned long const *)p;
-
-	for (int i = 0; i < len; i++) {
-		s = crc32_le_prep(crc, p_ul);
-		crc = crc32_le_zbc(s, poly, poly_qt);
-		p_ul++;
-	}
-
-	/* Handle the tail bytes. */
-	p = (unsigned char const *)p_ul;
-	if (tail_len)
-		crc = crc32_le_unaligned(crc, p, tail_len, poly, poly_qt);
-
-	return crc;
-
-legacy:
-	return crc_fb(crc, p, len);
-}
-
-u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
-{
-	return crc32_le_generic(crc, p, len, CRC32_POLY_LE, CRC32_POLY_QT_LE,
-				crc32_le_base);
-}
-
-u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
-{
-	return crc32_le_generic(crc, p, len, CRC32C_POLY_LE,
-				CRC32C_POLY_QT_LE, crc32c_le_base);
-}
-
-static inline u32 crc32_be_unaligned(u32 crc, unsigned char const *p,
-				     size_t len)
-{
-	size_t bits = len * 8;
-	unsigned long s = 0;
-	u32 crc_low = 0;
-
-	s = 0;
-	for (int i = 0; i < len; i++)
-		s = *p++ | (s << 8);
-
-	if (__riscv_xlen == 32 || len < sizeof(u32)) {
-		s ^= crc >> (32 - bits);
-		crc_low = crc << bits;
-	} else {
-		s ^= (unsigned long)crc << (bits - 32);
-	}
-
-	crc = crc32_be_zbc(s);
-	crc ^= crc_low;
-
-	return crc;
-}
-
-u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
-{
-	size_t offset, head_len, tail_len;
-	unsigned long const *p_ul;
-	unsigned long s;
-
-	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
-			     RISCV_ISA_EXT_ZBC, 1)
-		 : : : : legacy);
-
-	/* Handle the unaligned head. */
-	offset = (unsigned long)p & OFFSET_MASK;
-	if (offset && len) {
-		head_len = min(STEP - offset, len);
-		crc = crc32_be_unaligned(crc, p, head_len);
-		p += head_len;
-		len -= head_len;
-	}
-
-	tail_len = len & OFFSET_MASK;
-	len = len >> STEP_ORDER;
-	p_ul = (unsigned long const *)p;
-
-	for (int i = 0; i < len; i++) {
-		s = crc32_be_prep(crc, p_ul);
-		crc = crc32_be_zbc(s);
-		p_ul++;
-	}
-
-	/* Handle the tail bytes. */
-	p = (unsigned char const *)p_ul;
-	if (tail_len)
-		crc = crc32_be_unaligned(crc, p, tail_len);
-
-	return crc;
-
-legacy:
-	return crc32_be_base(crc, p, len);
-}
diff --git a/crypto/crc32_generic.c b/crypto/crc32_generic.c
index 6a55d206fab3..cc064ea8240e 100644
--- a/crypto/crc32_generic.c
+++ b/crypto/crc32_generic.c
@@ -160,12 +160,12 @@ static struct shash_alg algs[] = {{
 static int __init crc32_mod_init(void)
 {
 	/* register the arch flavor only if it differs from the generic one */
-	return crypto_register_shashes(algs, 1 + (&crc32_le != &crc32_le_base));
+	return crypto_register_shashes(algs, 1 + IS_ENABLED(CONFIG_CRC32_ARCH));
 }
 
 static void __exit crc32_mod_fini(void)
 {
-	crypto_unregister_shashes(algs, 1 + (&crc32_le != &crc32_le_base));
+	crypto_unregister_shashes(algs, 1 + IS_ENABLED(CONFIG_CRC32_ARCH));
 }
 
 subsys_initcall(crc32_mod_init);
diff --git a/crypto/crc32c_generic.c b/crypto/crc32c_generic.c
index 635599b255ec..04b03d825cf4 100644
--- a/crypto/crc32c_generic.c
+++ b/crypto/crc32c_generic.c
@@ -200,12 +200,12 @@ static struct shash_alg algs[] = {{
 static int __init crc32c_mod_init(void)
 {
 	/* register the arch flavor only if it differs from the generic one */
-	return crypto_register_shashes(algs, 1 + (&__crc32c_le != &crc32c_le_base));
+	return crypto_register_shashes(algs, 1 + IS_ENABLED(CONFIG_CRC32_ARCH));
 }
 
 static void __exit crc32c_mod_fini(void)
 {
-	crypto_unregister_shashes(algs, 1 + (&__crc32c_le != &crc32c_le_base));
+	crypto_unregister_shashes(algs, 1 + IS_ENABLED(CONFIG_CRC32_ARCH));
 }
 
 subsys_initcall(crc32c_mod_init);
diff --git a/include/linux/crc32.h b/include/linux/crc32.h
index 5b07fc9081c4..58c632533b08 100644
--- a/include/linux/crc32.h
+++ b/include/linux/crc32.h
@@ -8,10 +8,34 @@
 #include <linux/types.h>
 #include <linux/bitrev.h>
 
-u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len);
-u32 __pure crc32_le_base(u32 crc, unsigned char const *p, size_t len);
-u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len);
-u32 __pure crc32_be_base(u32 crc, unsigned char const *p, size_t len);
+u32 __pure crc32_le_arch(u32 crc, const u8 *p, size_t len);
+u32 __pure crc32_le_base(u32 crc, const u8 *p, size_t len);
+u32 __pure crc32_be_arch(u32 crc, const u8 *p, size_t len);
+u32 __pure crc32_be_base(u32 crc, const u8 *p, size_t len);
+u32 __pure crc32c_le_arch(u32 crc, const u8 *p, size_t len);
+u32 __pure crc32c_le_base(u32 crc, const u8 *p, size_t len);
+
+static inline u32 __pure crc32_le(u32 crc, const u8 *p, size_t len)
+{
+	if (IS_ENABLED(CONFIG_CRC32_ARCH))
+		return crc32_le_arch(crc, p, len);
+	return crc32_le_base(crc, p, len);
+}
+
+static inline u32 __pure crc32_be(u32 crc, const u8 *p, size_t len)
+{
+	if (IS_ENABLED(CONFIG_CRC32_ARCH))
+		return crc32_be_arch(crc, p, len);
+	return crc32_be_base(crc, p, len);
+}
+
+/* TODO: leading underscores should be dropped once callers have been updated */
+static inline u32 __pure __crc32c_le(u32 crc, const u8 *p, size_t len)
+{
+	if (IS_ENABLED(CONFIG_CRC32_ARCH))
+		return crc32c_le_arch(crc, p, len);
+	return crc32c_le_base(crc, p, len);
+}
 
 /**
  * crc32_le_combine - Combine two crc32 check values into one. For two
@@ -38,9 +62,6 @@ static inline u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2)
 	return crc32_le_shift(crc1, len2) ^ crc2;
 }
 
-u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len);
-u32 __pure crc32c_le_base(u32 crc, unsigned char const *p, size_t len);
-
 /**
  * __crc32c_le_combine - Combine two crc32c check values into one. For two
  * 			 sequences of bytes, seq1 and seq2 with lengths len1
diff --git a/lib/Kconfig b/lib/Kconfig
index 5a318f753b2f..8858030bebae 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -190,6 +190,9 @@ config CRC32
 	  the kernel tree does. Such modules that use library CRC32/CRC32c
 	  functions require M here.
 
+config ARCH_HAS_CRC32
+	bool
+
 config CRC32_SELFTEST
 	tristate "CRC32 perform self test on init"
 	depends on CRC32
@@ -202,24 +205,39 @@ config CRC32_SELFTEST
 choice
 	prompt "CRC32 implementation"
 	depends on CRC32
-	default CRC32_SLICEBY8
+	default CRC32_IMPL_ARCH_PLUS_SLICEBY8 if ARCH_HAS_CRC32
+	default CRC32_IMPL_SLICEBY8 if !ARCH_HAS_CRC32
 	help
-	  This option allows a kernel builder to override the default choice
-	  of CRC32 algorithm.  Choose the default ("slice by 8") unless you
-	  know that you need one of the others.
+	  This option allows you to override the default choice of CRC32
+	  implementation.  Choose the default unless you know that you need one
+	  of the others.
 
-config CRC32_SLICEBY8
+config CRC32_IMPL_ARCH_PLUS_SLICEBY8
+	bool "Arch-optimized, with fallback to slice-by-8" if ARCH_HAS_CRC32
+	help
+	  Use architecture-optimized implementation of CRC32.  Fall back to
+	  slice-by-8 in cases where the arch-optimized implementation cannot be
+	  used, e.g. if the CPU lacks support for the needed instructions.
+
+	  This is the default when an arch-optimized implementation exists.
+
+config CRC32_IMPL_ARCH_PLUS_SLICEBY1
+	bool "Arch-optimized, with fallback to slice-by-1" if ARCH_HAS_CRC32
+	help
+	  Use architecture-optimized implementation of CRC32, but fall back to
+	  slice-by-1 instead of slice-by-8 in order to reduce the binary size.
+
+config CRC32_IMPL_SLICEBY8
 	bool "Slice by 8 bytes"
 	help
 	  Calculate checksum 8 bytes at a time with a clever slicing algorithm.
-	  This is the fastest algorithm, but comes with a 8KiB lookup table.
-	  Most modern processors have enough cache to hold this table without
-	  thrashing the cache.
-
-	  This is the default implementation choice.  Choose this one unless
-	  you have a good reason not to.
+	  This is much slower than the architecture-optimized implementation of
+	  CRC32 (if the selected arch has one), but it is portable and is the
+	  fastest implementation when no arch-optimized implementation is
+	  available.  It uses an 8KiB lookup table.  Most modern processors have
+	  enough cache to hold this table without thrashing the cache.
 
-config CRC32_SLICEBY4
+config CRC32_IMPL_SLICEBY4
 	bool "Slice by 4 bytes"
 	help
 	  Calculate checksum 4 bytes at a time with a clever slicing algorithm.
@@ -228,15 +246,15 @@ config CRC32_SLICEBY4
 
 	  Only choose this option if you know what you are doing.
 
-config CRC32_SARWATE
-	bool "Sarwate's Algorithm (one byte at a time)"
+config CRC32_IMPL_SLICEBY1
+	bool "Slice by 1 byte (Sarwate's algorithm)"
 	help
 	  Calculate checksum a byte at a time using Sarwate's algorithm.  This
-	  is not particularly fast, but has a small 256 byte lookup table.
+	  is not particularly fast, but has a small 1KiB lookup table.
 
 	  Only choose this option if you know what you are doing.
 
-config CRC32_BIT
+config CRC32_IMPL_BIT
 	bool "Classic Algorithm (one bit at a time)"
 	help
 	  Calculate checksum one bit at a time.  This is VERY slow, but has
@@ -246,6 +264,26 @@ config CRC32_BIT
 
 endchoice
 
+config CRC32_ARCH
+	tristate
+	default CRC32 if CRC32_IMPL_ARCH_PLUS_SLICEBY8 || CRC32_IMPL_ARCH_PLUS_SLICEBY1
+
+config CRC32_SLICEBY8
+	bool
+	default y if CRC32_IMPL_SLICEBY8 || CRC32_IMPL_ARCH_PLUS_SLICEBY8
+
+config CRC32_SLICEBY4
+	bool
+	default y if CRC32_IMPL_SLICEBY4
+
+config CRC32_SARWATE
+	bool
+	default y if CRC32_IMPL_SLICEBY1 || CRC32_IMPL_ARCH_PLUS_SLICEBY1
+
+config CRC32_BIT
+	bool
+	default y if CRC32_IMPL_BIT
+
 config CRC64
 	tristate "CRC64 functions"
 	help
diff --git a/lib/crc32.c b/lib/crc32.c
index c67059b0082b..47151624332e 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -183,35 +183,27 @@ static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
 }
 
 #if CRC_LE_BITS == 1
-u32 __pure __weak crc32_le(u32 crc, unsigned char const *p, size_t len)
+u32 __pure crc32_le_base(u32 crc, const u8 *p, size_t len)
 {
 	return crc32_le_generic(crc, p, len, NULL, CRC32_POLY_LE);
 }
-u32 __pure __weak __crc32c_le(u32 crc, unsigned char const *p, size_t len)
+u32 __pure crc32c_le_base(u32 crc, const u8 *p, size_t len)
 {
 	return crc32_le_generic(crc, p, len, NULL, CRC32C_POLY_LE);
 }
 #else
-u32 __pure __weak crc32_le(u32 crc, unsigned char const *p, size_t len)
+u32 __pure crc32_le_base(u32 crc, const u8 *p, size_t len)
 {
 	return crc32_le_generic(crc, p, len, crc32table_le, CRC32_POLY_LE);
 }
-u32 __pure __weak __crc32c_le(u32 crc, unsigned char const *p, size_t len)
+u32 __pure crc32c_le_base(u32 crc, const u8 *p, size_t len)
 {
 	return crc32_le_generic(crc, p, len, crc32ctable_le, CRC32C_POLY_LE);
 }
 #endif
-EXPORT_SYMBOL(crc32_le);
-EXPORT_SYMBOL(__crc32c_le);
-
-u32 __pure crc32_le_base(u32, unsigned char const *, size_t) __alias(crc32_le);
 EXPORT_SYMBOL(crc32_le_base);
-
-u32 __pure crc32c_le_base(u32, unsigned char const *, size_t) __alias(__crc32c_le);
 EXPORT_SYMBOL(crc32c_le_base);
 
-u32 __pure crc32_be_base(u32, unsigned char const *, size_t) __alias(crc32_be);
-
 /*
  * This multiplies the polynomials x and y modulo the given modulus.
  * This follows the "little-endian" CRC convention that the lsbit
@@ -335,14 +327,14 @@ static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p,
 }
 
 #if CRC_BE_BITS == 1
-u32 __pure __weak crc32_be(u32 crc, unsigned char const *p, size_t len)
+u32 __pure crc32_be_base(u32 crc, const u8 *p, size_t len)
 {
 	return crc32_be_generic(crc, p, len, NULL, CRC32_POLY_BE);
 }
 #else
-u32 __pure __weak crc32_be(u32 crc, unsigned char const *p, size_t len)
+u32 __pure crc32_be_base(u32 crc, const u8 *p, size_t len)
 {
 	return crc32_be_generic(crc, p, len, crc32table_be, CRC32_POLY_BE);
 }
 #endif
-EXPORT_SYMBOL(crc32_be);
+EXPORT_SYMBOL(crc32_be_base);
-- 
cgit