summaryrefslogtreecommitdiff
path: root/arch/riscv/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/riscv/lib')
-rw-r--r--arch/riscv/lib/Makefile10
-rw-r--r--arch/riscv/lib/clear_page.S2
-rw-r--r--arch/riscv/lib/crc32-riscv.c311
-rw-r--r--arch/riscv/lib/csum.c325
-rw-r--r--arch/riscv/lib/memset.S2
-rw-r--r--arch/riscv/lib/riscv_v_helpers.c45
-rw-r--r--arch/riscv/lib/strcmp.S2
-rw-r--r--arch/riscv/lib/strlen.S1
-rw-r--r--arch/riscv/lib/strncmp.S2
-rw-r--r--arch/riscv/lib/tishift.S2
-rw-r--r--arch/riscv/lib/uaccess.S16
-rw-r--r--arch/riscv/lib/uaccess_vector.S52
-rw-r--r--arch/riscv/lib/xor.S81
13 files changed, 845 insertions, 6 deletions
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index 26cb2502ecf8..79368a895fee 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -3,11 +3,19 @@ lib-y += delay.o
lib-y += memcpy.o
lib-y += memset.o
lib-y += memmove.o
+ifeq ($(CONFIG_KASAN_GENERIC)$(CONFIG_KASAN_SW_TAGS),)
lib-y += strcmp.o
lib-y += strlen.o
lib-y += strncmp.o
+endif
+lib-y += csum.o
+ifeq ($(CONFIG_MMU), y)
+lib-$(CONFIG_RISCV_ISA_V) += uaccess_vector.o
+endif
lib-$(CONFIG_MMU) += uaccess.o
lib-$(CONFIG_64BIT) += tishift.o
lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o
-
+obj-$(CONFIG_CRC32_ARCH) += crc32-riscv.o
obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
+lib-$(CONFIG_RISCV_ISA_V) += xor.o
+lib-$(CONFIG_RISCV_ISA_V) += riscv_v_helpers.o
diff --git a/arch/riscv/lib/clear_page.S b/arch/riscv/lib/clear_page.S
index b22de1231144..20ff03f5b0f2 100644
--- a/arch/riscv/lib/clear_page.S
+++ b/arch/riscv/lib/clear_page.S
@@ -4,9 +4,9 @@
*/
#include <linux/linkage.h>
+#include <linux/export.h>
#include <asm/asm.h>
#include <asm/alternative-macros.h>
-#include <asm-generic/export.h>
#include <asm/hwcap.h>
#include <asm/insn-def.h>
#include <asm/page.h>
diff --git a/arch/riscv/lib/crc32-riscv.c b/arch/riscv/lib/crc32-riscv.c
new file mode 100644
index 000000000000..53d56ab422c7
--- /dev/null
+++ b/arch/riscv/lib/crc32-riscv.c
@@ -0,0 +1,311 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC32 implementation with Zbc extension.
+ *
+ * Copyright (C) 2024 Intel Corporation
+ */
+
+#include <asm/hwcap.h>
+#include <asm/alternative-macros.h>
+#include <asm/byteorder.h>
+
+#include <linux/types.h>
+#include <linux/minmax.h>
+#include <linux/crc32poly.h>
+#include <linux/crc32.h>
+#include <linux/byteorder/generic.h>
+#include <linux/module.h>
+
+/*
+ * Refer to https://www.corsix.org/content/barrett-reduction-polynomials for
+ * better understanding of how this math works.
+ *
+ * let "+" denotes polynomial add (XOR)
+ * let "-" denotes polynomial sub (XOR)
+ * let "*" denotes polynomial multiplication
+ * let "/" denotes polynomial floor division
+ * let "S" denotes source data, XLEN bit wide
+ * let "P" denotes CRC32 polynomial
+ * let "T" denotes 2^(XLEN+32)
+ * let "QT" denotes quotient of T/P, with the bit for 2^XLEN being implicit
+ *
+ * crc32(S, P)
+ * => S * (2^32) - S * (2^32) / P * P
+ * => lowest 32 bits of: S * (2^32) / P * P
+ * => lowest 32 bits of: S * (2^32) * (T / P) / T * P
+ * => lowest 32 bits of: S * (2^32) * quotient / T * P
+ * => lowest 32 bits of: S * quotient / 2^XLEN * P
+ * => lowest 32 bits of: (clmul_high_part(S, QT) + S) * P
+ * => clmul_low_part(clmul_high_part(S, QT) + S, P)
+ *
+ * In terms of below implementations, the BE case is more intuitive, since the
+ * higher order bit sits at more significant position.
+ */
+
+#if __riscv_xlen == 64
+/* Slide by XLEN bits per iteration */
+# define STEP_ORDER 3
+
+/* Each below polynomial quotient has an implicit bit for 2^XLEN */
+
+/* Polynomial quotient of (2^(XLEN+32))/CRC32_POLY, in LE format */
+# define CRC32_POLY_QT_LE 0x5a72d812fb808b20
+
+/* Polynomial quotient of (2^(XLEN+32))/CRC32C_POLY, in LE format */
+# define CRC32C_POLY_QT_LE 0xa434f61c6f5389f8
+
+/* Polynomial quotient of (2^(XLEN+32))/CRC32_POLY, in BE format, it should be
+ * the same as the bit-reversed version of CRC32_POLY_QT_LE
+ */
+# define CRC32_POLY_QT_BE 0x04d101df481b4e5a
+
+static inline u64 crc32_le_prep(u32 crc, unsigned long const *ptr)
+{
+ return (u64)crc ^ (__force u64)__cpu_to_le64(*ptr);
+}
+
+static inline u32 crc32_le_zbc(unsigned long s, u32 poly, unsigned long poly_qt)
+{
+ u32 crc;
+
+ /* We don't have a "clmulrh" insn, so use clmul + slli instead. */
+ asm volatile (".option push\n"
+ ".option arch,+zbc\n"
+ "clmul %0, %1, %2\n"
+ "slli %0, %0, 1\n"
+ "xor %0, %0, %1\n"
+ "clmulr %0, %0, %3\n"
+ "srli %0, %0, 32\n"
+ ".option pop\n"
+ : "=&r" (crc)
+ : "r" (s),
+ "r" (poly_qt),
+ "r" ((u64)poly << 32)
+ :);
+ return crc;
+}
+
+static inline u64 crc32_be_prep(u32 crc, unsigned long const *ptr)
+{
+ return ((u64)crc << 32) ^ (__force u64)__cpu_to_be64(*ptr);
+}
+
+#elif __riscv_xlen == 32
+# define STEP_ORDER 2
+/* Each quotient should match the upper half of its analog in RV64 */
+# define CRC32_POLY_QT_LE 0xfb808b20
+# define CRC32C_POLY_QT_LE 0x6f5389f8
+# define CRC32_POLY_QT_BE 0x04d101df
+
+static inline u32 crc32_le_prep(u32 crc, unsigned long const *ptr)
+{
+ return crc ^ (__force u32)__cpu_to_le32(*ptr);
+}
+
+static inline u32 crc32_le_zbc(unsigned long s, u32 poly, unsigned long poly_qt)
+{
+ u32 crc;
+
+ /* We don't have a "clmulrh" insn, so use clmul + slli instead. */
+ asm volatile (".option push\n"
+ ".option arch,+zbc\n"
+ "clmul %0, %1, %2\n"
+ "slli %0, %0, 1\n"
+ "xor %0, %0, %1\n"
+ "clmulr %0, %0, %3\n"
+ ".option pop\n"
+ : "=&r" (crc)
+ : "r" (s),
+ "r" (poly_qt),
+ "r" (poly)
+ :);
+ return crc;
+}
+
+static inline u32 crc32_be_prep(u32 crc, unsigned long const *ptr)
+{
+ return crc ^ (__force u32)__cpu_to_be32(*ptr);
+}
+
+#else
+# error "Unexpected __riscv_xlen"
+#endif
+
+static inline u32 crc32_be_zbc(unsigned long s)
+{
+ u32 crc;
+
+ asm volatile (".option push\n"
+ ".option arch,+zbc\n"
+ "clmulh %0, %1, %2\n"
+ "xor %0, %0, %1\n"
+ "clmul %0, %0, %3\n"
+ ".option pop\n"
+ : "=&r" (crc)
+ : "r" (s),
+ "r" (CRC32_POLY_QT_BE),
+ "r" (CRC32_POLY_BE)
+ :);
+ return crc;
+}
+
+#define STEP (1 << STEP_ORDER)
+#define OFFSET_MASK (STEP - 1)
+
+typedef u32 (*fallback)(u32 crc, unsigned char const *p, size_t len);
+
+static inline u32 crc32_le_unaligned(u32 crc, unsigned char const *p,
+ size_t len, u32 poly,
+ unsigned long poly_qt)
+{
+ size_t bits = len * 8;
+ unsigned long s = 0;
+ u32 crc_low = 0;
+
+ for (int i = 0; i < len; i++)
+ s = ((unsigned long)*p++ << (__riscv_xlen - 8)) | (s >> 8);
+
+ s ^= (unsigned long)crc << (__riscv_xlen - bits);
+ if (__riscv_xlen == 32 || len < sizeof(u32))
+ crc_low = crc >> bits;
+
+ crc = crc32_le_zbc(s, poly, poly_qt);
+ crc ^= crc_low;
+
+ return crc;
+}
+
+static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
+ size_t len, u32 poly,
+ unsigned long poly_qt,
+ fallback crc_fb)
+{
+ size_t offset, head_len, tail_len;
+ unsigned long const *p_ul;
+ unsigned long s;
+
+ asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+ RISCV_ISA_EXT_ZBC, 1)
+ : : : : legacy);
+
+ /* Handle the unaligned head. */
+ offset = (unsigned long)p & OFFSET_MASK;
+ if (offset && len) {
+ head_len = min(STEP - offset, len);
+ crc = crc32_le_unaligned(crc, p, head_len, poly, poly_qt);
+ p += head_len;
+ len -= head_len;
+ }
+
+ tail_len = len & OFFSET_MASK;
+ len = len >> STEP_ORDER;
+ p_ul = (unsigned long const *)p;
+
+ for (int i = 0; i < len; i++) {
+ s = crc32_le_prep(crc, p_ul);
+ crc = crc32_le_zbc(s, poly, poly_qt);
+ p_ul++;
+ }
+
+ /* Handle the tail bytes. */
+ p = (unsigned char const *)p_ul;
+ if (tail_len)
+ crc = crc32_le_unaligned(crc, p, tail_len, poly, poly_qt);
+
+ return crc;
+
+legacy:
+ return crc_fb(crc, p, len);
+}
+
+u32 __pure crc32_le_arch(u32 crc, const u8 *p, size_t len)
+{
+ return crc32_le_generic(crc, p, len, CRC32_POLY_LE, CRC32_POLY_QT_LE,
+ crc32_le_base);
+}
+EXPORT_SYMBOL(crc32_le_arch);
+
+u32 __pure crc32c_le_arch(u32 crc, const u8 *p, size_t len)
+{
+ return crc32_le_generic(crc, p, len, CRC32C_POLY_LE,
+ CRC32C_POLY_QT_LE, crc32c_le_base);
+}
+EXPORT_SYMBOL(crc32c_le_arch);
+
+static inline u32 crc32_be_unaligned(u32 crc, unsigned char const *p,
+ size_t len)
+{
+ size_t bits = len * 8;
+ unsigned long s = 0;
+ u32 crc_low = 0;
+
+ s = 0;
+ for (int i = 0; i < len; i++)
+ s = *p++ | (s << 8);
+
+ if (__riscv_xlen == 32 || len < sizeof(u32)) {
+ s ^= crc >> (32 - bits);
+ crc_low = crc << bits;
+ } else {
+ s ^= (unsigned long)crc << (bits - 32);
+ }
+
+ crc = crc32_be_zbc(s);
+ crc ^= crc_low;
+
+ return crc;
+}
+
+u32 __pure crc32_be_arch(u32 crc, const u8 *p, size_t len)
+{
+ size_t offset, head_len, tail_len;
+ unsigned long const *p_ul;
+ unsigned long s;
+
+ asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+ RISCV_ISA_EXT_ZBC, 1)
+ : : : : legacy);
+
+ /* Handle the unaligned head. */
+ offset = (unsigned long)p & OFFSET_MASK;
+ if (offset && len) {
+ head_len = min(STEP - offset, len);
+ crc = crc32_be_unaligned(crc, p, head_len);
+ p += head_len;
+ len -= head_len;
+ }
+
+ tail_len = len & OFFSET_MASK;
+ len = len >> STEP_ORDER;
+ p_ul = (unsigned long const *)p;
+
+ for (int i = 0; i < len; i++) {
+ s = crc32_be_prep(crc, p_ul);
+ crc = crc32_be_zbc(s);
+ p_ul++;
+ }
+
+ /* Handle the tail bytes. */
+ p = (unsigned char const *)p_ul;
+ if (tail_len)
+ crc = crc32_be_unaligned(crc, p, tail_len);
+
+ return crc;
+
+legacy:
+ return crc32_be_base(crc, p, len);
+}
+EXPORT_SYMBOL(crc32_be_arch);
+
+u32 crc32_optimizations(void)
+{
+ if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
+ return CRC32_LE_OPTIMIZATION |
+ CRC32_BE_OPTIMIZATION |
+ CRC32C_OPTIMIZATION;
+ return 0;
+}
+EXPORT_SYMBOL(crc32_optimizations);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Accelerated CRC32 implementation with Zbc extension");
diff --git a/arch/riscv/lib/csum.c b/arch/riscv/lib/csum.c
new file mode 100644
index 000000000000..7fb12c59e571
--- /dev/null
+++ b/arch/riscv/lib/csum.c
@@ -0,0 +1,325 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Checksum library
+ *
+ * Influenced by arch/arm64/lib/csum.c
+ * Copyright (C) 2023-2024 Rivos Inc.
+ */
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/jump_label.h>
+#include <linux/kasan-checks.h>
+#include <linux/kernel.h>
+
+#include <asm/cpufeature.h>
+
+#include <net/checksum.h>
+
+/* Default version is sufficient for 32 bit */
+#ifndef CONFIG_32BIT
+__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __u32 len, __u8 proto, __wsum csum)
+{
+ unsigned int ulen, uproto;
+ unsigned long sum = (__force unsigned long)csum;
+
+ sum += (__force unsigned long)saddr->s6_addr32[0];
+ sum += (__force unsigned long)saddr->s6_addr32[1];
+ sum += (__force unsigned long)saddr->s6_addr32[2];
+ sum += (__force unsigned long)saddr->s6_addr32[3];
+
+ sum += (__force unsigned long)daddr->s6_addr32[0];
+ sum += (__force unsigned long)daddr->s6_addr32[1];
+ sum += (__force unsigned long)daddr->s6_addr32[2];
+ sum += (__force unsigned long)daddr->s6_addr32[3];
+
+ ulen = (__force unsigned int)htonl((unsigned int)len);
+ sum += ulen;
+
+ uproto = (__force unsigned int)htonl(proto);
+ sum += uproto;
+
+ /*
+ * Zbb support saves 4 instructions, so not worth checking without
+ * alternatives if supported
+ */
+ if (IS_ENABLED(CONFIG_RISCV_ISA_ZBB) &&
+ IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
+ unsigned long fold_temp;
+
+ /*
+ * Zbb is likely available when the kernel is compiled with Zbb
+ * support, so nop when Zbb is available and jump when Zbb is
+ * not available.
+ */
+ asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
+ RISCV_ISA_EXT_ZBB, 1)
+ :
+ :
+ :
+ : no_zbb);
+ asm(".option push \n\
+ .option arch,+zbb \n\
+ rori %[fold_temp], %[sum], 32 \n\
+ add %[sum], %[fold_temp], %[sum] \n\
+ srli %[sum], %[sum], 32 \n\
+ not %[fold_temp], %[sum] \n\
+ roriw %[sum], %[sum], 16 \n\
+ subw %[sum], %[fold_temp], %[sum] \n\
+ .option pop"
+ : [sum] "+r" (sum), [fold_temp] "=&r" (fold_temp));
+ return (__force __sum16)(sum >> 16);
+ }
+no_zbb:
+ sum += ror64(sum, 32);
+ sum >>= 32;
+ return csum_fold((__force __wsum)sum);
+}
+EXPORT_SYMBOL(csum_ipv6_magic);
+#endif /* !CONFIG_32BIT */
+
+#ifdef CONFIG_32BIT
+#define OFFSET_MASK 3
+#elif CONFIG_64BIT
+#define OFFSET_MASK 7
+#endif
+
+static inline __no_sanitize_address unsigned long
+do_csum_common(const unsigned long *ptr, const unsigned long *end,
+ unsigned long data)
+{
+ unsigned int shift;
+ unsigned long csum = 0, carry = 0;
+
+ /*
+ * Do 32-bit reads on RV32 and 64-bit reads otherwise. This should be
+ * faster than doing 32-bit reads on architectures that support larger
+ * reads.
+ */
+ while (ptr < end) {
+ csum += data;
+ carry += csum < data;
+ data = *(ptr++);
+ }
+
+ /*
+ * Perform alignment (and over-read) bytes on the tail if any bytes
+ * leftover.
+ */
+ shift = ((long)ptr - (long)end) * 8;
+#ifdef __LITTLE_ENDIAN
+ data = (data << shift) >> shift;
+#else
+ data = (data >> shift) << shift;
+#endif
+ csum += data;
+ carry += csum < data;
+ csum += carry;
+ csum += csum < carry;
+
+ return csum;
+}
+
+/*
+ * Algorithm accounts for buff being misaligned.
+ * If buff is not aligned, will over-read bytes but not use the bytes that it
+ * shouldn't. The same thing will occur on the tail-end of the read.
+ */
+static inline __no_sanitize_address unsigned int
+do_csum_with_alignment(const unsigned char *buff, int len)
+{
+ unsigned int offset, shift;
+ unsigned long csum, data;
+ const unsigned long *ptr, *end;
+
+ /*
+ * Align address to closest word (double word on rv64) that comes before
+ * buff. This should always be in the same page and cache line.
+ * Directly call KASAN with the alignment we will be using.
+ */
+ offset = (unsigned long)buff & OFFSET_MASK;
+ kasan_check_read(buff, len);
+ ptr = (const unsigned long *)(buff - offset);
+
+ /*
+ * Clear the most significant bytes that were over-read if buff was not
+ * aligned.
+ */
+ shift = offset * 8;
+ data = *(ptr++);
+#ifdef __LITTLE_ENDIAN
+ data = (data >> shift) << shift;
+#else
+ data = (data << shift) >> shift;
+#endif
+ end = (const unsigned long *)(buff + len);
+ csum = do_csum_common(ptr, end, data);
+
+#ifdef CC_HAS_ASM_GOTO_TIED_OUTPUT
+ /*
+ * Zbb support saves 6 instructions, so not worth checking without
+ * alternatives if supported
+ */
+ if (IS_ENABLED(CONFIG_RISCV_ISA_ZBB) &&
+ IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
+ unsigned long fold_temp;
+
+ /*
+ * Zbb is likely available when the kernel is compiled with Zbb
+ * support, so nop when Zbb is available and jump when Zbb is
+ * not available.
+ */
+ asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
+ RISCV_ISA_EXT_ZBB, 1)
+ :
+ :
+ :
+ : no_zbb);
+
+#ifdef CONFIG_32BIT
+ asm_goto_output(".option push \n\
+ .option arch,+zbb \n\
+ rori %[fold_temp], %[csum], 16 \n\
+ andi %[offset], %[offset], 1 \n\
+ add %[csum], %[fold_temp], %[csum] \n\
+ beq %[offset], zero, %l[end] \n\
+ rev8 %[csum], %[csum] \n\
+ .option pop"
+ : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp)
+ : [offset] "r" (offset)
+ :
+ : end);
+
+ return (unsigned short)csum;
+#else /* !CONFIG_32BIT */
+ asm_goto_output(".option push \n\
+ .option arch,+zbb \n\
+ rori %[fold_temp], %[csum], 32 \n\
+ add %[csum], %[fold_temp], %[csum] \n\
+ srli %[csum], %[csum], 32 \n\
+ roriw %[fold_temp], %[csum], 16 \n\
+ addw %[csum], %[fold_temp], %[csum] \n\
+ andi %[offset], %[offset], 1 \n\
+ beq %[offset], zero, %l[end] \n\
+ rev8 %[csum], %[csum] \n\
+ .option pop"
+ : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp)
+ : [offset] "r" (offset)
+ :
+ : end);
+
+ return (csum << 16) >> 48;
+#endif /* !CONFIG_32BIT */
+end:
+ return csum >> 16;
+ }
+no_zbb:
+#endif /* CC_HAS_ASM_GOTO_TIED_OUTPUT */
+#ifndef CONFIG_32BIT
+ csum += ror64(csum, 32);
+ csum >>= 32;
+#endif
+ csum = (u32)csum + ror32((u32)csum, 16);
+ if (offset & 1)
+ return (u16)swab32(csum);
+ return csum >> 16;
+}
+
+/*
+ * Does not perform alignment, should only be used if machine has fast
+ * misaligned accesses, or when buff is known to be aligned.
+ */
+static inline __no_sanitize_address unsigned int
+do_csum_no_alignment(const unsigned char *buff, int len)
+{
+ unsigned long csum, data;
+ const unsigned long *ptr, *end;
+
+ ptr = (const unsigned long *)(buff);
+ data = *(ptr++);
+
+ kasan_check_read(buff, len);
+
+ end = (const unsigned long *)(buff + len);
+ csum = do_csum_common(ptr, end, data);
+
+ /*
+ * Zbb support saves 6 instructions, so not worth checking without
+ * alternatives if supported
+ */
+ if (IS_ENABLED(CONFIG_RISCV_ISA_ZBB) &&
+ IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
+ unsigned long fold_temp;
+
+ /*
+ * Zbb is likely available when the kernel is compiled with Zbb
+ * support, so nop when Zbb is available and jump when Zbb is
+ * not available.
+ */
+ asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
+ RISCV_ISA_EXT_ZBB, 1)
+ :
+ :
+ :
+ : no_zbb);
+
+#ifdef CONFIG_32BIT
+ asm (".option push \n\
+ .option arch,+zbb \n\
+ rori %[fold_temp], %[csum], 16 \n\
+ add %[csum], %[fold_temp], %[csum] \n\
+ .option pop"
+ : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp)
+ :
+ : );
+
+#else /* !CONFIG_32BIT */
+ asm (".option push \n\
+ .option arch,+zbb \n\
+ rori %[fold_temp], %[csum], 32 \n\
+ add %[csum], %[fold_temp], %[csum] \n\
+ srli %[csum], %[csum], 32 \n\
+ roriw %[fold_temp], %[csum], 16 \n\
+ addw %[csum], %[fold_temp], %[csum] \n\
+ .option pop"
+ : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp)
+ :
+ : );
+#endif /* !CONFIG_32BIT */
+ return csum >> 16;
+ }
+no_zbb:
+#ifndef CONFIG_32BIT
+ csum += ror64(csum, 32);
+ csum >>= 32;
+#endif
+ csum = (u32)csum + ror32((u32)csum, 16);
+ return csum >> 16;
+}
+
+/*
+ * Perform a checksum on an arbitrary memory address.
+ * Will do a light-weight address alignment if buff is misaligned, unless
+ * cpu supports fast misaligned accesses.
+ */
+unsigned int do_csum(const unsigned char *buff, int len)
+{
+ if (unlikely(len <= 0))
+ return 0;
+
+ /*
+ * Significant performance gains can be seen by not doing alignment
+ * on machines with fast misaligned accesses.
+ *
+ * There is some duplicate code between the "with_alignment" and
+ * "no_alignment" implmentations, but the overlap is too awkward to be
+ * able to fit in one function without introducing multiple static
+ * branches. The largest chunk of overlap was delegated into the
+ * do_csum_common function.
+ */
+ if (has_fast_unaligned_accesses() || (((unsigned long)buff & OFFSET_MASK) == 0))
+ return do_csum_no_alignment(buff, len);
+
+ return do_csum_with_alignment(buff, len);
+}
diff --git a/arch/riscv/lib/memset.S b/arch/riscv/lib/memset.S
index 35f358e70bdb..da23b8347e2d 100644
--- a/arch/riscv/lib/memset.S
+++ b/arch/riscv/lib/memset.S
@@ -111,3 +111,5 @@ SYM_FUNC_START(__memset)
ret
SYM_FUNC_END(__memset)
SYM_FUNC_ALIAS_WEAK(memset, __memset)
+SYM_FUNC_ALIAS(__pi_memset, __memset)
+SYM_FUNC_ALIAS(__pi___memset, __memset)
diff --git a/arch/riscv/lib/riscv_v_helpers.c b/arch/riscv/lib/riscv_v_helpers.c
new file mode 100644
index 000000000000..be38a93cedae
--- /dev/null
+++ b/arch/riscv/lib/riscv_v_helpers.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 SiFive
+ * Author: Andy Chiu <andy.chiu@sifive.com>
+ */
+#include <linux/linkage.h>
+#include <asm/asm.h>
+
+#include <asm/vector.h>
+#include <asm/simd.h>
+
+#ifdef CONFIG_MMU
+#include <asm/asm-prototypes.h>
+#endif
+
+#ifdef CONFIG_MMU
+size_t riscv_v_usercopy_threshold = CONFIG_RISCV_ISA_V_UCOPY_THRESHOLD;
+int __asm_vector_usercopy(void *dst, void *src, size_t n);
+int fallback_scalar_usercopy(void *dst, void *src, size_t n);
+asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n)
+{
+ size_t remain, copied;
+
+ /* skip has_vector() check because it has been done by the asm */
+ if (!may_use_simd())
+ goto fallback;
+
+ kernel_vector_begin();
+ remain = __asm_vector_usercopy(dst, src, n);
+ kernel_vector_end();
+
+ if (remain) {
+ copied = n - remain;
+ dst += copied;
+ src += copied;
+ n = remain;
+ goto fallback;
+ }
+
+ return remain;
+
+fallback:
+ return fallback_scalar_usercopy(dst, src, n);
+}
+#endif
diff --git a/arch/riscv/lib/strcmp.S b/arch/riscv/lib/strcmp.S
index 687b2bea5c43..57a5c0066231 100644
--- a/arch/riscv/lib/strcmp.S
+++ b/arch/riscv/lib/strcmp.S
@@ -120,3 +120,5 @@ strcmp_zbb:
.option pop
#endif
SYM_FUNC_END(strcmp)
+SYM_FUNC_ALIAS(__pi_strcmp, strcmp)
+EXPORT_SYMBOL(strcmp)
diff --git a/arch/riscv/lib/strlen.S b/arch/riscv/lib/strlen.S
index 8ae3064e45ff..962983b73251 100644
--- a/arch/riscv/lib/strlen.S
+++ b/arch/riscv/lib/strlen.S
@@ -131,3 +131,4 @@ strlen_zbb:
#endif
SYM_FUNC_END(strlen)
SYM_FUNC_ALIAS(__pi_strlen, strlen)
+EXPORT_SYMBOL(strlen)
diff --git a/arch/riscv/lib/strncmp.S b/arch/riscv/lib/strncmp.S
index aba5b3148621..7b2d0ff9ed6c 100644
--- a/arch/riscv/lib/strncmp.S
+++ b/arch/riscv/lib/strncmp.S
@@ -136,3 +136,5 @@ strncmp_zbb:
.option pop
#endif
SYM_FUNC_END(strncmp)
+SYM_FUNC_ALIAS(__pi_strncmp, strncmp)
+EXPORT_SYMBOL(strncmp)
diff --git a/arch/riscv/lib/tishift.S b/arch/riscv/lib/tishift.S
index ef90075c4b0a..c8294bf72c06 100644
--- a/arch/riscv/lib/tishift.S
+++ b/arch/riscv/lib/tishift.S
@@ -4,7 +4,7 @@
*/
#include <linux/linkage.h>
-#include <asm-generic/export.h>
+#include <linux/export.h>
SYM_FUNC_START(__lshrti3)
beqz a2, .L1
diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S
index 3ab438f30d13..6a9f116bb545 100644
--- a/arch/riscv/lib/uaccess.S
+++ b/arch/riscv/lib/uaccess.S
@@ -1,8 +1,10 @@
#include <linux/linkage.h>
-#include <asm-generic/export.h>
+#include <linux/export.h>
#include <asm/asm.h>
#include <asm/asm-extable.h>
#include <asm/csr.h>
+#include <asm/hwcap.h>
+#include <asm/alternative-macros.h>
.macro fixup op reg addr lbl
100:
@@ -11,6 +13,13 @@
.endm
SYM_FUNC_START(__asm_copy_to_user)
+#ifdef CONFIG_RISCV_ISA_V
+ ALTERNATIVE("j fallback_scalar_usercopy", "nop", 0, RISCV_ISA_EXT_ZVE32X, CONFIG_RISCV_ISA_V)
+ REG_L t0, riscv_v_usercopy_threshold
+ bltu a2, t0, fallback_scalar_usercopy
+ tail enter_vector_usercopy
+#endif
+SYM_FUNC_START(fallback_scalar_usercopy)
/* Enable access to user memory */
li t6, SR_SUM
@@ -35,7 +44,7 @@ SYM_FUNC_START(__asm_copy_to_user)
* Use byte copy only if too small.
* SZREG holds 4 for RV32 and 8 for RV64
*/
- li a3, 9*SZREG /* size must be larger than size in word_copy */
+ li a3, 9*SZREG-1 /* size must >= (word_copy stride + SZREG-1) */
bltu a2, a3, .Lbyte_copy_tail
/*
@@ -94,7 +103,7 @@ SYM_FUNC_START(__asm_copy_to_user)
fixup REG_S t4, 7*SZREG(a0), 10f
addi a0, a0, 8*SZREG
addi a1, a1, 8*SZREG
- bltu a0, t0, 2b
+ bleu a0, t0, 2b
addi t0, t0, 8*SZREG /* revert to original value */
j .Lbyte_copy_tail
@@ -181,6 +190,7 @@ SYM_FUNC_START(__asm_copy_to_user)
sub a0, t5, a0
ret
SYM_FUNC_END(__asm_copy_to_user)
+SYM_FUNC_END(fallback_scalar_usercopy)
EXPORT_SYMBOL(__asm_copy_to_user)
SYM_FUNC_ALIAS(__asm_copy_from_user, __asm_copy_to_user)
EXPORT_SYMBOL(__asm_copy_from_user)
diff --git a/arch/riscv/lib/uaccess_vector.S b/arch/riscv/lib/uaccess_vector.S
new file mode 100644
index 000000000000..7c45f26de4f7
--- /dev/null
+++ b/arch/riscv/lib/uaccess_vector.S
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm/asm-extable.h>
+#include <asm/csr.h>
+
+#define pDst a0
+#define pSrc a1
+#define iNum a2
+
+#define iVL a3
+
+#define ELEM_LMUL_SETTING m8
+#define vData v0
+
+ .macro fixup op reg addr lbl
+100:
+ \op \reg, \addr
+ _asm_extable 100b, \lbl
+ .endm
+
+SYM_FUNC_START(__asm_vector_usercopy)
+ /* Enable access to user memory */
+ li t6, SR_SUM
+ csrs CSR_STATUS, t6
+
+loop:
+ vsetvli iVL, iNum, e8, ELEM_LMUL_SETTING, ta, ma
+ fixup vle8.v vData, (pSrc), 10f
+ sub iNum, iNum, iVL
+ add pSrc, pSrc, iVL
+ fixup vse8.v vData, (pDst), 11f
+ add pDst, pDst, iVL
+ bnez iNum, loop
+
+ /* Exception fixup for vector load is shared with normal exit */
+10:
+ /* Disable access to user memory */
+ csrc CSR_STATUS, t6
+ mv a0, iNum
+ ret
+
+ /* Exception fixup code for vector store. */
+11:
+ /* Undo the subtraction after vle8.v */
+ add iNum, iNum, iVL
+ /* Make sure the scalar fallback skip already processed bytes */
+ csrr t2, CSR_VSTART
+ sub iNum, iNum, t2
+ j 10b
+SYM_FUNC_END(__asm_vector_usercopy)
diff --git a/arch/riscv/lib/xor.S b/arch/riscv/lib/xor.S
new file mode 100644
index 000000000000..b28f2430e52f
--- /dev/null
+++ b/arch/riscv/lib/xor.S
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2021 SiFive
+ */
+#include <linux/linkage.h>
+#include <linux/export.h>
+#include <asm/asm.h>
+
+SYM_FUNC_START(xor_regs_2_)
+ vsetvli a3, a0, e8, m8, ta, ma
+ vle8.v v0, (a1)
+ vle8.v v8, (a2)
+ sub a0, a0, a3
+ vxor.vv v16, v0, v8
+ add a2, a2, a3
+ vse8.v v16, (a1)
+ add a1, a1, a3
+ bnez a0, xor_regs_2_
+ ret
+SYM_FUNC_END(xor_regs_2_)
+EXPORT_SYMBOL(xor_regs_2_)
+
+SYM_FUNC_START(xor_regs_3_)
+ vsetvli a4, a0, e8, m8, ta, ma
+ vle8.v v0, (a1)
+ vle8.v v8, (a2)
+ sub a0, a0, a4
+ vxor.vv v0, v0, v8
+ vle8.v v16, (a3)
+ add a2, a2, a4
+ vxor.vv v16, v0, v16
+ add a3, a3, a4
+ vse8.v v16, (a1)
+ add a1, a1, a4
+ bnez a0, xor_regs_3_
+ ret
+SYM_FUNC_END(xor_regs_3_)
+EXPORT_SYMBOL(xor_regs_3_)
+
+SYM_FUNC_START(xor_regs_4_)
+ vsetvli a5, a0, e8, m8, ta, ma
+ vle8.v v0, (a1)
+ vle8.v v8, (a2)
+ sub a0, a0, a5
+ vxor.vv v0, v0, v8
+ vle8.v v16, (a3)
+ add a2, a2, a5
+ vxor.vv v0, v0, v16
+ vle8.v v24, (a4)
+ add a3, a3, a5
+ vxor.vv v16, v0, v24
+ add a4, a4, a5
+ vse8.v v16, (a1)
+ add a1, a1, a5
+ bnez a0, xor_regs_4_
+ ret
+SYM_FUNC_END(xor_regs_4_)
+EXPORT_SYMBOL(xor_regs_4_)
+
+SYM_FUNC_START(xor_regs_5_)
+ vsetvli a6, a0, e8, m8, ta, ma
+ vle8.v v0, (a1)
+ vle8.v v8, (a2)
+ sub a0, a0, a6
+ vxor.vv v0, v0, v8
+ vle8.v v16, (a3)
+ add a2, a2, a6
+ vxor.vv v0, v0, v16
+ vle8.v v24, (a4)
+ add a3, a3, a6
+ vxor.vv v0, v0, v24
+ vle8.v v8, (a5)
+ add a4, a4, a6
+ vxor.vv v16, v0, v8
+ add a5, a5, a6
+ vse8.v v16, (a1)
+ add a1, a1, a6
+ bnez a0, xor_regs_5_
+ ret
+SYM_FUNC_END(xor_regs_5_)
+EXPORT_SYMBOL(xor_regs_5_)