diff options
Diffstat (limited to 'arch/riscv/lib')
-rw-r--r-- | arch/riscv/lib/Makefile | 10 | ||||
-rw-r--r-- | arch/riscv/lib/clear_page.S | 2 | ||||
-rw-r--r-- | arch/riscv/lib/crc32-riscv.c | 311 | ||||
-rw-r--r-- | arch/riscv/lib/csum.c | 325 | ||||
-rw-r--r-- | arch/riscv/lib/memset.S | 2 | ||||
-rw-r--r-- | arch/riscv/lib/riscv_v_helpers.c | 45 | ||||
-rw-r--r-- | arch/riscv/lib/strcmp.S | 2 | ||||
-rw-r--r-- | arch/riscv/lib/strlen.S | 1 | ||||
-rw-r--r-- | arch/riscv/lib/strncmp.S | 2 | ||||
-rw-r--r-- | arch/riscv/lib/tishift.S | 2 | ||||
-rw-r--r-- | arch/riscv/lib/uaccess.S | 16 | ||||
-rw-r--r-- | arch/riscv/lib/uaccess_vector.S | 52 | ||||
-rw-r--r-- | arch/riscv/lib/xor.S | 81 |
13 files changed, 845 insertions, 6 deletions
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile index 26cb2502ecf8..79368a895fee 100644 --- a/arch/riscv/lib/Makefile +++ b/arch/riscv/lib/Makefile @@ -3,11 +3,19 @@ lib-y += delay.o lib-y += memcpy.o lib-y += memset.o lib-y += memmove.o +ifeq ($(CONFIG_KASAN_GENERIC)$(CONFIG_KASAN_SW_TAGS),) lib-y += strcmp.o lib-y += strlen.o lib-y += strncmp.o +endif +lib-y += csum.o +ifeq ($(CONFIG_MMU), y) +lib-$(CONFIG_RISCV_ISA_V) += uaccess_vector.o +endif lib-$(CONFIG_MMU) += uaccess.o lib-$(CONFIG_64BIT) += tishift.o lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o - +obj-$(CONFIG_CRC32_ARCH) += crc32-riscv.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o +lib-$(CONFIG_RISCV_ISA_V) += xor.o +lib-$(CONFIG_RISCV_ISA_V) += riscv_v_helpers.o diff --git a/arch/riscv/lib/clear_page.S b/arch/riscv/lib/clear_page.S index b22de1231144..20ff03f5b0f2 100644 --- a/arch/riscv/lib/clear_page.S +++ b/arch/riscv/lib/clear_page.S @@ -4,9 +4,9 @@ */ #include <linux/linkage.h> +#include <linux/export.h> #include <asm/asm.h> #include <asm/alternative-macros.h> -#include <asm-generic/export.h> #include <asm/hwcap.h> #include <asm/insn-def.h> #include <asm/page.h> diff --git a/arch/riscv/lib/crc32-riscv.c b/arch/riscv/lib/crc32-riscv.c new file mode 100644 index 000000000000..53d56ab422c7 --- /dev/null +++ b/arch/riscv/lib/crc32-riscv.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Accelerated CRC32 implementation with Zbc extension. + * + * Copyright (C) 2024 Intel Corporation + */ + +#include <asm/hwcap.h> +#include <asm/alternative-macros.h> +#include <asm/byteorder.h> + +#include <linux/types.h> +#include <linux/minmax.h> +#include <linux/crc32poly.h> +#include <linux/crc32.h> +#include <linux/byteorder/generic.h> +#include <linux/module.h> + +/* + * Refer to https://www.corsix.org/content/barrett-reduction-polynomials for + * better understanding of how this math works. + * + * let "+" denotes polynomial add (XOR) + * let "-" denotes polynomial sub (XOR) + * let "*" denotes polynomial multiplication + * let "/" denotes polynomial floor division + * let "S" denotes source data, XLEN bit wide + * let "P" denotes CRC32 polynomial + * let "T" denotes 2^(XLEN+32) + * let "QT" denotes quotient of T/P, with the bit for 2^XLEN being implicit + * + * crc32(S, P) + * => S * (2^32) - S * (2^32) / P * P + * => lowest 32 bits of: S * (2^32) / P * P + * => lowest 32 bits of: S * (2^32) * (T / P) / T * P + * => lowest 32 bits of: S * (2^32) * quotient / T * P + * => lowest 32 bits of: S * quotient / 2^XLEN * P + * => lowest 32 bits of: (clmul_high_part(S, QT) + S) * P + * => clmul_low_part(clmul_high_part(S, QT) + S, P) + * + * In terms of below implementations, the BE case is more intuitive, since the + * higher order bit sits at more significant position. + */ + +#if __riscv_xlen == 64 +/* Slide by XLEN bits per iteration */ +# define STEP_ORDER 3 + +/* Each below polynomial quotient has an implicit bit for 2^XLEN */ + +/* Polynomial quotient of (2^(XLEN+32))/CRC32_POLY, in LE format */ +# define CRC32_POLY_QT_LE 0x5a72d812fb808b20 + +/* Polynomial quotient of (2^(XLEN+32))/CRC32C_POLY, in LE format */ +# define CRC32C_POLY_QT_LE 0xa434f61c6f5389f8 + +/* Polynomial quotient of (2^(XLEN+32))/CRC32_POLY, in BE format, it should be + * the same as the bit-reversed version of CRC32_POLY_QT_LE + */ +# define CRC32_POLY_QT_BE 0x04d101df481b4e5a + +static inline u64 crc32_le_prep(u32 crc, unsigned long const *ptr) +{ + return (u64)crc ^ (__force u64)__cpu_to_le64(*ptr); +} + +static inline u32 crc32_le_zbc(unsigned long s, u32 poly, unsigned long poly_qt) +{ + u32 crc; + + /* We don't have a "clmulrh" insn, so use clmul + slli instead. */ + asm volatile (".option push\n" + ".option arch,+zbc\n" + "clmul %0, %1, %2\n" + "slli %0, %0, 1\n" + "xor %0, %0, %1\n" + "clmulr %0, %0, %3\n" + "srli %0, %0, 32\n" + ".option pop\n" + : "=&r" (crc) + : "r" (s), + "r" (poly_qt), + "r" ((u64)poly << 32) + :); + return crc; +} + +static inline u64 crc32_be_prep(u32 crc, unsigned long const *ptr) +{ + return ((u64)crc << 32) ^ (__force u64)__cpu_to_be64(*ptr); +} + +#elif __riscv_xlen == 32 +# define STEP_ORDER 2 +/* Each quotient should match the upper half of its analog in RV64 */ +# define CRC32_POLY_QT_LE 0xfb808b20 +# define CRC32C_POLY_QT_LE 0x6f5389f8 +# define CRC32_POLY_QT_BE 0x04d101df + +static inline u32 crc32_le_prep(u32 crc, unsigned long const *ptr) +{ + return crc ^ (__force u32)__cpu_to_le32(*ptr); +} + +static inline u32 crc32_le_zbc(unsigned long s, u32 poly, unsigned long poly_qt) +{ + u32 crc; + + /* We don't have a "clmulrh" insn, so use clmul + slli instead. */ + asm volatile (".option push\n" + ".option arch,+zbc\n" + "clmul %0, %1, %2\n" + "slli %0, %0, 1\n" + "xor %0, %0, %1\n" + "clmulr %0, %0, %3\n" + ".option pop\n" + : "=&r" (crc) + : "r" (s), + "r" (poly_qt), + "r" (poly) + :); + return crc; +} + +static inline u32 crc32_be_prep(u32 crc, unsigned long const *ptr) +{ + return crc ^ (__force u32)__cpu_to_be32(*ptr); +} + +#else +# error "Unexpected __riscv_xlen" +#endif + +static inline u32 crc32_be_zbc(unsigned long s) +{ + u32 crc; + + asm volatile (".option push\n" + ".option arch,+zbc\n" + "clmulh %0, %1, %2\n" + "xor %0, %0, %1\n" + "clmul %0, %0, %3\n" + ".option pop\n" + : "=&r" (crc) + : "r" (s), + "r" (CRC32_POLY_QT_BE), + "r" (CRC32_POLY_BE) + :); + return crc; +} + +#define STEP (1 << STEP_ORDER) +#define OFFSET_MASK (STEP - 1) + +typedef u32 (*fallback)(u32 crc, unsigned char const *p, size_t len); + +static inline u32 crc32_le_unaligned(u32 crc, unsigned char const *p, + size_t len, u32 poly, + unsigned long poly_qt) +{ + size_t bits = len * 8; + unsigned long s = 0; + u32 crc_low = 0; + + for (int i = 0; i < len; i++) + s = ((unsigned long)*p++ << (__riscv_xlen - 8)) | (s >> 8); + + s ^= (unsigned long)crc << (__riscv_xlen - bits); + if (__riscv_xlen == 32 || len < sizeof(u32)) + crc_low = crc >> bits; + + crc = crc32_le_zbc(s, poly, poly_qt); + crc ^= crc_low; + + return crc; +} + +static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p, + size_t len, u32 poly, + unsigned long poly_qt, + fallback crc_fb) +{ + size_t offset, head_len, tail_len; + unsigned long const *p_ul; + unsigned long s; + + asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + RISCV_ISA_EXT_ZBC, 1) + : : : : legacy); + + /* Handle the unaligned head. */ + offset = (unsigned long)p & OFFSET_MASK; + if (offset && len) { + head_len = min(STEP - offset, len); + crc = crc32_le_unaligned(crc, p, head_len, poly, poly_qt); + p += head_len; + len -= head_len; + } + + tail_len = len & OFFSET_MASK; + len = len >> STEP_ORDER; + p_ul = (unsigned long const *)p; + + for (int i = 0; i < len; i++) { + s = crc32_le_prep(crc, p_ul); + crc = crc32_le_zbc(s, poly, poly_qt); + p_ul++; + } + + /* Handle the tail bytes. */ + p = (unsigned char const *)p_ul; + if (tail_len) + crc = crc32_le_unaligned(crc, p, tail_len, poly, poly_qt); + + return crc; + +legacy: + return crc_fb(crc, p, len); +} + +u32 __pure crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + return crc32_le_generic(crc, p, len, CRC32_POLY_LE, CRC32_POLY_QT_LE, + crc32_le_base); +} +EXPORT_SYMBOL(crc32_le_arch); + +u32 __pure crc32c_le_arch(u32 crc, const u8 *p, size_t len) +{ + return crc32_le_generic(crc, p, len, CRC32C_POLY_LE, + CRC32C_POLY_QT_LE, crc32c_le_base); +} +EXPORT_SYMBOL(crc32c_le_arch); + +static inline u32 crc32_be_unaligned(u32 crc, unsigned char const *p, + size_t len) +{ + size_t bits = len * 8; + unsigned long s = 0; + u32 crc_low = 0; + + s = 0; + for (int i = 0; i < len; i++) + s = *p++ | (s << 8); + + if (__riscv_xlen == 32 || len < sizeof(u32)) { + s ^= crc >> (32 - bits); + crc_low = crc << bits; + } else { + s ^= (unsigned long)crc << (bits - 32); + } + + crc = crc32_be_zbc(s); + crc ^= crc_low; + + return crc; +} + +u32 __pure crc32_be_arch(u32 crc, const u8 *p, size_t len) +{ + size_t offset, head_len, tail_len; + unsigned long const *p_ul; + unsigned long s; + + asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + RISCV_ISA_EXT_ZBC, 1) + : : : : legacy); + + /* Handle the unaligned head. */ + offset = (unsigned long)p & OFFSET_MASK; + if (offset && len) { + head_len = min(STEP - offset, len); + crc = crc32_be_unaligned(crc, p, head_len); + p += head_len; + len -= head_len; + } + + tail_len = len & OFFSET_MASK; + len = len >> STEP_ORDER; + p_ul = (unsigned long const *)p; + + for (int i = 0; i < len; i++) { + s = crc32_be_prep(crc, p_ul); + crc = crc32_be_zbc(s); + p_ul++; + } + + /* Handle the tail bytes. */ + p = (unsigned char const *)p_ul; + if (tail_len) + crc = crc32_be_unaligned(crc, p, tail_len); + + return crc; + +legacy: + return crc32_be_base(crc, p, len); +} +EXPORT_SYMBOL(crc32_be_arch); + +u32 crc32_optimizations(void) +{ + if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC)) + return CRC32_LE_OPTIMIZATION | + CRC32_BE_OPTIMIZATION | + CRC32C_OPTIMIZATION; + return 0; +} +EXPORT_SYMBOL(crc32_optimizations); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Accelerated CRC32 implementation with Zbc extension"); diff --git a/arch/riscv/lib/csum.c b/arch/riscv/lib/csum.c new file mode 100644 index 000000000000..7fb12c59e571 --- /dev/null +++ b/arch/riscv/lib/csum.c @@ -0,0 +1,325 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Checksum library + * + * Influenced by arch/arm64/lib/csum.c + * Copyright (C) 2023-2024 Rivos Inc. + */ +#include <linux/bitops.h> +#include <linux/compiler.h> +#include <linux/jump_label.h> +#include <linux/kasan-checks.h> +#include <linux/kernel.h> + +#include <asm/cpufeature.h> + +#include <net/checksum.h> + +/* Default version is sufficient for 32 bit */ +#ifndef CONFIG_32BIT +__sum16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, __u8 proto, __wsum csum) +{ + unsigned int ulen, uproto; + unsigned long sum = (__force unsigned long)csum; + + sum += (__force unsigned long)saddr->s6_addr32[0]; + sum += (__force unsigned long)saddr->s6_addr32[1]; + sum += (__force unsigned long)saddr->s6_addr32[2]; + sum += (__force unsigned long)saddr->s6_addr32[3]; + + sum += (__force unsigned long)daddr->s6_addr32[0]; + sum += (__force unsigned long)daddr->s6_addr32[1]; + sum += (__force unsigned long)daddr->s6_addr32[2]; + sum += (__force unsigned long)daddr->s6_addr32[3]; + + ulen = (__force unsigned int)htonl((unsigned int)len); + sum += ulen; + + uproto = (__force unsigned int)htonl(proto); + sum += uproto; + + /* + * Zbb support saves 4 instructions, so not worth checking without + * alternatives if supported + */ + if (IS_ENABLED(CONFIG_RISCV_ISA_ZBB) && + IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) { + unsigned long fold_temp; + + /* + * Zbb is likely available when the kernel is compiled with Zbb + * support, so nop when Zbb is available and jump when Zbb is + * not available. + */ + asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, + RISCV_ISA_EXT_ZBB, 1) + : + : + : + : no_zbb); + asm(".option push \n\ + .option arch,+zbb \n\ + rori %[fold_temp], %[sum], 32 \n\ + add %[sum], %[fold_temp], %[sum] \n\ + srli %[sum], %[sum], 32 \n\ + not %[fold_temp], %[sum] \n\ + roriw %[sum], %[sum], 16 \n\ + subw %[sum], %[fold_temp], %[sum] \n\ + .option pop" + : [sum] "+r" (sum), [fold_temp] "=&r" (fold_temp)); + return (__force __sum16)(sum >> 16); + } +no_zbb: + sum += ror64(sum, 32); + sum >>= 32; + return csum_fold((__force __wsum)sum); +} +EXPORT_SYMBOL(csum_ipv6_magic); +#endif /* !CONFIG_32BIT */ + +#ifdef CONFIG_32BIT +#define OFFSET_MASK 3 +#elif CONFIG_64BIT +#define OFFSET_MASK 7 +#endif + +static inline __no_sanitize_address unsigned long +do_csum_common(const unsigned long *ptr, const unsigned long *end, + unsigned long data) +{ + unsigned int shift; + unsigned long csum = 0, carry = 0; + + /* + * Do 32-bit reads on RV32 and 64-bit reads otherwise. This should be + * faster than doing 32-bit reads on architectures that support larger + * reads. + */ + while (ptr < end) { + csum += data; + carry += csum < data; + data = *(ptr++); + } + + /* + * Perform alignment (and over-read) bytes on the tail if any bytes + * leftover. + */ + shift = ((long)ptr - (long)end) * 8; +#ifdef __LITTLE_ENDIAN + data = (data << shift) >> shift; +#else + data = (data >> shift) << shift; +#endif + csum += data; + carry += csum < data; + csum += carry; + csum += csum < carry; + + return csum; +} + +/* + * Algorithm accounts for buff being misaligned. + * If buff is not aligned, will over-read bytes but not use the bytes that it + * shouldn't. The same thing will occur on the tail-end of the read. + */ +static inline __no_sanitize_address unsigned int +do_csum_with_alignment(const unsigned char *buff, int len) +{ + unsigned int offset, shift; + unsigned long csum, data; + const unsigned long *ptr, *end; + + /* + * Align address to closest word (double word on rv64) that comes before + * buff. This should always be in the same page and cache line. + * Directly call KASAN with the alignment we will be using. + */ + offset = (unsigned long)buff & OFFSET_MASK; + kasan_check_read(buff, len); + ptr = (const unsigned long *)(buff - offset); + + /* + * Clear the most significant bytes that were over-read if buff was not + * aligned. + */ + shift = offset * 8; + data = *(ptr++); +#ifdef __LITTLE_ENDIAN + data = (data >> shift) << shift; +#else + data = (data << shift) >> shift; +#endif + end = (const unsigned long *)(buff + len); + csum = do_csum_common(ptr, end, data); + +#ifdef CC_HAS_ASM_GOTO_TIED_OUTPUT + /* + * Zbb support saves 6 instructions, so not worth checking without + * alternatives if supported + */ + if (IS_ENABLED(CONFIG_RISCV_ISA_ZBB) && + IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) { + unsigned long fold_temp; + + /* + * Zbb is likely available when the kernel is compiled with Zbb + * support, so nop when Zbb is available and jump when Zbb is + * not available. + */ + asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, + RISCV_ISA_EXT_ZBB, 1) + : + : + : + : no_zbb); + +#ifdef CONFIG_32BIT + asm_goto_output(".option push \n\ + .option arch,+zbb \n\ + rori %[fold_temp], %[csum], 16 \n\ + andi %[offset], %[offset], 1 \n\ + add %[csum], %[fold_temp], %[csum] \n\ + beq %[offset], zero, %l[end] \n\ + rev8 %[csum], %[csum] \n\ + .option pop" + : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp) + : [offset] "r" (offset) + : + : end); + + return (unsigned short)csum; +#else /* !CONFIG_32BIT */ + asm_goto_output(".option push \n\ + .option arch,+zbb \n\ + rori %[fold_temp], %[csum], 32 \n\ + add %[csum], %[fold_temp], %[csum] \n\ + srli %[csum], %[csum], 32 \n\ + roriw %[fold_temp], %[csum], 16 \n\ + addw %[csum], %[fold_temp], %[csum] \n\ + andi %[offset], %[offset], 1 \n\ + beq %[offset], zero, %l[end] \n\ + rev8 %[csum], %[csum] \n\ + .option pop" + : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp) + : [offset] "r" (offset) + : + : end); + + return (csum << 16) >> 48; +#endif /* !CONFIG_32BIT */ +end: + return csum >> 16; + } +no_zbb: +#endif /* CC_HAS_ASM_GOTO_TIED_OUTPUT */ +#ifndef CONFIG_32BIT + csum += ror64(csum, 32); + csum >>= 32; +#endif + csum = (u32)csum + ror32((u32)csum, 16); + if (offset & 1) + return (u16)swab32(csum); + return csum >> 16; +} + +/* + * Does not perform alignment, should only be used if machine has fast + * misaligned accesses, or when buff is known to be aligned. + */ +static inline __no_sanitize_address unsigned int +do_csum_no_alignment(const unsigned char *buff, int len) +{ + unsigned long csum, data; + const unsigned long *ptr, *end; + + ptr = (const unsigned long *)(buff); + data = *(ptr++); + + kasan_check_read(buff, len); + + end = (const unsigned long *)(buff + len); + csum = do_csum_common(ptr, end, data); + + /* + * Zbb support saves 6 instructions, so not worth checking without + * alternatives if supported + */ + if (IS_ENABLED(CONFIG_RISCV_ISA_ZBB) && + IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) { + unsigned long fold_temp; + + /* + * Zbb is likely available when the kernel is compiled with Zbb + * support, so nop when Zbb is available and jump when Zbb is + * not available. + */ + asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, + RISCV_ISA_EXT_ZBB, 1) + : + : + : + : no_zbb); + +#ifdef CONFIG_32BIT + asm (".option push \n\ + .option arch,+zbb \n\ + rori %[fold_temp], %[csum], 16 \n\ + add %[csum], %[fold_temp], %[csum] \n\ + .option pop" + : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp) + : + : ); + +#else /* !CONFIG_32BIT */ + asm (".option push \n\ + .option arch,+zbb \n\ + rori %[fold_temp], %[csum], 32 \n\ + add %[csum], %[fold_temp], %[csum] \n\ + srli %[csum], %[csum], 32 \n\ + roriw %[fold_temp], %[csum], 16 \n\ + addw %[csum], %[fold_temp], %[csum] \n\ + .option pop" + : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp) + : + : ); +#endif /* !CONFIG_32BIT */ + return csum >> 16; + } +no_zbb: +#ifndef CONFIG_32BIT + csum += ror64(csum, 32); + csum >>= 32; +#endif + csum = (u32)csum + ror32((u32)csum, 16); + return csum >> 16; +} + +/* + * Perform a checksum on an arbitrary memory address. + * Will do a light-weight address alignment if buff is misaligned, unless + * cpu supports fast misaligned accesses. + */ +unsigned int do_csum(const unsigned char *buff, int len) +{ + if (unlikely(len <= 0)) + return 0; + + /* + * Significant performance gains can be seen by not doing alignment + * on machines with fast misaligned accesses. + * + * There is some duplicate code between the "with_alignment" and + * "no_alignment" implmentations, but the overlap is too awkward to be + * able to fit in one function without introducing multiple static + * branches. The largest chunk of overlap was delegated into the + * do_csum_common function. + */ + if (has_fast_unaligned_accesses() || (((unsigned long)buff & OFFSET_MASK) == 0)) + return do_csum_no_alignment(buff, len); + + return do_csum_with_alignment(buff, len); +} diff --git a/arch/riscv/lib/memset.S b/arch/riscv/lib/memset.S index 35f358e70bdb..da23b8347e2d 100644 --- a/arch/riscv/lib/memset.S +++ b/arch/riscv/lib/memset.S @@ -111,3 +111,5 @@ SYM_FUNC_START(__memset) ret SYM_FUNC_END(__memset) SYM_FUNC_ALIAS_WEAK(memset, __memset) +SYM_FUNC_ALIAS(__pi_memset, __memset) +SYM_FUNC_ALIAS(__pi___memset, __memset) diff --git a/arch/riscv/lib/riscv_v_helpers.c b/arch/riscv/lib/riscv_v_helpers.c new file mode 100644 index 000000000000..be38a93cedae --- /dev/null +++ b/arch/riscv/lib/riscv_v_helpers.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 SiFive + * Author: Andy Chiu <andy.chiu@sifive.com> + */ +#include <linux/linkage.h> +#include <asm/asm.h> + +#include <asm/vector.h> +#include <asm/simd.h> + +#ifdef CONFIG_MMU +#include <asm/asm-prototypes.h> +#endif + +#ifdef CONFIG_MMU +size_t riscv_v_usercopy_threshold = CONFIG_RISCV_ISA_V_UCOPY_THRESHOLD; +int __asm_vector_usercopy(void *dst, void *src, size_t n); +int fallback_scalar_usercopy(void *dst, void *src, size_t n); +asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n) +{ + size_t remain, copied; + + /* skip has_vector() check because it has been done by the asm */ + if (!may_use_simd()) + goto fallback; + + kernel_vector_begin(); + remain = __asm_vector_usercopy(dst, src, n); + kernel_vector_end(); + + if (remain) { + copied = n - remain; + dst += copied; + src += copied; + n = remain; + goto fallback; + } + + return remain; + +fallback: + return fallback_scalar_usercopy(dst, src, n); +} +#endif diff --git a/arch/riscv/lib/strcmp.S b/arch/riscv/lib/strcmp.S index 687b2bea5c43..57a5c0066231 100644 --- a/arch/riscv/lib/strcmp.S +++ b/arch/riscv/lib/strcmp.S @@ -120,3 +120,5 @@ strcmp_zbb: .option pop #endif SYM_FUNC_END(strcmp) +SYM_FUNC_ALIAS(__pi_strcmp, strcmp) +EXPORT_SYMBOL(strcmp) diff --git a/arch/riscv/lib/strlen.S b/arch/riscv/lib/strlen.S index 8ae3064e45ff..962983b73251 100644 --- a/arch/riscv/lib/strlen.S +++ b/arch/riscv/lib/strlen.S @@ -131,3 +131,4 @@ strlen_zbb: #endif SYM_FUNC_END(strlen) SYM_FUNC_ALIAS(__pi_strlen, strlen) +EXPORT_SYMBOL(strlen) diff --git a/arch/riscv/lib/strncmp.S b/arch/riscv/lib/strncmp.S index aba5b3148621..7b2d0ff9ed6c 100644 --- a/arch/riscv/lib/strncmp.S +++ b/arch/riscv/lib/strncmp.S @@ -136,3 +136,5 @@ strncmp_zbb: .option pop #endif SYM_FUNC_END(strncmp) +SYM_FUNC_ALIAS(__pi_strncmp, strncmp) +EXPORT_SYMBOL(strncmp) diff --git a/arch/riscv/lib/tishift.S b/arch/riscv/lib/tishift.S index ef90075c4b0a..c8294bf72c06 100644 --- a/arch/riscv/lib/tishift.S +++ b/arch/riscv/lib/tishift.S @@ -4,7 +4,7 @@ */ #include <linux/linkage.h> -#include <asm-generic/export.h> +#include <linux/export.h> SYM_FUNC_START(__lshrti3) beqz a2, .L1 diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S index 3ab438f30d13..6a9f116bb545 100644 --- a/arch/riscv/lib/uaccess.S +++ b/arch/riscv/lib/uaccess.S @@ -1,8 +1,10 @@ #include <linux/linkage.h> -#include <asm-generic/export.h> +#include <linux/export.h> #include <asm/asm.h> #include <asm/asm-extable.h> #include <asm/csr.h> +#include <asm/hwcap.h> +#include <asm/alternative-macros.h> .macro fixup op reg addr lbl 100: @@ -11,6 +13,13 @@ .endm SYM_FUNC_START(__asm_copy_to_user) +#ifdef CONFIG_RISCV_ISA_V + ALTERNATIVE("j fallback_scalar_usercopy", "nop", 0, RISCV_ISA_EXT_ZVE32X, CONFIG_RISCV_ISA_V) + REG_L t0, riscv_v_usercopy_threshold + bltu a2, t0, fallback_scalar_usercopy + tail enter_vector_usercopy +#endif +SYM_FUNC_START(fallback_scalar_usercopy) /* Enable access to user memory */ li t6, SR_SUM @@ -35,7 +44,7 @@ SYM_FUNC_START(__asm_copy_to_user) * Use byte copy only if too small. * SZREG holds 4 for RV32 and 8 for RV64 */ - li a3, 9*SZREG /* size must be larger than size in word_copy */ + li a3, 9*SZREG-1 /* size must >= (word_copy stride + SZREG-1) */ bltu a2, a3, .Lbyte_copy_tail /* @@ -94,7 +103,7 @@ SYM_FUNC_START(__asm_copy_to_user) fixup REG_S t4, 7*SZREG(a0), 10f addi a0, a0, 8*SZREG addi a1, a1, 8*SZREG - bltu a0, t0, 2b + bleu a0, t0, 2b addi t0, t0, 8*SZREG /* revert to original value */ j .Lbyte_copy_tail @@ -181,6 +190,7 @@ SYM_FUNC_START(__asm_copy_to_user) sub a0, t5, a0 ret SYM_FUNC_END(__asm_copy_to_user) +SYM_FUNC_END(fallback_scalar_usercopy) EXPORT_SYMBOL(__asm_copy_to_user) SYM_FUNC_ALIAS(__asm_copy_from_user, __asm_copy_to_user) EXPORT_SYMBOL(__asm_copy_from_user) diff --git a/arch/riscv/lib/uaccess_vector.S b/arch/riscv/lib/uaccess_vector.S new file mode 100644 index 000000000000..7c45f26de4f7 --- /dev/null +++ b/arch/riscv/lib/uaccess_vector.S @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#include <linux/linkage.h> +#include <asm/asm.h> +#include <asm/asm-extable.h> +#include <asm/csr.h> + +#define pDst a0 +#define pSrc a1 +#define iNum a2 + +#define iVL a3 + +#define ELEM_LMUL_SETTING m8 +#define vData v0 + + .macro fixup op reg addr lbl +100: + \op \reg, \addr + _asm_extable 100b, \lbl + .endm + +SYM_FUNC_START(__asm_vector_usercopy) + /* Enable access to user memory */ + li t6, SR_SUM + csrs CSR_STATUS, t6 + +loop: + vsetvli iVL, iNum, e8, ELEM_LMUL_SETTING, ta, ma + fixup vle8.v vData, (pSrc), 10f + sub iNum, iNum, iVL + add pSrc, pSrc, iVL + fixup vse8.v vData, (pDst), 11f + add pDst, pDst, iVL + bnez iNum, loop + + /* Exception fixup for vector load is shared with normal exit */ +10: + /* Disable access to user memory */ + csrc CSR_STATUS, t6 + mv a0, iNum + ret + + /* Exception fixup code for vector store. */ +11: + /* Undo the subtraction after vle8.v */ + add iNum, iNum, iVL + /* Make sure the scalar fallback skip already processed bytes */ + csrr t2, CSR_VSTART + sub iNum, iNum, t2 + j 10b +SYM_FUNC_END(__asm_vector_usercopy) diff --git a/arch/riscv/lib/xor.S b/arch/riscv/lib/xor.S new file mode 100644 index 000000000000..b28f2430e52f --- /dev/null +++ b/arch/riscv/lib/xor.S @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2021 SiFive + */ +#include <linux/linkage.h> +#include <linux/export.h> +#include <asm/asm.h> + +SYM_FUNC_START(xor_regs_2_) + vsetvli a3, a0, e8, m8, ta, ma + vle8.v v0, (a1) + vle8.v v8, (a2) + sub a0, a0, a3 + vxor.vv v16, v0, v8 + add a2, a2, a3 + vse8.v v16, (a1) + add a1, a1, a3 + bnez a0, xor_regs_2_ + ret +SYM_FUNC_END(xor_regs_2_) +EXPORT_SYMBOL(xor_regs_2_) + +SYM_FUNC_START(xor_regs_3_) + vsetvli a4, a0, e8, m8, ta, ma + vle8.v v0, (a1) + vle8.v v8, (a2) + sub a0, a0, a4 + vxor.vv v0, v0, v8 + vle8.v v16, (a3) + add a2, a2, a4 + vxor.vv v16, v0, v16 + add a3, a3, a4 + vse8.v v16, (a1) + add a1, a1, a4 + bnez a0, xor_regs_3_ + ret +SYM_FUNC_END(xor_regs_3_) +EXPORT_SYMBOL(xor_regs_3_) + +SYM_FUNC_START(xor_regs_4_) + vsetvli a5, a0, e8, m8, ta, ma + vle8.v v0, (a1) + vle8.v v8, (a2) + sub a0, a0, a5 + vxor.vv v0, v0, v8 + vle8.v v16, (a3) + add a2, a2, a5 + vxor.vv v0, v0, v16 + vle8.v v24, (a4) + add a3, a3, a5 + vxor.vv v16, v0, v24 + add a4, a4, a5 + vse8.v v16, (a1) + add a1, a1, a5 + bnez a0, xor_regs_4_ + ret +SYM_FUNC_END(xor_regs_4_) +EXPORT_SYMBOL(xor_regs_4_) + +SYM_FUNC_START(xor_regs_5_) + vsetvli a6, a0, e8, m8, ta, ma + vle8.v v0, (a1) + vle8.v v8, (a2) + sub a0, a0, a6 + vxor.vv v0, v0, v8 + vle8.v v16, (a3) + add a2, a2, a6 + vxor.vv v0, v0, v16 + vle8.v v24, (a4) + add a3, a3, a6 + vxor.vv v0, v0, v24 + vle8.v v8, (a5) + add a4, a4, a6 + vxor.vv v16, v0, v8 + add a5, a5, a6 + vse8.v v16, (a1) + add a1, a1, a6 + bnez a0, xor_regs_5_ + ret +SYM_FUNC_END(xor_regs_5_) +EXPORT_SYMBOL(xor_regs_5_) |