summaryrefslogtreecommitdiff
path: root/arch/riscv/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/riscv/lib')
-rw-r--r--arch/riscv/lib/Makefile13
-rw-r--r--arch/riscv/lib/clear_page.S74
-rw-r--r--arch/riscv/lib/csum.c325
-rw-r--r--arch/riscv/lib/delay.c4
-rw-r--r--arch/riscv/lib/error-inject.c10
-rw-r--r--arch/riscv/lib/memcpy.S7
-rw-r--r--arch/riscv/lib/memmove.S317
-rw-r--r--arch/riscv/lib/memset.S5
-rw-r--r--arch/riscv/lib/riscv_v_helpers.c45
-rw-r--r--arch/riscv/lib/strcmp.S122
-rw-r--r--arch/riscv/lib/strlen.S133
-rw-r--r--arch/riscv/lib/strncmp.S138
-rw-r--r--arch/riscv/lib/tishift.S75
-rw-r--r--arch/riscv/lib/uaccess.S238
-rw-r--r--arch/riscv/lib/uaccess_vector.S52
-rw-r--r--arch/riscv/lib/xor.S81
16 files changed, 1559 insertions, 80 deletions
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index 47e7a8204460..bd6e6c1b0497 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -2,5 +2,18 @@
lib-y += delay.o
lib-y += memcpy.o
lib-y += memset.o
+lib-y += memmove.o
+lib-y += strcmp.o
+lib-y += strlen.o
+lib-y += strncmp.o
+lib-y += csum.o
+ifeq ($(CONFIG_MMU), y)
+lib-$(CONFIG_RISCV_ISA_V) += uaccess_vector.o
+endif
lib-$(CONFIG_MMU) += uaccess.o
lib-$(CONFIG_64BIT) += tishift.o
+lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o
+
+obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
+lib-$(CONFIG_RISCV_ISA_V) += xor.o
+lib-$(CONFIG_RISCV_ISA_V) += riscv_v_helpers.o
diff --git a/arch/riscv/lib/clear_page.S b/arch/riscv/lib/clear_page.S
new file mode 100644
index 000000000000..20ff03f5b0f2
--- /dev/null
+++ b/arch/riscv/lib/clear_page.S
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2023 Ventana Micro Systems Inc.
+ */
+
+#include <linux/linkage.h>
+#include <linux/export.h>
+#include <asm/asm.h>
+#include <asm/alternative-macros.h>
+#include <asm/hwcap.h>
+#include <asm/insn-def.h>
+#include <asm/page.h>
+
+#define CBOZ_ALT(order, old, new) \
+ ALTERNATIVE(old, new, 0, \
+ ((order) << 16) | RISCV_ISA_EXT_ZICBOZ, \
+ CONFIG_RISCV_ISA_ZICBOZ)
+
+/* void clear_page(void *page) */
+SYM_FUNC_START(clear_page)
+ li a2, PAGE_SIZE
+
+ /*
+ * If Zicboz isn't present, or somehow has a block
+ * size larger than 4K, then fallback to memset.
+ */
+ CBOZ_ALT(12, "j .Lno_zicboz", "nop")
+
+ lw a1, riscv_cboz_block_size
+ add a2, a0, a2
+.Lzero_loop:
+ CBO_ZERO(a0)
+ add a0, a0, a1
+ CBOZ_ALT(11, "bltu a0, a2, .Lzero_loop; ret", "nop; nop")
+ CBO_ZERO(a0)
+ add a0, a0, a1
+ CBOZ_ALT(10, "bltu a0, a2, .Lzero_loop; ret", "nop; nop")
+ CBO_ZERO(a0)
+ add a0, a0, a1
+ CBO_ZERO(a0)
+ add a0, a0, a1
+ CBOZ_ALT(9, "bltu a0, a2, .Lzero_loop; ret", "nop; nop")
+ CBO_ZERO(a0)
+ add a0, a0, a1
+ CBO_ZERO(a0)
+ add a0, a0, a1
+ CBO_ZERO(a0)
+ add a0, a0, a1
+ CBO_ZERO(a0)
+ add a0, a0, a1
+ CBOZ_ALT(8, "bltu a0, a2, .Lzero_loop; ret", "nop; nop")
+ CBO_ZERO(a0)
+ add a0, a0, a1
+ CBO_ZERO(a0)
+ add a0, a0, a1
+ CBO_ZERO(a0)
+ add a0, a0, a1
+ CBO_ZERO(a0)
+ add a0, a0, a1
+ CBO_ZERO(a0)
+ add a0, a0, a1
+ CBO_ZERO(a0)
+ add a0, a0, a1
+ CBO_ZERO(a0)
+ add a0, a0, a1
+ CBO_ZERO(a0)
+ add a0, a0, a1
+ bltu a0, a2, .Lzero_loop
+ ret
+.Lno_zicboz:
+ li a1, 0
+ tail __memset
+SYM_FUNC_END(clear_page)
+EXPORT_SYMBOL(clear_page)
diff --git a/arch/riscv/lib/csum.c b/arch/riscv/lib/csum.c
new file mode 100644
index 000000000000..7fb12c59e571
--- /dev/null
+++ b/arch/riscv/lib/csum.c
@@ -0,0 +1,325 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Checksum library
+ *
+ * Influenced by arch/arm64/lib/csum.c
+ * Copyright (C) 2023-2024 Rivos Inc.
+ */
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/jump_label.h>
+#include <linux/kasan-checks.h>
+#include <linux/kernel.h>
+
+#include <asm/cpufeature.h>
+
+#include <net/checksum.h>
+
+/* Default version is sufficient for 32 bit */
+#ifndef CONFIG_32BIT
+__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __u32 len, __u8 proto, __wsum csum)
+{
+ unsigned int ulen, uproto;
+ unsigned long sum = (__force unsigned long)csum;
+
+ sum += (__force unsigned long)saddr->s6_addr32[0];
+ sum += (__force unsigned long)saddr->s6_addr32[1];
+ sum += (__force unsigned long)saddr->s6_addr32[2];
+ sum += (__force unsigned long)saddr->s6_addr32[3];
+
+ sum += (__force unsigned long)daddr->s6_addr32[0];
+ sum += (__force unsigned long)daddr->s6_addr32[1];
+ sum += (__force unsigned long)daddr->s6_addr32[2];
+ sum += (__force unsigned long)daddr->s6_addr32[3];
+
+ ulen = (__force unsigned int)htonl((unsigned int)len);
+ sum += ulen;
+
+ uproto = (__force unsigned int)htonl(proto);
+ sum += uproto;
+
+ /*
+ * Zbb support saves 4 instructions, so not worth checking without
+ * alternatives if supported
+ */
+ if (IS_ENABLED(CONFIG_RISCV_ISA_ZBB) &&
+ IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
+ unsigned long fold_temp;
+
+ /*
+ * Zbb is likely available when the kernel is compiled with Zbb
+ * support, so nop when Zbb is available and jump when Zbb is
+ * not available.
+ */
+ asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
+ RISCV_ISA_EXT_ZBB, 1)
+ :
+ :
+ :
+ : no_zbb);
+ asm(".option push \n\
+ .option arch,+zbb \n\
+ rori %[fold_temp], %[sum], 32 \n\
+ add %[sum], %[fold_temp], %[sum] \n\
+ srli %[sum], %[sum], 32 \n\
+ not %[fold_temp], %[sum] \n\
+ roriw %[sum], %[sum], 16 \n\
+ subw %[sum], %[fold_temp], %[sum] \n\
+ .option pop"
+ : [sum] "+r" (sum), [fold_temp] "=&r" (fold_temp));
+ return (__force __sum16)(sum >> 16);
+ }
+no_zbb:
+ sum += ror64(sum, 32);
+ sum >>= 32;
+ return csum_fold((__force __wsum)sum);
+}
+EXPORT_SYMBOL(csum_ipv6_magic);
+#endif /* !CONFIG_32BIT */
+
+#ifdef CONFIG_32BIT
+#define OFFSET_MASK 3
+#elif CONFIG_64BIT
+#define OFFSET_MASK 7
+#endif
+
+static inline __no_sanitize_address unsigned long
+do_csum_common(const unsigned long *ptr, const unsigned long *end,
+ unsigned long data)
+{
+ unsigned int shift;
+ unsigned long csum = 0, carry = 0;
+
+ /*
+ * Do 32-bit reads on RV32 and 64-bit reads otherwise. This should be
+ * faster than doing 32-bit reads on architectures that support larger
+ * reads.
+ */
+ while (ptr < end) {
+ csum += data;
+ carry += csum < data;
+ data = *(ptr++);
+ }
+
+ /*
+ * Perform alignment (and over-read) bytes on the tail if any bytes
+ * leftover.
+ */
+ shift = ((long)ptr - (long)end) * 8;
+#ifdef __LITTLE_ENDIAN
+ data = (data << shift) >> shift;
+#else
+ data = (data >> shift) << shift;
+#endif
+ csum += data;
+ carry += csum < data;
+ csum += carry;
+ csum += csum < carry;
+
+ return csum;
+}
+
+/*
+ * Algorithm accounts for buff being misaligned.
+ * If buff is not aligned, will over-read bytes but not use the bytes that it
+ * shouldn't. The same thing will occur on the tail-end of the read.
+ */
+static inline __no_sanitize_address unsigned int
+do_csum_with_alignment(const unsigned char *buff, int len)
+{
+ unsigned int offset, shift;
+ unsigned long csum, data;
+ const unsigned long *ptr, *end;
+
+ /*
+ * Align address to closest word (double word on rv64) that comes before
+ * buff. This should always be in the same page and cache line.
+ * Directly call KASAN with the alignment we will be using.
+ */
+ offset = (unsigned long)buff & OFFSET_MASK;
+ kasan_check_read(buff, len);
+ ptr = (const unsigned long *)(buff - offset);
+
+ /*
+ * Clear the most significant bytes that were over-read if buff was not
+ * aligned.
+ */
+ shift = offset * 8;
+ data = *(ptr++);
+#ifdef __LITTLE_ENDIAN
+ data = (data >> shift) << shift;
+#else
+ data = (data << shift) >> shift;
+#endif
+ end = (const unsigned long *)(buff + len);
+ csum = do_csum_common(ptr, end, data);
+
+#ifdef CC_HAS_ASM_GOTO_TIED_OUTPUT
+ /*
+ * Zbb support saves 6 instructions, so not worth checking without
+ * alternatives if supported
+ */
+ if (IS_ENABLED(CONFIG_RISCV_ISA_ZBB) &&
+ IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
+ unsigned long fold_temp;
+
+ /*
+ * Zbb is likely available when the kernel is compiled with Zbb
+ * support, so nop when Zbb is available and jump when Zbb is
+ * not available.
+ */
+ asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
+ RISCV_ISA_EXT_ZBB, 1)
+ :
+ :
+ :
+ : no_zbb);
+
+#ifdef CONFIG_32BIT
+ asm_goto_output(".option push \n\
+ .option arch,+zbb \n\
+ rori %[fold_temp], %[csum], 16 \n\
+ andi %[offset], %[offset], 1 \n\
+ add %[csum], %[fold_temp], %[csum] \n\
+ beq %[offset], zero, %l[end] \n\
+ rev8 %[csum], %[csum] \n\
+ .option pop"
+ : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp)
+ : [offset] "r" (offset)
+ :
+ : end);
+
+ return (unsigned short)csum;
+#else /* !CONFIG_32BIT */
+ asm_goto_output(".option push \n\
+ .option arch,+zbb \n\
+ rori %[fold_temp], %[csum], 32 \n\
+ add %[csum], %[fold_temp], %[csum] \n\
+ srli %[csum], %[csum], 32 \n\
+ roriw %[fold_temp], %[csum], 16 \n\
+ addw %[csum], %[fold_temp], %[csum] \n\
+ andi %[offset], %[offset], 1 \n\
+ beq %[offset], zero, %l[end] \n\
+ rev8 %[csum], %[csum] \n\
+ .option pop"
+ : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp)
+ : [offset] "r" (offset)
+ :
+ : end);
+
+ return (csum << 16) >> 48;
+#endif /* !CONFIG_32BIT */
+end:
+ return csum >> 16;
+ }
+no_zbb:
+#endif /* CC_HAS_ASM_GOTO_TIED_OUTPUT */
+#ifndef CONFIG_32BIT
+ csum += ror64(csum, 32);
+ csum >>= 32;
+#endif
+ csum = (u32)csum + ror32((u32)csum, 16);
+ if (offset & 1)
+ return (u16)swab32(csum);
+ return csum >> 16;
+}
+
+/*
+ * Does not perform alignment, should only be used if machine has fast
+ * misaligned accesses, or when buff is known to be aligned.
+ */
+static inline __no_sanitize_address unsigned int
+do_csum_no_alignment(const unsigned char *buff, int len)
+{
+ unsigned long csum, data;
+ const unsigned long *ptr, *end;
+
+ ptr = (const unsigned long *)(buff);
+ data = *(ptr++);
+
+ kasan_check_read(buff, len);
+
+ end = (const unsigned long *)(buff + len);
+ csum = do_csum_common(ptr, end, data);
+
+ /*
+ * Zbb support saves 6 instructions, so not worth checking without
+ * alternatives if supported
+ */
+ if (IS_ENABLED(CONFIG_RISCV_ISA_ZBB) &&
+ IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
+ unsigned long fold_temp;
+
+ /*
+ * Zbb is likely available when the kernel is compiled with Zbb
+ * support, so nop when Zbb is available and jump when Zbb is
+ * not available.
+ */
+ asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
+ RISCV_ISA_EXT_ZBB, 1)
+ :
+ :
+ :
+ : no_zbb);
+
+#ifdef CONFIG_32BIT
+ asm (".option push \n\
+ .option arch,+zbb \n\
+ rori %[fold_temp], %[csum], 16 \n\
+ add %[csum], %[fold_temp], %[csum] \n\
+ .option pop"
+ : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp)
+ :
+ : );
+
+#else /* !CONFIG_32BIT */
+ asm (".option push \n\
+ .option arch,+zbb \n\
+ rori %[fold_temp], %[csum], 32 \n\
+ add %[csum], %[fold_temp], %[csum] \n\
+ srli %[csum], %[csum], 32 \n\
+ roriw %[fold_temp], %[csum], 16 \n\
+ addw %[csum], %[fold_temp], %[csum] \n\
+ .option pop"
+ : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp)
+ :
+ : );
+#endif /* !CONFIG_32BIT */
+ return csum >> 16;
+ }
+no_zbb:
+#ifndef CONFIG_32BIT
+ csum += ror64(csum, 32);
+ csum >>= 32;
+#endif
+ csum = (u32)csum + ror32((u32)csum, 16);
+ return csum >> 16;
+}
+
+/*
+ * Perform a checksum on an arbitrary memory address.
+ * Will do a light-weight address alignment if buff is misaligned, unless
+ * cpu supports fast misaligned accesses.
+ */
+unsigned int do_csum(const unsigned char *buff, int len)
+{
+ if (unlikely(len <= 0))
+ return 0;
+
+ /*
+ * Significant performance gains can be seen by not doing alignment
+ * on machines with fast misaligned accesses.
+ *
+ * There is some duplicate code between the "with_alignment" and
+ * "no_alignment" implmentations, but the overlap is too awkward to be
+ * able to fit in one function without introducing multiple static
+ * branches. The largest chunk of overlap was delegated into the
+ * do_csum_common function.
+ */
+ if (has_fast_unaligned_accesses() || (((unsigned long)buff & OFFSET_MASK) == 0))
+ return do_csum_no_alignment(buff, len);
+
+ return do_csum_with_alignment(buff, len);
+}
diff --git a/arch/riscv/lib/delay.c b/arch/riscv/lib/delay.c
index f51c9a03bca1..49d510ba75fd 100644
--- a/arch/riscv/lib/delay.c
+++ b/arch/riscv/lib/delay.c
@@ -4,10 +4,14 @@
*/
#include <linux/delay.h>
+#include <linux/math.h>
#include <linux/param.h>
#include <linux/timex.h>
+#include <linux/types.h>
#include <linux/export.h>
+#include <asm/processor.h>
+
/*
* This is copies from arch/arm/include/asm/delay.h
*
diff --git a/arch/riscv/lib/error-inject.c b/arch/riscv/lib/error-inject.c
new file mode 100644
index 000000000000..d667ade2bc41
--- /dev/null
+++ b/arch/riscv/lib/error-inject.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/error-injection.h>
+#include <linux/kprobes.h>
+
+void override_function_with_return(struct pt_regs *regs)
+{
+ instruction_pointer_set(regs, regs->ra);
+}
+NOKPROBE_SYMBOL(override_function_with_return);
diff --git a/arch/riscv/lib/memcpy.S b/arch/riscv/lib/memcpy.S
index b4c477846e91..44e009ec5fef 100644
--- a/arch/riscv/lib/memcpy.S
+++ b/arch/riscv/lib/memcpy.S
@@ -7,7 +7,7 @@
#include <asm/asm.h>
/* void *memcpy(void *, const void *, size_t) */
-ENTRY(memcpy)
+SYM_FUNC_START(__memcpy)
move t6, a0 /* Preserve return value */
/* Defer to byte-oriented copy for small sizes */
@@ -104,4 +104,7 @@ ENTRY(memcpy)
bltu a1, a3, 5b
6:
ret
-END(memcpy)
+SYM_FUNC_END(__memcpy)
+SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
+SYM_FUNC_ALIAS(__pi_memcpy, __memcpy)
+SYM_FUNC_ALIAS(__pi___memcpy, __memcpy)
diff --git a/arch/riscv/lib/memmove.S b/arch/riscv/lib/memmove.S
new file mode 100644
index 000000000000..cb3e2e7ef0ba
--- /dev/null
+++ b/arch/riscv/lib/memmove.S
@@ -0,0 +1,317 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+
+SYM_FUNC_START(__memmove)
+ /*
+ * Returns
+ * a0 - dest
+ *
+ * Parameters
+ * a0 - Inclusive first byte of dest
+ * a1 - Inclusive first byte of src
+ * a2 - Length of copy n
+ *
+ * Because the return matches the parameter register a0,
+ * we will not clobber or modify that register.
+ *
+ * Note: This currently only works on little-endian.
+ * To port to big-endian, reverse the direction of shifts
+ * in the 2 misaligned fixup copy loops.
+ */
+
+ /* Return if nothing to do */
+ beq a0, a1, .Lreturn_from_memmove
+ beqz a2, .Lreturn_from_memmove
+
+ /*
+ * Register Uses
+ * Forward Copy: a1 - Index counter of src
+ * Reverse Copy: a4 - Index counter of src
+ * Forward Copy: t3 - Index counter of dest
+ * Reverse Copy: t4 - Index counter of dest
+ * Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
+ * Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
+ * Both Copy Modes: t0 - Link / Temporary for load-store
+ * Both Copy Modes: t1 - Temporary for load-store
+ * Both Copy Modes: t2 - Temporary for load-store
+ * Both Copy Modes: a5 - dest to src alignment offset
+ * Both Copy Modes: a6 - Shift ammount
+ * Both Copy Modes: a7 - Inverse Shift ammount
+ * Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
+ */
+
+ /*
+ * Solve for some register values now.
+ * Byte copy does not need t5 or t6.
+ */
+ mv t3, a0
+ add t4, a0, a2
+ add a4, a1, a2
+
+ /*
+ * Byte copy if copying less than (2 * SZREG) bytes. This can
+ * cause problems with the bulk copy implementation and is
+ * small enough not to bother.
+ */
+ andi t0, a2, -(2 * SZREG)
+ beqz t0, .Lbyte_copy
+
+ /*
+ * Now solve for t5 and t6.
+ */
+ andi t5, t3, -SZREG
+ andi t6, t4, -SZREG
+ /*
+ * If dest(Register t3) rounded down to the nearest naturally
+ * aligned SZREG address, does not equal dest, then add SZREG
+ * to find the low-bound of SZREG alignment in the dest memory
+ * region. Note that this could overshoot the dest memory
+ * region if n is less than SZREG. This is one reason why
+ * we always byte copy if n is less than SZREG.
+ * Otherwise, dest is already naturally aligned to SZREG.
+ */
+ beq t5, t3, 1f
+ addi t5, t5, SZREG
+ 1:
+
+ /*
+ * If the dest and src are co-aligned to SZREG, then there is
+ * no need for the full rigmarole of a full misaligned fixup copy.
+ * Instead, do a simpler co-aligned copy.
+ */
+ xor t0, a0, a1
+ andi t1, t0, (SZREG - 1)
+ beqz t1, .Lcoaligned_copy
+ /* Fall through to misaligned fixup copy */
+
+.Lmisaligned_fixup_copy:
+ bltu a1, a0, .Lmisaligned_fixup_copy_reverse
+
+.Lmisaligned_fixup_copy_forward:
+ jal t0, .Lbyte_copy_until_aligned_forward
+
+ andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */
+ slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
+ sub a5, a1, t3 /* Find the difference between src and dest */
+ andi a1, a1, -SZREG /* Align the src pointer */
+ addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/
+
+ /*
+ * Compute The Inverse Shift
+ * a7 = XLEN - a6 = XLEN + -a6
+ * 2s complement negation to find the negative: -a6 = ~a6 + 1
+ * Add that to XLEN. XLEN = SZREG * 8.
+ */
+ not a7, a6
+ addi a7, a7, (SZREG * 8 + 1)
+
+ /*
+ * Fix Misalignment Copy Loop - Forward
+ * load_val0 = load_ptr[0];
+ * do {
+ * load_val1 = load_ptr[1];
+ * store_ptr += 2;
+ * store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7});
+ *
+ * if (store_ptr == {a2})
+ * break;
+ *
+ * load_val0 = load_ptr[2];
+ * load_ptr += 2;
+ * store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7});
+ *
+ * } while (store_ptr != store_ptr_end);
+ * store_ptr = store_ptr_end;
+ */
+
+ REG_L t0, (0 * SZREG)(a1)
+ 1:
+ REG_L t1, (1 * SZREG)(a1)
+ addi t3, t3, (2 * SZREG)
+ srl t0, t0, a6
+ sll t2, t1, a7
+ or t2, t0, t2
+ REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)
+
+ beq t3, a2, 2f
+
+ REG_L t0, (2 * SZREG)(a1)
+ addi a1, a1, (2 * SZREG)
+ srl t1, t1, a6
+ sll t2, t0, a7
+ or t2, t1, t2
+ REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)
+
+ bne t3, t6, 1b
+ 2:
+ mv t3, t6 /* Fix the dest pointer in case the loop was broken */
+
+ add a1, t3, a5 /* Restore the src pointer */
+ j .Lbyte_copy_forward /* Copy any remaining bytes */
+
+.Lmisaligned_fixup_copy_reverse:
+ jal t0, .Lbyte_copy_until_aligned_reverse
+
+ andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */
+ slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
+ sub a5, a4, t4 /* Find the difference between src and dest */
+ andi a4, a4, -SZREG /* Align the src pointer */
+ addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/
+
+ /*
+ * Compute The Inverse Shift
+ * a7 = XLEN - a6 = XLEN + -a6
+ * 2s complement negation to find the negative: -a6 = ~a6 + 1
+ * Add that to XLEN. XLEN = SZREG * 8.
+ */
+ not a7, a6
+ addi a7, a7, (SZREG * 8 + 1)
+
+ /*
+ * Fix Misalignment Copy Loop - Reverse
+ * load_val1 = load_ptr[0];
+ * do {
+ * load_val0 = load_ptr[-1];
+ * store_ptr -= 2;
+ * store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7});
+ *
+ * if (store_ptr == {a2})
+ * break;
+ *
+ * load_val1 = load_ptr[-2];
+ * load_ptr -= 2;
+ * store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7});
+ *
+ * } while (store_ptr != store_ptr_end);
+ * store_ptr = store_ptr_end;
+ */
+
+ REG_L t1, ( 0 * SZREG)(a4)
+ 1:
+ REG_L t0, (-1 * SZREG)(a4)
+ addi t4, t4, (-2 * SZREG)
+ sll t1, t1, a7
+ srl t2, t0, a6
+ or t2, t1, t2
+ REG_S t2, ( 1 * SZREG)(t4)
+
+ beq t4, a2, 2f
+
+ REG_L t1, (-2 * SZREG)(a4)
+ addi a4, a4, (-2 * SZREG)
+ sll t0, t0, a7
+ srl t2, t1, a6
+ or t2, t0, t2
+ REG_S t2, ( 0 * SZREG)(t4)
+
+ bne t4, t5, 1b
+ 2:
+ mv t4, t5 /* Fix the dest pointer in case the loop was broken */
+
+ add a4, t4, a5 /* Restore the src pointer */
+ j .Lbyte_copy_reverse /* Copy any remaining bytes */
+
+/*
+ * Simple copy loops for SZREG co-aligned memory locations.
+ * These also make calls to do byte copies for any unaligned
+ * data at their terminations.
+ */
+.Lcoaligned_copy:
+ bltu a1, a0, .Lcoaligned_copy_reverse
+
+.Lcoaligned_copy_forward:
+ jal t0, .Lbyte_copy_until_aligned_forward
+
+ 1:
+ REG_L t1, ( 0 * SZREG)(a1)
+ addi a1, a1, SZREG
+ addi t3, t3, SZREG
+ REG_S t1, (-1 * SZREG)(t3)
+ bne t3, t6, 1b
+
+ j .Lbyte_copy_forward /* Copy any remaining bytes */
+
+.Lcoaligned_copy_reverse:
+ jal t0, .Lbyte_copy_until_aligned_reverse
+
+ 1:
+ REG_L t1, (-1 * SZREG)(a4)
+ addi a4, a4, -SZREG
+ addi t4, t4, -SZREG
+ REG_S t1, ( 0 * SZREG)(t4)
+ bne t4, t5, 1b
+
+ j .Lbyte_copy_reverse /* Copy any remaining bytes */
+
+/*
+ * These are basically sub-functions within the function. They
+ * are used to byte copy until the dest pointer is in alignment.
+ * At which point, a bulk copy method can be used by the
+ * calling code. These work on the same registers as the bulk
+ * copy loops. Therefore, the register values can be picked
+ * up from where they were left and we avoid code duplication
+ * without any overhead except the call in and return jumps.
+ */
+.Lbyte_copy_until_aligned_forward:
+ beq t3, t5, 2f
+ 1:
+ lb t1, 0(a1)
+ addi a1, a1, 1
+ addi t3, t3, 1
+ sb t1, -1(t3)
+ bne t3, t5, 1b
+ 2:
+ jalr zero, 0x0(t0) /* Return to multibyte copy loop */
+
+.Lbyte_copy_until_aligned_reverse:
+ beq t4, t6, 2f
+ 1:
+ lb t1, -1(a4)
+ addi a4, a4, -1
+ addi t4, t4, -1
+ sb t1, 0(t4)
+ bne t4, t6, 1b
+ 2:
+ jalr zero, 0x0(t0) /* Return to multibyte copy loop */
+
+/*
+ * Simple byte copy loops.
+ * These will byte copy until they reach the end of data to copy.
+ * At that point, they will call to return from memmove.
+ */
+.Lbyte_copy:
+ bltu a1, a0, .Lbyte_copy_reverse
+
+.Lbyte_copy_forward:
+ beq t3, t4, 2f
+ 1:
+ lb t1, 0(a1)
+ addi a1, a1, 1
+ addi t3, t3, 1
+ sb t1, -1(t3)
+ bne t3, t4, 1b
+ 2:
+ ret
+
+.Lbyte_copy_reverse:
+ beq t4, t3, 2f
+ 1:
+ lb t1, -1(a4)
+ addi a4, a4, -1
+ addi t4, t4, -1
+ sb t1, 0(t4)
+ bne t4, t3, 1b
+ 2:
+
+.Lreturn_from_memmove:
+ ret
+
+SYM_FUNC_END(__memmove)
+SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
+SYM_FUNC_ALIAS(__pi_memmove, __memmove)
+SYM_FUNC_ALIAS(__pi___memmove, __memmove)
diff --git a/arch/riscv/lib/memset.S b/arch/riscv/lib/memset.S
index 5a7386b47175..35f358e70bdb 100644
--- a/arch/riscv/lib/memset.S
+++ b/arch/riscv/lib/memset.S
@@ -8,7 +8,7 @@
#include <asm/asm.h>
/* void *memset(void *, int, size_t) */
-ENTRY(memset)
+SYM_FUNC_START(__memset)
move t0, a0 /* Preserve return value */
/* Defer to byte-oriented fill for small sizes */
@@ -109,4 +109,5 @@ ENTRY(memset)
bltu t0, a3, 5b
6:
ret
-END(memset)
+SYM_FUNC_END(__memset)
+SYM_FUNC_ALIAS_WEAK(memset, __memset)
diff --git a/arch/riscv/lib/riscv_v_helpers.c b/arch/riscv/lib/riscv_v_helpers.c
new file mode 100644
index 000000000000..be38a93cedae
--- /dev/null
+++ b/arch/riscv/lib/riscv_v_helpers.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 SiFive
+ * Author: Andy Chiu <andy.chiu@sifive.com>
+ */
+#include <linux/linkage.h>
+#include <asm/asm.h>
+
+#include <asm/vector.h>
+#include <asm/simd.h>
+
+#ifdef CONFIG_MMU
+#include <asm/asm-prototypes.h>
+#endif
+
+#ifdef CONFIG_MMU
+size_t riscv_v_usercopy_threshold = CONFIG_RISCV_ISA_V_UCOPY_THRESHOLD;
+int __asm_vector_usercopy(void *dst, void *src, size_t n);
+int fallback_scalar_usercopy(void *dst, void *src, size_t n);
+asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n)
+{
+ size_t remain, copied;
+
+ /* skip has_vector() check because it has been done by the asm */
+ if (!may_use_simd())
+ goto fallback;
+
+ kernel_vector_begin();
+ remain = __asm_vector_usercopy(dst, src, n);
+ kernel_vector_end();
+
+ if (remain) {
+ copied = n - remain;
+ dst += copied;
+ src += copied;
+ n = remain;
+ goto fallback;
+ }
+
+ return remain;
+
+fallback:
+ return fallback_scalar_usercopy(dst, src, n);
+}
+#endif
diff --git a/arch/riscv/lib/strcmp.S b/arch/riscv/lib/strcmp.S
new file mode 100644
index 000000000000..687b2bea5c43
--- /dev/null
+++ b/arch/riscv/lib/strcmp.S
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm/alternative-macros.h>
+#include <asm/hwcap.h>
+
+/* int strcmp(const char *cs, const char *ct) */
+SYM_FUNC_START(strcmp)
+
+ ALTERNATIVE("nop", "j strcmp_zbb", 0, RISCV_ISA_EXT_ZBB, CONFIG_RISCV_ISA_ZBB)
+
+ /*
+ * Returns
+ * a0 - comparison result, value like strcmp
+ *
+ * Parameters
+ * a0 - string1
+ * a1 - string2
+ *
+ * Clobbers
+ * t0, t1
+ */
+1:
+ lbu t0, 0(a0)
+ lbu t1, 0(a1)
+ addi a0, a0, 1
+ addi a1, a1, 1
+ bne t0, t1, 2f
+ bnez t0, 1b
+ li a0, 0
+ ret
+2:
+ /*
+ * strcmp only needs to return (< 0, 0, > 0) values
+ * not necessarily -1, 0, +1
+ */
+ sub a0, t0, t1
+ ret
+
+/*
+ * Variant of strcmp using the ZBB extension if available.
+ * The code was published as part of the bitmanip manual
+ * in Appendix A.
+ */
+#ifdef CONFIG_RISCV_ISA_ZBB
+strcmp_zbb:
+
+.option push
+.option arch,+zbb
+
+ /*
+ * Returns
+ * a0 - comparison result, value like strcmp
+ *
+ * Parameters
+ * a0 - string1
+ * a1 - string2
+ *
+ * Clobbers
+ * t0, t1, t2, t3, t4
+ */
+
+ or t2, a0, a1
+ li t4, -1
+ and t2, t2, SZREG-1
+ bnez t2, 3f
+
+ /* Main loop for aligned string. */
+ .p2align 3
+1:
+ REG_L t0, 0(a0)
+ REG_L t1, 0(a1)
+ orc.b t3, t0
+ bne t3, t4, 2f
+ addi a0, a0, SZREG
+ addi a1, a1, SZREG
+ beq t0, t1, 1b
+
+ /*
+ * Words don't match, and no null byte in the first
+ * word. Get bytes in big-endian order and compare.
+ */
+#ifndef CONFIG_CPU_BIG_ENDIAN
+ rev8 t0, t0
+ rev8 t1, t1
+#endif
+
+ /* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */
+ sltu a0, t0, t1
+ neg a0, a0
+ ori a0, a0, 1
+ ret
+
+2:
+ /*
+ * Found a null byte.
+ * If words don't match, fall back to simple loop.
+ */
+ bne t0, t1, 3f
+
+ /* Otherwise, strings are equal. */
+ li a0, 0
+ ret
+
+ /* Simple loop for misaligned strings. */
+ .p2align 3
+3:
+ lbu t0, 0(a0)
+ lbu t1, 0(a1)
+ addi a0, a0, 1
+ addi a1, a1, 1
+ bne t0, t1, 4f
+ bnez t0, 3b
+
+4:
+ sub a0, t0, t1
+ ret
+
+.option pop
+#endif
+SYM_FUNC_END(strcmp)
diff --git a/arch/riscv/lib/strlen.S b/arch/riscv/lib/strlen.S
new file mode 100644
index 000000000000..8ae3064e45ff
--- /dev/null
+++ b/arch/riscv/lib/strlen.S
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm/alternative-macros.h>
+#include <asm/hwcap.h>
+
+/* int strlen(const char *s) */
+SYM_FUNC_START(strlen)
+
+ ALTERNATIVE("nop", "j strlen_zbb", 0, RISCV_ISA_EXT_ZBB, CONFIG_RISCV_ISA_ZBB)
+
+ /*
+ * Returns
+ * a0 - string length
+ *
+ * Parameters
+ * a0 - String to measure
+ *
+ * Clobbers:
+ * t0, t1
+ */
+ mv t1, a0
+1:
+ lbu t0, 0(t1)
+ beqz t0, 2f
+ addi t1, t1, 1
+ j 1b
+2:
+ sub a0, t1, a0
+ ret
+
+/*
+ * Variant of strlen using the ZBB extension if available
+ */
+#ifdef CONFIG_RISCV_ISA_ZBB
+strlen_zbb:
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+# define CZ clz
+# define SHIFT sll
+#else
+# define CZ ctz
+# define SHIFT srl
+#endif
+
+.option push
+.option arch,+zbb
+
+ /*
+ * Returns
+ * a0 - string length
+ *
+ * Parameters
+ * a0 - String to measure
+ *
+ * Clobbers
+ * t0, t1, t2, t3
+ */
+
+ /* Number of irrelevant bytes in the first word. */
+ andi t2, a0, SZREG-1
+
+ /* Align pointer. */
+ andi t0, a0, -SZREG
+
+ li t3, SZREG
+ sub t3, t3, t2
+ slli t2, t2, 3
+
+ /* Get the first word. */
+ REG_L t1, 0(t0)
+
+ /*
+ * Shift away the partial data we loaded to remove the irrelevant bytes
+ * preceding the string with the effect of adding NUL bytes at the
+ * end of the string's first word.
+ */
+ SHIFT t1, t1, t2
+
+ /* Convert non-NUL into 0xff and NUL into 0x00. */
+ orc.b t1, t1
+
+ /* Convert non-NUL into 0x00 and NUL into 0xff. */
+ not t1, t1
+
+ /*
+ * Search for the first set bit (corresponding to a NUL byte in the
+ * original chunk).
+ */
+ CZ t1, t1
+
+ /*
+ * The first chunk is special: compare against the number
+ * of valid bytes in this chunk.
+ */
+ srli a0, t1, 3
+ bgtu t3, a0, 2f
+
+ /* Prepare for the word comparison loop. */
+ addi t2, t0, SZREG
+ li t3, -1
+
+ /*
+ * Our critical loop is 4 instructions and processes data in
+ * 4 byte or 8 byte chunks.
+ */
+ .p2align 3
+1:
+ REG_L t1, SZREG(t0)
+ addi t0, t0, SZREG
+ orc.b t1, t1
+ beq t1, t3, 1b
+
+ not t1, t1
+ CZ t1, t1
+ srli t1, t1, 3
+
+ /* Get number of processed bytes. */
+ sub t2, t0, t2
+
+ /* Add number of characters in the first word. */
+ add a0, a0, t2
+
+ /* Add number of characters in the last word. */
+ add a0, a0, t1
+2:
+ ret
+
+.option pop
+#endif
+SYM_FUNC_END(strlen)
+SYM_FUNC_ALIAS(__pi_strlen, strlen)
diff --git a/arch/riscv/lib/strncmp.S b/arch/riscv/lib/strncmp.S
new file mode 100644
index 000000000000..aba5b3148621
--- /dev/null
+++ b/arch/riscv/lib/strncmp.S
@@ -0,0 +1,138 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm/alternative-macros.h>
+#include <asm/hwcap.h>
+
+/* int strncmp(const char *cs, const char *ct, size_t count) */
+SYM_FUNC_START(strncmp)
+
+ ALTERNATIVE("nop", "j strncmp_zbb", 0, RISCV_ISA_EXT_ZBB, CONFIG_RISCV_ISA_ZBB)
+
+ /*
+ * Returns
+ * a0 - comparison result, value like strncmp
+ *
+ * Parameters
+ * a0 - string1
+ * a1 - string2
+ * a2 - number of characters to compare
+ *
+ * Clobbers
+ * t0, t1, t2
+ */
+ li t2, 0
+1:
+ beq a2, t2, 2f
+ lbu t0, 0(a0)
+ lbu t1, 0(a1)
+ addi a0, a0, 1
+ addi a1, a1, 1
+ bne t0, t1, 3f
+ addi t2, t2, 1
+ bnez t0, 1b
+2:
+ li a0, 0
+ ret
+3:
+ /*
+ * strncmp only needs to return (< 0, 0, > 0) values
+ * not necessarily -1, 0, +1
+ */
+ sub a0, t0, t1
+ ret
+
+/*
+ * Variant of strncmp using the ZBB extension if available
+ */
+#ifdef CONFIG_RISCV_ISA_ZBB
+strncmp_zbb:
+
+.option push
+.option arch,+zbb
+
+ /*
+ * Returns
+ * a0 - comparison result, like strncmp
+ *
+ * Parameters
+ * a0 - string1
+ * a1 - string2
+ * a2 - number of characters to compare
+ *
+ * Clobbers
+ * t0, t1, t2, t3, t4, t5, t6
+ */
+
+ or t2, a0, a1
+ li t5, -1
+ and t2, t2, SZREG-1
+ add t4, a0, a2
+ bnez t2, 3f
+
+ /* Adjust limit for fast-path. */
+ andi t6, t4, -SZREG
+
+ /* Main loop for aligned string. */
+ .p2align 3
+1:
+ bge a0, t6, 3f
+ REG_L t0, 0(a0)
+ REG_L t1, 0(a1)
+ orc.b t3, t0
+ bne t3, t5, 2f
+ orc.b t3, t1
+ bne t3, t5, 2f
+ addi a0, a0, SZREG
+ addi a1, a1, SZREG
+ beq t0, t1, 1b
+
+ /*
+ * Words don't match, and no null byte in the first
+ * word. Get bytes in big-endian order and compare.
+ */
+#ifndef CONFIG_CPU_BIG_ENDIAN
+ rev8 t0, t0
+ rev8 t1, t1
+#endif
+
+ /* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */
+ sltu a0, t0, t1
+ neg a0, a0
+ ori a0, a0, 1
+ ret
+
+2:
+ /*
+ * Found a null byte.
+ * If words don't match, fall back to simple loop.
+ */
+ bne t0, t1, 3f
+
+ /* Otherwise, strings are equal. */
+ li a0, 0
+ ret
+
+ /* Simple loop for misaligned strings. */
+ .p2align 3
+3:
+ bge a0, t4, 5f
+ lbu t0, 0(a0)
+ lbu t1, 0(a1)
+ addi a0, a0, 1
+ addi a1, a1, 1
+ bne t0, t1, 4f
+ bnez t0, 3b
+
+4:
+ sub a0, t0, t1
+ ret
+
+5:
+ li a0, 0
+ ret
+
+.option pop
+#endif
+SYM_FUNC_END(strncmp)
diff --git a/arch/riscv/lib/tishift.S b/arch/riscv/lib/tishift.S
index 15f9d54c7db6..c8294bf72c06 100644
--- a/arch/riscv/lib/tishift.S
+++ b/arch/riscv/lib/tishift.S
@@ -4,34 +4,73 @@
*/
#include <linux/linkage.h>
+#include <linux/export.h>
-ENTRY(__lshrti3)
+SYM_FUNC_START(__lshrti3)
beqz a2, .L1
li a5,64
sub a5,a5,a2
- addi sp,sp,-16
sext.w a4,a5
blez a5, .L2
sext.w a2,a2
- sll a4,a1,a4
srl a0,a0,a2
- srl a1,a1,a2
+ sll a4,a1,a4
+ srl a2,a1,a2
or a0,a0,a4
- sd a1,8(sp)
- sd a0,0(sp)
- ld a0,0(sp)
- ld a1,8(sp)
- addi sp,sp,16
- ret
+ mv a1,a2
.L1:
ret
.L2:
- negw a4,a4
- srl a1,a1,a4
- sd a1,0(sp)
- sd zero,8(sp)
- ld a0,0(sp)
- ld a1,8(sp)
- addi sp,sp,16
+ negw a0,a4
+ li a2,0
+ srl a0,a1,a0
+ mv a1,a2
+ ret
+SYM_FUNC_END(__lshrti3)
+EXPORT_SYMBOL(__lshrti3)
+
+SYM_FUNC_START(__ashrti3)
+ beqz a2, .L3
+ li a5,64
+ sub a5,a5,a2
+ sext.w a4,a5
+ blez a5, .L4
+ sext.w a2,a2
+ srl a0,a0,a2
+ sll a4,a1,a4
+ sra a2,a1,a2
+ or a0,a0,a4
+ mv a1,a2
+.L3:
+ ret
+.L4:
+ negw a0,a4
+ srai a2,a1,0x3f
+ sra a0,a1,a0
+ mv a1,a2
+ ret
+SYM_FUNC_END(__ashrti3)
+EXPORT_SYMBOL(__ashrti3)
+
+SYM_FUNC_START(__ashlti3)
+ beqz a2, .L5
+ li a5,64
+ sub a5,a5,a2
+ sext.w a4,a5
+ blez a5, .L6
+ sext.w a2,a2
+ sll a1,a1,a2
+ srl a4,a0,a4
+ sll a2,a0,a2
+ or a1,a1,a4
+ mv a0,a2
+.L5:
+ ret
+.L6:
+ negw a1,a4
+ li a2,0
+ sll a1,a0,a1
+ mv a0,a2
ret
-ENDPROC(__lshrti3)
+SYM_FUNC_END(__ashlti3)
+EXPORT_SYMBOL(__ashlti3)
diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S
index fecd65657a6f..bc22c078aba8 100644
--- a/arch/riscv/lib/uaccess.S
+++ b/arch/riscv/lib/uaccess.S
@@ -1,74 +1,202 @@
#include <linux/linkage.h>
+#include <linux/export.h>
#include <asm/asm.h>
+#include <asm/asm-extable.h>
#include <asm/csr.h>
+#include <asm/hwcap.h>
+#include <asm/alternative-macros.h>
- .altmacro
.macro fixup op reg addr lbl
- LOCAL _epc
-_epc:
+100:
\op \reg, \addr
- .section __ex_table,"a"
- .balign RISCV_SZPTR
- RISCV_PTR _epc, \lbl
- .previous
+ _asm_extable 100b, \lbl
.endm
-ENTRY(__asm_copy_to_user)
-ENTRY(__asm_copy_from_user)
+SYM_FUNC_START(__asm_copy_to_user)
+#ifdef CONFIG_RISCV_ISA_V
+ ALTERNATIVE("j fallback_scalar_usercopy", "nop", 0, RISCV_ISA_EXT_v, CONFIG_RISCV_ISA_V)
+ REG_L t0, riscv_v_usercopy_threshold
+ bltu a2, t0, fallback_scalar_usercopy
+ tail enter_vector_usercopy
+#endif
+SYM_FUNC_START(fallback_scalar_usercopy)
/* Enable access to user memory */
li t6, SR_SUM
csrs CSR_STATUS, t6
- add a3, a1, a2
- /* Use word-oriented copy only if low-order bits match */
- andi t0, a0, SZREG-1
- andi t1, a1, SZREG-1
- bne t0, t1, 2f
+ /*
+ * Save the terminal address which will be used to compute the number
+ * of bytes copied in case of a fixup exception.
+ */
+ add t5, a0, a2
- addi t0, a1, SZREG-1
- andi t1, a3, ~(SZREG-1)
- andi t0, t0, ~(SZREG-1)
/*
- * a3: terminal address of source region
- * t0: lowest XLEN-aligned address in source
- * t1: highest XLEN-aligned address in source
+ * Register allocation for code below:
+ * a0 - start of uncopied dst
+ * a1 - start of uncopied src
+ * a2 - size
+ * t0 - end of uncopied dst
*/
- bgeu t0, t1, 2f
- bltu a1, t0, 4f
+ add t0, a0, a2
+
+ /*
+ * Use byte copy only if too small.
+ * SZREG holds 4 for RV32 and 8 for RV64
+ */
+ li a3, 9*SZREG /* size must be larger than size in word_copy */
+ bltu a2, a3, .Lbyte_copy_tail
+
+ /*
+ * Copy first bytes until dst is aligned to word boundary.
+ * a0 - start of dst
+ * t1 - start of aligned dst
+ */
+ addi t1, a0, SZREG-1
+ andi t1, t1, ~(SZREG-1)
+ /* dst is already aligned, skip */
+ beq a0, t1, .Lskip_align_dst
1:
- fixup REG_L, t2, (a1), 10f
- fixup REG_S, t2, (a0), 10f
- addi a1, a1, SZREG
- addi a0, a0, SZREG
- bltu a1, t1, 1b
+ /* a5 - one byte for copying data */
+ fixup lb a5, 0(a1), 10f
+ addi a1, a1, 1 /* src */
+ fixup sb a5, 0(a0), 10f
+ addi a0, a0, 1 /* dst */
+ bltu a0, t1, 1b /* t1 - start of aligned dst */
+
+.Lskip_align_dst:
+ /*
+ * Now dst is aligned.
+ * Use shift-copy if src is misaligned.
+ * Use word-copy if both src and dst are aligned because
+ * can not use shift-copy which do not require shifting
+ */
+ /* a1 - start of src */
+ andi a3, a1, SZREG-1
+ bnez a3, .Lshift_copy
+
+.Lword_copy:
+ /*
+ * Both src and dst are aligned, unrolled word copy
+ *
+ * a0 - start of aligned dst
+ * a1 - start of aligned src
+ * t0 - end of aligned dst
+ */
+ addi t0, t0, -(8*SZREG) /* not to over run */
2:
- bltu a1, a3, 5f
+ fixup REG_L a4, 0(a1), 10f
+ fixup REG_L a5, SZREG(a1), 10f
+ fixup REG_L a6, 2*SZREG(a1), 10f
+ fixup REG_L a7, 3*SZREG(a1), 10f
+ fixup REG_L t1, 4*SZREG(a1), 10f
+ fixup REG_L t2, 5*SZREG(a1), 10f
+ fixup REG_L t3, 6*SZREG(a1), 10f
+ fixup REG_L t4, 7*SZREG(a1), 10f
+ fixup REG_S a4, 0(a0), 10f
+ fixup REG_S a5, SZREG(a0), 10f
+ fixup REG_S a6, 2*SZREG(a0), 10f
+ fixup REG_S a7, 3*SZREG(a0), 10f
+ fixup REG_S t1, 4*SZREG(a0), 10f
+ fixup REG_S t2, 5*SZREG(a0), 10f
+ fixup REG_S t3, 6*SZREG(a0), 10f
+ fixup REG_S t4, 7*SZREG(a0), 10f
+ addi a0, a0, 8*SZREG
+ addi a1, a1, 8*SZREG
+ bltu a0, t0, 2b
+
+ addi t0, t0, 8*SZREG /* revert to original value */
+ j .Lbyte_copy_tail
+
+.Lshift_copy:
+
+ /*
+ * Word copy with shifting.
+ * For misaligned copy we still perform aligned word copy, but
+ * we need to use the value fetched from the previous iteration and
+ * do some shifts.
+ * This is safe because reading is less than a word size.
+ *
+ * a0 - start of aligned dst
+ * a1 - start of src
+ * a3 - a1 & mask:(SZREG-1)
+ * t0 - end of uncopied dst
+ * t1 - end of aligned dst
+ */
+ /* calculating aligned word boundary for dst */
+ andi t1, t0, ~(SZREG-1)
+ /* Converting unaligned src to aligned src */
+ andi a1, a1, ~(SZREG-1)
+
+ /*
+ * Calculate shifts
+ * t3 - prev shift
+ * t4 - current shift
+ */
+ slli t3, a3, 3 /* converting bytes in a3 to bits */
+ li a5, SZREG*8
+ sub t4, a5, t3
+
+ /* Load the first word to combine with second word */
+ fixup REG_L a5, 0(a1), 10f
3:
+ /* Main shifting copy
+ *
+ * a0 - start of aligned dst
+ * a1 - start of aligned src
+ * t1 - end of aligned dst
+ */
+
+ /* At least one iteration will be executed */
+ srl a4, a5, t3
+ fixup REG_L a5, SZREG(a1), 10f
+ addi a1, a1, SZREG
+ sll a2, a5, t4
+ or a2, a2, a4
+ fixup REG_S a2, 0(a0), 10f
+ addi a0, a0, SZREG
+ bltu a0, t1, 3b
+
+ /* Revert src to original unaligned value */
+ add a1, a1, a3
+
+.Lbyte_copy_tail:
+ /*
+ * Byte copy anything left.
+ *
+ * a0 - start of remaining dst
+ * a1 - start of remaining src
+ * t0 - end of remaining dst
+ */
+ bgeu a0, t0, .Lout_copy_user /* check if end of copy */
+4:
+ fixup lb a5, 0(a1), 10f
+ addi a1, a1, 1 /* src */
+ fixup sb a5, 0(a0), 10f
+ addi a0, a0, 1 /* dst */
+ bltu a0, t0, 4b /* t0 - end of dst */
+
+.Lout_copy_user:
/* Disable access to user memory */
csrc CSR_STATUS, t6
- li a0, 0
+ li a0, 0
ret
-4: /* Edge case: unalignment */
- fixup lbu, t2, (a1), 10f
- fixup sb, t2, (a0), 10f
- addi a1, a1, 1
- addi a0, a0, 1
- bltu a1, t0, 4b
- j 1b
-5: /* Edge case: remainder */
- fixup lbu, t2, (a1), 10f
- fixup sb, t2, (a0), 10f
- addi a1, a1, 1
- addi a0, a0, 1
- bltu a1, a3, 5b
- j 3b
-ENDPROC(__asm_copy_to_user)
-ENDPROC(__asm_copy_from_user)
+
+ /* Exception fixup code */
+10:
+ /* Disable access to user memory */
+ csrc CSR_STATUS, t6
+ sub a0, t5, a0
+ ret
+SYM_FUNC_END(__asm_copy_to_user)
+SYM_FUNC_END(fallback_scalar_usercopy)
+EXPORT_SYMBOL(__asm_copy_to_user)
+SYM_FUNC_ALIAS(__asm_copy_from_user, __asm_copy_to_user)
+EXPORT_SYMBOL(__asm_copy_from_user)
-ENTRY(__clear_user)
+SYM_FUNC_START(__clear_user)
/* Enable access to user memory */
li t6, SR_SUM
@@ -107,18 +235,12 @@ ENTRY(__clear_user)
addi a0, a0, 1
bltu a0, a3, 5b
j 3b
-ENDPROC(__clear_user)
- .section .fixup,"ax"
- .balign 4
- /* Fixup code for __copy_user(10) and __clear_user(11) */
-10:
- /* Disable access to user memory */
- csrs CSR_STATUS, t6
- mv a0, a2
- ret
+ /* Exception fixup code */
11:
- csrs CSR_STATUS, t6
- mv a0, a1
+ /* Disable access to user memory */
+ csrc CSR_STATUS, t6
+ sub a0, a3, a0
ret
- .previous
+SYM_FUNC_END(__clear_user)
+EXPORT_SYMBOL(__clear_user)
diff --git a/arch/riscv/lib/uaccess_vector.S b/arch/riscv/lib/uaccess_vector.S
new file mode 100644
index 000000000000..7c45f26de4f7
--- /dev/null
+++ b/arch/riscv/lib/uaccess_vector.S
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm/asm-extable.h>
+#include <asm/csr.h>
+
+#define pDst a0
+#define pSrc a1
+#define iNum a2
+
+#define iVL a3
+
+#define ELEM_LMUL_SETTING m8
+#define vData v0
+
+ .macro fixup op reg addr lbl
+100:
+ \op \reg, \addr
+ _asm_extable 100b, \lbl
+ .endm
+
+SYM_FUNC_START(__asm_vector_usercopy)
+ /* Enable access to user memory */
+ li t6, SR_SUM
+ csrs CSR_STATUS, t6
+
+loop:
+ vsetvli iVL, iNum, e8, ELEM_LMUL_SETTING, ta, ma
+ fixup vle8.v vData, (pSrc), 10f
+ sub iNum, iNum, iVL
+ add pSrc, pSrc, iVL
+ fixup vse8.v vData, (pDst), 11f
+ add pDst, pDst, iVL
+ bnez iNum, loop
+
+ /* Exception fixup for vector load is shared with normal exit */
+10:
+ /* Disable access to user memory */
+ csrc CSR_STATUS, t6
+ mv a0, iNum
+ ret
+
+ /* Exception fixup code for vector store. */
+11:
+ /* Undo the subtraction after vle8.v */
+ add iNum, iNum, iVL
+ /* Make sure the scalar fallback skip already processed bytes */
+ csrr t2, CSR_VSTART
+ sub iNum, iNum, t2
+ j 10b
+SYM_FUNC_END(__asm_vector_usercopy)
diff --git a/arch/riscv/lib/xor.S b/arch/riscv/lib/xor.S
new file mode 100644
index 000000000000..b28f2430e52f
--- /dev/null
+++ b/arch/riscv/lib/xor.S
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2021 SiFive
+ */
+#include <linux/linkage.h>
+#include <linux/export.h>
+#include <asm/asm.h>
+
+SYM_FUNC_START(xor_regs_2_)
+ vsetvli a3, a0, e8, m8, ta, ma
+ vle8.v v0, (a1)
+ vle8.v v8, (a2)
+ sub a0, a0, a3
+ vxor.vv v16, v0, v8
+ add a2, a2, a3
+ vse8.v v16, (a1)
+ add a1, a1, a3
+ bnez a0, xor_regs_2_
+ ret
+SYM_FUNC_END(xor_regs_2_)
+EXPORT_SYMBOL(xor_regs_2_)
+
+SYM_FUNC_START(xor_regs_3_)
+ vsetvli a4, a0, e8, m8, ta, ma
+ vle8.v v0, (a1)
+ vle8.v v8, (a2)
+ sub a0, a0, a4
+ vxor.vv v0, v0, v8
+ vle8.v v16, (a3)
+ add a2, a2, a4
+ vxor.vv v16, v0, v16
+ add a3, a3, a4
+ vse8.v v16, (a1)
+ add a1, a1, a4
+ bnez a0, xor_regs_3_
+ ret
+SYM_FUNC_END(xor_regs_3_)
+EXPORT_SYMBOL(xor_regs_3_)
+
+SYM_FUNC_START(xor_regs_4_)
+ vsetvli a5, a0, e8, m8, ta, ma
+ vle8.v v0, (a1)
+ vle8.v v8, (a2)
+ sub a0, a0, a5
+ vxor.vv v0, v0, v8
+ vle8.v v16, (a3)
+ add a2, a2, a5
+ vxor.vv v0, v0, v16
+ vle8.v v24, (a4)
+ add a3, a3, a5
+ vxor.vv v16, v0, v24
+ add a4, a4, a5
+ vse8.v v16, (a1)
+ add a1, a1, a5
+ bnez a0, xor_regs_4_
+ ret
+SYM_FUNC_END(xor_regs_4_)
+EXPORT_SYMBOL(xor_regs_4_)
+
+SYM_FUNC_START(xor_regs_5_)
+ vsetvli a6, a0, e8, m8, ta, ma
+ vle8.v v0, (a1)
+ vle8.v v8, (a2)
+ sub a0, a0, a6
+ vxor.vv v0, v0, v8
+ vle8.v v16, (a3)
+ add a2, a2, a6
+ vxor.vv v0, v0, v16
+ vle8.v v24, (a4)
+ add a3, a3, a6
+ vxor.vv v0, v0, v24
+ vle8.v v8, (a5)
+ add a4, a4, a6
+ vxor.vv v16, v0, v8
+ add a5, a5, a6
+ vse8.v v16, (a1)
+ add a1, a1, a6
+ bnez a0, xor_regs_5_
+ ret
+SYM_FUNC_END(xor_regs_5_)
+EXPORT_SYMBOL(xor_regs_5_)