summaryrefslogtreecommitdiff
path: root/arch/loongarch/lib/memset.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/loongarch/lib/memset.S')
-rw-r--r--arch/loongarch/lib/memset.S130
1 files changed, 105 insertions, 25 deletions
diff --git a/arch/loongarch/lib/memset.S b/arch/loongarch/lib/memset.S
index e7cb4ea3747d..df3846620553 100644
--- a/arch/loongarch/lib/memset.S
+++ b/arch/loongarch/lib/memset.S
@@ -3,12 +3,13 @@
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
*/
+#include <linux/export.h>
#include <asm/alternative-asm.h>
#include <asm/asm.h>
#include <asm/asmmacro.h>
#include <asm/cpu.h>
-#include <asm/export.h>
#include <asm/regdef.h>
+#include <asm/unwind_hints.h>
.macro fill_to_64 r0
bstrins.d \r0, \r0, 15, 8
@@ -16,6 +17,8 @@
bstrins.d \r0, \r0, 63, 32
.endm
+.section .noinstr.text, "ax"
+
SYM_FUNC_START(memset)
/*
* Some CPUs support hardware unaligned access
@@ -23,8 +26,13 @@ SYM_FUNC_START(memset)
ALTERNATIVE "b __memset_generic", \
"b __memset_fast", CPU_FEATURE_UAL
SYM_FUNC_END(memset)
+SYM_FUNC_ALIAS(__memset, memset)
EXPORT_SYMBOL(memset)
+EXPORT_SYMBOL(__memset)
+
+_ASM_NOKPROBE(memset)
+_ASM_NOKPROBE(__memset)
/*
* void *__memset_generic(void *s, int c, size_t n)
@@ -45,6 +53,7 @@ SYM_FUNC_START(__memset_generic)
2: move a0, a3
jr ra
SYM_FUNC_END(__memset_generic)
+_ASM_NOKPROBE(__memset_generic)
/*
* void *__memset_fast(void *s, int c, size_t n)
@@ -54,38 +63,109 @@ SYM_FUNC_END(__memset_generic)
* a2: n
*/
SYM_FUNC_START(__memset_fast)
- move a3, a0
- beqz a2, 3f
-
- ori a4, zero, 64
- blt a2, a4, 2f
-
/* fill a1 to 64 bits */
fill_to_64 a1
- /* set 64 bytes at a time */
-1: st.d a1, a0, 0
- st.d a1, a0, 8
- st.d a1, a0, 16
- st.d a1, a0, 24
- st.d a1, a0, 32
- st.d a1, a0, 40
- st.d a1, a0, 48
- st.d a1, a0, 56
+ sltui t0, a2, 9
+ bnez t0, .Lsmall
+
+ add.d a2, a0, a2
+ st.d a1, a0, 0
- addi.d a0, a0, 64
- addi.d a2, a2, -64
- bge a2, a4, 1b
+ /* align up address */
+ addi.d a3, a0, 8
+ bstrins.d a3, zero, 2, 0
- beqz a2, 3f
+ addi.d a4, a2, -64
+ bgeu a3, a4, .Llt64
+
+ /* set 64 bytes at a time */
+.Lloop64:
+ st.d a1, a3, 0
+ st.d a1, a3, 8
+ st.d a1, a3, 16
+ st.d a1, a3, 24
+ st.d a1, a3, 32
+ st.d a1, a3, 40
+ st.d a1, a3, 48
+ st.d a1, a3, 56
+ addi.d a3, a3, 64
+ bltu a3, a4, .Lloop64
/* set the remaining bytes */
-2: st.b a1, a0, 0
- addi.d a0, a0, 1
- addi.d a2, a2, -1
- bgt a2, zero, 2b
+.Llt64:
+ addi.d a4, a2, -32
+ bgeu a3, a4, .Llt32
+ st.d a1, a3, 0
+ st.d a1, a3, 8
+ st.d a1, a3, 16
+ st.d a1, a3, 24
+ addi.d a3, a3, 32
+
+.Llt32:
+ addi.d a4, a2, -16
+ bgeu a3, a4, .Llt16
+ st.d a1, a3, 0
+ st.d a1, a3, 8
+ addi.d a3, a3, 16
+
+.Llt16:
+ addi.d a4, a2, -8
+ bgeu a3, a4, .Llt8
+ st.d a1, a3, 0
+
+.Llt8:
+ st.d a1, a2, -8
/* return */
-3: move a0, a3
+ jr ra
+
+ .align 4
+.Lsmall:
+ pcaddi t0, 4
+ slli.d a2, a2, 4
+ add.d t0, t0, a2
+ jr t0
+
+ .align 4
+0: jr ra
+
+ .align 4
+1: st.b a1, a0, 0
+ jr ra
+
+ .align 4
+2: st.h a1, a0, 0
+ jr ra
+
+ .align 4
+3: st.h a1, a0, 0
+ st.b a1, a0, 2
+ jr ra
+
+ .align 4
+4: st.w a1, a0, 0
+ jr ra
+
+ .align 4
+5: st.w a1, a0, 0
+ st.b a1, a0, 4
+ jr ra
+
+ .align 4
+6: st.w a1, a0, 0
+ st.h a1, a0, 4
+ jr ra
+
+ .align 4
+7: st.w a1, a0, 0
+ st.w a1, a0, 3
+ jr ra
+
+ .align 4
+8: st.d a1, a0, 0
jr ra
SYM_FUNC_END(__memset_fast)
+_ASM_NOKPROBE(__memset_fast)
+
+STACK_FRAME_NON_STANDARD __memset_fast