summaryrefslogtreecommitdiff
path: root/arch/arm64/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/lib')
-rw-r--r--arch/arm64/lib/Makefile14
-rw-r--r--arch/arm64/lib/clear_page.S15
-rw-r--r--arch/arm64/lib/clear_user.S57
-rw-r--r--arch/arm64/lib/copy_from_user.S49
-rw-r--r--arch/arm64/lib/copy_in_user.S70
-rw-r--r--arch/arm64/lib/copy_page.S54
-rw-r--r--arch/arm64/lib/copy_to_user.S50
-rw-r--r--arch/arm64/lib/crc32.S97
-rw-r--r--arch/arm64/lib/csum.c157
-rw-r--r--arch/arm64/lib/delay.c12
-rw-r--r--arch/arm64/lib/insn.c1517
-rw-r--r--arch/arm64/lib/kasan_sw_tags.S74
-rw-r--r--arch/arm64/lib/memchr.S70
-rw-r--r--arch/arm64/lib/memcmp.S354
-rw-r--r--arch/arm64/lib/memcpy.S296
-rw-r--r--arch/arm64/lib/memmove.S190
-rw-r--r--arch/arm64/lib/memset.S13
-rw-r--r--arch/arm64/lib/mte.S177
-rw-r--r--arch/arm64/lib/strchr.S6
-rw-r--r--arch/arm64/lib/strcmp.S375
-rw-r--r--arch/arm64/lib/strlen.S274
-rw-r--r--arch/arm64/lib/strncmp.S483
-rw-r--r--arch/arm64/lib/strnlen.S6
-rw-r--r--arch/arm64/lib/strrchr.S5
-rw-r--r--arch/arm64/lib/tishift.S12
-rw-r--r--arch/arm64/lib/uaccess_flushcache.c14
-rw-r--r--arch/arm64/lib/xor-neon.c177
27 files changed, 3363 insertions, 1255 deletions
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index c21b936dc01d..29490be2546b 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,14 +1,16 @@
# SPDX-License-Identifier: GPL-2.0
lib-y := clear_user.o delay.o copy_from_user.o \
- copy_to_user.o copy_in_user.o copy_page.o \
- clear_page.o memchr.o memcpy.o memmove.o memset.o \
- memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \
- strchr.o strrchr.o tishift.o
+ copy_to_user.o copy_page.o \
+ clear_page.o csum.o insn.o memchr.o memcpy.o \
+ memset.o memcmp.o strcmp.o strncmp.o strlen.o \
+ strnlen.o strchr.o strrchr.o tishift.o
ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o
CFLAGS_REMOVE_xor-neon.o += -mgeneral-regs-only
CFLAGS_xor-neon.o += -ffreestanding
+# Enable <arm_neon.h>
+CFLAGS_xor-neon.o += -isystem $(shell $(CC) -print-file-name=include)
endif
lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
@@ -16,3 +18,7 @@ lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
obj-$(CONFIG_CRC32) += crc32.o
obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
+
+obj-$(CONFIG_ARM64_MTE) += mte.o
+
+obj-$(CONFIG_KASAN_SW_TAGS) += kasan_sw_tags.o
diff --git a/arch/arm64/lib/clear_page.S b/arch/arm64/lib/clear_page.S
index 78a9ef66288a..ebde40e7fa2b 100644
--- a/arch/arm64/lib/clear_page.S
+++ b/arch/arm64/lib/clear_page.S
@@ -14,8 +14,9 @@
* Parameters:
* x0 - dest
*/
-ENTRY(clear_page)
+SYM_FUNC_START(__pi_clear_page)
mrs x1, dczid_el0
+ tbnz x1, #4, 2f /* Branch if DC ZVA is prohibited */
and w1, w1, #0xf
mov x2, #4
lsl x1, x2, x1
@@ -25,5 +26,15 @@ ENTRY(clear_page)
tst x0, #(PAGE_SIZE - 1)
b.ne 1b
ret
-ENDPROC(clear_page)
+
+2: stnp xzr, xzr, [x0]
+ stnp xzr, xzr, [x0, #16]
+ stnp xzr, xzr, [x0, #32]
+ stnp xzr, xzr, [x0, #48]
+ add x0, x0, #64
+ tst x0, #(PAGE_SIZE - 1)
+ b.ne 2b
+ ret
+SYM_FUNC_END(__pi_clear_page)
+SYM_FUNC_ALIAS(clear_page, __pi_clear_page)
EXPORT_SYMBOL(clear_page)
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index aeafc03e961a..a5a5f5b97b17 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -1,13 +1,10 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Based on arch/arm/lib/clear_user.S
- *
- * Copyright (C) 2012 ARM Ltd.
+ * Copyright (C) 2021 Arm Ltd.
*/
-#include <linux/linkage.h>
+#include <linux/linkage.h>
#include <asm/asm-uaccess.h>
-#include <asm/assembler.h>
.text
@@ -19,32 +16,40 @@
*
* Alignment fixed up by hardware.
*/
-ENTRY(__arch_clear_user)
- mov x2, x1 // save the size for fixup return
+
+ .p2align 4
+ // Alignment is for the loop, but since the prologue (including BTI)
+ // is also 16 bytes we can keep any padding outside the function
+SYM_FUNC_START(__arch_clear_user)
+ add x2, x0, x1
subs x1, x1, #8
b.mi 2f
1:
-uao_user_alternative 9f, str, sttr, xzr, x0, 8
+USER(9f, sttr xzr, [x0])
+ add x0, x0, #8
subs x1, x1, #8
- b.pl 1b
-2: adds x1, x1, #4
- b.mi 3f
-uao_user_alternative 9f, str, sttr, wzr, x0, 4
- sub x1, x1, #4
-3: adds x1, x1, #2
- b.mi 4f
-uao_user_alternative 9f, strh, sttrh, wzr, x0, 2
- sub x1, x1, #2
-4: adds x1, x1, #1
- b.mi 5f
-uao_user_alternative 9f, strb, sttrb, wzr, x0, 0
+ b.hi 1b
+USER(9f, sttr xzr, [x2, #-8])
+ mov x0, #0
+ ret
+
+2: tbz x1, #2, 3f
+USER(9f, sttr wzr, [x0])
+USER(8f, sttr wzr, [x2, #-4])
+ mov x0, #0
+ ret
+
+3: tbz x1, #1, 4f
+USER(9f, sttrh wzr, [x0])
+4: tbz x1, #0, 5f
+USER(7f, sttrb wzr, [x2, #-1])
5: mov x0, #0
ret
-ENDPROC(__arch_clear_user)
-EXPORT_SYMBOL(__arch_clear_user)
- .section .fixup,"ax"
- .align 2
-9: mov x0, x2 // return the original size
+ // Exception fixups
+7: sub x0, x2, #5 // Adjust for faulting on the final byte...
+8: add x0, x0, #4 // ...or the second word of the 4-7 byte case
+9: sub x0, x2, x0
ret
- .previous
+SYM_FUNC_END(__arch_clear_user)
+EXPORT_SYMBOL(__arch_clear_user)
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index ebb3c06cbb5d..34e317907524 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -20,49 +20,54 @@
* x0 - bytes not copied
*/
- .macro ldrb1 ptr, regB, val
- uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val
+ .macro ldrb1 reg, ptr, val
+ user_ldst 9998f, ldtrb, \reg, \ptr, \val
.endm
- .macro strb1 ptr, regB, val
- strb \ptr, [\regB], \val
+ .macro strb1 reg, ptr, val
+ strb \reg, [\ptr], \val
.endm
- .macro ldrh1 ptr, regB, val
- uao_user_alternative 9998f, ldrh, ldtrh, \ptr, \regB, \val
+ .macro ldrh1 reg, ptr, val
+ user_ldst 9997f, ldtrh, \reg, \ptr, \val
.endm
- .macro strh1 ptr, regB, val
- strh \ptr, [\regB], \val
+ .macro strh1 reg, ptr, val
+ strh \reg, [\ptr], \val
.endm
- .macro ldr1 ptr, regB, val
- uao_user_alternative 9998f, ldr, ldtr, \ptr, \regB, \val
+ .macro ldr1 reg, ptr, val
+ user_ldst 9997f, ldtr, \reg, \ptr, \val
.endm
- .macro str1 ptr, regB, val
- str \ptr, [\regB], \val
+ .macro str1 reg, ptr, val
+ str \reg, [\ptr], \val
.endm
- .macro ldp1 ptr, regB, regC, val
- uao_ldp 9998f, \ptr, \regB, \regC, \val
+ .macro ldp1 reg1, reg2, ptr, val
+ user_ldp 9997f, \reg1, \reg2, \ptr, \val
.endm
- .macro stp1 ptr, regB, regC, val
- stp \ptr, \regB, [\regC], \val
+ .macro stp1 reg1, reg2, ptr, val
+ stp \reg1, \reg2, [\ptr], \val
.endm
end .req x5
-ENTRY(__arch_copy_from_user)
+srcin .req x15
+SYM_FUNC_START(__arch_copy_from_user)
add end, x0, x2
+ mov srcin, x1
#include "copy_template.S"
mov x0, #0 // Nothing to copy
ret
-ENDPROC(__arch_copy_from_user)
-EXPORT_SYMBOL(__arch_copy_from_user)
- .section .fixup,"ax"
- .align 2
+ // Exception fixups
+9997: cmp dst, dstin
+ b.ne 9998f
+ // Before being absolutely sure we couldn't copy anything, try harder
+USER(9998f, ldtrb tmp1w, [srcin])
+ strb tmp1w, [dst], #1
9998: sub x0, end, dst // bytes not copied
ret
- .previous
+SYM_FUNC_END(__arch_copy_from_user)
+EXPORT_SYMBOL(__arch_copy_from_user)
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
deleted file mode 100644
index 3d8153a1ebce..000000000000
--- a/arch/arm64/lib/copy_in_user.S
+++ /dev/null
@@ -1,70 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copy from user space to user space
- *
- * Copyright (C) 2012 ARM Ltd.
- */
-
-#include <linux/linkage.h>
-
-#include <asm/asm-uaccess.h>
-#include <asm/assembler.h>
-#include <asm/cache.h>
-
-/*
- * Copy from user space to user space (alignment handled by the hardware)
- *
- * Parameters:
- * x0 - to
- * x1 - from
- * x2 - n
- * Returns:
- * x0 - bytes not copied
- */
- .macro ldrb1 ptr, regB, val
- uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val
- .endm
-
- .macro strb1 ptr, regB, val
- uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val
- .endm
-
- .macro ldrh1 ptr, regB, val
- uao_user_alternative 9998f, ldrh, ldtrh, \ptr, \regB, \val
- .endm
-
- .macro strh1 ptr, regB, val
- uao_user_alternative 9998f, strh, sttrh, \ptr, \regB, \val
- .endm
-
- .macro ldr1 ptr, regB, val
- uao_user_alternative 9998f, ldr, ldtr, \ptr, \regB, \val
- .endm
-
- .macro str1 ptr, regB, val
- uao_user_alternative 9998f, str, sttr, \ptr, \regB, \val
- .endm
-
- .macro ldp1 ptr, regB, regC, val
- uao_ldp 9998f, \ptr, \regB, \regC, \val
- .endm
-
- .macro stp1 ptr, regB, regC, val
- uao_stp 9998f, \ptr, \regB, \regC, \val
- .endm
-
-end .req x5
-
-ENTRY(__arch_copy_in_user)
- add end, x0, x2
-#include "copy_template.S"
- mov x0, #0
- ret
-ENDPROC(__arch_copy_in_user)
-EXPORT_SYMBOL(__arch_copy_in_user)
-
- .section .fixup,"ax"
- .align 2
-9998: sub x0, end, dst // bytes not copied
- ret
- .previous
diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S
index bbb8562396af..6a56d7cf309d 100644
--- a/arch/arm64/lib/copy_page.S
+++ b/arch/arm64/lib/copy_page.S
@@ -17,14 +17,7 @@
* x0 - dest
* x1 - src
*/
-ENTRY(copy_page)
-alternative_if ARM64_HAS_NO_HW_PREFETCH
- // Prefetch three cache lines ahead.
- prfm pldl1strm, [x1, #128]
- prfm pldl1strm, [x1, #256]
- prfm pldl1strm, [x1, #384]
-alternative_else_nop_endif
-
+SYM_FUNC_START(__pi_copy_page)
ldp x2, x3, [x1]
ldp x4, x5, [x1, #16]
ldp x6, x7, [x1, #32]
@@ -34,46 +27,43 @@ alternative_else_nop_endif
ldp x14, x15, [x1, #96]
ldp x16, x17, [x1, #112]
- mov x18, #(PAGE_SIZE - 128)
+ add x0, x0, #256
add x1, x1, #128
1:
- subs x18, x18, #128
-
-alternative_if ARM64_HAS_NO_HW_PREFETCH
- prfm pldl1strm, [x1, #384]
-alternative_else_nop_endif
+ tst x0, #(PAGE_SIZE - 1)
- stnp x2, x3, [x0]
+ stnp x2, x3, [x0, #-256]
ldp x2, x3, [x1]
- stnp x4, x5, [x0, #16]
+ stnp x4, x5, [x0, #16 - 256]
ldp x4, x5, [x1, #16]
- stnp x6, x7, [x0, #32]
+ stnp x6, x7, [x0, #32 - 256]
ldp x6, x7, [x1, #32]
- stnp x8, x9, [x0, #48]
+ stnp x8, x9, [x0, #48 - 256]
ldp x8, x9, [x1, #48]
- stnp x10, x11, [x0, #64]
+ stnp x10, x11, [x0, #64 - 256]
ldp x10, x11, [x1, #64]
- stnp x12, x13, [x0, #80]
+ stnp x12, x13, [x0, #80 - 256]
ldp x12, x13, [x1, #80]
- stnp x14, x15, [x0, #96]
+ stnp x14, x15, [x0, #96 - 256]
ldp x14, x15, [x1, #96]
- stnp x16, x17, [x0, #112]
+ stnp x16, x17, [x0, #112 - 256]
ldp x16, x17, [x1, #112]
add x0, x0, #128
add x1, x1, #128
- b.gt 1b
+ b.ne 1b
- stnp x2, x3, [x0]
- stnp x4, x5, [x0, #16]
- stnp x6, x7, [x0, #32]
- stnp x8, x9, [x0, #48]
- stnp x10, x11, [x0, #64]
- stnp x12, x13, [x0, #80]
- stnp x14, x15, [x0, #96]
- stnp x16, x17, [x0, #112]
+ stnp x2, x3, [x0, #-256]
+ stnp x4, x5, [x0, #16 - 256]
+ stnp x6, x7, [x0, #32 - 256]
+ stnp x8, x9, [x0, #48 - 256]
+ stnp x10, x11, [x0, #64 - 256]
+ stnp x12, x13, [x0, #80 - 256]
+ stnp x14, x15, [x0, #96 - 256]
+ stnp x16, x17, [x0, #112 - 256]
ret
-ENDPROC(copy_page)
+SYM_FUNC_END(__pi_copy_page)
+SYM_FUNC_ALIAS(copy_page, __pi_copy_page)
EXPORT_SYMBOL(copy_page)
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 357eae2c18eb..802231772608 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -19,49 +19,55 @@
* Returns:
* x0 - bytes not copied
*/
- .macro ldrb1 ptr, regB, val
- ldrb \ptr, [\regB], \val
+ .macro ldrb1 reg, ptr, val
+ ldrb \reg, [\ptr], \val
.endm
- .macro strb1 ptr, regB, val
- uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val
+ .macro strb1 reg, ptr, val
+ user_ldst 9998f, sttrb, \reg, \ptr, \val
.endm
- .macro ldrh1 ptr, regB, val
- ldrh \ptr, [\regB], \val
+ .macro ldrh1 reg, ptr, val
+ ldrh \reg, [\ptr], \val
.endm
- .macro strh1 ptr, regB, val
- uao_user_alternative 9998f, strh, sttrh, \ptr, \regB, \val
+ .macro strh1 reg, ptr, val
+ user_ldst 9997f, sttrh, \reg, \ptr, \val
.endm
- .macro ldr1 ptr, regB, val
- ldr \ptr, [\regB], \val
+ .macro ldr1 reg, ptr, val
+ ldr \reg, [\ptr], \val
.endm
- .macro str1 ptr, regB, val
- uao_user_alternative 9998f, str, sttr, \ptr, \regB, \val
+ .macro str1 reg, ptr, val
+ user_ldst 9997f, sttr, \reg, \ptr, \val
.endm
- .macro ldp1 ptr, regB, regC, val
- ldp \ptr, \regB, [\regC], \val
+ .macro ldp1 reg1, reg2, ptr, val
+ ldp \reg1, \reg2, [\ptr], \val
.endm
- .macro stp1 ptr, regB, regC, val
- uao_stp 9998f, \ptr, \regB, \regC, \val
+ .macro stp1 reg1, reg2, ptr, val
+ user_stp 9997f, \reg1, \reg2, \ptr, \val
.endm
end .req x5
-ENTRY(__arch_copy_to_user)
+srcin .req x15
+SYM_FUNC_START(__arch_copy_to_user)
add end, x0, x2
+ mov srcin, x1
#include "copy_template.S"
mov x0, #0
ret
-ENDPROC(__arch_copy_to_user)
-EXPORT_SYMBOL(__arch_copy_to_user)
- .section .fixup,"ax"
- .align 2
+ // Exception fixups
+9997: cmp dst, dstin
+ b.ne 9998f
+ // Before being absolutely sure we couldn't copy anything, try harder
+ ldrb tmp1w, [srcin]
+USER(9998f, sttrb tmp1w, [dst])
+ add dst, dst, #1
9998: sub x0, end, dst // bytes not copied
ret
- .previous
+SYM_FUNC_END(__arch_copy_to_user)
+EXPORT_SYMBOL(__arch_copy_to_user)
diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S
index e6135f16649b..8340dccff46f 100644
--- a/arch/arm64/lib/crc32.S
+++ b/arch/arm64/lib/crc32.S
@@ -9,9 +9,46 @@
#include <asm/alternative.h>
#include <asm/assembler.h>
- .cpu generic+crc
+ .arch armv8-a+crc
- .macro __crc32, c
+ .macro byteorder, reg, be
+ .if \be
+CPU_LE( rev \reg, \reg )
+ .else
+CPU_BE( rev \reg, \reg )
+ .endif
+ .endm
+
+ .macro byteorder16, reg, be
+ .if \be
+CPU_LE( rev16 \reg, \reg )
+ .else
+CPU_BE( rev16 \reg, \reg )
+ .endif
+ .endm
+
+ .macro bitorder, reg, be
+ .if \be
+ rbit \reg, \reg
+ .endif
+ .endm
+
+ .macro bitorder16, reg, be
+ .if \be
+ rbit \reg, \reg
+ lsr \reg, \reg, #16
+ .endif
+ .endm
+
+ .macro bitorder8, reg, be
+ .if \be
+ rbit \reg, \reg
+ lsr \reg, \reg, #24
+ .endif
+ .endm
+
+ .macro __crc32, c, be=0
+ bitorder w0, \be
cmp x2, #16
b.lt 8f // less than 16 bytes
@@ -24,10 +61,14 @@
add x8, x8, x1
add x1, x1, x7
ldp x5, x6, [x8]
-CPU_BE( rev x3, x3 )
-CPU_BE( rev x4, x4 )
-CPU_BE( rev x5, x5 )
-CPU_BE( rev x6, x6 )
+ byteorder x3, \be
+ byteorder x4, \be
+ byteorder x5, \be
+ byteorder x6, \be
+ bitorder x3, \be
+ bitorder x4, \be
+ bitorder x5, \be
+ bitorder x6, \be
tst x7, #8
crc32\c\()x w8, w0, x3
@@ -55,47 +96,65 @@ CPU_BE( rev x6, x6 )
32: ldp x3, x4, [x1], #32
sub x2, x2, #32
ldp x5, x6, [x1, #-16]
-CPU_BE( rev x3, x3 )
-CPU_BE( rev x4, x4 )
-CPU_BE( rev x5, x5 )
-CPU_BE( rev x6, x6 )
+ byteorder x3, \be
+ byteorder x4, \be
+ byteorder x5, \be
+ byteorder x6, \be
+ bitorder x3, \be
+ bitorder x4, \be
+ bitorder x5, \be
+ bitorder x6, \be
crc32\c\()x w0, w0, x3
crc32\c\()x w0, w0, x4
crc32\c\()x w0, w0, x5
crc32\c\()x w0, w0, x6
cbnz x2, 32b
-0: ret
+0: bitorder w0, \be
+ ret
8: tbz x2, #3, 4f
ldr x3, [x1], #8
-CPU_BE( rev x3, x3 )
+ byteorder x3, \be
+ bitorder x3, \be
crc32\c\()x w0, w0, x3
4: tbz x2, #2, 2f
ldr w3, [x1], #4
-CPU_BE( rev w3, w3 )
+ byteorder w3, \be
+ bitorder w3, \be
crc32\c\()w w0, w0, w3
2: tbz x2, #1, 1f
ldrh w3, [x1], #2
-CPU_BE( rev16 w3, w3 )
+ byteorder16 w3, \be
+ bitorder16 w3, \be
crc32\c\()h w0, w0, w3
1: tbz x2, #0, 0f
ldrb w3, [x1]
+ bitorder8 w3, \be
crc32\c\()b w0, w0, w3
-0: ret
+0: bitorder w0, \be
+ ret
.endm
.align 5
-ENTRY(crc32_le)
+SYM_FUNC_START(crc32_le)
alternative_if_not ARM64_HAS_CRC32
b crc32_le_base
alternative_else_nop_endif
__crc32
-ENDPROC(crc32_le)
+SYM_FUNC_END(crc32_le)
.align 5
-ENTRY(__crc32c_le)
+SYM_FUNC_START(__crc32c_le)
alternative_if_not ARM64_HAS_CRC32
b __crc32c_le_base
alternative_else_nop_endif
__crc32 c
-ENDPROC(__crc32c_le)
+SYM_FUNC_END(__crc32c_le)
+
+ .align 5
+SYM_FUNC_START(crc32_be)
+alternative_if_not ARM64_HAS_CRC32
+ b crc32_be_base
+alternative_else_nop_endif
+ __crc32 be=1
+SYM_FUNC_END(crc32_be)
diff --git a/arch/arm64/lib/csum.c b/arch/arm64/lib/csum.c
new file mode 100644
index 000000000000..2432683e48a6
--- /dev/null
+++ b/arch/arm64/lib/csum.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2019-2020 Arm Ltd.
+
+#include <linux/compiler.h>
+#include <linux/kasan-checks.h>
+#include <linux/kernel.h>
+
+#include <net/checksum.h>
+
+/* Looks dumb, but generates nice-ish code */
+static u64 accumulate(u64 sum, u64 data)
+{
+ __uint128_t tmp = (__uint128_t)sum + data;
+ return tmp + (tmp >> 64);
+}
+
+/*
+ * We over-read the buffer and this makes KASAN unhappy. Instead, disable
+ * instrumentation and call kasan explicitly.
+ */
+unsigned int __no_sanitize_address do_csum(const unsigned char *buff, int len)
+{
+ unsigned int offset, shift, sum;
+ const u64 *ptr;
+ u64 data, sum64 = 0;
+
+ if (unlikely(len <= 0))
+ return 0;
+
+ offset = (unsigned long)buff & 7;
+ /*
+ * This is to all intents and purposes safe, since rounding down cannot
+ * result in a different page or cache line being accessed, and @buff
+ * should absolutely not be pointing to anything read-sensitive. We do,
+ * however, have to be careful not to piss off KASAN, which means using
+ * unchecked reads to accommodate the head and tail, for which we'll
+ * compensate with an explicit check up-front.
+ */
+ kasan_check_read(buff, len);
+ ptr = (u64 *)(buff - offset);
+ len = len + offset - 8;
+
+ /*
+ * Head: zero out any excess leading bytes. Shifting back by the same
+ * amount should be at least as fast as any other way of handling the
+ * odd/even alignment, and means we can ignore it until the very end.
+ */
+ shift = offset * 8;
+ data = *ptr++;
+#ifdef __LITTLE_ENDIAN
+ data = (data >> shift) << shift;
+#else
+ data = (data << shift) >> shift;
+#endif
+
+ /*
+ * Body: straightforward aligned loads from here on (the paired loads
+ * underlying the quadword type still only need dword alignment). The
+ * main loop strictly excludes the tail, so the second loop will always
+ * run at least once.
+ */
+ while (unlikely(len > 64)) {
+ __uint128_t tmp1, tmp2, tmp3, tmp4;
+
+ tmp1 = *(__uint128_t *)ptr;
+ tmp2 = *(__uint128_t *)(ptr + 2);
+ tmp3 = *(__uint128_t *)(ptr + 4);
+ tmp4 = *(__uint128_t *)(ptr + 6);
+
+ len -= 64;
+ ptr += 8;
+
+ /* This is the "don't dump the carry flag into a GPR" idiom */
+ tmp1 += (tmp1 >> 64) | (tmp1 << 64);
+ tmp2 += (tmp2 >> 64) | (tmp2 << 64);
+ tmp3 += (tmp3 >> 64) | (tmp3 << 64);
+ tmp4 += (tmp4 >> 64) | (tmp4 << 64);
+ tmp1 = ((tmp1 >> 64) << 64) | (tmp2 >> 64);
+ tmp1 += (tmp1 >> 64) | (tmp1 << 64);
+ tmp3 = ((tmp3 >> 64) << 64) | (tmp4 >> 64);
+ tmp3 += (tmp3 >> 64) | (tmp3 << 64);
+ tmp1 = ((tmp1 >> 64) << 64) | (tmp3 >> 64);
+ tmp1 += (tmp1 >> 64) | (tmp1 << 64);
+ tmp1 = ((tmp1 >> 64) << 64) | sum64;
+ tmp1 += (tmp1 >> 64) | (tmp1 << 64);
+ sum64 = tmp1 >> 64;
+ }
+ while (len > 8) {
+ __uint128_t tmp;
+
+ sum64 = accumulate(sum64, data);
+ tmp = *(__uint128_t *)ptr;
+
+ len -= 16;
+ ptr += 2;
+
+#ifdef __LITTLE_ENDIAN
+ data = tmp >> 64;
+ sum64 = accumulate(sum64, tmp);
+#else
+ data = tmp;
+ sum64 = accumulate(sum64, tmp >> 64);
+#endif
+ }
+ if (len > 0) {
+ sum64 = accumulate(sum64, data);
+ data = *ptr;
+ len -= 8;
+ }
+ /*
+ * Tail: zero any over-read bytes similarly to the head, again
+ * preserving odd/even alignment.
+ */
+ shift = len * -8;
+#ifdef __LITTLE_ENDIAN
+ data = (data << shift) >> shift;
+#else
+ data = (data >> shift) << shift;
+#endif
+ sum64 = accumulate(sum64, data);
+
+ /* Finally, folding */
+ sum64 += (sum64 >> 32) | (sum64 << 32);
+ sum = sum64 >> 32;
+ sum += (sum >> 16) | (sum << 16);
+ if (offset & 1)
+ return (u16)swab32(sum);
+
+ return sum >> 16;
+}
+
+__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __u32 len, __u8 proto, __wsum csum)
+{
+ __uint128_t src, dst;
+ u64 sum = (__force u64)csum;
+
+ src = *(const __uint128_t *)saddr->s6_addr;
+ dst = *(const __uint128_t *)daddr->s6_addr;
+
+ sum += (__force u32)htonl(len);
+#ifdef __LITTLE_ENDIAN
+ sum += (u32)proto << 24;
+#else
+ sum += proto;
+#endif
+ src += (src >> 64) | (src << 64);
+ dst += (dst >> 64) | (dst << 64);
+
+ sum = accumulate(sum, src >> 64);
+ sum = accumulate(sum, dst >> 64);
+
+ sum += ((sum >> 32) | (sum << 32));
+ return csum_fold((__force __wsum)(sum >> 32));
+}
+EXPORT_SYMBOL(csum_ipv6_magic);
diff --git a/arch/arm64/lib/delay.c b/arch/arm64/lib/delay.c
index 1688af0a4c97..cb2062e7e234 100644
--- a/arch/arm64/lib/delay.c
+++ b/arch/arm64/lib/delay.c
@@ -27,7 +27,17 @@ void __delay(unsigned long cycles)
{
cycles_t start = get_cycles();
- if (arch_timer_evtstrm_available()) {
+ if (alternative_has_cap_unlikely(ARM64_HAS_WFXT)) {
+ u64 end = start + cycles;
+
+ /*
+ * Start with WFIT. If an interrupt makes us resume
+ * early, use a WFET loop to complete the delay.
+ */
+ wfit(end);
+ while ((get_cycles() - start) < cycles)
+ wfet(end);
+ } else if (arch_timer_evtstrm_available()) {
const cycles_t timer_evt_period =
USECS_TO_CYCLES(ARCH_TIMER_EVT_STREAM_PERIOD_US);
diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c
new file mode 100644
index 000000000000..a635ab83fee3
--- /dev/null
+++ b/arch/arm64/lib/insn.c
@@ -0,0 +1,1517 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2013 Huawei Ltd.
+ * Author: Jiang Liu <liuj97@gmail.com>
+ *
+ * Copyright (C) 2014-2016 Zi Shen Lim <zlim.lnx@gmail.com>
+ */
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/printk.h>
+#include <linux/sizes.h>
+#include <linux/types.h>
+
+#include <asm/debug-monitors.h>
+#include <asm/errno.h>
+#include <asm/insn.h>
+#include <asm/kprobes.h>
+
+#define AARCH64_INSN_SF_BIT BIT(31)
+#define AARCH64_INSN_N_BIT BIT(22)
+#define AARCH64_INSN_LSL_12 BIT(22)
+
+static int __kprobes aarch64_get_imm_shift_mask(enum aarch64_insn_imm_type type,
+ u32 *maskp, int *shiftp)
+{
+ u32 mask;
+ int shift;
+
+ switch (type) {
+ case AARCH64_INSN_IMM_26:
+ mask = BIT(26) - 1;
+ shift = 0;
+ break;
+ case AARCH64_INSN_IMM_19:
+ mask = BIT(19) - 1;
+ shift = 5;
+ break;
+ case AARCH64_INSN_IMM_16:
+ mask = BIT(16) - 1;
+ shift = 5;
+ break;
+ case AARCH64_INSN_IMM_14:
+ mask = BIT(14) - 1;
+ shift = 5;
+ break;
+ case AARCH64_INSN_IMM_12:
+ mask = BIT(12) - 1;
+ shift = 10;
+ break;
+ case AARCH64_INSN_IMM_9:
+ mask = BIT(9) - 1;
+ shift = 12;
+ break;
+ case AARCH64_INSN_IMM_7:
+ mask = BIT(7) - 1;
+ shift = 15;
+ break;
+ case AARCH64_INSN_IMM_6:
+ case AARCH64_INSN_IMM_S:
+ mask = BIT(6) - 1;
+ shift = 10;
+ break;
+ case AARCH64_INSN_IMM_R:
+ mask = BIT(6) - 1;
+ shift = 16;
+ break;
+ case AARCH64_INSN_IMM_N:
+ mask = 1;
+ shift = 22;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ *maskp = mask;
+ *shiftp = shift;
+
+ return 0;
+}
+
+#define ADR_IMM_HILOSPLIT 2
+#define ADR_IMM_SIZE SZ_2M
+#define ADR_IMM_LOMASK ((1 << ADR_IMM_HILOSPLIT) - 1)
+#define ADR_IMM_HIMASK ((ADR_IMM_SIZE >> ADR_IMM_HILOSPLIT) - 1)
+#define ADR_IMM_LOSHIFT 29
+#define ADR_IMM_HISHIFT 5
+
+u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn)
+{
+ u32 immlo, immhi, mask;
+ int shift;
+
+ switch (type) {
+ case AARCH64_INSN_IMM_ADR:
+ shift = 0;
+ immlo = (insn >> ADR_IMM_LOSHIFT) & ADR_IMM_LOMASK;
+ immhi = (insn >> ADR_IMM_HISHIFT) & ADR_IMM_HIMASK;
+ insn = (immhi << ADR_IMM_HILOSPLIT) | immlo;
+ mask = ADR_IMM_SIZE - 1;
+ break;
+ default:
+ if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) {
+ pr_err("%s: unknown immediate encoding %d\n", __func__,
+ type);
+ return 0;
+ }
+ }
+
+ return (insn >> shift) & mask;
+}
+
+u32 __kprobes aarch64_insn_encode_immediate(enum aarch64_insn_imm_type type,
+ u32 insn, u64 imm)
+{
+ u32 immlo, immhi, mask;
+ int shift;
+
+ if (insn == AARCH64_BREAK_FAULT)
+ return AARCH64_BREAK_FAULT;
+
+ switch (type) {
+ case AARCH64_INSN_IMM_ADR:
+ shift = 0;
+ immlo = (imm & ADR_IMM_LOMASK) << ADR_IMM_LOSHIFT;
+ imm >>= ADR_IMM_HILOSPLIT;
+ immhi = (imm & ADR_IMM_HIMASK) << ADR_IMM_HISHIFT;
+ imm = immlo | immhi;
+ mask = ((ADR_IMM_LOMASK << ADR_IMM_LOSHIFT) |
+ (ADR_IMM_HIMASK << ADR_IMM_HISHIFT));
+ break;
+ default:
+ if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) {
+ pr_err("%s: unknown immediate encoding %d\n", __func__,
+ type);
+ return AARCH64_BREAK_FAULT;
+ }
+ }
+
+ /* Update the immediate field. */
+ insn &= ~(mask << shift);
+ insn |= (imm & mask) << shift;
+
+ return insn;
+}
+
+u32 aarch64_insn_decode_register(enum aarch64_insn_register_type type,
+ u32 insn)
+{
+ int shift;
+
+ switch (type) {
+ case AARCH64_INSN_REGTYPE_RT:
+ case AARCH64_INSN_REGTYPE_RD:
+ shift = 0;
+ break;
+ case AARCH64_INSN_REGTYPE_RN:
+ shift = 5;
+ break;
+ case AARCH64_INSN_REGTYPE_RT2:
+ case AARCH64_INSN_REGTYPE_RA:
+ shift = 10;
+ break;
+ case AARCH64_INSN_REGTYPE_RM:
+ shift = 16;
+ break;
+ default:
+ pr_err("%s: unknown register type encoding %d\n", __func__,
+ type);
+ return 0;
+ }
+
+ return (insn >> shift) & GENMASK(4, 0);
+}
+
+static u32 aarch64_insn_encode_register(enum aarch64_insn_register_type type,
+ u32 insn,
+ enum aarch64_insn_register reg)
+{
+ int shift;
+
+ if (insn == AARCH64_BREAK_FAULT)
+ return AARCH64_BREAK_FAULT;
+
+ if (reg < AARCH64_INSN_REG_0 || reg > AARCH64_INSN_REG_SP) {
+ pr_err("%s: unknown register encoding %d\n", __func__, reg);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (type) {
+ case AARCH64_INSN_REGTYPE_RT:
+ case AARCH64_INSN_REGTYPE_RD:
+ shift = 0;
+ break;
+ case AARCH64_INSN_REGTYPE_RN:
+ shift = 5;
+ break;
+ case AARCH64_INSN_REGTYPE_RT2:
+ case AARCH64_INSN_REGTYPE_RA:
+ shift = 10;
+ break;
+ case AARCH64_INSN_REGTYPE_RM:
+ case AARCH64_INSN_REGTYPE_RS:
+ shift = 16;
+ break;
+ default:
+ pr_err("%s: unknown register type encoding %d\n", __func__,
+ type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn &= ~(GENMASK(4, 0) << shift);
+ insn |= reg << shift;
+
+ return insn;
+}
+
+static const u32 aarch64_insn_ldst_size[] = {
+ [AARCH64_INSN_SIZE_8] = 0,
+ [AARCH64_INSN_SIZE_16] = 1,
+ [AARCH64_INSN_SIZE_32] = 2,
+ [AARCH64_INSN_SIZE_64] = 3,
+};
+
+static u32 aarch64_insn_encode_ldst_size(enum aarch64_insn_size_type type,
+ u32 insn)
+{
+ u32 size;
+
+ if (type < AARCH64_INSN_SIZE_8 || type > AARCH64_INSN_SIZE_64) {
+ pr_err("%s: unknown size encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ size = aarch64_insn_ldst_size[type];
+ insn &= ~GENMASK(31, 30);
+ insn |= size << 30;
+
+ return insn;
+}
+
+static inline long label_imm_common(unsigned long pc, unsigned long addr,
+ long range)
+{
+ long offset;
+
+ if ((pc & 0x3) || (addr & 0x3)) {
+ pr_err("%s: A64 instructions must be word aligned\n", __func__);
+ return range;
+ }
+
+ offset = ((long)addr - (long)pc);
+
+ if (offset < -range || offset >= range) {
+ pr_err("%s: offset out of range\n", __func__);
+ return range;
+ }
+
+ return offset;
+}
+
+u32 __kprobes aarch64_insn_gen_branch_imm(unsigned long pc, unsigned long addr,
+ enum aarch64_insn_branch_type type)
+{
+ u32 insn;
+ long offset;
+
+ /*
+ * B/BL support [-128M, 128M) offset
+ * ARM64 virtual address arrangement guarantees all kernel and module
+ * texts are within +/-128M.
+ */
+ offset = label_imm_common(pc, addr, SZ_128M);
+ if (offset >= SZ_128M)
+ return AARCH64_BREAK_FAULT;
+
+ switch (type) {
+ case AARCH64_INSN_BRANCH_LINK:
+ insn = aarch64_insn_get_bl_value();
+ break;
+ case AARCH64_INSN_BRANCH_NOLINK:
+ insn = aarch64_insn_get_b_value();
+ break;
+ default:
+ pr_err("%s: unknown branch encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn,
+ offset >> 2);
+}
+
+u32 aarch64_insn_gen_comp_branch_imm(unsigned long pc, unsigned long addr,
+ enum aarch64_insn_register reg,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_branch_type type)
+{
+ u32 insn;
+ long offset;
+
+ offset = label_imm_common(pc, addr, SZ_1M);
+ if (offset >= SZ_1M)
+ return AARCH64_BREAK_FAULT;
+
+ switch (type) {
+ case AARCH64_INSN_BRANCH_COMP_ZERO:
+ insn = aarch64_insn_get_cbz_value();
+ break;
+ case AARCH64_INSN_BRANCH_COMP_NONZERO:
+ insn = aarch64_insn_get_cbnz_value();
+ break;
+ default:
+ pr_err("%s: unknown branch encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT;
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
+ offset >> 2);
+}
+
+u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr,
+ enum aarch64_insn_condition cond)
+{
+ u32 insn;
+ long offset;
+
+ offset = label_imm_common(pc, addr, SZ_1M);
+
+ insn = aarch64_insn_get_bcond_value();
+
+ if (cond < AARCH64_INSN_COND_EQ || cond > AARCH64_INSN_COND_AL) {
+ pr_err("%s: unknown condition encoding %d\n", __func__, cond);
+ return AARCH64_BREAK_FAULT;
+ }
+ insn |= cond;
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
+ offset >> 2);
+}
+
+u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg,
+ enum aarch64_insn_branch_type type)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_BRANCH_NOLINK:
+ insn = aarch64_insn_get_br_value();
+ break;
+ case AARCH64_INSN_BRANCH_LINK:
+ insn = aarch64_insn_get_blr_value();
+ break;
+ case AARCH64_INSN_BRANCH_RETURN:
+ insn = aarch64_insn_get_ret_value();
+ break;
+ default:
+ pr_err("%s: unknown branch encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, reg);
+}
+
+u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg,
+ enum aarch64_insn_register base,
+ enum aarch64_insn_register offset,
+ enum aarch64_insn_size_type size,
+ enum aarch64_insn_ldst_type type)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_LDST_LOAD_REG_OFFSET:
+ insn = aarch64_insn_get_ldr_reg_value();
+ break;
+ case AARCH64_INSN_LDST_SIGNED_LOAD_REG_OFFSET:
+ insn = aarch64_insn_get_signed_ldr_reg_value();
+ break;
+ case AARCH64_INSN_LDST_STORE_REG_OFFSET:
+ insn = aarch64_insn_get_str_reg_value();
+ break;
+ default:
+ pr_err("%s: unknown load/store encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_ldst_size(size, insn);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+ base);
+
+ return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn,
+ offset);
+}
+
+u32 aarch64_insn_gen_load_store_imm(enum aarch64_insn_register reg,
+ enum aarch64_insn_register base,
+ unsigned int imm,
+ enum aarch64_insn_size_type size,
+ enum aarch64_insn_ldst_type type)
+{
+ u32 insn;
+ u32 shift;
+
+ if (size < AARCH64_INSN_SIZE_8 || size > AARCH64_INSN_SIZE_64) {
+ pr_err("%s: unknown size encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ shift = aarch64_insn_ldst_size[size];
+ if (imm & ~(BIT(12 + shift) - BIT(shift))) {
+ pr_err("%s: invalid imm: %d\n", __func__, imm);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ imm >>= shift;
+
+ switch (type) {
+ case AARCH64_INSN_LDST_LOAD_IMM_OFFSET:
+ insn = aarch64_insn_get_ldr_imm_value();
+ break;
+ case AARCH64_INSN_LDST_SIGNED_LOAD_IMM_OFFSET:
+ insn = aarch64_insn_get_signed_load_imm_value();
+ break;
+ case AARCH64_INSN_LDST_STORE_IMM_OFFSET:
+ insn = aarch64_insn_get_str_imm_value();
+ break;
+ default:
+ pr_err("%s: unknown load/store encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_ldst_size(size, insn);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+ base);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm);
+}
+
+u32 aarch64_insn_gen_load_literal(unsigned long pc, unsigned long addr,
+ enum aarch64_insn_register reg,
+ bool is64bit)
+{
+ u32 insn;
+ long offset;
+
+ offset = label_imm_common(pc, addr, SZ_1M);
+ if (offset >= SZ_1M)
+ return AARCH64_BREAK_FAULT;
+
+ insn = aarch64_insn_get_ldr_lit_value();
+
+ if (is64bit)
+ insn |= BIT(30);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
+ offset >> 2);
+}
+
+u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1,
+ enum aarch64_insn_register reg2,
+ enum aarch64_insn_register base,
+ int offset,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_ldst_type type)
+{
+ u32 insn;
+ int shift;
+
+ switch (type) {
+ case AARCH64_INSN_LDST_LOAD_PAIR_PRE_INDEX:
+ insn = aarch64_insn_get_ldp_pre_value();
+ break;
+ case AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX:
+ insn = aarch64_insn_get_stp_pre_value();
+ break;
+ case AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX:
+ insn = aarch64_insn_get_ldp_post_value();
+ break;
+ case AARCH64_INSN_LDST_STORE_PAIR_POST_INDEX:
+ insn = aarch64_insn_get_stp_post_value();
+ break;
+ default:
+ pr_err("%s: unknown load/store encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ if ((offset & 0x3) || (offset < -256) || (offset > 252)) {
+ pr_err("%s: offset must be multiples of 4 in the range of [-256, 252] %d\n",
+ __func__, offset);
+ return AARCH64_BREAK_FAULT;
+ }
+ shift = 2;
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ if ((offset & 0x7) || (offset < -512) || (offset > 504)) {
+ pr_err("%s: offset must be multiples of 8 in the range of [-512, 504] %d\n",
+ __func__, offset);
+ return AARCH64_BREAK_FAULT;
+ }
+ shift = 3;
+ insn |= AARCH64_INSN_SF_BIT;
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
+ reg1);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn,
+ reg2);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+ base);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_7, insn,
+ offset >> shift);
+}
+
+u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg,
+ enum aarch64_insn_register base,
+ enum aarch64_insn_register state,
+ enum aarch64_insn_size_type size,
+ enum aarch64_insn_ldst_type type)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_LDST_LOAD_EX:
+ case AARCH64_INSN_LDST_LOAD_ACQ_EX:
+ insn = aarch64_insn_get_load_ex_value();
+ if (type == AARCH64_INSN_LDST_LOAD_ACQ_EX)
+ insn |= BIT(15);
+ break;
+ case AARCH64_INSN_LDST_STORE_EX:
+ case AARCH64_INSN_LDST_STORE_REL_EX:
+ insn = aarch64_insn_get_store_ex_value();
+ if (type == AARCH64_INSN_LDST_STORE_REL_EX)
+ insn |= BIT(15);
+ break;
+ default:
+ pr_err("%s: unknown load/store exclusive encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_ldst_size(size, insn);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
+ reg);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+ base);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn,
+ AARCH64_INSN_REG_ZR);
+
+ return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn,
+ state);
+}
+
+#ifdef CONFIG_ARM64_LSE_ATOMICS
+static u32 aarch64_insn_encode_ldst_order(enum aarch64_insn_mem_order_type type,
+ u32 insn)
+{
+ u32 order;
+
+ switch (type) {
+ case AARCH64_INSN_MEM_ORDER_NONE:
+ order = 0;
+ break;
+ case AARCH64_INSN_MEM_ORDER_ACQ:
+ order = 2;
+ break;
+ case AARCH64_INSN_MEM_ORDER_REL:
+ order = 1;
+ break;
+ case AARCH64_INSN_MEM_ORDER_ACQREL:
+ order = 3;
+ break;
+ default:
+ pr_err("%s: unknown mem order %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn &= ~GENMASK(23, 22);
+ insn |= order << 22;
+
+ return insn;
+}
+
+u32 aarch64_insn_gen_atomic_ld_op(enum aarch64_insn_register result,
+ enum aarch64_insn_register address,
+ enum aarch64_insn_register value,
+ enum aarch64_insn_size_type size,
+ enum aarch64_insn_mem_atomic_op op,
+ enum aarch64_insn_mem_order_type order)
+{
+ u32 insn;
+
+ switch (op) {
+ case AARCH64_INSN_MEM_ATOMIC_ADD:
+ insn = aarch64_insn_get_ldadd_value();
+ break;
+ case AARCH64_INSN_MEM_ATOMIC_CLR:
+ insn = aarch64_insn_get_ldclr_value();
+ break;
+ case AARCH64_INSN_MEM_ATOMIC_EOR:
+ insn = aarch64_insn_get_ldeor_value();
+ break;
+ case AARCH64_INSN_MEM_ATOMIC_SET:
+ insn = aarch64_insn_get_ldset_value();
+ break;
+ case AARCH64_INSN_MEM_ATOMIC_SWP:
+ insn = aarch64_insn_get_swp_value();
+ break;
+ default:
+ pr_err("%s: unimplemented mem atomic op %d\n", __func__, op);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (size) {
+ case AARCH64_INSN_SIZE_32:
+ case AARCH64_INSN_SIZE_64:
+ break;
+ default:
+ pr_err("%s: unimplemented size encoding %d\n", __func__, size);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_ldst_size(size, insn);
+
+ insn = aarch64_insn_encode_ldst_order(order, insn);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
+ result);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+ address);
+
+ return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn,
+ value);
+}
+
+static u32 aarch64_insn_encode_cas_order(enum aarch64_insn_mem_order_type type,
+ u32 insn)
+{
+ u32 order;
+
+ switch (type) {
+ case AARCH64_INSN_MEM_ORDER_NONE:
+ order = 0;
+ break;
+ case AARCH64_INSN_MEM_ORDER_ACQ:
+ order = BIT(22);
+ break;
+ case AARCH64_INSN_MEM_ORDER_REL:
+ order = BIT(15);
+ break;
+ case AARCH64_INSN_MEM_ORDER_ACQREL:
+ order = BIT(15) | BIT(22);
+ break;
+ default:
+ pr_err("%s: unknown mem order %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn &= ~(BIT(15) | BIT(22));
+ insn |= order;
+
+ return insn;
+}
+
+u32 aarch64_insn_gen_cas(enum aarch64_insn_register result,
+ enum aarch64_insn_register address,
+ enum aarch64_insn_register value,
+ enum aarch64_insn_size_type size,
+ enum aarch64_insn_mem_order_type order)
+{
+ u32 insn;
+
+ switch (size) {
+ case AARCH64_INSN_SIZE_32:
+ case AARCH64_INSN_SIZE_64:
+ break;
+ default:
+ pr_err("%s: unimplemented size encoding %d\n", __func__, size);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_get_cas_value();
+
+ insn = aarch64_insn_encode_ldst_size(size, insn);
+
+ insn = aarch64_insn_encode_cas_order(order, insn);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
+ result);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+ address);
+
+ return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn,
+ value);
+}
+#endif
+
+u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst,
+ enum aarch64_insn_register src,
+ int imm, enum aarch64_insn_variant variant,
+ enum aarch64_insn_adsb_type type)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_ADSB_ADD:
+ insn = aarch64_insn_get_add_imm_value();
+ break;
+ case AARCH64_INSN_ADSB_SUB:
+ insn = aarch64_insn_get_sub_imm_value();
+ break;
+ case AARCH64_INSN_ADSB_ADD_SETFLAGS:
+ insn = aarch64_insn_get_adds_imm_value();
+ break;
+ case AARCH64_INSN_ADSB_SUB_SETFLAGS:
+ insn = aarch64_insn_get_subs_imm_value();
+ break;
+ default:
+ pr_err("%s: unknown add/sub encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT;
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ /* We can't encode more than a 24bit value (12bit + 12bit shift) */
+ if (imm & ~(BIT(24) - 1))
+ goto out;
+
+ /* If we have something in the top 12 bits... */
+ if (imm & ~(SZ_4K - 1)) {
+ /* ... and in the low 12 bits -> error */
+ if (imm & (SZ_4K - 1))
+ goto out;
+
+ imm >>= 12;
+ insn |= AARCH64_INSN_LSL_12;
+ }
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm);
+
+out:
+ pr_err("%s: invalid immediate encoding %d\n", __func__, imm);
+ return AARCH64_BREAK_FAULT;
+}
+
+u32 aarch64_insn_gen_bitfield(enum aarch64_insn_register dst,
+ enum aarch64_insn_register src,
+ int immr, int imms,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_bitfield_type type)
+{
+ u32 insn;
+ u32 mask;
+
+ switch (type) {
+ case AARCH64_INSN_BITFIELD_MOVE:
+ insn = aarch64_insn_get_bfm_value();
+ break;
+ case AARCH64_INSN_BITFIELD_MOVE_UNSIGNED:
+ insn = aarch64_insn_get_ubfm_value();
+ break;
+ case AARCH64_INSN_BITFIELD_MOVE_SIGNED:
+ insn = aarch64_insn_get_sbfm_value();
+ break;
+ default:
+ pr_err("%s: unknown bitfield encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ mask = GENMASK(4, 0);
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT | AARCH64_INSN_N_BIT;
+ mask = GENMASK(5, 0);
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ if (immr & ~mask) {
+ pr_err("%s: invalid immr encoding %d\n", __func__, immr);
+ return AARCH64_BREAK_FAULT;
+ }
+ if (imms & ~mask) {
+ pr_err("%s: invalid imms encoding %d\n", __func__, imms);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+
+ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms);
+}
+
+u32 aarch64_insn_gen_movewide(enum aarch64_insn_register dst,
+ int imm, int shift,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_movewide_type type)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_MOVEWIDE_ZERO:
+ insn = aarch64_insn_get_movz_value();
+ break;
+ case AARCH64_INSN_MOVEWIDE_KEEP:
+ insn = aarch64_insn_get_movk_value();
+ break;
+ case AARCH64_INSN_MOVEWIDE_INVERSE:
+ insn = aarch64_insn_get_movn_value();
+ break;
+ default:
+ pr_err("%s: unknown movewide encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ if (imm & ~(SZ_64K - 1)) {
+ pr_err("%s: invalid immediate encoding %d\n", __func__, imm);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ if (shift != 0 && shift != 16) {
+ pr_err("%s: invalid shift encoding %d\n", __func__,
+ shift);
+ return AARCH64_BREAK_FAULT;
+ }
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT;
+ if (shift != 0 && shift != 16 && shift != 32 && shift != 48) {
+ pr_err("%s: invalid shift encoding %d\n", __func__,
+ shift);
+ return AARCH64_BREAK_FAULT;
+ }
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn |= (shift >> 4) << 21;
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm);
+}
+
+u32 aarch64_insn_gen_add_sub_shifted_reg(enum aarch64_insn_register dst,
+ enum aarch64_insn_register src,
+ enum aarch64_insn_register reg,
+ int shift,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_adsb_type type)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_ADSB_ADD:
+ insn = aarch64_insn_get_add_value();
+ break;
+ case AARCH64_INSN_ADSB_SUB:
+ insn = aarch64_insn_get_sub_value();
+ break;
+ case AARCH64_INSN_ADSB_ADD_SETFLAGS:
+ insn = aarch64_insn_get_adds_value();
+ break;
+ case AARCH64_INSN_ADSB_SUB_SETFLAGS:
+ insn = aarch64_insn_get_subs_value();
+ break;
+ default:
+ pr_err("%s: unknown add/sub encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ if (shift & ~(SZ_32 - 1)) {
+ pr_err("%s: invalid shift encoding %d\n", __func__,
+ shift);
+ return AARCH64_BREAK_FAULT;
+ }
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT;
+ if (shift & ~(SZ_64 - 1)) {
+ pr_err("%s: invalid shift encoding %d\n", __func__,
+ shift);
+ return AARCH64_BREAK_FAULT;
+ }
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift);
+}
+
+u32 aarch64_insn_gen_data1(enum aarch64_insn_register dst,
+ enum aarch64_insn_register src,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_data1_type type)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_DATA1_REVERSE_16:
+ insn = aarch64_insn_get_rev16_value();
+ break;
+ case AARCH64_INSN_DATA1_REVERSE_32:
+ insn = aarch64_insn_get_rev32_value();
+ break;
+ case AARCH64_INSN_DATA1_REVERSE_64:
+ if (variant != AARCH64_INSN_VARIANT_64BIT) {
+ pr_err("%s: invalid variant for reverse64 %d\n",
+ __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+ insn = aarch64_insn_get_rev64_value();
+ break;
+ default:
+ pr_err("%s: unknown data1 encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT;
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+ return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+}
+
+u32 aarch64_insn_gen_data2(enum aarch64_insn_register dst,
+ enum aarch64_insn_register src,
+ enum aarch64_insn_register reg,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_data2_type type)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_DATA2_UDIV:
+ insn = aarch64_insn_get_udiv_value();
+ break;
+ case AARCH64_INSN_DATA2_SDIV:
+ insn = aarch64_insn_get_sdiv_value();
+ break;
+ case AARCH64_INSN_DATA2_LSLV:
+ insn = aarch64_insn_get_lslv_value();
+ break;
+ case AARCH64_INSN_DATA2_LSRV:
+ insn = aarch64_insn_get_lsrv_value();
+ break;
+ case AARCH64_INSN_DATA2_ASRV:
+ insn = aarch64_insn_get_asrv_value();
+ break;
+ case AARCH64_INSN_DATA2_RORV:
+ insn = aarch64_insn_get_rorv_value();
+ break;
+ default:
+ pr_err("%s: unknown data2 encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT;
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+
+ return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg);
+}
+
+u32 aarch64_insn_gen_data3(enum aarch64_insn_register dst,
+ enum aarch64_insn_register src,
+ enum aarch64_insn_register reg1,
+ enum aarch64_insn_register reg2,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_data3_type type)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_DATA3_MADD:
+ insn = aarch64_insn_get_madd_value();
+ break;
+ case AARCH64_INSN_DATA3_MSUB:
+ insn = aarch64_insn_get_msub_value();
+ break;
+ default:
+ pr_err("%s: unknown data3 encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT;
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RA, insn, src);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+ reg1);
+
+ return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn,
+ reg2);
+}
+
+u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst,
+ enum aarch64_insn_register src,
+ enum aarch64_insn_register reg,
+ int shift,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_logic_type type)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_LOGIC_AND:
+ insn = aarch64_insn_get_and_value();
+ break;
+ case AARCH64_INSN_LOGIC_BIC:
+ insn = aarch64_insn_get_bic_value();
+ break;
+ case AARCH64_INSN_LOGIC_ORR:
+ insn = aarch64_insn_get_orr_value();
+ break;
+ case AARCH64_INSN_LOGIC_ORN:
+ insn = aarch64_insn_get_orn_value();
+ break;
+ case AARCH64_INSN_LOGIC_EOR:
+ insn = aarch64_insn_get_eor_value();
+ break;
+ case AARCH64_INSN_LOGIC_EON:
+ insn = aarch64_insn_get_eon_value();
+ break;
+ case AARCH64_INSN_LOGIC_AND_SETFLAGS:
+ insn = aarch64_insn_get_ands_value();
+ break;
+ case AARCH64_INSN_LOGIC_BIC_SETFLAGS:
+ insn = aarch64_insn_get_bics_value();
+ break;
+ default:
+ pr_err("%s: unknown logical encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ if (shift & ~(SZ_32 - 1)) {
+ pr_err("%s: invalid shift encoding %d\n", __func__,
+ shift);
+ return AARCH64_BREAK_FAULT;
+ }
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT;
+ if (shift & ~(SZ_64 - 1)) {
+ pr_err("%s: invalid shift encoding %d\n", __func__,
+ shift);
+ return AARCH64_BREAK_FAULT;
+ }
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift);
+}
+
+/*
+ * MOV (register) is architecturally an alias of ORR (shifted register) where
+ * MOV <*d>, <*m> is equivalent to ORR <*d>, <*ZR>, <*m>
+ */
+u32 aarch64_insn_gen_move_reg(enum aarch64_insn_register dst,
+ enum aarch64_insn_register src,
+ enum aarch64_insn_variant variant)
+{
+ return aarch64_insn_gen_logical_shifted_reg(dst, AARCH64_INSN_REG_ZR,
+ src, 0, variant,
+ AARCH64_INSN_LOGIC_ORR);
+}
+
+u32 aarch64_insn_gen_adr(unsigned long pc, unsigned long addr,
+ enum aarch64_insn_register reg,
+ enum aarch64_insn_adr_type type)
+{
+ u32 insn;
+ s32 offset;
+
+ switch (type) {
+ case AARCH64_INSN_ADR_TYPE_ADR:
+ insn = aarch64_insn_get_adr_value();
+ offset = addr - pc;
+ break;
+ case AARCH64_INSN_ADR_TYPE_ADRP:
+ insn = aarch64_insn_get_adrp_value();
+ offset = (addr - ALIGN_DOWN(pc, SZ_4K)) >> 12;
+ break;
+ default:
+ pr_err("%s: unknown adr encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ if (offset < -SZ_1M || offset >= SZ_1M)
+ return AARCH64_BREAK_FAULT;
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, reg);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn, offset);
+}
+
+/*
+ * Decode the imm field of a branch, and return the byte offset as a
+ * signed value (so it can be used when computing a new branch
+ * target).
+ */
+s32 aarch64_get_branch_offset(u32 insn)
+{
+ s32 imm;
+
+ if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) {
+ imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_26, insn);
+ return (imm << 6) >> 4;
+ }
+
+ if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
+ aarch64_insn_is_bcond(insn)) {
+ imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_19, insn);
+ return (imm << 13) >> 11;
+ }
+
+ if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) {
+ imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_14, insn);
+ return (imm << 18) >> 16;
+ }
+
+ /* Unhandled instruction */
+ BUG();
+}
+
+/*
+ * Encode the displacement of a branch in the imm field and return the
+ * updated instruction.
+ */
+u32 aarch64_set_branch_offset(u32 insn, s32 offset)
+{
+ if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn))
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn,
+ offset >> 2);
+
+ if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
+ aarch64_insn_is_bcond(insn))
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
+ offset >> 2);
+
+ if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn))
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_14, insn,
+ offset >> 2);
+
+ /* Unhandled instruction */
+ BUG();
+}
+
+s32 aarch64_insn_adrp_get_offset(u32 insn)
+{
+ BUG_ON(!aarch64_insn_is_adrp(insn));
+ return aarch64_insn_decode_immediate(AARCH64_INSN_IMM_ADR, insn) << 12;
+}
+
+u32 aarch64_insn_adrp_set_offset(u32 insn, s32 offset)
+{
+ BUG_ON(!aarch64_insn_is_adrp(insn));
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn,
+ offset >> 12);
+}
+
+/*
+ * Extract the Op/CR data from a msr/mrs instruction.
+ */
+u32 aarch64_insn_extract_system_reg(u32 insn)
+{
+ return (insn & 0x1FFFE0) >> 5;
+}
+
+bool aarch32_insn_is_wide(u32 insn)
+{
+ return insn >= 0xe800;
+}
+
+/*
+ * Macros/defines for extracting register numbers from instruction.
+ */
+u32 aarch32_insn_extract_reg_num(u32 insn, int offset)
+{
+ return (insn & (0xf << offset)) >> offset;
+}
+
+#define OPC2_MASK 0x7
+#define OPC2_OFFSET 5
+u32 aarch32_insn_mcr_extract_opc2(u32 insn)
+{
+ return (insn & (OPC2_MASK << OPC2_OFFSET)) >> OPC2_OFFSET;
+}
+
+#define CRM_MASK 0xf
+u32 aarch32_insn_mcr_extract_crm(u32 insn)
+{
+ return insn & CRM_MASK;
+}
+
+static bool range_of_ones(u64 val)
+{
+ /* Doesn't handle full ones or full zeroes */
+ u64 sval = val >> __ffs64(val);
+
+ /* One of Sean Eron Anderson's bithack tricks */
+ return ((sval + 1) & (sval)) == 0;
+}
+
+static u32 aarch64_encode_immediate(u64 imm,
+ enum aarch64_insn_variant variant,
+ u32 insn)
+{
+ unsigned int immr, imms, n, ones, ror, esz, tmp;
+ u64 mask;
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ esz = 32;
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT;
+ esz = 64;
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ mask = GENMASK(esz - 1, 0);
+
+ /* Can't encode full zeroes, full ones, or value wider than the mask */
+ if (!imm || imm == mask || imm & ~mask)
+ return AARCH64_BREAK_FAULT;
+
+ /*
+ * Inverse of Replicate(). Try to spot a repeating pattern
+ * with a pow2 stride.
+ */
+ for (tmp = esz / 2; tmp >= 2; tmp /= 2) {
+ u64 emask = BIT(tmp) - 1;
+
+ if ((imm & emask) != ((imm >> tmp) & emask))
+ break;
+
+ esz = tmp;
+ mask = emask;
+ }
+
+ /* N is only set if we're encoding a 64bit value */
+ n = esz == 64;
+
+ /* Trim imm to the element size */
+ imm &= mask;
+
+ /* That's how many ones we need to encode */
+ ones = hweight64(imm);
+
+ /*
+ * imms is set to (ones - 1), prefixed with a string of ones
+ * and a zero if they fit. Cap it to 6 bits.
+ */
+ imms = ones - 1;
+ imms |= 0xf << ffs(esz);
+ imms &= BIT(6) - 1;
+
+ /* Compute the rotation */
+ if (range_of_ones(imm)) {
+ /*
+ * Pattern: 0..01..10..0
+ *
+ * Compute how many rotate we need to align it right
+ */
+ ror = __ffs64(imm);
+ } else {
+ /*
+ * Pattern: 0..01..10..01..1
+ *
+ * Fill the unused top bits with ones, and check if
+ * the result is a valid immediate (all ones with a
+ * contiguous ranges of zeroes).
+ */
+ imm |= ~mask;
+ if (!range_of_ones(~imm))
+ return AARCH64_BREAK_FAULT;
+
+ /*
+ * Compute the rotation to get a continuous set of
+ * ones, with the first bit set at position 0
+ */
+ ror = fls64(~imm);
+ }
+
+ /*
+ * immr is the number of bits we need to rotate back to the
+ * original set of ones. Note that this is relative to the
+ * element size...
+ */
+ immr = (esz - ror) % esz;
+
+ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, n);
+ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr);
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms);
+}
+
+u32 aarch64_insn_gen_logical_immediate(enum aarch64_insn_logic_type type,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_register Rn,
+ enum aarch64_insn_register Rd,
+ u64 imm)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_LOGIC_AND:
+ insn = aarch64_insn_get_and_imm_value();
+ break;
+ case AARCH64_INSN_LOGIC_ORR:
+ insn = aarch64_insn_get_orr_imm_value();
+ break;
+ case AARCH64_INSN_LOGIC_EOR:
+ insn = aarch64_insn_get_eor_imm_value();
+ break;
+ case AARCH64_INSN_LOGIC_AND_SETFLAGS:
+ insn = aarch64_insn_get_ands_imm_value();
+ break;
+ default:
+ pr_err("%s: unknown logical encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd);
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn);
+ return aarch64_encode_immediate(imm, variant, insn);
+}
+
+u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant,
+ enum aarch64_insn_register Rm,
+ enum aarch64_insn_register Rn,
+ enum aarch64_insn_register Rd,
+ u8 lsb)
+{
+ u32 insn;
+
+ insn = aarch64_insn_get_extr_value();
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ if (lsb > 31)
+ return AARCH64_BREAK_FAULT;
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ if (lsb > 63)
+ return AARCH64_BREAK_FAULT;
+ insn |= AARCH64_INSN_SF_BIT;
+ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, 1);
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, lsb);
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd);
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn);
+ return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, Rm);
+}
+
+u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type)
+{
+ u32 opt;
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_MB_SY:
+ opt = 0xf;
+ break;
+ case AARCH64_INSN_MB_ST:
+ opt = 0xe;
+ break;
+ case AARCH64_INSN_MB_LD:
+ opt = 0xd;
+ break;
+ case AARCH64_INSN_MB_ISH:
+ opt = 0xb;
+ break;
+ case AARCH64_INSN_MB_ISHST:
+ opt = 0xa;
+ break;
+ case AARCH64_INSN_MB_ISHLD:
+ opt = 0x9;
+ break;
+ case AARCH64_INSN_MB_NSH:
+ opt = 0x7;
+ break;
+ case AARCH64_INSN_MB_NSHST:
+ opt = 0x6;
+ break;
+ case AARCH64_INSN_MB_NSHLD:
+ opt = 0x5;
+ break;
+ default:
+ pr_err("%s: unknown dmb type %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_get_dmb_value();
+ insn &= ~GENMASK(11, 8);
+ insn |= (opt << 8);
+
+ return insn;
+}
diff --git a/arch/arm64/lib/kasan_sw_tags.S b/arch/arm64/lib/kasan_sw_tags.S
new file mode 100644
index 000000000000..20784ce75def
--- /dev/null
+++ b/arch/arm64/lib/kasan_sw_tags.S
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 Google LLC
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * Report a tag mismatch detected by tag-based KASAN.
+ *
+ * A compiler-generated thunk calls this with a non-AAPCS calling
+ * convention. Upon entry to this function, registers are as follows:
+ *
+ * x0: fault address (see below for restore)
+ * x1: fault description (see below for restore)
+ * x2 to x15: callee-saved
+ * x16 to x17: safe to clobber
+ * x18 to x30: callee-saved
+ * sp: pre-decremented by 256 bytes (see below for restore)
+ *
+ * The caller has decremented the SP by 256 bytes, and created a
+ * structure on the stack as follows:
+ *
+ * sp + 0..15: x0 and x1 to be restored
+ * sp + 16..231: free for use
+ * sp + 232..247: x29 and x30 (same as in GPRs)
+ * sp + 248..255: free for use
+ *
+ * Note that this is not a struct pt_regs.
+ *
+ * To call a regular AAPCS function we must save x2 to x15 (which we can
+ * store in the gaps), and create a frame record (for which we can use
+ * x29 and x30 spilled by the caller as those match the GPRs).
+ *
+ * The caller expects x0 and x1 to be restored from the structure, and
+ * for the structure to be removed from the stack (i.e. the SP must be
+ * incremented by 256 prior to return).
+ */
+SYM_CODE_START(__hwasan_tag_mismatch)
+ bti c
+ add x29, sp, #232
+ stp x2, x3, [sp, #8 * 2]
+ stp x4, x5, [sp, #8 * 4]
+ stp x6, x7, [sp, #8 * 6]
+ stp x8, x9, [sp, #8 * 8]
+ stp x10, x11, [sp, #8 * 10]
+ stp x12, x13, [sp, #8 * 12]
+ stp x14, x15, [sp, #8 * 14]
+#ifndef CONFIG_SHADOW_CALL_STACK
+ str x18, [sp, #8 * 18]
+#endif
+
+ mov x2, x30
+ bl kasan_tag_mismatch
+
+ ldp x0, x1, [sp]
+ ldp x2, x3, [sp, #8 * 2]
+ ldp x4, x5, [sp, #8 * 4]
+ ldp x6, x7, [sp, #8 * 6]
+ ldp x8, x9, [sp, #8 * 8]
+ ldp x10, x11, [sp, #8 * 10]
+ ldp x12, x13, [sp, #8 * 12]
+ ldp x14, x15, [sp, #8 * 14]
+#ifndef CONFIG_SHADOW_CALL_STACK
+ ldr x18, [sp, #8 * 18]
+#endif
+ ldp x29, x30, [sp, #8 * 29]
+
+ /* remove the structure from the stack */
+ add sp, sp, #256
+ ret
+SYM_CODE_END(__hwasan_tag_mismatch)
+EXPORT_SYMBOL(__hwasan_tag_mismatch)
diff --git a/arch/arm64/lib/memchr.S b/arch/arm64/lib/memchr.S
index 48a3ab636e4f..37a9f2a4f7f4 100644
--- a/arch/arm64/lib/memchr.S
+++ b/arch/arm64/lib/memchr.S
@@ -1,9 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Based on arch/arm/lib/memchr.S
- *
- * Copyright (C) 1995-2000 Russell King
- * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2021 Arm Ltd.
*/
#include <linux/linkage.h>
@@ -19,16 +16,61 @@
* Returns:
* x0 - address of first occurrence of 'c' or 0
*/
-WEAK(memchr)
- and w1, w1, #0xff
-1: subs x2, x2, #1
- b.mi 2f
- ldrb w3, [x0], #1
- cmp w3, w1
- b.ne 1b
- sub x0, x0, #1
+
+#define L(label) .L ## label
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+#define srcin x0
+#define chrin w1
+#define cntin x2
+
+#define result x0
+
+#define wordcnt x3
+#define rep01 x4
+#define repchr x5
+#define cur_word x6
+#define cur_byte w6
+#define tmp x7
+#define tmp2 x8
+
+ .p2align 4
+ nop
+SYM_FUNC_START(__pi_memchr)
+ and chrin, chrin, #0xff
+ lsr wordcnt, cntin, #3
+ cbz wordcnt, L(byte_loop)
+ mov rep01, #REP8_01
+ mul repchr, x1, rep01
+ and cntin, cntin, #7
+L(word_loop):
+ ldr cur_word, [srcin], #8
+ sub wordcnt, wordcnt, #1
+ eor cur_word, cur_word, repchr
+ sub tmp, cur_word, rep01
+ orr tmp2, cur_word, #REP8_7f
+ bics tmp, tmp, tmp2
+ b.ne L(found_word)
+ cbnz wordcnt, L(word_loop)
+L(byte_loop):
+ cbz cntin, L(not_found)
+ ldrb cur_byte, [srcin], #1
+ sub cntin, cntin, #1
+ cmp cur_byte, chrin
+ b.ne L(byte_loop)
+ sub srcin, srcin, #1
+ ret
+L(found_word):
+CPU_LE( rev tmp, tmp)
+ clz tmp, tmp
+ sub tmp, tmp, #64
+ add result, srcin, tmp, asr #3
ret
-2: mov x0, #0
+L(not_found):
+ mov result, #0
ret
-ENDPIPROC(memchr)
+SYM_FUNC_END(__pi_memchr)
+SYM_FUNC_ALIAS_WEAK(memchr, __pi_memchr)
EXPORT_SYMBOL_NOKASAN(memchr)
diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S
index b297bdaaf549..a5ccf2c55f91 100644
--- a/arch/arm64/lib/memcmp.S
+++ b/arch/arm64/lib/memcmp.S
@@ -1,247 +1,139 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2013-2021, Arm Limited.
*
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/memcmp.S
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
-/*
-* compare memory areas(when two memory areas' offset are different,
-* alignment handled by the hardware)
-*
-* Parameters:
-* x0 - const memory area 1 pointer
-* x1 - const memory area 2 pointer
-* x2 - the maximal compare byte length
-* Returns:
-* x0 - a compare result, maybe less than, equal to, or greater than ZERO
-*/
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ */
+
+#define L(label) .L ## label
/* Parameters and result. */
-src1 .req x0
-src2 .req x1
-limit .req x2
-result .req x0
+#define src1 x0
+#define src2 x1
+#define limit x2
+#define result w0
/* Internal variables. */
-data1 .req x3
-data1w .req w3
-data2 .req x4
-data2w .req w4
-has_nul .req x5
-diff .req x6
-endloop .req x7
-tmp1 .req x8
-tmp2 .req x9
-tmp3 .req x10
-pos .req x11
-limit_wd .req x12
-mask .req x13
-
-WEAK(memcmp)
- cbz limit, .Lret0
- eor tmp1, src1, src2
- tst tmp1, #7
- b.ne .Lmisaligned8
- ands tmp1, src1, #7
- b.ne .Lmutual_align
- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
- lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
- /*
- * The input source addresses are at alignment boundary.
- * Directly compare eight bytes each time.
- */
-.Lloop_aligned:
- ldr data1, [src1], #8
- ldr data2, [src2], #8
-.Lstart_realigned:
- subs limit_wd, limit_wd, #1
- eor diff, data1, data2 /* Non-zero if differences found. */
- csinv endloop, diff, xzr, cs /* Last Dword or differences. */
- cbz endloop, .Lloop_aligned
-
- /* Not reached the limit, must have found a diff. */
- tbz limit_wd, #63, .Lnot_limit
-
- /* Limit % 8 == 0 => the diff is in the last 8 bytes. */
- ands limit, limit, #7
- b.eq .Lnot_limit
- /*
- * The remained bytes less than 8. It is needed to extract valid data
- * from last eight bytes of the intended memory range.
- */
- lsl limit, limit, #3 /* bytes-> bits. */
- mov mask, #~0
-CPU_BE( lsr mask, mask, limit )
-CPU_LE( lsl mask, mask, limit )
- bic data1, data1, mask
- bic data2, data2, mask
-
- orr diff, diff, mask
- b .Lnot_limit
-
-.Lmutual_align:
- /*
- * Sources are mutually aligned, but are not currently at an
- * alignment boundary. Round down the addresses and then mask off
- * the bytes that precede the start point.
- */
- bic src1, src1, #7
- bic src2, src2, #7
- ldr data1, [src1], #8
- ldr data2, [src2], #8
- /*
- * We can not add limit with alignment offset(tmp1) here. Since the
- * addition probably make the limit overflown.
- */
- sub limit_wd, limit, #1/*limit != 0, so no underflow.*/
- and tmp3, limit_wd, #7
- lsr limit_wd, limit_wd, #3
- add tmp3, tmp3, tmp1
- add limit_wd, limit_wd, tmp3, lsr #3
- add limit, limit, tmp1/* Adjust the limit for the extra. */
-
- lsl tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/
- neg tmp1, tmp1/* Bits to alignment -64. */
- mov tmp2, #~0
- /*mask off the non-intended bytes before the start address.*/
-CPU_BE( lsl tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/
- /* Little-endian. Early bytes are at LSB. */
-CPU_LE( lsr tmp2, tmp2, tmp1 )
-
- orr data1, data1, tmp2
- orr data2, data2, tmp2
- b .Lstart_realigned
-
- /*src1 and src2 have different alignment offset.*/
-.Lmisaligned8:
- cmp limit, #8
- b.lo .Ltiny8proc /*limit < 8: compare byte by byte*/
-
- and tmp1, src1, #7
- neg tmp1, tmp1
- add tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/
- and tmp2, src2, #7
- neg tmp2, tmp2
- add tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/
- subs tmp3, tmp1, tmp2
- csel pos, tmp1, tmp2, hi /*Choose the maximum.*/
-
- sub limit, limit, pos
- /*compare the proceeding bytes in the first 8 byte segment.*/
-.Ltinycmp:
- ldrb data1w, [src1], #1
- ldrb data2w, [src2], #1
- subs pos, pos, #1
- ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */
- b.eq .Ltinycmp
- cbnz pos, 1f /*diff occurred before the last byte.*/
- cmp data1w, data2w
- b.eq .Lstart_align
-1:
- sub result, data1, data2
+#define data1 x3
+#define data1w w3
+#define data1h x4
+#define data2 x5
+#define data2w w5
+#define data2h x6
+#define tmp1 x7
+#define tmp2 x8
+
+SYM_FUNC_START(__pi_memcmp)
+ subs limit, limit, 8
+ b.lo L(less8)
+
+ ldr data1, [src1], 8
+ ldr data2, [src2], 8
+ cmp data1, data2
+ b.ne L(return)
+
+ subs limit, limit, 8
+ b.gt L(more16)
+
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ b L(return)
+
+L(more16):
+ ldr data1, [src1], 8
+ ldr data2, [src2], 8
+ cmp data1, data2
+ bne L(return)
+
+ /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+ strings. */
+ subs limit, limit, 16
+ b.ls L(last_bytes)
+
+ /* We overlap loads between 0-32 bytes at either side of SRC1 when we
+ try to align, so limit it only to strings larger than 128 bytes. */
+ cmp limit, 96
+ b.ls L(loop16)
+
+ /* Align src1 and adjust src2 with bytes not yet done. */
+ and tmp1, src1, 15
+ add limit, limit, tmp1
+ sub src1, src1, tmp1
+ sub src2, src2, tmp1
+
+ /* Loop performing 16 bytes per iteration using aligned src1.
+ Limit is pre-decremented by 16 and must be larger than zero.
+ Exit if <= 16 bytes left to do or if the data is not equal. */
+ .p2align 4
+L(loop16):
+ ldp data1, data1h, [src1], 16
+ ldp data2, data2h, [src2], 16
+ subs limit, limit, 16
+ ccmp data1, data2, 0, hi
+ ccmp data1h, data2h, 0, eq
+ b.eq L(loop16)
+
+ cmp data1, data2
+ bne L(return)
+ mov data1, data1h
+ mov data2, data2h
+ cmp data1, data2
+ bne L(return)
+
+ /* Compare last 1-16 bytes using unaligned access. */
+L(last_bytes):
+ add src1, src1, limit
+ add src2, src2, limit
+ ldp data1, data1h, [src1]
+ ldp data2, data2h, [src2]
+ cmp data1, data2
+ bne L(return)
+ mov data1, data1h
+ mov data2, data2h
+ cmp data1, data2
+
+ /* Compare data bytes and set return value to 0, -1 or 1. */
+L(return):
+#ifndef __AARCH64EB__
+ rev data1, data1
+ rev data2, data2
+#endif
+ cmp data1, data2
+L(ret_eq):
+ cset result, ne
+ cneg result, result, lo
ret
-.Lstart_align:
- lsr limit_wd, limit, #3
- cbz limit_wd, .Lremain8
-
- ands xzr, src1, #7
- b.eq .Lrecal_offset
- /*process more leading bytes to make src1 aligned...*/
- add src1, src1, tmp3 /*backwards src1 to alignment boundary*/
- add src2, src2, tmp3
- sub limit, limit, tmp3
- lsr limit_wd, limit, #3
- cbz limit_wd, .Lremain8
- /*load 8 bytes from aligned SRC1..*/
- ldr data1, [src1], #8
- ldr data2, [src2], #8
-
- subs limit_wd, limit_wd, #1
- eor diff, data1, data2 /*Non-zero if differences found.*/
- csinv endloop, diff, xzr, ne
- cbnz endloop, .Lunequal_proc
- /*How far is the current SRC2 from the alignment boundary...*/
- and tmp3, tmp3, #7
-
-.Lrecal_offset:/*src1 is aligned now..*/
- neg pos, tmp3
-.Lloopcmp_proc:
- /*
- * Divide the eight bytes into two parts. First,backwards the src2
- * to an alignment boundary,load eight bytes and compare from
- * the SRC2 alignment boundary. If all 8 bytes are equal,then start
- * the second part's comparison. Otherwise finish the comparison.
- * This special handle can garantee all the accesses are in the
- * thread/task space in avoid to overrange access.
- */
- ldr data1, [src1,pos]
- ldr data2, [src2,pos]
- eor diff, data1, data2 /* Non-zero if differences found. */
- cbnz diff, .Lnot_limit
-
- /*The second part process*/
- ldr data1, [src1], #8
- ldr data2, [src2], #8
- eor diff, data1, data2 /* Non-zero if differences found. */
- subs limit_wd, limit_wd, #1
- csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
- cbz endloop, .Lloopcmp_proc
-.Lunequal_proc:
- cbz diff, .Lremain8
-
-/* There is difference occurred in the latest comparison. */
-.Lnot_limit:
-/*
-* For little endian,reverse the low significant equal bits into MSB,then
-* following CLZ can find how many equal bits exist.
-*/
-CPU_LE( rev diff, diff )
-CPU_LE( rev data1, data1 )
-CPU_LE( rev data2, data2 )
-
- /*
- * The MS-non-zero bit of DIFF marks either the first bit
- * that is different, or the end of the significant data.
- * Shifting left now will bring the critical information into the
- * top bits.
- */
- clz pos, diff
- lsl data1, data1, pos
- lsl data2, data2, pos
- /*
- * We need to zero-extend (char is unsigned) the value and then
- * perform a signed subtraction.
- */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
- ret
-
-.Lremain8:
- /* Limit % 8 == 0 =>. all data are equal.*/
- ands limit, limit, #7
- b.eq .Lret0
-
-.Ltiny8proc:
- ldrb data1w, [src1], #1
- ldrb data2w, [src2], #1
- subs limit, limit, #1
-
- ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */
- b.eq .Ltiny8proc
- sub result, data1, data2
- ret
-.Lret0:
- mov result, #0
+ .p2align 4
+ /* Compare up to 8 bytes. Limit is [-8..-1]. */
+L(less8):
+ adds limit, limit, 4
+ b.lo L(less4)
+ ldr data1w, [src1], 4
+ ldr data2w, [src2], 4
+ cmp data1w, data2w
+ b.ne L(return)
+ sub limit, limit, 4
+L(less4):
+ adds limit, limit, 4
+ beq L(ret_eq)
+L(byte_loop):
+ ldrb data1w, [src1], 1
+ ldrb data2w, [src2], 1
+ subs limit, limit, 1
+ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
+ b.eq L(byte_loop)
+ sub result, data1w, data2w
ret
-ENDPIPROC(memcmp)
+SYM_FUNC_END(__pi_memcmp)
+SYM_FUNC_ALIAS_WEAK(memcmp, __pi_memcmp)
EXPORT_SYMBOL_NOKASAN(memcmp)
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index d79f48994dbb..4ab48d49c451 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -1,67 +1,253 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2012-2021, Arm Limited.
*
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
-#include <asm/cache.h>
-/*
- * Copy a buffer from src to dest (alignment handled by the hardware)
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
*
- * Parameters:
- * x0 - dest
- * x1 - src
- * x2 - n
- * Returns:
- * x0 - dest
*/
- .macro ldrb1 ptr, regB, val
- ldrb \ptr, [\regB], \val
- .endm
-
- .macro strb1 ptr, regB, val
- strb \ptr, [\regB], \val
- .endm
-
- .macro ldrh1 ptr, regB, val
- ldrh \ptr, [\regB], \val
- .endm
-
- .macro strh1 ptr, regB, val
- strh \ptr, [\regB], \val
- .endm
-
- .macro ldr1 ptr, regB, val
- ldr \ptr, [\regB], \val
- .endm
-
- .macro str1 ptr, regB, val
- str \ptr, [\regB], \val
- .endm
-
- .macro ldp1 ptr, regB, regC, val
- ldp \ptr, \regB, [\regC], \val
- .endm
-
- .macro stp1 ptr, regB, regC, val
- stp \ptr, \regB, [\regC], \val
- .endm
-
- .weak memcpy
-ENTRY(__memcpy)
-ENTRY(memcpy)
-#include "copy_template.S"
+
+#define L(label) .L ## label
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_lw w10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l x14
+#define E_h x15
+#define F_l x16
+#define F_h x17
+#define G_l count
+#define G_h dst
+#define H_l src
+#define H_h srcend
+#define tmp1 x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+ from a single entry point. It uses unaligned accesses and branchless
+ sequences to keep the code small, simple and improve performance.
+
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check is negligible since it is only required for large copies.
+
+ Large copies use a software pipelined loop processing 64 bytes per iteration.
+ The destination pointer is 16-byte aligned to minimize unaligned accesses.
+ The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+SYM_FUNC_START(__pi_memcpy)
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 128
+ b.hi L(copy_long)
+ cmp count, 32
+ b.hi L(copy32_128)
+
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
+ ldp A_l, A_h, [src]
+ ldp D_l, D_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
ret
-ENDPIPROC(memcpy)
-EXPORT_SYMBOL(memcpy)
-ENDPROC(__memcpy)
+
+ /* Copy 8-15 bytes. */
+L(copy16):
+ tbz count, 3, L(copy8)
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+
+ .p2align 3
+ /* Copy 4-7 bytes. */
+L(copy8):
+ tbz count, 2, L(copy4)
+ ldr A_lw, [src]
+ ldr B_lw, [srcend, -4]
+ str A_lw, [dstin]
+ str B_lw, [dstend, -4]
+ ret
+
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb C_lw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb C_lw, [dstend, -1]
+L(copy0):
+ ret
+
+ .p2align 4
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp A_l, A_h, [src]
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ ldp D_l, D_h, [srcend, -16]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp E_l, E_h, [src, 32]
+ ldp F_l, F_h, [src, 48]
+ cmp count, 96
+ b.ls L(copy96)
+ ldp G_l, G_h, [srcend, -64]
+ ldp H_l, H_h, [srcend, -48]
+ stp G_l, G_h, [dstend, -64]
+ stp H_l, H_h, [dstend, -48]
+L(copy96):
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp E_l, E_h, [dstin, 32]
+ stp F_l, F_h, [dstin, 48]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Copy more than 128 bytes. */
+L(copy_long):
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cbz tmp1, L(copy0)
+ cmp tmp1, count
+ b.lo L(copy_long_backwards)
+
+ /* Copy 16 bytes and then align dst to 16-byte alignment. */
+
+ ldp D_l, D_h, [src]
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(copy64_from_end)
+
+L(loop64):
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
+ ldp E_l, E_h, [srcend, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [srcend, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [srcend, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend, -64]
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
+ ret
+
+ .p2align 4
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align dst to 16-byte alignment. */
+L(copy_long_backwards):
+ ldp D_l, D_h, [srcend, -16]
+ and tmp1, dstend, 15
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp A_l, A_h, [srcend, -16]
+ stp D_l, D_h, [dstend, -16]
+ ldp B_l, B_h, [srcend, -32]
+ ldp C_l, C_h, [srcend, -48]
+ ldp D_l, D_h, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start)
+
+L(loop64_backwards):
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [srcend, -16]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [srcend, -48]
+ stp D_l, D_h, [dstend, -64]!
+ ldp D_l, D_h, [srcend, -64]!
+ subs count, count, 64
+ b.hi L(loop64_backwards)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+ ldp G_l, G_h, [src, 48]
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [src]
+ stp D_l, D_h, [dstend, -64]
+ stp G_l, G_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
+ ret
+SYM_FUNC_END(__pi_memcpy)
+
+SYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
EXPORT_SYMBOL(__memcpy)
+SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
+EXPORT_SYMBOL(memcpy)
+
+SYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy)
+
+SYM_FUNC_ALIAS(__memmove, __pi_memmove)
+EXPORT_SYMBOL(__memmove)
+SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
+EXPORT_SYMBOL(memmove)
diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S
deleted file mode 100644
index 784775136480..000000000000
--- a/arch/arm64/lib/memmove.S
+++ /dev/null
@@ -1,190 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
- *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/cache.h>
-
-/*
- * Move a buffer from src to test (alignment handled by the hardware).
- * If dest <= src, call memcpy, otherwise copy in reverse order.
- *
- * Parameters:
- * x0 - dest
- * x1 - src
- * x2 - n
- * Returns:
- * x0 - dest
- */
-dstin .req x0
-src .req x1
-count .req x2
-tmp1 .req x3
-tmp1w .req w3
-tmp2 .req x4
-tmp2w .req w4
-tmp3 .req x5
-tmp3w .req w5
-dst .req x6
-
-A_l .req x7
-A_h .req x8
-B_l .req x9
-B_h .req x10
-C_l .req x11
-C_h .req x12
-D_l .req x13
-D_h .req x14
-
- .weak memmove
-ENTRY(__memmove)
-ENTRY(memmove)
- cmp dstin, src
- b.lo __memcpy
- add tmp1, src, count
- cmp dstin, tmp1
- b.hs __memcpy /* No overlap. */
-
- add dst, dstin, count
- add src, src, count
- cmp count, #16
- b.lo .Ltail15 /*probably non-alignment accesses.*/
-
- ands tmp2, src, #15 /* Bytes to reach alignment. */
- b.eq .LSrcAligned
- sub count, count, tmp2
- /*
- * process the aligned offset length to make the src aligned firstly.
- * those extra instructions' cost is acceptable. It also make the
- * coming accesses are based on aligned address.
- */
- tbz tmp2, #0, 1f
- ldrb tmp1w, [src, #-1]!
- strb tmp1w, [dst, #-1]!
-1:
- tbz tmp2, #1, 2f
- ldrh tmp1w, [src, #-2]!
- strh tmp1w, [dst, #-2]!
-2:
- tbz tmp2, #2, 3f
- ldr tmp1w, [src, #-4]!
- str tmp1w, [dst, #-4]!
-3:
- tbz tmp2, #3, .LSrcAligned
- ldr tmp1, [src, #-8]!
- str tmp1, [dst, #-8]!
-
-.LSrcAligned:
- cmp count, #64
- b.ge .Lcpy_over64
-
- /*
- * Deal with small copies quickly by dropping straight into the
- * exit block.
- */
-.Ltail63:
- /*
- * Copy up to 48 bytes of data. At this point we only need the
- * bottom 6 bits of count to be accurate.
- */
- ands tmp1, count, #0x30
- b.eq .Ltail15
- cmp tmp1w, #0x20
- b.eq 1f
- b.lt 2f
- ldp A_l, A_h, [src, #-16]!
- stp A_l, A_h, [dst, #-16]!
-1:
- ldp A_l, A_h, [src, #-16]!
- stp A_l, A_h, [dst, #-16]!
-2:
- ldp A_l, A_h, [src, #-16]!
- stp A_l, A_h, [dst, #-16]!
-
-.Ltail15:
- tbz count, #3, 1f
- ldr tmp1, [src, #-8]!
- str tmp1, [dst, #-8]!
-1:
- tbz count, #2, 2f
- ldr tmp1w, [src, #-4]!
- str tmp1w, [dst, #-4]!
-2:
- tbz count, #1, 3f
- ldrh tmp1w, [src, #-2]!
- strh tmp1w, [dst, #-2]!
-3:
- tbz count, #0, .Lexitfunc
- ldrb tmp1w, [src, #-1]
- strb tmp1w, [dst, #-1]
-
-.Lexitfunc:
- ret
-
-.Lcpy_over64:
- subs count, count, #128
- b.ge .Lcpy_body_large
- /*
- * Less than 128 bytes to copy, so handle 64 bytes here and then jump
- * to the tail.
- */
- ldp A_l, A_h, [src, #-16]
- stp A_l, A_h, [dst, #-16]
- ldp B_l, B_h, [src, #-32]
- ldp C_l, C_h, [src, #-48]
- stp B_l, B_h, [dst, #-32]
- stp C_l, C_h, [dst, #-48]
- ldp D_l, D_h, [src, #-64]!
- stp D_l, D_h, [dst, #-64]!
-
- tst count, #0x3f
- b.ne .Ltail63
- ret
-
- /*
- * Critical loop. Start at a new cache line boundary. Assuming
- * 64 bytes per line this ensures the entire loop is in one line.
- */
- .p2align L1_CACHE_SHIFT
-.Lcpy_body_large:
- /* pre-load 64 bytes data. */
- ldp A_l, A_h, [src, #-16]
- ldp B_l, B_h, [src, #-32]
- ldp C_l, C_h, [src, #-48]
- ldp D_l, D_h, [src, #-64]!
-1:
- /*
- * interlace the load of next 64 bytes data block with store of the last
- * loaded 64 bytes data.
- */
- stp A_l, A_h, [dst, #-16]
- ldp A_l, A_h, [src, #-16]
- stp B_l, B_h, [dst, #-32]
- ldp B_l, B_h, [src, #-32]
- stp C_l, C_h, [dst, #-48]
- ldp C_l, C_h, [src, #-48]
- stp D_l, D_h, [dst, #-64]!
- ldp D_l, D_h, [src, #-64]!
- subs count, count, #64
- b.ge 1b
- stp A_l, A_h, [dst, #-16]
- stp B_l, B_h, [dst, #-32]
- stp C_l, C_h, [dst, #-48]
- stp D_l, D_h, [dst, #-64]!
-
- tst count, #0x3f
- b.ne .Ltail63
- ret
-ENDPIPROC(memmove)
-EXPORT_SYMBOL(memmove)
-ENDPROC(__memmove)
-EXPORT_SYMBOL(__memmove)
diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S
index 9fb97e6bc560..a5aebe82ad73 100644
--- a/arch/arm64/lib/memset.S
+++ b/arch/arm64/lib/memset.S
@@ -42,9 +42,7 @@ dst .req x8
tmp3w .req w9
tmp3 .req x9
- .weak memset
-ENTRY(__memset)
-ENTRY(memset)
+SYM_FUNC_START(__pi_memset)
mov dst, dstin /* Preserve return value. */
and A_lw, val, #255
orr A_lw, A_lw, A_lw, lsl #8
@@ -203,7 +201,10 @@ ENTRY(memset)
ands count, count, zva_bits_x
b.ne .Ltail_maybe_long
ret
-ENDPIPROC(memset)
-EXPORT_SYMBOL(memset)
-ENDPROC(__memset)
+SYM_FUNC_END(__pi_memset)
+
+SYM_FUNC_ALIAS(__memset, __pi_memset)
EXPORT_SYMBOL(__memset)
+
+SYM_FUNC_ALIAS_WEAK(memset, __pi_memset)
+EXPORT_SYMBOL(memset)
diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S
new file mode 100644
index 000000000000..5018ac03b6bf
--- /dev/null
+++ b/arch/arm64/lib/mte.S
@@ -0,0 +1,177 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 ARM Ltd.
+ */
+#include <linux/linkage.h>
+
+#include <asm/asm-uaccess.h>
+#include <asm/assembler.h>
+#include <asm/mte.h>
+#include <asm/page.h>
+#include <asm/sysreg.h>
+
+ .arch armv8.5-a+memtag
+
+/*
+ * multitag_transfer_size - set \reg to the block size that is accessed by the
+ * LDGM/STGM instructions.
+ */
+ .macro multitag_transfer_size, reg, tmp
+ mrs_s \reg, SYS_GMID_EL1
+ ubfx \reg, \reg, #GMID_EL1_BS_SHIFT, #GMID_EL1_BS_WIDTH
+ mov \tmp, #4
+ lsl \reg, \tmp, \reg
+ .endm
+
+/*
+ * Clear the tags in a page
+ * x0 - address of the page to be cleared
+ */
+SYM_FUNC_START(mte_clear_page_tags)
+ multitag_transfer_size x1, x2
+1: stgm xzr, [x0]
+ add x0, x0, x1
+ tst x0, #(PAGE_SIZE - 1)
+ b.ne 1b
+ ret
+SYM_FUNC_END(mte_clear_page_tags)
+
+/*
+ * Zero the page and tags at the same time
+ *
+ * Parameters:
+ * x0 - address to the beginning of the page
+ */
+SYM_FUNC_START(mte_zero_clear_page_tags)
+ and x0, x0, #(1 << MTE_TAG_SHIFT) - 1 // clear the tag
+ mrs x1, dczid_el0
+ tbnz x1, #4, 2f // Branch if DC GZVA is prohibited
+ and w1, w1, #0xf
+ mov x2, #4
+ lsl x1, x2, x1
+
+1: dc gzva, x0
+ add x0, x0, x1
+ tst x0, #(PAGE_SIZE - 1)
+ b.ne 1b
+ ret
+
+2: stz2g x0, [x0], #(MTE_GRANULE_SIZE * 2)
+ tst x0, #(PAGE_SIZE - 1)
+ b.ne 2b
+ ret
+SYM_FUNC_END(mte_zero_clear_page_tags)
+
+/*
+ * Copy the tags from the source page to the destination one
+ * x0 - address of the destination page
+ * x1 - address of the source page
+ */
+SYM_FUNC_START(mte_copy_page_tags)
+ mov x2, x0
+ mov x3, x1
+ multitag_transfer_size x5, x6
+1: ldgm x4, [x3]
+ stgm x4, [x2]
+ add x2, x2, x5
+ add x3, x3, x5
+ tst x2, #(PAGE_SIZE - 1)
+ b.ne 1b
+ ret
+SYM_FUNC_END(mte_copy_page_tags)
+
+/*
+ * Read tags from a user buffer (one tag per byte) and set the corresponding
+ * tags at the given kernel address. Used by PTRACE_POKEMTETAGS.
+ * x0 - kernel address (to)
+ * x1 - user buffer (from)
+ * x2 - number of tags/bytes (n)
+ * Returns:
+ * x0 - number of tags read/set
+ */
+SYM_FUNC_START(mte_copy_tags_from_user)
+ mov x3, x1
+ cbz x2, 2f
+1:
+USER(2f, ldtrb w4, [x1])
+ lsl x4, x4, #MTE_TAG_SHIFT
+ stg x4, [x0], #MTE_GRANULE_SIZE
+ add x1, x1, #1
+ subs x2, x2, #1
+ b.ne 1b
+
+ // exception handling and function return
+2: sub x0, x1, x3 // update the number of tags set
+ ret
+SYM_FUNC_END(mte_copy_tags_from_user)
+
+/*
+ * Get the tags from a kernel address range and write the tag values to the
+ * given user buffer (one tag per byte). Used by PTRACE_PEEKMTETAGS.
+ * x0 - user buffer (to)
+ * x1 - kernel address (from)
+ * x2 - number of tags/bytes (n)
+ * Returns:
+ * x0 - number of tags read/set
+ */
+SYM_FUNC_START(mte_copy_tags_to_user)
+ mov x3, x0
+ cbz x2, 2f
+1:
+ ldg x4, [x1]
+ ubfx x4, x4, #MTE_TAG_SHIFT, #MTE_TAG_SIZE
+USER(2f, sttrb w4, [x0])
+ add x0, x0, #1
+ add x1, x1, #MTE_GRANULE_SIZE
+ subs x2, x2, #1
+ b.ne 1b
+
+ // exception handling and function return
+2: sub x0, x0, x3 // update the number of tags copied
+ ret
+SYM_FUNC_END(mte_copy_tags_to_user)
+
+/*
+ * Save the tags in a page
+ * x0 - page address
+ * x1 - tag storage, MTE_PAGE_TAG_STORAGE bytes
+ */
+SYM_FUNC_START(mte_save_page_tags)
+ multitag_transfer_size x7, x5
+1:
+ mov x2, #0
+2:
+ ldgm x5, [x0]
+ orr x2, x2, x5
+ add x0, x0, x7
+ tst x0, #0xFF // 16 tag values fit in a register,
+ b.ne 2b // which is 16*16=256 bytes
+
+ str x2, [x1], #8
+
+ tst x0, #(PAGE_SIZE - 1)
+ b.ne 1b
+
+ ret
+SYM_FUNC_END(mte_save_page_tags)
+
+/*
+ * Restore the tags in a page
+ * x0 - page address
+ * x1 - tag storage, MTE_PAGE_TAG_STORAGE bytes
+ */
+SYM_FUNC_START(mte_restore_page_tags)
+ multitag_transfer_size x7, x5
+1:
+ ldr x2, [x1], #8
+2:
+ stgm x2, [x0]
+ add x0, x0, x7
+ tst x0, #0xFF
+ b.ne 2b
+
+ tst x0, #(PAGE_SIZE - 1)
+ b.ne 1b
+
+ ret
+SYM_FUNC_END(mte_restore_page_tags)
diff --git a/arch/arm64/lib/strchr.S b/arch/arm64/lib/strchr.S
index ca3ec18171a4..94ee67a6b212 100644
--- a/arch/arm64/lib/strchr.S
+++ b/arch/arm64/lib/strchr.S
@@ -18,7 +18,7 @@
* Returns:
* x0 - address of first occurrence of 'c' or 0
*/
-WEAK(strchr)
+SYM_FUNC_START(__pi_strchr)
and w1, w1, #0xff
1: ldrb w2, [x0], #1
cmp w2, w1
@@ -28,5 +28,7 @@ WEAK(strchr)
cmp w2, w1
csel x0, x0, xzr, eq
ret
-ENDPROC(strchr)
+SYM_FUNC_END(__pi_strchr)
+
+SYM_FUNC_ALIAS_WEAK(strchr, __pi_strchr)
EXPORT_SYMBOL_NOKASAN(strchr)
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
index e9aefbe0b740..9b89b4533607 100644
--- a/arch/arm64/lib/strcmp.S
+++ b/arch/arm64/lib/strcmp.S
@@ -1,223 +1,190 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2012-2022, Arm Limited.
*
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strcmp.S
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
-/*
- * compare two strings
+/* Assumptions:
*
- * Parameters:
- * x0 - const string 1 pointer
- * x1 - const string 2 pointer
- * Returns:
- * x0 - an integer less than, equal to, or greater than zero
- * if s1 is found, respectively, to be less than, to match,
- * or be greater than s2.
+ * ARMv8-a, AArch64.
+ * MTE compatible.
*/
+#define L(label) .L ## label
+
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-
-/* Parameters and result. */
-src1 .req x0
-src2 .req x1
-result .req x0
-
-/* Internal variables. */
-data1 .req x2
-data1w .req w2
-data2 .req x3
-data2w .req w3
-has_nul .req x4
-diff .req x5
-syndrome .req x6
-tmp1 .req x7
-tmp2 .req x8
-tmp3 .req x9
-zeroones .req x10
-pos .req x11
-
-WEAK(strcmp)
- eor tmp1, src1, src2
- mov zeroones, #REP8_01
- tst tmp1, #7
- b.ne .Lmisaligned8
- ands tmp1, src1, #7
- b.ne .Lmutual_align
-
- /*
- * NUL detection works on the principle that (X - 1) & (~X) & 0x80
- * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- * can be done in parallel across the entire word.
- */
-.Lloop_aligned:
- ldr data1, [src1], #8
- ldr data2, [src2], #8
-.Lstart_realigned:
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+
+#define src1 x0
+#define src2 x1
+#define result x0
+
+#define data1 x2
+#define data1w w2
+#define data2 x3
+#define data2w w3
+#define has_nul x4
+#define diff x5
+#define off1 x5
+#define syndrome x6
+#define tmp x6
+#define data3 x7
+#define zeroones x8
+#define shift x9
+#define off2 x10
+
+/* On big-endian early bytes are at MSB and on little-endian LSB.
+ LS_FW means shifting towards early bytes. */
+#ifdef __AARCH64EB__
+# define LS_FW lsl
+#else
+# define LS_FW lsr
+#endif
+
+/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word.
+ Since carry propagation makes 0x1 bytes before a NUL byte appear
+ NUL too in big-endian, byte-reverse the data before the NUL check. */
+
+
+SYM_FUNC_START(__pi_strcmp)
+ sub off2, src2, src1
+ mov zeroones, REP8_01
+ and tmp, src1, 7
+ tst off2, 7
+ b.ne L(misaligned8)
+ cbnz tmp, L(mutual_align)
+
+ .p2align 4
+
+L(loop_aligned):
+ ldr data2, [src1, off2]
+ ldr data1, [src1], 8
+L(start_realigned):
+#ifdef __AARCH64EB__
+ rev tmp, data1
+ sub has_nul, tmp, zeroones
+ orr tmp, tmp, REP8_7f
+#else
+ sub has_nul, data1, zeroones
+ orr tmp, data1, REP8_7f
+#endif
+ bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
+ ccmp data1, data2, 0, eq
+ b.eq L(loop_aligned)
+#ifdef __AARCH64EB__
+ rev has_nul, has_nul
+#endif
+ eor diff, data1, data2
orr syndrome, diff, has_nul
- cbz syndrome, .Lloop_aligned
- b .Lcal_cmpresult
-
-.Lmutual_align:
- /*
- * Sources are mutually aligned, but are not currently at an
- * alignment boundary. Round down the addresses and then mask off
- * the bytes that preceed the start point.
- */
- bic src1, src1, #7
- bic src2, src2, #7
- lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
- ldr data1, [src1], #8
- neg tmp1, tmp1 /* Bits to alignment -64. */
- ldr data2, [src2], #8
- mov tmp2, #~0
- /* Big-endian. Early bytes are at MSB. */
-CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
- /* Little-endian. Early bytes are at LSB. */
-CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
-
- orr data1, data1, tmp2
- orr data2, data2, tmp2
- b .Lstart_realigned
-
-.Lmisaligned8:
- /*
- * Get the align offset length to compare per byte first.
- * After this process, one string's address will be aligned.
- */
- and tmp1, src1, #7
- neg tmp1, tmp1
- add tmp1, tmp1, #8
- and tmp2, src2, #7
- neg tmp2, tmp2
- add tmp2, tmp2, #8
- subs tmp3, tmp1, tmp2
- csel pos, tmp1, tmp2, hi /*Choose the maximum. */
-.Ltinycmp:
- ldrb data1w, [src1], #1
- ldrb data2w, [src2], #1
- subs pos, pos, #1
- ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */
- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq .Ltinycmp
- cbnz pos, 1f /*find the null or unequal...*/
- cmp data1w, #1
- ccmp data1w, data2w, #0, cs
- b.eq .Lstart_align /*the last bytes are equal....*/
-1:
- sub result, data1, data2
+L(end):
+#ifndef __AARCH64EB__
+ rev syndrome, syndrome
+ rev data1, data1
+ rev data2, data2
+#endif
+ clz shift, syndrome
+ /* The most-significant-non-zero bit of the syndrome marks either the
+ first bit that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ lsl data1, data1, shift
+ lsl data2, data2, shift
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, 56
+ sub result, data1, data2, lsr 56
ret
-.Lstart_align:
- ands xzr, src1, #7
- b.eq .Lrecal_offset
- /*process more leading bytes to make str1 aligned...*/
- add src1, src1, tmp3
- add src2, src2, tmp3
- /*load 8 bytes from aligned str1 and non-aligned str2..*/
- ldr data1, [src1], #8
- ldr data2, [src2], #8
-
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- bic has_nul, tmp1, tmp2
- eor diff, data1, data2 /* Non-zero if differences found. */
+ .p2align 4
+
+L(mutual_align):
+ /* Sources are mutually aligned, but are not currently at an
+ alignment boundary. Round down the addresses and then mask off
+ the bytes that precede the start point. */
+ bic src1, src1, 7
+ ldr data2, [src1, off2]
+ ldr data1, [src1], 8
+ neg shift, src2, lsl 3 /* Bits to alignment -64. */
+ mov tmp, -1
+ LS_FW tmp, tmp, shift
+ orr data1, data1, tmp
+ orr data2, data2, tmp
+ b L(start_realigned)
+
+L(misaligned8):
+ /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+ checking to make sure that we don't access beyond the end of SRC2. */
+ cbz tmp, L(src1_aligned)
+L(do_misaligned):
+ ldrb data1w, [src1], 1
+ ldrb data2w, [src2], 1
+ cmp data1w, 0
+ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
+ b.ne L(done)
+ tst src1, 7
+ b.ne L(do_misaligned)
+
+L(src1_aligned):
+ neg shift, src2, lsl 3
+ bic src2, src2, 7
+ ldr data3, [src2], 8
+#ifdef __AARCH64EB__
+ rev data3, data3
+#endif
+ lsr tmp, zeroones, shift
+ orr data3, data3, tmp
+ sub has_nul, data3, zeroones
+ orr tmp, data3, REP8_7f
+ bics has_nul, has_nul, tmp
+ b.ne L(tail)
+
+ sub off1, src2, src1
+
+ .p2align 4
+
+L(loop_unaligned):
+ ldr data3, [src1, off1]
+ ldr data2, [src1, off2]
+#ifdef __AARCH64EB__
+ rev data3, data3
+#endif
+ sub has_nul, data3, zeroones
+ orr tmp, data3, REP8_7f
+ ldr data1, [src1], 8
+ bics has_nul, has_nul, tmp
+ ccmp data1, data2, 0, eq
+ b.eq L(loop_unaligned)
+
+ lsl tmp, has_nul, shift
+#ifdef __AARCH64EB__
+ rev tmp, tmp
+#endif
+ eor diff, data1, data2
+ orr syndrome, diff, tmp
+ cbnz syndrome, L(end)
+L(tail):
+ ldr data1, [src1]
+ neg shift, shift
+ lsr data2, data3, shift
+ lsr has_nul, has_nul, shift
+#ifdef __AARCH64EB__
+ rev data2, data2
+ rev has_nul, has_nul
+#endif
+ eor diff, data1, data2
orr syndrome, diff, has_nul
- cbnz syndrome, .Lcal_cmpresult
- /*How far is the current str2 from the alignment boundary...*/
- and tmp3, tmp3, #7
-.Lrecal_offset:
- neg pos, tmp3
-.Lloopcmp_proc:
- /*
- * Divide the eight bytes into two parts. First,backwards the src2
- * to an alignment boundary,load eight bytes from the SRC2 alignment
- * boundary,then compare with the relative bytes from SRC1.
- * If all 8 bytes are equal,then start the second part's comparison.
- * Otherwise finish the comparison.
- * This special handle can garantee all the accesses are in the
- * thread/task space in avoid to overrange access.
- */
- ldr data1, [src1,pos]
- ldr data2, [src2,pos]
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- bic has_nul, tmp1, tmp2
- eor diff, data1, data2 /* Non-zero if differences found. */
- orr syndrome, diff, has_nul
- cbnz syndrome, .Lcal_cmpresult
-
- /*The second part process*/
- ldr data1, [src1], #8
- ldr data2, [src2], #8
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- bic has_nul, tmp1, tmp2
- eor diff, data1, data2 /* Non-zero if differences found. */
- orr syndrome, diff, has_nul
- cbz syndrome, .Lloopcmp_proc
-
-.Lcal_cmpresult:
- /*
- * reversed the byte-order as big-endian,then CLZ can find the most
- * significant zero bits.
- */
-CPU_LE( rev syndrome, syndrome )
-CPU_LE( rev data1, data1 )
-CPU_LE( rev data2, data2 )
-
- /*
- * For big-endian we cannot use the trick with the syndrome value
- * as carry-propagation can corrupt the upper bits if the trailing
- * bytes in the string contain 0x01.
- * However, if there is no NUL byte in the dword, we can generate
- * the result directly. We ca not just subtract the bytes as the
- * MSB might be significant.
- */
-CPU_BE( cbnz has_nul, 1f )
-CPU_BE( cmp data1, data2 )
-CPU_BE( cset result, ne )
-CPU_BE( cneg result, result, lo )
-CPU_BE( ret )
-CPU_BE( 1: )
- /*Re-compute the NUL-byte detection, using a byte-reversed value. */
-CPU_BE( rev tmp3, data1 )
-CPU_BE( sub tmp1, tmp3, zeroones )
-CPU_BE( orr tmp2, tmp3, #REP8_7f )
-CPU_BE( bic has_nul, tmp1, tmp2 )
-CPU_BE( rev has_nul, has_nul )
-CPU_BE( orr syndrome, diff, has_nul )
-
- clz pos, syndrome
- /*
- * The MS-non-zero bit of the syndrome marks either the first bit
- * that is different, or the top bit of the first zero byte.
- * Shifting left now will bring the critical information into the
- * top bits.
- */
- lsl data1, data1, pos
- lsl data2, data2, pos
- /*
- * But we need to zero-extend (char is unsigned) the value and then
- * perform a signed 32-bit subtraction.
- */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
+ b L(end)
+
+L(done):
+ sub result, data1, data2
ret
-ENDPIPROC(strcmp)
+SYM_FUNC_END(__pi_strcmp)
+SYM_FUNC_ALIAS_WEAK(strcmp, __pi_strcmp)
EXPORT_SYMBOL_NOKASAN(strcmp)
diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S
index 87b0cb066915..4919fe81ae54 100644
--- a/arch/arm64/lib/strlen.S
+++ b/arch/arm64/lib/strlen.S
@@ -1,115 +1,213 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2013-2021, Arm Limited.
*
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/98e4d6a5c13c8e54/string/aarch64/strlen.S
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
+#include <asm/mte-def.h>
-/*
- * calculate the length of a string
+/* Assumptions:
*
- * Parameters:
- * x0 - const string pointer
- * Returns:
- * x0 - the return length of specific string
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
*/
+#define L(label) .L ## label
+
/* Arguments and results. */
-srcin .req x0
-len .req x0
+#define srcin x0
+#define len x0
/* Locals and temporaries. */
-src .req x1
-data1 .req x2
-data2 .req x3
-data2a .req x4
-has_nul1 .req x5
-has_nul2 .req x6
-tmp1 .req x7
-tmp2 .req x8
-tmp3 .req x9
-tmp4 .req x10
-zeroones .req x11
-pos .req x12
+#define src x1
+#define data1 x2
+#define data2 x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1 x4
+#define tmp2 x5
+#define tmp3 x6
+#define tmp4 x7
+#define zeroones x8
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. A faster check
+ (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
+ false hits for characters 129..255. */
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
-WEAK(strlen)
- mov zeroones, #REP8_01
- bic src, srcin, #15
- ands tmp1, srcin, #15
- b.ne .Lmisaligned
- /*
- * NUL detection works on the principle that (X - 1) & (~X) & 0x80
- * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- * can be done in parallel across the entire word.
- */
- /*
- * The inner loop deals with two Dwords at a time. This has a
- * slightly higher start-up cost, but we should win quite quickly,
- * especially on cores with a high number of issue slots per
- * cycle, as we get much better parallelism out of the operations.
- */
-.Lloop:
- ldp data1, data2, [src], #16
-.Lrealigned:
+/*
+ * When KASAN_HW_TAGS is in use, memory is checked at MTE_GRANULE_SIZE
+ * (16-byte) granularity, and we must ensure that no access straddles this
+ * alignment boundary.
+ */
+#ifdef CONFIG_KASAN_HW_TAGS
+#define MIN_PAGE_SIZE MTE_GRANULE_SIZE
+#else
+#define MIN_PAGE_SIZE 4096
+#endif
+
+ /* Since strings are short on average, we check the first 16 bytes
+ of the string for a NUL character. In order to do an unaligned ldp
+ safely we have to do a page cross check first. If there is a NUL
+ byte we calculate the length from the 2 8-byte words using
+ conditional select to reduce branch mispredictions (it is unlikely
+ strlen will be repeatedly called on strings with the same length).
+
+ If the string is longer than 16 bytes, we align src so don't need
+ further page cross checks, and process 32 bytes per iteration
+ using the fast NUL check. If we encounter non-ASCII characters,
+ fallback to a second loop using the full NUL check.
+
+ If the page cross check fails, we read 16 bytes from an aligned
+ address, remove any characters before the string, and continue
+ in the main loop using aligned loads. Since strings crossing a
+ page in the first 16 bytes are rare (probability of
+ 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
+
+ AArch64 systems have a minimum page size of 4k. We don't bother
+ checking for larger page sizes - the cost of setting up the correct
+ page size is just not worth the extra gain from a small reduction in
+ the cases taking the slow path. Note that we only care about
+ whether the first fetch, which may be misaligned, crosses a page
+ boundary. */
+
+SYM_FUNC_START(__pi_strlen)
+ and tmp1, srcin, MIN_PAGE_SIZE - 1
+ mov zeroones, REP8_01
+ cmp tmp1, MIN_PAGE_SIZE - 16
+ b.gt L(page_cross)
+ ldp data1, data2, [srcin]
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul1/2 directly.
+ Since we expect strings to be small and early-exit,
+ byte-swap the data now so has_null1/2 will be correct. */
+ rev data1, data1
+ rev data2, data2
+#endif
sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
+ orr tmp2, data1, REP8_7f
sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
- bic has_nul1, tmp1, tmp2
- bics has_nul2, tmp3, tmp4
- ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
- b.eq .Lloop
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(main_loop_entry)
+
+ /* Enter with C = has_nul1 == 0. */
+ csel has_nul1, has_nul1, has_nul2, cc
+ mov len, 8
+ rev has_nul1, has_nul1
+ clz tmp1, has_nul1
+ csel len, xzr, len, cc
+ add len, len, tmp1, lsr 3
+ ret
+ /* The inner loop processes 32 bytes per iteration and uses the fast
+ NUL check. If we encounter non-ASCII characters, use a second
+ loop with the accurate NUL check. */
+ .p2align 4
+L(main_loop_entry):
+ bic src, srcin, 15
+ sub src, src, 16
+L(main_loop):
+ ldp data1, data2, [src, 32]!
+L(page_cross_entry):
+ sub tmp1, data1, zeroones
+ sub tmp3, data2, zeroones
+ orr tmp2, tmp1, tmp3
+ tst tmp2, zeroones, lsl 7
+ bne 1f
+ ldp data1, data2, [src, 16]
+ sub tmp1, data1, zeroones
+ sub tmp3, data2, zeroones
+ orr tmp2, tmp1, tmp3
+ tst tmp2, zeroones, lsl 7
+ beq L(main_loop)
+ add src, src, 16
+1:
+ /* The fast check failed, so do the slower, accurate NUL check. */
+ orr tmp2, data1, REP8_7f
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(nonascii_loop)
+
+ /* Enter with C = has_nul1 == 0. */
+L(tail):
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul1/2 directly. The
+ easiest way to get the correct byte is to byte-swap the data
+ and calculate the syndrome a second time. */
+ csel data1, data1, data2, cc
+ rev data1, data1
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ bic has_nul1, tmp1, tmp2
+#else
+ csel has_nul1, has_nul1, has_nul2, cc
+#endif
sub len, src, srcin
- cbz has_nul1, .Lnul_in_data2
-CPU_BE( mov data2, data1 ) /*prepare data to re-calculate the syndrome*/
- sub len, len, #8
- mov has_nul2, has_nul1
-.Lnul_in_data2:
- /*
- * For big-endian, carry propagation (if the final byte in the
- * string is 0x01) means we cannot use has_nul directly. The
- * easiest way to get the correct byte is to byte-swap the data
- * and calculate the syndrome a second time.
- */
-CPU_BE( rev data2, data2 )
-CPU_BE( sub tmp1, data2, zeroones )
-CPU_BE( orr tmp2, data2, #REP8_7f )
-CPU_BE( bic has_nul2, tmp1, tmp2 )
-
- sub len, len, #8
- rev has_nul2, has_nul2
- clz pos, has_nul2
- add len, len, pos, lsr #3 /* Bits to bytes. */
+ rev has_nul1, has_nul1
+ add tmp2, len, 8
+ clz tmp1, has_nul1
+ csel len, len, tmp2, cc
+ add len, len, tmp1, lsr 3
ret
-.Lmisaligned:
- cmp tmp1, #8
- neg tmp1, tmp1
- ldp data1, data2, [src], #16
- lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
- mov tmp2, #~0
- /* Big-endian. Early bytes are at MSB. */
-CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
+L(nonascii_loop):
+ ldp data1, data2, [src, 16]!
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ bne L(tail)
+ ldp data1, data2, [src, 16]!
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(nonascii_loop)
+ b L(tail)
+
+ /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
+ srcin to 0x7f, so we ignore any NUL bytes before the string.
+ Then continue in the aligned loop. */
+L(page_cross):
+ bic src, srcin, 15
+ ldp data1, data2, [src]
+ lsl tmp1, srcin, 3
+ mov tmp4, -1
+#ifdef __AARCH64EB__
+ /* Big-endian. Early bytes are at MSB. */
+ lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
+#else
/* Little-endian. Early bytes are at LSB. */
-CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
-
- orr data1, data1, tmp2
- orr data2a, data2, tmp2
- csinv data1, data1, xzr, le
- csel data2, data2, data2a, le
- b .Lrealigned
-ENDPIPROC(strlen)
+ lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
+#endif
+ orr tmp1, tmp1, REP8_80
+ orn data1, data1, tmp1
+ orn tmp2, data2, tmp1
+ tst srcin, 8
+ csel data1, data1, tmp4, eq
+ csel data2, data2, tmp2, eq
+ b L(page_cross_entry)
+SYM_FUNC_END(__pi_strlen)
+SYM_FUNC_ALIAS_WEAK(strlen, __pi_strlen)
EXPORT_SYMBOL_NOKASAN(strlen)
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
index f571581888fa..fe7bbc0b42a7 100644
--- a/arch/arm64/lib/strncmp.S
+++ b/arch/arm64/lib/strncmp.S
@@ -1,299 +1,310 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2013-2022, Arm Limited.
*
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strncmp.S
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
-/*
- * compare two strings
+/* Assumptions:
*
- * Parameters:
- * x0 - const string 1 pointer
- * x1 - const string 2 pointer
- * x2 - the maximal length to be compared
- * Returns:
- * x0 - an integer less than, equal to, or greater than zero if s1 is found,
- * respectively, to be less than, to match, or be greater than s2.
+ * ARMv8-a, AArch64.
+ * MTE compatible.
*/
+#define L(label) .L ## label
+
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
/* Parameters and result. */
-src1 .req x0
-src2 .req x1
-limit .req x2
-result .req x0
+#define src1 x0
+#define src2 x1
+#define limit x2
+#define result x0
/* Internal variables. */
-data1 .req x3
-data1w .req w3
-data2 .req x4
-data2w .req w4
-has_nul .req x5
-diff .req x6
-syndrome .req x7
-tmp1 .req x8
-tmp2 .req x9
-tmp3 .req x10
-zeroones .req x11
-pos .req x12
-limit_wd .req x13
-mask .req x14
-endloop .req x15
+#define data1 x3
+#define data1w w3
+#define data2 x4
+#define data2w w4
+#define has_nul x5
+#define diff x6
+#define syndrome x7
+#define tmp1 x8
+#define tmp2 x9
+#define tmp3 x10
+#define zeroones x11
+#define pos x12
+#define mask x13
+#define endloop x14
+#define count mask
+#define offset pos
+#define neg_offset x15
+
+/* Define endian dependent shift operations.
+ On big-endian early bytes are at MSB and on little-endian LSB.
+ LS_FW means shifting towards early bytes.
+ LS_BK means shifting towards later bytes.
+ */
+#ifdef __AARCH64EB__
+#define LS_FW lsl
+#define LS_BK lsr
+#else
+#define LS_FW lsr
+#define LS_BK lsl
+#endif
-WEAK(strncmp)
- cbz limit, .Lret0
+SYM_FUNC_START(__pi_strncmp)
+ cbz limit, L(ret0)
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
- b.ne .Lmisaligned8
- ands tmp1, src1, #7
- b.ne .Lmutual_align
- /* Calculate the number of full and partial words -1. */
- /*
- * when limit is mulitply of 8, if not sub 1,
- * the judgement of last dword will wrong.
- */
- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
- lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
+ and count, src1, #7
+ b.ne L(misaligned8)
+ cbnz count, L(mutual_align)
- /*
- * NUL detection works on the principle that (X - 1) & (~X) & 0x80
- * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- * can be done in parallel across the entire word.
- */
-.Lloop_aligned:
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+ .p2align 4
+L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
-.Lstart_realigned:
- subs limit_wd, limit_wd, #1
+L(start_realigned):
+ subs limit, limit, #8
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- csinv endloop, diff, xzr, pl /* Last Dword or differences.*/
- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ csinv endloop, diff, xzr, hi /* Last Dword or differences. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp endloop, #0, #0, eq
- b.eq .Lloop_aligned
+ b.eq L(loop_aligned)
+ /* End of main loop */
- /*Not reached the limit, must have found the end or a diff. */
- tbz limit_wd, #63, .Lnot_limit
-
- /* Limit % 8 == 0 => all bytes significant. */
- ands limit, limit, #7
- b.eq .Lnot_limit
+L(full_check):
+#ifndef __AARCH64EB__
+ orr syndrome, diff, has_nul
+ add limit, limit, 8 /* Rewind limit to before last subs. */
+L(syndrome_check):
+ /* Limit was reached. Check if the NUL byte or the difference
+ is before the limit. */
+ rev syndrome, syndrome
+ rev data1, data1
+ clz pos, syndrome
+ rev data2, data2
+ lsl data1, data1, pos
+ cmp limit, pos, lsr #3
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ csel result, result, xzr, hi
+ ret
+#else
+ /* Not reached the limit, must have found the end or a diff. */
+ tbz limit, #63, L(not_limit)
+ add tmp1, limit, 8
+ cbz limit, L(not_limit)
- lsl limit, limit, #3 /* Bits -> bytes. */
+ lsl limit, tmp1, #3 /* Bits -> bytes. */
mov mask, #~0
-CPU_BE( lsr mask, mask, limit )
-CPU_LE( lsl mask, mask, limit )
+ lsr mask, mask, limit
bic data1, data1, mask
bic data2, data2, mask
/* Make sure that the NUL byte is marked in the syndrome. */
orr has_nul, has_nul, mask
-.Lnot_limit:
+L(not_limit):
+ /* For big-endian we cannot use the trick with the syndrome value
+ as carry-propagation can corrupt the upper bits if the trailing
+ bytes in the string contain 0x01. */
+ /* However, if there is no NUL byte in the dword, we can generate
+ the result directly. We can't just subtract the bytes as the
+ MSB might be significant. */
+ cbnz has_nul, 1f
+ cmp data1, data2
+ cset result, ne
+ cneg result, result, lo
+ ret
+1:
+ /* Re-compute the NUL-byte detection, using a byte-reversed value. */
+ rev tmp3, data1
+ sub tmp1, tmp3, zeroones
+ orr tmp2, tmp3, #REP8_7f
+ bic has_nul, tmp1, tmp2
+ rev has_nul, has_nul
orr syndrome, diff, has_nul
- b .Lcal_cmpresult
+ clz pos, syndrome
+ /* The most-significant-non-zero bit of the syndrome marks either the
+ first bit that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+L(end_quick):
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#endif
-.Lmutual_align:
- /*
- * Sources are mutually aligned, but are not currently at an
- * alignment boundary. Round down the addresses and then mask off
- * the bytes that precede the start point.
- * We also need to adjust the limit calculations, but without
- * overflowing if the limit is near ULONG_MAX.
- */
+L(mutual_align):
+ /* Sources are mutually aligned, but are not currently at an
+ alignment boundary. Round down the addresses and then mask off
+ the bytes that precede the start point.
+ We also need to adjust the limit calculations, but without
+ overflowing if the limit is near ULONG_MAX. */
bic src1, src1, #7
bic src2, src2, #7
ldr data1, [src1], #8
- neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */
+ neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
ldr data2, [src2], #8
mov tmp2, #~0
- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
- /* Big-endian. Early bytes are at MSB. */
-CPU_BE( lsl tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */
- /* Little-endian. Early bytes are at LSB. */
-CPU_LE( lsr tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */
-
- and tmp3, limit_wd, #7
- lsr limit_wd, limit_wd, #3
- /* Adjust the limit. Only low 3 bits used, so overflow irrelevant.*/
- add limit, limit, tmp1
- add tmp3, tmp3, tmp1
+ LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
+ /* Adjust the limit and ensure it doesn't overflow. */
+ adds limit, limit, count
+ csinv limit, limit, xzr, lo
orr data1, data1, tmp2
orr data2, data2, tmp2
- add limit_wd, limit_wd, tmp3, lsr #3
- b .Lstart_realigned
+ b L(start_realigned)
-/*when src1 offset is not equal to src2 offset...*/
-.Lmisaligned8:
- cmp limit, #8
- b.lo .Ltiny8proc /*limit < 8... */
- /*
- * Get the align offset length to compare per byte first.
- * After this process, one string's address will be aligned.*/
- and tmp1, src1, #7
- neg tmp1, tmp1
- add tmp1, tmp1, #8
- and tmp2, src2, #7
- neg tmp2, tmp2
- add tmp2, tmp2, #8
- subs tmp3, tmp1, tmp2
- csel pos, tmp1, tmp2, hi /*Choose the maximum. */
- /*
- * Here, limit is not less than 8, so directly run .Ltinycmp
- * without checking the limit.*/
- sub limit, limit, pos
-.Ltinycmp:
+ .p2align 4
+ /* Don't bother with dwords for up to 16 bytes. */
+L(misaligned8):
+ cmp limit, #16
+ b.hs L(try_misaligned_words)
+
+L(byte_loop):
+ /* Perhaps we can do better than this. */
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
- subs pos, pos, #1
- ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */
- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq .Ltinycmp
- cbnz pos, 1f /*find the null or unequal...*/
- cmp data1w, #1
- ccmp data1w, data2w, #0, cs
- b.eq .Lstart_align /*the last bytes are equal....*/
-1:
+ subs limit, limit, #1
+ ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.eq L(byte_loop)
+L(done):
sub result, data1, data2
ret
+ /* Align the SRC1 to a dword by doing a bytewise compare and then do
+ the dword loop. */
+L(try_misaligned_words):
+ cbz count, L(src1_aligned)
-.Lstart_align:
- lsr limit_wd, limit, #3
- cbz limit_wd, .Lremain8
- /*process more leading bytes to make str1 aligned...*/
- ands xzr, src1, #7
- b.eq .Lrecal_offset
- add src1, src1, tmp3 /*tmp3 is positive in this branch.*/
- add src2, src2, tmp3
- ldr data1, [src1], #8
- ldr data2, [src2], #8
+ neg count, count
+ and count, count, #7
+ sub limit, limit, count
+
+L(page_end_loop):
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ cmp data1w, #1
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.ne L(done)
+ subs count, count, #1
+ b.hi L(page_end_loop)
- sub limit, limit, tmp3
- lsr limit_wd, limit, #3
- subs limit_wd, limit_wd, #1
+ /* The following diagram explains the comparison of misaligned strings.
+ The bytes are shown in natural order. For little-endian, it is
+ reversed in the registers. The "x" bytes are before the string.
+ The "|" separates data that is loaded at one time.
+ src1 | a a a a a a a a | b b b c c c c c | . . .
+ src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
- bics has_nul, tmp1, tmp2
- ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/
- b.ne .Lunequal_proc
- /*How far is the current str2 from the alignment boundary...*/
- and tmp3, tmp3, #7
-.Lrecal_offset:
- neg pos, tmp3
-.Lloopcmp_proc:
- /*
- * Divide the eight bytes into two parts. First,backwards the src2
- * to an alignment boundary,load eight bytes from the SRC2 alignment
- * boundary,then compare with the relative bytes from SRC1.
- * If all 8 bytes are equal,then start the second part's comparison.
- * Otherwise finish the comparison.
- * This special handle can garantee all the accesses are in the
- * thread/task space in avoid to overrange access.
- */
- ldr data1, [src1,pos]
- ldr data2, [src2,pos]
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
- eor diff, data1, data2 /* Non-zero if differences found. */
- csinv endloop, diff, xzr, eq
- cbnz endloop, .Lunequal_proc
+ After shifting in each step, the data looks like this:
+ STEP_A STEP_B STEP_C
+ data1 a a a a a a a a b b b c c c c c b b b c c c c c
+ data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
+
+ The bytes with "0" are eliminated from the syndrome via mask.
- /*The second part process*/
+ Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
+ time from SRC2. The comparison happens in 3 steps. After each step
+ the loop can exit, or read from SRC1 or SRC2. */
+L(src1_aligned):
+ /* Calculate offset from 8 byte alignment to string start in bits. No
+ need to mask offset since shifts are ignoring upper bits. */
+ lsl offset, src2, #3
+ bic src2, src2, #0xf
+ mov mask, -1
+ neg neg_offset, offset
ldr data1, [src1], #8
- ldr data2, [src2], #8
- subs limit_wd, limit_wd, #1
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
- bics has_nul, tmp1, tmp2
- ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/
- b.eq .Lloopcmp_proc
+ ldp tmp1, tmp2, [src2], #16
+ LS_BK mask, mask, neg_offset
+ and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
+ /* Skip the first compare if data in tmp1 is irrelevant. */
+ tbnz offset, 6, L(misaligned_mid_loop)
+
+L(loop_misaligned):
+ /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
+ LS_FW data2, tmp1, offset
+ LS_BK tmp1, tmp2, neg_offset
+ subs limit, limit, #8
+ orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
+ sub has_nul, data1, zeroones
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ orr tmp3, data1, #REP8_7f
+ csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
+ orr tmp3, endloop, has_nul
+ cbnz tmp3, L(full_check)
-.Lunequal_proc:
+ ldr data1, [src1], #8
+L(misaligned_mid_loop):
+ /* STEP_B: Compare first part of data1 to second part of tmp2. */
+ LS_FW data2, tmp2, offset
+#ifdef __AARCH64EB__
+ /* For big-endian we do a byte reverse to avoid carry-propagation
+ problem described above. This way we can reuse the has_nul in the
+ next step and also use syndrome value trick at the end. */
+ rev tmp3, data1
+ #define data1_fixed tmp3
+#else
+ #define data1_fixed data1
+#endif
+ sub has_nul, data1_fixed, zeroones
+ orr tmp3, data1_fixed, #REP8_7f
+ eor diff, data2, data1 /* Non-zero if differences found. */
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
+#ifdef __AARCH64EB__
+ rev has_nul, has_nul
+#endif
+ cmp limit, neg_offset, lsr #3
orr syndrome, diff, has_nul
- cbz syndrome, .Lremain8
-.Lcal_cmpresult:
- /*
- * reversed the byte-order as big-endian,then CLZ can find the most
- * significant zero bits.
- */
-CPU_LE( rev syndrome, syndrome )
-CPU_LE( rev data1, data1 )
-CPU_LE( rev data2, data2 )
- /*
- * For big-endian we cannot use the trick with the syndrome value
- * as carry-propagation can corrupt the upper bits if the trailing
- * bytes in the string contain 0x01.
- * However, if there is no NUL byte in the dword, we can generate
- * the result directly. We can't just subtract the bytes as the
- * MSB might be significant.
- */
-CPU_BE( cbnz has_nul, 1f )
-CPU_BE( cmp data1, data2 )
-CPU_BE( cset result, ne )
-CPU_BE( cneg result, result, lo )
-CPU_BE( ret )
-CPU_BE( 1: )
- /* Re-compute the NUL-byte detection, using a byte-reversed value.*/
-CPU_BE( rev tmp3, data1 )
-CPU_BE( sub tmp1, tmp3, zeroones )
-CPU_BE( orr tmp2, tmp3, #REP8_7f )
-CPU_BE( bic has_nul, tmp1, tmp2 )
-CPU_BE( rev has_nul, has_nul )
-CPU_BE( orr syndrome, diff, has_nul )
- /*
- * The MS-non-zero bit of the syndrome marks either the first bit
- * that is different, or the top bit of the first zero byte.
- * Shifting left now will bring the critical information into the
- * top bits.
- */
- clz pos, syndrome
- lsl data1, data1, pos
- lsl data2, data2, pos
- /*
- * But we need to zero-extend (char is unsigned) the value and then
- * perform a signed 32-bit subtraction.
- */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
- ret
+ bic syndrome, syndrome, mask /* Ignore later bytes. */
+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
+ cbnz tmp3, L(syndrome_check)
-.Lremain8:
- /* Limit % 8 == 0 => all bytes significant. */
- ands limit, limit, #7
- b.eq .Lret0
-.Ltiny8proc:
- ldrb data1w, [src1], #1
- ldrb data2w, [src2], #1
- subs limit, limit, #1
+ /* STEP_C: Compare second part of data1 to first part of tmp1. */
+ ldp tmp1, tmp2, [src2], #16
+ cmp limit, #8
+ LS_BK data2, tmp1, neg_offset
+ eor diff, data2, data1 /* Non-zero if differences found. */
+ orr syndrome, diff, has_nul
+ and syndrome, syndrome, mask /* Ignore earlier bytes. */
+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
+ cbnz tmp3, L(syndrome_check)
- ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */
- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq .Ltiny8proc
- sub result, data1, data2
- ret
+ ldr data1, [src1], #8
+ sub limit, limit, #8
+ b L(loop_misaligned)
+
+#ifdef __AARCH64EB__
+L(syndrome_check):
+ clz pos, syndrome
+ cmp pos, limit, lsl #3
+ b.lo L(end_quick)
+#endif
-.Lret0:
+L(ret0):
mov result, #0
ret
-ENDPIPROC(strncmp)
+SYM_FUNC_END(__pi_strncmp)
+SYM_FUNC_ALIAS_WEAK(strncmp, __pi_strncmp)
EXPORT_SYMBOL_NOKASAN(strncmp)
diff --git a/arch/arm64/lib/strnlen.S b/arch/arm64/lib/strnlen.S
index c0bac9493c68..d5ac0e10a01d 100644
--- a/arch/arm64/lib/strnlen.S
+++ b/arch/arm64/lib/strnlen.S
@@ -47,7 +47,7 @@ limit_wd .req x14
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
-WEAK(strnlen)
+SYM_FUNC_START(__pi_strnlen)
cbz limit, .Lhit_limit
mov zeroones, #REP8_01
bic src, srcin, #15
@@ -156,5 +156,7 @@ CPU_LE( lsr tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */
.Lhit_limit:
mov len, limit
ret
-ENDPIPROC(strnlen)
+SYM_FUNC_END(__pi_strnlen)
+
+SYM_FUNC_ALIAS_WEAK(strnlen, __pi_strnlen)
EXPORT_SYMBOL_NOKASAN(strnlen)
diff --git a/arch/arm64/lib/strrchr.S b/arch/arm64/lib/strrchr.S
index 794ac49ea433..a5123cf0ce12 100644
--- a/arch/arm64/lib/strrchr.S
+++ b/arch/arm64/lib/strrchr.S
@@ -18,7 +18,7 @@
* Returns:
* x0 - address of last occurrence of 'c' or 0
*/
-WEAK(strrchr)
+SYM_FUNC_START(__pi_strrchr)
mov x3, #0
and w1, w1, #0xff
1: ldrb w2, [x0], #1
@@ -29,5 +29,6 @@ WEAK(strrchr)
b 1b
2: mov x0, x3
ret
-ENDPIPROC(strrchr)
+SYM_FUNC_END(__pi_strrchr)
+SYM_FUNC_ALIAS_WEAK(strrchr, __pi_strrchr)
EXPORT_SYMBOL_NOKASAN(strrchr)
diff --git a/arch/arm64/lib/tishift.S b/arch/arm64/lib/tishift.S
index 047622536535..a88613834fb0 100644
--- a/arch/arm64/lib/tishift.S
+++ b/arch/arm64/lib/tishift.S
@@ -7,7 +7,7 @@
#include <asm/assembler.h>
-ENTRY(__ashlti3)
+SYM_FUNC_START(__ashlti3)
cbz x2, 1f
mov x3, #64
sub x3, x3, x2
@@ -26,10 +26,10 @@ ENTRY(__ashlti3)
lsl x1, x0, x1
mov x0, x2
ret
-ENDPROC(__ashlti3)
+SYM_FUNC_END(__ashlti3)
EXPORT_SYMBOL(__ashlti3)
-ENTRY(__ashrti3)
+SYM_FUNC_START(__ashrti3)
cbz x2, 1f
mov x3, #64
sub x3, x3, x2
@@ -48,10 +48,10 @@ ENTRY(__ashrti3)
asr x0, x1, x0
mov x1, x2
ret
-ENDPROC(__ashrti3)
+SYM_FUNC_END(__ashrti3)
EXPORT_SYMBOL(__ashrti3)
-ENTRY(__lshrti3)
+SYM_FUNC_START(__lshrti3)
cbz x2, 1f
mov x3, #64
sub x3, x3, x2
@@ -70,5 +70,5 @@ ENTRY(__lshrti3)
lsr x0, x1, x0
mov x1, x2
ret
-ENDPROC(__lshrti3)
+SYM_FUNC_END(__lshrti3)
EXPORT_SYMBOL(__lshrti3)
diff --git a/arch/arm64/lib/uaccess_flushcache.c b/arch/arm64/lib/uaccess_flushcache.c
index bfa30b75b2b8..7510d1a23124 100644
--- a/arch/arm64/lib/uaccess_flushcache.c
+++ b/arch/arm64/lib/uaccess_flushcache.c
@@ -15,26 +15,18 @@ void memcpy_flushcache(void *dst, const void *src, size_t cnt)
* barrier to order the cache maintenance against the memcpy.
*/
memcpy(dst, src, cnt);
- __clean_dcache_area_pop(dst, cnt);
+ dcache_clean_pop((unsigned long)dst, (unsigned long)dst + cnt);
}
EXPORT_SYMBOL_GPL(memcpy_flushcache);
-void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
- size_t len)
-{
- memcpy_flushcache(to, page_address(page) + offset, len);
-}
-
unsigned long __copy_user_flushcache(void *to, const void __user *from,
unsigned long n)
{
unsigned long rc;
- uaccess_enable_not_uao();
- rc = __arch_copy_from_user(to, from, n);
- uaccess_disable_not_uao();
+ rc = raw_copy_from_user(to, from, n);
/* See above */
- __clean_dcache_area_pop(to, n - rc);
+ dcache_clean_pop((unsigned long)to, (unsigned long)to + n - rc);
return rc;
}
diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
index 11bf4f8aca68..f9a53b7f9842 100644
--- a/arch/arm64/lib/xor-neon.c
+++ b/arch/arm64/lib/xor-neon.c
@@ -10,8 +10,8 @@
#include <linux/module.h>
#include <asm/neon-intrinsics.h>
-void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
- unsigned long *p2)
+static void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2)
{
uint64_t *dp1 = (uint64_t *)p1;
uint64_t *dp2 = (uint64_t *)p2;
@@ -37,8 +37,9 @@ void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
} while (--lines > 0);
}
-void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
- unsigned long *p2, unsigned long *p3)
+static void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3)
{
uint64_t *dp1 = (uint64_t *)p1;
uint64_t *dp2 = (uint64_t *)p2;
@@ -72,8 +73,10 @@ void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
} while (--lines > 0);
}
-void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
- unsigned long *p2, unsigned long *p3, unsigned long *p4)
+static void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4)
{
uint64_t *dp1 = (uint64_t *)p1;
uint64_t *dp2 = (uint64_t *)p2;
@@ -115,9 +118,11 @@ void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
} while (--lines > 0);
}
-void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
- unsigned long *p2, unsigned long *p3,
- unsigned long *p4, unsigned long *p5)
+static void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4,
+ const unsigned long * __restrict p5)
{
uint64_t *dp1 = (uint64_t *)p1;
uint64_t *dp2 = (uint64_t *)p2;
@@ -167,7 +172,7 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
} while (--lines > 0);
}
-struct xor_block_template const xor_block_inner_neon = {
+struct xor_block_template xor_block_inner_neon __ro_after_init = {
.name = "__inner_neon__",
.do_2 = xor_arm64_neon_2,
.do_3 = xor_arm64_neon_3,
@@ -176,6 +181,158 @@ struct xor_block_template const xor_block_inner_neon = {
};
EXPORT_SYMBOL(xor_block_inner_neon);
+static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
+{
+ uint64x2_t res;
+
+ asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
+ "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
+ : "=w"(res) : "w"(p), "w"(q), "w"(r));
+ return res;
+}
+
+static void xor_arm64_eor3_3(unsigned long bytes,
+ unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3)
+{
+ uint64_t *dp1 = (uint64_t *)p1;
+ uint64_t *dp2 = (uint64_t *)p2;
+ uint64_t *dp3 = (uint64_t *)p3;
+
+ register uint64x2_t v0, v1, v2, v3;
+ long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+ do {
+ /* p1 ^= p2 ^ p3 */
+ v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+ vld1q_u64(dp3 + 0));
+ v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+ vld1q_u64(dp3 + 2));
+ v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+ vld1q_u64(dp3 + 4));
+ v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+ vld1q_u64(dp3 + 6));
+
+ /* store */
+ vst1q_u64(dp1 + 0, v0);
+ vst1q_u64(dp1 + 2, v1);
+ vst1q_u64(dp1 + 4, v2);
+ vst1q_u64(dp1 + 6, v3);
+
+ dp1 += 8;
+ dp2 += 8;
+ dp3 += 8;
+ } while (--lines > 0);
+}
+
+static void xor_arm64_eor3_4(unsigned long bytes,
+ unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4)
+{
+ uint64_t *dp1 = (uint64_t *)p1;
+ uint64_t *dp2 = (uint64_t *)p2;
+ uint64_t *dp3 = (uint64_t *)p3;
+ uint64_t *dp4 = (uint64_t *)p4;
+
+ register uint64x2_t v0, v1, v2, v3;
+ long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+ do {
+ /* p1 ^= p2 ^ p3 */
+ v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+ vld1q_u64(dp3 + 0));
+ v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+ vld1q_u64(dp3 + 2));
+ v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+ vld1q_u64(dp3 + 4));
+ v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+ vld1q_u64(dp3 + 6));
+
+ /* p1 ^= p4 */
+ v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
+ v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
+ v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
+ v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
+
+ /* store */
+ vst1q_u64(dp1 + 0, v0);
+ vst1q_u64(dp1 + 2, v1);
+ vst1q_u64(dp1 + 4, v2);
+ vst1q_u64(dp1 + 6, v3);
+
+ dp1 += 8;
+ dp2 += 8;
+ dp3 += 8;
+ dp4 += 8;
+ } while (--lines > 0);
+}
+
+static void xor_arm64_eor3_5(unsigned long bytes,
+ unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4,
+ const unsigned long * __restrict p5)
+{
+ uint64_t *dp1 = (uint64_t *)p1;
+ uint64_t *dp2 = (uint64_t *)p2;
+ uint64_t *dp3 = (uint64_t *)p3;
+ uint64_t *dp4 = (uint64_t *)p4;
+ uint64_t *dp5 = (uint64_t *)p5;
+
+ register uint64x2_t v0, v1, v2, v3;
+ long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+ do {
+ /* p1 ^= p2 ^ p3 */
+ v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+ vld1q_u64(dp3 + 0));
+ v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+ vld1q_u64(dp3 + 2));
+ v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+ vld1q_u64(dp3 + 4));
+ v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+ vld1q_u64(dp3 + 6));
+
+ /* p1 ^= p4 ^ p5 */
+ v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
+ v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
+ v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
+ v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
+
+ /* store */
+ vst1q_u64(dp1 + 0, v0);
+ vst1q_u64(dp1 + 2, v1);
+ vst1q_u64(dp1 + 4, v2);
+ vst1q_u64(dp1 + 6, v3);
+
+ dp1 += 8;
+ dp2 += 8;
+ dp3 += 8;
+ dp4 += 8;
+ dp5 += 8;
+ } while (--lines > 0);
+}
+
+static int __init xor_neon_init(void)
+{
+ if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
+ xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
+ xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
+ xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
+ }
+ return 0;
+}
+module_init(xor_neon_init);
+
+static void __exit xor_neon_exit(void)
+{
+}
+module_exit(xor_neon_exit);
+
MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
MODULE_DESCRIPTION("ARMv8 XOR Extensions");
MODULE_LICENSE("GPL");