summaryrefslogtreecommitdiff
path: root/arch/arm64/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/lib')
-rw-r--r--arch/arm64/lib/Makefile4
-rw-r--r--arch/arm64/lib/clear_user.S47
-rw-r--r--arch/arm64/lib/insn.c1458
-rw-r--r--arch/arm64/lib/kasan_sw_tags.S76
-rw-r--r--arch/arm64/lib/memchr.S65
-rw-r--r--arch/arm64/lib/memcmp.S346
-rw-r--r--arch/arm64/lib/memcpy.S272
-rw-r--r--arch/arm64/lib/memmove.S189
-rw-r--r--arch/arm64/lib/mte.S20
-rw-r--r--arch/arm64/lib/strcmp.S289
-rw-r--r--arch/arm64/lib/strlen.S258
-rw-r--r--arch/arm64/lib/strncmp.S406
-rw-r--r--arch/arm64/lib/uaccess_flushcache.c4
13 files changed, 2465 insertions, 969 deletions
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index d31e1169d9b8..6dd56a49790a 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
lib-y := clear_user.o delay.o copy_from_user.o \
copy_to_user.o copy_in_user.o copy_page.o \
- clear_page.o csum.o memchr.o memcpy.o memmove.o \
+ clear_page.o csum.o insn.o memchr.o memcpy.o \
memset.o memcmp.o strcmp.o strncmp.o strlen.o \
strnlen.o strchr.o strrchr.o tishift.o
@@ -18,3 +18,5 @@ obj-$(CONFIG_CRC32) += crc32.o
obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
obj-$(CONFIG_ARM64_MTE) += mte.o
+
+obj-$(CONFIG_KASAN_SW_TAGS) += kasan_sw_tags.o
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index af9afcbec92c..a7efb2ad2a1c 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -1,12 +1,9 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Based on arch/arm/lib/clear_user.S
- *
- * Copyright (C) 2012 ARM Ltd.
+ * Copyright (C) 2021 Arm Ltd.
*/
-#include <linux/linkage.h>
-#include <asm/asm-uaccess.h>
+#include <linux/linkage.h>
#include <asm/assembler.h>
.text
@@ -19,25 +16,33 @@
*
* Alignment fixed up by hardware.
*/
+
+ .p2align 4
+ // Alignment is for the loop, but since the prologue (including BTI)
+ // is also 16 bytes we can keep any padding outside the function
SYM_FUNC_START(__arch_clear_user)
- mov x2, x1 // save the size for fixup return
+ add x2, x0, x1
subs x1, x1, #8
b.mi 2f
1:
-user_ldst 9f, sttr, xzr, x0, 8
+USER(9f, sttr xzr, [x0])
+ add x0, x0, #8
subs x1, x1, #8
- b.pl 1b
-2: adds x1, x1, #4
- b.mi 3f
-user_ldst 9f, sttr, wzr, x0, 4
- sub x1, x1, #4
-3: adds x1, x1, #2
- b.mi 4f
-user_ldst 9f, sttrh, wzr, x0, 2
- sub x1, x1, #2
-4: adds x1, x1, #1
- b.mi 5f
-user_ldst 9f, sttrb, wzr, x0, 0
+ b.hi 1b
+USER(9f, sttr xzr, [x2, #-8])
+ mov x0, #0
+ ret
+
+2: tbz x1, #2, 3f
+USER(9f, sttr wzr, [x0])
+USER(8f, sttr wzr, [x2, #-4])
+ mov x0, #0
+ ret
+
+3: tbz x1, #1, 4f
+USER(9f, sttrh wzr, [x0])
+4: tbz x1, #0, 5f
+USER(7f, sttrb wzr, [x2, #-1])
5: mov x0, #0
ret
SYM_FUNC_END(__arch_clear_user)
@@ -45,6 +50,8 @@ EXPORT_SYMBOL(__arch_clear_user)
.section .fixup,"ax"
.align 2
-9: mov x0, x2 // return the original size
+7: sub x0, x2, #5 // Adjust for faulting on the final byte...
+8: add x0, x0, #4 // ...or the second word of the 4-7 byte case
+9: sub x0, x2, x0
ret
.previous
diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c
new file mode 100644
index 000000000000..b506a4b1e38c
--- /dev/null
+++ b/arch/arm64/lib/insn.c
@@ -0,0 +1,1458 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2013 Huawei Ltd.
+ * Author: Jiang Liu <liuj97@gmail.com>
+ *
+ * Copyright (C) 2014-2016 Zi Shen Lim <zlim.lnx@gmail.com>
+ */
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/printk.h>
+#include <linux/sizes.h>
+#include <linux/types.h>
+
+#include <asm/debug-monitors.h>
+#include <asm/errno.h>
+#include <asm/insn.h>
+#include <asm/kprobes.h>
+
+#define AARCH64_INSN_SF_BIT BIT(31)
+#define AARCH64_INSN_N_BIT BIT(22)
+#define AARCH64_INSN_LSL_12 BIT(22)
+
+static const int aarch64_insn_encoding_class[] = {
+ AARCH64_INSN_CLS_UNKNOWN,
+ AARCH64_INSN_CLS_UNKNOWN,
+ AARCH64_INSN_CLS_SVE,
+ AARCH64_INSN_CLS_UNKNOWN,
+ AARCH64_INSN_CLS_LDST,
+ AARCH64_INSN_CLS_DP_REG,
+ AARCH64_INSN_CLS_LDST,
+ AARCH64_INSN_CLS_DP_FPSIMD,
+ AARCH64_INSN_CLS_DP_IMM,
+ AARCH64_INSN_CLS_DP_IMM,
+ AARCH64_INSN_CLS_BR_SYS,
+ AARCH64_INSN_CLS_BR_SYS,
+ AARCH64_INSN_CLS_LDST,
+ AARCH64_INSN_CLS_DP_REG,
+ AARCH64_INSN_CLS_LDST,
+ AARCH64_INSN_CLS_DP_FPSIMD,
+};
+
+enum aarch64_insn_encoding_class __kprobes aarch64_get_insn_class(u32 insn)
+{
+ return aarch64_insn_encoding_class[(insn >> 25) & 0xf];
+}
+
+bool __kprobes aarch64_insn_is_steppable_hint(u32 insn)
+{
+ if (!aarch64_insn_is_hint(insn))
+ return false;
+
+ switch (insn & 0xFE0) {
+ case AARCH64_INSN_HINT_XPACLRI:
+ case AARCH64_INSN_HINT_PACIA_1716:
+ case AARCH64_INSN_HINT_PACIB_1716:
+ case AARCH64_INSN_HINT_PACIAZ:
+ case AARCH64_INSN_HINT_PACIASP:
+ case AARCH64_INSN_HINT_PACIBZ:
+ case AARCH64_INSN_HINT_PACIBSP:
+ case AARCH64_INSN_HINT_BTI:
+ case AARCH64_INSN_HINT_BTIC:
+ case AARCH64_INSN_HINT_BTIJ:
+ case AARCH64_INSN_HINT_BTIJC:
+ case AARCH64_INSN_HINT_NOP:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool aarch64_insn_is_branch_imm(u32 insn)
+{
+ return (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn) ||
+ aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn) ||
+ aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
+ aarch64_insn_is_bcond(insn));
+}
+
+bool __kprobes aarch64_insn_uses_literal(u32 insn)
+{
+ /* ldr/ldrsw (literal), prfm */
+
+ return aarch64_insn_is_ldr_lit(insn) ||
+ aarch64_insn_is_ldrsw_lit(insn) ||
+ aarch64_insn_is_adr_adrp(insn) ||
+ aarch64_insn_is_prfm_lit(insn);
+}
+
+bool __kprobes aarch64_insn_is_branch(u32 insn)
+{
+ /* b, bl, cb*, tb*, ret*, b.cond, br*, blr* */
+
+ return aarch64_insn_is_b(insn) ||
+ aarch64_insn_is_bl(insn) ||
+ aarch64_insn_is_cbz(insn) ||
+ aarch64_insn_is_cbnz(insn) ||
+ aarch64_insn_is_tbz(insn) ||
+ aarch64_insn_is_tbnz(insn) ||
+ aarch64_insn_is_ret(insn) ||
+ aarch64_insn_is_ret_auth(insn) ||
+ aarch64_insn_is_br(insn) ||
+ aarch64_insn_is_br_auth(insn) ||
+ aarch64_insn_is_blr(insn) ||
+ aarch64_insn_is_blr_auth(insn) ||
+ aarch64_insn_is_bcond(insn);
+}
+
+static int __kprobes aarch64_get_imm_shift_mask(enum aarch64_insn_imm_type type,
+ u32 *maskp, int *shiftp)
+{
+ u32 mask;
+ int shift;
+
+ switch (type) {
+ case AARCH64_INSN_IMM_26:
+ mask = BIT(26) - 1;
+ shift = 0;
+ break;
+ case AARCH64_INSN_IMM_19:
+ mask = BIT(19) - 1;
+ shift = 5;
+ break;
+ case AARCH64_INSN_IMM_16:
+ mask = BIT(16) - 1;
+ shift = 5;
+ break;
+ case AARCH64_INSN_IMM_14:
+ mask = BIT(14) - 1;
+ shift = 5;
+ break;
+ case AARCH64_INSN_IMM_12:
+ mask = BIT(12) - 1;
+ shift = 10;
+ break;
+ case AARCH64_INSN_IMM_9:
+ mask = BIT(9) - 1;
+ shift = 12;
+ break;
+ case AARCH64_INSN_IMM_7:
+ mask = BIT(7) - 1;
+ shift = 15;
+ break;
+ case AARCH64_INSN_IMM_6:
+ case AARCH64_INSN_IMM_S:
+ mask = BIT(6) - 1;
+ shift = 10;
+ break;
+ case AARCH64_INSN_IMM_R:
+ mask = BIT(6) - 1;
+ shift = 16;
+ break;
+ case AARCH64_INSN_IMM_N:
+ mask = 1;
+ shift = 22;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ *maskp = mask;
+ *shiftp = shift;
+
+ return 0;
+}
+
+#define ADR_IMM_HILOSPLIT 2
+#define ADR_IMM_SIZE SZ_2M
+#define ADR_IMM_LOMASK ((1 << ADR_IMM_HILOSPLIT) - 1)
+#define ADR_IMM_HIMASK ((ADR_IMM_SIZE >> ADR_IMM_HILOSPLIT) - 1)
+#define ADR_IMM_LOSHIFT 29
+#define ADR_IMM_HISHIFT 5
+
+u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn)
+{
+ u32 immlo, immhi, mask;
+ int shift;
+
+ switch (type) {
+ case AARCH64_INSN_IMM_ADR:
+ shift = 0;
+ immlo = (insn >> ADR_IMM_LOSHIFT) & ADR_IMM_LOMASK;
+ immhi = (insn >> ADR_IMM_HISHIFT) & ADR_IMM_HIMASK;
+ insn = (immhi << ADR_IMM_HILOSPLIT) | immlo;
+ mask = ADR_IMM_SIZE - 1;
+ break;
+ default:
+ if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) {
+ pr_err("aarch64_insn_decode_immediate: unknown immediate encoding %d\n",
+ type);
+ return 0;
+ }
+ }
+
+ return (insn >> shift) & mask;
+}
+
+u32 __kprobes aarch64_insn_encode_immediate(enum aarch64_insn_imm_type type,
+ u32 insn, u64 imm)
+{
+ u32 immlo, immhi, mask;
+ int shift;
+
+ if (insn == AARCH64_BREAK_FAULT)
+ return AARCH64_BREAK_FAULT;
+
+ switch (type) {
+ case AARCH64_INSN_IMM_ADR:
+ shift = 0;
+ immlo = (imm & ADR_IMM_LOMASK) << ADR_IMM_LOSHIFT;
+ imm >>= ADR_IMM_HILOSPLIT;
+ immhi = (imm & ADR_IMM_HIMASK) << ADR_IMM_HISHIFT;
+ imm = immlo | immhi;
+ mask = ((ADR_IMM_LOMASK << ADR_IMM_LOSHIFT) |
+ (ADR_IMM_HIMASK << ADR_IMM_HISHIFT));
+ break;
+ default:
+ if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) {
+ pr_err("aarch64_insn_encode_immediate: unknown immediate encoding %d\n",
+ type);
+ return AARCH64_BREAK_FAULT;
+ }
+ }
+
+ /* Update the immediate field. */
+ insn &= ~(mask << shift);
+ insn |= (imm & mask) << shift;
+
+ return insn;
+}
+
+u32 aarch64_insn_decode_register(enum aarch64_insn_register_type type,
+ u32 insn)
+{
+ int shift;
+
+ switch (type) {
+ case AARCH64_INSN_REGTYPE_RT:
+ case AARCH64_INSN_REGTYPE_RD:
+ shift = 0;
+ break;
+ case AARCH64_INSN_REGTYPE_RN:
+ shift = 5;
+ break;
+ case AARCH64_INSN_REGTYPE_RT2:
+ case AARCH64_INSN_REGTYPE_RA:
+ shift = 10;
+ break;
+ case AARCH64_INSN_REGTYPE_RM:
+ shift = 16;
+ break;
+ default:
+ pr_err("%s: unknown register type encoding %d\n", __func__,
+ type);
+ return 0;
+ }
+
+ return (insn >> shift) & GENMASK(4, 0);
+}
+
+static u32 aarch64_insn_encode_register(enum aarch64_insn_register_type type,
+ u32 insn,
+ enum aarch64_insn_register reg)
+{
+ int shift;
+
+ if (insn == AARCH64_BREAK_FAULT)
+ return AARCH64_BREAK_FAULT;
+
+ if (reg < AARCH64_INSN_REG_0 || reg > AARCH64_INSN_REG_SP) {
+ pr_err("%s: unknown register encoding %d\n", __func__, reg);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (type) {
+ case AARCH64_INSN_REGTYPE_RT:
+ case AARCH64_INSN_REGTYPE_RD:
+ shift = 0;
+ break;
+ case AARCH64_INSN_REGTYPE_RN:
+ shift = 5;
+ break;
+ case AARCH64_INSN_REGTYPE_RT2:
+ case AARCH64_INSN_REGTYPE_RA:
+ shift = 10;
+ break;
+ case AARCH64_INSN_REGTYPE_RM:
+ case AARCH64_INSN_REGTYPE_RS:
+ shift = 16;
+ break;
+ default:
+ pr_err("%s: unknown register type encoding %d\n", __func__,
+ type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn &= ~(GENMASK(4, 0) << shift);
+ insn |= reg << shift;
+
+ return insn;
+}
+
+static u32 aarch64_insn_encode_ldst_size(enum aarch64_insn_size_type type,
+ u32 insn)
+{
+ u32 size;
+
+ switch (type) {
+ case AARCH64_INSN_SIZE_8:
+ size = 0;
+ break;
+ case AARCH64_INSN_SIZE_16:
+ size = 1;
+ break;
+ case AARCH64_INSN_SIZE_32:
+ size = 2;
+ break;
+ case AARCH64_INSN_SIZE_64:
+ size = 3;
+ break;
+ default:
+ pr_err("%s: unknown size encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn &= ~GENMASK(31, 30);
+ insn |= size << 30;
+
+ return insn;
+}
+
+static inline long branch_imm_common(unsigned long pc, unsigned long addr,
+ long range)
+{
+ long offset;
+
+ if ((pc & 0x3) || (addr & 0x3)) {
+ pr_err("%s: A64 instructions must be word aligned\n", __func__);
+ return range;
+ }
+
+ offset = ((long)addr - (long)pc);
+
+ if (offset < -range || offset >= range) {
+ pr_err("%s: offset out of range\n", __func__);
+ return range;
+ }
+
+ return offset;
+}
+
+u32 __kprobes aarch64_insn_gen_branch_imm(unsigned long pc, unsigned long addr,
+ enum aarch64_insn_branch_type type)
+{
+ u32 insn;
+ long offset;
+
+ /*
+ * B/BL support [-128M, 128M) offset
+ * ARM64 virtual address arrangement guarantees all kernel and module
+ * texts are within +/-128M.
+ */
+ offset = branch_imm_common(pc, addr, SZ_128M);
+ if (offset >= SZ_128M)
+ return AARCH64_BREAK_FAULT;
+
+ switch (type) {
+ case AARCH64_INSN_BRANCH_LINK:
+ insn = aarch64_insn_get_bl_value();
+ break;
+ case AARCH64_INSN_BRANCH_NOLINK:
+ insn = aarch64_insn_get_b_value();
+ break;
+ default:
+ pr_err("%s: unknown branch encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn,
+ offset >> 2);
+}
+
+u32 aarch64_insn_gen_comp_branch_imm(unsigned long pc, unsigned long addr,
+ enum aarch64_insn_register reg,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_branch_type type)
+{
+ u32 insn;
+ long offset;
+
+ offset = branch_imm_common(pc, addr, SZ_1M);
+ if (offset >= SZ_1M)
+ return AARCH64_BREAK_FAULT;
+
+ switch (type) {
+ case AARCH64_INSN_BRANCH_COMP_ZERO:
+ insn = aarch64_insn_get_cbz_value();
+ break;
+ case AARCH64_INSN_BRANCH_COMP_NONZERO:
+ insn = aarch64_insn_get_cbnz_value();
+ break;
+ default:
+ pr_err("%s: unknown branch encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT;
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
+ offset >> 2);
+}
+
+u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr,
+ enum aarch64_insn_condition cond)
+{
+ u32 insn;
+ long offset;
+
+ offset = branch_imm_common(pc, addr, SZ_1M);
+
+ insn = aarch64_insn_get_bcond_value();
+
+ if (cond < AARCH64_INSN_COND_EQ || cond > AARCH64_INSN_COND_AL) {
+ pr_err("%s: unknown condition encoding %d\n", __func__, cond);
+ return AARCH64_BREAK_FAULT;
+ }
+ insn |= cond;
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
+ offset >> 2);
+}
+
+u32 __kprobes aarch64_insn_gen_hint(enum aarch64_insn_hint_cr_op op)
+{
+ return aarch64_insn_get_hint_value() | op;
+}
+
+u32 __kprobes aarch64_insn_gen_nop(void)
+{
+ return aarch64_insn_gen_hint(AARCH64_INSN_HINT_NOP);
+}
+
+u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg,
+ enum aarch64_insn_branch_type type)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_BRANCH_NOLINK:
+ insn = aarch64_insn_get_br_value();
+ break;
+ case AARCH64_INSN_BRANCH_LINK:
+ insn = aarch64_insn_get_blr_value();
+ break;
+ case AARCH64_INSN_BRANCH_RETURN:
+ insn = aarch64_insn_get_ret_value();
+ break;
+ default:
+ pr_err("%s: unknown branch encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, reg);
+}
+
+u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg,
+ enum aarch64_insn_register base,
+ enum aarch64_insn_register offset,
+ enum aarch64_insn_size_type size,
+ enum aarch64_insn_ldst_type type)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_LDST_LOAD_REG_OFFSET:
+ insn = aarch64_insn_get_ldr_reg_value();
+ break;
+ case AARCH64_INSN_LDST_STORE_REG_OFFSET:
+ insn = aarch64_insn_get_str_reg_value();
+ break;
+ default:
+ pr_err("%s: unknown load/store encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_ldst_size(size, insn);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+ base);
+
+ return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn,
+ offset);
+}
+
+u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1,
+ enum aarch64_insn_register reg2,
+ enum aarch64_insn_register base,
+ int offset,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_ldst_type type)
+{
+ u32 insn;
+ int shift;
+
+ switch (type) {
+ case AARCH64_INSN_LDST_LOAD_PAIR_PRE_INDEX:
+ insn = aarch64_insn_get_ldp_pre_value();
+ break;
+ case AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX:
+ insn = aarch64_insn_get_stp_pre_value();
+ break;
+ case AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX:
+ insn = aarch64_insn_get_ldp_post_value();
+ break;
+ case AARCH64_INSN_LDST_STORE_PAIR_POST_INDEX:
+ insn = aarch64_insn_get_stp_post_value();
+ break;
+ default:
+ pr_err("%s: unknown load/store encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ if ((offset & 0x3) || (offset < -256) || (offset > 252)) {
+ pr_err("%s: offset must be multiples of 4 in the range of [-256, 252] %d\n",
+ __func__, offset);
+ return AARCH64_BREAK_FAULT;
+ }
+ shift = 2;
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ if ((offset & 0x7) || (offset < -512) || (offset > 504)) {
+ pr_err("%s: offset must be multiples of 8 in the range of [-512, 504] %d\n",
+ __func__, offset);
+ return AARCH64_BREAK_FAULT;
+ }
+ shift = 3;
+ insn |= AARCH64_INSN_SF_BIT;
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
+ reg1);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn,
+ reg2);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+ base);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_7, insn,
+ offset >> shift);
+}
+
+u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg,
+ enum aarch64_insn_register base,
+ enum aarch64_insn_register state,
+ enum aarch64_insn_size_type size,
+ enum aarch64_insn_ldst_type type)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_LDST_LOAD_EX:
+ insn = aarch64_insn_get_load_ex_value();
+ break;
+ case AARCH64_INSN_LDST_STORE_EX:
+ insn = aarch64_insn_get_store_ex_value();
+ break;
+ default:
+ pr_err("%s: unknown load/store exclusive encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_ldst_size(size, insn);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
+ reg);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+ base);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn,
+ AARCH64_INSN_REG_ZR);
+
+ return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn,
+ state);
+}
+
+u32 aarch64_insn_gen_ldadd(enum aarch64_insn_register result,
+ enum aarch64_insn_register address,
+ enum aarch64_insn_register value,
+ enum aarch64_insn_size_type size)
+{
+ u32 insn = aarch64_insn_get_ldadd_value();
+
+ switch (size) {
+ case AARCH64_INSN_SIZE_32:
+ case AARCH64_INSN_SIZE_64:
+ break;
+ default:
+ pr_err("%s: unimplemented size encoding %d\n", __func__, size);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_ldst_size(size, insn);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
+ result);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+ address);
+
+ return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn,
+ value);
+}
+
+u32 aarch64_insn_gen_stadd(enum aarch64_insn_register address,
+ enum aarch64_insn_register value,
+ enum aarch64_insn_size_type size)
+{
+ /*
+ * STADD is simply encoded as an alias for LDADD with XZR as
+ * the destination register.
+ */
+ return aarch64_insn_gen_ldadd(AARCH64_INSN_REG_ZR, address,
+ value, size);
+}
+
+static u32 aarch64_insn_encode_prfm_imm(enum aarch64_insn_prfm_type type,
+ enum aarch64_insn_prfm_target target,
+ enum aarch64_insn_prfm_policy policy,
+ u32 insn)
+{
+ u32 imm_type = 0, imm_target = 0, imm_policy = 0;
+
+ switch (type) {
+ case AARCH64_INSN_PRFM_TYPE_PLD:
+ break;
+ case AARCH64_INSN_PRFM_TYPE_PLI:
+ imm_type = BIT(0);
+ break;
+ case AARCH64_INSN_PRFM_TYPE_PST:
+ imm_type = BIT(1);
+ break;
+ default:
+ pr_err("%s: unknown prfm type encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (target) {
+ case AARCH64_INSN_PRFM_TARGET_L1:
+ break;
+ case AARCH64_INSN_PRFM_TARGET_L2:
+ imm_target = BIT(0);
+ break;
+ case AARCH64_INSN_PRFM_TARGET_L3:
+ imm_target = BIT(1);
+ break;
+ default:
+ pr_err("%s: unknown prfm target encoding %d\n", __func__, target);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (policy) {
+ case AARCH64_INSN_PRFM_POLICY_KEEP:
+ break;
+ case AARCH64_INSN_PRFM_POLICY_STRM:
+ imm_policy = BIT(0);
+ break;
+ default:
+ pr_err("%s: unknown prfm policy encoding %d\n", __func__, policy);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ /* In this case, imm5 is encoded into Rt field. */
+ insn &= ~GENMASK(4, 0);
+ insn |= imm_policy | (imm_target << 1) | (imm_type << 3);
+
+ return insn;
+}
+
+u32 aarch64_insn_gen_prefetch(enum aarch64_insn_register base,
+ enum aarch64_insn_prfm_type type,
+ enum aarch64_insn_prfm_target target,
+ enum aarch64_insn_prfm_policy policy)
+{
+ u32 insn = aarch64_insn_get_prfm_value();
+
+ insn = aarch64_insn_encode_ldst_size(AARCH64_INSN_SIZE_64, insn);
+
+ insn = aarch64_insn_encode_prfm_imm(type, target, policy, insn);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+ base);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, 0);
+}
+
+u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst,
+ enum aarch64_insn_register src,
+ int imm, enum aarch64_insn_variant variant,
+ enum aarch64_insn_adsb_type type)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_ADSB_ADD:
+ insn = aarch64_insn_get_add_imm_value();
+ break;
+ case AARCH64_INSN_ADSB_SUB:
+ insn = aarch64_insn_get_sub_imm_value();
+ break;
+ case AARCH64_INSN_ADSB_ADD_SETFLAGS:
+ insn = aarch64_insn_get_adds_imm_value();
+ break;
+ case AARCH64_INSN_ADSB_SUB_SETFLAGS:
+ insn = aarch64_insn_get_subs_imm_value();
+ break;
+ default:
+ pr_err("%s: unknown add/sub encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT;
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ /* We can't encode more than a 24bit value (12bit + 12bit shift) */
+ if (imm & ~(BIT(24) - 1))
+ goto out;
+
+ /* If we have something in the top 12 bits... */
+ if (imm & ~(SZ_4K - 1)) {
+ /* ... and in the low 12 bits -> error */
+ if (imm & (SZ_4K - 1))
+ goto out;
+
+ imm >>= 12;
+ insn |= AARCH64_INSN_LSL_12;
+ }
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm);
+
+out:
+ pr_err("%s: invalid immediate encoding %d\n", __func__, imm);
+ return AARCH64_BREAK_FAULT;
+}
+
+u32 aarch64_insn_gen_bitfield(enum aarch64_insn_register dst,
+ enum aarch64_insn_register src,
+ int immr, int imms,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_bitfield_type type)
+{
+ u32 insn;
+ u32 mask;
+
+ switch (type) {
+ case AARCH64_INSN_BITFIELD_MOVE:
+ insn = aarch64_insn_get_bfm_value();
+ break;
+ case AARCH64_INSN_BITFIELD_MOVE_UNSIGNED:
+ insn = aarch64_insn_get_ubfm_value();
+ break;
+ case AARCH64_INSN_BITFIELD_MOVE_SIGNED:
+ insn = aarch64_insn_get_sbfm_value();
+ break;
+ default:
+ pr_err("%s: unknown bitfield encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ mask = GENMASK(4, 0);
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT | AARCH64_INSN_N_BIT;
+ mask = GENMASK(5, 0);
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ if (immr & ~mask) {
+ pr_err("%s: invalid immr encoding %d\n", __func__, immr);
+ return AARCH64_BREAK_FAULT;
+ }
+ if (imms & ~mask) {
+ pr_err("%s: invalid imms encoding %d\n", __func__, imms);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+
+ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms);
+}
+
+u32 aarch64_insn_gen_movewide(enum aarch64_insn_register dst,
+ int imm, int shift,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_movewide_type type)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_MOVEWIDE_ZERO:
+ insn = aarch64_insn_get_movz_value();
+ break;
+ case AARCH64_INSN_MOVEWIDE_KEEP:
+ insn = aarch64_insn_get_movk_value();
+ break;
+ case AARCH64_INSN_MOVEWIDE_INVERSE:
+ insn = aarch64_insn_get_movn_value();
+ break;
+ default:
+ pr_err("%s: unknown movewide encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ if (imm & ~(SZ_64K - 1)) {
+ pr_err("%s: invalid immediate encoding %d\n", __func__, imm);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ if (shift != 0 && shift != 16) {
+ pr_err("%s: invalid shift encoding %d\n", __func__,
+ shift);
+ return AARCH64_BREAK_FAULT;
+ }
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT;
+ if (shift != 0 && shift != 16 && shift != 32 && shift != 48) {
+ pr_err("%s: invalid shift encoding %d\n", __func__,
+ shift);
+ return AARCH64_BREAK_FAULT;
+ }
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn |= (shift >> 4) << 21;
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm);
+}
+
+u32 aarch64_insn_gen_add_sub_shifted_reg(enum aarch64_insn_register dst,
+ enum aarch64_insn_register src,
+ enum aarch64_insn_register reg,
+ int shift,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_adsb_type type)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_ADSB_ADD:
+ insn = aarch64_insn_get_add_value();
+ break;
+ case AARCH64_INSN_ADSB_SUB:
+ insn = aarch64_insn_get_sub_value();
+ break;
+ case AARCH64_INSN_ADSB_ADD_SETFLAGS:
+ insn = aarch64_insn_get_adds_value();
+ break;
+ case AARCH64_INSN_ADSB_SUB_SETFLAGS:
+ insn = aarch64_insn_get_subs_value();
+ break;
+ default:
+ pr_err("%s: unknown add/sub encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ if (shift & ~(SZ_32 - 1)) {
+ pr_err("%s: invalid shift encoding %d\n", __func__,
+ shift);
+ return AARCH64_BREAK_FAULT;
+ }
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT;
+ if (shift & ~(SZ_64 - 1)) {
+ pr_err("%s: invalid shift encoding %d\n", __func__,
+ shift);
+ return AARCH64_BREAK_FAULT;
+ }
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift);
+}
+
+u32 aarch64_insn_gen_data1(enum aarch64_insn_register dst,
+ enum aarch64_insn_register src,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_data1_type type)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_DATA1_REVERSE_16:
+ insn = aarch64_insn_get_rev16_value();
+ break;
+ case AARCH64_INSN_DATA1_REVERSE_32:
+ insn = aarch64_insn_get_rev32_value();
+ break;
+ case AARCH64_INSN_DATA1_REVERSE_64:
+ if (variant != AARCH64_INSN_VARIANT_64BIT) {
+ pr_err("%s: invalid variant for reverse64 %d\n",
+ __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+ insn = aarch64_insn_get_rev64_value();
+ break;
+ default:
+ pr_err("%s: unknown data1 encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT;
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+ return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+}
+
+u32 aarch64_insn_gen_data2(enum aarch64_insn_register dst,
+ enum aarch64_insn_register src,
+ enum aarch64_insn_register reg,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_data2_type type)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_DATA2_UDIV:
+ insn = aarch64_insn_get_udiv_value();
+ break;
+ case AARCH64_INSN_DATA2_SDIV:
+ insn = aarch64_insn_get_sdiv_value();
+ break;
+ case AARCH64_INSN_DATA2_LSLV:
+ insn = aarch64_insn_get_lslv_value();
+ break;
+ case AARCH64_INSN_DATA2_LSRV:
+ insn = aarch64_insn_get_lsrv_value();
+ break;
+ case AARCH64_INSN_DATA2_ASRV:
+ insn = aarch64_insn_get_asrv_value();
+ break;
+ case AARCH64_INSN_DATA2_RORV:
+ insn = aarch64_insn_get_rorv_value();
+ break;
+ default:
+ pr_err("%s: unknown data2 encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT;
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+
+ return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg);
+}
+
+u32 aarch64_insn_gen_data3(enum aarch64_insn_register dst,
+ enum aarch64_insn_register src,
+ enum aarch64_insn_register reg1,
+ enum aarch64_insn_register reg2,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_data3_type type)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_DATA3_MADD:
+ insn = aarch64_insn_get_madd_value();
+ break;
+ case AARCH64_INSN_DATA3_MSUB:
+ insn = aarch64_insn_get_msub_value();
+ break;
+ default:
+ pr_err("%s: unknown data3 encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT;
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RA, insn, src);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+ reg1);
+
+ return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn,
+ reg2);
+}
+
+u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst,
+ enum aarch64_insn_register src,
+ enum aarch64_insn_register reg,
+ int shift,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_logic_type type)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_LOGIC_AND:
+ insn = aarch64_insn_get_and_value();
+ break;
+ case AARCH64_INSN_LOGIC_BIC:
+ insn = aarch64_insn_get_bic_value();
+ break;
+ case AARCH64_INSN_LOGIC_ORR:
+ insn = aarch64_insn_get_orr_value();
+ break;
+ case AARCH64_INSN_LOGIC_ORN:
+ insn = aarch64_insn_get_orn_value();
+ break;
+ case AARCH64_INSN_LOGIC_EOR:
+ insn = aarch64_insn_get_eor_value();
+ break;
+ case AARCH64_INSN_LOGIC_EON:
+ insn = aarch64_insn_get_eon_value();
+ break;
+ case AARCH64_INSN_LOGIC_AND_SETFLAGS:
+ insn = aarch64_insn_get_ands_value();
+ break;
+ case AARCH64_INSN_LOGIC_BIC_SETFLAGS:
+ insn = aarch64_insn_get_bics_value();
+ break;
+ default:
+ pr_err("%s: unknown logical encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ if (shift & ~(SZ_32 - 1)) {
+ pr_err("%s: invalid shift encoding %d\n", __func__,
+ shift);
+ return AARCH64_BREAK_FAULT;
+ }
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT;
+ if (shift & ~(SZ_64 - 1)) {
+ pr_err("%s: invalid shift encoding %d\n", __func__,
+ shift);
+ return AARCH64_BREAK_FAULT;
+ }
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift);
+}
+
+/*
+ * MOV (register) is architecturally an alias of ORR (shifted register) where
+ * MOV <*d>, <*m> is equivalent to ORR <*d>, <*ZR>, <*m>
+ */
+u32 aarch64_insn_gen_move_reg(enum aarch64_insn_register dst,
+ enum aarch64_insn_register src,
+ enum aarch64_insn_variant variant)
+{
+ return aarch64_insn_gen_logical_shifted_reg(dst, AARCH64_INSN_REG_ZR,
+ src, 0, variant,
+ AARCH64_INSN_LOGIC_ORR);
+}
+
+u32 aarch64_insn_gen_adr(unsigned long pc, unsigned long addr,
+ enum aarch64_insn_register reg,
+ enum aarch64_insn_adr_type type)
+{
+ u32 insn;
+ s32 offset;
+
+ switch (type) {
+ case AARCH64_INSN_ADR_TYPE_ADR:
+ insn = aarch64_insn_get_adr_value();
+ offset = addr - pc;
+ break;
+ case AARCH64_INSN_ADR_TYPE_ADRP:
+ insn = aarch64_insn_get_adrp_value();
+ offset = (addr - ALIGN_DOWN(pc, SZ_4K)) >> 12;
+ break;
+ default:
+ pr_err("%s: unknown adr encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ if (offset < -SZ_1M || offset >= SZ_1M)
+ return AARCH64_BREAK_FAULT;
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, reg);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn, offset);
+}
+
+/*
+ * Decode the imm field of a branch, and return the byte offset as a
+ * signed value (so it can be used when computing a new branch
+ * target).
+ */
+s32 aarch64_get_branch_offset(u32 insn)
+{
+ s32 imm;
+
+ if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) {
+ imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_26, insn);
+ return (imm << 6) >> 4;
+ }
+
+ if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
+ aarch64_insn_is_bcond(insn)) {
+ imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_19, insn);
+ return (imm << 13) >> 11;
+ }
+
+ if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) {
+ imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_14, insn);
+ return (imm << 18) >> 16;
+ }
+
+ /* Unhandled instruction */
+ BUG();
+}
+
+/*
+ * Encode the displacement of a branch in the imm field and return the
+ * updated instruction.
+ */
+u32 aarch64_set_branch_offset(u32 insn, s32 offset)
+{
+ if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn))
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn,
+ offset >> 2);
+
+ if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
+ aarch64_insn_is_bcond(insn))
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
+ offset >> 2);
+
+ if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn))
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_14, insn,
+ offset >> 2);
+
+ /* Unhandled instruction */
+ BUG();
+}
+
+s32 aarch64_insn_adrp_get_offset(u32 insn)
+{
+ BUG_ON(!aarch64_insn_is_adrp(insn));
+ return aarch64_insn_decode_immediate(AARCH64_INSN_IMM_ADR, insn) << 12;
+}
+
+u32 aarch64_insn_adrp_set_offset(u32 insn, s32 offset)
+{
+ BUG_ON(!aarch64_insn_is_adrp(insn));
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn,
+ offset >> 12);
+}
+
+/*
+ * Extract the Op/CR data from a msr/mrs instruction.
+ */
+u32 aarch64_insn_extract_system_reg(u32 insn)
+{
+ return (insn & 0x1FFFE0) >> 5;
+}
+
+bool aarch32_insn_is_wide(u32 insn)
+{
+ return insn >= 0xe800;
+}
+
+/*
+ * Macros/defines for extracting register numbers from instruction.
+ */
+u32 aarch32_insn_extract_reg_num(u32 insn, int offset)
+{
+ return (insn & (0xf << offset)) >> offset;
+}
+
+#define OPC2_MASK 0x7
+#define OPC2_OFFSET 5
+u32 aarch32_insn_mcr_extract_opc2(u32 insn)
+{
+ return (insn & (OPC2_MASK << OPC2_OFFSET)) >> OPC2_OFFSET;
+}
+
+#define CRM_MASK 0xf
+u32 aarch32_insn_mcr_extract_crm(u32 insn)
+{
+ return insn & CRM_MASK;
+}
+
+static bool range_of_ones(u64 val)
+{
+ /* Doesn't handle full ones or full zeroes */
+ u64 sval = val >> __ffs64(val);
+
+ /* One of Sean Eron Anderson's bithack tricks */
+ return ((sval + 1) & (sval)) == 0;
+}
+
+static u32 aarch64_encode_immediate(u64 imm,
+ enum aarch64_insn_variant variant,
+ u32 insn)
+{
+ unsigned int immr, imms, n, ones, ror, esz, tmp;
+ u64 mask;
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ esz = 32;
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT;
+ esz = 64;
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ mask = GENMASK(esz - 1, 0);
+
+ /* Can't encode full zeroes, full ones, or value wider than the mask */
+ if (!imm || imm == mask || imm & ~mask)
+ return AARCH64_BREAK_FAULT;
+
+ /*
+ * Inverse of Replicate(). Try to spot a repeating pattern
+ * with a pow2 stride.
+ */
+ for (tmp = esz / 2; tmp >= 2; tmp /= 2) {
+ u64 emask = BIT(tmp) - 1;
+
+ if ((imm & emask) != ((imm >> tmp) & emask))
+ break;
+
+ esz = tmp;
+ mask = emask;
+ }
+
+ /* N is only set if we're encoding a 64bit value */
+ n = esz == 64;
+
+ /* Trim imm to the element size */
+ imm &= mask;
+
+ /* That's how many ones we need to encode */
+ ones = hweight64(imm);
+
+ /*
+ * imms is set to (ones - 1), prefixed with a string of ones
+ * and a zero if they fit. Cap it to 6 bits.
+ */
+ imms = ones - 1;
+ imms |= 0xf << ffs(esz);
+ imms &= BIT(6) - 1;
+
+ /* Compute the rotation */
+ if (range_of_ones(imm)) {
+ /*
+ * Pattern: 0..01..10..0
+ *
+ * Compute how many rotate we need to align it right
+ */
+ ror = __ffs64(imm);
+ } else {
+ /*
+ * Pattern: 0..01..10..01..1
+ *
+ * Fill the unused top bits with ones, and check if
+ * the result is a valid immediate (all ones with a
+ * contiguous ranges of zeroes).
+ */
+ imm |= ~mask;
+ if (!range_of_ones(~imm))
+ return AARCH64_BREAK_FAULT;
+
+ /*
+ * Compute the rotation to get a continuous set of
+ * ones, with the first bit set at position 0
+ */
+ ror = fls(~imm);
+ }
+
+ /*
+ * immr is the number of bits we need to rotate back to the
+ * original set of ones. Note that this is relative to the
+ * element size...
+ */
+ immr = (esz - ror) % esz;
+
+ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, n);
+ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr);
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms);
+}
+
+u32 aarch64_insn_gen_logical_immediate(enum aarch64_insn_logic_type type,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_register Rn,
+ enum aarch64_insn_register Rd,
+ u64 imm)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_LOGIC_AND:
+ insn = aarch64_insn_get_and_imm_value();
+ break;
+ case AARCH64_INSN_LOGIC_ORR:
+ insn = aarch64_insn_get_orr_imm_value();
+ break;
+ case AARCH64_INSN_LOGIC_EOR:
+ insn = aarch64_insn_get_eor_imm_value();
+ break;
+ case AARCH64_INSN_LOGIC_AND_SETFLAGS:
+ insn = aarch64_insn_get_ands_imm_value();
+ break;
+ default:
+ pr_err("%s: unknown logical encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd);
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn);
+ return aarch64_encode_immediate(imm, variant, insn);
+}
+
+u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant,
+ enum aarch64_insn_register Rm,
+ enum aarch64_insn_register Rn,
+ enum aarch64_insn_register Rd,
+ u8 lsb)
+{
+ u32 insn;
+
+ insn = aarch64_insn_get_extr_value();
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ if (lsb > 31)
+ return AARCH64_BREAK_FAULT;
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ if (lsb > 63)
+ return AARCH64_BREAK_FAULT;
+ insn |= AARCH64_INSN_SF_BIT;
+ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, 1);
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, lsb);
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd);
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn);
+ return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, Rm);
+}
diff --git a/arch/arm64/lib/kasan_sw_tags.S b/arch/arm64/lib/kasan_sw_tags.S
new file mode 100644
index 000000000000..5b04464c045e
--- /dev/null
+++ b/arch/arm64/lib/kasan_sw_tags.S
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 Google LLC
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * Report a tag mismatch detected by tag-based KASAN.
+ *
+ * A compiler-generated thunk calls this with a non-AAPCS calling
+ * convention. Upon entry to this function, registers are as follows:
+ *
+ * x0: fault address (see below for restore)
+ * x1: fault description (see below for restore)
+ * x2 to x15: callee-saved
+ * x16 to x17: safe to clobber
+ * x18 to x30: callee-saved
+ * sp: pre-decremented by 256 bytes (see below for restore)
+ *
+ * The caller has decremented the SP by 256 bytes, and created a
+ * structure on the stack as follows:
+ *
+ * sp + 0..15: x0 and x1 to be restored
+ * sp + 16..231: free for use
+ * sp + 232..247: x29 and x30 (same as in GPRs)
+ * sp + 248..255: free for use
+ *
+ * Note that this is not a struct pt_regs.
+ *
+ * To call a regular AAPCS function we must save x2 to x15 (which we can
+ * store in the gaps), and create a frame record (for which we can use
+ * x29 and x30 spilled by the caller as those match the GPRs).
+ *
+ * The caller expects x0 and x1 to be restored from the structure, and
+ * for the structure to be removed from the stack (i.e. the SP must be
+ * incremented by 256 prior to return).
+ */
+SYM_CODE_START(__hwasan_tag_mismatch)
+#ifdef BTI_C
+ BTI_C
+#endif
+ add x29, sp, #232
+ stp x2, x3, [sp, #8 * 2]
+ stp x4, x5, [sp, #8 * 4]
+ stp x6, x7, [sp, #8 * 6]
+ stp x8, x9, [sp, #8 * 8]
+ stp x10, x11, [sp, #8 * 10]
+ stp x12, x13, [sp, #8 * 12]
+ stp x14, x15, [sp, #8 * 14]
+#ifndef CONFIG_SHADOW_CALL_STACK
+ str x18, [sp, #8 * 18]
+#endif
+
+ mov x2, x30
+ bl kasan_tag_mismatch
+
+ ldp x0, x1, [sp]
+ ldp x2, x3, [sp, #8 * 2]
+ ldp x4, x5, [sp, #8 * 4]
+ ldp x6, x7, [sp, #8 * 6]
+ ldp x8, x9, [sp, #8 * 8]
+ ldp x10, x11, [sp, #8 * 10]
+ ldp x12, x13, [sp, #8 * 12]
+ ldp x14, x15, [sp, #8 * 14]
+#ifndef CONFIG_SHADOW_CALL_STACK
+ ldr x18, [sp, #8 * 18]
+#endif
+ ldp x29, x30, [sp, #8 * 29]
+
+ /* remove the structure from the stack */
+ add sp, sp, #256
+ ret
+SYM_CODE_END(__hwasan_tag_mismatch)
+EXPORT_SYMBOL(__hwasan_tag_mismatch)
diff --git a/arch/arm64/lib/memchr.S b/arch/arm64/lib/memchr.S
index edf6b970a277..7c2276fdab54 100644
--- a/arch/arm64/lib/memchr.S
+++ b/arch/arm64/lib/memchr.S
@@ -1,9 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Based on arch/arm/lib/memchr.S
- *
- * Copyright (C) 1995-2000 Russell King
- * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2021 Arm Ltd.
*/
#include <linux/linkage.h>
@@ -19,16 +16,60 @@
* Returns:
* x0 - address of first occurrence of 'c' or 0
*/
+
+#define L(label) .L ## label
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+#define srcin x0
+#define chrin w1
+#define cntin x2
+
+#define result x0
+
+#define wordcnt x3
+#define rep01 x4
+#define repchr x5
+#define cur_word x6
+#define cur_byte w6
+#define tmp x7
+#define tmp2 x8
+
+ .p2align 4
+ nop
SYM_FUNC_START_WEAK_PI(memchr)
- and w1, w1, #0xff
-1: subs x2, x2, #1
- b.mi 2f
- ldrb w3, [x0], #1
- cmp w3, w1
- b.ne 1b
- sub x0, x0, #1
+ and chrin, chrin, #0xff
+ lsr wordcnt, cntin, #3
+ cbz wordcnt, L(byte_loop)
+ mov rep01, #REP8_01
+ mul repchr, x1, rep01
+ and cntin, cntin, #7
+L(word_loop):
+ ldr cur_word, [srcin], #8
+ sub wordcnt, wordcnt, #1
+ eor cur_word, cur_word, repchr
+ sub tmp, cur_word, rep01
+ orr tmp2, cur_word, #REP8_7f
+ bics tmp, tmp, tmp2
+ b.ne L(found_word)
+ cbnz wordcnt, L(word_loop)
+L(byte_loop):
+ cbz cntin, L(not_found)
+ ldrb cur_byte, [srcin], #1
+ sub cntin, cntin, #1
+ cmp cur_byte, chrin
+ b.ne L(byte_loop)
+ sub srcin, srcin, #1
+ ret
+L(found_word):
+CPU_LE( rev tmp, tmp)
+ clz tmp, tmp
+ sub tmp, tmp, #64
+ add result, srcin, tmp, asr #3
ret
-2: mov x0, #0
+L(not_found):
+ mov result, #0
ret
SYM_FUNC_END_PI(memchr)
EXPORT_SYMBOL_NOKASAN(memchr)
diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S
index c0671e793ea9..7d956384222f 100644
--- a/arch/arm64/lib/memcmp.S
+++ b/arch/arm64/lib/memcmp.S
@@ -1,247 +1,139 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2013-2021, Arm Limited.
*
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/memcmp.S
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
-/*
-* compare memory areas(when two memory areas' offset are different,
-* alignment handled by the hardware)
-*
-* Parameters:
-* x0 - const memory area 1 pointer
-* x1 - const memory area 2 pointer
-* x2 - the maximal compare byte length
-* Returns:
-* x0 - a compare result, maybe less than, equal to, or greater than ZERO
-*/
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ */
+
+#define L(label) .L ## label
/* Parameters and result. */
-src1 .req x0
-src2 .req x1
-limit .req x2
-result .req x0
+#define src1 x0
+#define src2 x1
+#define limit x2
+#define result w0
/* Internal variables. */
-data1 .req x3
-data1w .req w3
-data2 .req x4
-data2w .req w4
-has_nul .req x5
-diff .req x6
-endloop .req x7
-tmp1 .req x8
-tmp2 .req x9
-tmp3 .req x10
-pos .req x11
-limit_wd .req x12
-mask .req x13
+#define data1 x3
+#define data1w w3
+#define data1h x4
+#define data2 x5
+#define data2w w5
+#define data2h x6
+#define tmp1 x7
+#define tmp2 x8
SYM_FUNC_START_WEAK_PI(memcmp)
- cbz limit, .Lret0
- eor tmp1, src1, src2
- tst tmp1, #7
- b.ne .Lmisaligned8
- ands tmp1, src1, #7
- b.ne .Lmutual_align
- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
- lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
- /*
- * The input source addresses are at alignment boundary.
- * Directly compare eight bytes each time.
- */
-.Lloop_aligned:
- ldr data1, [src1], #8
- ldr data2, [src2], #8
-.Lstart_realigned:
- subs limit_wd, limit_wd, #1
- eor diff, data1, data2 /* Non-zero if differences found. */
- csinv endloop, diff, xzr, cs /* Last Dword or differences. */
- cbz endloop, .Lloop_aligned
-
- /* Not reached the limit, must have found a diff. */
- tbz limit_wd, #63, .Lnot_limit
-
- /* Limit % 8 == 0 => the diff is in the last 8 bytes. */
- ands limit, limit, #7
- b.eq .Lnot_limit
- /*
- * The remained bytes less than 8. It is needed to extract valid data
- * from last eight bytes of the intended memory range.
- */
- lsl limit, limit, #3 /* bytes-> bits. */
- mov mask, #~0
-CPU_BE( lsr mask, mask, limit )
-CPU_LE( lsl mask, mask, limit )
- bic data1, data1, mask
- bic data2, data2, mask
-
- orr diff, diff, mask
- b .Lnot_limit
-
-.Lmutual_align:
- /*
- * Sources are mutually aligned, but are not currently at an
- * alignment boundary. Round down the addresses and then mask off
- * the bytes that precede the start point.
- */
- bic src1, src1, #7
- bic src2, src2, #7
- ldr data1, [src1], #8
- ldr data2, [src2], #8
- /*
- * We can not add limit with alignment offset(tmp1) here. Since the
- * addition probably make the limit overflown.
- */
- sub limit_wd, limit, #1/*limit != 0, so no underflow.*/
- and tmp3, limit_wd, #7
- lsr limit_wd, limit_wd, #3
- add tmp3, tmp3, tmp1
- add limit_wd, limit_wd, tmp3, lsr #3
- add limit, limit, tmp1/* Adjust the limit for the extra. */
-
- lsl tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/
- neg tmp1, tmp1/* Bits to alignment -64. */
- mov tmp2, #~0
- /*mask off the non-intended bytes before the start address.*/
-CPU_BE( lsl tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/
- /* Little-endian. Early bytes are at LSB. */
-CPU_LE( lsr tmp2, tmp2, tmp1 )
-
- orr data1, data1, tmp2
- orr data2, data2, tmp2
- b .Lstart_realigned
-
- /*src1 and src2 have different alignment offset.*/
-.Lmisaligned8:
- cmp limit, #8
- b.lo .Ltiny8proc /*limit < 8: compare byte by byte*/
-
- and tmp1, src1, #7
- neg tmp1, tmp1
- add tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/
- and tmp2, src2, #7
- neg tmp2, tmp2
- add tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/
- subs tmp3, tmp1, tmp2
- csel pos, tmp1, tmp2, hi /*Choose the maximum.*/
-
- sub limit, limit, pos
- /*compare the proceeding bytes in the first 8 byte segment.*/
-.Ltinycmp:
- ldrb data1w, [src1], #1
- ldrb data2w, [src2], #1
- subs pos, pos, #1
- ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */
- b.eq .Ltinycmp
- cbnz pos, 1f /*diff occurred before the last byte.*/
- cmp data1w, data2w
- b.eq .Lstart_align
-1:
- sub result, data1, data2
+ subs limit, limit, 8
+ b.lo L(less8)
+
+ ldr data1, [src1], 8
+ ldr data2, [src2], 8
+ cmp data1, data2
+ b.ne L(return)
+
+ subs limit, limit, 8
+ b.gt L(more16)
+
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ b L(return)
+
+L(more16):
+ ldr data1, [src1], 8
+ ldr data2, [src2], 8
+ cmp data1, data2
+ bne L(return)
+
+ /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+ strings. */
+ subs limit, limit, 16
+ b.ls L(last_bytes)
+
+ /* We overlap loads between 0-32 bytes at either side of SRC1 when we
+ try to align, so limit it only to strings larger than 128 bytes. */
+ cmp limit, 96
+ b.ls L(loop16)
+
+ /* Align src1 and adjust src2 with bytes not yet done. */
+ and tmp1, src1, 15
+ add limit, limit, tmp1
+ sub src1, src1, tmp1
+ sub src2, src2, tmp1
+
+ /* Loop performing 16 bytes per iteration using aligned src1.
+ Limit is pre-decremented by 16 and must be larger than zero.
+ Exit if <= 16 bytes left to do or if the data is not equal. */
+ .p2align 4
+L(loop16):
+ ldp data1, data1h, [src1], 16
+ ldp data2, data2h, [src2], 16
+ subs limit, limit, 16
+ ccmp data1, data2, 0, hi
+ ccmp data1h, data2h, 0, eq
+ b.eq L(loop16)
+
+ cmp data1, data2
+ bne L(return)
+ mov data1, data1h
+ mov data2, data2h
+ cmp data1, data2
+ bne L(return)
+
+ /* Compare last 1-16 bytes using unaligned access. */
+L(last_bytes):
+ add src1, src1, limit
+ add src2, src2, limit
+ ldp data1, data1h, [src1]
+ ldp data2, data2h, [src2]
+ cmp data1, data2
+ bne L(return)
+ mov data1, data1h
+ mov data2, data2h
+ cmp data1, data2
+
+ /* Compare data bytes and set return value to 0, -1 or 1. */
+L(return):
+#ifndef __AARCH64EB__
+ rev data1, data1
+ rev data2, data2
+#endif
+ cmp data1, data2
+L(ret_eq):
+ cset result, ne
+ cneg result, result, lo
ret
-.Lstart_align:
- lsr limit_wd, limit, #3
- cbz limit_wd, .Lremain8
-
- ands xzr, src1, #7
- b.eq .Lrecal_offset
- /*process more leading bytes to make src1 aligned...*/
- add src1, src1, tmp3 /*backwards src1 to alignment boundary*/
- add src2, src2, tmp3
- sub limit, limit, tmp3
- lsr limit_wd, limit, #3
- cbz limit_wd, .Lremain8
- /*load 8 bytes from aligned SRC1..*/
- ldr data1, [src1], #8
- ldr data2, [src2], #8
-
- subs limit_wd, limit_wd, #1
- eor diff, data1, data2 /*Non-zero if differences found.*/
- csinv endloop, diff, xzr, ne
- cbnz endloop, .Lunequal_proc
- /*How far is the current SRC2 from the alignment boundary...*/
- and tmp3, tmp3, #7
-
-.Lrecal_offset:/*src1 is aligned now..*/
- neg pos, tmp3
-.Lloopcmp_proc:
- /*
- * Divide the eight bytes into two parts. First,backwards the src2
- * to an alignment boundary,load eight bytes and compare from
- * the SRC2 alignment boundary. If all 8 bytes are equal,then start
- * the second part's comparison. Otherwise finish the comparison.
- * This special handle can garantee all the accesses are in the
- * thread/task space in avoid to overrange access.
- */
- ldr data1, [src1,pos]
- ldr data2, [src2,pos]
- eor diff, data1, data2 /* Non-zero if differences found. */
- cbnz diff, .Lnot_limit
-
- /*The second part process*/
- ldr data1, [src1], #8
- ldr data2, [src2], #8
- eor diff, data1, data2 /* Non-zero if differences found. */
- subs limit_wd, limit_wd, #1
- csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
- cbz endloop, .Lloopcmp_proc
-.Lunequal_proc:
- cbz diff, .Lremain8
-
-/* There is difference occurred in the latest comparison. */
-.Lnot_limit:
-/*
-* For little endian,reverse the low significant equal bits into MSB,then
-* following CLZ can find how many equal bits exist.
-*/
-CPU_LE( rev diff, diff )
-CPU_LE( rev data1, data1 )
-CPU_LE( rev data2, data2 )
-
- /*
- * The MS-non-zero bit of DIFF marks either the first bit
- * that is different, or the end of the significant data.
- * Shifting left now will bring the critical information into the
- * top bits.
- */
- clz pos, diff
- lsl data1, data1, pos
- lsl data2, data2, pos
- /*
- * We need to zero-extend (char is unsigned) the value and then
- * perform a signed subtraction.
- */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
+ .p2align 4
+ /* Compare up to 8 bytes. Limit is [-8..-1]. */
+L(less8):
+ adds limit, limit, 4
+ b.lo L(less4)
+ ldr data1w, [src1], 4
+ ldr data2w, [src2], 4
+ cmp data1w, data2w
+ b.ne L(return)
+ sub limit, limit, 4
+L(less4):
+ adds limit, limit, 4
+ beq L(ret_eq)
+L(byte_loop):
+ ldrb data1w, [src1], 1
+ ldrb data2w, [src2], 1
+ subs limit, limit, 1
+ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
+ b.eq L(byte_loop)
+ sub result, data1w, data2w
ret
-.Lremain8:
- /* Limit % 8 == 0 =>. all data are equal.*/
- ands limit, limit, #7
- b.eq .Lret0
-
-.Ltiny8proc:
- ldrb data1w, [src1], #1
- ldrb data2w, [src2], #1
- subs limit, limit, #1
-
- ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */
- b.eq .Ltiny8proc
- sub result, data1, data2
- ret
-.Lret0:
- mov result, #0
- ret
SYM_FUNC_END_PI(memcmp)
EXPORT_SYMBOL_NOKASAN(memcmp)
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index dc8d2a216a6e..b82fd64ee1e1 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -1,66 +1,252 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2012-2021, Arm Limited.
*
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
-#include <asm/cache.h>
-/*
- * Copy a buffer from src to dest (alignment handled by the hardware)
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
*
- * Parameters:
- * x0 - dest
- * x1 - src
- * x2 - n
- * Returns:
- * x0 - dest
*/
- .macro ldrb1 reg, ptr, val
- ldrb \reg, [\ptr], \val
- .endm
-
- .macro strb1 reg, ptr, val
- strb \reg, [\ptr], \val
- .endm
- .macro ldrh1 reg, ptr, val
- ldrh \reg, [\ptr], \val
- .endm
+#define L(label) .L ## label
- .macro strh1 reg, ptr, val
- strh \reg, [\ptr], \val
- .endm
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_lw w10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l x14
+#define E_h x15
+#define F_l x16
+#define F_h x17
+#define G_l count
+#define G_h dst
+#define H_l src
+#define H_h srcend
+#define tmp1 x14
- .macro ldr1 reg, ptr, val
- ldr \reg, [\ptr], \val
- .endm
+/* This implementation handles overlaps and supports both memcpy and memmove
+ from a single entry point. It uses unaligned accesses and branchless
+ sequences to keep the code small, simple and improve performance.
- .macro str1 reg, ptr, val
- str \reg, [\ptr], \val
- .endm
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check is negligible since it is only required for large copies.
- .macro ldp1 reg1, reg2, ptr, val
- ldp \reg1, \reg2, [\ptr], \val
- .endm
-
- .macro stp1 reg1, reg2, ptr, val
- stp \reg1, \reg2, [\ptr], \val
- .endm
+ Large copies use a software pipelined loop processing 64 bytes per iteration.
+ The destination pointer is 16-byte aligned to minimize unaligned accesses.
+ The loop tail is handled by always copying 64 bytes from the end.
+*/
+SYM_FUNC_START_ALIAS(__memmove)
+SYM_FUNC_START_WEAK_ALIAS_PI(memmove)
SYM_FUNC_START_ALIAS(__memcpy)
SYM_FUNC_START_WEAK_PI(memcpy)
-#include "copy_template.S"
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 128
+ b.hi L(copy_long)
+ cmp count, 32
+ b.hi L(copy32_128)
+
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
+ ldp A_l, A_h, [src]
+ ldp D_l, D_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ /* Copy 8-15 bytes. */
+L(copy16):
+ tbz count, 3, L(copy8)
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+
+ .p2align 3
+ /* Copy 4-7 bytes. */
+L(copy8):
+ tbz count, 2, L(copy4)
+ ldr A_lw, [src]
+ ldr B_lw, [srcend, -4]
+ str A_lw, [dstin]
+ str B_lw, [dstend, -4]
+ ret
+
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb C_lw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb C_lw, [dstend, -1]
+L(copy0):
+ ret
+
+ .p2align 4
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp A_l, A_h, [src]
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ ldp D_l, D_h, [srcend, -16]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
ret
+
+ .p2align 4
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp E_l, E_h, [src, 32]
+ ldp F_l, F_h, [src, 48]
+ cmp count, 96
+ b.ls L(copy96)
+ ldp G_l, G_h, [srcend, -64]
+ ldp H_l, H_h, [srcend, -48]
+ stp G_l, G_h, [dstend, -64]
+ stp H_l, H_h, [dstend, -48]
+L(copy96):
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp E_l, E_h, [dstin, 32]
+ stp F_l, F_h, [dstin, 48]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Copy more than 128 bytes. */
+L(copy_long):
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cbz tmp1, L(copy0)
+ cmp tmp1, count
+ b.lo L(copy_long_backwards)
+
+ /* Copy 16 bytes and then align dst to 16-byte alignment. */
+
+ ldp D_l, D_h, [src]
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(copy64_from_end)
+
+L(loop64):
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
+ ldp E_l, E_h, [srcend, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [srcend, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [srcend, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend, -64]
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
+ ret
+
+ .p2align 4
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align dst to 16-byte alignment. */
+L(copy_long_backwards):
+ ldp D_l, D_h, [srcend, -16]
+ and tmp1, dstend, 15
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp A_l, A_h, [srcend, -16]
+ stp D_l, D_h, [dstend, -16]
+ ldp B_l, B_h, [srcend, -32]
+ ldp C_l, C_h, [srcend, -48]
+ ldp D_l, D_h, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start)
+
+L(loop64_backwards):
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [srcend, -16]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [srcend, -48]
+ stp D_l, D_h, [dstend, -64]!
+ ldp D_l, D_h, [srcend, -64]!
+ subs count, count, 64
+ b.hi L(loop64_backwards)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+ ldp G_l, G_h, [src, 48]
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [src]
+ stp D_l, D_h, [dstend, -64]
+ stp G_l, G_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
+ ret
+
SYM_FUNC_END_PI(memcpy)
EXPORT_SYMBOL(memcpy)
SYM_FUNC_END_ALIAS(__memcpy)
EXPORT_SYMBOL(__memcpy)
+SYM_FUNC_END_ALIAS_PI(memmove)
+EXPORT_SYMBOL(memmove)
+SYM_FUNC_END_ALIAS(__memmove)
+EXPORT_SYMBOL(__memmove)
diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S
deleted file mode 100644
index 1035dce4bdaf..000000000000
--- a/arch/arm64/lib/memmove.S
+++ /dev/null
@@ -1,189 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
- *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/cache.h>
-
-/*
- * Move a buffer from src to test (alignment handled by the hardware).
- * If dest <= src, call memcpy, otherwise copy in reverse order.
- *
- * Parameters:
- * x0 - dest
- * x1 - src
- * x2 - n
- * Returns:
- * x0 - dest
- */
-dstin .req x0
-src .req x1
-count .req x2
-tmp1 .req x3
-tmp1w .req w3
-tmp2 .req x4
-tmp2w .req w4
-tmp3 .req x5
-tmp3w .req w5
-dst .req x6
-
-A_l .req x7
-A_h .req x8
-B_l .req x9
-B_h .req x10
-C_l .req x11
-C_h .req x12
-D_l .req x13
-D_h .req x14
-
-SYM_FUNC_START_ALIAS(__memmove)
-SYM_FUNC_START_WEAK_PI(memmove)
- cmp dstin, src
- b.lo __memcpy
- add tmp1, src, count
- cmp dstin, tmp1
- b.hs __memcpy /* No overlap. */
-
- add dst, dstin, count
- add src, src, count
- cmp count, #16
- b.lo .Ltail15 /*probably non-alignment accesses.*/
-
- ands tmp2, src, #15 /* Bytes to reach alignment. */
- b.eq .LSrcAligned
- sub count, count, tmp2
- /*
- * process the aligned offset length to make the src aligned firstly.
- * those extra instructions' cost is acceptable. It also make the
- * coming accesses are based on aligned address.
- */
- tbz tmp2, #0, 1f
- ldrb tmp1w, [src, #-1]!
- strb tmp1w, [dst, #-1]!
-1:
- tbz tmp2, #1, 2f
- ldrh tmp1w, [src, #-2]!
- strh tmp1w, [dst, #-2]!
-2:
- tbz tmp2, #2, 3f
- ldr tmp1w, [src, #-4]!
- str tmp1w, [dst, #-4]!
-3:
- tbz tmp2, #3, .LSrcAligned
- ldr tmp1, [src, #-8]!
- str tmp1, [dst, #-8]!
-
-.LSrcAligned:
- cmp count, #64
- b.ge .Lcpy_over64
-
- /*
- * Deal with small copies quickly by dropping straight into the
- * exit block.
- */
-.Ltail63:
- /*
- * Copy up to 48 bytes of data. At this point we only need the
- * bottom 6 bits of count to be accurate.
- */
- ands tmp1, count, #0x30
- b.eq .Ltail15
- cmp tmp1w, #0x20
- b.eq 1f
- b.lt 2f
- ldp A_l, A_h, [src, #-16]!
- stp A_l, A_h, [dst, #-16]!
-1:
- ldp A_l, A_h, [src, #-16]!
- stp A_l, A_h, [dst, #-16]!
-2:
- ldp A_l, A_h, [src, #-16]!
- stp A_l, A_h, [dst, #-16]!
-
-.Ltail15:
- tbz count, #3, 1f
- ldr tmp1, [src, #-8]!
- str tmp1, [dst, #-8]!
-1:
- tbz count, #2, 2f
- ldr tmp1w, [src, #-4]!
- str tmp1w, [dst, #-4]!
-2:
- tbz count, #1, 3f
- ldrh tmp1w, [src, #-2]!
- strh tmp1w, [dst, #-2]!
-3:
- tbz count, #0, .Lexitfunc
- ldrb tmp1w, [src, #-1]
- strb tmp1w, [dst, #-1]
-
-.Lexitfunc:
- ret
-
-.Lcpy_over64:
- subs count, count, #128
- b.ge .Lcpy_body_large
- /*
- * Less than 128 bytes to copy, so handle 64 bytes here and then jump
- * to the tail.
- */
- ldp A_l, A_h, [src, #-16]
- stp A_l, A_h, [dst, #-16]
- ldp B_l, B_h, [src, #-32]
- ldp C_l, C_h, [src, #-48]
- stp B_l, B_h, [dst, #-32]
- stp C_l, C_h, [dst, #-48]
- ldp D_l, D_h, [src, #-64]!
- stp D_l, D_h, [dst, #-64]!
-
- tst count, #0x3f
- b.ne .Ltail63
- ret
-
- /*
- * Critical loop. Start at a new cache line boundary. Assuming
- * 64 bytes per line this ensures the entire loop is in one line.
- */
- .p2align L1_CACHE_SHIFT
-.Lcpy_body_large:
- /* pre-load 64 bytes data. */
- ldp A_l, A_h, [src, #-16]
- ldp B_l, B_h, [src, #-32]
- ldp C_l, C_h, [src, #-48]
- ldp D_l, D_h, [src, #-64]!
-1:
- /*
- * interlace the load of next 64 bytes data block with store of the last
- * loaded 64 bytes data.
- */
- stp A_l, A_h, [dst, #-16]
- ldp A_l, A_h, [src, #-16]
- stp B_l, B_h, [dst, #-32]
- ldp B_l, B_h, [src, #-32]
- stp C_l, C_h, [dst, #-48]
- ldp C_l, C_h, [src, #-48]
- stp D_l, D_h, [dst, #-64]!
- ldp D_l, D_h, [src, #-64]!
- subs count, count, #64
- b.ge 1b
- stp A_l, A_h, [dst, #-16]
- stp B_l, B_h, [dst, #-32]
- stp C_l, C_h, [dst, #-48]
- stp D_l, D_h, [dst, #-64]!
-
- tst count, #0x3f
- b.ne .Ltail63
- ret
-SYM_FUNC_END_PI(memmove)
-EXPORT_SYMBOL(memmove)
-SYM_FUNC_END_ALIAS(__memmove)
-EXPORT_SYMBOL(__memmove)
diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S
index 351537c12f36..e83643b3995f 100644
--- a/arch/arm64/lib/mte.S
+++ b/arch/arm64/lib/mte.S
@@ -37,6 +37,26 @@ SYM_FUNC_START(mte_clear_page_tags)
SYM_FUNC_END(mte_clear_page_tags)
/*
+ * Zero the page and tags at the same time
+ *
+ * Parameters:
+ * x0 - address to the beginning of the page
+ */
+SYM_FUNC_START(mte_zero_clear_page_tags)
+ mrs x1, dczid_el0
+ and w1, w1, #0xf
+ mov x2, #4
+ lsl x1, x2, x1
+ and x0, x0, #(1 << MTE_TAG_SHIFT) - 1 // clear the tag
+
+1: dc gzva, x0
+ add x0, x0, x1
+ tst x0, #(PAGE_SIZE - 1)
+ b.ne 1b
+ ret
+SYM_FUNC_END(mte_zero_clear_page_tags)
+
+/*
* Copy the tags from the source page to the destination one
* x0 - address of the destination page
* x1 - address of the source page
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
index 4e79566726c8..d7bee210a798 100644
--- a/arch/arm64/lib/strcmp.S
+++ b/arch/arm64/lib/strcmp.S
@@ -1,84 +1,123 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2012-2021, Arm Limited.
*
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/strcmp.S
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
-/*
- * compare two strings
+/* Assumptions:
*
- * Parameters:
- * x0 - const string 1 pointer
- * x1 - const string 2 pointer
- * Returns:
- * x0 - an integer less than, equal to, or greater than zero
- * if s1 is found, respectively, to be less than, to match,
- * or be greater than s2.
+ * ARMv8-a, AArch64
*/
+#define L(label) .L ## label
+
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
/* Parameters and result. */
-src1 .req x0
-src2 .req x1
-result .req x0
+#define src1 x0
+#define src2 x1
+#define result x0
/* Internal variables. */
-data1 .req x2
-data1w .req w2
-data2 .req x3
-data2w .req w3
-has_nul .req x4
-diff .req x5
-syndrome .req x6
-tmp1 .req x7
-tmp2 .req x8
-tmp3 .req x9
-zeroones .req x10
-pos .req x11
-
+#define data1 x2
+#define data1w w2
+#define data2 x3
+#define data2w w3
+#define has_nul x4
+#define diff x5
+#define syndrome x6
+#define tmp1 x7
+#define tmp2 x8
+#define tmp3 x9
+#define zeroones x10
+#define pos x11
+
+ /* Start of performance-critical section -- one 64B cache line. */
+ .align 6
SYM_FUNC_START_WEAK_PI(strcmp)
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
- b.ne .Lmisaligned8
+ b.ne L(misaligned8)
ands tmp1, src1, #7
- b.ne .Lmutual_align
-
- /*
- * NUL detection works on the principle that (X - 1) & (~X) & 0x80
- * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- * can be done in parallel across the entire word.
- */
-.Lloop_aligned:
+ b.ne L(mutual_align)
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
-.Lstart_realigned:
+L(start_realigned):
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
eor diff, data1, data2 /* Non-zero if differences found. */
bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
orr syndrome, diff, has_nul
- cbz syndrome, .Lloop_aligned
- b .Lcal_cmpresult
+ cbz syndrome, L(loop_aligned)
+ /* End of performance-critical section -- one 64B cache line. */
+
+L(end):
+#ifndef __AARCH64EB__
+ rev syndrome, syndrome
+ rev data1, data1
+ /* The MS-non-zero bit of the syndrome marks either the first bit
+ that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ clz pos, syndrome
+ rev data2, data2
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#else
+ /* For big-endian we cannot use the trick with the syndrome value
+ as carry-propagation can corrupt the upper bits if the trailing
+ bytes in the string contain 0x01. */
+ /* However, if there is no NUL byte in the dword, we can generate
+ the result directly. We can't just subtract the bytes as the
+ MSB might be significant. */
+ cbnz has_nul, 1f
+ cmp data1, data2
+ cset result, ne
+ cneg result, result, lo
+ ret
+1:
+ /* Re-compute the NUL-byte detection, using a byte-reversed value. */
+ rev tmp3, data1
+ sub tmp1, tmp3, zeroones
+ orr tmp2, tmp3, #REP8_7f
+ bic has_nul, tmp1, tmp2
+ rev has_nul, has_nul
+ orr syndrome, diff, has_nul
+ clz pos, syndrome
+ /* The MS-non-zero bit of the syndrome marks either the first bit
+ that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#endif
-.Lmutual_align:
- /*
- * Sources are mutually aligned, but are not currently at an
- * alignment boundary. Round down the addresses and then mask off
- * the bytes that preceed the start point.
- */
+L(mutual_align):
+ /* Sources are mutually aligned, but are not currently at an
+ alignment boundary. Round down the addresses and then mask off
+ the bytes that preceed the start point. */
bic src1, src1, #7
bic src2, src2, #7
lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
@@ -86,138 +125,52 @@ SYM_FUNC_START_WEAK_PI(strcmp)
neg tmp1, tmp1 /* Bits to alignment -64. */
ldr data2, [src2], #8
mov tmp2, #~0
+#ifdef __AARCH64EB__
/* Big-endian. Early bytes are at MSB. */
-CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
+ lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#else
/* Little-endian. Early bytes are at LSB. */
-CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
-
+ lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#endif
orr data1, data1, tmp2
orr data2, data2, tmp2
- b .Lstart_realigned
-
-.Lmisaligned8:
- /*
- * Get the align offset length to compare per byte first.
- * After this process, one string's address will be aligned.
- */
- and tmp1, src1, #7
- neg tmp1, tmp1
- add tmp1, tmp1, #8
- and tmp2, src2, #7
- neg tmp2, tmp2
- add tmp2, tmp2, #8
- subs tmp3, tmp1, tmp2
- csel pos, tmp1, tmp2, hi /*Choose the maximum. */
-.Ltinycmp:
+ b L(start_realigned)
+
+L(misaligned8):
+ /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+ checking to make sure that we don't access beyond page boundary in
+ SRC2. */
+ tst src1, #7
+ b.eq L(loop_misaligned)
+L(do_misaligned):
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
- subs pos, pos, #1
- ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */
- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq .Ltinycmp
- cbnz pos, 1f /*find the null or unequal...*/
cmp data1w, #1
- ccmp data1w, data2w, #0, cs
- b.eq .Lstart_align /*the last bytes are equal....*/
-1:
- sub result, data1, data2
- ret
-
-.Lstart_align:
- ands xzr, src1, #7
- b.eq .Lrecal_offset
- /*process more leading bytes to make str1 aligned...*/
- add src1, src1, tmp3
- add src2, src2, tmp3
- /*load 8 bytes from aligned str1 and non-aligned str2..*/
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.ne L(done)
+ tst src1, #7
+ b.ne L(do_misaligned)
+
+L(loop_misaligned):
+ /* Test if we are within the last dword of the end of a 4K page. If
+ yes then jump back to the misaligned loop to copy a byte at a time. */
+ and tmp1, src2, #0xff8
+ eor tmp1, tmp1, #0xff8
+ cbz tmp1, L(do_misaligned)
ldr data1, [src1], #8
ldr data2, [src2], #8
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
- bic has_nul, tmp1, tmp2
- eor diff, data1, data2 /* Non-zero if differences found. */
- orr syndrome, diff, has_nul
- cbnz syndrome, .Lcal_cmpresult
- /*How far is the current str2 from the alignment boundary...*/
- and tmp3, tmp3, #7
-.Lrecal_offset:
- neg pos, tmp3
-.Lloopcmp_proc:
- /*
- * Divide the eight bytes into two parts. First,backwards the src2
- * to an alignment boundary,load eight bytes from the SRC2 alignment
- * boundary,then compare with the relative bytes from SRC1.
- * If all 8 bytes are equal,then start the second part's comparison.
- * Otherwise finish the comparison.
- * This special handle can garantee all the accesses are in the
- * thread/task space in avoid to overrange access.
- */
- ldr data1, [src1,pos]
- ldr data2, [src2,pos]
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- bic has_nul, tmp1, tmp2
- eor diff, data1, data2 /* Non-zero if differences found. */
- orr syndrome, diff, has_nul
- cbnz syndrome, .Lcal_cmpresult
-
- /*The second part process*/
- ldr data1, [src1], #8
- ldr data2, [src2], #8
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- bic has_nul, tmp1, tmp2
- eor diff, data1, data2 /* Non-zero if differences found. */
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
orr syndrome, diff, has_nul
- cbz syndrome, .Lloopcmp_proc
+ cbz syndrome, L(loop_misaligned)
+ b L(end)
-.Lcal_cmpresult:
- /*
- * reversed the byte-order as big-endian,then CLZ can find the most
- * significant zero bits.
- */
-CPU_LE( rev syndrome, syndrome )
-CPU_LE( rev data1, data1 )
-CPU_LE( rev data2, data2 )
-
- /*
- * For big-endian we cannot use the trick with the syndrome value
- * as carry-propagation can corrupt the upper bits if the trailing
- * bytes in the string contain 0x01.
- * However, if there is no NUL byte in the dword, we can generate
- * the result directly. We cannot just subtract the bytes as the
- * MSB might be significant.
- */
-CPU_BE( cbnz has_nul, 1f )
-CPU_BE( cmp data1, data2 )
-CPU_BE( cset result, ne )
-CPU_BE( cneg result, result, lo )
-CPU_BE( ret )
-CPU_BE( 1: )
- /*Re-compute the NUL-byte detection, using a byte-reversed value. */
-CPU_BE( rev tmp3, data1 )
-CPU_BE( sub tmp1, tmp3, zeroones )
-CPU_BE( orr tmp2, tmp3, #REP8_7f )
-CPU_BE( bic has_nul, tmp1, tmp2 )
-CPU_BE( rev has_nul, has_nul )
-CPU_BE( orr syndrome, diff, has_nul )
-
- clz pos, syndrome
- /*
- * The MS-non-zero bit of the syndrome marks either the first bit
- * that is different, or the top bit of the first zero byte.
- * Shifting left now will bring the critical information into the
- * top bits.
- */
- lsl data1, data1, pos
- lsl data2, data2, pos
- /*
- * But we need to zero-extend (char is unsigned) the value and then
- * perform a signed 32-bit subtraction.
- */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
+L(done):
+ sub result, data1, data2
ret
+
SYM_FUNC_END_PI(strcmp)
EXPORT_SYMBOL_NOKASAN(strcmp)
diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S
index ee3ed882dd79..35fbdb7d6e1a 100644
--- a/arch/arm64/lib/strlen.S
+++ b/arch/arm64/lib/strlen.S
@@ -1,115 +1,203 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2013-2021, Arm Limited.
*
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/98e4d6a5c13c8e54/string/aarch64/strlen.S
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
-/*
- * calculate the length of a string
+/* Assumptions:
*
- * Parameters:
- * x0 - const string pointer
- * Returns:
- * x0 - the return length of specific string
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
*/
+#define L(label) .L ## label
+
/* Arguments and results. */
-srcin .req x0
-len .req x0
+#define srcin x0
+#define len x0
/* Locals and temporaries. */
-src .req x1
-data1 .req x2
-data2 .req x3
-data2a .req x4
-has_nul1 .req x5
-has_nul2 .req x6
-tmp1 .req x7
-tmp2 .req x8
-tmp3 .req x9
-tmp4 .req x10
-zeroones .req x11
-pos .req x12
+#define src x1
+#define data1 x2
+#define data2 x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1 x4
+#define tmp2 x5
+#define tmp3 x6
+#define tmp4 x7
+#define zeroones x8
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. A faster check
+ (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
+ false hits for characters 129..255. */
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
+#define MIN_PAGE_SIZE 4096
+
+ /* Since strings are short on average, we check the first 16 bytes
+ of the string for a NUL character. In order to do an unaligned ldp
+ safely we have to do a page cross check first. If there is a NUL
+ byte we calculate the length from the 2 8-byte words using
+ conditional select to reduce branch mispredictions (it is unlikely
+ strlen will be repeatedly called on strings with the same length).
+
+ If the string is longer than 16 bytes, we align src so don't need
+ further page cross checks, and process 32 bytes per iteration
+ using the fast NUL check. If we encounter non-ASCII characters,
+ fallback to a second loop using the full NUL check.
+
+ If the page cross check fails, we read 16 bytes from an aligned
+ address, remove any characters before the string, and continue
+ in the main loop using aligned loads. Since strings crossing a
+ page in the first 16 bytes are rare (probability of
+ 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
+
+ AArch64 systems have a minimum page size of 4k. We don't bother
+ checking for larger page sizes - the cost of setting up the correct
+ page size is just not worth the extra gain from a small reduction in
+ the cases taking the slow path. Note that we only care about
+ whether the first fetch, which may be misaligned, crosses a page
+ boundary. */
+
SYM_FUNC_START_WEAK_PI(strlen)
- mov zeroones, #REP8_01
- bic src, srcin, #15
- ands tmp1, srcin, #15
- b.ne .Lmisaligned
- /*
- * NUL detection works on the principle that (X - 1) & (~X) & 0x80
- * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- * can be done in parallel across the entire word.
- */
- /*
- * The inner loop deals with two Dwords at a time. This has a
- * slightly higher start-up cost, but we should win quite quickly,
- * especially on cores with a high number of issue slots per
- * cycle, as we get much better parallelism out of the operations.
- */
-.Lloop:
- ldp data1, data2, [src], #16
-.Lrealigned:
+ and tmp1, srcin, MIN_PAGE_SIZE - 1
+ mov zeroones, REP8_01
+ cmp tmp1, MIN_PAGE_SIZE - 16
+ b.gt L(page_cross)
+ ldp data1, data2, [srcin]
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul1/2 directly.
+ Since we expect strings to be small and early-exit,
+ byte-swap the data now so has_null1/2 will be correct. */
+ rev data1, data1
+ rev data2, data2
+#endif
sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
+ orr tmp2, data1, REP8_7f
sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
- bic has_nul1, tmp1, tmp2
- bics has_nul2, tmp3, tmp4
- ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
- b.eq .Lloop
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(main_loop_entry)
+
+ /* Enter with C = has_nul1 == 0. */
+ csel has_nul1, has_nul1, has_nul2, cc
+ mov len, 8
+ rev has_nul1, has_nul1
+ clz tmp1, has_nul1
+ csel len, xzr, len, cc
+ add len, len, tmp1, lsr 3
+ ret
+ /* The inner loop processes 32 bytes per iteration and uses the fast
+ NUL check. If we encounter non-ASCII characters, use a second
+ loop with the accurate NUL check. */
+ .p2align 4
+L(main_loop_entry):
+ bic src, srcin, 15
+ sub src, src, 16
+L(main_loop):
+ ldp data1, data2, [src, 32]!
+L(page_cross_entry):
+ sub tmp1, data1, zeroones
+ sub tmp3, data2, zeroones
+ orr tmp2, tmp1, tmp3
+ tst tmp2, zeroones, lsl 7
+ bne 1f
+ ldp data1, data2, [src, 16]
+ sub tmp1, data1, zeroones
+ sub tmp3, data2, zeroones
+ orr tmp2, tmp1, tmp3
+ tst tmp2, zeroones, lsl 7
+ beq L(main_loop)
+ add src, src, 16
+1:
+ /* The fast check failed, so do the slower, accurate NUL check. */
+ orr tmp2, data1, REP8_7f
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(nonascii_loop)
+
+ /* Enter with C = has_nul1 == 0. */
+L(tail):
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul1/2 directly. The
+ easiest way to get the correct byte is to byte-swap the data
+ and calculate the syndrome a second time. */
+ csel data1, data1, data2, cc
+ rev data1, data1
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ bic has_nul1, tmp1, tmp2
+#else
+ csel has_nul1, has_nul1, has_nul2, cc
+#endif
sub len, src, srcin
- cbz has_nul1, .Lnul_in_data2
-CPU_BE( mov data2, data1 ) /*prepare data to re-calculate the syndrome*/
- sub len, len, #8
- mov has_nul2, has_nul1
-.Lnul_in_data2:
- /*
- * For big-endian, carry propagation (if the final byte in the
- * string is 0x01) means we cannot use has_nul directly. The
- * easiest way to get the correct byte is to byte-swap the data
- * and calculate the syndrome a second time.
- */
-CPU_BE( rev data2, data2 )
-CPU_BE( sub tmp1, data2, zeroones )
-CPU_BE( orr tmp2, data2, #REP8_7f )
-CPU_BE( bic has_nul2, tmp1, tmp2 )
-
- sub len, len, #8
- rev has_nul2, has_nul2
- clz pos, has_nul2
- add len, len, pos, lsr #3 /* Bits to bytes. */
+ rev has_nul1, has_nul1
+ add tmp2, len, 8
+ clz tmp1, has_nul1
+ csel len, len, tmp2, cc
+ add len, len, tmp1, lsr 3
ret
-.Lmisaligned:
- cmp tmp1, #8
- neg tmp1, tmp1
- ldp data1, data2, [src], #16
- lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
- mov tmp2, #~0
- /* Big-endian. Early bytes are at MSB. */
-CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
+L(nonascii_loop):
+ ldp data1, data2, [src, 16]!
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ bne L(tail)
+ ldp data1, data2, [src, 16]!
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(nonascii_loop)
+ b L(tail)
+
+ /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
+ srcin to 0x7f, so we ignore any NUL bytes before the string.
+ Then continue in the aligned loop. */
+L(page_cross):
+ bic src, srcin, 15
+ ldp data1, data2, [src]
+ lsl tmp1, srcin, 3
+ mov tmp4, -1
+#ifdef __AARCH64EB__
+ /* Big-endian. Early bytes are at MSB. */
+ lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
+#else
/* Little-endian. Early bytes are at LSB. */
-CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
+ lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
+#endif
+ orr tmp1, tmp1, REP8_80
+ orn data1, data1, tmp1
+ orn tmp2, data2, tmp1
+ tst srcin, 8
+ csel data1, data1, tmp4, eq
+ csel data2, data2, tmp2, eq
+ b L(page_cross_entry)
- orr data1, data1, tmp2
- orr data2a, data2, tmp2
- csinv data1, data1, xzr, le
- csel data2, data2, data2a, le
- b .Lrealigned
SYM_FUNC_END_PI(strlen)
EXPORT_SYMBOL_NOKASAN(strlen)
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
index 2a7ee949ed47..48d44f7fddb1 100644
--- a/arch/arm64/lib/strncmp.S
+++ b/arch/arm64/lib/strncmp.S
@@ -1,299 +1,261 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2013-2021, Arm Limited.
*
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/strncmp.S
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
-/*
- * compare two strings
+/* Assumptions:
*
- * Parameters:
- * x0 - const string 1 pointer
- * x1 - const string 2 pointer
- * x2 - the maximal length to be compared
- * Returns:
- * x0 - an integer less than, equal to, or greater than zero if s1 is found,
- * respectively, to be less than, to match, or be greater than s2.
+ * ARMv8-a, AArch64
*/
+#define L(label) .L ## label
+
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
/* Parameters and result. */
-src1 .req x0
-src2 .req x1
-limit .req x2
-result .req x0
+#define src1 x0
+#define src2 x1
+#define limit x2
+#define result x0
/* Internal variables. */
-data1 .req x3
-data1w .req w3
-data2 .req x4
-data2w .req w4
-has_nul .req x5
-diff .req x6
-syndrome .req x7
-tmp1 .req x8
-tmp2 .req x9
-tmp3 .req x10
-zeroones .req x11
-pos .req x12
-limit_wd .req x13
-mask .req x14
-endloop .req x15
+#define data1 x3
+#define data1w w3
+#define data2 x4
+#define data2w w4
+#define has_nul x5
+#define diff x6
+#define syndrome x7
+#define tmp1 x8
+#define tmp2 x9
+#define tmp3 x10
+#define zeroones x11
+#define pos x12
+#define limit_wd x13
+#define mask x14
+#define endloop x15
+#define count mask
SYM_FUNC_START_WEAK_PI(strncmp)
- cbz limit, .Lret0
+ cbz limit, L(ret0)
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
- b.ne .Lmisaligned8
- ands tmp1, src1, #7
- b.ne .Lmutual_align
+ and count, src1, #7
+ b.ne L(misaligned8)
+ cbnz count, L(mutual_align)
/* Calculate the number of full and partial words -1. */
- /*
- * when limit is mulitply of 8, if not sub 1,
- * the judgement of last dword will wrong.
- */
- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
- lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
+ sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
+ lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
- /*
- * NUL detection works on the principle that (X - 1) & (~X) & 0x80
- * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- * can be done in parallel across the entire word.
- */
-.Lloop_aligned:
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+ .p2align 4
+L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
-.Lstart_realigned:
+L(start_realigned):
subs limit_wd, limit_wd, #1
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- csinv endloop, diff, xzr, pl /* Last Dword or differences.*/
- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ csinv endloop, diff, xzr, pl /* Last Dword or differences. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp endloop, #0, #0, eq
- b.eq .Lloop_aligned
+ b.eq L(loop_aligned)
+ /* End of main loop */
- /*Not reached the limit, must have found the end or a diff. */
- tbz limit_wd, #63, .Lnot_limit
+ /* Not reached the limit, must have found the end or a diff. */
+ tbz limit_wd, #63, L(not_limit)
/* Limit % 8 == 0 => all bytes significant. */
ands limit, limit, #7
- b.eq .Lnot_limit
+ b.eq L(not_limit)
- lsl limit, limit, #3 /* Bits -> bytes. */
+ lsl limit, limit, #3 /* Bits -> bytes. */
mov mask, #~0
-CPU_BE( lsr mask, mask, limit )
-CPU_LE( lsl mask, mask, limit )
+#ifdef __AARCH64EB__
+ lsr mask, mask, limit
+#else
+ lsl mask, mask, limit
+#endif
bic data1, data1, mask
bic data2, data2, mask
/* Make sure that the NUL byte is marked in the syndrome. */
orr has_nul, has_nul, mask
-.Lnot_limit:
+L(not_limit):
orr syndrome, diff, has_nul
- b .Lcal_cmpresult
-.Lmutual_align:
- /*
- * Sources are mutually aligned, but are not currently at an
- * alignment boundary. Round down the addresses and then mask off
- * the bytes that precede the start point.
- * We also need to adjust the limit calculations, but without
- * overflowing if the limit is near ULONG_MAX.
- */
+#ifndef __AARCH64EB__
+ rev syndrome, syndrome
+ rev data1, data1
+ /* The MS-non-zero bit of the syndrome marks either the first bit
+ that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ clz pos, syndrome
+ rev data2, data2
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#else
+ /* For big-endian we cannot use the trick with the syndrome value
+ as carry-propagation can corrupt the upper bits if the trailing
+ bytes in the string contain 0x01. */
+ /* However, if there is no NUL byte in the dword, we can generate
+ the result directly. We can't just subtract the bytes as the
+ MSB might be significant. */
+ cbnz has_nul, 1f
+ cmp data1, data2
+ cset result, ne
+ cneg result, result, lo
+ ret
+1:
+ /* Re-compute the NUL-byte detection, using a byte-reversed value. */
+ rev tmp3, data1
+ sub tmp1, tmp3, zeroones
+ orr tmp2, tmp3, #REP8_7f
+ bic has_nul, tmp1, tmp2
+ rev has_nul, has_nul
+ orr syndrome, diff, has_nul
+ clz pos, syndrome
+ /* The MS-non-zero bit of the syndrome marks either the first bit
+ that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#endif
+
+L(mutual_align):
+ /* Sources are mutually aligned, but are not currently at an
+ alignment boundary. Round down the addresses and then mask off
+ the bytes that precede the start point.
+ We also need to adjust the limit calculations, but without
+ overflowing if the limit is near ULONG_MAX. */
bic src1, src1, #7
bic src2, src2, #7
ldr data1, [src1], #8
- neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */
+ neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
ldr data2, [src2], #8
mov tmp2, #~0
- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
+ sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
+#ifdef __AARCH64EB__
/* Big-endian. Early bytes are at MSB. */
-CPU_BE( lsl tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */
+ lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
+#else
/* Little-endian. Early bytes are at LSB. */
-CPU_LE( lsr tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */
-
+ lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
+#endif
and tmp3, limit_wd, #7
lsr limit_wd, limit_wd, #3
- /* Adjust the limit. Only low 3 bits used, so overflow irrelevant.*/
- add limit, limit, tmp1
- add tmp3, tmp3, tmp1
+ /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
+ add limit, limit, count
+ add tmp3, tmp3, count
orr data1, data1, tmp2
orr data2, data2, tmp2
add limit_wd, limit_wd, tmp3, lsr #3
- b .Lstart_realigned
+ b L(start_realigned)
+
+ .p2align 4
+ /* Don't bother with dwords for up to 16 bytes. */
+L(misaligned8):
+ cmp limit, #16
+ b.hs L(try_misaligned_words)
-/*when src1 offset is not equal to src2 offset...*/
-.Lmisaligned8:
- cmp limit, #8
- b.lo .Ltiny8proc /*limit < 8... */
- /*
- * Get the align offset length to compare per byte first.
- * After this process, one string's address will be aligned.*/
- and tmp1, src1, #7
- neg tmp1, tmp1
- add tmp1, tmp1, #8
- and tmp2, src2, #7
- neg tmp2, tmp2
- add tmp2, tmp2, #8
- subs tmp3, tmp1, tmp2
- csel pos, tmp1, tmp2, hi /*Choose the maximum. */
- /*
- * Here, limit is not less than 8, so directly run .Ltinycmp
- * without checking the limit.*/
- sub limit, limit, pos
-.Ltinycmp:
+L(byte_loop):
+ /* Perhaps we can do better than this. */
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
- subs pos, pos, #1
- ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */
- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq .Ltinycmp
- cbnz pos, 1f /*find the null or unequal...*/
- cmp data1w, #1
- ccmp data1w, data2w, #0, cs
- b.eq .Lstart_align /*the last bytes are equal....*/
-1:
+ subs limit, limit, #1
+ ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.eq L(byte_loop)
+L(done):
sub result, data1, data2
ret
-
-.Lstart_align:
+ /* Align the SRC1 to a dword by doing a bytewise compare and then do
+ the dword loop. */
+L(try_misaligned_words):
lsr limit_wd, limit, #3
- cbz limit_wd, .Lremain8
- /*process more leading bytes to make str1 aligned...*/
- ands xzr, src1, #7
- b.eq .Lrecal_offset
- add src1, src1, tmp3 /*tmp3 is positive in this branch.*/
- add src2, src2, tmp3
- ldr data1, [src1], #8
- ldr data2, [src2], #8
+ cbz count, L(do_misaligned)
- sub limit, limit, tmp3
+ neg count, count
+ and count, count, #7
+ sub limit, limit, count
lsr limit_wd, limit, #3
- subs limit_wd, limit_wd, #1
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
- bics has_nul, tmp1, tmp2
- ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/
- b.ne .Lunequal_proc
- /*How far is the current str2 from the alignment boundary...*/
- and tmp3, tmp3, #7
-.Lrecal_offset:
- neg pos, tmp3
-.Lloopcmp_proc:
- /*
- * Divide the eight bytes into two parts. First,backwards the src2
- * to an alignment boundary,load eight bytes from the SRC2 alignment
- * boundary,then compare with the relative bytes from SRC1.
- * If all 8 bytes are equal,then start the second part's comparison.
- * Otherwise finish the comparison.
- * This special handle can garantee all the accesses are in the
- * thread/task space in avoid to overrange access.
- */
- ldr data1, [src1,pos]
- ldr data2, [src2,pos]
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
- eor diff, data1, data2 /* Non-zero if differences found. */
- csinv endloop, diff, xzr, eq
- cbnz endloop, .Lunequal_proc
+L(page_end_loop):
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ cmp data1w, #1
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.ne L(done)
+ subs count, count, #1
+ b.hi L(page_end_loop)
+
+L(do_misaligned):
+ /* Prepare ourselves for the next page crossing. Unlike the aligned
+ loop, we fetch 1 less dword because we risk crossing bounds on
+ SRC2. */
+ mov count, #8
+ subs limit_wd, limit_wd, #1
+ b.lo L(done_loop)
+L(loop_misaligned):
+ and tmp2, src2, #0xff8
+ eor tmp2, tmp2, #0xff8
+ cbz tmp2, L(page_end_loop)
- /*The second part process*/
ldr data1, [src1], #8
ldr data2, [src2], #8
- subs limit_wd, limit_wd, #1
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
- bics has_nul, tmp1, tmp2
- ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/
- b.eq .Lloopcmp_proc
-
-.Lunequal_proc:
- orr syndrome, diff, has_nul
- cbz syndrome, .Lremain8
-.Lcal_cmpresult:
- /*
- * reversed the byte-order as big-endian,then CLZ can find the most
- * significant zero bits.
- */
-CPU_LE( rev syndrome, syndrome )
-CPU_LE( rev data1, data1 )
-CPU_LE( rev data2, data2 )
- /*
- * For big-endian we cannot use the trick with the syndrome value
- * as carry-propagation can corrupt the upper bits if the trailing
- * bytes in the string contain 0x01.
- * However, if there is no NUL byte in the dword, we can generate
- * the result directly. We can't just subtract the bytes as the
- * MSB might be significant.
- */
-CPU_BE( cbnz has_nul, 1f )
-CPU_BE( cmp data1, data2 )
-CPU_BE( cset result, ne )
-CPU_BE( cneg result, result, lo )
-CPU_BE( ret )
-CPU_BE( 1: )
- /* Re-compute the NUL-byte detection, using a byte-reversed value.*/
-CPU_BE( rev tmp3, data1 )
-CPU_BE( sub tmp1, tmp3, zeroones )
-CPU_BE( orr tmp2, tmp3, #REP8_7f )
-CPU_BE( bic has_nul, tmp1, tmp2 )
-CPU_BE( rev has_nul, has_nul )
-CPU_BE( orr syndrome, diff, has_nul )
- /*
- * The MS-non-zero bit of the syndrome marks either the first bit
- * that is different, or the top bit of the first zero byte.
- * Shifting left now will bring the critical information into the
- * top bits.
- */
- clz pos, syndrome
- lsl data1, data1, pos
- lsl data2, data2, pos
- /*
- * But we need to zero-extend (char is unsigned) the value and then
- * perform a signed 32-bit subtraction.
- */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
- ret
-
-.Lremain8:
- /* Limit % 8 == 0 => all bytes significant. */
- ands limit, limit, #7
- b.eq .Lret0
-.Ltiny8proc:
- ldrb data1w, [src1], #1
- ldrb data2w, [src2], #1
- subs limit, limit, #1
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp diff, #0, #0, eq
+ b.ne L(not_limit)
+ subs limit_wd, limit_wd, #1
+ b.pl L(loop_misaligned)
- ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */
- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq .Ltiny8proc
- sub result, data1, data2
- ret
+L(done_loop):
+ /* We found a difference or a NULL before the limit was reached. */
+ and limit, limit, #7
+ cbz limit, L(not_limit)
+ /* Read the last word. */
+ sub src1, src1, 8
+ sub src2, src2, 8
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp diff, #0, #0, eq
+ b.ne L(not_limit)
-.Lret0:
+L(ret0):
mov result, #0
ret
+
SYM_FUNC_END_PI(strncmp)
EXPORT_SYMBOL_NOKASAN(strncmp)
diff --git a/arch/arm64/lib/uaccess_flushcache.c b/arch/arm64/lib/uaccess_flushcache.c
index c83bb5a4aad2..baee22961bdb 100644
--- a/arch/arm64/lib/uaccess_flushcache.c
+++ b/arch/arm64/lib/uaccess_flushcache.c
@@ -15,7 +15,7 @@ void memcpy_flushcache(void *dst, const void *src, size_t cnt)
* barrier to order the cache maintenance against the memcpy.
*/
memcpy(dst, src, cnt);
- __clean_dcache_area_pop(dst, cnt);
+ dcache_clean_pop((unsigned long)dst, (unsigned long)dst + cnt);
}
EXPORT_SYMBOL_GPL(memcpy_flushcache);
@@ -33,6 +33,6 @@ unsigned long __copy_user_flushcache(void *to, const void __user *from,
rc = raw_copy_from_user(to, from, n);
/* See above */
- __clean_dcache_area_pop(to, n - rc);
+ dcache_clean_pop((unsigned long)to, (unsigned long)to + n - rc);
return rc;
}