From 6b5360a5e0ad357b73776d092437715ba4a77865 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 27 Oct 2022 14:55:03 +0800 Subject: crypto: arm64/sm4 - add CE implementation for cmac/xcbc/cbcmac This patch is a CE-optimized assembly implementation for cmac/xcbc/cbcmac. Benchmark on T-Head Yitian-710 2.75 GHz, the data comes from the 300 mode of tcrypt, and compared the performance before and after this patch (the driver used before this patch is XXXmac(sm4-ce)). The abscissas are blocks of different lengths. The data is tabulated and the unit is Mb/s: Before: update-size | 16 64 256 1024 2048 4096 8192 ---------------+-------------------------------------------------------- cmac(sm4-ce) | 293.33 403.69 503.76 527.78 531.10 535.46 535.81 xcbc(sm4-ce) | 292.83 402.50 504.02 529.08 529.87 536.55 538.24 cbcmac(sm4-ce) | 318.42 415.79 497.12 515.05 523.15 521.19 523.01 After: update-size | 16 64 256 1024 2048 4096 8192 ---------------+-------------------------------------------------------- cmac-sm4-ce | 371.99 675.28 903.56 971.65 980.57 990.40 991.04 xcbc-sm4-ce | 372.11 674.55 903.47 971.61 980.96 990.42 991.10 cbcmac-sm4-ce | 371.63 675.33 903.23 972.07 981.42 990.93 991.45 Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- arch/arm64/crypto/sm4-ce-core.S | 70 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) (limited to 'arch/arm64/crypto/sm4-ce-core.S') diff --git a/arch/arm64/crypto/sm4-ce-core.S b/arch/arm64/crypto/sm4-ce-core.S index ddd15ec09d38..877b80c54a0d 100644 --- a/arch/arm64/crypto/sm4-ce-core.S +++ b/arch/arm64/crypto/sm4-ce-core.S @@ -35,6 +35,7 @@ #define RTMP3 v19 #define RIV v20 +#define RMAC v20 #define RMASK v21 @@ -1007,6 +1008,75 @@ SYM_FUNC_START(sm4_ce_xts_dec) ret SYM_FUNC_END(sm4_ce_xts_dec) +.align 3 +SYM_FUNC_START(sm4_ce_mac_update) + /* input: + * x0: round key array, CTX + * x1: digest + * x2: src + * w3: nblocks + * w4: enc_before + * w5: enc_after + */ + SM4_PREPARE(x0) + + ld1 {RMAC.16b}, [x1] + + cbz w4, .Lmac_update + + SM4_CRYPT_BLK(RMAC) + +.Lmac_update: + cbz w3, .Lmac_ret + + sub w6, w3, #1 + cmp w5, wzr + csel w3, w3, w6, ne + + cbz w3, .Lmac_end + +.Lmac_loop_4x: + cmp w3, #4 + blt .Lmac_loop_1x + + sub w3, w3, #4 + + ld1 {v0.16b-v3.16b}, [x2], #64 + + eor RMAC.16b, RMAC.16b, v0.16b + SM4_CRYPT_BLK(RMAC) + eor RMAC.16b, RMAC.16b, v1.16b + SM4_CRYPT_BLK(RMAC) + eor RMAC.16b, RMAC.16b, v2.16b + SM4_CRYPT_BLK(RMAC) + eor RMAC.16b, RMAC.16b, v3.16b + SM4_CRYPT_BLK(RMAC) + + cbz w3, .Lmac_end + b .Lmac_loop_4x + +.Lmac_loop_1x: + sub w3, w3, #1 + + ld1 {v0.16b}, [x2], #16 + + eor RMAC.16b, RMAC.16b, v0.16b + SM4_CRYPT_BLK(RMAC) + + cbnz w3, .Lmac_loop_1x + + +.Lmac_end: + cbnz w5, .Lmac_ret + + ld1 {v0.16b}, [x2], #16 + eor RMAC.16b, RMAC.16b, v0.16b + +.Lmac_ret: + st1 {RMAC.16b}, [x1] + ret +SYM_FUNC_END(sm4_ce_mac_update) + .section ".rodata", "a" .align 4 -- cgit