diff options
Diffstat (limited to 'arch/arm64/crypto/aes-modes.S')
-rw-r--r-- | arch/arm64/crypto/aes-modes.S | 495 |
1 files changed, 341 insertions, 154 deletions
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S index 131618389f1f..0e834a2c062c 100644 --- a/arch/arm64/crypto/aes-modes.S +++ b/arch/arm64/crypto/aes-modes.S @@ -22,26 +22,26 @@ #define ST5(x...) x #endif -aes_encrypt_block4x: +SYM_FUNC_START_LOCAL(aes_encrypt_block4x) encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 ret -ENDPROC(aes_encrypt_block4x) +SYM_FUNC_END(aes_encrypt_block4x) -aes_decrypt_block4x: +SYM_FUNC_START_LOCAL(aes_decrypt_block4x) decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 ret -ENDPROC(aes_decrypt_block4x) +SYM_FUNC_END(aes_decrypt_block4x) #if MAX_STRIDE == 5 -aes_encrypt_block5x: +SYM_FUNC_START_LOCAL(aes_encrypt_block5x) encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 ret -ENDPROC(aes_encrypt_block5x) +SYM_FUNC_END(aes_encrypt_block5x) -aes_decrypt_block5x: +SYM_FUNC_START_LOCAL(aes_decrypt_block5x) decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 ret -ENDPROC(aes_decrypt_block5x) +SYM_FUNC_END(aes_decrypt_block5x) #endif /* @@ -51,9 +51,8 @@ ENDPROC(aes_decrypt_block5x) * int blocks) */ -AES_ENTRY(aes_ecb_encrypt) - stp x29, x30, [sp, #-16]! - mov x29, sp +AES_FUNC_START(aes_ecb_encrypt) + frame_push 0 enc_prepare w3, x2, x5 @@ -77,14 +76,13 @@ ST5( st1 {v4.16b}, [x0], #16 ) subs w4, w4, #1 bne .Lecbencloop .Lecbencout: - ldp x29, x30, [sp], #16 + frame_pop ret -AES_ENDPROC(aes_ecb_encrypt) +AES_FUNC_END(aes_ecb_encrypt) -AES_ENTRY(aes_ecb_decrypt) - stp x29, x30, [sp, #-16]! - mov x29, sp +AES_FUNC_START(aes_ecb_decrypt) + frame_push 0 dec_prepare w3, x2, x5 @@ -108,9 +106,9 @@ ST5( st1 {v4.16b}, [x0], #16 ) subs w4, w4, #1 bne .Lecbdecloop .Lecbdecout: - ldp x29, x30, [sp], #16 + frame_pop ret -AES_ENDPROC(aes_ecb_decrypt) +AES_FUNC_END(aes_ecb_decrypt) /* @@ -126,7 +124,7 @@ AES_ENDPROC(aes_ecb_decrypt) * u32 const rk2[]); */ -AES_ENTRY(aes_essiv_cbc_encrypt) +AES_FUNC_START(aes_essiv_cbc_encrypt) ld1 {v4.16b}, [x5] /* get iv */ mov w8, #14 /* AES-256: 14 rounds */ @@ -135,7 +133,7 @@ AES_ENTRY(aes_essiv_cbc_encrypt) enc_switch_key w3, x2, x6 b .Lcbcencloop4x -AES_ENTRY(aes_cbc_encrypt) +AES_FUNC_START(aes_cbc_encrypt) ld1 {v4.16b}, [x5] /* get iv */ enc_prepare w3, x2, x6 @@ -167,13 +165,10 @@ AES_ENTRY(aes_cbc_encrypt) .Lcbcencout: st1 {v4.16b}, [x5] /* return iv */ ret -AES_ENDPROC(aes_cbc_encrypt) -AES_ENDPROC(aes_essiv_cbc_encrypt) - -AES_ENTRY(aes_essiv_cbc_decrypt) - stp x29, x30, [sp, #-16]! - mov x29, sp +AES_FUNC_END(aes_cbc_encrypt) +AES_FUNC_END(aes_essiv_cbc_encrypt) +AES_FUNC_START(aes_essiv_cbc_decrypt) ld1 {cbciv.16b}, [x5] /* get iv */ mov w8, #14 /* AES-256: 14 rounds */ @@ -181,12 +176,10 @@ AES_ENTRY(aes_essiv_cbc_decrypt) encrypt_block cbciv, w8, x6, x7, w9 b .Lessivcbcdecstart -AES_ENTRY(aes_cbc_decrypt) - stp x29, x30, [sp, #-16]! - mov x29, sp - +AES_FUNC_START(aes_cbc_decrypt) ld1 {cbciv.16b}, [x5] /* get iv */ .Lessivcbcdecstart: + frame_push 0 dec_prepare w3, x2, x6 .LcbcdecloopNx: @@ -236,10 +229,10 @@ ST5( st1 {v4.16b}, [x0], #16 ) bne .Lcbcdecloop .Lcbcdecout: st1 {cbciv.16b}, [x5] /* return iv */ - ldp x29, x30, [sp], #16 + frame_pop ret -AES_ENDPROC(aes_cbc_decrypt) -AES_ENDPROC(aes_essiv_cbc_decrypt) +AES_FUNC_END(aes_cbc_decrypt) +AES_FUNC_END(aes_essiv_cbc_decrypt) /* @@ -249,7 +242,7 @@ AES_ENDPROC(aes_essiv_cbc_decrypt) * int rounds, int bytes, u8 const iv[]) */ -AES_ENTRY(aes_cbc_cts_encrypt) +AES_FUNC_START(aes_cbc_cts_encrypt) adr_l x8, .Lcts_permute_table sub x4, x4, #16 add x9, x8, #32 @@ -276,9 +269,9 @@ AES_ENTRY(aes_cbc_cts_encrypt) st1 {v0.16b}, [x4] /* overlapping stores */ st1 {v1.16b}, [x0] ret -AES_ENDPROC(aes_cbc_cts_encrypt) +AES_FUNC_END(aes_cbc_cts_encrypt) -AES_ENTRY(aes_cbc_cts_decrypt) +AES_FUNC_START(aes_cbc_cts_decrypt) adr_l x8, .Lcts_permute_table sub x4, x4, #16 add x9, x8, #32 @@ -305,7 +298,7 @@ AES_ENTRY(aes_cbc_cts_decrypt) st1 {v2.16b}, [x4] /* overlapping stores */ st1 {v0.16b}, [x0] ret -AES_ENDPROC(aes_cbc_cts_decrypt) +AES_FUNC_END(aes_cbc_cts_decrypt) .section ".rodata", "a" .align 6 @@ -318,98 +311,308 @@ AES_ENDPROC(aes_cbc_cts_decrypt) .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .previous - /* - * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, - * int blocks, u8 ctr[]) + * This macro generates the code for CTR and XCTR mode. */ +.macro ctr_encrypt xctr + // Arguments + OUT .req x0 + IN .req x1 + KEY .req x2 + ROUNDS_W .req w3 + BYTES_W .req w4 + IV .req x5 + BYTE_CTR_W .req w6 // XCTR only + // Intermediate values + CTR_W .req w11 // XCTR only + CTR .req x11 // XCTR only + IV_PART .req x12 + BLOCKS .req x13 + BLOCKS_W .req w13 + + frame_push 0 + + enc_prepare ROUNDS_W, KEY, IV_PART + ld1 {vctr.16b}, [IV] -AES_ENTRY(aes_ctr_encrypt) - stp x29, x30, [sp, #-16]! - mov x29, sp - - enc_prepare w3, x2, x6 - ld1 {vctr.16b}, [x5] + /* + * Keep 64 bits of the IV in a register. For CTR mode this lets us + * easily increment the IV. For XCTR mode this lets us efficiently XOR + * the 64-bit counter with the IV. + */ + .if \xctr + umov IV_PART, vctr.d[0] + lsr CTR_W, BYTE_CTR_W, #4 + .else + umov IV_PART, vctr.d[1] + rev IV_PART, IV_PART + .endif + +.LctrloopNx\xctr: + add BLOCKS_W, BYTES_W, #15 + sub BYTES_W, BYTES_W, #MAX_STRIDE << 4 + lsr BLOCKS_W, BLOCKS_W, #4 + mov w8, #MAX_STRIDE + cmp BLOCKS_W, w8 + csel BLOCKS_W, BLOCKS_W, w8, lt - umov x6, vctr.d[1] /* keep swabbed ctr in reg */ - rev x6, x6 - cmn w6, w4 /* 32 bit overflow? */ - bcs .Lctrloop -.LctrloopNx: - subs w4, w4, #MAX_STRIDE - bmi .Lctr1x - add w7, w6, #1 + /* + * Set up the counter values in v0-v{MAX_STRIDE-1}. + * + * If we are encrypting less than MAX_STRIDE blocks, the tail block + * handling code expects the last keystream block to be in + * v{MAX_STRIDE-1}. For example: if encrypting two blocks with + * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks. + */ + .if \xctr + add CTR, CTR, BLOCKS + .else + adds IV_PART, IV_PART, BLOCKS + .endif mov v0.16b, vctr.16b - add w8, w6, #2 mov v1.16b, vctr.16b - add w9, w6, #3 mov v2.16b, vctr.16b - add w9, w6, #3 - rev w7, w7 mov v3.16b, vctr.16b - rev w8, w8 ST5( mov v4.16b, vctr.16b ) - mov v1.s[3], w7 - rev w9, w9 -ST5( add w10, w6, #4 ) - mov v2.s[3], w8 -ST5( rev w10, w10 ) - mov v3.s[3], w9 -ST5( mov v4.s[3], w10 ) - ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ + .if \xctr + sub x6, CTR, #MAX_STRIDE - 1 + sub x7, CTR, #MAX_STRIDE - 2 + sub x8, CTR, #MAX_STRIDE - 3 + sub x9, CTR, #MAX_STRIDE - 4 +ST5( sub x10, CTR, #MAX_STRIDE - 5 ) + eor x6, x6, IV_PART + eor x7, x7, IV_PART + eor x8, x8, IV_PART + eor x9, x9, IV_PART +ST5( eor x10, x10, IV_PART ) + mov v0.d[0], x6 + mov v1.d[0], x7 + mov v2.d[0], x8 + mov v3.d[0], x9 +ST5( mov v4.d[0], x10 ) + .else + bcs 0f + .subsection 1 + /* + * This subsection handles carries. + * + * Conditional branching here is allowed with respect to time + * invariance since the branches are dependent on the IV instead + * of the plaintext or key. This code is rarely executed in + * practice anyway. + */ + + /* Apply carry to outgoing counter. */ +0: umov x8, vctr.d[0] + rev x8, x8 + add x8, x8, #1 + rev x8, x8 + ins vctr.d[0], x8 + + /* + * Apply carry to counter blocks if needed. + * + * Since the carry flag was set, we know 0 <= IV_PART < + * MAX_STRIDE. Using the value of IV_PART we can determine how + * many counter blocks need to be updated. + */ + cbz IV_PART, 2f + adr x16, 1f + sub x16, x16, IV_PART, lsl #3 + br x16 + bti c + mov v0.d[0], vctr.d[0] + bti c + mov v1.d[0], vctr.d[0] + bti c + mov v2.d[0], vctr.d[0] + bti c + mov v3.d[0], vctr.d[0] +ST5( bti c ) +ST5( mov v4.d[0], vctr.d[0] ) +1: b 2f + .previous + +2: rev x7, IV_PART + ins vctr.d[1], x7 + sub x7, IV_PART, #MAX_STRIDE - 1 + sub x8, IV_PART, #MAX_STRIDE - 2 + sub x9, IV_PART, #MAX_STRIDE - 3 + rev x7, x7 + rev x8, x8 + mov v1.d[1], x7 + rev x9, x9 +ST5( sub x10, IV_PART, #MAX_STRIDE - 4 ) + mov v2.d[1], x8 +ST5( rev x10, x10 ) + mov v3.d[1], x9 +ST5( mov v4.d[1], x10 ) + .endif + + /* + * If there are at least MAX_STRIDE blocks left, XOR the data with + * keystream and store. Otherwise jump to tail handling. + */ + tbnz BYTES_W, #31, .Lctrtail\xctr + ld1 {v5.16b-v7.16b}, [IN], #48 ST4( bl aes_encrypt_block4x ) ST5( bl aes_encrypt_block5x ) eor v0.16b, v5.16b, v0.16b -ST4( ld1 {v5.16b}, [x1], #16 ) +ST4( ld1 {v5.16b}, [IN], #16 ) eor v1.16b, v6.16b, v1.16b -ST5( ld1 {v5.16b-v6.16b}, [x1], #32 ) +ST5( ld1 {v5.16b-v6.16b}, [IN], #32 ) eor v2.16b, v7.16b, v2.16b eor v3.16b, v5.16b, v3.16b ST5( eor v4.16b, v6.16b, v4.16b ) - st1 {v0.16b-v3.16b}, [x0], #64 -ST5( st1 {v4.16b}, [x0], #16 ) - add x6, x6, #MAX_STRIDE - rev x7, x6 - ins vctr.d[1], x7 - cbz w4, .Lctrout - b .LctrloopNx -.Lctr1x: - adds w4, w4, #MAX_STRIDE - beq .Lctrout -.Lctrloop: - mov v0.16b, vctr.16b - encrypt_block v0, w3, x2, x8, w7 + st1 {v0.16b-v3.16b}, [OUT], #64 +ST5( st1 {v4.16b}, [OUT], #16 ) + cbz BYTES_W, .Lctrout\xctr + b .LctrloopNx\xctr + +.Lctrout\xctr: + .if !\xctr + st1 {vctr.16b}, [IV] /* return next CTR value */ + .endif + frame_pop + ret - adds x6, x6, #1 /* increment BE ctr */ - rev x7, x6 - ins vctr.d[1], x7 - bcs .Lctrcarry /* overflow? */ +.Lctrtail\xctr: + /* + * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext + * + * This code expects the last keystream block to be in v{MAX_STRIDE-1}. + * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and + * v4 should have the next two counter blocks. + * + * This allows us to store the ciphertext by writing to overlapping + * regions of memory. Any invalid ciphertext blocks get overwritten by + * correctly computed blocks. This approach greatly simplifies the + * logic for storing the ciphertext. + */ + mov x16, #16 + ands w7, BYTES_W, #0xf + csel x13, x7, x16, ne + +ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4)) +ST5( csel x14, x16, xzr, gt ) + cmp BYTES_W, #48 - (MAX_STRIDE << 4) + csel x15, x16, xzr, gt + cmp BYTES_W, #32 - (MAX_STRIDE << 4) + csel x16, x16, xzr, gt + cmp BYTES_W, #16 - (MAX_STRIDE << 4) + + adr_l x9, .Lcts_permute_table + add x9, x9, x13 + ble .Lctrtail1x\xctr + +ST5( ld1 {v5.16b}, [IN], x14 ) + ld1 {v6.16b}, [IN], x15 + ld1 {v7.16b}, [IN], x16 -.Lctrcarrydone: - subs w4, w4, #1 - bmi .Lctrtailblock /* blocks <0 means tail block */ - ld1 {v3.16b}, [x1], #16 - eor v3.16b, v0.16b, v3.16b - st1 {v3.16b}, [x0], #16 - bne .Lctrloop - -.Lctrout: - st1 {vctr.16b}, [x5] /* return next CTR value */ - ldp x29, x30, [sp], #16 - ret +ST4( bl aes_encrypt_block4x ) +ST5( bl aes_encrypt_block5x ) -.Lctrtailblock: - st1 {v0.16b}, [x0] - b .Lctrout + ld1 {v8.16b}, [IN], x13 + ld1 {v9.16b}, [IN] + ld1 {v10.16b}, [x9] + +ST4( eor v6.16b, v6.16b, v0.16b ) +ST4( eor v7.16b, v7.16b, v1.16b ) +ST4( tbl v3.16b, {v3.16b}, v10.16b ) +ST4( eor v8.16b, v8.16b, v2.16b ) +ST4( eor v9.16b, v9.16b, v3.16b ) + +ST5( eor v5.16b, v5.16b, v0.16b ) +ST5( eor v6.16b, v6.16b, v1.16b ) +ST5( tbl v4.16b, {v4.16b}, v10.16b ) +ST5( eor v7.16b, v7.16b, v2.16b ) +ST5( eor v8.16b, v8.16b, v3.16b ) +ST5( eor v9.16b, v9.16b, v4.16b ) + +ST5( st1 {v5.16b}, [OUT], x14 ) + st1 {v6.16b}, [OUT], x15 + st1 {v7.16b}, [OUT], x16 + add x13, x13, OUT + st1 {v9.16b}, [x13] // overlapping stores + st1 {v8.16b}, [OUT] + b .Lctrout\xctr + +.Lctrtail1x\xctr: + /* + * Handle <= 16 bytes of plaintext + * + * This code always reads and writes 16 bytes. To avoid out of bounds + * accesses, XCTR and CTR modes must use a temporary buffer when + * encrypting/decrypting less than 16 bytes. + * + * This code is unusual in that it loads the input and stores the output + * relative to the end of the buffers rather than relative to the start. + * This causes unusual behaviour when encrypting/decrypting less than 16 + * bytes; the end of the data is expected to be at the end of the + * temporary buffer rather than the start of the data being at the start + * of the temporary buffer. + */ + sub x8, x7, #16 + csel x7, x7, x8, eq + add IN, IN, x7 + add OUT, OUT, x7 + ld1 {v5.16b}, [IN] + ld1 {v6.16b}, [OUT] +ST5( mov v3.16b, v4.16b ) + encrypt_block v3, ROUNDS_W, KEY, x8, w7 + ld1 {v10.16b-v11.16b}, [x9] + tbl v3.16b, {v3.16b}, v10.16b + sshr v11.16b, v11.16b, #7 + eor v5.16b, v5.16b, v3.16b + bif v5.16b, v6.16b, v11.16b + st1 {v5.16b}, [OUT] + b .Lctrout\xctr + + // Arguments + .unreq OUT + .unreq IN + .unreq KEY + .unreq ROUNDS_W + .unreq BYTES_W + .unreq IV + .unreq BYTE_CTR_W // XCTR only + // Intermediate values + .unreq CTR_W // XCTR only + .unreq CTR // XCTR only + .unreq IV_PART + .unreq BLOCKS + .unreq BLOCKS_W +.endm + + /* + * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int bytes, u8 ctr[]) + * + * The input and output buffers must always be at least 16 bytes even if + * encrypting/decrypting less than 16 bytes. Otherwise out of bounds + * accesses will occur. The data to be encrypted/decrypted is expected + * to be at the end of this 16-byte temporary buffer rather than the + * start. + */ + +AES_FUNC_START(aes_ctr_encrypt) + ctr_encrypt 0 +AES_FUNC_END(aes_ctr_encrypt) + + /* + * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int bytes, u8 const iv[], int byte_ctr) + * + * The input and output buffers must always be at least 16 bytes even if + * encrypting/decrypting less than 16 bytes. Otherwise out of bounds + * accesses will occur. The data to be encrypted/decrypted is expected + * to be at the end of this 16-byte temporary buffer rather than the + * start. + */ -.Lctrcarry: - umov x7, vctr.d[0] /* load upper word of ctr */ - rev x7, x7 /* ... to handle the carry */ - add x7, x7, #1 - rev x7, x7 - ins vctr.d[0], x7 - b .Lctrcarrydone -AES_ENDPROC(aes_ctr_encrypt) +AES_FUNC_START(aes_xctr_encrypt) + ctr_encrypt 1 +AES_FUNC_END(aes_xctr_encrypt) /* @@ -433,9 +636,8 @@ AES_ENDPROC(aes_ctr_encrypt) uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s .endm -AES_ENTRY(aes_xts_encrypt) - stp x29, x30, [sp, #-16]! - mov x29, sp +AES_FUNC_START(aes_xts_encrypt) + frame_push 0 ld1 {v4.16b}, [x6] xts_load_mask v8 @@ -493,7 +695,7 @@ AES_ENTRY(aes_xts_encrypt) st1 {v0.16b}, [x0] .Lxtsencret: st1 {v4.16b}, [x6] - ldp x29, x30, [sp], #16 + frame_pop ret .LxtsencctsNx: @@ -518,11 +720,10 @@ AES_ENTRY(aes_xts_encrypt) st1 {v2.16b}, [x4] /* overlapping stores */ mov w4, wzr b .Lxtsencctsout -AES_ENDPROC(aes_xts_encrypt) +AES_FUNC_END(aes_xts_encrypt) -AES_ENTRY(aes_xts_decrypt) - stp x29, x30, [sp, #-16]! - mov x29, sp +AES_FUNC_START(aes_xts_decrypt) + frame_push 0 /* subtract 16 bytes if we are doing CTS */ sub w8, w4, #0x10 @@ -583,7 +784,7 @@ AES_ENTRY(aes_xts_decrypt) b .Lxtsdecloop .Lxtsdecout: st1 {v4.16b}, [x6] - ldp x29, x30, [sp], #16 + frame_pop ret .Lxtsdeccts: @@ -612,68 +813,54 @@ AES_ENTRY(aes_xts_decrypt) st1 {v2.16b}, [x4] /* overlapping stores */ mov w4, wzr b .Lxtsdecctsout -AES_ENDPROC(aes_xts_decrypt) +AES_FUNC_END(aes_xts_decrypt) /* * aes_mac_update(u8 const in[], u32 const rk[], int rounds, * int blocks, u8 dg[], int enc_before, int enc_after) */ -AES_ENTRY(aes_mac_update) - frame_push 6 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - mov x24, x6 - - ld1 {v0.16b}, [x23] /* get dg */ +AES_FUNC_START(aes_mac_update) + ld1 {v0.16b}, [x4] /* get dg */ enc_prepare w2, x1, x7 cbz w5, .Lmacloop4x encrypt_block v0, w2, x1, x7, w8 .Lmacloop4x: - subs w22, w22, #4 + subs w3, w3, #4 bmi .Lmac1x - ld1 {v1.16b-v4.16b}, [x19], #64 /* get next pt block */ + ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */ eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ - encrypt_block v0, w21, x20, x7, w8 + encrypt_block v0, w2, x1, x7, w8 eor v0.16b, v0.16b, v2.16b - encrypt_block v0, w21, x20, x7, w8 + encrypt_block v0, w2, x1, x7, w8 eor v0.16b, v0.16b, v3.16b - encrypt_block v0, w21, x20, x7, w8 + encrypt_block v0, w2, x1, x7, w8 eor v0.16b, v0.16b, v4.16b - cmp w22, wzr - csinv x5, x24, xzr, eq + cmp w3, wzr + csinv x5, x6, xzr, eq cbz w5, .Lmacout - encrypt_block v0, w21, x20, x7, w8 - st1 {v0.16b}, [x23] /* return dg */ - cond_yield_neon .Lmacrestart + encrypt_block v0, w2, x1, x7, w8 + st1 {v0.16b}, [x4] /* return dg */ + cond_yield .Lmacout, x7, x8 b .Lmacloop4x .Lmac1x: - add w22, w22, #4 + add w3, w3, #4 .Lmacloop: - cbz w22, .Lmacout - ld1 {v1.16b}, [x19], #16 /* get next pt block */ + cbz w3, .Lmacout + ld1 {v1.16b}, [x0], #16 /* get next pt block */ eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ - subs w22, w22, #1 - csinv x5, x24, xzr, eq + subs w3, w3, #1 + csinv x5, x6, xzr, eq cbz w5, .Lmacout .Lmacenc: - encrypt_block v0, w21, x20, x7, w8 + encrypt_block v0, w2, x1, x7, w8 b .Lmacloop .Lmacout: - st1 {v0.16b}, [x23] /* return dg */ - frame_pop + st1 {v0.16b}, [x4] /* return dg */ + mov w0, w3 ret - -.Lmacrestart: - ld1 {v0.16b}, [x23] /* get dg */ - enc_prepare w21, x20, x0 - b .Lmacloop4x -AES_ENDPROC(aes_mac_update) +AES_FUNC_END(aes_mac_update) |