/* * Accelerated GHASH implementation with ARMv8 PMULL instructions. * * Copyright (C) 2014 - 2017 Linaro Ltd. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published * by the Free Software Foundation. */ #include #include SHASH .req v0 SHASH2 .req v1 T1 .req v2 T2 .req v3 MASK .req v4 XL .req v5 XM .req v6 XH .req v7 IN1 .req v7 k00_16 .req v8 k32_48 .req v9 t3 .req v10 t4 .req v11 t5 .req v12 t6 .req v13 t7 .req v14 t8 .req v15 t9 .req v16 perm1 .req v17 perm2 .req v18 perm3 .req v19 sh1 .req v20 sh2 .req v21 sh3 .req v22 sh4 .req v23 ss1 .req v24 ss2 .req v25 ss3 .req v26 ss4 .req v27 .text .arch armv8-a+crypto .macro __pmull_p64, rd, rn, rm pmull \rd\().1q, \rn\().1d, \rm\().1d .endm .macro __pmull2_p64, rd, rn, rm pmull2 \rd\().1q, \rn\().2d, \rm\().2d .endm .macro __pmull_p8, rq, ad, bd ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 __pmull_p8_\bd \rq, \ad .endm .macro __pmull2_p8, rq, ad, bd tbl t3.16b, {\ad\().16b}, perm1.16b // A1 tbl t5.16b, {\ad\().16b}, perm2.16b // A2 tbl t7.16b, {\ad\().16b}, perm3.16b // A3 __pmull2_p8_\bd \rq, \ad .endm .macro __pmull_p8_SHASH, rq, ad __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 .endm .macro __pmull_p8_SHASH2, rq, ad __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 .endm .macro __pmull2_p8_SHASH, rq, ad __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 .endm .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 pmull\t t3.8h, t3.\nb, \bd // F = A1*B pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 pmull\t t5.8h, t5.\nb, \bd // H = A2*B pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 pmull\t t7.8h, t7.\nb, \bd // J = A3*B pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 pmull\t \rq\().8h, \ad, \bd // D = A*B eor t3.16b, t3.16b, t4.16b // L = E + F eor t5.16b, t5.16b, t6.16b // M = G + H eor t7.16b, t7.16b, t8.16b // N = I + J uzp1 t4.2d, t3.2d, t5.2d uzp2 t3.2d, t3.2d, t5.2d uzp1 t6.2d, t7.2d, t9.2d uzp2 t7.2d, t7.2d, t9.2d // t3 = (L) (P0 + P1) << 8 // t5 = (M) (P2 + P3) << 16 eor t4.16b, t4.16b, t3.16b and t3.16b, t3.16b, k32_48.16b // t7 = (N) (P4 + P5) << 24 // t9 = (K) (P6 + P7) << 32 eor t6.16b, t6.16b, t7.16b and t7.16b, t7.16b, k00_16.16b eor t4.16b, t4.16b, t3.16b eor t6.16b, t6.16b, t7.16b zip2 t5.2d, t4.2d, t3.2d zip1 t3.2d, t4.2d, t3.2d zip2 t9.2d, t6.2d, t7.2d zip1 t7.2d, t6.2d, t7.2d ext t3.16b, t3.16b, t3.16b, #15 ext t5.16b, t5.16b, t5.16b, #14 ext t7.16b, t7.16b, t7.16b, #13 ext t9.16b, t9.16b, t9.16b, #12 eor t3.16b, t3.16b, t5.16b eor t7.16b, t7.16b, t9.16b eor \rq\().16b, \rq\().16b, t3.16b eor \rq\().16b, \rq\().16b, t7.16b .endm .macro __pmull_pre_p64 movi MASK.16b, #0xe1 shl MASK.2d, MASK.2d, #57 .endm .macro __pmull_pre_p8 // k00_16 := 0x0000000000000000_000000000000ffff // k32_48 := 0x00000000ffffffff_0000ffffffffffff movi k32_48.2d, #0xffffffff mov k32_48.h[2], k32_48.h[0] ushr k00_16.2d, k32_48.2d, #32 // prepare the permutation vectors mov_q x5, 0x080f0e0d0c0b0a09 movi T1.8b, #8 dup perm1.2d, x5 eor perm1.16b, perm1.16b, T1.16b ushr perm2.2d, perm1.2d, #8 ushr perm3.2d, perm1.2d, #16 ushr T1.2d, perm1.2d, #24 sli perm2.2d, perm1.2d, #56 sli perm3.2d, perm1.2d, #48 sli T1.2d, perm1.2d, #40 // precompute loop invariants tbl sh1.16b, {SHASH.16b}, perm1.16b tbl sh2.16b, {SHASH.16b}, perm2.16b tbl sh3.16b, {SHASH.16b}, perm3.16b tbl sh4.16b, {SHASH.16b}, T1.16b ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 .endm // // PMULL (64x64->128) based reduction for CPUs that can do // it in a single instruction. // .macro __pmull_reduce_p64 pmull T2.1q, XL.1d, MASK.1d eor XM.16b, XM.16b, T1.16b mov XH.d[0], XM.d[1] mov XM.d[1], XL.d[0] eor XL.16b, XM.16b, T2.16b ext T2.16b, XL.16b, XL.16b, #8 pmull XL.1q, XL.1d, MASK.1d .endm // // Alternative reduction for CPUs that lack support for the // 64x64->128 PMULL instruction // .macro __pmull_reduce_p8 eor XM.16b, XM.16b, T1.16b mov XL.d[1], XM.d[0] mov XH.d[0], XM.d[1] shl T1.2d, XL.2d, #57 shl T2.2d, XL.2d, #62 eor T2.16b, T2.16b, T1.16b shl T1.2d, XL.2d, #63 eor T2.16b, T2.16b, T1.16b ext T1.16b, XL.16b, XH.16b, #8 eor T2.16b, T2.16b, T1.16b mov XL.d[1], T2.d[0] mov XH.d[0], T2.d[1] ushr T2.2d, XL.2d, #1 eor XH.16b, XH.16b, XL.16b eor XL.16b, XL.16b, T2.16b ushr T2.2d, T2.2d, #6 ushr XL.2d, XL.2d, #1 .endm .macro __pmull_ghash, pn ld1 {SHASH.2d}, [x3] ld1 {XL.2d}, [x1] ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 eor SHASH2.16b, SHASH2.16b, SHASH.16b __pmull_pre_\pn /* do the head block first, if supplied */ cbz x4, 0f ld1 {T1.2d}, [x4] b 1f 0: ld1 {T1.2d}, [x2], #16 sub w0, w0, #1 1: /* multiply XL by SHASH in GF(2^128) */ CPU_LE( rev64 T1.16b, T1.16b ) ext T2.16b, XL.16b, XL.16b, #8 ext IN1.16b, T1.16b, T1.16b, #8 eor T1.16b, T1.16b, T2.16b eor XL.16b, XL.16b, IN1.16b __pmull2_\pn XH, XL, SHASH // a1 * b1 eor T1.16b, T1.16b, XL.16b __pmull_\pn XL, XL, SHASH // a0 * b0 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) eor T2.16b, XL.16b, XH.16b ext T1.16b, XL.16b, XH.16b, #8 eor XM.16b, XM.16b, T2.16b __pmull_reduce_\pn eor T2.16b, T2.16b, XH.16b eor XL.16b, XL.16b, T2.16b cbnz w0, 0b st1 {XL.2d}, [x1] ret .endm /* * void pmull_ghash_update(int blocks, u64 dg[], const char *src, * struct ghash_key const *k, const char *head) */ ENTRY(pmull_ghash_update_p64) __pmull_ghash p64 ENDPROC(pmull_ghash_update_p64) ENTRY(pmull_ghash_update_p8) __pmull_ghash p8 ENDPROC(pmull_ghash_update_p8) KS .req v8 CTR .req v9 INP .req v10 .macro load_round_keys, rounds, rk cmp \rounds, #12 blo 2222f /* 128 bits */ beq 1111f /* 192 bits */ ld1 {v17.4s-v18.4s}, [\rk], #32 1111: ld1 {v19.4s-v20.4s}, [\rk], #32 2222: ld1 {v21.4s-v24.4s}, [\rk], #64 ld1 {v25.4s-v28.4s}, [\rk], #64 ld1 {v29.4s-v31.4s}, [\rk] .endm .macro enc_round, state, key aese \state\().16b, \key\().16b aesmc \state\().16b, \state\().16b .endm .macro enc_block, state, rounds cmp \rounds, #12 b.lo 2222f /* 128 bits */ b.eq 1111f /* 192 bits */ enc_round \state, v17 enc_round \state, v18 1111: enc_round \state, v19 enc_round \state, v20 2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 enc_round \state, \key .endr aese \state\().16b, v30.16b eor \state\().16b, \state\().16b, v31.16b .endm .macro pmull_gcm_do_crypt, enc ld1 {SHASH.2d}, [x4] ld1 {XL.2d}, [x1] ldr x8, [x5, #8] // load lower counter movi MASK.16b, #0xe1 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 CPU_LE( rev x8, x8 ) shl MASK.2d, MASK.2d, #57 eor SHASH2.16b, SHASH2.16b, SHASH.16b .if \enc == 1 ld1 {KS.16b}, [x7] .endif 0: ld1 {CTR.8b}, [x5] // load upper counter ld1 {INP.16b}, [x3], #16 rev x9, x8 add x8, x8, #1 sub w0, w0, #1 ins CTR.d[1], x9 // set lower counter .if \enc == 1 eor INP.16b, INP.16b, KS.16b // encrypt input st1 {INP.16b}, [x2], #16 .endif rev64 T1.16b, INP.16b cmp w6, #12 b.ge 2f // AES-192/256? 1: enc_round CTR, v21 ext T2.16b, XL.16b, XL.16b, #8 ext IN1.16b, T1.16b, T1.16b, #8 enc_round CTR, v22 eor T1.16b, T1.16b, T2.16b eor XL.16b, XL.16b, IN1.16b enc_round CTR, v23 pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1 eor T1.16b, T1.16b, XL.16b enc_round CTR, v24 pmull XL.1q, SHASH.1d, XL.1d // a0 * b0 pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) enc_round CTR, v25 ext T1.16b, XL.16b, XH.16b, #8 eor T2.16b, XL.16b, XH.16b eor XM.16b, XM.16b, T1.16b enc_round CTR, v26 eor XM.16b, XM.16b, T2.16b pmull T2.1q, XL.1d, MASK.1d enc_round CTR, v27 mov XH.d[0], XM.d[1] mov XM.d[1], XL.d[0] enc_round CTR, v28 eor XL.16b, XM.16b, T2.16b enc_round CTR, v29 ext T2.16b, XL.16b, XL.16b, #8 aese CTR.16b, v30.16b pmull XL.1q, XL.1d, MASK.1d eor T2.16b, T2.16b, XH.16b eor KS.16b, CTR.16b, v31.16b eor XL.16b, XL.16b, T2.16b .if \enc == 0 eor INP.16b, INP.16b, KS.16b st1 {INP.16b}, [x2], #16 .endif cbnz w0, 0b CPU_LE( rev x8, x8 ) st1 {XL.2d}, [x1] str x8, [x5, #8] // store lower counter .if \enc == 1 st1 {KS.16b}, [x7] .endif ret 2: b.eq 3f // AES-192? enc_round CTR, v17 enc_round CTR, v18 3: enc_round CTR, v19 enc_round CTR, v20 b 1b .endm /* * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], * struct ghash_key const *k, u8 ctr[], * int rounds, u8 ks[]) */ ENTRY(pmull_gcm_encrypt) pmull_gcm_do_crypt 1 ENDPROC(pmull_gcm_encrypt) /* * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], * struct ghash_key const *k, u8 ctr[], * int rounds) */ ENTRY(pmull_gcm_decrypt) pmull_gcm_do_crypt 0 ENDPROC(pmull_gcm_decrypt) /* * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds) */ ENTRY(pmull_gcm_encrypt_block) cbz x2, 0f load_round_keys w3, x2 0: ld1 {v0.16b}, [x1] enc_block v0, w3 st1 {v0.16b}, [x0] ret ENDPROC(pmull_gcm_encrypt_block)