From 8b65f34c5821e7361488dc668d21195ea4c9f14d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 4 Dec 2018 22:20:03 -0800 Subject: crypto: x86/chacha20 - refactor to allow varying number of rounds In preparation for adding XChaCha12 support, rename/refactor the x86_64 SIMD implementations of ChaCha20 to support different numbers of rounds. Reviewed-by: Martin Willi Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha_glue.c | 270 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100644 arch/x86/crypto/chacha_glue.c (limited to 'arch/x86/crypto/chacha_glue.c') diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c new file mode 100644 index 000000000000..35fd02b50d27 --- /dev/null +++ b/arch/x86/crypto/chacha_glue.c @@ -0,0 +1,270 @@ +/* + * x64 SIMD accelerated ChaCha and XChaCha stream ciphers, + * including ChaCha20 (RFC7539) + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define CHACHA_STATE_ALIGN 16 + +asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds); +#ifdef CONFIG_AS_AVX2 +asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +static bool chacha_use_avx2; +#ifdef CONFIG_AS_AVX512 +asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +static bool chacha_use_avx512vl; +#endif +#endif + +static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) +{ + len = min(len, maxblocks * CHACHA_BLOCK_SIZE); + return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; +} + +static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ +#ifdef CONFIG_AS_AVX2 +#ifdef CONFIG_AS_AVX512 + if (chacha_use_avx512vl) { + while (bytes >= CHACHA_BLOCK_SIZE * 8) { + chacha_8block_xor_avx512vl(state, dst, src, bytes, + nrounds); + bytes -= CHACHA_BLOCK_SIZE * 8; + src += CHACHA_BLOCK_SIZE * 8; + dst += CHACHA_BLOCK_SIZE * 8; + state[12] += 8; + } + if (bytes > CHACHA_BLOCK_SIZE * 4) { + chacha_8block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state[12] += chacha_advance(bytes, 8); + return; + } + if (bytes > CHACHA_BLOCK_SIZE * 2) { + chacha_4block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state[12] += chacha_advance(bytes, 4); + return; + } + if (bytes) { + chacha_2block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state[12] += chacha_advance(bytes, 2); + return; + } + } +#endif + if (chacha_use_avx2) { + while (bytes >= CHACHA_BLOCK_SIZE * 8) { + chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); + bytes -= CHACHA_BLOCK_SIZE * 8; + src += CHACHA_BLOCK_SIZE * 8; + dst += CHACHA_BLOCK_SIZE * 8; + state[12] += 8; + } + if (bytes > CHACHA_BLOCK_SIZE * 4) { + chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); + state[12] += chacha_advance(bytes, 8); + return; + } + if (bytes > CHACHA_BLOCK_SIZE * 2) { + chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); + state[12] += chacha_advance(bytes, 4); + return; + } + if (bytes > CHACHA_BLOCK_SIZE) { + chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); + state[12] += chacha_advance(bytes, 2); + return; + } + } +#endif + while (bytes >= CHACHA_BLOCK_SIZE * 4) { + chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); + bytes -= CHACHA_BLOCK_SIZE * 4; + src += CHACHA_BLOCK_SIZE * 4; + dst += CHACHA_BLOCK_SIZE * 4; + state[12] += 4; + } + if (bytes > CHACHA_BLOCK_SIZE) { + chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); + state[12] += chacha_advance(bytes, 4); + return; + } + if (bytes) { + chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); + state[12]++; + } +} + +static int chacha_simd_stream_xor(struct skcipher_request *req, + struct chacha_ctx *ctx, u8 *iv) +{ + u32 *state, state_buf[16 + 2] __aligned(8); + struct skcipher_walk walk; + int err; + + BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); + state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); + + err = skcipher_walk_virt(&walk, req, true); + + crypto_chacha_init(state, ctx, iv); + + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); + + chacha_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, + nbytes, ctx->nrounds); + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + return err; +} + +static int chacha_simd(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + int err; + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) + return crypto_chacha_crypt(req); + + kernel_fpu_begin(); + err = chacha_simd_stream_xor(req, ctx, req->iv); + kernel_fpu_end(); + return err; +} + +static int xchacha_simd(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx subctx; + u32 *state, state_buf[16 + 2] __aligned(8); + u8 real_iv[16]; + int err; + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) + return crypto_xchacha_crypt(req); + + BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); + state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); + crypto_chacha_init(state, ctx, req->iv); + + kernel_fpu_begin(); + + hchacha_block_ssse3(state, subctx.key, ctx->nrounds); + subctx.nrounds = ctx->nrounds; + + memcpy(&real_iv[0], req->iv + 24, 8); + memcpy(&real_iv[8], req->iv + 16, 8); + err = chacha_simd_stream_xor(req, &subctx, real_iv); + + kernel_fpu_end(); + + return err; +} + +static struct skcipher_alg algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-simd", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = chacha_simd, + .decrypt = chacha_simd, + }, { + .base.cra_name = "xchacha20", + .base.cra_driver_name = "xchacha20-simd", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = xchacha_simd, + .decrypt = xchacha_simd, + }, +}; + +static int __init chacha_simd_mod_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_SSSE3)) + return -ENODEV; + +#ifdef CONFIG_AS_AVX2 + chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); +#ifdef CONFIG_AS_AVX512 + chacha_use_avx512vl = chacha_use_avx2 && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */ +#endif +#endif + return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); +} + +static void __exit chacha_simd_mod_fini(void) +{ + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); +} + +module_init(chacha_simd_mod_init); +module_exit(chacha_simd_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Willi "); +MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)"); +MODULE_ALIAS_CRYPTO("chacha20"); +MODULE_ALIAS_CRYPTO("chacha20-simd"); +MODULE_ALIAS_CRYPTO("xchacha20"); +MODULE_ALIAS_CRYPTO("xchacha20-simd"); -- cgit