diff options
Diffstat (limited to 'arch/x86/crypto/crc32c-pcl-intel-asm_64.S')
-rw-r--r-- | arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 463 |
1 files changed, 0 insertions, 463 deletions
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S deleted file mode 100644 index bbcff1fb78cb..000000000000 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ /dev/null @@ -1,463 +0,0 @@ -/* - * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) - * - * The white papers on CRC32C calculations with PCLMULQDQ instruction can be - * downloaded from: - * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf - * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf - * - * Copyright (C) 2012 Intel Corporation. - * - * Authors: - * Wajdi Feghali <wajdi.k.feghali@intel.com> - * James Guilford <james.guilford@intel.com> - * David Cote <david.m.cote@intel.com> - * Tim Chen <tim.c.chen@linux.intel.com> - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include <linux/linkage.h> -#include <asm/nospec-branch.h> - -## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction - -.macro LABEL prefix n -.L\prefix\n\(): -.endm - -.macro JMPTBL_ENTRY i -.quad .Lcrc_\i -.endm - -.macro JNC_LESS_THAN j - jnc .Lless_than_\j -.endm - -# Define threshold where buffers are considered "small" and routed to more -# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so -# SMALL_SIZE can be no larger than 255. - -#define SMALL_SIZE 200 - -.if (SMALL_SIZE > 255) -.error "SMALL_ SIZE must be < 256" -.endif - -# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); - -.text -SYM_FUNC_START(crc_pcl) -#define bufp rdi -#define bufp_dw %edi -#define bufp_w %di -#define bufp_b %dil -#define bufptmp %rcx -#define block_0 %rcx -#define block_1 %rdx -#define block_2 %r11 -#define len %rsi -#define len_dw %esi -#define len_w %si -#define len_b %sil -#define crc_init_arg %rdx -#define tmp %rbx -#define crc_init %r8 -#define crc_init_dw %r8d -#define crc1 %r9 -#define crc2 %r10 - - pushq %rbx - pushq %rdi - pushq %rsi - - ## Move crc_init for Linux to a different - mov crc_init_arg, crc_init - - ################################################################ - ## 1) ALIGN: - ################################################################ - - mov %bufp, bufptmp # rdi = *buf - neg %bufp - and $7, %bufp # calculate the unalignment amount of - # the address - je .Lproc_block # Skip if aligned - - ## If len is less than 8 and we're unaligned, we need to jump - ## to special code to avoid reading beyond the end of the buffer - cmp $8, len - jae .Ldo_align - # less_than_8 expects length in upper 3 bits of len_dw - # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] - shl $32-3+1, len_dw - jmp .Lless_than_8_post_shl1 - -.Ldo_align: - #### Calculate CRC of unaligned bytes of the buffer (if any) - movq (bufptmp), tmp # load a quadward from the buffer - add %bufp, bufptmp # align buffer pointer for quadword - # processing - sub %bufp, len # update buffer length -.Lalign_loop: - crc32b %bl, crc_init_dw # compute crc32 of 1-byte - shr $8, tmp # get next byte - dec %bufp - jne .Lalign_loop - -.Lproc_block: - - ################################################################ - ## 2) PROCESS BLOCKS: - ################################################################ - - ## compute num of bytes to be processed - movq len, tmp # save num bytes in tmp - - cmpq $128*24, len - jae .Lfull_block - -.Lcontinue_block: - cmpq $SMALL_SIZE, len - jb .Lsmall - - ## len < 128*24 - movq $2731, %rax # 2731 = ceil(2^16 / 24) - mul len_dw - shrq $16, %rax - - ## eax contains floor(bytes / 24) = num 24-byte chunks to do - - ## process rax 24-byte chunks (128 >= rax >= 0) - - ## compute end address of each block - ## block 0 (base addr + RAX * 8) - ## block 1 (base addr + RAX * 16) - ## block 2 (base addr + RAX * 24) - lea (bufptmp, %rax, 8), block_0 - lea (block_0, %rax, 8), block_1 - lea (block_1, %rax, 8), block_2 - - xor crc1, crc1 - xor crc2, crc2 - - ## branch into array - leaq jump_table(%rip), %bufp - mov (%bufp,%rax,8), %bufp - JMP_NOSPEC bufp - - ################################################################ - ## 2a) PROCESS FULL BLOCKS: - ################################################################ -.Lfull_block: - movl $128,%eax - lea 128*8*2(block_0), block_1 - lea 128*8*3(block_0), block_2 - add $128*8*1, block_0 - - xor crc1,crc1 - xor crc2,crc2 - - # Fall through into top of crc array (crc_128) - - ################################################################ - ## 3) CRC Array: - ################################################################ - - i=128 -.rept 128-1 -.altmacro -LABEL crc_ %i -.noaltmacro - ENDBR - crc32q -i*8(block_0), crc_init - crc32q -i*8(block_1), crc1 - crc32q -i*8(block_2), crc2 - i=(i-1) -.endr - -.altmacro -LABEL crc_ %i -.noaltmacro - ENDBR - crc32q -i*8(block_0), crc_init - crc32q -i*8(block_1), crc1 -# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet - - mov block_2, block_0 - - ################################################################ - ## 4) Combine three results: - ################################################################ - - lea (K_table-8)(%rip), %bufp # first entry is for idx 1 - shlq $3, %rax # rax *= 8 - pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2 - leal (%eax,%eax,2), %eax # rax *= 3 (total *24) - subq %rax, tmp # tmp -= rax*24 - - movq crc_init, %xmm1 # CRC for block 1 - pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 - - movq crc1, %xmm2 # CRC for block 2 - pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 - - pxor %xmm2,%xmm1 - movq %xmm1, %rax - xor -i*8(block_2), %rax - mov crc2, crc_init - crc32 %rax, crc_init - - ################################################################ - ## 5) Check for end: - ################################################################ - -LABEL crc_ 0 - ENDBR - mov tmp, len - cmp $128*24, tmp - jae .Lfull_block - cmp $24, tmp - jae .Lcontinue_block - -.Lless_than_24: - shl $32-4, len_dw # less_than_16 expects length - # in upper 4 bits of len_dw - jnc .Lless_than_16 - crc32q (bufptmp), crc_init - crc32q 8(bufptmp), crc_init - jz .Ldo_return - add $16, bufptmp - # len is less than 8 if we got here - # less_than_8 expects length in upper 3 bits of len_dw - # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] - shl $2, len_dw - jmp .Lless_than_8_post_shl1 - - ####################################################################### - ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) - ####################################################################### -.Lsmall: - shl $32-8, len_dw # Prepare len_dw for less_than_256 - j=256 -.rept 5 # j = {256, 128, 64, 32, 16} -.altmacro -LABEL less_than_ %j # less_than_j: Length should be in - # upper lg(j) bits of len_dw - j=(j/2) - shl $1, len_dw # Get next MSB - JNC_LESS_THAN %j -.noaltmacro - i=0 -.rept (j/8) - crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data - i=i+8 -.endr - jz .Ldo_return # Return if remaining length is zero - add $j, bufptmp # Advance buf -.endr - -.Lless_than_8: # Length should be stored in - # upper 3 bits of len_dw - shl $1, len_dw -.Lless_than_8_post_shl1: - jnc .Lless_than_4 - crc32l (bufptmp), crc_init_dw # CRC of 4 bytes - jz .Ldo_return # return if remaining data is zero - add $4, bufptmp -.Lless_than_4: # Length should be stored in - # upper 2 bits of len_dw - shl $1, len_dw - jnc .Lless_than_2 - crc32w (bufptmp), crc_init_dw # CRC of 2 bytes - jz .Ldo_return # return if remaining data is zero - add $2, bufptmp -.Lless_than_2: # Length should be stored in the MSB - # of len_dw - shl $1, len_dw - jnc .Lless_than_1 - crc32b (bufptmp), crc_init_dw # CRC of 1 byte -.Lless_than_1: # Length should be zero -.Ldo_return: - movq crc_init, %rax - popq %rsi - popq %rdi - popq %rbx - RET -SYM_FUNC_END(crc_pcl) - -.section .rodata, "a", @progbits - ################################################################ - ## jump table Table is 129 entries x 2 bytes each - ################################################################ -.align 4 -jump_table: - i=0 -.rept 129 -.altmacro -JMPTBL_ENTRY %i -.noaltmacro - i=i+1 -.endr - - - ################################################################ - ## PCLMULQDQ tables - ## Table is 128 entries x 2 words (8 bytes) each - ################################################################ -.align 8 -K_table: - .long 0x493c7d27, 0x00000001 - .long 0xba4fc28e, 0x493c7d27 - .long 0xddc0152b, 0xf20c0dfe - .long 0x9e4addf8, 0xba4fc28e - .long 0x39d3b296, 0x3da6d0cb - .long 0x0715ce53, 0xddc0152b - .long 0x47db8317, 0x1c291d04 - .long 0x0d3b6092, 0x9e4addf8 - .long 0xc96cfdc0, 0x740eef02 - .long 0x878a92a7, 0x39d3b296 - .long 0xdaece73e, 0x083a6eec - .long 0xab7aff2a, 0x0715ce53 - .long 0x2162d385, 0xc49f4f67 - .long 0x83348832, 0x47db8317 - .long 0x299847d5, 0x2ad91c30 - .long 0xb9e02b86, 0x0d3b6092 - .long 0x18b33a4e, 0x6992cea2 - .long 0xb6dd949b, 0xc96cfdc0 - .long 0x78d9ccb7, 0x7e908048 - .long 0xbac2fd7b, 0x878a92a7 - .long 0xa60ce07b, 0x1b3d8f29 - .long 0xce7f39f4, 0xdaece73e - .long 0x61d82e56, 0xf1d0f55e - .long 0xd270f1a2, 0xab7aff2a - .long 0xc619809d, 0xa87ab8a8 - .long 0x2b3cac5d, 0x2162d385 - .long 0x65863b64, 0x8462d800 - .long 0x1b03397f, 0x83348832 - .long 0xebb883bd, 0x71d111a8 - .long 0xb3e32c28, 0x299847d5 - .long 0x064f7f26, 0xffd852c6 - .long 0xdd7e3b0c, 0xb9e02b86 - .long 0xf285651c, 0xdcb17aa4 - .long 0x10746f3c, 0x18b33a4e - .long 0xc7a68855, 0xf37c5aee - .long 0x271d9844, 0xb6dd949b - .long 0x8e766a0c, 0x6051d5a2 - .long 0x93a5f730, 0x78d9ccb7 - .long 0x6cb08e5c, 0x18b0d4ff - .long 0x6b749fb2, 0xbac2fd7b - .long 0x1393e203, 0x21f3d99c - .long 0xcec3662e, 0xa60ce07b - .long 0x96c515bb, 0x8f158014 - .long 0xe6fc4e6a, 0xce7f39f4 - .long 0x8227bb8a, 0xa00457f7 - .long 0xb0cd4768, 0x61d82e56 - .long 0x39c7ff35, 0x8d6d2c43 - .long 0xd7a4825c, 0xd270f1a2 - .long 0x0ab3844b, 0x00ac29cf - .long 0x0167d312, 0xc619809d - .long 0xf6076544, 0xe9adf796 - .long 0x26f6a60a, 0x2b3cac5d - .long 0xa741c1bf, 0x96638b34 - .long 0x98d8d9cb, 0x65863b64 - .long 0x49c3cc9c, 0xe0e9f351 - .long 0x68bce87a, 0x1b03397f - .long 0x57a3d037, 0x9af01f2d - .long 0x6956fc3b, 0xebb883bd - .long 0x42d98888, 0x2cff42cf - .long 0x3771e98f, 0xb3e32c28 - .long 0xb42ae3d9, 0x88f25a3a - .long 0x2178513a, 0x064f7f26 - .long 0xe0ac139e, 0x4e36f0b0 - .long 0x170076fa, 0xdd7e3b0c - .long 0x444dd413, 0xbd6f81f8 - .long 0x6f345e45, 0xf285651c - .long 0x41d17b64, 0x91c9bd4b - .long 0xff0dba97, 0x10746f3c - .long 0xa2b73df1, 0x885f087b - .long 0xf872e54c, 0xc7a68855 - .long 0x1e41e9fc, 0x4c144932 - .long 0x86d8e4d2, 0x271d9844 - .long 0x651bd98b, 0x52148f02 - .long 0x5bb8f1bc, 0x8e766a0c - .long 0xa90fd27a, 0xa3c6f37a - .long 0xb3af077a, 0x93a5f730 - .long 0x4984d782, 0xd7c0557f - .long 0xca6ef3ac, 0x6cb08e5c - .long 0x234e0b26, 0x63ded06a - .long 0xdd66cbbb, 0x6b749fb2 - .long 0x4597456a, 0x4d56973c - .long 0xe9e28eb4, 0x1393e203 - .long 0x7b3ff57a, 0x9669c9df - .long 0xc9c8b782, 0xcec3662e - .long 0x3f70cc6f, 0xe417f38a - .long 0x93e106a4, 0x96c515bb - .long 0x62ec6c6d, 0x4b9e0f71 - .long 0xd813b325, 0xe6fc4e6a - .long 0x0df04680, 0xd104b8fc - .long 0x2342001e, 0x8227bb8a - .long 0x0a2a8d7e, 0x5b397730 - .long 0x6d9a4957, 0xb0cd4768 - .long 0xe8b6368b, 0xe78eb416 - .long 0xd2c3ed1a, 0x39c7ff35 - .long 0x995a5724, 0x61ff0e01 - .long 0x9ef68d35, 0xd7a4825c - .long 0x0c139b31, 0x8d96551c - .long 0xf2271e60, 0x0ab3844b - .long 0x0b0bf8ca, 0x0bf80dd2 - .long 0x2664fd8b, 0x0167d312 - .long 0xed64812d, 0x8821abed - .long 0x02ee03b2, 0xf6076544 - .long 0x8604ae0f, 0x6a45d2b2 - .long 0x363bd6b3, 0x26f6a60a - .long 0x135c83fd, 0xd8d26619 - .long 0x5fabe670, 0xa741c1bf - .long 0x35ec3279, 0xde87806c - .long 0x00bcf5f6, 0x98d8d9cb - .long 0x8ae00689, 0x14338754 - .long 0x17f27698, 0x49c3cc9c - .long 0x58ca5f00, 0x5bd2011f - .long 0xaa7c7ad5, 0x68bce87a - .long 0xb5cfca28, 0xdd07448e - .long 0xded288f8, 0x57a3d037 - .long 0x59f229bc, 0xdde8f5b9 - .long 0x6d390dec, 0x6956fc3b - .long 0x37170390, 0xa3e3e02c - .long 0x6353c1cc, 0x42d98888 - .long 0xc4584f5c, 0xd73c7bea - .long 0xf48642e9, 0x3771e98f - .long 0x531377e2, 0x80ff0093 - .long 0xdd35bc8d, 0xb42ae3d9 - .long 0xb25b29f2, 0x8fe4c34d - .long 0x9a5ede41, 0x2178513a - .long 0xa563905d, 0xdf99fc11 - .long 0x45cddf4e, 0xe0ac139e - .long 0xacfa3103, 0x6c23e841 - .long 0xa51b6135, 0x170076fa |