summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig6
-rw-r--r--arch/x86/boot/Makefile2
-rw-r--r--arch/x86/boot/compressed/Makefile5
-rw-r--r--arch/x86/boot/compressed/sbat.S7
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S8
-rw-r--r--arch/x86/boot/header.S31
-rw-r--r--arch/x86/coco/sev/core.c89
-rw-r--r--arch/x86/coco/sev/vc-handle.c9
-rw-r--r--arch/x86/configs/i386_defconfig19
-rw-r--r--arch/x86/configs/x86_64_defconfig9
-rw-r--r--arch/x86/crypto/Kconfig27
-rw-r--r--arch/x86/crypto/Makefile6
-rw-r--r--arch/x86/crypto/sha1_avx2_x86_64_asm.S700
-rw-r--r--arch/x86/crypto/sha1_ni_asm.S304
-rw-r--r--arch/x86/crypto/sha1_ssse3_asm.S554
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c324
-rw-r--r--arch/x86/crypto/sha512-avx-asm.S423
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S750
-rw-r--r--arch/x86/crypto/sha512-ssse3-asm.S425
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c322
-rw-r--r--arch/x86/entry/calling.h4
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl2
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/entry/vdso/Makefile3
-rw-r--r--arch/x86/events/intel/uncore.c7
-rw-r--r--arch/x86/events/intel/uncore.h2
-rw-r--r--arch/x86/events/intel/uncore_discovery.c89
-rw-r--r--arch/x86/events/intel/uncore_discovery.h7
-rw-r--r--arch/x86/events/intel/uncore_snb.c79
-rw-r--r--arch/x86/events/intel/uncore_snbep.c4
-rw-r--r--arch/x86/include/asm/acpi.h4
-rw-r--r--arch/x86/include/asm/ce4100.h6
-rw-r--r--arch/x86/include/asm/cpufeatures.h3
-rw-r--r--arch/x86/include/asm/fpu/types.h49
-rw-r--r--arch/x86/include/asm/fpu/xstate.h9
-rw-r--r--arch/x86/include/asm/init.h2
-rw-r--r--arch/x86/include/asm/intel_telemetry.h37
-rw-r--r--arch/x86/include/asm/msr-index.h6
-rw-r--r--arch/x86/include/asm/pgtable_types.h3
-rw-r--r--arch/x86/include/asm/realmode.h2
-rw-r--r--arch/x86/include/asm/sev.h14
-rw-r--r--arch/x86/include/asm/smp.h23
-rw-r--r--arch/x86/include/asm/special_insns.h29
-rw-r--r--arch/x86/kernel/apic/vector.c4
-rw-r--r--arch/x86/kernel/cpu/amd.c4
-rw-r--r--arch/x86/kernel/cpu/bugs.c465
-rw-r--r--arch/x86/kernel/cpu/common.c7
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c14
-rw-r--r--arch/x86/kernel/fpu/core.c53
-rw-r--r--arch/x86/kernel/fpu/init.c1
-rw-r--r--arch/x86/kernel/fpu/xstate.c40
-rw-r--r--arch/x86/kernel/itmt.c23
-rw-r--r--arch/x86/kernel/ksysfs.c8
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/process.c20
-rw-r--r--arch/x86/kernel/process_64.c4
-rw-r--r--arch/x86/kernel/ptrace.c22
-rw-r--r--arch/x86/kernel/smpboot.c51
-rw-r--r--arch/x86/kvm/x86.c3
-rw-r--r--arch/x86/lib/.gitignore4
-rw-r--r--arch/x86/lib/Makefile12
-rw-r--r--arch/x86/lib/cache-smp.c26
-rw-r--r--arch/x86/lib/crc-pclmul-consts.h195
-rw-r--r--arch/x86/lib/crc-pclmul-template.S582
-rw-r--r--arch/x86/lib/crc-pclmul-template.h76
-rw-r--r--arch/x86/lib/crc-t10dif.c40
-rw-r--r--arch/x86/lib/crc16-msb-pclmul.S6
-rw-r--r--arch/x86/lib/crc32-pclmul.S6
-rw-r--r--arch/x86/lib/crc32.c111
-rw-r--r--arch/x86/lib/crc32c-3way.S360
-rw-r--r--arch/x86/lib/crc64-pclmul.S7
-rw-r--r--arch/x86/lib/crc64.c50
-rw-r--r--arch/x86/lib/crypto/.gitignore2
-rw-r--r--arch/x86/lib/crypto/Kconfig34
-rw-r--r--arch/x86/lib/crypto/Makefile20
-rw-r--r--arch/x86/lib/crypto/blake2s-core.S252
-rw-r--r--arch/x86/lib/crypto/blake2s-glue.c70
-rw-r--r--arch/x86/lib/crypto/chacha-avx2-x86_64.S1021
-rw-r--r--arch/x86/lib/crypto/chacha-avx512vl-x86_64.S836
-rw-r--r--arch/x86/lib/crypto/chacha-ssse3-x86_64.S791
-rw-r--r--arch/x86/lib/crypto/chacha_glue.c196
-rw-r--r--arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl4253
-rw-r--r--arch/x86/lib/crypto/poly1305_glue.c129
-rw-r--r--arch/x86/lib/crypto/sha256-avx-asm.S499
-rw-r--r--arch/x86/lib/crypto/sha256-avx2-asm.S774
-rw-r--r--arch/x86/lib/crypto/sha256-ni-asm.S196
-rw-r--r--arch/x86/lib/crypto/sha256-ssse3-asm.S511
-rw-r--r--arch/x86/lib/crypto/sha256.c80
-rw-r--r--arch/x86/mm/extable.c5
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/mm/pti.c4
-rw-r--r--arch/x86/platform/ce4100/ce4100.c95
-rw-r--r--arch/x86/platform/efi/efi_64.c4
-rw-r--r--arch/x86/purgatory/Makefile2
-rw-r--r--arch/x86/purgatory/purgatory.c2
-rw-r--r--arch/x86/tools/insn_decoder_test.c2
-rw-r--r--arch/x86/tools/insn_sanity.c4
-rw-r--r--arch/x86/um/asm/syscall.h2
-rw-r--r--arch/x86/um/ptrace.c10
-rw-r--r--arch/x86/um/shared/sysdep/ptrace.h12
-rw-r--r--arch/x86/um/shared/sysdep/syscalls.h6
-rw-r--r--arch/x86/um/shared/sysdep/syscalls_32.h14
-rw-r--r--arch/x86/um/shared/sysdep/syscalls_64.h28
-rw-r--r--arch/x86/um/tls_32.c2
104 files changed, 947 insertions, 15513 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8bed9030ad47..f23919a7db40 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -75,13 +75,11 @@ config X86
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if (PGTABLE_LEVELS > 2) && (X86_64 || X86_PAE)
select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
+ select ARCH_HAS_CPU_ATTACK_VECTORS if CPU_MITIGATIONS
select ARCH_HAS_CACHE_LINE_SIZE
select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
select ARCH_HAS_CPU_FINALIZE_INIT
select ARCH_HAS_CPU_PASID if IOMMU_SVA
- select ARCH_HAS_CRC32
- select ARCH_HAS_CRC64 if X86_64
- select ARCH_HAS_CRC_T10DIF
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE
@@ -204,13 +202,13 @@ config X86
select HAVE_ARCH_KFENCE
select HAVE_ARCH_KMSAN if X86_64
select HAVE_ARCH_KGDB
+ select HAVE_ARCH_KSTACK_ERASE
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT
select HAVE_ARCH_PREL32_RELOCATIONS
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
- select HAVE_ARCH_STACKLEAK
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 640fcac3af74..3f9fb3698d66 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -71,7 +71,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
-sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|efi.._stub_entry\|efi\(32\)\?_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|_e\?data\|z_.*\)$$/\#define ZO_\2 0x\1/p'
+sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|efi.._stub_entry\|efi\(32\)\?_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|_e\?data\|_e\?sbat\|z_.*\)$$/\#define ZO_\2 0x\1/p'
quiet_cmd_zoffset = ZOFFSET $@
cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index f4f7b22d8113..3a38fdcdb9bd 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -106,6 +106,11 @@ vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o
vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o
vmlinux-libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
vmlinux-libs-$(CONFIG_X86_64) += $(objtree)/arch/x86/boot/startup/lib.a
+vmlinux-objs-$(CONFIG_EFI_SBAT) += $(obj)/sbat.o
+
+ifdef CONFIG_EFI_SBAT
+$(obj)/sbat.o: $(CONFIG_EFI_SBAT_FILE)
+endif
$(obj)/vmlinux: $(vmlinux-objs-y) $(vmlinux-libs-y) FORCE
$(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/sbat.S b/arch/x86/boot/compressed/sbat.S
new file mode 100644
index 000000000000..838f70a997dd
--- /dev/null
+++ b/arch/x86/boot/compressed/sbat.S
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Embed SBAT data in the kernel.
+ */
+ .pushsection ".sbat", "a", @progbits
+ .incbin CONFIG_EFI_SBAT_FILE
+ .popsection
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index 3b2bc61c9408..587ce3e7c504 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -43,6 +43,14 @@ SECTIONS
*(.rodata.*)
_erodata = . ;
}
+#ifdef CONFIG_EFI_SBAT
+ .sbat : ALIGN(0x1000) {
+ _sbat = . ;
+ *(.sbat)
+ _esbat = ALIGN(0x1000);
+ . = _esbat;
+ }
+#endif
.data : ALIGN(0x1000) {
_data = . ;
*(.data)
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index e1f4fd5bc8ee..9bea5a1e2c52 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -179,15 +179,11 @@ pecompat_fstart:
#else
.set pecompat_fstart, setup_size
#endif
- .ascii ".text"
- .byte 0
- .byte 0
- .byte 0
- .long ZO__data
- .long setup_size
- .long ZO__data # Size of initialized data
- # on disk
- .long setup_size
+ .ascii ".text\0\0\0"
+ .long textsize # VirtualSize
+ .long setup_size # VirtualAddress
+ .long textsize # SizeOfRawData
+ .long setup_size # PointerToRawData
.long 0 # PointerToRelocations
.long 0 # PointerToLineNumbers
.word 0 # NumberOfRelocations
@@ -196,6 +192,23 @@ pecompat_fstart:
IMAGE_SCN_MEM_READ | \
IMAGE_SCN_MEM_EXECUTE # Characteristics
+#ifdef CONFIG_EFI_SBAT
+ .ascii ".sbat\0\0\0"
+ .long ZO__esbat - ZO__sbat # VirtualSize
+ .long setup_size + ZO__sbat # VirtualAddress
+ .long ZO__esbat - ZO__sbat # SizeOfRawData
+ .long setup_size + ZO__sbat # PointerToRawData
+
+ .long 0, 0, 0
+ .long IMAGE_SCN_CNT_INITIALIZED_DATA | \
+ IMAGE_SCN_MEM_READ | \
+ IMAGE_SCN_MEM_DISCARDABLE # Characteristics
+
+ .set textsize, ZO__sbat
+#else
+ .set textsize, ZO__data
+#endif
+
.ascii ".data\0\0\0"
.long ZO__end - ZO__data # VirtualSize
.long setup_size + ZO__data # VirtualAddress
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index 7543a8b52c67..fc59ce78c477 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -1045,11 +1045,13 @@ int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
* This is needed by the OVMF UEFI firmware which will use whatever it finds in
* the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu
* runtime GHCBs used by the kernel are also mapped in the EFI page-table.
+ *
+ * When running under SVSM the CA page is needed too, so map it as well.
*/
-int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
+int __init sev_es_efi_map_ghcbs_cas(pgd_t *pgd)
{
+ unsigned long address, pflags, pflags_enc;
struct sev_es_runtime_data *data;
- unsigned long address, pflags;
int cpu;
u64 pfn;
@@ -1057,6 +1059,7 @@ int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
return 0;
pflags = _PAGE_NX | _PAGE_RW;
+ pflags_enc = cc_mkenc(pflags);
for_each_possible_cpu(cpu) {
data = per_cpu(runtime_data, cpu);
@@ -1066,6 +1069,16 @@ int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags))
return 1;
+
+ if (snp_vmpl) {
+ address = per_cpu(svsm_caa_pa, cpu);
+ if (!address)
+ return 1;
+
+ pfn = address >> PAGE_SHIFT;
+ if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags_enc))
+ return 1;
+ }
}
return 0;
@@ -1389,16 +1402,16 @@ int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call,
}
EXPORT_SYMBOL_GPL(snp_issue_svsm_attest_req);
-static int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input,
- struct snp_guest_request_ioctl *rio)
+static int snp_issue_guest_request(struct snp_guest_req *req)
{
+ struct snp_req_data *input = &req->input;
struct ghcb_state state;
struct es_em_ctxt ctxt;
unsigned long flags;
struct ghcb *ghcb;
int ret;
- rio->exitinfo2 = SEV_RET_NO_FW_CALL;
+ req->exitinfo2 = SEV_RET_NO_FW_CALL;
/*
* __sev_get_ghcb() needs to run with IRQs disabled because it is using
@@ -1423,8 +1436,8 @@ static int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_dat
if (ret)
goto e_put;
- rio->exitinfo2 = ghcb->save.sw_exit_info_2;
- switch (rio->exitinfo2) {
+ req->exitinfo2 = ghcb->save.sw_exit_info_2;
+ switch (req->exitinfo2) {
case 0:
break;
@@ -1919,8 +1932,7 @@ static int enc_payload(struct snp_msg_desc *mdesc, u64 seqno, struct snp_guest_r
return 0;
}
-static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
- struct snp_guest_request_ioctl *rio)
+static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req)
{
unsigned long req_start = jiffies;
unsigned int override_npages = 0;
@@ -1934,7 +1946,7 @@ retry_request:
* sequence number must be incremented or the VMPCK must be deleted to
* prevent reuse of the IV.
*/
- rc = snp_issue_guest_request(req, &req->input, rio);
+ rc = snp_issue_guest_request(req);
switch (rc) {
case -ENOSPC:
/*
@@ -1987,7 +1999,7 @@ retry_request:
snp_inc_msg_seqno(mdesc);
if (override_err) {
- rio->exitinfo2 = override_err;
+ req->exitinfo2 = override_err;
/*
* If an extended guest request was issued and the supplied certificate
@@ -2005,12 +2017,20 @@ retry_request:
return rc;
}
-int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
- struct snp_guest_request_ioctl *rio)
+int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req)
{
u64 seqno;
int rc;
+ /*
+ * enc_payload() calls aesgcm_encrypt(), which can potentially offload to HW.
+ * The offload's DMA SG list of data to encrypt has to be in linear mapping.
+ */
+ if (!virt_addr_valid(req->req_buf) || !virt_addr_valid(req->resp_buf)) {
+ pr_warn("AES-GSM buffers must be in linear mapping");
+ return -EINVAL;
+ }
+
guard(mutex)(&snp_cmd_mutex);
/* Check if the VMPCK is not empty */
@@ -2043,14 +2063,14 @@ int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req
req->input.resp_gpa = __pa(mdesc->response);
req->input.data_gpa = req->certs_data ? __pa(req->certs_data) : 0;
- rc = __handle_guest_request(mdesc, req, rio);
+ rc = __handle_guest_request(mdesc, req);
if (rc) {
if (rc == -EIO &&
- rio->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN))
+ req->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN))
return rc;
pr_alert("Detected error from ASP request. rc: %d, exitinfo2: 0x%llx\n",
- rc, rio->exitinfo2);
+ rc, req->exitinfo2);
snp_disable_vmpck(mdesc);
return rc;
@@ -2069,11 +2089,10 @@ EXPORT_SYMBOL_GPL(snp_send_guest_request);
static int __init snp_get_tsc_info(void)
{
- struct snp_guest_request_ioctl *rio;
struct snp_tsc_info_resp *tsc_resp;
struct snp_tsc_info_req *tsc_req;
struct snp_msg_desc *mdesc;
- struct snp_guest_req *req;
+ struct snp_guest_req req = {};
int rc = -ENOMEM;
tsc_req = kzalloc(sizeof(*tsc_req), GFP_KERNEL);
@@ -2089,32 +2108,24 @@ static int __init snp_get_tsc_info(void)
if (!tsc_resp)
goto e_free_tsc_req;
- req = kzalloc(sizeof(*req), GFP_KERNEL);
- if (!req)
- goto e_free_tsc_resp;
-
- rio = kzalloc(sizeof(*rio), GFP_KERNEL);
- if (!rio)
- goto e_free_req;
-
mdesc = snp_msg_alloc();
if (IS_ERR_OR_NULL(mdesc))
- goto e_free_rio;
+ goto e_free_tsc_resp;
rc = snp_msg_init(mdesc, snp_vmpl);
if (rc)
goto e_free_mdesc;
- req->msg_version = MSG_HDR_VER;
- req->msg_type = SNP_MSG_TSC_INFO_REQ;
- req->vmpck_id = snp_vmpl;
- req->req_buf = tsc_req;
- req->req_sz = sizeof(*tsc_req);
- req->resp_buf = (void *)tsc_resp;
- req->resp_sz = sizeof(*tsc_resp) + AUTHTAG_LEN;
- req->exit_code = SVM_VMGEXIT_GUEST_REQUEST;
+ req.msg_version = MSG_HDR_VER;
+ req.msg_type = SNP_MSG_TSC_INFO_REQ;
+ req.vmpck_id = snp_vmpl;
+ req.req_buf = tsc_req;
+ req.req_sz = sizeof(*tsc_req);
+ req.resp_buf = (void *)tsc_resp;
+ req.resp_sz = sizeof(*tsc_resp) + AUTHTAG_LEN;
+ req.exit_code = SVM_VMGEXIT_GUEST_REQUEST;
- rc = snp_send_guest_request(mdesc, req, rio);
+ rc = snp_send_guest_request(mdesc, &req);
if (rc)
goto e_request;
@@ -2135,11 +2146,7 @@ e_request:
memzero_explicit(tsc_resp, sizeof(*tsc_resp) + AUTHTAG_LEN);
e_free_mdesc:
snp_msg_free(mdesc);
-e_free_rio:
- kfree(rio);
-e_free_req:
- kfree(req);
- e_free_tsc_resp:
+e_free_tsc_resp:
kfree(tsc_resp);
e_free_tsc_req:
kfree(tsc_req);
diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c
index 0989d98da130..faf1fce89ed4 100644
--- a/arch/x86/coco/sev/vc-handle.c
+++ b/arch/x86/coco/sev/vc-handle.c
@@ -17,6 +17,7 @@
#include <linux/mm.h>
#include <linux/io.h>
#include <linux/psp-sev.h>
+#include <linux/efi.h>
#include <uapi/linux/sev-guest.h>
#include <asm/init.h>
@@ -178,9 +179,15 @@ static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt)
return ES_OK;
}
+/*
+ * User instruction decoding is also required for the EFI runtime. Even though
+ * the EFI runtime is running in kernel mode, it uses special EFI virtual
+ * address mappings that require the use of efi_mm to properly address and
+ * decode.
+ */
static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
{
- if (user_mode(ctxt->regs))
+ if (user_mode(ctxt->regs) || mm_is_efi(current->active_mm))
return __vc_decode_user_insn(ctxt);
else
return __vc_decode_kern_insn(ctxt);
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 7cd2f395f301..79fa38ca954d 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -27,10 +27,12 @@ CONFIG_CGROUP_DEBUG=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_KALLSYMS_ALL=y
CONFIG_PROFILING=y
+CONFIG_KEXEC=y
+# Do not remove this as it results in non-bootable kernels
+# CONFIG_64BIT is not set
CONFIG_SMP=y
CONFIG_HYPERVISOR_GUEST=y
CONFIG_PARAVIRT=y
-CONFIG_NR_CPUS=8
CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
@@ -39,9 +41,6 @@ CONFIG_X86_CHECK_BIOS_CORRUPTION=y
CONFIG_EFI=y
CONFIG_EFI_STUB=y
CONFIG_HZ_1000=y
-CONFIG_KEXEC=y
-CONFIG_CRASH_DUMP=y
-# CONFIG_MITIGATION_RETHUNK is not set
CONFIG_HIBERNATION=y
CONFIG_PM_DEBUG=y
CONFIG_PM_TRACE_RTC=y
@@ -52,7 +51,6 @@ CONFIG_CPU_FREQ_GOV_ONDEMAND=y
CONFIG_X86_ACPI_CPUFREQ=y
CONFIG_KPROBES=y
CONFIG_JUMP_LABEL=y
-CONFIG_COMPAT_32BIT_TIME=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
@@ -63,9 +61,7 @@ CONFIG_BINFMT_MISC=y
# CONFIG_COMPAT_BRK is not set
CONFIG_NET=y
CONFIG_PACKET=y
-CONFIG_UNIX=y
CONFIG_XFRM_USER=y
-CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_MULTIPLE_TABLES=y
@@ -134,7 +130,6 @@ CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_DEBUG_DEVRES=y
CONFIG_CONNECTOR=y
-CONFIG_EFI_CAPSULE_LOADER=y
CONFIG_BLK_DEV_LOOP=y
CONFIG_VIRTIO_BLK=y
CONFIG_BLK_DEV_SD=y
@@ -210,7 +205,6 @@ CONFIG_SND_HDA_INTEL=y
CONFIG_SND_HDA_HWDEP=y
CONFIG_HIDRAW=y
CONFIG_HID_GYRATION=y
-CONFIG_LOGITECH_FF=y
CONFIG_HID_NTRIG=y
CONFIG_HID_PANTHERLORD=y
CONFIG_PANTHERLORD_FF=y
@@ -241,7 +235,6 @@ CONFIG_EXT4_FS_POSIX_ACL=y
CONFIG_EXT4_FS_SECURITY=y
CONFIG_QUOTA=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
-# CONFIG_PRINT_QUOTA_WARNING is not set
CONFIG_QFMT_V2=y
CONFIG_AUTOFS_FS=y
CONFIG_ISO9660_FS=y
@@ -266,19 +259,13 @@ CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y
CONFIG_SECURITY_SELINUX_BOOTPARAM=y
-CONFIG_SECURITY_SELINUX_DISABLE=y
CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_KERNEL=y
-CONFIG_FRAME_WARN=1024
CONFIG_MAGIC_SYSRQ=y
-CONFIG_DEBUG_WX=y
CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_SCHED_DEBUG is not set
CONFIG_SCHEDSTATS=y
CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_BOOT_PARAMS=y
-CONFIG_UNWINDER_FRAME_POINTER=y
CONFIG_DEBUG_ENTRY=y
-# CONFIG_64BIT is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 61e25f6209ed..7d7310cdf8b0 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -27,6 +27,7 @@ CONFIG_CGROUP_DEBUG=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_KALLSYMS_ALL=y
CONFIG_PROFILING=y
+CONFIG_KEXEC=y
CONFIG_SMP=y
CONFIG_HYPERVISOR_GUEST=y
CONFIG_PARAVIRT=y
@@ -40,8 +41,6 @@ CONFIG_EFI=y
CONFIG_EFI_STUB=y
CONFIG_EFI_MIXED=y
CONFIG_HZ_1000=y
-CONFIG_KEXEC=y
-CONFIG_CRASH_DUMP=y
CONFIG_HIBERNATION=y
CONFIG_PM_DEBUG=y
CONFIG_PM_TRACE_RTC=y
@@ -63,9 +62,7 @@ CONFIG_BINFMT_MISC=y
# CONFIG_COMPAT_BRK is not set
CONFIG_NET=y
CONFIG_PACKET=y
-CONFIG_UNIX=y
CONFIG_XFRM_USER=y
-CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_MULTIPLE_TABLES=y
@@ -205,7 +202,6 @@ CONFIG_SND_HDA_INTEL=y
CONFIG_SND_HDA_HWDEP=y
CONFIG_HIDRAW=y
CONFIG_HID_GYRATION=y
-CONFIG_LOGITECH_FF=y
CONFIG_HID_NTRIG=y
CONFIG_HID_PANTHERLORD=y
CONFIG_PANTHERLORD_FF=y
@@ -239,7 +235,6 @@ CONFIG_EXT4_FS_POSIX_ACL=y
CONFIG_EXT4_FS_SECURITY=y
CONFIG_QUOTA=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
-# CONFIG_PRINT_QUOTA_WARNING is not set
CONFIG_QFMT_V2=y
CONFIG_AUTOFS_FS=y
CONFIG_ISO9660_FS=y
@@ -264,13 +259,11 @@ CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y
CONFIG_SECURITY_SELINUX_BOOTPARAM=y
-CONFIG_SECURITY_SELINUX_DISABLE=y
CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_KERNEL=y
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_WX=y
CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_SCHED_DEBUG is not set
CONFIG_SCHEDSTATS=y
CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 56cfdc79e2c6..94016c60561e 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -376,33 +376,6 @@ config CRYPTO_POLYVAL_CLMUL_NI
Architecture: x86_64 using:
- CLMUL-NI (carry-less multiplication new instructions)
-config CRYPTO_SHA1_SSSE3
- tristate "Hash functions: SHA-1 (SSSE3/AVX/AVX2/SHA-NI)"
- depends on 64BIT
- select CRYPTO_SHA1
- select CRYPTO_HASH
- help
- SHA-1 secure hash algorithm (FIPS 180)
-
- Architecture: x86_64 using:
- - SSSE3 (Supplemental SSE3)
- - AVX (Advanced Vector Extensions)
- - AVX2 (Advanced Vector Extensions 2)
- - SHA-NI (SHA Extensions New Instructions)
-
-config CRYPTO_SHA512_SSSE3
- tristate "Hash functions: SHA-384 and SHA-512 (SSSE3/AVX/AVX2)"
- depends on 64BIT
- select CRYPTO_SHA512
- select CRYPTO_HASH
- help
- SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
-
- Architecture: x86_64 using:
- - SSSE3 (Supplemental SSE3)
- - AVX (Advanced Vector Extensions)
- - AVX2 (Advanced Vector Extensions 2)
-
config CRYPTO_SM3_AVX_X86_64
tristate "Hash functions: SM3 (AVX)"
depends on 64BIT
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index aa289a9e0153..d402963d6b57 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -51,12 +51,6 @@ ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy)
aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o
endif
-obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
-sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ni_asm.o sha1_ssse3_glue.o
-
-obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
-sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
-
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
deleted file mode 100644
index 4b49bdc95265..000000000000
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ /dev/null
@@ -1,700 +0,0 @@
-/*
- * Implement fast SHA-1 with AVX2 instructions. (x86_64)
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Ilya Albrekht <ilya.albrekht@intel.com>
- * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
- * Ronen Zohar <ronen.zohar@intel.com>
- * Chandramouli Narayanan <mouli@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-/*
- * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
- *
- *This implementation is based on the previous SSSE3 release:
- *Visit http://software.intel.com/en-us/articles/
- *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
- *
- *Updates 20-byte SHA-1 record at start of 'state', from 'input', for
- *even number of 'blocks' consecutive 64-byte blocks.
- *
- *extern "C" void sha1_transform_avx2(
- * struct sha1_state *state, const u8* input, int blocks );
- */
-
-#include <linux/linkage.h>
-
-#define CTX %rdi /* arg1 */
-#define BUF %rsi /* arg2 */
-#define CNT %rdx /* arg3 */
-
-#define REG_A %ecx
-#define REG_B %esi
-#define REG_C %edi
-#define REG_D %eax
-#define REG_E %edx
-#define REG_TB %ebx
-#define REG_TA %r12d
-#define REG_RA %rcx
-#define REG_RB %rsi
-#define REG_RC %rdi
-#define REG_RD %rax
-#define REG_RE %rdx
-#define REG_RTA %r12
-#define REG_RTB %rbx
-#define REG_T1 %r11d
-#define xmm_mov vmovups
-#define avx2_zeroupper vzeroupper
-#define RND_F1 1
-#define RND_F2 2
-#define RND_F3 3
-
-.macro REGALLOC
- .set A, REG_A
- .set B, REG_B
- .set C, REG_C
- .set D, REG_D
- .set E, REG_E
- .set TB, REG_TB
- .set TA, REG_TA
-
- .set RA, REG_RA
- .set RB, REG_RB
- .set RC, REG_RC
- .set RD, REG_RD
- .set RE, REG_RE
-
- .set RTA, REG_RTA
- .set RTB, REG_RTB
-
- .set T1, REG_T1
-.endm
-
-#define HASH_PTR %r9
-#define BLOCKS_CTR %r8
-#define BUFFER_PTR %r10
-#define BUFFER_PTR2 %r13
-
-#define PRECALC_BUF %r14
-#define WK_BUF %r15
-
-#define W_TMP %xmm0
-#define WY_TMP %ymm0
-#define WY_TMP2 %ymm9
-
-# AVX2 variables
-#define WY0 %ymm3
-#define WY4 %ymm5
-#define WY08 %ymm7
-#define WY12 %ymm8
-#define WY16 %ymm12
-#define WY20 %ymm13
-#define WY24 %ymm14
-#define WY28 %ymm15
-
-#define YMM_SHUFB_BSWAP %ymm10
-
-/*
- * Keep 2 iterations precalculated at a time:
- * - 80 DWORDs per iteration * 2
- */
-#define W_SIZE (80*2*2 +16)
-
-#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
-#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
-
-
-.macro UPDATE_HASH hash, val
- add \hash, \val
- mov \val, \hash
-.endm
-
-.macro PRECALC_RESET_WY
- .set WY_00, WY0
- .set WY_04, WY4
- .set WY_08, WY08
- .set WY_12, WY12
- .set WY_16, WY16
- .set WY_20, WY20
- .set WY_24, WY24
- .set WY_28, WY28
- .set WY_32, WY_00
-.endm
-
-.macro PRECALC_ROTATE_WY
- /* Rotate macros */
- .set WY_32, WY_28
- .set WY_28, WY_24
- .set WY_24, WY_20
- .set WY_20, WY_16
- .set WY_16, WY_12
- .set WY_12, WY_08
- .set WY_08, WY_04
- .set WY_04, WY_00
- .set WY_00, WY_32
-
- /* Define register aliases */
- .set WY, WY_00
- .set WY_minus_04, WY_04
- .set WY_minus_08, WY_08
- .set WY_minus_12, WY_12
- .set WY_minus_16, WY_16
- .set WY_minus_20, WY_20
- .set WY_minus_24, WY_24
- .set WY_minus_28, WY_28
- .set WY_minus_32, WY
-.endm
-
-.macro PRECALC_00_15
- .if (i == 0) # Initialize and rotate registers
- PRECALC_RESET_WY
- PRECALC_ROTATE_WY
- .endif
-
- /* message scheduling pre-compute for rounds 0-15 */
- .if ((i & 7) == 0)
- /*
- * blended AVX2 and ALU instruction scheduling
- * 1 vector iteration per 8 rounds
- */
- vmovdqu (i * 2)(BUFFER_PTR), W_TMP
- .elseif ((i & 7) == 1)
- vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
- WY_TMP, WY_TMP
- .elseif ((i & 7) == 2)
- vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
- .elseif ((i & 7) == 4)
- vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
- .elseif ((i & 7) == 7)
- vmovdqu WY_TMP, PRECALC_WK(i&~7)
-
- PRECALC_ROTATE_WY
- .endif
-.endm
-
-.macro PRECALC_16_31
- /*
- * message scheduling pre-compute for rounds 16-31
- * calculating last 32 w[i] values in 8 XMM registers
- * pre-calculate K+w[i] values and store to mem
- * for later load by ALU add instruction
- *
- * "brute force" vectorization for rounds 16-31 only
- * due to w[i]->w[i-3] dependency
- */
- .if ((i & 7) == 0)
- /*
- * blended AVX2 and ALU instruction scheduling
- * 1 vector iteration per 8 rounds
- */
- /* w[i-14] */
- vpalignr $8, WY_minus_16, WY_minus_12, WY
- vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
- .elseif ((i & 7) == 1)
- vpxor WY_minus_08, WY, WY
- vpxor WY_minus_16, WY_TMP, WY_TMP
- .elseif ((i & 7) == 2)
- vpxor WY_TMP, WY, WY
- vpslldq $12, WY, WY_TMP2
- .elseif ((i & 7) == 3)
- vpslld $1, WY, WY_TMP
- vpsrld $31, WY, WY
- .elseif ((i & 7) == 4)
- vpor WY, WY_TMP, WY_TMP
- vpslld $2, WY_TMP2, WY
- .elseif ((i & 7) == 5)
- vpsrld $30, WY_TMP2, WY_TMP2
- vpxor WY, WY_TMP, WY_TMP
- .elseif ((i & 7) == 7)
- vpxor WY_TMP2, WY_TMP, WY
- vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
- vmovdqu WY_TMP, PRECALC_WK(i&~7)
-
- PRECALC_ROTATE_WY
- .endif
-.endm
-
-.macro PRECALC_32_79
- /*
- * in SHA-1 specification:
- * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
- * instead we do equal:
- * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
- * allows more efficient vectorization
- * since w[i]=>w[i-3] dependency is broken
- */
-
- .if ((i & 7) == 0)
- /*
- * blended AVX2 and ALU instruction scheduling
- * 1 vector iteration per 8 rounds
- */
- vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
- .elseif ((i & 7) == 1)
- /* W is W_minus_32 before xor */
- vpxor WY_minus_28, WY, WY
- .elseif ((i & 7) == 2)
- vpxor WY_minus_16, WY_TMP, WY_TMP
- .elseif ((i & 7) == 3)
- vpxor WY_TMP, WY, WY
- .elseif ((i & 7) == 4)
- vpslld $2, WY, WY_TMP
- .elseif ((i & 7) == 5)
- vpsrld $30, WY, WY
- vpor WY, WY_TMP, WY
- .elseif ((i & 7) == 7)
- vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
- vmovdqu WY_TMP, PRECALC_WK(i&~7)
-
- PRECALC_ROTATE_WY
- .endif
-.endm
-
-.macro PRECALC r, s
- .set i, \r
-
- .if (i < 40)
- .set K_XMM, 32*0
- .elseif (i < 80)
- .set K_XMM, 32*1
- .elseif (i < 120)
- .set K_XMM, 32*2
- .else
- .set K_XMM, 32*3
- .endif
-
- .if (i<32)
- PRECALC_00_15 \s
- .elseif (i<64)
- PRECALC_16_31 \s
- .elseif (i < 160)
- PRECALC_32_79 \s
- .endif
-.endm
-
-.macro ROTATE_STATE
- .set T_REG, E
- .set E, D
- .set D, C
- .set C, B
- .set B, TB
- .set TB, A
- .set A, T_REG
-
- .set T_REG, RE
- .set RE, RD
- .set RD, RC
- .set RC, RB
- .set RB, RTB
- .set RTB, RA
- .set RA, T_REG
-.endm
-
-/* Macro relies on saved ROUND_Fx */
-
-.macro RND_FUN f, r
- .if (\f == RND_F1)
- ROUND_F1 \r
- .elseif (\f == RND_F2)
- ROUND_F2 \r
- .elseif (\f == RND_F3)
- ROUND_F3 \r
- .endif
-.endm
-
-.macro RR r
- .set round_id, (\r % 80)
-
- .if (round_id == 0) /* Precalculate F for first round */
- .set ROUND_FUNC, RND_F1
- mov B, TB
-
- rorx $(32-30), B, B /* b>>>2 */
- andn D, TB, T1
- and C, TB
- xor T1, TB
- .endif
-
- RND_FUN ROUND_FUNC, \r
- ROTATE_STATE
-
- .if (round_id == 18)
- .set ROUND_FUNC, RND_F2
- .elseif (round_id == 38)
- .set ROUND_FUNC, RND_F3
- .elseif (round_id == 58)
- .set ROUND_FUNC, RND_F2
- .endif
-
- .set round_id, ( (\r+1) % 80)
-
- RND_FUN ROUND_FUNC, (\r+1)
- ROTATE_STATE
-.endm
-
-.macro ROUND_F1 r
- add WK(\r), E
-
- andn C, A, T1 /* ~b&d */
- lea (RE,RTB), E /* Add F from the previous round */
-
- rorx $(32-5), A, TA /* T2 = A >>> 5 */
- rorx $(32-30),A, TB /* b>>>2 for next round */
-
- PRECALC (\r) /* msg scheduling for next 2 blocks */
-
- /*
- * Calculate F for the next round
- * (b & c) ^ andn[b, d]
- */
- and B, A /* b&c */
- xor T1, A /* F1 = (b&c) ^ (~b&d) */
-
- lea (RE,RTA), E /* E += A >>> 5 */
-.endm
-
-.macro ROUND_F2 r
- add WK(\r), E
- lea (RE,RTB), E /* Add F from the previous round */
-
- /* Calculate F for the next round */
- rorx $(32-5), A, TA /* T2 = A >>> 5 */
- .if ((round_id) < 79)
- rorx $(32-30), A, TB /* b>>>2 for next round */
- .endif
- PRECALC (\r) /* msg scheduling for next 2 blocks */
-
- .if ((round_id) < 79)
- xor B, A
- .endif
-
- add TA, E /* E += A >>> 5 */
-
- .if ((round_id) < 79)
- xor C, A
- .endif
-.endm
-
-.macro ROUND_F3 r
- add WK(\r), E
- PRECALC (\r) /* msg scheduling for next 2 blocks */
-
- lea (RE,RTB), E /* Add F from the previous round */
-
- mov B, T1
- or A, T1
-
- rorx $(32-5), A, TA /* T2 = A >>> 5 */
- rorx $(32-30), A, TB /* b>>>2 for next round */
-
- /* Calculate F for the next round
- * (b and c) or (d and (b or c))
- */
- and C, T1
- and B, A
- or T1, A
-
- add TA, E /* E += A >>> 5 */
-
-.endm
-
-/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
- * %1 + %2 >= %3 ? %4 : 0
- */
-.macro ADD_IF_GE a, b, c, d
- mov \a, RTA
- add $\d, RTA
- cmp $\c, \b
- cmovge RTA, \a
-.endm
-
-/*
- * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
- */
-.macro SHA1_PIPELINED_MAIN_BODY
-
- REGALLOC
-
- mov (HASH_PTR), A
- mov 4(HASH_PTR), B
- mov 8(HASH_PTR), C
- mov 12(HASH_PTR), D
- mov 16(HASH_PTR), E
-
- mov %rsp, PRECALC_BUF
- lea (2*4*80+32)(%rsp), WK_BUF
-
- # Precalc WK for first 2 blocks
- ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
- .set i, 0
- .rept 160
- PRECALC i
- .set i, i + 1
- .endr
-
- /* Go to next block if needed */
- ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
- ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
- xchg WK_BUF, PRECALC_BUF
-
- .align 32
-.L_loop:
- /*
- * code loops through more than one block
- * we use K_BASE value as a signal of a last block,
- * it is set below by: cmovae BUFFER_PTR, K_BASE
- */
- test BLOCKS_CTR, BLOCKS_CTR
- jnz .L_begin
- .align 32
- jmp .L_end
- .align 32
-.L_begin:
-
- /*
- * Do first block
- * rounds: 0,2,4,6,8
- */
- .set j, 0
- .rept 5
- RR j
- .set j, j+2
- .endr
-
- /*
- * rounds:
- * 10,12,14,16,18
- * 20,22,24,26,28
- * 30,32,34,36,38
- * 40,42,44,46,48
- * 50,52,54,56,58
- */
- .rept 25
- RR j
- .set j, j+2
- .endr
-
- /* Update Counter */
- sub $1, BLOCKS_CTR
- /* Move to the next block only if needed*/
- ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
- /*
- * rounds
- * 60,62,64,66,68
- * 70,72,74,76,78
- */
- .rept 10
- RR j
- .set j, j+2
- .endr
-
- UPDATE_HASH (HASH_PTR), A
- UPDATE_HASH 4(HASH_PTR), TB
- UPDATE_HASH 8(HASH_PTR), C
- UPDATE_HASH 12(HASH_PTR), D
- UPDATE_HASH 16(HASH_PTR), E
-
- test BLOCKS_CTR, BLOCKS_CTR
- jz .L_loop
-
- mov TB, B
-
- /* Process second block */
- /*
- * rounds
- * 0+80, 2+80, 4+80, 6+80, 8+80
- * 10+80,12+80,14+80,16+80,18+80
- */
-
- .set j, 0
- .rept 10
- RR j+80
- .set j, j+2
- .endr
-
- /*
- * rounds
- * 20+80,22+80,24+80,26+80,28+80
- * 30+80,32+80,34+80,36+80,38+80
- */
- .rept 10
- RR j+80
- .set j, j+2
- .endr
-
- /*
- * rounds
- * 40+80,42+80,44+80,46+80,48+80
- * 50+80,52+80,54+80,56+80,58+80
- */
- .rept 10
- RR j+80
- .set j, j+2
- .endr
-
- /* update counter */
- sub $1, BLOCKS_CTR
- /* Move to the next block only if needed*/
- ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
-
- /*
- * rounds
- * 60+80,62+80,64+80,66+80,68+80
- * 70+80,72+80,74+80,76+80,78+80
- */
- .rept 10
- RR j+80
- .set j, j+2
- .endr
-
- UPDATE_HASH (HASH_PTR), A
- UPDATE_HASH 4(HASH_PTR), TB
- UPDATE_HASH 8(HASH_PTR), C
- UPDATE_HASH 12(HASH_PTR), D
- UPDATE_HASH 16(HASH_PTR), E
-
- /* Reset state for AVX2 reg permutation */
- mov A, TA
- mov TB, A
- mov C, TB
- mov E, C
- mov D, B
- mov TA, D
-
- REGALLOC
-
- xchg WK_BUF, PRECALC_BUF
-
- jmp .L_loop
-
- .align 32
-.L_end:
-
-.endm
-/*
- * macro implements SHA-1 function's body for several 64-byte blocks
- * param: function's name
- */
-.macro SHA1_VECTOR_ASM name
- SYM_FUNC_START(\name)
-
- push %rbx
- push %r12
- push %r13
- push %r14
- push %r15
-
- RESERVE_STACK = (W_SIZE*4 + 8+24)
-
- /* Align stack */
- push %rbp
- mov %rsp, %rbp
- and $~(0x20-1), %rsp
- sub $RESERVE_STACK, %rsp
-
- avx2_zeroupper
-
- /* Setup initial values */
- mov CTX, HASH_PTR
- mov BUF, BUFFER_PTR
-
- mov BUF, BUFFER_PTR2
- mov CNT, BLOCKS_CTR
-
- xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
-
- SHA1_PIPELINED_MAIN_BODY
-
- avx2_zeroupper
-
- mov %rbp, %rsp
- pop %rbp
-
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
-
- RET
-
- SYM_FUNC_END(\name)
-.endm
-
-.section .rodata
-
-#define K1 0x5a827999
-#define K2 0x6ed9eba1
-#define K3 0x8f1bbcdc
-#define K4 0xca62c1d6
-
-.align 128
-K_XMM_AR:
- .long K1, K1, K1, K1
- .long K1, K1, K1, K1
- .long K2, K2, K2, K2
- .long K2, K2, K2, K2
- .long K3, K3, K3, K3
- .long K3, K3, K3, K3
- .long K4, K4, K4, K4
- .long K4, K4, K4, K4
-
-BSWAP_SHUFB_CTL:
- .long 0x00010203
- .long 0x04050607
- .long 0x08090a0b
- .long 0x0c0d0e0f
- .long 0x00010203
- .long 0x04050607
- .long 0x08090a0b
- .long 0x0c0d0e0f
-.text
-
-SHA1_VECTOR_ASM sha1_transform_avx2
diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S
deleted file mode 100644
index cade913d4882..000000000000
--- a/arch/x86/crypto/sha1_ni_asm.S
+++ /dev/null
@@ -1,304 +0,0 @@
-/*
- * Intel SHA Extensions optimized implementation of a SHA-1 update function
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Sean Gulley <sean.m.gulley@intel.com>
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-#define DIGEST_PTR %rdi /* 1st arg */
-#define DATA_PTR %rsi /* 2nd arg */
-#define NUM_BLKS %rdx /* 3rd arg */
-
-/* gcc conversion */
-#define FRAME_SIZE 32 /* space for 2x16 bytes */
-
-#define ABCD %xmm0
-#define E0 %xmm1 /* Need two E's b/c they ping pong */
-#define E1 %xmm2
-#define MSG0 %xmm3
-#define MSG1 %xmm4
-#define MSG2 %xmm5
-#define MSG3 %xmm6
-#define SHUF_MASK %xmm7
-
-
-/*
- * Intel SHA Extensions optimized implementation of a SHA-1 update function
- *
- * The function takes a pointer to the current hash values, a pointer to the
- * input data, and a number of 64 byte blocks to process. Once all blocks have
- * been processed, the digest pointer is updated with the resulting hash value.
- * The function only processes complete blocks, there is no functionality to
- * store partial blocks. All message padding and hash value initialization must
- * be done outside the update function.
- *
- * The indented lines in the loop are instructions related to rounds processing.
- * The non-indented lines are instructions related to the message schedule.
- *
- * void sha1_ni_transform(uint32_t *digest, const void *data,
- uint32_t numBlocks)
- * digest : pointer to digest
- * data: pointer to input data
- * numBlocks: Number of blocks to process
- */
-.text
-SYM_TYPED_FUNC_START(sha1_ni_transform)
- push %rbp
- mov %rsp, %rbp
- sub $FRAME_SIZE, %rsp
- and $~0xF, %rsp
-
- shl $6, NUM_BLKS /* convert to bytes */
- jz .Ldone_hash
- add DATA_PTR, NUM_BLKS /* pointer to end of data */
-
- /* load initial hash values */
- pinsrd $3, 1*16(DIGEST_PTR), E0
- movdqu 0*16(DIGEST_PTR), ABCD
- pand UPPER_WORD_MASK(%rip), E0
- pshufd $0x1B, ABCD, ABCD
-
- movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
-
-.Lloop0:
- /* Save hash values for addition after rounds */
- movdqa E0, (0*16)(%rsp)
- movdqa ABCD, (1*16)(%rsp)
-
- /* Rounds 0-3 */
- movdqu 0*16(DATA_PTR), MSG0
- pshufb SHUF_MASK, MSG0
- paddd MSG0, E0
- movdqa ABCD, E1
- sha1rnds4 $0, E0, ABCD
-
- /* Rounds 4-7 */
- movdqu 1*16(DATA_PTR), MSG1
- pshufb SHUF_MASK, MSG1
- sha1nexte MSG1, E1
- movdqa ABCD, E0
- sha1rnds4 $0, E1, ABCD
- sha1msg1 MSG1, MSG0
-
- /* Rounds 8-11 */
- movdqu 2*16(DATA_PTR), MSG2
- pshufb SHUF_MASK, MSG2
- sha1nexte MSG2, E0
- movdqa ABCD, E1
- sha1rnds4 $0, E0, ABCD
- sha1msg1 MSG2, MSG1
- pxor MSG2, MSG0
-
- /* Rounds 12-15 */
- movdqu 3*16(DATA_PTR), MSG3
- pshufb SHUF_MASK, MSG3
- sha1nexte MSG3, E1
- movdqa ABCD, E0
- sha1msg2 MSG3, MSG0
- sha1rnds4 $0, E1, ABCD
- sha1msg1 MSG3, MSG2
- pxor MSG3, MSG1
-
- /* Rounds 16-19 */
- sha1nexte MSG0, E0
- movdqa ABCD, E1
- sha1msg2 MSG0, MSG1
- sha1rnds4 $0, E0, ABCD
- sha1msg1 MSG0, MSG3
- pxor MSG0, MSG2
-
- /* Rounds 20-23 */
- sha1nexte MSG1, E1
- movdqa ABCD, E0
- sha1msg2 MSG1, MSG2
- sha1rnds4 $1, E1, ABCD
- sha1msg1 MSG1, MSG0
- pxor MSG1, MSG3
-
- /* Rounds 24-27 */
- sha1nexte MSG2, E0
- movdqa ABCD, E1
- sha1msg2 MSG2, MSG3
- sha1rnds4 $1, E0, ABCD
- sha1msg1 MSG2, MSG1
- pxor MSG2, MSG0
-
- /* Rounds 28-31 */
- sha1nexte MSG3, E1
- movdqa ABCD, E0
- sha1msg2 MSG3, MSG0
- sha1rnds4 $1, E1, ABCD
- sha1msg1 MSG3, MSG2
- pxor MSG3, MSG1
-
- /* Rounds 32-35 */
- sha1nexte MSG0, E0
- movdqa ABCD, E1
- sha1msg2 MSG0, MSG1
- sha1rnds4 $1, E0, ABCD
- sha1msg1 MSG0, MSG3
- pxor MSG0, MSG2
-
- /* Rounds 36-39 */
- sha1nexte MSG1, E1
- movdqa ABCD, E0
- sha1msg2 MSG1, MSG2
- sha1rnds4 $1, E1, ABCD
- sha1msg1 MSG1, MSG0
- pxor MSG1, MSG3
-
- /* Rounds 40-43 */
- sha1nexte MSG2, E0
- movdqa ABCD, E1
- sha1msg2 MSG2, MSG3
- sha1rnds4 $2, E0, ABCD
- sha1msg1 MSG2, MSG1
- pxor MSG2, MSG0
-
- /* Rounds 44-47 */
- sha1nexte MSG3, E1
- movdqa ABCD, E0
- sha1msg2 MSG3, MSG0
- sha1rnds4 $2, E1, ABCD
- sha1msg1 MSG3, MSG2
- pxor MSG3, MSG1
-
- /* Rounds 48-51 */
- sha1nexte MSG0, E0
- movdqa ABCD, E1
- sha1msg2 MSG0, MSG1
- sha1rnds4 $2, E0, ABCD
- sha1msg1 MSG0, MSG3
- pxor MSG0, MSG2
-
- /* Rounds 52-55 */
- sha1nexte MSG1, E1
- movdqa ABCD, E0
- sha1msg2 MSG1, MSG2
- sha1rnds4 $2, E1, ABCD
- sha1msg1 MSG1, MSG0
- pxor MSG1, MSG3
-
- /* Rounds 56-59 */
- sha1nexte MSG2, E0
- movdqa ABCD, E1
- sha1msg2 MSG2, MSG3
- sha1rnds4 $2, E0, ABCD
- sha1msg1 MSG2, MSG1
- pxor MSG2, MSG0
-
- /* Rounds 60-63 */
- sha1nexte MSG3, E1
- movdqa ABCD, E0
- sha1msg2 MSG3, MSG0
- sha1rnds4 $3, E1, ABCD
- sha1msg1 MSG3, MSG2
- pxor MSG3, MSG1
-
- /* Rounds 64-67 */
- sha1nexte MSG0, E0
- movdqa ABCD, E1
- sha1msg2 MSG0, MSG1
- sha1rnds4 $3, E0, ABCD
- sha1msg1 MSG0, MSG3
- pxor MSG0, MSG2
-
- /* Rounds 68-71 */
- sha1nexte MSG1, E1
- movdqa ABCD, E0
- sha1msg2 MSG1, MSG2
- sha1rnds4 $3, E1, ABCD
- pxor MSG1, MSG3
-
- /* Rounds 72-75 */
- sha1nexte MSG2, E0
- movdqa ABCD, E1
- sha1msg2 MSG2, MSG3
- sha1rnds4 $3, E0, ABCD
-
- /* Rounds 76-79 */
- sha1nexte MSG3, E1
- movdqa ABCD, E0
- sha1rnds4 $3, E1, ABCD
-
- /* Add current hash values with previously saved */
- sha1nexte (0*16)(%rsp), E0
- paddd (1*16)(%rsp), ABCD
-
- /* Increment data pointer and loop if more to process */
- add $64, DATA_PTR
- cmp NUM_BLKS, DATA_PTR
- jne .Lloop0
-
- /* Write hash values back in the correct order */
- pshufd $0x1B, ABCD, ABCD
- movdqu ABCD, 0*16(DIGEST_PTR)
- pextrd $3, E0, 1*16(DIGEST_PTR)
-
-.Ldone_hash:
- mov %rbp, %rsp
- pop %rbp
-
- RET
-SYM_FUNC_END(sha1_ni_transform)
-
-.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x000102030405060708090a0b0c0d0e0f
-
-.section .rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16
-.align 16
-UPPER_WORD_MASK:
- .octa 0xFFFFFFFF000000000000000000000000
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
deleted file mode 100644
index f54988c80eb4..000000000000
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ /dev/null
@@ -1,554 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
- * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
- * processors. CPUs supporting Intel(R) AVX extensions will get an additional
- * boost.
- *
- * This work was inspired by the vectorized implementation of Dean Gaudet.
- * Additional information on it can be found at:
- * http://www.arctic.org/~dean/crypto/sha1.html
- *
- * It was improved upon with more efficient vectorization of the message
- * scheduling. This implementation has also been optimized for all current and
- * several future generations of Intel CPUs.
- *
- * See this article for more information about the implementation details:
- * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
- *
- * Copyright (C) 2010, Intel Corp.
- * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
- * Ronen Zohar <ronen.zohar@intel.com>
- *
- * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
- * Author: Mathias Krause <minipli@googlemail.com>
- */
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-#define CTX %rdi // arg1
-#define BUF %rsi // arg2
-#define CNT %rdx // arg3
-
-#define REG_A %ecx
-#define REG_B %esi
-#define REG_C %edi
-#define REG_D %r12d
-#define REG_E %edx
-
-#define REG_T1 %eax
-#define REG_T2 %ebx
-
-#define K_BASE %r8
-#define HASH_PTR %r9
-#define BUFFER_PTR %r10
-#define BUFFER_END %r11
-
-#define W_TMP1 %xmm0
-#define W_TMP2 %xmm9
-
-#define W0 %xmm1
-#define W4 %xmm2
-#define W8 %xmm3
-#define W12 %xmm4
-#define W16 %xmm5
-#define W20 %xmm6
-#define W24 %xmm7
-#define W28 %xmm8
-
-#define XMM_SHUFB_BSWAP %xmm10
-
-/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
-#define WK(t) (((t) & 15) * 4)(%rsp)
-#define W_PRECALC_AHEAD 16
-
-/*
- * This macro implements the SHA-1 function's body for single 64-byte block
- * param: function's name
- */
-.macro SHA1_VECTOR_ASM name
- SYM_TYPED_FUNC_START(\name)
-
- push %rbx
- push %r12
- push %rbp
- mov %rsp, %rbp
-
- sub $64, %rsp # allocate workspace
- and $~15, %rsp # align stack
-
- mov CTX, HASH_PTR
- mov BUF, BUFFER_PTR
-
- shl $6, CNT # multiply by 64
- add BUF, CNT
- mov CNT, BUFFER_END
-
- lea K_XMM_AR(%rip), K_BASE
- xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
-
- SHA1_PIPELINED_MAIN_BODY
-
- # cleanup workspace
- mov $8, %ecx
- mov %rsp, %rdi
- xor %eax, %eax
- rep stosq
-
- mov %rbp, %rsp # deallocate workspace
- pop %rbp
- pop %r12
- pop %rbx
- RET
-
- SYM_FUNC_END(\name)
-.endm
-
-/*
- * This macro implements 80 rounds of SHA-1 for one 64-byte block
- */
-.macro SHA1_PIPELINED_MAIN_BODY
- INIT_REGALLOC
-
- mov (HASH_PTR), A
- mov 4(HASH_PTR), B
- mov 8(HASH_PTR), C
- mov 12(HASH_PTR), D
- mov 16(HASH_PTR), E
-
- .set i, 0
- .rept W_PRECALC_AHEAD
- W_PRECALC i
- .set i, (i+1)
- .endr
-
-.align 4
-1:
- RR F1,A,B,C,D,E,0
- RR F1,D,E,A,B,C,2
- RR F1,B,C,D,E,A,4
- RR F1,E,A,B,C,D,6
- RR F1,C,D,E,A,B,8
-
- RR F1,A,B,C,D,E,10
- RR F1,D,E,A,B,C,12
- RR F1,B,C,D,E,A,14
- RR F1,E,A,B,C,D,16
- RR F1,C,D,E,A,B,18
-
- RR F2,A,B,C,D,E,20
- RR F2,D,E,A,B,C,22
- RR F2,B,C,D,E,A,24
- RR F2,E,A,B,C,D,26
- RR F2,C,D,E,A,B,28
-
- RR F2,A,B,C,D,E,30
- RR F2,D,E,A,B,C,32
- RR F2,B,C,D,E,A,34
- RR F2,E,A,B,C,D,36
- RR F2,C,D,E,A,B,38
-
- RR F3,A,B,C,D,E,40
- RR F3,D,E,A,B,C,42
- RR F3,B,C,D,E,A,44
- RR F3,E,A,B,C,D,46
- RR F3,C,D,E,A,B,48
-
- RR F3,A,B,C,D,E,50
- RR F3,D,E,A,B,C,52
- RR F3,B,C,D,E,A,54
- RR F3,E,A,B,C,D,56
- RR F3,C,D,E,A,B,58
-
- add $64, BUFFER_PTR # move to the next 64-byte block
- cmp BUFFER_END, BUFFER_PTR # if the current is the last one use
- cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun
-
- RR F4,A,B,C,D,E,60
- RR F4,D,E,A,B,C,62
- RR F4,B,C,D,E,A,64
- RR F4,E,A,B,C,D,66
- RR F4,C,D,E,A,B,68
-
- RR F4,A,B,C,D,E,70
- RR F4,D,E,A,B,C,72
- RR F4,B,C,D,E,A,74
- RR F4,E,A,B,C,D,76
- RR F4,C,D,E,A,B,78
-
- UPDATE_HASH (HASH_PTR), A
- UPDATE_HASH 4(HASH_PTR), B
- UPDATE_HASH 8(HASH_PTR), C
- UPDATE_HASH 12(HASH_PTR), D
- UPDATE_HASH 16(HASH_PTR), E
-
- RESTORE_RENAMED_REGS
- cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end
- jne 1b
-.endm
-
-.macro INIT_REGALLOC
- .set A, REG_A
- .set B, REG_B
- .set C, REG_C
- .set D, REG_D
- .set E, REG_E
- .set T1, REG_T1
- .set T2, REG_T2
-.endm
-
-.macro RESTORE_RENAMED_REGS
- # order is important (REG_C is where it should be)
- mov B, REG_B
- mov D, REG_D
- mov A, REG_A
- mov E, REG_E
-.endm
-
-.macro SWAP_REG_NAMES a, b
- .set _T, \a
- .set \a, \b
- .set \b, _T
-.endm
-
-.macro F1 b, c, d
- mov \c, T1
- SWAP_REG_NAMES \c, T1
- xor \d, T1
- and \b, T1
- xor \d, T1
-.endm
-
-.macro F2 b, c, d
- mov \d, T1
- SWAP_REG_NAMES \d, T1
- xor \c, T1
- xor \b, T1
-.endm
-
-.macro F3 b, c ,d
- mov \c, T1
- SWAP_REG_NAMES \c, T1
- mov \b, T2
- or \b, T1
- and \c, T2
- and \d, T1
- or T2, T1
-.endm
-
-.macro F4 b, c, d
- F2 \b, \c, \d
-.endm
-
-.macro UPDATE_HASH hash, val
- add \hash, \val
- mov \val, \hash
-.endm
-
-/*
- * RR does two rounds of SHA-1 back to back with W[] pre-calc
- * t1 = F(b, c, d); e += w(i)
- * e += t1; b <<= 30; d += w(i+1);
- * t1 = F(a, b, c);
- * d += t1; a <<= 5;
- * e += a;
- * t1 = e; a >>= 7;
- * t1 <<= 5;
- * d += t1;
- */
-.macro RR F, a, b, c, d, e, round
- add WK(\round), \e
- \F \b, \c, \d # t1 = F(b, c, d);
- W_PRECALC (\round + W_PRECALC_AHEAD)
- rol $30, \b
- add T1, \e
- add WK(\round + 1), \d
-
- \F \a, \b, \c
- W_PRECALC (\round + W_PRECALC_AHEAD + 1)
- rol $5, \a
- add \a, \e
- add T1, \d
- ror $7, \a # (a <<r 5) >>r 7) => a <<r 30)
-
- mov \e, T1
- SWAP_REG_NAMES \e, T1
-
- rol $5, T1
- add T1, \d
-
- # write: \a, \b
- # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
-.endm
-
-.macro W_PRECALC r
- .set i, \r
-
- .if (i < 20)
- .set K_XMM, 0
- .elseif (i < 40)
- .set K_XMM, 16
- .elseif (i < 60)
- .set K_XMM, 32
- .elseif (i < 80)
- .set K_XMM, 48
- .endif
-
- .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
- .set i, ((\r) % 80) # pre-compute for the next iteration
- .if (i == 0)
- W_PRECALC_RESET
- .endif
- W_PRECALC_00_15
- .elseif (i<32)
- W_PRECALC_16_31
- .elseif (i < 80) // rounds 32-79
- W_PRECALC_32_79
- .endif
-.endm
-
-.macro W_PRECALC_RESET
- .set W, W0
- .set W_minus_04, W4
- .set W_minus_08, W8
- .set W_minus_12, W12
- .set W_minus_16, W16
- .set W_minus_20, W20
- .set W_minus_24, W24
- .set W_minus_28, W28
- .set W_minus_32, W
-.endm
-
-.macro W_PRECALC_ROTATE
- .set W_minus_32, W_minus_28
- .set W_minus_28, W_minus_24
- .set W_minus_24, W_minus_20
- .set W_minus_20, W_minus_16
- .set W_minus_16, W_minus_12
- .set W_minus_12, W_minus_08
- .set W_minus_08, W_minus_04
- .set W_minus_04, W
- .set W, W_minus_32
-.endm
-
-.macro W_PRECALC_SSSE3
-
-.macro W_PRECALC_00_15
- W_PRECALC_00_15_SSSE3
-.endm
-.macro W_PRECALC_16_31
- W_PRECALC_16_31_SSSE3
-.endm
-.macro W_PRECALC_32_79
- W_PRECALC_32_79_SSSE3
-.endm
-
-/* message scheduling pre-compute for rounds 0-15 */
-.macro W_PRECALC_00_15_SSSE3
- .if ((i & 3) == 0)
- movdqu (i*4)(BUFFER_PTR), W_TMP1
- .elseif ((i & 3) == 1)
- pshufb XMM_SHUFB_BSWAP, W_TMP1
- movdqa W_TMP1, W
- .elseif ((i & 3) == 2)
- paddd (K_BASE), W_TMP1
- .elseif ((i & 3) == 3)
- movdqa W_TMP1, WK(i&~3)
- W_PRECALC_ROTATE
- .endif
-.endm
-
-/* message scheduling pre-compute for rounds 16-31
- *
- * - calculating last 32 w[i] values in 8 XMM registers
- * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
- * instruction
- *
- * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
- * dependency, but improves for 32-79
- */
-.macro W_PRECALC_16_31_SSSE3
- # blended scheduling of vector and scalar instruction streams, one 4-wide
- # vector iteration / 4 scalar rounds
- .if ((i & 3) == 0)
- movdqa W_minus_12, W
- palignr $8, W_minus_16, W # w[i-14]
- movdqa W_minus_04, W_TMP1
- psrldq $4, W_TMP1 # w[i-3]
- pxor W_minus_08, W
- .elseif ((i & 3) == 1)
- pxor W_minus_16, W_TMP1
- pxor W_TMP1, W
- movdqa W, W_TMP2
- movdqa W, W_TMP1
- pslldq $12, W_TMP2
- .elseif ((i & 3) == 2)
- psrld $31, W
- pslld $1, W_TMP1
- por W, W_TMP1
- movdqa W_TMP2, W
- psrld $30, W_TMP2
- pslld $2, W
- .elseif ((i & 3) == 3)
- pxor W, W_TMP1
- pxor W_TMP2, W_TMP1
- movdqa W_TMP1, W
- paddd K_XMM(K_BASE), W_TMP1
- movdqa W_TMP1, WK(i&~3)
- W_PRECALC_ROTATE
- .endif
-.endm
-
-/* message scheduling pre-compute for rounds 32-79
- *
- * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
- * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
- * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
- */
-.macro W_PRECALC_32_79_SSSE3
- .if ((i & 3) == 0)
- movdqa W_minus_04, W_TMP1
- pxor W_minus_28, W # W is W_minus_32 before xor
- palignr $8, W_minus_08, W_TMP1
- .elseif ((i & 3) == 1)
- pxor W_minus_16, W
- pxor W_TMP1, W
- movdqa W, W_TMP1
- .elseif ((i & 3) == 2)
- psrld $30, W
- pslld $2, W_TMP1
- por W, W_TMP1
- .elseif ((i & 3) == 3)
- movdqa W_TMP1, W
- paddd K_XMM(K_BASE), W_TMP1
- movdqa W_TMP1, WK(i&~3)
- W_PRECALC_ROTATE
- .endif
-.endm
-
-.endm // W_PRECALC_SSSE3
-
-
-#define K1 0x5a827999
-#define K2 0x6ed9eba1
-#define K3 0x8f1bbcdc
-#define K4 0xca62c1d6
-
-.section .rodata
-.align 16
-
-K_XMM_AR:
- .long K1, K1, K1, K1
- .long K2, K2, K2, K2
- .long K3, K3, K3, K3
- .long K4, K4, K4, K4
-
-BSWAP_SHUFB_CTL:
- .long 0x00010203
- .long 0x04050607
- .long 0x08090a0b
- .long 0x0c0d0e0f
-
-
-.section .text
-
-W_PRECALC_SSSE3
-.macro xmm_mov a, b
- movdqu \a,\b
-.endm
-
-/*
- * SSSE3 optimized implementation:
- *
- * extern "C" void sha1_transform_ssse3(struct sha1_state *state,
- * const u8 *data, int blocks);
- *
- * Note that struct sha1_state is assumed to begin with u32 state[5].
- */
-SHA1_VECTOR_ASM sha1_transform_ssse3
-
-.macro W_PRECALC_AVX
-
-.purgem W_PRECALC_00_15
-.macro W_PRECALC_00_15
- W_PRECALC_00_15_AVX
-.endm
-.purgem W_PRECALC_16_31
-.macro W_PRECALC_16_31
- W_PRECALC_16_31_AVX
-.endm
-.purgem W_PRECALC_32_79
-.macro W_PRECALC_32_79
- W_PRECALC_32_79_AVX
-.endm
-
-.macro W_PRECALC_00_15_AVX
- .if ((i & 3) == 0)
- vmovdqu (i*4)(BUFFER_PTR), W_TMP1
- .elseif ((i & 3) == 1)
- vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
- .elseif ((i & 3) == 2)
- vpaddd (K_BASE), W, W_TMP1
- .elseif ((i & 3) == 3)
- vmovdqa W_TMP1, WK(i&~3)
- W_PRECALC_ROTATE
- .endif
-.endm
-
-.macro W_PRECALC_16_31_AVX
- .if ((i & 3) == 0)
- vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
- vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
- vpxor W_minus_08, W, W
- vpxor W_minus_16, W_TMP1, W_TMP1
- .elseif ((i & 3) == 1)
- vpxor W_TMP1, W, W
- vpslldq $12, W, W_TMP2
- vpslld $1, W, W_TMP1
- .elseif ((i & 3) == 2)
- vpsrld $31, W, W
- vpor W, W_TMP1, W_TMP1
- vpslld $2, W_TMP2, W
- vpsrld $30, W_TMP2, W_TMP2
- .elseif ((i & 3) == 3)
- vpxor W, W_TMP1, W_TMP1
- vpxor W_TMP2, W_TMP1, W
- vpaddd K_XMM(K_BASE), W, W_TMP1
- vmovdqu W_TMP1, WK(i&~3)
- W_PRECALC_ROTATE
- .endif
-.endm
-
-.macro W_PRECALC_32_79_AVX
- .if ((i & 3) == 0)
- vpalignr $8, W_minus_08, W_minus_04, W_TMP1
- vpxor W_minus_28, W, W # W is W_minus_32 before xor
- .elseif ((i & 3) == 1)
- vpxor W_minus_16, W_TMP1, W_TMP1
- vpxor W_TMP1, W, W
- .elseif ((i & 3) == 2)
- vpslld $2, W, W_TMP1
- vpsrld $30, W, W
- vpor W, W_TMP1, W
- .elseif ((i & 3) == 3)
- vpaddd K_XMM(K_BASE), W, W_TMP1
- vmovdqu W_TMP1, WK(i&~3)
- W_PRECALC_ROTATE
- .endif
-.endm
-
-.endm // W_PRECALC_AVX
-
-W_PRECALC_AVX
-.purgem xmm_mov
-.macro xmm_mov a, b
- vmovdqu \a,\b
-.endm
-
-
-/* AVX optimized implementation:
- * extern "C" void sha1_transform_avx(struct sha1_state *state,
- * const u8 *data, int blocks);
- */
-SHA1_VECTOR_ASM sha1_transform_avx
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
deleted file mode 100644
index 0a912bfc86c5..000000000000
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ /dev/null
@@ -1,324 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Cryptographic API.
- *
- * Glue code for the SHA1 Secure Hash Algorithm assembler implementations
- * using SSSE3, AVX, AVX2, and SHA-NI instructions.
- *
- * This file is based on sha1_generic.c
- *
- * Copyright (c) Alan Smithee.
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
- * Copyright (c) Mathias Krause <minipli@googlemail.com>
- * Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <asm/cpu_device_id.h>
-#include <asm/simd.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-static const struct x86_cpu_id module_cpu_ids[] = {
- X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL),
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids);
-
-static inline int sha1_update(struct shash_desc *desc, const u8 *data,
- unsigned int len, sha1_block_fn *sha1_xform)
-{
- int remain;
-
- /*
- * Make sure struct sha1_state begins directly with the SHA1
- * 160-bit internal state, as this is what the asm functions expect.
- */
- BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
-
- kernel_fpu_begin();
- remain = sha1_base_do_update_blocks(desc, data, len, sha1_xform);
- kernel_fpu_end();
-
- return remain;
-}
-
-static inline int sha1_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out,
- sha1_block_fn *sha1_xform)
-{
- kernel_fpu_begin();
- sha1_base_do_finup(desc, data, len, sha1_xform);
- kernel_fpu_end();
-
- return sha1_base_finish(desc, out);
-}
-
-asmlinkage void sha1_transform_ssse3(struct sha1_state *state,
- const u8 *data, int blocks);
-
-static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha1_update(desc, data, len, sha1_transform_ssse3);
-}
-
-static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha1_finup(desc, data, len, out, sha1_transform_ssse3);
-}
-
-static struct shash_alg sha1_ssse3_alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_base_init,
- .update = sha1_ssse3_update,
- .finup = sha1_ssse3_finup,
- .descsize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name = "sha1-ssse3",
- .cra_priority = 150,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static int register_sha1_ssse3(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- return crypto_register_shash(&sha1_ssse3_alg);
- return 0;
-}
-
-static void unregister_sha1_ssse3(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- crypto_unregister_shash(&sha1_ssse3_alg);
-}
-
-asmlinkage void sha1_transform_avx(struct sha1_state *state,
- const u8 *data, int blocks);
-
-static int sha1_avx_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha1_update(desc, data, len, sha1_transform_avx);
-}
-
-static int sha1_avx_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha1_finup(desc, data, len, out, sha1_transform_avx);
-}
-
-static struct shash_alg sha1_avx_alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_base_init,
- .update = sha1_avx_update,
- .finup = sha1_avx_finup,
- .descsize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name = "sha1-avx",
- .cra_priority = 160,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static bool avx_usable(void)
-{
- if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
- if (boot_cpu_has(X86_FEATURE_AVX))
- pr_info("AVX detected but unusable.\n");
- return false;
- }
-
- return true;
-}
-
-static int register_sha1_avx(void)
-{
- if (avx_usable())
- return crypto_register_shash(&sha1_avx_alg);
- return 0;
-}
-
-static void unregister_sha1_avx(void)
-{
- if (avx_usable())
- crypto_unregister_shash(&sha1_avx_alg);
-}
-
-#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
-
-asmlinkage void sha1_transform_avx2(struct sha1_state *state,
- const u8 *data, int blocks);
-
-static bool avx2_usable(void)
-{
- if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
- && boot_cpu_has(X86_FEATURE_BMI1)
- && boot_cpu_has(X86_FEATURE_BMI2))
- return true;
-
- return false;
-}
-
-static inline void sha1_apply_transform_avx2(struct sha1_state *state,
- const u8 *data, int blocks)
-{
- /* Select the optimal transform based on data block size */
- if (blocks >= SHA1_AVX2_BLOCK_OPTSIZE)
- sha1_transform_avx2(state, data, blocks);
- else
- sha1_transform_avx(state, data, blocks);
-}
-
-static int sha1_avx2_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha1_update(desc, data, len, sha1_apply_transform_avx2);
-}
-
-static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha1_finup(desc, data, len, out, sha1_apply_transform_avx2);
-}
-
-static struct shash_alg sha1_avx2_alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_base_init,
- .update = sha1_avx2_update,
- .finup = sha1_avx2_finup,
- .descsize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name = "sha1-avx2",
- .cra_priority = 170,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static int register_sha1_avx2(void)
-{
- if (avx2_usable())
- return crypto_register_shash(&sha1_avx2_alg);
- return 0;
-}
-
-static void unregister_sha1_avx2(void)
-{
- if (avx2_usable())
- crypto_unregister_shash(&sha1_avx2_alg);
-}
-
-asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data,
- int rounds);
-
-static int sha1_ni_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha1_update(desc, data, len, sha1_ni_transform);
-}
-
-static int sha1_ni_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha1_finup(desc, data, len, out, sha1_ni_transform);
-}
-
-static struct shash_alg sha1_ni_alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_base_init,
- .update = sha1_ni_update,
- .finup = sha1_ni_finup,
- .descsize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name = "sha1-ni",
- .cra_priority = 250,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static int register_sha1_ni(void)
-{
- if (boot_cpu_has(X86_FEATURE_SHA_NI))
- return crypto_register_shash(&sha1_ni_alg);
- return 0;
-}
-
-static void unregister_sha1_ni(void)
-{
- if (boot_cpu_has(X86_FEATURE_SHA_NI))
- crypto_unregister_shash(&sha1_ni_alg);
-}
-
-static int __init sha1_ssse3_mod_init(void)
-{
- if (!x86_match_cpu(module_cpu_ids))
- return -ENODEV;
-
- if (register_sha1_ssse3())
- goto fail;
-
- if (register_sha1_avx()) {
- unregister_sha1_ssse3();
- goto fail;
- }
-
- if (register_sha1_avx2()) {
- unregister_sha1_avx();
- unregister_sha1_ssse3();
- goto fail;
- }
-
- if (register_sha1_ni()) {
- unregister_sha1_avx2();
- unregister_sha1_avx();
- unregister_sha1_ssse3();
- goto fail;
- }
-
- return 0;
-fail:
- return -ENODEV;
-}
-
-static void __exit sha1_ssse3_mod_fini(void)
-{
- unregister_sha1_ni();
- unregister_sha1_avx2();
- unregister_sha1_avx();
- unregister_sha1_ssse3();
-}
-
-module_init(sha1_ssse3_mod_init);
-module_exit(sha1_ssse3_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated");
-
-MODULE_ALIAS_CRYPTO("sha1");
-MODULE_ALIAS_CRYPTO("sha1-ssse3");
-MODULE_ALIAS_CRYPTO("sha1-avx");
-MODULE_ALIAS_CRYPTO("sha1-avx2");
-MODULE_ALIAS_CRYPTO("sha1-ni");
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
deleted file mode 100644
index 5bfce4b045fd..000000000000
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ /dev/null
@@ -1,423 +0,0 @@
-########################################################################
-# Implement fast SHA-512 with AVX instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# David Cote <david.m.cote@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-512 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-.text
-
-# Virtual Registers
-# ARG1
-digest = %rdi
-# ARG2
-msg = %rsi
-# ARG3
-msglen = %rdx
-T1 = %rcx
-T2 = %r8
-a_64 = %r9
-b_64 = %r10
-c_64 = %r11
-d_64 = %r12
-e_64 = %r13
-f_64 = %r14
-g_64 = %r15
-h_64 = %rbx
-tmp0 = %rax
-
-# Local variables (stack frame)
-
-# Message Schedule
-W_SIZE = 80*8
-# W[t] + K[t] | W[t+1] + K[t+1]
-WK_SIZE = 2*8
-
-frame_W = 0
-frame_WK = frame_W + W_SIZE
-frame_size = frame_WK + WK_SIZE
-
-# Useful QWORD "arrays" for simpler memory references
-# MSG, DIGEST, K_t, W_t are arrays
-# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
-
-# Input message (arg1)
-#define MSG(i) 8*i(msg)
-
-# Output Digest (arg2)
-#define DIGEST(i) 8*i(digest)
-
-# SHA Constants (static mem)
-#define K_t(i) 8*i+K512(%rip)
-
-# Message Schedule (stack frame)
-#define W_t(i) 8*i+frame_W(%rsp)
-
-# W[t]+K[t] (stack frame)
-#define WK_2(i) 8*((i%2))+frame_WK(%rsp)
-
-.macro RotateState
- # Rotate symbols a..h right
- TMP = h_64
- h_64 = g_64
- g_64 = f_64
- f_64 = e_64
- e_64 = d_64
- d_64 = c_64
- c_64 = b_64
- b_64 = a_64
- a_64 = TMP
-.endm
-
-.macro RORQ p1 p2
- # shld is faster than ror on Sandybridge
- shld $(64-\p2), \p1, \p1
-.endm
-
-.macro SHA512_Round rnd
- # Compute Round %%t
- mov f_64, T1 # T1 = f
- mov e_64, tmp0 # tmp = e
- xor g_64, T1 # T1 = f ^ g
- RORQ tmp0, 23 # 41 # tmp = e ror 23
- and e_64, T1 # T1 = (f ^ g) & e
- xor e_64, tmp0 # tmp = (e ror 23) ^ e
- xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
- idx = \rnd
- add WK_2(idx), T1 # W[t] + K[t] from message scheduler
- RORQ tmp0, 4 # 18 # tmp = ((e ror 23) ^ e) ror 4
- xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e
- mov a_64, T2 # T2 = a
- add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h
- RORQ tmp0, 14 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
- add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
- mov a_64, tmp0 # tmp = a
- xor c_64, T2 # T2 = a ^ c
- and c_64, tmp0 # tmp = a & c
- and b_64, T2 # T2 = (a ^ c) & b
- xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
- mov a_64, tmp0 # tmp = a
- RORQ tmp0, 5 # 39 # tmp = a ror 5
- xor a_64, tmp0 # tmp = (a ror 5) ^ a
- add T1, d_64 # e(next_state) = d + T1
- RORQ tmp0, 6 # 34 # tmp = ((a ror 5) ^ a) ror 6
- xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a
- lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c)
- RORQ tmp0, 28 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
- add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a)
- RotateState
-.endm
-
-.macro SHA512_2Sched_2Round_avx rnd
- # Compute rounds t-2 and t-1
- # Compute message schedule QWORDS t and t+1
-
- # Two rounds are computed based on the values for K[t-2]+W[t-2] and
- # K[t-1]+W[t-1] which were previously stored at WK_2 by the message
- # scheduler.
- # The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)].
- # They are then added to their respective SHA512 constants at
- # [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)]
- # For brievity, the comments following vectored instructions only refer to
- # the first of a pair of QWORDS.
- # Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
- # The computation of the message schedule and the rounds are tightly
- # stitched to take advantage of instruction-level parallelism.
-
- idx = \rnd - 2
- vmovdqa W_t(idx), %xmm4 # XMM4 = W[t-2]
- idx = \rnd - 15
- vmovdqu W_t(idx), %xmm5 # XMM5 = W[t-15]
- mov f_64, T1
- vpsrlq $61, %xmm4, %xmm0 # XMM0 = W[t-2]>>61
- mov e_64, tmp0
- vpsrlq $1, %xmm5, %xmm6 # XMM6 = W[t-15]>>1
- xor g_64, T1
- RORQ tmp0, 23 # 41
- vpsrlq $19, %xmm4, %xmm1 # XMM1 = W[t-2]>>19
- and e_64, T1
- xor e_64, tmp0
- vpxor %xmm1, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19
- xor g_64, T1
- idx = \rnd
- add WK_2(idx), T1#
- vpsrlq $8, %xmm5, %xmm7 # XMM7 = W[t-15]>>8
- RORQ tmp0, 4 # 18
- vpsrlq $6, %xmm4, %xmm2 # XMM2 = W[t-2]>>6
- xor e_64, tmp0
- mov a_64, T2
- add h_64, T1
- vpxor %xmm7, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8
- RORQ tmp0, 14 # 14
- add tmp0, T1
- vpsrlq $7, %xmm5, %xmm8 # XMM8 = W[t-15]>>7
- mov a_64, tmp0
- xor c_64, T2
- vpsllq $(64-61), %xmm4, %xmm3 # XMM3 = W[t-2]<<3
- and c_64, tmp0
- and b_64, T2
- vpxor %xmm3, %xmm2, %xmm2 # XMM2 = W[t-2]>>6 ^ W[t-2]<<3
- xor tmp0, T2
- mov a_64, tmp0
- vpsllq $(64-1), %xmm5, %xmm9 # XMM9 = W[t-15]<<63
- RORQ tmp0, 5 # 39
- vpxor %xmm9, %xmm8, %xmm8 # XMM8 = W[t-15]>>7 ^ W[t-15]<<63
- xor a_64, tmp0
- add T1, d_64
- RORQ tmp0, 6 # 34
- xor a_64, tmp0
- vpxor %xmm8, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^
- # W[t-15]>>7 ^ W[t-15]<<63
- lea (T1, T2), h_64
- RORQ tmp0, 28 # 28
- vpsllq $(64-19), %xmm4, %xmm4 # XMM4 = W[t-2]<<25
- add tmp0, h_64
- RotateState
- vpxor %xmm4, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^
- # W[t-2]<<25
- mov f_64, T1
- vpxor %xmm2, %xmm0, %xmm0 # XMM0 = s1(W[t-2])
- mov e_64, tmp0
- xor g_64, T1
- idx = \rnd - 16
- vpaddq W_t(idx), %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16]
- idx = \rnd - 7
- vmovdqu W_t(idx), %xmm1 # XMM1 = W[t-7]
- RORQ tmp0, 23 # 41
- and e_64, T1
- xor e_64, tmp0
- xor g_64, T1
- vpsllq $(64-8), %xmm5, %xmm5 # XMM5 = W[t-15]<<56
- idx = \rnd + 1
- add WK_2(idx), T1
- vpxor %xmm5, %xmm6, %xmm6 # XMM6 = s0(W[t-15])
- RORQ tmp0, 4 # 18
- vpaddq %xmm6, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
- xor e_64, tmp0
- vpaddq %xmm1, %xmm0, %xmm0 # XMM0 = W[t] = s1(W[t-2]) + W[t-7] +
- # s0(W[t-15]) + W[t-16]
- mov a_64, T2
- add h_64, T1
- RORQ tmp0, 14 # 14
- add tmp0, T1
- idx = \rnd
- vmovdqa %xmm0, W_t(idx) # Store W[t]
- vpaddq K_t(idx), %xmm0, %xmm0 # Compute W[t]+K[t]
- vmovdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds
- mov a_64, tmp0
- xor c_64, T2
- and c_64, tmp0
- and b_64, T2
- xor tmp0, T2
- mov a_64, tmp0
- RORQ tmp0, 5 # 39
- xor a_64, tmp0
- add T1, d_64
- RORQ tmp0, 6 # 34
- xor a_64, tmp0
- lea (T1, T2), h_64
- RORQ tmp0, 28 # 28
- add tmp0, h_64
- RotateState
-.endm
-
-########################################################################
-# void sha512_transform_avx(sha512_state *state, const u8 *data, int blocks)
-# Purpose: Updates the SHA512 digest stored at "state" with the message
-# stored in "data".
-# The size of the message pointed to by "data" must be an integer multiple
-# of SHA512 message blocks.
-# "blocks" is the message length in SHA512 blocks
-########################################################################
-SYM_TYPED_FUNC_START(sha512_transform_avx)
- test msglen, msglen
- je .Lnowork
-
- # Save GPRs
- push %rbx
- push %r12
- push %r13
- push %r14
- push %r15
-
- # Allocate Stack Space
- push %rbp
- mov %rsp, %rbp
- sub $frame_size, %rsp
- and $~(0x20 - 1), %rsp
-
-.Lupdateblock:
-
- # Load state variables
- mov DIGEST(0), a_64
- mov DIGEST(1), b_64
- mov DIGEST(2), c_64
- mov DIGEST(3), d_64
- mov DIGEST(4), e_64
- mov DIGEST(5), f_64
- mov DIGEST(6), g_64
- mov DIGEST(7), h_64
-
- t = 0
- .rept 80/2 + 1
- # (80 rounds) / (2 rounds/iteration) + (1 iteration)
- # +1 iteration because the scheduler leads hashing by 1 iteration
- .if t < 2
- # BSWAP 2 QWORDS
- vmovdqa XMM_QWORD_BSWAP(%rip), %xmm1
- vmovdqu MSG(t), %xmm0
- vpshufb %xmm1, %xmm0, %xmm0 # BSWAP
- vmovdqa %xmm0, W_t(t) # Store Scheduled Pair
- vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
- vmovdqa %xmm0, WK_2(t) # Store into WK for rounds
- .elseif t < 16
- # BSWAP 2 QWORDS# Compute 2 Rounds
- vmovdqu MSG(t), %xmm0
- vpshufb %xmm1, %xmm0, %xmm0 # BSWAP
- SHA512_Round t-2 # Round t-2
- vmovdqa %xmm0, W_t(t) # Store Scheduled Pair
- vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
- SHA512_Round t-1 # Round t-1
- vmovdqa %xmm0, WK_2(t)# Store W[t]+K[t] into WK
- .elseif t < 79
- # Schedule 2 QWORDS# Compute 2 Rounds
- SHA512_2Sched_2Round_avx t
- .else
- # Compute 2 Rounds
- SHA512_Round t-2
- SHA512_Round t-1
- .endif
- t = t+2
- .endr
-
- # Update digest
- add a_64, DIGEST(0)
- add b_64, DIGEST(1)
- add c_64, DIGEST(2)
- add d_64, DIGEST(3)
- add e_64, DIGEST(4)
- add f_64, DIGEST(5)
- add g_64, DIGEST(6)
- add h_64, DIGEST(7)
-
- # Advance to next message block
- add $16*8, msg
- dec msglen
- jnz .Lupdateblock
-
- # Restore Stack Pointer
- mov %rbp, %rsp
- pop %rbp
-
- # Restore GPRs
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
-
-.Lnowork:
- RET
-SYM_FUNC_END(sha512_transform_avx)
-
-########################################################################
-### Binary Data
-
-.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
-.align 16
-# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
-XMM_QWORD_BSWAP:
- .octa 0x08090a0b0c0d0e0f0001020304050607
-
-# Mergeable 640-byte rodata section. This allows linker to merge the table
-# with other, exactly the same 640-byte fragment of another rodata section
-# (if such section exists).
-.section .rodata.cst640.K512, "aM", @progbits, 640
-.align 64
-# K[t] used in SHA512 hashing
-K512:
- .quad 0x428a2f98d728ae22,0x7137449123ef65cd
- .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
- .quad 0x3956c25bf348b538,0x59f111f1b605d019
- .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
- .quad 0xd807aa98a3030242,0x12835b0145706fbe
- .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
- .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
- .quad 0x9bdc06a725c71235,0xc19bf174cf692694
- .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
- .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
- .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
- .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
- .quad 0x983e5152ee66dfab,0xa831c66d2db43210
- .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
- .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
- .quad 0x06ca6351e003826f,0x142929670a0e6e70
- .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
- .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
- .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
- .quad 0x81c2c92e47edaee6,0x92722c851482353b
- .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
- .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
- .quad 0xd192e819d6ef5218,0xd69906245565a910
- .quad 0xf40e35855771202a,0x106aa07032bbd1b8
- .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
- .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
- .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
- .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
- .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
- .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
- .quad 0x90befffa23631e28,0xa4506cebde82bde9
- .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
- .quad 0xca273eceea26619c,0xd186b8c721c0c207
- .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
- .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
- .quad 0x113f9804bef90dae,0x1b710b35131c471b
- .quad 0x28db77f523047d84,0x32caab7b40c72493
- .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
- .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
- .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
deleted file mode 100644
index 24973f42c43f..000000000000
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ /dev/null
@@ -1,750 +0,0 @@
-########################################################################
-# Implement fast SHA-512 with AVX2 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# David Cote <david.m.cote@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-512 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-# This code schedules 1 blocks at a time, with 4 lanes per block
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-.text
-
-# Virtual Registers
-Y_0 = %ymm4
-Y_1 = %ymm5
-Y_2 = %ymm6
-Y_3 = %ymm7
-
-YTMP0 = %ymm0
-YTMP1 = %ymm1
-YTMP2 = %ymm2
-YTMP3 = %ymm3
-YTMP4 = %ymm8
-XFER = YTMP0
-
-BYTE_FLIP_MASK = %ymm9
-
-# 1st arg is %rdi, which is saved to the stack and accessed later via %r12
-CTX1 = %rdi
-CTX2 = %r12
-# 2nd arg
-INP = %rsi
-# 3rd arg
-NUM_BLKS = %rdx
-
-c = %rcx
-d = %r8
-e = %rdx
-y3 = %rsi
-
-TBL = %rdi # clobbers CTX1
-
-a = %rax
-b = %rbx
-
-f = %r9
-g = %r10
-h = %r11
-old_h = %r11
-
-T1 = %r12 # clobbers CTX2
-y0 = %r13
-y1 = %r14
-y2 = %r15
-
-# Local variables (stack frame)
-XFER_SIZE = 4*8
-SRND_SIZE = 1*8
-INP_SIZE = 1*8
-INPEND_SIZE = 1*8
-CTX_SIZE = 1*8
-
-frame_XFER = 0
-frame_SRND = frame_XFER + XFER_SIZE
-frame_INP = frame_SRND + SRND_SIZE
-frame_INPEND = frame_INP + INP_SIZE
-frame_CTX = frame_INPEND + INPEND_SIZE
-frame_size = frame_CTX + CTX_SIZE
-
-## assume buffers not aligned
-#define VMOVDQ vmovdqu
-
-# addm [mem], reg
-# Add reg to mem using reg-mem add and store
-.macro addm p1 p2
- add \p1, \p2
- mov \p2, \p1
-.endm
-
-
-# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
-# Load ymm with mem and byte swap each dword
-.macro COPY_YMM_AND_BSWAP p1 p2 p3
- VMOVDQ \p2, \p1
- vpshufb \p3, \p1, \p1
-.endm
-# rotate_Ys
-# Rotate values of symbols Y0...Y3
-.macro rotate_Ys
- Y_ = Y_0
- Y_0 = Y_1
- Y_1 = Y_2
- Y_2 = Y_3
- Y_3 = Y_
-.endm
-
-# RotateState
-.macro RotateState
- # Rotate symbols a..h right
- old_h = h
- TMP_ = h
- h = g
- g = f
- f = e
- e = d
- d = c
- c = b
- b = a
- a = TMP_
-.endm
-
-# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL
-# YDST = {YSRC1, YSRC2} >> RVAL*8
-.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
- vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI}
- vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
-################################### RND N + 0 #########################################
-
- # Extract w[t-7]
- MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7]
- # Calculate w[t-16] + w[t-7]
- vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16]
- # Extract w[t-15]
- MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15]
-
- # Calculate sigma0
-
- # Calculate w[t-15] ror 1
- vpsrlq $1, YTMP1, YTMP2
- vpsllq $(64-1), YTMP1, YTMP3
- vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1
- # Calculate w[t-15] shr 7
- vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7
-
- mov a, y3 # y3 = a # MAJA
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- add frame_XFER(%rsp),h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
- mov f, y2 # y2 = f # CH
- rorx $34, a, T1 # T1 = a >> 34 # S0B
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- xor g, y2 # y2 = f^g # CH
- rorx $14, e, y1 # y1 = (e >> 14) # S1
-
- and e, y2 # y2 = (f^g)&e # CH
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- add h, d # d = k + w + h + d # --
-
- and b, y3 # y3 = (a|c)&b # MAJA
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
-
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
-
- add y0, y2 # y2 = S1 + CH # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- RotateState
-
-################################### RND N + 1 #########################################
-
- # Calculate w[t-15] ror 8
- vpsrlq $8, YTMP1, YTMP2
- vpsllq $(64-8), YTMP1, YTMP1
- vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8
- # XOR the three components
- vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
- vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0
-
-
- # Add three components, w[t-16], w[t-7] and sigma0
- vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0
- # Move to appropriate lanes for calculating w[16] and w[17]
- vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA}
- # Move to appropriate lanes for calculating w[18] and w[19]
- vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00}
-
- # Calculate w[16] and w[17] in both 128 bit lanes
-
- # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
- vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA}
- vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA}
-
-
- mov a, y3 # y3 = a # MAJA
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- add 1*8+frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
-
- mov f, y2 # y2 = f # CH
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- xor g, y2 # y2 = f^g # CH
-
-
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- and e, y2 # y2 = (f^g)&e # CH
- add h, d # d = k + w + h + d # --
-
- and b, y3 # y3 = (a|c)&b # MAJA
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
-
- rorx $28, a, T1 # T1 = (a >> 28) # S0
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- RotateState
-
-
-################################### RND N + 2 #########################################
-
- vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA}
- vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA}
- vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA}
- vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
- vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA}
- vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA}
- vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA}
- vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
- # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
-
- # Add sigma1 to the other compunents to get w[16] and w[17]
- vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]}
-
- # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
- vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--}
-
- mov a, y3 # y3 = a # MAJA
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- add 2*8+frame_XFER(%rsp), h # h = k + w + h # --
-
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- or c, y3 # y3 = a|c # MAJA
- mov f, y2 # y2 = f # CH
- xor g, y2 # y2 = f^g # CH
-
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- and e, y2 # y2 = (f^g)&e # CH
-
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- add h, d # d = k + w + h + d # --
- and b, y3 # y3 = (a|c)&b # MAJA
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
-
- add y3, h # h = t1 + S0 + MAJ # --
-
- RotateState
-
-################################### RND N + 3 #########################################
-
- vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--}
- vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--}
- vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--}
- vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
- vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--}
- vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--}
- vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--}
- vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
- # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
-
- # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
- # to newly calculated sigma1 to get w[18] and w[19]
- vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --}
-
- # Form w[19, w[18], w17], w[16]
- vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]}
-
- mov a, y3 # y3 = a # MAJA
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- add 3*8+frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
-
- mov f, y2 # y2 = f # CH
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- xor g, y2 # y2 = f^g # CH
-
-
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add h, d # d = k + w + h + d # --
- and b, y3 # y3 = (a|c)&b # MAJA
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- add y0, y2 # y2 = S1 + CH # --
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- rorx $28, a, T1 # T1 = (a >> 28) # S0
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
-
- add y1, h # h = k + w + h + S0 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- RotateState
-
- rotate_Ys
-.endm
-
-.macro DO_4ROUNDS
-
-################################### RND N + 0 #########################################
-
- mov f, y2 # y2 = f # CH
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- and e, y2 # y2 = (f^g)&e # CH
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
- add frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- RotateState
-
-################################### RND N + 1 #########################################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
- add 8*1+frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- RotateState
-
-################################### RND N + 2 #########################################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
- add 8*2+frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- RotateState
-
-################################### RND N + 3 #########################################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
- add 8*3+frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
-
- add y3, h # h = t1 + S0 + MAJ # --
-
- RotateState
-
-.endm
-
-########################################################################
-# void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks)
-# Purpose: Updates the SHA512 digest stored at "state" with the message
-# stored in "data".
-# The size of the message pointed to by "data" must be an integer multiple
-# of SHA512 message blocks.
-# "blocks" is the message length in SHA512 blocks
-########################################################################
-SYM_TYPED_FUNC_START(sha512_transform_rorx)
- # Save GPRs
- push %rbx
- push %r12
- push %r13
- push %r14
- push %r15
-
- # Allocate Stack Space
- push %rbp
- mov %rsp, %rbp
- sub $frame_size, %rsp
- and $~(0x20 - 1), %rsp
-
- shl $7, NUM_BLKS # convert to bytes
- jz .Ldone_hash
- add INP, NUM_BLKS # pointer to end of data
- mov NUM_BLKS, frame_INPEND(%rsp)
-
- ## load initial digest
- mov 8*0(CTX1), a
- mov 8*1(CTX1), b
- mov 8*2(CTX1), c
- mov 8*3(CTX1), d
- mov 8*4(CTX1), e
- mov 8*5(CTX1), f
- mov 8*6(CTX1), g
- mov 8*7(CTX1), h
-
- # save %rdi (CTX) before it gets clobbered
- mov %rdi, frame_CTX(%rsp)
-
- vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
-
-.Lloop0:
- lea K512(%rip), TBL
-
- ## byte swap first 16 dwords
- COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK
- COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK
- COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK
- COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK
-
- mov INP, frame_INP(%rsp)
-
- ## schedule 64 input dwords, by doing 12 rounds of 4 each
- movq $4, frame_SRND(%rsp)
-
-.align 16
-.Lloop1:
- vpaddq (TBL), Y_0, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddq 1*32(TBL), Y_0, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddq 2*32(TBL), Y_0, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddq 3*32(TBL), Y_0, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- add $(4*32), TBL
- FOUR_ROUNDS_AND_SCHED
-
- subq $1, frame_SRND(%rsp)
- jne .Lloop1
-
- movq $2, frame_SRND(%rsp)
-.Lloop2:
- vpaddq (TBL), Y_0, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- DO_4ROUNDS
- vpaddq 1*32(TBL), Y_1, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- add $(2*32), TBL
- DO_4ROUNDS
-
- vmovdqa Y_2, Y_0
- vmovdqa Y_3, Y_1
-
- subq $1, frame_SRND(%rsp)
- jne .Lloop2
-
- mov frame_CTX(%rsp), CTX2
- addm 8*0(CTX2), a
- addm 8*1(CTX2), b
- addm 8*2(CTX2), c
- addm 8*3(CTX2), d
- addm 8*4(CTX2), e
- addm 8*5(CTX2), f
- addm 8*6(CTX2), g
- addm 8*7(CTX2), h
-
- mov frame_INP(%rsp), INP
- add $128, INP
- cmp frame_INPEND(%rsp), INP
- jne .Lloop0
-
-.Ldone_hash:
-
- # Restore Stack Pointer
- mov %rbp, %rsp
- pop %rbp
-
- # Restore GPRs
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
-
- vzeroupper
- RET
-SYM_FUNC_END(sha512_transform_rorx)
-
-########################################################################
-### Binary Data
-
-
-# Mergeable 640-byte rodata section. This allows linker to merge the table
-# with other, exactly the same 640-byte fragment of another rodata section
-# (if such section exists).
-.section .rodata.cst640.K512, "aM", @progbits, 640
-.align 64
-# K[t] used in SHA512 hashing
-K512:
- .quad 0x428a2f98d728ae22,0x7137449123ef65cd
- .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
- .quad 0x3956c25bf348b538,0x59f111f1b605d019
- .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
- .quad 0xd807aa98a3030242,0x12835b0145706fbe
- .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
- .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
- .quad 0x9bdc06a725c71235,0xc19bf174cf692694
- .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
- .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
- .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
- .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
- .quad 0x983e5152ee66dfab,0xa831c66d2db43210
- .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
- .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
- .quad 0x06ca6351e003826f,0x142929670a0e6e70
- .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
- .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
- .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
- .quad 0x81c2c92e47edaee6,0x92722c851482353b
- .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
- .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
- .quad 0xd192e819d6ef5218,0xd69906245565a910
- .quad 0xf40e35855771202a,0x106aa07032bbd1b8
- .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
- .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
- .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
- .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
- .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
- .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
- .quad 0x90befffa23631e28,0xa4506cebde82bde9
- .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
- .quad 0xca273eceea26619c,0xd186b8c721c0c207
- .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
- .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
- .quad 0x113f9804bef90dae,0x1b710b35131c471b
- .quad 0x28db77f523047d84,0x32caab7b40c72493
- .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
- .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
- .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
-
-.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
-.align 32
-# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x08090a0b0c0d0e0f0001020304050607
- .octa 0x18191a1b1c1d1e1f1011121314151617
-
-.section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
-.align 32
-MASK_YMM_LO:
- .octa 0x00000000000000000000000000000000
- .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
deleted file mode 100644
index 30a2c4777f9d..000000000000
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ /dev/null
@@ -1,425 +0,0 @@
-########################################################################
-# Implement fast SHA-512 with SSSE3 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# David Cote <david.m.cote@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-512 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-.text
-
-# Virtual Registers
-# ARG1
-digest = %rdi
-# ARG2
-msg = %rsi
-# ARG3
-msglen = %rdx
-T1 = %rcx
-T2 = %r8
-a_64 = %r9
-b_64 = %r10
-c_64 = %r11
-d_64 = %r12
-e_64 = %r13
-f_64 = %r14
-g_64 = %r15
-h_64 = %rbx
-tmp0 = %rax
-
-# Local variables (stack frame)
-
-W_SIZE = 80*8
-WK_SIZE = 2*8
-
-frame_W = 0
-frame_WK = frame_W + W_SIZE
-frame_size = frame_WK + WK_SIZE
-
-# Useful QWORD "arrays" for simpler memory references
-# MSG, DIGEST, K_t, W_t are arrays
-# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
-
-# Input message (arg1)
-#define MSG(i) 8*i(msg)
-
-# Output Digest (arg2)
-#define DIGEST(i) 8*i(digest)
-
-# SHA Constants (static mem)
-#define K_t(i) 8*i+K512(%rip)
-
-# Message Schedule (stack frame)
-#define W_t(i) 8*i+frame_W(%rsp)
-
-# W[t]+K[t] (stack frame)
-#define WK_2(i) 8*((i%2))+frame_WK(%rsp)
-
-.macro RotateState
- # Rotate symbols a..h right
- TMP = h_64
- h_64 = g_64
- g_64 = f_64
- f_64 = e_64
- e_64 = d_64
- d_64 = c_64
- c_64 = b_64
- b_64 = a_64
- a_64 = TMP
-.endm
-
-.macro SHA512_Round rnd
-
- # Compute Round %%t
- mov f_64, T1 # T1 = f
- mov e_64, tmp0 # tmp = e
- xor g_64, T1 # T1 = f ^ g
- ror $23, tmp0 # 41 # tmp = e ror 23
- and e_64, T1 # T1 = (f ^ g) & e
- xor e_64, tmp0 # tmp = (e ror 23) ^ e
- xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
- idx = \rnd
- add WK_2(idx), T1 # W[t] + K[t] from message scheduler
- ror $4, tmp0 # 18 # tmp = ((e ror 23) ^ e) ror 4
- xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e
- mov a_64, T2 # T2 = a
- add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h
- ror $14, tmp0 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
- add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
- mov a_64, tmp0 # tmp = a
- xor c_64, T2 # T2 = a ^ c
- and c_64, tmp0 # tmp = a & c
- and b_64, T2 # T2 = (a ^ c) & b
- xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
- mov a_64, tmp0 # tmp = a
- ror $5, tmp0 # 39 # tmp = a ror 5
- xor a_64, tmp0 # tmp = (a ror 5) ^ a
- add T1, d_64 # e(next_state) = d + T1
- ror $6, tmp0 # 34 # tmp = ((a ror 5) ^ a) ror 6
- xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a
- lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c)
- ror $28, tmp0 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
- add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a)
- RotateState
-.endm
-
-.macro SHA512_2Sched_2Round_sse rnd
-
- # Compute rounds t-2 and t-1
- # Compute message schedule QWORDS t and t+1
-
- # Two rounds are computed based on the values for K[t-2]+W[t-2] and
- # K[t-1]+W[t-1] which were previously stored at WK_2 by the message
- # scheduler.
- # The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
- # They are then added to their respective SHA512 constants at
- # [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
- # For brievity, the comments following vectored instructions only refer to
- # the first of a pair of QWORDS.
- # Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
- # The computation of the message schedule and the rounds are tightly
- # stitched to take advantage of instruction-level parallelism.
- # For clarity, integer instructions (for the rounds calculation) are indented
- # by one tab. Vectored instructions (for the message scheduler) are indented
- # by two tabs.
-
- mov f_64, T1
- idx = \rnd -2
- movdqa W_t(idx), %xmm2 # XMM2 = W[t-2]
- xor g_64, T1
- and e_64, T1
- movdqa %xmm2, %xmm0 # XMM0 = W[t-2]
- xor g_64, T1
- idx = \rnd
- add WK_2(idx), T1
- idx = \rnd - 15
- movdqu W_t(idx), %xmm5 # XMM5 = W[t-15]
- mov e_64, tmp0
- ror $23, tmp0 # 41
- movdqa %xmm5, %xmm3 # XMM3 = W[t-15]
- xor e_64, tmp0
- ror $4, tmp0 # 18
- psrlq $61-19, %xmm0 # XMM0 = W[t-2] >> 42
- xor e_64, tmp0
- ror $14, tmp0 # 14
- psrlq $(8-7), %xmm3 # XMM3 = W[t-15] >> 1
- add tmp0, T1
- add h_64, T1
- pxor %xmm2, %xmm0 # XMM0 = (W[t-2] >> 42) ^ W[t-2]
- mov a_64, T2
- xor c_64, T2
- pxor %xmm5, %xmm3 # XMM3 = (W[t-15] >> 1) ^ W[t-15]
- and b_64, T2
- mov a_64, tmp0
- psrlq $(19-6), %xmm0 # XMM0 = ((W[t-2]>>42)^W[t-2])>>13
- and c_64, tmp0
- xor tmp0, T2
- psrlq $(7-1), %xmm3 # XMM3 = ((W[t-15]>>1)^W[t-15])>>6
- mov a_64, tmp0
- ror $5, tmp0 # 39
- pxor %xmm2, %xmm0 # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
- xor a_64, tmp0
- ror $6, tmp0 # 34
- pxor %xmm5, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
- xor a_64, tmp0
- ror $28, tmp0 # 28
- psrlq $6, %xmm0 # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
- add tmp0, T2
- add T1, d_64
- psrlq $1, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
- lea (T1, T2), h_64
- RotateState
- movdqa %xmm2, %xmm1 # XMM1 = W[t-2]
- mov f_64, T1
- xor g_64, T1
- movdqa %xmm5, %xmm4 # XMM4 = W[t-15]
- and e_64, T1
- xor g_64, T1
- psllq $(64-19)-(64-61) , %xmm1 # XMM1 = W[t-2] << 42
- idx = \rnd + 1
- add WK_2(idx), T1
- mov e_64, tmp0
- psllq $(64-1)-(64-8), %xmm4 # XMM4 = W[t-15] << 7
- ror $23, tmp0 # 41
- xor e_64, tmp0
- pxor %xmm2, %xmm1 # XMM1 = (W[t-2] << 42)^W[t-2]
- ror $4, tmp0 # 18
- xor e_64, tmp0
- pxor %xmm5, %xmm4 # XMM4 = (W[t-15]<<7)^W[t-15]
- ror $14, tmp0 # 14
- add tmp0, T1
- psllq $(64-61), %xmm1 # XMM1 = ((W[t-2] << 42)^W[t-2])<<3
- add h_64, T1
- mov a_64, T2
- psllq $(64-8), %xmm4 # XMM4 = ((W[t-15]<<7)^W[t-15])<<56
- xor c_64, T2
- and b_64, T2
- pxor %xmm1, %xmm0 # XMM0 = s1(W[t-2])
- mov a_64, tmp0
- and c_64, tmp0
- idx = \rnd - 7
- movdqu W_t(idx), %xmm1 # XMM1 = W[t-7]
- xor tmp0, T2
- pxor %xmm4, %xmm3 # XMM3 = s0(W[t-15])
- mov a_64, tmp0
- paddq %xmm3, %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15])
- ror $5, tmp0 # 39
- idx =\rnd-16
- paddq W_t(idx), %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
- xor a_64, tmp0
- paddq %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
- ror $6, tmp0 # 34
- movdqa %xmm0, W_t(\rnd) # Store scheduled qwords
- xor a_64, tmp0
- paddq K_t(\rnd), %xmm0 # Compute W[t]+K[t]
- ror $28, tmp0 # 28
- idx = \rnd
- movdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds
- add tmp0, T2
- add T1, d_64
- lea (T1, T2), h_64
- RotateState
-.endm
-
-########################################################################
-## void sha512_transform_ssse3(struct sha512_state *state, const u8 *data,
-## int blocks);
-# (struct sha512_state is assumed to begin with u64 state[8])
-# Purpose: Updates the SHA512 digest stored at "state" with the message
-# stored in "data".
-# The size of the message pointed to by "data" must be an integer multiple
-# of SHA512 message blocks.
-# "blocks" is the message length in SHA512 blocks.
-########################################################################
-SYM_TYPED_FUNC_START(sha512_transform_ssse3)
-
- test msglen, msglen
- je .Lnowork
-
- # Save GPRs
- push %rbx
- push %r12
- push %r13
- push %r14
- push %r15
-
- # Allocate Stack Space
- push %rbp
- mov %rsp, %rbp
- sub $frame_size, %rsp
- and $~(0x20 - 1), %rsp
-
-.Lupdateblock:
-
-# Load state variables
- mov DIGEST(0), a_64
- mov DIGEST(1), b_64
- mov DIGEST(2), c_64
- mov DIGEST(3), d_64
- mov DIGEST(4), e_64
- mov DIGEST(5), f_64
- mov DIGEST(6), g_64
- mov DIGEST(7), h_64
-
- t = 0
- .rept 80/2 + 1
- # (80 rounds) / (2 rounds/iteration) + (1 iteration)
- # +1 iteration because the scheduler leads hashing by 1 iteration
- .if t < 2
- # BSWAP 2 QWORDS
- movdqa XMM_QWORD_BSWAP(%rip), %xmm1
- movdqu MSG(t), %xmm0
- pshufb %xmm1, %xmm0 # BSWAP
- movdqa %xmm0, W_t(t) # Store Scheduled Pair
- paddq K_t(t), %xmm0 # Compute W[t]+K[t]
- movdqa %xmm0, WK_2(t) # Store into WK for rounds
- .elseif t < 16
- # BSWAP 2 QWORDS# Compute 2 Rounds
- movdqu MSG(t), %xmm0
- pshufb %xmm1, %xmm0 # BSWAP
- SHA512_Round t-2 # Round t-2
- movdqa %xmm0, W_t(t) # Store Scheduled Pair
- paddq K_t(t), %xmm0 # Compute W[t]+K[t]
- SHA512_Round t-1 # Round t-1
- movdqa %xmm0, WK_2(t) # Store W[t]+K[t] into WK
- .elseif t < 79
- # Schedule 2 QWORDS# Compute 2 Rounds
- SHA512_2Sched_2Round_sse t
- .else
- # Compute 2 Rounds
- SHA512_Round t-2
- SHA512_Round t-1
- .endif
- t = t+2
- .endr
-
- # Update digest
- add a_64, DIGEST(0)
- add b_64, DIGEST(1)
- add c_64, DIGEST(2)
- add d_64, DIGEST(3)
- add e_64, DIGEST(4)
- add f_64, DIGEST(5)
- add g_64, DIGEST(6)
- add h_64, DIGEST(7)
-
- # Advance to next message block
- add $16*8, msg
- dec msglen
- jnz .Lupdateblock
-
- # Restore Stack Pointer
- mov %rbp, %rsp
- pop %rbp
-
- # Restore GPRs
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
-
-.Lnowork:
- RET
-SYM_FUNC_END(sha512_transform_ssse3)
-
-########################################################################
-### Binary Data
-
-.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
-.align 16
-# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
-XMM_QWORD_BSWAP:
- .octa 0x08090a0b0c0d0e0f0001020304050607
-
-# Mergeable 640-byte rodata section. This allows linker to merge the table
-# with other, exactly the same 640-byte fragment of another rodata section
-# (if such section exists).
-.section .rodata.cst640.K512, "aM", @progbits, 640
-.align 64
-# K[t] used in SHA512 hashing
-K512:
- .quad 0x428a2f98d728ae22,0x7137449123ef65cd
- .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
- .quad 0x3956c25bf348b538,0x59f111f1b605d019
- .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
- .quad 0xd807aa98a3030242,0x12835b0145706fbe
- .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
- .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
- .quad 0x9bdc06a725c71235,0xc19bf174cf692694
- .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
- .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
- .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
- .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
- .quad 0x983e5152ee66dfab,0xa831c66d2db43210
- .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
- .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
- .quad 0x06ca6351e003826f,0x142929670a0e6e70
- .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
- .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
- .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
- .quad 0x81c2c92e47edaee6,0x92722c851482353b
- .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
- .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
- .quad 0xd192e819d6ef5218,0xd69906245565a910
- .quad 0xf40e35855771202a,0x106aa07032bbd1b8
- .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
- .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
- .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
- .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
- .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
- .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
- .quad 0x90befffa23631e28,0xa4506cebde82bde9
- .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
- .quad 0xca273eceea26619c,0xd186b8c721c0c207
- .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
- .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
- .quad 0x113f9804bef90dae,0x1b710b35131c471b
- .quad 0x28db77f523047d84,0x32caab7b40c72493
- .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
- .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
- .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
deleted file mode 100644
index 067684c54395..000000000000
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ /dev/null
@@ -1,322 +0,0 @@
-/*
- * Cryptographic API.
- *
- * Glue code for the SHA512 Secure Hash Algorithm assembler
- * implementation using supplemental SSE3 / AVX / AVX2 instructions.
- *
- * This file is based on sha512_generic.c
- *
- * Copyright (C) 2013 Intel Corporation
- * Author: Tim Chen <tim.c.chen@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <asm/cpu_device_id.h>
-#include <asm/simd.h>
-#include <crypto/internal/hash.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-
-asmlinkage void sha512_transform_ssse3(struct sha512_state *state,
- const u8 *data, int blocks);
-
-static int sha512_update(struct shash_desc *desc, const u8 *data,
- unsigned int len, sha512_block_fn *sha512_xform)
-{
- int remain;
-
- /*
- * Make sure struct sha512_state begins directly with the SHA512
- * 512-bit internal state, as this is what the asm functions expect.
- */
- BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0);
-
- kernel_fpu_begin();
- remain = sha512_base_do_update_blocks(desc, data, len, sha512_xform);
- kernel_fpu_end();
-
- return remain;
-}
-
-static int sha512_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out, sha512_block_fn *sha512_xform)
-{
- kernel_fpu_begin();
- sha512_base_do_finup(desc, data, len, sha512_xform);
- kernel_fpu_end();
-
- return sha512_base_finish(desc, out);
-}
-
-static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha512_update(desc, data, len, sha512_transform_ssse3);
-}
-
-static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha512_finup(desc, data, len, out, sha512_transform_ssse3);
-}
-
-static struct shash_alg sha512_ssse3_algs[] = { {
- .digestsize = SHA512_DIGEST_SIZE,
- .init = sha512_base_init,
- .update = sha512_ssse3_update,
- .finup = sha512_ssse3_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha512",
- .cra_driver_name = "sha512-ssse3",
- .cra_priority = 150,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA384_DIGEST_SIZE,
- .init = sha384_base_init,
- .update = sha512_ssse3_update,
- .finup = sha512_ssse3_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha384",
- .cra_driver_name = "sha384-ssse3",
- .cra_priority = 150,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA384_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static int register_sha512_ssse3(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- return crypto_register_shashes(sha512_ssse3_algs,
- ARRAY_SIZE(sha512_ssse3_algs));
- return 0;
-}
-
-static void unregister_sha512_ssse3(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- crypto_unregister_shashes(sha512_ssse3_algs,
- ARRAY_SIZE(sha512_ssse3_algs));
-}
-
-asmlinkage void sha512_transform_avx(struct sha512_state *state,
- const u8 *data, int blocks);
-static bool avx_usable(void)
-{
- if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
- if (boot_cpu_has(X86_FEATURE_AVX))
- pr_info("AVX detected but unusable.\n");
- return false;
- }
-
- return true;
-}
-
-static int sha512_avx_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha512_update(desc, data, len, sha512_transform_avx);
-}
-
-static int sha512_avx_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha512_finup(desc, data, len, out, sha512_transform_avx);
-}
-
-static struct shash_alg sha512_avx_algs[] = { {
- .digestsize = SHA512_DIGEST_SIZE,
- .init = sha512_base_init,
- .update = sha512_avx_update,
- .finup = sha512_avx_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha512",
- .cra_driver_name = "sha512-avx",
- .cra_priority = 160,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA384_DIGEST_SIZE,
- .init = sha384_base_init,
- .update = sha512_avx_update,
- .finup = sha512_avx_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha384",
- .cra_driver_name = "sha384-avx",
- .cra_priority = 160,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA384_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static int register_sha512_avx(void)
-{
- if (avx_usable())
- return crypto_register_shashes(sha512_avx_algs,
- ARRAY_SIZE(sha512_avx_algs));
- return 0;
-}
-
-static void unregister_sha512_avx(void)
-{
- if (avx_usable())
- crypto_unregister_shashes(sha512_avx_algs,
- ARRAY_SIZE(sha512_avx_algs));
-}
-
-asmlinkage void sha512_transform_rorx(struct sha512_state *state,
- const u8 *data, int blocks);
-
-static int sha512_avx2_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha512_update(desc, data, len, sha512_transform_rorx);
-}
-
-static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha512_finup(desc, data, len, out, sha512_transform_rorx);
-}
-
-static struct shash_alg sha512_avx2_algs[] = { {
- .digestsize = SHA512_DIGEST_SIZE,
- .init = sha512_base_init,
- .update = sha512_avx2_update,
- .finup = sha512_avx2_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha512",
- .cra_driver_name = "sha512-avx2",
- .cra_priority = 170,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA384_DIGEST_SIZE,
- .init = sha384_base_init,
- .update = sha512_avx2_update,
- .finup = sha512_avx2_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha384",
- .cra_driver_name = "sha384-avx2",
- .cra_priority = 170,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA384_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static bool avx2_usable(void)
-{
- if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_BMI2))
- return true;
-
- return false;
-}
-
-static int register_sha512_avx2(void)
-{
- if (avx2_usable())
- return crypto_register_shashes(sha512_avx2_algs,
- ARRAY_SIZE(sha512_avx2_algs));
- return 0;
-}
-static const struct x86_cpu_id module_cpu_ids[] = {
- X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL),
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids);
-
-static void unregister_sha512_avx2(void)
-{
- if (avx2_usable())
- crypto_unregister_shashes(sha512_avx2_algs,
- ARRAY_SIZE(sha512_avx2_algs));
-}
-
-static int __init sha512_ssse3_mod_init(void)
-{
- if (!x86_match_cpu(module_cpu_ids))
- return -ENODEV;
-
- if (register_sha512_ssse3())
- goto fail;
-
- if (register_sha512_avx()) {
- unregister_sha512_ssse3();
- goto fail;
- }
-
- if (register_sha512_avx2()) {
- unregister_sha512_avx();
- unregister_sha512_ssse3();
- goto fail;
- }
-
- return 0;
-fail:
- return -ENODEV;
-}
-
-static void __exit sha512_ssse3_mod_fini(void)
-{
- unregister_sha512_avx2();
- unregister_sha512_avx();
- unregister_sha512_ssse3();
-}
-
-module_init(sha512_ssse3_mod_init);
-module_exit(sha512_ssse3_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated");
-
-MODULE_ALIAS_CRYPTO("sha512");
-MODULE_ALIAS_CRYPTO("sha512-ssse3");
-MODULE_ALIAS_CRYPTO("sha512-avx");
-MODULE_ALIAS_CRYPTO("sha512-avx2");
-MODULE_ALIAS_CRYPTO("sha384");
-MODULE_ALIAS_CRYPTO("sha384-ssse3");
-MODULE_ALIAS_CRYPTO("sha384-avx");
-MODULE_ALIAS_CRYPTO("sha384-avx2");
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index d83236b96f22..94519688b007 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -369,7 +369,7 @@ For 32-bit we have the following conventions - kernel is built with
.endm
.macro STACKLEAK_ERASE_NOCLOBBER
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
PUSH_AND_CLEAR_REGS
call stackleak_erase
POP_REGS
@@ -388,7 +388,7 @@ For 32-bit we have the following conventions - kernel is built with
#endif /* !CONFIG_X86_64 */
.macro STACKLEAK_ERASE
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
call stackleak_erase
#endif
.endm
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ac007ea00979..4877e16da69a 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -473,3 +473,5 @@
465 i386 listxattrat sys_listxattrat
466 i386 removexattrat sys_removexattrat
467 i386 open_tree_attr sys_open_tree_attr
+468 i386 file_getattr sys_file_getattr
+469 i386 file_setattr sys_file_setattr
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index cfb5ca41e30d..92cf0fe2291e 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -391,6 +391,8 @@
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
467 common open_tree_attr sys_open_tree_attr
+468 common file_getattr sys_file_getattr
+469 common file_setattr sys_file_setattr
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 54d3e9774d62..f247f5f5cb44 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -62,7 +62,7 @@ ifneq ($(RETPOLINE_VDSO_CFLAGS),)
endif
endif
-$(vobjs): KBUILD_CFLAGS := $(filter-out $(PADDING_CFLAGS) $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
+$(vobjs): KBUILD_CFLAGS := $(filter-out $(PADDING_CFLAGS) $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(KSTACK_ERASE_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
$(vobjs): KBUILD_AFLAGS += -DBUILD_VDSO
#
@@ -123,6 +123,7 @@ KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(RANDSTRUCT_CFLAGS),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(KSTACK_ERASE_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32))
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index e0815a12db90..a762f7f5b161 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1807,6 +1807,12 @@ static const struct intel_uncore_init_fun lnl_uncore_init __initconst = {
.mmio_init = lnl_uncore_mmio_init,
};
+static const struct intel_uncore_init_fun ptl_uncore_init __initconst = {
+ .cpu_init = ptl_uncore_cpu_init,
+ .mmio_init = ptl_uncore_mmio_init,
+ .use_discovery = true,
+};
+
static const struct intel_uncore_init_fun icx_uncore_init __initconst = {
.cpu_init = icx_uncore_cpu_init,
.pci_init = icx_uncore_pci_init,
@@ -1888,6 +1894,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_VFM(INTEL_ARROWLAKE_U, &mtl_uncore_init),
X86_MATCH_VFM(INTEL_ARROWLAKE_H, &mtl_uncore_init),
X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_uncore_init),
+ X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &ptl_uncore_init),
X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &spr_uncore_init),
X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &spr_uncore_init),
X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &gnr_uncore_init),
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 3dcb88c0ecfa..d8815fff7588 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -612,10 +612,12 @@ void tgl_uncore_cpu_init(void);
void adl_uncore_cpu_init(void);
void lnl_uncore_cpu_init(void);
void mtl_uncore_cpu_init(void);
+void ptl_uncore_cpu_init(void);
void tgl_uncore_mmio_init(void);
void tgl_l_uncore_mmio_init(void);
void adl_uncore_mmio_init(void);
void lnl_uncore_mmio_init(void);
+void ptl_uncore_mmio_init(void);
int snb_pci2phy_map_init(int devid);
/* uncore_snbep.c */
diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
index 18a3022f26a0..7d57ce706feb 100644
--- a/arch/x86/events/intel/uncore_discovery.c
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -274,32 +274,15 @@ uncore_ignore_unit(struct uncore_unit_discovery *unit, int *ignore)
return false;
}
-static int parse_discovery_table(struct pci_dev *dev, int die,
- u32 bar_offset, bool *parsed,
- int *ignore)
+static int __parse_discovery_table(resource_size_t addr, int die,
+ bool *parsed, int *ignore)
{
struct uncore_global_discovery global;
struct uncore_unit_discovery unit;
void __iomem *io_addr;
- resource_size_t addr;
unsigned long size;
- u32 val;
int i;
- pci_read_config_dword(dev, bar_offset, &val);
-
- if (val & ~PCI_BASE_ADDRESS_MEM_MASK & ~PCI_BASE_ADDRESS_MEM_TYPE_64)
- return -EINVAL;
-
- addr = (resource_size_t)(val & PCI_BASE_ADDRESS_MEM_MASK);
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
- if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) {
- u32 val2;
-
- pci_read_config_dword(dev, bar_offset + 4, &val2);
- addr |= ((resource_size_t)val2) << 32;
- }
-#endif
size = UNCORE_DISCOVERY_GLOBAL_MAP_SIZE;
io_addr = ioremap(addr, size);
if (!io_addr)
@@ -342,7 +325,32 @@ static int parse_discovery_table(struct pci_dev *dev, int die,
return 0;
}
-bool intel_uncore_has_discovery_tables(int *ignore)
+static int parse_discovery_table(struct pci_dev *dev, int die,
+ u32 bar_offset, bool *parsed,
+ int *ignore)
+{
+ resource_size_t addr;
+ u32 val;
+
+ pci_read_config_dword(dev, bar_offset, &val);
+
+ if (val & ~PCI_BASE_ADDRESS_MEM_MASK & ~PCI_BASE_ADDRESS_MEM_TYPE_64)
+ return -EINVAL;
+
+ addr = (resource_size_t)(val & PCI_BASE_ADDRESS_MEM_MASK);
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) {
+ u32 val2;
+
+ pci_read_config_dword(dev, bar_offset + 4, &val2);
+ addr |= ((resource_size_t)val2) << 32;
+ }
+#endif
+
+ return __parse_discovery_table(addr, die, parsed, ignore);
+}
+
+static bool intel_uncore_has_discovery_tables_pci(int *ignore)
{
u32 device, val, entry_id, bar_offset;
int die, dvsec = 0, ret = true;
@@ -391,6 +399,45 @@ err:
return ret;
}
+static bool intel_uncore_has_discovery_tables_msr(int *ignore)
+{
+ unsigned long *die_mask;
+ bool parsed = false;
+ int cpu, die;
+ u64 base;
+
+ die_mask = kcalloc(BITS_TO_LONGS(uncore_max_dies()),
+ sizeof(unsigned long), GFP_KERNEL);
+ if (!die_mask)
+ return false;
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ die = topology_logical_die_id(cpu);
+ if (__test_and_set_bit(die, die_mask))
+ continue;
+
+ if (rdmsrq_safe_on_cpu(cpu, UNCORE_DISCOVERY_MSR, &base))
+ continue;
+
+ if (!base)
+ continue;
+
+ __parse_discovery_table(base, die, &parsed, ignore);
+ }
+
+ cpus_read_unlock();
+
+ kfree(die_mask);
+ return parsed;
+}
+
+bool intel_uncore_has_discovery_tables(int *ignore)
+{
+ return intel_uncore_has_discovery_tables_msr(ignore) ||
+ intel_uncore_has_discovery_tables_pci(ignore);
+}
+
void intel_uncore_clear_discovery_tables(void)
{
struct intel_uncore_discovery_type *type, *next;
@@ -604,7 +651,7 @@ void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box)
}
addr = unit->addr;
- box->io_addr = ioremap(addr, UNCORE_GENERIC_MMIO_SIZE);
+ box->io_addr = ioremap(addr, type->mmio_map_size);
if (!box->io_addr) {
pr_warn("Uncore type %d box %d: ioremap error for 0x%llx.\n",
type->type_id, unit->id, (unsigned long long)addr);
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index 0e94aa7db8e7..dff75c98e22f 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -1,5 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0-only */
+/* Store the full address of the global discovery table */
+#define UNCORE_DISCOVERY_MSR 0x201e
+
/* Generic device ID of a discovery table device */
#define UNCORE_DISCOVERY_TABLE_DEVICE 0x09a7
/* Capability ID for a discovery table device */
@@ -168,3 +171,7 @@ bool intel_generic_uncore_assign_hw_event(struct perf_event *event,
struct intel_uncore_box *box);
void uncore_find_add_unit(struct intel_uncore_discovery_unit *node,
struct rb_root *root, u16 *num_units);
+struct intel_uncore_type **
+uncore_get_uncores(enum uncore_access_type type_id, int num_extra,
+ struct intel_uncore_type **extra, int max_num_types,
+ struct intel_uncore_type **uncores);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index a1a96833e30e..807e582b8f17 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -1855,3 +1855,82 @@ void lnl_uncore_mmio_init(void)
}
/* end of Lunar Lake MMIO uncore support */
+
+/* Panther Lake uncore support */
+
+#define UNCORE_PTL_MAX_NUM_UNCORE_TYPES 42
+#define UNCORE_PTL_TYPE_IMC 6
+#define UNCORE_PTL_TYPE_SNCU 34
+#define UNCORE_PTL_TYPE_HBO 41
+
+#define PTL_UNCORE_GLOBAL_CTL_OFFSET 0x380
+
+static struct intel_uncore_type ptl_uncore_imc = {
+ .name = "imc",
+ .mmio_map_size = 0xf00,
+};
+
+static void ptl_uncore_sncu_init_box(struct intel_uncore_box *box)
+{
+ intel_generic_uncore_mmio_init_box(box);
+
+ /* Clear the global freeze bit */
+ if (box->io_addr)
+ writel(0, box->io_addr + PTL_UNCORE_GLOBAL_CTL_OFFSET);
+}
+
+static struct intel_uncore_ops ptl_uncore_sncu_ops = {
+ .init_box = ptl_uncore_sncu_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .disable_box = intel_generic_uncore_mmio_disable_box,
+ .enable_box = intel_generic_uncore_mmio_enable_box,
+ .disable_event = intel_generic_uncore_mmio_disable_event,
+ .enable_event = intel_generic_uncore_mmio_enable_event,
+ .read_counter = uncore_mmio_read_counter,
+};
+
+static struct intel_uncore_type ptl_uncore_sncu = {
+ .name = "sncu",
+ .ops = &ptl_uncore_sncu_ops,
+ .mmio_map_size = 0xf00,
+};
+
+static struct intel_uncore_type ptl_uncore_hbo = {
+ .name = "hbo",
+ .mmio_map_size = 0xf00,
+};
+
+static struct intel_uncore_type *ptl_uncores[UNCORE_PTL_MAX_NUM_UNCORE_TYPES] = {
+ [UNCORE_PTL_TYPE_IMC] = &ptl_uncore_imc,
+ [UNCORE_PTL_TYPE_SNCU] = &ptl_uncore_sncu,
+ [UNCORE_PTL_TYPE_HBO] = &ptl_uncore_hbo,
+};
+
+#define UNCORE_PTL_MMIO_EXTRA_UNCORES 1
+
+static struct intel_uncore_type *ptl_mmio_extra_uncores[UNCORE_PTL_MMIO_EXTRA_UNCORES] = {
+ &adl_uncore_imc_free_running,
+};
+
+void ptl_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO,
+ UNCORE_PTL_MMIO_EXTRA_UNCORES,
+ ptl_mmio_extra_uncores,
+ UNCORE_PTL_MAX_NUM_UNCORE_TYPES,
+ ptl_uncores);
+}
+
+static struct intel_uncore_type *ptl_msr_uncores[] = {
+ &mtl_uncore_cbox,
+ NULL
+};
+
+void ptl_uncore_cpu_init(void)
+{
+ mtl_uncore_cbox.num_boxes = 6;
+ mtl_uncore_cbox.ops = &lnl_uncore_msr_ops;
+ uncore_msr_uncores = ptl_msr_uncores;
+}
+
+/* end of Panther Lake uncore support */
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 2824dc9950be..e1f370b8d065 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -6409,9 +6409,11 @@ static void uncore_type_customized_copy(struct intel_uncore_type *to_type,
to_type->get_topology = from_type->get_topology;
if (from_type->cleanup_mapping)
to_type->cleanup_mapping = from_type->cleanup_mapping;
+ if (from_type->mmio_map_size)
+ to_type->mmio_map_size = from_type->mmio_map_size;
}
-static struct intel_uncore_type **
+struct intel_uncore_type **
uncore_get_uncores(enum uncore_access_type type_id, int num_extra,
struct intel_uncore_type **extra, int max_num_types,
struct intel_uncore_type **uncores)
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 5ab1a4598d00..a03aa6f999d1 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -158,13 +158,13 @@ static inline bool acpi_has_cpu_in_madt(void)
}
#define ACPI_HAVE_ARCH_SET_ROOT_POINTER
-static inline void acpi_arch_set_root_pointer(u64 addr)
+static __always_inline void acpi_arch_set_root_pointer(u64 addr)
{
x86_init.acpi.set_root_pointer(addr);
}
#define ACPI_HAVE_ARCH_GET_ROOT_POINTER
-static inline u64 acpi_arch_get_root_pointer(void)
+static __always_inline u64 acpi_arch_get_root_pointer(void)
{
return x86_init.acpi.get_root_pointer();
}
diff --git a/arch/x86/include/asm/ce4100.h b/arch/x86/include/asm/ce4100.h
index 2930f560d7f3..e1f965bb1e31 100644
--- a/arch/x86/include/asm/ce4100.h
+++ b/arch/x86/include/asm/ce4100.h
@@ -4,4 +4,10 @@
int ce4100_pci_init(void);
+#ifdef CONFIG_SERIAL_8250
+void __init sdv_serial_fixup(void);
+#else
+static inline void sdv_serial_fixup(void) {};
+#endif
+
#endif
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 286d509f9363..602957dd2609 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -458,9 +458,12 @@
#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* LFENCE always serializing / synchronizes RDTSC */
#define X86_FEATURE_VERW_CLEAR (20*32+ 5) /* The memory form of VERW mitigates TSA */
#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* Null Selector Clears Base */
+
#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */
#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */
+#define X86_FEATURE_GP_ON_USER_CPUID (20*32+17) /* User CPUID faulting */
+
#define X86_FEATURE_PREFETCHI (20*32+20) /* Prefetch Data/Instruction to Cache Level */
#define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */
#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 1c94121acd3d..93e99d2583d6 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -118,7 +118,7 @@ enum xfeature {
XFEATURE_PKRU,
XFEATURE_PASID,
XFEATURE_CET_USER,
- XFEATURE_CET_KERNEL_UNUSED,
+ XFEATURE_CET_KERNEL,
XFEATURE_RSRVD_COMP_13,
XFEATURE_RSRVD_COMP_14,
XFEATURE_LBR,
@@ -142,7 +142,7 @@ enum xfeature {
#define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
#define XFEATURE_MASK_PASID (1 << XFEATURE_PASID)
#define XFEATURE_MASK_CET_USER (1 << XFEATURE_CET_USER)
-#define XFEATURE_MASK_CET_KERNEL (1 << XFEATURE_CET_KERNEL_UNUSED)
+#define XFEATURE_MASK_CET_KERNEL (1 << XFEATURE_CET_KERNEL)
#define XFEATURE_MASK_LBR (1 << XFEATURE_LBR)
#define XFEATURE_MASK_XTILE_CFG (1 << XFEATURE_XTILE_CFG)
#define XFEATURE_MASK_XTILE_DATA (1 << XFEATURE_XTILE_DATA)
@@ -269,6 +269,16 @@ struct cet_user_state {
};
/*
+ * State component 12 is Control-flow Enforcement supervisor states.
+ * This state includes SSP pointers for privilege levels 0 through 2.
+ */
+struct cet_supervisor_state {
+ u64 pl0_ssp;
+ u64 pl1_ssp;
+ u64 pl2_ssp;
+} __packed;
+
+/*
* State component 15: Architectural LBR configuration state.
* The size of Arch LBR state depends on the number of LBRs (lbr_depth).
*/
@@ -552,6 +562,31 @@ struct fpu_guest {
};
/*
+ * FPU state configuration data for fpu_guest.
+ * Initialized at boot time. Read only after init.
+ */
+struct vcpu_fpu_config {
+ /*
+ * @size:
+ *
+ * The default size of the register state buffer in guest FPUs.
+ * Includes all supported features except independent managed
+ * features and features which have to be requested by user space
+ * before usage.
+ */
+ unsigned int size;
+
+ /*
+ * @features:
+ *
+ * The default supported features bitmap in guest FPUs. Does not
+ * include independent managed features and features which have to
+ * be requested by user space before usage.
+ */
+ u64 features;
+};
+
+/*
* FPU state configuration data. Initialized at boot time. Read only after init.
*/
struct fpu_state_config {
@@ -567,8 +602,9 @@ struct fpu_state_config {
* @default_size:
*
* The default size of the register state buffer. Includes all
- * supported features except independent managed features and
- * features which have to be requested by user space before usage.
+ * supported features except independent managed features,
+ * guest-only features and features which have to be requested by
+ * user space before usage.
*/
unsigned int default_size;
@@ -584,8 +620,8 @@ struct fpu_state_config {
* @default_features:
*
* The default supported features bitmap. Does not include
- * independent managed features and features which have to
- * be requested by user space before usage.
+ * independent managed features, guest-only features and features
+ * which have to be requested by user space before usage.
*/
u64 default_features;
/*
@@ -606,5 +642,6 @@ struct fpu_state_config {
/* FPU state configuration information */
extern struct fpu_state_config fpu_kernel_cfg, fpu_user_cfg;
+extern struct vcpu_fpu_config guest_default_cfg;
#endif /* _ASM_X86_FPU_TYPES_H */
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index b308a76afbb7..7a7dc9d56027 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -46,9 +46,13 @@
/* Features which are dynamically enabled for a process on request */
#define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA
+/* Supervisor features which are enabled only in guest FPUs */
+#define XFEATURE_MASK_GUEST_SUPERVISOR XFEATURE_MASK_CET_KERNEL
+
/* All currently supported supervisor features */
#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \
- XFEATURE_MASK_CET_USER)
+ XFEATURE_MASK_CET_USER | \
+ XFEATURE_MASK_GUEST_SUPERVISOR)
/*
* A supervisor state component may not always contain valuable information,
@@ -75,8 +79,7 @@
* Unsupported supervisor features. When a supervisor feature in this mask is
* supported in the future, move it to the supported supervisor feature mask.
*/
-#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT | \
- XFEATURE_MASK_CET_KERNEL)
+#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT)
/* All supervisor states including supported and unsupported states. */
#define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 8b1b1abcef15..5a68e9db6518 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -5,7 +5,7 @@
#if defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000
#define __head __section(".head.text") __no_sanitize_undefined __no_stack_protector
#else
-#define __head __section(".head.text") __no_sanitize_undefined
+#define __head __section(".head.text") __no_sanitize_undefined __no_kstack_erase
#endif
struct x86_mapping_info {
diff --git a/arch/x86/include/asm/intel_telemetry.h b/arch/x86/include/asm/intel_telemetry.h
index 43b7657febca..944637a4e6de 100644
--- a/arch/x86/include/asm/intel_telemetry.h
+++ b/arch/x86/include/asm/intel_telemetry.h
@@ -59,18 +59,6 @@ struct telemetry_plt_config {
};
struct telemetry_core_ops {
- int (*get_sampling_period)(u8 *pss_min_period, u8 *pss_max_period,
- u8 *ioss_min_period, u8 *ioss_max_period);
-
- int (*get_eventconfig)(struct telemetry_evtconfig *pss_evtconfig,
- struct telemetry_evtconfig *ioss_evtconfig,
- int pss_len, int ioss_len);
-
- int (*update_events)(struct telemetry_evtconfig pss_evtconfig,
- struct telemetry_evtconfig ioss_evtconfig);
-
- int (*set_sampling_period)(u8 pss_period, u8 ioss_period);
-
int (*get_trace_verbosity)(enum telemetry_unit telem_unit,
u32 *verbosity);
@@ -84,11 +72,6 @@ struct telemetry_core_ops {
int (*read_eventlog)(enum telemetry_unit telem_unit,
struct telemetry_evtlog *evtlog,
int len, int log_all_evts);
-
- int (*add_events)(u8 num_pss_evts, u8 num_ioss_evts,
- u32 *pss_evtmap, u32 *ioss_evtmap);
-
- int (*reset_events)(void);
};
int telemetry_set_pltdata(const struct telemetry_core_ops *ops,
@@ -101,35 +84,15 @@ struct telemetry_plt_config *telemetry_get_pltdata(void);
int telemetry_get_evtname(enum telemetry_unit telem_unit,
const char **name, int len);
-int telemetry_update_events(struct telemetry_evtconfig pss_evtconfig,
- struct telemetry_evtconfig ioss_evtconfig);
-
-int telemetry_add_events(u8 num_pss_evts, u8 num_ioss_evts,
- u32 *pss_evtmap, u32 *ioss_evtmap);
-
-int telemetry_reset_events(void);
-
-int telemetry_get_eventconfig(struct telemetry_evtconfig *pss_config,
- struct telemetry_evtconfig *ioss_config,
- int pss_len, int ioss_len);
-
int telemetry_read_events(enum telemetry_unit telem_unit,
struct telemetry_evtlog *evtlog, int len);
-int telemetry_raw_read_events(enum telemetry_unit telem_unit,
- struct telemetry_evtlog *evtlog, int len);
-
int telemetry_read_eventlog(enum telemetry_unit telem_unit,
struct telemetry_evtlog *evtlog, int len);
int telemetry_raw_read_eventlog(enum telemetry_unit telem_unit,
struct telemetry_evtlog *evtlog, int len);
-int telemetry_get_sampling_period(u8 *pss_min_period, u8 *pss_max_period,
- u8 *ioss_min_period, u8 *ioss_max_period);
-
-int telemetry_set_sampling_period(u8 pss_period, u8 ioss_period);
-
int telemetry_set_trace_verbosity(enum telemetry_unit telem_unit,
u32 verbosity);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 5cfb5d74dd5f..7490bb5c0776 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -733,6 +733,11 @@
#define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301
#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302
+/* AMD Hardware Feedback Support MSRs */
+#define MSR_AMD_WORKLOAD_CLASS_CONFIG 0xc0000500
+#define MSR_AMD_WORKLOAD_CLASS_ID 0xc0000501
+#define MSR_AMD_WORKLOAD_HRST 0xc0000502
+
/* AMD Last Branch Record MSRs */
#define MSR_AMD64_LBR_SELECT 0xc000010e
@@ -831,6 +836,7 @@
#define MSR_K7_HWCR_SMMLOCK BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT)
#define MSR_K7_HWCR_IRPERF_EN_BIT 30
#define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT)
+#define MSR_K7_HWCR_CPUID_USER_DIS_BIT 35
#define MSR_K7_FID_VID_CTL 0xc0010041
#define MSR_K7_FID_VID_STATUS 0xc0010042
#define MSR_K7_HWCR_CPB_DIS_BIT 25
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b74ec5c3643b..a5731fb1e9dd 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -214,9 +214,6 @@ enum page_cache_mode {
#define PAGE_READONLY __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0)
#define PAGE_READONLY_EXEC __pg(__PP| 0|_USR|___A| 0| 0| 0| 0)
-#define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G)
-#define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G)
-
/*
* Page tables needs to have Write=1 in order for any lower PTEs to be
* writable. This includes shadow stack memory (Write=0, Dirty=1)
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index f607081a022a..e406a1e92c63 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -78,7 +78,7 @@ extern unsigned char secondary_startup_64[];
extern unsigned char secondary_startup_64_no_verify[];
#endif
-static inline size_t real_mode_size_needed(void)
+static __always_inline size_t real_mode_size_needed(void)
{
if (real_mode_header)
return 0; /* already allocated. */
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index a631f7d7c0c0..89075ff19afa 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -243,6 +243,7 @@ struct snp_guest_req {
size_t resp_sz;
u64 exit_code;
+ u64 exitinfo2;
unsigned int vmpck_id;
u8 msg_version;
u8 msg_type;
@@ -460,7 +461,7 @@ static __always_inline void sev_es_nmi_complete(void)
cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
__sev_es_nmi_complete();
}
-extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd);
+extern int __init sev_es_efi_map_ghcbs_cas(pgd_t *pgd);
extern void sev_enable(struct boot_params *bp);
/*
@@ -501,8 +502,6 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate)
return rc;
}
-struct snp_guest_request_ioctl;
-
void setup_ghcb(void);
void early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
unsigned long npages);
@@ -528,8 +527,7 @@ void snp_kexec_begin(void);
int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id);
struct snp_msg_desc *snp_msg_alloc(void);
void snp_msg_free(struct snp_msg_desc *mdesc);
-int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
- struct snp_guest_request_ioctl *rio);
+int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req);
int snp_svsm_vtpm_send_command(u8 *buffer);
@@ -571,7 +569,7 @@ static inline void sev_es_ist_enter(struct pt_regs *regs) { }
static inline void sev_es_ist_exit(void) { }
static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; }
static inline void sev_es_nmi_complete(void) { }
-static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; }
+static inline int sev_es_efi_map_ghcbs_cas(pgd_t *pgd) { return 0; }
static inline void sev_enable(struct boot_params *bp) { }
static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; }
static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; }
@@ -602,8 +600,8 @@ static inline void snp_kexec_begin(void) { }
static inline int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id) { return -1; }
static inline struct snp_msg_desc *snp_msg_alloc(void) { return NULL; }
static inline void snp_msg_free(struct snp_msg_desc *mdesc) { }
-static inline int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
- struct snp_guest_request_ioctl *rio) { return -ENODEV; }
+static inline int snp_send_guest_request(struct snp_msg_desc *mdesc,
+ struct snp_guest_req *req) { return -ENODEV; }
static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; }
static inline void __init snp_secure_tsc_prepare(void) { }
static inline void __init snp_secure_tsc_init(void) { }
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 0c1c68039d6f..22bfebe6776d 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -112,7 +112,10 @@ void __noreturn hlt_play_dead(void);
void native_play_dead(void);
void play_dead_common(void);
void wbinvd_on_cpu(int cpu);
-int wbinvd_on_all_cpus(void);
+void wbinvd_on_all_cpus(void);
+void wbinvd_on_cpus_mask(struct cpumask *cpus);
+void wbnoinvd_on_all_cpus(void);
+void wbnoinvd_on_cpus_mask(struct cpumask *cpus);
void smp_kick_mwait_play_dead(void);
void __noreturn mwait_play_dead(unsigned int eax_hint);
@@ -148,10 +151,24 @@ static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
#else /* !CONFIG_SMP */
#define wbinvd_on_cpu(cpu) wbinvd()
-static inline int wbinvd_on_all_cpus(void)
+static inline void wbinvd_on_all_cpus(void)
{
wbinvd();
- return 0;
+}
+
+static inline void wbinvd_on_cpus_mask(struct cpumask *cpus)
+{
+ wbinvd();
+}
+
+static inline void wbnoinvd_on_all_cpus(void)
+{
+ wbnoinvd();
+}
+
+static inline void wbnoinvd_on_cpus_mask(struct cpumask *cpus)
+{
+ wbnoinvd();
}
static inline struct cpumask *cpu_llc_shared_mask(int cpu)
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index ecda17efa042..fde2bd7af19e 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -104,9 +104,36 @@ static inline void wrpkru(u32 pkru)
}
#endif
+/*
+ * Write back all modified lines in all levels of cache associated with this
+ * logical processor to main memory, and then invalidate all caches. Depending
+ * on the micro-architecture, WBINVD (and WBNOINVD below) may or may not affect
+ * lower level caches associated with another logical processor that shares any
+ * level of this processor's cache hierarchy.
+ */
static __always_inline void wbinvd(void)
{
- asm volatile("wbinvd": : :"memory");
+ asm volatile("wbinvd" : : : "memory");
+}
+
+/* Instruction encoding provided for binutils backwards compatibility. */
+#define ASM_WBNOINVD _ASM_BYTES(0xf3,0x0f,0x09)
+
+/*
+ * Write back all modified lines in all levels of cache associated with this
+ * logical processor to main memory, but do NOT explicitly invalidate caches,
+ * i.e. leave all/most cache lines in the hierarchy in non-modified state.
+ */
+static __always_inline void wbnoinvd(void)
+{
+ /*
+ * Explicitly encode WBINVD if X86_FEATURE_WBNOINVD is unavailable even
+ * though WBNOINVD is backwards compatible (it's simply WBINVD with an
+ * ignored REP prefix), to guarantee that WBNOINVD isn't used if it
+ * needs to be avoided for any reason. For all supported usage in the
+ * kernel, WBINVD is functionally a superset of WBNOINVD.
+ */
+ alternative("wbinvd", ASM_WBNOINVD, X86_FEATURE_WBNOINVD);
}
static inline unsigned long __read_cr4(void)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 93069b13d3af..a947b46a8b64 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -183,6 +183,7 @@ setnew:
apicd->cpu = newcpu;
BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec]));
per_cpu(vector_irq, newcpu)[newvec] = desc;
+ apic_update_irq_cfg(irqd, newvec, newcpu);
}
static void vector_assign_managed_shutdown(struct irq_data *irqd)
@@ -261,7 +262,6 @@ assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest)
if (vector < 0)
return vector;
apic_update_vector(irqd, vector, cpu);
- apic_update_irq_cfg(irqd, vector, cpu);
return 0;
}
@@ -338,7 +338,7 @@ assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest)
if (vector < 0)
return vector;
apic_update_vector(irqd, vector, cpu);
- apic_update_irq_cfg(irqd, vector, cpu);
+
return 0;
}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 329ee185d8cc..a5ece6ebe8a7 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -530,9 +530,11 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
}
bsp_determine_snp(c);
-
tsa_init(c);
+ if (cpu_has(c, X86_FEATURE_GP_ON_USER_CPUID))
+ setup_force_cpu_cap(X86_FEATURE_CPUID_FAULT);
+
return;
warn:
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index f4d3abb12317..b74bf937cd9f 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -115,10 +115,9 @@ void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk;
static void __init set_return_thunk(void *thunk)
{
- if (x86_return_thunk != __x86_return_thunk)
- pr_warn("x86/bugs: return thunk changed\n");
-
x86_return_thunk = thunk;
+
+ pr_info("active return thunk: %ps\n", thunk);
}
/* Update SPEC_CTRL MSR and its cached copy unconditionally */
@@ -190,6 +189,39 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear);
EXPORT_SYMBOL_GPL(cpu_buf_vm_clear);
+#undef pr_fmt
+#define pr_fmt(fmt) "mitigations: " fmt
+
+static void __init cpu_print_attack_vectors(void)
+{
+ pr_info("Enabled attack vectors: ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL))
+ pr_cont("user_kernel, ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER))
+ pr_cont("user_user, ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST))
+ pr_cont("guest_host, ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST))
+ pr_cont("guest_guest, ");
+
+ pr_cont("SMT mitigations: ");
+
+ switch (smt_mitigations) {
+ case SMT_MITIGATIONS_OFF:
+ pr_cont("off\n");
+ break;
+ case SMT_MITIGATIONS_AUTO:
+ pr_cont("auto\n");
+ break;
+ case SMT_MITIGATIONS_ON:
+ pr_cont("on\n");
+ }
+}
+
void __init cpu_select_mitigations(void)
{
/*
@@ -210,6 +242,8 @@ void __init cpu_select_mitigations(void)
x86_arch_cap_msr = x86_read_arch_cap_msr();
+ cpu_print_attack_vectors();
+
/* Select the proper CPU mitigations before patching alternatives: */
spectre_v1_select_mitigation();
spectre_v2_select_mitigation();
@@ -333,6 +367,62 @@ static void x86_amd_ssb_disable(void)
#undef pr_fmt
#define pr_fmt(fmt) "MDS: " fmt
+/*
+ * Returns true if vulnerability should be mitigated based on the
+ * selected attack vector controls.
+ *
+ * See Documentation/admin-guide/hw-vuln/attack_vector_controls.rst
+ */
+static bool __init should_mitigate_vuln(unsigned int bug)
+{
+ switch (bug) {
+ /*
+ * The only runtime-selected spectre_v1 mitigations in the kernel are
+ * related to SWAPGS protection on kernel entry. Therefore, protection
+ * is only required for the user->kernel attack vector.
+ */
+ case X86_BUG_SPECTRE_V1:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL);
+
+ case X86_BUG_SPECTRE_V2:
+ case X86_BUG_RETBLEED:
+ case X86_BUG_SRSO:
+ case X86_BUG_L1TF:
+ case X86_BUG_ITS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST);
+
+ case X86_BUG_SPECTRE_V2_USER:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST);
+
+ /*
+ * All the vulnerabilities below allow potentially leaking data
+ * across address spaces. Therefore, mitigation is required for
+ * any of these 4 attack vectors.
+ */
+ case X86_BUG_MDS:
+ case X86_BUG_TAA:
+ case X86_BUG_MMIO_STALE_DATA:
+ case X86_BUG_RFDS:
+ case X86_BUG_SRBDS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST);
+
+ case X86_BUG_GDS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST) ||
+ (smt_mitigations != SMT_MITIGATIONS_OFF);
+ default:
+ WARN(1, "Unknown bug %x\n", bug);
+ return false;
+ }
+}
+
/* Default mitigation for MDS-affected CPUs */
static enum mds_mitigations mds_mitigation __ro_after_init =
IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_AUTO : MDS_MITIGATION_OFF;
@@ -386,13 +476,17 @@ static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init;
static void __init mds_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_MDS)) {
mds_mitigation = MDS_MITIGATION_OFF;
return;
}
- if (mds_mitigation == MDS_MITIGATION_AUTO)
- mds_mitigation = MDS_MITIGATION_FULL;
+ if (mds_mitigation == MDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_MDS))
+ mds_mitigation = MDS_MITIGATION_FULL;
+ else
+ mds_mitigation = MDS_MITIGATION_OFF;
+ }
if (mds_mitigation == MDS_MITIGATION_OFF)
return;
@@ -402,7 +496,7 @@ static void __init mds_select_mitigation(void)
static void __init mds_update_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
return;
/* If TAA, MMIO, or RFDS are being mitigated, MDS gets mitigated too. */
@@ -423,7 +517,7 @@ static void __init mds_apply_mitigation(void)
mds_mitigation == MDS_MITIGATION_VMWERV) {
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
- (mds_nosmt || cpu_mitigations_auto_nosmt()))
+ (mds_nosmt || smt_mitigations == SMT_MITIGATIONS_ON))
cpu_smt_disable(false);
}
}
@@ -479,12 +573,13 @@ static void __init taa_select_mitigation(void)
return;
}
- if (cpu_mitigations_off())
- taa_mitigation = TAA_MITIGATION_OFF;
-
/* Microcode will be checked in taa_update_mitigation(). */
- if (taa_mitigation == TAA_MITIGATION_AUTO)
- taa_mitigation = TAA_MITIGATION_VERW;
+ if (taa_mitigation == TAA_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_TAA))
+ taa_mitigation = TAA_MITIGATION_VERW;
+ else
+ taa_mitigation = TAA_MITIGATION_OFF;
+ }
if (taa_mitigation != TAA_MITIGATION_OFF)
verw_clear_cpu_buf_mitigation_selected = true;
@@ -492,7 +587,7 @@ static void __init taa_select_mitigation(void)
static void __init taa_update_mitigation(void)
{
- if (!taa_vulnerable() || cpu_mitigations_off())
+ if (!taa_vulnerable())
return;
if (verw_clear_cpu_buf_mitigation_selected)
@@ -533,7 +628,7 @@ static void __init taa_apply_mitigation(void)
*/
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
- if (taa_nosmt || cpu_mitigations_auto_nosmt())
+ if (taa_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)
cpu_smt_disable(false);
}
}
@@ -579,8 +674,12 @@ static void __init mmio_select_mitigation(void)
}
/* Microcode will be checked in mmio_update_mitigation(). */
- if (mmio_mitigation == MMIO_MITIGATION_AUTO)
- mmio_mitigation = MMIO_MITIGATION_VERW;
+ if (mmio_mitigation == MMIO_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_MMIO_STALE_DATA))
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ else
+ mmio_mitigation = MMIO_MITIGATION_OFF;
+ }
if (mmio_mitigation == MMIO_MITIGATION_OFF)
return;
@@ -595,7 +694,7 @@ static void __init mmio_select_mitigation(void)
static void __init mmio_update_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
return;
if (verw_clear_cpu_buf_mitigation_selected)
@@ -643,7 +742,7 @@ static void __init mmio_apply_mitigation(void)
if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO))
static_branch_enable(&cpu_buf_idle_clear);
- if (mmio_nosmt || cpu_mitigations_auto_nosmt())
+ if (mmio_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)
cpu_smt_disable(false);
}
@@ -684,13 +783,17 @@ static inline bool __init verw_clears_cpu_reg_file(void)
static void __init rfds_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_RFDS)) {
rfds_mitigation = RFDS_MITIGATION_OFF;
return;
}
- if (rfds_mitigation == RFDS_MITIGATION_AUTO)
- rfds_mitigation = RFDS_MITIGATION_VERW;
+ if (rfds_mitigation == RFDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_RFDS))
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+ else
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ }
if (rfds_mitigation == RFDS_MITIGATION_OFF)
return;
@@ -701,7 +804,7 @@ static void __init rfds_select_mitigation(void)
static void __init rfds_update_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_RFDS))
return;
if (verw_clear_cpu_buf_mitigation_selected)
@@ -802,13 +905,19 @@ void update_srbds_msr(void)
static void __init srbds_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_SRBDS) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS)) {
srbds_mitigation = SRBDS_MITIGATION_OFF;
return;
}
- if (srbds_mitigation == SRBDS_MITIGATION_AUTO)
- srbds_mitigation = SRBDS_MITIGATION_FULL;
+ if (srbds_mitigation == SRBDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_SRBDS))
+ srbds_mitigation = SRBDS_MITIGATION_FULL;
+ else {
+ srbds_mitigation = SRBDS_MITIGATION_OFF;
+ return;
+ }
+ }
/*
* Check to see if this is one of the MDS_NO systems supporting TSX that
@@ -956,12 +1065,15 @@ static void __init gds_select_mitigation(void)
return;
}
- if (cpu_mitigations_off())
- gds_mitigation = GDS_MITIGATION_OFF;
/* Will verify below that mitigation _can_ be disabled */
-
- if (gds_mitigation == GDS_MITIGATION_AUTO)
- gds_mitigation = GDS_MITIGATION_FULL;
+ if (gds_mitigation == GDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_GDS))
+ gds_mitigation = GDS_MITIGATION_FULL;
+ else {
+ gds_mitigation = GDS_MITIGATION_OFF;
+ return;
+ }
+ }
/* No microcode */
if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) {
@@ -1067,13 +1179,16 @@ static bool smap_works_speculatively(void)
static void __init spectre_v1_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
+ spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
+
+ if (!should_mitigate_vuln(X86_BUG_SPECTRE_V1))
spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
}
static void __init spectre_v1_apply_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
return;
if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) {
@@ -1124,6 +1239,20 @@ early_param("nospectre_v1", nospectre_v1_cmdline);
enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE;
+/* Depends on spectre_v2 mitigation selected already */
+static inline bool cdt_possible(enum spectre_v2_mitigation mode)
+{
+ if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) ||
+ !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE))
+ return false;
+
+ if (mode == SPECTRE_V2_RETPOLINE ||
+ mode == SPECTRE_V2_EIBRS_RETPOLINE)
+ return true;
+
+ return false;
+}
+
#undef pr_fmt
#define pr_fmt(fmt) "RETBleed: " fmt
@@ -1162,6 +1291,21 @@ static enum retbleed_mitigation retbleed_mitigation __ro_after_init =
static int __ro_after_init retbleed_nosmt = false;
+enum srso_mitigation {
+ SRSO_MITIGATION_NONE,
+ SRSO_MITIGATION_AUTO,
+ SRSO_MITIGATION_UCODE_NEEDED,
+ SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED,
+ SRSO_MITIGATION_MICROCODE,
+ SRSO_MITIGATION_NOSMT,
+ SRSO_MITIGATION_SAFE_RET,
+ SRSO_MITIGATION_IBPB,
+ SRSO_MITIGATION_IBPB_ON_VMEXIT,
+ SRSO_MITIGATION_BP_SPEC_REDUCE,
+};
+
+static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_AUTO;
+
static int __init retbleed_parse_cmdline(char *str)
{
if (!str)
@@ -1204,7 +1348,7 @@ early_param("retbleed", retbleed_parse_cmdline);
static void __init retbleed_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_RETBLEED)) {
retbleed_mitigation = RETBLEED_MITIGATION_NONE;
return;
}
@@ -1241,6 +1385,11 @@ static void __init retbleed_select_mitigation(void)
if (retbleed_mitigation != RETBLEED_MITIGATION_AUTO)
return;
+ if (!should_mitigate_vuln(X86_BUG_RETBLEED)) {
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ return;
+ }
+
/* Intel mitigation selected in retbleed_update_mitigation() */
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
@@ -1251,35 +1400,36 @@ static void __init retbleed_select_mitigation(void)
retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
else
retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ } else if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ /* Final mitigation depends on spectre-v2 selection */
+ if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED))
+ retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
+ else if (boot_cpu_has(X86_FEATURE_IBRS))
+ retbleed_mitigation = RETBLEED_MITIGATION_IBRS;
+ else
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
}
}
static void __init retbleed_update_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_RETBLEED))
return;
- if (retbleed_mitigation == RETBLEED_MITIGATION_NONE)
- goto out;
+ /* ITS can also enable stuffing */
+ if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF)
+ retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
- /*
- * retbleed=stuff is only allowed on Intel. If stuffing can't be used
- * then a different mitigation will be selected below.
- *
- * its=stuff will also attempt to enable stuffing.
- */
- if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF ||
- its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF) {
- if (spectre_v2_enabled != SPECTRE_V2_RETPOLINE) {
- pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n");
- retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
- } else {
- if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
- pr_info("Retbleed mitigation updated to stuffing\n");
+ /* If SRSO is using IBPB, that works for retbleed too */
+ if (srso_mitigation == SRSO_MITIGATION_IBPB)
+ retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
- retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
- }
+ if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF &&
+ !cdt_possible(spectre_v2_enabled)) {
+ pr_err("WARNING: retbleed=stuff depends on retpoline\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
}
+
/*
* Let IBRS trump all on Intel without affecting the effects of the
* retbleed= cmdline option except for call depth based stuffing
@@ -1298,15 +1448,11 @@ static void __init retbleed_update_mitigation(void)
if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
pr_err(RETBLEED_INTEL_MSG);
}
- /* If nothing has set the mitigation yet, default to NONE. */
- if (retbleed_mitigation == RETBLEED_MITIGATION_AUTO)
- retbleed_mitigation = RETBLEED_MITIGATION_NONE;
}
-out:
+
pr_info("%s\n", retbleed_strings[retbleed_mitigation]);
}
-
static void __init retbleed_apply_mitigation(void)
{
bool mitigate_smt = false;
@@ -1362,7 +1508,7 @@ static void __init retbleed_apply_mitigation(void)
}
if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) &&
- (retbleed_nosmt || cpu_mitigations_auto_nosmt()))
+ (retbleed_nosmt || smt_mitigations == SMT_MITIGATIONS_ON))
cpu_smt_disable(false);
}
@@ -1407,13 +1553,17 @@ early_param("indirect_target_selection", its_parse_cmdline);
static void __init its_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_ITS)) {
its_mitigation = ITS_MITIGATION_OFF;
return;
}
- if (its_mitigation == ITS_MITIGATION_AUTO)
- its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ if (its_mitigation == ITS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_ITS))
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ else
+ its_mitigation = ITS_MITIGATION_OFF;
+ }
if (its_mitigation == ITS_MITIGATION_OFF)
return;
@@ -1444,15 +1594,17 @@ static void __init its_select_mitigation(void)
static void __init its_update_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_ITS))
return;
switch (spectre_v2_enabled) {
case SPECTRE_V2_NONE:
- pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n");
+ if (its_mitigation != ITS_MITIGATION_OFF)
+ pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n");
its_mitigation = ITS_MITIGATION_OFF;
break;
case SPECTRE_V2_RETPOLINE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
/* Retpoline+CDT mitigates ITS */
if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF)
its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF;
@@ -1466,13 +1618,8 @@ static void __init its_update_mitigation(void)
break;
}
- /*
- * retbleed_update_mitigation() will try to do stuffing if its=stuff.
- * If it can't, such as if spectre_v2!=retpoline, then fall back to
- * aligned thunks.
- */
if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF &&
- retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
+ !cdt_possible(spectre_v2_enabled))
its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
pr_info("%s\n", its_strings[its_mitigation]);
@@ -1480,15 +1627,24 @@ static void __init its_update_mitigation(void)
static void __init its_apply_mitigation(void)
{
- /* its=stuff forces retbleed stuffing and is enabled there. */
- if (its_mitigation != ITS_MITIGATION_ALIGNED_THUNKS)
- return;
-
- if (!boot_cpu_has(X86_FEATURE_RETPOLINE))
- setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS);
+ switch (its_mitigation) {
+ case ITS_MITIGATION_OFF:
+ case ITS_MITIGATION_AUTO:
+ case ITS_MITIGATION_VMEXIT_ONLY:
+ break;
+ case ITS_MITIGATION_ALIGNED_THUNKS:
+ if (!boot_cpu_has(X86_FEATURE_RETPOLINE))
+ setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS);
- setup_force_cpu_cap(X86_FEATURE_RETHUNK);
- set_return_thunk(its_return_thunk);
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ set_return_thunk(its_return_thunk);
+ break;
+ case ITS_MITIGATION_RETPOLINE_STUFF:
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH);
+ set_return_thunk(call_depth_return_thunk);
+ break;
+ }
}
#undef pr_fmt
@@ -1536,28 +1692,43 @@ early_param("tsa", tsa_parse_cmdline);
static void __init tsa_select_mitigation(void)
{
- if (cpu_mitigations_off() || !boot_cpu_has_bug(X86_BUG_TSA)) {
+ if (!boot_cpu_has_bug(X86_BUG_TSA)) {
tsa_mitigation = TSA_MITIGATION_NONE;
return;
}
+ if (tsa_mitigation == TSA_MITIGATION_AUTO) {
+ bool vm = false, uk = false;
+
+ tsa_mitigation = TSA_MITIGATION_NONE;
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER)) {
+ tsa_mitigation = TSA_MITIGATION_USER_KERNEL;
+ uk = true;
+ }
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST)) {
+ tsa_mitigation = TSA_MITIGATION_VM;
+ vm = true;
+ }
+
+ if (uk && vm)
+ tsa_mitigation = TSA_MITIGATION_FULL;
+ }
+
if (tsa_mitigation == TSA_MITIGATION_NONE)
return;
- if (!boot_cpu_has(X86_FEATURE_VERW_CLEAR)) {
+ if (!boot_cpu_has(X86_FEATURE_VERW_CLEAR))
tsa_mitigation = TSA_MITIGATION_UCODE_NEEDED;
- goto out;
- }
-
- if (tsa_mitigation == TSA_MITIGATION_AUTO)
- tsa_mitigation = TSA_MITIGATION_FULL;
/*
* No need to set verw_clear_cpu_buf_mitigation_selected - it
* doesn't fit all cases here and it is not needed because this
* is the only VERW-based mitigation on AMD.
*/
-out:
pr_info("%s\n", tsa_strings[tsa_mitigation]);
}
@@ -1701,7 +1872,7 @@ static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void)
char arg[20];
int ret, i;
- if (cpu_mitigations_off() || !IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2))
+ if (!IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2))
return SPECTRE_V2_USER_CMD_NONE;
ret = cmdline_find_option(boot_command_line, "spectre_v2_user",
@@ -1739,6 +1910,13 @@ static void __init spectre_v2_user_select_mitigation(void)
spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT;
break;
case SPECTRE_V2_USER_CMD_AUTO:
+ if (!should_mitigate_vuln(X86_BUG_SPECTRE_V2_USER))
+ break;
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
+ if (smt_mitigations == SMT_MITIGATIONS_OFF)
+ break;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
+ break;
case SPECTRE_V2_USER_CMD_PRCTL:
spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
@@ -1890,8 +2068,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
int ret, i;
cmd = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE;
- if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") ||
- cpu_mitigations_off())
+ if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
return SPECTRE_V2_CMD_NONE;
ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
@@ -2094,11 +2271,20 @@ early_param("spectre_bhi", spectre_bhi_parse_cmdline);
static void __init bhi_select_mitigation(void)
{
- if (!boot_cpu_has(X86_BUG_BHI) || cpu_mitigations_off())
+ if (!boot_cpu_has(X86_BUG_BHI))
bhi_mitigation = BHI_MITIGATION_OFF;
- if (bhi_mitigation == BHI_MITIGATION_AUTO)
- bhi_mitigation = BHI_MITIGATION_ON;
+ if (bhi_mitigation != BHI_MITIGATION_AUTO)
+ return;
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST)) {
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL))
+ bhi_mitigation = BHI_MITIGATION_ON;
+ else
+ bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY;
+ } else {
+ bhi_mitigation = BHI_MITIGATION_OFF;
+ }
}
static void __init bhi_update_mitigation(void)
@@ -2154,8 +2340,11 @@ static void __init spectre_v2_select_mitigation(void)
case SPECTRE_V2_CMD_NONE:
return;
- case SPECTRE_V2_CMD_FORCE:
case SPECTRE_V2_CMD_AUTO:
+ if (!should_mitigate_vuln(X86_BUG_SPECTRE_V2))
+ break;
+ fallthrough;
+ case SPECTRE_V2_CMD_FORCE:
if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
spectre_v2_enabled = SPECTRE_V2_EIBRS;
break;
@@ -2209,7 +2398,7 @@ static void __init spectre_v2_update_mitigation(void)
}
}
- if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && !cpu_mitigations_off())
+ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
pr_info("%s\n", spectre_v2_strings[spectre_v2_enabled]);
}
@@ -2861,17 +3050,23 @@ static void override_cache_bits(struct cpuinfo_x86 *c)
static void __init l1tf_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_L1TF) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
l1tf_mitigation = L1TF_MITIGATION_OFF;
return;
}
- if (l1tf_mitigation == L1TF_MITIGATION_AUTO) {
- if (cpu_mitigations_auto_nosmt())
- l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
- else
- l1tf_mitigation = L1TF_MITIGATION_FLUSH;
+ if (l1tf_mitigation != L1TF_MITIGATION_AUTO)
+ return;
+
+ if (!should_mitigate_vuln(X86_BUG_L1TF)) {
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ return;
}
+
+ if (smt_mitigations == SMT_MITIGATIONS_ON)
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
+ else
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH;
}
static void __init l1tf_apply_mitigation(void)
@@ -2945,31 +3140,18 @@ early_param("l1tf", l1tf_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "Speculative Return Stack Overflow: " fmt
-enum srso_mitigation {
- SRSO_MITIGATION_NONE,
- SRSO_MITIGATION_AUTO,
- SRSO_MITIGATION_UCODE_NEEDED,
- SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED,
- SRSO_MITIGATION_MICROCODE,
- SRSO_MITIGATION_SAFE_RET,
- SRSO_MITIGATION_IBPB,
- SRSO_MITIGATION_IBPB_ON_VMEXIT,
- SRSO_MITIGATION_BP_SPEC_REDUCE,
-};
-
static const char * const srso_strings[] = {
[SRSO_MITIGATION_NONE] = "Vulnerable",
[SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
[SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED] = "Vulnerable: Safe RET, no microcode",
[SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET",
+ [SRSO_MITIGATION_NOSMT] = "Mitigation: SMT disabled",
[SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET",
[SRSO_MITIGATION_IBPB] = "Mitigation: IBPB",
[SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only",
[SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation"
};
-static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_AUTO;
-
static int __init srso_parse_cmdline(char *str)
{
if (!str)
@@ -2996,35 +3178,44 @@ early_param("spec_rstack_overflow", srso_parse_cmdline);
static void __init srso_select_mitigation(void)
{
- bool has_microcode;
-
- if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_SRSO)) {
srso_mitigation = SRSO_MITIGATION_NONE;
-
- if (srso_mitigation == SRSO_MITIGATION_NONE)
return;
+ }
- if (srso_mitigation == SRSO_MITIGATION_AUTO)
- srso_mitigation = SRSO_MITIGATION_SAFE_RET;
-
- has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE);
- if (has_microcode) {
- /*
- * Zen1/2 with SMT off aren't vulnerable after the right
- * IBPB microcode has been applied.
- */
- if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
- setup_force_cpu_cap(X86_FEATURE_SRSO_NO);
+ if (srso_mitigation == SRSO_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_SRSO)) {
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+ } else {
srso_mitigation = SRSO_MITIGATION_NONE;
return;
}
- } else {
+ }
+
+ /* Zen1/2 with SMT off aren't vulnerable to SRSO. */
+ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
+ srso_mitigation = SRSO_MITIGATION_NOSMT;
+ return;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB_BRTYPE)) {
pr_warn("IBPB-extending microcode not applied!\n");
pr_warn(SRSO_NOTICE);
+
+ /*
+ * Safe-RET provides partial mitigation without microcode, but
+ * other mitigations require microcode to provide any
+ * mitigations.
+ */
+ if (srso_mitigation == SRSO_MITIGATION_SAFE_RET)
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED;
+ else
+ srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED;
}
switch (srso_mitigation) {
case SRSO_MITIGATION_SAFE_RET:
+ case SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED:
if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) {
srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
goto ibpb_on_vmexit;
@@ -3034,9 +3225,6 @@ static void __init srso_select_mitigation(void)
pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n");
srso_mitigation = SRSO_MITIGATION_NONE;
}
-
- if (!has_microcode)
- srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED;
break;
ibpb_on_vmexit:
case SRSO_MITIGATION_IBPB_ON_VMEXIT:
@@ -3051,9 +3239,6 @@ ibpb_on_vmexit:
pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
srso_mitigation = SRSO_MITIGATION_NONE;
}
-
- if (!has_microcode)
- srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED;
break;
default:
break;
@@ -3068,8 +3253,7 @@ static void __init srso_update_mitigation(void)
srso_mitigation = SRSO_MITIGATION_IBPB;
if (boot_cpu_has_bug(X86_BUG_SRSO) &&
- !cpu_mitigations_off() &&
- !boot_cpu_has(X86_FEATURE_SRSO_NO))
+ !cpu_mitigations_off())
pr_info("%s\n", srso_strings[srso_mitigation]);
}
@@ -3365,9 +3549,6 @@ static ssize_t retbleed_show_state(char *buf)
static ssize_t srso_show_state(char *buf)
{
- if (boot_cpu_has(X86_FEATURE_SRSO_NO))
- return sysfs_emit(buf, "Mitigation: SMT disabled\n");
-
return sysfs_emit(buf, "%s\n", srso_strings[srso_mitigation]);
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index fb50c1dd53ef..34a054181c4d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -26,6 +26,7 @@
#include <linux/pgtable.h>
#include <linux/stackprotector.h>
#include <linux/utsname.h>
+#include <linux/efi.h>
#include <asm/alternative.h>
#include <asm/cmdline.h>
@@ -2538,6 +2539,12 @@ void __init arch_cpu_finalize_init(void)
fpu__init_cpu();
/*
+ * This needs to follow the FPU initializtion, since EFI depends on it.
+ */
+ if (efi_enabled(EFI_RUNTIME_SERVICES))
+ efi_enter_virtual_mode();
+
+ /*
* Ensure that access to the per CPU representation has the initial
* boot CPU configuration.
*/
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index fe50eb5b7c4a..b92e09a87c69 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -17,8 +17,8 @@
#define pr_fmt(fmt) "microcode: " fmt
-#include <linux/platform_device.h>
#include <linux/stop_machine.h>
+#include <linux/device/faux.h>
#include <linux/syscore_ops.h>
#include <linux/miscdevice.h>
#include <linux/capability.h>
@@ -249,7 +249,7 @@ static void reload_early_microcode(unsigned int cpu)
}
/* fake device for request_firmware */
-static struct platform_device *microcode_pdev;
+static struct faux_device *microcode_fdev;
#ifdef CONFIG_MICROCODE_LATE_LOADING
/*
@@ -690,7 +690,7 @@ static int load_late_locked(void)
if (!setup_cpus())
return -EBUSY;
- switch (microcode_ops->request_microcode_fw(0, &microcode_pdev->dev)) {
+ switch (microcode_ops->request_microcode_fw(0, &microcode_fdev->dev)) {
case UCODE_NEW:
return load_late_stop_cpus(false);
case UCODE_NEW_SAFE:
@@ -841,9 +841,9 @@ static int __init microcode_init(void)
if (early_data.new_rev)
pr_info_once("Updated early from: 0x%08x\n", early_data.old_rev);
- microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0);
- if (IS_ERR(microcode_pdev))
- return PTR_ERR(microcode_pdev);
+ microcode_fdev = faux_device_create("microcode", NULL, NULL);
+ if (!microcode_fdev)
+ return -ENODEV;
dev_root = bus_get_dev_root(&cpu_subsys);
if (dev_root) {
@@ -862,7 +862,7 @@ static int __init microcode_init(void)
return 0;
out_pdev:
- platform_device_unregister(microcode_pdev);
+ faux_device_destroy(microcode_fdev);
return error;
}
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index ea138583dd92..aefd412a23dc 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -37,6 +37,7 @@ DEFINE_PER_CPU(u64, xfd_state);
/* The FPU state configuration data for kernel and user space */
struct fpu_state_config fpu_kernel_cfg __ro_after_init;
struct fpu_state_config fpu_user_cfg __ro_after_init;
+struct vcpu_fpu_config guest_default_cfg __ro_after_init;
/*
* Represents the initial FPU state. It's mostly (but not completely) zeroes,
@@ -217,7 +218,7 @@ void fpu_reset_from_exception_fixup(void)
}
#if IS_ENABLED(CONFIG_KVM)
-static void __fpstate_reset(struct fpstate *fpstate, u64 xfd);
+static void __fpstate_reset(struct fpstate *fpstate);
static void fpu_lock_guest_permissions(void)
{
@@ -242,19 +243,21 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
struct fpstate *fpstate;
unsigned int size;
- size = fpu_kernel_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64);
+ size = guest_default_cfg.size + ALIGN(offsetof(struct fpstate, regs), 64);
+
fpstate = vzalloc(size);
if (!fpstate)
return false;
- /* Leave xfd to 0 (the reset value defined by spec) */
- __fpstate_reset(fpstate, 0);
- fpstate_init_user(fpstate);
+ /* Initialize indicators to reflect properties of the fpstate */
fpstate->is_valloc = true;
fpstate->is_guest = true;
+ __fpstate_reset(fpstate);
+ fpstate_init_user(fpstate);
+
gfpu->fpstate = fpstate;
- gfpu->xfeatures = fpu_kernel_cfg.default_features;
+ gfpu->xfeatures = guest_default_cfg.features;
/*
* KVM sets the FP+SSE bits in the XSAVE header when copying FPU state
@@ -541,28 +544,50 @@ void fpstate_init_user(struct fpstate *fpstate)
fpstate_init_fstate(fpstate);
}
-static void __fpstate_reset(struct fpstate *fpstate, u64 xfd)
+static void __fpstate_reset(struct fpstate *fpstate)
{
- /* Initialize sizes and feature masks */
- fpstate->size = fpu_kernel_cfg.default_size;
+ /*
+ * Supervisor features (and thus sizes) may diverge between guest
+ * FPUs and host FPUs, as some supervisor features are supported
+ * for guests despite not being utilized by the host. User
+ * features and sizes are always identical, which allows for
+ * common guest and userspace ABI.
+ *
+ * For the host, set XFD to the kernel's desired initialization
+ * value. For guests, set XFD to its architectural RESET value.
+ */
+ if (fpstate->is_guest) {
+ fpstate->size = guest_default_cfg.size;
+ fpstate->xfeatures = guest_default_cfg.features;
+ fpstate->xfd = 0;
+ } else {
+ fpstate->size = fpu_kernel_cfg.default_size;
+ fpstate->xfeatures = fpu_kernel_cfg.default_features;
+ fpstate->xfd = init_fpstate.xfd;
+ }
+
fpstate->user_size = fpu_user_cfg.default_size;
- fpstate->xfeatures = fpu_kernel_cfg.default_features;
fpstate->user_xfeatures = fpu_user_cfg.default_features;
- fpstate->xfd = xfd;
}
void fpstate_reset(struct fpu *fpu)
{
/* Set the fpstate pointer to the default fpstate */
fpu->fpstate = &fpu->__fpstate;
- __fpstate_reset(fpu->fpstate, init_fpstate.xfd);
+ __fpstate_reset(fpu->fpstate);
/* Initialize the permission related info in fpu */
fpu->perm.__state_perm = fpu_kernel_cfg.default_features;
fpu->perm.__state_size = fpu_kernel_cfg.default_size;
fpu->perm.__user_state_size = fpu_user_cfg.default_size;
- /* Same defaults for guests */
- fpu->guest_perm = fpu->perm;
+
+ fpu->guest_perm.__state_perm = guest_default_cfg.features;
+ fpu->guest_perm.__state_size = guest_default_cfg.size;
+ /*
+ * User features and sizes are always identical between host and
+ * guest FPUs, which allows for common guest and userspace ABI.
+ */
+ fpu->guest_perm.__user_state_size = fpu_user_cfg.default_size;
}
static inline void fpu_inherit_perms(struct fpu *dst_fpu)
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 99db41bf9fa6..ff988b9ea39f 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -205,6 +205,7 @@ static void __init fpu__init_system_xstate_size_legacy(void)
fpu_kernel_cfg.default_size = size;
fpu_user_cfg.max_size = size;
fpu_user_cfg.default_size = size;
+ guest_default_cfg.size = size;
}
/*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 9aa9ac8399ae..12ed75c1b567 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -57,7 +57,7 @@ static const char *xfeature_names[] =
"Protection Keys User registers",
"PASID state",
"Control-flow User registers",
- "Control-flow Kernel registers (unused)",
+ "Control-flow Kernel registers (KVM only)",
"unknown xstate feature",
"unknown xstate feature",
"unknown xstate feature",
@@ -81,6 +81,7 @@ static unsigned short xsave_cpuid_features[] __initdata = {
[XFEATURE_PKRU] = X86_FEATURE_OSPKE,
[XFEATURE_PASID] = X86_FEATURE_ENQCMD,
[XFEATURE_CET_USER] = X86_FEATURE_SHSTK,
+ [XFEATURE_CET_KERNEL] = X86_FEATURE_SHSTK,
[XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE,
[XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE,
[XFEATURE_APX] = X86_FEATURE_APX,
@@ -372,6 +373,7 @@ static __init void os_xrstor_booting(struct xregs_state *xstate)
XFEATURE_MASK_BNDCSR | \
XFEATURE_MASK_PASID | \
XFEATURE_MASK_CET_USER | \
+ XFEATURE_MASK_CET_KERNEL | \
XFEATURE_MASK_XTILE | \
XFEATURE_MASK_APX)
@@ -573,6 +575,7 @@ static bool __init check_xstate_against_struct(int nr)
case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg);
case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state);
+ case XFEATURE_CET_KERNEL: return XCHECK_SZ(sz, nr, struct cet_supervisor_state);
case XFEATURE_APX: return XCHECK_SZ(sz, nr, struct apx_state);
case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
default:
@@ -743,6 +746,9 @@ static int __init init_xstate_size(void)
fpu_user_cfg.default_size =
xstate_calculate_size(fpu_user_cfg.default_features, false);
+ guest_default_cfg.size =
+ xstate_calculate_size(guest_default_cfg.features, compacted);
+
return 0;
}
@@ -763,6 +769,7 @@ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
fpu_kernel_cfg.default_size = legacy_size;
fpu_user_cfg.max_size = legacy_size;
fpu_user_cfg.default_size = legacy_size;
+ guest_default_cfg.size = legacy_size;
/*
* Prevent enabling the static branch which enables writes to the
@@ -773,6 +780,24 @@ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
fpstate_reset(x86_task_fpu(current));
}
+static u64 __init host_default_mask(void)
+{
+ /*
+ * Exclude dynamic features (require userspace opt-in) and features
+ * that are supported only for KVM guests.
+ */
+ return ~((u64)XFEATURE_MASK_USER_DYNAMIC | XFEATURE_MASK_GUEST_SUPERVISOR);
+}
+
+static u64 __init guest_default_mask(void)
+{
+ /*
+ * Exclude dynamic features, which require userspace opt-in even
+ * for KVM guests.
+ */
+ return ~(u64)XFEATURE_MASK_USER_DYNAMIC;
+}
+
/*
* Enable and initialize the xsave feature.
* Called once per system bootup.
@@ -855,12 +880,13 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
- /* Clean out dynamic features from default */
- fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
- fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
-
- fpu_user_cfg.default_features = fpu_user_cfg.max_features;
- fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
+ /*
+ * Now, given maximum feature set, determine default values by
+ * applying default masks.
+ */
+ fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features & host_default_mask();
+ fpu_user_cfg.default_features = fpu_user_cfg.max_features & host_default_mask();
+ guest_default_cfg.features = fpu_kernel_cfg.max_features & guest_default_mask();
/* Store it for paranoia check at the end */
xfeatures = fpu_kernel_cfg.max_features;
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 9cea1fc36c18..243a769fdd97 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -59,6 +59,18 @@ static ssize_t sched_itmt_enabled_write(struct file *filp,
return result;
}
+static int sched_core_priority_show(struct seq_file *s, void *unused)
+{
+ int cpu;
+
+ seq_puts(s, "CPU #\tPriority\n");
+ for_each_possible_cpu(cpu)
+ seq_printf(s, "%d\t%d\n", cpu, arch_asym_cpu_priority(cpu));
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(sched_core_priority);
+
static const struct file_operations dfs_sched_itmt_fops = {
.read = debugfs_read_file_bool,
.write = sched_itmt_enabled_write,
@@ -67,6 +79,7 @@ static const struct file_operations dfs_sched_itmt_fops = {
};
static struct dentry *dfs_sched_itmt;
+static struct dentry *dfs_sched_core_prio;
/**
* sched_set_itmt_support() - Indicate platform supports ITMT
@@ -102,6 +115,14 @@ int sched_set_itmt_support(void)
return -ENOMEM;
}
+ dfs_sched_core_prio = debugfs_create_file("sched_core_priority", 0644,
+ arch_debugfs_dir, NULL,
+ &sched_core_priority_fops);
+ if (IS_ERR_OR_NULL(dfs_sched_core_prio)) {
+ dfs_sched_core_prio = NULL;
+ return -ENOMEM;
+ }
+
sched_itmt_capable = true;
sysctl_sched_itmt_enabled = 1;
@@ -133,6 +154,8 @@ void sched_clear_itmt_support(void)
debugfs_remove(dfs_sched_itmt);
dfs_sched_itmt = NULL;
+ debugfs_remove(dfs_sched_core_prio);
+ dfs_sched_core_prio = NULL;
if (sysctl_sched_itmt_enabled) {
/* disable sched_itmt if we are no longer ITMT capable */
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
index b68d4be9464e..d547de9b3ed8 100644
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -40,7 +40,7 @@ static const struct bin_attribute boot_params_data_attr = {
.name = "data",
.mode = S_IRUGO,
},
- .read_new = boot_params_data_read,
+ .read = boot_params_data_read,
.size = sizeof(boot_params),
};
@@ -56,7 +56,7 @@ static const struct bin_attribute *const boot_params_data_attrs[] = {
static const struct attribute_group boot_params_attr_group = {
.attrs = boot_params_version_attrs,
- .bin_attrs_new = boot_params_data_attrs,
+ .bin_attrs = boot_params_data_attrs,
};
static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr)
@@ -250,7 +250,7 @@ static struct bin_attribute data_attr __ro_after_init = {
.name = "data",
.mode = S_IRUGO,
},
- .read_new = setup_data_data_read,
+ .read = setup_data_data_read,
};
static struct attribute *setup_data_type_attrs[] = {
@@ -265,7 +265,7 @@ static const struct bin_attribute *const setup_data_data_attrs[] = {
static const struct attribute_group setup_data_attr_group = {
.attrs = setup_data_type_attrs,
- .bin_attrs_new = setup_data_data_attrs,
+ .bin_attrs = setup_data_data_attrs,
};
static int __init create_setup_data_node(struct kobject *parent,
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 921c1c783bc1..8ae750cde0c6 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -420,7 +420,7 @@ static u64 kvm_steal_clock(int cpu)
return steal;
}
-static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
+static inline __init void __set_percpu_decrypted(void *ptr, unsigned long size)
{
early_set_memory_decrypted((unsigned long) ptr, size);
}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index a838be04f874..1b7960cf6eb0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -334,13 +334,21 @@ DEFINE_PER_CPU(u64, msr_misc_features_shadow);
static void set_cpuid_faulting(bool on)
{
- u64 msrval;
- msrval = this_cpu_read(msr_misc_features_shadow);
- msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
- msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT);
- this_cpu_write(msr_misc_features_shadow, msrval);
- wrmsrq(MSR_MISC_FEATURES_ENABLES, msrval);
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ u64 msrval;
+
+ msrval = this_cpu_read(msr_misc_features_shadow);
+ msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+ msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT);
+ this_cpu_write(msr_misc_features_shadow, msrval);
+ wrmsrq(MSR_MISC_FEATURES_ENABLES, msrval);
+ } else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ if (on)
+ msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_CPUID_USER_DIS_BIT);
+ else
+ msr_clear_bit(MSR_K7_HWCR, MSR_K7_HWCR_CPUID_USER_DIS_BIT);
+ }
}
static void disable_cpuid(void)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b972bf72fb8b..52a5c03c353c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -707,6 +707,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* Load the Intel cache allocation PQR MSR. */
resctrl_arch_sched_in(next_p);
+ /* Reset hw history on AMD CPUs */
+ if (cpu_feature_enabled(X86_FEATURE_AMD_WORKLOAD_CLASS))
+ wrmsrl(MSR_AMD_WORKLOAD_HRST, 0x1);
+
return prev_p;
}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 095f04bdabdc..3dcadc13f09a 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1236,7 +1236,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
static struct user_regset x86_64_regsets[] __ro_after_init = {
[REGSET64_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = sizeof(struct user_regs_struct) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
@@ -1244,7 +1244,7 @@ static struct user_regset x86_64_regsets[] __ro_after_init = {
.set = genregs_set
},
[REGSET64_FP] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(struct fxregs_state) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
@@ -1253,7 +1253,7 @@ static struct user_regset x86_64_regsets[] __ro_after_init = {
.set = xfpregs_set
},
[REGSET64_XSTATE] = {
- .core_note_type = NT_X86_XSTATE,
+ USER_REGSET_NOTE_TYPE(X86_XSTATE),
.size = sizeof(u64),
.align = sizeof(u64),
.active = xstateregs_active,
@@ -1261,7 +1261,7 @@ static struct user_regset x86_64_regsets[] __ro_after_init = {
.set = xstateregs_set
},
[REGSET64_IOPERM] = {
- .core_note_type = NT_386_IOPERM,
+ USER_REGSET_NOTE_TYPE(386_IOPERM),
.n = IO_BITMAP_LONGS,
.size = sizeof(long),
.align = sizeof(long),
@@ -1270,7 +1270,7 @@ static struct user_regset x86_64_regsets[] __ro_after_init = {
},
#ifdef CONFIG_X86_USER_SHADOW_STACK
[REGSET64_SSP] = {
- .core_note_type = NT_X86_SHSTK,
+ USER_REGSET_NOTE_TYPE(X86_SHSTK),
.n = 1,
.size = sizeof(u64),
.align = sizeof(u64),
@@ -1297,7 +1297,7 @@ static const struct user_regset_view user_x86_64_view = {
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
static struct user_regset x86_32_regsets[] __ro_after_init = {
[REGSET32_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = sizeof(struct user_regs_struct32) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -1305,7 +1305,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
.set = genregs32_set
},
[REGSET32_FP] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(struct user_i387_ia32_struct) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -1314,7 +1314,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
.set = fpregs_set
},
[REGSET32_XFP] = {
- .core_note_type = NT_PRXFPREG,
+ USER_REGSET_NOTE_TYPE(PRXFPREG),
.n = sizeof(struct fxregs_state) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -1323,7 +1323,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
.set = xfpregs_set
},
[REGSET32_XSTATE] = {
- .core_note_type = NT_X86_XSTATE,
+ USER_REGSET_NOTE_TYPE(X86_XSTATE),
.size = sizeof(u64),
.align = sizeof(u64),
.active = xstateregs_active,
@@ -1331,7 +1331,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
.set = xstateregs_set
},
[REGSET32_TLS] = {
- .core_note_type = NT_386_TLS,
+ USER_REGSET_NOTE_TYPE(386_TLS),
.n = GDT_ENTRY_TLS_ENTRIES,
.bias = GDT_ENTRY_TLS_MIN,
.size = sizeof(struct user_desc),
@@ -1341,7 +1341,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
.set = regset_tls_set
},
[REGSET32_IOPERM] = {
- .core_note_type = NT_386_IOPERM,
+ USER_REGSET_NOTE_TYPE(386_IOPERM),
.n = IO_BITMAP_BYTES / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 58ede3fa6a75..33e166f6ab12 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -478,44 +478,41 @@ static int x86_cluster_flags(void)
*/
static bool x86_has_numa_in_package;
-static struct sched_domain_topology_level x86_topology[6];
-
-static void __init build_sched_topology(void)
-{
- int i = 0;
-
-#ifdef CONFIG_SCHED_SMT
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT)
- };
-#endif
+static struct sched_domain_topology_level x86_topology[] = {
+ SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
#ifdef CONFIG_SCHED_CLUSTER
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
- };
+ SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS),
#endif
#ifdef CONFIG_SCHED_MC
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC)
- };
+ SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC),
#endif
+ SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG),
+ { NULL },
+};
+
+static void __init build_sched_topology(void)
+{
+ struct sched_domain_topology_level *topology = x86_topology;
+
/*
- * When there is NUMA topology inside the package skip the PKG domain
- * since the NUMA domains will auto-magically create the right spanning
- * domains based on the SLIT.
+ * When there is NUMA topology inside the package invalidate the
+ * PKG domain since the NUMA domains will auto-magically create the
+ * right spanning domains based on the SLIT.
*/
- if (!x86_has_numa_in_package) {
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(PKG)
- };
+ if (x86_has_numa_in_package) {
+ unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2;
+
+ memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
}
/*
- * There must be one trailing NULL entry left.
+ * Drop the SMT domains if there is only one thread per-core
+ * since it'll get degenerated by the scheduler anyways.
*/
- BUG_ON(i >= ARRAY_SIZE(x86_topology)-1);
+ if (cpu_smt_num_threads <= 1)
+ ++topology;
- set_sched_topology(x86_topology);
+ set_sched_topology(topology);
}
void set_cpu_sibling_map(int cpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 93636f77c42d..f2a57a1136d2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8295,8 +8295,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
int cpu = get_cpu();
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
- on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask,
- wbinvd_ipi, NULL, 1);
+ wbinvd_on_cpus_mask(vcpu->arch.wbinvd_dirty_mask);
put_cpu();
cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
} else
diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore
index 8ae0f93ecbfd..ec2131c9fd20 100644
--- a/arch/x86/lib/.gitignore
+++ b/arch/x86/lib/.gitignore
@@ -1,2 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
+
+# This now-removed directory used to contain generated files.
+/crypto/
+
inat-tables.c
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 4fa5c4e1ba8a..2dba7f83ef97 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -3,8 +3,6 @@
# Makefile for x86 specific library files.
#
-obj-y += crypto/
-
# Produces uninteresting flaky coverage.
KCOV_INSTRUMENT_delay.o := n
@@ -40,16 +38,6 @@ lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
lib-$(CONFIG_MITIGATION_RETPOLINE) += retpoline.o
-obj-$(CONFIG_CRC32_ARCH) += crc32-x86.o
-crc32-x86-y := crc32.o crc32-pclmul.o
-crc32-x86-$(CONFIG_64BIT) += crc32c-3way.o
-
-obj-$(CONFIG_CRC64_ARCH) += crc64-x86.o
-crc64-x86-y := crc64.o crc64-pclmul.o
-
-obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-x86.o
-crc-t10dif-x86-y := crc-t10dif.o crc16-msb-pclmul.o
-
obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
obj-y += iomem.o
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
index 7af743bd3b13..c5c60d07308c 100644
--- a/arch/x86/lib/cache-smp.c
+++ b/arch/x86/lib/cache-smp.c
@@ -14,9 +14,31 @@ void wbinvd_on_cpu(int cpu)
}
EXPORT_SYMBOL(wbinvd_on_cpu);
-int wbinvd_on_all_cpus(void)
+void wbinvd_on_all_cpus(void)
{
on_each_cpu(__wbinvd, NULL, 1);
- return 0;
}
EXPORT_SYMBOL(wbinvd_on_all_cpus);
+
+void wbinvd_on_cpus_mask(struct cpumask *cpus)
+{
+ on_each_cpu_mask(cpus, __wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(wbinvd_on_cpus_mask);
+
+static void __wbnoinvd(void *dummy)
+{
+ wbnoinvd();
+}
+
+void wbnoinvd_on_all_cpus(void)
+{
+ on_each_cpu(__wbnoinvd, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(wbnoinvd_on_all_cpus);
+
+void wbnoinvd_on_cpus_mask(struct cpumask *cpus)
+{
+ on_each_cpu_mask(cpus, __wbnoinvd, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(wbnoinvd_on_cpus_mask);
diff --git a/arch/x86/lib/crc-pclmul-consts.h b/arch/x86/lib/crc-pclmul-consts.h
deleted file mode 100644
index fcc63c064333..000000000000
--- a/arch/x86/lib/crc-pclmul-consts.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * CRC constants generated by:
- *
- * ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5
- *
- * Do not edit manually.
- */
-
-/*
- * CRC folding constants generated for most-significant-bit-first CRC-16 using
- * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
- */
-static const struct {
- u8 bswap_mask[16];
- u64 fold_across_2048_bits_consts[2];
- u64 fold_across_1024_bits_consts[2];
- u64 fold_across_512_bits_consts[2];
- u64 fold_across_256_bits_consts[2];
- u64 fold_across_128_bits_consts[2];
- u8 shuf_table[48];
- u64 barrett_reduction_consts[2];
-} crc16_msb_0x8bb7_consts ____cacheline_aligned __maybe_unused = {
- .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
- .fold_across_2048_bits_consts = {
- 0xdccf000000000000, /* LO64_TERMS: (x^2000 mod G) * x^48 */
- 0x4b0b000000000000, /* HI64_TERMS: (x^2064 mod G) * x^48 */
- },
- .fold_across_1024_bits_consts = {
- 0x9d9d000000000000, /* LO64_TERMS: (x^976 mod G) * x^48 */
- 0x7cf5000000000000, /* HI64_TERMS: (x^1040 mod G) * x^48 */
- },
- .fold_across_512_bits_consts = {
- 0x044c000000000000, /* LO64_TERMS: (x^464 mod G) * x^48 */
- 0xe658000000000000, /* HI64_TERMS: (x^528 mod G) * x^48 */
- },
- .fold_across_256_bits_consts = {
- 0x6ee3000000000000, /* LO64_TERMS: (x^208 mod G) * x^48 */
- 0xe7b5000000000000, /* HI64_TERMS: (x^272 mod G) * x^48 */
- },
- .fold_across_128_bits_consts = {
- 0x2d56000000000000, /* LO64_TERMS: (x^80 mod G) * x^48 */
- 0x06df000000000000, /* HI64_TERMS: (x^144 mod G) * x^48 */
- },
- .shuf_table = {
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- },
- .barrett_reduction_consts = {
- 0x8bb7000000000000, /* LO64_TERMS: (G - x^16) * x^48 */
- 0xf65a57f81d33a48a, /* HI64_TERMS: (floor(x^79 / G) * x) - x^64 */
- },
-};
-
-/*
- * CRC folding constants generated for least-significant-bit-first CRC-32 using
- * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
- * x^5 + x^4 + x^2 + x^1 + x^0
- */
-static const struct {
- u64 fold_across_2048_bits_consts[2];
- u64 fold_across_1024_bits_consts[2];
- u64 fold_across_512_bits_consts[2];
- u64 fold_across_256_bits_consts[2];
- u64 fold_across_128_bits_consts[2];
- u8 shuf_table[48];
- u64 barrett_reduction_consts[2];
-} crc32_lsb_0xedb88320_consts ____cacheline_aligned __maybe_unused = {
- .fold_across_2048_bits_consts = {
- 0x00000000ce3371cb, /* HI64_TERMS: (x^2079 mod G) * x^32 */
- 0x00000000e95c1271, /* LO64_TERMS: (x^2015 mod G) * x^32 */
- },
- .fold_across_1024_bits_consts = {
- 0x0000000033fff533, /* HI64_TERMS: (x^1055 mod G) * x^32 */
- 0x00000000910eeec1, /* LO64_TERMS: (x^991 mod G) * x^32 */
- },
- .fold_across_512_bits_consts = {
- 0x000000008f352d95, /* HI64_TERMS: (x^543 mod G) * x^32 */
- 0x000000001d9513d7, /* LO64_TERMS: (x^479 mod G) * x^32 */
- },
- .fold_across_256_bits_consts = {
- 0x00000000f1da05aa, /* HI64_TERMS: (x^287 mod G) * x^32 */
- 0x0000000081256527, /* LO64_TERMS: (x^223 mod G) * x^32 */
- },
- .fold_across_128_bits_consts = {
- 0x00000000ae689191, /* HI64_TERMS: (x^159 mod G) * x^32 */
- 0x00000000ccaa009e, /* LO64_TERMS: (x^95 mod G) * x^32 */
- },
- .shuf_table = {
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- },
- .barrett_reduction_consts = {
- 0xb4e5b025f7011641, /* HI64_TERMS: floor(x^95 / G) */
- 0x00000001db710640, /* LO64_TERMS: (G - x^32) * x^31 */
- },
-};
-
-/*
- * CRC folding constants generated for most-significant-bit-first CRC-64 using
- * G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
- * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
- * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
- * x^7 + x^4 + x^1 + x^0
- */
-static const struct {
- u8 bswap_mask[16];
- u64 fold_across_2048_bits_consts[2];
- u64 fold_across_1024_bits_consts[2];
- u64 fold_across_512_bits_consts[2];
- u64 fold_across_256_bits_consts[2];
- u64 fold_across_128_bits_consts[2];
- u8 shuf_table[48];
- u64 barrett_reduction_consts[2];
-} crc64_msb_0x42f0e1eba9ea3693_consts ____cacheline_aligned __maybe_unused = {
- .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
- .fold_across_2048_bits_consts = {
- 0x7f52691a60ddc70d, /* LO64_TERMS: (x^2048 mod G) * x^0 */
- 0x7036b0389f6a0c82, /* HI64_TERMS: (x^2112 mod G) * x^0 */
- },
- .fold_across_1024_bits_consts = {
- 0x05cf79dea9ac37d6, /* LO64_TERMS: (x^1024 mod G) * x^0 */
- 0x001067e571d7d5c2, /* HI64_TERMS: (x^1088 mod G) * x^0 */
- },
- .fold_across_512_bits_consts = {
- 0x5f6843ca540df020, /* LO64_TERMS: (x^512 mod G) * x^0 */
- 0xddf4b6981205b83f, /* HI64_TERMS: (x^576 mod G) * x^0 */
- },
- .fold_across_256_bits_consts = {
- 0x571bee0a227ef92b, /* LO64_TERMS: (x^256 mod G) * x^0 */
- 0x44bef2a201b5200c, /* HI64_TERMS: (x^320 mod G) * x^0 */
- },
- .fold_across_128_bits_consts = {
- 0x05f5c3c7eb52fab6, /* LO64_TERMS: (x^128 mod G) * x^0 */
- 0x4eb938a7d257740e, /* HI64_TERMS: (x^192 mod G) * x^0 */
- },
- .shuf_table = {
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- },
- .barrett_reduction_consts = {
- 0x42f0e1eba9ea3693, /* LO64_TERMS: (G - x^64) * x^0 */
- 0x578d29d06cc4f872, /* HI64_TERMS: (floor(x^127 / G) * x) - x^64 */
- },
-};
-
-/*
- * CRC folding constants generated for least-significant-bit-first CRC-64 using
- * G(x) = x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 +
- * x^47 + x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 +
- * x^26 + x^23 + x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 +
- * x^4 + x^3 + x^0
- */
-static const struct {
- u64 fold_across_2048_bits_consts[2];
- u64 fold_across_1024_bits_consts[2];
- u64 fold_across_512_bits_consts[2];
- u64 fold_across_256_bits_consts[2];
- u64 fold_across_128_bits_consts[2];
- u8 shuf_table[48];
- u64 barrett_reduction_consts[2];
-} crc64_lsb_0x9a6c9329ac4bc9b5_consts ____cacheline_aligned __maybe_unused = {
- .fold_across_2048_bits_consts = {
- 0x37ccd3e14069cabc, /* HI64_TERMS: (x^2111 mod G) * x^0 */
- 0xa043808c0f782663, /* LO64_TERMS: (x^2047 mod G) * x^0 */
- },
- .fold_across_1024_bits_consts = {
- 0xa1ca681e733f9c40, /* HI64_TERMS: (x^1087 mod G) * x^0 */
- 0x5f852fb61e8d92dc, /* LO64_TERMS: (x^1023 mod G) * x^0 */
- },
- .fold_across_512_bits_consts = {
- 0x0c32cdb31e18a84a, /* HI64_TERMS: (x^575 mod G) * x^0 */
- 0x62242240ace5045a, /* LO64_TERMS: (x^511 mod G) * x^0 */
- },
- .fold_across_256_bits_consts = {
- 0xb0bc2e589204f500, /* HI64_TERMS: (x^319 mod G) * x^0 */
- 0xe1e0bb9d45d7a44c, /* LO64_TERMS: (x^255 mod G) * x^0 */
- },
- .fold_across_128_bits_consts = {
- 0xeadc41fd2ba3d420, /* HI64_TERMS: (x^191 mod G) * x^0 */
- 0x21e9761e252621ac, /* LO64_TERMS: (x^127 mod G) * x^0 */
- },
- .shuf_table = {
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- },
- .barrett_reduction_consts = {
- 0x27ecfa329aef9f77, /* HI64_TERMS: floor(x^127 / G) */
- 0x34d926535897936a, /* LO64_TERMS: (G - x^64 - x^0) / x */
- },
-};
diff --git a/arch/x86/lib/crc-pclmul-template.S b/arch/x86/lib/crc-pclmul-template.S
deleted file mode 100644
index ae0b6144c503..000000000000
--- a/arch/x86/lib/crc-pclmul-template.S
+++ /dev/null
@@ -1,582 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-//
-// Template to generate [V]PCLMULQDQ-based CRC functions for x86
-//
-// Copyright 2025 Google LLC
-//
-// Author: Eric Biggers <ebiggers@google.com>
-
-#include <linux/linkage.h>
-#include <linux/objtool.h>
-
-// Offsets within the generated constants table
-.set OFFSETOF_BSWAP_MASK, -5*16 // msb-first CRCs only
-.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS, -4*16 // must precede next
-.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS, -3*16 // must precede next
-.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS, -2*16 // must precede next
-.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS, -1*16 // must precede next
-.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS, 0*16 // must be 0
-.set OFFSETOF_SHUF_TABLE, 1*16
-.set OFFSETOF_BARRETT_REDUCTION_CONSTS, 4*16
-
-// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the
-// corresponding non-VEX instruction plus any needed moves. The supported
-// instruction formats are:
-//
-// - Two-arg [src, dst], where the non-VEX format is the same.
-// - Three-arg [src1, src2, dst] where the non-VEX format is
-// [src1, src2_and_dst]. If src2 != dst, then src1 must != dst too.
-//
-// \insn gives the instruction without a "v" prefix and including any immediate
-// argument if needed to make the instruction follow one of the above formats.
-// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to
-// it first; this is needed when \arg1 is an unaligned mem operand.
-.macro _cond_vex insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp
-.if AVX_LEVEL == 0
- // VEX not allowed. Emulate it.
- .ifnb \arg3 // Three-arg [src1, src2, dst]
- .ifc "\arg2", "\arg3" // src2 == dst?
- .ifnb \unaligned_mem_tmp
- movdqu \arg1, \unaligned_mem_tmp
- \insn \unaligned_mem_tmp, \arg3
- .else
- \insn \arg1, \arg3
- .endif
- .else // src2 != dst
- .ifc "\arg1", "\arg3"
- .error "Can't have src1 == dst when src2 != dst"
- .endif
- .ifnb \unaligned_mem_tmp
- movdqu \arg1, \unaligned_mem_tmp
- movdqa \arg2, \arg3
- \insn \unaligned_mem_tmp, \arg3
- .else
- movdqa \arg2, \arg3
- \insn \arg1, \arg3
- .endif
- .endif
- .else // Two-arg [src, dst]
- .ifnb \unaligned_mem_tmp
- movdqu \arg1, \unaligned_mem_tmp
- \insn \unaligned_mem_tmp, \arg2
- .else
- \insn \arg1, \arg2
- .endif
- .endif
-.else
- // VEX is allowed. Emit the desired instruction directly.
- .ifnb \arg3
- v\insn \arg1, \arg2, \arg3
- .else
- v\insn \arg1, \arg2
- .endif
-.endif
-.endm
-
-// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector
-// register of length VL.
-.macro _vbroadcast src, dst
-.if VL == 16
- _cond_vex movdqa, \src, \dst
-.elseif VL == 32
- vbroadcasti128 \src, \dst
-.else
- vbroadcasti32x4 \src, \dst
-.endif
-.endm
-
-// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC
-// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane.
-.macro _load_data vl, src, bswap_mask, dst
-.if \vl < 64
- _cond_vex movdqu, "\src", \dst
-.else
- vmovdqu8 \src, \dst
-.endif
-.if !LSB_CRC
- _cond_vex pshufb, \bswap_mask, \dst, \dst
-.endif
-.endm
-
-.macro _prepare_v0 vl, v0, v1, bswap_mask
-.if LSB_CRC
- .if \vl < 64
- _cond_vex pxor, (BUF), \v0, \v0, unaligned_mem_tmp=\v1
- .else
- vpxorq (BUF), \v0, \v0
- .endif
-.else
- _load_data \vl, (BUF), \bswap_mask, \v1
- .if \vl < 64
- _cond_vex pxor, \v1, \v0, \v0
- .else
- vpxorq \v1, \v0, \v0
- .endif
-.endif
-.endm
-
-// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for
-// msb-first order or the physically high qword for lsb-first order
-#define LO64_TERMS 0
-
-// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high
-// qword for msb-first order or the physically low qword for lsb-first order
-#define HI64_TERMS 1
-
-// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given
-// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst.
-.macro _pclmulqdq src1, src1_terms, src2, src2_terms, dst
- _cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \
- \src1, \src2, \dst
-.endm
-
-// Fold \acc into \data and store the result back into \acc. \data can be an
-// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no
-// byte-reflection is needed; otherwise it must be a vector register. \consts
-// is a vector register containing the needed fold constants, and \tmp is a
-// temporary vector register. All arguments must be the same length.
-.macro _fold_vec acc, data, consts, tmp
- _pclmulqdq \consts, HI64_TERMS, \acc, HI64_TERMS, \tmp
- _pclmulqdq \consts, LO64_TERMS, \acc, LO64_TERMS, \acc
-.if AVX_LEVEL <= 2
- _cond_vex pxor, \data, \tmp, \tmp
- _cond_vex pxor, \tmp, \acc, \acc
-.else
- vpternlogq $0x96, \data, \tmp, \acc
-.endif
-.endm
-
-// Fold \acc into \data and store the result back into \acc. \data is an
-// unaligned mem operand, \consts is a vector register containing the needed
-// fold constants, \bswap_mask is a vector register containing the
-// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are
-// temporary vector registers. All arguments must have length \vl.
-.macro _fold_vec_mem vl, acc, data, consts, bswap_mask, tmp1, tmp2
-.if AVX_LEVEL == 0 || !LSB_CRC
- _load_data \vl, \data, \bswap_mask, \tmp1
- _fold_vec \acc, \tmp1, \consts, \tmp2
-.else
- _fold_vec \acc, \data, \consts, \tmp1
-.endif
-.endm
-
-// Load the constants for folding across 2**i vectors of length VL at a time
-// into all 128-bit lanes of the vector register CONSTS.
-.macro _load_vec_folding_consts i
- _vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \
- CONSTS
-.endm
-
-// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store
-// the result back into \v0. If the remaining length mod \vl is nonzero, also
-// fold \vl data bytes from BUF. For both operations the fold distance is \vl.
-// \consts must be a register of length \vl containing the fold constants.
-.macro _fold_vec_final vl, v0, v1, consts, bswap_mask, tmp1, tmp2
- _fold_vec \v0, \v1, \consts, \tmp1
- test $\vl, LEN8
- jz .Lfold_vec_final_done\@
- _fold_vec_mem \vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2
- add $\vl, BUF
-.Lfold_vec_final_done\@:
-.endm
-
-// This macro generates the body of a CRC function with the following prototype:
-//
-// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts);
-//
-// |crc| is the initial CRC, and crc_t is a data type wide enough to hold it.
-// |buf| is the data to checksum. |len| is the data length in bytes, which must
-// be at least 16. |consts| is a pointer to the fold_across_128_bits_consts
-// field of the constants struct that was generated for the chosen CRC variant.
-//
-// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g.
-// 32 for a CRC-32. Currently the supported values are 8, 16, 32, and 64. If
-// the file is compiled in i386 mode, then the maximum supported value is 32.
-//
-// \lsb_crc is 1 if the CRC processes the least significant bit of each byte
-// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0. \lsb_crc is 0
-// if the CRC processes the most significant bit of each byte first, i.e. maps
-// bit0 to x^0, bit1 to x^1, bit7 to x^7.
-//
-// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64.
-//
-// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or
-// 512 for AVX512.
-//
-// If \vl == 16 && \avx_level == 0, the generated code requires:
-// PCLMULQDQ && SSE4.1. (Note: all known CPUs with PCLMULQDQ also have SSE4.1.)
-//
-// If \vl == 32 && \avx_level == 2, the generated code requires:
-// VPCLMULQDQ && AVX2.
-//
-// If \vl == 64 && \avx_level == 512, the generated code requires:
-// VPCLMULQDQ && AVX512BW && AVX512VL.
-//
-// Other \vl and \avx_level combinations are either not supported or not useful.
-.macro _crc_pclmul n, lsb_crc, vl, avx_level
- .set LSB_CRC, \lsb_crc
- .set VL, \vl
- .set AVX_LEVEL, \avx_level
-
- // Define aliases for the xmm, ymm, or zmm registers according to VL.
-.irp i, 0,1,2,3,4,5,6,7
- .if VL == 16
- .set V\i, %xmm\i
- .set LOG2_VL, 4
- .elseif VL == 32
- .set V\i, %ymm\i
- .set LOG2_VL, 5
- .elseif VL == 64
- .set V\i, %zmm\i
- .set LOG2_VL, 6
- .else
- .error "Unsupported vector length"
- .endif
-.endr
- // Define aliases for the function parameters.
- // Note: when crc_t is shorter than u32, zero-extension to 32 bits is
- // guaranteed by the ABI. Zero-extension to 64 bits is *not* guaranteed
- // when crc_t is shorter than u64.
-#ifdef __x86_64__
-.if \n <= 32
- .set CRC, %edi
-.else
- .set CRC, %rdi
-.endif
- .set BUF, %rsi
- .set LEN, %rdx
- .set LEN32, %edx
- .set LEN8, %dl
- .set CONSTS_PTR, %rcx
-#else
- // 32-bit support, assuming -mregparm=3 and not including support for
- // CRC-64 (which would use both eax and edx to pass the crc parameter).
- .set CRC, %eax
- .set BUF, %edx
- .set LEN, %ecx
- .set LEN32, %ecx
- .set LEN8, %cl
- .set CONSTS_PTR, %ebx // Passed on stack
-#endif
-
- // Define aliases for some local variables. V0-V5 are used without
- // aliases (for accumulators, data, temporary values, etc). Staying
- // within the first 8 vector registers keeps the code 32-bit SSE
- // compatible and reduces the size of 64-bit SSE code slightly.
- .set BSWAP_MASK, V6
- .set BSWAP_MASK_YMM, %ymm6
- .set BSWAP_MASK_XMM, %xmm6
- .set CONSTS, V7
- .set CONSTS_YMM, %ymm7
- .set CONSTS_XMM, %xmm7
-
- // Use ANNOTATE_NOENDBR to suppress an objtool warning, since the
- // functions generated by this macro are called only by static_call.
- ANNOTATE_NOENDBR
-
-#ifdef __i386__
- push CONSTS_PTR
- mov 8(%esp), CONSTS_PTR
-#endif
-
- // Create a 128-bit vector that contains the initial CRC in the end
- // representing the high-order polynomial coefficients, and the rest 0.
- // If the CRC is msb-first, also load the byte-reflection table.
-.if \n <= 32
- _cond_vex movd, CRC, %xmm0
-.else
- _cond_vex movq, CRC, %xmm0
-.endif
-.if !LSB_CRC
- _cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0
- _vbroadcast OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK
-.endif
-
- // Load the first vector of data and XOR the initial CRC into the
- // appropriate end of the first 128-bit lane of data. If LEN < VL, then
- // use a short vector and jump ahead to the final reduction. (LEN >= 16
- // is guaranteed here but not necessarily LEN >= VL.)
-.if VL >= 32
- cmp $VL, LEN
- jae .Lat_least_1vec\@
- .if VL == 64
- cmp $32, LEN32
- jb .Lless_than_32bytes\@
- _prepare_v0 32, %ymm0, %ymm1, BSWAP_MASK_YMM
- add $32, BUF
- jmp .Lreduce_256bits_to_128bits\@
-.Lless_than_32bytes\@:
- .endif
- _prepare_v0 16, %xmm0, %xmm1, BSWAP_MASK_XMM
- add $16, BUF
- vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
- jmp .Lcheck_for_partial_block\@
-.Lat_least_1vec\@:
-.endif
- _prepare_v0 VL, V0, V1, BSWAP_MASK
-
- // Handle VL <= LEN < 4*VL.
- cmp $4*VL-1, LEN
- ja .Lat_least_4vecs\@
- add $VL, BUF
- // If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector.
- // If VL==16 then load fold_across_128_bits_consts first, as the final
- // reduction depends on it and it won't be loaded anywhere else.
- cmp $2*VL-1, LEN32
-.if VL == 16
- _cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
-.endif
- jbe .Lreduce_1vec_to_128bits\@
- // Otherwise 2*VL <= LEN < 4*VL. Load one more vector and jump ahead to
- // the reduction from 2 vectors.
- _load_data VL, (BUF), BSWAP_MASK, V1
- add $VL, BUF
- jmp .Lreduce_2vecs_to_1\@
-
-.Lat_least_4vecs\@:
- // Load 3 more vectors of data.
- _load_data VL, 1*VL(BUF), BSWAP_MASK, V1
- _load_data VL, 2*VL(BUF), BSWAP_MASK, V2
- _load_data VL, 3*VL(BUF), BSWAP_MASK, V3
- sub $-4*VL, BUF // Shorter than 'add 4*VL' when VL=32
- add $-4*VL, LEN // Shorter than 'sub 4*VL' when VL=32
-
- // Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next
- // 4 vectors of data and write the result back to V0-V3.
- cmp $4*VL-1, LEN // Shorter than 'cmp 4*VL' when VL=32
- jbe .Lreduce_4vecs_to_2\@
- _load_vec_folding_consts 2
-.Lfold_4vecs_loop\@:
- _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- _fold_vec_mem VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- _fold_vec_mem VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- sub $-4*VL, BUF
- add $-4*VL, LEN
- cmp $4*VL-1, LEN
- ja .Lfold_4vecs_loop\@
-
- // Fold V0,V1 into V2,V3 and write the result back to V0,V1. Then fold
- // two more vectors of data from BUF, if at least that much remains.
-.Lreduce_4vecs_to_2\@:
- _load_vec_folding_consts 1
- _fold_vec V0, V2, CONSTS, V4
- _fold_vec V1, V3, CONSTS, V4
- test $2*VL, LEN8
- jz .Lreduce_2vecs_to_1\@
- _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- sub $-2*VL, BUF
-
- // Fold V0 into V1 and write the result back to V0. Then fold one more
- // vector of data from BUF, if at least that much remains.
-.Lreduce_2vecs_to_1\@:
- _load_vec_folding_consts 0
- _fold_vec_final VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5
-
-.Lreduce_1vec_to_128bits\@:
-.if VL == 64
- // Reduce 512-bit %zmm0 to 256-bit %ymm0. Then fold 256 more bits of
- // data from BUF, if at least that much remains.
- vbroadcasti128 OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM
- vextracti64x4 $1, %zmm0, %ymm1
- _fold_vec_final 32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5
-.Lreduce_256bits_to_128bits\@:
-.endif
-.if VL >= 32
- // Reduce 256-bit %ymm0 to 128-bit %xmm0. Then fold 128 more bits of
- // data from BUF, if at least that much remains.
- vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
- vextracti128 $1, %ymm0, %xmm1
- _fold_vec_final 16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5
-.Lcheck_for_partial_block\@:
-.endif
- and $15, LEN32
- jz .Lreduce_128bits_to_crc\@
-
- // 1 <= LEN <= 15 data bytes remain in BUF. The polynomial is now
- // A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0
- // and B is the polynomial of the remaining LEN data bytes. To reduce
- // this to 128 bits without needing fold constants for each possible
- // LEN, rearrange this expression into C1*(x^128) + C2, where
- // C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128.
- // Then fold C1 into C2, which is just another fold across 128 bits.
-
-.if !LSB_CRC || AVX_LEVEL == 0
- // Load the last 16 data bytes. Note that originally LEN was >= 16.
- _load_data 16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2
-.endif // Else will use vpblendvb mem operand later.
-.if !LSB_CRC
- neg LEN // Needed for indexing shuf_table
-.endif
-
- // tmp = A*x^(8*LEN) mod x^128
- // lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1]
- // i.e. right-shift by LEN bytes.
- // msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN]
- // i.e. left-shift by LEN bytes.
- _cond_vex movdqu, "OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3
- _cond_vex pshufb, %xmm3, %xmm0, %xmm1
-
- // C1 = floor(A / x^(128 - 8*LEN))
- // lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1]
- // i.e. left-shift by 16-LEN bytes.
- // msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1]
- // i.e. right-shift by 16-LEN bytes.
- _cond_vex pshufb, "OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \
- %xmm0, %xmm0, unaligned_mem_tmp=%xmm4
-
- // C2 = tmp + B. This is just a blend of tmp with the last 16 data
- // bytes (reflected if msb-first). The blend mask is the shuffle table
- // that was used to create tmp. 0 selects tmp, and 1 last16databytes.
-.if AVX_LEVEL == 0
- movdqa %xmm0, %xmm4
- movdqa %xmm3, %xmm0
- pblendvb %xmm2, %xmm1 // uses %xmm0 as implicit operand
- movdqa %xmm4, %xmm0
-.elseif LSB_CRC
- vpblendvb %xmm3, -16(BUF,LEN), %xmm1, %xmm1
-.else
- vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
-.endif
-
- // Fold C1 into C2 and store the 128-bit result in %xmm0.
- _fold_vec %xmm0, %xmm1, CONSTS_XMM, %xmm4
-
-.Lreduce_128bits_to_crc\@:
- // Compute the CRC as %xmm0 * x^n mod G. Here %xmm0 means the 128-bit
- // polynomial stored in %xmm0 (using either lsb-first or msb-first bit
- // order according to LSB_CRC), and G is the CRC's generator polynomial.
-
- // First, multiply %xmm0 by x^n and reduce the result to 64+n bits:
- //
- // t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) +
- // x^n * (%xmm0 mod x^64)
- //
- // Store t0 * x^(64-n) in %xmm0. I.e., actually do:
- //
- // %xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) +
- // x^64 * (%xmm0 mod x^64)
- //
- // The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned
- // to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily
- // select it. The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the
- // msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case
- // (considering the extra factor of x that gets implicitly introduced by
- // each pclmulqdq when using lsb-first order), is identical to the
- // constant that was used earlier for folding the LO64_TERMS across 128
- // bits. Thus it's already available in LO64_TERMS of CONSTS_XMM.
- _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1
-.if LSB_CRC
- _cond_vex psrldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64)
-.else
- _cond_vex pslldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64)
-.endif
- _cond_vex pxor, %xmm1, %xmm0, %xmm0
- // The HI64_TERMS of %xmm0 now contain floor(t0 / x^n).
- // The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n).
-
- // First step of Barrett reduction: Compute floor(t0 / G). This is the
- // polynomial by which G needs to be multiplied to cancel out the x^n
- // and higher terms of t0, i.e. to reduce t0 mod G. First do:
- //
- // t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n)
- //
- // Then the desired value floor(t0 / G) is floor(t1 / x^64). The 63 in
- // x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest
- // value that makes enough precision be carried through the calculation.
- //
- // The '* x' makes it so the result is floor(t1 / x^64) rather than
- // floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it
- // can be extracted much more easily in the next step. In the lsb-first
- // case the '* x' happens implicitly. In the msb-first case it must be
- // done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the
- // constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and
- // the multiplication by the x^64 term is handled using a pxor. The
- // pxor causes the low 64 terms of t1 to be wrong, but they are unused.
- _cond_vex movdqa, OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM
- _pclmulqdq CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1
-.if !LSB_CRC
- _cond_vex pxor, %xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n)
-.endif
- // The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G).
-
- // Second step of Barrett reduction: Cancel out the x^n and higher terms
- // of t0 by subtracting the needed multiple of G. This gives the CRC:
- //
- // crc := t0 - (G * floor(t0 / G))
- //
- // But %xmm0 contains t0 * x^(64-n), so it's more convenient to do:
- //
- // crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n)
- //
- // Furthermore, since the resulting CRC is n-bit, if mod x^n is
- // explicitly applied to it then the x^n term of G makes no difference
- // in the result and can be omitted. This helps keep the constant
- // multiplier in 64 bits in most cases. This gives the following:
- //
- // %xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G))
- // crc := (%xmm0 / x^(64-n)) mod x^n
- //
- // In the lsb-first case, each pclmulqdq implicitly introduces
- // an extra factor of x, so in that case the constant that needs to be
- // passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63.
- // For lsb-first CRCs where n=64, the extra factor of x cannot be as
- // easily avoided. In that case, instead pass '(G - x^n - x^0) / x' to
- // pclmulqdq and handle the x^0 term (i.e. 1) separately. (All CRC
- // polynomials have nonzero x^n and x^0 terms.) It works out as: the
- // CRC has be XORed with the physically low qword of %xmm1, representing
- // floor(t0 / G). The most efficient way to do that is to move it to
- // the physically high qword and use a ternlog to combine the two XORs.
-.if LSB_CRC && \n == 64
- _cond_vex punpcklqdq, %xmm1, %xmm2, %xmm2
- _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
- .if AVX_LEVEL <= 2
- _cond_vex pxor, %xmm2, %xmm0, %xmm0
- _cond_vex pxor, %xmm1, %xmm0, %xmm0
- .else
- vpternlogq $0x96, %xmm2, %xmm1, %xmm0
- .endif
- _cond_vex "pextrq $1,", %xmm0, %rax // (%xmm0 / x^0) mod x^64
-.else
- _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
- _cond_vex pxor, %xmm1, %xmm0, %xmm0
- .if \n == 8
- _cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8
- .elseif \n == 16
- _cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16
- .elseif \n == 32
- _cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32
- .else // \n == 64 && !LSB_CRC
- _cond_vex movq, %xmm0, %rax // (%xmm0 / x^0) mod x^64
- .endif
-.endif
-
-.if VL > 16
- vzeroupper // Needed when ymm or zmm registers may have been used.
-.endif
-#ifdef __i386__
- pop CONSTS_PTR
-#endif
- RET
-.endm
-
-#ifdef CONFIG_AS_VPCLMULQDQ
-#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \
-SYM_FUNC_START(prefix##_pclmul_sse); \
- _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \
-SYM_FUNC_END(prefix##_pclmul_sse); \
- \
-SYM_FUNC_START(prefix##_vpclmul_avx2); \
- _crc_pclmul n=bits, lsb_crc=lsb, vl=32, avx_level=2; \
-SYM_FUNC_END(prefix##_vpclmul_avx2); \
- \
-SYM_FUNC_START(prefix##_vpclmul_avx512); \
- _crc_pclmul n=bits, lsb_crc=lsb, vl=64, avx_level=512; \
-SYM_FUNC_END(prefix##_vpclmul_avx512);
-#else
-#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \
-SYM_FUNC_START(prefix##_pclmul_sse); \
- _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \
-SYM_FUNC_END(prefix##_pclmul_sse);
-#endif // !CONFIG_AS_VPCLMULQDQ
diff --git a/arch/x86/lib/crc-pclmul-template.h b/arch/x86/lib/crc-pclmul-template.h
deleted file mode 100644
index c5b3bfe11d8d..000000000000
--- a/arch/x86/lib/crc-pclmul-template.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Macros for accessing the [V]PCLMULQDQ-based CRC functions that are
- * instantiated by crc-pclmul-template.S
- *
- * Copyright 2025 Google LLC
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-#ifndef _CRC_PCLMUL_TEMPLATE_H
-#define _CRC_PCLMUL_TEMPLATE_H
-
-#include <asm/cpufeatures.h>
-#include <asm/simd.h>
-#include <crypto/internal/simd.h>
-#include <linux/static_call.h>
-#include "crc-pclmul-consts.h"
-
-#define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t) \
-crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len, \
- const void *consts_ptr); \
-crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len, \
- const void *consts_ptr); \
-crc_t prefix##_vpclmul_avx512(crc_t crc, const u8 *p, size_t len, \
- const void *consts_ptr); \
-DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse)
-
-#define INIT_CRC_PCLMUL(prefix) \
-do { \
- if (IS_ENABLED(CONFIG_AS_VPCLMULQDQ) && \
- boot_cpu_has(X86_FEATURE_VPCLMULQDQ) && \
- boot_cpu_has(X86_FEATURE_AVX2) && \
- cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL)) { \
- if (boot_cpu_has(X86_FEATURE_AVX512BW) && \
- boot_cpu_has(X86_FEATURE_AVX512VL) && \
- !boot_cpu_has(X86_FEATURE_PREFER_YMM) && \
- cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL)) { \
- static_call_update(prefix##_pclmul, \
- prefix##_vpclmul_avx512); \
- } else { \
- static_call_update(prefix##_pclmul, \
- prefix##_vpclmul_avx2); \
- } \
- } \
-} while (0)
-
-/*
- * Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16
- * bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD.
- *
- * 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions.
- * There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(),
- * varying by CPU and factors such as which parts of the "FPU" state userspace
- * has touched, which could result in a larger cutoff being better. Indeed, a
- * larger cutoff is usually better for a *single* message. However, the
- * overhead of the FPU section gets amortized if multiple FPU sections get
- * executed before returning to userspace, since the XSAVE and XRSTOR occur only
- * once. Considering that and the fact that the [V]PCLMULQDQ code is lighter on
- * the dcache than the table-based code is, a 16-byte cutoff seems to work well.
- */
-#define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq) \
-do { \
- if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) && \
- crypto_simd_usable()) { \
- const void *consts_ptr; \
- \
- consts_ptr = (consts).fold_across_128_bits_consts; \
- kernel_fpu_begin(); \
- crc = static_call(prefix##_pclmul)((crc), (p), (len), \
- consts_ptr); \
- kernel_fpu_end(); \
- return crc; \
- } \
-} while (0)
-
-#endif /* _CRC_PCLMUL_TEMPLATE_H */
diff --git a/arch/x86/lib/crc-t10dif.c b/arch/x86/lib/crc-t10dif.c
deleted file mode 100644
index db7ce59c31ac..000000000000
--- a/arch/x86/lib/crc-t10dif.c
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * CRC-T10DIF using [V]PCLMULQDQ instructions
- *
- * Copyright 2024 Google LLC
- */
-
-#include <linux/crc-t10dif.h>
-#include <linux/module.h>
-#include "crc-pclmul-template.h"
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
-
-DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16);
-
-u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len)
-{
- CRC_PCLMUL(crc, p, len, crc16_msb, crc16_msb_0x8bb7_consts,
- have_pclmulqdq);
- return crc_t10dif_generic(crc, p, len);
-}
-EXPORT_SYMBOL(crc_t10dif_arch);
-
-static int __init crc_t10dif_x86_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
- static_branch_enable(&have_pclmulqdq);
- INIT_CRC_PCLMUL(crc16_msb);
- }
- return 0;
-}
-subsys_initcall(crc_t10dif_x86_init);
-
-static void __exit crc_t10dif_x86_exit(void)
-{
-}
-module_exit(crc_t10dif_x86_exit);
-
-MODULE_DESCRIPTION("CRC-T10DIF using [V]PCLMULQDQ instructions");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/lib/crc16-msb-pclmul.S b/arch/x86/lib/crc16-msb-pclmul.S
deleted file mode 100644
index e9fe248093a8..000000000000
--- a/arch/x86/lib/crc16-msb-pclmul.S
+++ /dev/null
@@ -1,6 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-// Copyright 2025 Google LLC
-
-#include "crc-pclmul-template.S"
-
-DEFINE_CRC_PCLMUL_FUNCS(crc16_msb, /* bits= */ 16, /* lsb= */ 0)
diff --git a/arch/x86/lib/crc32-pclmul.S b/arch/x86/lib/crc32-pclmul.S
deleted file mode 100644
index f20f40fb0172..000000000000
--- a/arch/x86/lib/crc32-pclmul.S
+++ /dev/null
@@ -1,6 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-// Copyright 2025 Google LLC
-
-#include "crc-pclmul-template.S"
-
-DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1)
diff --git a/arch/x86/lib/crc32.c b/arch/x86/lib/crc32.c
deleted file mode 100644
index d09343e2cea9..000000000000
--- a/arch/x86/lib/crc32.c
+++ /dev/null
@@ -1,111 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * x86-optimized CRC32 functions
- *
- * Copyright (C) 2008 Intel Corporation
- * Copyright 2012 Xyratex Technology Limited
- * Copyright 2024 Google LLC
- */
-
-#include <linux/crc32.h>
-#include <linux/module.h>
-#include "crc-pclmul-template.h"
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
-
-DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
-
-u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
-{
- CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
- have_pclmulqdq);
- return crc32_le_base(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_le_arch);
-
-#ifdef CONFIG_X86_64
-#define CRC32_INST "crc32q %1, %q0"
-#else
-#define CRC32_INST "crc32l %1, %0"
-#endif
-
-/*
- * Use carryless multiply version of crc32c when buffer size is >= 512 to
- * account for FPU state save/restore overhead.
- */
-#define CRC32C_PCLMUL_BREAKEVEN 512
-
-asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
-
-u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
-{
- size_t num_longs;
-
- if (!static_branch_likely(&have_crc32))
- return crc32c_base(crc, p, len);
-
- if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
- static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
- kernel_fpu_begin();
- crc = crc32c_x86_3way(crc, p, len);
- kernel_fpu_end();
- return crc;
- }
-
- for (num_longs = len / sizeof(unsigned long);
- num_longs != 0; num_longs--, p += sizeof(unsigned long))
- asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));
-
- if (sizeof(unsigned long) > 4 && (len & 4)) {
- asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p));
- p += 4;
- }
- if (len & 2) {
- asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p));
- p += 2;
- }
- if (len & 1)
- asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p));
-
- return crc;
-}
-EXPORT_SYMBOL(crc32c_arch);
-
-u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
-{
- return crc32_be_base(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_be_arch);
-
-static int __init crc32_x86_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_XMM4_2))
- static_branch_enable(&have_crc32);
- if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
- static_branch_enable(&have_pclmulqdq);
- INIT_CRC_PCLMUL(crc32_lsb);
- }
- return 0;
-}
-subsys_initcall(crc32_x86_init);
-
-static void __exit crc32_x86_exit(void)
-{
-}
-module_exit(crc32_x86_exit);
-
-u32 crc32_optimizations(void)
-{
- u32 optimizations = 0;
-
- if (static_key_enabled(&have_crc32))
- optimizations |= CRC32C_OPTIMIZATION;
- if (static_key_enabled(&have_pclmulqdq))
- optimizations |= CRC32_LE_OPTIMIZATION;
- return optimizations;
-}
-EXPORT_SYMBOL(crc32_optimizations);
-
-MODULE_DESCRIPTION("x86-optimized CRC32 functions");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/lib/crc32c-3way.S b/arch/x86/lib/crc32c-3way.S
deleted file mode 100644
index 9b8770503bbc..000000000000
--- a/arch/x86/lib/crc32c-3way.S
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
- *
- * The white papers on CRC32C calculations with PCLMULQDQ instruction can be
- * downloaded from:
- * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
- * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
- *
- * Copyright (C) 2012 Intel Corporation.
- * Copyright 2024 Google LLC
- *
- * Authors:
- * Wajdi Feghali <wajdi.k.feghali@intel.com>
- * James Guilford <james.guilford@intel.com>
- * David Cote <david.m.cote@intel.com>
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/linkage.h>
-
-## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
-
-# Define threshold below which buffers are considered "small" and routed to
-# regular CRC code that does not interleave the CRC instructions.
-#define SMALL_SIZE 200
-
-# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
-
-.text
-SYM_FUNC_START(crc32c_x86_3way)
-#define crc0 %edi
-#define crc0_q %rdi
-#define bufp %rsi
-#define bufp_d %esi
-#define len %rdx
-#define len_dw %edx
-#define n_misaligned %ecx /* overlaps chunk_bytes! */
-#define n_misaligned_q %rcx
-#define chunk_bytes %ecx /* overlaps n_misaligned! */
-#define chunk_bytes_q %rcx
-#define crc1 %r8
-#define crc2 %r9
-
- cmp $SMALL_SIZE, len
- jb .Lsmall
-
- ################################################################
- ## 1) ALIGN:
- ################################################################
- mov bufp_d, n_misaligned
- neg n_misaligned
- and $7, n_misaligned # calculate the misalignment amount of
- # the address
- je .Laligned # Skip if aligned
-
- # Process 1 <= n_misaligned <= 7 bytes individually in order to align
- # the remaining data to an 8-byte boundary.
-.Ldo_align:
- movq (bufp), %rax
- add n_misaligned_q, bufp
- sub n_misaligned_q, len
-.Lalign_loop:
- crc32b %al, crc0 # compute crc32 of 1-byte
- shr $8, %rax # get next byte
- dec n_misaligned
- jne .Lalign_loop
-.Laligned:
-
- ################################################################
- ## 2) PROCESS BLOCK:
- ################################################################
-
- cmp $128*24, len
- jae .Lfull_block
-
-.Lpartial_block:
- # Compute floor(len / 24) to get num qwords to process from each lane.
- imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24)
- shr $16, %eax
- jmp .Lcrc_3lanes
-
-.Lfull_block:
- # Processing 128 qwords from each lane.
- mov $128, %eax
-
- ################################################################
- ## 3) CRC each of three lanes:
- ################################################################
-
-.Lcrc_3lanes:
- xor crc1,crc1
- xor crc2,crc2
- mov %eax, chunk_bytes
- shl $3, chunk_bytes # num bytes to process from each lane
- sub $5, %eax # 4 for 4x_loop, 1 for special last iter
- jl .Lcrc_3lanes_4x_done
-
- # Unroll the loop by a factor of 4 to reduce the overhead of the loop
- # bookkeeping instructions, which can compete with crc32q for the ALUs.
-.Lcrc_3lanes_4x_loop:
- crc32q (bufp), crc0_q
- crc32q (bufp,chunk_bytes_q), crc1
- crc32q (bufp,chunk_bytes_q,2), crc2
- crc32q 8(bufp), crc0_q
- crc32q 8(bufp,chunk_bytes_q), crc1
- crc32q 8(bufp,chunk_bytes_q,2), crc2
- crc32q 16(bufp), crc0_q
- crc32q 16(bufp,chunk_bytes_q), crc1
- crc32q 16(bufp,chunk_bytes_q,2), crc2
- crc32q 24(bufp), crc0_q
- crc32q 24(bufp,chunk_bytes_q), crc1
- crc32q 24(bufp,chunk_bytes_q,2), crc2
- add $32, bufp
- sub $4, %eax
- jge .Lcrc_3lanes_4x_loop
-
-.Lcrc_3lanes_4x_done:
- add $4, %eax
- jz .Lcrc_3lanes_last_qword
-
-.Lcrc_3lanes_1x_loop:
- crc32q (bufp), crc0_q
- crc32q (bufp,chunk_bytes_q), crc1
- crc32q (bufp,chunk_bytes_q,2), crc2
- add $8, bufp
- dec %eax
- jnz .Lcrc_3lanes_1x_loop
-
-.Lcrc_3lanes_last_qword:
- crc32q (bufp), crc0_q
- crc32q (bufp,chunk_bytes_q), crc1
-# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
-
- ################################################################
- ## 4) Combine three results:
- ################################################################
-
- lea (K_table-8)(%rip), %rax # first entry is for idx 1
- pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
- lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
- sub %rax, len # len -= chunk_bytes * 3
-
- movq crc0_q, %xmm1 # CRC for block 1
- pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
-
- movq crc1, %xmm2 # CRC for block 2
- pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
-
- pxor %xmm2,%xmm1
- movq %xmm1, %rax
- xor (bufp,chunk_bytes_q,2), %rax
- mov crc2, crc0_q
- crc32 %rax, crc0_q
- lea 8(bufp,chunk_bytes_q,2), bufp
-
- ################################################################
- ## 5) If more blocks remain, goto (2):
- ################################################################
-
- cmp $128*24, len
- jae .Lfull_block
- cmp $SMALL_SIZE, len
- jae .Lpartial_block
-
- #######################################################################
- ## 6) Process any remainder without interleaving:
- #######################################################################
-.Lsmall:
- test len_dw, len_dw
- jz .Ldone
- mov len_dw, %eax
- shr $3, %eax
- jz .Ldo_dword
-.Ldo_qwords:
- crc32q (bufp), crc0_q
- add $8, bufp
- dec %eax
- jnz .Ldo_qwords
-.Ldo_dword:
- test $4, len_dw
- jz .Ldo_word
- crc32l (bufp), crc0
- add $4, bufp
-.Ldo_word:
- test $2, len_dw
- jz .Ldo_byte
- crc32w (bufp), crc0
- add $2, bufp
-.Ldo_byte:
- test $1, len_dw
- jz .Ldone
- crc32b (bufp), crc0
-.Ldone:
- mov crc0, %eax
- RET
-SYM_FUNC_END(crc32c_x86_3way)
-
-.section .rodata, "a", @progbits
- ################################################################
- ## PCLMULQDQ tables
- ## Table is 128 entries x 2 words (8 bytes) each
- ################################################################
-.align 8
-K_table:
- .long 0x493c7d27, 0x00000001
- .long 0xba4fc28e, 0x493c7d27
- .long 0xddc0152b, 0xf20c0dfe
- .long 0x9e4addf8, 0xba4fc28e
- .long 0x39d3b296, 0x3da6d0cb
- .long 0x0715ce53, 0xddc0152b
- .long 0x47db8317, 0x1c291d04
- .long 0x0d3b6092, 0x9e4addf8
- .long 0xc96cfdc0, 0x740eef02
- .long 0x878a92a7, 0x39d3b296
- .long 0xdaece73e, 0x083a6eec
- .long 0xab7aff2a, 0x0715ce53
- .long 0x2162d385, 0xc49f4f67
- .long 0x83348832, 0x47db8317
- .long 0x299847d5, 0x2ad91c30
- .long 0xb9e02b86, 0x0d3b6092
- .long 0x18b33a4e, 0x6992cea2
- .long 0xb6dd949b, 0xc96cfdc0
- .long 0x78d9ccb7, 0x7e908048
- .long 0xbac2fd7b, 0x878a92a7
- .long 0xa60ce07b, 0x1b3d8f29
- .long 0xce7f39f4, 0xdaece73e
- .long 0x61d82e56, 0xf1d0f55e
- .long 0xd270f1a2, 0xab7aff2a
- .long 0xc619809d, 0xa87ab8a8
- .long 0x2b3cac5d, 0x2162d385
- .long 0x65863b64, 0x8462d800
- .long 0x1b03397f, 0x83348832
- .long 0xebb883bd, 0x71d111a8
- .long 0xb3e32c28, 0x299847d5
- .long 0x064f7f26, 0xffd852c6
- .long 0xdd7e3b0c, 0xb9e02b86
- .long 0xf285651c, 0xdcb17aa4
- .long 0x10746f3c, 0x18b33a4e
- .long 0xc7a68855, 0xf37c5aee
- .long 0x271d9844, 0xb6dd949b
- .long 0x8e766a0c, 0x6051d5a2
- .long 0x93a5f730, 0x78d9ccb7
- .long 0x6cb08e5c, 0x18b0d4ff
- .long 0x6b749fb2, 0xbac2fd7b
- .long 0x1393e203, 0x21f3d99c
- .long 0xcec3662e, 0xa60ce07b
- .long 0x96c515bb, 0x8f158014
- .long 0xe6fc4e6a, 0xce7f39f4
- .long 0x8227bb8a, 0xa00457f7
- .long 0xb0cd4768, 0x61d82e56
- .long 0x39c7ff35, 0x8d6d2c43
- .long 0xd7a4825c, 0xd270f1a2
- .long 0x0ab3844b, 0x00ac29cf
- .long 0x0167d312, 0xc619809d
- .long 0xf6076544, 0xe9adf796
- .long 0x26f6a60a, 0x2b3cac5d
- .long 0xa741c1bf, 0x96638b34
- .long 0x98d8d9cb, 0x65863b64
- .long 0x49c3cc9c, 0xe0e9f351
- .long 0x68bce87a, 0x1b03397f
- .long 0x57a3d037, 0x9af01f2d
- .long 0x6956fc3b, 0xebb883bd
- .long 0x42d98888, 0x2cff42cf
- .long 0x3771e98f, 0xb3e32c28
- .long 0xb42ae3d9, 0x88f25a3a
- .long 0x2178513a, 0x064f7f26
- .long 0xe0ac139e, 0x4e36f0b0
- .long 0x170076fa, 0xdd7e3b0c
- .long 0x444dd413, 0xbd6f81f8
- .long 0x6f345e45, 0xf285651c
- .long 0x41d17b64, 0x91c9bd4b
- .long 0xff0dba97, 0x10746f3c
- .long 0xa2b73df1, 0x885f087b
- .long 0xf872e54c, 0xc7a68855
- .long 0x1e41e9fc, 0x4c144932
- .long 0x86d8e4d2, 0x271d9844
- .long 0x651bd98b, 0x52148f02
- .long 0x5bb8f1bc, 0x8e766a0c
- .long 0xa90fd27a, 0xa3c6f37a
- .long 0xb3af077a, 0x93a5f730
- .long 0x4984d782, 0xd7c0557f
- .long 0xca6ef3ac, 0x6cb08e5c
- .long 0x234e0b26, 0x63ded06a
- .long 0xdd66cbbb, 0x6b749fb2
- .long 0x4597456a, 0x4d56973c
- .long 0xe9e28eb4, 0x1393e203
- .long 0x7b3ff57a, 0x9669c9df
- .long 0xc9c8b782, 0xcec3662e
- .long 0x3f70cc6f, 0xe417f38a
- .long 0x93e106a4, 0x96c515bb
- .long 0x62ec6c6d, 0x4b9e0f71
- .long 0xd813b325, 0xe6fc4e6a
- .long 0x0df04680, 0xd104b8fc
- .long 0x2342001e, 0x8227bb8a
- .long 0x0a2a8d7e, 0x5b397730
- .long 0x6d9a4957, 0xb0cd4768
- .long 0xe8b6368b, 0xe78eb416
- .long 0xd2c3ed1a, 0x39c7ff35
- .long 0x995a5724, 0x61ff0e01
- .long 0x9ef68d35, 0xd7a4825c
- .long 0x0c139b31, 0x8d96551c
- .long 0xf2271e60, 0x0ab3844b
- .long 0x0b0bf8ca, 0x0bf80dd2
- .long 0x2664fd8b, 0x0167d312
- .long 0xed64812d, 0x8821abed
- .long 0x02ee03b2, 0xf6076544
- .long 0x8604ae0f, 0x6a45d2b2
- .long 0x363bd6b3, 0x26f6a60a
- .long 0x135c83fd, 0xd8d26619
- .long 0x5fabe670, 0xa741c1bf
- .long 0x35ec3279, 0xde87806c
- .long 0x00bcf5f6, 0x98d8d9cb
- .long 0x8ae00689, 0x14338754
- .long 0x17f27698, 0x49c3cc9c
- .long 0x58ca5f00, 0x5bd2011f
- .long 0xaa7c7ad5, 0x68bce87a
- .long 0xb5cfca28, 0xdd07448e
- .long 0xded288f8, 0x57a3d037
- .long 0x59f229bc, 0xdde8f5b9
- .long 0x6d390dec, 0x6956fc3b
- .long 0x37170390, 0xa3e3e02c
- .long 0x6353c1cc, 0x42d98888
- .long 0xc4584f5c, 0xd73c7bea
- .long 0xf48642e9, 0x3771e98f
- .long 0x531377e2, 0x80ff0093
- .long 0xdd35bc8d, 0xb42ae3d9
- .long 0xb25b29f2, 0x8fe4c34d
- .long 0x9a5ede41, 0x2178513a
- .long 0xa563905d, 0xdf99fc11
- .long 0x45cddf4e, 0xe0ac139e
- .long 0xacfa3103, 0x6c23e841
- .long 0xa51b6135, 0x170076fa
diff --git a/arch/x86/lib/crc64-pclmul.S b/arch/x86/lib/crc64-pclmul.S
deleted file mode 100644
index 4173051b5197..000000000000
--- a/arch/x86/lib/crc64-pclmul.S
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-// Copyright 2025 Google LLC
-
-#include "crc-pclmul-template.S"
-
-DEFINE_CRC_PCLMUL_FUNCS(crc64_msb, /* bits= */ 64, /* lsb= */ 0)
-DEFINE_CRC_PCLMUL_FUNCS(crc64_lsb, /* bits= */ 64, /* lsb= */ 1)
diff --git a/arch/x86/lib/crc64.c b/arch/x86/lib/crc64.c
deleted file mode 100644
index 351a09f5813e..000000000000
--- a/arch/x86/lib/crc64.c
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * CRC64 using [V]PCLMULQDQ instructions
- *
- * Copyright 2025 Google LLC
- */
-
-#include <linux/crc64.h>
-#include <linux/module.h>
-#include "crc-pclmul-template.h"
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
-
-DECLARE_CRC_PCLMUL_FUNCS(crc64_msb, u64);
-DECLARE_CRC_PCLMUL_FUNCS(crc64_lsb, u64);
-
-u64 crc64_be_arch(u64 crc, const u8 *p, size_t len)
-{
- CRC_PCLMUL(crc, p, len, crc64_msb, crc64_msb_0x42f0e1eba9ea3693_consts,
- have_pclmulqdq);
- return crc64_be_generic(crc, p, len);
-}
-EXPORT_SYMBOL_GPL(crc64_be_arch);
-
-u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
-{
- CRC_PCLMUL(crc, p, len, crc64_lsb, crc64_lsb_0x9a6c9329ac4bc9b5_consts,
- have_pclmulqdq);
- return crc64_nvme_generic(crc, p, len);
-}
-EXPORT_SYMBOL_GPL(crc64_nvme_arch);
-
-static int __init crc64_x86_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
- static_branch_enable(&have_pclmulqdq);
- INIT_CRC_PCLMUL(crc64_msb);
- INIT_CRC_PCLMUL(crc64_lsb);
- }
- return 0;
-}
-subsys_initcall(crc64_x86_init);
-
-static void __exit crc64_x86_exit(void)
-{
-}
-module_exit(crc64_x86_exit);
-
-MODULE_DESCRIPTION("CRC64 using [V]PCLMULQDQ instructions");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/lib/crypto/.gitignore b/arch/x86/lib/crypto/.gitignore
deleted file mode 100644
index 580c839bb177..000000000000
--- a/arch/x86/lib/crypto/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-poly1305-x86_64-cryptogams.S
diff --git a/arch/x86/lib/crypto/Kconfig b/arch/x86/lib/crypto/Kconfig
deleted file mode 100644
index 5e94cdee492c..000000000000
--- a/arch/x86/lib/crypto/Kconfig
+++ /dev/null
@@ -1,34 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_BLAKE2S_X86
- bool "Hash functions: BLAKE2s (SSSE3/AVX-512)"
- depends on 64BIT
- select CRYPTO_LIB_BLAKE2S_GENERIC
- select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
- help
- BLAKE2s cryptographic hash function (RFC 7693)
-
- Architecture: x86_64 using:
- - SSSE3 (Supplemental SSE3)
- - AVX-512 (Advanced Vector Extensions-512)
-
-config CRYPTO_CHACHA20_X86_64
- tristate
- depends on 64BIT
- default CRYPTO_LIB_CHACHA
- select CRYPTO_LIB_CHACHA_GENERIC
- select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_POLY1305_X86_64
- tristate
- depends on 64BIT
- default CRYPTO_LIB_POLY1305
- select CRYPTO_ARCH_HAVE_LIB_POLY1305
-
-config CRYPTO_SHA256_X86_64
- tristate
- depends on 64BIT
- default CRYPTO_LIB_SHA256
- select CRYPTO_ARCH_HAVE_LIB_SHA256
- select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
- select CRYPTO_LIB_SHA256_GENERIC
diff --git a/arch/x86/lib/crypto/Makefile b/arch/x86/lib/crypto/Makefile
deleted file mode 100644
index abceca3d31c0..000000000000
--- a/arch/x86/lib/crypto/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o
-libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o
-
-obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
-chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha-avx512vl-x86_64.o chacha_glue.o
-
-obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
-poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
-targets += poly1305-x86_64-cryptogams.S
-
-obj-$(CONFIG_CRYPTO_SHA256_X86_64) += sha256-x86_64.o
-sha256-x86_64-y := sha256.o sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256-ni-asm.o
-
-quiet_cmd_perlasm = PERLASM $@
- cmd_perlasm = $(PERL) $< > $@
-
-$(obj)/%.S: $(src)/%.pl FORCE
- $(call if_changed,perlasm)
diff --git a/arch/x86/lib/crypto/blake2s-core.S b/arch/x86/lib/crypto/blake2s-core.S
deleted file mode 100644
index ac1c845445a4..000000000000
--- a/arch/x86/lib/crypto/blake2s-core.S
+++ /dev/null
@@ -1,252 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
- */
-
-#include <linux/linkage.h>
-
-.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
-.align 32
-IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
- .octa 0x5BE0CD191F83D9AB9B05688C510E527F
-.section .rodata.cst16.ROT16, "aM", @progbits, 16
-.align 16
-ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
-.section .rodata.cst16.ROR328, "aM", @progbits, 16
-.align 16
-ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
-.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
-.align 64
-SIGMA:
-.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
-.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
-.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
-.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
-.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
-.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
-.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
-.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
-.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
-.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
-.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
-.align 64
-SIGMA2:
-.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
-.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
-.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
-.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
-.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
-.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
-.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
-.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
-.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
-.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
-
-.text
-SYM_FUNC_START(blake2s_compress_ssse3)
- testq %rdx,%rdx
- je .Lendofloop
- movdqu (%rdi),%xmm0
- movdqu 0x10(%rdi),%xmm1
- movdqa ROT16(%rip),%xmm12
- movdqa ROR328(%rip),%xmm13
- movdqu 0x20(%rdi),%xmm14
- movq %rcx,%xmm15
- leaq SIGMA+0xa0(%rip),%r8
- jmp .Lbeginofloop
- .align 32
-.Lbeginofloop:
- movdqa %xmm0,%xmm10
- movdqa %xmm1,%xmm11
- paddq %xmm15,%xmm14
- movdqa IV(%rip),%xmm2
- movdqa %xmm14,%xmm3
- pxor IV+0x10(%rip),%xmm3
- leaq SIGMA(%rip),%rcx
-.Lroundloop:
- movzbl (%rcx),%eax
- movd (%rsi,%rax,4),%xmm4
- movzbl 0x1(%rcx),%eax
- movd (%rsi,%rax,4),%xmm5
- movzbl 0x2(%rcx),%eax
- movd (%rsi,%rax,4),%xmm6
- movzbl 0x3(%rcx),%eax
- movd (%rsi,%rax,4),%xmm7
- punpckldq %xmm5,%xmm4
- punpckldq %xmm7,%xmm6
- punpcklqdq %xmm6,%xmm4
- paddd %xmm4,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movzbl 0x4(%rcx),%eax
- movd (%rsi,%rax,4),%xmm5
- movzbl 0x5(%rcx),%eax
- movd (%rsi,%rax,4),%xmm6
- movzbl 0x6(%rcx),%eax
- movd (%rsi,%rax,4),%xmm7
- movzbl 0x7(%rcx),%eax
- movd (%rsi,%rax,4),%xmm4
- punpckldq %xmm6,%xmm5
- punpckldq %xmm4,%xmm7
- punpcklqdq %xmm7,%xmm5
- paddd %xmm5,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x93,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x39,%xmm2,%xmm2
- movzbl 0x8(%rcx),%eax
- movd (%rsi,%rax,4),%xmm6
- movzbl 0x9(%rcx),%eax
- movd (%rsi,%rax,4),%xmm7
- movzbl 0xa(%rcx),%eax
- movd (%rsi,%rax,4),%xmm4
- movzbl 0xb(%rcx),%eax
- movd (%rsi,%rax,4),%xmm5
- punpckldq %xmm7,%xmm6
- punpckldq %xmm5,%xmm4
- punpcklqdq %xmm4,%xmm6
- paddd %xmm6,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movzbl 0xc(%rcx),%eax
- movd (%rsi,%rax,4),%xmm7
- movzbl 0xd(%rcx),%eax
- movd (%rsi,%rax,4),%xmm4
- movzbl 0xe(%rcx),%eax
- movd (%rsi,%rax,4),%xmm5
- movzbl 0xf(%rcx),%eax
- movd (%rsi,%rax,4),%xmm6
- punpckldq %xmm4,%xmm7
- punpckldq %xmm6,%xmm5
- punpcklqdq %xmm5,%xmm7
- paddd %xmm7,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x39,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x93,%xmm2,%xmm2
- addq $0x10,%rcx
- cmpq %r8,%rcx
- jnz .Lroundloop
- pxor %xmm2,%xmm0
- pxor %xmm3,%xmm1
- pxor %xmm10,%xmm0
- pxor %xmm11,%xmm1
- addq $0x40,%rsi
- decq %rdx
- jnz .Lbeginofloop
- movdqu %xmm0,(%rdi)
- movdqu %xmm1,0x10(%rdi)
- movdqu %xmm14,0x20(%rdi)
-.Lendofloop:
- RET
-SYM_FUNC_END(blake2s_compress_ssse3)
-
-SYM_FUNC_START(blake2s_compress_avx512)
- vmovdqu (%rdi),%xmm0
- vmovdqu 0x10(%rdi),%xmm1
- vmovdqu 0x20(%rdi),%xmm4
- vmovq %rcx,%xmm5
- vmovdqa IV(%rip),%xmm14
- vmovdqa IV+16(%rip),%xmm15
- jmp .Lblake2s_compress_avx512_mainloop
-.align 32
-.Lblake2s_compress_avx512_mainloop:
- vmovdqa %xmm0,%xmm10
- vmovdqa %xmm1,%xmm11
- vpaddq %xmm5,%xmm4,%xmm4
- vmovdqa %xmm14,%xmm2
- vpxor %xmm15,%xmm4,%xmm3
- vmovdqu (%rsi),%ymm6
- vmovdqu 0x20(%rsi),%ymm7
- addq $0x40,%rsi
- leaq SIGMA2(%rip),%rax
- movb $0xa,%cl
-.Lblake2s_compress_avx512_roundloop:
- addq $0x40,%rax
- vmovdqa -0x40(%rax),%ymm8
- vmovdqa -0x20(%rax),%ymm9
- vpermi2d %ymm7,%ymm6,%ymm8
- vpermi2d %ymm7,%ymm6,%ymm9
- vmovdqa %ymm8,%ymm6
- vmovdqa %ymm9,%ymm7
- vpaddd %xmm8,%xmm0,%xmm0
- vpaddd %xmm1,%xmm0,%xmm0
- vpxor %xmm0,%xmm3,%xmm3
- vprord $0x10,%xmm3,%xmm3
- vpaddd %xmm3,%xmm2,%xmm2
- vpxor %xmm2,%xmm1,%xmm1
- vprord $0xc,%xmm1,%xmm1
- vextracti128 $0x1,%ymm8,%xmm8
- vpaddd %xmm8,%xmm0,%xmm0
- vpaddd %xmm1,%xmm0,%xmm0
- vpxor %xmm0,%xmm3,%xmm3
- vprord $0x8,%xmm3,%xmm3
- vpaddd %xmm3,%xmm2,%xmm2
- vpxor %xmm2,%xmm1,%xmm1
- vprord $0x7,%xmm1,%xmm1
- vpshufd $0x93,%xmm0,%xmm0
- vpshufd $0x4e,%xmm3,%xmm3
- vpshufd $0x39,%xmm2,%xmm2
- vpaddd %xmm9,%xmm0,%xmm0
- vpaddd %xmm1,%xmm0,%xmm0
- vpxor %xmm0,%xmm3,%xmm3
- vprord $0x10,%xmm3,%xmm3
- vpaddd %xmm3,%xmm2,%xmm2
- vpxor %xmm2,%xmm1,%xmm1
- vprord $0xc,%xmm1,%xmm1
- vextracti128 $0x1,%ymm9,%xmm9
- vpaddd %xmm9,%xmm0,%xmm0
- vpaddd %xmm1,%xmm0,%xmm0
- vpxor %xmm0,%xmm3,%xmm3
- vprord $0x8,%xmm3,%xmm3
- vpaddd %xmm3,%xmm2,%xmm2
- vpxor %xmm2,%xmm1,%xmm1
- vprord $0x7,%xmm1,%xmm1
- vpshufd $0x39,%xmm0,%xmm0
- vpshufd $0x4e,%xmm3,%xmm3
- vpshufd $0x93,%xmm2,%xmm2
- decb %cl
- jne .Lblake2s_compress_avx512_roundloop
- vpxor %xmm10,%xmm0,%xmm0
- vpxor %xmm11,%xmm1,%xmm1
- vpxor %xmm2,%xmm0,%xmm0
- vpxor %xmm3,%xmm1,%xmm1
- decq %rdx
- jne .Lblake2s_compress_avx512_mainloop
- vmovdqu %xmm0,(%rdi)
- vmovdqu %xmm1,0x10(%rdi)
- vmovdqu %xmm4,0x20(%rdi)
- vzeroupper
- RET
-SYM_FUNC_END(blake2s_compress_avx512)
diff --git a/arch/x86/lib/crypto/blake2s-glue.c b/arch/x86/lib/crypto/blake2s-glue.c
deleted file mode 100644
index adc296cd17c9..000000000000
--- a/arch/x86/lib/crypto/blake2s-glue.c
+++ /dev/null
@@ -1,70 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <asm/cpufeature.h>
-#include <asm/fpu/api.h>
-#include <asm/processor.h>
-#include <asm/simd.h>
-#include <crypto/internal/blake2s.h>
-#include <linux/init.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/sizes.h>
-
-asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
- const u8 *block, const size_t nblocks,
- const u32 inc);
-asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
- const u8 *block, const size_t nblocks,
- const u32 inc);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
-
-void blake2s_compress(struct blake2s_state *state, const u8 *block,
- size_t nblocks, const u32 inc)
-{
- /* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
-
- if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
- blake2s_compress_generic(state, block, nblocks, inc);
- return;
- }
-
- do {
- const size_t blocks = min_t(size_t, nblocks,
- SZ_4K / BLAKE2S_BLOCK_SIZE);
-
- kernel_fpu_begin();
- if (static_branch_likely(&blake2s_use_avx512))
- blake2s_compress_avx512(state, block, blocks, inc);
- else
- blake2s_compress_ssse3(state, block, blocks, inc);
- kernel_fpu_end();
-
- nblocks -= blocks;
- block += blocks * BLAKE2S_BLOCK_SIZE;
- } while (nblocks);
-}
-EXPORT_SYMBOL(blake2s_compress);
-
-static int __init blake2s_mod_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- static_branch_enable(&blake2s_use_ssse3);
-
- if (boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_AVX512F) &&
- boot_cpu_has(X86_FEATURE_AVX512VL) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
- XFEATURE_MASK_AVX512, NULL))
- static_branch_enable(&blake2s_use_avx512);
-
- return 0;
-}
-
-subsys_initcall(blake2s_mod_init);
diff --git a/arch/x86/lib/crypto/chacha-avx2-x86_64.S b/arch/x86/lib/crypto/chacha-avx2-x86_64.S
deleted file mode 100644
index f3d8fc018249..000000000000
--- a/arch/x86/lib/crypto/chacha-avx2-x86_64.S
+++ /dev/null
@@ -1,1021 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <linux/linkage.h>
-
-.section .rodata.cst32.ROT8, "aM", @progbits, 32
-.align 32
-ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
- .octa 0x0e0d0c0f0a09080b0605040702010003
-
-.section .rodata.cst32.ROT16, "aM", @progbits, 32
-.align 32
-ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
- .octa 0x0d0c0f0e09080b0a0504070601000302
-
-.section .rodata.cst32.CTRINC, "aM", @progbits, 32
-.align 32
-CTRINC: .octa 0x00000003000000020000000100000000
- .octa 0x00000007000000060000000500000004
-
-.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
-.align 32
-CTR2BL: .octa 0x00000000000000000000000000000000
- .octa 0x00000000000000000000000000000001
-
-.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
-.align 32
-CTR4BL: .octa 0x00000000000000000000000000000002
- .octa 0x00000000000000000000000000000003
-
-.text
-
-SYM_FUNC_START(chacha_2block_xor_avx2)
- # %rdi: Input state matrix, s
- # %rsi: up to 2 data blocks output, o
- # %rdx: up to 2 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts two ChaCha blocks by loading the state
- # matrix twice across four AVX registers. It performs matrix operations
- # on four words in each matrix in parallel, but requires shuffling to
- # rearrange the words after each round.
-
- vzeroupper
-
- # x0..3[0-2] = s0..3
- vbroadcasti128 0x00(%rdi),%ymm0
- vbroadcasti128 0x10(%rdi),%ymm1
- vbroadcasti128 0x20(%rdi),%ymm2
- vbroadcasti128 0x30(%rdi),%ymm3
-
- vpaddd CTR2BL(%rip),%ymm3,%ymm3
-
- vmovdqa %ymm0,%ymm8
- vmovdqa %ymm1,%ymm9
- vmovdqa %ymm2,%ymm10
- vmovdqa %ymm3,%ymm11
-
- vmovdqa ROT8(%rip),%ymm4
- vmovdqa ROT16(%rip),%ymm5
-
- mov %rcx,%rax
-
-.Ldoubleround:
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm5,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm6
- vpslld $12,%ymm6,%ymm6
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm6,%ymm1,%ymm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm4,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm7
- vpslld $7,%ymm7,%ymm7
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm7,%ymm1,%ymm1
-
- # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm1,%ymm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm3,%ymm3
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm5,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm6
- vpslld $12,%ymm6,%ymm6
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm6,%ymm1,%ymm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm4,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm7
- vpslld $7,%ymm7,%ymm7
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm7,%ymm1,%ymm1
-
- # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm1,%ymm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm3,%ymm3
-
- sub $2,%r8d
- jnz .Ldoubleround
-
- # o0 = i0 ^ (x0 + s0)
- vpaddd %ymm8,%ymm0,%ymm7
- cmp $0x10,%rax
- jl .Lxorpart2
- vpxor 0x00(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x00(%rsi)
- vextracti128 $1,%ymm7,%xmm0
- # o1 = i1 ^ (x1 + s1)
- vpaddd %ymm9,%ymm1,%ymm7
- cmp $0x20,%rax
- jl .Lxorpart2
- vpxor 0x10(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x10(%rsi)
- vextracti128 $1,%ymm7,%xmm1
- # o2 = i2 ^ (x2 + s2)
- vpaddd %ymm10,%ymm2,%ymm7
- cmp $0x30,%rax
- jl .Lxorpart2
- vpxor 0x20(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x20(%rsi)
- vextracti128 $1,%ymm7,%xmm2
- # o3 = i3 ^ (x3 + s3)
- vpaddd %ymm11,%ymm3,%ymm7
- cmp $0x40,%rax
- jl .Lxorpart2
- vpxor 0x30(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x30(%rsi)
- vextracti128 $1,%ymm7,%xmm3
-
- # xor and write second block
- vmovdqa %xmm0,%xmm7
- cmp $0x50,%rax
- jl .Lxorpart2
- vpxor 0x40(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x40(%rsi)
-
- vmovdqa %xmm1,%xmm7
- cmp $0x60,%rax
- jl .Lxorpart2
- vpxor 0x50(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x50(%rsi)
-
- vmovdqa %xmm2,%xmm7
- cmp $0x70,%rax
- jl .Lxorpart2
- vpxor 0x60(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x60(%rsi)
-
- vmovdqa %xmm3,%xmm7
- cmp $0x80,%rax
- jl .Lxorpart2
- vpxor 0x70(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x70(%rsi)
-
-.Ldone2:
- vzeroupper
- RET
-
-.Lxorpart2:
- # xor remaining bytes from partial register into output
- mov %rax,%r9
- and $0x0f,%r9
- jz .Ldone2
- and $~0x0f,%rax
-
- mov %rsi,%r11
-
- lea 8(%rsp),%r10
- sub $0x10,%rsp
- and $~31,%rsp
-
- lea (%rdx,%rax),%rsi
- mov %rsp,%rdi
- mov %r9,%rcx
- rep movsb
-
- vpxor 0x00(%rsp),%xmm7,%xmm7
- vmovdqa %xmm7,0x00(%rsp)
-
- mov %rsp,%rsi
- lea (%r11,%rax),%rdi
- mov %r9,%rcx
- rep movsb
-
- lea -8(%r10),%rsp
- jmp .Ldone2
-
-SYM_FUNC_END(chacha_2block_xor_avx2)
-
-SYM_FUNC_START(chacha_4block_xor_avx2)
- # %rdi: Input state matrix, s
- # %rsi: up to 4 data blocks output, o
- # %rdx: up to 4 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts four ChaCha blocks by loading the state
- # matrix four times across eight AVX registers. It performs matrix
- # operations on four words in two matrices in parallel, sequentially
- # to the operations on the four words of the other two matrices. The
- # required word shuffling has a rather high latency, we can do the
- # arithmetic on two matrix-pairs without much slowdown.
-
- vzeroupper
-
- # x0..3[0-4] = s0..3
- vbroadcasti128 0x00(%rdi),%ymm0
- vbroadcasti128 0x10(%rdi),%ymm1
- vbroadcasti128 0x20(%rdi),%ymm2
- vbroadcasti128 0x30(%rdi),%ymm3
-
- vmovdqa %ymm0,%ymm4
- vmovdqa %ymm1,%ymm5
- vmovdqa %ymm2,%ymm6
- vmovdqa %ymm3,%ymm7
-
- vpaddd CTR2BL(%rip),%ymm3,%ymm3
- vpaddd CTR4BL(%rip),%ymm7,%ymm7
-
- vmovdqa %ymm0,%ymm11
- vmovdqa %ymm1,%ymm12
- vmovdqa %ymm2,%ymm13
- vmovdqa %ymm3,%ymm14
- vmovdqa %ymm7,%ymm15
-
- vmovdqa ROT8(%rip),%ymm8
- vmovdqa ROT16(%rip),%ymm9
-
- mov %rcx,%rax
-
-.Ldoubleround4:
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm9,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxor %ymm4,%ymm7,%ymm7
- vpshufb %ymm9,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm10
- vpslld $12,%ymm10,%ymm10
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm10,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm6,%ymm5,%ymm5
- vmovdqa %ymm5,%ymm10
- vpslld $12,%ymm10,%ymm10
- vpsrld $20,%ymm5,%ymm5
- vpor %ymm10,%ymm5,%ymm5
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm8,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxor %ymm4,%ymm7,%ymm7
- vpshufb %ymm8,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm10
- vpslld $7,%ymm10,%ymm10
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm10,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm6,%ymm5,%ymm5
- vmovdqa %ymm5,%ymm10
- vpslld $7,%ymm10,%ymm10
- vpsrld $25,%ymm5,%ymm5
- vpor %ymm10,%ymm5,%ymm5
-
- # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm1,%ymm1
- vpshufd $0x39,%ymm5,%ymm5
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- vpshufd $0x4e,%ymm6,%ymm6
- # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm3,%ymm3
- vpshufd $0x93,%ymm7,%ymm7
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm9,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxor %ymm4,%ymm7,%ymm7
- vpshufb %ymm9,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm10
- vpslld $12,%ymm10,%ymm10
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm10,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm6,%ymm5,%ymm5
- vmovdqa %ymm5,%ymm10
- vpslld $12,%ymm10,%ymm10
- vpsrld $20,%ymm5,%ymm5
- vpor %ymm10,%ymm5,%ymm5
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm8,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxor %ymm4,%ymm7,%ymm7
- vpshufb %ymm8,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm10
- vpslld $7,%ymm10,%ymm10
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm10,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm6,%ymm5,%ymm5
- vmovdqa %ymm5,%ymm10
- vpslld $7,%ymm10,%ymm10
- vpsrld $25,%ymm5,%ymm5
- vpor %ymm10,%ymm5,%ymm5
-
- # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm1,%ymm1
- vpshufd $0x93,%ymm5,%ymm5
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- vpshufd $0x4e,%ymm6,%ymm6
- # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm3,%ymm3
- vpshufd $0x39,%ymm7,%ymm7
-
- sub $2,%r8d
- jnz .Ldoubleround4
-
- # o0 = i0 ^ (x0 + s0), first block
- vpaddd %ymm11,%ymm0,%ymm10
- cmp $0x10,%rax
- jl .Lxorpart4
- vpxor 0x00(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x00(%rsi)
- vextracti128 $1,%ymm10,%xmm0
- # o1 = i1 ^ (x1 + s1), first block
- vpaddd %ymm12,%ymm1,%ymm10
- cmp $0x20,%rax
- jl .Lxorpart4
- vpxor 0x10(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x10(%rsi)
- vextracti128 $1,%ymm10,%xmm1
- # o2 = i2 ^ (x2 + s2), first block
- vpaddd %ymm13,%ymm2,%ymm10
- cmp $0x30,%rax
- jl .Lxorpart4
- vpxor 0x20(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x20(%rsi)
- vextracti128 $1,%ymm10,%xmm2
- # o3 = i3 ^ (x3 + s3), first block
- vpaddd %ymm14,%ymm3,%ymm10
- cmp $0x40,%rax
- jl .Lxorpart4
- vpxor 0x30(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x30(%rsi)
- vextracti128 $1,%ymm10,%xmm3
-
- # xor and write second block
- vmovdqa %xmm0,%xmm10
- cmp $0x50,%rax
- jl .Lxorpart4
- vpxor 0x40(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x40(%rsi)
-
- vmovdqa %xmm1,%xmm10
- cmp $0x60,%rax
- jl .Lxorpart4
- vpxor 0x50(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x50(%rsi)
-
- vmovdqa %xmm2,%xmm10
- cmp $0x70,%rax
- jl .Lxorpart4
- vpxor 0x60(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x60(%rsi)
-
- vmovdqa %xmm3,%xmm10
- cmp $0x80,%rax
- jl .Lxorpart4
- vpxor 0x70(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x70(%rsi)
-
- # o0 = i0 ^ (x0 + s0), third block
- vpaddd %ymm11,%ymm4,%ymm10
- cmp $0x90,%rax
- jl .Lxorpart4
- vpxor 0x80(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x80(%rsi)
- vextracti128 $1,%ymm10,%xmm4
- # o1 = i1 ^ (x1 + s1), third block
- vpaddd %ymm12,%ymm5,%ymm10
- cmp $0xa0,%rax
- jl .Lxorpart4
- vpxor 0x90(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x90(%rsi)
- vextracti128 $1,%ymm10,%xmm5
- # o2 = i2 ^ (x2 + s2), third block
- vpaddd %ymm13,%ymm6,%ymm10
- cmp $0xb0,%rax
- jl .Lxorpart4
- vpxor 0xa0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xa0(%rsi)
- vextracti128 $1,%ymm10,%xmm6
- # o3 = i3 ^ (x3 + s3), third block
- vpaddd %ymm15,%ymm7,%ymm10
- cmp $0xc0,%rax
- jl .Lxorpart4
- vpxor 0xb0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xb0(%rsi)
- vextracti128 $1,%ymm10,%xmm7
-
- # xor and write fourth block
- vmovdqa %xmm4,%xmm10
- cmp $0xd0,%rax
- jl .Lxorpart4
- vpxor 0xc0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xc0(%rsi)
-
- vmovdqa %xmm5,%xmm10
- cmp $0xe0,%rax
- jl .Lxorpart4
- vpxor 0xd0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xd0(%rsi)
-
- vmovdqa %xmm6,%xmm10
- cmp $0xf0,%rax
- jl .Lxorpart4
- vpxor 0xe0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xe0(%rsi)
-
- vmovdqa %xmm7,%xmm10
- cmp $0x100,%rax
- jl .Lxorpart4
- vpxor 0xf0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xf0(%rsi)
-
-.Ldone4:
- vzeroupper
- RET
-
-.Lxorpart4:
- # xor remaining bytes from partial register into output
- mov %rax,%r9
- and $0x0f,%r9
- jz .Ldone4
- and $~0x0f,%rax
-
- mov %rsi,%r11
-
- lea 8(%rsp),%r10
- sub $0x10,%rsp
- and $~31,%rsp
-
- lea (%rdx,%rax),%rsi
- mov %rsp,%rdi
- mov %r9,%rcx
- rep movsb
-
- vpxor 0x00(%rsp),%xmm10,%xmm10
- vmovdqa %xmm10,0x00(%rsp)
-
- mov %rsp,%rsi
- lea (%r11,%rax),%rdi
- mov %r9,%rcx
- rep movsb
-
- lea -8(%r10),%rsp
- jmp .Ldone4
-
-SYM_FUNC_END(chacha_4block_xor_avx2)
-
-SYM_FUNC_START(chacha_8block_xor_avx2)
- # %rdi: Input state matrix, s
- # %rsi: up to 8 data blocks output, o
- # %rdx: up to 8 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts eight consecutive ChaCha blocks by loading
- # the state matrix in AVX registers eight times. As we need some
- # scratch registers, we save the first four registers on the stack. The
- # algorithm performs each operation on the corresponding word of each
- # state matrix, hence requires no word shuffling. For final XORing step
- # we transpose the matrix by interleaving 32-, 64- and then 128-bit
- # words, which allows us to do XOR in AVX registers. 8/16-bit word
- # rotation is done with the slightly better performing byte shuffling,
- # 7/12-bit word rotation uses traditional shift+OR.
-
- vzeroupper
- # 4 * 32 byte stack, 32-byte aligned
- lea 8(%rsp),%r10
- and $~31, %rsp
- sub $0x80, %rsp
- mov %rcx,%rax
-
- # x0..15[0-7] = s[0..15]
- vpbroadcastd 0x00(%rdi),%ymm0
- vpbroadcastd 0x04(%rdi),%ymm1
- vpbroadcastd 0x08(%rdi),%ymm2
- vpbroadcastd 0x0c(%rdi),%ymm3
- vpbroadcastd 0x10(%rdi),%ymm4
- vpbroadcastd 0x14(%rdi),%ymm5
- vpbroadcastd 0x18(%rdi),%ymm6
- vpbroadcastd 0x1c(%rdi),%ymm7
- vpbroadcastd 0x20(%rdi),%ymm8
- vpbroadcastd 0x24(%rdi),%ymm9
- vpbroadcastd 0x28(%rdi),%ymm10
- vpbroadcastd 0x2c(%rdi),%ymm11
- vpbroadcastd 0x30(%rdi),%ymm12
- vpbroadcastd 0x34(%rdi),%ymm13
- vpbroadcastd 0x38(%rdi),%ymm14
- vpbroadcastd 0x3c(%rdi),%ymm15
- # x0..3 on stack
- vmovdqa %ymm0,0x00(%rsp)
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa %ymm2,0x40(%rsp)
- vmovdqa %ymm3,0x60(%rsp)
-
- vmovdqa CTRINC(%rip),%ymm1
- vmovdqa ROT8(%rip),%ymm2
- vmovdqa ROT16(%rip),%ymm3
-
- # x12 += counter values 0-3
- vpaddd %ymm1,%ymm12,%ymm12
-
-.Ldoubleround8:
- # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
- vpaddd 0x00(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm3,%ymm12,%ymm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
- vpaddd 0x20(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm3,%ymm13,%ymm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
- vpaddd 0x40(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm3,%ymm14,%ymm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
- vpaddd 0x60(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm3,%ymm15,%ymm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $12,%ymm4,%ymm0
- vpsrld $20,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $12,%ymm5,%ymm0
- vpsrld $20,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $12,%ymm6,%ymm0
- vpsrld $20,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
- vpaddd %ymm15,%ymm11,%ymm11
- vpxor %ymm11,%ymm7,%ymm7
- vpslld $12,%ymm7,%ymm0
- vpsrld $20,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
-
- # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
- vpaddd 0x00(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm2,%ymm12,%ymm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
- vpaddd 0x20(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm2,%ymm13,%ymm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
- vpaddd 0x40(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm2,%ymm14,%ymm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
- vpaddd 0x60(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm2,%ymm15,%ymm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm0
- vpsrld $25,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm0
- vpsrld $25,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm0
- vpsrld $25,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
- vpaddd %ymm15,%ymm11,%ymm11
- vpxor %ymm11,%ymm7,%ymm7
- vpslld $7,%ymm7,%ymm0
- vpsrld $25,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
- vpaddd 0x00(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm3,%ymm15,%ymm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
- vpaddd 0x20(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm3,%ymm12,%ymm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
- vpaddd 0x40(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm3,%ymm13,%ymm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
- vpaddd 0x60(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm3,%ymm14,%ymm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
- vpaddd %ymm15,%ymm10,%ymm10
- vpxor %ymm10,%ymm5,%ymm5
- vpslld $12,%ymm5,%ymm0
- vpsrld $20,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
- vpaddd %ymm12,%ymm11,%ymm11
- vpxor %ymm11,%ymm6,%ymm6
- vpslld $12,%ymm6,%ymm0
- vpsrld $20,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
- vpaddd %ymm13,%ymm8,%ymm8
- vpxor %ymm8,%ymm7,%ymm7
- vpslld $12,%ymm7,%ymm0
- vpsrld $20,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
- vpaddd %ymm14,%ymm9,%ymm9
- vpxor %ymm9,%ymm4,%ymm4
- vpslld $12,%ymm4,%ymm0
- vpsrld $20,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
- vpaddd 0x00(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm2,%ymm15,%ymm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
- vpaddd 0x20(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm2,%ymm12,%ymm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
- vpaddd 0x40(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm2,%ymm13,%ymm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
- vpaddd 0x60(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm2,%ymm14,%ymm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
- vpaddd %ymm15,%ymm10,%ymm10
- vpxor %ymm10,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm0
- vpsrld $25,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
- vpaddd %ymm12,%ymm11,%ymm11
- vpxor %ymm11,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm0
- vpsrld $25,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
- vpaddd %ymm13,%ymm8,%ymm8
- vpxor %ymm8,%ymm7,%ymm7
- vpslld $7,%ymm7,%ymm0
- vpsrld $25,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
- vpaddd %ymm14,%ymm9,%ymm9
- vpxor %ymm9,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm0
- vpsrld $25,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
-
- sub $2,%r8d
- jnz .Ldoubleround8
-
- # x0..15[0-3] += s[0..15]
- vpbroadcastd 0x00(%rdi),%ymm0
- vpaddd 0x00(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpbroadcastd 0x04(%rdi),%ymm0
- vpaddd 0x20(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpbroadcastd 0x08(%rdi),%ymm0
- vpaddd 0x40(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpbroadcastd 0x0c(%rdi),%ymm0
- vpaddd 0x60(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpbroadcastd 0x10(%rdi),%ymm0
- vpaddd %ymm0,%ymm4,%ymm4
- vpbroadcastd 0x14(%rdi),%ymm0
- vpaddd %ymm0,%ymm5,%ymm5
- vpbroadcastd 0x18(%rdi),%ymm0
- vpaddd %ymm0,%ymm6,%ymm6
- vpbroadcastd 0x1c(%rdi),%ymm0
- vpaddd %ymm0,%ymm7,%ymm7
- vpbroadcastd 0x20(%rdi),%ymm0
- vpaddd %ymm0,%ymm8,%ymm8
- vpbroadcastd 0x24(%rdi),%ymm0
- vpaddd %ymm0,%ymm9,%ymm9
- vpbroadcastd 0x28(%rdi),%ymm0
- vpaddd %ymm0,%ymm10,%ymm10
- vpbroadcastd 0x2c(%rdi),%ymm0
- vpaddd %ymm0,%ymm11,%ymm11
- vpbroadcastd 0x30(%rdi),%ymm0
- vpaddd %ymm0,%ymm12,%ymm12
- vpbroadcastd 0x34(%rdi),%ymm0
- vpaddd %ymm0,%ymm13,%ymm13
- vpbroadcastd 0x38(%rdi),%ymm0
- vpaddd %ymm0,%ymm14,%ymm14
- vpbroadcastd 0x3c(%rdi),%ymm0
- vpaddd %ymm0,%ymm15,%ymm15
-
- # x12 += counter values 0-3
- vpaddd %ymm1,%ymm12,%ymm12
-
- # interleave 32-bit words in state n, n+1
- vmovdqa 0x00(%rsp),%ymm0
- vmovdqa 0x20(%rsp),%ymm1
- vpunpckldq %ymm1,%ymm0,%ymm2
- vpunpckhdq %ymm1,%ymm0,%ymm1
- vmovdqa %ymm2,0x00(%rsp)
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa 0x40(%rsp),%ymm0
- vmovdqa 0x60(%rsp),%ymm1
- vpunpckldq %ymm1,%ymm0,%ymm2
- vpunpckhdq %ymm1,%ymm0,%ymm1
- vmovdqa %ymm2,0x40(%rsp)
- vmovdqa %ymm1,0x60(%rsp)
- vmovdqa %ymm4,%ymm0
- vpunpckldq %ymm5,%ymm0,%ymm4
- vpunpckhdq %ymm5,%ymm0,%ymm5
- vmovdqa %ymm6,%ymm0
- vpunpckldq %ymm7,%ymm0,%ymm6
- vpunpckhdq %ymm7,%ymm0,%ymm7
- vmovdqa %ymm8,%ymm0
- vpunpckldq %ymm9,%ymm0,%ymm8
- vpunpckhdq %ymm9,%ymm0,%ymm9
- vmovdqa %ymm10,%ymm0
- vpunpckldq %ymm11,%ymm0,%ymm10
- vpunpckhdq %ymm11,%ymm0,%ymm11
- vmovdqa %ymm12,%ymm0
- vpunpckldq %ymm13,%ymm0,%ymm12
- vpunpckhdq %ymm13,%ymm0,%ymm13
- vmovdqa %ymm14,%ymm0
- vpunpckldq %ymm15,%ymm0,%ymm14
- vpunpckhdq %ymm15,%ymm0,%ymm15
-
- # interleave 64-bit words in state n, n+2
- vmovdqa 0x00(%rsp),%ymm0
- vmovdqa 0x40(%rsp),%ymm2
- vpunpcklqdq %ymm2,%ymm0,%ymm1
- vpunpckhqdq %ymm2,%ymm0,%ymm2
- vmovdqa %ymm1,0x00(%rsp)
- vmovdqa %ymm2,0x40(%rsp)
- vmovdqa 0x20(%rsp),%ymm0
- vmovdqa 0x60(%rsp),%ymm2
- vpunpcklqdq %ymm2,%ymm0,%ymm1
- vpunpckhqdq %ymm2,%ymm0,%ymm2
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa %ymm2,0x60(%rsp)
- vmovdqa %ymm4,%ymm0
- vpunpcklqdq %ymm6,%ymm0,%ymm4
- vpunpckhqdq %ymm6,%ymm0,%ymm6
- vmovdqa %ymm5,%ymm0
- vpunpcklqdq %ymm7,%ymm0,%ymm5
- vpunpckhqdq %ymm7,%ymm0,%ymm7
- vmovdqa %ymm8,%ymm0
- vpunpcklqdq %ymm10,%ymm0,%ymm8
- vpunpckhqdq %ymm10,%ymm0,%ymm10
- vmovdqa %ymm9,%ymm0
- vpunpcklqdq %ymm11,%ymm0,%ymm9
- vpunpckhqdq %ymm11,%ymm0,%ymm11
- vmovdqa %ymm12,%ymm0
- vpunpcklqdq %ymm14,%ymm0,%ymm12
- vpunpckhqdq %ymm14,%ymm0,%ymm14
- vmovdqa %ymm13,%ymm0
- vpunpcklqdq %ymm15,%ymm0,%ymm13
- vpunpckhqdq %ymm15,%ymm0,%ymm15
-
- # interleave 128-bit words in state n, n+4
- # xor/write first four blocks
- vmovdqa 0x00(%rsp),%ymm1
- vperm2i128 $0x20,%ymm4,%ymm1,%ymm0
- cmp $0x0020,%rax
- jl .Lxorpart8
- vpxor 0x0000(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0000(%rsi)
- vperm2i128 $0x31,%ymm4,%ymm1,%ymm4
-
- vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
- cmp $0x0040,%rax
- jl .Lxorpart8
- vpxor 0x0020(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0020(%rsi)
- vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
-
- vmovdqa 0x40(%rsp),%ymm1
- vperm2i128 $0x20,%ymm6,%ymm1,%ymm0
- cmp $0x0060,%rax
- jl .Lxorpart8
- vpxor 0x0040(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0040(%rsi)
- vperm2i128 $0x31,%ymm6,%ymm1,%ymm6
-
- vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
- cmp $0x0080,%rax
- jl .Lxorpart8
- vpxor 0x0060(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0060(%rsi)
- vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
-
- vmovdqa 0x20(%rsp),%ymm1
- vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
- cmp $0x00a0,%rax
- jl .Lxorpart8
- vpxor 0x0080(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0080(%rsi)
- vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
-
- vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
- cmp $0x00c0,%rax
- jl .Lxorpart8
- vpxor 0x00a0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x00a0(%rsi)
- vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
-
- vmovdqa 0x60(%rsp),%ymm1
- vperm2i128 $0x20,%ymm7,%ymm1,%ymm0
- cmp $0x00e0,%rax
- jl .Lxorpart8
- vpxor 0x00c0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x00c0(%rsi)
- vperm2i128 $0x31,%ymm7,%ymm1,%ymm7
-
- vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
- cmp $0x0100,%rax
- jl .Lxorpart8
- vpxor 0x00e0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x00e0(%rsi)
- vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
-
- # xor remaining blocks, write to output
- vmovdqa %ymm4,%ymm0
- cmp $0x0120,%rax
- jl .Lxorpart8
- vpxor 0x0100(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0100(%rsi)
-
- vmovdqa %ymm12,%ymm0
- cmp $0x0140,%rax
- jl .Lxorpart8
- vpxor 0x0120(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0120(%rsi)
-
- vmovdqa %ymm6,%ymm0
- cmp $0x0160,%rax
- jl .Lxorpart8
- vpxor 0x0140(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0140(%rsi)
-
- vmovdqa %ymm14,%ymm0
- cmp $0x0180,%rax
- jl .Lxorpart8
- vpxor 0x0160(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0160(%rsi)
-
- vmovdqa %ymm5,%ymm0
- cmp $0x01a0,%rax
- jl .Lxorpart8
- vpxor 0x0180(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0180(%rsi)
-
- vmovdqa %ymm13,%ymm0
- cmp $0x01c0,%rax
- jl .Lxorpart8
- vpxor 0x01a0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x01a0(%rsi)
-
- vmovdqa %ymm7,%ymm0
- cmp $0x01e0,%rax
- jl .Lxorpart8
- vpxor 0x01c0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x01c0(%rsi)
-
- vmovdqa %ymm15,%ymm0
- cmp $0x0200,%rax
- jl .Lxorpart8
- vpxor 0x01e0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x01e0(%rsi)
-
-.Ldone8:
- vzeroupper
- lea -8(%r10),%rsp
- RET
-
-.Lxorpart8:
- # xor remaining bytes from partial register into output
- mov %rax,%r9
- and $0x1f,%r9
- jz .Ldone8
- and $~0x1f,%rax
-
- mov %rsi,%r11
-
- lea (%rdx,%rax),%rsi
- mov %rsp,%rdi
- mov %r9,%rcx
- rep movsb
-
- vpxor 0x00(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
-
- mov %rsp,%rsi
- lea (%r11,%rax),%rdi
- mov %r9,%rcx
- rep movsb
-
- jmp .Ldone8
-
-SYM_FUNC_END(chacha_8block_xor_avx2)
diff --git a/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S b/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S
deleted file mode 100644
index 259383e1ad44..000000000000
--- a/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S
+++ /dev/null
@@ -1,836 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
- *
- * Copyright (C) 2018 Martin Willi
- */
-
-#include <linux/linkage.h>
-
-.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
-.align 32
-CTR2BL: .octa 0x00000000000000000000000000000000
- .octa 0x00000000000000000000000000000001
-
-.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
-.align 32
-CTR4BL: .octa 0x00000000000000000000000000000002
- .octa 0x00000000000000000000000000000003
-
-.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
-.align 32
-CTR8BL: .octa 0x00000003000000020000000100000000
- .octa 0x00000007000000060000000500000004
-
-.text
-
-SYM_FUNC_START(chacha_2block_xor_avx512vl)
- # %rdi: Input state matrix, s
- # %rsi: up to 2 data blocks output, o
- # %rdx: up to 2 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts two ChaCha blocks by loading the state
- # matrix twice across four AVX registers. It performs matrix operations
- # on four words in each matrix in parallel, but requires shuffling to
- # rearrange the words after each round.
-
- vzeroupper
-
- # x0..3[0-2] = s0..3
- vbroadcasti128 0x00(%rdi),%ymm0
- vbroadcasti128 0x10(%rdi),%ymm1
- vbroadcasti128 0x20(%rdi),%ymm2
- vbroadcasti128 0x30(%rdi),%ymm3
-
- vpaddd CTR2BL(%rip),%ymm3,%ymm3
-
- vmovdqa %ymm0,%ymm8
- vmovdqa %ymm1,%ymm9
- vmovdqa %ymm2,%ymm10
- vmovdqa %ymm3,%ymm11
-
-.Ldoubleround:
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $16,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $12,%ymm1,%ymm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $8,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $7,%ymm1,%ymm1
-
- # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm1,%ymm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm3,%ymm3
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $16,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $12,%ymm1,%ymm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $8,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $7,%ymm1,%ymm1
-
- # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm1,%ymm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm3,%ymm3
-
- sub $2,%r8d
- jnz .Ldoubleround
-
- # o0 = i0 ^ (x0 + s0)
- vpaddd %ymm8,%ymm0,%ymm7
- cmp $0x10,%rcx
- jl .Lxorpart2
- vpxord 0x00(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x00(%rsi)
- vextracti128 $1,%ymm7,%xmm0
- # o1 = i1 ^ (x1 + s1)
- vpaddd %ymm9,%ymm1,%ymm7
- cmp $0x20,%rcx
- jl .Lxorpart2
- vpxord 0x10(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x10(%rsi)
- vextracti128 $1,%ymm7,%xmm1
- # o2 = i2 ^ (x2 + s2)
- vpaddd %ymm10,%ymm2,%ymm7
- cmp $0x30,%rcx
- jl .Lxorpart2
- vpxord 0x20(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x20(%rsi)
- vextracti128 $1,%ymm7,%xmm2
- # o3 = i3 ^ (x3 + s3)
- vpaddd %ymm11,%ymm3,%ymm7
- cmp $0x40,%rcx
- jl .Lxorpart2
- vpxord 0x30(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x30(%rsi)
- vextracti128 $1,%ymm7,%xmm3
-
- # xor and write second block
- vmovdqa %xmm0,%xmm7
- cmp $0x50,%rcx
- jl .Lxorpart2
- vpxord 0x40(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x40(%rsi)
-
- vmovdqa %xmm1,%xmm7
- cmp $0x60,%rcx
- jl .Lxorpart2
- vpxord 0x50(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x50(%rsi)
-
- vmovdqa %xmm2,%xmm7
- cmp $0x70,%rcx
- jl .Lxorpart2
- vpxord 0x60(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x60(%rsi)
-
- vmovdqa %xmm3,%xmm7
- cmp $0x80,%rcx
- jl .Lxorpart2
- vpxord 0x70(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x70(%rsi)
-
-.Ldone2:
- vzeroupper
- RET
-
-.Lxorpart2:
- # xor remaining bytes from partial register into output
- mov %rcx,%rax
- and $0xf,%rcx
- jz .Ldone2
- mov %rax,%r9
- and $~0xf,%r9
-
- mov $1,%rax
- shld %cl,%rax,%rax
- sub $1,%rax
- kmovq %rax,%k1
-
- vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
- vpxord %xmm7,%xmm1,%xmm1
- vmovdqu8 %xmm1,(%rsi,%r9){%k1}
-
- jmp .Ldone2
-
-SYM_FUNC_END(chacha_2block_xor_avx512vl)
-
-SYM_FUNC_START(chacha_4block_xor_avx512vl)
- # %rdi: Input state matrix, s
- # %rsi: up to 4 data blocks output, o
- # %rdx: up to 4 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts four ChaCha blocks by loading the state
- # matrix four times across eight AVX registers. It performs matrix
- # operations on four words in two matrices in parallel, sequentially
- # to the operations on the four words of the other two matrices. The
- # required word shuffling has a rather high latency, we can do the
- # arithmetic on two matrix-pairs without much slowdown.
-
- vzeroupper
-
- # x0..3[0-4] = s0..3
- vbroadcasti128 0x00(%rdi),%ymm0
- vbroadcasti128 0x10(%rdi),%ymm1
- vbroadcasti128 0x20(%rdi),%ymm2
- vbroadcasti128 0x30(%rdi),%ymm3
-
- vmovdqa %ymm0,%ymm4
- vmovdqa %ymm1,%ymm5
- vmovdqa %ymm2,%ymm6
- vmovdqa %ymm3,%ymm7
-
- vpaddd CTR2BL(%rip),%ymm3,%ymm3
- vpaddd CTR4BL(%rip),%ymm7,%ymm7
-
- vmovdqa %ymm0,%ymm11
- vmovdqa %ymm1,%ymm12
- vmovdqa %ymm2,%ymm13
- vmovdqa %ymm3,%ymm14
- vmovdqa %ymm7,%ymm15
-
-.Ldoubleround4:
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $16,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxord %ymm4,%ymm7,%ymm7
- vprold $16,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $12,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxord %ymm6,%ymm5,%ymm5
- vprold $12,%ymm5,%ymm5
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $8,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxord %ymm4,%ymm7,%ymm7
- vprold $8,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $7,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxord %ymm6,%ymm5,%ymm5
- vprold $7,%ymm5,%ymm5
-
- # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm1,%ymm1
- vpshufd $0x39,%ymm5,%ymm5
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- vpshufd $0x4e,%ymm6,%ymm6
- # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm3,%ymm3
- vpshufd $0x93,%ymm7,%ymm7
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $16,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxord %ymm4,%ymm7,%ymm7
- vprold $16,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $12,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxord %ymm6,%ymm5,%ymm5
- vprold $12,%ymm5,%ymm5
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $8,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxord %ymm4,%ymm7,%ymm7
- vprold $8,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $7,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxord %ymm6,%ymm5,%ymm5
- vprold $7,%ymm5,%ymm5
-
- # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm1,%ymm1
- vpshufd $0x93,%ymm5,%ymm5
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- vpshufd $0x4e,%ymm6,%ymm6
- # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm3,%ymm3
- vpshufd $0x39,%ymm7,%ymm7
-
- sub $2,%r8d
- jnz .Ldoubleround4
-
- # o0 = i0 ^ (x0 + s0), first block
- vpaddd %ymm11,%ymm0,%ymm10
- cmp $0x10,%rcx
- jl .Lxorpart4
- vpxord 0x00(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x00(%rsi)
- vextracti128 $1,%ymm10,%xmm0
- # o1 = i1 ^ (x1 + s1), first block
- vpaddd %ymm12,%ymm1,%ymm10
- cmp $0x20,%rcx
- jl .Lxorpart4
- vpxord 0x10(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x10(%rsi)
- vextracti128 $1,%ymm10,%xmm1
- # o2 = i2 ^ (x2 + s2), first block
- vpaddd %ymm13,%ymm2,%ymm10
- cmp $0x30,%rcx
- jl .Lxorpart4
- vpxord 0x20(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x20(%rsi)
- vextracti128 $1,%ymm10,%xmm2
- # o3 = i3 ^ (x3 + s3), first block
- vpaddd %ymm14,%ymm3,%ymm10
- cmp $0x40,%rcx
- jl .Lxorpart4
- vpxord 0x30(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x30(%rsi)
- vextracti128 $1,%ymm10,%xmm3
-
- # xor and write second block
- vmovdqa %xmm0,%xmm10
- cmp $0x50,%rcx
- jl .Lxorpart4
- vpxord 0x40(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x40(%rsi)
-
- vmovdqa %xmm1,%xmm10
- cmp $0x60,%rcx
- jl .Lxorpart4
- vpxord 0x50(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x50(%rsi)
-
- vmovdqa %xmm2,%xmm10
- cmp $0x70,%rcx
- jl .Lxorpart4
- vpxord 0x60(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x60(%rsi)
-
- vmovdqa %xmm3,%xmm10
- cmp $0x80,%rcx
- jl .Lxorpart4
- vpxord 0x70(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x70(%rsi)
-
- # o0 = i0 ^ (x0 + s0), third block
- vpaddd %ymm11,%ymm4,%ymm10
- cmp $0x90,%rcx
- jl .Lxorpart4
- vpxord 0x80(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x80(%rsi)
- vextracti128 $1,%ymm10,%xmm4
- # o1 = i1 ^ (x1 + s1), third block
- vpaddd %ymm12,%ymm5,%ymm10
- cmp $0xa0,%rcx
- jl .Lxorpart4
- vpxord 0x90(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x90(%rsi)
- vextracti128 $1,%ymm10,%xmm5
- # o2 = i2 ^ (x2 + s2), third block
- vpaddd %ymm13,%ymm6,%ymm10
- cmp $0xb0,%rcx
- jl .Lxorpart4
- vpxord 0xa0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xa0(%rsi)
- vextracti128 $1,%ymm10,%xmm6
- # o3 = i3 ^ (x3 + s3), third block
- vpaddd %ymm15,%ymm7,%ymm10
- cmp $0xc0,%rcx
- jl .Lxorpart4
- vpxord 0xb0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xb0(%rsi)
- vextracti128 $1,%ymm10,%xmm7
-
- # xor and write fourth block
- vmovdqa %xmm4,%xmm10
- cmp $0xd0,%rcx
- jl .Lxorpart4
- vpxord 0xc0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xc0(%rsi)
-
- vmovdqa %xmm5,%xmm10
- cmp $0xe0,%rcx
- jl .Lxorpart4
- vpxord 0xd0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xd0(%rsi)
-
- vmovdqa %xmm6,%xmm10
- cmp $0xf0,%rcx
- jl .Lxorpart4
- vpxord 0xe0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xe0(%rsi)
-
- vmovdqa %xmm7,%xmm10
- cmp $0x100,%rcx
- jl .Lxorpart4
- vpxord 0xf0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xf0(%rsi)
-
-.Ldone4:
- vzeroupper
- RET
-
-.Lxorpart4:
- # xor remaining bytes from partial register into output
- mov %rcx,%rax
- and $0xf,%rcx
- jz .Ldone4
- mov %rax,%r9
- and $~0xf,%r9
-
- mov $1,%rax
- shld %cl,%rax,%rax
- sub $1,%rax
- kmovq %rax,%k1
-
- vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
- vpxord %xmm10,%xmm1,%xmm1
- vmovdqu8 %xmm1,(%rsi,%r9){%k1}
-
- jmp .Ldone4
-
-SYM_FUNC_END(chacha_4block_xor_avx512vl)
-
-SYM_FUNC_START(chacha_8block_xor_avx512vl)
- # %rdi: Input state matrix, s
- # %rsi: up to 8 data blocks output, o
- # %rdx: up to 8 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts eight consecutive ChaCha blocks by loading
- # the state matrix in AVX registers eight times. Compared to AVX2, this
- # mostly benefits from the new rotate instructions in VL and the
- # additional registers.
-
- vzeroupper
-
- # x0..15[0-7] = s[0..15]
- vpbroadcastd 0x00(%rdi),%ymm0
- vpbroadcastd 0x04(%rdi),%ymm1
- vpbroadcastd 0x08(%rdi),%ymm2
- vpbroadcastd 0x0c(%rdi),%ymm3
- vpbroadcastd 0x10(%rdi),%ymm4
- vpbroadcastd 0x14(%rdi),%ymm5
- vpbroadcastd 0x18(%rdi),%ymm6
- vpbroadcastd 0x1c(%rdi),%ymm7
- vpbroadcastd 0x20(%rdi),%ymm8
- vpbroadcastd 0x24(%rdi),%ymm9
- vpbroadcastd 0x28(%rdi),%ymm10
- vpbroadcastd 0x2c(%rdi),%ymm11
- vpbroadcastd 0x30(%rdi),%ymm12
- vpbroadcastd 0x34(%rdi),%ymm13
- vpbroadcastd 0x38(%rdi),%ymm14
- vpbroadcastd 0x3c(%rdi),%ymm15
-
- # x12 += counter values 0-3
- vpaddd CTR8BL(%rip),%ymm12,%ymm12
-
- vmovdqa64 %ymm0,%ymm16
- vmovdqa64 %ymm1,%ymm17
- vmovdqa64 %ymm2,%ymm18
- vmovdqa64 %ymm3,%ymm19
- vmovdqa64 %ymm4,%ymm20
- vmovdqa64 %ymm5,%ymm21
- vmovdqa64 %ymm6,%ymm22
- vmovdqa64 %ymm7,%ymm23
- vmovdqa64 %ymm8,%ymm24
- vmovdqa64 %ymm9,%ymm25
- vmovdqa64 %ymm10,%ymm26
- vmovdqa64 %ymm11,%ymm27
- vmovdqa64 %ymm12,%ymm28
- vmovdqa64 %ymm13,%ymm29
- vmovdqa64 %ymm14,%ymm30
- vmovdqa64 %ymm15,%ymm31
-
-.Ldoubleround8:
- # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
- vpaddd %ymm0,%ymm4,%ymm0
- vpxord %ymm0,%ymm12,%ymm12
- vprold $16,%ymm12,%ymm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
- vpaddd %ymm1,%ymm5,%ymm1
- vpxord %ymm1,%ymm13,%ymm13
- vprold $16,%ymm13,%ymm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
- vpaddd %ymm2,%ymm6,%ymm2
- vpxord %ymm2,%ymm14,%ymm14
- vprold $16,%ymm14,%ymm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
- vpaddd %ymm3,%ymm7,%ymm3
- vpxord %ymm3,%ymm15,%ymm15
- vprold $16,%ymm15,%ymm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
- vpaddd %ymm12,%ymm8,%ymm8
- vpxord %ymm8,%ymm4,%ymm4
- vprold $12,%ymm4,%ymm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
- vpaddd %ymm13,%ymm9,%ymm9
- vpxord %ymm9,%ymm5,%ymm5
- vprold $12,%ymm5,%ymm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
- vpaddd %ymm14,%ymm10,%ymm10
- vpxord %ymm10,%ymm6,%ymm6
- vprold $12,%ymm6,%ymm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
- vpaddd %ymm15,%ymm11,%ymm11
- vpxord %ymm11,%ymm7,%ymm7
- vprold $12,%ymm7,%ymm7
-
- # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
- vpaddd %ymm0,%ymm4,%ymm0
- vpxord %ymm0,%ymm12,%ymm12
- vprold $8,%ymm12,%ymm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
- vpaddd %ymm1,%ymm5,%ymm1
- vpxord %ymm1,%ymm13,%ymm13
- vprold $8,%ymm13,%ymm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
- vpaddd %ymm2,%ymm6,%ymm2
- vpxord %ymm2,%ymm14,%ymm14
- vprold $8,%ymm14,%ymm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
- vpaddd %ymm3,%ymm7,%ymm3
- vpxord %ymm3,%ymm15,%ymm15
- vprold $8,%ymm15,%ymm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
- vpaddd %ymm12,%ymm8,%ymm8
- vpxord %ymm8,%ymm4,%ymm4
- vprold $7,%ymm4,%ymm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
- vpaddd %ymm13,%ymm9,%ymm9
- vpxord %ymm9,%ymm5,%ymm5
- vprold $7,%ymm5,%ymm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
- vpaddd %ymm14,%ymm10,%ymm10
- vpxord %ymm10,%ymm6,%ymm6
- vprold $7,%ymm6,%ymm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
- vpaddd %ymm15,%ymm11,%ymm11
- vpxord %ymm11,%ymm7,%ymm7
- vprold $7,%ymm7,%ymm7
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
- vpaddd %ymm0,%ymm5,%ymm0
- vpxord %ymm0,%ymm15,%ymm15
- vprold $16,%ymm15,%ymm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
- vpaddd %ymm1,%ymm6,%ymm1
- vpxord %ymm1,%ymm12,%ymm12
- vprold $16,%ymm12,%ymm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
- vpaddd %ymm2,%ymm7,%ymm2
- vpxord %ymm2,%ymm13,%ymm13
- vprold $16,%ymm13,%ymm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
- vpaddd %ymm3,%ymm4,%ymm3
- vpxord %ymm3,%ymm14,%ymm14
- vprold $16,%ymm14,%ymm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
- vpaddd %ymm15,%ymm10,%ymm10
- vpxord %ymm10,%ymm5,%ymm5
- vprold $12,%ymm5,%ymm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
- vpaddd %ymm12,%ymm11,%ymm11
- vpxord %ymm11,%ymm6,%ymm6
- vprold $12,%ymm6,%ymm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
- vpaddd %ymm13,%ymm8,%ymm8
- vpxord %ymm8,%ymm7,%ymm7
- vprold $12,%ymm7,%ymm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
- vpaddd %ymm14,%ymm9,%ymm9
- vpxord %ymm9,%ymm4,%ymm4
- vprold $12,%ymm4,%ymm4
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
- vpaddd %ymm0,%ymm5,%ymm0
- vpxord %ymm0,%ymm15,%ymm15
- vprold $8,%ymm15,%ymm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
- vpaddd %ymm1,%ymm6,%ymm1
- vpxord %ymm1,%ymm12,%ymm12
- vprold $8,%ymm12,%ymm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
- vpaddd %ymm2,%ymm7,%ymm2
- vpxord %ymm2,%ymm13,%ymm13
- vprold $8,%ymm13,%ymm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
- vpaddd %ymm3,%ymm4,%ymm3
- vpxord %ymm3,%ymm14,%ymm14
- vprold $8,%ymm14,%ymm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
- vpaddd %ymm15,%ymm10,%ymm10
- vpxord %ymm10,%ymm5,%ymm5
- vprold $7,%ymm5,%ymm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
- vpaddd %ymm12,%ymm11,%ymm11
- vpxord %ymm11,%ymm6,%ymm6
- vprold $7,%ymm6,%ymm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
- vpaddd %ymm13,%ymm8,%ymm8
- vpxord %ymm8,%ymm7,%ymm7
- vprold $7,%ymm7,%ymm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
- vpaddd %ymm14,%ymm9,%ymm9
- vpxord %ymm9,%ymm4,%ymm4
- vprold $7,%ymm4,%ymm4
-
- sub $2,%r8d
- jnz .Ldoubleround8
-
- # x0..15[0-3] += s[0..15]
- vpaddd %ymm16,%ymm0,%ymm0
- vpaddd %ymm17,%ymm1,%ymm1
- vpaddd %ymm18,%ymm2,%ymm2
- vpaddd %ymm19,%ymm3,%ymm3
- vpaddd %ymm20,%ymm4,%ymm4
- vpaddd %ymm21,%ymm5,%ymm5
- vpaddd %ymm22,%ymm6,%ymm6
- vpaddd %ymm23,%ymm7,%ymm7
- vpaddd %ymm24,%ymm8,%ymm8
- vpaddd %ymm25,%ymm9,%ymm9
- vpaddd %ymm26,%ymm10,%ymm10
- vpaddd %ymm27,%ymm11,%ymm11
- vpaddd %ymm28,%ymm12,%ymm12
- vpaddd %ymm29,%ymm13,%ymm13
- vpaddd %ymm30,%ymm14,%ymm14
- vpaddd %ymm31,%ymm15,%ymm15
-
- # interleave 32-bit words in state n, n+1
- vpunpckldq %ymm1,%ymm0,%ymm16
- vpunpckhdq %ymm1,%ymm0,%ymm17
- vpunpckldq %ymm3,%ymm2,%ymm18
- vpunpckhdq %ymm3,%ymm2,%ymm19
- vpunpckldq %ymm5,%ymm4,%ymm20
- vpunpckhdq %ymm5,%ymm4,%ymm21
- vpunpckldq %ymm7,%ymm6,%ymm22
- vpunpckhdq %ymm7,%ymm6,%ymm23
- vpunpckldq %ymm9,%ymm8,%ymm24
- vpunpckhdq %ymm9,%ymm8,%ymm25
- vpunpckldq %ymm11,%ymm10,%ymm26
- vpunpckhdq %ymm11,%ymm10,%ymm27
- vpunpckldq %ymm13,%ymm12,%ymm28
- vpunpckhdq %ymm13,%ymm12,%ymm29
- vpunpckldq %ymm15,%ymm14,%ymm30
- vpunpckhdq %ymm15,%ymm14,%ymm31
-
- # interleave 64-bit words in state n, n+2
- vpunpcklqdq %ymm18,%ymm16,%ymm0
- vpunpcklqdq %ymm19,%ymm17,%ymm1
- vpunpckhqdq %ymm18,%ymm16,%ymm2
- vpunpckhqdq %ymm19,%ymm17,%ymm3
- vpunpcklqdq %ymm22,%ymm20,%ymm4
- vpunpcklqdq %ymm23,%ymm21,%ymm5
- vpunpckhqdq %ymm22,%ymm20,%ymm6
- vpunpckhqdq %ymm23,%ymm21,%ymm7
- vpunpcklqdq %ymm26,%ymm24,%ymm8
- vpunpcklqdq %ymm27,%ymm25,%ymm9
- vpunpckhqdq %ymm26,%ymm24,%ymm10
- vpunpckhqdq %ymm27,%ymm25,%ymm11
- vpunpcklqdq %ymm30,%ymm28,%ymm12
- vpunpcklqdq %ymm31,%ymm29,%ymm13
- vpunpckhqdq %ymm30,%ymm28,%ymm14
- vpunpckhqdq %ymm31,%ymm29,%ymm15
-
- # interleave 128-bit words in state n, n+4
- # xor/write first four blocks
- vmovdqa64 %ymm0,%ymm16
- vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
- cmp $0x0020,%rcx
- jl .Lxorpart8
- vpxord 0x0000(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0000(%rsi)
- vmovdqa64 %ymm16,%ymm0
- vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
-
- vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
- cmp $0x0040,%rcx
- jl .Lxorpart8
- vpxord 0x0020(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0020(%rsi)
- vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
-
- vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
- cmp $0x0060,%rcx
- jl .Lxorpart8
- vpxord 0x0040(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0040(%rsi)
- vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
-
- vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
- cmp $0x0080,%rcx
- jl .Lxorpart8
- vpxord 0x0060(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0060(%rsi)
- vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
-
- vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
- cmp $0x00a0,%rcx
- jl .Lxorpart8
- vpxord 0x0080(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0080(%rsi)
- vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
-
- vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
- cmp $0x00c0,%rcx
- jl .Lxorpart8
- vpxord 0x00a0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x00a0(%rsi)
- vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
-
- vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
- cmp $0x00e0,%rcx
- jl .Lxorpart8
- vpxord 0x00c0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x00c0(%rsi)
- vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
-
- vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
- cmp $0x0100,%rcx
- jl .Lxorpart8
- vpxord 0x00e0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x00e0(%rsi)
- vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
-
- # xor remaining blocks, write to output
- vmovdqa64 %ymm4,%ymm0
- cmp $0x0120,%rcx
- jl .Lxorpart8
- vpxord 0x0100(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0100(%rsi)
-
- vmovdqa64 %ymm12,%ymm0
- cmp $0x0140,%rcx
- jl .Lxorpart8
- vpxord 0x0120(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0120(%rsi)
-
- vmovdqa64 %ymm6,%ymm0
- cmp $0x0160,%rcx
- jl .Lxorpart8
- vpxord 0x0140(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0140(%rsi)
-
- vmovdqa64 %ymm14,%ymm0
- cmp $0x0180,%rcx
- jl .Lxorpart8
- vpxord 0x0160(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0160(%rsi)
-
- vmovdqa64 %ymm5,%ymm0
- cmp $0x01a0,%rcx
- jl .Lxorpart8
- vpxord 0x0180(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0180(%rsi)
-
- vmovdqa64 %ymm13,%ymm0
- cmp $0x01c0,%rcx
- jl .Lxorpart8
- vpxord 0x01a0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x01a0(%rsi)
-
- vmovdqa64 %ymm7,%ymm0
- cmp $0x01e0,%rcx
- jl .Lxorpart8
- vpxord 0x01c0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x01c0(%rsi)
-
- vmovdqa64 %ymm15,%ymm0
- cmp $0x0200,%rcx
- jl .Lxorpart8
- vpxord 0x01e0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x01e0(%rsi)
-
-.Ldone8:
- vzeroupper
- RET
-
-.Lxorpart8:
- # xor remaining bytes from partial register into output
- mov %rcx,%rax
- and $0x1f,%rcx
- jz .Ldone8
- mov %rax,%r9
- and $~0x1f,%r9
-
- mov $1,%rax
- shld %cl,%rax,%rax
- sub $1,%rax
- kmovq %rax,%k1
-
- vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
- vpxord %ymm0,%ymm1,%ymm1
- vmovdqu8 %ymm1,(%rsi,%r9){%k1}
-
- jmp .Ldone8
-
-SYM_FUNC_END(chacha_8block_xor_avx512vl)
diff --git a/arch/x86/lib/crypto/chacha-ssse3-x86_64.S b/arch/x86/lib/crypto/chacha-ssse3-x86_64.S
deleted file mode 100644
index 7111949cd5b9..000000000000
--- a/arch/x86/lib/crypto/chacha-ssse3-x86_64.S
+++ /dev/null
@@ -1,791 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-.section .rodata.cst16.ROT8, "aM", @progbits, 16
-.align 16
-ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
-.section .rodata.cst16.ROT16, "aM", @progbits, 16
-.align 16
-ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
-.section .rodata.cst16.CTRINC, "aM", @progbits, 16
-.align 16
-CTRINC: .octa 0x00000003000000020000000100000000
-
-.text
-
-/*
- * chacha_permute - permute one block
- *
- * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
- * function performs matrix operations on four words in parallel, but requires
- * shuffling to rearrange the words after each round. 8/16-bit word rotation is
- * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
- * rotation uses traditional shift+OR.
- *
- * The round count is given in %r8d.
- *
- * Clobbers: %r8d, %xmm4-%xmm7
- */
-SYM_FUNC_START_LOCAL(chacha_permute)
-
- movdqa ROT8(%rip),%xmm4
- movdqa ROT16(%rip),%xmm5
-
-.Ldoubleround:
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm5,%xmm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm6
- pslld $12,%xmm6
- psrld $20,%xmm1
- por %xmm6,%xmm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm4,%xmm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm7
- pslld $7,%xmm7
- psrld $25,%xmm1
- por %xmm7,%xmm1
-
- # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- pshufd $0x39,%xmm1,%xmm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- pshufd $0x4e,%xmm2,%xmm2
- # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- pshufd $0x93,%xmm3,%xmm3
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm5,%xmm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm6
- pslld $12,%xmm6
- psrld $20,%xmm1
- por %xmm6,%xmm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm4,%xmm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm7
- pslld $7,%xmm7
- psrld $25,%xmm1
- por %xmm7,%xmm1
-
- # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- pshufd $0x93,%xmm1,%xmm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- pshufd $0x4e,%xmm2,%xmm2
- # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- pshufd $0x39,%xmm3,%xmm3
-
- sub $2,%r8d
- jnz .Ldoubleround
-
- RET
-SYM_FUNC_END(chacha_permute)
-
-SYM_FUNC_START(chacha_block_xor_ssse3)
- # %rdi: Input state matrix, s
- # %rsi: up to 1 data block output, o
- # %rdx: up to 1 data block input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
- FRAME_BEGIN
-
- # x0..3 = s0..3
- movdqu 0x00(%rdi),%xmm0
- movdqu 0x10(%rdi),%xmm1
- movdqu 0x20(%rdi),%xmm2
- movdqu 0x30(%rdi),%xmm3
- movdqa %xmm0,%xmm8
- movdqa %xmm1,%xmm9
- movdqa %xmm2,%xmm10
- movdqa %xmm3,%xmm11
-
- mov %rcx,%rax
- call chacha_permute
-
- # o0 = i0 ^ (x0 + s0)
- paddd %xmm8,%xmm0
- cmp $0x10,%rax
- jl .Lxorpart
- movdqu 0x00(%rdx),%xmm4
- pxor %xmm4,%xmm0
- movdqu %xmm0,0x00(%rsi)
- # o1 = i1 ^ (x1 + s1)
- paddd %xmm9,%xmm1
- movdqa %xmm1,%xmm0
- cmp $0x20,%rax
- jl .Lxorpart
- movdqu 0x10(%rdx),%xmm0
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x10(%rsi)
- # o2 = i2 ^ (x2 + s2)
- paddd %xmm10,%xmm2
- movdqa %xmm2,%xmm0
- cmp $0x30,%rax
- jl .Lxorpart
- movdqu 0x20(%rdx),%xmm0
- pxor %xmm2,%xmm0
- movdqu %xmm0,0x20(%rsi)
- # o3 = i3 ^ (x3 + s3)
- paddd %xmm11,%xmm3
- movdqa %xmm3,%xmm0
- cmp $0x40,%rax
- jl .Lxorpart
- movdqu 0x30(%rdx),%xmm0
- pxor %xmm3,%xmm0
- movdqu %xmm0,0x30(%rsi)
-
-.Ldone:
- FRAME_END
- RET
-
-.Lxorpart:
- # xor remaining bytes from partial register into output
- mov %rax,%r9
- and $0x0f,%r9
- jz .Ldone
- and $~0x0f,%rax
-
- mov %rsi,%r11
-
- lea 8(%rsp),%r10
- sub $0x10,%rsp
- and $~31,%rsp
-
- lea (%rdx,%rax),%rsi
- mov %rsp,%rdi
- mov %r9,%rcx
- rep movsb
-
- pxor 0x00(%rsp),%xmm0
- movdqa %xmm0,0x00(%rsp)
-
- mov %rsp,%rsi
- lea (%r11,%rax),%rdi
- mov %r9,%rcx
- rep movsb
-
- lea -8(%r10),%rsp
- jmp .Ldone
-
-SYM_FUNC_END(chacha_block_xor_ssse3)
-
-SYM_FUNC_START(hchacha_block_ssse3)
- # %rdi: Input state matrix, s
- # %rsi: output (8 32-bit words)
- # %edx: nrounds
- FRAME_BEGIN
-
- movdqu 0x00(%rdi),%xmm0
- movdqu 0x10(%rdi),%xmm1
- movdqu 0x20(%rdi),%xmm2
- movdqu 0x30(%rdi),%xmm3
-
- mov %edx,%r8d
- call chacha_permute
-
- movdqu %xmm0,0x00(%rsi)
- movdqu %xmm3,0x10(%rsi)
-
- FRAME_END
- RET
-SYM_FUNC_END(hchacha_block_ssse3)
-
-SYM_FUNC_START(chacha_4block_xor_ssse3)
- # %rdi: Input state matrix, s
- # %rsi: up to 4 data blocks output, o
- # %rdx: up to 4 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts four consecutive ChaCha blocks by loading the
- # the state matrix in SSE registers four times. As we need some scratch
- # registers, we save the first four registers on the stack. The
- # algorithm performs each operation on the corresponding word of each
- # state matrix, hence requires no word shuffling. For final XORing step
- # we transpose the matrix by interleaving 32- and then 64-bit words,
- # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
- # done with the slightly better performing SSSE3 byte shuffling,
- # 7/12-bit word rotation uses traditional shift+OR.
-
- lea 8(%rsp),%r10
- sub $0x80,%rsp
- and $~63,%rsp
- mov %rcx,%rax
-
- # x0..15[0-3] = s0..3[0..3]
- movq 0x00(%rdi),%xmm1
- pshufd $0x00,%xmm1,%xmm0
- pshufd $0x55,%xmm1,%xmm1
- movq 0x08(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- movq 0x10(%rdi),%xmm5
- pshufd $0x00,%xmm5,%xmm4
- pshufd $0x55,%xmm5,%xmm5
- movq 0x18(%rdi),%xmm7
- pshufd $0x00,%xmm7,%xmm6
- pshufd $0x55,%xmm7,%xmm7
- movq 0x20(%rdi),%xmm9
- pshufd $0x00,%xmm9,%xmm8
- pshufd $0x55,%xmm9,%xmm9
- movq 0x28(%rdi),%xmm11
- pshufd $0x00,%xmm11,%xmm10
- pshufd $0x55,%xmm11,%xmm11
- movq 0x30(%rdi),%xmm13
- pshufd $0x00,%xmm13,%xmm12
- pshufd $0x55,%xmm13,%xmm13
- movq 0x38(%rdi),%xmm15
- pshufd $0x00,%xmm15,%xmm14
- pshufd $0x55,%xmm15,%xmm15
- # x0..3 on stack
- movdqa %xmm0,0x00(%rsp)
- movdqa %xmm1,0x10(%rsp)
- movdqa %xmm2,0x20(%rsp)
- movdqa %xmm3,0x30(%rsp)
-
- movdqa CTRINC(%rip),%xmm1
- movdqa ROT8(%rip),%xmm2
- movdqa ROT16(%rip),%xmm3
-
- # x12 += counter values 0-3
- paddd %xmm1,%xmm12
-
-.Ldoubleround4:
- # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
- movdqa 0x00(%rsp),%xmm0
- paddd %xmm4,%xmm0
- movdqa %xmm0,0x00(%rsp)
- pxor %xmm0,%xmm12
- pshufb %xmm3,%xmm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
- movdqa 0x10(%rsp),%xmm0
- paddd %xmm5,%xmm0
- movdqa %xmm0,0x10(%rsp)
- pxor %xmm0,%xmm13
- pshufb %xmm3,%xmm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
- movdqa 0x20(%rsp),%xmm0
- paddd %xmm6,%xmm0
- movdqa %xmm0,0x20(%rsp)
- pxor %xmm0,%xmm14
- pshufb %xmm3,%xmm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
- movdqa 0x30(%rsp),%xmm0
- paddd %xmm7,%xmm0
- movdqa %xmm0,0x30(%rsp)
- pxor %xmm0,%xmm15
- pshufb %xmm3,%xmm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm4
- por %xmm0,%xmm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm5
- por %xmm0,%xmm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm6
- por %xmm0,%xmm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
- paddd %xmm15,%xmm11
- pxor %xmm11,%xmm7
- movdqa %xmm7,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm7
- por %xmm0,%xmm7
-
- # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
- movdqa 0x00(%rsp),%xmm0
- paddd %xmm4,%xmm0
- movdqa %xmm0,0x00(%rsp)
- pxor %xmm0,%xmm12
- pshufb %xmm2,%xmm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
- movdqa 0x10(%rsp),%xmm0
- paddd %xmm5,%xmm0
- movdqa %xmm0,0x10(%rsp)
- pxor %xmm0,%xmm13
- pshufb %xmm2,%xmm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
- movdqa 0x20(%rsp),%xmm0
- paddd %xmm6,%xmm0
- movdqa %xmm0,0x20(%rsp)
- pxor %xmm0,%xmm14
- pshufb %xmm2,%xmm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
- movdqa 0x30(%rsp),%xmm0
- paddd %xmm7,%xmm0
- movdqa %xmm0,0x30(%rsp)
- pxor %xmm0,%xmm15
- pshufb %xmm2,%xmm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm4
- por %xmm0,%xmm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm5
- por %xmm0,%xmm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm6
- por %xmm0,%xmm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
- paddd %xmm15,%xmm11
- pxor %xmm11,%xmm7
- movdqa %xmm7,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm7
- por %xmm0,%xmm7
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
- movdqa 0x00(%rsp),%xmm0
- paddd %xmm5,%xmm0
- movdqa %xmm0,0x00(%rsp)
- pxor %xmm0,%xmm15
- pshufb %xmm3,%xmm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
- movdqa 0x10(%rsp),%xmm0
- paddd %xmm6,%xmm0
- movdqa %xmm0,0x10(%rsp)
- pxor %xmm0,%xmm12
- pshufb %xmm3,%xmm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
- movdqa 0x20(%rsp),%xmm0
- paddd %xmm7,%xmm0
- movdqa %xmm0,0x20(%rsp)
- pxor %xmm0,%xmm13
- pshufb %xmm3,%xmm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
- movdqa 0x30(%rsp),%xmm0
- paddd %xmm4,%xmm0
- movdqa %xmm0,0x30(%rsp)
- pxor %xmm0,%xmm14
- pshufb %xmm3,%xmm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
- paddd %xmm15,%xmm10
- pxor %xmm10,%xmm5
- movdqa %xmm5,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm5
- por %xmm0,%xmm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
- paddd %xmm12,%xmm11
- pxor %xmm11,%xmm6
- movdqa %xmm6,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm6
- por %xmm0,%xmm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
- paddd %xmm13,%xmm8
- pxor %xmm8,%xmm7
- movdqa %xmm7,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm7
- por %xmm0,%xmm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
- paddd %xmm14,%xmm9
- pxor %xmm9,%xmm4
- movdqa %xmm4,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm4
- por %xmm0,%xmm4
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
- movdqa 0x00(%rsp),%xmm0
- paddd %xmm5,%xmm0
- movdqa %xmm0,0x00(%rsp)
- pxor %xmm0,%xmm15
- pshufb %xmm2,%xmm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
- movdqa 0x10(%rsp),%xmm0
- paddd %xmm6,%xmm0
- movdqa %xmm0,0x10(%rsp)
- pxor %xmm0,%xmm12
- pshufb %xmm2,%xmm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
- movdqa 0x20(%rsp),%xmm0
- paddd %xmm7,%xmm0
- movdqa %xmm0,0x20(%rsp)
- pxor %xmm0,%xmm13
- pshufb %xmm2,%xmm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
- movdqa 0x30(%rsp),%xmm0
- paddd %xmm4,%xmm0
- movdqa %xmm0,0x30(%rsp)
- pxor %xmm0,%xmm14
- pshufb %xmm2,%xmm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
- paddd %xmm15,%xmm10
- pxor %xmm10,%xmm5
- movdqa %xmm5,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm5
- por %xmm0,%xmm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
- paddd %xmm12,%xmm11
- pxor %xmm11,%xmm6
- movdqa %xmm6,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm6
- por %xmm0,%xmm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
- paddd %xmm13,%xmm8
- pxor %xmm8,%xmm7
- movdqa %xmm7,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm7
- por %xmm0,%xmm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
- paddd %xmm14,%xmm9
- pxor %xmm9,%xmm4
- movdqa %xmm4,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm4
- por %xmm0,%xmm4
-
- sub $2,%r8d
- jnz .Ldoubleround4
-
- # x0[0-3] += s0[0]
- # x1[0-3] += s0[1]
- movq 0x00(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd 0x00(%rsp),%xmm2
- movdqa %xmm2,0x00(%rsp)
- paddd 0x10(%rsp),%xmm3
- movdqa %xmm3,0x10(%rsp)
- # x2[0-3] += s0[2]
- # x3[0-3] += s0[3]
- movq 0x08(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd 0x20(%rsp),%xmm2
- movdqa %xmm2,0x20(%rsp)
- paddd 0x30(%rsp),%xmm3
- movdqa %xmm3,0x30(%rsp)
-
- # x4[0-3] += s1[0]
- # x5[0-3] += s1[1]
- movq 0x10(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm4
- paddd %xmm3,%xmm5
- # x6[0-3] += s1[2]
- # x7[0-3] += s1[3]
- movq 0x18(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm6
- paddd %xmm3,%xmm7
-
- # x8[0-3] += s2[0]
- # x9[0-3] += s2[1]
- movq 0x20(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm8
- paddd %xmm3,%xmm9
- # x10[0-3] += s2[2]
- # x11[0-3] += s2[3]
- movq 0x28(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm10
- paddd %xmm3,%xmm11
-
- # x12[0-3] += s3[0]
- # x13[0-3] += s3[1]
- movq 0x30(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm12
- paddd %xmm3,%xmm13
- # x14[0-3] += s3[2]
- # x15[0-3] += s3[3]
- movq 0x38(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm14
- paddd %xmm3,%xmm15
-
- # x12 += counter values 0-3
- paddd %xmm1,%xmm12
-
- # interleave 32-bit words in state n, n+1
- movdqa 0x00(%rsp),%xmm0
- movdqa 0x10(%rsp),%xmm1
- movdqa %xmm0,%xmm2
- punpckldq %xmm1,%xmm2
- punpckhdq %xmm1,%xmm0
- movdqa %xmm2,0x00(%rsp)
- movdqa %xmm0,0x10(%rsp)
- movdqa 0x20(%rsp),%xmm0
- movdqa 0x30(%rsp),%xmm1
- movdqa %xmm0,%xmm2
- punpckldq %xmm1,%xmm2
- punpckhdq %xmm1,%xmm0
- movdqa %xmm2,0x20(%rsp)
- movdqa %xmm0,0x30(%rsp)
- movdqa %xmm4,%xmm0
- punpckldq %xmm5,%xmm4
- punpckhdq %xmm5,%xmm0
- movdqa %xmm0,%xmm5
- movdqa %xmm6,%xmm0
- punpckldq %xmm7,%xmm6
- punpckhdq %xmm7,%xmm0
- movdqa %xmm0,%xmm7
- movdqa %xmm8,%xmm0
- punpckldq %xmm9,%xmm8
- punpckhdq %xmm9,%xmm0
- movdqa %xmm0,%xmm9
- movdqa %xmm10,%xmm0
- punpckldq %xmm11,%xmm10
- punpckhdq %xmm11,%xmm0
- movdqa %xmm0,%xmm11
- movdqa %xmm12,%xmm0
- punpckldq %xmm13,%xmm12
- punpckhdq %xmm13,%xmm0
- movdqa %xmm0,%xmm13
- movdqa %xmm14,%xmm0
- punpckldq %xmm15,%xmm14
- punpckhdq %xmm15,%xmm0
- movdqa %xmm0,%xmm15
-
- # interleave 64-bit words in state n, n+2
- movdqa 0x00(%rsp),%xmm0
- movdqa 0x20(%rsp),%xmm1
- movdqa %xmm0,%xmm2
- punpcklqdq %xmm1,%xmm2
- punpckhqdq %xmm1,%xmm0
- movdqa %xmm2,0x00(%rsp)
- movdqa %xmm0,0x20(%rsp)
- movdqa 0x10(%rsp),%xmm0
- movdqa 0x30(%rsp),%xmm1
- movdqa %xmm0,%xmm2
- punpcklqdq %xmm1,%xmm2
- punpckhqdq %xmm1,%xmm0
- movdqa %xmm2,0x10(%rsp)
- movdqa %xmm0,0x30(%rsp)
- movdqa %xmm4,%xmm0
- punpcklqdq %xmm6,%xmm4
- punpckhqdq %xmm6,%xmm0
- movdqa %xmm0,%xmm6
- movdqa %xmm5,%xmm0
- punpcklqdq %xmm7,%xmm5
- punpckhqdq %xmm7,%xmm0
- movdqa %xmm0,%xmm7
- movdqa %xmm8,%xmm0
- punpcklqdq %xmm10,%xmm8
- punpckhqdq %xmm10,%xmm0
- movdqa %xmm0,%xmm10
- movdqa %xmm9,%xmm0
- punpcklqdq %xmm11,%xmm9
- punpckhqdq %xmm11,%xmm0
- movdqa %xmm0,%xmm11
- movdqa %xmm12,%xmm0
- punpcklqdq %xmm14,%xmm12
- punpckhqdq %xmm14,%xmm0
- movdqa %xmm0,%xmm14
- movdqa %xmm13,%xmm0
- punpcklqdq %xmm15,%xmm13
- punpckhqdq %xmm15,%xmm0
- movdqa %xmm0,%xmm15
-
- # xor with corresponding input, write to output
- movdqa 0x00(%rsp),%xmm0
- cmp $0x10,%rax
- jl .Lxorpart4
- movdqu 0x00(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x00(%rsi)
-
- movdqu %xmm4,%xmm0
- cmp $0x20,%rax
- jl .Lxorpart4
- movdqu 0x10(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x10(%rsi)
-
- movdqu %xmm8,%xmm0
- cmp $0x30,%rax
- jl .Lxorpart4
- movdqu 0x20(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x20(%rsi)
-
- movdqu %xmm12,%xmm0
- cmp $0x40,%rax
- jl .Lxorpart4
- movdqu 0x30(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x30(%rsi)
-
- movdqa 0x20(%rsp),%xmm0
- cmp $0x50,%rax
- jl .Lxorpart4
- movdqu 0x40(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x40(%rsi)
-
- movdqu %xmm6,%xmm0
- cmp $0x60,%rax
- jl .Lxorpart4
- movdqu 0x50(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x50(%rsi)
-
- movdqu %xmm10,%xmm0
- cmp $0x70,%rax
- jl .Lxorpart4
- movdqu 0x60(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x60(%rsi)
-
- movdqu %xmm14,%xmm0
- cmp $0x80,%rax
- jl .Lxorpart4
- movdqu 0x70(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x70(%rsi)
-
- movdqa 0x10(%rsp),%xmm0
- cmp $0x90,%rax
- jl .Lxorpart4
- movdqu 0x80(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x80(%rsi)
-
- movdqu %xmm5,%xmm0
- cmp $0xa0,%rax
- jl .Lxorpart4
- movdqu 0x90(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x90(%rsi)
-
- movdqu %xmm9,%xmm0
- cmp $0xb0,%rax
- jl .Lxorpart4
- movdqu 0xa0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xa0(%rsi)
-
- movdqu %xmm13,%xmm0
- cmp $0xc0,%rax
- jl .Lxorpart4
- movdqu 0xb0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xb0(%rsi)
-
- movdqa 0x30(%rsp),%xmm0
- cmp $0xd0,%rax
- jl .Lxorpart4
- movdqu 0xc0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xc0(%rsi)
-
- movdqu %xmm7,%xmm0
- cmp $0xe0,%rax
- jl .Lxorpart4
- movdqu 0xd0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xd0(%rsi)
-
- movdqu %xmm11,%xmm0
- cmp $0xf0,%rax
- jl .Lxorpart4
- movdqu 0xe0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xe0(%rsi)
-
- movdqu %xmm15,%xmm0
- cmp $0x100,%rax
- jl .Lxorpart4
- movdqu 0xf0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xf0(%rsi)
-
-.Ldone4:
- lea -8(%r10),%rsp
- RET
-
-.Lxorpart4:
- # xor remaining bytes from partial register into output
- mov %rax,%r9
- and $0x0f,%r9
- jz .Ldone4
- and $~0x0f,%rax
-
- mov %rsi,%r11
-
- lea (%rdx,%rax),%rsi
- mov %rsp,%rdi
- mov %r9,%rcx
- rep movsb
-
- pxor 0x00(%rsp),%xmm0
- movdqa %xmm0,0x00(%rsp)
-
- mov %rsp,%rsi
- lea (%r11,%rax),%rdi
- mov %r9,%rcx
- rep movsb
-
- jmp .Ldone4
-
-SYM_FUNC_END(chacha_4block_xor_ssse3)
diff --git a/arch/x86/lib/crypto/chacha_glue.c b/arch/x86/lib/crypto/chacha_glue.c
deleted file mode 100644
index 10b2c945f541..000000000000
--- a/arch/x86/lib/crypto/chacha_glue.c
+++ /dev/null
@@ -1,196 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * ChaCha and HChaCha functions (x86_64 optimized)
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <asm/simd.h>
-#include <crypto/chacha.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/sizes.h>
-
-asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void hchacha_block_ssse3(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds);
-
-asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-
-asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
-
-static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
-{
- len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
- return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
-}
-
-static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- if (static_branch_likely(&chacha_use_avx512vl)) {
- while (bytes >= CHACHA_BLOCK_SIZE * 8) {
- chacha_8block_xor_avx512vl(state, dst, src, bytes,
- nrounds);
- bytes -= CHACHA_BLOCK_SIZE * 8;
- src += CHACHA_BLOCK_SIZE * 8;
- dst += CHACHA_BLOCK_SIZE * 8;
- state->x[12] += 8;
- }
- if (bytes > CHACHA_BLOCK_SIZE * 4) {
- chacha_8block_xor_avx512vl(state, dst, src, bytes,
- nrounds);
- state->x[12] += chacha_advance(bytes, 8);
- return;
- }
- if (bytes > CHACHA_BLOCK_SIZE * 2) {
- chacha_4block_xor_avx512vl(state, dst, src, bytes,
- nrounds);
- state->x[12] += chacha_advance(bytes, 4);
- return;
- }
- if (bytes) {
- chacha_2block_xor_avx512vl(state, dst, src, bytes,
- nrounds);
- state->x[12] += chacha_advance(bytes, 2);
- return;
- }
- }
-
- if (static_branch_likely(&chacha_use_avx2)) {
- while (bytes >= CHACHA_BLOCK_SIZE * 8) {
- chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
- bytes -= CHACHA_BLOCK_SIZE * 8;
- src += CHACHA_BLOCK_SIZE * 8;
- dst += CHACHA_BLOCK_SIZE * 8;
- state->x[12] += 8;
- }
- if (bytes > CHACHA_BLOCK_SIZE * 4) {
- chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
- state->x[12] += chacha_advance(bytes, 8);
- return;
- }
- if (bytes > CHACHA_BLOCK_SIZE * 2) {
- chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
- state->x[12] += chacha_advance(bytes, 4);
- return;
- }
- if (bytes > CHACHA_BLOCK_SIZE) {
- chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
- state->x[12] += chacha_advance(bytes, 2);
- return;
- }
- }
-
- while (bytes >= CHACHA_BLOCK_SIZE * 4) {
- chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
- bytes -= CHACHA_BLOCK_SIZE * 4;
- src += CHACHA_BLOCK_SIZE * 4;
- dst += CHACHA_BLOCK_SIZE * 4;
- state->x[12] += 4;
- }
- if (bytes > CHACHA_BLOCK_SIZE) {
- chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
- state->x[12] += chacha_advance(bytes, 4);
- return;
- }
- if (bytes) {
- chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
- state->x[12]++;
- }
-}
-
-void hchacha_block_arch(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds)
-{
- if (!static_branch_likely(&chacha_use_simd)) {
- hchacha_block_generic(state, out, nrounds);
- } else {
- kernel_fpu_begin();
- hchacha_block_ssse3(state, out, nrounds);
- kernel_fpu_end();
- }
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- if (!static_branch_likely(&chacha_use_simd) ||
- bytes <= CHACHA_BLOCK_SIZE)
- return chacha_crypt_generic(state, dst, src, bytes, nrounds);
-
- do {
- unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
-
- kernel_fpu_begin();
- chacha_dosimd(state, dst, src, todo, nrounds);
- kernel_fpu_end();
-
- bytes -= todo;
- src += todo;
- dst += todo;
- } while (bytes);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-bool chacha_is_arch_optimized(void)
-{
- return static_key_enabled(&chacha_use_simd);
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-static int __init chacha_simd_mod_init(void)
-{
- if (!boot_cpu_has(X86_FEATURE_SSSE3))
- return 0;
-
- static_branch_enable(&chacha_use_simd);
-
- if (boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
- static_branch_enable(&chacha_use_avx2);
-
- if (boot_cpu_has(X86_FEATURE_AVX512VL) &&
- boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
- static_branch_enable(&chacha_use_avx512vl);
- }
- return 0;
-}
-subsys_initcall(chacha_simd_mod_init);
-
-static void __exit chacha_simd_mod_exit(void)
-{
-}
-module_exit(chacha_simd_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
-MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)");
diff --git a/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl
deleted file mode 100644
index 501827254fed..000000000000
--- a/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl
+++ /dev/null
@@ -1,4253 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-#
-# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
-# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
-# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
-#
-# This code is taken from the OpenSSL project but the author, Andy Polyakov,
-# has relicensed it under the licenses specified in the SPDX header above.
-# The original headers, including the original license headers, are
-# included below for completeness.
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# This module implements Poly1305 hash for x86_64.
-#
-# March 2015
-#
-# Initial release.
-#
-# December 2016
-#
-# Add AVX512F+VL+BW code path.
-#
-# November 2017
-#
-# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
-# executed even on Knights Landing. Trigger for modification was
-# observation that AVX512 code paths can negatively affect overall
-# Skylake-X system performance. Since we are likely to suppress
-# AVX512F capability flag [at least on Skylake-X], conversion serves
-# as kind of "investment protection". Note that next *lake processor,
-# Cannonlake, has AVX512IFMA code path to execute...
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone,
-# measured with rdtsc at fixed clock frequency.
-#
-# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
-# P4 4.46/+120% -
-# Core 2 2.41/+90% -
-# Westmere 1.88/+120% -
-# Sandy Bridge 1.39/+140% 1.10
-# Haswell 1.14/+175% 1.11 0.65
-# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
-# Silvermont 2.83/+95% -
-# Knights L 3.60/? 1.65 1.10 0.41(***)
-# Goldmont 1.70/+180% -
-# VIA Nano 1.82/+150% -
-# Sledgehammer 1.38/+160% -
-# Bulldozer 2.30/+130% 0.97
-# Ryzen 1.15/+200% 1.08 1.18
-#
-# (*) improvement coefficients relative to clang are more modest and
-# are ~50% on most processors, in both cases we are comparing to
-# __int128 code;
-# (**) SSE2 implementation was attempted, but among non-AVX processors
-# it was faster than integer-only code only on older Intel P4 and
-# Core processors, 50-30%, less newer processor is, but slower on
-# contemporary ones, for example almost 2x slower on Atom, and as
-# former are naturally disappearing, SSE2 is deemed unnecessary;
-# (***) strangely enough performance seems to vary from core to core,
-# listed result is best case;
-
-$flavour = shift;
-$output = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-$kernel=0; $kernel=1 if (!$flavour && !$output);
-
-if (!$kernel) {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
- die "can't locate x86_64-xlate.pl";
-
- open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
- *STDOUT=*OUT;
-
- if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
- =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
- $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
- }
-
- if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
- `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
- $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
- $avx += 1 if ($1==2.11 && $2>=8);
- }
-
- if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
- `ml64 2>&1` =~ /Version ([0-9]+)\./) {
- $avx = ($1>=10) + ($1>=11);
- }
-
- if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
- $avx = ($2>=3.0) + ($2>3.0);
- }
-} else {
- $avx = 4; # The kernel uses ifdefs for this.
-}
-
-sub declare_function() {
- my ($name, $align, $nargs) = @_;
- if($kernel) {
- $code .= "SYM_FUNC_START($name)\n";
- $code .= ".L$name:\n";
- } else {
- $code .= ".globl $name\n";
- $code .= ".type $name,\@function,$nargs\n";
- $code .= ".align $align\n";
- $code .= "$name:\n";
- }
-}
-
-sub declare_typed_function() {
- my ($name, $align, $nargs) = @_;
- if($kernel) {
- $code .= "SYM_TYPED_FUNC_START($name)\n";
- $code .= ".L$name:\n";
- } else {
- $code .= ".globl $name\n";
- $code .= ".type $name,\@function,$nargs\n";
- $code .= ".align $align\n";
- $code .= "$name:\n";
- }
-}
-
-sub end_function() {
- my ($name) = @_;
- if($kernel) {
- $code .= "SYM_FUNC_END($name)\n";
- } else {
- $code .= ".size $name,.-$name\n";
- }
-}
-
-$code.=<<___ if $kernel;
-#include <linux/cfi_types.h>
-___
-
-if ($avx) {
-$code.=<<___ if $kernel;
-.section .rodata
-___
-$code.=<<___;
-.align 64
-.Lconst:
-.Lmask24:
-.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
-.L129:
-.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
-.Lmask26:
-.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
-.Lpermd_avx2:
-.long 2,2,2,3,2,0,2,1
-.Lpermd_avx512:
-.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
-
-.L2_44_inp_permd:
-.long 0,1,1,2,2,3,7,7
-.L2_44_inp_shift:
-.quad 0,12,24,64
-.L2_44_mask:
-.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
-.L2_44_shift_rgt:
-.quad 44,44,42,64
-.L2_44_shift_lft:
-.quad 8,8,10,64
-
-.align 64
-.Lx_mask44:
-.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
-.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
-.Lx_mask42:
-.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
-.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
-___
-}
-$code.=<<___ if (!$kernel);
-.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-.align 16
-___
-
-my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
-my ($mac,$nonce)=($inp,$len); # *_emit arguments
-my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
-my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
-
-sub poly1305_iteration {
-# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
-# output: $h0-$h2 *= $r0-$r1
-$code.=<<___;
- mulq $h0 # h0*r1
- mov %rax,$d2
- mov $r0,%rax
- mov %rdx,$d3
-
- mulq $h0 # h0*r0
- mov %rax,$h0 # future $h0
- mov $r0,%rax
- mov %rdx,$d1
-
- mulq $h1 # h1*r0
- add %rax,$d2
- mov $s1,%rax
- adc %rdx,$d3
-
- mulq $h1 # h1*s1
- mov $h2,$h1 # borrow $h1
- add %rax,$h0
- adc %rdx,$d1
-
- imulq $s1,$h1 # h2*s1
- add $h1,$d2
- mov $d1,$h1
- adc \$0,$d3
-
- imulq $r0,$h2 # h2*r0
- add $d2,$h1
- mov \$-4,%rax # mask value
- adc $h2,$d3
-
- and $d3,%rax # last reduction step
- mov $d3,$h2
- shr \$2,$d3
- and \$3,$h2
- add $d3,%rax
- add %rax,$h0
- adc \$0,$h1
- adc \$0,$h2
-___
-}
-
-########################################################################
-# Layout of opaque area is following.
-#
-# unsigned __int64 h[3]; # current hash value base 2^64
-# unsigned __int64 r[2]; # key value base 2^64
-
-$code.=<<___;
-.text
-___
-$code.=<<___ if (!$kernel);
-.extern OPENSSL_ia32cap_P
-
-.globl poly1305_block_init_arch
-.hidden poly1305_block_init_arch
-.globl poly1305_blocks_x86_64
-.hidden poly1305_blocks_x86_64
-.globl poly1305_emit_x86_64
-.hidden poly1305_emit_x86_64
-___
-&declare_typed_function("poly1305_block_init_arch", 32, 3);
-$code.=<<___;
- xor %eax,%eax
- mov %rax,0($ctx) # initialize hash value
- mov %rax,8($ctx)
- mov %rax,16($ctx)
-
- test $inp,$inp
- je .Lno_key
-___
-$code.=<<___ if (!$kernel);
- lea poly1305_blocks_x86_64(%rip),%r10
- lea poly1305_emit_x86_64(%rip),%r11
-___
-$code.=<<___ if (!$kernel && $avx);
- mov OPENSSL_ia32cap_P+4(%rip),%r9
- lea poly1305_blocks_avx(%rip),%rax
- lea poly1305_emit_avx(%rip),%rcx
- bt \$`60-32`,%r9 # AVX?
- cmovc %rax,%r10
- cmovc %rcx,%r11
-___
-$code.=<<___ if (!$kernel && $avx>1);
- lea poly1305_blocks_avx2(%rip),%rax
- bt \$`5+32`,%r9 # AVX2?
- cmovc %rax,%r10
-___
-$code.=<<___ if (!$kernel && $avx>3);
- mov \$`(1<<31|1<<21|1<<16)`,%rax
- shr \$32,%r9
- and %rax,%r9
- cmp %rax,%r9
- je .Linit_base2_44
-___
-$code.=<<___;
- mov \$0x0ffffffc0fffffff,%rax
- mov \$0x0ffffffc0ffffffc,%rcx
- and 0($inp),%rax
- and 8($inp),%rcx
- mov %rax,24($ctx)
- mov %rcx,32($ctx)
-___
-$code.=<<___ if (!$kernel && $flavour !~ /elf32/);
- mov %r10,0(%rdx)
- mov %r11,8(%rdx)
-___
-$code.=<<___ if (!$kernel && $flavour =~ /elf32/);
- mov %r10d,0(%rdx)
- mov %r11d,4(%rdx)
-___
-$code.=<<___;
- mov \$1,%eax
-.Lno_key:
- RET
-___
-&end_function("poly1305_block_init_arch");
-
-&declare_function("poly1305_blocks_x86_64", 32, 4);
-$code.=<<___;
-.cfi_startproc
-.Lblocks:
- shr \$4,$len
- jz .Lno_data # too short
-
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
- push $ctx
-.cfi_push $ctx
-.Lblocks_body:
-
- mov $len,%r15 # reassign $len
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- mov 0($ctx),$h0 # load hash value
- mov 8($ctx),$h1
- mov 16($ctx),$h2
-
- mov $s1,$r1
- shr \$2,$s1
- mov $r1,%rax
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
- jmp .Loop
-
-.align 32
-.Loop:
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
-___
-
- &poly1305_iteration();
-
-$code.=<<___;
- mov $r1,%rax
- dec %r15 # len-=16
- jnz .Loop
-
- mov 0(%rsp),$ctx
-.cfi_restore $ctx
-
- mov $h0,0($ctx) # store hash value
- mov $h1,8($ctx)
- mov $h2,16($ctx)
-
- mov 8(%rsp),%r15
-.cfi_restore %r15
- mov 16(%rsp),%r14
-.cfi_restore %r14
- mov 24(%rsp),%r13
-.cfi_restore %r13
- mov 32(%rsp),%r12
-.cfi_restore %r12
- mov 40(%rsp),%rbx
-.cfi_restore %rbx
- lea 48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
-.Lno_data:
-.Lblocks_epilogue:
- RET
-.cfi_endproc
-___
-&end_function("poly1305_blocks_x86_64");
-
-&declare_function("poly1305_emit_x86_64", 32, 3);
-$code.=<<___;
-.Lemit:
- mov 0($ctx),%r8 # load hash value
- mov 8($ctx),%r9
- mov 16($ctx),%r10
-
- mov %r8,%rax
- add \$5,%r8 # compare to modulus
- mov %r9,%rcx
- adc \$0,%r9
- adc \$0,%r10
- shr \$2,%r10 # did 130-bit value overflow?
- cmovnz %r8,%rax
- cmovnz %r9,%rcx
-
- add 0($nonce),%rax # accumulate nonce
- adc 8($nonce),%rcx
- mov %rax,0($mac) # write result
- mov %rcx,8($mac)
-
- RET
-___
-&end_function("poly1305_emit_x86_64");
-if ($avx) {
-
-########################################################################
-# Layout of opaque area is following.
-#
-# unsigned __int32 h[5]; # current hash value base 2^26
-# unsigned __int32 is_base2_26;
-# unsigned __int64 r[2]; # key value base 2^64
-# unsigned __int64 pad;
-# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
-#
-# where r^n are base 2^26 digits of degrees of multiplier key. There are
-# 5 digits, but last four are interleaved with multiples of 5, totalling
-# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
-
-my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
- map("%xmm$_",(0..15));
-
-$code.=<<___;
-.type __poly1305_block,\@abi-omnipotent
-.align 32
-__poly1305_block:
- push $ctx
-___
- &poly1305_iteration();
-$code.=<<___;
- pop $ctx
- RET
-.size __poly1305_block,.-__poly1305_block
-
-.type __poly1305_init_avx,\@abi-omnipotent
-.align 32
-__poly1305_init_avx:
- push %rbp
- mov %rsp,%rbp
- mov $r0,$h0
- mov $r1,$h1
- xor $h2,$h2
-
- lea 48+64($ctx),$ctx # size optimization
-
- mov $r1,%rax
- call __poly1305_block # r^2
-
- mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
- mov \$0x3ffffff,%edx
- mov $h0,$d1
- and $h0#d,%eax
- mov $r0,$d2
- and $r0#d,%edx
- mov %eax,`16*0+0-64`($ctx)
- shr \$26,$d1
- mov %edx,`16*0+4-64`($ctx)
- shr \$26,$d2
-
- mov \$0x3ffffff,%eax
- mov \$0x3ffffff,%edx
- and $d1#d,%eax
- and $d2#d,%edx
- mov %eax,`16*1+0-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov %edx,`16*1+4-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- mov %eax,`16*2+0-64`($ctx)
- shr \$26,$d1
- mov %edx,`16*2+4-64`($ctx)
- shr \$26,$d2
-
- mov $h1,%rax
- mov $r1,%rdx
- shl \$12,%rax
- shl \$12,%rdx
- or $d1,%rax
- or $d2,%rdx
- and \$0x3ffffff,%eax
- and \$0x3ffffff,%edx
- mov %eax,`16*3+0-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov %edx,`16*3+4-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- mov %eax,`16*4+0-64`($ctx)
- mov $h1,$d1
- mov %edx,`16*4+4-64`($ctx)
- mov $r1,$d2
-
- mov \$0x3ffffff,%eax
- mov \$0x3ffffff,%edx
- shr \$14,$d1
- shr \$14,$d2
- and $d1#d,%eax
- and $d2#d,%edx
- mov %eax,`16*5+0-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov %edx,`16*5+4-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- mov %eax,`16*6+0-64`($ctx)
- shr \$26,$d1
- mov %edx,`16*6+4-64`($ctx)
- shr \$26,$d2
-
- mov $h2,%rax
- shl \$24,%rax
- or %rax,$d1
- mov $d1#d,`16*7+0-64`($ctx)
- lea ($d1,$d1,4),$d1 # *5
- mov $d2#d,`16*7+4-64`($ctx)
- lea ($d2,$d2,4),$d2 # *5
- mov $d1#d,`16*8+0-64`($ctx)
- mov $d2#d,`16*8+4-64`($ctx)
-
- mov $r1,%rax
- call __poly1305_block # r^3
-
- mov \$0x3ffffff,%eax # save r^3 base 2^26
- mov $h0,$d1
- and $h0#d,%eax
- shr \$26,$d1
- mov %eax,`16*0+12-64`($ctx)
-
- mov \$0x3ffffff,%edx
- and $d1#d,%edx
- mov %edx,`16*1+12-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*2+12-64`($ctx)
-
- mov $h1,%rax
- shl \$12,%rax
- or $d1,%rax
- and \$0x3ffffff,%eax
- mov %eax,`16*3+12-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov $h1,$d1
- mov %eax,`16*4+12-64`($ctx)
-
- mov \$0x3ffffff,%edx
- shr \$14,$d1
- and $d1#d,%edx
- mov %edx,`16*5+12-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*6+12-64`($ctx)
-
- mov $h2,%rax
- shl \$24,%rax
- or %rax,$d1
- mov $d1#d,`16*7+12-64`($ctx)
- lea ($d1,$d1,4),$d1 # *5
- mov $d1#d,`16*8+12-64`($ctx)
-
- mov $r1,%rax
- call __poly1305_block # r^4
-
- mov \$0x3ffffff,%eax # save r^4 base 2^26
- mov $h0,$d1
- and $h0#d,%eax
- shr \$26,$d1
- mov %eax,`16*0+8-64`($ctx)
-
- mov \$0x3ffffff,%edx
- and $d1#d,%edx
- mov %edx,`16*1+8-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*2+8-64`($ctx)
-
- mov $h1,%rax
- shl \$12,%rax
- or $d1,%rax
- and \$0x3ffffff,%eax
- mov %eax,`16*3+8-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov $h1,$d1
- mov %eax,`16*4+8-64`($ctx)
-
- mov \$0x3ffffff,%edx
- shr \$14,$d1
- and $d1#d,%edx
- mov %edx,`16*5+8-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*6+8-64`($ctx)
-
- mov $h2,%rax
- shl \$24,%rax
- or %rax,$d1
- mov $d1#d,`16*7+8-64`($ctx)
- lea ($d1,$d1,4),$d1 # *5
- mov $d1#d,`16*8+8-64`($ctx)
-
- lea -48-64($ctx),$ctx # size [de-]optimization
- pop %rbp
- RET
-.size __poly1305_init_avx,.-__poly1305_init_avx
-___
-
-&declare_function("poly1305_blocks_avx", 32, 4);
-$code.=<<___;
-.cfi_startproc
- mov 20($ctx),%r8d # is_base2_26
- cmp \$128,$len
- jae .Lblocks_avx
- test %r8d,%r8d
- jz .Lblocks
-
-.Lblocks_avx:
- and \$-16,$len
- jz .Lno_data_avx
-
- vzeroupper
-
- test %r8d,%r8d
- jz .Lbase2_64_avx
-
- test \$31,$len
- jz .Leven_avx
-
- push %rbp
-.cfi_push %rbp
- mov %rsp,%rbp
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lblocks_avx_body:
-
- mov $len,%r15 # reassign $len
-
- mov 0($ctx),$d1 # load hash value
- mov 8($ctx),$d2
- mov 16($ctx),$h2#d
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- ################################# base 2^26 -> base 2^64
- mov $d1#d,$h0#d
- and \$`-1*(1<<31)`,$d1
- mov $d2,$r1 # borrow $r1
- mov $d2#d,$h1#d
- and \$`-1*(1<<31)`,$d2
-
- shr \$6,$d1
- shl \$52,$r1
- add $d1,$h0
- shr \$12,$h1
- shr \$18,$d2
- add $r1,$h0
- adc $d2,$h1
-
- mov $h2,$d1
- shl \$40,$d1
- shr \$24,$h2
- add $d1,$h1
- adc \$0,$h2 # can be partially reduced...
-
- mov \$-4,$d2 # ... so reduce
- mov $h2,$d1
- and $h2,$d2
- shr \$2,$d1
- and \$3,$h2
- add $d2,$d1 # =*5
- add $d1,$h0
- adc \$0,$h1
- adc \$0,$h2
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
-
- call __poly1305_block
-
- test $padbit,$padbit # if $padbit is zero,
- jz .Lstore_base2_64_avx # store hash in base 2^64 format
-
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$r0
- mov $h1,$r1
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$r0
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $r0,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$r1
- and \$0x3ffffff,$h1 # h[3]
- or $r1,$h2 # h[4]
-
- sub \$16,%r15
- jz .Lstore_base2_26_avx
-
- vmovd %rax#d,$H0
- vmovd %rdx#d,$H1
- vmovd $h0#d,$H2
- vmovd $h1#d,$H3
- vmovd $h2#d,$H4
- jmp .Lproceed_avx
-
-.align 32
-.Lstore_base2_64_avx:
- mov $h0,0($ctx)
- mov $h1,8($ctx)
- mov $h2,16($ctx) # note that is_base2_26 is zeroed
- jmp .Ldone_avx
-
-.align 16
-.Lstore_base2_26_avx:
- mov %rax#d,0($ctx) # store hash value base 2^26
- mov %rdx#d,4($ctx)
- mov $h0#d,8($ctx)
- mov $h1#d,12($ctx)
- mov $h2#d,16($ctx)
-.align 16
-.Ldone_avx:
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbx
-.cfi_restore %rbx
- pop %rbp
-.cfi_restore %rbp
-.Lno_data_avx:
-.Lblocks_avx_epilogue:
- RET
-.cfi_endproc
-
-.align 32
-.Lbase2_64_avx:
-.cfi_startproc
- push %rbp
-.cfi_push %rbp
- mov %rsp,%rbp
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lbase2_64_avx_body:
-
- mov $len,%r15 # reassign $len
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- mov 0($ctx),$h0 # load hash value
- mov 8($ctx),$h1
- mov 16($ctx),$h2#d
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
- test \$31,$len
- jz .Linit_avx
-
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
- sub \$16,%r15
-
- call __poly1305_block
-
-.Linit_avx:
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$d1
- mov $h1,$d2
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$d1
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $d1,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$d2
- and \$0x3ffffff,$h1 # h[3]
- or $d2,$h2 # h[4]
-
- vmovd %rax#d,$H0
- vmovd %rdx#d,$H1
- vmovd $h0#d,$H2
- vmovd $h1#d,$H3
- vmovd $h2#d,$H4
- movl \$1,20($ctx) # set is_base2_26
-
- call __poly1305_init_avx
-
-.Lproceed_avx:
- mov %r15,$len
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbx
-.cfi_restore %rbx
- pop %rbp
-.cfi_restore %rbp
-.Lbase2_64_avx_epilogue:
- jmp .Ldo_avx
-.cfi_endproc
-
-.align 32
-.Leven_avx:
-.cfi_startproc
- vmovd 4*0($ctx),$H0 # load hash value
- vmovd 4*1($ctx),$H1
- vmovd 4*2($ctx),$H2
- vmovd 4*3($ctx),$H3
- vmovd 4*4($ctx),$H4
-
-.Ldo_avx:
-___
-$code.=<<___ if (!$win64);
- lea 8(%rsp),%r10
-.cfi_def_cfa_register %r10
- and \$-32,%rsp
- sub \$-8,%rsp
- lea -0x58(%rsp),%r11
- sub \$0x178,%rsp
-___
-$code.=<<___ if ($win64);
- lea -0xf8(%rsp),%r11
- sub \$0x218,%rsp
- vmovdqa %xmm6,0x50(%r11)
- vmovdqa %xmm7,0x60(%r11)
- vmovdqa %xmm8,0x70(%r11)
- vmovdqa %xmm9,0x80(%r11)
- vmovdqa %xmm10,0x90(%r11)
- vmovdqa %xmm11,0xa0(%r11)
- vmovdqa %xmm12,0xb0(%r11)
- vmovdqa %xmm13,0xc0(%r11)
- vmovdqa %xmm14,0xd0(%r11)
- vmovdqa %xmm15,0xe0(%r11)
-.Ldo_avx_body:
-___
-$code.=<<___;
- sub \$64,$len
- lea -32($inp),%rax
- cmovc %rax,$inp
-
- vmovdqu `16*3`($ctx),$D4 # preload r0^2
- lea `16*3+64`($ctx),$ctx # size optimization
- lea .Lconst(%rip),%rcx
-
- ################################################################
- # load input
- vmovdqu 16*2($inp),$T0
- vmovdqu 16*3($inp),$T1
- vmovdqa 64(%rcx),$MASK # .Lmask26
-
- vpsrldq \$6,$T0,$T2 # splat input
- vpsrldq \$6,$T1,$T3
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpunpcklqdq $T3,$T2,$T3 # 2:3
-
- vpsrlq \$40,$T4,$T4 # 4
- vpsrlq \$26,$T0,$T1
- vpand $MASK,$T0,$T0 # 0
- vpsrlq \$4,$T3,$T2
- vpand $MASK,$T1,$T1 # 1
- vpsrlq \$30,$T3,$T3
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- jbe .Lskip_loop_avx
-
- # expand and copy pre-calculated table to stack
- vmovdqu `16*1-64`($ctx),$D1
- vmovdqu `16*2-64`($ctx),$D2
- vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
- vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
- vmovdqa $D3,-0x90(%r11)
- vmovdqa $D0,0x00(%rsp)
- vpshufd \$0xEE,$D1,$D4
- vmovdqu `16*3-64`($ctx),$D0
- vpshufd \$0x44,$D1,$D1
- vmovdqa $D4,-0x80(%r11)
- vmovdqa $D1,0x10(%rsp)
- vpshufd \$0xEE,$D2,$D3
- vmovdqu `16*4-64`($ctx),$D1
- vpshufd \$0x44,$D2,$D2
- vmovdqa $D3,-0x70(%r11)
- vmovdqa $D2,0x20(%rsp)
- vpshufd \$0xEE,$D0,$D4
- vmovdqu `16*5-64`($ctx),$D2
- vpshufd \$0x44,$D0,$D0
- vmovdqa $D4,-0x60(%r11)
- vmovdqa $D0,0x30(%rsp)
- vpshufd \$0xEE,$D1,$D3
- vmovdqu `16*6-64`($ctx),$D0
- vpshufd \$0x44,$D1,$D1
- vmovdqa $D3,-0x50(%r11)
- vmovdqa $D1,0x40(%rsp)
- vpshufd \$0xEE,$D2,$D4
- vmovdqu `16*7-64`($ctx),$D1
- vpshufd \$0x44,$D2,$D2
- vmovdqa $D4,-0x40(%r11)
- vmovdqa $D2,0x50(%rsp)
- vpshufd \$0xEE,$D0,$D3
- vmovdqu `16*8-64`($ctx),$D2
- vpshufd \$0x44,$D0,$D0
- vmovdqa $D3,-0x30(%r11)
- vmovdqa $D0,0x60(%rsp)
- vpshufd \$0xEE,$D1,$D4
- vpshufd \$0x44,$D1,$D1
- vmovdqa $D4,-0x20(%r11)
- vmovdqa $D1,0x70(%rsp)
- vpshufd \$0xEE,$D2,$D3
- vmovdqa 0x00(%rsp),$D4 # preload r0^2
- vpshufd \$0x44,$D2,$D2
- vmovdqa $D3,-0x10(%r11)
- vmovdqa $D2,0x80(%rsp)
-
- jmp .Loop_avx
-
-.align 32
-.Loop_avx:
- ################################################################
- # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- # \___________________/
- # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- # \___________________/ \____________________/
- #
- # Note that we start with inp[2:3]*r^2. This is because it
- # doesn't depend on reduction in previous iteration.
- ################################################################
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- #
- # though note that $Tx and $Hx are "reversed" in this section,
- # and $D4 is preloaded with r0^2...
-
- vpmuludq $T0,$D4,$D0 # d0 = h0*r0
- vpmuludq $T1,$D4,$D1 # d1 = h1*r0
- vmovdqa $H2,0x20(%r11) # offload hash
- vpmuludq $T2,$D4,$D2 # d3 = h2*r0
- vmovdqa 0x10(%rsp),$H2 # r1^2
- vpmuludq $T3,$D4,$D3 # d3 = h3*r0
- vpmuludq $T4,$D4,$D4 # d4 = h4*r0
-
- vmovdqa $H0,0x00(%r11) #
- vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
- vmovdqa $H1,0x10(%r11) #
- vpmuludq $T3,$H2,$H1 # h3*r1
- vpaddq $H0,$D0,$D0 # d0 += h4*s1
- vpaddq $H1,$D4,$D4 # d4 += h3*r1
- vmovdqa $H3,0x30(%r11) #
- vpmuludq $T2,$H2,$H0 # h2*r1
- vpmuludq $T1,$H2,$H1 # h1*r1
- vpaddq $H0,$D3,$D3 # d3 += h2*r1
- vmovdqa 0x30(%rsp),$H3 # r2^2
- vpaddq $H1,$D2,$D2 # d2 += h1*r1
- vmovdqa $H4,0x40(%r11) #
- vpmuludq $T0,$H2,$H2 # h0*r1
- vpmuludq $T2,$H3,$H0 # h2*r2
- vpaddq $H2,$D1,$D1 # d1 += h0*r1
-
- vmovdqa 0x40(%rsp),$H4 # s2^2
- vpaddq $H0,$D4,$D4 # d4 += h2*r2
- vpmuludq $T1,$H3,$H1 # h1*r2
- vpmuludq $T0,$H3,$H3 # h0*r2
- vpaddq $H1,$D3,$D3 # d3 += h1*r2
- vmovdqa 0x50(%rsp),$H2 # r3^2
- vpaddq $H3,$D2,$D2 # d2 += h0*r2
- vpmuludq $T4,$H4,$H0 # h4*s2
- vpmuludq $T3,$H4,$H4 # h3*s2
- vpaddq $H0,$D1,$D1 # d1 += h4*s2
- vmovdqa 0x60(%rsp),$H3 # s3^2
- vpaddq $H4,$D0,$D0 # d0 += h3*s2
-
- vmovdqa 0x80(%rsp),$H4 # s4^2
- vpmuludq $T1,$H2,$H1 # h1*r3
- vpmuludq $T0,$H2,$H2 # h0*r3
- vpaddq $H1,$D4,$D4 # d4 += h1*r3
- vpaddq $H2,$D3,$D3 # d3 += h0*r3
- vpmuludq $T4,$H3,$H0 # h4*s3
- vpmuludq $T3,$H3,$H1 # h3*s3
- vpaddq $H0,$D2,$D2 # d2 += h4*s3
- vmovdqu 16*0($inp),$H0 # load input
- vpaddq $H1,$D1,$D1 # d1 += h3*s3
- vpmuludq $T2,$H3,$H3 # h2*s3
- vpmuludq $T2,$H4,$T2 # h2*s4
- vpaddq $H3,$D0,$D0 # d0 += h2*s3
-
- vmovdqu 16*1($inp),$H1 #
- vpaddq $T2,$D1,$D1 # d1 += h2*s4
- vpmuludq $T3,$H4,$T3 # h3*s4
- vpmuludq $T4,$H4,$T4 # h4*s4
- vpsrldq \$6,$H0,$H2 # splat input
- vpaddq $T3,$D2,$D2 # d2 += h3*s4
- vpaddq $T4,$D3,$D3 # d3 += h4*s4
- vpsrldq \$6,$H1,$H3 #
- vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
- vpmuludq $T1,$H4,$T0 # h1*s4
- vpunpckhqdq $H1,$H0,$H4 # 4
- vpaddq $T4,$D4,$D4 # d4 += h0*r4
- vmovdqa -0x90(%r11),$T4 # r0^4
- vpaddq $T0,$D0,$D0 # d0 += h1*s4
-
- vpunpcklqdq $H1,$H0,$H0 # 0:1
- vpunpcklqdq $H3,$H2,$H3 # 2:3
-
- #vpsrlq \$40,$H4,$H4 # 4
- vpsrldq \$`40/8`,$H4,$H4 # 4
- vpsrlq \$26,$H0,$H1
- vpand $MASK,$H0,$H0 # 0
- vpsrlq \$4,$H3,$H2
- vpand $MASK,$H1,$H1 # 1
- vpand 0(%rcx),$H4,$H4 # .Lmask24
- vpsrlq \$30,$H3,$H3
- vpand $MASK,$H2,$H2 # 2
- vpand $MASK,$H3,$H3 # 3
- vpor 32(%rcx),$H4,$H4 # padbit, yes, always
-
- vpaddq 0x00(%r11),$H0,$H0 # add hash value
- vpaddq 0x10(%r11),$H1,$H1
- vpaddq 0x20(%r11),$H2,$H2
- vpaddq 0x30(%r11),$H3,$H3
- vpaddq 0x40(%r11),$H4,$H4
-
- lea 16*2($inp),%rax
- lea 16*4($inp),$inp
- sub \$64,$len
- cmovc %rax,$inp
-
- ################################################################
- # Now we accumulate (inp[0:1]+hash)*r^4
- ################################################################
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- vpmuludq $H0,$T4,$T0 # h0*r0
- vpmuludq $H1,$T4,$T1 # h1*r0
- vpaddq $T0,$D0,$D0
- vpaddq $T1,$D1,$D1
- vmovdqa -0x80(%r11),$T2 # r1^4
- vpmuludq $H2,$T4,$T0 # h2*r0
- vpmuludq $H3,$T4,$T1 # h3*r0
- vpaddq $T0,$D2,$D2
- vpaddq $T1,$D3,$D3
- vpmuludq $H4,$T4,$T4 # h4*r0
- vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
- vpaddq $T4,$D4,$D4
-
- vpaddq $T0,$D0,$D0 # d0 += h4*s1
- vpmuludq $H2,$T2,$T1 # h2*r1
- vpmuludq $H3,$T2,$T0 # h3*r1
- vpaddq $T1,$D3,$D3 # d3 += h2*r1
- vmovdqa -0x60(%r11),$T3 # r2^4
- vpaddq $T0,$D4,$D4 # d4 += h3*r1
- vpmuludq $H1,$T2,$T1 # h1*r1
- vpmuludq $H0,$T2,$T2 # h0*r1
- vpaddq $T1,$D2,$D2 # d2 += h1*r1
- vpaddq $T2,$D1,$D1 # d1 += h0*r1
-
- vmovdqa -0x50(%r11),$T4 # s2^4
- vpmuludq $H2,$T3,$T0 # h2*r2
- vpmuludq $H1,$T3,$T1 # h1*r2
- vpaddq $T0,$D4,$D4 # d4 += h2*r2
- vpaddq $T1,$D3,$D3 # d3 += h1*r2
- vmovdqa -0x40(%r11),$T2 # r3^4
- vpmuludq $H0,$T3,$T3 # h0*r2
- vpmuludq $H4,$T4,$T0 # h4*s2
- vpaddq $T3,$D2,$D2 # d2 += h0*r2
- vpaddq $T0,$D1,$D1 # d1 += h4*s2
- vmovdqa -0x30(%r11),$T3 # s3^4
- vpmuludq $H3,$T4,$T4 # h3*s2
- vpmuludq $H1,$T2,$T1 # h1*r3
- vpaddq $T4,$D0,$D0 # d0 += h3*s2
-
- vmovdqa -0x10(%r11),$T4 # s4^4
- vpaddq $T1,$D4,$D4 # d4 += h1*r3
- vpmuludq $H0,$T2,$T2 # h0*r3
- vpmuludq $H4,$T3,$T0 # h4*s3
- vpaddq $T2,$D3,$D3 # d3 += h0*r3
- vpaddq $T0,$D2,$D2 # d2 += h4*s3
- vmovdqu 16*2($inp),$T0 # load input
- vpmuludq $H3,$T3,$T2 # h3*s3
- vpmuludq $H2,$T3,$T3 # h2*s3
- vpaddq $T2,$D1,$D1 # d1 += h3*s3
- vmovdqu 16*3($inp),$T1 #
- vpaddq $T3,$D0,$D0 # d0 += h2*s3
-
- vpmuludq $H2,$T4,$H2 # h2*s4
- vpmuludq $H3,$T4,$H3 # h3*s4
- vpsrldq \$6,$T0,$T2 # splat input
- vpaddq $H2,$D1,$D1 # d1 += h2*s4
- vpmuludq $H4,$T4,$H4 # h4*s4
- vpsrldq \$6,$T1,$T3 #
- vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
- vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
- vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
- vpmuludq $H1,$T4,$H0
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
- vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
-
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpunpcklqdq $T3,$T2,$T3 # 2:3
-
- #vpsrlq \$40,$T4,$T4 # 4
- vpsrldq \$`40/8`,$T4,$T4 # 4
- vpsrlq \$26,$T0,$T1
- vmovdqa 0x00(%rsp),$D4 # preload r0^2
- vpand $MASK,$T0,$T0 # 0
- vpsrlq \$4,$T3,$T2
- vpand $MASK,$T1,$T1 # 1
- vpand 0(%rcx),$T4,$T4 # .Lmask24
- vpsrlq \$30,$T3,$T3
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- ################################################################
- # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- # and P. Schwabe
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$D1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D0
- vpand $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D0,$H0,$H0
- vpsllq \$2,$D0,$D0
- vpaddq $D0,$H0,$H0 # h4 -> h0
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- ja .Loop_avx
-
-.Lskip_loop_avx:
- ################################################################
- # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
- add \$32,$len
- jnz .Long_tail_avx
-
- vpaddq $H2,$T2,$T2
- vpaddq $H0,$T0,$T0
- vpaddq $H1,$T1,$T1
- vpaddq $H3,$T3,$T3
- vpaddq $H4,$T4,$T4
-
-.Long_tail_avx:
- vmovdqa $H2,0x20(%r11)
- vmovdqa $H0,0x00(%r11)
- vmovdqa $H1,0x10(%r11)
- vmovdqa $H3,0x30(%r11)
- vmovdqa $H4,0x40(%r11)
-
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- vpmuludq $T2,$D4,$D2 # d2 = h2*r0
- vpmuludq $T0,$D4,$D0 # d0 = h0*r0
- vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
- vpmuludq $T1,$D4,$D1 # d1 = h1*r0
- vpmuludq $T3,$D4,$D3 # d3 = h3*r0
- vpmuludq $T4,$D4,$D4 # d4 = h4*r0
-
- vpmuludq $T3,$H2,$H0 # h3*r1
- vpaddq $H0,$D4,$D4 # d4 += h3*r1
- vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
- vpmuludq $T2,$H2,$H1 # h2*r1
- vpaddq $H1,$D3,$D3 # d3 += h2*r1
- vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
- vpmuludq $T1,$H2,$H0 # h1*r1
- vpaddq $H0,$D2,$D2 # d2 += h1*r1
- vpmuludq $T0,$H2,$H2 # h0*r1
- vpaddq $H2,$D1,$D1 # d1 += h0*r1
- vpmuludq $T4,$H3,$H3 # h4*s1
- vpaddq $H3,$D0,$D0 # d0 += h4*s1
-
- vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
- vpmuludq $T2,$H4,$H1 # h2*r2
- vpaddq $H1,$D4,$D4 # d4 += h2*r2
- vpmuludq $T1,$H4,$H0 # h1*r2
- vpaddq $H0,$D3,$D3 # d3 += h1*r2
- vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
- vpmuludq $T0,$H4,$H4 # h0*r2
- vpaddq $H4,$D2,$D2 # d2 += h0*r2
- vpmuludq $T4,$H2,$H1 # h4*s2
- vpaddq $H1,$D1,$D1 # d1 += h4*s2
- vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
- vpmuludq $T3,$H2,$H2 # h3*s2
- vpaddq $H2,$D0,$D0 # d0 += h3*s2
-
- vpmuludq $T1,$H3,$H0 # h1*r3
- vpaddq $H0,$D4,$D4 # d4 += h1*r3
- vpmuludq $T0,$H3,$H3 # h0*r3
- vpaddq $H3,$D3,$D3 # d3 += h0*r3
- vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
- vpmuludq $T4,$H4,$H1 # h4*s3
- vpaddq $H1,$D2,$D2 # d2 += h4*s3
- vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
- vpmuludq $T3,$H4,$H0 # h3*s3
- vpaddq $H0,$D1,$D1 # d1 += h3*s3
- vpmuludq $T2,$H4,$H4 # h2*s3
- vpaddq $H4,$D0,$D0 # d0 += h2*s3
-
- vpmuludq $T0,$H2,$H2 # h0*r4
- vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
- vpmuludq $T4,$H3,$H1 # h4*s4
- vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
- vpmuludq $T3,$H3,$H0 # h3*s4
- vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
- vpmuludq $T2,$H3,$H1 # h2*s4
- vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
- vpmuludq $T1,$H3,$H3 # h1*s4
- vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
-
- jz .Lshort_tail_avx
-
- vmovdqu 16*0($inp),$H0 # load input
- vmovdqu 16*1($inp),$H1
-
- vpsrldq \$6,$H0,$H2 # splat input
- vpsrldq \$6,$H1,$H3
- vpunpckhqdq $H1,$H0,$H4 # 4
- vpunpcklqdq $H1,$H0,$H0 # 0:1
- vpunpcklqdq $H3,$H2,$H3 # 2:3
-
- vpsrlq \$40,$H4,$H4 # 4
- vpsrlq \$26,$H0,$H1
- vpand $MASK,$H0,$H0 # 0
- vpsrlq \$4,$H3,$H2
- vpand $MASK,$H1,$H1 # 1
- vpsrlq \$30,$H3,$H3
- vpand $MASK,$H2,$H2 # 2
- vpand $MASK,$H3,$H3 # 3
- vpor 32(%rcx),$H4,$H4 # padbit, yes, always
-
- vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
- vpaddq 0x00(%r11),$H0,$H0
- vpaddq 0x10(%r11),$H1,$H1
- vpaddq 0x20(%r11),$H2,$H2
- vpaddq 0x30(%r11),$H3,$H3
- vpaddq 0x40(%r11),$H4,$H4
-
- ################################################################
- # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
-
- vpmuludq $H0,$T4,$T0 # h0*r0
- vpaddq $T0,$D0,$D0 # d0 += h0*r0
- vpmuludq $H1,$T4,$T1 # h1*r0
- vpaddq $T1,$D1,$D1 # d1 += h1*r0
- vpmuludq $H2,$T4,$T0 # h2*r0
- vpaddq $T0,$D2,$D2 # d2 += h2*r0
- vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
- vpmuludq $H3,$T4,$T1 # h3*r0
- vpaddq $T1,$D3,$D3 # d3 += h3*r0
- vpmuludq $H4,$T4,$T4 # h4*r0
- vpaddq $T4,$D4,$D4 # d4 += h4*r0
-
- vpmuludq $H3,$T2,$T0 # h3*r1
- vpaddq $T0,$D4,$D4 # d4 += h3*r1
- vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
- vpmuludq $H2,$T2,$T1 # h2*r1
- vpaddq $T1,$D3,$D3 # d3 += h2*r1
- vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
- vpmuludq $H1,$T2,$T0 # h1*r1
- vpaddq $T0,$D2,$D2 # d2 += h1*r1
- vpmuludq $H0,$T2,$T2 # h0*r1
- vpaddq $T2,$D1,$D1 # d1 += h0*r1
- vpmuludq $H4,$T3,$T3 # h4*s1
- vpaddq $T3,$D0,$D0 # d0 += h4*s1
-
- vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
- vpmuludq $H2,$T4,$T1 # h2*r2
- vpaddq $T1,$D4,$D4 # d4 += h2*r2
- vpmuludq $H1,$T4,$T0 # h1*r2
- vpaddq $T0,$D3,$D3 # d3 += h1*r2
- vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
- vpmuludq $H0,$T4,$T4 # h0*r2
- vpaddq $T4,$D2,$D2 # d2 += h0*r2
- vpmuludq $H4,$T2,$T1 # h4*s2
- vpaddq $T1,$D1,$D1 # d1 += h4*s2
- vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
- vpmuludq $H3,$T2,$T2 # h3*s2
- vpaddq $T2,$D0,$D0 # d0 += h3*s2
-
- vpmuludq $H1,$T3,$T0 # h1*r3
- vpaddq $T0,$D4,$D4 # d4 += h1*r3
- vpmuludq $H0,$T3,$T3 # h0*r3
- vpaddq $T3,$D3,$D3 # d3 += h0*r3
- vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
- vpmuludq $H4,$T4,$T1 # h4*s3
- vpaddq $T1,$D2,$D2 # d2 += h4*s3
- vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
- vpmuludq $H3,$T4,$T0 # h3*s3
- vpaddq $T0,$D1,$D1 # d1 += h3*s3
- vpmuludq $H2,$T4,$T4 # h2*s3
- vpaddq $T4,$D0,$D0 # d0 += h2*s3
-
- vpmuludq $H0,$T2,$T2 # h0*r4
- vpaddq $T2,$D4,$D4 # d4 += h0*r4
- vpmuludq $H4,$T3,$T1 # h4*s4
- vpaddq $T1,$D3,$D3 # d3 += h4*s4
- vpmuludq $H3,$T3,$T0 # h3*s4
- vpaddq $T0,$D2,$D2 # d2 += h3*s4
- vpmuludq $H2,$T3,$T1 # h2*s4
- vpaddq $T1,$D1,$D1 # d1 += h2*s4
- vpmuludq $H1,$T3,$T3 # h1*s4
- vpaddq $T3,$D0,$D0 # d0 += h1*s4
-
-.Lshort_tail_avx:
- ################################################################
- # horizontal addition
-
- vpsrldq \$8,$D4,$T4
- vpsrldq \$8,$D3,$T3
- vpsrldq \$8,$D1,$T1
- vpsrldq \$8,$D0,$T0
- vpsrldq \$8,$D2,$T2
- vpaddq $T3,$D3,$D3
- vpaddq $T4,$D4,$D4
- vpaddq $T0,$D0,$D0
- vpaddq $T1,$D1,$D1
- vpaddq $T2,$D2,$D2
-
- ################################################################
- # lazy reduction
-
- vpsrlq \$26,$D3,$H3
- vpand $MASK,$D3,$D3
- vpaddq $H3,$D4,$D4 # h3 -> h4
-
- vpsrlq \$26,$D0,$H0
- vpand $MASK,$D0,$D0
- vpaddq $H0,$D1,$D1 # h0 -> h1
-
- vpsrlq \$26,$D4,$H4
- vpand $MASK,$D4,$D4
-
- vpsrlq \$26,$D1,$H1
- vpand $MASK,$D1,$D1
- vpaddq $H1,$D2,$D2 # h1 -> h2
-
- vpaddq $H4,$D0,$D0
- vpsllq \$2,$H4,$H4
- vpaddq $H4,$D0,$D0 # h4 -> h0
-
- vpsrlq \$26,$D2,$H2
- vpand $MASK,$D2,$D2
- vpaddq $H2,$D3,$D3 # h2 -> h3
-
- vpsrlq \$26,$D0,$H0
- vpand $MASK,$D0,$D0
- vpaddq $H0,$D1,$D1 # h0 -> h1
-
- vpsrlq \$26,$D3,$H3
- vpand $MASK,$D3,$D3
- vpaddq $H3,$D4,$D4 # h3 -> h4
-
- vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
- vmovd $D1,`4*1-48-64`($ctx)
- vmovd $D2,`4*2-48-64`($ctx)
- vmovd $D3,`4*3-48-64`($ctx)
- vmovd $D4,`4*4-48-64`($ctx)
-___
-$code.=<<___ if ($win64);
- vmovdqa 0x50(%r11),%xmm6
- vmovdqa 0x60(%r11),%xmm7
- vmovdqa 0x70(%r11),%xmm8
- vmovdqa 0x80(%r11),%xmm9
- vmovdqa 0x90(%r11),%xmm10
- vmovdqa 0xa0(%r11),%xmm11
- vmovdqa 0xb0(%r11),%xmm12
- vmovdqa 0xc0(%r11),%xmm13
- vmovdqa 0xd0(%r11),%xmm14
- vmovdqa 0xe0(%r11),%xmm15
- lea 0xf8(%r11),%rsp
-.Ldo_avx_epilogue:
-___
-$code.=<<___ if (!$win64);
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-___
-$code.=<<___;
- vzeroupper
- RET
-.cfi_endproc
-___
-&end_function("poly1305_blocks_avx");
-
-&declare_function("poly1305_emit_avx", 32, 3);
-$code.=<<___;
- cmpl \$0,20($ctx) # is_base2_26?
- je .Lemit
-
- mov 0($ctx),%eax # load hash value base 2^26
- mov 4($ctx),%ecx
- mov 8($ctx),%r8d
- mov 12($ctx),%r11d
- mov 16($ctx),%r10d
-
- shl \$26,%rcx # base 2^26 -> base 2^64
- mov %r8,%r9
- shl \$52,%r8
- add %rcx,%rax
- shr \$12,%r9
- add %rax,%r8 # h0
- adc \$0,%r9
-
- shl \$14,%r11
- mov %r10,%rax
- shr \$24,%r10
- add %r11,%r9
- shl \$40,%rax
- add %rax,%r9 # h1
- adc \$0,%r10 # h2
-
- mov %r10,%rax # could be partially reduced, so reduce
- mov %r10,%rcx
- and \$3,%r10
- shr \$2,%rax
- and \$-4,%rcx
- add %rcx,%rax
- add %rax,%r8
- adc \$0,%r9
- adc \$0,%r10
-
- mov %r8,%rax
- add \$5,%r8 # compare to modulus
- mov %r9,%rcx
- adc \$0,%r9
- adc \$0,%r10
- shr \$2,%r10 # did 130-bit value overflow?
- cmovnz %r8,%rax
- cmovnz %r9,%rcx
-
- add 0($nonce),%rax # accumulate nonce
- adc 8($nonce),%rcx
- mov %rax,0($mac) # write result
- mov %rcx,8($mac)
-
- RET
-___
-&end_function("poly1305_emit_avx");
-
-if ($avx>1) {
-
-my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
- map("%ymm$_",(0..15));
-my $S4=$MASK;
-
-sub poly1305_blocks_avxN {
- my ($avx512) = @_;
- my $suffix = $avx512 ? "_avx512" : "";
-$code.=<<___;
-.cfi_startproc
- mov 20($ctx),%r8d # is_base2_26
- cmp \$128,$len
- jae .Lblocks_avx2$suffix
- test %r8d,%r8d
- jz .Lblocks
-
-.Lblocks_avx2$suffix:
- and \$-16,$len
- jz .Lno_data_avx2$suffix
-
- vzeroupper
-
- test %r8d,%r8d
- jz .Lbase2_64_avx2$suffix
-
- test \$63,$len
- jz .Leven_avx2$suffix
-
- push %rbp
-.cfi_push %rbp
- mov %rsp,%rbp
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lblocks_avx2_body$suffix:
-
- mov $len,%r15 # reassign $len
-
- mov 0($ctx),$d1 # load hash value
- mov 8($ctx),$d2
- mov 16($ctx),$h2#d
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- ################################# base 2^26 -> base 2^64
- mov $d1#d,$h0#d
- and \$`-1*(1<<31)`,$d1
- mov $d2,$r1 # borrow $r1
- mov $d2#d,$h1#d
- and \$`-1*(1<<31)`,$d2
-
- shr \$6,$d1
- shl \$52,$r1
- add $d1,$h0
- shr \$12,$h1
- shr \$18,$d2
- add $r1,$h0
- adc $d2,$h1
-
- mov $h2,$d1
- shl \$40,$d1
- shr \$24,$h2
- add $d1,$h1
- adc \$0,$h2 # can be partially reduced...
-
- mov \$-4,$d2 # ... so reduce
- mov $h2,$d1
- and $h2,$d2
- shr \$2,$d1
- and \$3,$h2
- add $d2,$d1 # =*5
- add $d1,$h0
- adc \$0,$h1
- adc \$0,$h2
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
-.Lbase2_26_pre_avx2$suffix:
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
- sub \$16,%r15
-
- call __poly1305_block
- mov $r1,%rax
-
- test \$63,%r15
- jnz .Lbase2_26_pre_avx2$suffix
-
- test $padbit,$padbit # if $padbit is zero,
- jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format
-
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$r0
- mov $h1,$r1
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$r0
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $r0,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$r1
- and \$0x3ffffff,$h1 # h[3]
- or $r1,$h2 # h[4]
-
- test %r15,%r15
- jz .Lstore_base2_26_avx2$suffix
-
- vmovd %rax#d,%x#$H0
- vmovd %rdx#d,%x#$H1
- vmovd $h0#d,%x#$H2
- vmovd $h1#d,%x#$H3
- vmovd $h2#d,%x#$H4
- jmp .Lproceed_avx2$suffix
-
-.align 32
-.Lstore_base2_64_avx2$suffix:
- mov $h0,0($ctx)
- mov $h1,8($ctx)
- mov $h2,16($ctx) # note that is_base2_26 is zeroed
- jmp .Ldone_avx2$suffix
-
-.align 16
-.Lstore_base2_26_avx2$suffix:
- mov %rax#d,0($ctx) # store hash value base 2^26
- mov %rdx#d,4($ctx)
- mov $h0#d,8($ctx)
- mov $h1#d,12($ctx)
- mov $h2#d,16($ctx)
-.align 16
-.Ldone_avx2$suffix:
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbx
-.cfi_restore %rbx
- pop %rbp
-.cfi_restore %rbp
-.Lno_data_avx2$suffix:
-.Lblocks_avx2_epilogue$suffix:
- RET
-.cfi_endproc
-
-.align 32
-.Lbase2_64_avx2$suffix:
-.cfi_startproc
- push %rbp
-.cfi_push %rbp
- mov %rsp,%rbp
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lbase2_64_avx2_body$suffix:
-
- mov $len,%r15 # reassign $len
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- mov 0($ctx),$h0 # load hash value
- mov 8($ctx),$h1
- mov 16($ctx),$h2#d
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
- test \$63,$len
- jz .Linit_avx2$suffix
-
-.Lbase2_64_pre_avx2$suffix:
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
- sub \$16,%r15
-
- call __poly1305_block
- mov $r1,%rax
-
- test \$63,%r15
- jnz .Lbase2_64_pre_avx2$suffix
-
-.Linit_avx2$suffix:
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$d1
- mov $h1,$d2
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$d1
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $d1,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$d2
- and \$0x3ffffff,$h1 # h[3]
- or $d2,$h2 # h[4]
-
- vmovd %rax#d,%x#$H0
- vmovd %rdx#d,%x#$H1
- vmovd $h0#d,%x#$H2
- vmovd $h1#d,%x#$H3
- vmovd $h2#d,%x#$H4
- movl \$1,20($ctx) # set is_base2_26
-
- call __poly1305_init_avx
-
-.Lproceed_avx2$suffix:
- mov %r15,$len # restore $len
-___
-$code.=<<___ if (!$kernel);
- mov OPENSSL_ia32cap_P+8(%rip),%r9d
- mov \$`(1<<31|1<<30|1<<16)`,%r11d
-___
-$code.=<<___;
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbx
-.cfi_restore %rbx
- pop %rbp
-.cfi_restore %rbp
-.Lbase2_64_avx2_epilogue$suffix:
- jmp .Ldo_avx2$suffix
-.cfi_endproc
-
-.align 32
-.Leven_avx2$suffix:
-.cfi_startproc
-___
-$code.=<<___ if (!$kernel);
- mov OPENSSL_ia32cap_P+8(%rip),%r9d
-___
-$code.=<<___;
- vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
- vmovd 4*1($ctx),%x#$H1
- vmovd 4*2($ctx),%x#$H2
- vmovd 4*3($ctx),%x#$H3
- vmovd 4*4($ctx),%x#$H4
-
-.Ldo_avx2$suffix:
-___
-$code.=<<___ if (!$kernel && $avx>2);
- cmp \$512,$len
- jb .Lskip_avx512
- and %r11d,%r9d
- test \$`1<<16`,%r9d # check for AVX512F
- jnz .Lblocks_avx512
-.Lskip_avx512$suffix:
-___
-$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
- cmp \$512,$len
- jae .Lblocks_avx512
-___
-$code.=<<___ if (!$win64);
- lea 8(%rsp),%r10
-.cfi_def_cfa_register %r10
- sub \$0x128,%rsp
-___
-$code.=<<___ if ($win64);
- lea 8(%rsp),%r10
- sub \$0x1c8,%rsp
- vmovdqa %xmm6,-0xb0(%r10)
- vmovdqa %xmm7,-0xa0(%r10)
- vmovdqa %xmm8,-0x90(%r10)
- vmovdqa %xmm9,-0x80(%r10)
- vmovdqa %xmm10,-0x70(%r10)
- vmovdqa %xmm11,-0x60(%r10)
- vmovdqa %xmm12,-0x50(%r10)
- vmovdqa %xmm13,-0x40(%r10)
- vmovdqa %xmm14,-0x30(%r10)
- vmovdqa %xmm15,-0x20(%r10)
-.Ldo_avx2_body$suffix:
-___
-$code.=<<___;
- lea .Lconst(%rip),%rcx
- lea 48+64($ctx),$ctx # size optimization
- vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
-
- # expand and copy pre-calculated table to stack
- vmovdqu `16*0-64`($ctx),%x#$T2
- and \$-512,%rsp
- vmovdqu `16*1-64`($ctx),%x#$T3
- vmovdqu `16*2-64`($ctx),%x#$T4
- vmovdqu `16*3-64`($ctx),%x#$D0
- vmovdqu `16*4-64`($ctx),%x#$D1
- vmovdqu `16*5-64`($ctx),%x#$D2
- lea 0x90(%rsp),%rax # size optimization
- vmovdqu `16*6-64`($ctx),%x#$D3
- vpermd $T2,$T0,$T2 # 00003412 -> 14243444
- vmovdqu `16*7-64`($ctx),%x#$D4
- vpermd $T3,$T0,$T3
- vmovdqu `16*8-64`($ctx),%x#$MASK
- vpermd $T4,$T0,$T4
- vmovdqa $T2,0x00(%rsp)
- vpermd $D0,$T0,$D0
- vmovdqa $T3,0x20-0x90(%rax)
- vpermd $D1,$T0,$D1
- vmovdqa $T4,0x40-0x90(%rax)
- vpermd $D2,$T0,$D2
- vmovdqa $D0,0x60-0x90(%rax)
- vpermd $D3,$T0,$D3
- vmovdqa $D1,0x80-0x90(%rax)
- vpermd $D4,$T0,$D4
- vmovdqa $D2,0xa0-0x90(%rax)
- vpermd $MASK,$T0,$MASK
- vmovdqa $D3,0xc0-0x90(%rax)
- vmovdqa $D4,0xe0-0x90(%rax)
- vmovdqa $MASK,0x100-0x90(%rax)
- vmovdqa 64(%rcx),$MASK # .Lmask26
-
- ################################################################
- # load input
- vmovdqu 16*0($inp),%x#$T0
- vmovdqu 16*1($inp),%x#$T1
- vinserti128 \$1,16*2($inp),$T0,$T0
- vinserti128 \$1,16*3($inp),$T1,$T1
- lea 16*4($inp),$inp
-
- vpsrldq \$6,$T0,$T2 # splat input
- vpsrldq \$6,$T1,$T3
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpunpcklqdq $T3,$T2,$T2 # 2:3
- vpunpcklqdq $T1,$T0,$T0 # 0:1
-
- vpsrlq \$30,$T2,$T3
- vpsrlq \$4,$T2,$T2
- vpsrlq \$26,$T0,$T1
- vpsrlq \$40,$T4,$T4 # 4
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T0,$T0 # 0
- vpand $MASK,$T1,$T1 # 1
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- vpaddq $H2,$T2,$H2 # accumulate input
- sub \$64,$len
- jz .Ltail_avx2$suffix
- jmp .Loop_avx2$suffix
-
-.align 32
-.Loop_avx2$suffix:
- ################################################################
- # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
- # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
- # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
- # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
- # \________/\__________/
- ################################################################
- #vpaddq $H2,$T2,$H2 # accumulate input
- vpaddq $H0,$T0,$H0
- vmovdqa `32*0`(%rsp),$T0 # r0^4
- vpaddq $H1,$T1,$H1
- vmovdqa `32*1`(%rsp),$T1 # r1^4
- vpaddq $H3,$T3,$H3
- vmovdqa `32*3`(%rsp),$T2 # r2^4
- vpaddq $H4,$T4,$H4
- vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
- vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
-
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- #
- # however, as h2 is "chronologically" first one available pull
- # corresponding operations up, so it's
- #
- # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
- # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
- # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
-
- vpmuludq $H2,$T0,$D2 # d2 = h2*r0
- vpmuludq $H2,$T1,$D3 # d3 = h2*r1
- vpmuludq $H2,$T2,$D4 # d4 = h2*r2
- vpmuludq $H2,$T3,$D0 # d0 = h2*s3
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
-
- vpmuludq $H0,$T1,$T4 # h0*r1
- vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
- vpaddq $T4,$D1,$D1 # d1 += h0*r1
- vpaddq $H2,$D2,$D2 # d2 += h1*r1
- vpmuludq $H3,$T1,$T4 # h3*r1
- vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
- vpaddq $T4,$D4,$D4 # d4 += h3*r1
- vpaddq $H2,$D0,$D0 # d0 += h4*s1
- vmovdqa `32*4-0x90`(%rax),$T1 # s2
-
- vpmuludq $H0,$T0,$T4 # h0*r0
- vpmuludq $H1,$T0,$H2 # h1*r0
- vpaddq $T4,$D0,$D0 # d0 += h0*r0
- vpaddq $H2,$D1,$D1 # d1 += h1*r0
- vpmuludq $H3,$T0,$T4 # h3*r0
- vpmuludq $H4,$T0,$H2 # h4*r0
- vmovdqu 16*0($inp),%x#$T0 # load input
- vpaddq $T4,$D3,$D3 # d3 += h3*r0
- vpaddq $H2,$D4,$D4 # d4 += h4*r0
- vinserti128 \$1,16*2($inp),$T0,$T0
-
- vpmuludq $H3,$T1,$T4 # h3*s2
- vpmuludq $H4,$T1,$H2 # h4*s2
- vmovdqu 16*1($inp),%x#$T1
- vpaddq $T4,$D0,$D0 # d0 += h3*s2
- vpaddq $H2,$D1,$D1 # d1 += h4*s2
- vmovdqa `32*5-0x90`(%rax),$H2 # r3
- vpmuludq $H1,$T2,$T4 # h1*r2
- vpmuludq $H0,$T2,$T2 # h0*r2
- vpaddq $T4,$D3,$D3 # d3 += h1*r2
- vpaddq $T2,$D2,$D2 # d2 += h0*r2
- vinserti128 \$1,16*3($inp),$T1,$T1
- lea 16*4($inp),$inp
-
- vpmuludq $H1,$H2,$T4 # h1*r3
- vpmuludq $H0,$H2,$H2 # h0*r3
- vpsrldq \$6,$T0,$T2 # splat input
- vpaddq $T4,$D4,$D4 # d4 += h1*r3
- vpaddq $H2,$D3,$D3 # d3 += h0*r3
- vpmuludq $H3,$T3,$T4 # h3*s3
- vpmuludq $H4,$T3,$H2 # h4*s3
- vpsrldq \$6,$T1,$T3
- vpaddq $T4,$D1,$D1 # d1 += h3*s3
- vpaddq $H2,$D2,$D2 # d2 += h4*s3
- vpunpckhqdq $T1,$T0,$T4 # 4
-
- vpmuludq $H3,$S4,$H3 # h3*s4
- vpmuludq $H4,$S4,$H4 # h4*s4
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
- vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
- vpunpcklqdq $T3,$T2,$T3 # 2:3
- vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
- vpmuludq $H1,$S4,$H0 # h1*s4
- vmovdqa 64(%rcx),$MASK # .Lmask26
- vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
- vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
-
- ################################################################
- # lazy reduction (interleaved with tail of input splat)
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$D1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D4
- vpand $MASK,$H4,$H4
-
- vpsrlq \$4,$T3,$T2
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpand $MASK,$T2,$T2 # 2
- vpsrlq \$26,$T0,$T1
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpaddq $T2,$H2,$H2 # modulo-scheduled
- vpsrlq \$30,$T3,$T3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$40,$T4,$T4 # 4
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpand $MASK,$T0,$T0 # 0
- vpand $MASK,$T1,$T1 # 1
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- sub \$64,$len
- jnz .Loop_avx2$suffix
-
- .byte 0x66,0x90
-.Ltail_avx2$suffix:
- ################################################################
- # while above multiplications were by r^4 in all lanes, in last
- # iteration we multiply least significant lane by r^4 and most
- # significant one by r, so copy of above except that references
- # to the precomputed table are displaced by 4...
-
- #vpaddq $H2,$T2,$H2 # accumulate input
- vpaddq $H0,$T0,$H0
- vmovdqu `32*0+4`(%rsp),$T0 # r0^4
- vpaddq $H1,$T1,$H1
- vmovdqu `32*1+4`(%rsp),$T1 # r1^4
- vpaddq $H3,$T3,$H3
- vmovdqu `32*3+4`(%rsp),$T2 # r2^4
- vpaddq $H4,$T4,$H4
- vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
- vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
-
- vpmuludq $H2,$T0,$D2 # d2 = h2*r0
- vpmuludq $H2,$T1,$D3 # d3 = h2*r1
- vpmuludq $H2,$T2,$D4 # d4 = h2*r2
- vpmuludq $H2,$T3,$D0 # d0 = h2*s3
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
-
- vpmuludq $H0,$T1,$T4 # h0*r1
- vpmuludq $H1,$T1,$H2 # h1*r1
- vpaddq $T4,$D1,$D1 # d1 += h0*r1
- vpaddq $H2,$D2,$D2 # d2 += h1*r1
- vpmuludq $H3,$T1,$T4 # h3*r1
- vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
- vpaddq $T4,$D4,$D4 # d4 += h3*r1
- vpaddq $H2,$D0,$D0 # d0 += h4*s1
-
- vpmuludq $H0,$T0,$T4 # h0*r0
- vpmuludq $H1,$T0,$H2 # h1*r0
- vpaddq $T4,$D0,$D0 # d0 += h0*r0
- vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
- vpaddq $H2,$D1,$D1 # d1 += h1*r0
- vpmuludq $H3,$T0,$T4 # h3*r0
- vpmuludq $H4,$T0,$H2 # h4*r0
- vpaddq $T4,$D3,$D3 # d3 += h3*r0
- vpaddq $H2,$D4,$D4 # d4 += h4*r0
-
- vpmuludq $H3,$T1,$T4 # h3*s2
- vpmuludq $H4,$T1,$H2 # h4*s2
- vpaddq $T4,$D0,$D0 # d0 += h3*s2
- vpaddq $H2,$D1,$D1 # d1 += h4*s2
- vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
- vpmuludq $H1,$T2,$T4 # h1*r2
- vpmuludq $H0,$T2,$T2 # h0*r2
- vpaddq $T4,$D3,$D3 # d3 += h1*r2
- vpaddq $T2,$D2,$D2 # d2 += h0*r2
-
- vpmuludq $H1,$H2,$T4 # h1*r3
- vpmuludq $H0,$H2,$H2 # h0*r3
- vpaddq $T4,$D4,$D4 # d4 += h1*r3
- vpaddq $H2,$D3,$D3 # d3 += h0*r3
- vpmuludq $H3,$T3,$T4 # h3*s3
- vpmuludq $H4,$T3,$H2 # h4*s3
- vpaddq $T4,$D1,$D1 # d1 += h3*s3
- vpaddq $H2,$D2,$D2 # d2 += h4*s3
-
- vpmuludq $H3,$S4,$H3 # h3*s4
- vpmuludq $H4,$S4,$H4 # h4*s4
- vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
- vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
- vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
- vpmuludq $H1,$S4,$H0 # h1*s4
- vmovdqa 64(%rcx),$MASK # .Lmask26
- vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
- vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
-
- ################################################################
- # horizontal addition
-
- vpsrldq \$8,$D1,$T1
- vpsrldq \$8,$H2,$T2
- vpsrldq \$8,$H3,$T3
- vpsrldq \$8,$H4,$T4
- vpsrldq \$8,$H0,$T0
- vpaddq $T1,$D1,$D1
- vpaddq $T2,$H2,$H2
- vpaddq $T3,$H3,$H3
- vpaddq $T4,$H4,$H4
- vpaddq $T0,$H0,$H0
-
- vpermq \$0x2,$H3,$T3
- vpermq \$0x2,$H4,$T4
- vpermq \$0x2,$H0,$T0
- vpermq \$0x2,$D1,$T1
- vpermq \$0x2,$H2,$T2
- vpaddq $T3,$H3,$H3
- vpaddq $T4,$H4,$H4
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$D1,$D1
- vpaddq $T2,$H2,$H2
-
- ################################################################
- # lazy reduction
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$D1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D4
- vpand $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
- vmovd %x#$H1,`4*1-48-64`($ctx)
- vmovd %x#$H2,`4*2-48-64`($ctx)
- vmovd %x#$H3,`4*3-48-64`($ctx)
- vmovd %x#$H4,`4*4-48-64`($ctx)
-___
-$code.=<<___ if ($win64);
- vmovdqa -0xb0(%r10),%xmm6
- vmovdqa -0xa0(%r10),%xmm7
- vmovdqa -0x90(%r10),%xmm8
- vmovdqa -0x80(%r10),%xmm9
- vmovdqa -0x70(%r10),%xmm10
- vmovdqa -0x60(%r10),%xmm11
- vmovdqa -0x50(%r10),%xmm12
- vmovdqa -0x40(%r10),%xmm13
- vmovdqa -0x30(%r10),%xmm14
- vmovdqa -0x20(%r10),%xmm15
- lea -8(%r10),%rsp
-.Ldo_avx2_epilogue$suffix:
-___
-$code.=<<___ if (!$win64);
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-___
-$code.=<<___;
- vzeroupper
- RET
-.cfi_endproc
-___
-if($avx > 2 && $avx512) {
-my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
-my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
-my $PADBIT="%zmm30";
-
-map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
-map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
-map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
-map(s/%y/%z/,($MASK));
-
-$code.=<<___;
-.cfi_startproc
-.Lblocks_avx512:
- mov \$15,%eax
- kmovw %eax,%k2
-___
-$code.=<<___ if (!$win64);
- lea 8(%rsp),%r10
-.cfi_def_cfa_register %r10
- sub \$0x128,%rsp
-___
-$code.=<<___ if ($win64);
- lea 8(%rsp),%r10
- sub \$0x1c8,%rsp
- vmovdqa %xmm6,-0xb0(%r10)
- vmovdqa %xmm7,-0xa0(%r10)
- vmovdqa %xmm8,-0x90(%r10)
- vmovdqa %xmm9,-0x80(%r10)
- vmovdqa %xmm10,-0x70(%r10)
- vmovdqa %xmm11,-0x60(%r10)
- vmovdqa %xmm12,-0x50(%r10)
- vmovdqa %xmm13,-0x40(%r10)
- vmovdqa %xmm14,-0x30(%r10)
- vmovdqa %xmm15,-0x20(%r10)
-.Ldo_avx512_body:
-___
-$code.=<<___;
- lea .Lconst(%rip),%rcx
- lea 48+64($ctx),$ctx # size optimization
- vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
-
- # expand pre-calculated table
- vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
- and \$-512,%rsp
- vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
- mov \$0x20,%rax
- vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
- vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
- vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
- vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
- vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
- vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
- vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
- vpermd $D0,$T2,$R0 # 00003412 -> 14243444
- vpbroadcastq 64(%rcx),$MASK # .Lmask26
- vpermd $D1,$T2,$R1
- vpermd $T0,$T2,$S1
- vpermd $D2,$T2,$R2
- vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
- vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
- vpermd $T1,$T2,$S2
- vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
- vpsrlq \$32,$R1,$T1
- vpermd $D3,$T2,$R3
- vmovdqa64 $S1,0x40(%rsp){%k2}
- vpermd $T3,$T2,$S3
- vpermd $D4,$T2,$R4
- vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
- vpermd $T4,$T2,$S4
- vmovdqa64 $S2,0x80(%rsp){%k2}
- vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
- vmovdqa64 $S3,0xc0(%rsp){%k2}
- vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
- vmovdqa64 $S4,0x100(%rsp){%k2}
-
- ################################################################
- # calculate 5th through 8th powers of the key
- #
- # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
- # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
- # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
- # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
- # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
-
- vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
- vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
- vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
- vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
- vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
- vpsrlq \$32,$R2,$T2
-
- vpmuludq $T1,$S4,$M0
- vpmuludq $T1,$R0,$M1
- vpmuludq $T1,$R1,$M2
- vpmuludq $T1,$R2,$M3
- vpmuludq $T1,$R3,$M4
- vpsrlq \$32,$R3,$T3
- vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
- vpaddq $M1,$D1,$D1 # d1 += r1'*r0
- vpaddq $M2,$D2,$D2 # d2 += r1'*r1
- vpaddq $M3,$D3,$D3 # d3 += r1'*r2
- vpaddq $M4,$D4,$D4 # d4 += r1'*r3
-
- vpmuludq $T2,$S3,$M0
- vpmuludq $T2,$S4,$M1
- vpmuludq $T2,$R1,$M3
- vpmuludq $T2,$R2,$M4
- vpmuludq $T2,$R0,$M2
- vpsrlq \$32,$R4,$T4
- vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
- vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
- vpaddq $M3,$D3,$D3 # d3 += r2'*r1
- vpaddq $M4,$D4,$D4 # d4 += r2'*r2
- vpaddq $M2,$D2,$D2 # d2 += r2'*r0
-
- vpmuludq $T3,$S2,$M0
- vpmuludq $T3,$R0,$M3
- vpmuludq $T3,$R1,$M4
- vpmuludq $T3,$S3,$M1
- vpmuludq $T3,$S4,$M2
- vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
- vpaddq $M3,$D3,$D3 # d3 += r3'*r0
- vpaddq $M4,$D4,$D4 # d4 += r3'*r1
- vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
- vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
-
- vpmuludq $T4,$S4,$M3
- vpmuludq $T4,$R0,$M4
- vpmuludq $T4,$S1,$M0
- vpmuludq $T4,$S2,$M1
- vpmuludq $T4,$S3,$M2
- vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
- vpaddq $M4,$D4,$D4 # d4 += r2'*r0
- vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
- vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
- vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
-
- ################################################################
- # load input
- vmovdqu64 16*0($inp),%z#$T3
- vmovdqu64 16*4($inp),%z#$T4
- lea 16*8($inp),$inp
-
- ################################################################
- # lazy reduction
-
- vpsrlq \$26,$D3,$M3
- vpandq $MASK,$D3,$D3
- vpaddq $M3,$D4,$D4 # d3 -> d4
-
- vpsrlq \$26,$D0,$M0
- vpandq $MASK,$D0,$D0
- vpaddq $M0,$D1,$D1 # d0 -> d1
-
- vpsrlq \$26,$D4,$M4
- vpandq $MASK,$D4,$D4
-
- vpsrlq \$26,$D1,$M1
- vpandq $MASK,$D1,$D1
- vpaddq $M1,$D2,$D2 # d1 -> d2
-
- vpaddq $M4,$D0,$D0
- vpsllq \$2,$M4,$M4
- vpaddq $M4,$D0,$D0 # d4 -> d0
-
- vpsrlq \$26,$D2,$M2
- vpandq $MASK,$D2,$D2
- vpaddq $M2,$D3,$D3 # d2 -> d3
-
- vpsrlq \$26,$D0,$M0
- vpandq $MASK,$D0,$D0
- vpaddq $M0,$D1,$D1 # d0 -> d1
-
- vpsrlq \$26,$D3,$M3
- vpandq $MASK,$D3,$D3
- vpaddq $M3,$D4,$D4 # d3 -> d4
-
- ################################################################
- # at this point we have 14243444 in $R0-$S4 and 05060708 in
- # $D0-$D4, ...
-
- vpunpcklqdq $T4,$T3,$T0 # transpose input
- vpunpckhqdq $T4,$T3,$T4
-
- # ... since input 64-bit lanes are ordered as 73625140, we could
- # "vperm" it to 76543210 (here and in each loop iteration), *or*
- # we could just flow along, hence the goal for $R0-$S4 is
- # 1858286838784888 ...
-
- vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
- mov \$0x7777,%eax
- kmovw %eax,%k1
-
- vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
- vpermd $R1,$M0,$R1
- vpermd $R2,$M0,$R2
- vpermd $R3,$M0,$R3
- vpermd $R4,$M0,$R4
-
- vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
- vpermd $D1,$M0,${R1}{%k1}
- vpermd $D2,$M0,${R2}{%k1}
- vpermd $D3,$M0,${R3}{%k1}
- vpermd $D4,$M0,${R4}{%k1}
-
- vpslld \$2,$R1,$S1 # *5
- vpslld \$2,$R2,$S2
- vpslld \$2,$R3,$S3
- vpslld \$2,$R4,$S4
- vpaddd $R1,$S1,$S1
- vpaddd $R2,$S2,$S2
- vpaddd $R3,$S3,$S3
- vpaddd $R4,$S4,$S4
-
- vpbroadcastq 32(%rcx),$PADBIT # .L129
-
- vpsrlq \$52,$T0,$T2 # splat input
- vpsllq \$12,$T4,$T3
- vporq $T3,$T2,$T2
- vpsrlq \$26,$T0,$T1
- vpsrlq \$14,$T4,$T3
- vpsrlq \$40,$T4,$T4 # 4
- vpandq $MASK,$T2,$T2 # 2
- vpandq $MASK,$T0,$T0 # 0
- #vpandq $MASK,$T1,$T1 # 1
- #vpandq $MASK,$T3,$T3 # 3
- #vporq $PADBIT,$T4,$T4 # padbit, yes, always
-
- vpaddq $H2,$T2,$H2 # accumulate input
- sub \$192,$len
- jbe .Ltail_avx512
- jmp .Loop_avx512
-
-.align 32
-.Loop_avx512:
- ################################################################
- # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
- # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
- # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
- # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
- # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
- # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
- # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
- # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
- # \________/\___________/
- ################################################################
- #vpaddq $H2,$T2,$H2 # accumulate input
-
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- #
- # however, as h2 is "chronologically" first one available pull
- # corresponding operations up, so it's
- #
- # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
- # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
- # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
- # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
- # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
-
- vpmuludq $H2,$R1,$D3 # d3 = h2*r1
- vpaddq $H0,$T0,$H0
- vpmuludq $H2,$R2,$D4 # d4 = h2*r2
- vpandq $MASK,$T1,$T1 # 1
- vpmuludq $H2,$S3,$D0 # d0 = h2*s3
- vpandq $MASK,$T3,$T3 # 3
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
- vporq $PADBIT,$T4,$T4 # padbit, yes, always
- vpmuludq $H2,$R0,$D2 # d2 = h2*r0
- vpaddq $H1,$T1,$H1 # accumulate input
- vpaddq $H3,$T3,$H3
- vpaddq $H4,$T4,$H4
-
- vmovdqu64 16*0($inp),$T3 # load input
- vmovdqu64 16*4($inp),$T4
- lea 16*8($inp),$inp
- vpmuludq $H0,$R3,$M3
- vpmuludq $H0,$R4,$M4
- vpmuludq $H0,$R0,$M0
- vpmuludq $H0,$R1,$M1
- vpaddq $M3,$D3,$D3 # d3 += h0*r3
- vpaddq $M4,$D4,$D4 # d4 += h0*r4
- vpaddq $M0,$D0,$D0 # d0 += h0*r0
- vpaddq $M1,$D1,$D1 # d1 += h0*r1
-
- vpmuludq $H1,$R2,$M3
- vpmuludq $H1,$R3,$M4
- vpmuludq $H1,$S4,$M0
- vpmuludq $H0,$R2,$M2
- vpaddq $M3,$D3,$D3 # d3 += h1*r2
- vpaddq $M4,$D4,$D4 # d4 += h1*r3
- vpaddq $M0,$D0,$D0 # d0 += h1*s4
- vpaddq $M2,$D2,$D2 # d2 += h0*r2
-
- vpunpcklqdq $T4,$T3,$T0 # transpose input
- vpunpckhqdq $T4,$T3,$T4
-
- vpmuludq $H3,$R0,$M3
- vpmuludq $H3,$R1,$M4
- vpmuludq $H1,$R0,$M1
- vpmuludq $H1,$R1,$M2
- vpaddq $M3,$D3,$D3 # d3 += h3*r0
- vpaddq $M4,$D4,$D4 # d4 += h3*r1
- vpaddq $M1,$D1,$D1 # d1 += h1*r0
- vpaddq $M2,$D2,$D2 # d2 += h1*r1
-
- vpmuludq $H4,$S4,$M3
- vpmuludq $H4,$R0,$M4
- vpmuludq $H3,$S2,$M0
- vpmuludq $H3,$S3,$M1
- vpaddq $M3,$D3,$D3 # d3 += h4*s4
- vpmuludq $H3,$S4,$M2
- vpaddq $M4,$D4,$D4 # d4 += h4*r0
- vpaddq $M0,$D0,$D0 # d0 += h3*s2
- vpaddq $M1,$D1,$D1 # d1 += h3*s3
- vpaddq $M2,$D2,$D2 # d2 += h3*s4
-
- vpmuludq $H4,$S1,$M0
- vpmuludq $H4,$S2,$M1
- vpmuludq $H4,$S3,$M2
- vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
- vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
- vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
-
- ################################################################
- # lazy reduction (interleaved with input splat)
-
- vpsrlq \$52,$T0,$T2 # splat input
- vpsllq \$12,$T4,$T3
-
- vpsrlq \$26,$D3,$H3
- vpandq $MASK,$D3,$D3
- vpaddq $H3,$D4,$H4 # h3 -> h4
-
- vporq $T3,$T2,$T2
-
- vpsrlq \$26,$H0,$D0
- vpandq $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpandq $MASK,$T2,$T2 # 2
-
- vpsrlq \$26,$H4,$D4
- vpandq $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpandq $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpaddq $T2,$H2,$H2 # modulo-scheduled
- vpsrlq \$26,$T0,$T1
-
- vpsrlq \$26,$H2,$D2
- vpandq $MASK,$H2,$H2
- vpaddq $D2,$D3,$H3 # h2 -> h3
-
- vpsrlq \$14,$T4,$T3
-
- vpsrlq \$26,$H0,$D0
- vpandq $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$40,$T4,$T4 # 4
-
- vpsrlq \$26,$H3,$D3
- vpandq $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpandq $MASK,$T0,$T0 # 0
- #vpandq $MASK,$T1,$T1 # 1
- #vpandq $MASK,$T3,$T3 # 3
- #vporq $PADBIT,$T4,$T4 # padbit, yes, always
-
- sub \$128,$len
- ja .Loop_avx512
-
-.Ltail_avx512:
- ################################################################
- # while above multiplications were by r^8 in all lanes, in last
- # iteration we multiply least significant lane by r^8 and most
- # significant one by r, that's why table gets shifted...
-
- vpsrlq \$32,$R0,$R0 # 0105020603070408
- vpsrlq \$32,$R1,$R1
- vpsrlq \$32,$R2,$R2
- vpsrlq \$32,$S3,$S3
- vpsrlq \$32,$S4,$S4
- vpsrlq \$32,$R3,$R3
- vpsrlq \$32,$R4,$R4
- vpsrlq \$32,$S1,$S1
- vpsrlq \$32,$S2,$S2
-
- ################################################################
- # load either next or last 64 byte of input
- lea ($inp,$len),$inp
-
- #vpaddq $H2,$T2,$H2 # accumulate input
- vpaddq $H0,$T0,$H0
-
- vpmuludq $H2,$R1,$D3 # d3 = h2*r1
- vpmuludq $H2,$R2,$D4 # d4 = h2*r2
- vpmuludq $H2,$S3,$D0 # d0 = h2*s3
- vpandq $MASK,$T1,$T1 # 1
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
- vpandq $MASK,$T3,$T3 # 3
- vpmuludq $H2,$R0,$D2 # d2 = h2*r0
- vporq $PADBIT,$T4,$T4 # padbit, yes, always
- vpaddq $H1,$T1,$H1 # accumulate input
- vpaddq $H3,$T3,$H3
- vpaddq $H4,$T4,$H4
-
- vmovdqu 16*0($inp),%x#$T0
- vpmuludq $H0,$R3,$M3
- vpmuludq $H0,$R4,$M4
- vpmuludq $H0,$R0,$M0
- vpmuludq $H0,$R1,$M1
- vpaddq $M3,$D3,$D3 # d3 += h0*r3
- vpaddq $M4,$D4,$D4 # d4 += h0*r4
- vpaddq $M0,$D0,$D0 # d0 += h0*r0
- vpaddq $M1,$D1,$D1 # d1 += h0*r1
-
- vmovdqu 16*1($inp),%x#$T1
- vpmuludq $H1,$R2,$M3
- vpmuludq $H1,$R3,$M4
- vpmuludq $H1,$S4,$M0
- vpmuludq $H0,$R2,$M2
- vpaddq $M3,$D3,$D3 # d3 += h1*r2
- vpaddq $M4,$D4,$D4 # d4 += h1*r3
- vpaddq $M0,$D0,$D0 # d0 += h1*s4
- vpaddq $M2,$D2,$D2 # d2 += h0*r2
-
- vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
- vpmuludq $H3,$R0,$M3
- vpmuludq $H3,$R1,$M4
- vpmuludq $H1,$R0,$M1
- vpmuludq $H1,$R1,$M2
- vpaddq $M3,$D3,$D3 # d3 += h3*r0
- vpaddq $M4,$D4,$D4 # d4 += h3*r1
- vpaddq $M1,$D1,$D1 # d1 += h1*r0
- vpaddq $M2,$D2,$D2 # d2 += h1*r1
-
- vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
- vpmuludq $H4,$S4,$M3
- vpmuludq $H4,$R0,$M4
- vpmuludq $H3,$S2,$M0
- vpmuludq $H3,$S3,$M1
- vpmuludq $H3,$S4,$M2
- vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
- vpaddq $M4,$D4,$D4 # d4 += h4*r0
- vpaddq $M0,$D0,$D0 # d0 += h3*s2
- vpaddq $M1,$D1,$D1 # d1 += h3*s3
- vpaddq $M2,$D2,$D2 # d2 += h3*s4
-
- vpmuludq $H4,$S1,$M0
- vpmuludq $H4,$S2,$M1
- vpmuludq $H4,$S3,$M2
- vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
- vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
- vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
-
- ################################################################
- # horizontal addition
-
- mov \$1,%eax
- vpermq \$0xb1,$H3,$D3
- vpermq \$0xb1,$D4,$H4
- vpermq \$0xb1,$H0,$D0
- vpermq \$0xb1,$H1,$D1
- vpermq \$0xb1,$H2,$D2
- vpaddq $D3,$H3,$H3
- vpaddq $D4,$H4,$H4
- vpaddq $D0,$H0,$H0
- vpaddq $D1,$H1,$H1
- vpaddq $D2,$H2,$H2
-
- kmovw %eax,%k3
- vpermq \$0x2,$H3,$D3
- vpermq \$0x2,$H4,$D4
- vpermq \$0x2,$H0,$D0
- vpermq \$0x2,$H1,$D1
- vpermq \$0x2,$H2,$D2
- vpaddq $D3,$H3,$H3
- vpaddq $D4,$H4,$H4
- vpaddq $D0,$H0,$H0
- vpaddq $D1,$H1,$H1
- vpaddq $D2,$H2,$H2
-
- vextracti64x4 \$0x1,$H3,%y#$D3
- vextracti64x4 \$0x1,$H4,%y#$D4
- vextracti64x4 \$0x1,$H0,%y#$D0
- vextracti64x4 \$0x1,$H1,%y#$D1
- vextracti64x4 \$0x1,$H2,%y#$D2
- vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
- vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
- vpaddq $D0,$H0,${H0}{%k3}{z}
- vpaddq $D1,$H1,${H1}{%k3}{z}
- vpaddq $D2,$H2,${H2}{%k3}{z}
-___
-map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
-map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
-$code.=<<___;
- ################################################################
- # lazy reduction (interleaved with input splat)
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpsrldq \$6,$T0,$T2 # splat input
- vpsrldq \$6,$T1,$T3
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpunpcklqdq $T3,$T2,$T2 # 2:3
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D4
- vpand $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpsrlq \$30,$T2,$T3
- vpsrlq \$4,$T2,$T2
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpsrlq \$26,$T0,$T1
- vpsrlq \$40,$T4,$T4 # 4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T0,$T0 # 0
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
- vpand $MASK,$T1,$T1 # 1
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
- add \$64,$len
- jnz .Ltail_avx2$suffix
-
- vpsubq $T2,$H2,$H2 # undo input accumulation
- vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
- vmovd %x#$H1,`4*1-48-64`($ctx)
- vmovd %x#$H2,`4*2-48-64`($ctx)
- vmovd %x#$H3,`4*3-48-64`($ctx)
- vmovd %x#$H4,`4*4-48-64`($ctx)
- vzeroall
-___
-$code.=<<___ if ($win64);
- movdqa -0xb0(%r10),%xmm6
- movdqa -0xa0(%r10),%xmm7
- movdqa -0x90(%r10),%xmm8
- movdqa -0x80(%r10),%xmm9
- movdqa -0x70(%r10),%xmm10
- movdqa -0x60(%r10),%xmm11
- movdqa -0x50(%r10),%xmm12
- movdqa -0x40(%r10),%xmm13
- movdqa -0x30(%r10),%xmm14
- movdqa -0x20(%r10),%xmm15
- lea -8(%r10),%rsp
-.Ldo_avx512_epilogue:
-___
-$code.=<<___ if (!$win64);
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-___
-$code.=<<___;
- RET
-.cfi_endproc
-___
-
-}
-
-}
-
-&declare_function("poly1305_blocks_avx2", 32, 4);
-poly1305_blocks_avxN(0);
-&end_function("poly1305_blocks_avx2");
-
-#######################################################################
-if ($avx>2) {
-# On entry we have input length divisible by 64. But since inner loop
-# processes 128 bytes per iteration, cases when length is not divisible
-# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
-# reason stack layout is kept identical to poly1305_blocks_avx2. If not
-# for this tail, we wouldn't have to even allocate stack frame...
-
-&declare_function("poly1305_blocks_avx512", 32, 4);
-poly1305_blocks_avxN(1);
-&end_function("poly1305_blocks_avx512");
-
-if (!$kernel && $avx>3) {
-########################################################################
-# VPMADD52 version using 2^44 radix.
-#
-# One can argue that base 2^52 would be more natural. Well, even though
-# some operations would be more natural, one has to recognize couple of
-# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
-# at amount of multiply-n-accumulate operations. Secondly, it makes it
-# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
-# reference implementations], which means that more such operations
-# would have to be performed in inner loop, which in turn makes critical
-# path longer. In other words, even though base 2^44 reduction might
-# look less elegant, overall critical path is actually shorter...
-
-########################################################################
-# Layout of opaque area is following.
-#
-# unsigned __int64 h[3]; # current hash value base 2^44
-# unsigned __int64 s[2]; # key value*20 base 2^44
-# unsigned __int64 r[3]; # key value base 2^44
-# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
-# # r^n positions reflect
-# # placement in register, not
-# # memory, R[3] is R[1]*20
-
-$code.=<<___;
-.type poly1305_init_base2_44,\@function,3
-.align 32
-poly1305_init_base2_44:
- xor %eax,%eax
- mov %rax,0($ctx) # initialize hash value
- mov %rax,8($ctx)
- mov %rax,16($ctx)
-
-.Linit_base2_44:
- lea poly1305_blocks_vpmadd52(%rip),%r10
- lea poly1305_emit_base2_44(%rip),%r11
-
- mov \$0x0ffffffc0fffffff,%rax
- mov \$0x0ffffffc0ffffffc,%rcx
- and 0($inp),%rax
- mov \$0x00000fffffffffff,%r8
- and 8($inp),%rcx
- mov \$0x00000fffffffffff,%r9
- and %rax,%r8
- shrd \$44,%rcx,%rax
- mov %r8,40($ctx) # r0
- and %r9,%rax
- shr \$24,%rcx
- mov %rax,48($ctx) # r1
- lea (%rax,%rax,4),%rax # *5
- mov %rcx,56($ctx) # r2
- shl \$2,%rax # magic <<2
- lea (%rcx,%rcx,4),%rcx # *5
- shl \$2,%rcx # magic <<2
- mov %rax,24($ctx) # s1
- mov %rcx,32($ctx) # s2
- movq \$-1,64($ctx) # write impossible value
-___
-$code.=<<___ if ($flavour !~ /elf32/);
- mov %r10,0(%rdx)
- mov %r11,8(%rdx)
-___
-$code.=<<___ if ($flavour =~ /elf32/);
- mov %r10d,0(%rdx)
- mov %r11d,4(%rdx)
-___
-$code.=<<___;
- mov \$1,%eax
- RET
-.size poly1305_init_base2_44,.-poly1305_init_base2_44
-___
-{
-my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
-my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
-my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
-
-$code.=<<___;
-.type poly1305_blocks_vpmadd52,\@function,4
-.align 32
-poly1305_blocks_vpmadd52:
- shr \$4,$len
- jz .Lno_data_vpmadd52 # too short
-
- shl \$40,$padbit
- mov 64($ctx),%r8 # peek on power of the key
-
- # if powers of the key are not calculated yet, process up to 3
- # blocks with this single-block subroutine, otherwise ensure that
- # length is divisible by 2 blocks and pass the rest down to next
- # subroutine...
-
- mov \$3,%rax
- mov \$1,%r10
- cmp \$4,$len # is input long
- cmovae %r10,%rax
- test %r8,%r8 # is power value impossible?
- cmovns %r10,%rax
-
- and $len,%rax # is input of favourable length?
- jz .Lblocks_vpmadd52_4x
-
- sub %rax,$len
- mov \$7,%r10d
- mov \$1,%r11d
- kmovw %r10d,%k7
- lea .L2_44_inp_permd(%rip),%r10
- kmovw %r11d,%k1
-
- vmovq $padbit,%x#$PAD
- vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
- vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
- vpermq \$0xcf,$PAD,$PAD
- vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
-
- vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
- vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
- vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
- vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
-
- vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
- vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
-
- jmp .Loop_vpmadd52
-
-.align 32
-.Loop_vpmadd52:
- vmovdqu32 0($inp),%x#$T0 # load input as ----3210
- lea 16($inp),$inp
-
- vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
- vpsrlvq $inp_shift,$T0,$T0
- vpandq $reduc_mask,$T0,$T0
- vporq $PAD,$T0,$T0
-
- vpaddq $T0,$Dlo,$Dlo # accumulate input
-
- vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
- vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
- vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
-
- vpxord $Dlo,$Dlo,$Dlo
- vpxord $Dhi,$Dhi,$Dhi
-
- vpmadd52luq $r2r1r0,$H0,$Dlo
- vpmadd52huq $r2r1r0,$H0,$Dhi
-
- vpmadd52luq $r1r0s2,$H1,$Dlo
- vpmadd52huq $r1r0s2,$H1,$Dhi
-
- vpmadd52luq $r0s2s1,$H2,$Dlo
- vpmadd52huq $r0s2s1,$H2,$Dhi
-
- vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
- vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
- vpandq $reduc_mask,$Dlo,$Dlo
-
- vpaddq $T0,$Dhi,$Dhi
-
- vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
-
- vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
-
- vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
- vpandq $reduc_mask,$Dlo,$Dlo
-
- vpermq \$0b10010011,$T0,$T0
-
- vpaddq $T0,$Dlo,$Dlo
-
- vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
-
- vpaddq $T0,$Dlo,$Dlo
- vpsllq \$2,$T0,$T0
-
- vpaddq $T0,$Dlo,$Dlo
-
- dec %rax # len-=16
- jnz .Loop_vpmadd52
-
- vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
-
- test $len,$len
- jnz .Lblocks_vpmadd52_4x
-
-.Lno_data_vpmadd52:
- RET
-.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
-___
-}
-{
-########################################################################
-# As implied by its name 4x subroutine processes 4 blocks in parallel
-# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
-# and is handled in 256-bit %ymm registers.
-
-my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
-my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
-my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
-
-$code.=<<___;
-.type poly1305_blocks_vpmadd52_4x,\@function,4
-.align 32
-poly1305_blocks_vpmadd52_4x:
- shr \$4,$len
- jz .Lno_data_vpmadd52_4x # too short
-
- shl \$40,$padbit
- mov 64($ctx),%r8 # peek on power of the key
-
-.Lblocks_vpmadd52_4x:
- vpbroadcastq $padbit,$PAD
-
- vmovdqa64 .Lx_mask44(%rip),$mask44
- mov \$5,%eax
- vmovdqa64 .Lx_mask42(%rip),$mask42
- kmovw %eax,%k1 # used in 2x path
-
- test %r8,%r8 # is power value impossible?
- js .Linit_vpmadd52 # if it is, then init R[4]
-
- vmovq 0($ctx),%x#$H0 # load current hash value
- vmovq 8($ctx),%x#$H1
- vmovq 16($ctx),%x#$H2
-
- test \$3,$len # is length 4*n+2?
- jnz .Lblocks_vpmadd52_2x_do
-
-.Lblocks_vpmadd52_4x_do:
- vpbroadcastq 64($ctx),$R0 # load 4th power of the key
- vpbroadcastq 96($ctx),$R1
- vpbroadcastq 128($ctx),$R2
- vpbroadcastq 160($ctx),$S1
-
-.Lblocks_vpmadd52_4x_key_loaded:
- vpsllq \$2,$R2,$S2 # S2 = R2*5*4
- vpaddq $R2,$S2,$S2
- vpsllq \$2,$S2,$S2
-
- test \$7,$len # is len 8*n?
- jz .Lblocks_vpmadd52_8x
-
- vmovdqu64 16*0($inp),$T2 # load data
- vmovdqu64 16*2($inp),$T3
- lea 16*4($inp),$inp
-
- vpunpcklqdq $T3,$T2,$T1 # transpose data
- vpunpckhqdq $T3,$T2,$T3
-
- # at this point 64-bit lanes are ordered as 3-1-2-0
-
- vpsrlq \$24,$T3,$T2 # splat the data
- vporq $PAD,$T2,$T2
- vpaddq $T2,$H2,$H2 # accumulate input
- vpandq $mask44,$T1,$T0
- vpsrlq \$44,$T1,$T1
- vpsllq \$20,$T3,$T3
- vporq $T3,$T1,$T1
- vpandq $mask44,$T1,$T1
-
- sub \$4,$len
- jz .Ltail_vpmadd52_4x
- jmp .Loop_vpmadd52_4x
- ud2
-
-.align 32
-.Linit_vpmadd52:
- vmovq 24($ctx),%x#$S1 # load key
- vmovq 56($ctx),%x#$H2
- vmovq 32($ctx),%x#$S2
- vmovq 40($ctx),%x#$R0
- vmovq 48($ctx),%x#$R1
-
- vmovdqa $R0,$H0
- vmovdqa $R1,$H1
- vmovdqa $H2,$R2
-
- mov \$2,%eax
-
-.Lmul_init_vpmadd52:
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $H2,$S1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $H2,$S1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $H2,$S2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $H2,$S2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $H2,$R0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $H2,$R0,$D2hi
-
- vpmadd52luq $H0,$R0,$D0lo
- vpmadd52huq $H0,$R0,$D0hi
- vpmadd52luq $H0,$R1,$D1lo
- vpmadd52huq $H0,$R1,$D1hi
- vpmadd52luq $H0,$R2,$D2lo
- vpmadd52huq $H0,$R2,$D2hi
-
- vpmadd52luq $H1,$S2,$D0lo
- vpmadd52huq $H1,$S2,$D0hi
- vpmadd52luq $H1,$R0,$D1lo
- vpmadd52huq $H1,$R0,$D1hi
- vpmadd52luq $H1,$R1,$D2lo
- vpmadd52huq $H1,$R1,$D2hi
-
- ################################################################
- # partial reduction
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$H0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$H1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$H2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
-
- vpsrlq \$44,$H0,$tmp # additional step
- vpandq $mask44,$H0,$H0
-
- vpaddq $tmp,$H1,$H1
-
- dec %eax
- jz .Ldone_init_vpmadd52
-
- vpunpcklqdq $R1,$H1,$R1 # 1,2
- vpbroadcastq %x#$H1,%x#$H1 # 2,2
- vpunpcklqdq $R2,$H2,$R2
- vpbroadcastq %x#$H2,%x#$H2
- vpunpcklqdq $R0,$H0,$R0
- vpbroadcastq %x#$H0,%x#$H0
-
- vpsllq \$2,$R1,$S1 # S1 = R1*5*4
- vpsllq \$2,$R2,$S2 # S2 = R2*5*4
- vpaddq $R1,$S1,$S1
- vpaddq $R2,$S2,$S2
- vpsllq \$2,$S1,$S1
- vpsllq \$2,$S2,$S2
-
- jmp .Lmul_init_vpmadd52
- ud2
-
-.align 32
-.Ldone_init_vpmadd52:
- vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
- vinserti128 \$1,%x#$R2,$H2,$R2
- vinserti128 \$1,%x#$R0,$H0,$R0
-
- vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
- vpermq \$0b11011000,$R2,$R2
- vpermq \$0b11011000,$R0,$R0
-
- vpsllq \$2,$R1,$S1 # S1 = R1*5*4
- vpaddq $R1,$S1,$S1
- vpsllq \$2,$S1,$S1
-
- vmovq 0($ctx),%x#$H0 # load current hash value
- vmovq 8($ctx),%x#$H1
- vmovq 16($ctx),%x#$H2
-
- test \$3,$len # is length 4*n+2?
- jnz .Ldone_init_vpmadd52_2x
-
- vmovdqu64 $R0,64($ctx) # save key powers
- vpbroadcastq %x#$R0,$R0 # broadcast 4th power
- vmovdqu64 $R1,96($ctx)
- vpbroadcastq %x#$R1,$R1
- vmovdqu64 $R2,128($ctx)
- vpbroadcastq %x#$R2,$R2
- vmovdqu64 $S1,160($ctx)
- vpbroadcastq %x#$S1,$S1
-
- jmp .Lblocks_vpmadd52_4x_key_loaded
- ud2
-
-.align 32
-.Ldone_init_vpmadd52_2x:
- vmovdqu64 $R0,64($ctx) # save key powers
- vpsrldq \$8,$R0,$R0 # 0-1-0-2
- vmovdqu64 $R1,96($ctx)
- vpsrldq \$8,$R1,$R1
- vmovdqu64 $R2,128($ctx)
- vpsrldq \$8,$R2,$R2
- vmovdqu64 $S1,160($ctx)
- vpsrldq \$8,$S1,$S1
- jmp .Lblocks_vpmadd52_2x_key_loaded
- ud2
-
-.align 32
-.Lblocks_vpmadd52_2x_do:
- vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
- vmovdqu64 160+8($ctx),${S1}{%k1}{z}
- vmovdqu64 64+8($ctx),${R0}{%k1}{z}
- vmovdqu64 96+8($ctx),${R1}{%k1}{z}
-
-.Lblocks_vpmadd52_2x_key_loaded:
- vmovdqu64 16*0($inp),$T2 # load data
- vpxorq $T3,$T3,$T3
- lea 16*2($inp),$inp
-
- vpunpcklqdq $T3,$T2,$T1 # transpose data
- vpunpckhqdq $T3,$T2,$T3
-
- # at this point 64-bit lanes are ordered as x-1-x-0
-
- vpsrlq \$24,$T3,$T2 # splat the data
- vporq $PAD,$T2,$T2
- vpaddq $T2,$H2,$H2 # accumulate input
- vpandq $mask44,$T1,$T0
- vpsrlq \$44,$T1,$T1
- vpsllq \$20,$T3,$T3
- vporq $T3,$T1,$T1
- vpandq $mask44,$T1,$T1
-
- jmp .Ltail_vpmadd52_2x
- ud2
-
-.align 32
-.Loop_vpmadd52_4x:
- #vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$H1,$H1
-
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $H2,$S1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $H2,$S1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $H2,$S2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $H2,$S2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $H2,$R0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $H2,$R0,$D2hi
-
- vmovdqu64 16*0($inp),$T2 # load data
- vmovdqu64 16*2($inp),$T3
- lea 16*4($inp),$inp
- vpmadd52luq $H0,$R0,$D0lo
- vpmadd52huq $H0,$R0,$D0hi
- vpmadd52luq $H0,$R1,$D1lo
- vpmadd52huq $H0,$R1,$D1hi
- vpmadd52luq $H0,$R2,$D2lo
- vpmadd52huq $H0,$R2,$D2hi
-
- vpunpcklqdq $T3,$T2,$T1 # transpose data
- vpunpckhqdq $T3,$T2,$T3
- vpmadd52luq $H1,$S2,$D0lo
- vpmadd52huq $H1,$S2,$D0hi
- vpmadd52luq $H1,$R0,$D1lo
- vpmadd52huq $H1,$R0,$D1hi
- vpmadd52luq $H1,$R1,$D2lo
- vpmadd52huq $H1,$R1,$D2hi
-
- ################################################################
- # partial reduction (interleaved with data splat)
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$H0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpsrlq \$24,$T3,$T2
- vporq $PAD,$T2,$T2
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$H1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpandq $mask44,$T1,$T0
- vpsrlq \$44,$T1,$T1
- vpsllq \$20,$T3,$T3
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$H2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $D2hi,$H0,$H0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
- vporq $T3,$T1,$T1
- vpandq $mask44,$T1,$T1
-
- vpsrlq \$44,$H0,$tmp # additional step
- vpandq $mask44,$H0,$H0
-
- vpaddq $tmp,$H1,$H1
-
- sub \$4,$len # len-=64
- jnz .Loop_vpmadd52_4x
-
-.Ltail_vpmadd52_4x:
- vmovdqu64 128($ctx),$R2 # load all key powers
- vmovdqu64 160($ctx),$S1
- vmovdqu64 64($ctx),$R0
- vmovdqu64 96($ctx),$R1
-
-.Ltail_vpmadd52_2x:
- vpsllq \$2,$R2,$S2 # S2 = R2*5*4
- vpaddq $R2,$S2,$S2
- vpsllq \$2,$S2,$S2
-
- #vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$H1,$H1
-
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $H2,$S1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $H2,$S1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $H2,$S2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $H2,$S2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $H2,$R0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $H2,$R0,$D2hi
-
- vpmadd52luq $H0,$R0,$D0lo
- vpmadd52huq $H0,$R0,$D0hi
- vpmadd52luq $H0,$R1,$D1lo
- vpmadd52huq $H0,$R1,$D1hi
- vpmadd52luq $H0,$R2,$D2lo
- vpmadd52huq $H0,$R2,$D2hi
-
- vpmadd52luq $H1,$S2,$D0lo
- vpmadd52huq $H1,$S2,$D0hi
- vpmadd52luq $H1,$R0,$D1lo
- vpmadd52huq $H1,$R0,$D1hi
- vpmadd52luq $H1,$R1,$D2lo
- vpmadd52huq $H1,$R1,$D2hi
-
- ################################################################
- # horizontal addition
-
- mov \$1,%eax
- kmovw %eax,%k1
- vpsrldq \$8,$D0lo,$T0
- vpsrldq \$8,$D0hi,$H0
- vpsrldq \$8,$D1lo,$T1
- vpsrldq \$8,$D1hi,$H1
- vpaddq $T0,$D0lo,$D0lo
- vpaddq $H0,$D0hi,$D0hi
- vpsrldq \$8,$D2lo,$T2
- vpsrldq \$8,$D2hi,$H2
- vpaddq $T1,$D1lo,$D1lo
- vpaddq $H1,$D1hi,$D1hi
- vpermq \$0x2,$D0lo,$T0
- vpermq \$0x2,$D0hi,$H0
- vpaddq $T2,$D2lo,$D2lo
- vpaddq $H2,$D2hi,$D2hi
-
- vpermq \$0x2,$D1lo,$T1
- vpermq \$0x2,$D1hi,$H1
- vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
- vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
- vpermq \$0x2,$D2lo,$T2
- vpermq \$0x2,$D2hi,$H2
- vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
- vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
- vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
- vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
-
- ################################################################
- # partial reduction
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$H0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$H1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$H2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
-
- vpsrlq \$44,$H0,$tmp # additional step
- vpandq $mask44,$H0,$H0
-
- vpaddq $tmp,$H1,$H1
- # at this point $len is
- # either 4*n+2 or 0...
- sub \$2,$len # len-=32
- ja .Lblocks_vpmadd52_4x_do
-
- vmovq %x#$H0,0($ctx)
- vmovq %x#$H1,8($ctx)
- vmovq %x#$H2,16($ctx)
- vzeroall
-
-.Lno_data_vpmadd52_4x:
- RET
-.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
-___
-}
-{
-########################################################################
-# As implied by its name 8x subroutine processes 8 blocks in parallel...
-# This is intermediate version, as it's used only in cases when input
-# length is either 8*n, 8*n+1 or 8*n+2...
-
-my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
-my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
-my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
-my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
-
-$code.=<<___;
-.type poly1305_blocks_vpmadd52_8x,\@function,4
-.align 32
-poly1305_blocks_vpmadd52_8x:
- shr \$4,$len
- jz .Lno_data_vpmadd52_8x # too short
-
- shl \$40,$padbit
- mov 64($ctx),%r8 # peek on power of the key
-
- vmovdqa64 .Lx_mask44(%rip),$mask44
- vmovdqa64 .Lx_mask42(%rip),$mask42
-
- test %r8,%r8 # is power value impossible?
- js .Linit_vpmadd52 # if it is, then init R[4]
-
- vmovq 0($ctx),%x#$H0 # load current hash value
- vmovq 8($ctx),%x#$H1
- vmovq 16($ctx),%x#$H2
-
-.Lblocks_vpmadd52_8x:
- ################################################################
- # fist we calculate more key powers
-
- vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
- vmovdqu64 160($ctx),$S1
- vmovdqu64 64($ctx),$R0
- vmovdqu64 96($ctx),$R1
-
- vpsllq \$2,$R2,$S2 # S2 = R2*5*4
- vpaddq $R2,$S2,$S2
- vpsllq \$2,$S2,$S2
-
- vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
- vpbroadcastq %x#$R0,$RR0
- vpbroadcastq %x#$R1,$RR1
-
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $RR2,$S1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $RR2,$S1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $RR2,$S2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $RR2,$S2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $RR2,$R0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $RR2,$R0,$D2hi
-
- vpmadd52luq $RR0,$R0,$D0lo
- vpmadd52huq $RR0,$R0,$D0hi
- vpmadd52luq $RR0,$R1,$D1lo
- vpmadd52huq $RR0,$R1,$D1hi
- vpmadd52luq $RR0,$R2,$D2lo
- vpmadd52huq $RR0,$R2,$D2hi
-
- vpmadd52luq $RR1,$S2,$D0lo
- vpmadd52huq $RR1,$S2,$D0hi
- vpmadd52luq $RR1,$R0,$D1lo
- vpmadd52huq $RR1,$R0,$D1hi
- vpmadd52luq $RR1,$R1,$D2lo
- vpmadd52huq $RR1,$R1,$D2hi
-
- ################################################################
- # partial reduction
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$RR0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$RR1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$RR2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $D2hi,$RR0,$RR0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$RR0,$RR0
-
- vpsrlq \$44,$RR0,$tmp # additional step
- vpandq $mask44,$RR0,$RR0
-
- vpaddq $tmp,$RR1,$RR1
-
- ################################################################
- # At this point Rx holds 1324 powers, RRx - 5768, and the goal
- # is 15263748, which reflects how data is loaded...
-
- vpunpcklqdq $R2,$RR2,$T2 # 3748
- vpunpckhqdq $R2,$RR2,$R2 # 1526
- vpunpcklqdq $R0,$RR0,$T0
- vpunpckhqdq $R0,$RR0,$R0
- vpunpcklqdq $R1,$RR1,$T1
- vpunpckhqdq $R1,$RR1,$R1
-___
-######## switch to %zmm
-map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
-map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
-map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
-map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
-
-$code.=<<___;
- vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
- vshufi64x2 \$0x44,$R0,$T0,$RR0
- vshufi64x2 \$0x44,$R1,$T1,$RR1
-
- vmovdqu64 16*0($inp),$T2 # load data
- vmovdqu64 16*4($inp),$T3
- lea 16*8($inp),$inp
-
- vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
- vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
- vpaddq $RR2,$SS2,$SS2
- vpaddq $RR1,$SS1,$SS1
- vpsllq \$2,$SS2,$SS2
- vpsllq \$2,$SS1,$SS1
-
- vpbroadcastq $padbit,$PAD
- vpbroadcastq %x#$mask44,$mask44
- vpbroadcastq %x#$mask42,$mask42
-
- vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
- vpbroadcastq %x#$SS2,$S2
- vpbroadcastq %x#$RR0,$R0
- vpbroadcastq %x#$RR1,$R1
- vpbroadcastq %x#$RR2,$R2
-
- vpunpcklqdq $T3,$T2,$T1 # transpose data
- vpunpckhqdq $T3,$T2,$T3
-
- # at this point 64-bit lanes are ordered as 73625140
-
- vpsrlq \$24,$T3,$T2 # splat the data
- vporq $PAD,$T2,$T2
- vpaddq $T2,$H2,$H2 # accumulate input
- vpandq $mask44,$T1,$T0
- vpsrlq \$44,$T1,$T1
- vpsllq \$20,$T3,$T3
- vporq $T3,$T1,$T1
- vpandq $mask44,$T1,$T1
-
- sub \$8,$len
- jz .Ltail_vpmadd52_8x
- jmp .Loop_vpmadd52_8x
-
-.align 32
-.Loop_vpmadd52_8x:
- #vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$H1,$H1
-
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $H2,$S1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $H2,$S1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $H2,$S2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $H2,$S2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $H2,$R0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $H2,$R0,$D2hi
-
- vmovdqu64 16*0($inp),$T2 # load data
- vmovdqu64 16*4($inp),$T3
- lea 16*8($inp),$inp
- vpmadd52luq $H0,$R0,$D0lo
- vpmadd52huq $H0,$R0,$D0hi
- vpmadd52luq $H0,$R1,$D1lo
- vpmadd52huq $H0,$R1,$D1hi
- vpmadd52luq $H0,$R2,$D2lo
- vpmadd52huq $H0,$R2,$D2hi
-
- vpunpcklqdq $T3,$T2,$T1 # transpose data
- vpunpckhqdq $T3,$T2,$T3
- vpmadd52luq $H1,$S2,$D0lo
- vpmadd52huq $H1,$S2,$D0hi
- vpmadd52luq $H1,$R0,$D1lo
- vpmadd52huq $H1,$R0,$D1hi
- vpmadd52luq $H1,$R1,$D2lo
- vpmadd52huq $H1,$R1,$D2hi
-
- ################################################################
- # partial reduction (interleaved with data splat)
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$H0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpsrlq \$24,$T3,$T2
- vporq $PAD,$T2,$T2
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$H1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpandq $mask44,$T1,$T0
- vpsrlq \$44,$T1,$T1
- vpsllq \$20,$T3,$T3
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$H2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $D2hi,$H0,$H0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
- vporq $T3,$T1,$T1
- vpandq $mask44,$T1,$T1
-
- vpsrlq \$44,$H0,$tmp # additional step
- vpandq $mask44,$H0,$H0
-
- vpaddq $tmp,$H1,$H1
-
- sub \$8,$len # len-=128
- jnz .Loop_vpmadd52_8x
-
-.Ltail_vpmadd52_8x:
- #vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$H1,$H1
-
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $H2,$SS1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $H2,$SS1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $H2,$SS2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $H2,$SS2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $H2,$RR0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $H2,$RR0,$D2hi
-
- vpmadd52luq $H0,$RR0,$D0lo
- vpmadd52huq $H0,$RR0,$D0hi
- vpmadd52luq $H0,$RR1,$D1lo
- vpmadd52huq $H0,$RR1,$D1hi
- vpmadd52luq $H0,$RR2,$D2lo
- vpmadd52huq $H0,$RR2,$D2hi
-
- vpmadd52luq $H1,$SS2,$D0lo
- vpmadd52huq $H1,$SS2,$D0hi
- vpmadd52luq $H1,$RR0,$D1lo
- vpmadd52huq $H1,$RR0,$D1hi
- vpmadd52luq $H1,$RR1,$D2lo
- vpmadd52huq $H1,$RR1,$D2hi
-
- ################################################################
- # horizontal addition
-
- mov \$1,%eax
- kmovw %eax,%k1
- vpsrldq \$8,$D0lo,$T0
- vpsrldq \$8,$D0hi,$H0
- vpsrldq \$8,$D1lo,$T1
- vpsrldq \$8,$D1hi,$H1
- vpaddq $T0,$D0lo,$D0lo
- vpaddq $H0,$D0hi,$D0hi
- vpsrldq \$8,$D2lo,$T2
- vpsrldq \$8,$D2hi,$H2
- vpaddq $T1,$D1lo,$D1lo
- vpaddq $H1,$D1hi,$D1hi
- vpermq \$0x2,$D0lo,$T0
- vpermq \$0x2,$D0hi,$H0
- vpaddq $T2,$D2lo,$D2lo
- vpaddq $H2,$D2hi,$D2hi
-
- vpermq \$0x2,$D1lo,$T1
- vpermq \$0x2,$D1hi,$H1
- vpaddq $T0,$D0lo,$D0lo
- vpaddq $H0,$D0hi,$D0hi
- vpermq \$0x2,$D2lo,$T2
- vpermq \$0x2,$D2hi,$H2
- vpaddq $T1,$D1lo,$D1lo
- vpaddq $H1,$D1hi,$D1hi
- vextracti64x4 \$1,$D0lo,%y#$T0
- vextracti64x4 \$1,$D0hi,%y#$H0
- vpaddq $T2,$D2lo,$D2lo
- vpaddq $H2,$D2hi,$D2hi
-
- vextracti64x4 \$1,$D1lo,%y#$T1
- vextracti64x4 \$1,$D1hi,%y#$H1
- vextracti64x4 \$1,$D2lo,%y#$T2
- vextracti64x4 \$1,$D2hi,%y#$H2
-___
-######## switch back to %ymm
-map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
-map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
-map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
-
-$code.=<<___;
- vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
- vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
- vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
- vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
- vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
- vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
-
- ################################################################
- # partial reduction
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$H0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$H1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$H2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
-
- vpsrlq \$44,$H0,$tmp # additional step
- vpandq $mask44,$H0,$H0
-
- vpaddq $tmp,$H1,$H1
-
- ################################################################
-
- vmovq %x#$H0,0($ctx)
- vmovq %x#$H1,8($ctx)
- vmovq %x#$H2,16($ctx)
- vzeroall
-
-.Lno_data_vpmadd52_8x:
- RET
-.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
-___
-}
-$code.=<<___;
-.type poly1305_emit_base2_44,\@function,3
-.align 32
-poly1305_emit_base2_44:
- mov 0($ctx),%r8 # load hash value
- mov 8($ctx),%r9
- mov 16($ctx),%r10
-
- mov %r9,%rax
- shr \$20,%r9
- shl \$44,%rax
- mov %r10,%rcx
- shr \$40,%r10
- shl \$24,%rcx
-
- add %rax,%r8
- adc %rcx,%r9
- adc \$0,%r10
-
- mov %r8,%rax
- add \$5,%r8 # compare to modulus
- mov %r9,%rcx
- adc \$0,%r9
- adc \$0,%r10
- shr \$2,%r10 # did 130-bit value overflow?
- cmovnz %r8,%rax
- cmovnz %r9,%rcx
-
- add 0($nonce),%rax # accumulate nonce
- adc 8($nonce),%rcx
- mov %rax,0($mac) # write result
- mov %rcx,8($mac)
-
- RET
-.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
-___
-} } }
-}
-
-if (!$kernel)
-{ # chacha20-poly1305 helpers
-my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
- ("%rdi","%rsi","%rdx","%rcx"); # Unix order
-$code.=<<___;
-.globl xor128_encrypt_n_pad
-.type xor128_encrypt_n_pad,\@abi-omnipotent
-.align 16
-xor128_encrypt_n_pad:
- sub $otp,$inp
- sub $otp,$out
- mov $len,%r10 # put len aside
- shr \$4,$len # len / 16
- jz .Ltail_enc
- nop
-.Loop_enc_xmm:
- movdqu ($inp,$otp),%xmm0
- pxor ($otp),%xmm0
- movdqu %xmm0,($out,$otp)
- movdqa %xmm0,($otp)
- lea 16($otp),$otp
- dec $len
- jnz .Loop_enc_xmm
-
- and \$15,%r10 # len % 16
- jz .Ldone_enc
-
-.Ltail_enc:
- mov \$16,$len
- sub %r10,$len
- xor %eax,%eax
-.Loop_enc_byte:
- mov ($inp,$otp),%al
- xor ($otp),%al
- mov %al,($out,$otp)
- mov %al,($otp)
- lea 1($otp),$otp
- dec %r10
- jnz .Loop_enc_byte
-
- xor %eax,%eax
-.Loop_enc_pad:
- mov %al,($otp)
- lea 1($otp),$otp
- dec $len
- jnz .Loop_enc_pad
-
-.Ldone_enc:
- mov $otp,%rax
- RET
-.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
-
-.globl xor128_decrypt_n_pad
-.type xor128_decrypt_n_pad,\@abi-omnipotent
-.align 16
-xor128_decrypt_n_pad:
- sub $otp,$inp
- sub $otp,$out
- mov $len,%r10 # put len aside
- shr \$4,$len # len / 16
- jz .Ltail_dec
- nop
-.Loop_dec_xmm:
- movdqu ($inp,$otp),%xmm0
- movdqa ($otp),%xmm1
- pxor %xmm0,%xmm1
- movdqu %xmm1,($out,$otp)
- movdqa %xmm0,($otp)
- lea 16($otp),$otp
- dec $len
- jnz .Loop_dec_xmm
-
- pxor %xmm1,%xmm1
- and \$15,%r10 # len % 16
- jz .Ldone_dec
-
-.Ltail_dec:
- mov \$16,$len
- sub %r10,$len
- xor %eax,%eax
- xor %r11d,%r11d
-.Loop_dec_byte:
- mov ($inp,$otp),%r11b
- mov ($otp),%al
- xor %r11b,%al
- mov %al,($out,$otp)
- mov %r11b,($otp)
- lea 1($otp),$otp
- dec %r10
- jnz .Loop_dec_byte
-
- xor %eax,%eax
-.Loop_dec_pad:
- mov %al,($otp)
- lea 1($otp),$otp
- dec $len
- jnz .Loop_dec_pad
-
-.Ldone_dec:
- mov $otp,%rax
- RET
-.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
-___
-}
-
-# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-# CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
-$rec="%rcx";
-$frame="%rdx";
-$context="%r8";
-$disp="%r9";
-
-$code.=<<___;
-.extern __imp_RtlVirtualUnwind
-.type se_handler,\@abi-omnipotent
-.align 16
-se_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
-
- mov 0(%r11),%r10d # HandlerData[0]
- lea (%rsi,%r10),%r10 # prologue label
- cmp %r10,%rbx # context->Rip<.Lprologue
- jb .Lcommon_seh_tail
-
- mov 152($context),%rax # pull context->Rsp
-
- mov 4(%r11),%r10d # HandlerData[1]
- lea (%rsi,%r10),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=.Lepilogue
- jae .Lcommon_seh_tail
-
- lea 48(%rax),%rax
-
- mov -8(%rax),%rbx
- mov -16(%rax),%rbp
- mov -24(%rax),%r12
- mov -32(%rax),%r13
- mov -40(%rax),%r14
- mov -48(%rax),%r15
- mov %rbx,144($context) # restore context->Rbx
- mov %rbp,160($context) # restore context->Rbp
- mov %r12,216($context) # restore context->R12
- mov %r13,224($context) # restore context->R13
- mov %r14,232($context) # restore context->R14
- mov %r15,240($context) # restore context->R14
-
- jmp .Lcommon_seh_tail
-.size se_handler,.-se_handler
-
-.type avx_handler,\@abi-omnipotent
-.align 16
-avx_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
-
- mov 0(%r11),%r10d # HandlerData[0]
- lea (%rsi,%r10),%r10 # prologue label
- cmp %r10,%rbx # context->Rip<prologue label
- jb .Lcommon_seh_tail
-
- mov 152($context),%rax # pull context->Rsp
-
- mov 4(%r11),%r10d # HandlerData[1]
- lea (%rsi,%r10),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=epilogue label
- jae .Lcommon_seh_tail
-
- mov 208($context),%rax # pull context->R11
-
- lea 0x50(%rax),%rsi
- lea 0xf8(%rax),%rax
- lea 512($context),%rdi # &context.Xmm6
- mov \$20,%ecx
- .long 0xa548f3fc # cld; rep movsq
-
-.Lcommon_seh_tail:
- mov 8(%rax),%rdi
- mov 16(%rax),%rsi
- mov %rax,152($context) # restore context->Rsp
- mov %rsi,168($context) # restore context->Rsi
- mov %rdi,176($context) # restore context->Rdi
-
- mov 40($disp),%rdi # disp->ContextRecord
- mov $context,%rsi # context
- mov \$154,%ecx # sizeof(CONTEXT)
- .long 0xa548f3fc # cld; rep movsq
-
- mov $disp,%rsi
- xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER
- mov 8(%rsi),%rdx # arg2, disp->ImageBase
- mov 0(%rsi),%r8 # arg3, disp->ControlPc
- mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
- mov 40(%rsi),%r10 # disp->ContextRecord
- lea 56(%rsi),%r11 # &disp->HandlerData
- lea 24(%rsi),%r12 # &disp->EstablisherFrame
- mov %r10,32(%rsp) # arg5
- mov %r11,40(%rsp) # arg6
- mov %r12,48(%rsp) # arg7
- mov %rcx,56(%rsp) # arg8, (NULL)
- call *__imp_RtlVirtualUnwind(%rip)
-
- mov \$1,%eax # ExceptionContinueSearch
- add \$64,%rsp
- popfq
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- pop %rdi
- pop %rsi
- RET
-.size avx_handler,.-avx_handler
-
-.section .pdata
-.align 4
- .rva .LSEH_begin_poly1305_block_init_arch
- .rva .LSEH_end_poly1305_block_init_arch
- .rva .LSEH_info_poly1305_block_init_arch
-
- .rva .LSEH_begin_poly1305_blocks_x86_64
- .rva .LSEH_end_poly1305_blocks_x86_64
- .rva .LSEH_info_poly1305_blocks_x86_64
-
- .rva .LSEH_begin_poly1305_emit_x86_64
- .rva .LSEH_end_poly1305_emit_x86_64
- .rva .LSEH_info_poly1305_emit_x86_64
-___
-$code.=<<___ if ($avx);
- .rva .LSEH_begin_poly1305_blocks_avx
- .rva .Lbase2_64_avx
- .rva .LSEH_info_poly1305_blocks_avx_1
-
- .rva .Lbase2_64_avx
- .rva .Leven_avx
- .rva .LSEH_info_poly1305_blocks_avx_2
-
- .rva .Leven_avx
- .rva .LSEH_end_poly1305_blocks_avx
- .rva .LSEH_info_poly1305_blocks_avx_3
-
- .rva .LSEH_begin_poly1305_emit_avx
- .rva .LSEH_end_poly1305_emit_avx
- .rva .LSEH_info_poly1305_emit_avx
-___
-$code.=<<___ if ($avx>1);
- .rva .LSEH_begin_poly1305_blocks_avx2
- .rva .Lbase2_64_avx2
- .rva .LSEH_info_poly1305_blocks_avx2_1
-
- .rva .Lbase2_64_avx2
- .rva .Leven_avx2
- .rva .LSEH_info_poly1305_blocks_avx2_2
-
- .rva .Leven_avx2
- .rva .LSEH_end_poly1305_blocks_avx2
- .rva .LSEH_info_poly1305_blocks_avx2_3
-___
-$code.=<<___ if ($avx>2);
- .rva .LSEH_begin_poly1305_blocks_avx512
- .rva .LSEH_end_poly1305_blocks_avx512
- .rva .LSEH_info_poly1305_blocks_avx512
-___
-$code.=<<___;
-.section .xdata
-.align 8
-.LSEH_info_poly1305_block_init_arch:
- .byte 9,0,0,0
- .rva se_handler
- .rva .LSEH_begin_poly1305_block_init_arch,.LSEH_begin_poly1305_block_init_arch
-
-.LSEH_info_poly1305_blocks_x86_64:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lblocks_body,.Lblocks_epilogue
-
-.LSEH_info_poly1305_emit_x86_64:
- .byte 9,0,0,0
- .rva se_handler
- .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
-___
-$code.=<<___ if ($avx);
-.LSEH_info_poly1305_blocks_avx_1:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx_2:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx_3:
- .byte 9,0,0,0
- .rva avx_handler
- .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_emit_avx:
- .byte 9,0,0,0
- .rva se_handler
- .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
-___
-$code.=<<___ if ($avx>1);
-.LSEH_info_poly1305_blocks_avx2_1:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx2_2:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx2_3:
- .byte 9,0,0,0
- .rva avx_handler
- .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
-___
-$code.=<<___ if ($avx>2);
-.LSEH_info_poly1305_blocks_avx512:
- .byte 9,0,0,0
- .rva avx_handler
- .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
-___
-}
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/\/\// and !/^$/);
- print;
-}
-close SELF;
-
-foreach (split('\n',$code)) {
- s/\`([^\`]*)\`/eval($1)/ge;
- s/%r([a-z]+)#d/%e$1/g;
- s/%r([0-9]+)#d/%r$1d/g;
- s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
-
- if ($kernel) {
- s/(^\.type.*),[0-9]+$/\1/;
- s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
- next if /^\.cfi.*/;
- }
-
- print $_,"\n";
-}
-close STDOUT;
diff --git a/arch/x86/lib/crypto/poly1305_glue.c b/arch/x86/lib/crypto/poly1305_glue.c
deleted file mode 100644
index b7e78a583e07..000000000000
--- a/arch/x86/lib/crypto/poly1305_glue.c
+++ /dev/null
@@ -1,129 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <asm/cpu_device_id.h>
-#include <asm/fpu/api.h>
-#include <crypto/internal/poly1305.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/sizes.h>
-#include <linux/unaligned.h>
-
-struct poly1305_arch_internal {
- union {
- struct {
- u32 h[5];
- u32 is_base2_26;
- };
- u64 hs[3];
- };
- u64 r[2];
- u64 pad;
- struct { u32 r2, r1, r4, r3; } rn[9];
-};
-
-asmlinkage void poly1305_block_init_arch(
- struct poly1305_block_state *state,
- const u8 raw_key[POLY1305_BLOCK_SIZE]);
-EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
-asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx,
- const u8 *inp,
- const size_t len, const u32 padbit);
-asmlinkage void poly1305_emit_x86_64(const struct poly1305_state *ctx,
- u8 mac[POLY1305_DIGEST_SIZE],
- const u32 nonce[4]);
-asmlinkage void poly1305_emit_avx(const struct poly1305_state *ctx,
- u8 mac[POLY1305_DIGEST_SIZE],
- const u32 nonce[4]);
-asmlinkage void poly1305_blocks_avx(struct poly1305_arch_internal *ctx,
- const u8 *inp, const size_t len,
- const u32 padbit);
-asmlinkage void poly1305_blocks_avx2(struct poly1305_arch_internal *ctx,
- const u8 *inp, const size_t len,
- const u32 padbit);
-asmlinkage void poly1305_blocks_avx512(struct poly1305_arch_internal *ctx,
- const u8 *inp,
- const size_t len, const u32 padbit);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
-
-void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp,
- unsigned int len, u32 padbit)
-{
- struct poly1305_arch_internal *ctx =
- container_of(&state->h.h, struct poly1305_arch_internal, h);
-
- /* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
- SZ_4K % POLY1305_BLOCK_SIZE);
-
- if (!static_branch_likely(&poly1305_use_avx)) {
- poly1305_blocks_x86_64(ctx, inp, len, padbit);
- return;
- }
-
- do {
- const unsigned int bytes = min(len, SZ_4K);
-
- kernel_fpu_begin();
- if (static_branch_likely(&poly1305_use_avx512))
- poly1305_blocks_avx512(ctx, inp, bytes, padbit);
- else if (static_branch_likely(&poly1305_use_avx2))
- poly1305_blocks_avx2(ctx, inp, bytes, padbit);
- else
- poly1305_blocks_avx(ctx, inp, bytes, padbit);
- kernel_fpu_end();
-
- len -= bytes;
- inp += bytes;
- } while (len);
-}
-EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
-
-void poly1305_emit_arch(const struct poly1305_state *ctx,
- u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4])
-{
- if (!static_branch_likely(&poly1305_use_avx))
- poly1305_emit_x86_64(ctx, mac, nonce);
- else
- poly1305_emit_avx(ctx, mac, nonce);
-}
-EXPORT_SYMBOL_GPL(poly1305_emit_arch);
-
-bool poly1305_is_arch_optimized(void)
-{
- return static_key_enabled(&poly1305_use_avx);
-}
-EXPORT_SYMBOL(poly1305_is_arch_optimized);
-
-static int __init poly1305_simd_mod_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_AVX) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
- static_branch_enable(&poly1305_use_avx);
- if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
- static_branch_enable(&poly1305_use_avx2);
- if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_AVX512F) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
- /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
- boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X)
- static_branch_enable(&poly1305_use_avx512);
- return 0;
-}
-subsys_initcall(poly1305_simd_mod_init);
-
-static void __exit poly1305_simd_mod_exit(void)
-{
-}
-module_exit(poly1305_simd_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
-MODULE_DESCRIPTION("Poly1305 authenticator");
diff --git a/arch/x86/lib/crypto/sha256-avx-asm.S b/arch/x86/lib/crypto/sha256-avx-asm.S
deleted file mode 100644
index 0d7b2c3e45d9..000000000000
--- a/arch/x86/lib/crypto/sha256-avx-asm.S
+++ /dev/null
@@ -1,499 +0,0 @@
-########################################################################
-# Implement fast SHA-256 with AVX1 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-256 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-# This code schedules 1 block at a time, with 4 lanes per block
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/objtool.h>
-
-## assume buffers not aligned
-#define VMOVDQ vmovdqu
-
-################################ Define Macros
-
-# addm [mem], reg
-# Add reg to mem using reg-mem add and store
-.macro addm p1 p2
- add \p1, \p2
- mov \p2, \p1
-.endm
-
-
-.macro MY_ROR p1 p2
- shld $(32-(\p1)), \p2, \p2
-.endm
-
-################################
-
-# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
-# Load xmm with mem and byte swap each dword
-.macro COPY_XMM_AND_BSWAP p1 p2 p3
- VMOVDQ \p2, \p1
- vpshufb \p3, \p1, \p1
-.endm
-
-################################
-
-X0 = %xmm4
-X1 = %xmm5
-X2 = %xmm6
-X3 = %xmm7
-
-XTMP0 = %xmm0
-XTMP1 = %xmm1
-XTMP2 = %xmm2
-XTMP3 = %xmm3
-XTMP4 = %xmm8
-XFER = %xmm9
-XTMP5 = %xmm11
-
-SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
-SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
-BYTE_FLIP_MASK = %xmm13
-
-NUM_BLKS = %rdx # 3rd arg
-INP = %rsi # 2nd arg
-CTX = %rdi # 1st arg
-
-SRND = %rsi # clobbers INP
-c = %ecx
-d = %r8d
-e = %edx
-TBL = %r12
-a = %eax
-b = %ebx
-
-f = %r9d
-g = %r10d
-h = %r11d
-
-y0 = %r13d
-y1 = %r14d
-y2 = %r15d
-
-
-_INP_END_SIZE = 8
-_INP_SIZE = 8
-_XFER_SIZE = 16
-_XMM_SAVE_SIZE = 0
-
-_INP_END = 0
-_INP = _INP_END + _INP_END_SIZE
-_XFER = _INP + _INP_SIZE
-_XMM_SAVE = _XFER + _XFER_SIZE
-STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
-
-# rotate_Xs
-# Rotate values of symbols X0...X3
-.macro rotate_Xs
-X_ = X0
-X0 = X1
-X1 = X2
-X2 = X3
-X3 = X_
-.endm
-
-# ROTATE_ARGS
-# Rotate values of symbols a...h
-.macro ROTATE_ARGS
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
- ## compute s0 four at a time and s1 two at a time
- ## compute W[-16] + W[-7] 4 at a time
-
- mov e, y0 # y0 = e
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor a, y1 # y1 = a ^ (a >> (22-13)
- xor g, y2 # y2 = f^g
- vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- ## compute s0
- vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add y0, y2 # y2 = S1 + CH
- add _XFER(%rsp), y2 # y2 = k + w + S1 + CH
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- vpsrld $7, XTMP1, XTMP2
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- vpslld $(32-7), XTMP1, XTMP3
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
- mov e, y0 # y0 = e
- mov a, y1 # y1 = a
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- vpsrld $18, XTMP1, XTMP2 #
- xor a, y1 # y1 = a ^ (a >> (22-13)
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor g, y2 # y2 = f^g
- vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- vpslld $(32-18), XTMP1, XTMP1
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- vpxor XTMP1, XTMP3, XTMP3 #
- add y0, y2 # y2 = S1 + CH
- add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- ## compute low s1
- vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
- mov e, y0 # y0 = e
- mov a, y1 # y1 = a
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- mov f, y2 # y2 = f
- xor a, y1 # y1 = a ^ (a >> (22-13)
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
- xor g, y2 # y2 = f^g
- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- vpxor XTMP3, XTMP2, XTMP2 #
- add y0, y2 # y2 = S1 + CH
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- ## compute high s1
- vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
- mov e, y0 # y0 = e
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
- xor a, y1 # y1 = a ^ (a >> (22-13)
- xor g, y2 # y2 = f^g
- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- vpxor XTMP3, XTMP2, XTMP2
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add y0, y2 # y2 = S1 + CH
- add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
- rotate_Xs
-.endm
-
-## input is [rsp + _XFER + %1 * 4]
-.macro DO_ROUND round
- mov e, y0 # y0 = e
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- xor e, y0 # y0 = e ^ (e >> (25-11))
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- mov f, y2 # y2 = f
- xor a, y1 # y1 = a ^ (a >> (22-13)
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor g, y2 # y2 = f^g
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- and e, y2 # y2 = (f^g)&e
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- add y0, y2 # y2 = S1 + CH
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- offset = \round * 4 + _XFER #
- add offset(%rsp), y2 # y2 = k + w + S1 + CH
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
-.endm
-
-########################################################################
-## void sha256_transform_avx(u32 state[SHA256_STATE_WORDS],
-## const u8 *data, size_t nblocks);
-########################################################################
-.text
-SYM_FUNC_START(sha256_transform_avx)
- ANNOTATE_NOENDBR # since this is called only via static_call
-
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rbp
- movq %rsp, %rbp
-
- subq $STACK_SIZE, %rsp # allocate stack space
- and $~15, %rsp # align stack pointer
-
- shl $6, NUM_BLKS # convert to bytes
- jz .Ldone_hash
- add INP, NUM_BLKS # pointer to end of data
- mov NUM_BLKS, _INP_END(%rsp)
-
- ## load initial digest
- mov 4*0(CTX), a
- mov 4*1(CTX), b
- mov 4*2(CTX), c
- mov 4*3(CTX), d
- mov 4*4(CTX), e
- mov 4*5(CTX), f
- mov 4*6(CTX), g
- mov 4*7(CTX), h
-
- vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
- vmovdqa _SHUF_00BA(%rip), SHUF_00BA
- vmovdqa _SHUF_DC00(%rip), SHUF_DC00
-.Lloop0:
- lea K256(%rip), TBL
-
- ## byte swap first 16 dwords
- COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
-
- mov INP, _INP(%rsp)
-
- ## schedule 48 input dwords, by doing 3 rounds of 16 each
- mov $3, SRND
-.align 16
-.Lloop1:
- vpaddd (TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddd 1*16(TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddd 2*16(TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddd 3*16(TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- add $4*16, TBL
- FOUR_ROUNDS_AND_SCHED
-
- sub $1, SRND
- jne .Lloop1
-
- mov $2, SRND
-.Lloop2:
- vpaddd (TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
-
- vpaddd 1*16(TBL), X1, XFER
- vmovdqa XFER, _XFER(%rsp)
- add $2*16, TBL
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
-
- vmovdqa X2, X0
- vmovdqa X3, X1
-
- sub $1, SRND
- jne .Lloop2
-
- addm (4*0)(CTX),a
- addm (4*1)(CTX),b
- addm (4*2)(CTX),c
- addm (4*3)(CTX),d
- addm (4*4)(CTX),e
- addm (4*5)(CTX),f
- addm (4*6)(CTX),g
- addm (4*7)(CTX),h
-
- mov _INP(%rsp), INP
- add $64, INP
- cmp _INP_END(%rsp), INP
- jne .Lloop0
-
-.Ldone_hash:
-
- mov %rbp, %rsp
- popq %rbp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- RET
-SYM_FUNC_END(sha256_transform_avx)
-
-.section .rodata.cst256.K256, "aM", @progbits, 256
-.align 64
-K256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x0c0d0e0f08090a0b0405060700010203
-
-.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
-.align 16
-# shuffle xBxA -> 00BA
-_SHUF_00BA:
- .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
-
-.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
-.align 16
-# shuffle xDxC -> DC00
-_SHUF_DC00:
- .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/arch/x86/lib/crypto/sha256-avx2-asm.S b/arch/x86/lib/crypto/sha256-avx2-asm.S
deleted file mode 100644
index 25d3380321ec..000000000000
--- a/arch/x86/lib/crypto/sha256-avx2-asm.S
+++ /dev/null
@@ -1,774 +0,0 @@
-########################################################################
-# Implement fast SHA-256 with AVX2 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-256 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-# This code schedules 2 blocks at a time, with 4 lanes per block
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/objtool.h>
-
-## assume buffers not aligned
-#define VMOVDQ vmovdqu
-
-################################ Define Macros
-
-# addm [mem], reg
-# Add reg to mem using reg-mem add and store
-.macro addm p1 p2
- add \p1, \p2
- mov \p2, \p1
-.endm
-
-################################
-
-X0 = %ymm4
-X1 = %ymm5
-X2 = %ymm6
-X3 = %ymm7
-
-# XMM versions of above
-XWORD0 = %xmm4
-XWORD1 = %xmm5
-XWORD2 = %xmm6
-XWORD3 = %xmm7
-
-XTMP0 = %ymm0
-XTMP1 = %ymm1
-XTMP2 = %ymm2
-XTMP3 = %ymm3
-XTMP4 = %ymm8
-XFER = %ymm9
-XTMP5 = %ymm11
-
-SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
-SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
-BYTE_FLIP_MASK = %ymm13
-
-X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
-
-NUM_BLKS = %rdx # 3rd arg
-INP = %rsi # 2nd arg
-CTX = %rdi # 1st arg
-c = %ecx
-d = %r8d
-e = %edx # clobbers NUM_BLKS
-y3 = %esi # clobbers INP
-
-SRND = CTX # SRND is same register as CTX
-
-a = %eax
-b = %ebx
-f = %r9d
-g = %r10d
-h = %r11d
-old_h = %r11d
-
-T1 = %r12d
-y0 = %r13d
-y1 = %r14d
-y2 = %r15d
-
-
-_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
-_XMM_SAVE_SIZE = 0
-_INP_END_SIZE = 8
-_INP_SIZE = 8
-_CTX_SIZE = 8
-
-_XFER = 0
-_XMM_SAVE = _XFER + _XFER_SIZE
-_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
-_INP = _INP_END + _INP_END_SIZE
-_CTX = _INP + _INP_SIZE
-STACK_SIZE = _CTX + _CTX_SIZE
-
-# rotate_Xs
-# Rotate values of symbols X0...X3
-.macro rotate_Xs
- X_ = X0
- X0 = X1
- X1 = X2
- X2 = X3
- X3 = X_
-.endm
-
-# ROTATE_ARGS
-# Rotate values of symbols a...h
-.macro ROTATE_ARGS
- old_h = h
- TMP_ = h
- h = g
- g = f
- f = e
- e = d
- d = c
- c = b
- b = a
- a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED disp
-################################### RND N + 0 ############################
-
- mov a, y3 # y3 = a # MAJA
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
-
- addl \disp(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
- vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
- mov f, y2 # y2 = f # CH
- rorx $13, a, T1 # T1 = a >> 13 # S0B
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- xor g, y2 # y2 = f^g # CH
- vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
-
- and e, y2 # y2 = (f^g)&e # CH
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- add h, d # d = k + w + h + d # --
-
- and b, y3 # y3 = (a|c)&b # MAJA
- vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
-
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- vpsrld $7, XTMP1, XTMP2
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
-
- add y0, y2 # y2 = S1 + CH # --
- vpslld $(32-7), XTMP1, XTMP3
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
- vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
-
- vpsrld $18, XTMP1, XTMP2
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
-
- ROTATE_ARGS
-
-################################### RND N + 1 ############################
-
- mov a, y3 # y3 = a # MAJA
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- offset = \disp + 1*4
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
-
- vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
- mov f, y2 # y2 = f # CH
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- xor g, y2 # y2 = f^g # CH
-
-
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- and e, y2 # y2 = (f^g)&e # CH
- add h, d # d = k + w + h + d # --
-
- vpslld $(32-18), XTMP1, XTMP1
- and b, y3 # y3 = (a|c)&b # MAJA
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
-
- vpxor XTMP1, XTMP3, XTMP3
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
- vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
-
-
- ROTATE_ARGS
-
-################################### RND N + 2 ############################
-
- mov a, y3 # y3 = a # MAJA
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- offset = \disp + 2*4
- addl offset(%rsp, SRND), h # h = k + w + h # --
-
- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- or c, y3 # y3 = a|c # MAJA
- mov f, y2 # y2 = f # CH
- xor g, y2 # y2 = f^g # CH
-
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
- and e, y2 # y2 = (f^g)&e # CH
-
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- vpxor XTMP3, XTMP2, XTMP2
- add h, d # d = k + w + h + d # --
- and b, y3 # y3 = (a|c)&b # MAJA
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a ,T1 # T1 = (a >> 2) # S0
- vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
- vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
-
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1,h # h = k + w + h + S0 # --
- add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
- add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
-
- add y3,h # h = t1 + S0 + MAJ # --
-
-
- ROTATE_ARGS
-
-################################### RND N + 3 ############################
-
- mov a, y3 # y3 = a # MAJA
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- offset = \disp + 3*4
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
-
- vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
- mov f, y2 # y2 = f # CH
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- xor g, y2 # y2 = f^g # CH
-
-
- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add h, d # d = k + w + h + d # --
- and b, y3 # y3 = (a|c)&b # MAJA
-
- vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- vpxor XTMP3, XTMP2, XTMP2
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- add y0, y2 # y2 = S1 + CH # --
-
- vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
-
- vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
-
- add y1, h # h = k + w + h + S0 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- ROTATE_ARGS
- rotate_Xs
-.endm
-
-.macro DO_4ROUNDS disp
-################################### RND N + 0 ###########################
-
- mov f, y2 # y2 = f # CH
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- addl \disp(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- ROTATE_ARGS
-
-################################### RND N + 1 ###########################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- offset = 4*1 + \disp
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- ROTATE_ARGS
-
-################################### RND N + 2 ##############################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- offset = 4*2 + \disp
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- ROTATE_ARGS
-
-################################### RND N + 3 ###########################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- offset = 4*3 + \disp
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
-
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
-
- add y3, h # h = t1 + S0 + MAJ # --
-
- ROTATE_ARGS
-
-.endm
-
-########################################################################
-## void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS],
-## const u8 *data, size_t nblocks);
-########################################################################
-.text
-SYM_FUNC_START(sha256_transform_rorx)
- ANNOTATE_NOENDBR # since this is called only via static_call
-
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
-
- push %rbp
- mov %rsp, %rbp
-
- subq $STACK_SIZE, %rsp
- and $-32, %rsp # align rsp to 32 byte boundary
-
- shl $6, NUM_BLKS # convert to bytes
- jz .Ldone_hash
- lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
- mov NUM_BLKS, _INP_END(%rsp)
-
- cmp NUM_BLKS, INP
- je .Lonly_one_block
-
- ## load initial digest
- mov (CTX), a
- mov 4*1(CTX), b
- mov 4*2(CTX), c
- mov 4*3(CTX), d
- mov 4*4(CTX), e
- mov 4*5(CTX), f
- mov 4*6(CTX), g
- mov 4*7(CTX), h
-
- vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
- vmovdqa _SHUF_00BA(%rip), SHUF_00BA
- vmovdqa _SHUF_DC00(%rip), SHUF_DC00
-
- mov CTX, _CTX(%rsp)
-
-.Lloop0:
- ## Load first 16 dwords from two blocks
- VMOVDQ 0*32(INP),XTMP0
- VMOVDQ 1*32(INP),XTMP1
- VMOVDQ 2*32(INP),XTMP2
- VMOVDQ 3*32(INP),XTMP3
-
- ## byte swap data
- vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
- vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
- vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
- vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
-
- ## transpose data into high/low halves
- vperm2i128 $0x20, XTMP2, XTMP0, X0
- vperm2i128 $0x31, XTMP2, XTMP0, X1
- vperm2i128 $0x20, XTMP3, XTMP1, X2
- vperm2i128 $0x31, XTMP3, XTMP1, X3
-
-.Llast_block_enter:
- add $64, INP
- mov INP, _INP(%rsp)
-
- ## schedule 48 input dwords, by doing 3 rounds of 12 each
- xor SRND, SRND
-
-.align 16
-.Lloop1:
- leaq K256+0*32(%rip), INP ## reuse INP as scratch reg
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED (_XFER + 0*32)
-
- leaq K256+1*32(%rip), INP
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED (_XFER + 1*32)
-
- leaq K256+2*32(%rip), INP
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED (_XFER + 2*32)
-
- leaq K256+3*32(%rip), INP
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED (_XFER + 3*32)
-
- add $4*32, SRND
- cmp $3*4*32, SRND
- jb .Lloop1
-
-.Lloop2:
- ## Do last 16 rounds with no scheduling
- leaq K256+0*32(%rip), INP
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
- DO_4ROUNDS (_XFER + 0*32)
-
- leaq K256+1*32(%rip), INP
- vpaddd (INP, SRND), X1, XFER
- vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
- DO_4ROUNDS (_XFER + 1*32)
- add $2*32, SRND
-
- vmovdqa X2, X0
- vmovdqa X3, X1
-
- cmp $4*4*32, SRND
- jb .Lloop2
-
- mov _CTX(%rsp), CTX
- mov _INP(%rsp), INP
-
- addm (4*0)(CTX),a
- addm (4*1)(CTX),b
- addm (4*2)(CTX),c
- addm (4*3)(CTX),d
- addm (4*4)(CTX),e
- addm (4*5)(CTX),f
- addm (4*6)(CTX),g
- addm (4*7)(CTX),h
-
- cmp _INP_END(%rsp), INP
- ja .Ldone_hash
-
- #### Do second block using previously scheduled results
- xor SRND, SRND
-.align 16
-.Lloop3:
- DO_4ROUNDS (_XFER + 0*32 + 16)
- DO_4ROUNDS (_XFER + 1*32 + 16)
- add $2*32, SRND
- cmp $4*4*32, SRND
- jb .Lloop3
-
- mov _CTX(%rsp), CTX
- mov _INP(%rsp), INP
- add $64, INP
-
- addm (4*0)(CTX),a
- addm (4*1)(CTX),b
- addm (4*2)(CTX),c
- addm (4*3)(CTX),d
- addm (4*4)(CTX),e
- addm (4*5)(CTX),f
- addm (4*6)(CTX),g
- addm (4*7)(CTX),h
-
- cmp _INP_END(%rsp), INP
- jb .Lloop0
- ja .Ldone_hash
-
-.Ldo_last_block:
- VMOVDQ 0*16(INP),XWORD0
- VMOVDQ 1*16(INP),XWORD1
- VMOVDQ 2*16(INP),XWORD2
- VMOVDQ 3*16(INP),XWORD3
-
- vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
- vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
- vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
- vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
-
- jmp .Llast_block_enter
-
-.Lonly_one_block:
-
- ## load initial digest
- mov (4*0)(CTX),a
- mov (4*1)(CTX),b
- mov (4*2)(CTX),c
- mov (4*3)(CTX),d
- mov (4*4)(CTX),e
- mov (4*5)(CTX),f
- mov (4*6)(CTX),g
- mov (4*7)(CTX),h
-
- vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
- vmovdqa _SHUF_00BA(%rip), SHUF_00BA
- vmovdqa _SHUF_DC00(%rip), SHUF_DC00
-
- mov CTX, _CTX(%rsp)
- jmp .Ldo_last_block
-
-.Ldone_hash:
-
- mov %rbp, %rsp
- pop %rbp
-
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- vzeroupper
- RET
-SYM_FUNC_END(sha256_transform_rorx)
-
-.section .rodata.cst512.K256, "aM", @progbits, 512
-.align 64
-K256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
-.align 32
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
-
-# shuffle xBxA -> 00BA
-.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
-.align 32
-_SHUF_00BA:
- .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
-
-# shuffle xDxC -> DC00
-.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
-.align 32
-_SHUF_DC00:
- .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/arch/x86/lib/crypto/sha256-ni-asm.S b/arch/x86/lib/crypto/sha256-ni-asm.S
deleted file mode 100644
index d3548206cf3d..000000000000
--- a/arch/x86/lib/crypto/sha256-ni-asm.S
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Intel SHA Extensions optimized implementation of a SHA-256 update function
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Sean Gulley <sean.m.gulley@intel.com>
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/linkage.h>
-#include <linux/objtool.h>
-
-#define STATE_PTR %rdi /* 1st arg */
-#define DATA_PTR %rsi /* 2nd arg */
-#define NUM_BLKS %rdx /* 3rd arg */
-
-#define SHA256CONSTANTS %rax
-
-#define MSG %xmm0 /* sha256rnds2 implicit operand */
-#define STATE0 %xmm1
-#define STATE1 %xmm2
-#define MSG0 %xmm3
-#define MSG1 %xmm4
-#define MSG2 %xmm5
-#define MSG3 %xmm6
-#define TMP %xmm7
-
-#define SHUF_MASK %xmm8
-
-#define ABEF_SAVE %xmm9
-#define CDGH_SAVE %xmm10
-
-.macro do_4rounds i, m0, m1, m2, m3
-.if \i < 16
- movdqu \i*4(DATA_PTR), \m0
- pshufb SHUF_MASK, \m0
-.endif
- movdqa (\i-32)*4(SHA256CONSTANTS), MSG
- paddd \m0, MSG
- sha256rnds2 STATE0, STATE1
-.if \i >= 12 && \i < 60
- movdqa \m0, TMP
- palignr $4, \m3, TMP
- paddd TMP, \m1
- sha256msg2 \m0, \m1
-.endif
- punpckhqdq MSG, MSG
- sha256rnds2 STATE1, STATE0
-.if \i >= 4 && \i < 52
- sha256msg1 \m0, \m3
-.endif
-.endm
-
-/*
- * Intel SHA Extensions optimized implementation of a SHA-256 block function
- *
- * This function takes a pointer to the current SHA-256 state, a pointer to the
- * input data, and the number of 64-byte blocks to process. Once all blocks
- * have been processed, the state is updated with the new state. This function
- * only processes complete blocks. State initialization, buffering of partial
- * blocks, and digest finalization is expected to be handled elsewhere.
- *
- * void sha256_ni_transform(u32 state[SHA256_STATE_WORDS],
- * const u8 *data, size_t nblocks);
- */
-.text
-SYM_FUNC_START(sha256_ni_transform)
- ANNOTATE_NOENDBR # since this is called only via static_call
-
- shl $6, NUM_BLKS /* convert to bytes */
- jz .Ldone_hash
- add DATA_PTR, NUM_BLKS /* pointer to end of data */
-
- /*
- * load initial hash values
- * Need to reorder these appropriately
- * DCBA, HGFE -> ABEF, CDGH
- */
- movdqu 0*16(STATE_PTR), STATE0 /* DCBA */
- movdqu 1*16(STATE_PTR), STATE1 /* HGFE */
-
- movdqa STATE0, TMP
- punpcklqdq STATE1, STATE0 /* FEBA */
- punpckhqdq TMP, STATE1 /* DCHG */
- pshufd $0x1B, STATE0, STATE0 /* ABEF */
- pshufd $0xB1, STATE1, STATE1 /* CDGH */
-
- movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
- lea K256+32*4(%rip), SHA256CONSTANTS
-
-.Lloop0:
- /* Save hash values for addition after rounds */
- movdqa STATE0, ABEF_SAVE
- movdqa STATE1, CDGH_SAVE
-
-.irp i, 0, 16, 32, 48
- do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3
- do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0
- do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1
- do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2
-.endr
-
- /* Add current hash values with previously saved */
- paddd ABEF_SAVE, STATE0
- paddd CDGH_SAVE, STATE1
-
- /* Increment data pointer and loop if more to process */
- add $64, DATA_PTR
- cmp NUM_BLKS, DATA_PTR
- jne .Lloop0
-
- /* Write hash values back in the correct order */
- movdqa STATE0, TMP
- punpcklqdq STATE1, STATE0 /* GHEF */
- punpckhqdq TMP, STATE1 /* ABCD */
- pshufd $0xB1, STATE0, STATE0 /* HGFE */
- pshufd $0x1B, STATE1, STATE1 /* DCBA */
-
- movdqu STATE1, 0*16(STATE_PTR)
- movdqu STATE0, 1*16(STATE_PTR)
-
-.Ldone_hash:
-
- RET
-SYM_FUNC_END(sha256_ni_transform)
-
-.section .rodata.cst256.K256, "aM", @progbits, 256
-.align 64
-K256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/arch/x86/lib/crypto/sha256-ssse3-asm.S b/arch/x86/lib/crypto/sha256-ssse3-asm.S
deleted file mode 100644
index 7f24a4cdcb25..000000000000
--- a/arch/x86/lib/crypto/sha256-ssse3-asm.S
+++ /dev/null
@@ -1,511 +0,0 @@
-########################################################################
-# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-256 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/objtool.h>
-
-## assume buffers not aligned
-#define MOVDQ movdqu
-
-################################ Define Macros
-
-# addm [mem], reg
-# Add reg to mem using reg-mem add and store
-.macro addm p1 p2
- add \p1, \p2
- mov \p2, \p1
-.endm
-
-################################
-
-# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
-# Load xmm with mem and byte swap each dword
-.macro COPY_XMM_AND_BSWAP p1 p2 p3
- MOVDQ \p2, \p1
- pshufb \p3, \p1
-.endm
-
-################################
-
-X0 = %xmm4
-X1 = %xmm5
-X2 = %xmm6
-X3 = %xmm7
-
-XTMP0 = %xmm0
-XTMP1 = %xmm1
-XTMP2 = %xmm2
-XTMP3 = %xmm3
-XTMP4 = %xmm8
-XFER = %xmm9
-
-SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
-SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
-BYTE_FLIP_MASK = %xmm12
-
-NUM_BLKS = %rdx # 3rd arg
-INP = %rsi # 2nd arg
-CTX = %rdi # 1st arg
-
-SRND = %rsi # clobbers INP
-c = %ecx
-d = %r8d
-e = %edx
-TBL = %r12
-a = %eax
-b = %ebx
-
-f = %r9d
-g = %r10d
-h = %r11d
-
-y0 = %r13d
-y1 = %r14d
-y2 = %r15d
-
-
-
-_INP_END_SIZE = 8
-_INP_SIZE = 8
-_XFER_SIZE = 16
-_XMM_SAVE_SIZE = 0
-
-_INP_END = 0
-_INP = _INP_END + _INP_END_SIZE
-_XFER = _INP + _INP_SIZE
-_XMM_SAVE = _XFER + _XFER_SIZE
-STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
-
-# rotate_Xs
-# Rotate values of symbols X0...X3
-.macro rotate_Xs
-X_ = X0
-X0 = X1
-X1 = X2
-X2 = X3
-X3 = X_
-.endm
-
-# ROTATE_ARGS
-# Rotate values of symbols a...h
-.macro ROTATE_ARGS
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
- ## compute s0 four at a time and s1 two at a time
- ## compute W[-16] + W[-7] 4 at a time
- movdqa X3, XTMP0
- mov e, y0 # y0 = e
- ror $(25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- palignr $4, X2, XTMP0 # XTMP0 = W[-7]
- ror $(22-13), y1 # y1 = a >> (22-13)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- movdqa X1, XTMP1
- xor a, y1 # y1 = a ^ (a >> (22-13)
- xor g, y2 # y2 = f^g
- paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- ## compute s0
- palignr $4, X0, XTMP1 # XTMP1 = W[-15]
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- movdqa XTMP1, XTMP2 # XTMP2 = W[-15]
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add y0, y2 # y2 = S1 + CH
- add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH
- movdqa XTMP1, XTMP3 # XTMP3 = W[-15]
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- pslld $(32-7), XTMP1 #
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- psrld $7, XTMP2 #
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- #
- ROTATE_ARGS #
- movdqa XTMP3, XTMP2 # XTMP2 = W[-15]
- mov e, y0 # y0 = e
- mov a, y1 # y1 = a
- movdqa XTMP3, XTMP4 # XTMP4 = W[-15]
- ror $(25-11), y0 # y0 = e >> (25-11)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- ror $(22-13), y1 # y1 = a >> (22-13)
- pslld $(32-18), XTMP3 #
- xor a, y1 # y1 = a ^ (a >> (22-13)
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor g, y2 # y2 = f^g
- psrld $18, XTMP2 #
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- pxor XTMP3, XTMP1
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- psrld $3, XTMP4 # XTMP4 = W[-15] >> 3
- add y0, y2 # y2 = S1 + CH
- add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- pxor XTMP4, XTMP1 # XTMP1 = s0
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- ## compute low s1
- pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
-
- ROTATE_ARGS
- movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}
- mov e, y0 # y0 = e
- mov a, y1 # y1 = a
- ror $(25-11), y0 # y0 = e >> (25-11)
- movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}
- xor e, y0 # y0 = e ^ (e >> (25-11))
- ror $(22-13), y1 # y1 = a >> (22-13)
- mov f, y2 # y2 = f
- xor a, y1 # y1 = a ^ (a >> (22-13)
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
- xor g, y2 # y2 = f^g
- psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- pxor XTMP3, XTMP2
- add y0, y2 # y2 = S1 + CH
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA}
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA}
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- ## compute high s1
- pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- #
- ROTATE_ARGS #
- movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}
- mov e, y0 # y0 = e
- ror $(25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- movdqa XTMP2, X0 # X0 = W[-2] {DDCC}
- ror $(22-13), y1 # y1 = a >> (22-13)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
- xor a, y1 # y1 = a ^ (a >> (22-13)
- xor g, y2 # y2 = f^g
- psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25
- and e, y2 # y2 = (f^g)&e
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- pxor XTMP3, XTMP2 #
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
- add y0, y2 # y2 = S1 + CH
- add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- pxor XTMP2, X0 # X0 = s1 {xDxC}
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- pshufb SHUF_DC00, X0 # X0 = s1 {DC00}
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
-
- ROTATE_ARGS
- rotate_Xs
-.endm
-
-## input is [rsp + _XFER + %1 * 4]
-.macro DO_ROUND round
- mov e, y0 # y0 = e
- ror $(25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- xor e, y0 # y0 = e ^ (e >> (25-11))
- ror $(22-13), y1 # y1 = a >> (22-13)
- mov f, y2 # y2 = f
- xor a, y1 # y1 = a ^ (a >> (22-13)
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor g, y2 # y2 = f^g
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- and e, y2 # y2 = (f^g)&e
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- add y0, y2 # y2 = S1 + CH
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- offset = \round * 4 + _XFER
- add offset(%rsp), y2 # y2 = k + w + S1 + CH
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
-.endm
-
-########################################################################
-## void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS],
-## const u8 *data, size_t nblocks);
-########################################################################
-.text
-SYM_FUNC_START(sha256_transform_ssse3)
- ANNOTATE_NOENDBR # since this is called only via static_call
-
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rbp
- mov %rsp, %rbp
-
- subq $STACK_SIZE, %rsp
- and $~15, %rsp
-
- shl $6, NUM_BLKS # convert to bytes
- jz .Ldone_hash
- add INP, NUM_BLKS
- mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
-
- ## load initial digest
- mov 4*0(CTX), a
- mov 4*1(CTX), b
- mov 4*2(CTX), c
- mov 4*3(CTX), d
- mov 4*4(CTX), e
- mov 4*5(CTX), f
- mov 4*6(CTX), g
- mov 4*7(CTX), h
-
- movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
- movdqa _SHUF_00BA(%rip), SHUF_00BA
- movdqa _SHUF_DC00(%rip), SHUF_DC00
-
-.Lloop0:
- lea K256(%rip), TBL
-
- ## byte swap first 16 dwords
- COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
-
- mov INP, _INP(%rsp)
-
- ## schedule 48 input dwords, by doing 3 rounds of 16 each
- mov $3, SRND
-.align 16
-.Lloop1:
- movdqa (TBL), XFER
- paddd X0, XFER
- movdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- movdqa 1*16(TBL), XFER
- paddd X0, XFER
- movdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- movdqa 2*16(TBL), XFER
- paddd X0, XFER
- movdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- movdqa 3*16(TBL), XFER
- paddd X0, XFER
- movdqa XFER, _XFER(%rsp)
- add $4*16, TBL
- FOUR_ROUNDS_AND_SCHED
-
- sub $1, SRND
- jne .Lloop1
-
- mov $2, SRND
-.Lloop2:
- paddd (TBL), X0
- movdqa X0, _XFER(%rsp)
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
- paddd 1*16(TBL), X1
- movdqa X1, _XFER(%rsp)
- add $2*16, TBL
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
-
- movdqa X2, X0
- movdqa X3, X1
-
- sub $1, SRND
- jne .Lloop2
-
- addm (4*0)(CTX),a
- addm (4*1)(CTX),b
- addm (4*2)(CTX),c
- addm (4*3)(CTX),d
- addm (4*4)(CTX),e
- addm (4*5)(CTX),f
- addm (4*6)(CTX),g
- addm (4*7)(CTX),h
-
- mov _INP(%rsp), INP
- add $64, INP
- cmp _INP_END(%rsp), INP
- jne .Lloop0
-
-.Ldone_hash:
-
- mov %rbp, %rsp
- popq %rbp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
-
- RET
-SYM_FUNC_END(sha256_transform_ssse3)
-
-.section .rodata.cst256.K256, "aM", @progbits, 256
-.align 64
-K256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x0c0d0e0f08090a0b0405060700010203
-
-.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
-.align 16
-# shuffle xBxA -> 00BA
-_SHUF_00BA:
- .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
-
-.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
-.align 16
-# shuffle xDxC -> DC00
-_SHUF_DC00:
- .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/arch/x86/lib/crypto/sha256.c b/arch/x86/lib/crypto/sha256.c
deleted file mode 100644
index 80380f8fdcee..000000000000
--- a/arch/x86/lib/crypto/sha256.c
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 optimized for x86_64
- *
- * Copyright 2025 Google LLC
- */
-#include <asm/fpu/api.h>
-#include <crypto/internal/sha2.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/static_call.h>
-
-asmlinkage void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-asmlinkage void sha256_transform_avx(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-asmlinkage void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-asmlinkage void sha256_ni_transform(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_x86);
-
-DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_transform_ssse3);
-
-void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks)
-{
- if (static_branch_likely(&have_sha256_x86)) {
- kernel_fpu_begin();
- static_call(sha256_blocks_x86)(state, data, nblocks);
- kernel_fpu_end();
- } else {
- sha256_blocks_generic(state, data, nblocks);
- }
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_simd);
-
-void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks)
-{
- sha256_blocks_generic(state, data, nblocks);
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-
-bool sha256_is_arch_optimized(void)
-{
- return static_key_enabled(&have_sha256_x86);
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
-static int __init sha256_x86_mod_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
- static_call_update(sha256_blocks_x86, sha256_ni_transform);
- } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE |
- XFEATURE_MASK_YMM, NULL) &&
- boot_cpu_has(X86_FEATURE_AVX)) {
- if (boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_BMI2))
- static_call_update(sha256_blocks_x86,
- sha256_transform_rorx);
- else
- static_call_update(sha256_blocks_x86,
- sha256_transform_avx);
- } else if (!boot_cpu_has(X86_FEATURE_SSSE3)) {
- return 0;
- }
- static_branch_enable(&have_sha256_x86);
- return 0;
-}
-subsys_initcall(sha256_x86_mod_init);
-
-static void __exit sha256_x86_mod_exit(void)
-{
-}
-module_exit(sha256_x86_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 optimized for x86_64");
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index bf8dab18be97..2fdc1f1f5adb 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -122,13 +122,12 @@ static bool ex_handler_sgx(const struct exception_table_entry *fixup,
static bool ex_handler_fprestore(const struct exception_table_entry *fixup,
struct pt_regs *regs)
{
- regs->ip = ex_fixup_addr(fixup);
-
WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
(void *)instruction_pointer(regs));
fpu_reset_from_exception_fixup();
- return true;
+
+ return ex_handler_default(fixup, regs);
}
/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index fdb6cab524f0..76e33bd7c556 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -805,7 +805,7 @@ kernel_physical_mapping_change(unsigned long paddr_start,
}
#ifndef CONFIG_NUMA
-static inline void x86_numa_init(void)
+static __always_inline void x86_numa_init(void)
{
memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
}
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index c0c40b67524e..b10d4d131dce 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -38,6 +38,7 @@
#include <asm/desc.h>
#include <asm/sections.h>
#include <asm/set_memory.h>
+#include <asm/bugs.h>
#undef pr_fmt
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
@@ -84,7 +85,8 @@ void __init pti_check_boottime_disable(void)
return;
}
- if (cpu_mitigations_off())
+ if (pti_mode == PTI_AUTO &&
+ !cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL))
pti_mode = PTI_FORCE_OFF;
if (pti_mode == PTI_FORCE_OFF) {
pti_print_if_insecure("disabled on command line.");
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
index f8126821a94d..aaa7017416f7 100644
--- a/arch/x86/platform/ce4100/ce4100.c
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -5,19 +5,12 @@
* (C) Copyright 2010 Intel Corporation
*/
#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/irq.h>
#include <linux/reboot.h>
-#include <linux/serial_reg.h>
-#include <linux/serial_8250.h>
#include <asm/ce4100.h>
#include <asm/prom.h>
#include <asm/setup.h>
-#include <asm/i8259.h>
#include <asm/io.h>
-#include <asm/io_apic.h>
-#include <asm/emergency-restart.h>
/*
* The CE4100 platform has an internal 8051 Microcontroller which is
@@ -31,94 +24,6 @@ static void ce4100_power_off(void)
outb(0x4, 0xcf9);
}
-#ifdef CONFIG_SERIAL_8250
-
-static unsigned int mem_serial_in(struct uart_port *p, int offset)
-{
- offset = offset << p->regshift;
- return readl(p->membase + offset);
-}
-
-/*
- * The UART Tx interrupts are not set under some conditions and therefore serial
- * transmission hangs. This is a silicon issue and has not been root caused. The
- * workaround for this silicon issue checks UART_LSR_THRE bit and UART_LSR_TEMT
- * bit of LSR register in interrupt handler to see whether at least one of these
- * two bits is set, if so then process the transmit request. If this workaround
- * is not applied, then the serial transmission may hang. This workaround is for
- * errata number 9 in Errata - B step.
-*/
-
-static unsigned int ce4100_mem_serial_in(struct uart_port *p, int offset)
-{
- unsigned int ret, ier, lsr;
-
- if (offset == UART_IIR) {
- offset = offset << p->regshift;
- ret = readl(p->membase + offset);
- if (ret & UART_IIR_NO_INT) {
- /* see if the TX interrupt should have really set */
- ier = mem_serial_in(p, UART_IER);
- /* see if the UART's XMIT interrupt is enabled */
- if (ier & UART_IER_THRI) {
- lsr = mem_serial_in(p, UART_LSR);
- /* now check to see if the UART should be
- generating an interrupt (but isn't) */
- if (lsr & (UART_LSR_THRE | UART_LSR_TEMT))
- ret &= ~UART_IIR_NO_INT;
- }
- }
- } else
- ret = mem_serial_in(p, offset);
- return ret;
-}
-
-static void ce4100_mem_serial_out(struct uart_port *p, int offset, int value)
-{
- offset = offset << p->regshift;
- writel(value, p->membase + offset);
-}
-
-static void ce4100_serial_fixup(int port, struct uart_port *up,
- u32 *capabilities)
-{
-#ifdef CONFIG_EARLY_PRINTK
- /*
- * Over ride the legacy port configuration that comes from
- * asm/serial.h. Using the ioport driver then switching to the
- * PCI memmaped driver hangs the IOAPIC
- */
- if (up->iotype != UPIO_MEM32) {
- up->uartclk = 14745600;
- up->mapbase = 0xdffe0200;
- set_fixmap_nocache(FIX_EARLYCON_MEM_BASE,
- up->mapbase & PAGE_MASK);
- up->membase =
- (void __iomem *)__fix_to_virt(FIX_EARLYCON_MEM_BASE);
- up->membase += up->mapbase & ~PAGE_MASK;
- up->mapbase += port * 0x100;
- up->membase += port * 0x100;
- up->iotype = UPIO_MEM32;
- up->regshift = 2;
- up->irq = 4;
- }
-#endif
- up->iobase = 0;
- up->serial_in = ce4100_mem_serial_in;
- up->serial_out = ce4100_mem_serial_out;
-
- *capabilities |= (1 << 12);
-}
-
-static __init void sdv_serial_fixup(void)
-{
- serial8250_set_isa_configurator(ce4100_serial_fixup);
-}
-
-#else
-static inline void sdv_serial_fixup(void) {};
-#endif
-
static void __init sdv_arch_setup(void)
{
sdv_serial_fixup();
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index e7e8f77f77f8..b4409df2105a 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -216,8 +216,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
* When SEV-ES is active, the GHCB as set by the kernel will be used
* by firmware. Create a 1:1 unencrypted mapping for each GHCB.
*/
- if (sev_es_efi_map_ghcbs(pgd)) {
- pr_err("Failed to create 1:1 mapping for the GHCBs!\n");
+ if (sev_es_efi_map_ghcbs_cas(pgd)) {
+ pr_err("Failed to create 1:1 mapping for the GHCBs and CAs!\n");
return 1;
}
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index ebdfd7b84feb..e0a607a14e7e 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -35,7 +35,7 @@ targets += purgatory.ro purgatory.chk
PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel
PURGATORY_CFLAGS := -mcmodel=small -ffreestanding -fno-zero-initialized-in-bss -g0
PURGATORY_CFLAGS += -fpic -fvisibility=hidden
-PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) -DDISABLE_BRANCH_PROFILING
+PURGATORY_CFLAGS += $(DISABLE_KSTACK_ERASE) -DDISABLE_BRANCH_PROFILING
PURGATORY_CFLAGS += -fno-stack-protector
# Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That
diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c
index aea47e793963..655139dd0532 100644
--- a/arch/x86/purgatory/purgatory.c
+++ b/arch/x86/purgatory/purgatory.c
@@ -25,7 +25,7 @@ static int verify_sha256_digest(void)
{
struct kexec_sha_region *ptr, *end;
u8 digest[SHA256_DIGEST_SIZE];
- struct sha256_state sctx;
+ struct sha256_ctx sctx;
sha256_init(&sctx);
end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions);
diff --git a/arch/x86/tools/insn_decoder_test.c b/arch/x86/tools/insn_decoder_test.c
index 08cd913cbd4e..8bf15c4aefa9 100644
--- a/arch/x86/tools/insn_decoder_test.c
+++ b/arch/x86/tools/insn_decoder_test.c
@@ -167,7 +167,7 @@ int main(int argc, char **argv)
pr_warn("Decoded and checked %d instructions with %d "
"failures\n", insns, warnings);
else
- fprintf(stdout, "%s: success: Decoded and checked %d"
+ fprintf(stdout, " %s: success: Decoded and checked %d"
" instructions\n", prog, insns);
return 0;
}
diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c
index 213f35f94feb..e743f0ea01ee 100644
--- a/arch/x86/tools/insn_sanity.c
+++ b/arch/x86/tools/insn_sanity.c
@@ -253,9 +253,9 @@ int main(int argc, char **argv)
}
fprintf((errors) ? stderr : stdout,
- "%s: %s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n",
+ " %s: %s: Decoded and checked %d %s instructions with %d errors (seed:0x%x)\n",
prog,
- (errors) ? "Failure" : "Success",
+ (errors) ? "failure" : "success",
insns,
(input_file) ? "given" : "random",
errors,
diff --git a/arch/x86/um/asm/syscall.h b/arch/x86/um/asm/syscall.h
index 56a2f0913e3c..d6208d0fad51 100644
--- a/arch/x86/um/asm/syscall.h
+++ b/arch/x86/um/asm/syscall.h
@@ -9,6 +9,8 @@ typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long,
unsigned long, unsigned long,
unsigned long, unsigned long);
+extern const sys_call_ptr_t sys_call_table[];
+
static inline int syscall_get_arch(struct task_struct *task)
{
#ifdef CONFIG_X86_32
diff --git a/arch/x86/um/ptrace.c b/arch/x86/um/ptrace.c
index fae8aabad10f..2635ca2595a3 100644
--- a/arch/x86/um/ptrace.c
+++ b/arch/x86/um/ptrace.c
@@ -236,7 +236,7 @@ static int generic_fpregs_set(struct task_struct *target,
static struct user_regset uml_regsets[] __ro_after_init = {
[REGSET_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = sizeof(struct user_regs_struct) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
@@ -246,7 +246,7 @@ static struct user_regset uml_regsets[] __ro_after_init = {
#ifdef CONFIG_X86_32
/* Old FP registers, they are needed in signal frames */
[REGSET_FP_LEGACY] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(struct user_i387_ia32_struct) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
@@ -257,10 +257,10 @@ static struct user_regset uml_regsets[] __ro_after_init = {
#endif
[REGSET_FP] = {
#ifdef CONFIG_X86_32
- .core_note_type = NT_PRXFPREG,
+ USER_REGSET_NOTE_TYPE(PRXFPREG),
.n = sizeof(struct user32_fxsr_struct) / sizeof(long),
#else
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(struct user_i387_struct) / sizeof(long),
#endif
.size = sizeof(long),
@@ -270,7 +270,7 @@ static struct user_regset uml_regsets[] __ro_after_init = {
.set = generic_fpregs_set,
},
[REGSET_XSTATE] = {
- .core_note_type = NT_X86_XSTATE,
+ USER_REGSET_NOTE_TYPE(X86_XSTATE),
.size = sizeof(long),
.align = sizeof(long),
.active = generic_fpregs_active,
diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h
index 8f7476ff6e95..572ea2d79131 100644
--- a/arch/x86/um/shared/sysdep/ptrace.h
+++ b/arch/x86/um/shared/sysdep/ptrace.h
@@ -44,18 +44,6 @@
#include "ptrace_64.h"
#endif
-struct syscall_args {
- unsigned long args[6];
-};
-
-#define SYSCALL_ARGS(r) ((struct syscall_args) \
- { .args = { UPT_SYSCALL_ARG1(r), \
- UPT_SYSCALL_ARG2(r), \
- UPT_SYSCALL_ARG3(r), \
- UPT_SYSCALL_ARG4(r), \
- UPT_SYSCALL_ARG5(r), \
- UPT_SYSCALL_ARG6(r) } } )
-
extern unsigned long host_fp_size;
struct uml_pt_regs {
diff --git a/arch/x86/um/shared/sysdep/syscalls.h b/arch/x86/um/shared/sysdep/syscalls.h
deleted file mode 100644
index b2060ac707f0..000000000000
--- a/arch/x86/um/shared/sysdep/syscalls.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifdef __i386__
-#include "syscalls_32.h"
-#else
-#include "syscalls_64.h"
-#endif
diff --git a/arch/x86/um/shared/sysdep/syscalls_32.h b/arch/x86/um/shared/sysdep/syscalls_32.h
deleted file mode 100644
index f6e9f84397e7..000000000000
--- a/arch/x86/um/shared/sysdep/syscalls_32.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (C) 2000 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
- */
-
-#include <asm/unistd.h>
-#include <sysdep/ptrace.h>
-
-typedef long syscall_handler_t(struct syscall_args);
-
-extern syscall_handler_t *sys_call_table[];
-
-#define EXECUTE_SYSCALL(syscall, regs) \
- ((*sys_call_table[syscall]))(SYSCALL_ARGS(&regs->regs))
diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h b/arch/x86/um/shared/sysdep/syscalls_64.h
deleted file mode 100644
index b6b997225841..000000000000
--- a/arch/x86/um/shared/sysdep/syscalls_64.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright 2003 PathScale, Inc.
- *
- * Licensed under the GPL
- */
-
-#ifndef __SYSDEP_X86_64_SYSCALLS_H__
-#define __SYSDEP_X86_64_SYSCALLS_H__
-
-#include <linux/msg.h>
-#include <linux/shm.h>
-
-typedef long syscall_handler_t(long, long, long, long, long, long);
-
-extern syscall_handler_t *sys_call_table[];
-
-#define EXECUTE_SYSCALL(syscall, regs) \
- (((*sys_call_table[syscall]))(UPT_SYSCALL_ARG1(&regs->regs), \
- UPT_SYSCALL_ARG2(&regs->regs), \
- UPT_SYSCALL_ARG3(&regs->regs), \
- UPT_SYSCALL_ARG4(&regs->regs), \
- UPT_SYSCALL_ARG5(&regs->regs), \
- UPT_SYSCALL_ARG6(&regs->regs)))
-
-extern syscall_handler_t sys_modify_ldt;
-extern syscall_handler_t sys_arch_prctl;
-
-#endif
diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c
index cb3f17627d16..1909c2e640b2 100644
--- a/arch/x86/um/tls_32.c
+++ b/arch/x86/um/tls_32.c
@@ -186,7 +186,7 @@ int arch_switch_tls(struct task_struct *to)
/*
* We have no need whatsoever to switch TLS for kernel threads; beyond
* that, that would also result in us calling os_set_thread_area with
- * userspace_pid[cpu] == 0, which gives an error.
+ * task->mm == NULL, which would cause a crash.
*/
if (likely(to->mm))
return load_TLS(O_FORCE, to);