diff options
Diffstat (limited to 'lib')
292 files changed, 45788 insertions, 1551 deletions
diff --git a/lib/Kconfig b/lib/Kconfig index 61cce0686b53..c483951b624f 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -136,100 +136,9 @@ config TRACE_MMIO_ACCESS Create tracepoints for MMIO read/write operations. These trace events can be used for logging all MMIO read/write operations. +source "lib/crc/Kconfig" source "lib/crypto/Kconfig" -config CRC_CCITT - tristate "CRC-CCITT functions" - help - This option is provided for the case where no in-kernel-tree - modules require CRC-CCITT functions, but a module built outside - the kernel tree does. Such modules that use library CRC-CCITT - functions require M here. - -config CRC16 - tristate "CRC16 functions" - help - This option is provided for the case where no in-kernel-tree - modules require CRC16 functions, but a module built outside - the kernel tree does. Such modules that use library CRC16 - functions require M here. - -config CRC_T10DIF - tristate "CRC calculation for the T10 Data Integrity Field" - help - This option is only needed if a module that's not in the - kernel tree needs to calculate CRC checks for use with the - SCSI data integrity subsystem. - -config ARCH_HAS_CRC_T10DIF - bool - -config CRC_T10DIF_ARCH - tristate - default CRC_T10DIF if ARCH_HAS_CRC_T10DIF && CRC_OPTIMIZATIONS - -config CRC_ITU_T - tristate "CRC ITU-T V.41 functions" - help - This option is provided for the case where no in-kernel-tree - modules require CRC ITU-T V.41 functions, but a module built outside - the kernel tree does. Such modules that use library CRC ITU-T V.41 - functions require M here. - -config CRC32 - tristate "CRC32/CRC32c functions" - default y - select BITREVERSE - help - This option is provided for the case where no in-kernel-tree - modules require CRC32/CRC32c functions, but a module built outside - the kernel tree does. Such modules that use library CRC32/CRC32c - functions require M here. - -config ARCH_HAS_CRC32 - bool - -config CRC32_ARCH - tristate - default CRC32 if ARCH_HAS_CRC32 && CRC_OPTIMIZATIONS - -config CRC64 - tristate - -config ARCH_HAS_CRC64 - bool - -config CRC64_ARCH - tristate - default CRC64 if ARCH_HAS_CRC64 && CRC_OPTIMIZATIONS - -config CRC4 - tristate - -config CRC7 - tristate - -config LIBCRC32C - tristate - select CRC32 - help - This option just selects CRC32 and is provided for compatibility - purposes until the users are updated to select CRC32 directly. - -config CRC8 - tristate - -config CRC_OPTIMIZATIONS - bool "Enable optimized CRC implementations" if EXPERT - default y - help - Disabling this option reduces code size slightly by disabling the - architecture-optimized implementations of any CRC variants that are - enabled. CRC checksumming performance may get much slower. - - Keep this enabled unless you're really trying to minimize the size of - the kernel. - config XXHASH tristate @@ -721,6 +630,7 @@ config GENERIC_LIB_DEVMEM_IS_ALLOWED config PLDMFW bool + select CRC32 default n config ASN1_ENCODER diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index df9587aa5c5e..dc0e0c6ed075 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -335,12 +335,12 @@ config DEBUG_INFO_COMPRESSED_ZLIB Compress the debug information using zlib. Requires GCC 5.0+ or Clang 5.0+, binutils 2.26+, and zlib. - Users of dpkg-deb via scripts/package/builddeb may find an increase in + Users of dpkg-deb via debian/rules may find an increase in size of their debug .deb packages with this config set, due to the debug info being compressed with zlib, then the object files being recompressed with a different compression scheme. But this is still - preferable to setting $KDEB_COMPRESS to "none" which would be even - larger. + preferable to setting KDEB_COMPRESS or DPKG_DEB_COMPRESSOR_TYPE to + "none" which would be even larger. config DEBUG_INFO_COMPRESSED_ZSTD bool "Compress debugging information with zstd" @@ -473,7 +473,6 @@ config READABLE_ASM config HEADERS_INSTALL bool "Install uapi headers to usr/include" - depends on !UML help This option will install uapi headers (headers exported to user-space) into the usr/include directory for use during the kernel build. @@ -2154,18 +2153,12 @@ config ARCH_HAS_KCOV build and run with CONFIG_KCOV. This typically requires disabling instrumentation for some early boot code. -config CC_HAS_SANCOV_TRACE_PC - def_bool $(cc-option,-fsanitize-coverage=trace-pc) - - config KCOV bool "Code coverage for fuzzing" depends on ARCH_HAS_KCOV - depends on CC_HAS_SANCOV_TRACE_PC || GCC_PLUGINS depends on !ARCH_WANTS_NO_INSTR || HAVE_NOINSTR_HACK || \ GCC_VERSION >= 120000 || CC_IS_CLANG select DEBUG_FS - select GCC_PLUGIN_SANCOV if !CC_HAS_SANCOV_TRACE_PC select OBJTOOL if HAVE_NOINSTR_HACK help KCOV exposes kernel code coverage information in a form suitable @@ -2467,6 +2460,15 @@ config SCANF_KUNIT_TEST If unsure, say N. +config SEQ_BUF_KUNIT_TEST + tristate "KUnit test for seq_buf" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This builds unit tests for the seq_buf library. + + If unsure, say N. + config STRING_KUNIT_TEST tristate "KUnit test string functions at runtime" if !KUNIT_ALL_TESTS depends on KUNIT @@ -2513,8 +2515,8 @@ config TEST_IDA tristate "Perform selftest on IDA functions" config TEST_MISC_MINOR - tristate "miscdevice KUnit test" if !KUNIT_ALL_TESTS - depends on KUNIT + bool "miscdevice KUnit test" if !KUNIT_ALL_TESTS + depends on KUNIT=y default KUNIT_ALL_TESTS help Kunit test for miscdevice API, specially its behavior in respect to @@ -2575,8 +2577,7 @@ config TEST_BITOPS config TEST_VMALLOC tristate "Test module for stress/performance analysis of vmalloc allocator" default n - depends on MMU - depends on m + depends on MMU help This builds the "test_vmalloc" module that should be used for stress and performance analysis. So, any new change for vmalloc @@ -2864,6 +2865,14 @@ config OVERFLOW_KUNIT_TEST If unsure, say N. +config RANDSTRUCT_KUNIT_TEST + tristate "Test randstruct structure layout randomization at runtime" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + Builds unit tests for the checking CONFIG_RANDSTRUCT=y, which + randomizes structure layouts. + config STACKINIT_KUNIT_TEST tristate "Test level of stack variable initialization" if !KUNIT_ALL_TESTS depends on KUNIT @@ -2871,9 +2880,7 @@ config STACKINIT_KUNIT_TEST help Test if the kernel is zero-initializing stack variables and padding. Coverage is controlled by compiler flags, - CONFIG_INIT_STACK_ALL_PATTERN, CONFIG_INIT_STACK_ALL_ZERO, - CONFIG_GCC_PLUGIN_STRUCTLEAK, CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF, - or CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL. + CONFIG_INIT_STACK_ALL_PATTERN or CONFIG_INIT_STACK_ALL_ZERO. config FORTIFY_KUNIT_TEST tristate "Test fortified str*() and mem*() function internals at runtime" if !KUNIT_ALL_TESTS @@ -2887,6 +2894,7 @@ config FORTIFY_KUNIT_TEST config LONGEST_SYM_KUNIT_TEST tristate "Test the longest symbol possible" if !KUNIT_ALL_TESTS depends on KUNIT && KPROBES + depends on !PREFIX_SYMBOLS && !CFI_CLANG && !GCOV_KERNEL default KUNIT_ALL_TESTS help Tests the longest symbol possible @@ -2903,27 +2911,6 @@ config HW_BREAKPOINT_KUNIT_TEST If unsure, say N. -config CRC_KUNIT_TEST - tristate "KUnit tests for CRC functions" if !KUNIT_ALL_TESTS - depends on KUNIT - default KUNIT_ALL_TESTS - select CRC7 - select CRC16 - select CRC_T10DIF - select CRC32 - select CRC64 - help - Unit tests for the CRC library functions. - - This is intended to help people writing architecture-specific - optimized versions. If unsure, say N. - -config CRC_BENCHMARK - bool "Benchmark for the CRC functions" - depends on CRC_KUNIT_TEST - help - Include benchmarks in the KUnit test suite for the CRC functions. - config SIPHASH_KUNIT_TEST tristate "Perform selftest on siphash functions" if !KUNIT_ALL_TESTS depends on KUNIT @@ -2984,13 +2971,7 @@ config TEST_DYNAMIC_DEBUG config TEST_KMOD tristate "kmod stress tester" depends on m - depends on NETDEVICES && NET_CORE && INET # for TUN - depends on BLOCK - depends on PAGE_SIZE_LESS_THAN_256KB # for BTRFS select TEST_LKM - select XFS_FS - select TUN - select BTRFS_FS help Test the kernel's module loading mechanism: kmod. kmod implements support to load modules using the Linux kernel's usermode helper. @@ -3233,6 +3214,37 @@ config TEST_OBJPOOL If unsure, say N. +config TEST_KEXEC_HANDOVER + bool "Test for Kexec HandOver" + default n + depends on KEXEC_HANDOVER + help + This option enables test for Kexec HandOver (KHO). + The test consists of two parts: saving kernel data before kexec and + restoring the data after kexec and verifying that it was properly + handed over. This test module creates and saves data on the boot of + the first kernel and restores and verifies the data on the boot of + kexec'ed kernel. + + For detailed documentation about KHO, see Documentation/core-api/kho. + + To run the test run: + + tools/testing/selftests/kho/vmtest.sh -h + + If unsure, say N. + +config RATELIMIT_KUNIT_TEST + tristate "KUnit Test for correctness and stress of ratelimit" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This builds the "test_ratelimit" module that should be used + for correctness verification and concurrent testings of rate + limiting. + + If unsure, say N. + config INT_POW_KUNIT_TEST tristate "Integer exponentiation (int_pow) test" if !KUNIT_ALL_TESTS depends on KUNIT @@ -3291,7 +3303,7 @@ config GCD_KUNIT_TEST config PRIME_NUMBERS_KUNIT_TEST tristate "Prime number generator test" if !KUNIT_ALL_TESTS depends on KUNIT - select PRIME_NUMBERS + depends on PRIME_NUMBERS default KUNIT_ALL_TESTS help This option enables the KUnit test suite for the {is,next}_prime_number diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index 4216b3a4ff21..744121178815 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -118,7 +118,8 @@ config UBSAN_UNREACHABLE config UBSAN_INTEGER_WRAP bool "Perform checking for integer arithmetic wrap-around" - default UBSAN + # This is very experimental so drop the next line if you really want it + depends on BROKEN depends on !COMPILE_TEST depends on $(cc-option,-fsanitize-undefined-ignore-overflow-pattern=all) depends on $(cc-option,-fsanitize=signed-integer-overflow) @@ -166,4 +167,13 @@ config TEST_UBSAN This is a test module for UBSAN. It triggers various undefined behavior, and detect it. +config UBSAN_KVM_EL2 + bool "UBSAN for KVM code at EL2" + depends on ARM64 + help + Enable UBSAN when running on ARM64 with KVM in a split mode + (nvhe/hvhe/protected) for the hypervisor code running in EL2. + In this mode, any UBSAN violation in EL2 would panic the kernel + and information similar to UBSAN_TRAP would be printed. + endif # if UBSAN diff --git a/lib/Makefile b/lib/Makefile index f07b24ce1b3f..392ff808c9b9 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -40,7 +40,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ nmi_backtrace.o win_minmax.o memcat_p.o \ - buildid.o objpool.o iomem_copy.o + buildid.o objpool.o iomem_copy.o sys_info.o lib-$(CONFIG_UNION_FIND) += union_find.o lib-$(CONFIG_PRINTK) += dump_stack.o @@ -71,7 +71,6 @@ CFLAGS_test_bitops.o += -Werror obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o obj-$(CONFIG_TEST_IDA) += test_ida.o obj-$(CONFIG_TEST_UBSAN) += test_ubsan.o -CFLAGS_test_ubsan.o += $(call cc-disable-warning, vla) CFLAGS_test_ubsan.o += $(call cc-disable-warning, unused-but-set-variable) UBSAN_SANITIZE_test_ubsan.o := y obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o @@ -103,6 +102,7 @@ obj-$(CONFIG_TEST_HMM) += test_hmm.o obj-$(CONFIG_TEST_FREE_PAGES) += test_free_pages.o obj-$(CONFIG_TEST_REF_TRACKER) += test_ref_tracker.o obj-$(CONFIG_TEST_OBJPOOL) += test_objpool.o +obj-$(CONFIG_TEST_KEXEC_HANDOVER) += test_kho.o obj-$(CONFIG_TEST_FPU) += test_fpu.o test_fpu-y := test_fpu_glue.o test_fpu_impl.o @@ -123,7 +123,7 @@ endif obj-$(CONFIG_DEBUG_INFO_REDUCED) += debug_info.o CFLAGS_debug_info.o += $(call cc-option, -femit-struct-debug-detailed=any) -obj-y += math/ crypto/ tests/ vdso/ +obj-y += math/ crc/ crypto/ tests/ vdso/ obj-$(CONFIG_GENERIC_IOMAP) += iomap.o obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o @@ -149,15 +149,6 @@ obj-$(CONFIG_BITREVERSE) += bitrev.o obj-$(CONFIG_LINEAR_RANGES) += linear_ranges.o obj-$(CONFIG_PACKING) += packing.o obj-$(CONFIG_PACKING_KUNIT_TEST) += packing_test.o -obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o -obj-$(CONFIG_CRC16) += crc16.o -obj-$(CONFIG_CRC_T10DIF)+= crc-t10dif.o -obj-$(CONFIG_CRC_ITU_T) += crc-itu-t.o -obj-$(CONFIG_CRC32) += crc32.o -obj-$(CONFIG_CRC64) += crc64.o -obj-$(CONFIG_CRC4) += crc4.o -obj-$(CONFIG_CRC7) += crc7.o -obj-$(CONFIG_CRC8) += crc8.o obj-$(CONFIG_XXHASH) += xxhash.o obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o @@ -295,27 +286,6 @@ obj-$(CONFIG_ASN1_ENCODER) += asn1_encoder.o obj-$(CONFIG_FONT_SUPPORT) += fonts/ -hostprogs := gen_crc32table -hostprogs += gen_crc64table -clean-files := crc32table.h -clean-files += crc64table.h - -$(obj)/crc32.o: $(obj)/crc32table.h - -quiet_cmd_crc32 = GEN $@ - cmd_crc32 = $< > $@ - -$(obj)/crc32table.h: $(obj)/gen_crc32table - $(call cmd,crc32) - -$(obj)/crc64.o: $(obj)/crc64table.h - -quiet_cmd_crc64 = GEN $@ - cmd_crc64 = $< > $@ - -$(obj)/crc64table.h: $(obj)/gen_crc64table - $(call cmd,crc64) - # # Build a fast OID lookip registry from include/linux/oid_registry.h # @@ -338,7 +308,7 @@ obj-$(CONFIG_UBSAN) += ubsan.o UBSAN_SANITIZE_ubsan.o := n KASAN_SANITIZE_ubsan.o := n KCSAN_SANITIZE_ubsan.o := n -CFLAGS_ubsan.o := -fno-stack-protector $(DISABLE_STACKLEAK_PLUGIN) +CFLAGS_ubsan.o := -fno-stack-protector $(DISABLE_KSTACK_ERASE) obj-$(CONFIG_SBITMAP) += sbitmap.o diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 1d893e313614..e9b33848700a 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -10,6 +10,7 @@ #include <linux/seq_buf.h> #include <linux/seq_file.h> #include <linux/vmalloc.h> +#include <linux/kmemleak.h> #define ALLOCINFO_FILE_NAME "allocinfo" #define MODULE_ALLOC_TAG_VMAP_SIZE (100000UL * sizeof(struct alloc_tag)) @@ -24,8 +25,10 @@ static bool mem_profiling_support; static struct codetag_type *alloc_tag_cttype; +#ifdef CONFIG_ARCH_MODULE_NEEDS_WEAK_PER_CPU DEFINE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); EXPORT_SYMBOL(_shared_alloc_tag); +#endif DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, mem_alloc_profiling_key); @@ -45,21 +48,16 @@ struct allocinfo_private { static void *allocinfo_start(struct seq_file *m, loff_t *pos) { struct allocinfo_private *priv; - struct codetag *ct; loff_t node = *pos; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - m->private = priv; - if (!priv) - return NULL; - - priv->print_header = (node == 0); + priv = (struct allocinfo_private *)m->private; codetag_lock_module_list(alloc_tag_cttype, true); - priv->iter = codetag_get_ct_iter(alloc_tag_cttype); - while ((ct = codetag_next_ct(&priv->iter)) != NULL && node) - node--; - - return ct ? priv : NULL; + if (node == 0) { + priv->print_header = true; + priv->iter = codetag_get_ct_iter(alloc_tag_cttype); + codetag_next_ct(&priv->iter); + } + return priv->iter.ct ? priv : NULL; } static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos) @@ -76,12 +74,7 @@ static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos) static void allocinfo_stop(struct seq_file *m, void *arg) { - struct allocinfo_private *priv = (struct allocinfo_private *)m->private; - - if (priv) { - codetag_lock_module_list(alloc_tag_cttype, false); - kfree(priv); - } + codetag_lock_module_list(alloc_tag_cttype, false); } static void print_allocinfo_header(struct seq_buf *buf) @@ -134,6 +127,9 @@ size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sl struct codetag_bytes n; unsigned int i, nr = 0; + if (IS_ERR_OR_NULL(alloc_tag_cttype)) + return 0; + if (can_sleep) codetag_lock_module_list(alloc_tag_cttype, true); else if (!codetag_trylock_module_list(alloc_tag_cttype)) @@ -244,17 +240,6 @@ static void shutdown_mem_profiling(bool remove_file) mem_profiling_support = false; } -static void __init procfs_init(void) -{ - if (!mem_profiling_support) - return; - - if (!proc_create_seq(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_seq_op)) { - pr_err("Failed to create %s file\n", ALLOCINFO_FILE_NAME); - shutdown_mem_profiling(false); - } -} - void __init alloc_tag_sec_init(void) { struct alloc_tag *last_codetag; @@ -350,18 +335,28 @@ static bool needs_section_mem(struct module *mod, unsigned long size) return size >= sizeof(struct alloc_tag); } -static struct alloc_tag *find_used_tag(struct alloc_tag *from, struct alloc_tag *to) +static bool clean_unused_counters(struct alloc_tag *start_tag, + struct alloc_tag *end_tag) { - while (from <= to) { + struct alloc_tag *tag; + bool ret = true; + + for (tag = start_tag; tag <= end_tag; tag++) { struct alloc_tag_counters counter; - counter = alloc_tag_read(from); - if (counter.bytes) - return from; - from++; + if (!tag->counters) + continue; + + counter = alloc_tag_read(tag); + if (!counter.bytes) { + free_percpu(tag->counters); + tag->counters = NULL; + } else { + ret = false; + } } - return NULL; + return ret; } /* Called with mod_area_mt locked */ @@ -371,12 +366,16 @@ static void clean_unused_module_areas_locked(void) struct module *val; mas_for_each(&mas, val, module_tags.size) { + struct alloc_tag *start_tag; + struct alloc_tag *end_tag; + if (val != &unloaded_mod) continue; /* Release area if all tags are unused */ - if (!find_used_tag((struct alloc_tag *)(module_tags.start_addr + mas.index), - (struct alloc_tag *)(module_tags.start_addr + mas.last))) + start_tag = (struct alloc_tag *)(module_tags.start_addr + mas.index); + end_tag = (struct alloc_tag *)(module_tags.start_addr + mas.last); + if (clean_unused_counters(start_tag, end_tag)) mas_erase(&mas); } } @@ -422,11 +421,20 @@ static int vm_module_tags_populate(void) unsigned long old_shadow_end = ALIGN(phys_end, MODULE_ALIGN); unsigned long new_shadow_end = ALIGN(new_end, MODULE_ALIGN); unsigned long more_pages; - unsigned long nr; + unsigned long nr = 0; more_pages = ALIGN(new_end - phys_end, PAGE_SIZE) >> PAGE_SHIFT; - nr = alloc_pages_bulk_node(GFP_KERNEL | __GFP_NOWARN, - NUMA_NO_NODE, more_pages, next_page); + while (nr < more_pages) { + unsigned long allocated; + + allocated = alloc_pages_bulk_node(GFP_KERNEL | __GFP_NOWARN, + NUMA_NO_NODE, more_pages - nr, next_page + nr); + + if (!allocated) + break; + nr += allocated; + } + if (nr < more_pages || vmap_pages_range(phys_end, phys_end + (nr << PAGE_SHIFT), PAGE_KERNEL, next_page, PAGE_SHIFT) < 0) { @@ -552,7 +560,8 @@ unlock: static void release_module_tags(struct module *mod, bool used) { MA_STATE(mas, &mod_area_mt, module_tags.size, module_tags.size); - struct alloc_tag *tag; + struct alloc_tag *start_tag; + struct alloc_tag *end_tag; struct module *val; mas_lock(&mas); @@ -566,15 +575,22 @@ static void release_module_tags(struct module *mod, bool used) if (!used) goto release_area; - /* Find out if the area is used */ - tag = find_used_tag((struct alloc_tag *)(module_tags.start_addr + mas.index), - (struct alloc_tag *)(module_tags.start_addr + mas.last)); - if (tag) { - struct alloc_tag_counters counter = alloc_tag_read(tag); + start_tag = (struct alloc_tag *)(module_tags.start_addr + mas.index); + end_tag = (struct alloc_tag *)(module_tags.start_addr + mas.last); + if (!clean_unused_counters(start_tag, end_tag)) { + struct alloc_tag *tag; + + for (tag = start_tag; tag <= end_tag; tag++) { + struct alloc_tag_counters counter; - pr_info("%s:%u module %s func:%s has %llu allocated at module unload\n", - tag->ct.filename, tag->ct.lineno, tag->ct.modname, - tag->ct.function, counter.bytes); + if (!tag->counters) + continue; + + counter = alloc_tag_read(tag); + pr_info("%s:%u module %s func:%s has %llu allocated at module unload\n", + tag->ct.filename, tag->ct.lineno, tag->ct.modname, + tag->ct.function, counter.bytes); + } } else { used = false; } @@ -587,6 +603,41 @@ out: mas_unlock(&mas); } +static int load_module(struct module *mod, struct codetag *start, struct codetag *stop) +{ + /* Allocate module alloc_tag percpu counters */ + struct alloc_tag *start_tag; + struct alloc_tag *stop_tag; + struct alloc_tag *tag; + + /* percpu counters for core allocations are already statically allocated */ + if (!mod) + return 0; + + start_tag = ct_to_alloc_tag(start); + stop_tag = ct_to_alloc_tag(stop); + for (tag = start_tag; tag < stop_tag; tag++) { + WARN_ON(tag->counters); + tag->counters = alloc_percpu(struct alloc_tag_counters); + if (!tag->counters) { + while (--tag >= start_tag) { + free_percpu(tag->counters); + tag->counters = NULL; + } + pr_err("Failed to allocate memory for allocation tag percpu counters in the module %s\n", + mod->name); + return -ENOMEM; + } + + /* + * Avoid a kmemleak false positive. The pointer to the counters is stored + * in the alloc_tag section of the module and cannot be directly accessed. + */ + kmemleak_ignore_percpu(tag->counters); + } + return 0; +} + static void replace_module(struct module *mod, struct module *new_mod) { MA_STATE(mas, &mod_area_mt, 0, module_tags.size); @@ -748,24 +799,41 @@ static int __init alloc_tag_init(void) .needs_section_mem = needs_section_mem, .alloc_section_mem = reserve_module_tags, .free_section_mem = release_module_tags, + .module_load = load_module, .module_replaced = replace_module, #endif }; int res; + sysctl_init(); + + if (!mem_profiling_support) { + pr_info("Memory allocation profiling is not supported!\n"); + return 0; + } + + if (!proc_create_seq_private(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_seq_op, + sizeof(struct allocinfo_private), NULL)) { + pr_err("Failed to create %s file\n", ALLOCINFO_FILE_NAME); + shutdown_mem_profiling(false); + return -ENOMEM; + } + res = alloc_mod_tags_mem(); - if (res) + if (res) { + pr_err("Failed to reserve address space for module tags, errno = %d\n", res); + shutdown_mem_profiling(true); return res; + } alloc_tag_cttype = codetag_register_type(&desc); if (IS_ERR(alloc_tag_cttype)) { + pr_err("Allocation tags registration failed, errno = %ld\n", PTR_ERR(alloc_tag_cttype)); free_mod_tags_mem(); + shutdown_mem_profiling(true); return PTR_ERR(alloc_tag_cttype); } - sysctl_init(); - procfs_init(); - return 0; } module_init(alloc_tag_init); diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c index 13da529e2e72..5738ae286b41 100644 --- a/lib/asn1_decoder.c +++ b/lib/asn1_decoder.c @@ -518,4 +518,5 @@ error: } EXPORT_SYMBOL_GPL(asn1_ber_decoder); +MODULE_DESCRIPTION("Decoder for ASN.1 BER/DER/CER encoded bytestream"); MODULE_LICENSE("GPL"); diff --git a/lib/codetag.c b/lib/codetag.c index 42aadd6c1454..545911cebd25 100644 --- a/lib/codetag.c +++ b/lib/codetag.c @@ -11,8 +11,14 @@ struct codetag_type { struct list_head link; unsigned int count; struct idr mod_idr; - struct rw_semaphore mod_lock; /* protects mod_idr */ + /* + * protects mod_idr, next_mod_seq, + * iter->mod_seq and cmod->mod_seq + */ + struct rw_semaphore mod_lock; struct codetag_type_desc desc; + /* generates unique sequence number for module load */ + unsigned long next_mod_seq; }; struct codetag_range { @@ -23,6 +29,7 @@ struct codetag_range { struct codetag_module { struct module *mod; struct codetag_range range; + unsigned long mod_seq; }; static DEFINE_MUTEX(codetag_lock); @@ -48,6 +55,7 @@ struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype) .cmod = NULL, .mod_id = 0, .ct = NULL, + .mod_seq = 0, }; return iter; @@ -91,11 +99,13 @@ struct codetag *codetag_next_ct(struct codetag_iterator *iter) if (!cmod) break; - if (cmod != iter->cmod) { + if (!iter->cmod || iter->mod_seq != cmod->mod_seq) { iter->cmod = cmod; + iter->mod_seq = cmod->mod_seq; ct = get_first_module_ct(cmod); - } else + } else { ct = get_next_module_ct(iter); + } if (ct) break; @@ -167,6 +177,7 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod) { struct codetag_range range; struct codetag_module *cmod; + int mod_id; int err; range = get_section_range(mod, cttype->desc.section); @@ -190,11 +201,21 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod) cmod->range = range; down_write(&cttype->mod_lock); - err = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL); - if (err >= 0) { - cttype->count += range_size(cttype, &range); - if (cttype->desc.module_load) - cttype->desc.module_load(cttype, cmod); + cmod->mod_seq = ++cttype->next_mod_seq; + mod_id = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL); + if (mod_id >= 0) { + if (cttype->desc.module_load) { + err = cttype->desc.module_load(mod, range.start, range.stop); + if (!err) + cttype->count += range_size(cttype, &range); + else + idr_remove(&cttype->mod_idr, mod_id); + } else { + cttype->count += range_size(cttype, &range); + err = 0; + } + } else { + err = mod_id; } up_write(&cttype->mod_lock); @@ -295,17 +316,23 @@ void codetag_module_replaced(struct module *mod, struct module *new_mod) mutex_unlock(&codetag_lock); } -void codetag_load_module(struct module *mod) +int codetag_load_module(struct module *mod) { struct codetag_type *cttype; + int ret = 0; if (!mod) - return; + return 0; mutex_lock(&codetag_lock); - list_for_each_entry(cttype, &codetag_types, link) - codetag_module_init(cttype, mod); + list_for_each_entry(cttype, &codetag_types, link) { + ret = codetag_module_init(cttype, mod); + if (ret) + break; + } mutex_unlock(&codetag_lock); + + return ret; } void codetag_unload_module(struct module *mod) @@ -333,7 +360,8 @@ void codetag_unload_module(struct module *mod) } if (found) { if (cttype->desc.module_unload) - cttype->desc.module_unload(cttype, cmod); + cttype->desc.module_unload(cmod->mod, + cmod->range.start, cmod->range.stop); cttype->count -= range_size(cttype, &cmod->range); idr_remove(&cttype->mod_idr, mod_id); diff --git a/lib/crc/.gitignore b/lib/crc/.gitignore new file mode 100644 index 000000000000..a9e48103c9fb --- /dev/null +++ b/lib/crc/.gitignore @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +/crc32table.h +/crc64table.h +/gen_crc32table +/gen_crc64table diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig new file mode 100644 index 000000000000..70e7a6016de3 --- /dev/null +++ b/lib/crc/Kconfig @@ -0,0 +1,119 @@ +# SPDX-License-Identifier: GPL-2.0-only + +# Kconfig for the kernel's cyclic redundancy check (CRC) library code + +config CRC4 + tristate + help + The CRC4 library functions. Select this if your module uses any of + the functions from <linux/crc4.h>. + +config CRC7 + tristate + help + The CRC7 library functions. Select this if your module uses any of + the functions from <linux/crc7.h>. + +config CRC8 + tristate + help + The CRC8 library functions. Select this if your module uses any of + the functions from <linux/crc8.h>. + +config CRC16 + tristate + help + The CRC16 library functions. Select this if your module uses any of + the functions from <linux/crc16.h>. + +config CRC_CCITT + tristate + help + The CRC-CCITT library functions. Select this if your module uses any + of the functions from <linux/crc-ccitt.h>. + +config CRC_ITU_T + tristate + help + The CRC-ITU-T library functions. Select this if your module uses + any of the functions from <linux/crc-itu-t.h>. + +config CRC_T10DIF + tristate + help + The CRC-T10DIF library functions. Select this if your module uses + any of the functions from <linux/crc-t10dif.h>. + +config CRC_T10DIF_ARCH + bool + depends on CRC_T10DIF && CRC_OPTIMIZATIONS + default y if ARM && KERNEL_MODE_NEON + default y if ARM64 && KERNEL_MODE_NEON + default y if PPC64 && ALTIVEC + default y if RISCV && RISCV_ISA_ZBC + default y if X86 + +config CRC32 + tristate + select BITREVERSE + help + The CRC32 library functions. Select this if your module uses any of + the functions from <linux/crc32.h> or <linux/crc32c.h>. + +config CRC32_ARCH + bool + depends on CRC32 && CRC_OPTIMIZATIONS + default y if ARM && KERNEL_MODE_NEON + default y if ARM64 + default y if LOONGARCH + default y if MIPS && CPU_MIPSR6 + default y if PPC64 && ALTIVEC + default y if RISCV && RISCV_ISA_ZBC + default y if S390 + default y if SPARC64 + default y if X86 + +config CRC64 + tristate + help + The CRC64 library functions. Select this if your module uses any of + the functions from <linux/crc64.h>. + +config CRC64_ARCH + bool + depends on CRC64 && CRC_OPTIMIZATIONS + default y if RISCV && RISCV_ISA_ZBC && 64BIT + default y if X86_64 + +config CRC_OPTIMIZATIONS + bool "Enable optimized CRC implementations" if EXPERT + depends on !UML + default y + help + Disabling this option reduces code size slightly by disabling the + architecture-optimized implementations of any CRC variants that are + enabled. CRC checksumming performance may get much slower. + + Keep this enabled unless you're really trying to minimize the size of + the kernel. + +config CRC_KUNIT_TEST + tristate "KUnit tests for CRC functions" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + select CRC7 + select CRC16 + select CRC_T10DIF + select CRC32 + select CRC64 + help + Unit tests for the CRC library functions. + + This is intended to help people writing architecture-specific + optimized versions. If unsure, say N. + +config CRC_BENCHMARK + bool "Benchmark for the CRC functions" + depends on CRC_KUNIT_TEST + help + Include benchmarks in the KUnit test suite for the CRC functions. diff --git a/lib/crc/Makefile b/lib/crc/Makefile new file mode 100644 index 000000000000..7543ad295ab6 --- /dev/null +++ b/lib/crc/Makefile @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: GPL-2.0-only + +# Makefile for the kernel's cyclic redundancy check (CRC) library code + +obj-$(CONFIG_CRC4) += crc4.o +obj-$(CONFIG_CRC7) += crc7.o +obj-$(CONFIG_CRC8) += crc8.o +obj-$(CONFIG_CRC16) += crc16.o +obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o +obj-$(CONFIG_CRC_ITU_T) += crc-itu-t.o + +obj-$(CONFIG_CRC_T10DIF) += crc-t10dif.o +crc-t10dif-y := crc-t10dif-main.o +ifeq ($(CONFIG_CRC_T10DIF_ARCH),y) +CFLAGS_crc-t10dif-main.o += -I$(src)/$(SRCARCH) +crc-t10dif-$(CONFIG_ARM) += arm/crc-t10dif-core.o +crc-t10dif-$(CONFIG_ARM64) += arm64/crc-t10dif-core.o +crc-t10dif-$(CONFIG_PPC) += powerpc/crct10dif-vpmsum_asm.o +crc-t10dif-$(CONFIG_RISCV) += riscv/crc16_msb.o +crc-t10dif-$(CONFIG_X86) += x86/crc16-msb-pclmul.o +endif + +obj-$(CONFIG_CRC32) += crc32.o +crc32-y := crc32-main.o +ifeq ($(CONFIG_CRC32_ARCH),y) +CFLAGS_crc32-main.o += -I$(src)/$(SRCARCH) +crc32-$(CONFIG_ARM) += arm/crc32-core.o +crc32-$(CONFIG_ARM64) += arm64/crc32-core.o +crc32-$(CONFIG_PPC) += powerpc/crc32c-vpmsum_asm.o +crc32-$(CONFIG_RISCV) += riscv/crc32_lsb.o riscv/crc32_msb.o +crc32-$(CONFIG_S390) += s390/crc32le-vx.o s390/crc32be-vx.o +crc32-$(CONFIG_SPARC) += sparc/crc32c_asm.o +crc32-$(CONFIG_X86) += x86/crc32-pclmul.o +crc32-$(CONFIG_X86_64) += x86/crc32c-3way.o +endif + +obj-$(CONFIG_CRC64) += crc64.o +crc64-y := crc64-main.o +ifeq ($(CONFIG_CRC64_ARCH),y) +CFLAGS_crc64-main.o += -I$(src)/$(SRCARCH) +crc64-$(CONFIG_RISCV) += riscv/crc64_lsb.o riscv/crc64_msb.o +crc64-$(CONFIG_X86) += x86/crc64-pclmul.o +endif + +obj-y += tests/ + +hostprogs := gen_crc32table gen_crc64table +clean-files := crc32table.h crc64table.h + +$(obj)/crc32-main.o: $(obj)/crc32table.h +$(obj)/crc64-main.o: $(obj)/crc64table.h + +quiet_cmd_crc32 = GEN $@ + cmd_crc32 = $< > $@ + +quiet_cmd_crc64 = GEN $@ + cmd_crc64 = $< > $@ + +$(obj)/crc32table.h: $(obj)/gen_crc32table + $(call cmd,crc32) + +$(obj)/crc64table.h: $(obj)/gen_crc64table + $(call cmd,crc64) diff --git a/lib/crc/arm/crc-t10dif-core.S b/lib/crc/arm/crc-t10dif-core.S new file mode 100644 index 000000000000..2bbf2df9c1e2 --- /dev/null +++ b/lib/crc/arm/crc-t10dif-core.S @@ -0,0 +1,468 @@ +// +// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions +// +// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> +// Copyright (C) 2019 Google LLC <ebiggers@google.com> +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License version 2 as +// published by the Free Software Foundation. +// + +// Derived from the x86 version: +// +// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions +// +// Copyright (c) 2013, Intel Corporation +// +// Authors: +// Erdinc Ozturk <erdinc.ozturk@intel.com> +// Vinodh Gopal <vinodh.gopal@intel.com> +// James Guilford <james.guilford@intel.com> +// Tim Chen <tim.c.chen@linux.intel.com> +// +// This software is available to you under a choice of one of two +// licenses. You may choose to be licensed under the terms of the GNU +// General Public License (GPL) Version 2, available from the file +// COPYING in the main directory of this source tree, or the +// OpenIB.org BSD license below: +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the +// distribution. +// +// * Neither the name of the Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// +// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Reference paper titled "Fast CRC Computation for Generic +// Polynomials Using PCLMULQDQ Instruction" +// URL: http://www.intel.com/content/dam/www/public/us/en/documents +// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +// + +#include <linux/linkage.h> +#include <asm/assembler.h> + +#ifdef CONFIG_CPU_ENDIAN_BE8 +#define CPU_LE(code...) +#else +#define CPU_LE(code...) code +#endif + + .text + .arch armv8-a + .fpu crypto-neon-fp-armv8 + + init_crc .req r0 + buf .req r1 + len .req r2 + + fold_consts_ptr .req ip + + q0l .req d0 + q0h .req d1 + q1l .req d2 + q1h .req d3 + q2l .req d4 + q2h .req d5 + q3l .req d6 + q3h .req d7 + q4l .req d8 + q4h .req d9 + q5l .req d10 + q5h .req d11 + q6l .req d12 + q6h .req d13 + q7l .req d14 + q7h .req d15 + q8l .req d16 + q8h .req d17 + q9l .req d18 + q9h .req d19 + q10l .req d20 + q10h .req d21 + q11l .req d22 + q11h .req d23 + q12l .req d24 + q12h .req d25 + + FOLD_CONSTS .req q10 + FOLD_CONST_L .req q10l + FOLD_CONST_H .req q10h + + /* + * Pairwise long polynomial multiplication of two 16-bit values + * + * { w0, w1 }, { y0, y1 } + * + * by two 64-bit values + * + * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 } + * + * where each vector element is a byte, ordered from least to most + * significant. The resulting 80-bit vectors are XOR'ed together. + * + * This can be implemented using 8x8 long polynomial multiplication, by + * reorganizing the input so that each pairwise 8x8 multiplication + * produces one of the terms from the decomposition below, and + * combining the results of each rank and shifting them into place. + * + * Rank + * 0 w0*x0 ^ | y0*z0 ^ + * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^ + * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^ + * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^ + * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^ + * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^ + * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^ + * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^ + * 8 w1*x7 << 64 | y1*z7 << 64 + * + * The inputs can be reorganized into + * + * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 } + * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 } + * + * and after performing 8x8->16 bit long polynomial multiplication of + * each of the halves of the first vector with those of the second one, + * we obtain the following four vectors of 16-bit elements: + * + * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 } + * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 } + * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 } + * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 } + * + * Results b and c can be XORed together, as the vector elements have + * matching ranks. Then, the final XOR can be pulled forward, and + * applied between the halves of each of the remaining three vectors, + * which are then shifted into place, and XORed together to produce the + * final 80-bit result. + */ + .macro pmull16x64_p8, v16, v64 + vext.8 q11, \v64, \v64, #1 + vld1.64 {q12}, [r4, :128] + vuzp.8 q11, \v64 + vtbl.8 d24, {\v16\()_L-\v16\()_H}, d24 + vtbl.8 d25, {\v16\()_L-\v16\()_H}, d25 + bl __pmull16x64_p8 + veor \v64, q12, q14 + .endm + +__pmull16x64_p8: + vmull.p8 q13, d23, d24 + vmull.p8 q14, d23, d25 + vmull.p8 q15, d22, d24 + vmull.p8 q12, d22, d25 + + veor q14, q14, q15 + veor d24, d24, d25 + veor d26, d26, d27 + veor d28, d28, d29 + vmov.i32 d25, #0 + vmov.i32 d29, #0 + vext.8 q12, q12, q12, #14 + vext.8 q14, q14, q14, #15 + veor d24, d24, d26 + bx lr +ENDPROC(__pmull16x64_p8) + + .macro pmull16x64_p64, v16, v64 + vmull.p64 q11, \v64\()l, \v16\()_L + vmull.p64 \v64, \v64\()h, \v16\()_H + veor \v64, \v64, q11 + .endm + + // Fold reg1, reg2 into the next 32 data bytes, storing the result back + // into reg1, reg2. + .macro fold_32_bytes, reg1, reg2, p + vld1.64 {q8-q9}, [buf]! + + pmull16x64_\p FOLD_CONST, \reg1 + pmull16x64_\p FOLD_CONST, \reg2 + +CPU_LE( vrev64.8 q8, q8 ) +CPU_LE( vrev64.8 q9, q9 ) + vswp q8l, q8h + vswp q9l, q9h + + veor.8 \reg1, \reg1, q8 + veor.8 \reg2, \reg2, q9 + .endm + + // Fold src_reg into dst_reg, optionally loading the next fold constants + .macro fold_16_bytes, src_reg, dst_reg, p, load_next_consts + pmull16x64_\p FOLD_CONST, \src_reg + .ifnb \load_next_consts + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! + .endif + veor.8 \dst_reg, \dst_reg, \src_reg + .endm + + .macro crct10dif, p + // For sizes less than 256 bytes, we can't fold 128 bytes at a time. + cmp len, #256 + blt .Lless_than_256_bytes\@ + + mov_l fold_consts_ptr, .Lfold_across_128_bytes_consts + + // Load the first 128 data bytes. Byte swapping is necessary to make + // the bit order match the polynomial coefficient order. + vld1.64 {q0-q1}, [buf]! + vld1.64 {q2-q3}, [buf]! + vld1.64 {q4-q5}, [buf]! + vld1.64 {q6-q7}, [buf]! +CPU_LE( vrev64.8 q0, q0 ) +CPU_LE( vrev64.8 q1, q1 ) +CPU_LE( vrev64.8 q2, q2 ) +CPU_LE( vrev64.8 q3, q3 ) +CPU_LE( vrev64.8 q4, q4 ) +CPU_LE( vrev64.8 q5, q5 ) +CPU_LE( vrev64.8 q6, q6 ) +CPU_LE( vrev64.8 q7, q7 ) + vswp q0l, q0h + vswp q1l, q1h + vswp q2l, q2h + vswp q3l, q3h + vswp q4l, q4h + vswp q5l, q5h + vswp q6l, q6h + vswp q7l, q7h + + // XOR the first 16 data *bits* with the initial CRC value. + vmov.i8 q8h, #0 + vmov.u16 q8h[3], init_crc + veor q0h, q0h, q8h + + // Load the constants for folding across 128 bytes. + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! + + // Subtract 128 for the 128 data bytes just consumed. Subtract another + // 128 to simplify the termination condition of the following loop. + sub len, len, #256 + + // While >= 128 data bytes remain (not counting q0-q7), fold the 128 + // bytes q0-q7 into them, storing the result back into q0-q7. +.Lfold_128_bytes_loop\@: + fold_32_bytes q0, q1, \p + fold_32_bytes q2, q3, \p + fold_32_bytes q4, q5, \p + fold_32_bytes q6, q7, \p + subs len, len, #128 + bge .Lfold_128_bytes_loop\@ + + // Now fold the 112 bytes in q0-q6 into the 16 bytes in q7. + + // Fold across 64 bytes. + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! + fold_16_bytes q0, q4, \p + fold_16_bytes q1, q5, \p + fold_16_bytes q2, q6, \p + fold_16_bytes q3, q7, \p, 1 + // Fold across 32 bytes. + fold_16_bytes q4, q6, \p + fold_16_bytes q5, q7, \p, 1 + // Fold across 16 bytes. + fold_16_bytes q6, q7, \p + + // Add 128 to get the correct number of data bytes remaining in 0...127 + // (not counting q7), following the previous extra subtraction by 128. + // Then subtract 16 to simplify the termination condition of the + // following loop. + adds len, len, #(128-16) + + // While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7 + // into them, storing the result back into q7. + blt .Lfold_16_bytes_loop_done\@ +.Lfold_16_bytes_loop\@: + pmull16x64_\p FOLD_CONST, q7 + vld1.64 {q0}, [buf]! +CPU_LE( vrev64.8 q0, q0 ) + vswp q0l, q0h + veor.8 q7, q7, q0 + subs len, len, #16 + bge .Lfold_16_bytes_loop\@ + +.Lfold_16_bytes_loop_done\@: + // Add 16 to get the correct number of data bytes remaining in 0...15 + // (not counting q7), following the previous extra subtraction by 16. + adds len, len, #16 + beq .Lreduce_final_16_bytes\@ + +.Lhandle_partial_segment\@: + // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first + // 16 bytes are in q7 and the rest are the remaining data in 'buf'. To + // do this without needing a fold constant for each possible 'len', + // redivide the bytes into a first chunk of 'len' bytes and a second + // chunk of 16 bytes, then fold the first chunk into the second. + + // q0 = last 16 original data bytes + add buf, buf, len + sub buf, buf, #16 + vld1.64 {q0}, [buf] +CPU_LE( vrev64.8 q0, q0 ) + vswp q0l, q0h + + // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes. + mov_l r1, .Lbyteshift_table + 16 + sub r1, r1, len + vld1.8 {q2}, [r1] + vtbl.8 q1l, {q7l-q7h}, q2l + vtbl.8 q1h, {q7l-q7h}, q2h + + // q3 = first chunk: q7 right-shifted by '16-len' bytes. + vmov.i8 q3, #0x80 + veor.8 q2, q2, q3 + vtbl.8 q3l, {q7l-q7h}, q2l + vtbl.8 q3h, {q7l-q7h}, q2h + + // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes. + vshr.s8 q2, q2, #7 + + // q2 = second chunk: 'len' bytes from q0 (low-order bytes), + // then '16-len' bytes from q1 (high-order bytes). + vbsl.8 q2, q1, q0 + + // Fold the first chunk into the second chunk, storing the result in q7. + pmull16x64_\p FOLD_CONST, q3 + veor.8 q7, q3, q2 + b .Lreduce_final_16_bytes\@ + +.Lless_than_256_bytes\@: + // Checksumming a buffer of length 16...255 bytes + + mov_l fold_consts_ptr, .Lfold_across_16_bytes_consts + + // Load the first 16 data bytes. + vld1.64 {q7}, [buf]! +CPU_LE( vrev64.8 q7, q7 ) + vswp q7l, q7h + + // XOR the first 16 data *bits* with the initial CRC value. + vmov.i8 q0h, #0 + vmov.u16 q0h[3], init_crc + veor.8 q7h, q7h, q0h + + // Load the fold-across-16-bytes constants. + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! + + cmp len, #16 + beq .Lreduce_final_16_bytes\@ // len == 16 + subs len, len, #32 + addlt len, len, #16 + blt .Lhandle_partial_segment\@ // 17 <= len <= 31 + b .Lfold_16_bytes_loop\@ // 32 <= len <= 255 + +.Lreduce_final_16_bytes\@: + .endm + +// +// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len); +// +// Assumes len >= 16. +// +ENTRY(crc_t10dif_pmull64) + crct10dif p64 + + // Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC. + + // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! + + // Fold the high 64 bits into the low 64 bits, while also multiplying by + // x^64. This produces a 128-bit value congruent to x^64 * M(x) and + // whose low 48 bits are 0. + vmull.p64 q0, q7h, FOLD_CONST_H // high bits * x^48 * (x^80 mod G(x)) + veor.8 q0h, q0h, q7l // + low bits * x^64 + + // Fold the high 32 bits into the low 96 bits. This produces a 96-bit + // value congruent to x^64 * M(x) and whose low 48 bits are 0. + vmov.i8 q1, #0 + vmov s4, s3 // extract high 32 bits + vmov s3, s5 // zero high 32 bits + vmull.p64 q1, q1l, FOLD_CONST_L // high 32 bits * x^48 * (x^48 mod G(x)) + veor.8 q0, q0, q1 // + low bits + + // Load G(x) and floor(x^48 / G(x)). + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128] + + // Use Barrett reduction to compute the final CRC value. + vmull.p64 q1, q0h, FOLD_CONST_H // high 32 bits * floor(x^48 / G(x)) + vshr.u64 q1l, q1l, #32 // /= x^32 + vmull.p64 q1, q1l, FOLD_CONST_L // *= G(x) + vshr.u64 q0l, q0l, #48 + veor.8 q0l, q0l, q1l // + low 16 nonzero bits + // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of q0. + + vmov.u16 r0, q0l[0] + bx lr +ENDPROC(crc_t10dif_pmull64) + +ENTRY(crc_t10dif_pmull8) + push {r4, lr} + mov_l r4, .L16x64perm + + crct10dif p8 + +CPU_LE( vrev64.8 q7, q7 ) + vswp q7l, q7h + vst1.64 {q7}, [r3, :128] + pop {r4, pc} +ENDPROC(crc_t10dif_pmull8) + + .section ".rodata", "a" + .align 4 + +// Fold constants precomputed from the polynomial 0x18bb7 +// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 +.Lfold_across_128_bytes_consts: + .quad 0x0000000000006123 // x^(8*128) mod G(x) + .quad 0x0000000000002295 // x^(8*128+64) mod G(x) +// .Lfold_across_64_bytes_consts: + .quad 0x0000000000001069 // x^(4*128) mod G(x) + .quad 0x000000000000dd31 // x^(4*128+64) mod G(x) +// .Lfold_across_32_bytes_consts: + .quad 0x000000000000857d // x^(2*128) mod G(x) + .quad 0x0000000000007acc // x^(2*128+64) mod G(x) +.Lfold_across_16_bytes_consts: + .quad 0x000000000000a010 // x^(1*128) mod G(x) + .quad 0x0000000000001faa // x^(1*128+64) mod G(x) +// .Lfinal_fold_consts: + .quad 0x1368000000000000 // x^48 * (x^48 mod G(x)) + .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x)) +// .Lbarrett_reduction_consts: + .quad 0x0000000000018bb7 // G(x) + .quad 0x00000001f65a57f8 // floor(x^48 / G(x)) + +// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - +// len] is the index vector to shift left by 'len' bytes, and is also {0x80, +// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes. +.Lbyteshift_table: + .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 + .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 + +.L16x64perm: + .quad 0x808080800000000, 0x909090901010101 diff --git a/lib/crc/arm/crc-t10dif.h b/lib/crc/arm/crc-t10dif.h new file mode 100644 index 000000000000..2edf7e9681d0 --- /dev/null +++ b/lib/crc/arm/crc-t10dif.h @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <crypto/internal/simd.h> + +#include <asm/neon.h> +#include <asm/simd.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull); + +#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U + +asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len); +asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len, + u8 out[16]); + +static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) +{ + if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) { + if (static_branch_likely(&have_pmull)) { + if (crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc_t10dif_pmull64(crc, data, length); + kernel_neon_end(); + return crc; + } + } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && + static_branch_likely(&have_neon) && + crypto_simd_usable()) { + u8 buf[16] __aligned(16); + + kernel_neon_begin(); + crc_t10dif_pmull8(crc, data, length, buf); + kernel_neon_end(); + + return crc_t10dif_generic(0, buf, sizeof(buf)); + } + } + return crc_t10dif_generic(crc, data, length); +} + +#define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch +static inline void crc_t10dif_mod_init_arch(void) +{ + if (elf_hwcap & HWCAP_NEON) { + static_branch_enable(&have_neon); + if (elf_hwcap2 & HWCAP2_PMULL) + static_branch_enable(&have_pmull); + } +} diff --git a/lib/crc/arm/crc32-core.S b/lib/crc/arm/crc32-core.S new file mode 100644 index 000000000000..6f674f30c70b --- /dev/null +++ b/lib/crc/arm/crc32-core.S @@ -0,0 +1,306 @@ +/* + * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 + * calculation. + * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) + * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found + * at: + * https://www.intel.com/products/processor/manuals/ + * Intel(R) 64 and IA-32 Architectures Software Developer's Manual + * Volume 2B: Instruction Set Reference, N-Z + * + * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> + * Alexander Boyko <Alexander_Boyko@xyratex.com> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .align 6 + .arch armv8-a + .arch_extension crc + .fpu crypto-neon-fp-armv8 + +.Lcrc32_constants: + /* + * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 + * #define CONSTANT_R1 0x154442bd4LL + * + * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 + * #define CONSTANT_R2 0x1c6e41596LL + */ + .quad 0x0000000154442bd4 + .quad 0x00000001c6e41596 + + /* + * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 + * #define CONSTANT_R3 0x1751997d0LL + * + * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e + * #define CONSTANT_R4 0x0ccaa009eLL + */ + .quad 0x00000001751997d0 + .quad 0x00000000ccaa009e + + /* + * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 + * #define CONSTANT_R5 0x163cd6124LL + */ + .quad 0x0000000163cd6124 + .quad 0x00000000FFFFFFFF + + /* + * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL + * + * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` + * = 0x1F7011641LL + * #define CONSTANT_RU 0x1F7011641LL + */ + .quad 0x00000001DB710641 + .quad 0x00000001F7011641 + +.Lcrc32c_constants: + .quad 0x00000000740eef02 + .quad 0x000000009e4addf8 + .quad 0x00000000f20c0dfe + .quad 0x000000014cd00bd6 + .quad 0x00000000dd45aab8 + .quad 0x00000000FFFFFFFF + .quad 0x0000000105ec76f0 + .quad 0x00000000dea713f1 + + dCONSTANTl .req d0 + dCONSTANTh .req d1 + qCONSTANT .req q0 + + BUF .req r0 + LEN .req r1 + CRC .req r2 + + qzr .req q9 + + /** + * Calculate crc32 + * BUF - buffer + * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63 + * CRC - initial crc32 + * return %eax crc32 + * uint crc32_pmull_le(unsigned char const *buffer, + * size_t len, uint crc32) + */ +SYM_FUNC_START(crc32_pmull_le) + adr r3, .Lcrc32_constants + b 0f +SYM_FUNC_END(crc32_pmull_le) + +SYM_FUNC_START(crc32c_pmull_le) + adr r3, .Lcrc32c_constants + +0: bic LEN, LEN, #15 + vld1.8 {q1-q2}, [BUF, :128]! + vld1.8 {q3-q4}, [BUF, :128]! + vmov.i8 qzr, #0 + vmov.i8 qCONSTANT, #0 + vmov.32 dCONSTANTl[0], CRC + veor.8 d2, d2, dCONSTANTl + sub LEN, LEN, #0x40 + cmp LEN, #0x40 + blt less_64 + + vld1.64 {qCONSTANT}, [r3] + +loop_64: /* 64 bytes Full cache line folding */ + sub LEN, LEN, #0x40 + + vmull.p64 q5, d3, dCONSTANTh + vmull.p64 q6, d5, dCONSTANTh + vmull.p64 q7, d7, dCONSTANTh + vmull.p64 q8, d9, dCONSTANTh + + vmull.p64 q1, d2, dCONSTANTl + vmull.p64 q2, d4, dCONSTANTl + vmull.p64 q3, d6, dCONSTANTl + vmull.p64 q4, d8, dCONSTANTl + + veor.8 q1, q1, q5 + vld1.8 {q5}, [BUF, :128]! + veor.8 q2, q2, q6 + vld1.8 {q6}, [BUF, :128]! + veor.8 q3, q3, q7 + vld1.8 {q7}, [BUF, :128]! + veor.8 q4, q4, q8 + vld1.8 {q8}, [BUF, :128]! + + veor.8 q1, q1, q5 + veor.8 q2, q2, q6 + veor.8 q3, q3, q7 + veor.8 q4, q4, q8 + + cmp LEN, #0x40 + bge loop_64 + +less_64: /* Folding cache line into 128bit */ + vldr dCONSTANTl, [r3, #16] + vldr dCONSTANTh, [r3, #24] + + vmull.p64 q5, d3, dCONSTANTh + vmull.p64 q1, d2, dCONSTANTl + veor.8 q1, q1, q5 + veor.8 q1, q1, q2 + + vmull.p64 q5, d3, dCONSTANTh + vmull.p64 q1, d2, dCONSTANTl + veor.8 q1, q1, q5 + veor.8 q1, q1, q3 + + vmull.p64 q5, d3, dCONSTANTh + vmull.p64 q1, d2, dCONSTANTl + veor.8 q1, q1, q5 + veor.8 q1, q1, q4 + + teq LEN, #0 + beq fold_64 + +loop_16: /* Folding rest buffer into 128bit */ + subs LEN, LEN, #0x10 + + vld1.8 {q2}, [BUF, :128]! + vmull.p64 q5, d3, dCONSTANTh + vmull.p64 q1, d2, dCONSTANTl + veor.8 q1, q1, q5 + veor.8 q1, q1, q2 + + bne loop_16 + +fold_64: + /* perform the last 64 bit fold, also adds 32 zeroes + * to the input stream */ + vmull.p64 q2, d2, dCONSTANTh + vext.8 q1, q1, qzr, #8 + veor.8 q1, q1, q2 + + /* final 32-bit fold */ + vldr dCONSTANTl, [r3, #32] + vldr d6, [r3, #40] + vmov.i8 d7, #0 + + vext.8 q2, q1, qzr, #4 + vand.8 d2, d2, d6 + vmull.p64 q1, d2, dCONSTANTl + veor.8 q1, q1, q2 + + /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ + vldr dCONSTANTl, [r3, #48] + vldr dCONSTANTh, [r3, #56] + + vand.8 q2, q1, q3 + vext.8 q2, qzr, q2, #8 + vmull.p64 q2, d5, dCONSTANTh + vand.8 q2, q2, q3 + vmull.p64 q2, d4, dCONSTANTl + veor.8 q1, q1, q2 + vmov r0, s5 + + bx lr +SYM_FUNC_END(crc32c_pmull_le) + + .macro __crc32, c + subs ip, r2, #8 + bmi .Ltail\c + + tst r1, #3 + bne .Lunaligned\c + + teq ip, #0 +.Laligned8\c: + ldrd r2, r3, [r1], #8 +ARM_BE8(rev r2, r2 ) +ARM_BE8(rev r3, r3 ) + crc32\c\()w r0, r0, r2 + crc32\c\()w r0, r0, r3 + bxeq lr + subs ip, ip, #8 + bpl .Laligned8\c + +.Ltail\c: + tst ip, #4 + beq 2f + ldr r3, [r1], #4 +ARM_BE8(rev r3, r3 ) + crc32\c\()w r0, r0, r3 + +2: tst ip, #2 + beq 1f + ldrh r3, [r1], #2 +ARM_BE8(rev16 r3, r3 ) + crc32\c\()h r0, r0, r3 + +1: tst ip, #1 + bxeq lr + ldrb r3, [r1] + crc32\c\()b r0, r0, r3 + bx lr + +.Lunaligned\c: + tst r1, #1 + beq 2f + ldrb r3, [r1], #1 + subs r2, r2, #1 + crc32\c\()b r0, r0, r3 + + tst r1, #2 + beq 0f +2: ldrh r3, [r1], #2 + subs r2, r2, #2 +ARM_BE8(rev16 r3, r3 ) + crc32\c\()h r0, r0, r3 + +0: subs ip, r2, #8 + bpl .Laligned8\c + b .Ltail\c + .endm + + .align 5 +SYM_FUNC_START(crc32_armv8_le) + __crc32 +SYM_FUNC_END(crc32_armv8_le) + + .align 5 +SYM_FUNC_START(crc32c_armv8_le) + __crc32 c +SYM_FUNC_END(crc32c_armv8_le) diff --git a/lib/crc/arm/crc32.h b/lib/crc/arm/crc32.h new file mode 100644 index 000000000000..018007e162a2 --- /dev/null +++ b/lib/crc/arm/crc32.h @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/cpufeature.h> + +#include <crypto/internal/simd.h> + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull); + +#define PMULL_MIN_LEN 64 /* min size of buffer for pmull functions */ + +asmlinkage u32 crc32_pmull_le(const u8 buf[], u32 len, u32 init_crc); +asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], u32 len); + +asmlinkage u32 crc32c_pmull_le(const u8 buf[], u32 len, u32 init_crc); +asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], u32 len); + +static inline u32 crc32_le_scalar(u32 crc, const u8 *p, size_t len) +{ + if (static_branch_likely(&have_crc32)) + return crc32_armv8_le(crc, p, len); + return crc32_le_base(crc, p, len); +} + +static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + if (len >= PMULL_MIN_LEN + 15 && + static_branch_likely(&have_pmull) && crypto_simd_usable()) { + size_t n = -(uintptr_t)p & 15; + + /* align p to 16-byte boundary */ + if (n) { + crc = crc32_le_scalar(crc, p, n); + p += n; + len -= n; + } + n = round_down(len, 16); + kernel_neon_begin(); + crc = crc32_pmull_le(p, n, crc); + kernel_neon_end(); + p += n; + len -= n; + } + return crc32_le_scalar(crc, p, len); +} + +static inline u32 crc32c_scalar(u32 crc, const u8 *p, size_t len) +{ + if (static_branch_likely(&have_crc32)) + return crc32c_armv8_le(crc, p, len); + return crc32c_base(crc, p, len); +} + +static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) +{ + if (len >= PMULL_MIN_LEN + 15 && + static_branch_likely(&have_pmull) && crypto_simd_usable()) { + size_t n = -(uintptr_t)p & 15; + + /* align p to 16-byte boundary */ + if (n) { + crc = crc32c_scalar(crc, p, n); + p += n; + len -= n; + } + n = round_down(len, 16); + kernel_neon_begin(); + crc = crc32c_pmull_le(p, n, crc); + kernel_neon_end(); + p += n; + len -= n; + } + return crc32c_scalar(crc, p, len); +} + +#define crc32_be_arch crc32_be_base /* not implemented on this arch */ + +#define crc32_mod_init_arch crc32_mod_init_arch +static inline void crc32_mod_init_arch(void) +{ + if (elf_hwcap2 & HWCAP2_CRC32) + static_branch_enable(&have_crc32); + if (elf_hwcap2 & HWCAP2_PMULL) + static_branch_enable(&have_pmull); +} + +static inline u32 crc32_optimizations_arch(void) +{ + if (elf_hwcap2 & (HWCAP2_CRC32 | HWCAP2_PMULL)) + return CRC32_LE_OPTIMIZATION | CRC32C_OPTIMIZATION; + return 0; +} diff --git a/lib/crc/arm64/crc-t10dif-core.S b/lib/crc/arm64/crc-t10dif-core.S new file mode 100644 index 000000000000..87dd6d46224d --- /dev/null +++ b/lib/crc/arm64/crc-t10dif-core.S @@ -0,0 +1,469 @@ +// +// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions +// +// Copyright (C) 2016 Linaro Ltd +// Copyright (C) 2019-2024 Google LLC +// +// Authors: Ard Biesheuvel <ardb@google.com> +// Eric Biggers <ebiggers@google.com> +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License version 2 as +// published by the Free Software Foundation. +// + +// Derived from the x86 version: +// +// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions +// +// Copyright (c) 2013, Intel Corporation +// +// Authors: +// Erdinc Ozturk <erdinc.ozturk@intel.com> +// Vinodh Gopal <vinodh.gopal@intel.com> +// James Guilford <james.guilford@intel.com> +// Tim Chen <tim.c.chen@linux.intel.com> +// +// This software is available to you under a choice of one of two +// licenses. You may choose to be licensed under the terms of the GNU +// General Public License (GPL) Version 2, available from the file +// COPYING in the main directory of this source tree, or the +// OpenIB.org BSD license below: +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the +// distribution. +// +// * Neither the name of the Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// +// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Reference paper titled "Fast CRC Computation for Generic +// Polynomials Using PCLMULQDQ Instruction" +// URL: http://www.intel.com/content/dam/www/public/us/en/documents +// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +// + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .arch armv8-a+crypto + + init_crc .req w0 + buf .req x1 + len .req x2 + fold_consts_ptr .req x5 + + fold_consts .req v10 + + t3 .req v17 + t4 .req v18 + t5 .req v19 + t6 .req v20 + t7 .req v21 + t8 .req v22 + + perm .req v27 + + .macro pmull16x64_p64, a16, b64, c64 + pmull2 \c64\().1q, \a16\().2d, \b64\().2d + pmull \b64\().1q, \a16\().1d, \b64\().1d + .endm + + /* + * Pairwise long polynomial multiplication of two 16-bit values + * + * { w0, w1 }, { y0, y1 } + * + * by two 64-bit values + * + * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 } + * + * where each vector element is a byte, ordered from least to most + * significant. + * + * This can be implemented using 8x8 long polynomial multiplication, by + * reorganizing the input so that each pairwise 8x8 multiplication + * produces one of the terms from the decomposition below, and + * combining the results of each rank and shifting them into place. + * + * Rank + * 0 w0*x0 ^ | y0*z0 ^ + * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^ + * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^ + * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^ + * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^ + * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^ + * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^ + * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^ + * 8 w1*x7 << 64 | y1*z7 << 64 + * + * The inputs can be reorganized into + * + * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 } + * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 } + * + * and after performing 8x8->16 bit long polynomial multiplication of + * each of the halves of the first vector with those of the second one, + * we obtain the following four vectors of 16-bit elements: + * + * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 } + * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 } + * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 } + * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 } + * + * Results b and c can be XORed together, as the vector elements have + * matching ranks. Then, the final XOR (*) can be pulled forward, and + * applied between the halves of each of the remaining three vectors, + * which are then shifted into place, and combined to produce two + * 80-bit results. + * + * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent + * to the 64x64 bit one above, but XOR'ing the outputs together will + * produce the expected result, and this is sufficient in the context of + * this algorithm. + */ + .macro pmull16x64_p8, a16, b64, c64 + ext t7.16b, \b64\().16b, \b64\().16b, #1 + tbl t5.16b, {\a16\().16b}, perm.16b + uzp1 t7.16b, \b64\().16b, t7.16b + bl __pmull_p8_16x64 + ext \b64\().16b, t4.16b, t4.16b, #15 + eor \c64\().16b, t8.16b, t5.16b + .endm + +SYM_FUNC_START_LOCAL(__pmull_p8_16x64) + ext t6.16b, t5.16b, t5.16b, #8 + + pmull t3.8h, t7.8b, t5.8b + pmull t4.8h, t7.8b, t6.8b + pmull2 t5.8h, t7.16b, t5.16b + pmull2 t6.8h, t7.16b, t6.16b + + ext t8.16b, t3.16b, t3.16b, #8 + eor t4.16b, t4.16b, t6.16b + ext t7.16b, t5.16b, t5.16b, #8 + ext t6.16b, t4.16b, t4.16b, #8 + eor t8.8b, t8.8b, t3.8b + eor t5.8b, t5.8b, t7.8b + eor t4.8b, t4.8b, t6.8b + ext t5.16b, t5.16b, t5.16b, #14 + ret +SYM_FUNC_END(__pmull_p8_16x64) + + + // Fold reg1, reg2 into the next 32 data bytes, storing the result back + // into reg1, reg2. + .macro fold_32_bytes, p, reg1, reg2 + ldp q11, q12, [buf], #0x20 + + pmull16x64_\p fold_consts, \reg1, v8 + +CPU_LE( rev64 v11.16b, v11.16b ) +CPU_LE( rev64 v12.16b, v12.16b ) + + pmull16x64_\p fold_consts, \reg2, v9 + +CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 ) +CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) + + eor \reg1\().16b, \reg1\().16b, v8.16b + eor \reg2\().16b, \reg2\().16b, v9.16b + eor \reg1\().16b, \reg1\().16b, v11.16b + eor \reg2\().16b, \reg2\().16b, v12.16b + .endm + + // Fold src_reg into dst_reg, optionally loading the next fold constants + .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts + pmull16x64_\p fold_consts, \src_reg, v8 + .ifnb \load_next_consts + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 + .endif + eor \dst_reg\().16b, \dst_reg\().16b, v8.16b + eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b + .endm + + .macro crc_t10dif_pmull, p + + // For sizes less than 256 bytes, we can't fold 128 bytes at a time. + cmp len, #256 + b.lt .Lless_than_256_bytes_\@ + + adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts + + // Load the first 128 data bytes. Byte swapping is necessary to make + // the bit order match the polynomial coefficient order. + ldp q0, q1, [buf] + ldp q2, q3, [buf, #0x20] + ldp q4, q5, [buf, #0x40] + ldp q6, q7, [buf, #0x60] + add buf, buf, #0x80 +CPU_LE( rev64 v0.16b, v0.16b ) +CPU_LE( rev64 v1.16b, v1.16b ) +CPU_LE( rev64 v2.16b, v2.16b ) +CPU_LE( rev64 v3.16b, v3.16b ) +CPU_LE( rev64 v4.16b, v4.16b ) +CPU_LE( rev64 v5.16b, v5.16b ) +CPU_LE( rev64 v6.16b, v6.16b ) +CPU_LE( rev64 v7.16b, v7.16b ) +CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) +CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) +CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 ) +CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 ) +CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 ) +CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 ) +CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 ) +CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) + + // XOR the first 16 data *bits* with the initial CRC value. + movi v8.16b, #0 + mov v8.h[7], init_crc + eor v0.16b, v0.16b, v8.16b + + // Load the constants for folding across 128 bytes. + ld1 {fold_consts.2d}, [fold_consts_ptr] + + // Subtract 128 for the 128 data bytes just consumed. Subtract another + // 128 to simplify the termination condition of the following loop. + sub len, len, #256 + + // While >= 128 data bytes remain (not counting v0-v7), fold the 128 + // bytes v0-v7 into them, storing the result back into v0-v7. +.Lfold_128_bytes_loop_\@: + fold_32_bytes \p, v0, v1 + fold_32_bytes \p, v2, v3 + fold_32_bytes \p, v4, v5 + fold_32_bytes \p, v6, v7 + + subs len, len, #128 + b.ge .Lfold_128_bytes_loop_\@ + + // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7. + + // Fold across 64 bytes. + add fold_consts_ptr, fold_consts_ptr, #16 + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 + fold_16_bytes \p, v0, v4 + fold_16_bytes \p, v1, v5 + fold_16_bytes \p, v2, v6 + fold_16_bytes \p, v3, v7, 1 + // Fold across 32 bytes. + fold_16_bytes \p, v4, v6 + fold_16_bytes \p, v5, v7, 1 + // Fold across 16 bytes. + fold_16_bytes \p, v6, v7 + + // Add 128 to get the correct number of data bytes remaining in 0...127 + // (not counting v7), following the previous extra subtraction by 128. + // Then subtract 16 to simplify the termination condition of the + // following loop. + adds len, len, #(128-16) + + // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7 + // into them, storing the result back into v7. + b.lt .Lfold_16_bytes_loop_done_\@ +.Lfold_16_bytes_loop_\@: + pmull16x64_\p fold_consts, v7, v8 + eor v7.16b, v7.16b, v8.16b + ldr q0, [buf], #16 +CPU_LE( rev64 v0.16b, v0.16b ) +CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) + eor v7.16b, v7.16b, v0.16b + subs len, len, #16 + b.ge .Lfold_16_bytes_loop_\@ + +.Lfold_16_bytes_loop_done_\@: + // Add 16 to get the correct number of data bytes remaining in 0...15 + // (not counting v7), following the previous extra subtraction by 16. + adds len, len, #16 + b.eq .Lreduce_final_16_bytes_\@ + +.Lhandle_partial_segment_\@: + // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first + // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To + // do this without needing a fold constant for each possible 'len', + // redivide the bytes into a first chunk of 'len' bytes and a second + // chunk of 16 bytes, then fold the first chunk into the second. + + // v0 = last 16 original data bytes + add buf, buf, len + ldr q0, [buf, #-16] +CPU_LE( rev64 v0.16b, v0.16b ) +CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) + + // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes. + adr_l x4, .Lbyteshift_table + 16 + sub x4, x4, len + ld1 {v2.16b}, [x4] + tbl v1.16b, {v7.16b}, v2.16b + + // v3 = first chunk: v7 right-shifted by '16-len' bytes. + movi v3.16b, #0x80 + eor v2.16b, v2.16b, v3.16b + tbl v3.16b, {v7.16b}, v2.16b + + // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes. + sshr v2.16b, v2.16b, #7 + + // v2 = second chunk: 'len' bytes from v0 (low-order bytes), + // then '16-len' bytes from v1 (high-order bytes). + bsl v2.16b, v1.16b, v0.16b + + // Fold the first chunk into the second chunk, storing the result in v7. + pmull16x64_\p fold_consts, v3, v0 + eor v7.16b, v3.16b, v0.16b + eor v7.16b, v7.16b, v2.16b + b .Lreduce_final_16_bytes_\@ + +.Lless_than_256_bytes_\@: + // Checksumming a buffer of length 16...255 bytes + + adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts + + // Load the first 16 data bytes. + ldr q7, [buf], #0x10 +CPU_LE( rev64 v7.16b, v7.16b ) +CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) + + // XOR the first 16 data *bits* with the initial CRC value. + movi v0.16b, #0 + mov v0.h[7], init_crc + eor v7.16b, v7.16b, v0.16b + + // Load the fold-across-16-bytes constants. + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 + + cmp len, #16 + b.eq .Lreduce_final_16_bytes_\@ // len == 16 + subs len, len, #32 + b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255 + add len, len, #16 + b .Lhandle_partial_segment_\@ // 17 <= len <= 31 + +.Lreduce_final_16_bytes_\@: + .endm + +// +// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len); +// +// Assumes len >= 16. +// +SYM_FUNC_START(crc_t10dif_pmull_p8) + frame_push 1 + + // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 } + movi perm.4h, #8, lsl #8 + orr perm.2s, #1, lsl #16 + orr perm.2s, #1, lsl #24 + zip1 perm.16b, perm.16b, perm.16b + zip1 perm.16b, perm.16b, perm.16b + + crc_t10dif_pmull p8 + +CPU_LE( rev64 v7.16b, v7.16b ) +CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) + str q7, [x3] + + frame_pop + ret +SYM_FUNC_END(crc_t10dif_pmull_p8) + + .align 5 +// +// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len); +// +// Assumes len >= 16. +// +SYM_FUNC_START(crc_t10dif_pmull_p64) + crc_t10dif_pmull p64 + + // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC. + + movi v2.16b, #0 // init zero register + + // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 + + // Fold the high 64 bits into the low 64 bits, while also multiplying by + // x^64. This produces a 128-bit value congruent to x^64 * M(x) and + // whose low 48 bits are 0. + ext v0.16b, v2.16b, v7.16b, #8 + pmull2 v7.1q, v7.2d, fold_consts.2d // high bits * x^48 * (x^80 mod G(x)) + eor v0.16b, v0.16b, v7.16b // + low bits * x^64 + + // Fold the high 32 bits into the low 96 bits. This produces a 96-bit + // value congruent to x^64 * M(x) and whose low 48 bits are 0. + ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits + mov v0.s[3], v2.s[0] // zero high 32 bits + pmull v1.1q, v1.1d, fold_consts.1d // high 32 bits * x^48 * (x^48 mod G(x)) + eor v0.16b, v0.16b, v1.16b // + low bits + + // Load G(x) and floor(x^48 / G(x)). + ld1 {fold_consts.2d}, [fold_consts_ptr] + + // Use Barrett reduction to compute the final CRC value. + pmull2 v1.1q, v0.2d, fold_consts.2d // high 32 bits * floor(x^48 / G(x)) + ushr v1.2d, v1.2d, #32 // /= x^32 + pmull v1.1q, v1.1d, fold_consts.1d // *= G(x) + ushr v0.2d, v0.2d, #48 + eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits + // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0. + + umov w0, v0.h[0] + ret +SYM_FUNC_END(crc_t10dif_pmull_p64) + + .section ".rodata", "a" + .align 4 + +// Fold constants precomputed from the polynomial 0x18bb7 +// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 +.Lfold_across_128_bytes_consts: + .quad 0x0000000000006123 // x^(8*128) mod G(x) + .quad 0x0000000000002295 // x^(8*128+64) mod G(x) +// .Lfold_across_64_bytes_consts: + .quad 0x0000000000001069 // x^(4*128) mod G(x) + .quad 0x000000000000dd31 // x^(4*128+64) mod G(x) +// .Lfold_across_32_bytes_consts: + .quad 0x000000000000857d // x^(2*128) mod G(x) + .quad 0x0000000000007acc // x^(2*128+64) mod G(x) +.Lfold_across_16_bytes_consts: + .quad 0x000000000000a010 // x^(1*128) mod G(x) + .quad 0x0000000000001faa // x^(1*128+64) mod G(x) +// .Lfinal_fold_consts: + .quad 0x1368000000000000 // x^48 * (x^48 mod G(x)) + .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x)) +// .Lbarrett_reduction_consts: + .quad 0x0000000000018bb7 // G(x) + .quad 0x00000001f65a57f8 // floor(x^48 / G(x)) + +// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - +// len] is the index vector to shift left by 'len' bytes, and is also {0x80, +// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes. +.Lbyteshift_table: + .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 + .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 diff --git a/lib/crc/arm64/crc-t10dif.h b/lib/crc/arm64/crc-t10dif.h new file mode 100644 index 000000000000..c4521a7f1ee9 --- /dev/null +++ b/lib/crc/arm64/crc-t10dif.h @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/cpufeature.h> + +#include <crypto/internal/simd.h> + +#include <asm/neon.h> +#include <asm/simd.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_asimd); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull); + +#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U + +asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len, + u8 out[16]); +asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len); + +static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) +{ + if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) { + if (static_branch_likely(&have_pmull)) { + if (crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc_t10dif_pmull_p64(crc, data, length); + kernel_neon_end(); + return crc; + } + } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && + static_branch_likely(&have_asimd) && + crypto_simd_usable()) { + u8 buf[16]; + + kernel_neon_begin(); + crc_t10dif_pmull_p8(crc, data, length, buf); + kernel_neon_end(); + + return crc_t10dif_generic(0, buf, sizeof(buf)); + } + } + return crc_t10dif_generic(crc, data, length); +} + +#define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch +static inline void crc_t10dif_mod_init_arch(void) +{ + if (cpu_have_named_feature(ASIMD)) { + static_branch_enable(&have_asimd); + if (cpu_have_named_feature(PMULL)) + static_branch_enable(&have_pmull); + } +} diff --git a/lib/crc/arm64/crc32-core.S b/lib/crc/arm64/crc32-core.S new file mode 100644 index 000000000000..68825317460f --- /dev/null +++ b/lib/crc/arm64/crc32-core.S @@ -0,0 +1,362 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Accelerated CRC32(C) using AArch64 CRC and PMULL instructions + * + * Copyright (C) 2016 - 2018 Linaro Ltd. + * Copyright (C) 2024 Google LLC + * + * Author: Ard Biesheuvel <ardb@kernel.org> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .cpu generic+crc+crypto + + .macro bitle, reg + .endm + + .macro bitbe, reg + rbit \reg, \reg + .endm + + .macro bytele, reg + .endm + + .macro bytebe, reg + rbit \reg, \reg + lsr \reg, \reg, #24 + .endm + + .macro hwordle, reg +CPU_BE( rev16 \reg, \reg ) + .endm + + .macro hwordbe, reg +CPU_LE( rev \reg, \reg ) + rbit \reg, \reg +CPU_BE( lsr \reg, \reg, #16 ) + .endm + + .macro le, regs:vararg + .irp r, \regs +CPU_BE( rev \r, \r ) + .endr + .endm + + .macro be, regs:vararg + .irp r, \regs +CPU_LE( rev \r, \r ) + .endr + .irp r, \regs + rbit \r, \r + .endr + .endm + + .macro __crc32, c, order=le + bit\order w0 + cmp x2, #16 + b.lt 8f // less than 16 bytes + + and x7, x2, #0x1f + and x2, x2, #~0x1f + cbz x7, 32f // multiple of 32 bytes + + and x8, x7, #0xf + ldp x3, x4, [x1] + add x8, x8, x1 + add x1, x1, x7 + ldp x5, x6, [x8] + \order x3, x4, x5, x6 + + tst x7, #8 + crc32\c\()x w8, w0, x3 + csel x3, x3, x4, eq + csel w0, w0, w8, eq + tst x7, #4 + lsr x4, x3, #32 + crc32\c\()w w8, w0, w3 + csel x3, x3, x4, eq + csel w0, w0, w8, eq + tst x7, #2 + lsr w4, w3, #16 + crc32\c\()h w8, w0, w3 + csel w3, w3, w4, eq + csel w0, w0, w8, eq + tst x7, #1 + crc32\c\()b w8, w0, w3 + csel w0, w0, w8, eq + tst x7, #16 + crc32\c\()x w8, w0, x5 + crc32\c\()x w8, w8, x6 + csel w0, w0, w8, eq + cbz x2, 0f + +32: ldp x3, x4, [x1], #32 + sub x2, x2, #32 + ldp x5, x6, [x1, #-16] + \order x3, x4, x5, x6 + crc32\c\()x w0, w0, x3 + crc32\c\()x w0, w0, x4 + crc32\c\()x w0, w0, x5 + crc32\c\()x w0, w0, x6 + cbnz x2, 32b +0: bit\order w0 + ret + +8: tbz x2, #3, 4f + ldr x3, [x1], #8 + \order x3 + crc32\c\()x w0, w0, x3 +4: tbz x2, #2, 2f + ldr w3, [x1], #4 + \order w3 + crc32\c\()w w0, w0, w3 +2: tbz x2, #1, 1f + ldrh w3, [x1], #2 + hword\order w3 + crc32\c\()h w0, w0, w3 +1: tbz x2, #0, 0f + ldrb w3, [x1] + byte\order w3 + crc32\c\()b w0, w0, w3 +0: bit\order w0 + ret + .endm + + .align 5 +SYM_FUNC_START(crc32_le_arm64) + __crc32 +SYM_FUNC_END(crc32_le_arm64) + + .align 5 +SYM_FUNC_START(crc32c_le_arm64) + __crc32 c +SYM_FUNC_END(crc32c_le_arm64) + + .align 5 +SYM_FUNC_START(crc32_be_arm64) + __crc32 order=be +SYM_FUNC_END(crc32_be_arm64) + + in .req x1 + len .req x2 + + /* + * w0: input CRC at entry, output CRC at exit + * x1: pointer to input buffer + * x2: length of input in bytes + */ + .macro crc4way, insn, table, order=le + bit\order w0 + lsr len, len, #6 // len := # of 64-byte blocks + + /* Process up to 64 blocks of 64 bytes at a time */ +.La\@: mov x3, #64 + cmp len, #64 + csel x3, x3, len, hi // x3 := min(len, 64) + sub len, len, x3 + + /* Divide the input into 4 contiguous blocks */ + add x4, x3, x3, lsl #1 // x4 := 3 * x3 + add x7, in, x3, lsl #4 // x7 := in + 16 * x3 + add x8, in, x3, lsl #5 // x8 := in + 32 * x3 + add x9, in, x4, lsl #4 // x9 := in + 16 * x4 + + /* Load the folding coefficients from the lookup table */ + adr_l x5, \table - 12 // entry 0 omitted + add x5, x5, x4, lsl #2 // x5 += 12 * x3 + ldp s0, s1, [x5] + ldr s2, [x5, #8] + + /* Zero init partial CRCs for this iteration */ + mov w4, wzr + mov w5, wzr + mov w6, wzr + mov x17, xzr + +.Lb\@: sub x3, x3, #1 + \insn w6, w6, x17 + ldp x10, x11, [in], #16 + ldp x12, x13, [x7], #16 + ldp x14, x15, [x8], #16 + ldp x16, x17, [x9], #16 + + \order x10, x11, x12, x13, x14, x15, x16, x17 + + /* Apply the CRC transform to 4 16-byte blocks in parallel */ + \insn w0, w0, x10 + \insn w4, w4, x12 + \insn w5, w5, x14 + \insn w6, w6, x16 + \insn w0, w0, x11 + \insn w4, w4, x13 + \insn w5, w5, x15 + cbnz x3, .Lb\@ + + /* Combine the 4 partial results into w0 */ + mov v3.d[0], x0 + mov v4.d[0], x4 + mov v5.d[0], x5 + pmull v0.1q, v0.1d, v3.1d + pmull v1.1q, v1.1d, v4.1d + pmull v2.1q, v2.1d, v5.1d + eor v0.8b, v0.8b, v1.8b + eor v0.8b, v0.8b, v2.8b + mov x5, v0.d[0] + eor x5, x5, x17 + \insn w0, w6, x5 + + mov in, x9 + cbnz len, .La\@ + + bit\order w0 + ret + .endm + + .align 5 +SYM_FUNC_START(crc32c_le_arm64_4way) + crc4way crc32cx, .L0 +SYM_FUNC_END(crc32c_le_arm64_4way) + + .align 5 +SYM_FUNC_START(crc32_le_arm64_4way) + crc4way crc32x, .L1 +SYM_FUNC_END(crc32_le_arm64_4way) + + .align 5 +SYM_FUNC_START(crc32_be_arm64_4way) + crc4way crc32x, .L1, be +SYM_FUNC_END(crc32_be_arm64_4way) + + .section .rodata, "a", %progbits + .align 6 +.L0: .long 0xddc0152b, 0xba4fc28e, 0x493c7d27 + .long 0x0715ce53, 0x9e4addf8, 0xba4fc28e + .long 0xc96cfdc0, 0x0715ce53, 0xddc0152b + .long 0xab7aff2a, 0x0d3b6092, 0x9e4addf8 + .long 0x299847d5, 0x878a92a7, 0x39d3b296 + .long 0xb6dd949b, 0xab7aff2a, 0x0715ce53 + .long 0xa60ce07b, 0x83348832, 0x47db8317 + .long 0xd270f1a2, 0xb9e02b86, 0x0d3b6092 + .long 0x65863b64, 0xb6dd949b, 0xc96cfdc0 + .long 0xb3e32c28, 0xbac2fd7b, 0x878a92a7 + .long 0xf285651c, 0xce7f39f4, 0xdaece73e + .long 0x271d9844, 0xd270f1a2, 0xab7aff2a + .long 0x6cb08e5c, 0x2b3cac5d, 0x2162d385 + .long 0xcec3662e, 0x1b03397f, 0x83348832 + .long 0x8227bb8a, 0xb3e32c28, 0x299847d5 + .long 0xd7a4825c, 0xdd7e3b0c, 0xb9e02b86 + .long 0xf6076544, 0x10746f3c, 0x18b33a4e + .long 0x98d8d9cb, 0x271d9844, 0xb6dd949b + .long 0x57a3d037, 0x93a5f730, 0x78d9ccb7 + .long 0x3771e98f, 0x6b749fb2, 0xbac2fd7b + .long 0xe0ac139e, 0xcec3662e, 0xa60ce07b + .long 0x6f345e45, 0xe6fc4e6a, 0xce7f39f4 + .long 0xa2b73df1, 0xb0cd4768, 0x61d82e56 + .long 0x86d8e4d2, 0xd7a4825c, 0xd270f1a2 + .long 0xa90fd27a, 0x0167d312, 0xc619809d + .long 0xca6ef3ac, 0x26f6a60a, 0x2b3cac5d + .long 0x4597456a, 0x98d8d9cb, 0x65863b64 + .long 0xc9c8b782, 0x68bce87a, 0x1b03397f + .long 0x62ec6c6d, 0x6956fc3b, 0xebb883bd + .long 0x2342001e, 0x3771e98f, 0xb3e32c28 + .long 0xe8b6368b, 0x2178513a, 0x064f7f26 + .long 0x9ef68d35, 0x170076fa, 0xdd7e3b0c + .long 0x0b0bf8ca, 0x6f345e45, 0xf285651c + .long 0x02ee03b2, 0xff0dba97, 0x10746f3c + .long 0x135c83fd, 0xf872e54c, 0xc7a68855 + .long 0x00bcf5f6, 0x86d8e4d2, 0x271d9844 + .long 0x58ca5f00, 0x5bb8f1bc, 0x8e766a0c + .long 0xded288f8, 0xb3af077a, 0x93a5f730 + .long 0x37170390, 0xca6ef3ac, 0x6cb08e5c + .long 0xf48642e9, 0xdd66cbbb, 0x6b749fb2 + .long 0xb25b29f2, 0xe9e28eb4, 0x1393e203 + .long 0x45cddf4e, 0xc9c8b782, 0xcec3662e + .long 0xdfd94fb2, 0x93e106a4, 0x96c515bb + .long 0x021ac5ef, 0xd813b325, 0xe6fc4e6a + .long 0x8e1450f7, 0x2342001e, 0x8227bb8a + .long 0xe0cdcf86, 0x6d9a4957, 0xb0cd4768 + .long 0x613eee91, 0xd2c3ed1a, 0x39c7ff35 + .long 0xbedc6ba1, 0x9ef68d35, 0xd7a4825c + .long 0x0cd1526a, 0xf2271e60, 0x0ab3844b + .long 0xd6c3a807, 0x2664fd8b, 0x0167d312 + .long 0x1d31175f, 0x02ee03b2, 0xf6076544 + .long 0x4be7fd90, 0x363bd6b3, 0x26f6a60a + .long 0x6eeed1c9, 0x5fabe670, 0xa741c1bf + .long 0xb3a6da94, 0x00bcf5f6, 0x98d8d9cb + .long 0x2e7d11a7, 0x17f27698, 0x49c3cc9c + .long 0x889774e1, 0xaa7c7ad5, 0x68bce87a + .long 0x8a074012, 0xded288f8, 0x57a3d037 + .long 0xbd0bb25f, 0x6d390dec, 0x6956fc3b + .long 0x3be3c09b, 0x6353c1cc, 0x42d98888 + .long 0x465a4eee, 0xf48642e9, 0x3771e98f + .long 0x2e5f3c8c, 0xdd35bc8d, 0xb42ae3d9 + .long 0xa52f58ec, 0x9a5ede41, 0x2178513a + .long 0x47972100, 0x45cddf4e, 0xe0ac139e + .long 0x359674f7, 0xa51b6135, 0x170076fa + +.L1: .long 0xaf449247, 0x81256527, 0xccaa009e + .long 0x57c54819, 0x1d9513d7, 0x81256527 + .long 0x3f41287a, 0x57c54819, 0xaf449247 + .long 0xf5e48c85, 0x910eeec1, 0x1d9513d7 + .long 0x1f0c2cdd, 0x9026d5b1, 0xae0b5394 + .long 0x71d54a59, 0xf5e48c85, 0x57c54819 + .long 0x1c63267b, 0xfe807bbd, 0x0cbec0ed + .long 0xd31343ea, 0xe95c1271, 0x910eeec1 + .long 0xf9d9c7ee, 0x71d54a59, 0x3f41287a + .long 0x9ee62949, 0xcec97417, 0x9026d5b1 + .long 0xa55d1514, 0xf183c71b, 0xd1df2327 + .long 0x21aa2b26, 0xd31343ea, 0xf5e48c85 + .long 0x9d842b80, 0xeea395c4, 0x3c656ced + .long 0xd8110ff1, 0xcd669a40, 0xfe807bbd + .long 0x3f9e9356, 0x9ee62949, 0x1f0c2cdd + .long 0x1d6708a0, 0x0c30f51d, 0xe95c1271 + .long 0xef82aa68, 0xdb3935ea, 0xb918a347 + .long 0xd14bcc9b, 0x21aa2b26, 0x71d54a59 + .long 0x99cce860, 0x356d209f, 0xff6f2fc2 + .long 0xd8af8e46, 0xc352f6de, 0xcec97417 + .long 0xf1996890, 0xd8110ff1, 0x1c63267b + .long 0x631bc508, 0xe95c7216, 0xf183c71b + .long 0x8511c306, 0x8e031a19, 0x9b9bdbd0 + .long 0xdb3839f3, 0x1d6708a0, 0xd31343ea + .long 0x7a92fffb, 0xf7003835, 0x4470ac44 + .long 0x6ce68f2a, 0x00eba0c8, 0xeea395c4 + .long 0x4caaa263, 0xd14bcc9b, 0xf9d9c7ee + .long 0xb46f7cff, 0x9a1b53c8, 0xcd669a40 + .long 0x60290934, 0x81b6f443, 0x6d40f445 + .long 0x8e976a7d, 0xd8af8e46, 0x9ee62949 + .long 0xdcf5088a, 0x9dbdc100, 0x145575d5 + .long 0x1753ab84, 0xbbf2f6d6, 0x0c30f51d + .long 0x255b139e, 0x631bc508, 0xa55d1514 + .long 0xd784eaa8, 0xce26786c, 0xdb3935ea + .long 0x6d2c864a, 0x8068c345, 0x2586d334 + .long 0x02072e24, 0xdb3839f3, 0x21aa2b26 + .long 0x06689b0a, 0x5efd72f5, 0xe0575528 + .long 0x1e52f5ea, 0x4117915b, 0x356d209f + .long 0x1d3d1db6, 0x6ce68f2a, 0x9d842b80 + .long 0x3796455c, 0xb8e0e4a8, 0xc352f6de + .long 0xdf3a4eb3, 0xc55a2330, 0xb84ffa9c + .long 0x28ae0976, 0xb46f7cff, 0xd8110ff1 + .long 0x9764bc8d, 0xd7e7a22c, 0x712510f0 + .long 0x13a13e18, 0x3e9a43cd, 0xe95c7216 + .long 0xb8ee242e, 0x8e976a7d, 0x3f9e9356 + .long 0x0c540e7b, 0x753c81ff, 0x8e031a19 + .long 0x9924c781, 0xb9220208, 0x3edcde65 + .long 0x3954de39, 0x1753ab84, 0x1d6708a0 + .long 0xf32238b5, 0xbec81497, 0x9e70b943 + .long 0xbbd2cd2c, 0x0925d861, 0xf7003835 + .long 0xcc401304, 0xd784eaa8, 0xef82aa68 + .long 0x4987e684, 0x6044fbb0, 0x00eba0c8 + .long 0x3aa11427, 0x18fe3b4a, 0x87441142 + .long 0x297aad60, 0x02072e24, 0xd14bcc9b + .long 0xf60c5e51, 0x6ef6f487, 0x5b7fdd0a + .long 0x632d78c5, 0x3fc33de4, 0x9a1b53c8 + .long 0x25b8822a, 0x1e52f5ea, 0x99cce860 + .long 0xd4fc84bc, 0x1af62fb8, 0x81b6f443 + .long 0x5690aa32, 0xa91fdefb, 0x688a110e + .long 0x1357a093, 0x3796455c, 0xd8af8e46 + .long 0x798fdd33, 0xaaa18a37, 0x357b9517 + .long 0xc2815395, 0x54d42691, 0x9dbdc100 + .long 0x21cfc0f7, 0x28ae0976, 0xf1996890 + .long 0xa0decef3, 0x7b4aa8b7, 0xbbf2f6d6 diff --git a/lib/crc/arm64/crc32.h b/lib/crc/arm64/crc32.h new file mode 100644 index 000000000000..6e5dec45f05d --- /dev/null +++ b/lib/crc/arm64/crc32.h @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <asm/alternative.h> +#include <asm/cpufeature.h> +#include <asm/neon.h> +#include <asm/simd.h> + +#include <crypto/internal/simd.h> + +// The minimum input length to consider the 4-way interleaved code path +static const size_t min_len = 1024; + +asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len); + +asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len); + +static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) + return crc32_le_base(crc, p, len); + + if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc32_le_arm64_4way(crc, p, len); + kernel_neon_end(); + + p += round_down(len, 64); + len %= 64; + + if (!len) + return crc; + } + + return crc32_le_arm64(crc, p, len); +} + +static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) +{ + if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) + return crc32c_base(crc, p, len); + + if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc32c_le_arm64_4way(crc, p, len); + kernel_neon_end(); + + p += round_down(len, 64); + len %= 64; + + if (!len) + return crc; + } + + return crc32c_le_arm64(crc, p, len); +} + +static inline u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) +{ + if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) + return crc32_be_base(crc, p, len); + + if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc32_be_arm64_4way(crc, p, len); + kernel_neon_end(); + + p += round_down(len, 64); + len %= 64; + + if (!len) + return crc; + } + + return crc32_be_arm64(crc, p, len); +} + +static inline u32 crc32_optimizations_arch(void) +{ + if (alternative_has_cap_likely(ARM64_HAS_CRC32)) + return CRC32_LE_OPTIMIZATION | + CRC32_BE_OPTIMIZATION | + CRC32C_OPTIMIZATION; + return 0; +} diff --git a/lib/crc-ccitt.c b/lib/crc/crc-ccitt.c index 9cddf35d3b66..f8692c3de101 100644 --- a/lib/crc-ccitt.c +++ b/lib/crc/crc-ccitt.c @@ -1,11 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only -/* - * linux/lib/crc-ccitt.c - */ -#include <linux/types.h> -#include <linux/module.h> #include <linux/crc-ccitt.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/types.h> /* * This mysterious table is just the CRC of each possible byte. It can be diff --git a/lib/crc-itu-t.c b/lib/crc/crc-itu-t.c index 1d26a1647da5..6e413a290f54 100644 --- a/lib/crc-itu-t.c +++ b/lib/crc/crc-itu-t.c @@ -3,9 +3,10 @@ * crc-itu-t.c */ -#include <linux/types.h> -#include <linux/module.h> #include <linux/crc-itu-t.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/types.h> /* CRC table for the CRC ITU-T V.41 0x1021 (x^16 + x^12 + x^5 + 1) */ const u16 crc_itu_t_table[256] = { diff --git a/lib/crc-t10dif.c b/lib/crc/crc-t10dif-main.c index 311c2ab829f1..08dde238e89f 100644 --- a/lib/crc-t10dif.c +++ b/lib/crc/crc-t10dif-main.c @@ -6,9 +6,10 @@ * Written by Martin K. Petersen <martin.petersen@oracle.com> */ -#include <linux/types.h> -#include <linux/module.h> #include <linux/crc-t10dif.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/types.h> /* * Table generated using the following polynomial: @@ -50,16 +51,39 @@ static const u16 t10_dif_crc_table[256] = { 0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3 }; -u16 crc_t10dif_generic(u16 crc, const u8 *p, size_t len) +static inline u16 __maybe_unused +crc_t10dif_generic(u16 crc, const u8 *p, size_t len) { - size_t i; + while (len--) + crc = (crc << 8) ^ t10_dif_crc_table[(crc >> 8) ^ *p++]; + return crc; +} - for (i = 0; i < len; i++) - crc = (crc << 8) ^ t10_dif_crc_table[(crc >> 8) ^ p[i]]; +#ifdef CONFIG_CRC_T10DIF_ARCH +#include "crc-t10dif.h" /* $(SRCARCH)/crc-t10dif.h */ +#else +#define crc_t10dif_arch crc_t10dif_generic +#endif - return crc; +u16 crc_t10dif_update(u16 crc, const u8 *p, size_t len) +{ + return crc_t10dif_arch(crc, p, len); +} +EXPORT_SYMBOL(crc_t10dif_update); + +#ifdef crc_t10dif_mod_init_arch +static int __init crc_t10dif_mod_init(void) +{ + crc_t10dif_mod_init_arch(); + return 0; +} +subsys_initcall(crc_t10dif_mod_init); + +static void __exit crc_t10dif_mod_exit(void) +{ } -EXPORT_SYMBOL(crc_t10dif_generic); +module_exit(crc_t10dif_mod_exit); +#endif -MODULE_DESCRIPTION("T10 DIF CRC calculation"); +MODULE_DESCRIPTION("CRC-T10DIF library functions"); MODULE_LICENSE("GPL"); diff --git a/lib/crc16.c b/lib/crc/crc16.c index 5c3a803c01e0..931660a8cbaa 100644 --- a/lib/crc16.c +++ b/lib/crc/crc16.c @@ -3,12 +3,13 @@ * crc16.c */ -#include <linux/types.h> -#include <linux/module.h> #include <linux/crc16.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/types.h> /** CRC table for the CRC-16. The poly is 0x8005 (x^16 + x^15 + x^2 + 1) */ -u16 const crc16_table[256] = { +static const u16 crc16_table[256] = { 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, @@ -42,20 +43,19 @@ u16 const crc16_table[256] = { 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 }; -EXPORT_SYMBOL(crc16_table); /** * crc16 - compute the CRC-16 for the data buffer * @crc: previous CRC value - * @buffer: data pointer + * @p: data pointer * @len: number of bytes in the buffer * * Returns the updated CRC value. */ -u16 crc16(u16 crc, u8 const *buffer, size_t len) +u16 crc16(u16 crc, const u8 *p, size_t len) { while (len--) - crc = crc16_byte(crc, *buffer++); + crc = (crc >> 8) ^ crc16_table[(crc & 0xff) ^ *p++]; return crc; } EXPORT_SYMBOL(crc16); diff --git a/lib/crc/crc32-main.c b/lib/crc/crc32-main.c new file mode 100644 index 000000000000..fbb90c9006e5 --- /dev/null +++ b/lib/crc/crc32-main.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin + * cleaned up code to current version of sparse and added the slicing-by-8 + * algorithm to the closely similar existing slicing-by-4 algorithm. + * + * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com> + * Nicer crc32 functions/docs submitted by linux@horizon.com. Thanks! + * Code was from the public domain, copyright abandoned. Code was + * subsequently included in the kernel, thus was re-licensed under the + * GNU GPL v2. + * + * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com> + * Same crc32 function was used in 5 other places in the kernel. + * I made one version, and deleted the others. + * There are various incantations of crc32(). Some use a seed of 0 or ~0. + * Some xor at the end with ~0. The generic crc32() function takes + * seed as an argument, and doesn't xor at the end. Then individual + * users can do whatever they need. + * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0. + * fs/jffs2 uses seed 0, doesn't xor with ~0. + * fs/partitions/efi.c uses seed ~0, xor's with ~0. + */ + +/* see: Documentation/staging/crc32.rst for a description of algorithms */ + +#include <linux/crc32.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/types.h> + +#include "crc32table.h" + +static inline u32 __maybe_unused +crc32_le_base(u32 crc, const u8 *p, size_t len) +{ + while (len--) + crc = (crc >> 8) ^ crc32table_le[(crc & 255) ^ *p++]; + return crc; +} + +static inline u32 __maybe_unused +crc32_be_base(u32 crc, const u8 *p, size_t len) +{ + while (len--) + crc = (crc << 8) ^ crc32table_be[(crc >> 24) ^ *p++]; + return crc; +} + +static inline u32 __maybe_unused +crc32c_base(u32 crc, const u8 *p, size_t len) +{ + while (len--) + crc = (crc >> 8) ^ crc32ctable_le[(crc & 255) ^ *p++]; + return crc; +} + +#ifdef CONFIG_CRC32_ARCH +#include "crc32.h" /* $(SRCARCH)/crc32.h */ + +u32 crc32_optimizations(void) +{ + return crc32_optimizations_arch(); +} +EXPORT_SYMBOL(crc32_optimizations); +#else +#define crc32_le_arch crc32_le_base +#define crc32_be_arch crc32_be_base +#define crc32c_arch crc32c_base +#endif + +u32 crc32_le(u32 crc, const void *p, size_t len) +{ + return crc32_le_arch(crc, p, len); +} +EXPORT_SYMBOL(crc32_le); + +u32 crc32_be(u32 crc, const void *p, size_t len) +{ + return crc32_be_arch(crc, p, len); +} +EXPORT_SYMBOL(crc32_be); + +u32 crc32c(u32 crc, const void *p, size_t len) +{ + return crc32c_arch(crc, p, len); +} +EXPORT_SYMBOL(crc32c); + +#ifdef crc32_mod_init_arch +static int __init crc32_mod_init(void) +{ + crc32_mod_init_arch(); + return 0; +} +subsys_initcall(crc32_mod_init); + +static void __exit crc32_mod_exit(void) +{ +} +module_exit(crc32_mod_exit); +#endif + +MODULE_DESCRIPTION("CRC32 library functions"); +MODULE_LICENSE("GPL"); diff --git a/lib/crc4.c b/lib/crc/crc4.c index e7e1779c67d9..8e83fbe60bdc 100644 --- a/lib/crc4.c +++ b/lib/crc/crc4.c @@ -4,6 +4,7 @@ */ #include <linux/crc4.h> +#include <linux/export.h> #include <linux/module.h> static const uint8_t crc4_tab[] = { diff --git a/lib/crc64.c b/lib/crc/crc64-main.c index 5b1b17057f0a..1337036010fe 100644 --- a/lib/crc64.c +++ b/lib/crc/crc64-main.c @@ -33,26 +33,61 @@ * Author: Coly Li <colyli@suse.de> */ +#include <linux/crc64.h> +#include <linux/export.h> #include <linux/module.h> #include <linux/types.h> -#include <linux/crc64.h> -#include "crc64table.h" -MODULE_DESCRIPTION("CRC64 calculations"); -MODULE_LICENSE("GPL v2"); +#include "crc64table.h" -u64 crc64_be_generic(u64 crc, const u8 *p, size_t len) +static inline u64 __maybe_unused +crc64_be_generic(u64 crc, const u8 *p, size_t len) { while (len--) crc = (crc << 8) ^ crc64table[(crc >> 56) ^ *p++]; return crc; } -EXPORT_SYMBOL_GPL(crc64_be_generic); -u64 crc64_nvme_generic(u64 crc, const u8 *p, size_t len) +static inline u64 __maybe_unused +crc64_nvme_generic(u64 crc, const u8 *p, size_t len) { while (len--) crc = (crc >> 8) ^ crc64nvmetable[(crc & 0xff) ^ *p++]; return crc; } -EXPORT_SYMBOL_GPL(crc64_nvme_generic); + +#ifdef CONFIG_CRC64_ARCH +#include "crc64.h" /* $(SRCARCH)/crc64.h */ +#else +#define crc64_be_arch crc64_be_generic +#define crc64_nvme_arch crc64_nvme_generic +#endif + +u64 crc64_be(u64 crc, const void *p, size_t len) +{ + return crc64_be_arch(crc, p, len); +} +EXPORT_SYMBOL_GPL(crc64_be); + +u64 crc64_nvme(u64 crc, const void *p, size_t len) +{ + return ~crc64_nvme_arch(~crc, p, len); +} +EXPORT_SYMBOL_GPL(crc64_nvme); + +#ifdef crc64_mod_init_arch +static int __init crc64_mod_init(void) +{ + crc64_mod_init_arch(); + return 0; +} +subsys_initcall(crc64_mod_init); + +static void __exit crc64_mod_exit(void) +{ +} +module_exit(crc64_mod_exit); +#endif + +MODULE_DESCRIPTION("CRC64 library functions"); +MODULE_LICENSE("GPL"); diff --git a/lib/crc7.c b/lib/crc/crc7.c index 8dd991cc6114..46b95d7ac6ce 100644 --- a/lib/crc7.c +++ b/lib/crc/crc7.c @@ -3,9 +3,10 @@ * crc7.c */ -#include <linux/types.h> -#include <linux/module.h> #include <linux/crc7.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/types.h> /* * Table for CRC-7 (polynomial x^7 + x^3 + 1). diff --git a/lib/crc8.c b/lib/crc/crc8.c index 1ad8e501d9b6..329c52158c45 100644 --- a/lib/crc8.c +++ b/lib/crc/crc8.c @@ -16,8 +16,9 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> #include <linux/crc8.h> +#include <linux/export.h> +#include <linux/module.h> #include <linux/printk.h> /** diff --git a/lib/gen_crc32table.c b/lib/crc/gen_crc32table.c index 6d03425b849e..9a7f31658e35 100644 --- a/lib/gen_crc32table.c +++ b/lib/crc/gen_crc32table.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <stdio.h> -#include "../include/linux/crc32poly.h" -#include "../include/generated/autoconf.h" +#include "../../include/linux/crc32poly.h" +#include "../../include/generated/autoconf.h" #include <inttypes.h> static uint32_t crc32table_le[256]; diff --git a/lib/gen_crc64table.c b/lib/crc/gen_crc64table.c index e05a4230a0a0..f2be9f62bab7 100644 --- a/lib/gen_crc64table.c +++ b/lib/crc/gen_crc64table.c @@ -1,14 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Generate lookup table for the table-driven CRC64 calculation. - * - * gen_crc64table is executed in kernel build time and generates - * lib/crc64table.h. This header is included by lib/crc64.c for - * the table-driven CRC64 calculation. - * - * See lib/crc64.c for more information about which specification - * and polynomial arithmetic that gen_crc64table.c follows to - * generate the lookup table. + * This host program runs at kernel build time and generates the lookup tables + * used by the generic CRC64 code. * * Copyright 2018 SUSE Linux. * Author: Coly Li <colyli@suse.de> diff --git a/lib/crc/loongarch/crc32.h b/lib/crc/loongarch/crc32.h new file mode 100644 index 000000000000..6de5c96594af --- /dev/null +++ b/lib/crc/loongarch/crc32.h @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * CRC32 and CRC32C using LoongArch crc* instructions + * + * Module based on mips/crypto/crc32-mips.c + * + * Copyright (C) 2014 Linaro Ltd <yazen.ghannam@linaro.org> + * Copyright (C) 2018 MIPS Tech, LLC + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include <asm/cpu-features.h> +#include <linux/unaligned.h> + +#define _CRC32(crc, value, size, type) \ +do { \ + __asm__ __volatile__( \ + #type ".w." #size ".w" " %0, %1, %0\n\t"\ + : "+r" (crc) \ + : "r" (value) \ + : "memory"); \ +} while (0) + +#define CRC32(crc, value, size) _CRC32(crc, value, size, crc) +#define CRC32C(crc, value, size) _CRC32(crc, value, size, crcc) + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); + +static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + if (!static_branch_likely(&have_crc32)) + return crc32_le_base(crc, p, len); + + while (len >= sizeof(u64)) { + u64 value = get_unaligned_le64(p); + + CRC32(crc, value, d); + p += sizeof(u64); + len -= sizeof(u64); + } + + if (len & sizeof(u32)) { + u32 value = get_unaligned_le32(p); + + CRC32(crc, value, w); + p += sizeof(u32); + } + + if (len & sizeof(u16)) { + u16 value = get_unaligned_le16(p); + + CRC32(crc, value, h); + p += sizeof(u16); + } + + if (len & sizeof(u8)) { + u8 value = *p++; + + CRC32(crc, value, b); + } + + return crc; +} + +static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) +{ + if (!static_branch_likely(&have_crc32)) + return crc32c_base(crc, p, len); + + while (len >= sizeof(u64)) { + u64 value = get_unaligned_le64(p); + + CRC32C(crc, value, d); + p += sizeof(u64); + len -= sizeof(u64); + } + + if (len & sizeof(u32)) { + u32 value = get_unaligned_le32(p); + + CRC32C(crc, value, w); + p += sizeof(u32); + } + + if (len & sizeof(u16)) { + u16 value = get_unaligned_le16(p); + + CRC32C(crc, value, h); + p += sizeof(u16); + } + + if (len & sizeof(u8)) { + u8 value = *p++; + + CRC32C(crc, value, b); + } + + return crc; +} + +#define crc32_be_arch crc32_be_base /* not implemented on this arch */ + +#define crc32_mod_init_arch crc32_mod_init_arch +static inline void crc32_mod_init_arch(void) +{ + if (cpu_has_crc32) + static_branch_enable(&have_crc32); +} + +static inline u32 crc32_optimizations_arch(void) +{ + if (static_key_enabled(&have_crc32)) + return CRC32_LE_OPTIMIZATION | CRC32C_OPTIMIZATION; + return 0; +} diff --git a/lib/crc/mips/crc32.h b/lib/crc/mips/crc32.h new file mode 100644 index 000000000000..11cb272c63a6 --- /dev/null +++ b/lib/crc/mips/crc32.h @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * crc32-mips.c - CRC32 and CRC32C using optional MIPSr6 instructions + * + * Module based on arm64/crypto/crc32-arm.c + * + * Copyright (C) 2014 Linaro Ltd <yazen.ghannam@linaro.org> + * Copyright (C) 2018 MIPS Tech, LLC + */ + +#include <linux/cpufeature.h> +#include <asm/mipsregs.h> +#include <linux/unaligned.h> + +#ifndef TOOLCHAIN_SUPPORTS_CRC +#define _ASM_SET_CRC(OP, SZ, TYPE) \ +_ASM_MACRO_3R(OP, rt, rs, rt2, \ + ".ifnc \\rt, \\rt2\n\t" \ + ".error \"invalid operands \\\"" #OP " \\rt,\\rs,\\rt2\\\"\"\n\t" \ + ".endif\n\t" \ + _ASM_INSN_IF_MIPS(0x7c00000f | (__rt << 16) | (__rs << 21) | \ + ((SZ) << 6) | ((TYPE) << 8)) \ + _ASM_INSN32_IF_MM(0x00000030 | (__rs << 16) | (__rt << 21) | \ + ((SZ) << 14) | ((TYPE) << 3))) +#define _ASM_UNSET_CRC(op, SZ, TYPE) ".purgem " #op "\n\t" +#else /* !TOOLCHAIN_SUPPORTS_CRC */ +#define _ASM_SET_CRC(op, SZ, TYPE) ".set\tcrc\n\t" +#define _ASM_UNSET_CRC(op, SZ, TYPE) +#endif + +#define __CRC32(crc, value, op, SZ, TYPE) \ +do { \ + __asm__ __volatile__( \ + ".set push\n\t" \ + _ASM_SET_CRC(op, SZ, TYPE) \ + #op " %0, %1, %0\n\t" \ + _ASM_UNSET_CRC(op, SZ, TYPE) \ + ".set pop" \ + : "+r" (crc) \ + : "r" (value)); \ +} while (0) + +#define _CRC32_crc32b(crc, value) __CRC32(crc, value, crc32b, 0, 0) +#define _CRC32_crc32h(crc, value) __CRC32(crc, value, crc32h, 1, 0) +#define _CRC32_crc32w(crc, value) __CRC32(crc, value, crc32w, 2, 0) +#define _CRC32_crc32d(crc, value) __CRC32(crc, value, crc32d, 3, 0) +#define _CRC32_crc32cb(crc, value) __CRC32(crc, value, crc32cb, 0, 1) +#define _CRC32_crc32ch(crc, value) __CRC32(crc, value, crc32ch, 1, 1) +#define _CRC32_crc32cw(crc, value) __CRC32(crc, value, crc32cw, 2, 1) +#define _CRC32_crc32cd(crc, value) __CRC32(crc, value, crc32cd, 3, 1) + +#define _CRC32(crc, value, size, op) \ + _CRC32_##op##size(crc, value) + +#define CRC32(crc, value, size) \ + _CRC32(crc, value, size, crc32) + +#define CRC32C(crc, value, size) \ + _CRC32(crc, value, size, crc32c) + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); + +static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + if (!static_branch_likely(&have_crc32)) + return crc32_le_base(crc, p, len); + + if (IS_ENABLED(CONFIG_64BIT)) { + for (; len >= sizeof(u64); p += sizeof(u64), len -= sizeof(u64)) { + u64 value = get_unaligned_le64(p); + + CRC32(crc, value, d); + } + + if (len & sizeof(u32)) { + u32 value = get_unaligned_le32(p); + + CRC32(crc, value, w); + p += sizeof(u32); + } + } else { + for (; len >= sizeof(u32); len -= sizeof(u32)) { + u32 value = get_unaligned_le32(p); + + CRC32(crc, value, w); + p += sizeof(u32); + } + } + + if (len & sizeof(u16)) { + u16 value = get_unaligned_le16(p); + + CRC32(crc, value, h); + p += sizeof(u16); + } + + if (len & sizeof(u8)) { + u8 value = *p++; + + CRC32(crc, value, b); + } + + return crc; +} + +static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) +{ + if (!static_branch_likely(&have_crc32)) + return crc32c_base(crc, p, len); + + if (IS_ENABLED(CONFIG_64BIT)) { + for (; len >= sizeof(u64); p += sizeof(u64), len -= sizeof(u64)) { + u64 value = get_unaligned_le64(p); + + CRC32C(crc, value, d); + } + + if (len & sizeof(u32)) { + u32 value = get_unaligned_le32(p); + + CRC32C(crc, value, w); + p += sizeof(u32); + } + } else { + for (; len >= sizeof(u32); len -= sizeof(u32)) { + u32 value = get_unaligned_le32(p); + + CRC32C(crc, value, w); + p += sizeof(u32); + } + } + + if (len & sizeof(u16)) { + u16 value = get_unaligned_le16(p); + + CRC32C(crc, value, h); + p += sizeof(u16); + } + + if (len & sizeof(u8)) { + u8 value = *p++; + + CRC32C(crc, value, b); + } + return crc; +} + +#define crc32_be_arch crc32_be_base /* not implemented on this arch */ + +#define crc32_mod_init_arch crc32_mod_init_arch +static inline void crc32_mod_init_arch(void) +{ + if (cpu_have_feature(cpu_feature(MIPS_CRC32))) + static_branch_enable(&have_crc32); +} + +static inline u32 crc32_optimizations_arch(void) +{ + if (static_key_enabled(&have_crc32)) + return CRC32_LE_OPTIMIZATION | CRC32C_OPTIMIZATION; + return 0; +} diff --git a/lib/crc/powerpc/crc-t10dif.h b/lib/crc/powerpc/crc-t10dif.h new file mode 100644 index 000000000000..59e16804a6ea --- /dev/null +++ b/lib/crc/powerpc/crc-t10dif.h @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Calculate a CRC T10-DIF with vpmsum acceleration + * + * Copyright 2017, Daniel Axtens, IBM Corporation. + * [based on crc32c-vpmsum_glue.c] + */ + +#include <asm/switch_to.h> +#include <crypto/internal/simd.h> +#include <linux/cpufeature.h> +#include <linux/jump_label.h> +#include <linux/preempt.h> +#include <linux/uaccess.h> + +#define VMX_ALIGN 16 +#define VMX_ALIGN_MASK (VMX_ALIGN-1) + +#define VECTOR_BREAKPOINT 64 + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto); + +u32 __crct10dif_vpmsum(u32 crc, unsigned char const *p, size_t len); + +static inline u16 crc_t10dif_arch(u16 crci, const u8 *p, size_t len) +{ + unsigned int prealign; + unsigned int tail; + u32 crc = crci; + + if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || + !static_branch_likely(&have_vec_crypto) || !crypto_simd_usable()) + return crc_t10dif_generic(crc, p, len); + + if ((unsigned long)p & VMX_ALIGN_MASK) { + prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); + crc = crc_t10dif_generic(crc, p, prealign); + len -= prealign; + p += prealign; + } + + if (len & ~VMX_ALIGN_MASK) { + crc <<= 16; + preempt_disable(); + pagefault_disable(); + enable_kernel_altivec(); + crc = __crct10dif_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); + disable_kernel_altivec(); + pagefault_enable(); + preempt_enable(); + crc >>= 16; + } + + tail = len & VMX_ALIGN_MASK; + if (tail) { + p += len & ~VMX_ALIGN_MASK; + crc = crc_t10dif_generic(crc, p, tail); + } + + return crc & 0xffff; +} + +#define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch +static inline void crc_t10dif_mod_init_arch(void) +{ + if (cpu_has_feature(CPU_FTR_ARCH_207S) && + (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO)) + static_branch_enable(&have_vec_crypto); +} diff --git a/lib/crc/powerpc/crc-vpmsum-template.S b/lib/crc/powerpc/crc-vpmsum-template.S new file mode 100644 index 000000000000..b0f87f595b26 --- /dev/null +++ b/lib/crc/powerpc/crc-vpmsum-template.S @@ -0,0 +1,746 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Core of the accelerated CRC algorithm. + * In your file, define the constants and CRC_FUNCTION_NAME + * Then include this file. + * + * Calculate the checksum of data that is 16 byte aligned and a multiple of + * 16 bytes. + * + * The first step is to reduce it to 1024 bits. We do this in 8 parallel + * chunks in order to mask the latency of the vpmsum instructions. If we + * have more than 32 kB of data to checksum we repeat this step multiple + * times, passing in the previous 1024 bits. + * + * The next step is to reduce the 1024 bits to 64 bits. This step adds + * 32 bits of 0s to the end - this matches what a CRC does. We just + * calculate constants that land the data in this 32 bits. + * + * We then use fixed point Barrett reduction to compute a mod n over GF(2) + * for n = CRC using POWER8 instructions. We use x = 32. + * + * https://en.wikipedia.org/wiki/Barrett_reduction + * + * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM +*/ + +#include <asm/ppc_asm.h> +#include <asm/ppc-opcode.h> + +#define MAX_SIZE 32768 + + .text + +#if defined(__BIG_ENDIAN__) && defined(REFLECT) +#define BYTESWAP_DATA +#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) +#define BYTESWAP_DATA +#else +#undef BYTESWAP_DATA +#endif + +#define off16 r25 +#define off32 r26 +#define off48 r27 +#define off64 r28 +#define off80 r29 +#define off96 r30 +#define off112 r31 + +#define const1 v24 +#define const2 v25 + +#define byteswap v26 +#define mask_32bit v27 +#define mask_64bit v28 +#define zeroes v29 + +#ifdef BYTESWAP_DATA +#define VPERM(A, B, C, D) vperm A, B, C, D +#else +#define VPERM(A, B, C, D) +#endif + +/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */ +FUNC_START(CRC_FUNCTION_NAME) + std r31,-8(r1) + std r30,-16(r1) + std r29,-24(r1) + std r28,-32(r1) + std r27,-40(r1) + std r26,-48(r1) + std r25,-56(r1) + + li off16,16 + li off32,32 + li off48,48 + li off64,64 + li off80,80 + li off96,96 + li off112,112 + li r0,0 + + /* Enough room for saving 10 non volatile VMX registers */ + subi r6,r1,56+10*16 + subi r7,r1,56+2*16 + + stvx v20,0,r6 + stvx v21,off16,r6 + stvx v22,off32,r6 + stvx v23,off48,r6 + stvx v24,off64,r6 + stvx v25,off80,r6 + stvx v26,off96,r6 + stvx v27,off112,r6 + stvx v28,0,r7 + stvx v29,off16,r7 + + mr r10,r3 + + vxor zeroes,zeroes,zeroes + vspltisw v0,-1 + + vsldoi mask_32bit,zeroes,v0,4 + vsldoi mask_64bit,zeroes,v0,8 + + /* Get the initial value into v8 */ + vxor v8,v8,v8 + MTVRD(v8, R3) +#ifdef REFLECT + vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */ +#else + vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */ +#endif + +#ifdef BYTESWAP_DATA + LOAD_REG_ADDR(r3, .byteswap_constant) + lvx byteswap,0,r3 + addi r3,r3,16 +#endif + + cmpdi r5,256 + blt .Lshort + + rldicr r6,r5,0,56 + + /* Checksum in blocks of MAX_SIZE */ +1: lis r7,MAX_SIZE@h + ori r7,r7,MAX_SIZE@l + mr r9,r7 + cmpd r6,r7 + bgt 2f + mr r7,r6 +2: subf r6,r7,r6 + + /* our main loop does 128 bytes at a time */ + srdi r7,r7,7 + + /* + * Work out the offset into the constants table to start at. Each + * constant is 16 bytes, and it is used against 128 bytes of input + * data - 128 / 16 = 8 + */ + sldi r8,r7,4 + srdi r9,r9,3 + subf r8,r8,r9 + + /* We reduce our final 128 bytes in a separate step */ + addi r7,r7,-1 + mtctr r7 + + LOAD_REG_ADDR(r3, .constants) + + /* Find the start of our constants */ + add r3,r3,r8 + + /* zero v0-v7 which will contain our checksums */ + vxor v0,v0,v0 + vxor v1,v1,v1 + vxor v2,v2,v2 + vxor v3,v3,v3 + vxor v4,v4,v4 + vxor v5,v5,v5 + vxor v6,v6,v6 + vxor v7,v7,v7 + + lvx const1,0,r3 + + /* + * If we are looping back to consume more data we use the values + * already in v16-v23. + */ + cmpdi r0,1 + beq 2f + + /* First warm up pass */ + lvx v16,0,r4 + lvx v17,off16,r4 + VPERM(v16,v16,v16,byteswap) + VPERM(v17,v17,v17,byteswap) + lvx v18,off32,r4 + lvx v19,off48,r4 + VPERM(v18,v18,v18,byteswap) + VPERM(v19,v19,v19,byteswap) + lvx v20,off64,r4 + lvx v21,off80,r4 + VPERM(v20,v20,v20,byteswap) + VPERM(v21,v21,v21,byteswap) + lvx v22,off96,r4 + lvx v23,off112,r4 + VPERM(v22,v22,v22,byteswap) + VPERM(v23,v23,v23,byteswap) + addi r4,r4,8*16 + + /* xor in initial value */ + vxor v16,v16,v8 + +2: bdz .Lfirst_warm_up_done + + addi r3,r3,16 + lvx const2,0,r3 + + /* Second warm up pass */ + VPMSUMD(v8,v16,const1) + lvx v16,0,r4 + VPERM(v16,v16,v16,byteswap) + ori r2,r2,0 + + VPMSUMD(v9,v17,const1) + lvx v17,off16,r4 + VPERM(v17,v17,v17,byteswap) + ori r2,r2,0 + + VPMSUMD(v10,v18,const1) + lvx v18,off32,r4 + VPERM(v18,v18,v18,byteswap) + ori r2,r2,0 + + VPMSUMD(v11,v19,const1) + lvx v19,off48,r4 + VPERM(v19,v19,v19,byteswap) + ori r2,r2,0 + + VPMSUMD(v12,v20,const1) + lvx v20,off64,r4 + VPERM(v20,v20,v20,byteswap) + ori r2,r2,0 + + VPMSUMD(v13,v21,const1) + lvx v21,off80,r4 + VPERM(v21,v21,v21,byteswap) + ori r2,r2,0 + + VPMSUMD(v14,v22,const1) + lvx v22,off96,r4 + VPERM(v22,v22,v22,byteswap) + ori r2,r2,0 + + VPMSUMD(v15,v23,const1) + lvx v23,off112,r4 + VPERM(v23,v23,v23,byteswap) + + addi r4,r4,8*16 + + bdz .Lfirst_cool_down + + /* + * main loop. We modulo schedule it such that it takes three iterations + * to complete - first iteration load, second iteration vpmsum, third + * iteration xor. + */ + .balign 16 +4: lvx const1,0,r3 + addi r3,r3,16 + ori r2,r2,0 + + vxor v0,v0,v8 + VPMSUMD(v8,v16,const2) + lvx v16,0,r4 + VPERM(v16,v16,v16,byteswap) + ori r2,r2,0 + + vxor v1,v1,v9 + VPMSUMD(v9,v17,const2) + lvx v17,off16,r4 + VPERM(v17,v17,v17,byteswap) + ori r2,r2,0 + + vxor v2,v2,v10 + VPMSUMD(v10,v18,const2) + lvx v18,off32,r4 + VPERM(v18,v18,v18,byteswap) + ori r2,r2,0 + + vxor v3,v3,v11 + VPMSUMD(v11,v19,const2) + lvx v19,off48,r4 + VPERM(v19,v19,v19,byteswap) + lvx const2,0,r3 + ori r2,r2,0 + + vxor v4,v4,v12 + VPMSUMD(v12,v20,const1) + lvx v20,off64,r4 + VPERM(v20,v20,v20,byteswap) + ori r2,r2,0 + + vxor v5,v5,v13 + VPMSUMD(v13,v21,const1) + lvx v21,off80,r4 + VPERM(v21,v21,v21,byteswap) + ori r2,r2,0 + + vxor v6,v6,v14 + VPMSUMD(v14,v22,const1) + lvx v22,off96,r4 + VPERM(v22,v22,v22,byteswap) + ori r2,r2,0 + + vxor v7,v7,v15 + VPMSUMD(v15,v23,const1) + lvx v23,off112,r4 + VPERM(v23,v23,v23,byteswap) + + addi r4,r4,8*16 + + bdnz 4b + +.Lfirst_cool_down: + /* First cool down pass */ + lvx const1,0,r3 + addi r3,r3,16 + + vxor v0,v0,v8 + VPMSUMD(v8,v16,const1) + ori r2,r2,0 + + vxor v1,v1,v9 + VPMSUMD(v9,v17,const1) + ori r2,r2,0 + + vxor v2,v2,v10 + VPMSUMD(v10,v18,const1) + ori r2,r2,0 + + vxor v3,v3,v11 + VPMSUMD(v11,v19,const1) + ori r2,r2,0 + + vxor v4,v4,v12 + VPMSUMD(v12,v20,const1) + ori r2,r2,0 + + vxor v5,v5,v13 + VPMSUMD(v13,v21,const1) + ori r2,r2,0 + + vxor v6,v6,v14 + VPMSUMD(v14,v22,const1) + ori r2,r2,0 + + vxor v7,v7,v15 + VPMSUMD(v15,v23,const1) + ori r2,r2,0 + +.Lsecond_cool_down: + /* Second cool down pass */ + vxor v0,v0,v8 + vxor v1,v1,v9 + vxor v2,v2,v10 + vxor v3,v3,v11 + vxor v4,v4,v12 + vxor v5,v5,v13 + vxor v6,v6,v14 + vxor v7,v7,v15 + +#ifdef REFLECT + /* + * vpmsumd produces a 96 bit result in the least significant bits + * of the register. Since we are bit reflected we have to shift it + * left 32 bits so it occupies the least significant bits in the + * bit reflected domain. + */ + vsldoi v0,v0,zeroes,4 + vsldoi v1,v1,zeroes,4 + vsldoi v2,v2,zeroes,4 + vsldoi v3,v3,zeroes,4 + vsldoi v4,v4,zeroes,4 + vsldoi v5,v5,zeroes,4 + vsldoi v6,v6,zeroes,4 + vsldoi v7,v7,zeroes,4 +#endif + + /* xor with last 1024 bits */ + lvx v8,0,r4 + lvx v9,off16,r4 + VPERM(v8,v8,v8,byteswap) + VPERM(v9,v9,v9,byteswap) + lvx v10,off32,r4 + lvx v11,off48,r4 + VPERM(v10,v10,v10,byteswap) + VPERM(v11,v11,v11,byteswap) + lvx v12,off64,r4 + lvx v13,off80,r4 + VPERM(v12,v12,v12,byteswap) + VPERM(v13,v13,v13,byteswap) + lvx v14,off96,r4 + lvx v15,off112,r4 + VPERM(v14,v14,v14,byteswap) + VPERM(v15,v15,v15,byteswap) + + addi r4,r4,8*16 + + vxor v16,v0,v8 + vxor v17,v1,v9 + vxor v18,v2,v10 + vxor v19,v3,v11 + vxor v20,v4,v12 + vxor v21,v5,v13 + vxor v22,v6,v14 + vxor v23,v7,v15 + + li r0,1 + cmpdi r6,0 + addi r6,r6,128 + bne 1b + + /* Work out how many bytes we have left */ + andi. r5,r5,127 + + /* Calculate where in the constant table we need to start */ + subfic r6,r5,128 + add r3,r3,r6 + + /* How many 16 byte chunks are in the tail */ + srdi r7,r5,4 + mtctr r7 + + /* + * Reduce the previously calculated 1024 bits to 64 bits, shifting + * 32 bits to include the trailing 32 bits of zeros + */ + lvx v0,0,r3 + lvx v1,off16,r3 + lvx v2,off32,r3 + lvx v3,off48,r3 + lvx v4,off64,r3 + lvx v5,off80,r3 + lvx v6,off96,r3 + lvx v7,off112,r3 + addi r3,r3,8*16 + + VPMSUMW(v0,v16,v0) + VPMSUMW(v1,v17,v1) + VPMSUMW(v2,v18,v2) + VPMSUMW(v3,v19,v3) + VPMSUMW(v4,v20,v4) + VPMSUMW(v5,v21,v5) + VPMSUMW(v6,v22,v6) + VPMSUMW(v7,v23,v7) + + /* Now reduce the tail (0 - 112 bytes) */ + cmpdi r7,0 + beq 1f + + lvx v16,0,r4 + lvx v17,0,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off16,r4 + lvx v17,off16,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off32,r4 + lvx v17,off32,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off48,r4 + lvx v17,off48,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off64,r4 + lvx v17,off64,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off80,r4 + lvx v17,off80,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off96,r4 + lvx v17,off96,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + + /* Now xor all the parallel chunks together */ +1: vxor v0,v0,v1 + vxor v2,v2,v3 + vxor v4,v4,v5 + vxor v6,v6,v7 + + vxor v0,v0,v2 + vxor v4,v4,v6 + + vxor v0,v0,v4 + +.Lbarrett_reduction: + /* Barrett constants */ + LOAD_REG_ADDR(r3, .barrett_constants) + + lvx const1,0,r3 + lvx const2,off16,r3 + + vsldoi v1,v0,v0,8 + vxor v0,v0,v1 /* xor two 64 bit results together */ + +#ifdef REFLECT + /* shift left one bit */ + vspltisb v1,1 + vsl v0,v0,v1 +#endif + + vand v0,v0,mask_64bit +#ifndef REFLECT + /* + * Now for the Barrett reduction algorithm. The idea is to calculate q, + * the multiple of our polynomial that we need to subtract. By + * doing the computation 2x bits higher (ie 64 bits) and shifting the + * result back down 2x bits, we round down to the nearest multiple. + */ + VPMSUMD(v1,v0,const1) /* ma */ + vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */ + VPMSUMD(v1,v1,const2) /* qn */ + vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Get the result into r3. We need to shift it left 8 bytes: + * V0 [ 0 1 2 X ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */ +#else + /* + * The reflected version of Barrett reduction. Instead of bit + * reflecting our data (which is expensive to do), we bit reflect our + * constants and our algorithm, which means the intermediate data in + * our vector registers goes from 0-63 instead of 63-0. We can reflect + * the algorithm because we don't carry in mod 2 arithmetic. + */ + vand v1,v0,mask_32bit /* bottom 32 bits of a */ + VPMSUMD(v1,v1,const1) /* ma */ + vand v1,v1,mask_32bit /* bottom 32bits of ma */ + VPMSUMD(v1,v1,const2) /* qn */ + vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Since we are bit reflected, the result (ie the low 32 bits) is in + * the high 32 bits. We just need to shift it left 4 bytes + * V0 [ 0 1 X 3 ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */ +#endif + + /* Get it into r3 */ + MFVRD(R3, v0) + +.Lout: + subi r6,r1,56+10*16 + subi r7,r1,56+2*16 + + lvx v20,0,r6 + lvx v21,off16,r6 + lvx v22,off32,r6 + lvx v23,off48,r6 + lvx v24,off64,r6 + lvx v25,off80,r6 + lvx v26,off96,r6 + lvx v27,off112,r6 + lvx v28,0,r7 + lvx v29,off16,r7 + + ld r31,-8(r1) + ld r30,-16(r1) + ld r29,-24(r1) + ld r28,-32(r1) + ld r27,-40(r1) + ld r26,-48(r1) + ld r25,-56(r1) + + blr + +.Lfirst_warm_up_done: + lvx const1,0,r3 + addi r3,r3,16 + + VPMSUMD(v8,v16,const1) + VPMSUMD(v9,v17,const1) + VPMSUMD(v10,v18,const1) + VPMSUMD(v11,v19,const1) + VPMSUMD(v12,v20,const1) + VPMSUMD(v13,v21,const1) + VPMSUMD(v14,v22,const1) + VPMSUMD(v15,v23,const1) + + b .Lsecond_cool_down + +.Lshort: + cmpdi r5,0 + beq .Lzero + + LOAD_REG_ADDR(r3, .short_constants) + + /* Calculate where in the constant table we need to start */ + subfic r6,r5,256 + add r3,r3,r6 + + /* How many 16 byte chunks? */ + srdi r7,r5,4 + mtctr r7 + + vxor v19,v19,v19 + vxor v20,v20,v20 + + lvx v0,0,r4 + lvx v16,0,r3 + VPERM(v0,v0,v16,byteswap) + vxor v0,v0,v8 /* xor in initial value */ + VPMSUMW(v0,v0,v16) + bdz .Lv0 + + lvx v1,off16,r4 + lvx v17,off16,r3 + VPERM(v1,v1,v17,byteswap) + VPMSUMW(v1,v1,v17) + bdz .Lv1 + + lvx v2,off32,r4 + lvx v16,off32,r3 + VPERM(v2,v2,v16,byteswap) + VPMSUMW(v2,v2,v16) + bdz .Lv2 + + lvx v3,off48,r4 + lvx v17,off48,r3 + VPERM(v3,v3,v17,byteswap) + VPMSUMW(v3,v3,v17) + bdz .Lv3 + + lvx v4,off64,r4 + lvx v16,off64,r3 + VPERM(v4,v4,v16,byteswap) + VPMSUMW(v4,v4,v16) + bdz .Lv4 + + lvx v5,off80,r4 + lvx v17,off80,r3 + VPERM(v5,v5,v17,byteswap) + VPMSUMW(v5,v5,v17) + bdz .Lv5 + + lvx v6,off96,r4 + lvx v16,off96,r3 + VPERM(v6,v6,v16,byteswap) + VPMSUMW(v6,v6,v16) + bdz .Lv6 + + lvx v7,off112,r4 + lvx v17,off112,r3 + VPERM(v7,v7,v17,byteswap) + VPMSUMW(v7,v7,v17) + bdz .Lv7 + + addi r3,r3,128 + addi r4,r4,128 + + lvx v8,0,r4 + lvx v16,0,r3 + VPERM(v8,v8,v16,byteswap) + VPMSUMW(v8,v8,v16) + bdz .Lv8 + + lvx v9,off16,r4 + lvx v17,off16,r3 + VPERM(v9,v9,v17,byteswap) + VPMSUMW(v9,v9,v17) + bdz .Lv9 + + lvx v10,off32,r4 + lvx v16,off32,r3 + VPERM(v10,v10,v16,byteswap) + VPMSUMW(v10,v10,v16) + bdz .Lv10 + + lvx v11,off48,r4 + lvx v17,off48,r3 + VPERM(v11,v11,v17,byteswap) + VPMSUMW(v11,v11,v17) + bdz .Lv11 + + lvx v12,off64,r4 + lvx v16,off64,r3 + VPERM(v12,v12,v16,byteswap) + VPMSUMW(v12,v12,v16) + bdz .Lv12 + + lvx v13,off80,r4 + lvx v17,off80,r3 + VPERM(v13,v13,v17,byteswap) + VPMSUMW(v13,v13,v17) + bdz .Lv13 + + lvx v14,off96,r4 + lvx v16,off96,r3 + VPERM(v14,v14,v16,byteswap) + VPMSUMW(v14,v14,v16) + bdz .Lv14 + + lvx v15,off112,r4 + lvx v17,off112,r3 + VPERM(v15,v15,v17,byteswap) + VPMSUMW(v15,v15,v17) + +.Lv15: vxor v19,v19,v15 +.Lv14: vxor v20,v20,v14 +.Lv13: vxor v19,v19,v13 +.Lv12: vxor v20,v20,v12 +.Lv11: vxor v19,v19,v11 +.Lv10: vxor v20,v20,v10 +.Lv9: vxor v19,v19,v9 +.Lv8: vxor v20,v20,v8 +.Lv7: vxor v19,v19,v7 +.Lv6: vxor v20,v20,v6 +.Lv5: vxor v19,v19,v5 +.Lv4: vxor v20,v20,v4 +.Lv3: vxor v19,v19,v3 +.Lv2: vxor v20,v20,v2 +.Lv1: vxor v19,v19,v1 +.Lv0: vxor v20,v20,v0 + + vxor v0,v19,v20 + + b .Lbarrett_reduction + +.Lzero: + mr r3,r10 + b .Lout + +FUNC_END(CRC_FUNCTION_NAME) diff --git a/lib/crc/powerpc/crc32.h b/lib/crc/powerpc/crc32.h new file mode 100644 index 000000000000..811cc2e6ed24 --- /dev/null +++ b/lib/crc/powerpc/crc32.h @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <asm/switch_to.h> +#include <crypto/internal/simd.h> +#include <linux/cpufeature.h> +#include <linux/jump_label.h> +#include <linux/preempt.h> +#include <linux/uaccess.h> + +#define VMX_ALIGN 16 +#define VMX_ALIGN_MASK (VMX_ALIGN-1) + +#define VECTOR_BREAKPOINT 512 + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto); + +#define crc32_le_arch crc32_le_base /* not implemented on this arch */ +#define crc32_be_arch crc32_be_base /* not implemented on this arch */ + +u32 __crc32c_vpmsum(u32 crc, const u8 *p, size_t len); + +static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) +{ + unsigned int prealign; + unsigned int tail; + + if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || + !static_branch_likely(&have_vec_crypto) || !crypto_simd_usable()) + return crc32c_base(crc, p, len); + + if ((unsigned long)p & VMX_ALIGN_MASK) { + prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); + crc = crc32c_base(crc, p, prealign); + len -= prealign; + p += prealign; + } + + if (len & ~VMX_ALIGN_MASK) { + preempt_disable(); + pagefault_disable(); + enable_kernel_altivec(); + crc = __crc32c_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); + disable_kernel_altivec(); + pagefault_enable(); + preempt_enable(); + } + + tail = len & VMX_ALIGN_MASK; + if (tail) { + p += len & ~VMX_ALIGN_MASK; + crc = crc32c_base(crc, p, tail); + } + + return crc; +} + +#define crc32_mod_init_arch crc32_mod_init_arch +static inline void crc32_mod_init_arch(void) +{ + if (cpu_has_feature(CPU_FTR_ARCH_207S) && + (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO)) + static_branch_enable(&have_vec_crypto); +} + +static inline u32 crc32_optimizations_arch(void) +{ + if (static_key_enabled(&have_vec_crypto)) + return CRC32C_OPTIMIZATION; + return 0; +} diff --git a/lib/crc/powerpc/crc32c-vpmsum_asm.S b/lib/crc/powerpc/crc32c-vpmsum_asm.S new file mode 100644 index 000000000000..1b35c55cce0a --- /dev/null +++ b/lib/crc/powerpc/crc32c-vpmsum_asm.S @@ -0,0 +1,842 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Calculate a crc32c with vpmsum acceleration + * + * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM + */ + .section .rodata +.balign 16 + +.byteswap_constant: + /* byte reverse permute constant */ + .octa 0x0F0E0D0C0B0A09080706050403020100 + +.constants: + + /* Reduce 262144 kbits to 1024 bits */ + /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */ + .octa 0x00000000b6ca9e20000000009c37c408 + + /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */ + .octa 0x00000000350249a800000001b51df26c + + /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */ + .octa 0x00000001862dac54000000000724b9d0 + + /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */ + .octa 0x00000001d87fb48c00000001c00532fe + + /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */ + .octa 0x00000001f39b699e00000000f05a9362 + + /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */ + .octa 0x0000000101da11b400000001e1007970 + + /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */ + .octa 0x00000001cab571e000000000a57366ee + + /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */ + .octa 0x00000000c7020cfe0000000192011284 + + /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */ + .octa 0x00000000cdaed1ae0000000162716d9a + + /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */ + .octa 0x00000001e804effc00000000cd97ecde + + /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */ + .octa 0x0000000077c3ea3a0000000058812bc0 + + /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */ + .octa 0x0000000068df31b40000000088b8c12e + + /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */ + .octa 0x00000000b059b6c200000001230b234c + + /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */ + .octa 0x0000000145fb8ed800000001120b416e + + /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */ + .octa 0x00000000cbc0916800000001974aecb0 + + /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */ + .octa 0x000000005ceeedc2000000008ee3f226 + + /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */ + .octa 0x0000000047d74e8600000001089aba9a + + /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */ + .octa 0x00000001407e9e220000000065113872 + + /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */ + .octa 0x00000001da967bda000000005c07ec10 + + /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */ + .octa 0x000000006c8983680000000187590924 + + /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */ + .octa 0x00000000f2d14c9800000000e35da7c6 + + /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */ + .octa 0x00000001993c6ad4000000000415855a + + /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */ + .octa 0x000000014683d1ac0000000073617758 + + /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */ + .octa 0x00000001a7c93e6c0000000176021d28 + + /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */ + .octa 0x000000010211e90a00000001c358fd0a + + /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */ + .octa 0x000000001119403e00000001ff7a2c18 + + /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */ + .octa 0x000000001c3261aa00000000f2d9f7e4 + + /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */ + .octa 0x000000014e37a634000000016cf1f9c8 + + /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */ + .octa 0x0000000073786c0c000000010af9279a + + /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */ + .octa 0x000000011dc037f80000000004f101e8 + + /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */ + .octa 0x0000000031433dfc0000000070bcf184 + + /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */ + .octa 0x000000009cde8348000000000a8de642 + + /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */ + .octa 0x0000000038d3c2a60000000062ea130c + + /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */ + .octa 0x000000011b25f26000000001eb31cbb2 + + /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */ + .octa 0x000000001629e6f00000000170783448 + + /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */ + .octa 0x0000000160838b4c00000001a684b4c6 + + /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */ + .octa 0x000000007a44011c00000000253ca5b4 + + /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */ + .octa 0x00000000226f417a0000000057b4b1e2 + + /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */ + .octa 0x0000000045eb2eb400000000b6bd084c + + /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */ + .octa 0x000000014459d70c0000000123c2d592 + + /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */ + .octa 0x00000001d406ed8200000000159dafce + + /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */ + .octa 0x0000000160c8e1a80000000127e1a64e + + /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */ + .octa 0x0000000027ba80980000000056860754 + + /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */ + .octa 0x000000006d92d01800000001e661aae8 + + /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */ + .octa 0x000000012ed7e3f200000000f82c6166 + + /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */ + .octa 0x000000002dc8778800000000c4f9c7ae + + /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */ + .octa 0x0000000018240bb80000000074203d20 + + /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */ + .octa 0x000000001ad381580000000198173052 + + /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */ + .octa 0x00000001396b78f200000001ce8aba54 + + /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */ + .octa 0x000000011a68133400000001850d5d94 + + /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */ + .octa 0x000000012104732e00000001d609239c + + /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */ + .octa 0x00000000a140d90c000000001595f048 + + /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */ + .octa 0x00000001b7215eda0000000042ccee08 + + /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */ + .octa 0x00000001aaf1df3c000000010a389d74 + + /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */ + .octa 0x0000000029d15b8a000000012a840da6 + + /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */ + .octa 0x00000000f1a96922000000001d181c0c + + /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */ + .octa 0x00000001ac80d03c0000000068b7d1f6 + + /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */ + .octa 0x000000000f11d56a000000005b0f14fc + + /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */ + .octa 0x00000001f1c022a20000000179e9e730 + + /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */ + .octa 0x0000000173d00ae200000001ce1368d6 + + /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */ + .octa 0x00000001d4ffe4ac0000000112c3a84c + + /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */ + .octa 0x000000016edc5ae400000000de940fee + + /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */ + .octa 0x00000001f1a0214000000000fe896b7e + + /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */ + .octa 0x00000000ca0b28a000000001f797431c + + /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */ + .octa 0x00000001928e30a20000000053e989ba + + /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */ + .octa 0x0000000097b1b002000000003920cd16 + + /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */ + .octa 0x00000000b15bf90600000001e6f579b8 + + /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */ + .octa 0x00000000411c5d52000000007493cb0a + + /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */ + .octa 0x00000001c36f330000000001bdd376d8 + + /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */ + .octa 0x00000001119227e0000000016badfee6 + + /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */ + .octa 0x00000000114d47020000000071de5c58 + + /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */ + .octa 0x00000000458b5b9800000000453f317c + + /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */ + .octa 0x000000012e31fb8e0000000121675cce + + /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */ + .octa 0x000000005cf619d800000001f409ee92 + + /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */ + .octa 0x0000000063f4d8b200000000f36b9c88 + + /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */ + .octa 0x000000004138dc8a0000000036b398f4 + + /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */ + .octa 0x00000001d29ee8e000000001748f9adc + + /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */ + .octa 0x000000006a08ace800000001be94ec00 + + /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */ + .octa 0x0000000127d4201000000000b74370d6 + + /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */ + .octa 0x0000000019d76b6200000001174d0b98 + + /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */ + .octa 0x00000001b1471f6e00000000befc06a4 + + /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */ + .octa 0x00000001f64c19cc00000001ae125288 + + /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */ + .octa 0x00000000003c0ea00000000095c19b34 + + /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */ + .octa 0x000000014d73abf600000001a78496f2 + + /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */ + .octa 0x00000001620eb84400000001ac5390a0 + + /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */ + .octa 0x0000000147655048000000002a80ed6e + + /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */ + .octa 0x0000000067b5077e00000001fa9b0128 + + /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */ + .octa 0x0000000010ffe20600000001ea94929e + + /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */ + .octa 0x000000000fee8f1e0000000125f4305c + + /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */ + .octa 0x00000001da26fbae00000001471e2002 + + /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */ + .octa 0x00000001b3a8bd880000000132d2253a + + /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */ + .octa 0x00000000e8f3898e00000000f26b3592 + + /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */ + .octa 0x00000000b0d0d28c00000000bc8b67b0 + + /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */ + .octa 0x0000000030f2a798000000013a826ef2 + + /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */ + .octa 0x000000000fba10020000000081482c84 + + /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */ + .octa 0x00000000bdb9bd7200000000e77307c2 + + /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */ + .octa 0x0000000075d3bf5a00000000d4a07ec8 + + /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */ + .octa 0x00000000ef1f98a00000000017102100 + + /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */ + .octa 0x00000000689c760200000000db406486 + + /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */ + .octa 0x000000016d5fa5fe0000000192db7f88 + + /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */ + .octa 0x00000001d0d2b9ca000000018bf67b1e + + /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */ + .octa 0x0000000041e7b470000000007c09163e + + /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */ + .octa 0x00000001cbb6495e000000000adac060 + + /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */ + .octa 0x000000010052a0b000000000bd8316ae + + /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */ + .octa 0x00000001d8effb5c000000019f09ab54 + + /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */ + .octa 0x00000001d969853c0000000125155542 + + /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */ + .octa 0x00000000523ccce2000000018fdb5882 + + /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */ + .octa 0x000000001e2436bc00000000e794b3f4 + + /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */ + .octa 0x00000000ddd1c3a2000000016f9bb022 + + /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */ + .octa 0x0000000019fcfe3800000000290c9978 + + /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */ + .octa 0x00000001ce95db640000000083c0f350 + + /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */ + .octa 0x00000000af5828060000000173ea6628 + + /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */ + .octa 0x00000001006388f600000001c8b4e00a + + /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */ + .octa 0x0000000179eca00a00000000de95d6aa + + /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */ + .octa 0x0000000122410a6a000000010b7f7248 + + /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */ + .octa 0x000000004288e87c00000001326e3a06 + + /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */ + .octa 0x000000016c5490da00000000bb62c2e6 + + /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */ + .octa 0x00000000d1c71f6e0000000156a4b2c2 + + /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */ + .octa 0x00000001b4ce08a6000000011dfe763a + + /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */ + .octa 0x00000001466ba60c000000007bcca8e2 + + /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */ + .octa 0x00000001f6c488a40000000186118faa + + /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */ + .octa 0x000000013bfb06820000000111a65a88 + + /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */ + .octa 0x00000000690e9e54000000003565e1c4 + + /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */ + .octa 0x00000000281346b6000000012ed02a82 + + /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */ + .octa 0x000000015646402400000000c486ecfc + + /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */ + .octa 0x000000016063a8dc0000000001b951b2 + + /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */ + .octa 0x0000000116a663620000000048143916 + + /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */ + .octa 0x000000017e8aa4d200000001dc2ae124 + + /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */ + .octa 0x00000001728eb10c00000001416c58d6 + + /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */ + .octa 0x00000001b08fd7fa00000000a479744a + + /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */ + .octa 0x00000001092a16e80000000096ca3a26 + + /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */ + .octa 0x00000000a505637c00000000ff223d4e + + /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */ + .octa 0x00000000d94869b2000000010e84da42 + + /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */ + .octa 0x00000001c8b203ae00000001b61ba3d0 + + /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */ + .octa 0x000000005704aea000000000680f2de8 + + /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */ + .octa 0x000000012e295fa2000000008772a9a8 + + /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */ + .octa 0x000000011d0908bc0000000155f295bc + + /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */ + .octa 0x0000000193ed97ea00000000595f9282 + + /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */ + .octa 0x000000013a0f1c520000000164b1c25a + + /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */ + .octa 0x000000010c2c40c000000000fbd67c50 + + /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */ + .octa 0x00000000ff6fac3e0000000096076268 + + /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */ + .octa 0x000000017b3609c000000001d288e4cc + + /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */ + .octa 0x0000000088c8c92200000001eaac1bdc + + /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */ + .octa 0x00000001751baae600000001f1ea39e2 + + /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */ + .octa 0x000000010795297200000001eb6506fc + + /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */ + .octa 0x0000000162b00abe000000010f806ffe + + /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */ + .octa 0x000000000d7b404c000000010408481e + + /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */ + .octa 0x00000000763b13d40000000188260534 + + /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */ + .octa 0x00000000f6dc22d80000000058fc73e0 + + /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */ + .octa 0x000000007daae06000000000391c59b8 + + /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */ + .octa 0x000000013359ab7c000000018b638400 + + /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */ + .octa 0x000000008add438a000000011738f5c4 + + /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */ + .octa 0x00000001edbefdea000000008cf7c6da + + /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */ + .octa 0x000000004104e0f800000001ef97fb16 + + /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */ + .octa 0x00000000b48a82220000000102130e20 + + /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */ + .octa 0x00000001bcb4684400000000db968898 + + /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */ + .octa 0x000000013293ce0a00000000b5047b5e + + /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */ + .octa 0x00000001710d0844000000010b90fdb2 + + /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */ + .octa 0x0000000117907f6e000000004834a32e + + /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */ + .octa 0x0000000087ddf93e0000000059c8f2b0 + + /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */ + .octa 0x000000005970e9b00000000122cec508 + + /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */ + .octa 0x0000000185b2b7d0000000000a330cda + + /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */ + .octa 0x00000001dcee0efc000000014a47148c + + /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */ + .octa 0x0000000030da27220000000042c61cb8 + + /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */ + .octa 0x000000012f925a180000000012fe6960 + + /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */ + .octa 0x00000000dd2e357c00000000dbda2c20 + + /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */ + .octa 0x00000000071c80de000000011122410c + + /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */ + .octa 0x000000011513140a00000000977b2070 + + /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */ + .octa 0x00000001df876e8e000000014050438e + + /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */ + .octa 0x000000015f81d6ce0000000147c840e8 + + /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */ + .octa 0x000000019dd94dbe00000001cc7c88ce + + /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */ + .octa 0x00000001373d206e00000001476b35a4 + + /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */ + .octa 0x00000000668ccade000000013d52d508 + + /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */ + .octa 0x00000001b192d268000000008e4be32e + + /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */ + .octa 0x00000000e30f3a7800000000024120fe + + /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */ + .octa 0x000000010ef1f7bc00000000ddecddb4 + + /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */ + .octa 0x00000001f5ac738000000000d4d403bc + + /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */ + .octa 0x000000011822ea7000000001734b89aa + + /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */ + .octa 0x00000000c3a33848000000010e7a58d6 + + /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */ + .octa 0x00000001bd151c2400000001f9f04e9c + + /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */ + .octa 0x0000000056002d7600000000b692225e + + /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */ + .octa 0x000000014657c4f4000000019b8d3f3e + + /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */ + .octa 0x0000000113742d7c00000001a874f11e + + /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */ + .octa 0x000000019c5920ba000000010d5a4254 + + /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */ + .octa 0x000000005216d2d600000000bbb2f5d6 + + /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */ + .octa 0x0000000136f5ad8a0000000179cc0e36 + + /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */ + .octa 0x000000018b07beb600000001dca1da4a + + /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */ + .octa 0x00000000db1e93b000000000feb1a192 + + /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */ + .octa 0x000000000b96fa3a00000000d1eeedd6 + + /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */ + .octa 0x00000001d9968af0000000008fad9bb4 + + /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */ + .octa 0x000000000e4a77a200000001884938e4 + + /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */ + .octa 0x00000000508c2ac800000001bc2e9bc0 + + /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */ + .octa 0x0000000021572a8000000001f9658a68 + + /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */ + .octa 0x00000001b859daf2000000001b9224fc + + /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */ + .octa 0x000000016f7884740000000055b2fb84 + + /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */ + .octa 0x00000001b438810e000000018b090348 + + /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */ + .octa 0x0000000095ddc6f2000000011ccbd5ea + + /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */ + .octa 0x00000001d977c20c0000000007ae47f8 + + /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */ + .octa 0x00000000ebedb99a0000000172acbec0 + + /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */ + .octa 0x00000001df9e9e9200000001c6e3ff20 + + /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */ + .octa 0x00000001a4a3f95200000000e1b38744 + + /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */ + .octa 0x00000000e2f5122000000000791585b2 + + /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */ + .octa 0x000000004aa01f3e00000000ac53b894 + + /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */ + .octa 0x00000000b3e90a5800000001ed5f2cf4 + + /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */ + .octa 0x000000000c9ca2aa00000001df48b2e0 + + /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */ + .octa 0x000000015168231600000000049c1c62 + + /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */ + .octa 0x0000000036fce78c000000017c460c12 + + /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */ + .octa 0x000000009037dc10000000015be4da7e + + /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */ + .octa 0x00000000d3298582000000010f38f668 + + /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */ + .octa 0x00000001b42e8ad60000000039f40a00 + + /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */ + .octa 0x00000000142a983800000000bd4c10c4 + + /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */ + .octa 0x0000000109c7f1900000000042db1d98 + + /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */ + .octa 0x0000000056ff931000000001c905bae6 + + /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */ + .octa 0x00000001594513aa00000000069d40ea + + /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */ + .octa 0x00000001e3b5b1e8000000008e4fbad0 + + /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */ + .octa 0x000000011dd5fc080000000047bedd46 + + /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */ + .octa 0x00000001675f0cc20000000026396bf8 + + /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */ + .octa 0x00000000d1c8dd4400000000379beb92 + + /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */ + .octa 0x0000000115ebd3d8000000000abae54a + + /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */ + .octa 0x00000001ecbd0dac0000000007e6a128 + + /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */ + .octa 0x00000000cdf67af2000000000ade29d2 + + /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */ + .octa 0x000000004c01ff4c00000000f974c45c + + /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */ + .octa 0x00000000f2d8657e00000000e77ac60a + + /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */ + .octa 0x000000006bae74c40000000145895816 + + /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */ + .octa 0x0000000152af8aa00000000038e362be + + /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */ + .octa 0x0000000004663802000000007f991a64 + + /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */ + .octa 0x00000001ab2f5afc00000000fa366d3a + + /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */ + .octa 0x0000000074a4ebd400000001a2bb34f0 + + /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */ + .octa 0x00000001d7ab3a4c0000000028a9981e + + /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */ + .octa 0x00000001a8da60c600000001dbc672be + + /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */ + .octa 0x000000013cf6382000000000b04d77f6 + + /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */ + .octa 0x00000000bec12e1e0000000124400d96 + + /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */ + .octa 0x00000001c6368010000000014ca4b414 + + /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */ + .octa 0x00000001e6e78758000000012fe2c938 + + /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */ + .octa 0x000000008d7f2b3c00000001faed01e6 + + /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */ + .octa 0x000000016b4a156e000000007e80ecfe + + /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */ + .octa 0x00000001c63cfeb60000000098daee94 + + /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */ + .octa 0x000000015f902670000000010a04edea + + /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */ + .octa 0x00000001cd5de11e00000001c00b4524 + + /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */ + .octa 0x000000001acaec540000000170296550 + + /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */ + .octa 0x000000002bd0ca780000000181afaa48 + + /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */ + .octa 0x0000000032d63d5c0000000185a31ffa + + /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */ + .octa 0x000000001c6d4e4c000000002469f608 + + /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */ + .octa 0x0000000106a60b92000000006980102a + + /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */ + .octa 0x00000000d3855e120000000111ea9ca8 + + /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */ + .octa 0x00000000e312563600000001bd1d29ce + + /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */ + .octa 0x000000009e8f7ea400000001b34b9580 + + /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */ + .octa 0x00000001c82e562c000000003076054e + + /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */ + .octa 0x00000000ca9f09ce000000012a608ea4 + + /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */ + .octa 0x00000000c63764e600000000784d05fe + + /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */ + .octa 0x0000000168d2e49e000000016ef0d82a + + /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */ + .octa 0x00000000e986c1480000000075bda454 + + /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */ + .octa 0x00000000cfb65894000000003dc0a1c4 + + /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */ + .octa 0x0000000111cadee400000000e9a5d8be + + /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */ + .octa 0x0000000171fb63ce00000001609bc4b4 + +.short_constants: + + /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */ + /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */ + .octa 0x7fec2963e5bf80485cf015c388e56f72 + + /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */ + .octa 0x38e888d4844752a9963a18920246e2e6 + + /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */ + .octa 0x42316c00730206ad419a441956993a31 + + /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */ + .octa 0x543d5c543e65ddf9924752ba2b830011 + + /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */ + .octa 0x78e87aaf56767c9255bd7f9518e4a304 + + /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */ + .octa 0x8f68fcec1903da7f6d76739fe0553f1e + + /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */ + .octa 0x3f4840246791d588c133722b1fe0b5c3 + + /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */ + .octa 0x34c96751b04de25a64b67ee0e55ef1f3 + + /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */ + .octa 0x156c8e180b4a395b069db049b8fdb1e7 + + /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */ + .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e + + /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */ + .octa 0x041d37768cd75659817cdc5119b29a35 + + /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */ + .octa 0x3a0777818cfaa9651ce9d94b36c41f1c + + /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */ + .octa 0x0e148e8252377a554f256efcb82be955 + + /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */ + .octa 0x9c25531d19e65ddeec1631edb2dea967 + + /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */ + .octa 0x790606ff9957c0a65d27e147510ac59a + + /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */ + .octa 0x82f63b786ea2d55ca66805eb18b8ea18 + + +.barrett_constants: + /* 33 bit reflected Barrett constant m - (4^32)/n */ + .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */ + /* 33 bit reflected Barrett constant n */ + .octa 0x00000000000000000000000105ec76f1 + +#define CRC_FUNCTION_NAME __crc32c_vpmsum +#define REFLECT +#include "crc-vpmsum-template.S" diff --git a/lib/crc/powerpc/crct10dif-vpmsum_asm.S b/lib/crc/powerpc/crct10dif-vpmsum_asm.S new file mode 100644 index 000000000000..47a6266d89a8 --- /dev/null +++ b/lib/crc/powerpc/crct10dif-vpmsum_asm.S @@ -0,0 +1,845 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Calculate a CRC T10DIF with vpmsum acceleration + * + * Constants generated by crc32-vpmsum, available at + * https://github.com/antonblanchard/crc32-vpmsum + * + * crc32-vpmsum is + * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM + */ + .section .rodata +.balign 16 + +.byteswap_constant: + /* byte reverse permute constant */ + .octa 0x0F0E0D0C0B0A09080706050403020100 + +.constants: + + /* Reduce 262144 kbits to 1024 bits */ + /* x^261184 mod p(x), x^261120 mod p(x) */ + .octa 0x0000000056d300000000000052550000 + + /* x^260160 mod p(x), x^260096 mod p(x) */ + .octa 0x00000000ee67000000000000a1e40000 + + /* x^259136 mod p(x), x^259072 mod p(x) */ + .octa 0x0000000060830000000000004ad10000 + + /* x^258112 mod p(x), x^258048 mod p(x) */ + .octa 0x000000008cfe0000000000009ab40000 + + /* x^257088 mod p(x), x^257024 mod p(x) */ + .octa 0x000000003e93000000000000fdb50000 + + /* x^256064 mod p(x), x^256000 mod p(x) */ + .octa 0x000000003c2000000000000045480000 + + /* x^255040 mod p(x), x^254976 mod p(x) */ + .octa 0x00000000b1fc0000000000008d690000 + + /* x^254016 mod p(x), x^253952 mod p(x) */ + .octa 0x00000000f82b00000000000024ad0000 + + /* x^252992 mod p(x), x^252928 mod p(x) */ + .octa 0x0000000044420000000000009f1a0000 + + /* x^251968 mod p(x), x^251904 mod p(x) */ + .octa 0x00000000e88c00000000000066ec0000 + + /* x^250944 mod p(x), x^250880 mod p(x) */ + .octa 0x00000000385c000000000000c87d0000 + + /* x^249920 mod p(x), x^249856 mod p(x) */ + .octa 0x000000003227000000000000c8ff0000 + + /* x^248896 mod p(x), x^248832 mod p(x) */ + .octa 0x00000000a9a900000000000033440000 + + /* x^247872 mod p(x), x^247808 mod p(x) */ + .octa 0x00000000abaa00000000000066eb0000 + + /* x^246848 mod p(x), x^246784 mod p(x) */ + .octa 0x000000001ac3000000000000c4ef0000 + + /* x^245824 mod p(x), x^245760 mod p(x) */ + .octa 0x0000000063f000000000000056f30000 + + /* x^244800 mod p(x), x^244736 mod p(x) */ + .octa 0x0000000032cc00000000000002050000 + + /* x^243776 mod p(x), x^243712 mod p(x) */ + .octa 0x00000000f8b5000000000000568e0000 + + /* x^242752 mod p(x), x^242688 mod p(x) */ + .octa 0x000000008db100000000000064290000 + + /* x^241728 mod p(x), x^241664 mod p(x) */ + .octa 0x0000000059ca0000000000006b660000 + + /* x^240704 mod p(x), x^240640 mod p(x) */ + .octa 0x000000005f5c00000000000018f80000 + + /* x^239680 mod p(x), x^239616 mod p(x) */ + .octa 0x0000000061af000000000000b6090000 + + /* x^238656 mod p(x), x^238592 mod p(x) */ + .octa 0x00000000e29e000000000000099a0000 + + /* x^237632 mod p(x), x^237568 mod p(x) */ + .octa 0x000000000975000000000000a8360000 + + /* x^236608 mod p(x), x^236544 mod p(x) */ + .octa 0x0000000043900000000000004f570000 + + /* x^235584 mod p(x), x^235520 mod p(x) */ + .octa 0x00000000f9cd000000000000134c0000 + + /* x^234560 mod p(x), x^234496 mod p(x) */ + .octa 0x000000007c29000000000000ec380000 + + /* x^233536 mod p(x), x^233472 mod p(x) */ + .octa 0x000000004c6a000000000000b0d10000 + + /* x^232512 mod p(x), x^232448 mod p(x) */ + .octa 0x00000000e7290000000000007d3e0000 + + /* x^231488 mod p(x), x^231424 mod p(x) */ + .octa 0x00000000f1ab000000000000f0b20000 + + /* x^230464 mod p(x), x^230400 mod p(x) */ + .octa 0x0000000039db0000000000009c270000 + + /* x^229440 mod p(x), x^229376 mod p(x) */ + .octa 0x000000005e2800000000000092890000 + + /* x^228416 mod p(x), x^228352 mod p(x) */ + .octa 0x00000000d44e000000000000d5ee0000 + + /* x^227392 mod p(x), x^227328 mod p(x) */ + .octa 0x00000000cd0a00000000000041f50000 + + /* x^226368 mod p(x), x^226304 mod p(x) */ + .octa 0x00000000c5b400000000000010520000 + + /* x^225344 mod p(x), x^225280 mod p(x) */ + .octa 0x00000000fd2100000000000042170000 + + /* x^224320 mod p(x), x^224256 mod p(x) */ + .octa 0x000000002f2500000000000095c20000 + + /* x^223296 mod p(x), x^223232 mod p(x) */ + .octa 0x000000001b0100000000000001ce0000 + + /* x^222272 mod p(x), x^222208 mod p(x) */ + .octa 0x000000000d430000000000002aca0000 + + /* x^221248 mod p(x), x^221184 mod p(x) */ + .octa 0x0000000030a6000000000000385e0000 + + /* x^220224 mod p(x), x^220160 mod p(x) */ + .octa 0x00000000e37b0000000000006f7a0000 + + /* x^219200 mod p(x), x^219136 mod p(x) */ + .octa 0x00000000873600000000000024320000 + + /* x^218176 mod p(x), x^218112 mod p(x) */ + .octa 0x00000000e9fb000000000000bd9c0000 + + /* x^217152 mod p(x), x^217088 mod p(x) */ + .octa 0x000000003b9500000000000054bc0000 + + /* x^216128 mod p(x), x^216064 mod p(x) */ + .octa 0x00000000133e000000000000a4660000 + + /* x^215104 mod p(x), x^215040 mod p(x) */ + .octa 0x00000000784500000000000079930000 + + /* x^214080 mod p(x), x^214016 mod p(x) */ + .octa 0x00000000b9800000000000001bb80000 + + /* x^213056 mod p(x), x^212992 mod p(x) */ + .octa 0x00000000687600000000000024400000 + + /* x^212032 mod p(x), x^211968 mod p(x) */ + .octa 0x00000000aff300000000000029e10000 + + /* x^211008 mod p(x), x^210944 mod p(x) */ + .octa 0x0000000024b50000000000005ded0000 + + /* x^209984 mod p(x), x^209920 mod p(x) */ + .octa 0x0000000017e8000000000000b12e0000 + + /* x^208960 mod p(x), x^208896 mod p(x) */ + .octa 0x00000000128400000000000026d20000 + + /* x^207936 mod p(x), x^207872 mod p(x) */ + .octa 0x000000002115000000000000a32a0000 + + /* x^206912 mod p(x), x^206848 mod p(x) */ + .octa 0x000000009595000000000000a1210000 + + /* x^205888 mod p(x), x^205824 mod p(x) */ + .octa 0x00000000281e000000000000ee8b0000 + + /* x^204864 mod p(x), x^204800 mod p(x) */ + .octa 0x0000000006010000000000003d0d0000 + + /* x^203840 mod p(x), x^203776 mod p(x) */ + .octa 0x00000000e2b600000000000034e90000 + + /* x^202816 mod p(x), x^202752 mod p(x) */ + .octa 0x000000001bd40000000000004cdb0000 + + /* x^201792 mod p(x), x^201728 mod p(x) */ + .octa 0x00000000df2800000000000030e90000 + + /* x^200768 mod p(x), x^200704 mod p(x) */ + .octa 0x0000000049c200000000000042590000 + + /* x^199744 mod p(x), x^199680 mod p(x) */ + .octa 0x000000009b97000000000000df950000 + + /* x^198720 mod p(x), x^198656 mod p(x) */ + .octa 0x000000006184000000000000da7b0000 + + /* x^197696 mod p(x), x^197632 mod p(x) */ + .octa 0x00000000461700000000000012510000 + + /* x^196672 mod p(x), x^196608 mod p(x) */ + .octa 0x000000009b40000000000000f37e0000 + + /* x^195648 mod p(x), x^195584 mod p(x) */ + .octa 0x00000000eeb2000000000000ecf10000 + + /* x^194624 mod p(x), x^194560 mod p(x) */ + .octa 0x00000000b2e800000000000050f20000 + + /* x^193600 mod p(x), x^193536 mod p(x) */ + .octa 0x00000000f59a000000000000e0b30000 + + /* x^192576 mod p(x), x^192512 mod p(x) */ + .octa 0x00000000467f0000000000004d5a0000 + + /* x^191552 mod p(x), x^191488 mod p(x) */ + .octa 0x00000000da92000000000000bb010000 + + /* x^190528 mod p(x), x^190464 mod p(x) */ + .octa 0x000000001e1000000000000022a40000 + + /* x^189504 mod p(x), x^189440 mod p(x) */ + .octa 0x0000000058fe000000000000836f0000 + + /* x^188480 mod p(x), x^188416 mod p(x) */ + .octa 0x00000000b9ce000000000000d78d0000 + + /* x^187456 mod p(x), x^187392 mod p(x) */ + .octa 0x0000000022210000000000004f8d0000 + + /* x^186432 mod p(x), x^186368 mod p(x) */ + .octa 0x00000000744600000000000033760000 + + /* x^185408 mod p(x), x^185344 mod p(x) */ + .octa 0x000000001c2e000000000000a1e50000 + + /* x^184384 mod p(x), x^184320 mod p(x) */ + .octa 0x00000000dcc8000000000000a1a40000 + + /* x^183360 mod p(x), x^183296 mod p(x) */ + .octa 0x00000000910f00000000000019a20000 + + /* x^182336 mod p(x), x^182272 mod p(x) */ + .octa 0x0000000055d5000000000000f6ae0000 + + /* x^181312 mod p(x), x^181248 mod p(x) */ + .octa 0x00000000c8ba000000000000a7ac0000 + + /* x^180288 mod p(x), x^180224 mod p(x) */ + .octa 0x0000000031f8000000000000eea20000 + + /* x^179264 mod p(x), x^179200 mod p(x) */ + .octa 0x000000001966000000000000c4d90000 + + /* x^178240 mod p(x), x^178176 mod p(x) */ + .octa 0x00000000b9810000000000002b470000 + + /* x^177216 mod p(x), x^177152 mod p(x) */ + .octa 0x000000008303000000000000f7cf0000 + + /* x^176192 mod p(x), x^176128 mod p(x) */ + .octa 0x000000002ce500000000000035b30000 + + /* x^175168 mod p(x), x^175104 mod p(x) */ + .octa 0x000000002fae0000000000000c7c0000 + + /* x^174144 mod p(x), x^174080 mod p(x) */ + .octa 0x00000000f50c0000000000009edf0000 + + /* x^173120 mod p(x), x^173056 mod p(x) */ + .octa 0x00000000714f00000000000004cd0000 + + /* x^172096 mod p(x), x^172032 mod p(x) */ + .octa 0x00000000c161000000000000541b0000 + + /* x^171072 mod p(x), x^171008 mod p(x) */ + .octa 0x0000000021c8000000000000e2700000 + + /* x^170048 mod p(x), x^169984 mod p(x) */ + .octa 0x00000000b93d00000000000009a60000 + + /* x^169024 mod p(x), x^168960 mod p(x) */ + .octa 0x00000000fbcf000000000000761c0000 + + /* x^168000 mod p(x), x^167936 mod p(x) */ + .octa 0x0000000026350000000000009db30000 + + /* x^166976 mod p(x), x^166912 mod p(x) */ + .octa 0x00000000b64f0000000000003e9f0000 + + /* x^165952 mod p(x), x^165888 mod p(x) */ + .octa 0x00000000bd0e00000000000078590000 + + /* x^164928 mod p(x), x^164864 mod p(x) */ + .octa 0x00000000d9360000000000008bc80000 + + /* x^163904 mod p(x), x^163840 mod p(x) */ + .octa 0x000000002f140000000000008c9f0000 + + /* x^162880 mod p(x), x^162816 mod p(x) */ + .octa 0x000000006a270000000000006af70000 + + /* x^161856 mod p(x), x^161792 mod p(x) */ + .octa 0x000000006685000000000000e5210000 + + /* x^160832 mod p(x), x^160768 mod p(x) */ + .octa 0x0000000062da00000000000008290000 + + /* x^159808 mod p(x), x^159744 mod p(x) */ + .octa 0x00000000bb4b000000000000e4d00000 + + /* x^158784 mod p(x), x^158720 mod p(x) */ + .octa 0x00000000d2490000000000004ae10000 + + /* x^157760 mod p(x), x^157696 mod p(x) */ + .octa 0x00000000c85b00000000000000e70000 + + /* x^156736 mod p(x), x^156672 mod p(x) */ + .octa 0x00000000c37a00000000000015650000 + + /* x^155712 mod p(x), x^155648 mod p(x) */ + .octa 0x0000000018530000000000001c2f0000 + + /* x^154688 mod p(x), x^154624 mod p(x) */ + .octa 0x00000000b46600000000000037bd0000 + + /* x^153664 mod p(x), x^153600 mod p(x) */ + .octa 0x00000000439b00000000000012190000 + + /* x^152640 mod p(x), x^152576 mod p(x) */ + .octa 0x00000000b1260000000000005ece0000 + + /* x^151616 mod p(x), x^151552 mod p(x) */ + .octa 0x00000000d8110000000000002a5e0000 + + /* x^150592 mod p(x), x^150528 mod p(x) */ + .octa 0x00000000099f00000000000052330000 + + /* x^149568 mod p(x), x^149504 mod p(x) */ + .octa 0x00000000f9f9000000000000f9120000 + + /* x^148544 mod p(x), x^148480 mod p(x) */ + .octa 0x000000005cc00000000000000ddc0000 + + /* x^147520 mod p(x), x^147456 mod p(x) */ + .octa 0x00000000343b00000000000012200000 + + /* x^146496 mod p(x), x^146432 mod p(x) */ + .octa 0x000000009222000000000000d12b0000 + + /* x^145472 mod p(x), x^145408 mod p(x) */ + .octa 0x00000000d781000000000000eb2d0000 + + /* x^144448 mod p(x), x^144384 mod p(x) */ + .octa 0x000000000bf400000000000058970000 + + /* x^143424 mod p(x), x^143360 mod p(x) */ + .octa 0x00000000094200000000000013690000 + + /* x^142400 mod p(x), x^142336 mod p(x) */ + .octa 0x00000000d55100000000000051950000 + + /* x^141376 mod p(x), x^141312 mod p(x) */ + .octa 0x000000008f11000000000000954b0000 + + /* x^140352 mod p(x), x^140288 mod p(x) */ + .octa 0x00000000140f000000000000b29e0000 + + /* x^139328 mod p(x), x^139264 mod p(x) */ + .octa 0x00000000c6db000000000000db5d0000 + + /* x^138304 mod p(x), x^138240 mod p(x) */ + .octa 0x00000000715b000000000000dfaf0000 + + /* x^137280 mod p(x), x^137216 mod p(x) */ + .octa 0x000000000dea000000000000e3b60000 + + /* x^136256 mod p(x), x^136192 mod p(x) */ + .octa 0x000000006f94000000000000ddaf0000 + + /* x^135232 mod p(x), x^135168 mod p(x) */ + .octa 0x0000000024e1000000000000e4f70000 + + /* x^134208 mod p(x), x^134144 mod p(x) */ + .octa 0x000000008810000000000000aa110000 + + /* x^133184 mod p(x), x^133120 mod p(x) */ + .octa 0x0000000030c2000000000000a8e60000 + + /* x^132160 mod p(x), x^132096 mod p(x) */ + .octa 0x00000000e6d0000000000000ccf30000 + + /* x^131136 mod p(x), x^131072 mod p(x) */ + .octa 0x000000004da000000000000079bf0000 + + /* x^130112 mod p(x), x^130048 mod p(x) */ + .octa 0x000000007759000000000000b3a30000 + + /* x^129088 mod p(x), x^129024 mod p(x) */ + .octa 0x00000000597400000000000028790000 + + /* x^128064 mod p(x), x^128000 mod p(x) */ + .octa 0x000000007acd000000000000b5820000 + + /* x^127040 mod p(x), x^126976 mod p(x) */ + .octa 0x00000000e6e400000000000026ad0000 + + /* x^126016 mod p(x), x^125952 mod p(x) */ + .octa 0x000000006d49000000000000985b0000 + + /* x^124992 mod p(x), x^124928 mod p(x) */ + .octa 0x000000000f0800000000000011520000 + + /* x^123968 mod p(x), x^123904 mod p(x) */ + .octa 0x000000002c7f000000000000846c0000 + + /* x^122944 mod p(x), x^122880 mod p(x) */ + .octa 0x000000005ce7000000000000ae1d0000 + + /* x^121920 mod p(x), x^121856 mod p(x) */ + .octa 0x00000000d4cb000000000000e21d0000 + + /* x^120896 mod p(x), x^120832 mod p(x) */ + .octa 0x000000003a2300000000000019bb0000 + + /* x^119872 mod p(x), x^119808 mod p(x) */ + .octa 0x000000000e1700000000000095290000 + + /* x^118848 mod p(x), x^118784 mod p(x) */ + .octa 0x000000006e6400000000000050d20000 + + /* x^117824 mod p(x), x^117760 mod p(x) */ + .octa 0x000000008d5c0000000000000cd10000 + + /* x^116800 mod p(x), x^116736 mod p(x) */ + .octa 0x00000000ef310000000000007b570000 + + /* x^115776 mod p(x), x^115712 mod p(x) */ + .octa 0x00000000645d00000000000053d60000 + + /* x^114752 mod p(x), x^114688 mod p(x) */ + .octa 0x0000000018fc00000000000077510000 + + /* x^113728 mod p(x), x^113664 mod p(x) */ + .octa 0x000000000cb3000000000000a7b70000 + + /* x^112704 mod p(x), x^112640 mod p(x) */ + .octa 0x00000000991b000000000000d0780000 + + /* x^111680 mod p(x), x^111616 mod p(x) */ + .octa 0x00000000845a000000000000be3c0000 + + /* x^110656 mod p(x), x^110592 mod p(x) */ + .octa 0x00000000d3a9000000000000df020000 + + /* x^109632 mod p(x), x^109568 mod p(x) */ + .octa 0x0000000017d7000000000000063e0000 + + /* x^108608 mod p(x), x^108544 mod p(x) */ + .octa 0x000000007a860000000000008ab40000 + + /* x^107584 mod p(x), x^107520 mod p(x) */ + .octa 0x00000000fd7c000000000000c7bd0000 + + /* x^106560 mod p(x), x^106496 mod p(x) */ + .octa 0x00000000a56b000000000000efd60000 + + /* x^105536 mod p(x), x^105472 mod p(x) */ + .octa 0x0000000010e400000000000071380000 + + /* x^104512 mod p(x), x^104448 mod p(x) */ + .octa 0x00000000994500000000000004d30000 + + /* x^103488 mod p(x), x^103424 mod p(x) */ + .octa 0x00000000b83c0000000000003b0e0000 + + /* x^102464 mod p(x), x^102400 mod p(x) */ + .octa 0x00000000d6c10000000000008b020000 + + /* x^101440 mod p(x), x^101376 mod p(x) */ + .octa 0x000000009efc000000000000da940000 + + /* x^100416 mod p(x), x^100352 mod p(x) */ + .octa 0x000000005e87000000000000f9f70000 + + /* x^99392 mod p(x), x^99328 mod p(x) */ + .octa 0x000000006c9b00000000000045e40000 + + /* x^98368 mod p(x), x^98304 mod p(x) */ + .octa 0x00000000178a00000000000083940000 + + /* x^97344 mod p(x), x^97280 mod p(x) */ + .octa 0x00000000f0c8000000000000f0a00000 + + /* x^96320 mod p(x), x^96256 mod p(x) */ + .octa 0x00000000f699000000000000b74b0000 + + /* x^95296 mod p(x), x^95232 mod p(x) */ + .octa 0x00000000316d000000000000c1cf0000 + + /* x^94272 mod p(x), x^94208 mod p(x) */ + .octa 0x00000000987e00000000000072680000 + + /* x^93248 mod p(x), x^93184 mod p(x) */ + .octa 0x00000000acff000000000000e0ab0000 + + /* x^92224 mod p(x), x^92160 mod p(x) */ + .octa 0x00000000a1f6000000000000c5a80000 + + /* x^91200 mod p(x), x^91136 mod p(x) */ + .octa 0x0000000061bd000000000000cf690000 + + /* x^90176 mod p(x), x^90112 mod p(x) */ + .octa 0x00000000c9f2000000000000cbcc0000 + + /* x^89152 mod p(x), x^89088 mod p(x) */ + .octa 0x000000005a33000000000000de050000 + + /* x^88128 mod p(x), x^88064 mod p(x) */ + .octa 0x00000000e416000000000000ccd70000 + + /* x^87104 mod p(x), x^87040 mod p(x) */ + .octa 0x0000000058930000000000002f670000 + + /* x^86080 mod p(x), x^86016 mod p(x) */ + .octa 0x00000000a9d3000000000000152f0000 + + /* x^85056 mod p(x), x^84992 mod p(x) */ + .octa 0x00000000c114000000000000ecc20000 + + /* x^84032 mod p(x), x^83968 mod p(x) */ + .octa 0x00000000b9270000000000007c890000 + + /* x^83008 mod p(x), x^82944 mod p(x) */ + .octa 0x000000002e6000000000000006ee0000 + + /* x^81984 mod p(x), x^81920 mod p(x) */ + .octa 0x00000000dfc600000000000009100000 + + /* x^80960 mod p(x), x^80896 mod p(x) */ + .octa 0x000000004911000000000000ad4e0000 + + /* x^79936 mod p(x), x^79872 mod p(x) */ + .octa 0x00000000ae1b000000000000b04d0000 + + /* x^78912 mod p(x), x^78848 mod p(x) */ + .octa 0x0000000005fa000000000000e9900000 + + /* x^77888 mod p(x), x^77824 mod p(x) */ + .octa 0x0000000004a1000000000000cc6f0000 + + /* x^76864 mod p(x), x^76800 mod p(x) */ + .octa 0x00000000af73000000000000ed110000 + + /* x^75840 mod p(x), x^75776 mod p(x) */ + .octa 0x0000000082530000000000008f7e0000 + + /* x^74816 mod p(x), x^74752 mod p(x) */ + .octa 0x00000000cfdc000000000000594f0000 + + /* x^73792 mod p(x), x^73728 mod p(x) */ + .octa 0x00000000a6b6000000000000a8750000 + + /* x^72768 mod p(x), x^72704 mod p(x) */ + .octa 0x00000000fd76000000000000aa0c0000 + + /* x^71744 mod p(x), x^71680 mod p(x) */ + .octa 0x0000000006f500000000000071db0000 + + /* x^70720 mod p(x), x^70656 mod p(x) */ + .octa 0x0000000037ca000000000000ab0c0000 + + /* x^69696 mod p(x), x^69632 mod p(x) */ + .octa 0x00000000d7ab000000000000b7a00000 + + /* x^68672 mod p(x), x^68608 mod p(x) */ + .octa 0x00000000440800000000000090d30000 + + /* x^67648 mod p(x), x^67584 mod p(x) */ + .octa 0x00000000186100000000000054730000 + + /* x^66624 mod p(x), x^66560 mod p(x) */ + .octa 0x000000007368000000000000a3a20000 + + /* x^65600 mod p(x), x^65536 mod p(x) */ + .octa 0x0000000026d0000000000000f9040000 + + /* x^64576 mod p(x), x^64512 mod p(x) */ + .octa 0x00000000fe770000000000009c0a0000 + + /* x^63552 mod p(x), x^63488 mod p(x) */ + .octa 0x000000002cba000000000000d1e70000 + + /* x^62528 mod p(x), x^62464 mod p(x) */ + .octa 0x00000000f8bd0000000000005ac10000 + + /* x^61504 mod p(x), x^61440 mod p(x) */ + .octa 0x000000007372000000000000d68d0000 + + /* x^60480 mod p(x), x^60416 mod p(x) */ + .octa 0x00000000f37f00000000000089f60000 + + /* x^59456 mod p(x), x^59392 mod p(x) */ + .octa 0x00000000078400000000000008a90000 + + /* x^58432 mod p(x), x^58368 mod p(x) */ + .octa 0x00000000d3e400000000000042360000 + + /* x^57408 mod p(x), x^57344 mod p(x) */ + .octa 0x00000000eba800000000000092d50000 + + /* x^56384 mod p(x), x^56320 mod p(x) */ + .octa 0x00000000afbe000000000000b4d50000 + + /* x^55360 mod p(x), x^55296 mod p(x) */ + .octa 0x00000000d8ca000000000000c9060000 + + /* x^54336 mod p(x), x^54272 mod p(x) */ + .octa 0x00000000c2d00000000000008f4f0000 + + /* x^53312 mod p(x), x^53248 mod p(x) */ + .octa 0x00000000373200000000000028690000 + + /* x^52288 mod p(x), x^52224 mod p(x) */ + .octa 0x0000000046ae000000000000c3b30000 + + /* x^51264 mod p(x), x^51200 mod p(x) */ + .octa 0x00000000b243000000000000f8700000 + + /* x^50240 mod p(x), x^50176 mod p(x) */ + .octa 0x00000000f7f500000000000029eb0000 + + /* x^49216 mod p(x), x^49152 mod p(x) */ + .octa 0x000000000c7e000000000000fe730000 + + /* x^48192 mod p(x), x^48128 mod p(x) */ + .octa 0x00000000c38200000000000096000000 + + /* x^47168 mod p(x), x^47104 mod p(x) */ + .octa 0x000000008956000000000000683c0000 + + /* x^46144 mod p(x), x^46080 mod p(x) */ + .octa 0x00000000422d0000000000005f1e0000 + + /* x^45120 mod p(x), x^45056 mod p(x) */ + .octa 0x00000000ac0f0000000000006f810000 + + /* x^44096 mod p(x), x^44032 mod p(x) */ + .octa 0x00000000ce30000000000000031f0000 + + /* x^43072 mod p(x), x^43008 mod p(x) */ + .octa 0x000000003d43000000000000455a0000 + + /* x^42048 mod p(x), x^41984 mod p(x) */ + .octa 0x000000007ebe000000000000a6050000 + + /* x^41024 mod p(x), x^40960 mod p(x) */ + .octa 0x00000000976e00000000000077eb0000 + + /* x^40000 mod p(x), x^39936 mod p(x) */ + .octa 0x000000000872000000000000389c0000 + + /* x^38976 mod p(x), x^38912 mod p(x) */ + .octa 0x000000008979000000000000c7b20000 + + /* x^37952 mod p(x), x^37888 mod p(x) */ + .octa 0x000000005c1e0000000000001d870000 + + /* x^36928 mod p(x), x^36864 mod p(x) */ + .octa 0x00000000aebb00000000000045810000 + + /* x^35904 mod p(x), x^35840 mod p(x) */ + .octa 0x000000004f7e0000000000006d4a0000 + + /* x^34880 mod p(x), x^34816 mod p(x) */ + .octa 0x00000000ea98000000000000b9200000 + + /* x^33856 mod p(x), x^33792 mod p(x) */ + .octa 0x00000000f39600000000000022f20000 + + /* x^32832 mod p(x), x^32768 mod p(x) */ + .octa 0x000000000bc500000000000041ca0000 + + /* x^31808 mod p(x), x^31744 mod p(x) */ + .octa 0x00000000786400000000000078500000 + + /* x^30784 mod p(x), x^30720 mod p(x) */ + .octa 0x00000000be970000000000009e7e0000 + + /* x^29760 mod p(x), x^29696 mod p(x) */ + .octa 0x00000000dd6d000000000000a53c0000 + + /* x^28736 mod p(x), x^28672 mod p(x) */ + .octa 0x000000004c3f00000000000039340000 + + /* x^27712 mod p(x), x^27648 mod p(x) */ + .octa 0x0000000093a4000000000000b58e0000 + + /* x^26688 mod p(x), x^26624 mod p(x) */ + .octa 0x0000000050fb00000000000062d40000 + + /* x^25664 mod p(x), x^25600 mod p(x) */ + .octa 0x00000000f505000000000000a26f0000 + + /* x^24640 mod p(x), x^24576 mod p(x) */ + .octa 0x0000000064f900000000000065e60000 + + /* x^23616 mod p(x), x^23552 mod p(x) */ + .octa 0x00000000e8c2000000000000aad90000 + + /* x^22592 mod p(x), x^22528 mod p(x) */ + .octa 0x00000000720b000000000000a3b00000 + + /* x^21568 mod p(x), x^21504 mod p(x) */ + .octa 0x00000000e992000000000000d2680000 + + /* x^20544 mod p(x), x^20480 mod p(x) */ + .octa 0x000000009132000000000000cf4c0000 + + /* x^19520 mod p(x), x^19456 mod p(x) */ + .octa 0x00000000608a00000000000076610000 + + /* x^18496 mod p(x), x^18432 mod p(x) */ + .octa 0x000000009948000000000000fb9f0000 + + /* x^17472 mod p(x), x^17408 mod p(x) */ + .octa 0x00000000173000000000000003770000 + + /* x^16448 mod p(x), x^16384 mod p(x) */ + .octa 0x000000006fe300000000000004880000 + + /* x^15424 mod p(x), x^15360 mod p(x) */ + .octa 0x00000000e15300000000000056a70000 + + /* x^14400 mod p(x), x^14336 mod p(x) */ + .octa 0x0000000092d60000000000009dfd0000 + + /* x^13376 mod p(x), x^13312 mod p(x) */ + .octa 0x0000000002fd00000000000074c80000 + + /* x^12352 mod p(x), x^12288 mod p(x) */ + .octa 0x00000000c78b000000000000a3ec0000 + + /* x^11328 mod p(x), x^11264 mod p(x) */ + .octa 0x000000009262000000000000b3530000 + + /* x^10304 mod p(x), x^10240 mod p(x) */ + .octa 0x0000000084f200000000000047bf0000 + + /* x^9280 mod p(x), x^9216 mod p(x) */ + .octa 0x0000000067ee000000000000e97c0000 + + /* x^8256 mod p(x), x^8192 mod p(x) */ + .octa 0x00000000535b00000000000091e10000 + + /* x^7232 mod p(x), x^7168 mod p(x) */ + .octa 0x000000007ebb00000000000055060000 + + /* x^6208 mod p(x), x^6144 mod p(x) */ + .octa 0x00000000c6a1000000000000fd360000 + + /* x^5184 mod p(x), x^5120 mod p(x) */ + .octa 0x000000001be500000000000055860000 + + /* x^4160 mod p(x), x^4096 mod p(x) */ + .octa 0x00000000ae0e0000000000005bd00000 + + /* x^3136 mod p(x), x^3072 mod p(x) */ + .octa 0x0000000022040000000000008db20000 + + /* x^2112 mod p(x), x^2048 mod p(x) */ + .octa 0x00000000c9eb000000000000efe20000 + + /* x^1088 mod p(x), x^1024 mod p(x) */ + .octa 0x0000000039b400000000000051d10000 + +.short_constants: + + /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */ + /* x^2048 mod p(x), x^2016 mod p(x), x^1984 mod p(x), x^1952 mod p(x) */ + .octa 0xefe20000dccf00009440000033590000 + + /* x^1920 mod p(x), x^1888 mod p(x), x^1856 mod p(x), x^1824 mod p(x) */ + .octa 0xee6300002f3f000062180000e0ed0000 + + /* x^1792 mod p(x), x^1760 mod p(x), x^1728 mod p(x), x^1696 mod p(x) */ + .octa 0xcf5f000017ef0000ccbe000023d30000 + + /* x^1664 mod p(x), x^1632 mod p(x), x^1600 mod p(x), x^1568 mod p(x) */ + .octa 0x6d0c0000a30e00000920000042630000 + + /* x^1536 mod p(x), x^1504 mod p(x), x^1472 mod p(x), x^1440 mod p(x) */ + .octa 0x21d30000932b0000a7a00000efcc0000 + + /* x^1408 mod p(x), x^1376 mod p(x), x^1344 mod p(x), x^1312 mod p(x) */ + .octa 0x10be00000b310000666f00000d1c0000 + + /* x^1280 mod p(x), x^1248 mod p(x), x^1216 mod p(x), x^1184 mod p(x) */ + .octa 0x1f240000ce9e0000caad0000589e0000 + + /* x^1152 mod p(x), x^1120 mod p(x), x^1088 mod p(x), x^1056 mod p(x) */ + .octa 0x29610000d02b000039b400007cf50000 + + /* x^1024 mod p(x), x^992 mod p(x), x^960 mod p(x), x^928 mod p(x) */ + .octa 0x51d100009d9d00003c0e0000bfd60000 + + /* x^896 mod p(x), x^864 mod p(x), x^832 mod p(x), x^800 mod p(x) */ + .octa 0xda390000ceae000013830000713c0000 + + /* x^768 mod p(x), x^736 mod p(x), x^704 mod p(x), x^672 mod p(x) */ + .octa 0xb67800001e16000085c0000080a60000 + + /* x^640 mod p(x), x^608 mod p(x), x^576 mod p(x), x^544 mod p(x) */ + .octa 0x0db40000f7f90000371d0000e6580000 + + /* x^512 mod p(x), x^480 mod p(x), x^448 mod p(x), x^416 mod p(x) */ + .octa 0x87e70000044c0000aadb0000a4970000 + + /* x^384 mod p(x), x^352 mod p(x), x^320 mod p(x), x^288 mod p(x) */ + .octa 0x1f990000ad180000d8b30000e7b50000 + + /* x^256 mod p(x), x^224 mod p(x), x^192 mod p(x), x^160 mod p(x) */ + .octa 0xbe6c00006ee300004c1a000006df0000 + + /* x^128 mod p(x), x^96 mod p(x), x^64 mod p(x), x^32 mod p(x) */ + .octa 0xfb0b00002d560000136800008bb70000 + + +.barrett_constants: + /* Barrett constant m - (4^32)/n */ + .octa 0x000000000000000000000001f65a57f8 /* x^64 div p(x) */ + /* Barrett constant n */ + .octa 0x0000000000000000000000018bb70000 + +#define CRC_FUNCTION_NAME __crct10dif_vpmsum +#include "crc-vpmsum-template.S" diff --git a/lib/crc/riscv/crc-clmul-consts.h b/lib/crc/riscv/crc-clmul-consts.h new file mode 100644 index 000000000000..8d73449235ef --- /dev/null +++ b/lib/crc/riscv/crc-clmul-consts.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * CRC constants generated by: + * + * ./scripts/gen-crc-consts.py riscv_clmul crc16_msb_0x8bb7,crc32_msb_0x04c11db7,crc32_lsb_0xedb88320,crc32_lsb_0x82f63b78,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5 + * + * Do not edit manually. + */ + +struct crc_clmul_consts { + unsigned long fold_across_2_longs_const_hi; + unsigned long fold_across_2_longs_const_lo; + unsigned long barrett_reduction_const_1; + unsigned long barrett_reduction_const_2; +}; + +/* + * Constants generated for most-significant-bit-first CRC-16 using + * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 + */ +static const struct crc_clmul_consts crc16_msb_0x8bb7_consts __maybe_unused = { +#ifdef CONFIG_64BIT + .fold_across_2_longs_const_hi = 0x0000000000001faa, /* x^192 mod G */ + .fold_across_2_longs_const_lo = 0x000000000000a010, /* x^128 mod G */ + .barrett_reduction_const_1 = 0xfb2d2bfc0e99d245, /* floor(x^79 / G) */ + .barrett_reduction_const_2 = 0x0000000000008bb7, /* G - x^16 */ +#else + .fold_across_2_longs_const_hi = 0x00005890, /* x^96 mod G */ + .fold_across_2_longs_const_lo = 0x0000f249, /* x^64 mod G */ + .barrett_reduction_const_1 = 0xfb2d2bfc, /* floor(x^47 / G) */ + .barrett_reduction_const_2 = 0x00008bb7, /* G - x^16 */ +#endif +}; + +/* + * Constants generated for most-significant-bit-first CRC-32 using + * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 + + * x^5 + x^4 + x^2 + x^1 + x^0 + */ +static const struct crc_clmul_consts crc32_msb_0x04c11db7_consts __maybe_unused = { +#ifdef CONFIG_64BIT + .fold_across_2_longs_const_hi = 0x00000000c5b9cd4c, /* x^192 mod G */ + .fold_across_2_longs_const_lo = 0x00000000e8a45605, /* x^128 mod G */ + .barrett_reduction_const_1 = 0x826880efa40da72d, /* floor(x^95 / G) */ + .barrett_reduction_const_2 = 0x0000000004c11db7, /* G - x^32 */ +#else + .fold_across_2_longs_const_hi = 0xf200aa66, /* x^96 mod G */ + .fold_across_2_longs_const_lo = 0x490d678d, /* x^64 mod G */ + .barrett_reduction_const_1 = 0x826880ef, /* floor(x^63 / G) */ + .barrett_reduction_const_2 = 0x04c11db7, /* G - x^32 */ +#endif +}; + +/* + * Constants generated for least-significant-bit-first CRC-32 using + * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 + + * x^5 + x^4 + x^2 + x^1 + x^0 + */ +static const struct crc_clmul_consts crc32_lsb_0xedb88320_consts __maybe_unused = { +#ifdef CONFIG_64BIT + .fold_across_2_longs_const_hi = 0x65673b4600000000, /* x^191 mod G */ + .fold_across_2_longs_const_lo = 0x9ba54c6f00000000, /* x^127 mod G */ + .barrett_reduction_const_1 = 0xb4e5b025f7011641, /* floor(x^95 / G) */ + .barrett_reduction_const_2 = 0x00000000edb88320, /* (G - x^32) * x^32 */ +#else + .fold_across_2_longs_const_hi = 0xccaa009e, /* x^95 mod G */ + .fold_across_2_longs_const_lo = 0xb8bc6765, /* x^63 mod G */ + .barrett_reduction_const_1 = 0xf7011641, /* floor(x^63 / G) */ + .barrett_reduction_const_2 = 0xedb88320, /* (G - x^32) * x^0 */ +#endif +}; + +/* + * Constants generated for least-significant-bit-first CRC-32 using + * G(x) = x^32 + x^28 + x^27 + x^26 + x^25 + x^23 + x^22 + x^20 + x^19 + x^18 + + * x^14 + x^13 + x^11 + x^10 + x^9 + x^8 + x^6 + x^0 + */ +static const struct crc_clmul_consts crc32_lsb_0x82f63b78_consts __maybe_unused = { +#ifdef CONFIG_64BIT + .fold_across_2_longs_const_hi = 0x3743f7bd00000000, /* x^191 mod G */ + .fold_across_2_longs_const_lo = 0x3171d43000000000, /* x^127 mod G */ + .barrett_reduction_const_1 = 0x4869ec38dea713f1, /* floor(x^95 / G) */ + .barrett_reduction_const_2 = 0x0000000082f63b78, /* (G - x^32) * x^32 */ +#else + .fold_across_2_longs_const_hi = 0x493c7d27, /* x^95 mod G */ + .fold_across_2_longs_const_lo = 0xdd45aab8, /* x^63 mod G */ + .barrett_reduction_const_1 = 0xdea713f1, /* floor(x^63 / G) */ + .barrett_reduction_const_2 = 0x82f63b78, /* (G - x^32) * x^0 */ +#endif +}; + +/* + * Constants generated for most-significant-bit-first CRC-64 using + * G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + + * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + + * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + + * x^7 + x^4 + x^1 + x^0 + */ +#ifdef CONFIG_64BIT +static const struct crc_clmul_consts crc64_msb_0x42f0e1eba9ea3693_consts __maybe_unused = { + .fold_across_2_longs_const_hi = 0x4eb938a7d257740e, /* x^192 mod G */ + .fold_across_2_longs_const_lo = 0x05f5c3c7eb52fab6, /* x^128 mod G */ + .barrett_reduction_const_1 = 0xabc694e836627c39, /* floor(x^127 / G) */ + .barrett_reduction_const_2 = 0x42f0e1eba9ea3693, /* G - x^64 */ +}; +#endif + +/* + * Constants generated for least-significant-bit-first CRC-64 using + * G(x) = x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 + + * x^47 + x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 + + * x^26 + x^23 + x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 + + * x^4 + x^3 + x^0 + */ +#ifdef CONFIG_64BIT +static const struct crc_clmul_consts crc64_lsb_0x9a6c9329ac4bc9b5_consts __maybe_unused = { + .fold_across_2_longs_const_hi = 0xeadc41fd2ba3d420, /* x^191 mod G */ + .fold_across_2_longs_const_lo = 0x21e9761e252621ac, /* x^127 mod G */ + .barrett_reduction_const_1 = 0x27ecfa329aef9f77, /* floor(x^127 / G) */ + .barrett_reduction_const_2 = 0x9a6c9329ac4bc9b5, /* (G - x^64) * x^0 */ +}; +#endif diff --git a/lib/crc/riscv/crc-clmul-template.h b/lib/crc/riscv/crc-clmul-template.h new file mode 100644 index 000000000000..77187e7f1762 --- /dev/null +++ b/lib/crc/riscv/crc-clmul-template.h @@ -0,0 +1,265 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright 2025 Google LLC */ + +/* + * This file is a "template" that generates a CRC function optimized using the + * RISC-V Zbc (scalar carryless multiplication) extension. The includer of this + * file must define the following parameters to specify the type of CRC: + * + * crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC + * LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural + * mapping between bits and polynomial coefficients + * 1 for a lsb (least-significant-bit) first CRC, i.e. reflected + * mapping between bits and polynomial coefficients + */ + +#include <asm/byteorder.h> +#include <linux/minmax.h> + +#define CRC_BITS (8 * sizeof(crc_t)) /* a.k.a. 'n' */ + +static inline unsigned long clmul(unsigned long a, unsigned long b) +{ + unsigned long res; + + asm(".option push\n" + ".option arch,+zbc\n" + "clmul %0, %1, %2\n" + ".option pop\n" + : "=r" (res) : "r" (a), "r" (b)); + return res; +} + +static inline unsigned long clmulh(unsigned long a, unsigned long b) +{ + unsigned long res; + + asm(".option push\n" + ".option arch,+zbc\n" + "clmulh %0, %1, %2\n" + ".option pop\n" + : "=r" (res) : "r" (a), "r" (b)); + return res; +} + +static inline unsigned long clmulr(unsigned long a, unsigned long b) +{ + unsigned long res; + + asm(".option push\n" + ".option arch,+zbc\n" + "clmulr %0, %1, %2\n" + ".option pop\n" + : "=r" (res) : "r" (a), "r" (b)); + return res; +} + +/* + * crc_load_long() loads one "unsigned long" of aligned data bytes, producing a + * polynomial whose bit order matches the CRC's bit order. + */ +#ifdef CONFIG_64BIT +# if LSB_CRC +# define crc_load_long(x) le64_to_cpup(x) +# else +# define crc_load_long(x) be64_to_cpup(x) +# endif +#else +# if LSB_CRC +# define crc_load_long(x) le32_to_cpup(x) +# else +# define crc_load_long(x) be32_to_cpup(x) +# endif +#endif + +/* XOR @crc into the end of @msgpoly that represents the high-order terms. */ +static inline unsigned long +crc_clmul_prep(crc_t crc, unsigned long msgpoly) +{ +#if LSB_CRC + return msgpoly ^ crc; +#else + return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS)); +#endif +} + +/* + * Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it + * modulo the generator polynomial G. This gives the CRC of @msgpoly. + */ +static inline crc_t +crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts) +{ + unsigned long tmp; + + /* + * First step of Barrett reduction with integrated multiplication by + * x^n: calculate floor((msgpoly * x^n) / G). This is the value by + * which G needs to be multiplied to cancel out the x^n and higher terms + * of msgpoly * x^n. Do it using the following formula: + * + * msb-first: + * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1)) + * lsb-first: + * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG) + * + * barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G), + * which fits a long exactly. Using any lower power of x there would + * not carry enough precision through the calculation, while using any + * higher power of x would require extra instructions to handle a wider + * multiplication. In the msb-first case, using this power of x results + * in needing a floored division by x^(BITS_PER_LONG-1), which matches + * what clmulr produces. In the lsb-first case, a factor of x gets + * implicitly introduced by each carryless multiplication (shown as + * '* x' above), and the floored division instead needs to be by + * x^BITS_PER_LONG which matches what clmul produces. + */ +#if LSB_CRC + tmp = clmul(msgpoly, consts->barrett_reduction_const_1); +#else + tmp = clmulr(msgpoly, consts->barrett_reduction_const_1); +#endif + + /* + * Second step of Barrett reduction: + * + * crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G)) + * + * This reduces (msgpoly * x^n) modulo G by adding the appropriate + * multiple of G to it. The result uses only the x^0..x^(n-1) terms. + * HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those + * terms in the first place, it is more efficient to do the equivalent: + * + * crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n + * + * In the lsb-first case further modify it to the following which avoids + * a shift, as the crc ends up in the physically low n bits from clmulr: + * + * product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x + * crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n + * + * barrett_reduction_const_2 contains the constant multiplier (G - x^n) + * or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above. The + * cast of the result to crc_t is essential, as it applies the mod x^n! + */ +#if LSB_CRC + return clmulr(tmp, consts->barrett_reduction_const_2); +#else + return clmul(tmp, consts->barrett_reduction_const_2); +#endif +} + +/* Update @crc with the data from @msgpoly. */ +static inline crc_t +crc_clmul_update_long(crc_t crc, unsigned long msgpoly, + const struct crc_clmul_consts *consts) +{ + return crc_clmul_long(crc_clmul_prep(crc, msgpoly), consts); +} + +/* Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. */ +static inline crc_t +crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len, + const struct crc_clmul_consts *consts) +{ + unsigned long msgpoly; + size_t i; + +#if LSB_CRC + msgpoly = (unsigned long)p[0] << (BITS_PER_LONG - 8); + for (i = 1; i < len; i++) + msgpoly = (msgpoly >> 8) ^ ((unsigned long)p[i] << (BITS_PER_LONG - 8)); +#else + msgpoly = p[0]; + for (i = 1; i < len; i++) + msgpoly = (msgpoly << 8) ^ p[i]; +#endif + + if (len >= sizeof(crc_t)) { + #if LSB_CRC + msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len); + #else + msgpoly ^= (unsigned long)crc << (8*len - CRC_BITS); + #endif + return crc_clmul_long(msgpoly, consts); + } +#if LSB_CRC + msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len); + return crc_clmul_long(msgpoly, consts) ^ (crc >> (8*len)); +#else + msgpoly ^= crc >> (CRC_BITS - 8*len); + return crc_clmul_long(msgpoly, consts) ^ (crc << (8*len)); +#endif +} + +static inline crc_t +crc_clmul(crc_t crc, const void *p, size_t len, + const struct crc_clmul_consts *consts) +{ + size_t align; + + /* This implementation assumes that the CRC fits in an unsigned long. */ + BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long)); + + /* If the buffer is not long-aligned, align it. */ + align = (unsigned long)p % sizeof(unsigned long); + if (align && len) { + align = min(sizeof(unsigned long) - align, len); + crc = crc_clmul_update_partial(crc, p, align, consts); + p += align; + len -= align; + } + + if (len >= 4 * sizeof(unsigned long)) { + unsigned long m0, m1; + + m0 = crc_clmul_prep(crc, crc_load_long(p)); + m1 = crc_load_long(p + sizeof(unsigned long)); + p += 2 * sizeof(unsigned long); + len -= 2 * sizeof(unsigned long); + /* + * Main loop. Each iteration starts with a message polynomial + * (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two + * more longs of data to form x^(3*BITS_PER_LONG)*m0 + + * x^(2*BITS_PER_LONG)*m1 + x^BITS_PER_LONG*m2 + m3, then + * "folds" that back into a congruent (modulo G) value that uses + * just m0 and m1 again. This is done by multiplying m0 by the + * precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by + * the precomputed constant (x^(2*BITS_PER_LONG) mod G), then + * adding the results to m2 and m3 as appropriate. Each such + * multiplication produces a result twice the length of a long, + * which in RISC-V is two instructions clmul and clmulh. + * + * This could be changed to fold across more than 2 longs at a + * time if there is a CPU that can take advantage of it. + */ + do { + unsigned long p0, p1, p2, p3; + + p0 = clmulh(m0, consts->fold_across_2_longs_const_hi); + p1 = clmul(m0, consts->fold_across_2_longs_const_hi); + p2 = clmulh(m1, consts->fold_across_2_longs_const_lo); + p3 = clmul(m1, consts->fold_across_2_longs_const_lo); + m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p); + m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^ + crc_load_long(p + sizeof(unsigned long)); + + p += 2 * sizeof(unsigned long); + len -= 2 * sizeof(unsigned long); + } while (len >= 2 * sizeof(unsigned long)); + + crc = crc_clmul_long(m0, consts); + crc = crc_clmul_update_long(crc, m1, consts); + } + + while (len >= sizeof(unsigned long)) { + crc = crc_clmul_update_long(crc, crc_load_long(p), consts); + p += sizeof(unsigned long); + len -= sizeof(unsigned long); + } + + if (len) + crc = crc_clmul_update_partial(crc, p, len, consts); + + return crc; +} diff --git a/lib/crc/riscv/crc-clmul.h b/lib/crc/riscv/crc-clmul.h new file mode 100644 index 000000000000..dd1736245815 --- /dev/null +++ b/lib/crc/riscv/crc-clmul.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright 2025 Google LLC */ + +#ifndef _RISCV_CRC_CLMUL_H +#define _RISCV_CRC_CLMUL_H + +#include <linux/types.h> +#include "crc-clmul-consts.h" + +u16 crc16_msb_clmul(u16 crc, const void *p, size_t len, + const struct crc_clmul_consts *consts); +u32 crc32_msb_clmul(u32 crc, const void *p, size_t len, + const struct crc_clmul_consts *consts); +u32 crc32_lsb_clmul(u32 crc, const void *p, size_t len, + const struct crc_clmul_consts *consts); +#ifdef CONFIG_64BIT +u64 crc64_msb_clmul(u64 crc, const void *p, size_t len, + const struct crc_clmul_consts *consts); +u64 crc64_lsb_clmul(u64 crc, const void *p, size_t len, + const struct crc_clmul_consts *consts); +#endif + +#endif /* _RISCV_CRC_CLMUL_H */ diff --git a/lib/crc/riscv/crc-t10dif.h b/lib/crc/riscv/crc-t10dif.h new file mode 100644 index 000000000000..cd6136cbfda1 --- /dev/null +++ b/lib/crc/riscv/crc-t10dif.h @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RISC-V optimized CRC-T10DIF function + * + * Copyright 2025 Google LLC + */ + +#include <asm/hwcap.h> +#include <asm/alternative-macros.h> + +#include "crc-clmul.h" + +static inline u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len) +{ + if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC)) + return crc16_msb_clmul(crc, p, len, &crc16_msb_0x8bb7_consts); + return crc_t10dif_generic(crc, p, len); +} diff --git a/lib/crc/riscv/crc16_msb.c b/lib/crc/riscv/crc16_msb.c new file mode 100644 index 000000000000..554d295e95f5 --- /dev/null +++ b/lib/crc/riscv/crc16_msb.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RISC-V optimized most-significant-bit-first CRC16 + * + * Copyright 2025 Google LLC + */ + +#include "crc-clmul.h" + +typedef u16 crc_t; +#define LSB_CRC 0 +#include "crc-clmul-template.h" + +u16 crc16_msb_clmul(u16 crc, const void *p, size_t len, + const struct crc_clmul_consts *consts) +{ + return crc_clmul(crc, p, len, consts); +} diff --git a/lib/crc/riscv/crc32.h b/lib/crc/riscv/crc32.h new file mode 100644 index 000000000000..3ec6eee98afa --- /dev/null +++ b/lib/crc/riscv/crc32.h @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RISC-V optimized CRC32 functions + * + * Copyright 2025 Google LLC + */ + +#include <asm/hwcap.h> +#include <asm/alternative-macros.h> + +#include "crc-clmul.h" + +static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC)) + return crc32_lsb_clmul(crc, p, len, + &crc32_lsb_0xedb88320_consts); + return crc32_le_base(crc, p, len); +} + +static inline u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) +{ + if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC)) + return crc32_msb_clmul(crc, p, len, + &crc32_msb_0x04c11db7_consts); + return crc32_be_base(crc, p, len); +} + +static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) +{ + if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC)) + return crc32_lsb_clmul(crc, p, len, + &crc32_lsb_0x82f63b78_consts); + return crc32c_base(crc, p, len); +} + +static inline u32 crc32_optimizations_arch(void) +{ + if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC)) + return CRC32_LE_OPTIMIZATION | + CRC32_BE_OPTIMIZATION | + CRC32C_OPTIMIZATION; + return 0; +} diff --git a/lib/crc/riscv/crc32_lsb.c b/lib/crc/riscv/crc32_lsb.c new file mode 100644 index 000000000000..72fd67e7470c --- /dev/null +++ b/lib/crc/riscv/crc32_lsb.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RISC-V optimized least-significant-bit-first CRC32 + * + * Copyright 2025 Google LLC + */ + +#include "crc-clmul.h" + +typedef u32 crc_t; +#define LSB_CRC 1 +#include "crc-clmul-template.h" + +u32 crc32_lsb_clmul(u32 crc, const void *p, size_t len, + const struct crc_clmul_consts *consts) +{ + return crc_clmul(crc, p, len, consts); +} diff --git a/lib/crc/riscv/crc32_msb.c b/lib/crc/riscv/crc32_msb.c new file mode 100644 index 000000000000..fdbeaccc369f --- /dev/null +++ b/lib/crc/riscv/crc32_msb.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RISC-V optimized most-significant-bit-first CRC32 + * + * Copyright 2025 Google LLC + */ + +#include "crc-clmul.h" + +typedef u32 crc_t; +#define LSB_CRC 0 +#include "crc-clmul-template.h" + +u32 crc32_msb_clmul(u32 crc, const void *p, size_t len, + const struct crc_clmul_consts *consts) +{ + return crc_clmul(crc, p, len, consts); +} diff --git a/lib/crc/riscv/crc64.h b/lib/crc/riscv/crc64.h new file mode 100644 index 000000000000..a1b7873fde57 --- /dev/null +++ b/lib/crc/riscv/crc64.h @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RISC-V optimized CRC64 functions + * + * Copyright 2025 Google LLC + */ + +#include <asm/hwcap.h> +#include <asm/alternative-macros.h> + +#include "crc-clmul.h" + +static inline u64 crc64_be_arch(u64 crc, const u8 *p, size_t len) +{ + if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC)) + return crc64_msb_clmul(crc, p, len, + &crc64_msb_0x42f0e1eba9ea3693_consts); + return crc64_be_generic(crc, p, len); +} + +static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len) +{ + if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC)) + return crc64_lsb_clmul(crc, p, len, + &crc64_lsb_0x9a6c9329ac4bc9b5_consts); + return crc64_nvme_generic(crc, p, len); +} diff --git a/lib/crc/riscv/crc64_lsb.c b/lib/crc/riscv/crc64_lsb.c new file mode 100644 index 000000000000..c5371bb85d90 --- /dev/null +++ b/lib/crc/riscv/crc64_lsb.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RISC-V optimized least-significant-bit-first CRC64 + * + * Copyright 2025 Google LLC + */ + +#include "crc-clmul.h" + +typedef u64 crc_t; +#define LSB_CRC 1 +#include "crc-clmul-template.h" + +u64 crc64_lsb_clmul(u64 crc, const void *p, size_t len, + const struct crc_clmul_consts *consts) +{ + return crc_clmul(crc, p, len, consts); +} diff --git a/lib/crc/riscv/crc64_msb.c b/lib/crc/riscv/crc64_msb.c new file mode 100644 index 000000000000..1925d1dbe225 --- /dev/null +++ b/lib/crc/riscv/crc64_msb.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RISC-V optimized most-significant-bit-first CRC64 + * + * Copyright 2025 Google LLC + */ + +#include "crc-clmul.h" + +typedef u64 crc_t; +#define LSB_CRC 0 +#include "crc-clmul-template.h" + +u64 crc64_msb_clmul(u64 crc, const void *p, size_t len, + const struct crc_clmul_consts *consts) +{ + return crc_clmul(crc, p, len, consts); +} diff --git a/lib/crc/s390/crc32-vx.h b/lib/crc/s390/crc32-vx.h new file mode 100644 index 000000000000..652c96e1a822 --- /dev/null +++ b/lib/crc/s390/crc32-vx.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _CRC32_VX_S390_H +#define _CRC32_VX_S390_H + +#include <linux/types.h> + +u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size); +u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size); +u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size); + +#endif /* _CRC32_VX_S390_H */ diff --git a/lib/crc/s390/crc32.h b/lib/crc/s390/crc32.h new file mode 100644 index 000000000000..59c8983d428b --- /dev/null +++ b/lib/crc/s390/crc32.h @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * CRC-32 implemented with the z/Architecture Vector Extension Facility. + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ + +#include <linux/cpufeature.h> +#include <asm/fpu.h> +#include "crc32-vx.h" + +#define VX_MIN_LEN 64 +#define VX_ALIGNMENT 16L +#define VX_ALIGN_MASK (VX_ALIGNMENT - 1) + +/* + * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension + * + * Creates a function to perform a particular CRC-32 computation. Depending + * on the message buffer, the hardware-accelerated or software implementation + * is used. Note that the message buffer is aligned to improve fetch + * operations of VECTOR LOAD MULTIPLE instructions. + */ +#define DEFINE_CRC32_VX(___fname, ___crc32_vx, ___crc32_sw) \ + static inline u32 ___fname(u32 crc, const u8 *data, size_t datalen) \ + { \ + unsigned long prealign, aligned, remaining; \ + DECLARE_KERNEL_FPU_ONSTACK16(vxstate); \ + \ + if (datalen < VX_MIN_LEN + VX_ALIGN_MASK || !cpu_has_vx()) \ + return ___crc32_sw(crc, data, datalen); \ + \ + if ((unsigned long)data & VX_ALIGN_MASK) { \ + prealign = VX_ALIGNMENT - \ + ((unsigned long)data & VX_ALIGN_MASK); \ + datalen -= prealign; \ + crc = ___crc32_sw(crc, data, prealign); \ + data = (void *)((unsigned long)data + prealign); \ + } \ + \ + aligned = datalen & ~VX_ALIGN_MASK; \ + remaining = datalen & VX_ALIGN_MASK; \ + \ + kernel_fpu_begin(&vxstate, KERNEL_VXR_LOW); \ + crc = ___crc32_vx(crc, data, aligned); \ + kernel_fpu_end(&vxstate, KERNEL_VXR_LOW); \ + \ + if (remaining) \ + crc = ___crc32_sw(crc, data + aligned, remaining); \ + \ + return crc; \ + } + +DEFINE_CRC32_VX(crc32_le_arch, crc32_le_vgfm_16, crc32_le_base) +DEFINE_CRC32_VX(crc32_be_arch, crc32_be_vgfm_16, crc32_be_base) +DEFINE_CRC32_VX(crc32c_arch, crc32c_le_vgfm_16, crc32c_base) + +static inline u32 crc32_optimizations_arch(void) +{ + if (cpu_has_vx()) { + return CRC32_LE_OPTIMIZATION | + CRC32_BE_OPTIMIZATION | + CRC32C_OPTIMIZATION; + } + return 0; +} diff --git a/lib/crc/s390/crc32be-vx.c b/lib/crc/s390/crc32be-vx.c new file mode 100644 index 000000000000..fed7c9c70d05 --- /dev/null +++ b/lib/crc/s390/crc32be-vx.c @@ -0,0 +1,174 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Hardware-accelerated CRC-32 variants for Linux on z Systems + * + * Use the z/Architecture Vector Extension Facility to accelerate the + * computing of CRC-32 checksums. + * + * This CRC-32 implementation algorithm processes the most-significant + * bit first (BE). + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ + +#include <linux/types.h> +#include <asm/fpu.h> +#include "crc32-vx.h" + +/* Vector register range containing CRC-32 constants */ +#define CONST_R1R2 9 +#define CONST_R3R4 10 +#define CONST_R5 11 +#define CONST_R6 12 +#define CONST_RU_POLY 13 +#define CONST_CRC_POLY 14 + +/* + * The CRC-32 constant block contains reduction constants to fold and + * process particular chunks of the input data stream in parallel. + * + * For the CRC-32 variants, the constants are precomputed according to + * these definitions: + * + * R1 = x4*128+64 mod P(x) + * R2 = x4*128 mod P(x) + * R3 = x128+64 mod P(x) + * R4 = x128 mod P(x) + * R5 = x96 mod P(x) + * R6 = x64 mod P(x) + * + * Barret reduction constant, u, is defined as floor(x**64 / P(x)). + * + * where P(x) is the polynomial in the normal domain and the P'(x) is the + * polynomial in the reversed (bitreflected) domain. + * + * Note that the constant definitions below are extended in order to compute + * intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction. + * The rightmost doubleword can be 0 to prevent contribution to the result or + * can be multiplied by 1 to perform an XOR without the need for a separate + * VECTOR EXCLUSIVE OR instruction. + * + * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials: + * + * P(x) = 0x04C11DB7 + * P'(x) = 0xEDB88320 + */ + +static unsigned long constants_CRC_32_BE[] = { + 0x08833794c, 0x0e6228b11, /* R1, R2 */ + 0x0c5b9cd4c, 0x0e8a45605, /* R3, R4 */ + 0x0f200aa66, 1UL << 32, /* R5, x32 */ + 0x0490d678d, 1, /* R6, 1 */ + 0x104d101df, 0, /* u */ + 0x104C11DB7, 0, /* P(x) */ +}; + +/** + * crc32_be_vgfm_16 - Compute CRC-32 (BE variant) with vector registers + * @crc: Initial CRC value, typically ~0. + * @buf: Input buffer pointer, performance might be improved if the + * buffer is on a doubleword boundary. + * @size: Size of the buffer, must be 64 bytes or greater. + * + * Register usage: + * V0: Initial CRC value and intermediate constants and results. + * V1..V4: Data for CRC computation. + * V5..V8: Next data chunks that are fetched from the input buffer. + * V9..V14: CRC-32 constants. + */ +u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size) +{ + /* Load CRC-32 constants */ + fpu_vlm(CONST_R1R2, CONST_CRC_POLY, &constants_CRC_32_BE); + fpu_vzero(0); + + /* Load the initial CRC value into the leftmost word of V0. */ + fpu_vlvgf(0, crc, 0); + + /* Load a 64-byte data chunk and XOR with CRC */ + fpu_vlm(1, 4, buf); + fpu_vx(1, 0, 1); + buf += 64; + size -= 64; + + while (size >= 64) { + /* Load the next 64-byte data chunk into V5 to V8 */ + fpu_vlm(5, 8, buf); + + /* + * Perform a GF(2) multiplication of the doublewords in V1 with + * the reduction constants in V0. The intermediate result is + * then folded (accumulated) with the next data chunk in V5 and + * stored in V1. Repeat this step for the register contents + * in V2, V3, and V4 respectively. + */ + fpu_vgfmag(1, CONST_R1R2, 1, 5); + fpu_vgfmag(2, CONST_R1R2, 2, 6); + fpu_vgfmag(3, CONST_R1R2, 3, 7); + fpu_vgfmag(4, CONST_R1R2, 4, 8); + buf += 64; + size -= 64; + } + + /* Fold V1 to V4 into a single 128-bit value in V1 */ + fpu_vgfmag(1, CONST_R3R4, 1, 2); + fpu_vgfmag(1, CONST_R3R4, 1, 3); + fpu_vgfmag(1, CONST_R3R4, 1, 4); + + while (size >= 16) { + fpu_vl(2, buf); + fpu_vgfmag(1, CONST_R3R4, 1, 2); + buf += 16; + size -= 16; + } + + /* + * The R5 constant is used to fold a 128-bit value into an 96-bit value + * that is XORed with the next 96-bit input data chunk. To use a single + * VGFMG instruction, multiply the rightmost 64-bit with x^32 (1<<32) to + * form an intermediate 96-bit value (with appended zeros) which is then + * XORed with the intermediate reduction result. + */ + fpu_vgfmg(1, CONST_R5, 1); + + /* + * Further reduce the remaining 96-bit value to a 64-bit value using a + * single VGFMG, the rightmost doubleword is multiplied with 0x1. The + * intermediate result is then XORed with the product of the leftmost + * doubleword with R6. The result is a 64-bit value and is subject to + * the Barret reduction. + */ + fpu_vgfmg(1, CONST_R6, 1); + + /* + * The input values to the Barret reduction are the degree-63 polynomial + * in V1 (R(x)), degree-32 generator polynomial, and the reduction + * constant u. The Barret reduction result is the CRC value of R(x) mod + * P(x). + * + * The Barret reduction algorithm is defined as: + * + * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u + * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) + * 3. C(x) = R(x) XOR T2(x) mod x^32 + * + * Note: To compensate the division by x^32, use the vector unpack + * instruction to move the leftmost word into the leftmost doubleword + * of the vector register. The rightmost doubleword is multiplied + * with zero to not contribute to the intermediate results. + */ + + /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ + fpu_vupllf(2, 1); + fpu_vgfmg(2, CONST_RU_POLY, 2); + + /* + * Compute the GF(2) product of the CRC polynomial in VO with T1(x) in + * V2 and XOR the intermediate result, T2(x), with the value in V1. + * The final result is in the rightmost word of V2. + */ + fpu_vupllf(2, 2); + fpu_vgfmag(2, CONST_CRC_POLY, 2, 1); + return fpu_vlgvf(2, 3); +} diff --git a/lib/crc/s390/crc32le-vx.c b/lib/crc/s390/crc32le-vx.c new file mode 100644 index 000000000000..2f629f394df7 --- /dev/null +++ b/lib/crc/s390/crc32le-vx.c @@ -0,0 +1,240 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Hardware-accelerated CRC-32 variants for Linux on z Systems + * + * Use the z/Architecture Vector Extension Facility to accelerate the + * computing of bitreflected CRC-32 checksums for IEEE 802.3 Ethernet + * and Castagnoli. + * + * This CRC-32 implementation algorithm is bitreflected and processes + * the least-significant bit first (Little-Endian). + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ + +#include <linux/types.h> +#include <asm/fpu.h> +#include "crc32-vx.h" + +/* Vector register range containing CRC-32 constants */ +#define CONST_PERM_LE2BE 9 +#define CONST_R2R1 10 +#define CONST_R4R3 11 +#define CONST_R5 12 +#define CONST_RU_POLY 13 +#define CONST_CRC_POLY 14 + +/* + * The CRC-32 constant block contains reduction constants to fold and + * process particular chunks of the input data stream in parallel. + * + * For the CRC-32 variants, the constants are precomputed according to + * these definitions: + * + * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1 + * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1 + * R3 = [(x128+32 mod P'(x) << 32)]' << 1 + * R4 = [(x128-32 mod P'(x) << 32)]' << 1 + * R5 = [(x64 mod P'(x) << 32)]' << 1 + * R6 = [(x32 mod P'(x) << 32)]' << 1 + * + * The bitreflected Barret reduction constant, u', is defined as + * the bit reversal of floor(x**64 / P(x)). + * + * where P(x) is the polynomial in the normal domain and the P'(x) is the + * polynomial in the reversed (bitreflected) domain. + * + * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials: + * + * P(x) = 0x04C11DB7 + * P'(x) = 0xEDB88320 + * + * CRC-32C (Castagnoli) polynomials: + * + * P(x) = 0x1EDC6F41 + * P'(x) = 0x82F63B78 + */ + +static unsigned long constants_CRC_32_LE[] = { + 0x0f0e0d0c0b0a0908, 0x0706050403020100, /* BE->LE mask */ + 0x1c6e41596, 0x154442bd4, /* R2, R1 */ + 0x0ccaa009e, 0x1751997d0, /* R4, R3 */ + 0x0, 0x163cd6124, /* R5 */ + 0x0, 0x1f7011641, /* u' */ + 0x0, 0x1db710641 /* P'(x) << 1 */ +}; + +static unsigned long constants_CRC_32C_LE[] = { + 0x0f0e0d0c0b0a0908, 0x0706050403020100, /* BE->LE mask */ + 0x09e4addf8, 0x740eef02, /* R2, R1 */ + 0x14cd00bd6, 0xf20c0dfe, /* R4, R3 */ + 0x0, 0x0dd45aab8, /* R5 */ + 0x0, 0x0dea713f1, /* u' */ + 0x0, 0x105ec76f0 /* P'(x) << 1 */ +}; + +/** + * crc32_le_vgfm_generic - Compute CRC-32 (LE variant) with vector registers + * @crc: Initial CRC value, typically ~0. + * @buf: Input buffer pointer, performance might be improved if the + * buffer is on a doubleword boundary. + * @size: Size of the buffer, must be 64 bytes or greater. + * @constants: CRC-32 constant pool base pointer. + * + * Register usage: + * V0: Initial CRC value and intermediate constants and results. + * V1..V4: Data for CRC computation. + * V5..V8: Next data chunks that are fetched from the input buffer. + * V9: Constant for BE->LE conversion and shift operations + * V10..V14: CRC-32 constants. + */ +static u32 crc32_le_vgfm_generic(u32 crc, unsigned char const *buf, size_t size, unsigned long *constants) +{ + /* Load CRC-32 constants */ + fpu_vlm(CONST_PERM_LE2BE, CONST_CRC_POLY, constants); + + /* + * Load the initial CRC value. + * + * The CRC value is loaded into the rightmost word of the + * vector register and is later XORed with the LSB portion + * of the loaded input data. + */ + fpu_vzero(0); /* Clear V0 */ + fpu_vlvgf(0, crc, 3); /* Load CRC into rightmost word */ + + /* Load a 64-byte data chunk and XOR with CRC */ + fpu_vlm(1, 4, buf); + fpu_vperm(1, 1, 1, CONST_PERM_LE2BE); + fpu_vperm(2, 2, 2, CONST_PERM_LE2BE); + fpu_vperm(3, 3, 3, CONST_PERM_LE2BE); + fpu_vperm(4, 4, 4, CONST_PERM_LE2BE); + + fpu_vx(1, 0, 1); /* V1 ^= CRC */ + buf += 64; + size -= 64; + + while (size >= 64) { + fpu_vlm(5, 8, buf); + fpu_vperm(5, 5, 5, CONST_PERM_LE2BE); + fpu_vperm(6, 6, 6, CONST_PERM_LE2BE); + fpu_vperm(7, 7, 7, CONST_PERM_LE2BE); + fpu_vperm(8, 8, 8, CONST_PERM_LE2BE); + /* + * Perform a GF(2) multiplication of the doublewords in V1 with + * the R1 and R2 reduction constants in V0. The intermediate + * result is then folded (accumulated) with the next data chunk + * in V5 and stored in V1. Repeat this step for the register + * contents in V2, V3, and V4 respectively. + */ + fpu_vgfmag(1, CONST_R2R1, 1, 5); + fpu_vgfmag(2, CONST_R2R1, 2, 6); + fpu_vgfmag(3, CONST_R2R1, 3, 7); + fpu_vgfmag(4, CONST_R2R1, 4, 8); + buf += 64; + size -= 64; + } + + /* + * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3 + * and R4 and accumulating the next 128-bit chunk until a single 128-bit + * value remains. + */ + fpu_vgfmag(1, CONST_R4R3, 1, 2); + fpu_vgfmag(1, CONST_R4R3, 1, 3); + fpu_vgfmag(1, CONST_R4R3, 1, 4); + + while (size >= 16) { + fpu_vl(2, buf); + fpu_vperm(2, 2, 2, CONST_PERM_LE2BE); + fpu_vgfmag(1, CONST_R4R3, 1, 2); + buf += 16; + size -= 16; + } + + /* + * Set up a vector register for byte shifts. The shift value must + * be loaded in bits 1-4 in byte element 7 of a vector register. + * Shift by 8 bytes: 0x40 + * Shift by 4 bytes: 0x20 + */ + fpu_vleib(9, 0x40, 7); + + /* + * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes + * to move R4 into the rightmost doubleword and set the leftmost + * doubleword to 0x1. + */ + fpu_vsrlb(0, CONST_R4R3, 9); + fpu_vleig(0, 1, 0); + + /* + * Compute GF(2) product of V1 and V0. The rightmost doubleword + * of V1 is multiplied with R4. The leftmost doubleword of V1 is + * multiplied by 0x1 and is then XORed with rightmost product. + * Implicitly, the intermediate leftmost product becomes padded + */ + fpu_vgfmg(1, 0, 1); + + /* + * Now do the final 32-bit fold by multiplying the rightmost word + * in V1 with R5 and XOR the result with the remaining bits in V1. + * + * To achieve this by a single VGFMAG, right shift V1 by a word + * and store the result in V2 which is then accumulated. Use the + * vector unpack instruction to load the rightmost half of the + * doubleword into the rightmost doubleword element of V1; the other + * half is loaded in the leftmost doubleword. + * The vector register with CONST_R5 contains the R5 constant in the + * rightmost doubleword and the leftmost doubleword is zero to ignore + * the leftmost product of V1. + */ + fpu_vleib(9, 0x20, 7); /* Shift by words */ + fpu_vsrlb(2, 1, 9); /* Store remaining bits in V2 */ + fpu_vupllf(1, 1); /* Split rightmost doubleword */ + fpu_vgfmag(1, CONST_R5, 1, 2); /* V1 = (V1 * R5) XOR V2 */ + + /* + * Apply a Barret reduction to compute the final 32-bit CRC value. + * + * The input values to the Barret reduction are the degree-63 polynomial + * in V1 (R(x)), degree-32 generator polynomial, and the reduction + * constant u. The Barret reduction result is the CRC value of R(x) mod + * P(x). + * + * The Barret reduction algorithm is defined as: + * + * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u + * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) + * 3. C(x) = R(x) XOR T2(x) mod x^32 + * + * Note: The leftmost doubleword of vector register containing + * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product + * is zero and does not contribute to the final result. + */ + + /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ + fpu_vupllf(2, 1); + fpu_vgfmg(2, CONST_RU_POLY, 2); + + /* + * Compute the GF(2) product of the CRC polynomial with T1(x) in + * V2 and XOR the intermediate result, T2(x), with the value in V1. + * The final result is stored in word element 2 of V2. + */ + fpu_vupllf(2, 2); + fpu_vgfmag(2, CONST_CRC_POLY, 2, 1); + + return fpu_vlgvf(2, 2); +} + +u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size) +{ + return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32_LE[0]); +} + +u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size) +{ + return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32C_LE[0]); +} diff --git a/lib/crc/sparc/crc32.h b/lib/crc/sparc/crc32.h new file mode 100644 index 000000000000..60f2765ac015 --- /dev/null +++ b/lib/crc/sparc/crc32.h @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* CRC32c (Castagnoli), sparc64 crc32c opcode accelerated + * + * This is based largely upon arch/x86/crypto/crc32c-intel.c + * + * Copyright (C) 2008 Intel Corporation + * Authors: Austin Zhang <austin_zhang@linux.intel.com> + * Kent Liu <kent.liu@intel.com> + */ + +#include <asm/pstate.h> +#include <asm/elf.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32c_opcode); + +#define crc32_le_arch crc32_le_base /* not implemented on this arch */ +#define crc32_be_arch crc32_be_base /* not implemented on this arch */ + +void crc32c_sparc64(u32 *crcp, const u64 *data, size_t len); + +static inline u32 crc32c_arch(u32 crc, const u8 *data, size_t len) +{ + size_t n = -(uintptr_t)data & 7; + + if (!static_branch_likely(&have_crc32c_opcode)) + return crc32c_base(crc, data, len); + + if (n) { + /* Data isn't 8-byte aligned. Align it. */ + n = min(n, len); + crc = crc32c_base(crc, data, n); + data += n; + len -= n; + } + n = len & ~7U; + if (n) { + crc32c_sparc64(&crc, (const u64 *)data, n); + data += n; + len -= n; + } + if (len) + crc = crc32c_base(crc, data, len); + return crc; +} + +#define crc32_mod_init_arch crc32_mod_init_arch +static inline void crc32_mod_init_arch(void) +{ + unsigned long cfr; + + if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) + return; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + if (!(cfr & CFR_CRC32C)) + return; + + static_branch_enable(&have_crc32c_opcode); + pr_info("Using sparc64 crc32c opcode optimized CRC32C implementation\n"); +} + +static inline u32 crc32_optimizations_arch(void) +{ + if (static_key_enabled(&have_crc32c_opcode)) + return CRC32C_OPTIMIZATION; + return 0; +} diff --git a/lib/crc/sparc/crc32c_asm.S b/lib/crc/sparc/crc32c_asm.S new file mode 100644 index 000000000000..4db873850f44 --- /dev/null +++ b/lib/crc/sparc/crc32c_asm.S @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> +#include <asm/opcodes.h> +#include <asm/visasm.h> +#include <asm/asi.h> + +ENTRY(crc32c_sparc64) + /* %o0=crc32p, %o1=data_ptr, %o2=len */ + VISEntryHalf + lda [%o0] ASI_PL, %f1 +1: ldd [%o1], %f2 + CRC32C(0,2,0) + subcc %o2, 8, %o2 + bne,pt %icc, 1b + add %o1, 0x8, %o1 + sta %f1, [%o0] ASI_PL + VISExitHalf +2: retl + nop +ENDPROC(crc32c_sparc64) diff --git a/lib/crc/tests/Makefile b/lib/crc/tests/Makefile new file mode 100644 index 000000000000..65f63c318ef5 --- /dev/null +++ b/lib/crc/tests/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_CRC_KUNIT_TEST) += crc_kunit.o diff --git a/lib/tests/crc_kunit.c b/lib/crc/tests/crc_kunit.c index 585c48b65cef..f08d985d8860 100644 --- a/lib/tests/crc_kunit.c +++ b/lib/crc/tests/crc_kunit.c @@ -36,14 +36,12 @@ static size_t test_buflen; * can fit any CRC up to CRC-64. The CRC is passed in, and is expected * to be returned in, the least significant bits of the u64. The * function is expected to *not* invert the CRC at the beginning and end. - * @combine_func: Optional function to combine two CRCs. */ struct crc_variant { int bits; bool le; u64 poly; u64 (*func)(u64 crc, const u8 *p, size_t len); - u64 (*combine_func)(u64 crc1, u64 crc2, size_t len2); }; static u32 rand32(void) @@ -144,7 +142,7 @@ static size_t generate_random_length(size_t max_length) } /* Test that v->func gives the same CRCs as a reference implementation. */ -static void crc_main_test(struct kunit *test, const struct crc_variant *v) +static void crc_test(struct kunit *test, const struct crc_variant *v) { size_t i; @@ -188,35 +186,6 @@ static void crc_main_test(struct kunit *test, const struct crc_variant *v) } } -/* Test that CRC(concat(A, B)) == combine_CRCs(CRC(A), CRC(B), len(B)). */ -static void crc_combine_test(struct kunit *test, const struct crc_variant *v) -{ - int i; - - for (i = 0; i < 100; i++) { - u64 init_crc = generate_random_initial_crc(v); - size_t len1 = generate_random_length(CRC_KUNIT_MAX_LEN); - size_t len2 = generate_random_length(CRC_KUNIT_MAX_LEN - len1); - u64 crc1, crc2, expected_crc, actual_crc; - - prandom_bytes_state(&rng, test_buffer, len1 + len2); - crc1 = v->func(init_crc, test_buffer, len1); - crc2 = v->func(0, &test_buffer[len1], len2); - expected_crc = v->func(init_crc, test_buffer, len1 + len2); - actual_crc = v->combine_func(crc1, crc2, len2); - KUNIT_EXPECT_EQ_MSG(test, expected_crc, actual_crc, - "CRC combination gave wrong result with len1=%zu len2=%zu\n", - len1, len2); - } -} - -static void crc_test(struct kunit *test, const struct crc_variant *v) -{ - crc_main_test(test, v); - if (v->combine_func) - crc_combine_test(test, v); -} - static __always_inline void crc_benchmark(struct kunit *test, u64 (*crc_func)(u64 crc, const u8 *p, size_t len)) @@ -337,17 +306,11 @@ static u64 crc32_le_wrapper(u64 crc, const u8 *p, size_t len) return crc32_le(crc, p, len); } -static u64 crc32_le_combine_wrapper(u64 crc1, u64 crc2, size_t len2) -{ - return crc32_le_combine(crc1, crc2, len2); -} - static const struct crc_variant crc_variant_crc32_le = { .bits = 32, .le = true, .poly = 0xedb88320, .func = crc32_le_wrapper, - .combine_func = crc32_le_combine_wrapper, }; static void crc32_le_test(struct kunit *test) @@ -391,17 +354,11 @@ static u64 crc32c_wrapper(u64 crc, const u8 *p, size_t len) return crc32c(crc, p, len); } -static u64 crc32c_combine_wrapper(u64 crc1, u64 crc2, size_t len2) -{ - return crc32c_combine(crc1, crc2, len2); -} - static const struct crc_variant crc_variant_crc32c = { .bits = 32, .le = true, .poly = 0x82f63b78, .func = crc32c_wrapper, - .combine_func = crc32c_combine_wrapper, }; static void crc32c_test(struct kunit *test) diff --git a/lib/crc/x86/crc-pclmul-consts.h b/lib/crc/x86/crc-pclmul-consts.h new file mode 100644 index 000000000000..6ae94158fca2 --- /dev/null +++ b/lib/crc/x86/crc-pclmul-consts.h @@ -0,0 +1,240 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * CRC constants generated by: + * + * ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc32_lsb_0x82f63b78,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5 + * + * Do not edit manually. + */ + +/* + * CRC folding constants generated for most-significant-bit-first CRC-16 using + * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 + */ +static const struct { + u8 bswap_mask[16]; + u64 fold_across_2048_bits_consts[2]; + u64 fold_across_1024_bits_consts[2]; + u64 fold_across_512_bits_consts[2]; + u64 fold_across_256_bits_consts[2]; + u64 fold_across_128_bits_consts[2]; + u8 shuf_table[48]; + u64 barrett_reduction_consts[2]; +} crc16_msb_0x8bb7_consts ____cacheline_aligned __maybe_unused = { + .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, + .fold_across_2048_bits_consts = { + 0xdccf000000000000, /* LO64_TERMS: (x^2000 mod G) * x^48 */ + 0x4b0b000000000000, /* HI64_TERMS: (x^2064 mod G) * x^48 */ + }, + .fold_across_1024_bits_consts = { + 0x9d9d000000000000, /* LO64_TERMS: (x^976 mod G) * x^48 */ + 0x7cf5000000000000, /* HI64_TERMS: (x^1040 mod G) * x^48 */ + }, + .fold_across_512_bits_consts = { + 0x044c000000000000, /* LO64_TERMS: (x^464 mod G) * x^48 */ + 0xe658000000000000, /* HI64_TERMS: (x^528 mod G) * x^48 */ + }, + .fold_across_256_bits_consts = { + 0x6ee3000000000000, /* LO64_TERMS: (x^208 mod G) * x^48 */ + 0xe7b5000000000000, /* HI64_TERMS: (x^272 mod G) * x^48 */ + }, + .fold_across_128_bits_consts = { + 0x2d56000000000000, /* LO64_TERMS: (x^80 mod G) * x^48 */ + 0x06df000000000000, /* HI64_TERMS: (x^144 mod G) * x^48 */ + }, + .shuf_table = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + }, + .barrett_reduction_consts = { + 0x8bb7000000000000, /* LO64_TERMS: (G - x^16) * x^48 */ + 0xf65a57f81d33a48a, /* HI64_TERMS: (floor(x^79 / G) * x) - x^64 */ + }, +}; + +/* + * CRC folding constants generated for least-significant-bit-first CRC-32 using + * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 + + * x^5 + x^4 + x^2 + x^1 + x^0 + */ +static const struct { + u64 fold_across_2048_bits_consts[2]; + u64 fold_across_1024_bits_consts[2]; + u64 fold_across_512_bits_consts[2]; + u64 fold_across_256_bits_consts[2]; + u64 fold_across_128_bits_consts[2]; + u8 shuf_table[48]; + u64 barrett_reduction_consts[2]; +} crc32_lsb_0xedb88320_consts ____cacheline_aligned __maybe_unused = { + .fold_across_2048_bits_consts = { + 0x00000000ce3371cb, /* HI64_TERMS: (x^2079 mod G) * x^32 */ + 0x00000000e95c1271, /* LO64_TERMS: (x^2015 mod G) * x^32 */ + }, + .fold_across_1024_bits_consts = { + 0x0000000033fff533, /* HI64_TERMS: (x^1055 mod G) * x^32 */ + 0x00000000910eeec1, /* LO64_TERMS: (x^991 mod G) * x^32 */ + }, + .fold_across_512_bits_consts = { + 0x000000008f352d95, /* HI64_TERMS: (x^543 mod G) * x^32 */ + 0x000000001d9513d7, /* LO64_TERMS: (x^479 mod G) * x^32 */ + }, + .fold_across_256_bits_consts = { + 0x00000000f1da05aa, /* HI64_TERMS: (x^287 mod G) * x^32 */ + 0x0000000081256527, /* LO64_TERMS: (x^223 mod G) * x^32 */ + }, + .fold_across_128_bits_consts = { + 0x00000000ae689191, /* HI64_TERMS: (x^159 mod G) * x^32 */ + 0x00000000ccaa009e, /* LO64_TERMS: (x^95 mod G) * x^32 */ + }, + .shuf_table = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + }, + .barrett_reduction_consts = { + 0xb4e5b025f7011641, /* HI64_TERMS: floor(x^95 / G) */ + 0x00000001db710640, /* LO64_TERMS: (G - x^32) * x^31 */ + }, +}; + +/* + * CRC folding constants generated for least-significant-bit-first CRC-32 using + * G(x) = x^32 + x^28 + x^27 + x^26 + x^25 + x^23 + x^22 + x^20 + x^19 + x^18 + + * x^14 + x^13 + x^11 + x^10 + x^9 + x^8 + x^6 + x^0 + */ +static const struct { + u64 fold_across_2048_bits_consts[2]; + u64 fold_across_1024_bits_consts[2]; + u64 fold_across_512_bits_consts[2]; + u64 fold_across_256_bits_consts[2]; + u64 fold_across_128_bits_consts[2]; + u8 shuf_table[48]; + u64 barrett_reduction_consts[2]; +} crc32_lsb_0x82f63b78_consts ____cacheline_aligned __maybe_unused = { + .fold_across_2048_bits_consts = { + 0x00000000dcb17aa4, /* HI64_TERMS: (x^2079 mod G) * x^32 */ + 0x00000000b9e02b86, /* LO64_TERMS: (x^2015 mod G) * x^32 */ + }, + .fold_across_1024_bits_consts = { + 0x000000006992cea2, /* HI64_TERMS: (x^1055 mod G) * x^32 */ + 0x000000000d3b6092, /* LO64_TERMS: (x^991 mod G) * x^32 */ + }, + .fold_across_512_bits_consts = { + 0x00000000740eef02, /* HI64_TERMS: (x^543 mod G) * x^32 */ + 0x000000009e4addf8, /* LO64_TERMS: (x^479 mod G) * x^32 */ + }, + .fold_across_256_bits_consts = { + 0x000000003da6d0cb, /* HI64_TERMS: (x^287 mod G) * x^32 */ + 0x00000000ba4fc28e, /* LO64_TERMS: (x^223 mod G) * x^32 */ + }, + .fold_across_128_bits_consts = { + 0x00000000f20c0dfe, /* HI64_TERMS: (x^159 mod G) * x^32 */ + 0x00000000493c7d27, /* LO64_TERMS: (x^95 mod G) * x^32 */ + }, + .shuf_table = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + }, + .barrett_reduction_consts = { + 0x4869ec38dea713f1, /* HI64_TERMS: floor(x^95 / G) */ + 0x0000000105ec76f0, /* LO64_TERMS: (G - x^32) * x^31 */ + }, +}; + +/* + * CRC folding constants generated for most-significant-bit-first CRC-64 using + * G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + + * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + + * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + + * x^7 + x^4 + x^1 + x^0 + */ +static const struct { + u8 bswap_mask[16]; + u64 fold_across_2048_bits_consts[2]; + u64 fold_across_1024_bits_consts[2]; + u64 fold_across_512_bits_consts[2]; + u64 fold_across_256_bits_consts[2]; + u64 fold_across_128_bits_consts[2]; + u8 shuf_table[48]; + u64 barrett_reduction_consts[2]; +} crc64_msb_0x42f0e1eba9ea3693_consts ____cacheline_aligned __maybe_unused = { + .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, + .fold_across_2048_bits_consts = { + 0x7f52691a60ddc70d, /* LO64_TERMS: (x^2048 mod G) * x^0 */ + 0x7036b0389f6a0c82, /* HI64_TERMS: (x^2112 mod G) * x^0 */ + }, + .fold_across_1024_bits_consts = { + 0x05cf79dea9ac37d6, /* LO64_TERMS: (x^1024 mod G) * x^0 */ + 0x001067e571d7d5c2, /* HI64_TERMS: (x^1088 mod G) * x^0 */ + }, + .fold_across_512_bits_consts = { + 0x5f6843ca540df020, /* LO64_TERMS: (x^512 mod G) * x^0 */ + 0xddf4b6981205b83f, /* HI64_TERMS: (x^576 mod G) * x^0 */ + }, + .fold_across_256_bits_consts = { + 0x571bee0a227ef92b, /* LO64_TERMS: (x^256 mod G) * x^0 */ + 0x44bef2a201b5200c, /* HI64_TERMS: (x^320 mod G) * x^0 */ + }, + .fold_across_128_bits_consts = { + 0x05f5c3c7eb52fab6, /* LO64_TERMS: (x^128 mod G) * x^0 */ + 0x4eb938a7d257740e, /* HI64_TERMS: (x^192 mod G) * x^0 */ + }, + .shuf_table = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + }, + .barrett_reduction_consts = { + 0x42f0e1eba9ea3693, /* LO64_TERMS: (G - x^64) * x^0 */ + 0x578d29d06cc4f872, /* HI64_TERMS: (floor(x^127 / G) * x) - x^64 */ + }, +}; + +/* + * CRC folding constants generated for least-significant-bit-first CRC-64 using + * G(x) = x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 + + * x^47 + x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 + + * x^26 + x^23 + x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 + + * x^4 + x^3 + x^0 + */ +static const struct { + u64 fold_across_2048_bits_consts[2]; + u64 fold_across_1024_bits_consts[2]; + u64 fold_across_512_bits_consts[2]; + u64 fold_across_256_bits_consts[2]; + u64 fold_across_128_bits_consts[2]; + u8 shuf_table[48]; + u64 barrett_reduction_consts[2]; +} crc64_lsb_0x9a6c9329ac4bc9b5_consts ____cacheline_aligned __maybe_unused = { + .fold_across_2048_bits_consts = { + 0x37ccd3e14069cabc, /* HI64_TERMS: (x^2111 mod G) * x^0 */ + 0xa043808c0f782663, /* LO64_TERMS: (x^2047 mod G) * x^0 */ + }, + .fold_across_1024_bits_consts = { + 0xa1ca681e733f9c40, /* HI64_TERMS: (x^1087 mod G) * x^0 */ + 0x5f852fb61e8d92dc, /* LO64_TERMS: (x^1023 mod G) * x^0 */ + }, + .fold_across_512_bits_consts = { + 0x0c32cdb31e18a84a, /* HI64_TERMS: (x^575 mod G) * x^0 */ + 0x62242240ace5045a, /* LO64_TERMS: (x^511 mod G) * x^0 */ + }, + .fold_across_256_bits_consts = { + 0xb0bc2e589204f500, /* HI64_TERMS: (x^319 mod G) * x^0 */ + 0xe1e0bb9d45d7a44c, /* LO64_TERMS: (x^255 mod G) * x^0 */ + }, + .fold_across_128_bits_consts = { + 0xeadc41fd2ba3d420, /* HI64_TERMS: (x^191 mod G) * x^0 */ + 0x21e9761e252621ac, /* LO64_TERMS: (x^127 mod G) * x^0 */ + }, + .shuf_table = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + }, + .barrett_reduction_consts = { + 0x27ecfa329aef9f77, /* HI64_TERMS: floor(x^127 / G) */ + 0x34d926535897936a, /* LO64_TERMS: (G - x^64 - x^0) / x */ + }, +}; diff --git a/lib/crc/x86/crc-pclmul-template.S b/lib/crc/x86/crc-pclmul-template.S new file mode 100644 index 000000000000..a02f7dc8053e --- /dev/null +++ b/lib/crc/x86/crc-pclmul-template.S @@ -0,0 +1,575 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +// +// Template to generate [V]PCLMULQDQ-based CRC functions for x86 +// +// Copyright 2025 Google LLC +// +// Author: Eric Biggers <ebiggers@google.com> + +#include <linux/linkage.h> +#include <linux/objtool.h> + +// Offsets within the generated constants table +.set OFFSETOF_BSWAP_MASK, -5*16 // msb-first CRCs only +.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS, -4*16 // must precede next +.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS, -3*16 // must precede next +.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS, -2*16 // must precede next +.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS, -1*16 // must precede next +.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS, 0*16 // must be 0 +.set OFFSETOF_SHUF_TABLE, 1*16 +.set OFFSETOF_BARRETT_REDUCTION_CONSTS, 4*16 + +// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the +// corresponding non-VEX instruction plus any needed moves. The supported +// instruction formats are: +// +// - Two-arg [src, dst], where the non-VEX format is the same. +// - Three-arg [src1, src2, dst] where the non-VEX format is +// [src1, src2_and_dst]. If src2 != dst, then src1 must != dst too. +// +// \insn gives the instruction without a "v" prefix and including any immediate +// argument if needed to make the instruction follow one of the above formats. +// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to +// it first; this is needed when \arg1 is an unaligned mem operand. +.macro _cond_vex insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp +.if AVX_LEVEL == 0 + // VEX not allowed. Emulate it. + .ifnb \arg3 // Three-arg [src1, src2, dst] + .ifc "\arg2", "\arg3" // src2 == dst? + .ifnb \unaligned_mem_tmp + movdqu \arg1, \unaligned_mem_tmp + \insn \unaligned_mem_tmp, \arg3 + .else + \insn \arg1, \arg3 + .endif + .else // src2 != dst + .ifc "\arg1", "\arg3" + .error "Can't have src1 == dst when src2 != dst" + .endif + .ifnb \unaligned_mem_tmp + movdqu \arg1, \unaligned_mem_tmp + movdqa \arg2, \arg3 + \insn \unaligned_mem_tmp, \arg3 + .else + movdqa \arg2, \arg3 + \insn \arg1, \arg3 + .endif + .endif + .else // Two-arg [src, dst] + .ifnb \unaligned_mem_tmp + movdqu \arg1, \unaligned_mem_tmp + \insn \unaligned_mem_tmp, \arg2 + .else + \insn \arg1, \arg2 + .endif + .endif +.else + // VEX is allowed. Emit the desired instruction directly. + .ifnb \arg3 + v\insn \arg1, \arg2, \arg3 + .else + v\insn \arg1, \arg2 + .endif +.endif +.endm + +// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector +// register of length VL. +.macro _vbroadcast src, dst +.if VL == 16 + _cond_vex movdqa, \src, \dst +.elseif VL == 32 + vbroadcasti128 \src, \dst +.else + vbroadcasti32x4 \src, \dst +.endif +.endm + +// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC +// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane. +.macro _load_data vl, src, bswap_mask, dst +.if \vl < 64 + _cond_vex movdqu, "\src", \dst +.else + vmovdqu8 \src, \dst +.endif +.if !LSB_CRC + _cond_vex pshufb, \bswap_mask, \dst, \dst +.endif +.endm + +.macro _prepare_v0 vl, v0, v1, bswap_mask +.if LSB_CRC + .if \vl < 64 + _cond_vex pxor, (BUF), \v0, \v0, unaligned_mem_tmp=\v1 + .else + vpxorq (BUF), \v0, \v0 + .endif +.else + _load_data \vl, (BUF), \bswap_mask, \v1 + .if \vl < 64 + _cond_vex pxor, \v1, \v0, \v0 + .else + vpxorq \v1, \v0, \v0 + .endif +.endif +.endm + +// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for +// msb-first order or the physically high qword for lsb-first order +#define LO64_TERMS 0 + +// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high +// qword for msb-first order or the physically low qword for lsb-first order +#define HI64_TERMS 1 + +// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given +// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst. +.macro _pclmulqdq src1, src1_terms, src2, src2_terms, dst + _cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \ + \src1, \src2, \dst +.endm + +// Fold \acc into \data and store the result back into \acc. \data can be an +// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no +// byte-reflection is needed; otherwise it must be a vector register. \consts +// is a vector register containing the needed fold constants, and \tmp is a +// temporary vector register. All arguments must be the same length. +.macro _fold_vec acc, data, consts, tmp + _pclmulqdq \consts, HI64_TERMS, \acc, HI64_TERMS, \tmp + _pclmulqdq \consts, LO64_TERMS, \acc, LO64_TERMS, \acc +.if AVX_LEVEL <= 2 + _cond_vex pxor, \data, \tmp, \tmp + _cond_vex pxor, \tmp, \acc, \acc +.else + vpternlogq $0x96, \data, \tmp, \acc +.endif +.endm + +// Fold \acc into \data and store the result back into \acc. \data is an +// unaligned mem operand, \consts is a vector register containing the needed +// fold constants, \bswap_mask is a vector register containing the +// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are +// temporary vector registers. All arguments must have length \vl. +.macro _fold_vec_mem vl, acc, data, consts, bswap_mask, tmp1, tmp2 +.if AVX_LEVEL == 0 || !LSB_CRC + _load_data \vl, \data, \bswap_mask, \tmp1 + _fold_vec \acc, \tmp1, \consts, \tmp2 +.else + _fold_vec \acc, \data, \consts, \tmp1 +.endif +.endm + +// Load the constants for folding across 2**i vectors of length VL at a time +// into all 128-bit lanes of the vector register CONSTS. +.macro _load_vec_folding_consts i + _vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \ + CONSTS +.endm + +// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store +// the result back into \v0. If the remaining length mod \vl is nonzero, also +// fold \vl data bytes from BUF. For both operations the fold distance is \vl. +// \consts must be a register of length \vl containing the fold constants. +.macro _fold_vec_final vl, v0, v1, consts, bswap_mask, tmp1, tmp2 + _fold_vec \v0, \v1, \consts, \tmp1 + test $\vl, LEN8 + jz .Lfold_vec_final_done\@ + _fold_vec_mem \vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2 + add $\vl, BUF +.Lfold_vec_final_done\@: +.endm + +// This macro generates the body of a CRC function with the following prototype: +// +// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts); +// +// |crc| is the initial CRC, and crc_t is a data type wide enough to hold it. +// |buf| is the data to checksum. |len| is the data length in bytes, which must +// be at least 16. |consts| is a pointer to the fold_across_128_bits_consts +// field of the constants struct that was generated for the chosen CRC variant. +// +// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g. +// 32 for a CRC-32. Currently the supported values are 8, 16, 32, and 64. If +// the file is compiled in i386 mode, then the maximum supported value is 32. +// +// \lsb_crc is 1 if the CRC processes the least significant bit of each byte +// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0. \lsb_crc is 0 +// if the CRC processes the most significant bit of each byte first, i.e. maps +// bit0 to x^0, bit1 to x^1, bit7 to x^7. +// +// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64. +// +// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or +// 512 for AVX512. +// +// If \vl == 16 && \avx_level == 0, the generated code requires: +// PCLMULQDQ && SSE4.1. (Note: all known CPUs with PCLMULQDQ also have SSE4.1.) +// +// If \vl == 32 && \avx_level == 2, the generated code requires: +// VPCLMULQDQ && AVX2. +// +// If \vl == 64 && \avx_level == 512, the generated code requires: +// VPCLMULQDQ && AVX512BW && AVX512VL. +// +// Other \vl and \avx_level combinations are either not supported or not useful. +.macro _crc_pclmul n, lsb_crc, vl, avx_level + .set LSB_CRC, \lsb_crc + .set VL, \vl + .set AVX_LEVEL, \avx_level + + // Define aliases for the xmm, ymm, or zmm registers according to VL. +.irp i, 0,1,2,3,4,5,6,7 + .if VL == 16 + .set V\i, %xmm\i + .set LOG2_VL, 4 + .elseif VL == 32 + .set V\i, %ymm\i + .set LOG2_VL, 5 + .elseif VL == 64 + .set V\i, %zmm\i + .set LOG2_VL, 6 + .else + .error "Unsupported vector length" + .endif +.endr + // Define aliases for the function parameters. + // Note: when crc_t is shorter than u32, zero-extension to 32 bits is + // guaranteed by the ABI. Zero-extension to 64 bits is *not* guaranteed + // when crc_t is shorter than u64. +#ifdef __x86_64__ +.if \n <= 32 + .set CRC, %edi +.else + .set CRC, %rdi +.endif + .set BUF, %rsi + .set LEN, %rdx + .set LEN32, %edx + .set LEN8, %dl + .set CONSTS_PTR, %rcx +#else + // 32-bit support, assuming -mregparm=3 and not including support for + // CRC-64 (which would use both eax and edx to pass the crc parameter). + .set CRC, %eax + .set BUF, %edx + .set LEN, %ecx + .set LEN32, %ecx + .set LEN8, %cl + .set CONSTS_PTR, %ebx // Passed on stack +#endif + + // Define aliases for some local variables. V0-V5 are used without + // aliases (for accumulators, data, temporary values, etc). Staying + // within the first 8 vector registers keeps the code 32-bit SSE + // compatible and reduces the size of 64-bit SSE code slightly. + .set BSWAP_MASK, V6 + .set BSWAP_MASK_YMM, %ymm6 + .set BSWAP_MASK_XMM, %xmm6 + .set CONSTS, V7 + .set CONSTS_YMM, %ymm7 + .set CONSTS_XMM, %xmm7 + + // Use ANNOTATE_NOENDBR to suppress an objtool warning, since the + // functions generated by this macro are called only by static_call. + ANNOTATE_NOENDBR + +#ifdef __i386__ + push CONSTS_PTR + mov 8(%esp), CONSTS_PTR +#endif + + // Create a 128-bit vector that contains the initial CRC in the end + // representing the high-order polynomial coefficients, and the rest 0. + // If the CRC is msb-first, also load the byte-reflection table. +.if \n <= 32 + _cond_vex movd, CRC, %xmm0 +.else + _cond_vex movq, CRC, %xmm0 +.endif +.if !LSB_CRC + _cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0 + _vbroadcast OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK +.endif + + // Load the first vector of data and XOR the initial CRC into the + // appropriate end of the first 128-bit lane of data. If LEN < VL, then + // use a short vector and jump ahead to the final reduction. (LEN >= 16 + // is guaranteed here but not necessarily LEN >= VL.) +.if VL >= 32 + cmp $VL, LEN + jae .Lat_least_1vec\@ + .if VL == 64 + cmp $32, LEN32 + jb .Lless_than_32bytes\@ + _prepare_v0 32, %ymm0, %ymm1, BSWAP_MASK_YMM + add $32, BUF + jmp .Lreduce_256bits_to_128bits\@ +.Lless_than_32bytes\@: + .endif + _prepare_v0 16, %xmm0, %xmm1, BSWAP_MASK_XMM + add $16, BUF + vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM + jmp .Lcheck_for_partial_block\@ +.Lat_least_1vec\@: +.endif + _prepare_v0 VL, V0, V1, BSWAP_MASK + + // Handle VL <= LEN < 4*VL. + cmp $4*VL-1, LEN + ja .Lat_least_4vecs\@ + add $VL, BUF + // If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector. + // If VL==16 then load fold_across_128_bits_consts first, as the final + // reduction depends on it and it won't be loaded anywhere else. + cmp $2*VL-1, LEN32 +.if VL == 16 + _cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM +.endif + jbe .Lreduce_1vec_to_128bits\@ + // Otherwise 2*VL <= LEN < 4*VL. Load one more vector and jump ahead to + // the reduction from 2 vectors. + _load_data VL, (BUF), BSWAP_MASK, V1 + add $VL, BUF + jmp .Lreduce_2vecs_to_1\@ + +.Lat_least_4vecs\@: + // Load 3 more vectors of data. + _load_data VL, 1*VL(BUF), BSWAP_MASK, V1 + _load_data VL, 2*VL(BUF), BSWAP_MASK, V2 + _load_data VL, 3*VL(BUF), BSWAP_MASK, V3 + sub $-4*VL, BUF // Shorter than 'add 4*VL' when VL=32 + add $-4*VL, LEN // Shorter than 'sub 4*VL' when VL=32 + + // Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next + // 4 vectors of data and write the result back to V0-V3. + cmp $4*VL-1, LEN // Shorter than 'cmp 4*VL' when VL=32 + jbe .Lreduce_4vecs_to_2\@ + _load_vec_folding_consts 2 +.Lfold_4vecs_loop\@: + _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 + _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 + _fold_vec_mem VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 + _fold_vec_mem VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 + sub $-4*VL, BUF + add $-4*VL, LEN + cmp $4*VL-1, LEN + ja .Lfold_4vecs_loop\@ + + // Fold V0,V1 into V2,V3 and write the result back to V0,V1. Then fold + // two more vectors of data from BUF, if at least that much remains. +.Lreduce_4vecs_to_2\@: + _load_vec_folding_consts 1 + _fold_vec V0, V2, CONSTS, V4 + _fold_vec V1, V3, CONSTS, V4 + test $2*VL, LEN8 + jz .Lreduce_2vecs_to_1\@ + _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 + _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 + sub $-2*VL, BUF + + // Fold V0 into V1 and write the result back to V0. Then fold one more + // vector of data from BUF, if at least that much remains. +.Lreduce_2vecs_to_1\@: + _load_vec_folding_consts 0 + _fold_vec_final VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5 + +.Lreduce_1vec_to_128bits\@: +.if VL == 64 + // Reduce 512-bit %zmm0 to 256-bit %ymm0. Then fold 256 more bits of + // data from BUF, if at least that much remains. + vbroadcasti128 OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM + vextracti64x4 $1, %zmm0, %ymm1 + _fold_vec_final 32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5 +.Lreduce_256bits_to_128bits\@: +.endif +.if VL >= 32 + // Reduce 256-bit %ymm0 to 128-bit %xmm0. Then fold 128 more bits of + // data from BUF, if at least that much remains. + vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM + vextracti128 $1, %ymm0, %xmm1 + _fold_vec_final 16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5 +.Lcheck_for_partial_block\@: +.endif + and $15, LEN32 + jz .Lreduce_128bits_to_crc\@ + + // 1 <= LEN <= 15 data bytes remain in BUF. The polynomial is now + // A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0 + // and B is the polynomial of the remaining LEN data bytes. To reduce + // this to 128 bits without needing fold constants for each possible + // LEN, rearrange this expression into C1*(x^128) + C2, where + // C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128. + // Then fold C1 into C2, which is just another fold across 128 bits. + +.if !LSB_CRC || AVX_LEVEL == 0 + // Load the last 16 data bytes. Note that originally LEN was >= 16. + _load_data 16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2 +.endif // Else will use vpblendvb mem operand later. +.if !LSB_CRC + neg LEN // Needed for indexing shuf_table +.endif + + // tmp = A*x^(8*LEN) mod x^128 + // lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1] + // i.e. right-shift by LEN bytes. + // msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN] + // i.e. left-shift by LEN bytes. + _cond_vex movdqu, "OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3 + _cond_vex pshufb, %xmm3, %xmm0, %xmm1 + + // C1 = floor(A / x^(128 - 8*LEN)) + // lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1] + // i.e. left-shift by 16-LEN bytes. + // msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1] + // i.e. right-shift by 16-LEN bytes. + _cond_vex pshufb, "OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \ + %xmm0, %xmm0, unaligned_mem_tmp=%xmm4 + + // C2 = tmp + B. This is just a blend of tmp with the last 16 data + // bytes (reflected if msb-first). The blend mask is the shuffle table + // that was used to create tmp. 0 selects tmp, and 1 last16databytes. +.if AVX_LEVEL == 0 + movdqa %xmm0, %xmm4 + movdqa %xmm3, %xmm0 + pblendvb %xmm2, %xmm1 // uses %xmm0 as implicit operand + movdqa %xmm4, %xmm0 +.elseif LSB_CRC + vpblendvb %xmm3, -16(BUF,LEN), %xmm1, %xmm1 +.else + vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 +.endif + + // Fold C1 into C2 and store the 128-bit result in %xmm0. + _fold_vec %xmm0, %xmm1, CONSTS_XMM, %xmm4 + +.Lreduce_128bits_to_crc\@: + // Compute the CRC as %xmm0 * x^n mod G. Here %xmm0 means the 128-bit + // polynomial stored in %xmm0 (using either lsb-first or msb-first bit + // order according to LSB_CRC), and G is the CRC's generator polynomial. + + // First, multiply %xmm0 by x^n and reduce the result to 64+n bits: + // + // t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) + + // x^n * (%xmm0 mod x^64) + // + // Store t0 * x^(64-n) in %xmm0. I.e., actually do: + // + // %xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) + + // x^64 * (%xmm0 mod x^64) + // + // The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned + // to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily + // select it. The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the + // msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case + // (considering the extra factor of x that gets implicitly introduced by + // each pclmulqdq when using lsb-first order), is identical to the + // constant that was used earlier for folding the LO64_TERMS across 128 + // bits. Thus it's already available in LO64_TERMS of CONSTS_XMM. + _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1 +.if LSB_CRC + _cond_vex psrldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64) +.else + _cond_vex pslldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64) +.endif + _cond_vex pxor, %xmm1, %xmm0, %xmm0 + // The HI64_TERMS of %xmm0 now contain floor(t0 / x^n). + // The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n). + + // First step of Barrett reduction: Compute floor(t0 / G). This is the + // polynomial by which G needs to be multiplied to cancel out the x^n + // and higher terms of t0, i.e. to reduce t0 mod G. First do: + // + // t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n) + // + // Then the desired value floor(t0 / G) is floor(t1 / x^64). The 63 in + // x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest + // value that makes enough precision be carried through the calculation. + // + // The '* x' makes it so the result is floor(t1 / x^64) rather than + // floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it + // can be extracted much more easily in the next step. In the lsb-first + // case the '* x' happens implicitly. In the msb-first case it must be + // done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the + // constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and + // the multiplication by the x^64 term is handled using a pxor. The + // pxor causes the low 64 terms of t1 to be wrong, but they are unused. + _cond_vex movdqa, OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM + _pclmulqdq CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1 +.if !LSB_CRC + _cond_vex pxor, %xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n) +.endif + // The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G). + + // Second step of Barrett reduction: Cancel out the x^n and higher terms + // of t0 by subtracting the needed multiple of G. This gives the CRC: + // + // crc := t0 - (G * floor(t0 / G)) + // + // But %xmm0 contains t0 * x^(64-n), so it's more convenient to do: + // + // crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n) + // + // Furthermore, since the resulting CRC is n-bit, if mod x^n is + // explicitly applied to it then the x^n term of G makes no difference + // in the result and can be omitted. This helps keep the constant + // multiplier in 64 bits in most cases. This gives the following: + // + // %xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G)) + // crc := (%xmm0 / x^(64-n)) mod x^n + // + // In the lsb-first case, each pclmulqdq implicitly introduces + // an extra factor of x, so in that case the constant that needs to be + // passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63. + // For lsb-first CRCs where n=64, the extra factor of x cannot be as + // easily avoided. In that case, instead pass '(G - x^n - x^0) / x' to + // pclmulqdq and handle the x^0 term (i.e. 1) separately. (All CRC + // polynomials have nonzero x^n and x^0 terms.) It works out as: the + // CRC has be XORed with the physically low qword of %xmm1, representing + // floor(t0 / G). The most efficient way to do that is to move it to + // the physically high qword and use a ternlog to combine the two XORs. +.if LSB_CRC && \n == 64 + _cond_vex punpcklqdq, %xmm1, %xmm2, %xmm2 + _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1 + .if AVX_LEVEL <= 2 + _cond_vex pxor, %xmm2, %xmm0, %xmm0 + _cond_vex pxor, %xmm1, %xmm0, %xmm0 + .else + vpternlogq $0x96, %xmm2, %xmm1, %xmm0 + .endif + _cond_vex "pextrq $1,", %xmm0, %rax // (%xmm0 / x^0) mod x^64 +.else + _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1 + _cond_vex pxor, %xmm1, %xmm0, %xmm0 + .if \n == 8 + _cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8 + .elseif \n == 16 + _cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16 + .elseif \n == 32 + _cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32 + .else // \n == 64 && !LSB_CRC + _cond_vex movq, %xmm0, %rax // (%xmm0 / x^0) mod x^64 + .endif +.endif + +.if VL > 16 + vzeroupper // Needed when ymm or zmm registers may have been used. +.endif +#ifdef __i386__ + pop CONSTS_PTR +#endif + RET +.endm + +#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \ +SYM_FUNC_START(prefix##_pclmul_sse); \ + _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \ +SYM_FUNC_END(prefix##_pclmul_sse); \ + \ +SYM_FUNC_START(prefix##_vpclmul_avx2); \ + _crc_pclmul n=bits, lsb_crc=lsb, vl=32, avx_level=2; \ +SYM_FUNC_END(prefix##_vpclmul_avx2); \ + \ +SYM_FUNC_START(prefix##_vpclmul_avx512); \ + _crc_pclmul n=bits, lsb_crc=lsb, vl=64, avx_level=512; \ +SYM_FUNC_END(prefix##_vpclmul_avx512); diff --git a/lib/crc/x86/crc-pclmul-template.h b/lib/crc/x86/crc-pclmul-template.h new file mode 100644 index 000000000000..35c950d7010c --- /dev/null +++ b/lib/crc/x86/crc-pclmul-template.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Macros for accessing the [V]PCLMULQDQ-based CRC functions that are + * instantiated by crc-pclmul-template.S + * + * Copyright 2025 Google LLC + * + * Author: Eric Biggers <ebiggers@google.com> + */ +#ifndef _CRC_PCLMUL_TEMPLATE_H +#define _CRC_PCLMUL_TEMPLATE_H + +#include <asm/cpufeatures.h> +#include <asm/simd.h> +#include <crypto/internal/simd.h> +#include <linux/static_call.h> +#include "crc-pclmul-consts.h" + +#define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t) \ +crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len, \ + const void *consts_ptr); \ +crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len, \ + const void *consts_ptr); \ +crc_t prefix##_vpclmul_avx512(crc_t crc, const u8 *p, size_t len, \ + const void *consts_ptr); \ +DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse) + +static inline bool have_vpclmul(void) +{ + return boot_cpu_has(X86_FEATURE_VPCLMULQDQ) && + boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL); +} + +static inline bool have_avx512(void) +{ + return boot_cpu_has(X86_FEATURE_AVX512BW) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + !boot_cpu_has(X86_FEATURE_PREFER_YMM) && + cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL); +} + +/* + * Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16 + * bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD. + * + * 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions. + * There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(), + * varying by CPU and factors such as which parts of the "FPU" state userspace + * has touched, which could result in a larger cutoff being better. Indeed, a + * larger cutoff is usually better for a *single* message. However, the + * overhead of the FPU section gets amortized if multiple FPU sections get + * executed before returning to userspace, since the XSAVE and XRSTOR occur only + * once. Considering that and the fact that the [V]PCLMULQDQ code is lighter on + * the dcache than the table-based code is, a 16-byte cutoff seems to work well. + */ +#define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq) \ +do { \ + if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) && \ + crypto_simd_usable()) { \ + const void *consts_ptr; \ + \ + consts_ptr = (consts).fold_across_128_bits_consts; \ + kernel_fpu_begin(); \ + crc = static_call(prefix##_pclmul)((crc), (p), (len), \ + consts_ptr); \ + kernel_fpu_end(); \ + return crc; \ + } \ +} while (0) + +#endif /* _CRC_PCLMUL_TEMPLATE_H */ diff --git a/lib/crc/x86/crc-t10dif.h b/lib/crc/x86/crc-t10dif.h new file mode 100644 index 000000000000..2a02a3026f3f --- /dev/null +++ b/lib/crc/x86/crc-t10dif.h @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * CRC-T10DIF using [V]PCLMULQDQ instructions + * + * Copyright 2024 Google LLC + */ + +#include "crc-pclmul-template.h" + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); + +DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16); + +static inline u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len) +{ + CRC_PCLMUL(crc, p, len, crc16_msb, crc16_msb_0x8bb7_consts, + have_pclmulqdq); + return crc_t10dif_generic(crc, p, len); +} + +#define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch +static inline void crc_t10dif_mod_init_arch(void) +{ + if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { + static_branch_enable(&have_pclmulqdq); + if (have_vpclmul()) { + if (have_avx512()) + static_call_update(crc16_msb_pclmul, + crc16_msb_vpclmul_avx512); + else + static_call_update(crc16_msb_pclmul, + crc16_msb_vpclmul_avx2); + } + } +} diff --git a/lib/crc/x86/crc16-msb-pclmul.S b/lib/crc/x86/crc16-msb-pclmul.S new file mode 100644 index 000000000000..e9fe248093a8 --- /dev/null +++ b/lib/crc/x86/crc16-msb-pclmul.S @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +// Copyright 2025 Google LLC + +#include "crc-pclmul-template.S" + +DEFINE_CRC_PCLMUL_FUNCS(crc16_msb, /* bits= */ 16, /* lsb= */ 0) diff --git a/lib/crc/x86/crc32-pclmul.S b/lib/crc/x86/crc32-pclmul.S new file mode 100644 index 000000000000..f20f40fb0172 --- /dev/null +++ b/lib/crc/x86/crc32-pclmul.S @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +// Copyright 2025 Google LLC + +#include "crc-pclmul-template.S" + +DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1) diff --git a/lib/crc/x86/crc32.h b/lib/crc/x86/crc32.h new file mode 100644 index 000000000000..cea2c96d08d0 --- /dev/null +++ b/lib/crc/x86/crc32.h @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * x86-optimized CRC32 functions + * + * Copyright (C) 2008 Intel Corporation + * Copyright 2012 Xyratex Technology Limited + * Copyright 2024 Google LLC + */ + +#include "crc-pclmul-template.h" + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512); + +DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32); + +static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts, + have_pclmulqdq); + return crc32_le_base(crc, p, len); +} + +#ifdef CONFIG_X86_64 +#define CRC32_INST "crc32q %1, %q0" +#else +#define CRC32_INST "crc32l %1, %0" +#endif + +/* + * Use carryless multiply version of crc32c when buffer size is >= 512 to + * account for FPU state save/restore overhead. + */ +#define CRC32C_PCLMUL_BREAKEVEN 512 + +asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); + +static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) +{ + size_t num_longs; + + if (!static_branch_likely(&have_crc32)) + return crc32c_base(crc, p, len); + + if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN && + static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) { + /* + * Long length, the vector registers are usable, and the CPU is + * 64-bit and supports both CRC32 and PCLMULQDQ instructions. + * It is worthwhile to divide the data into multiple streams, + * CRC them independently, and combine them using PCLMULQDQ. + * crc32c_x86_3way() does this using 3 streams, which is the + * most that x86_64 CPUs have traditionally been capable of. + * + * However, due to improved VPCLMULQDQ performance on newer + * CPUs, use crc32_lsb_vpclmul_avx512() instead of + * crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a + * "good" implementation of AVX-512. + * + * Future work: the optimal strategy on Zen 3--5 is actually to + * use both crc32q and VPCLMULQDQ in parallel. Unfortunately, + * different numbers of streams and vector lengths are optimal + * on each CPU microarchitecture, making it challenging to take + * advantage of this. (Zen 5 even supports 7 parallel crc32q, a + * major upgrade.) For now, just choose between + * crc32c_x86_3way() and crc32_lsb_vpclmul_avx512(). The latter + * is needed anyway for crc32_le(), so we just reuse it here. + */ + kernel_fpu_begin(); + if (static_branch_likely(&have_vpclmul_avx512)) + crc = crc32_lsb_vpclmul_avx512(crc, p, len, + crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts); + else + crc = crc32c_x86_3way(crc, p, len); + kernel_fpu_end(); + return crc; + } + + /* + * Short length, XMM registers unusable, or the CPU is 32-bit; but the + * CPU supports CRC32 instructions. Just issue a single stream of CRC32 + * instructions inline. While this doesn't use the CPU's CRC32 + * throughput very well, it avoids the need to combine streams. Stream + * combination would be inefficient here. + */ + + for (num_longs = len / sizeof(unsigned long); + num_longs != 0; num_longs--, p += sizeof(unsigned long)) + asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p)); + + if (sizeof(unsigned long) > 4 && (len & 4)) { + asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p)); + p += 4; + } + if (len & 2) { + asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p)); + p += 2; + } + if (len & 1) + asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p)); + + return crc; +} + +#define crc32_be_arch crc32_be_base /* not implemented on this arch */ + +#define crc32_mod_init_arch crc32_mod_init_arch +static inline void crc32_mod_init_arch(void) +{ + if (boot_cpu_has(X86_FEATURE_XMM4_2)) + static_branch_enable(&have_crc32); + if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { + static_branch_enable(&have_pclmulqdq); + if (have_vpclmul()) { + if (have_avx512()) { + static_call_update(crc32_lsb_pclmul, + crc32_lsb_vpclmul_avx512); + static_branch_enable(&have_vpclmul_avx512); + } else { + static_call_update(crc32_lsb_pclmul, + crc32_lsb_vpclmul_avx2); + } + } + } +} + +static inline u32 crc32_optimizations_arch(void) +{ + u32 optimizations = 0; + + if (static_key_enabled(&have_crc32)) + optimizations |= CRC32C_OPTIMIZATION; + if (static_key_enabled(&have_pclmulqdq)) + optimizations |= CRC32_LE_OPTIMIZATION; + return optimizations; +} diff --git a/lib/crc/x86/crc32c-3way.S b/lib/crc/x86/crc32c-3way.S new file mode 100644 index 000000000000..9b8770503bbc --- /dev/null +++ b/lib/crc/x86/crc32c-3way.S @@ -0,0 +1,360 @@ +/* + * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) + * + * The white papers on CRC32C calculations with PCLMULQDQ instruction can be + * downloaded from: + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf + * + * Copyright (C) 2012 Intel Corporation. + * Copyright 2024 Google LLC + * + * Authors: + * Wajdi Feghali <wajdi.k.feghali@intel.com> + * James Guilford <james.guilford@intel.com> + * David Cote <david.m.cote@intel.com> + * Tim Chen <tim.c.chen@linux.intel.com> + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/linkage.h> + +## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction + +# Define threshold below which buffers are considered "small" and routed to +# regular CRC code that does not interleave the CRC instructions. +#define SMALL_SIZE 200 + +# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); + +.text +SYM_FUNC_START(crc32c_x86_3way) +#define crc0 %edi +#define crc0_q %rdi +#define bufp %rsi +#define bufp_d %esi +#define len %rdx +#define len_dw %edx +#define n_misaligned %ecx /* overlaps chunk_bytes! */ +#define n_misaligned_q %rcx +#define chunk_bytes %ecx /* overlaps n_misaligned! */ +#define chunk_bytes_q %rcx +#define crc1 %r8 +#define crc2 %r9 + + cmp $SMALL_SIZE, len + jb .Lsmall + + ################################################################ + ## 1) ALIGN: + ################################################################ + mov bufp_d, n_misaligned + neg n_misaligned + and $7, n_misaligned # calculate the misalignment amount of + # the address + je .Laligned # Skip if aligned + + # Process 1 <= n_misaligned <= 7 bytes individually in order to align + # the remaining data to an 8-byte boundary. +.Ldo_align: + movq (bufp), %rax + add n_misaligned_q, bufp + sub n_misaligned_q, len +.Lalign_loop: + crc32b %al, crc0 # compute crc32 of 1-byte + shr $8, %rax # get next byte + dec n_misaligned + jne .Lalign_loop +.Laligned: + + ################################################################ + ## 2) PROCESS BLOCK: + ################################################################ + + cmp $128*24, len + jae .Lfull_block + +.Lpartial_block: + # Compute floor(len / 24) to get num qwords to process from each lane. + imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24) + shr $16, %eax + jmp .Lcrc_3lanes + +.Lfull_block: + # Processing 128 qwords from each lane. + mov $128, %eax + + ################################################################ + ## 3) CRC each of three lanes: + ################################################################ + +.Lcrc_3lanes: + xor crc1,crc1 + xor crc2,crc2 + mov %eax, chunk_bytes + shl $3, chunk_bytes # num bytes to process from each lane + sub $5, %eax # 4 for 4x_loop, 1 for special last iter + jl .Lcrc_3lanes_4x_done + + # Unroll the loop by a factor of 4 to reduce the overhead of the loop + # bookkeeping instructions, which can compete with crc32q for the ALUs. +.Lcrc_3lanes_4x_loop: + crc32q (bufp), crc0_q + crc32q (bufp,chunk_bytes_q), crc1 + crc32q (bufp,chunk_bytes_q,2), crc2 + crc32q 8(bufp), crc0_q + crc32q 8(bufp,chunk_bytes_q), crc1 + crc32q 8(bufp,chunk_bytes_q,2), crc2 + crc32q 16(bufp), crc0_q + crc32q 16(bufp,chunk_bytes_q), crc1 + crc32q 16(bufp,chunk_bytes_q,2), crc2 + crc32q 24(bufp), crc0_q + crc32q 24(bufp,chunk_bytes_q), crc1 + crc32q 24(bufp,chunk_bytes_q,2), crc2 + add $32, bufp + sub $4, %eax + jge .Lcrc_3lanes_4x_loop + +.Lcrc_3lanes_4x_done: + add $4, %eax + jz .Lcrc_3lanes_last_qword + +.Lcrc_3lanes_1x_loop: + crc32q (bufp), crc0_q + crc32q (bufp,chunk_bytes_q), crc1 + crc32q (bufp,chunk_bytes_q,2), crc2 + add $8, bufp + dec %eax + jnz .Lcrc_3lanes_1x_loop + +.Lcrc_3lanes_last_qword: + crc32q (bufp), crc0_q + crc32q (bufp,chunk_bytes_q), crc1 +# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet + + ################################################################ + ## 4) Combine three results: + ################################################################ + + lea (K_table-8)(%rip), %rax # first entry is for idx 1 + pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2 + lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3 + sub %rax, len # len -= chunk_bytes * 3 + + movq crc0_q, %xmm1 # CRC for block 1 + pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 + + movq crc1, %xmm2 # CRC for block 2 + pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 + + pxor %xmm2,%xmm1 + movq %xmm1, %rax + xor (bufp,chunk_bytes_q,2), %rax + mov crc2, crc0_q + crc32 %rax, crc0_q + lea 8(bufp,chunk_bytes_q,2), bufp + + ################################################################ + ## 5) If more blocks remain, goto (2): + ################################################################ + + cmp $128*24, len + jae .Lfull_block + cmp $SMALL_SIZE, len + jae .Lpartial_block + + ####################################################################### + ## 6) Process any remainder without interleaving: + ####################################################################### +.Lsmall: + test len_dw, len_dw + jz .Ldone + mov len_dw, %eax + shr $3, %eax + jz .Ldo_dword +.Ldo_qwords: + crc32q (bufp), crc0_q + add $8, bufp + dec %eax + jnz .Ldo_qwords +.Ldo_dword: + test $4, len_dw + jz .Ldo_word + crc32l (bufp), crc0 + add $4, bufp +.Ldo_word: + test $2, len_dw + jz .Ldo_byte + crc32w (bufp), crc0 + add $2, bufp +.Ldo_byte: + test $1, len_dw + jz .Ldone + crc32b (bufp), crc0 +.Ldone: + mov crc0, %eax + RET +SYM_FUNC_END(crc32c_x86_3way) + +.section .rodata, "a", @progbits + ################################################################ + ## PCLMULQDQ tables + ## Table is 128 entries x 2 words (8 bytes) each + ################################################################ +.align 8 +K_table: + .long 0x493c7d27, 0x00000001 + .long 0xba4fc28e, 0x493c7d27 + .long 0xddc0152b, 0xf20c0dfe + .long 0x9e4addf8, 0xba4fc28e + .long 0x39d3b296, 0x3da6d0cb + .long 0x0715ce53, 0xddc0152b + .long 0x47db8317, 0x1c291d04 + .long 0x0d3b6092, 0x9e4addf8 + .long 0xc96cfdc0, 0x740eef02 + .long 0x878a92a7, 0x39d3b296 + .long 0xdaece73e, 0x083a6eec + .long 0xab7aff2a, 0x0715ce53 + .long 0x2162d385, 0xc49f4f67 + .long 0x83348832, 0x47db8317 + .long 0x299847d5, 0x2ad91c30 + .long 0xb9e02b86, 0x0d3b6092 + .long 0x18b33a4e, 0x6992cea2 + .long 0xb6dd949b, 0xc96cfdc0 + .long 0x78d9ccb7, 0x7e908048 + .long 0xbac2fd7b, 0x878a92a7 + .long 0xa60ce07b, 0x1b3d8f29 + .long 0xce7f39f4, 0xdaece73e + .long 0x61d82e56, 0xf1d0f55e + .long 0xd270f1a2, 0xab7aff2a + .long 0xc619809d, 0xa87ab8a8 + .long 0x2b3cac5d, 0x2162d385 + .long 0x65863b64, 0x8462d800 + .long 0x1b03397f, 0x83348832 + .long 0xebb883bd, 0x71d111a8 + .long 0xb3e32c28, 0x299847d5 + .long 0x064f7f26, 0xffd852c6 + .long 0xdd7e3b0c, 0xb9e02b86 + .long 0xf285651c, 0xdcb17aa4 + .long 0x10746f3c, 0x18b33a4e + .long 0xc7a68855, 0xf37c5aee + .long 0x271d9844, 0xb6dd949b + .long 0x8e766a0c, 0x6051d5a2 + .long 0x93a5f730, 0x78d9ccb7 + .long 0x6cb08e5c, 0x18b0d4ff + .long 0x6b749fb2, 0xbac2fd7b + .long 0x1393e203, 0x21f3d99c + .long 0xcec3662e, 0xa60ce07b + .long 0x96c515bb, 0x8f158014 + .long 0xe6fc4e6a, 0xce7f39f4 + .long 0x8227bb8a, 0xa00457f7 + .long 0xb0cd4768, 0x61d82e56 + .long 0x39c7ff35, 0x8d6d2c43 + .long 0xd7a4825c, 0xd270f1a2 + .long 0x0ab3844b, 0x00ac29cf + .long 0x0167d312, 0xc619809d + .long 0xf6076544, 0xe9adf796 + .long 0x26f6a60a, 0x2b3cac5d + .long 0xa741c1bf, 0x96638b34 + .long 0x98d8d9cb, 0x65863b64 + .long 0x49c3cc9c, 0xe0e9f351 + .long 0x68bce87a, 0x1b03397f + .long 0x57a3d037, 0x9af01f2d + .long 0x6956fc3b, 0xebb883bd + .long 0x42d98888, 0x2cff42cf + .long 0x3771e98f, 0xb3e32c28 + .long 0xb42ae3d9, 0x88f25a3a + .long 0x2178513a, 0x064f7f26 + .long 0xe0ac139e, 0x4e36f0b0 + .long 0x170076fa, 0xdd7e3b0c + .long 0x444dd413, 0xbd6f81f8 + .long 0x6f345e45, 0xf285651c + .long 0x41d17b64, 0x91c9bd4b + .long 0xff0dba97, 0x10746f3c + .long 0xa2b73df1, 0x885f087b + .long 0xf872e54c, 0xc7a68855 + .long 0x1e41e9fc, 0x4c144932 + .long 0x86d8e4d2, 0x271d9844 + .long 0x651bd98b, 0x52148f02 + .long 0x5bb8f1bc, 0x8e766a0c + .long 0xa90fd27a, 0xa3c6f37a + .long 0xb3af077a, 0x93a5f730 + .long 0x4984d782, 0xd7c0557f + .long 0xca6ef3ac, 0x6cb08e5c + .long 0x234e0b26, 0x63ded06a + .long 0xdd66cbbb, 0x6b749fb2 + .long 0x4597456a, 0x4d56973c + .long 0xe9e28eb4, 0x1393e203 + .long 0x7b3ff57a, 0x9669c9df + .long 0xc9c8b782, 0xcec3662e + .long 0x3f70cc6f, 0xe417f38a + .long 0x93e106a4, 0x96c515bb + .long 0x62ec6c6d, 0x4b9e0f71 + .long 0xd813b325, 0xe6fc4e6a + .long 0x0df04680, 0xd104b8fc + .long 0x2342001e, 0x8227bb8a + .long 0x0a2a8d7e, 0x5b397730 + .long 0x6d9a4957, 0xb0cd4768 + .long 0xe8b6368b, 0xe78eb416 + .long 0xd2c3ed1a, 0x39c7ff35 + .long 0x995a5724, 0x61ff0e01 + .long 0x9ef68d35, 0xd7a4825c + .long 0x0c139b31, 0x8d96551c + .long 0xf2271e60, 0x0ab3844b + .long 0x0b0bf8ca, 0x0bf80dd2 + .long 0x2664fd8b, 0x0167d312 + .long 0xed64812d, 0x8821abed + .long 0x02ee03b2, 0xf6076544 + .long 0x8604ae0f, 0x6a45d2b2 + .long 0x363bd6b3, 0x26f6a60a + .long 0x135c83fd, 0xd8d26619 + .long 0x5fabe670, 0xa741c1bf + .long 0x35ec3279, 0xde87806c + .long 0x00bcf5f6, 0x98d8d9cb + .long 0x8ae00689, 0x14338754 + .long 0x17f27698, 0x49c3cc9c + .long 0x58ca5f00, 0x5bd2011f + .long 0xaa7c7ad5, 0x68bce87a + .long 0xb5cfca28, 0xdd07448e + .long 0xded288f8, 0x57a3d037 + .long 0x59f229bc, 0xdde8f5b9 + .long 0x6d390dec, 0x6956fc3b + .long 0x37170390, 0xa3e3e02c + .long 0x6353c1cc, 0x42d98888 + .long 0xc4584f5c, 0xd73c7bea + .long 0xf48642e9, 0x3771e98f + .long 0x531377e2, 0x80ff0093 + .long 0xdd35bc8d, 0xb42ae3d9 + .long 0xb25b29f2, 0x8fe4c34d + .long 0x9a5ede41, 0x2178513a + .long 0xa563905d, 0xdf99fc11 + .long 0x45cddf4e, 0xe0ac139e + .long 0xacfa3103, 0x6c23e841 + .long 0xa51b6135, 0x170076fa diff --git a/lib/crc/x86/crc64-pclmul.S b/lib/crc/x86/crc64-pclmul.S new file mode 100644 index 000000000000..4173051b5197 --- /dev/null +++ b/lib/crc/x86/crc64-pclmul.S @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +// Copyright 2025 Google LLC + +#include "crc-pclmul-template.S" + +DEFINE_CRC_PCLMUL_FUNCS(crc64_msb, /* bits= */ 64, /* lsb= */ 0) +DEFINE_CRC_PCLMUL_FUNCS(crc64_lsb, /* bits= */ 64, /* lsb= */ 1) diff --git a/lib/crc/x86/crc64.h b/lib/crc/x86/crc64.h new file mode 100644 index 000000000000..fde1222c4c58 --- /dev/null +++ b/lib/crc/x86/crc64.h @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * CRC64 using [V]PCLMULQDQ instructions + * + * Copyright 2025 Google LLC + */ + +#include "crc-pclmul-template.h" + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); + +DECLARE_CRC_PCLMUL_FUNCS(crc64_msb, u64); +DECLARE_CRC_PCLMUL_FUNCS(crc64_lsb, u64); + +static inline u64 crc64_be_arch(u64 crc, const u8 *p, size_t len) +{ + CRC_PCLMUL(crc, p, len, crc64_msb, crc64_msb_0x42f0e1eba9ea3693_consts, + have_pclmulqdq); + return crc64_be_generic(crc, p, len); +} + +static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len) +{ + CRC_PCLMUL(crc, p, len, crc64_lsb, crc64_lsb_0x9a6c9329ac4bc9b5_consts, + have_pclmulqdq); + return crc64_nvme_generic(crc, p, len); +} + +#define crc64_mod_init_arch crc64_mod_init_arch +static inline void crc64_mod_init_arch(void) +{ + if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { + static_branch_enable(&have_pclmulqdq); + if (have_vpclmul()) { + if (have_avx512()) { + static_call_update(crc64_msb_pclmul, + crc64_msb_vpclmul_avx512); + static_call_update(crc64_lsb_pclmul, + crc64_lsb_vpclmul_avx512); + } else { + static_call_update(crc64_msb_pclmul, + crc64_msb_vpclmul_avx2); + static_call_update(crc64_lsb_pclmul, + crc64_lsb_vpclmul_avx2); + } + } + } +} diff --git a/lib/crc32.c b/lib/crc32.c deleted file mode 100644 index fddd424ff224..000000000000 --- a/lib/crc32.c +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin - * cleaned up code to current version of sparse and added the slicing-by-8 - * algorithm to the closely similar existing slicing-by-4 algorithm. - * - * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com> - * Nicer crc32 functions/docs submitted by linux@horizon.com. Thanks! - * Code was from the public domain, copyright abandoned. Code was - * subsequently included in the kernel, thus was re-licensed under the - * GNU GPL v2. - * - * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com> - * Same crc32 function was used in 5 other places in the kernel. - * I made one version, and deleted the others. - * There are various incantations of crc32(). Some use a seed of 0 or ~0. - * Some xor at the end with ~0. The generic crc32() function takes - * seed as an argument, and doesn't xor at the end. Then individual - * users can do whatever they need. - * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0. - * fs/jffs2 uses seed 0, doesn't xor with ~0. - * fs/partitions/efi.c uses seed ~0, xor's with ~0. - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. - */ - -/* see: Documentation/staging/crc32.rst for a description of algorithms */ - -#include <linux/crc32.h> -#include <linux/crc32poly.h> -#include <linux/module.h> -#include <linux/types.h> - -#include "crc32table.h" - -MODULE_AUTHOR("Matt Domsch <Matt_Domsch@dell.com>"); -MODULE_DESCRIPTION("Various CRC32 calculations"); -MODULE_LICENSE("GPL"); - -u32 crc32_le_base(u32 crc, const u8 *p, size_t len) -{ - while (len--) - crc = (crc >> 8) ^ crc32table_le[(crc & 255) ^ *p++]; - return crc; -} -EXPORT_SYMBOL(crc32_le_base); - -u32 crc32c_base(u32 crc, const u8 *p, size_t len) -{ - while (len--) - crc = (crc >> 8) ^ crc32ctable_le[(crc & 255) ^ *p++]; - return crc; -} -EXPORT_SYMBOL(crc32c_base); - -/* - * This multiplies the polynomials x and y modulo the given modulus. - * This follows the "little-endian" CRC convention that the lsbit - * represents the highest power of x, and the msbit represents x^0. - */ -static u32 gf2_multiply(u32 x, u32 y, u32 modulus) -{ - u32 product = x & 1 ? y : 0; - int i; - - for (i = 0; i < 31; i++) { - product = (product >> 1) ^ (product & 1 ? modulus : 0); - x >>= 1; - product ^= x & 1 ? y : 0; - } - - return product; -} - -/** - * crc32_generic_shift - Append @len 0 bytes to crc, in logarithmic time - * @crc: The original little-endian CRC (i.e. lsbit is x^31 coefficient) - * @len: The number of bytes. @crc is multiplied by x^(8*@len) - * @polynomial: The modulus used to reduce the result to 32 bits. - * - * It's possible to parallelize CRC computations by computing a CRC - * over separate ranges of a buffer, then summing them. - * This shifts the given CRC by 8*len bits (i.e. produces the same effect - * as appending len bytes of zero to the data), in time proportional - * to log(len). - */ -static u32 crc32_generic_shift(u32 crc, size_t len, u32 polynomial) -{ - u32 power = polynomial; /* CRC of x^32 */ - int i; - - /* Shift up to 32 bits in the simple linear way */ - for (i = 0; i < 8 * (int)(len & 3); i++) - crc = (crc >> 1) ^ (crc & 1 ? polynomial : 0); - - len >>= 2; - if (!len) - return crc; - - for (;;) { - /* "power" is x^(2^i), modulo the polynomial */ - if (len & 1) - crc = gf2_multiply(crc, power, polynomial); - - len >>= 1; - if (!len) - break; - - /* Square power, advancing to x^(2^(i+1)) */ - power = gf2_multiply(power, power, polynomial); - } - - return crc; -} - -u32 crc32_le_shift(u32 crc, size_t len) -{ - return crc32_generic_shift(crc, len, CRC32_POLY_LE); -} -EXPORT_SYMBOL(crc32_le_shift); - -u32 crc32c_shift(u32 crc, size_t len) -{ - return crc32_generic_shift(crc, len, CRC32C_POLY_LE); -} -EXPORT_SYMBOL(crc32c_shift); - -u32 crc32_be_base(u32 crc, const u8 *p, size_t len) -{ - while (len--) - crc = (crc << 8) ^ crc32table_be[(crc >> 24) ^ *p++]; - return crc; -} -EXPORT_SYMBOL(crc32_be_base); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 798972b29b68..c2b65b6a9bb6 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -2,6 +2,9 @@ menu "Crypto library routines" +config CRYPTO_HASH_INFO + bool + config CRYPTO_LIB_UTILS tristate @@ -50,22 +53,16 @@ config CRYPTO_ARCH_HAVE_LIB_CHACHA config CRYPTO_LIB_CHACHA_GENERIC tristate + default CRYPTO_LIB_CHACHA if !CRYPTO_ARCH_HAVE_LIB_CHACHA select CRYPTO_LIB_UTILS help - This symbol can be depended upon by arch implementations of the - ChaCha library interface that require the generic code as a - fallback, e.g., for SIMD implementations. If no arch specific - implementation is enabled, this implementation serves the users - of CRYPTO_LIB_CHACHA. - -config CRYPTO_LIB_CHACHA_INTERNAL - tristate - select CRYPTO_LIB_CHACHA_GENERIC if CRYPTO_ARCH_HAVE_LIB_CHACHA=n + This symbol can be selected by arch implementations of the ChaCha + library interface that require the generic code as a fallback, e.g., + for SIMD implementations. If no arch specific implementation is + enabled, this implementation serves the users of CRYPTO_LIB_CHACHA. config CRYPTO_LIB_CHACHA tristate - select CRYPTO - select CRYPTO_LIB_CHACHA_INTERNAL help Enable the ChaCha library interface. This interface may be fulfilled by either the generic implementation or an arch-specific one, if one @@ -120,21 +117,15 @@ config CRYPTO_ARCH_HAVE_LIB_POLY1305 config CRYPTO_LIB_POLY1305_GENERIC tristate + default CRYPTO_LIB_POLY1305 if !CRYPTO_ARCH_HAVE_LIB_POLY1305 help - This symbol can be depended upon by arch implementations of the - Poly1305 library interface that require the generic code as a - fallback, e.g., for SIMD implementations. If no arch specific - implementation is enabled, this implementation serves the users - of CRYPTO_LIB_POLY1305. - -config CRYPTO_LIB_POLY1305_INTERNAL - tristate - select CRYPTO_LIB_POLY1305_GENERIC if CRYPTO_ARCH_HAVE_LIB_POLY1305=n + This symbol can be selected by arch implementations of the Poly1305 + library interface that require the generic code as a fallback, e.g., + for SIMD implementations. If no arch specific implementation is + enabled, this implementation serves the users of CRYPTO_LIB_POLY1305. config CRYPTO_LIB_POLY1305 tristate - select CRYPTO - select CRYPTO_LIB_POLY1305_INTERNAL help Enable the Poly1305 library interface. This interface may be fulfilled by either the generic implementation or an arch-specific one, if one @@ -148,8 +139,85 @@ config CRYPTO_LIB_CHACHA20POLY1305 config CRYPTO_LIB_SHA1 tristate + help + The SHA-1 library functions. Select this if your module uses any of + the functions from <crypto/sha1.h>. + +config CRYPTO_LIB_SHA1_ARCH + bool + depends on CRYPTO_LIB_SHA1 && !UML + default y if ARM + default y if ARM64 && KERNEL_MODE_NEON + default y if MIPS && CPU_CAVIUM_OCTEON + default y if PPC + default y if S390 + default y if SPARC64 + default y if X86_64 config CRYPTO_LIB_SHA256 tristate + help + Enable the SHA-256 library interface. This interface may be fulfilled + by either the generic implementation or an arch-specific one, if one + is available and enabled. + +config CRYPTO_LIB_SHA256_ARCH + bool + depends on CRYPTO_LIB_SHA256 && !UML + default y if ARM && !CPU_V7M + default y if ARM64 + default y if MIPS && CPU_CAVIUM_OCTEON + default y if PPC && SPE + default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + default y if S390 + default y if SPARC64 + default y if X86_64 + +config CRYPTO_LIB_SHA512 + tristate + help + The SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions. + Select this if your module uses any of these functions from + <crypto/sha2.h>. + +config CRYPTO_LIB_SHA512_ARCH + bool + depends on CRYPTO_LIB_SHA512 && !UML + default y if ARM && !CPU_V7M + default y if ARM64 + default y if MIPS && CPU_CAVIUM_OCTEON + default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + default y if S390 + default y if SPARC64 + default y if X86_64 + +config CRYPTO_LIB_SM3 + tristate + +source "lib/crypto/tests/Kconfig" + +if !KMSAN # avoid false positives from assembly +if ARM +source "lib/crypto/arm/Kconfig" +endif +if ARM64 +source "lib/crypto/arm64/Kconfig" +endif +if MIPS +source "lib/crypto/mips/Kconfig" +endif +if PPC +source "lib/crypto/powerpc/Kconfig" +endif +if RISCV +source "lib/crypto/riscv/Kconfig" +endif +if S390 +source "lib/crypto/s390/Kconfig" +endif +if X86 +source "lib/crypto/x86/Kconfig" +endif +endif endmenu diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 01fac1cd05a1..e4151be2ebd4 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -1,5 +1,17 @@ # SPDX-License-Identifier: GPL-2.0 +aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1 + +quiet_cmd_perlasm = PERLASM $@ + cmd_perlasm = $(PERL) $(<) > $(@) + +quiet_cmd_perlasm_with_args = PERLASM $@ + cmd_perlasm_with_args = $(PERL) $(<) void $(@) + +obj-$(CONFIG_KUNIT) += tests/ + +obj-$(CONFIG_CRYPTO_HASH_INFO) += hash_info.o + obj-$(CONFIG_CRYPTO_LIB_UTILS) += libcryptoutils.o libcryptoutils-y := memneq.o utils.o @@ -25,38 +37,133 @@ obj-$(CONFIG_CRYPTO_LIB_GF128MUL) += gf128mul.o obj-y += libblake2s.o libblake2s-y := blake2s.o libblake2s-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC) += blake2s-generic.o +libblake2s-$(CONFIG_CRYPTO_SELFTESTS) += blake2s-selftest.o obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305) += libchacha20poly1305.o libchacha20poly1305-y += chacha20poly1305.o +libchacha20poly1305-$(CONFIG_CRYPTO_SELFTESTS) += chacha20poly1305-selftest.o obj-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += libcurve25519-generic.o libcurve25519-generic-y := curve25519-fiat32.o libcurve25519-generic-$(CONFIG_ARCH_SUPPORTS_INT128) := curve25519-hacl64.o libcurve25519-generic-y += curve25519-generic.o +# clang versions prior to 18 may blow out the stack with KASAN +ifeq ($(call clang-min-version, 180000),) +KASAN_SANITIZE_curve25519-hacl64.o := n +endif obj-$(CONFIG_CRYPTO_LIB_CURVE25519) += libcurve25519.o libcurve25519-y += curve25519.o +libcurve25519-$(CONFIG_CRYPTO_SELFTESTS) += curve25519-selftest.o obj-$(CONFIG_CRYPTO_LIB_DES) += libdes.o libdes-y := des.o -obj-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += libpoly1305.o -libpoly1305-y := poly1305-donna32.o -libpoly1305-$(CONFIG_ARCH_SUPPORTS_INT128) := poly1305-donna64.o +obj-$(CONFIG_CRYPTO_LIB_POLY1305) += libpoly1305.o libpoly1305-y += poly1305.o -obj-$(CONFIG_CRYPTO_LIB_SHA1) += libsha1.o -libsha1-y := sha1.o +obj-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += libpoly1305-generic.o +libpoly1305-generic-y := poly1305-donna32.o +libpoly1305-generic-$(CONFIG_ARCH_SUPPORTS_INT128) := poly1305-donna64.o +libpoly1305-generic-y += poly1305-generic.o + +################################################################################ + +obj-$(CONFIG_CRYPTO_LIB_SHA1) += libsha1.o +libsha1-y := sha1.o +ifeq ($(CONFIG_CRYPTO_LIB_SHA1_ARCH),y) +CFLAGS_sha1.o += -I$(src)/$(SRCARCH) +ifeq ($(CONFIG_ARM),y) +libsha1-y += arm/sha1-armv4-large.o +libsha1-$(CONFIG_KERNEL_MODE_NEON) += arm/sha1-armv7-neon.o \ + arm/sha1-ce-core.o +endif +libsha1-$(CONFIG_ARM64) += arm64/sha1-ce-core.o +ifeq ($(CONFIG_PPC),y) +libsha1-y += powerpc/sha1-powerpc-asm.o +libsha1-$(CONFIG_SPE) += powerpc/sha1-spe-asm.o +endif +libsha1-$(CONFIG_SPARC) += sparc/sha1_asm.o +libsha1-$(CONFIG_X86) += x86/sha1-ssse3-and-avx.o \ + x86/sha1-avx2-asm.o \ + x86/sha1-ni-asm.o +endif # CONFIG_CRYPTO_LIB_SHA1_ARCH + +################################################################################ + +obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o +libsha256-y := sha256.o +ifeq ($(CONFIG_CRYPTO_LIB_SHA256_ARCH),y) +CFLAGS_sha256.o += -I$(src)/$(SRCARCH) + +ifeq ($(CONFIG_ARM),y) +libsha256-y += arm/sha256-ce.o arm/sha256-core.o +$(obj)/arm/sha256-core.S: $(src)/arm/sha256-armv4.pl + $(call cmd,perlasm) +clean-files += arm/sha256-core.S +AFLAGS_arm/sha256-core.o += $(aflags-thumb2-y) +endif + +ifeq ($(CONFIG_ARM64),y) +libsha256-y += arm64/sha256-core.o +$(obj)/arm64/sha256-core.S: $(src)/arm64/sha2-armv8.pl + $(call cmd,perlasm_with_args) +clean-files += arm64/sha256-core.S +libsha256-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha256-ce.o +endif -obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o -libsha256-y := sha256.o +libsha256-$(CONFIG_PPC) += powerpc/sha256-spe-asm.o +libsha256-$(CONFIG_RISCV) += riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.o +libsha256-$(CONFIG_SPARC) += sparc/sha256_asm.o +libsha256-$(CONFIG_X86) += x86/sha256-ssse3-asm.o \ + x86/sha256-avx-asm.o \ + x86/sha256-avx2-asm.o \ + x86/sha256-ni-asm.o +endif # CONFIG_CRYPTO_LIB_SHA256_ARCH + +################################################################################ + +obj-$(CONFIG_CRYPTO_LIB_SHA512) += libsha512.o +libsha512-y := sha512.o +ifeq ($(CONFIG_CRYPTO_LIB_SHA512_ARCH),y) +CFLAGS_sha512.o += -I$(src)/$(SRCARCH) + +ifeq ($(CONFIG_ARM),y) +libsha512-y += arm/sha512-core.o +$(obj)/arm/sha512-core.S: $(src)/arm/sha512-armv4.pl + $(call cmd,perlasm) +clean-files += arm/sha512-core.S +AFLAGS_arm/sha512-core.o += $(aflags-thumb2-y) +endif -ifneq ($(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS),y) -libblake2s-y += blake2s-selftest.o -libchacha20poly1305-y += chacha20poly1305-selftest.o -libcurve25519-y += curve25519-selftest.o +ifeq ($(CONFIG_ARM64),y) +libsha512-y += arm64/sha512-core.o +$(obj)/arm64/sha512-core.S: $(src)/arm64/sha2-armv8.pl + $(call cmd,perlasm_with_args) +clean-files += arm64/sha512-core.S +libsha512-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha512-ce-core.o endif +libsha512-$(CONFIG_RISCV) += riscv/sha512-riscv64-zvknhb-zvkb.o +libsha512-$(CONFIG_SPARC) += sparc/sha512_asm.o +libsha512-$(CONFIG_X86) += x86/sha512-ssse3-asm.o \ + x86/sha512-avx-asm.o \ + x86/sha512-avx2-asm.o +endif # CONFIG_CRYPTO_LIB_SHA512_ARCH + +################################################################################ + obj-$(CONFIG_MPILIB) += mpi/ -obj-$(CONFIG_CRYPTO_MANAGER_EXTRA_TESTS) += simd.o +obj-$(CONFIG_CRYPTO_SELFTESTS_FULL) += simd.o + +obj-$(CONFIG_CRYPTO_LIB_SM3) += libsm3.o +libsm3-y := sm3.o + +obj-$(CONFIG_ARM) += arm/ +obj-$(CONFIG_ARM64) += arm64/ +obj-$(CONFIG_MIPS) += mips/ +obj-$(CONFIG_PPC) += powerpc/ +obj-$(CONFIG_RISCV) += riscv/ +obj-$(CONFIG_S390) += s390/ +obj-$(CONFIG_X86) += x86/ diff --git a/lib/crypto/aes.c b/lib/crypto/aes.c index eafe14d021f5..b57fda3460f1 100644 --- a/lib/crypto/aes.c +++ b/lib/crypto/aes.c @@ -5,6 +5,7 @@ #include <crypto/aes.h> #include <linux/crypto.h> +#include <linux/export.h> #include <linux/module.h> #include <linux/unaligned.h> diff --git a/lib/crypto/aescfb.c b/lib/crypto/aescfb.c index 749dc1258a44..0f294c8cbf3c 100644 --- a/lib/crypto/aescfb.c +++ b/lib/crypto/aescfb.c @@ -5,11 +5,10 @@ * Copyright 2023 Google LLC */ -#include <linux/module.h> - -#include <crypto/algapi.h> #include <crypto/aes.h> - +#include <crypto/algapi.h> +#include <linux/export.h> +#include <linux/module.h> #include <asm/irqflags.h> static void aescfb_encrypt_block(const struct crypto_aes_ctx *ctx, void *dst, @@ -99,18 +98,18 @@ MODULE_DESCRIPTION("Generic AES-CFB library"); MODULE_AUTHOR("Ard Biesheuvel <ardb@kernel.org>"); MODULE_LICENSE("GPL"); -#ifndef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS +#ifdef CONFIG_CRYPTO_SELFTESTS /* * Test code below. Vectors taken from crypto/testmgr.h */ static struct { - u8 ptext[64]; - u8 ctext[64]; + u8 ptext[64] __nonstring; + u8 ctext[64] __nonstring; - u8 key[AES_MAX_KEY_SIZE]; - u8 iv[AES_BLOCK_SIZE]; + u8 key[AES_MAX_KEY_SIZE] __nonstring; + u8 iv[AES_BLOCK_SIZE] __nonstring; int klen; int len; diff --git a/lib/crypto/aesgcm.c b/lib/crypto/aesgcm.c index 902e49410aaf..ac0b2fcfd606 100644 --- a/lib/crypto/aesgcm.c +++ b/lib/crypto/aesgcm.c @@ -5,12 +5,11 @@ * Copyright 2022 Google LLC */ -#include <linux/module.h> - #include <crypto/algapi.h> #include <crypto/gcm.h> #include <crypto/ghash.h> - +#include <linux/export.h> +#include <linux/module.h> #include <asm/irqflags.h> static void aesgcm_encrypt_block(const struct crypto_aes_ctx *ctx, void *dst, @@ -199,25 +198,25 @@ MODULE_DESCRIPTION("Generic AES-GCM library"); MODULE_AUTHOR("Ard Biesheuvel <ardb@kernel.org>"); MODULE_LICENSE("GPL"); -#ifndef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS +#ifdef CONFIG_CRYPTO_SELFTESTS /* * Test code below. Vectors taken from crypto/testmgr.h */ -static const u8 __initconst ctext0[16] = +static const u8 __initconst ctext0[16] __nonstring = "\x58\xe2\xfc\xce\xfa\x7e\x30\x61" "\x36\x7f\x1d\x57\xa4\xe7\x45\x5a"; static const u8 __initconst ptext1[16]; -static const u8 __initconst ctext1[32] = +static const u8 __initconst ctext1[32] __nonstring = "\x03\x88\xda\xce\x60\xb6\xa3\x92" "\xf3\x28\xc2\xb9\x71\xb2\xfe\x78" "\xab\x6e\x47\xd4\x2c\xec\x13\xbd" "\xf5\x3a\x67\xb2\x12\x57\xbd\xdf"; -static const u8 __initconst ptext2[64] = +static const u8 __initconst ptext2[64] __nonstring = "\xd9\x31\x32\x25\xf8\x84\x06\xe5" "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" "\x86\xa7\xa9\x53\x15\x34\xf7\xda" @@ -227,7 +226,7 @@ static const u8 __initconst ptext2[64] = "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" "\xba\x63\x7b\x39\x1a\xaf\xd2\x55"; -static const u8 __initconst ctext2[80] = +static const u8 __initconst ctext2[80] __nonstring = "\x42\x83\x1e\xc2\x21\x77\x74\x24" "\x4b\x72\x21\xb7\x84\xd0\xd4\x9c" "\xe3\xaa\x21\x2f\x2c\x02\xa4\xe0" @@ -239,7 +238,7 @@ static const u8 __initconst ctext2[80] = "\x4d\x5c\x2a\xf3\x27\xcd\x64\xa6" "\x2c\xf3\x5a\xbd\x2b\xa6\xfa\xb4"; -static const u8 __initconst ptext3[60] = +static const u8 __initconst ptext3[60] __nonstring = "\xd9\x31\x32\x25\xf8\x84\x06\xe5" "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" "\x86\xa7\xa9\x53\x15\x34\xf7\xda" @@ -249,7 +248,7 @@ static const u8 __initconst ptext3[60] = "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" "\xba\x63\x7b\x39"; -static const u8 __initconst ctext3[76] = +static const u8 __initconst ctext3[76] __nonstring = "\x42\x83\x1e\xc2\x21\x77\x74\x24" "\x4b\x72\x21\xb7\x84\xd0\xd4\x9c" "\xe3\xaa\x21\x2f\x2c\x02\xa4\xe0" @@ -261,17 +260,17 @@ static const u8 __initconst ctext3[76] = "\x5b\xc9\x4f\xbc\x32\x21\xa5\xdb" "\x94\xfa\xe9\x5a\xe7\x12\x1a\x47"; -static const u8 __initconst ctext4[16] = +static const u8 __initconst ctext4[16] __nonstring = "\xcd\x33\xb2\x8a\xc7\x73\xf7\x4b" "\xa0\x0e\xd1\xf3\x12\x57\x24\x35"; -static const u8 __initconst ctext5[32] = +static const u8 __initconst ctext5[32] __nonstring = "\x98\xe7\x24\x7c\x07\xf0\xfe\x41" "\x1c\x26\x7e\x43\x84\xb0\xf6\x00" "\x2f\xf5\x8d\x80\x03\x39\x27\xab" "\x8e\xf4\xd4\x58\x75\x14\xf0\xfb"; -static const u8 __initconst ptext6[64] = +static const u8 __initconst ptext6[64] __nonstring = "\xd9\x31\x32\x25\xf8\x84\x06\xe5" "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" "\x86\xa7\xa9\x53\x15\x34\xf7\xda" @@ -281,7 +280,7 @@ static const u8 __initconst ptext6[64] = "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" "\xba\x63\x7b\x39\x1a\xaf\xd2\x55"; -static const u8 __initconst ctext6[80] = +static const u8 __initconst ctext6[80] __nonstring = "\x39\x80\xca\x0b\x3c\x00\xe8\x41" "\xeb\x06\xfa\xc4\x87\x2a\x27\x57" "\x85\x9e\x1c\xea\xa6\xef\xd9\x84" @@ -293,17 +292,17 @@ static const u8 __initconst ctext6[80] = "\x99\x24\xa7\xc8\x58\x73\x36\xbf" "\xb1\x18\x02\x4d\xb8\x67\x4a\x14"; -static const u8 __initconst ctext7[16] = +static const u8 __initconst ctext7[16] __nonstring = "\x53\x0f\x8a\xfb\xc7\x45\x36\xb9" "\xa9\x63\xb4\xf1\xc4\xcb\x73\x8b"; -static const u8 __initconst ctext8[32] = +static const u8 __initconst ctext8[32] __nonstring = "\xce\xa7\x40\x3d\x4d\x60\x6b\x6e" "\x07\x4e\xc5\xd3\xba\xf3\x9d\x18" "\xd0\xd1\xc8\xa7\x99\x99\x6b\xf0" "\x26\x5b\x98\xb5\xd4\x8a\xb9\x19"; -static const u8 __initconst ptext9[64] = +static const u8 __initconst ptext9[64] __nonstring = "\xd9\x31\x32\x25\xf8\x84\x06\xe5" "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" "\x86\xa7\xa9\x53\x15\x34\xf7\xda" @@ -313,7 +312,7 @@ static const u8 __initconst ptext9[64] = "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" "\xba\x63\x7b\x39\x1a\xaf\xd2\x55"; -static const u8 __initconst ctext9[80] = +static const u8 __initconst ctext9[80] __nonstring = "\x52\x2d\xc1\xf0\x99\x56\x7d\x07" "\xf4\x7f\x37\xa3\x2a\x84\x42\x7d" "\x64\x3a\x8c\xdc\xbf\xe5\xc0\xc9" @@ -325,7 +324,7 @@ static const u8 __initconst ctext9[80] = "\xb0\x94\xda\xc5\xd9\x34\x71\xbd" "\xec\x1a\x50\x22\x70\xe3\xcc\x6c"; -static const u8 __initconst ptext10[60] = +static const u8 __initconst ptext10[60] __nonstring = "\xd9\x31\x32\x25\xf8\x84\x06\xe5" "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" "\x86\xa7\xa9\x53\x15\x34\xf7\xda" @@ -335,7 +334,7 @@ static const u8 __initconst ptext10[60] = "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" "\xba\x63\x7b\x39"; -static const u8 __initconst ctext10[76] = +static const u8 __initconst ctext10[76] __nonstring = "\x52\x2d\xc1\xf0\x99\x56\x7d\x07" "\xf4\x7f\x37\xa3\x2a\x84\x42\x7d" "\x64\x3a\x8c\xdc\xbf\xe5\xc0\xc9" @@ -347,7 +346,7 @@ static const u8 __initconst ctext10[76] = "\x76\xfc\x6e\xce\x0f\x4e\x17\x68" "\xcd\xdf\x88\x53\xbb\x2d\x55\x1b"; -static const u8 __initconst ptext11[60] = +static const u8 __initconst ptext11[60] __nonstring = "\xd9\x31\x32\x25\xf8\x84\x06\xe5" "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" "\x86\xa7\xa9\x53\x15\x34\xf7\xda" @@ -357,7 +356,7 @@ static const u8 __initconst ptext11[60] = "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" "\xba\x63\x7b\x39"; -static const u8 __initconst ctext11[76] = +static const u8 __initconst ctext11[76] __nonstring = "\x39\x80\xca\x0b\x3c\x00\xe8\x41" "\xeb\x06\xfa\xc4\x87\x2a\x27\x57" "\x85\x9e\x1c\xea\xa6\xef\xd9\x84" @@ -369,7 +368,7 @@ static const u8 __initconst ctext11[76] = "\x25\x19\x49\x8e\x80\xf1\x47\x8f" "\x37\xba\x55\xbd\x6d\x27\x61\x8c"; -static const u8 __initconst ptext12[719] = +static const u8 __initconst ptext12[719] __nonstring = "\x42\xc1\xcc\x08\x48\x6f\x41\x3f" "\x2f\x11\x66\x8b\x2a\x16\xf0\xe0" "\x58\x83\xf0\xc3\x70\x14\xc0\x5b" @@ -461,7 +460,7 @@ static const u8 __initconst ptext12[719] = "\x59\xfa\xfa\xaa\x44\x04\x01\xa7" "\xa4\x78\xdb\x74\x3d\x8b\xb5"; -static const u8 __initconst ctext12[735] = +static const u8 __initconst ctext12[735] __nonstring = "\x84\x0b\xdb\xd5\xb7\xa8\xfe\x20" "\xbb\xb1\x12\x7f\x41\xea\xb3\xc0" "\xa2\xb4\x37\x19\x11\x58\xb6\x0b" @@ -559,9 +558,9 @@ static struct { const u8 *ptext; const u8 *ctext; - u8 key[AES_MAX_KEY_SIZE]; - u8 iv[GCM_AES_IV_SIZE]; - u8 assoc[20]; + u8 key[AES_MAX_KEY_SIZE] __nonstring; + u8 iv[GCM_AES_IV_SIZE] __nonstring; + u8 assoc[20] __nonstring; int klen; int clen; diff --git a/lib/crypto/arc4.c b/lib/crypto/arc4.c index 838812d18216..4e950e1e66d0 100644 --- a/lib/crypto/arc4.c +++ b/lib/crypto/arc4.c @@ -8,6 +8,7 @@ */ #include <crypto/arc4.h> +#include <linux/export.h> #include <linux/module.h> int arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len) diff --git a/lib/crypto/arm/.gitignore b/lib/crypto/arm/.gitignore new file mode 100644 index 000000000000..f6c4e8ef80da --- /dev/null +++ b/lib/crypto/arm/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +poly1305-core.S +sha256-core.S +sha512-core.S diff --git a/lib/crypto/arm/Kconfig b/lib/crypto/arm/Kconfig new file mode 100644 index 000000000000..e8444fd0aae3 --- /dev/null +++ b/lib/crypto/arm/Kconfig @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_BLAKE2S_ARM + bool "Hash functions: BLAKE2s" + select CRYPTO_ARCH_HAVE_LIB_BLAKE2S + help + BLAKE2s cryptographic hash function (RFC 7693) + + Architecture: arm + + This is faster than the generic implementations of BLAKE2s and + BLAKE2b, but slower than the NEON implementation of BLAKE2b. + There is no NEON implementation of BLAKE2s, since NEON doesn't + really help with it. + +config CRYPTO_CHACHA20_NEON + tristate + default CRYPTO_LIB_CHACHA + select CRYPTO_ARCH_HAVE_LIB_CHACHA + +config CRYPTO_POLY1305_ARM + tristate + default CRYPTO_LIB_POLY1305 + select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/lib/crypto/arm/Makefile b/lib/crypto/arm/Makefile new file mode 100644 index 000000000000..4c042a4c77ed --- /dev/null +++ b/lib/crypto/arm/Makefile @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o +libblake2s-arm-y := blake2s-core.o blake2s-glue.o + +obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o +chacha-neon-y := chacha-scalar-core.o chacha-glue.o +chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o + +obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o +poly1305-arm-y := poly1305-core.o poly1305-glue.o + +quiet_cmd_perl = PERL $@ + cmd_perl = $(PERL) $(<) > $(@) + +$(obj)/%-core.S: $(src)/%-armv4.pl + $(call cmd,perl) + +clean-files += poly1305-core.S + +aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1 + +# massage the perlasm code a bit so we only get the NEON routine if we need it +poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5 +poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7 +AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y) diff --git a/lib/crypto/arm/blake2s-core.S b/lib/crypto/arm/blake2s-core.S new file mode 100644 index 000000000000..df40e46601f1 --- /dev/null +++ b/lib/crypto/arm/blake2s-core.S @@ -0,0 +1,306 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * BLAKE2s digest algorithm, ARM scalar implementation + * + * Copyright 2020 Google LLC + * + * Author: Eric Biggers <ebiggers@google.com> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + // Registers used to hold message words temporarily. There aren't + // enough ARM registers to hold the whole message block, so we have to + // load the words on-demand. + M_0 .req r12 + M_1 .req r14 + +// The BLAKE2s initialization vector +.Lblake2s_IV: + .word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A + .word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 + +.macro __ldrd a, b, src, offset +#if __LINUX_ARM_ARCH__ >= 6 + ldrd \a, \b, [\src, #\offset] +#else + ldr \a, [\src, #\offset] + ldr \b, [\src, #\offset + 4] +#endif +.endm + +.macro __strd a, b, dst, offset +#if __LINUX_ARM_ARCH__ >= 6 + strd \a, \b, [\dst, #\offset] +#else + str \a, [\dst, #\offset] + str \b, [\dst, #\offset + 4] +#endif +.endm + +.macro _le32_bswap a, tmp +#ifdef __ARMEB__ + rev_l \a, \tmp +#endif +.endm + +.macro _le32_bswap_8x a, b, c, d, e, f, g, h, tmp + _le32_bswap \a, \tmp + _le32_bswap \b, \tmp + _le32_bswap \c, \tmp + _le32_bswap \d, \tmp + _le32_bswap \e, \tmp + _le32_bswap \f, \tmp + _le32_bswap \g, \tmp + _le32_bswap \h, \tmp +.endm + +// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals. +// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two +// columns/diagonals. s0-s1 are the word offsets to the message words the first +// column/diagonal needs, and likewise s2-s3 for the second column/diagonal. +// M_0 and M_1 are free to use, and the message block can be found at sp + 32. +// +// Note that to save instructions, the rotations don't happen when the +// pseudocode says they should, but rather they are delayed until the values are +// used. See the comment above _blake2s_round(). +.macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3 + + ldr M_0, [sp, #32 + 4 * \s0] + ldr M_1, [sp, #32 + 4 * \s2] + + // a += b + m[blake2s_sigma[r][2*i + 0]]; + add \a0, \a0, \b0, ror #brot + add \a1, \a1, \b1, ror #brot + add \a0, \a0, M_0 + add \a1, \a1, M_1 + + // d = ror32(d ^ a, 16); + eor \d0, \a0, \d0, ror #drot + eor \d1, \a1, \d1, ror #drot + + // c += d; + add \c0, \c0, \d0, ror #16 + add \c1, \c1, \d1, ror #16 + + // b = ror32(b ^ c, 12); + eor \b0, \c0, \b0, ror #brot + eor \b1, \c1, \b1, ror #brot + + ldr M_0, [sp, #32 + 4 * \s1] + ldr M_1, [sp, #32 + 4 * \s3] + + // a += b + m[blake2s_sigma[r][2*i + 1]]; + add \a0, \a0, \b0, ror #12 + add \a1, \a1, \b1, ror #12 + add \a0, \a0, M_0 + add \a1, \a1, M_1 + + // d = ror32(d ^ a, 8); + eor \d0, \a0, \d0, ror#16 + eor \d1, \a1, \d1, ror#16 + + // c += d; + add \c0, \c0, \d0, ror#8 + add \c1, \c1, \d1, ror#8 + + // b = ror32(b ^ c, 7); + eor \b0, \c0, \b0, ror#12 + eor \b1, \c1, \b1, ror#12 +.endm + +// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9] +// are in r0..r9. The stack pointer points to 8 bytes of scratch space for +// spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and +// r14 are free to use. The macro arguments s0-s15 give the order in which the +// message words are used in this round. +// +// All rotates are performed using the implicit rotate operand accepted by the +// 'add' and 'eor' instructions. This is faster than using explicit rotate +// instructions. To make this work, we allow the values in the second and last +// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the +// wrong rotation amount. The rotation amount is then fixed up just in time +// when the values are used. 'brot' is the number of bits the values in row 'b' +// need to be rotated right to arrive at the correct values, and 'drot' +// similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such +// that they end up as (7, 8) after every round. +.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \ + s8, s9, s10, s11, s12, s13, s14, s15 + + // Mix first two columns: + // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]). + __ldrd r10, r11, sp, 16 // load v[12] and v[13] + _blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \ + \s0, \s1, \s2, \s3 + __strd r8, r9, sp, 0 + __strd r10, r11, sp, 16 + + // Mix second two columns: + // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]). + __ldrd r8, r9, sp, 8 // load v[10] and v[11] + __ldrd r10, r11, sp, 24 // load v[14] and v[15] + _blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \ + \s4, \s5, \s6, \s7 + str r10, [sp, #24] // store v[14] + // v[10], v[11], and v[15] are used below, so no need to store them yet. + + .set brot, 7 + .set drot, 8 + + // Mix first two diagonals: + // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]). + ldr r10, [sp, #16] // load v[12] + _blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \ + \s8, \s9, \s10, \s11 + __strd r8, r9, sp, 8 + str r11, [sp, #28] + str r10, [sp, #16] + + // Mix second two diagonals: + // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]). + __ldrd r8, r9, sp, 0 // load v[8] and v[9] + __ldrd r10, r11, sp, 20 // load v[13] and v[14] + _blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \ + \s12, \s13, \s14, \s15 + __strd r10, r11, sp, 20 +.endm + +// +// void blake2s_compress(struct blake2s_state *state, +// const u8 *block, size_t nblocks, u32 inc); +// +// Only the first three fields of struct blake2s_state are used: +// u32 h[8]; (inout) +// u32 t[2]; (inout) +// u32 f[2]; (in) +// + .align 5 +ENTRY(blake2s_compress) + push {r0-r2,r4-r11,lr} // keep this an even number + +.Lnext_block: + // r0 is 'state' + // r1 is 'block' + // r3 is 'inc' + + // Load and increment the counter t[0..1]. + __ldrd r10, r11, r0, 32 + adds r10, r10, r3 + adc r11, r11, #0 + __strd r10, r11, r0, 32 + + // _blake2s_round is very short on registers, so copy the message block + // to the stack to save a register during the rounds. This also has the + // advantage that misalignment only needs to be dealt with in one place. + sub sp, sp, #64 + mov r12, sp + tst r1, #3 + bne .Lcopy_block_misaligned + ldmia r1!, {r2-r9} + _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 + stmia r12!, {r2-r9} + ldmia r1!, {r2-r9} + _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 + stmia r12, {r2-r9} +.Lcopy_block_done: + str r1, [sp, #68] // Update message pointer + + // Calculate v[8..15]. Push v[9..15] onto the stack, and leave space + // for spilling v[8..9]. Leave v[8..9] in r8-r9. + mov r14, r0 // r14 = state + adr r12, .Lblake2s_IV + ldmia r12!, {r8-r9} // load IV[0..1] + __ldrd r0, r1, r14, 40 // load f[0..1] + ldm r12, {r2-r7} // load IV[3..7] + eor r4, r4, r10 // v[12] = IV[4] ^ t[0] + eor r5, r5, r11 // v[13] = IV[5] ^ t[1] + eor r6, r6, r0 // v[14] = IV[6] ^ f[0] + eor r7, r7, r1 // v[15] = IV[7] ^ f[1] + push {r2-r7} // push v[9..15] + sub sp, sp, #8 // leave space for v[8..9] + + // Load h[0..7] == v[0..7]. + ldm r14, {r0-r7} + + // Execute the rounds. Each round is provided the order in which it + // needs to use the message words. + .set brot, 0 + .set drot, 0 + _blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + _blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 + _blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 + _blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 + _blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 + _blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 + _blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 + _blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 + _blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 + _blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 + + // Fold the final state matrix into the hash chaining value: + // + // for (i = 0; i < 8; i++) + // h[i] ^= v[i] ^ v[i + 8]; + // + ldr r14, [sp, #96] // r14 = &h[0] + add sp, sp, #8 // v[8..9] are already loaded. + pop {r10-r11} // load v[10..11] + eor r0, r0, r8 + eor r1, r1, r9 + eor r2, r2, r10 + eor r3, r3, r11 + ldm r14, {r8-r11} // load h[0..3] + eor r0, r0, r8 + eor r1, r1, r9 + eor r2, r2, r10 + eor r3, r3, r11 + stmia r14!, {r0-r3} // store new h[0..3] + ldm r14, {r0-r3} // load old h[4..7] + pop {r8-r11} // load v[12..15] + eor r0, r0, r4, ror #brot + eor r1, r1, r5, ror #brot + eor r2, r2, r6, ror #brot + eor r3, r3, r7, ror #brot + eor r0, r0, r8, ror #drot + eor r1, r1, r9, ror #drot + eor r2, r2, r10, ror #drot + eor r3, r3, r11, ror #drot + add sp, sp, #64 // skip copy of message block + stm r14, {r0-r3} // store new h[4..7] + + // Advance to the next block, if there is one. Note that if there are + // multiple blocks, then 'inc' (the counter increment amount) must be + // 64. So we can simply set it to 64 without re-loading it. + ldm sp, {r0, r1, r2} // load (state, block, nblocks) + mov r3, #64 // set 'inc' + subs r2, r2, #1 // nblocks-- + str r2, [sp, #8] + bne .Lnext_block // nblocks != 0? + + pop {r0-r2,r4-r11,pc} + + // The next message block (pointed to by r1) isn't 4-byte aligned, so it + // can't be loaded using ldmia. Copy it to the stack buffer (pointed to + // by r12) using an alternative method. r2-r9 are free to use. +.Lcopy_block_misaligned: + mov r2, #64 +1: +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + ldr r3, [r1], #4 + _le32_bswap r3, r4 +#else + ldrb r3, [r1, #0] + ldrb r4, [r1, #1] + ldrb r5, [r1, #2] + ldrb r6, [r1, #3] + add r1, r1, #4 + orr r3, r3, r4, lsl #8 + orr r3, r3, r5, lsl #16 + orr r3, r3, r6, lsl #24 +#endif + subs r2, r2, #4 + str r3, [r12], #4 + bne 1b + b .Lcopy_block_done +ENDPROC(blake2s_compress) diff --git a/lib/crypto/arm/blake2s-glue.c b/lib/crypto/arm/blake2s-glue.c new file mode 100644 index 000000000000..0238a70d9581 --- /dev/null +++ b/lib/crypto/arm/blake2s-glue.c @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <crypto/internal/blake2s.h> +#include <linux/module.h> + +/* defined in blake2s-core.S */ +EXPORT_SYMBOL(blake2s_compress); diff --git a/lib/crypto/arm/chacha-glue.c b/lib/crypto/arm/chacha-glue.c new file mode 100644 index 000000000000..88ec96415283 --- /dev/null +++ b/lib/crypto/arm/chacha-glue.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ChaCha and HChaCha functions (ARM optimized) + * + * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org> + * Copyright (C) 2015 Martin Willi + */ + +#include <crypto/chacha.h> +#include <crypto/internal/simd.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/cputype.h> +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> + +asmlinkage void chacha_block_xor_neon(const struct chacha_state *state, + u8 *dst, const u8 *src, int nrounds); +asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state, + u8 *dst, const u8 *src, + int nrounds, unsigned int nbytes); +asmlinkage void hchacha_block_arm(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); +asmlinkage void hchacha_block_neon(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); + +asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, + const struct chacha_state *state, int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon); + +static inline bool neon_usable(void) +{ + return static_branch_likely(&use_neon) && crypto_simd_usable(); +} + +static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + u8 buf[CHACHA_BLOCK_SIZE]; + + while (bytes > CHACHA_BLOCK_SIZE) { + unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U); + + chacha_4block_xor_neon(state, dst, src, nrounds, l); + bytes -= l; + src += l; + dst += l; + state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); + } + if (bytes) { + const u8 *s = src; + u8 *d = dst; + + if (bytes != CHACHA_BLOCK_SIZE) + s = d = memcpy(buf, src, bytes); + chacha_block_xor_neon(state, d, s, nrounds); + if (d != dst) + memcpy(dst, buf, bytes); + state->x[12]++; + } +} + +void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) { + hchacha_block_arm(state, out, nrounds); + } else { + kernel_neon_begin(); + hchacha_block_neon(state, out, nrounds); + kernel_neon_end(); + } +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() || + bytes <= CHACHA_BLOCK_SIZE) { + chacha_doarm(dst, src, bytes, state, nrounds); + state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE); + return; + } + + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + kernel_neon_begin(); + chacha_doneon(state, dst, src, todo, nrounds); + kernel_neon_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +bool chacha_is_arch_optimized(void) +{ + /* We always can use at least the ARM scalar implementation. */ + return true; +} +EXPORT_SYMBOL(chacha_is_arch_optimized); + +static int __init chacha_arm_mod_init(void) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) { + switch (read_cpuid_part()) { + case ARM_CPU_PART_CORTEX_A7: + case ARM_CPU_PART_CORTEX_A5: + /* + * The Cortex-A7 and Cortex-A5 do not perform well with + * the NEON implementation but do incredibly with the + * scalar one and use less power. + */ + break; + default: + static_branch_enable(&use_neon); + } + } + return 0; +} +subsys_initcall(chacha_arm_mod_init); + +static void __exit chacha_arm_mod_exit(void) +{ +} +module_exit(chacha_arm_mod_exit); + +MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM optimized)"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm/chacha-neon-core.S b/lib/crypto/arm/chacha-neon-core.S new file mode 100644 index 000000000000..ddd62b6294a5 --- /dev/null +++ b/lib/crypto/arm/chacha-neon-core.S @@ -0,0 +1,643 @@ +/* + * ChaCha/HChaCha NEON helper functions + * + * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + + /* + * NEON doesn't have a rotate instruction. The alternatives are, more or less: + * + * (a) vshl.u32 + vsri.u32 (needs temporary register) + * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register) + * (c) vrev32.16 (16-bit rotations only) + * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only, + * needs index vector) + * + * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations, + * the only choices are (a) and (b). We use (a) since it takes two-thirds the + * cycles of (b) on both Cortex-A7 and Cortex-A53. + * + * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest + * and doesn't need a temporary register. + * + * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence + * is twice as fast as (a), even when doing (a) on multiple registers + * simultaneously to eliminate the stall between vshl and vsri. Also, it + * parallelizes better when temporary registers are scarce. + * + * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as + * (a), so the need to load the rotation table actually makes the vtbl method + * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it + * seems to be a good compromise to get a more significant speed boost on some + * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7. + */ + +#include <linux/linkage.h> +#include <asm/cache.h> + + .text + .fpu neon + .align 5 + +/* + * chacha_permute - permute one block + * + * Permute one 64-byte block where the state matrix is stored in the four NEON + * registers q0-q3. It performs matrix operations on four words in parallel, + * but requires shuffling to rearrange the words after each round. + * + * The round count is given in r3. + * + * Clobbers: r3, ip, q4-q5 + */ +chacha_permute: + + adr ip, .Lrol8_table + vld1.8 {d10}, [ip, :64] + +.Ldoubleround: + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vadd.i32 q0, q0, q1 + veor q3, q3, q0 + vrev32.16 q3, q3 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vadd.i32 q2, q2, q3 + veor q4, q1, q2 + vshl.u32 q1, q4, #12 + vsri.u32 q1, q4, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vadd.i32 q0, q0, q1 + veor q3, q3, q0 + vtbl.8 d6, {d6}, d10 + vtbl.8 d7, {d7}, d10 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vadd.i32 q2, q2, q3 + veor q4, q1, q2 + vshl.u32 q1, q4, #7 + vsri.u32 q1, q4, #25 + + // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vext.8 q1, q1, q1, #4 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vext.8 q2, q2, q2, #8 + // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vext.8 q3, q3, q3, #12 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vadd.i32 q0, q0, q1 + veor q3, q3, q0 + vrev32.16 q3, q3 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vadd.i32 q2, q2, q3 + veor q4, q1, q2 + vshl.u32 q1, q4, #12 + vsri.u32 q1, q4, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vadd.i32 q0, q0, q1 + veor q3, q3, q0 + vtbl.8 d6, {d6}, d10 + vtbl.8 d7, {d7}, d10 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vadd.i32 q2, q2, q3 + veor q4, q1, q2 + vshl.u32 q1, q4, #7 + vsri.u32 q1, q4, #25 + + // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vext.8 q1, q1, q1, #12 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vext.8 q2, q2, q2, #8 + // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vext.8 q3, q3, q3, #4 + + subs r3, r3, #2 + bne .Ldoubleround + + bx lr +ENDPROC(chacha_permute) + +ENTRY(chacha_block_xor_neon) + // r0: Input state matrix, s + // r1: 1 data block output, o + // r2: 1 data block input, i + // r3: nrounds + push {lr} + + // x0..3 = s0..3 + add ip, r0, #0x20 + vld1.32 {q0-q1}, [r0] + vld1.32 {q2-q3}, [ip] + + vmov q8, q0 + vmov q9, q1 + vmov q10, q2 + vmov q11, q3 + + bl chacha_permute + + add ip, r2, #0x20 + vld1.8 {q4-q5}, [r2] + vld1.8 {q6-q7}, [ip] + + // o0 = i0 ^ (x0 + s0) + vadd.i32 q0, q0, q8 + veor q0, q0, q4 + + // o1 = i1 ^ (x1 + s1) + vadd.i32 q1, q1, q9 + veor q1, q1, q5 + + // o2 = i2 ^ (x2 + s2) + vadd.i32 q2, q2, q10 + veor q2, q2, q6 + + // o3 = i3 ^ (x3 + s3) + vadd.i32 q3, q3, q11 + veor q3, q3, q7 + + add ip, r1, #0x20 + vst1.8 {q0-q1}, [r1] + vst1.8 {q2-q3}, [ip] + + pop {pc} +ENDPROC(chacha_block_xor_neon) + +ENTRY(hchacha_block_neon) + // r0: Input state matrix, s + // r1: output (8 32-bit words) + // r2: nrounds + push {lr} + + vld1.32 {q0-q1}, [r0]! + vld1.32 {q2-q3}, [r0] + + mov r3, r2 + bl chacha_permute + + vst1.32 {q0}, [r1]! + vst1.32 {q3}, [r1] + + pop {pc} +ENDPROC(hchacha_block_neon) + + .align 4 +.Lctrinc: .word 0, 1, 2, 3 +.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6 + + .align 5 +ENTRY(chacha_4block_xor_neon) + push {r4, lr} + mov r4, sp // preserve the stack pointer + sub ip, sp, #0x20 // allocate a 32 byte buffer + bic ip, ip, #0x1f // aligned to 32 bytes + mov sp, ip + + // r0: Input state matrix, s + // r1: 4 data blocks output, o + // r2: 4 data blocks input, i + // r3: nrounds + + // + // This function encrypts four consecutive ChaCha blocks by loading + // the state matrix in NEON registers four times. The algorithm performs + // each operation on the corresponding word of each state matrix, hence + // requires no word shuffling. The words are re-interleaved before the + // final addition of the original state and the XORing step. + // + + // x0..15[0-3] = s0..15[0-3] + add ip, r0, #0x20 + vld1.32 {q0-q1}, [r0] + vld1.32 {q2-q3}, [ip] + + adr lr, .Lctrinc + vdup.32 q15, d7[1] + vdup.32 q14, d7[0] + vld1.32 {q4}, [lr, :128] + vdup.32 q13, d6[1] + vdup.32 q12, d6[0] + vdup.32 q11, d5[1] + vdup.32 q10, d5[0] + vadd.u32 q12, q12, q4 // x12 += counter values 0-3 + vdup.32 q9, d4[1] + vdup.32 q8, d4[0] + vdup.32 q7, d3[1] + vdup.32 q6, d3[0] + vdup.32 q5, d2[1] + vdup.32 q4, d2[0] + vdup.32 q3, d1[1] + vdup.32 q2, d1[0] + vdup.32 q1, d0[1] + vdup.32 q0, d0[0] + + adr ip, .Lrol8_table + b 1f + +.Ldoubleround4: + vld1.32 {q8-q9}, [sp, :256] +1: + // x0 += x4, x12 = rotl32(x12 ^ x0, 16) + // x1 += x5, x13 = rotl32(x13 ^ x1, 16) + // x2 += x6, x14 = rotl32(x14 ^ x2, 16) + // x3 += x7, x15 = rotl32(x15 ^ x3, 16) + vadd.i32 q0, q0, q4 + vadd.i32 q1, q1, q5 + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + + veor q12, q12, q0 + veor q13, q13, q1 + veor q14, q14, q2 + veor q15, q15, q3 + + vrev32.16 q12, q12 + vrev32.16 q13, q13 + vrev32.16 q14, q14 + vrev32.16 q15, q15 + + // x8 += x12, x4 = rotl32(x4 ^ x8, 12) + // x9 += x13, x5 = rotl32(x5 ^ x9, 12) + // x10 += x14, x6 = rotl32(x6 ^ x10, 12) + // x11 += x15, x7 = rotl32(x7 ^ x11, 12) + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vadd.i32 q10, q10, q14 + vadd.i32 q11, q11, q15 + + vst1.32 {q8-q9}, [sp, :256] + + veor q8, q4, q8 + veor q9, q5, q9 + vshl.u32 q4, q8, #12 + vshl.u32 q5, q9, #12 + vsri.u32 q4, q8, #20 + vsri.u32 q5, q9, #20 + + veor q8, q6, q10 + veor q9, q7, q11 + vshl.u32 q6, q8, #12 + vshl.u32 q7, q9, #12 + vsri.u32 q6, q8, #20 + vsri.u32 q7, q9, #20 + + // x0 += x4, x12 = rotl32(x12 ^ x0, 8) + // x1 += x5, x13 = rotl32(x13 ^ x1, 8) + // x2 += x6, x14 = rotl32(x14 ^ x2, 8) + // x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vld1.8 {d16}, [ip, :64] + vadd.i32 q0, q0, q4 + vadd.i32 q1, q1, q5 + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + + veor q12, q12, q0 + veor q13, q13, q1 + veor q14, q14, q2 + veor q15, q15, q3 + + vtbl.8 d24, {d24}, d16 + vtbl.8 d25, {d25}, d16 + vtbl.8 d26, {d26}, d16 + vtbl.8 d27, {d27}, d16 + vtbl.8 d28, {d28}, d16 + vtbl.8 d29, {d29}, d16 + vtbl.8 d30, {d30}, d16 + vtbl.8 d31, {d31}, d16 + + vld1.32 {q8-q9}, [sp, :256] + + // x8 += x12, x4 = rotl32(x4 ^ x8, 7) + // x9 += x13, x5 = rotl32(x5 ^ x9, 7) + // x10 += x14, x6 = rotl32(x6 ^ x10, 7) + // x11 += x15, x7 = rotl32(x7 ^ x11, 7) + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vadd.i32 q10, q10, q14 + vadd.i32 q11, q11, q15 + + vst1.32 {q8-q9}, [sp, :256] + + veor q8, q4, q8 + veor q9, q5, q9 + vshl.u32 q4, q8, #7 + vshl.u32 q5, q9, #7 + vsri.u32 q4, q8, #25 + vsri.u32 q5, q9, #25 + + veor q8, q6, q10 + veor q9, q7, q11 + vshl.u32 q6, q8, #7 + vshl.u32 q7, q9, #7 + vsri.u32 q6, q8, #25 + vsri.u32 q7, q9, #25 + + vld1.32 {q8-q9}, [sp, :256] + + // x0 += x5, x15 = rotl32(x15 ^ x0, 16) + // x1 += x6, x12 = rotl32(x12 ^ x1, 16) + // x2 += x7, x13 = rotl32(x13 ^ x2, 16) + // x3 += x4, x14 = rotl32(x14 ^ x3, 16) + vadd.i32 q0, q0, q5 + vadd.i32 q1, q1, q6 + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q4 + + veor q15, q15, q0 + veor q12, q12, q1 + veor q13, q13, q2 + veor q14, q14, q3 + + vrev32.16 q15, q15 + vrev32.16 q12, q12 + vrev32.16 q13, q13 + vrev32.16 q14, q14 + + // x10 += x15, x5 = rotl32(x5 ^ x10, 12) + // x11 += x12, x6 = rotl32(x6 ^ x11, 12) + // x8 += x13, x7 = rotl32(x7 ^ x8, 12) + // x9 += x14, x4 = rotl32(x4 ^ x9, 12) + vadd.i32 q10, q10, q15 + vadd.i32 q11, q11, q12 + vadd.i32 q8, q8, q13 + vadd.i32 q9, q9, q14 + + vst1.32 {q8-q9}, [sp, :256] + + veor q8, q7, q8 + veor q9, q4, q9 + vshl.u32 q7, q8, #12 + vshl.u32 q4, q9, #12 + vsri.u32 q7, q8, #20 + vsri.u32 q4, q9, #20 + + veor q8, q5, q10 + veor q9, q6, q11 + vshl.u32 q5, q8, #12 + vshl.u32 q6, q9, #12 + vsri.u32 q5, q8, #20 + vsri.u32 q6, q9, #20 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 8) + // x1 += x6, x12 = rotl32(x12 ^ x1, 8) + // x2 += x7, x13 = rotl32(x13 ^ x2, 8) + // x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vld1.8 {d16}, [ip, :64] + vadd.i32 q0, q0, q5 + vadd.i32 q1, q1, q6 + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q4 + + veor q15, q15, q0 + veor q12, q12, q1 + veor q13, q13, q2 + veor q14, q14, q3 + + vtbl.8 d30, {d30}, d16 + vtbl.8 d31, {d31}, d16 + vtbl.8 d24, {d24}, d16 + vtbl.8 d25, {d25}, d16 + vtbl.8 d26, {d26}, d16 + vtbl.8 d27, {d27}, d16 + vtbl.8 d28, {d28}, d16 + vtbl.8 d29, {d29}, d16 + + vld1.32 {q8-q9}, [sp, :256] + + // x10 += x15, x5 = rotl32(x5 ^ x10, 7) + // x11 += x12, x6 = rotl32(x6 ^ x11, 7) + // x8 += x13, x7 = rotl32(x7 ^ x8, 7) + // x9 += x14, x4 = rotl32(x4 ^ x9, 7) + vadd.i32 q10, q10, q15 + vadd.i32 q11, q11, q12 + vadd.i32 q8, q8, q13 + vadd.i32 q9, q9, q14 + + vst1.32 {q8-q9}, [sp, :256] + + veor q8, q7, q8 + veor q9, q4, q9 + vshl.u32 q7, q8, #7 + vshl.u32 q4, q9, #7 + vsri.u32 q7, q8, #25 + vsri.u32 q4, q9, #25 + + veor q8, q5, q10 + veor q9, q6, q11 + vshl.u32 q5, q8, #7 + vshl.u32 q6, q9, #7 + vsri.u32 q5, q8, #25 + vsri.u32 q6, q9, #25 + + subs r3, r3, #2 + bne .Ldoubleround4 + + // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15. + // x8..9[0-3] are on the stack. + + // Re-interleave the words in the first two rows of each block (x0..7). + // Also add the counter values 0-3 to x12[0-3]. + vld1.32 {q8}, [lr, :128] // load counter values 0-3 + vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1) + vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3) + vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5) + vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7) + vadd.u32 q12, q8 // x12 += counter values 0-3 + vswp d1, d4 + vswp d3, d6 + vld1.32 {q8-q9}, [r0]! // load s0..7 + vswp d9, d12 + vswp d11, d14 + + // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1) + // after XORing the first 32 bytes. + vswp q1, q4 + + // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7) + + // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block) + vadd.u32 q0, q0, q8 + vadd.u32 q2, q2, q8 + vadd.u32 q4, q4, q8 + vadd.u32 q3, q3, q8 + + // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block) + vadd.u32 q1, q1, q9 + vadd.u32 q6, q6, q9 + vadd.u32 q5, q5, q9 + vadd.u32 q7, q7, q9 + + // XOR first 32 bytes using keystream from first two rows of first block + vld1.8 {q8-q9}, [r2]! + veor q8, q8, q0 + veor q9, q9, q1 + vst1.8 {q8-q9}, [r1]! + + // Re-interleave the words in the last two rows of each block (x8..15). + vld1.32 {q8-q9}, [sp, :256] + mov sp, r4 // restore original stack pointer + ldr r4, [r4, #8] // load number of bytes + vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13) + vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15) + vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9) + vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11) + vld1.32 {q0-q1}, [r0] // load s8..15 + vswp d25, d28 + vswp d27, d30 + vswp d17, d20 + vswp d19, d22 + + // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15) + + // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block) + vadd.u32 q8, q8, q0 + vadd.u32 q10, q10, q0 + vadd.u32 q9, q9, q0 + vadd.u32 q11, q11, q0 + + // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block) + vadd.u32 q12, q12, q1 + vadd.u32 q14, q14, q1 + vadd.u32 q13, q13, q1 + vadd.u32 q15, q15, q1 + + // XOR the rest of the data with the keystream + + vld1.8 {q0-q1}, [r2]! + subs r4, r4, #96 + veor q0, q0, q8 + veor q1, q1, q12 + ble .Lle96 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + subs r4, r4, #32 + veor q0, q0, q2 + veor q1, q1, q6 + ble .Lle128 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + subs r4, r4, #32 + veor q0, q0, q10 + veor q1, q1, q14 + ble .Lle160 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + subs r4, r4, #32 + veor q0, q0, q4 + veor q1, q1, q5 + ble .Lle192 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + subs r4, r4, #32 + veor q0, q0, q9 + veor q1, q1, q13 + ble .Lle224 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + subs r4, r4, #32 + veor q0, q0, q3 + veor q1, q1, q7 + blt .Llt256 +.Lout: + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2] + veor q0, q0, q11 + veor q1, q1, q15 + vst1.8 {q0-q1}, [r1] + + pop {r4, pc} + +.Lle192: + vmov q4, q9 + vmov q5, q13 + +.Lle160: + // nothing to do + +.Lfinalblock: + // Process the final block if processing less than 4 full blocks. + // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the + // previous 32 byte output block that still needs to be written at + // [r1] in q0-q1. + beq .Lfullblock + +.Lpartialblock: + adr lr, .Lpermute + 32 + add r2, r2, r4 + add lr, lr, r4 + add r4, r4, r1 + + vld1.8 {q2-q3}, [lr] + vld1.8 {q6-q7}, [r2] + + add r4, r4, #32 + + vtbl.8 d4, {q4-q5}, d4 + vtbl.8 d5, {q4-q5}, d5 + vtbl.8 d6, {q4-q5}, d6 + vtbl.8 d7, {q4-q5}, d7 + + veor q6, q6, q2 + veor q7, q7, q3 + + vst1.8 {q6-q7}, [r4] // overlapping stores + vst1.8 {q0-q1}, [r1] + pop {r4, pc} + +.Lfullblock: + vmov q11, q4 + vmov q15, q5 + b .Lout +.Lle96: + vmov q4, q2 + vmov q5, q6 + b .Lfinalblock +.Lle128: + vmov q4, q10 + vmov q5, q14 + b .Lfinalblock +.Lle224: + vmov q4, q3 + vmov q5, q7 + b .Lfinalblock +.Llt256: + vmov q4, q11 + vmov q5, q15 + b .Lpartialblock +ENDPROC(chacha_4block_xor_neon) + + .align L1_CACHE_SHIFT +.Lpermute: + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 + .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 + .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f diff --git a/lib/crypto/arm/chacha-scalar-core.S b/lib/crypto/arm/chacha-scalar-core.S new file mode 100644 index 000000000000..4951df05c158 --- /dev/null +++ b/lib/crypto/arm/chacha-scalar-core.S @@ -0,0 +1,444 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2018 Google, Inc. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * Design notes: + * + * 16 registers would be needed to hold the state matrix, but only 14 are + * available because 'sp' and 'pc' cannot be used. So we spill the elements + * (x8, x9) to the stack and swap them out with (x10, x11). This adds one + * 'ldrd' and one 'strd' instruction per round. + * + * All rotates are performed using the implicit rotate operand accepted by the + * 'add' and 'eor' instructions. This is faster than using explicit rotate + * instructions. To make this work, we allow the values in the second and last + * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the + * wrong rotation amount. The rotation amount is then fixed up just in time + * when the values are used. 'brot' is the number of bits the values in row 'b' + * need to be rotated right to arrive at the correct values, and 'drot' + * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such + * that they end up as (25, 24) after every round. + */ + + // ChaCha state registers + X0 .req r0 + X1 .req r1 + X2 .req r2 + X3 .req r3 + X4 .req r4 + X5 .req r5 + X6 .req r6 + X7 .req r7 + X8_X10 .req r8 // shared by x8 and x10 + X9_X11 .req r9 // shared by x9 and x11 + X12 .req r10 + X13 .req r11 + X14 .req r12 + X15 .req r14 + +.macro _le32_bswap_4x a, b, c, d, tmp +#ifdef __ARMEB__ + rev_l \a, \tmp + rev_l \b, \tmp + rev_l \c, \tmp + rev_l \d, \tmp +#endif +.endm + +.macro __ldrd a, b, src, offset +#if __LINUX_ARM_ARCH__ >= 6 + ldrd \a, \b, [\src, #\offset] +#else + ldr \a, [\src, #\offset] + ldr \b, [\src, #\offset + 4] +#endif +.endm + +.macro __strd a, b, dst, offset +#if __LINUX_ARM_ARCH__ >= 6 + strd \a, \b, [\dst, #\offset] +#else + str \a, [\dst, #\offset] + str \b, [\dst, #\offset + 4] +#endif +.endm + +.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2 + + // a += b; d ^= a; d = rol(d, 16); + add \a1, \a1, \b1, ror #brot + add \a2, \a2, \b2, ror #brot + eor \d1, \a1, \d1, ror #drot + eor \d2, \a2, \d2, ror #drot + // drot == 32 - 16 == 16 + + // c += d; b ^= c; b = rol(b, 12); + add \c1, \c1, \d1, ror #16 + add \c2, \c2, \d2, ror #16 + eor \b1, \c1, \b1, ror #brot + eor \b2, \c2, \b2, ror #brot + // brot == 32 - 12 == 20 + + // a += b; d ^= a; d = rol(d, 8); + add \a1, \a1, \b1, ror #20 + add \a2, \a2, \b2, ror #20 + eor \d1, \a1, \d1, ror #16 + eor \d2, \a2, \d2, ror #16 + // drot == 32 - 8 == 24 + + // c += d; b ^= c; b = rol(b, 7); + add \c1, \c1, \d1, ror #24 + add \c2, \c2, \d2, ror #24 + eor \b1, \c1, \b1, ror #20 + eor \b2, \c2, \b2, ror #20 + // brot == 32 - 7 == 25 +.endm + +.macro _doubleround + + // column round + + // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13) + _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13 + + // save (x8, x9); restore (x10, x11) + __strd X8_X10, X9_X11, sp, 0 + __ldrd X8_X10, X9_X11, sp, 8 + + // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15) + _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15 + + .set brot, 25 + .set drot, 24 + + // diagonal round + + // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12) + _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12 + + // save (x10, x11); restore (x8, x9) + __strd X8_X10, X9_X11, sp, 8 + __ldrd X8_X10, X9_X11, sp, 0 + + // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14) + _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14 +.endm + +.macro _chacha_permute nrounds + .set brot, 0 + .set drot, 0 + .rept \nrounds / 2 + _doubleround + .endr +.endm + +.macro _chacha nrounds + +.Lnext_block\@: + // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN + // Registers contain x0-x9,x12-x15. + + // Do the core ChaCha permutation to update x0-x15. + _chacha_permute \nrounds + + add sp, #8 + // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN + // Registers contain x0-x9,x12-x15. + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. + + // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15). + push {X8_X10, X9_X11, X12, X13, X14, X15} + + // Load (OUT, IN, LEN). + ldr r14, [sp, #96] + ldr r12, [sp, #100] + ldr r11, [sp, #104] + + orr r10, r14, r12 + + // Use slow path if fewer than 64 bytes remain. + cmp r11, #64 + blt .Lxor_slowpath\@ + + // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on + // ARMv6+, since ldmia and stmia (used below) still require alignment. + tst r10, #3 + bne .Lxor_slowpath\@ + + // Fast path: XOR 64 bytes of aligned data. + + // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN + // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT. + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. + + // x0-x3 + __ldrd r8, r9, sp, 32 + __ldrd r10, r11, sp, 40 + add X0, X0, r8 + add X1, X1, r9 + add X2, X2, r10 + add X3, X3, r11 + _le32_bswap_4x X0, X1, X2, X3, r8 + ldmia r12!, {r8-r11} + eor X0, X0, r8 + eor X1, X1, r9 + eor X2, X2, r10 + eor X3, X3, r11 + stmia r14!, {X0-X3} + + // x4-x7 + __ldrd r8, r9, sp, 48 + __ldrd r10, r11, sp, 56 + add X4, r8, X4, ror #brot + add X5, r9, X5, ror #brot + ldmia r12!, {X0-X3} + add X6, r10, X6, ror #brot + add X7, r11, X7, ror #brot + _le32_bswap_4x X4, X5, X6, X7, r8 + eor X4, X4, X0 + eor X5, X5, X1 + eor X6, X6, X2 + eor X7, X7, X3 + stmia r14!, {X4-X7} + + // x8-x15 + pop {r0-r7} // (x8-x9,x12-x15,x10-x11) + __ldrd r8, r9, sp, 32 + __ldrd r10, r11, sp, 40 + add r0, r0, r8 // x8 + add r1, r1, r9 // x9 + add r6, r6, r10 // x10 + add r7, r7, r11 // x11 + _le32_bswap_4x r0, r1, r6, r7, r8 + ldmia r12!, {r8-r11} + eor r0, r0, r8 // x8 + eor r1, r1, r9 // x9 + eor r6, r6, r10 // x10 + eor r7, r7, r11 // x11 + stmia r14!, {r0,r1,r6,r7} + ldmia r12!, {r0,r1,r6,r7} + __ldrd r8, r9, sp, 48 + __ldrd r10, r11, sp, 56 + add r2, r8, r2, ror #drot // x12 + add r3, r9, r3, ror #drot // x13 + add r4, r10, r4, ror #drot // x14 + add r5, r11, r5, ror #drot // x15 + _le32_bswap_4x r2, r3, r4, r5, r9 + ldr r9, [sp, #72] // load LEN + eor r2, r2, r0 // x12 + eor r3, r3, r1 // x13 + eor r4, r4, r6 // x14 + eor r5, r5, r7 // x15 + subs r9, #64 // decrement and check LEN + stmia r14!, {r2-r5} + + beq .Ldone\@ + +.Lprepare_for_next_block\@: + + // Stack: x0-x15 OUT IN LEN + + // Increment block counter (x12) + add r8, #1 + + // Store updated (OUT, IN, LEN) + str r14, [sp, #64] + str r12, [sp, #68] + str r9, [sp, #72] + + mov r14, sp + + // Store updated block counter (x12) + str r8, [sp, #48] + + sub sp, #16 + + // Reload state and do next block + ldmia r14!, {r0-r11} // load x0-x11 + __strd r10, r11, sp, 8 // store x10-x11 before state + ldmia r14, {r10-r12,r14} // load x12-x15 + b .Lnext_block\@ + +.Lxor_slowpath\@: + // Slow path: < 64 bytes remaining, or unaligned input or output buffer. + // We handle it by storing the 64 bytes of keystream to the stack, then + // XOR-ing the needed portion with the data. + + // Allocate keystream buffer + sub sp, #64 + mov r14, sp + + // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN + // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0. + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. + + // Save keystream for x0-x3 + __ldrd r8, r9, sp, 96 + __ldrd r10, r11, sp, 104 + add X0, X0, r8 + add X1, X1, r9 + add X2, X2, r10 + add X3, X3, r11 + _le32_bswap_4x X0, X1, X2, X3, r8 + stmia r14!, {X0-X3} + + // Save keystream for x4-x7 + __ldrd r8, r9, sp, 112 + __ldrd r10, r11, sp, 120 + add X4, r8, X4, ror #brot + add X5, r9, X5, ror #brot + add X6, r10, X6, ror #brot + add X7, r11, X7, ror #brot + _le32_bswap_4x X4, X5, X6, X7, r8 + add r8, sp, #64 + stmia r14!, {X4-X7} + + // Save keystream for x8-x15 + ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11) + __ldrd r8, r9, sp, 128 + __ldrd r10, r11, sp, 136 + add r0, r0, r8 // x8 + add r1, r1, r9 // x9 + add r6, r6, r10 // x10 + add r7, r7, r11 // x11 + _le32_bswap_4x r0, r1, r6, r7, r8 + stmia r14!, {r0,r1,r6,r7} + __ldrd r8, r9, sp, 144 + __ldrd r10, r11, sp, 152 + add r2, r8, r2, ror #drot // x12 + add r3, r9, r3, ror #drot // x13 + add r4, r10, r4, ror #drot // x14 + add r5, r11, r5, ror #drot // x15 + _le32_bswap_4x r2, r3, r4, r5, r9 + stmia r14, {r2-r5} + + // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN + // Registers: r8 is block counter, r12 is IN. + + ldr r9, [sp, #168] // LEN + ldr r14, [sp, #160] // OUT + cmp r9, #64 + mov r0, sp + movle r1, r9 + movgt r1, #64 + // r1 is number of bytes to XOR, in range [1, 64] + +.if __LINUX_ARM_ARCH__ < 6 + orr r2, r12, r14 + tst r2, #3 // IN or OUT misaligned? + bne .Lxor_next_byte\@ +.endif + + // XOR a word at a time +.rept 16 + subs r1, #4 + blt .Lxor_words_done\@ + ldr r2, [r12], #4 + ldr r3, [r0], #4 + eor r2, r2, r3 + str r2, [r14], #4 +.endr + b .Lxor_slowpath_done\@ +.Lxor_words_done\@: + ands r1, r1, #3 + beq .Lxor_slowpath_done\@ + + // XOR a byte at a time +.Lxor_next_byte\@: + ldrb r2, [r12], #1 + ldrb r3, [r0], #1 + eor r2, r2, r3 + strb r2, [r14], #1 + subs r1, #1 + bne .Lxor_next_byte\@ + +.Lxor_slowpath_done\@: + subs r9, #64 + add sp, #96 + bgt .Lprepare_for_next_block\@ + +.Ldone\@: +.endm // _chacha + +/* + * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, + * const struct chacha_state *state, int nrounds); + */ +ENTRY(chacha_doarm) + cmp r2, #0 // len == 0? + reteq lr + + ldr ip, [sp] + cmp ip, #12 + + push {r0-r2,r4-r11,lr} + + // Push state x0-x15 onto stack. + // Also store an extra copy of x10-x11 just before the state. + + add X12, r3, #48 + ldm X12, {X12,X13,X14,X15} + push {X12,X13,X14,X15} + sub sp, sp, #64 + + __ldrd X8_X10, X9_X11, r3, 40 + __strd X8_X10, X9_X11, sp, 8 + __strd X8_X10, X9_X11, sp, 56 + ldm r3, {X0-X9_X11} + __strd X0, X1, sp, 16 + __strd X2, X3, sp, 24 + __strd X4, X5, sp, 32 + __strd X6, X7, sp, 40 + __strd X8_X10, X9_X11, sp, 48 + + beq 1f + _chacha 20 + +0: add sp, #76 + pop {r4-r11, pc} + +1: _chacha 12 + b 0b +ENDPROC(chacha_doarm) + +/* + * void hchacha_block_arm(const struct chacha_state *state, + * u32 out[HCHACHA_OUT_WORDS], int nrounds); + */ +ENTRY(hchacha_block_arm) + push {r1,r4-r11,lr} + + cmp r2, #12 // ChaCha12 ? + + mov r14, r0 + ldmia r14!, {r0-r11} // load x0-x11 + push {r10-r11} // store x10-x11 to stack + ldm r14, {r10-r12,r14} // load x12-x15 + sub sp, #8 + + beq 1f + _chacha_permute 20 + + // Skip over (unused0-unused1, x10-x11) +0: add sp, #16 + + // Fix up rotations of x12-x15 + ror X12, X12, #drot + ror X13, X13, #drot + pop {r4} // load 'out' + ror X14, X14, #drot + ror X15, X15, #drot + + // Store (x0-x3,x12-x15) to 'out' + stm r4, {X0,X1,X2,X3,X12,X13,X14,X15} + + pop {r4-r11,pc} + +1: _chacha_permute 12 + b 0b +ENDPROC(hchacha_block_arm) diff --git a/lib/crypto/arm/poly1305-armv4.pl b/lib/crypto/arm/poly1305-armv4.pl new file mode 100644 index 000000000000..dd7a996361a7 --- /dev/null +++ b/lib/crypto/arm/poly1305-armv4.pl @@ -0,0 +1,1236 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# IALU(*)/gcc-4.4 NEON +# +# ARM11xx(ARMv6) 7.78/+100% - +# Cortex-A5 6.35/+130% 3.00 +# Cortex-A8 6.25/+115% 2.36 +# Cortex-A9 5.10/+95% 2.55 +# Cortex-A15 3.85/+85% 1.25(**) +# Snapdragon S4 5.70/+100% 1.48(**) +# +# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data; +# (**) these are trade-off results, they can be improved by ~8% but at +# the cost of 15/12% regression on Cortex-A5/A7, it's even possible +# to improve Cortex-A9 result, but then A5/A7 loose more than 20%; + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($ctx,$inp,$len,$padbit)=map("r$_",(0..3)); + +$code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ +# define poly1305_init poly1305_block_init_arch +# define poly1305_blocks poly1305_blocks_arm +# define poly1305_emit poly1305_emit_arch +#endif + +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.text + +.globl poly1305_emit +.globl poly1305_blocks +.globl poly1305_init +.type poly1305_init,%function +.align 5 +poly1305_init: +.Lpoly1305_init: + stmdb sp!,{r4-r11} + + eor r3,r3,r3 + cmp $inp,#0 + str r3,[$ctx,#0] @ zero hash value + str r3,[$ctx,#4] + str r3,[$ctx,#8] + str r3,[$ctx,#12] + str r3,[$ctx,#16] + str r3,[$ctx,#36] @ clear is_base2_26 + add $ctx,$ctx,#20 + +#ifdef __thumb2__ + it eq +#endif + moveq r0,#0 + beq .Lno_key + +#if __ARM_MAX_ARCH__>=7 + mov r3,#-1 + str r3,[$ctx,#28] @ impossible key power value +# ifndef __KERNEL__ + adr r11,.Lpoly1305_init + ldr r12,.LOPENSSL_armcap +# endif +#endif + ldrb r4,[$inp,#0] + mov r10,#0x0fffffff + ldrb r5,[$inp,#1] + and r3,r10,#-4 @ 0x0ffffffc + ldrb r6,[$inp,#2] + ldrb r7,[$inp,#3] + orr r4,r4,r5,lsl#8 + ldrb r5,[$inp,#4] + orr r4,r4,r6,lsl#16 + ldrb r6,[$inp,#5] + orr r4,r4,r7,lsl#24 + ldrb r7,[$inp,#6] + and r4,r4,r10 + +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +# if !defined(_WIN32) + ldr r12,[r11,r12] @ OPENSSL_armcap_P +# endif +# if defined(__APPLE__) || defined(_WIN32) + ldr r12,[r12] +# endif +#endif + ldrb r8,[$inp,#7] + orr r5,r5,r6,lsl#8 + ldrb r6,[$inp,#8] + orr r5,r5,r7,lsl#16 + ldrb r7,[$inp,#9] + orr r5,r5,r8,lsl#24 + ldrb r8,[$inp,#10] + and r5,r5,r3 + +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + tst r12,#ARMV7_NEON @ check for NEON +# ifdef __thumb2__ + adr r9,.Lpoly1305_blocks_neon + adr r11,.Lpoly1305_blocks + it ne + movne r11,r9 + adr r12,.Lpoly1305_emit + orr r11,r11,#1 @ thumb-ify addresses + orr r12,r12,#1 +# else + add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) + ite eq + addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) + addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) +# endif +#endif + ldrb r9,[$inp,#11] + orr r6,r6,r7,lsl#8 + ldrb r7,[$inp,#12] + orr r6,r6,r8,lsl#16 + ldrb r8,[$inp,#13] + orr r6,r6,r9,lsl#24 + ldrb r9,[$inp,#14] + and r6,r6,r3 + + ldrb r10,[$inp,#15] + orr r7,r7,r8,lsl#8 + str r4,[$ctx,#0] + orr r7,r7,r9,lsl#16 + str r5,[$ctx,#4] + orr r7,r7,r10,lsl#24 + str r6,[$ctx,#8] + and r7,r7,r3 + str r7,[$ctx,#12] +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + stmia r2,{r11,r12} @ fill functions table + mov r0,#1 +#else + mov r0,#0 +#endif +.Lno_key: + ldmia sp!,{r4-r11} +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_init,.-poly1305_init +___ +{ +my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12)); +my ($s1,$s2,$s3)=($r1,$r2,$r3); + +$code.=<<___; +.type poly1305_blocks,%function +.align 5 +poly1305_blocks: +.Lpoly1305_blocks: + stmdb sp!,{r3-r11,lr} + + ands $len,$len,#-16 + beq .Lno_data + + add $len,$len,$inp @ end pointer + sub sp,sp,#32 + +#if __ARM_ARCH__<7 + ldmia $ctx,{$h0-$r3} @ load context + add $ctx,$ctx,#20 + str $len,[sp,#16] @ offload stuff + str $ctx,[sp,#12] +#else + ldr lr,[$ctx,#36] @ is_base2_26 + ldmia $ctx!,{$h0-$h4} @ load hash value + str $len,[sp,#16] @ offload stuff + str $ctx,[sp,#12] + + adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 + mov $r1,$h1,lsr#6 + adcs $r1,$r1,$h2,lsl#20 + mov $r2,$h2,lsr#12 + adcs $r2,$r2,$h3,lsl#14 + mov $r3,$h3,lsr#18 + adcs $r3,$r3,$h4,lsl#8 + mov $len,#0 + teq lr,#0 + str $len,[$ctx,#16] @ clear is_base2_26 + adc $len,$len,$h4,lsr#24 + + itttt ne + movne $h0,$r0 @ choose between radixes + movne $h1,$r1 + movne $h2,$r2 + movne $h3,$r3 + ldmia $ctx,{$r0-$r3} @ load key + it ne + movne $h4,$len +#endif + + mov lr,$inp + cmp $padbit,#0 + str $r1,[sp,#20] + str $r2,[sp,#24] + str $r3,[sp,#28] + b .Loop + +.align 4 +.Loop: +#if __ARM_ARCH__<7 + ldrb r0,[lr],#16 @ load input +# ifdef __thumb2__ + it hi +# endif + addhi $h4,$h4,#1 @ 1<<128 + ldrb r1,[lr,#-15] + ldrb r2,[lr,#-14] + ldrb r3,[lr,#-13] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-12] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-11] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-10] + adds $h0,$h0,r3 @ accumulate input + + ldrb r3,[lr,#-9] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-8] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-7] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-6] + adcs $h1,$h1,r3 + + ldrb r3,[lr,#-5] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-4] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-3] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-2] + adcs $h2,$h2,r3 + + ldrb r3,[lr,#-1] + orr r1,r0,r1,lsl#8 + str lr,[sp,#8] @ offload input pointer + orr r2,r1,r2,lsl#16 + add $s1,$r1,$r1,lsr#2 + orr r3,r2,r3,lsl#24 +#else + ldr r0,[lr],#16 @ load input + it hi + addhi $h4,$h4,#1 @ padbit + ldr r1,[lr,#-12] + ldr r2,[lr,#-8] + ldr r3,[lr,#-4] +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif + adds $h0,$h0,r0 @ accumulate input + str lr,[sp,#8] @ offload input pointer + adcs $h1,$h1,r1 + add $s1,$r1,$r1,lsr#2 + adcs $h2,$h2,r2 +#endif + add $s2,$r2,$r2,lsr#2 + adcs $h3,$h3,r3 + add $s3,$r3,$r3,lsr#2 + + umull r2,r3,$h1,$r0 + adc $h4,$h4,#0 + umull r0,r1,$h0,$r0 + umlal r2,r3,$h4,$s1 + umlal r0,r1,$h3,$s1 + ldr $r1,[sp,#20] @ reload $r1 + umlal r2,r3,$h2,$s3 + umlal r0,r1,$h1,$s3 + umlal r2,r3,$h3,$s2 + umlal r0,r1,$h2,$s2 + umlal r2,r3,$h0,$r1 + str r0,[sp,#0] @ future $h0 + mul r0,$s2,$h4 + ldr $r2,[sp,#24] @ reload $r2 + adds r2,r2,r1 @ d1+=d0>>32 + eor r1,r1,r1 + adc lr,r3,#0 @ future $h2 + str r2,[sp,#4] @ future $h1 + + mul r2,$s3,$h4 + eor r3,r3,r3 + umlal r0,r1,$h3,$s3 + ldr $r3,[sp,#28] @ reload $r3 + umlal r2,r3,$h3,$r0 + umlal r0,r1,$h2,$r0 + umlal r2,r3,$h2,$r1 + umlal r0,r1,$h1,$r1 + umlal r2,r3,$h1,$r2 + umlal r0,r1,$h0,$r2 + umlal r2,r3,$h0,$r3 + ldr $h0,[sp,#0] + mul $h4,$r0,$h4 + ldr $h1,[sp,#4] + + adds $h2,lr,r0 @ d2+=d1>>32 + ldr lr,[sp,#8] @ reload input pointer + adc r1,r1,#0 + adds $h3,r2,r1 @ d3+=d2>>32 + ldr r0,[sp,#16] @ reload end pointer + adc r3,r3,#0 + add $h4,$h4,r3 @ h4+=d3>>32 + + and r1,$h4,#-4 + and $h4,$h4,#3 + add r1,r1,r1,lsr#2 @ *=5 + adds $h0,$h0,r1 + adcs $h1,$h1,#0 + adcs $h2,$h2,#0 + adcs $h3,$h3,#0 + adc $h4,$h4,#0 + + cmp r0,lr @ done yet? + bhi .Loop + + ldr $ctx,[sp,#12] + add sp,sp,#32 + stmdb $ctx,{$h0-$h4} @ store the result + +.Lno_data: +#if __ARM_ARCH__>=5 + ldmia sp!,{r3-r11,pc} +#else + ldmia sp!,{r3-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_blocks,.-poly1305_blocks +___ +} +{ +my ($ctx,$mac,$nonce)=map("r$_",(0..2)); +my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11)); +my $g4=$ctx; + +$code.=<<___; +.type poly1305_emit,%function +.align 5 +poly1305_emit: +.Lpoly1305_emit: + stmdb sp!,{r4-r11} + + ldmia $ctx,{$h0-$h4} + +#if __ARM_ARCH__>=7 + ldr ip,[$ctx,#36] @ is_base2_26 + + adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 + mov $g1,$h1,lsr#6 + adcs $g1,$g1,$h2,lsl#20 + mov $g2,$h2,lsr#12 + adcs $g2,$g2,$h3,lsl#14 + mov $g3,$h3,lsr#18 + adcs $g3,$g3,$h4,lsl#8 + mov $g4,#0 + adc $g4,$g4,$h4,lsr#24 + + tst ip,ip + itttt ne + movne $h0,$g0 + movne $h1,$g1 + movne $h2,$g2 + movne $h3,$g3 + it ne + movne $h4,$g4 +#endif + + adds $g0,$h0,#5 @ compare to modulus + adcs $g1,$h1,#0 + adcs $g2,$h2,#0 + adcs $g3,$h3,#0 + adc $g4,$h4,#0 + tst $g4,#4 @ did it carry/borrow? + +#ifdef __thumb2__ + it ne +#endif + movne $h0,$g0 + ldr $g0,[$nonce,#0] +#ifdef __thumb2__ + it ne +#endif + movne $h1,$g1 + ldr $g1,[$nonce,#4] +#ifdef __thumb2__ + it ne +#endif + movne $h2,$g2 + ldr $g2,[$nonce,#8] +#ifdef __thumb2__ + it ne +#endif + movne $h3,$g3 + ldr $g3,[$nonce,#12] + + adds $h0,$h0,$g0 + adcs $h1,$h1,$g1 + adcs $h2,$h2,$g2 + adc $h3,$h3,$g3 + +#if __ARM_ARCH__>=7 +# ifdef __ARMEB__ + rev $h0,$h0 + rev $h1,$h1 + rev $h2,$h2 + rev $h3,$h3 +# endif + str $h0,[$mac,#0] + str $h1,[$mac,#4] + str $h2,[$mac,#8] + str $h3,[$mac,#12] +#else + strb $h0,[$mac,#0] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#4] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#8] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#12] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#1] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#5] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#9] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#13] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#2] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#6] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#10] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#14] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#3] + strb $h1,[$mac,#7] + strb $h2,[$mac,#11] + strb $h3,[$mac,#15] +#endif + ldmia sp!,{r4-r11} +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_emit,.-poly1305_emit +___ +{ +my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9)); +my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14)); +my ($T0,$T1,$MASK) = map("q$_",(15,4,0)); + +my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7)); + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.fpu neon + +.type poly1305_init_neon,%function +.align 5 +poly1305_init_neon: +.Lpoly1305_init_neon: + ldr r3,[$ctx,#48] @ first table element + cmp r3,#-1 @ is value impossible? + bne .Lno_init_neon + + ldr r4,[$ctx,#20] @ load key base 2^32 + ldr r5,[$ctx,#24] + ldr r6,[$ctx,#28] + ldr r7,[$ctx,#32] + + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 + mov r3,r4,lsr#26 + mov r4,r5,lsr#20 + orr r3,r3,r5,lsl#6 + mov r5,r6,lsr#14 + orr r4,r4,r6,lsl#12 + mov r6,r7,lsr#8 + orr r5,r5,r7,lsl#18 + and r3,r3,#0x03ffffff + and r4,r4,#0x03ffffff + and r5,r5,#0x03ffffff + + vdup.32 $R0,r2 @ r^1 in both lanes + add r2,r3,r3,lsl#2 @ *5 + vdup.32 $R1,r3 + add r3,r4,r4,lsl#2 + vdup.32 $S1,r2 + vdup.32 $R2,r4 + add r4,r5,r5,lsl#2 + vdup.32 $S2,r3 + vdup.32 $R3,r5 + add r5,r6,r6,lsl#2 + vdup.32 $S3,r4 + vdup.32 $R4,r6 + vdup.32 $S4,r5 + + mov $zeros,#2 @ counter + +.Lsquare_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + + vmull.u32 $D0,$R0,${R0}[1] + vmull.u32 $D1,$R1,${R0}[1] + vmull.u32 $D2,$R2,${R0}[1] + vmull.u32 $D3,$R3,${R0}[1] + vmull.u32 $D4,$R4,${R0}[1] + + vmlal.u32 $D0,$R4,${S1}[1] + vmlal.u32 $D1,$R0,${R1}[1] + vmlal.u32 $D2,$R1,${R1}[1] + vmlal.u32 $D3,$R2,${R1}[1] + vmlal.u32 $D4,$R3,${R1}[1] + + vmlal.u32 $D0,$R3,${S2}[1] + vmlal.u32 $D1,$R4,${S2}[1] + vmlal.u32 $D3,$R1,${R2}[1] + vmlal.u32 $D2,$R0,${R2}[1] + vmlal.u32 $D4,$R2,${R2}[1] + + vmlal.u32 $D0,$R2,${S3}[1] + vmlal.u32 $D3,$R0,${R3}[1] + vmlal.u32 $D1,$R3,${S3}[1] + vmlal.u32 $D2,$R4,${S3}[1] + vmlal.u32 $D4,$R1,${R3}[1] + + vmlal.u32 $D3,$R4,${S4}[1] + vmlal.u32 $D0,$R1,${S4}[1] + vmlal.u32 $D1,$R2,${S4}[1] + vmlal.u32 $D2,$R3,${S4}[1] + vmlal.u32 $D4,$R0,${R4}[1] + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + @ and P. Schwabe + @ + @ H0>>+H1>>+H2>>+H3>>+H4 + @ H3>>+H4>>*5+H0>>+H1 + @ + @ Trivia. + @ + @ Result of multiplication of n-bit number by m-bit number is + @ n+m bits wide. However! Even though 2^n is a n+1-bit number, + @ m-bit number multiplied by 2^n is still n+m bits wide. + @ + @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, + @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit + @ one is n+1 bits wide. + @ + @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that + @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 + @ can be 27. However! In cases when their width exceeds 26 bits + @ they are limited by 2^26+2^6. This in turn means that *sum* + @ of the products with these values can still be viewed as sum + @ of 52-bit numbers as long as the amount of addends is not a + @ power of 2. For example, + @ + @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, + @ + @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or + @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than + @ 8 * (2^52) or 2^55. However, the value is then multiplied by + @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), + @ which is less than 32 * (2^52) or 2^57. And when processing + @ data we are looking at triple as many addends... + @ + @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and + @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the + @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while + @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 + @ instruction accepts 2x32-bit input and writes 2x64-bit result. + @ This means that result of reduction have to be compressed upon + @ loop wrap-around. This can be done in the process of reduction + @ to minimize amount of instructions [as well as amount of + @ 128-bit instructions, which benefits low-end processors], but + @ one has to watch for H2 (which is narrower than H0) and 5*H4 + @ not being wider than 58 bits, so that result of right shift + @ by 26 bits fits in 32 bits. This is also useful on x86, + @ because it allows to use paddd in place for paddq, which + @ benefits Atom, where paddq is ridiculously slow. + + vshr.u64 $T0,$D3,#26 + vmovn.i64 $D3#lo,$D3 + vshr.u64 $T1,$D0,#26 + vmovn.i64 $D0#lo,$D0 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + vbic.i32 $D0#lo,#0xfc000000 + + vshrn.u64 $T0#lo,$D4,#26 + vmovn.i64 $D4#lo,$D4 + vshr.u64 $T1,$D1,#26 + vmovn.i64 $D1#lo,$D1 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + vbic.i32 $D4#lo,#0xfc000000 + vbic.i32 $D1#lo,#0xfc000000 + + vadd.i32 $D0#lo,$D0#lo,$T0#lo + vshl.u32 $T0#lo,$T0#lo,#2 + vshrn.u64 $T1#lo,$D2,#26 + vmovn.i64 $D2#lo,$D2 + vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0 + vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 + vbic.i32 $D2#lo,#0xfc000000 + + vshr.u32 $T0#lo,$D0#lo,#26 + vbic.i32 $D0#lo,#0xfc000000 + vshr.u32 $T1#lo,$D3#lo,#26 + vbic.i32 $D3#lo,#0xfc000000 + vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 + vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 + + subs $zeros,$zeros,#1 + beq .Lsquare_break_neon + + add $tbl0,$ctx,#(48+0*9*4) + add $tbl1,$ctx,#(48+1*9*4) + + vtrn.32 $R0,$D0#lo @ r^2:r^1 + vtrn.32 $R2,$D2#lo + vtrn.32 $R3,$D3#lo + vtrn.32 $R1,$D1#lo + vtrn.32 $R4,$D4#lo + + vshl.u32 $S2,$R2,#2 @ *5 + vshl.u32 $S3,$R3,#2 + vshl.u32 $S1,$R1,#2 + vshl.u32 $S4,$R4,#2 + vadd.i32 $S2,$S2,$R2 + vadd.i32 $S1,$S1,$R1 + vadd.i32 $S3,$S3,$R3 + vadd.i32 $S4,$S4,$R4 + + vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! + vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! + vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vst1.32 {${S4}[0]},[$tbl0,:32] + vst1.32 {${S4}[1]},[$tbl1,:32] + + b .Lsquare_neon + +.align 4 +.Lsquare_break_neon: + add $tbl0,$ctx,#(48+2*4*9) + add $tbl1,$ctx,#(48+3*4*9) + + vmov $R0,$D0#lo @ r^4:r^3 + vshl.u32 $S1,$D1#lo,#2 @ *5 + vmov $R1,$D1#lo + vshl.u32 $S2,$D2#lo,#2 + vmov $R2,$D2#lo + vshl.u32 $S3,$D3#lo,#2 + vmov $R3,$D3#lo + vshl.u32 $S4,$D4#lo,#2 + vmov $R4,$D4#lo + vadd.i32 $S1,$S1,$D1#lo + vadd.i32 $S2,$S2,$D2#lo + vadd.i32 $S3,$S3,$D3#lo + vadd.i32 $S4,$S4,$D4#lo + + vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! + vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! + vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vst1.32 {${S4}[0]},[$tbl0] + vst1.32 {${S4}[1]},[$tbl1] + +.Lno_init_neon: + ret @ bx lr +.size poly1305_init_neon,.-poly1305_init_neon + +.globl poly1305_blocks_neon +.type poly1305_blocks_neon,%function +.align 5 +poly1305_blocks_neon: +.Lpoly1305_blocks_neon: + ldr ip,[$ctx,#36] @ is_base2_26 + + cmp $len,#64 + blo .Lpoly1305_blocks + + stmdb sp!,{r4-r7} + vstmdb sp!,{d8-d15} @ ABI specification says so + + tst ip,ip @ is_base2_26? + bne .Lbase2_26_neon + + stmdb sp!,{r1-r3,lr} + bl .Lpoly1305_init_neon + + ldr r4,[$ctx,#0] @ load hash value base 2^32 + ldr r5,[$ctx,#4] + ldr r6,[$ctx,#8] + ldr r7,[$ctx,#12] + ldr ip,[$ctx,#16] + + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 + mov r3,r4,lsr#26 + veor $D0#lo,$D0#lo,$D0#lo + mov r4,r5,lsr#20 + orr r3,r3,r5,lsl#6 + veor $D1#lo,$D1#lo,$D1#lo + mov r5,r6,lsr#14 + orr r4,r4,r6,lsl#12 + veor $D2#lo,$D2#lo,$D2#lo + mov r6,r7,lsr#8 + orr r5,r5,r7,lsl#18 + veor $D3#lo,$D3#lo,$D3#lo + and r3,r3,#0x03ffffff + orr r6,r6,ip,lsl#24 + veor $D4#lo,$D4#lo,$D4#lo + and r4,r4,#0x03ffffff + mov r1,#1 + and r5,r5,#0x03ffffff + str r1,[$ctx,#36] @ set is_base2_26 + + vmov.32 $D0#lo[0],r2 + vmov.32 $D1#lo[0],r3 + vmov.32 $D2#lo[0],r4 + vmov.32 $D3#lo[0],r5 + vmov.32 $D4#lo[0],r6 + adr $zeros,.Lzeros + + ldmia sp!,{r1-r3,lr} + b .Lhash_loaded + +.align 4 +.Lbase2_26_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ load hash value + + veor $D0#lo,$D0#lo,$D0#lo + veor $D1#lo,$D1#lo,$D1#lo + veor $D2#lo,$D2#lo,$D2#lo + veor $D3#lo,$D3#lo,$D3#lo + veor $D4#lo,$D4#lo,$D4#lo + vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! + adr $zeros,.Lzeros + vld1.32 {$D4#lo[0]},[$ctx] + sub $ctx,$ctx,#16 @ rewind + +.Lhash_loaded: + add $in2,$inp,#32 + mov $padbit,$padbit,lsl#24 + tst $len,#31 + beq .Leven + + vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]! + vmov.32 $H4#lo[0],$padbit + sub $len,$len,#16 + add $in2,$inp,#32 + +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H3,$H3 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 +# endif + vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26 + vshl.u32 $H3#lo,$H3#lo,#18 + + vsri.u32 $H3#lo,$H2#lo,#14 + vshl.u32 $H2#lo,$H2#lo,#12 + vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi + + vbic.i32 $H3#lo,#0xfc000000 + vsri.u32 $H2#lo,$H1#lo,#20 + vshl.u32 $H1#lo,$H1#lo,#6 + + vbic.i32 $H2#lo,#0xfc000000 + vsri.u32 $H1#lo,$H0#lo,#26 + vadd.i32 $H3#hi,$H3#lo,$D3#lo + + vbic.i32 $H0#lo,#0xfc000000 + vbic.i32 $H1#lo,#0xfc000000 + vadd.i32 $H2#hi,$H2#lo,$D2#lo + + vadd.i32 $H0#hi,$H0#lo,$D0#lo + vadd.i32 $H1#hi,$H1#lo,$D1#lo + + mov $tbl1,$zeros + add $tbl0,$ctx,#48 + + cmp $len,$len + b .Long_tail + +.align 4 +.Leven: + subs $len,$len,#64 + it lo + movlo $in2,$zeros + + vmov.i32 $H4,#1<<24 @ padbit, yes, always + vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] + add $inp,$inp,#64 + vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) + add $in2,$in2,#64 + itt hi + addhi $tbl1,$ctx,#(48+1*9*4) + addhi $tbl0,$ctx,#(48+3*9*4) + +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H3,$H3 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 +# endif + vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 + vshl.u32 $H3,$H3,#18 + + vsri.u32 $H3,$H2,#14 + vshl.u32 $H2,$H2,#12 + + vbic.i32 $H3,#0xfc000000 + vsri.u32 $H2,$H1,#20 + vshl.u32 $H1,$H1,#6 + + vbic.i32 $H2,#0xfc000000 + vsri.u32 $H1,$H0,#26 + + vbic.i32 $H0,#0xfc000000 + vbic.i32 $H1,#0xfc000000 + + bls .Lskip_loop + + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + b .Loop_neon + +.align 5 +.Loop_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + @ \___________________/ + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + @ \___________________/ \____________________/ + @ + @ Note that we start with inp[2:3]*r^2. This is because it + @ doesn't depend on reduction in previous iteration. + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ inp[2:3]*r^2 + + vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1] + vmull.u32 $D2,$H2#hi,${R0}[1] + vadd.i32 $H0#lo,$H0#lo,$D0#lo + vmull.u32 $D0,$H0#hi,${R0}[1] + vadd.i32 $H3#lo,$H3#lo,$D3#lo + vmull.u32 $D3,$H3#hi,${R0}[1] + vmlal.u32 $D2,$H1#hi,${R1}[1] + vadd.i32 $H1#lo,$H1#lo,$D1#lo + vmull.u32 $D1,$H1#hi,${R0}[1] + + vadd.i32 $H4#lo,$H4#lo,$D4#lo + vmull.u32 $D4,$H4#hi,${R0}[1] + subs $len,$len,#64 + vmlal.u32 $D0,$H4#hi,${S1}[1] + it lo + movlo $in2,$zeros + vmlal.u32 $D3,$H2#hi,${R1}[1] + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D1,$H0#hi,${R1}[1] + vmlal.u32 $D4,$H3#hi,${R1}[1] + + vmlal.u32 $D0,$H3#hi,${S2}[1] + vmlal.u32 $D3,$H1#hi,${R2}[1] + vmlal.u32 $D4,$H2#hi,${R2}[1] + vmlal.u32 $D1,$H4#hi,${S2}[1] + vmlal.u32 $D2,$H0#hi,${R2}[1] + + vmlal.u32 $D3,$H0#hi,${R3}[1] + vmlal.u32 $D0,$H2#hi,${S3}[1] + vmlal.u32 $D4,$H1#hi,${R3}[1] + vmlal.u32 $D1,$H3#hi,${S3}[1] + vmlal.u32 $D2,$H4#hi,${S3}[1] + + vmlal.u32 $D3,$H4#hi,${S4}[1] + vmlal.u32 $D0,$H1#hi,${S4}[1] + vmlal.u32 $D4,$H0#hi,${R4}[1] + vmlal.u32 $D1,$H2#hi,${S4}[1] + vmlal.u32 $D2,$H3#hi,${S4}[1] + + vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) + add $in2,$in2,#64 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ (hash+inp[0:1])*r^4 and accumulate + + vmlal.u32 $D3,$H3#lo,${R0}[0] + vmlal.u32 $D0,$H0#lo,${R0}[0] + vmlal.u32 $D4,$H4#lo,${R0}[0] + vmlal.u32 $D1,$H1#lo,${R0}[0] + vmlal.u32 $D2,$H2#lo,${R0}[0] + vld1.32 ${S4}[0],[$tbl0,:32] + + vmlal.u32 $D3,$H2#lo,${R1}[0] + vmlal.u32 $D0,$H4#lo,${S1}[0] + vmlal.u32 $D4,$H3#lo,${R1}[0] + vmlal.u32 $D1,$H0#lo,${R1}[0] + vmlal.u32 $D2,$H1#lo,${R1}[0] + + vmlal.u32 $D3,$H1#lo,${R2}[0] + vmlal.u32 $D0,$H3#lo,${S2}[0] + vmlal.u32 $D4,$H2#lo,${R2}[0] + vmlal.u32 $D1,$H4#lo,${S2}[0] + vmlal.u32 $D2,$H0#lo,${R2}[0] + + vmlal.u32 $D3,$H0#lo,${R3}[0] + vmlal.u32 $D0,$H2#lo,${S3}[0] + vmlal.u32 $D4,$H1#lo,${R3}[0] + vmlal.u32 $D1,$H3#lo,${S3}[0] + vmlal.u32 $D3,$H4#lo,${S4}[0] + + vmlal.u32 $D2,$H4#lo,${S3}[0] + vmlal.u32 $D0,$H1#lo,${S4}[0] + vmlal.u32 $D4,$H0#lo,${R4}[0] + vmov.i32 $H4,#1<<24 @ padbit, yes, always + vmlal.u32 $D1,$H2#lo,${S4}[0] + vmlal.u32 $D2,$H3#lo,${S4}[0] + + vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] + add $inp,$inp,#64 +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 + vrev32.8 $H3,$H3 +# endif + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction interleaved with base 2^32 -> base 2^26 of + @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4. + + vshr.u64 $T0,$D3,#26 + vmovn.i64 $D3#lo,$D3 + vshr.u64 $T1,$D0,#26 + vmovn.i64 $D0#lo,$D0 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vbic.i32 $D3#lo,#0xfc000000 + vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + vshl.u32 $H3,$H3,#18 + vbic.i32 $D0#lo,#0xfc000000 + + vshrn.u64 $T0#lo,$D4,#26 + vmovn.i64 $D4#lo,$D4 + vshr.u64 $T1,$D1,#26 + vmovn.i64 $D1#lo,$D1 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + vsri.u32 $H3,$H2,#14 + vbic.i32 $D4#lo,#0xfc000000 + vshl.u32 $H2,$H2,#12 + vbic.i32 $D1#lo,#0xfc000000 + + vadd.i32 $D0#lo,$D0#lo,$T0#lo + vshl.u32 $T0#lo,$T0#lo,#2 + vbic.i32 $H3,#0xfc000000 + vshrn.u64 $T1#lo,$D2,#26 + vmovn.i64 $D2#lo,$D2 + vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec] + vsri.u32 $H2,$H1,#20 + vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 + vshl.u32 $H1,$H1,#6 + vbic.i32 $D2#lo,#0xfc000000 + vbic.i32 $H2,#0xfc000000 + + vshrn.u64 $T0#lo,$D0,#26 @ re-narrow + vmovn.i64 $D0#lo,$D0 + vsri.u32 $H1,$H0,#26 + vbic.i32 $H0,#0xfc000000 + vshr.u32 $T1#lo,$D3#lo,#26 + vbic.i32 $D3#lo,#0xfc000000 + vbic.i32 $D0#lo,#0xfc000000 + vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 + vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 + vbic.i32 $H1,#0xfc000000 + + bhi .Loop_neon + +.Lskip_loop: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + add $tbl1,$ctx,#(48+0*9*4) + add $tbl0,$ctx,#(48+1*9*4) + adds $len,$len,#32 + it ne + movne $len,#0 + bne .Long_tail + + vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi + vadd.i32 $H0#hi,$H0#lo,$D0#lo + vadd.i32 $H3#hi,$H3#lo,$D3#lo + vadd.i32 $H1#hi,$H1#lo,$D1#lo + vadd.i32 $H4#hi,$H4#lo,$D4#lo + +.Long_tail: + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2 + + vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant + vmull.u32 $D2,$H2#hi,$R0 + vadd.i32 $H0#lo,$H0#lo,$D0#lo + vmull.u32 $D0,$H0#hi,$R0 + vadd.i32 $H3#lo,$H3#lo,$D3#lo + vmull.u32 $D3,$H3#hi,$R0 + vadd.i32 $H1#lo,$H1#lo,$D1#lo + vmull.u32 $D1,$H1#hi,$R0 + vadd.i32 $H4#lo,$H4#lo,$D4#lo + vmull.u32 $D4,$H4#hi,$R0 + + vmlal.u32 $D0,$H4#hi,$S1 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vmlal.u32 $D3,$H2#hi,$R1 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vmlal.u32 $D1,$H0#hi,$R1 + vmlal.u32 $D4,$H3#hi,$R1 + vmlal.u32 $D2,$H1#hi,$R1 + + vmlal.u32 $D3,$H1#hi,$R2 + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D0,$H3#hi,$S2 + vld1.32 ${S4}[0],[$tbl0,:32] + vmlal.u32 $D4,$H2#hi,$R2 + vmlal.u32 $D1,$H4#hi,$S2 + vmlal.u32 $D2,$H0#hi,$R2 + + vmlal.u32 $D3,$H0#hi,$R3 + it ne + addne $tbl1,$ctx,#(48+2*9*4) + vmlal.u32 $D0,$H2#hi,$S3 + it ne + addne $tbl0,$ctx,#(48+3*9*4) + vmlal.u32 $D4,$H1#hi,$R3 + vmlal.u32 $D1,$H3#hi,$S3 + vmlal.u32 $D2,$H4#hi,$S3 + + vmlal.u32 $D3,$H4#hi,$S4 + vorn $MASK,$MASK,$MASK @ all-ones, can be redundant + vmlal.u32 $D0,$H1#hi,$S4 + vshr.u64 $MASK,$MASK,#38 + vmlal.u32 $D4,$H0#hi,$R4 + vmlal.u32 $D1,$H2#hi,$S4 + vmlal.u32 $D2,$H3#hi,$S4 + + beq .Lshort_tail + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ (hash+inp[0:1])*r^4:r^3 and accumulate + + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 + + vmlal.u32 $D2,$H2#lo,$R0 + vmlal.u32 $D0,$H0#lo,$R0 + vmlal.u32 $D3,$H3#lo,$R0 + vmlal.u32 $D1,$H1#lo,$R0 + vmlal.u32 $D4,$H4#lo,$R0 + + vmlal.u32 $D0,$H4#lo,$S1 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vmlal.u32 $D3,$H2#lo,$R1 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vmlal.u32 $D1,$H0#lo,$R1 + vmlal.u32 $D4,$H3#lo,$R1 + vmlal.u32 $D2,$H1#lo,$R1 + + vmlal.u32 $D3,$H1#lo,$R2 + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D0,$H3#lo,$S2 + vld1.32 ${S4}[0],[$tbl0,:32] + vmlal.u32 $D4,$H2#lo,$R2 + vmlal.u32 $D1,$H4#lo,$S2 + vmlal.u32 $D2,$H0#lo,$R2 + + vmlal.u32 $D3,$H0#lo,$R3 + vmlal.u32 $D0,$H2#lo,$S3 + vmlal.u32 $D4,$H1#lo,$R3 + vmlal.u32 $D1,$H3#lo,$S3 + vmlal.u32 $D2,$H4#lo,$S3 + + vmlal.u32 $D3,$H4#lo,$S4 + vorn $MASK,$MASK,$MASK @ all-ones + vmlal.u32 $D0,$H1#lo,$S4 + vshr.u64 $MASK,$MASK,#38 + vmlal.u32 $D4,$H0#lo,$R4 + vmlal.u32 $D1,$H2#lo,$S4 + vmlal.u32 $D2,$H3#lo,$S4 + +.Lshort_tail: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ horizontal addition + + vadd.i64 $D3#lo,$D3#lo,$D3#hi + vadd.i64 $D0#lo,$D0#lo,$D0#hi + vadd.i64 $D4#lo,$D4#lo,$D4#hi + vadd.i64 $D1#lo,$D1#lo,$D1#hi + vadd.i64 $D2#lo,$D2#lo,$D2#hi + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction, but without narrowing + + vshr.u64 $T0,$D3,#26 + vand.i64 $D3,$D3,$MASK + vshr.u64 $T1,$D0,#26 + vand.i64 $D0,$D0,$MASK + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + + vshr.u64 $T0,$D4,#26 + vand.i64 $D4,$D4,$MASK + vshr.u64 $T1,$D1,#26 + vand.i64 $D1,$D1,$MASK + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + + vadd.i64 $D0,$D0,$T0 + vshl.u64 $T0,$T0,#2 + vshr.u64 $T1,$D2,#26 + vand.i64 $D2,$D2,$MASK + vadd.i64 $D0,$D0,$T0 @ h4 -> h0 + vadd.i64 $D3,$D3,$T1 @ h2 -> h3 + + vshr.u64 $T0,$D0,#26 + vand.i64 $D0,$D0,$MASK + vshr.u64 $T1,$D3,#26 + vand.i64 $D3,$D3,$MASK + vadd.i64 $D1,$D1,$T0 @ h0 -> h1 + vadd.i64 $D4,$D4,$T1 @ h3 -> h4 + + cmp $len,#0 + bne .Leven + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ store hash value + + vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! + vst1.32 {$D4#lo[0]},[$ctx] + + vldmia sp!,{d8-d15} @ epilogue + ldmia sp!,{r4-r7} + ret @ bx lr +.size poly1305_blocks_neon,.-poly1305_blocks_neon + +.align 5 +.Lzeros: +.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +#ifndef __KERNEL__ +.LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else +.word OPENSSL_armcap_P-.Lpoly1305_init +# endif +.comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P +#endif +#endif +___ +} } +$code.=<<___; +.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm" +.align 2 +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/\bret\b/bx lr/go or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} +close STDOUT; # enforce flush diff --git a/lib/crypto/arm/poly1305-glue.c b/lib/crypto/arm/poly1305-glue.c new file mode 100644 index 000000000000..2d86c78af883 --- /dev/null +++ b/lib/crypto/arm/poly1305-glue.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM + * + * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> +#include <crypto/internal/poly1305.h> +#include <linux/cpufeature.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/unaligned.h> + +asmlinkage void poly1305_block_init_arch( + struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +EXPORT_SYMBOL_GPL(poly1305_block_init_arch); +asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +EXPORT_SYMBOL_GPL(poly1305_emit_arch); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, + unsigned int len, u32 padbit) +{ + len = round_down(len, POLY1305_BLOCK_SIZE); + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_neon) && likely(may_use_simd())) { + do { + unsigned int todo = min_t(unsigned int, len, SZ_4K); + + kernel_neon_begin(); + poly1305_blocks_neon(state, src, todo, padbit); + kernel_neon_end(); + + len -= todo; + src += todo; + } while (len); + } else + poly1305_blocks_arm(state, src, len, padbit); +} +EXPORT_SYMBOL_GPL(poly1305_blocks_arch); + +bool poly1305_is_arch_optimized(void) +{ + /* We always can use at least the ARM scalar implementation. */ + return true; +} +EXPORT_SYMBOL(poly1305_is_arch_optimized); + +static int __init arm_poly1305_mod_init(void) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + (elf_hwcap & HWCAP_NEON)) + static_branch_enable(&have_neon); + return 0; +} +subsys_initcall(arm_poly1305_mod_init); + +static void __exit arm_poly1305_mod_exit(void) +{ +} +module_exit(arm_poly1305_mod_exit); + +MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM"); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm/sha1-armv4-large.S b/lib/crypto/arm/sha1-armv4-large.S new file mode 100644 index 000000000000..1c8b685149f2 --- /dev/null +++ b/lib/crypto/arm/sha1-armv4-large.S @@ -0,0 +1,507 @@ +#define __ARM_ARCH__ __LINUX_ARM_ARCH__ +@ SPDX-License-Identifier: GPL-2.0 + +@ This code is taken from the OpenSSL project but the author (Andy Polyakov) +@ has relicensed it under the GPLv2. Therefore this program is free software; +@ you can redistribute it and/or modify it under the terms of the GNU General +@ Public License version 2 as published by the Free Software Foundation. +@ +@ The original headers, including the original license headers, are +@ included below for completeness. + +@ ==================================================================== +@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see https://www.openssl.org/~appro/cryptogams/. +@ ==================================================================== + +@ sha1_block procedure for ARMv4. +@ +@ January 2007. + +@ Size/performance trade-off +@ ==================================================================== +@ impl size in bytes comp cycles[*] measured performance +@ ==================================================================== +@ thumb 304 3212 4420 +@ armv4-small 392/+29% 1958/+64% 2250/+96% +@ armv4-compact 740/+89% 1552/+26% 1840/+22% +@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***] +@ full unroll ~5100/+260% ~1260/+4% ~1300/+5% +@ ==================================================================== +@ thumb = same as 'small' but in Thumb instructions[**] and +@ with recurring code in two private functions; +@ small = detached Xload/update, loops are folded; +@ compact = detached Xload/update, 5x unroll; +@ large = interleaved Xload/update, 5x unroll; +@ full unroll = interleaved Xload/update, full unroll, estimated[!]; +@ +@ [*] Manually counted instructions in "grand" loop body. Measured +@ performance is affected by prologue and epilogue overhead, +@ i-cache availability, branch penalties, etc. +@ [**] While each Thumb instruction is twice smaller, they are not as +@ diverse as ARM ones: e.g., there are only two arithmetic +@ instructions with 3 arguments, no [fixed] rotate, addressing +@ modes are limited. As result it takes more instructions to do +@ the same job in Thumb, therefore the code is never twice as +@ small and always slower. +@ [***] which is also ~35% better than compiler generated code. Dual- +@ issue Cortex A8 core was measured to process input block in +@ ~990 cycles. + +@ August 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 13% improvement on +@ Cortex A8 core and in absolute terms ~870 cycles per input block +@ [or 13.6 cycles per byte]. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 10% +@ improvement on Cortex A8 core and 12.2 cycles per byte. + +#include <linux/linkage.h> + +.text + +.align 2 +ENTRY(sha1_block_data_order) + stmdb sp!,{r4-r12,lr} + add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 + ldmia r0,{r3,r4,r5,r6,r7} +.Lloop: + ldr r8,.LK_00_19 + mov r14,sp + sub sp,sp,#15*4 + mov r5,r5,ror#30 + mov r6,r6,ror#30 + mov r7,r7,ror#30 @ [6] +.L_00_15: +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r7,r8,r7,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r5,r6 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r7,r8,r7,ror#2 @ E+=K_00_19 + eor r10,r5,r6 @ F_xx_xx + add r7,r7,r3,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r4,r10,ror#2 + add r7,r7,r9 @ E+=X[i] + eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r7,r7,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r6,r8,r6,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r4,r5 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r6,r8,r6,ror#2 @ E+=K_00_19 + eor r10,r4,r5 @ F_xx_xx + add r6,r6,r7,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r3,r10,ror#2 + add r6,r6,r9 @ E+=X[i] + eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r6,r6,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r5,r8,r5,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r3,r4 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r5,r8,r5,ror#2 @ E+=K_00_19 + eor r10,r3,r4 @ F_xx_xx + add r5,r5,r6,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r7,r10,ror#2 + add r5,r5,r9 @ E+=X[i] + eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r5,r5,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r4,r8,r4,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r7,r3 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r4,r8,r4,ror#2 @ E+=K_00_19 + eor r10,r7,r3 @ F_xx_xx + add r4,r4,r5,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r6,r10,ror#2 + add r4,r4,r9 @ E+=X[i] + eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r4,r4,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r3,r8,r3,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r6,r7 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r3,r8,r3,ror#2 @ E+=K_00_19 + eor r10,r6,r7 @ F_xx_xx + add r3,r3,r4,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r5,r10,ror#2 + add r3,r3,r9 @ E+=X[i] + eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r3,r3,r10 @ E+=F_00_19(B,C,D) + cmp r14,sp + bne .L_00_15 @ [((11+4)*5+2)*3] + sub sp,sp,#25*4 +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r7,r8,r7,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r5,r6 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r7,r8,r7,ror#2 @ E+=K_00_19 + eor r10,r5,r6 @ F_xx_xx + add r7,r7,r3,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r4,r10,ror#2 + add r7,r7,r9 @ E+=X[i] + eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r7,r7,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r3,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) + add r6,r6,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r7,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) + add r5,r5,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r6,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) + add r4,r4,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r5,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) + add r3,r3,r10 @ E+=F_00_19(B,C,D) + + ldr r8,.LK_20_39 @ [+15+16*4] + cmn sp,#0 @ [+3], clear carry to denote 20_39 +.L_20_39_or_60_79: + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r7,r8,r7,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r5,r6 @ F_xx_xx + mov r9,r9,ror#31 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r4,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r7,r7,r9 @ E+=X[i] + add r7,r7,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r3,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + add r6,r6,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r7,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + add r5,r5,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r6,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + add r4,r4,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r5,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + add r3,r3,r10 @ E+=F_20_39(B,C,D) + ARM( teq r14,sp ) @ preserve carry + THUMB( mov r11,sp ) + THUMB( teq r14,r11 ) @ preserve carry + bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] + bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes + + ldr r8,.LK_40_59 + sub sp,sp,#20*4 @ [+2] +.L_40_59: + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r7,r8,r7,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r5,r6 @ F_xx_xx + mov r9,r9,ror#31 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r4,r10,ror#2 @ F_xx_xx + and r11,r5,r6 @ F_xx_xx + add r7,r7,r9 @ E+=X[i] + add r7,r7,r10 @ E+=F_40_59(B,C,D) + add r7,r7,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r3,r10,ror#2 @ F_xx_xx + and r11,r4,r5 @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + add r6,r6,r10 @ E+=F_40_59(B,C,D) + add r6,r6,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r7,r10,ror#2 @ F_xx_xx + and r11,r3,r4 @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + add r5,r5,r10 @ E+=F_40_59(B,C,D) + add r5,r5,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r6,r10,ror#2 @ F_xx_xx + and r11,r7,r3 @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + add r4,r4,r10 @ E+=F_40_59(B,C,D) + add r4,r4,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r5,r10,ror#2 @ F_xx_xx + and r11,r6,r7 @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + add r3,r3,r10 @ E+=F_40_59(B,C,D) + add r3,r3,r11,ror#2 + cmp r14,sp + bne .L_40_59 @ [+((12+5)*5+2)*4] + + ldr r8,.LK_60_79 + sub sp,sp,#20*4 + cmp sp,#0 @ set carry to denote 60_79 + b .L_20_39_or_60_79 @ [+4], spare 300 bytes +.L_done: + add sp,sp,#80*4 @ "deallocate" stack frame + ldmia r0,{r8,r9,r10,r11,r12} + add r3,r8,r3 + add r4,r9,r4 + add r5,r10,r5,ror#2 + add r6,r11,r6,ror#2 + add r7,r12,r7,ror#2 + stmia r0,{r3,r4,r5,r6,r7} + teq r1,r2 + bne .Lloop @ [+18], total 1307 + + ldmia sp!,{r4-r12,pc} +.align 2 +.LK_00_19: .word 0x5a827999 +.LK_20_39: .word 0x6ed9eba1 +.LK_40_59: .word 0x8f1bbcdc +.LK_60_79: .word 0xca62c1d6 +ENDPROC(sha1_block_data_order) +.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>" +.align 2 diff --git a/lib/crypto/arm/sha1-armv7-neon.S b/lib/crypto/arm/sha1-armv7-neon.S new file mode 100644 index 000000000000..6edba3ab62e8 --- /dev/null +++ b/lib/crypto/arm/sha1-armv7-neon.S @@ -0,0 +1,633 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function + * + * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +.syntax unified +.fpu neon + +.text + + +/* Context structure */ + +#define state_h0 0 +#define state_h1 4 +#define state_h2 8 +#define state_h3 12 +#define state_h4 16 + + +/* Constants */ + +#define K1 0x5A827999 +#define K2 0x6ED9EBA1 +#define K3 0x8F1BBCDC +#define K4 0xCA62C1D6 +.align 4 +.LK_VEC: +.LK1: .long K1, K1, K1, K1 +.LK2: .long K2, K2, K2, K2 +.LK3: .long K3, K3, K3, K3 +.LK4: .long K4, K4, K4, K4 + + +/* Register macros */ + +#define RSTATE r0 +#define RDATA r1 +#define RNBLKS r2 +#define ROLDSTACK r3 +#define RWK lr + +#define _a r4 +#define _b r5 +#define _c r6 +#define _d r7 +#define _e r8 + +#define RT0 r9 +#define RT1 r10 +#define RT2 r11 +#define RT3 r12 + +#define W0 q0 +#define W1 q7 +#define W2 q2 +#define W3 q3 +#define W4 q4 +#define W5 q6 +#define W6 q5 +#define W7 q1 + +#define tmp0 q8 +#define tmp1 q9 +#define tmp2 q10 +#define tmp3 q11 + +#define qK1 q12 +#define qK2 q13 +#define qK3 q14 +#define qK4 q15 + +#ifdef CONFIG_CPU_BIG_ENDIAN +#define ARM_LE(code...) +#else +#define ARM_LE(code...) code +#endif + +/* Round function macros. */ + +#define WK_offs(i) (((i) & 15) * 4) + +#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ldr RT3, [sp, WK_offs(i)]; \ + pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + bic RT0, d, b; \ + add e, e, a, ror #(32 - 5); \ + and RT1, c, b; \ + pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add RT0, RT0, RT3; \ + add e, e, RT1; \ + ror b, #(32 - 30); \ + pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add e, e, RT0; + +#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ldr RT3, [sp, WK_offs(i)]; \ + pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + eor RT0, d, b; \ + add e, e, a, ror #(32 - 5); \ + eor RT0, RT0, c; \ + pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add e, e, RT3; \ + ror b, #(32 - 30); \ + pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add e, e, RT0; \ + +#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ldr RT3, [sp, WK_offs(i)]; \ + pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + eor RT0, b, c; \ + and RT1, b, c; \ + add e, e, a, ror #(32 - 5); \ + pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + and RT0, RT0, d; \ + add RT1, RT1, RT3; \ + add e, e, RT0; \ + ror b, #(32 - 30); \ + pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add e, e, RT1; + +#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) + +#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) + +#define R(a,b,c,d,e,f,i) \ + _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) + +#define dummy(...) + + +/* Input expansion macros. */ + +/********* Precalc macros for rounds 0-15 *************************************/ + +#define W_PRECALC_00_15() \ + add RWK, sp, #(WK_offs(0)); \ + \ + vld1.32 {W0, W7}, [RDATA]!; \ + ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \ + vld1.32 {W6, W5}, [RDATA]!; \ + vadd.u32 tmp0, W0, curK; \ + ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \ + ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \ + vadd.u32 tmp1, W7, curK; \ + ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \ + vadd.u32 tmp2, W6, curK; \ + vst1.32 {tmp0, tmp1}, [RWK]!; \ + vadd.u32 tmp3, W5, curK; \ + vst1.32 {tmp2, tmp3}, [RWK]; \ + +#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vld1.32 {W0, W7}, [RDATA]!; \ + +#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + add RWK, sp, #(WK_offs(0)); \ + +#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \ + +#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vld1.32 {W6, W5}, [RDATA]!; \ + +#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vadd.u32 tmp0, W0, curK; \ + +#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \ + +#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \ + +#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vadd.u32 tmp1, W7, curK; \ + +#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \ + +#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vadd.u32 tmp2, W6, curK; \ + +#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vst1.32 {tmp0, tmp1}, [RWK]!; \ + +#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vadd.u32 tmp3, W5, curK; \ + +#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vst1.32 {tmp2, tmp3}, [RWK]; \ + + +/********* Precalc macros for rounds 16-31 ************************************/ + +#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor tmp0, tmp0; \ + vext.8 W, W_m16, W_m12, #8; \ + +#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + add RWK, sp, #(WK_offs(i)); \ + vext.8 tmp0, W_m04, tmp0, #4; \ + +#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor tmp0, tmp0, W_m16; \ + veor.32 W, W, W_m08; \ + +#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor tmp1, tmp1; \ + veor W, W, tmp0; \ + +#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vshl.u32 tmp0, W, #1; \ + +#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vext.8 tmp1, tmp1, W, #(16-12); \ + vshr.u32 W, W, #31; \ + +#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vorr tmp0, tmp0, W; \ + vshr.u32 W, tmp1, #30; \ + +#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vshl.u32 tmp1, tmp1, #2; \ + +#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor tmp0, tmp0, W; \ + +#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor W, tmp0, tmp1; \ + +#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vadd.u32 tmp0, W, curK; \ + +#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vst1.32 {tmp0}, [RWK]; + + +/********* Precalc macros for rounds 32-79 ************************************/ + +#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor W, W_m28; \ + +#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vext.8 tmp0, W_m08, W_m04, #8; \ + +#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor W, W_m16; \ + +#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor W, tmp0; \ + +#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + add RWK, sp, #(WK_offs(i&~3)); \ + +#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vshl.u32 tmp1, W, #2; \ + +#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vshr.u32 tmp0, W, #30; \ + +#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vorr W, tmp0, tmp1; \ + +#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vadd.u32 tmp0, W, curK; \ + +#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vst1.32 {tmp0}, [RWK]; + + +/* + * Transform nblocks*64 bytes (nblocks*16 32-bit words) at DATA. + * + * void sha1_transform_neon(struct sha1_block_state *state, + * const u8 *data, size_t nblocks); + */ +.align 3 +ENTRY(sha1_transform_neon) + /* input: + * r0: state + * r1: data (64*nblocks bytes) + * r2: nblocks + */ + + cmp RNBLKS, #0; + beq .Ldo_nothing; + + push {r4-r12, lr}; + /*vpush {q4-q7};*/ + + adr RT3, .LK_VEC; + + mov ROLDSTACK, sp; + + /* Align stack. */ + sub RT0, sp, #(16*4); + and RT0, #(~(16-1)); + mov sp, RT0; + + vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */ + + /* Get the values of the chaining variables. */ + ldm RSTATE, {_a-_e}; + + vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */ + +#undef curK +#define curK qK1 + /* Precalc 0-15. */ + W_PRECALC_00_15(); + +.Loop: + /* Transform 0-15 + Precalc 16-31. */ + _R( _a, _b, _c, _d, _e, F1, 0, + WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16, + W4, W5, W6, W7, W0, _, _, _ ); + _R( _e, _a, _b, _c, _d, F1, 1, + WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16, + W4, W5, W6, W7, W0, _, _, _ ); + _R( _d, _e, _a, _b, _c, F1, 2, + WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16, + W4, W5, W6, W7, W0, _, _, _ ); + _R( _c, _d, _e, _a, _b, F1, 3, + WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16, + W4, W5, W6, W7, W0, _, _, _ ); + +#undef curK +#define curK qK2 + _R( _b, _c, _d, _e, _a, F1, 4, + WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20, + W3, W4, W5, W6, W7, _, _, _ ); + _R( _a, _b, _c, _d, _e, F1, 5, + WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20, + W3, W4, W5, W6, W7, _, _, _ ); + _R( _e, _a, _b, _c, _d, F1, 6, + WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20, + W3, W4, W5, W6, W7, _, _, _ ); + _R( _d, _e, _a, _b, _c, F1, 7, + WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20, + W3, W4, W5, W6, W7, _, _, _ ); + + _R( _c, _d, _e, _a, _b, F1, 8, + WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24, + W2, W3, W4, W5, W6, _, _, _ ); + _R( _b, _c, _d, _e, _a, F1, 9, + WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24, + W2, W3, W4, W5, W6, _, _, _ ); + _R( _a, _b, _c, _d, _e, F1, 10, + WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24, + W2, W3, W4, W5, W6, _, _, _ ); + _R( _e, _a, _b, _c, _d, F1, 11, + WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24, + W2, W3, W4, W5, W6, _, _, _ ); + + _R( _d, _e, _a, _b, _c, F1, 12, + WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28, + W1, W2, W3, W4, W5, _, _, _ ); + _R( _c, _d, _e, _a, _b, F1, 13, + WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28, + W1, W2, W3, W4, W5, _, _, _ ); + _R( _b, _c, _d, _e, _a, F1, 14, + WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28, + W1, W2, W3, W4, W5, _, _, _ ); + _R( _a, _b, _c, _d, _e, F1, 15, + WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28, + W1, W2, W3, W4, W5, _, _, _ ); + + /* Transform 16-63 + Precalc 32-79. */ + _R( _e, _a, _b, _c, _d, F1, 16, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32, + W0, W1, W2, W3, W4, W5, W6, W7); + _R( _d, _e, _a, _b, _c, F1, 17, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32, + W0, W1, W2, W3, W4, W5, W6, W7); + _R( _c, _d, _e, _a, _b, F1, 18, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32, + W0, W1, W2, W3, W4, W5, W6, W7); + _R( _b, _c, _d, _e, _a, F1, 19, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32, + W0, W1, W2, W3, W4, W5, W6, W7); + + _R( _a, _b, _c, _d, _e, F2, 20, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36, + W7, W0, W1, W2, W3, W4, W5, W6); + _R( _e, _a, _b, _c, _d, F2, 21, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36, + W7, W0, W1, W2, W3, W4, W5, W6); + _R( _d, _e, _a, _b, _c, F2, 22, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36, + W7, W0, W1, W2, W3, W4, W5, W6); + _R( _c, _d, _e, _a, _b, F2, 23, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36, + W7, W0, W1, W2, W3, W4, W5, W6); + +#undef curK +#define curK qK3 + _R( _b, _c, _d, _e, _a, F2, 24, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40, + W6, W7, W0, W1, W2, W3, W4, W5); + _R( _a, _b, _c, _d, _e, F2, 25, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40, + W6, W7, W0, W1, W2, W3, W4, W5); + _R( _e, _a, _b, _c, _d, F2, 26, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40, + W6, W7, W0, W1, W2, W3, W4, W5); + _R( _d, _e, _a, _b, _c, F2, 27, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40, + W6, W7, W0, W1, W2, W3, W4, W5); + + _R( _c, _d, _e, _a, _b, F2, 28, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44, + W5, W6, W7, W0, W1, W2, W3, W4); + _R( _b, _c, _d, _e, _a, F2, 29, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44, + W5, W6, W7, W0, W1, W2, W3, W4); + _R( _a, _b, _c, _d, _e, F2, 30, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44, + W5, W6, W7, W0, W1, W2, W3, W4); + _R( _e, _a, _b, _c, _d, F2, 31, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44, + W5, W6, W7, W0, W1, W2, W3, W4); + + _R( _d, _e, _a, _b, _c, F2, 32, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48, + W4, W5, W6, W7, W0, W1, W2, W3); + _R( _c, _d, _e, _a, _b, F2, 33, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48, + W4, W5, W6, W7, W0, W1, W2, W3); + _R( _b, _c, _d, _e, _a, F2, 34, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48, + W4, W5, W6, W7, W0, W1, W2, W3); + _R( _a, _b, _c, _d, _e, F2, 35, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48, + W4, W5, W6, W7, W0, W1, W2, W3); + + _R( _e, _a, _b, _c, _d, F2, 36, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52, + W3, W4, W5, W6, W7, W0, W1, W2); + _R( _d, _e, _a, _b, _c, F2, 37, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52, + W3, W4, W5, W6, W7, W0, W1, W2); + _R( _c, _d, _e, _a, _b, F2, 38, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52, + W3, W4, W5, W6, W7, W0, W1, W2); + _R( _b, _c, _d, _e, _a, F2, 39, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52, + W3, W4, W5, W6, W7, W0, W1, W2); + + _R( _a, _b, _c, _d, _e, F3, 40, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56, + W2, W3, W4, W5, W6, W7, W0, W1); + _R( _e, _a, _b, _c, _d, F3, 41, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56, + W2, W3, W4, W5, W6, W7, W0, W1); + _R( _d, _e, _a, _b, _c, F3, 42, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56, + W2, W3, W4, W5, W6, W7, W0, W1); + _R( _c, _d, _e, _a, _b, F3, 43, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56, + W2, W3, W4, W5, W6, W7, W0, W1); + +#undef curK +#define curK qK4 + _R( _b, _c, _d, _e, _a, F3, 44, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60, + W1, W2, W3, W4, W5, W6, W7, W0); + _R( _a, _b, _c, _d, _e, F3, 45, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60, + W1, W2, W3, W4, W5, W6, W7, W0); + _R( _e, _a, _b, _c, _d, F3, 46, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60, + W1, W2, W3, W4, W5, W6, W7, W0); + _R( _d, _e, _a, _b, _c, F3, 47, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60, + W1, W2, W3, W4, W5, W6, W7, W0); + + _R( _c, _d, _e, _a, _b, F3, 48, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64, + W0, W1, W2, W3, W4, W5, W6, W7); + _R( _b, _c, _d, _e, _a, F3, 49, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64, + W0, W1, W2, W3, W4, W5, W6, W7); + _R( _a, _b, _c, _d, _e, F3, 50, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64, + W0, W1, W2, W3, W4, W5, W6, W7); + _R( _e, _a, _b, _c, _d, F3, 51, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64, + W0, W1, W2, W3, W4, W5, W6, W7); + + _R( _d, _e, _a, _b, _c, F3, 52, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68, + W7, W0, W1, W2, W3, W4, W5, W6); + _R( _c, _d, _e, _a, _b, F3, 53, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68, + W7, W0, W1, W2, W3, W4, W5, W6); + _R( _b, _c, _d, _e, _a, F3, 54, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68, + W7, W0, W1, W2, W3, W4, W5, W6); + _R( _a, _b, _c, _d, _e, F3, 55, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68, + W7, W0, W1, W2, W3, W4, W5, W6); + + _R( _e, _a, _b, _c, _d, F3, 56, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72, + W6, W7, W0, W1, W2, W3, W4, W5); + _R( _d, _e, _a, _b, _c, F3, 57, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72, + W6, W7, W0, W1, W2, W3, W4, W5); + _R( _c, _d, _e, _a, _b, F3, 58, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72, + W6, W7, W0, W1, W2, W3, W4, W5); + _R( _b, _c, _d, _e, _a, F3, 59, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72, + W6, W7, W0, W1, W2, W3, W4, W5); + + subs RNBLKS, #1; + + _R( _a, _b, _c, _d, _e, F4, 60, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76, + W5, W6, W7, W0, W1, W2, W3, W4); + _R( _e, _a, _b, _c, _d, F4, 61, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76, + W5, W6, W7, W0, W1, W2, W3, W4); + _R( _d, _e, _a, _b, _c, F4, 62, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76, + W5, W6, W7, W0, W1, W2, W3, W4); + _R( _c, _d, _e, _a, _b, F4, 63, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76, + W5, W6, W7, W0, W1, W2, W3, W4); + + beq .Lend; + + /* Transform 64-79 + Precalc 0-15 of next block. */ +#undef curK +#define curK qK1 + _R( _b, _c, _d, _e, _a, F4, 64, + WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _a, _b, _c, _d, _e, F4, 65, + WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _e, _a, _b, _c, _d, F4, 66, + WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _d, _e, _a, _b, _c, F4, 67, + WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + + _R( _c, _d, _e, _a, _b, F4, 68, + dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _b, _c, _d, _e, _a, F4, 69, + dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _a, _b, _c, _d, _e, F4, 70, + WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _e, _a, _b, _c, _d, F4, 71, + WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + + _R( _d, _e, _a, _b, _c, F4, 72, + dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _c, _d, _e, _a, _b, F4, 73, + dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _b, _c, _d, _e, _a, F4, 74, + WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _a, _b, _c, _d, _e, F4, 75, + WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + + _R( _e, _a, _b, _c, _d, F4, 76, + WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _d, _e, _a, _b, _c, F4, 77, + WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _c, _d, _e, _a, _b, F4, 78, + WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _b, _c, _d, _e, _a, F4, 79, + WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ ); + + /* Update the chaining variables. */ + ldm RSTATE, {RT0-RT3}; + add _a, RT0; + ldr RT0, [RSTATE, #state_h4]; + add _b, RT1; + add _c, RT2; + add _d, RT3; + add _e, RT0; + stm RSTATE, {_a-_e}; + + b .Loop; + +.Lend: + /* Transform 64-79 */ + R( _b, _c, _d, _e, _a, F4, 64 ); + R( _a, _b, _c, _d, _e, F4, 65 ); + R( _e, _a, _b, _c, _d, F4, 66 ); + R( _d, _e, _a, _b, _c, F4, 67 ); + R( _c, _d, _e, _a, _b, F4, 68 ); + R( _b, _c, _d, _e, _a, F4, 69 ); + R( _a, _b, _c, _d, _e, F4, 70 ); + R( _e, _a, _b, _c, _d, F4, 71 ); + R( _d, _e, _a, _b, _c, F4, 72 ); + R( _c, _d, _e, _a, _b, F4, 73 ); + R( _b, _c, _d, _e, _a, F4, 74 ); + R( _a, _b, _c, _d, _e, F4, 75 ); + R( _e, _a, _b, _c, _d, F4, 76 ); + R( _d, _e, _a, _b, _c, F4, 77 ); + R( _c, _d, _e, _a, _b, F4, 78 ); + R( _b, _c, _d, _e, _a, F4, 79 ); + + mov sp, ROLDSTACK; + + /* Update the chaining variables. */ + ldm RSTATE, {RT0-RT3}; + add _a, RT0; + ldr RT0, [RSTATE, #state_h4]; + add _b, RT1; + add _c, RT2; + add _d, RT3; + /*vpop {q4-q7};*/ + add _e, RT0; + stm RSTATE, {_a-_e}; + + pop {r4-r12, pc}; + +.Ldo_nothing: + bx lr +ENDPROC(sha1_transform_neon) diff --git a/lib/crypto/arm/sha1-ce-core.S b/lib/crypto/arm/sha1-ce-core.S new file mode 100644 index 000000000000..2de40dd25e47 --- /dev/null +++ b/lib/crypto/arm/sha1-ce-core.S @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions + * + * Copyright (C) 2015 Linaro Ltd. + * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .arch armv8-a + .fpu crypto-neon-fp-armv8 + + k0 .req q0 + k1 .req q1 + k2 .req q2 + k3 .req q3 + + ta0 .req q4 + ta1 .req q5 + tb0 .req q5 + tb1 .req q4 + + dga .req q6 + dgb .req q7 + dgbs .req s28 + + dg0 .req q12 + dg1a0 .req q13 + dg1a1 .req q14 + dg1b0 .req q14 + dg1b1 .req q13 + + .macro add_only, op, ev, rc, s0, dg1 + .ifnb \s0 + vadd.u32 tb\ev, q\s0, \rc + .endif + sha1h.32 dg1b\ev, dg0 + .ifb \dg1 + sha1\op\().32 dg0, dg1a\ev, ta\ev + .else + sha1\op\().32 dg0, \dg1, ta\ev + .endif + .endm + + .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1 + sha1su0.32 q\s0, q\s1, q\s2 + add_only \op, \ev, \rc, \s1, \dg1 + sha1su1.32 q\s0, q\s3 + .endm + + .align 6 +.Lsha1_rcon: + .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999 + .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1 + .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc + .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6 + + /* + * void sha1_ce_transform(struct sha1_block_state *state, + * const u8 *data, size_t nblocks); + */ +ENTRY(sha1_ce_transform) + /* load round constants */ + adr ip, .Lsha1_rcon + vld1.32 {k0-k1}, [ip, :128]! + vld1.32 {k2-k3}, [ip, :128] + + /* load state */ + vld1.32 {dga}, [r0] + vldr dgbs, [r0, #16] + + /* load input */ +0: vld1.32 {q8-q9}, [r1]! + vld1.32 {q10-q11}, [r1]! + subs r2, r2, #1 + +#ifndef CONFIG_CPU_BIG_ENDIAN + vrev32.8 q8, q8 + vrev32.8 q9, q9 + vrev32.8 q10, q10 + vrev32.8 q11, q11 +#endif + + vadd.u32 ta0, q8, k0 + vmov dg0, dga + + add_update c, 0, k0, 8, 9, 10, 11, dgb + add_update c, 1, k0, 9, 10, 11, 8 + add_update c, 0, k0, 10, 11, 8, 9 + add_update c, 1, k0, 11, 8, 9, 10 + add_update c, 0, k1, 8, 9, 10, 11 + + add_update p, 1, k1, 9, 10, 11, 8 + add_update p, 0, k1, 10, 11, 8, 9 + add_update p, 1, k1, 11, 8, 9, 10 + add_update p, 0, k1, 8, 9, 10, 11 + add_update p, 1, k2, 9, 10, 11, 8 + + add_update m, 0, k2, 10, 11, 8, 9 + add_update m, 1, k2, 11, 8, 9, 10 + add_update m, 0, k2, 8, 9, 10, 11 + add_update m, 1, k2, 9, 10, 11, 8 + add_update m, 0, k3, 10, 11, 8, 9 + + add_update p, 1, k3, 11, 8, 9, 10 + add_only p, 0, k3, 9 + add_only p, 1, k3, 10 + add_only p, 0, k3, 11 + add_only p, 1 + + /* update state */ + vadd.u32 dga, dga, dg0 + vadd.u32 dgb, dgb, dg1a0 + bne 0b + + /* store new state */ + vst1.32 {dga}, [r0] + vstr dgbs, [r0, #16] + bx lr +ENDPROC(sha1_ce_transform) diff --git a/lib/crypto/arm/sha1.h b/lib/crypto/arm/sha1.h new file mode 100644 index 000000000000..fa1e92419000 --- /dev/null +++ b/lib/crypto/arm/sha1.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-1 optimized for ARM + * + * Copyright 2025 Google LLC + */ +#include <asm/neon.h> +#include <asm/simd.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); + +asmlinkage void sha1_block_data_order(struct sha1_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage void sha1_transform_neon(struct sha1_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage void sha1_ce_transform(struct sha1_block_state *state, + const u8 *data, size_t nblocks); + +static void sha1_blocks(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_neon) && likely(may_use_simd())) { + kernel_neon_begin(); + if (static_branch_likely(&have_ce)) + sha1_ce_transform(state, data, nblocks); + else + sha1_transform_neon(state, data, nblocks); + kernel_neon_end(); + } else { + sha1_block_data_order(state, data, nblocks); + } +} + +#ifdef CONFIG_KERNEL_MODE_NEON +#define sha1_mod_init_arch sha1_mod_init_arch +static inline void sha1_mod_init_arch(void) +{ + if (elf_hwcap & HWCAP_NEON) { + static_branch_enable(&have_neon); + if (elf_hwcap2 & HWCAP2_SHA1) + static_branch_enable(&have_ce); + } +} +#endif /* CONFIG_KERNEL_MODE_NEON */ diff --git a/lib/crypto/arm/sha256-armv4.pl b/lib/crypto/arm/sha256-armv4.pl new file mode 100644 index 000000000000..f3a2b54efd4e --- /dev/null +++ b/lib/crypto/arm/sha256-armv4.pl @@ -0,0 +1,724 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 + +# This code is taken from the OpenSSL project but the author (Andy Polyakov) +# has relicensed it under the GPLv2. Therefore this program is free software; +# you can redistribute it and/or modify it under the terms of the GNU General +# Public License version 2 as published by the Free Software Foundation. +# +# The original headers, including the original license headers, are +# included below for completeness. + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see https://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA256 block procedure for ARMv4. May 2007. + +# Performance is ~2x better than gcc 3.4 generated code and in "abso- +# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per +# byte [on single-issue Xscale PXA250 core]. + +# July 2010. +# +# Rescheduling for dual-issue pipeline resulted in 22% improvement on +# Cortex A8 core and ~20 cycles per processed byte. + +# February 2011. +# +# Profiler-assisted and platform-specific optimization resulted in 16% +# improvement on Cortex A8 core and ~15.4 cycles per processed byte. + +# September 2013. +# +# Add NEON implementation. On Cortex A8 it was measured to process one +# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon +# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only +# code (meaning that latter performs sub-optimally, nothing was done +# about it). + +# May 2014. +# +# Add ARMv8 code path performing at 2.0 cpb on Apple A7. + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$ctx="r0"; $t0="r0"; +$inp="r1"; $t4="r1"; +$len="r2"; $t1="r2"; +$T1="r3"; $t3="r3"; +$A="r4"; +$B="r5"; +$C="r6"; +$D="r7"; +$E="r8"; +$F="r9"; +$G="r10"; +$H="r11"; +@V=($A,$B,$C,$D,$E,$F,$G,$H); +$t2="r12"; +$Ktbl="r14"; + +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); + +sub BODY_00_15 { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + +$code.=<<___ if ($i<16); +#if __ARM_ARCH__>=7 + @ ldr $t1,[$inp],#4 @ $i +# if $i==15 + str $inp,[sp,#17*4] @ make room for $t4 +# endif + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) +# ifndef __ARMEB__ + rev $t1,$t1 +# endif +#else + @ ldrb $t1,[$inp,#3] @ $i + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past + ldrb $t2,[$inp,#2] + ldrb $t0,[$inp,#1] + orr $t1,$t1,$t2,lsl#8 + ldrb $t2,[$inp],#4 + orr $t1,$t1,$t0,lsl#16 +# if $i==15 + str $inp,[sp,#17*4] @ make room for $t4 +# endif + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` + orr $t1,$t1,$t2,lsl#24 + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) +#endif +___ +$code.=<<___; + ldr $t2,[$Ktbl],#4 @ *K256++ + add $h,$h,$t1 @ h+=X[i] + str $t1,[sp,#`$i%16`*4] + eor $t1,$f,$g + add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) + and $t1,$t1,$e + add $h,$h,$t2 @ h+=K256[i] + eor $t1,$t1,$g @ Ch(e,f,g) + eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` + add $h,$h,$t1 @ h+=Ch(e,f,g) +#if $i==31 + and $t2,$t2,#0xff + cmp $t2,#0xf2 @ done? +#endif +#if $i<15 +# if __ARM_ARCH__>=7 + ldr $t1,[$inp],#4 @ prefetch +# else + ldrb $t1,[$inp,#3] +# endif + eor $t2,$a,$b @ a^b, b^c in next round +#else + ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx + eor $t2,$a,$b @ a^b, b^c in next round + ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx +#endif + eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) + and $t3,$t3,$t2 @ (b^c)&=(a^b) + add $d,$d,$h @ d+=h + eor $t3,$t3,$b @ Maj(a,b,c) + add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) + @ add $h,$h,$t3 @ h+=Maj(a,b,c) +___ + ($t2,$t3)=($t3,$t2); +} + +sub BODY_16_XX { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + +$code.=<<___; + @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i + @ ldr $t4,[sp,#`($i+14)%16`*4] + mov $t0,$t1,ror#$sigma0[0] + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past + mov $t2,$t4,ror#$sigma1[0] + eor $t0,$t0,$t1,ror#$sigma0[1] + eor $t2,$t2,$t4,ror#$sigma1[1] + eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) + ldr $t1,[sp,#`($i+0)%16`*4] + eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) + ldr $t4,[sp,#`($i+9)%16`*4] + + add $t2,$t2,$t0 + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 + add $t1,$t1,$t2 + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) + add $t1,$t1,$t4 @ X[i] +___ + &BODY_00_15(@_); +} + +$code=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +#endif + +.text +#if __ARM_ARCH__<7 +.code 32 +#else +.syntax unified +# ifdef __thumb2__ +.thumb +# else +.code 32 +# endif +#endif + +.type K256,%object +.align 5 +K256: +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.size K256,.-K256 +.word 0 @ terminator +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-sha256_block_data_order +#endif +.align 5 + +.global sha256_block_data_order +.type sha256_block_data_order,%function +sha256_block_data_order: +.Lsha256_block_data_order: +#if __ARM_ARCH__<7 + sub r3,pc,#8 @ sha256_block_data_order +#else + adr r3,.Lsha256_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P + tst r12,#ARMV8_SHA256 + bne .LARMv8 + tst r12,#ARMV7_NEON + bne .LNEON +#endif + add $len,$inp,$len,lsl#6 @ len to point at the end of inp + stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} + ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} + sub $Ktbl,r3,#256+32 @ K256 + sub sp,sp,#16*4 @ alloca(X[16]) +.Loop: +# if __ARM_ARCH__>=7 + ldr $t1,[$inp],#4 +# else + ldrb $t1,[$inp,#3] +# endif + eor $t3,$B,$C @ magic + eor $t2,$t2,$t2 +___ +for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=".Lrounds_16_xx:\n"; +for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; +#if __ARM_ARCH__>=7 + ite eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq $t3,[sp,#16*4] @ pull ctx + bne .Lrounds_16_xx + + add $A,$A,$t2 @ h+=Maj(a,b,c) from the past + ldr $t0,[$t3,#0] + ldr $t1,[$t3,#4] + ldr $t2,[$t3,#8] + add $A,$A,$t0 + ldr $t0,[$t3,#12] + add $B,$B,$t1 + ldr $t1,[$t3,#16] + add $C,$C,$t2 + ldr $t2,[$t3,#20] + add $D,$D,$t0 + ldr $t0,[$t3,#24] + add $E,$E,$t1 + ldr $t1,[$t3,#28] + add $F,$F,$t2 + ldr $inp,[sp,#17*4] @ pull inp + ldr $t2,[sp,#18*4] @ pull inp+len + add $G,$G,$t0 + add $H,$H,$t1 + stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} + cmp $inp,$t2 + sub $Ktbl,$Ktbl,#256 @ rewind Ktbl + bne .Loop + + add sp,sp,#`16+3`*4 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size sha256_block_data_order,.-sha256_block_data_order +___ +###################################################################### +# NEON stuff +# +{{{ +my @X=map("q$_",(0..3)); +my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); +my $Xfer=$t4; +my $j=0; + +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub Xupdate() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T2,$T0,$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T1,$T0,$sigma0[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T2,$T0,32-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T3,$T0,$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T1,$T1,$T2); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T3,$T0,32-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T1,$T1,$T3); # sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vld1_32 ("{$T0}","[$Ktbl,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); # sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 ($T0,$T0,@X[0]); + while($#insns>=2) { eval(shift(@insns)); } + &vst1_32 ("{$T0}","[$Xfer,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub Xpreload() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vld1_32 ("{$T0}","[$Ktbl,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vrev32_8 (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 ($T0,$T0,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &vst1_32 ("{$T0}","[$Xfer,:128]!"); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. + '&add ($h,$h,$t1)', # h+=X[i]+K[i] + '&eor ($t1,$f,$g)', + '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', + '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past + '&and ($t1,$t1,$e)', + '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) + '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', + '&eor ($t1,$t1,$g)', # Ch(e,f,g) + '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) + '&eor ($t2,$a,$b)', # a^b, b^c in next round + '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) + '&add ($h,$h,$t1)', # h+=Ch(e,f,g) + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. + '&ldr ($t1,"[$Ktbl]") if ($j==15);'. + '&ldr ($t1,"[sp,#64]") if ($j==31)', + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) + '&add ($d,$d,$h)', # d+=h + '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) + '&eor ($t3,$t3,$b)', # Maj(a,b,c) + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' + ) +} + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.global sha256_block_data_order_neon +.type sha256_block_data_order_neon,%function +.align 4 +sha256_block_data_order_neon: +.LNEON: + stmdb sp!,{r4-r12,lr} + + sub $H,sp,#16*4+16 + adr $Ktbl,.Lsha256_block_data_order + sub $Ktbl,$Ktbl,#.Lsha256_block_data_order-K256 + bic $H,$H,#15 @ align for 128-bit stores + mov $t2,sp + mov sp,$H @ alloca + add $len,$inp,$len,lsl#6 @ len to point at the end of inp + + vld1.8 {@X[0]},[$inp]! + vld1.8 {@X[1]},[$inp]! + vld1.8 {@X[2]},[$inp]! + vld1.8 {@X[3]},[$inp]! + vld1.32 {$T0},[$Ktbl,:128]! + vld1.32 {$T1},[$Ktbl,:128]! + vld1.32 {$T2},[$Ktbl,:128]! + vld1.32 {$T3},[$Ktbl,:128]! + vrev32.8 @X[0],@X[0] @ yes, even on + str $ctx,[sp,#64] + vrev32.8 @X[1],@X[1] @ big-endian + str $inp,[sp,#68] + mov $Xfer,sp + vrev32.8 @X[2],@X[2] + str $len,[sp,#72] + vrev32.8 @X[3],@X[3] + str $t2,[sp,#76] @ save original sp + vadd.i32 $T0,$T0,@X[0] + vadd.i32 $T1,$T1,@X[1] + vst1.32 {$T0},[$Xfer,:128]! + vadd.i32 $T2,$T2,@X[2] + vst1.32 {$T1},[$Xfer,:128]! + vadd.i32 $T3,$T3,@X[3] + vst1.32 {$T2},[$Xfer,:128]! + vst1.32 {$T3},[$Xfer,:128]! + + ldmia $ctx,{$A-$H} + sub $Xfer,$Xfer,#64 + ldr $t1,[sp,#0] + eor $t2,$t2,$t2 + eor $t3,$B,$C + b .L_00_48 + +.align 4 +.L_00_48: +___ + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); +$code.=<<___; + teq $t1,#0 @ check for K256 terminator + ldr $t1,[sp,#0] + sub $Xfer,$Xfer,#64 + bne .L_00_48 + + ldr $inp,[sp,#68] + ldr $t0,[sp,#72] + sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl + teq $inp,$t0 + it eq + subeq $inp,$inp,#64 @ avoid SEGV + vld1.8 {@X[0]},[$inp]! @ load next input block + vld1.8 {@X[1]},[$inp]! + vld1.8 {@X[2]},[$inp]! + vld1.8 {@X[3]},[$inp]! + it ne + strne $inp,[sp,#68] + mov $Xfer,sp +___ + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); +$code.=<<___; + ldr $t0,[$t1,#0] + add $A,$A,$t2 @ h+=Maj(a,b,c) from the past + ldr $t2,[$t1,#4] + ldr $t3,[$t1,#8] + ldr $t4,[$t1,#12] + add $A,$A,$t0 @ accumulate + ldr $t0,[$t1,#16] + add $B,$B,$t2 + ldr $t2,[$t1,#20] + add $C,$C,$t3 + ldr $t3,[$t1,#24] + add $D,$D,$t4 + ldr $t4,[$t1,#28] + add $E,$E,$t0 + str $A,[$t1],#4 + add $F,$F,$t2 + str $B,[$t1],#4 + add $G,$G,$t3 + str $C,[$t1],#4 + add $H,$H,$t4 + str $D,[$t1],#4 + stmia $t1,{$E-$H} + + ittte ne + movne $Xfer,sp + ldrne $t1,[sp,#0] + eorne $t2,$t2,$t2 + ldreq sp,[sp,#76] @ restore original sp + itt ne + eorne $t3,$B,$C + bne .L_00_48 + + ldmia sp!,{r4-r12,pc} +.size sha256_block_data_order_neon,.-sha256_block_data_order_neon +#endif +___ +}}} +###################################################################### +# ARMv8 stuff +# +{{{ +my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); +my @MSG=map("q$_",(8..11)); +my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); +my $Ktbl="r3"; + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + +# ifdef __thumb2__ +# define INST(a,b,c,d) .byte c,d|0xc,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d +# endif + +.type sha256_block_data_order_armv8,%function +.align 5 +sha256_block_data_order_armv8: +.LARMv8: + vld1.32 {$ABCD,$EFGH},[$ctx] +# ifdef __thumb2__ + adr $Ktbl,.LARMv8 + sub $Ktbl,$Ktbl,#.LARMv8-K256 +# else + adrl $Ktbl,K256 +# endif + add $len,$inp,$len,lsl#6 @ len to point at the end of inp + +.Loop_v8: + vld1.8 {@MSG[0]-@MSG[1]},[$inp]! + vld1.8 {@MSG[2]-@MSG[3]},[$inp]! + vld1.32 {$W0},[$Ktbl]! + vrev32.8 @MSG[0],@MSG[0] + vrev32.8 @MSG[1],@MSG[1] + vrev32.8 @MSG[2],@MSG[2] + vrev32.8 @MSG[3],@MSG[3] + vmov $ABCD_SAVE,$ABCD @ offload + vmov $EFGH_SAVE,$EFGH + teq $inp,$len +___ +for($i=0;$i<12;$i++) { +$code.=<<___; + vld1.32 {$W1},[$Ktbl]! + vadd.i32 $W0,$W0,@MSG[0] + sha256su0 @MSG[0],@MSG[1] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + sha256su1 @MSG[0],@MSG[2],@MSG[3] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); +} +$code.=<<___; + vld1.32 {$W1},[$Ktbl]! + vadd.i32 $W0,$W0,@MSG[0] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + vld1.32 {$W0},[$Ktbl]! + vadd.i32 $W1,$W1,@MSG[1] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + vld1.32 {$W1},[$Ktbl] + vadd.i32 $W0,$W0,@MSG[2] + sub $Ktbl,$Ktbl,#256-16 @ rewind + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + vadd.i32 $W1,$W1,@MSG[3] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + vadd.i32 $ABCD,$ABCD,$ABCD_SAVE + vadd.i32 $EFGH,$EFGH,$EFGH_SAVE + it ne + bne .Loop_v8 + + vst1.32 {$ABCD,$EFGH},[$ctx] + + ret @ bx lr +.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 +#endif +___ +}}} +$code.=<<___; +.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.comm OPENSSL_armcap_P,4,4 +#endif +___ + +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + +{ my %opcode = ( + "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, + "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); + + sub unsha256 { + my ($mnemonic,$arg)=@_; + + if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { + my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<17)|(($2&8)<<4) + |(($3&7)<<1) |(($3&8)<<2); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; + } + } +} + +foreach (split($/,$code)) { + + s/\`([^\`]*)\`/eval $1/geo; + + s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; + + s/\bret\b/bx lr/go or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} + +close STDOUT; # enforce flush diff --git a/lib/crypto/arm/sha256-ce.S b/lib/crypto/arm/sha256-ce.S new file mode 100644 index 000000000000..7481ac8e6c0d --- /dev/null +++ b/lib/crypto/arm/sha256-ce.S @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sha256-ce.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions + * + * Copyright (C) 2015 Linaro Ltd. + * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .arch armv8-a + .fpu crypto-neon-fp-armv8 + + k0 .req q7 + k1 .req q8 + rk .req r3 + + ta0 .req q9 + ta1 .req q10 + tb0 .req q10 + tb1 .req q9 + + dga .req q11 + dgb .req q12 + + dg0 .req q13 + dg1 .req q14 + dg2 .req q15 + + .macro add_only, ev, s0 + vmov dg2, dg0 + .ifnb \s0 + vld1.32 {k\ev}, [rk, :128]! + .endif + sha256h.32 dg0, dg1, tb\ev + sha256h2.32 dg1, dg2, tb\ev + .ifnb \s0 + vadd.u32 ta\ev, q\s0, k\ev + .endif + .endm + + .macro add_update, ev, s0, s1, s2, s3 + sha256su0.32 q\s0, q\s1 + add_only \ev, \s1 + sha256su1.32 q\s0, q\s2, q\s3 + .endm + + .align 6 +.Lsha256_rcon: + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + + /* + * void sha256_ce_transform(struct sha256_block_state *state, + * const u8 *data, size_t nblocks); + */ +ENTRY(sha256_ce_transform) + /* load state */ + vld1.32 {dga-dgb}, [r0] + + /* load input */ +0: vld1.32 {q0-q1}, [r1]! + vld1.32 {q2-q3}, [r1]! + subs r2, r2, #1 + +#ifndef CONFIG_CPU_BIG_ENDIAN + vrev32.8 q0, q0 + vrev32.8 q1, q1 + vrev32.8 q2, q2 + vrev32.8 q3, q3 +#endif + + /* load first round constant */ + adr rk, .Lsha256_rcon + vld1.32 {k0}, [rk, :128]! + + vadd.u32 ta0, q0, k0 + vmov dg0, dga + vmov dg1, dgb + + add_update 1, 0, 1, 2, 3 + add_update 0, 1, 2, 3, 0 + add_update 1, 2, 3, 0, 1 + add_update 0, 3, 0, 1, 2 + add_update 1, 0, 1, 2, 3 + add_update 0, 1, 2, 3, 0 + add_update 1, 2, 3, 0, 1 + add_update 0, 3, 0, 1, 2 + add_update 1, 0, 1, 2, 3 + add_update 0, 1, 2, 3, 0 + add_update 1, 2, 3, 0, 1 + add_update 0, 3, 0, 1, 2 + + add_only 1, 1 + add_only 0, 2 + add_only 1, 3 + add_only 0 + + /* update state */ + vadd.u32 dga, dga, dg0 + vadd.u32 dgb, dgb, dg1 + bne 0b + + /* store new state */ + vst1.32 {dga-dgb}, [r0] + bx lr +ENDPROC(sha256_ce_transform) diff --git a/lib/crypto/arm/sha256.h b/lib/crypto/arm/sha256.h new file mode 100644 index 000000000000..da75cbdc51d4 --- /dev/null +++ b/lib/crypto/arm/sha256.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-256 optimized for ARM + * + * Copyright 2025 Google LLC + */ +#include <asm/neon.h> +#include <crypto/internal/simd.h> + +asmlinkage void sha256_block_data_order(struct sha256_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage void sha256_block_data_order_neon(struct sha256_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage void sha256_ce_transform(struct sha256_block_state *state, + const u8 *data, size_t nblocks); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); + +static void sha256_blocks(struct sha256_block_state *state, + const u8 *data, size_t nblocks) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_neon) && crypto_simd_usable()) { + kernel_neon_begin(); + if (static_branch_likely(&have_ce)) + sha256_ce_transform(state, data, nblocks); + else + sha256_block_data_order_neon(state, data, nblocks); + kernel_neon_end(); + } else { + sha256_block_data_order(state, data, nblocks); + } +} + +#ifdef CONFIG_KERNEL_MODE_NEON +#define sha256_mod_init_arch sha256_mod_init_arch +static inline void sha256_mod_init_arch(void) +{ + if (elf_hwcap & HWCAP_NEON) { + static_branch_enable(&have_neon); + if (elf_hwcap2 & HWCAP2_SHA2) + static_branch_enable(&have_ce); + } +} +#endif /* CONFIG_KERNEL_MODE_NEON */ diff --git a/lib/crypto/arm/sha512-armv4.pl b/lib/crypto/arm/sha512-armv4.pl new file mode 100644 index 000000000000..2fc3516912fa --- /dev/null +++ b/lib/crypto/arm/sha512-armv4.pl @@ -0,0 +1,657 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 + +# This code is taken from the OpenSSL project but the author (Andy Polyakov) +# has relicensed it under the GPLv2. Therefore this program is free software; +# you can redistribute it and/or modify it under the terms of the GNU General +# Public License version 2 as published by the Free Software Foundation. +# +# The original headers, including the original license headers, are +# included below for completeness. + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see https://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA512 block procedure for ARMv4. September 2007. + +# This code is ~4.5 (four and a half) times faster than code generated +# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue +# Xscale PXA250 core]. +# +# July 2010. +# +# Rescheduling for dual-issue pipeline resulted in 6% improvement on +# Cortex A8 core and ~40 cycles per processed byte. + +# February 2011. +# +# Profiler-assisted and platform-specific optimization resulted in 7% +# improvement on Coxtex A8 core and ~38 cycles per byte. + +# March 2011. +# +# Add NEON implementation. On Cortex A8 it was measured to process +# one byte in 23.3 cycles or ~60% faster than integer-only code. + +# August 2012. +# +# Improve NEON performance by 12% on Snapdragon S4. In absolute +# terms it's 22.6 cycles per byte, which is disappointing result. +# Technical writers asserted that 3-way S4 pipeline can sustain +# multiple NEON instructions per cycle, but dual NEON issue could +# not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html +# for further details. On side note Cortex-A15 processes one byte in +# 16 cycles. + +# Byte order [in]dependence. ========================================= +# +# Originally caller was expected to maintain specific *dword* order in +# h[0-7], namely with most significant dword at *lower* address, which +# was reflected in below two parameters as 0 and 4. Now caller is +# expected to maintain native byte order for whole 64-bit values. +$hi="HI"; +$lo="LO"; +# ==================================================================== + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$ctx="r0"; # parameter block +$inp="r1"; +$len="r2"; + +$Tlo="r3"; +$Thi="r4"; +$Alo="r5"; +$Ahi="r6"; +$Elo="r7"; +$Ehi="r8"; +$t0="r9"; +$t1="r10"; +$t2="r11"; +$t3="r12"; +############ r13 is stack pointer +$Ktbl="r14"; +############ r15 is program counter + +$Aoff=8*0; +$Boff=8*1; +$Coff=8*2; +$Doff=8*3; +$Eoff=8*4; +$Foff=8*5; +$Goff=8*6; +$Hoff=8*7; +$Xoff=8*8; + +sub BODY_00_15() { +my $magic = shift; +$code.=<<___; + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov $t0,$Elo,lsr#14 + str $Tlo,[sp,#$Xoff+0] + mov $t1,$Ehi,lsr#14 + str $Thi,[sp,#$Xoff+4] + eor $t0,$t0,$Ehi,lsl#18 + ldr $t2,[sp,#$Hoff+0] @ h.lo + eor $t1,$t1,$Elo,lsl#18 + ldr $t3,[sp,#$Hoff+4] @ h.hi + eor $t0,$t0,$Elo,lsr#18 + eor $t1,$t1,$Ehi,lsr#18 + eor $t0,$t0,$Ehi,lsl#14 + eor $t1,$t1,$Elo,lsl#14 + eor $t0,$t0,$Ehi,lsr#9 + eor $t1,$t1,$Elo,lsr#9 + eor $t0,$t0,$Elo,lsl#23 + eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e) + adds $Tlo,$Tlo,$t0 + ldr $t0,[sp,#$Foff+0] @ f.lo + adc $Thi,$Thi,$t1 @ T += Sigma1(e) + ldr $t1,[sp,#$Foff+4] @ f.hi + adds $Tlo,$Tlo,$t2 + ldr $t2,[sp,#$Goff+0] @ g.lo + adc $Thi,$Thi,$t3 @ T += h + ldr $t3,[sp,#$Goff+4] @ g.hi + + eor $t0,$t0,$t2 + str $Elo,[sp,#$Eoff+0] + eor $t1,$t1,$t3 + str $Ehi,[sp,#$Eoff+4] + and $t0,$t0,$Elo + str $Alo,[sp,#$Aoff+0] + and $t1,$t1,$Ehi + str $Ahi,[sp,#$Aoff+4] + eor $t0,$t0,$t2 + ldr $t2,[$Ktbl,#$lo] @ K[i].lo + eor $t1,$t1,$t3 @ Ch(e,f,g) + ldr $t3,[$Ktbl,#$hi] @ K[i].hi + + adds $Tlo,$Tlo,$t0 + ldr $Elo,[sp,#$Doff+0] @ d.lo + adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) + ldr $Ehi,[sp,#$Doff+4] @ d.hi + adds $Tlo,$Tlo,$t2 + and $t0,$t2,#0xff + adc $Thi,$Thi,$t3 @ T += K[i] + adds $Elo,$Elo,$Tlo + ldr $t2,[sp,#$Boff+0] @ b.lo + adc $Ehi,$Ehi,$Thi @ d += T + teq $t0,#$magic + + ldr $t3,[sp,#$Coff+0] @ c.lo +#if __ARM_ARCH__>=7 + it eq @ Thumb2 thing, sanity check in ARM +#endif + orreq $Ktbl,$Ktbl,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov $t0,$Alo,lsr#28 + mov $t1,$Ahi,lsr#28 + eor $t0,$t0,$Ahi,lsl#4 + eor $t1,$t1,$Alo,lsl#4 + eor $t0,$t0,$Ahi,lsr#2 + eor $t1,$t1,$Alo,lsr#2 + eor $t0,$t0,$Alo,lsl#30 + eor $t1,$t1,$Ahi,lsl#30 + eor $t0,$t0,$Ahi,lsr#7 + eor $t1,$t1,$Alo,lsr#7 + eor $t0,$t0,$Alo,lsl#25 + eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) + adds $Tlo,$Tlo,$t0 + and $t0,$Alo,$t2 + adc $Thi,$Thi,$t1 @ T += Sigma0(a) + + ldr $t1,[sp,#$Boff+4] @ b.hi + orr $Alo,$Alo,$t2 + ldr $t2,[sp,#$Coff+4] @ c.hi + and $Alo,$Alo,$t3 + and $t3,$Ahi,$t1 + orr $Ahi,$Ahi,$t1 + orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo + and $Ahi,$Ahi,$t2 + adds $Alo,$Alo,$Tlo + orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc $Ahi,$Ahi,$Thi @ h += T + tst $Ktbl,#1 + add $Ktbl,$Ktbl,#8 +___ +} +$code=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +# define VFP_ABI_PUSH +# define VFP_ABI_POP +#endif + +#ifdef __ARMEL__ +# define LO 0 +# define HI 4 +# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 +#else +# define HI 0 +# define LO 4 +# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 +#endif + +.text +#if __ARM_ARCH__<7 +.code 32 +#else +.syntax unified +# ifdef __thumb2__ +.thumb +# else +.code 32 +# endif +#endif + +.type K512,%object +.align 5 +K512: +WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) +WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) +WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) +WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) +WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) +WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) +WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) +WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) +WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) +WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) +WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) +WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) +WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) +WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) +WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) +WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) +WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) +WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) +WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) +WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) +WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) +WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) +WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) +WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) +WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) +WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) +WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) +WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) +WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) +WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) +WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) +WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) +WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) +WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) +WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) +WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) +WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) +WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) +WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) +WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) +.size K512,.-K512 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-sha512_block_data_order +.skip 32-4 +#else +.skip 32 +#endif + +.global sha512_block_data_order +.type sha512_block_data_order,%function +sha512_block_data_order: +.Lsha512_block_data_order: +#if __ARM_ARCH__<7 + sub r3,pc,#8 @ sha512_block_data_order +#else + adr r3,.Lsha512_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P + tst r12,#1 + bne .LNEON +#endif + add $len,$inp,$len,lsl#7 @ len to point at the end of inp + stmdb sp!,{r4-r12,lr} + sub $Ktbl,r3,#672 @ K512 + sub sp,sp,#9*8 + + ldr $Elo,[$ctx,#$Eoff+$lo] + ldr $Ehi,[$ctx,#$Eoff+$hi] + ldr $t0, [$ctx,#$Goff+$lo] + ldr $t1, [$ctx,#$Goff+$hi] + ldr $t2, [$ctx,#$Hoff+$lo] + ldr $t3, [$ctx,#$Hoff+$hi] +.Loop: + str $t0, [sp,#$Goff+0] + str $t1, [sp,#$Goff+4] + str $t2, [sp,#$Hoff+0] + str $t3, [sp,#$Hoff+4] + ldr $Alo,[$ctx,#$Aoff+$lo] + ldr $Ahi,[$ctx,#$Aoff+$hi] + ldr $Tlo,[$ctx,#$Boff+$lo] + ldr $Thi,[$ctx,#$Boff+$hi] + ldr $t0, [$ctx,#$Coff+$lo] + ldr $t1, [$ctx,#$Coff+$hi] + ldr $t2, [$ctx,#$Doff+$lo] + ldr $t3, [$ctx,#$Doff+$hi] + str $Tlo,[sp,#$Boff+0] + str $Thi,[sp,#$Boff+4] + str $t0, [sp,#$Coff+0] + str $t1, [sp,#$Coff+4] + str $t2, [sp,#$Doff+0] + str $t3, [sp,#$Doff+4] + ldr $Tlo,[$ctx,#$Foff+$lo] + ldr $Thi,[$ctx,#$Foff+$hi] + str $Tlo,[sp,#$Foff+0] + str $Thi,[sp,#$Foff+4] + +.L00_15: +#if __ARM_ARCH__<7 + ldrb $Tlo,[$inp,#7] + ldrb $t0, [$inp,#6] + ldrb $t1, [$inp,#5] + ldrb $t2, [$inp,#4] + ldrb $Thi,[$inp,#3] + ldrb $t3, [$inp,#2] + orr $Tlo,$Tlo,$t0,lsl#8 + ldrb $t0, [$inp,#1] + orr $Tlo,$Tlo,$t1,lsl#16 + ldrb $t1, [$inp],#8 + orr $Tlo,$Tlo,$t2,lsl#24 + orr $Thi,$Thi,$t3,lsl#8 + orr $Thi,$Thi,$t0,lsl#16 + orr $Thi,$Thi,$t1,lsl#24 +#else + ldr $Tlo,[$inp,#4] + ldr $Thi,[$inp],#8 +#ifdef __ARMEL__ + rev $Tlo,$Tlo + rev $Thi,$Thi +#endif +#endif +___ + &BODY_00_15(0x94); +$code.=<<___; + tst $Ktbl,#1 + beq .L00_15 + ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] + ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] + bic $Ktbl,$Ktbl,#1 +.L16_79: + @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) + @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 + @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 + mov $Tlo,$t0,lsr#1 + ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] + mov $Thi,$t1,lsr#1 + ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] + eor $Tlo,$Tlo,$t1,lsl#31 + eor $Thi,$Thi,$t0,lsl#31 + eor $Tlo,$Tlo,$t0,lsr#8 + eor $Thi,$Thi,$t1,lsr#8 + eor $Tlo,$Tlo,$t1,lsl#24 + eor $Thi,$Thi,$t0,lsl#24 + eor $Tlo,$Tlo,$t0,lsr#7 + eor $Thi,$Thi,$t1,lsr#7 + eor $Tlo,$Tlo,$t1,lsl#25 + + @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) + @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 + @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 + mov $t0,$t2,lsr#19 + mov $t1,$t3,lsr#19 + eor $t0,$t0,$t3,lsl#13 + eor $t1,$t1,$t2,lsl#13 + eor $t0,$t0,$t3,lsr#29 + eor $t1,$t1,$t2,lsr#29 + eor $t0,$t0,$t2,lsl#3 + eor $t1,$t1,$t3,lsl#3 + eor $t0,$t0,$t2,lsr#6 + eor $t1,$t1,$t3,lsr#6 + ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] + eor $t0,$t0,$t3,lsl#26 + + ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] + adds $Tlo,$Tlo,$t0 + ldr $t0,[sp,#`$Xoff+8*16`+0] + adc $Thi,$Thi,$t1 + + ldr $t1,[sp,#`$Xoff+8*16`+4] + adds $Tlo,$Tlo,$t2 + adc $Thi,$Thi,$t3 + adds $Tlo,$Tlo,$t0 + adc $Thi,$Thi,$t1 +___ + &BODY_00_15(0x17); +$code.=<<___; +#if __ARM_ARCH__>=7 + ittt eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] + ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] + beq .L16_79 + bic $Ktbl,$Ktbl,#1 + + ldr $Tlo,[sp,#$Boff+0] + ldr $Thi,[sp,#$Boff+4] + ldr $t0, [$ctx,#$Aoff+$lo] + ldr $t1, [$ctx,#$Aoff+$hi] + ldr $t2, [$ctx,#$Boff+$lo] + ldr $t3, [$ctx,#$Boff+$hi] + adds $t0,$Alo,$t0 + str $t0, [$ctx,#$Aoff+$lo] + adc $t1,$Ahi,$t1 + str $t1, [$ctx,#$Aoff+$hi] + adds $t2,$Tlo,$t2 + str $t2, [$ctx,#$Boff+$lo] + adc $t3,$Thi,$t3 + str $t3, [$ctx,#$Boff+$hi] + + ldr $Alo,[sp,#$Coff+0] + ldr $Ahi,[sp,#$Coff+4] + ldr $Tlo,[sp,#$Doff+0] + ldr $Thi,[sp,#$Doff+4] + ldr $t0, [$ctx,#$Coff+$lo] + ldr $t1, [$ctx,#$Coff+$hi] + ldr $t2, [$ctx,#$Doff+$lo] + ldr $t3, [$ctx,#$Doff+$hi] + adds $t0,$Alo,$t0 + str $t0, [$ctx,#$Coff+$lo] + adc $t1,$Ahi,$t1 + str $t1, [$ctx,#$Coff+$hi] + adds $t2,$Tlo,$t2 + str $t2, [$ctx,#$Doff+$lo] + adc $t3,$Thi,$t3 + str $t3, [$ctx,#$Doff+$hi] + + ldr $Tlo,[sp,#$Foff+0] + ldr $Thi,[sp,#$Foff+4] + ldr $t0, [$ctx,#$Eoff+$lo] + ldr $t1, [$ctx,#$Eoff+$hi] + ldr $t2, [$ctx,#$Foff+$lo] + ldr $t3, [$ctx,#$Foff+$hi] + adds $Elo,$Elo,$t0 + str $Elo,[$ctx,#$Eoff+$lo] + adc $Ehi,$Ehi,$t1 + str $Ehi,[$ctx,#$Eoff+$hi] + adds $t2,$Tlo,$t2 + str $t2, [$ctx,#$Foff+$lo] + adc $t3,$Thi,$t3 + str $t3, [$ctx,#$Foff+$hi] + + ldr $Alo,[sp,#$Goff+0] + ldr $Ahi,[sp,#$Goff+4] + ldr $Tlo,[sp,#$Hoff+0] + ldr $Thi,[sp,#$Hoff+4] + ldr $t0, [$ctx,#$Goff+$lo] + ldr $t1, [$ctx,#$Goff+$hi] + ldr $t2, [$ctx,#$Hoff+$lo] + ldr $t3, [$ctx,#$Hoff+$hi] + adds $t0,$Alo,$t0 + str $t0, [$ctx,#$Goff+$lo] + adc $t1,$Ahi,$t1 + str $t1, [$ctx,#$Goff+$hi] + adds $t2,$Tlo,$t2 + str $t2, [$ctx,#$Hoff+$lo] + adc $t3,$Thi,$t3 + str $t3, [$ctx,#$Hoff+$hi] + + add sp,sp,#640 + sub $Ktbl,$Ktbl,#640 + + teq $inp,$len + bne .Loop + + add sp,sp,#8*9 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size sha512_block_data_order,.-sha512_block_data_order +___ + +{ +my @Sigma0=(28,34,39); +my @Sigma1=(14,18,41); +my @sigma0=(1, 8, 7); +my @sigma1=(19,61,6); + +my $Ktbl="r3"; +my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch + +my @X=map("d$_",(0..15)); +my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23)); + +sub NEON_00_15() { +my $i=shift; +my ($a,$b,$c,$d,$e,$f,$g,$h)=@_; +my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps + +$code.=<<___ if ($i<16 || $i&1); + vshr.u64 $t0,$e,#@Sigma1[0] @ $i +#if $i<16 + vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned +#endif + vshr.u64 $t1,$e,#@Sigma1[1] +#if $i>0 + vadd.i64 $a,$Maj @ h+=Maj from the past +#endif + vshr.u64 $t2,$e,#@Sigma1[2] +___ +$code.=<<___; + vld1.64 {$K},[$Ktbl,:64]! @ K[i++] + vsli.64 $t0,$e,#`64-@Sigma1[0]` + vsli.64 $t1,$e,#`64-@Sigma1[1]` + vmov $Ch,$e + vsli.64 $t2,$e,#`64-@Sigma1[2]` +#if $i<16 && defined(__ARMEL__) + vrev64.8 @X[$i],@X[$i] +#endif + veor $t1,$t0 + vbsl $Ch,$f,$g @ Ch(e,f,g) + vshr.u64 $t0,$a,#@Sigma0[0] + veor $t2,$t1 @ Sigma1(e) + vadd.i64 $T1,$Ch,$h + vshr.u64 $t1,$a,#@Sigma0[1] + vsli.64 $t0,$a,#`64-@Sigma0[0]` + vadd.i64 $T1,$t2 + vshr.u64 $t2,$a,#@Sigma0[2] + vadd.i64 $K,@X[$i%16] + vsli.64 $t1,$a,#`64-@Sigma0[1]` + veor $Maj,$a,$b + vsli.64 $t2,$a,#`64-@Sigma0[2]` + veor $h,$t0,$t1 + vadd.i64 $T1,$K + vbsl $Maj,$c,$b @ Maj(a,b,c) + veor $h,$t2 @ Sigma0(a) + vadd.i64 $d,$T1 + vadd.i64 $Maj,$T1 + @ vadd.i64 $h,$Maj +___ +} + +sub NEON_16_79() { +my $i=shift; + +if ($i&1) { &NEON_00_15($i,@_); return; } + +# 2x-vectorized, therefore runs every 2nd round +my @X=map("q$_",(0..7)); # view @X as 128-bit vector +my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps +my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15 +my $e=@_[4]; # $e from NEON_00_15 +$i /= 2; +$code.=<<___; + vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0] + vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1] + vadd.i64 @_[0],d30 @ h+=Maj from the past + vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2] + vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]` + vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1] + vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]` + veor $s1,$t0 + vshr.u64 $t0,$s0,#@sigma0[0] + veor $s1,$t1 @ sigma1(X[i+14]) + vshr.u64 $t1,$s0,#@sigma0[1] + vadd.i64 @X[$i%8],$s1 + vshr.u64 $s1,$s0,#@sigma0[2] + vsli.64 $t0,$s0,#`64-@sigma0[0]` + vsli.64 $t1,$s0,#`64-@sigma0[1]` + vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9] + veor $s1,$t0 + vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15 + vadd.i64 @X[$i%8],$s0 + vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15 + veor $s1,$t1 @ sigma0(X[i+1]) + vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15 + vadd.i64 @X[$i%8],$s1 +___ + &NEON_00_15(2*$i,@_); +} + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.global sha512_block_data_order_neon +.type sha512_block_data_order_neon,%function +.align 4 +sha512_block_data_order_neon: +.LNEON: + dmb @ errata #451034 on early Cortex A8 + add $len,$inp,$len,lsl#7 @ len to point at the end of inp + VFP_ABI_PUSH + adr $Ktbl,.Lsha512_block_data_order + sub $Ktbl,$Ktbl,.Lsha512_block_data_order-K512 + vldmia $ctx,{$A-$H} @ load context +.Loop_neon: +___ +for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + mov $cnt,#4 +.L16_79_neon: + subs $cnt,#1 +___ +for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + bne .L16_79_neon + + vadd.i64 $A,d30 @ h+=Maj from the past + vldmia $ctx,{d24-d31} @ load context to temp + vadd.i64 q8,q12 @ vectorized accumulate + vadd.i64 q9,q13 + vadd.i64 q10,q14 + vadd.i64 q11,q15 + vstmia $ctx,{$A-$H} @ save context + teq $inp,$len + sub $Ktbl,#640 @ rewind K512 + bne .Loop_neon + + VFP_ABI_POP + ret @ bx lr +.size sha512_block_data_order_neon,.-sha512_block_data_order_neon +#endif +___ +} +$code.=<<___; +.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.comm OPENSSL_armcap_P,4,4 +#endif +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +$code =~ s/\bret\b/bx lr/gm; + +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + +print $code; +close STDOUT; # enforce flush diff --git a/lib/crypto/arm/sha512.h b/lib/crypto/arm/sha512.h new file mode 100644 index 000000000000..f147b6490d6c --- /dev/null +++ b/lib/crypto/arm/sha512.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * arm32-optimized SHA-512 block function + * + * Copyright 2025 Google LLC + */ + +#include <asm/neon.h> +#include <crypto/internal/simd.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +asmlinkage void sha512_block_data_order(struct sha512_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage void sha512_block_data_order_neon(struct sha512_block_state *state, + const u8 *data, size_t nblocks); + +static void sha512_blocks(struct sha512_block_state *state, + const u8 *data, size_t nblocks) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_neon) && likely(crypto_simd_usable())) { + kernel_neon_begin(); + sha512_block_data_order_neon(state, data, nblocks); + kernel_neon_end(); + } else { + sha512_block_data_order(state, data, nblocks); + } +} + +#ifdef CONFIG_KERNEL_MODE_NEON +#define sha512_mod_init_arch sha512_mod_init_arch +static inline void sha512_mod_init_arch(void) +{ + if (cpu_has_neon()) + static_branch_enable(&have_neon); +} +#endif /* CONFIG_KERNEL_MODE_NEON */ diff --git a/lib/crypto/arm64/.gitignore b/lib/crypto/arm64/.gitignore new file mode 100644 index 000000000000..f6c4e8ef80da --- /dev/null +++ b/lib/crypto/arm64/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +poly1305-core.S +sha256-core.S +sha512-core.S diff --git a/lib/crypto/arm64/Kconfig b/lib/crypto/arm64/Kconfig new file mode 100644 index 000000000000..0b903ef524d8 --- /dev/null +++ b/lib/crypto/arm64/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_CHACHA20_NEON + tristate + depends on KERNEL_MODE_NEON + default CRYPTO_LIB_CHACHA + select CRYPTO_LIB_CHACHA_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CHACHA + +config CRYPTO_POLY1305_NEON + tristate + depends on KERNEL_MODE_NEON + default CRYPTO_LIB_POLY1305 + select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/lib/crypto/arm64/Makefile b/lib/crypto/arm64/Makefile new file mode 100644 index 000000000000..6207088397a7 --- /dev/null +++ b/lib/crypto/arm64/Makefile @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o +chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o + +obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o +poly1305-neon-y := poly1305-core.o poly1305-glue.o +AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_block_init_arch +AFLAGS_poly1305-core.o += -Dpoly1305_emit=poly1305_emit_arch + +quiet_cmd_perlasm = PERLASM $@ + cmd_perlasm = $(PERL) $(<) void $(@) + +$(obj)/%-core.S: $(src)/%-armv8.pl + $(call cmd,perlasm) + +clean-files += poly1305-core.S diff --git a/lib/crypto/arm64/chacha-neon-core.S b/lib/crypto/arm64/chacha-neon-core.S new file mode 100644 index 000000000000..80079586ecc7 --- /dev/null +++ b/lib/crypto/arm64/chacha-neon-core.S @@ -0,0 +1,805 @@ +/* + * ChaCha/HChaCha NEON helper functions + * + * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Originally based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> +#include <asm/cache.h> + + .text + .align 6 + +/* + * chacha_permute - permute one block + * + * Permute one 64-byte block where the state matrix is stored in the four NEON + * registers v0-v3. It performs matrix operations on four words in parallel, + * but requires shuffling to rearrange the words after each round. + * + * The round count is given in w3. + * + * Clobbers: w3, x10, v4, v12 + */ +SYM_FUNC_START_LOCAL(chacha_permute) + + adr_l x10, ROT8 + ld1 {v12.4s}, [x10] + +.Ldoubleround: + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + rev32 v3.8h, v3.8h + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #12 + sri v1.4s, v4.4s, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + tbl v3.16b, {v3.16b}, v12.16b + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #7 + sri v1.4s, v4.4s, #25 + + // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + ext v1.16b, v1.16b, v1.16b, #4 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + ext v2.16b, v2.16b, v2.16b, #8 + // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + ext v3.16b, v3.16b, v3.16b, #12 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + rev32 v3.8h, v3.8h + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #12 + sri v1.4s, v4.4s, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + tbl v3.16b, {v3.16b}, v12.16b + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #7 + sri v1.4s, v4.4s, #25 + + // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + ext v1.16b, v1.16b, v1.16b, #12 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + ext v2.16b, v2.16b, v2.16b, #8 + // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + ext v3.16b, v3.16b, v3.16b, #4 + + subs w3, w3, #2 + b.ne .Ldoubleround + + ret +SYM_FUNC_END(chacha_permute) + +SYM_FUNC_START(chacha_block_xor_neon) + // x0: Input state matrix, s + // x1: 1 data block output, o + // x2: 1 data block input, i + // w3: nrounds + + stp x29, x30, [sp, #-16]! + mov x29, sp + + // x0..3 = s0..3 + ld1 {v0.4s-v3.4s}, [x0] + ld1 {v8.4s-v11.4s}, [x0] + + bl chacha_permute + + ld1 {v4.16b-v7.16b}, [x2] + + // o0 = i0 ^ (x0 + s0) + add v0.4s, v0.4s, v8.4s + eor v0.16b, v0.16b, v4.16b + + // o1 = i1 ^ (x1 + s1) + add v1.4s, v1.4s, v9.4s + eor v1.16b, v1.16b, v5.16b + + // o2 = i2 ^ (x2 + s2) + add v2.4s, v2.4s, v10.4s + eor v2.16b, v2.16b, v6.16b + + // o3 = i3 ^ (x3 + s3) + add v3.4s, v3.4s, v11.4s + eor v3.16b, v3.16b, v7.16b + + st1 {v0.16b-v3.16b}, [x1] + + ldp x29, x30, [sp], #16 + ret +SYM_FUNC_END(chacha_block_xor_neon) + +SYM_FUNC_START(hchacha_block_neon) + // x0: Input state matrix, s + // x1: output (8 32-bit words) + // w2: nrounds + + stp x29, x30, [sp, #-16]! + mov x29, sp + + ld1 {v0.4s-v3.4s}, [x0] + + mov w3, w2 + bl chacha_permute + + st1 {v0.4s}, [x1], #16 + st1 {v3.4s}, [x1] + + ldp x29, x30, [sp], #16 + ret +SYM_FUNC_END(hchacha_block_neon) + + a0 .req w12 + a1 .req w13 + a2 .req w14 + a3 .req w15 + a4 .req w16 + a5 .req w17 + a6 .req w19 + a7 .req w20 + a8 .req w21 + a9 .req w22 + a10 .req w23 + a11 .req w24 + a12 .req w25 + a13 .req w26 + a14 .req w27 + a15 .req w28 + + .align 6 +SYM_FUNC_START(chacha_4block_xor_neon) + frame_push 10 + + // x0: Input state matrix, s + // x1: 4 data blocks output, o + // x2: 4 data blocks input, i + // w3: nrounds + // x4: byte count + + adr_l x10, .Lpermute + and x5, x4, #63 + add x10, x10, x5 + + // + // This function encrypts four consecutive ChaCha blocks by loading + // the state matrix in NEON registers four times. The algorithm performs + // each operation on the corresponding word of each state matrix, hence + // requires no word shuffling. For final XORing step we transpose the + // matrix by interleaving 32- and then 64-bit words, which allows us to + // do XOR in NEON registers. + // + // At the same time, a fifth block is encrypted in parallel using + // scalar registers + // + adr_l x9, CTRINC // ... and ROT8 + ld1 {v30.4s-v31.4s}, [x9] + + // x0..15[0-3] = s0..3[0..3] + add x8, x0, #16 + ld4r { v0.4s- v3.4s}, [x0] + ld4r { v4.4s- v7.4s}, [x8], #16 + ld4r { v8.4s-v11.4s}, [x8], #16 + ld4r {v12.4s-v15.4s}, [x8] + + mov a0, v0.s[0] + mov a1, v1.s[0] + mov a2, v2.s[0] + mov a3, v3.s[0] + mov a4, v4.s[0] + mov a5, v5.s[0] + mov a6, v6.s[0] + mov a7, v7.s[0] + mov a8, v8.s[0] + mov a9, v9.s[0] + mov a10, v10.s[0] + mov a11, v11.s[0] + mov a12, v12.s[0] + mov a13, v13.s[0] + mov a14, v14.s[0] + mov a15, v15.s[0] + + // x12 += counter values 1-4 + add v12.4s, v12.4s, v30.4s + +.Ldoubleround4: + // x0 += x4, x12 = rotl32(x12 ^ x0, 16) + // x1 += x5, x13 = rotl32(x13 ^ x1, 16) + // x2 += x6, x14 = rotl32(x14 ^ x2, 16) + // x3 += x7, x15 = rotl32(x15 ^ x3, 16) + add v0.4s, v0.4s, v4.4s + add a0, a0, a4 + add v1.4s, v1.4s, v5.4s + add a1, a1, a5 + add v2.4s, v2.4s, v6.4s + add a2, a2, a6 + add v3.4s, v3.4s, v7.4s + add a3, a3, a7 + + eor v12.16b, v12.16b, v0.16b + eor a12, a12, a0 + eor v13.16b, v13.16b, v1.16b + eor a13, a13, a1 + eor v14.16b, v14.16b, v2.16b + eor a14, a14, a2 + eor v15.16b, v15.16b, v3.16b + eor a15, a15, a3 + + rev32 v12.8h, v12.8h + ror a12, a12, #16 + rev32 v13.8h, v13.8h + ror a13, a13, #16 + rev32 v14.8h, v14.8h + ror a14, a14, #16 + rev32 v15.8h, v15.8h + ror a15, a15, #16 + + // x8 += x12, x4 = rotl32(x4 ^ x8, 12) + // x9 += x13, x5 = rotl32(x5 ^ x9, 12) + // x10 += x14, x6 = rotl32(x6 ^ x10, 12) + // x11 += x15, x7 = rotl32(x7 ^ x11, 12) + add v8.4s, v8.4s, v12.4s + add a8, a8, a12 + add v9.4s, v9.4s, v13.4s + add a9, a9, a13 + add v10.4s, v10.4s, v14.4s + add a10, a10, a14 + add v11.4s, v11.4s, v15.4s + add a11, a11, a15 + + eor v16.16b, v4.16b, v8.16b + eor a4, a4, a8 + eor v17.16b, v5.16b, v9.16b + eor a5, a5, a9 + eor v18.16b, v6.16b, v10.16b + eor a6, a6, a10 + eor v19.16b, v7.16b, v11.16b + eor a7, a7, a11 + + shl v4.4s, v16.4s, #12 + shl v5.4s, v17.4s, #12 + shl v6.4s, v18.4s, #12 + shl v7.4s, v19.4s, #12 + + sri v4.4s, v16.4s, #20 + ror a4, a4, #20 + sri v5.4s, v17.4s, #20 + ror a5, a5, #20 + sri v6.4s, v18.4s, #20 + ror a6, a6, #20 + sri v7.4s, v19.4s, #20 + ror a7, a7, #20 + + // x0 += x4, x12 = rotl32(x12 ^ x0, 8) + // x1 += x5, x13 = rotl32(x13 ^ x1, 8) + // x2 += x6, x14 = rotl32(x14 ^ x2, 8) + // x3 += x7, x15 = rotl32(x15 ^ x3, 8) + add v0.4s, v0.4s, v4.4s + add a0, a0, a4 + add v1.4s, v1.4s, v5.4s + add a1, a1, a5 + add v2.4s, v2.4s, v6.4s + add a2, a2, a6 + add v3.4s, v3.4s, v7.4s + add a3, a3, a7 + + eor v12.16b, v12.16b, v0.16b + eor a12, a12, a0 + eor v13.16b, v13.16b, v1.16b + eor a13, a13, a1 + eor v14.16b, v14.16b, v2.16b + eor a14, a14, a2 + eor v15.16b, v15.16b, v3.16b + eor a15, a15, a3 + + tbl v12.16b, {v12.16b}, v31.16b + ror a12, a12, #24 + tbl v13.16b, {v13.16b}, v31.16b + ror a13, a13, #24 + tbl v14.16b, {v14.16b}, v31.16b + ror a14, a14, #24 + tbl v15.16b, {v15.16b}, v31.16b + ror a15, a15, #24 + + // x8 += x12, x4 = rotl32(x4 ^ x8, 7) + // x9 += x13, x5 = rotl32(x5 ^ x9, 7) + // x10 += x14, x6 = rotl32(x6 ^ x10, 7) + // x11 += x15, x7 = rotl32(x7 ^ x11, 7) + add v8.4s, v8.4s, v12.4s + add a8, a8, a12 + add v9.4s, v9.4s, v13.4s + add a9, a9, a13 + add v10.4s, v10.4s, v14.4s + add a10, a10, a14 + add v11.4s, v11.4s, v15.4s + add a11, a11, a15 + + eor v16.16b, v4.16b, v8.16b + eor a4, a4, a8 + eor v17.16b, v5.16b, v9.16b + eor a5, a5, a9 + eor v18.16b, v6.16b, v10.16b + eor a6, a6, a10 + eor v19.16b, v7.16b, v11.16b + eor a7, a7, a11 + + shl v4.4s, v16.4s, #7 + shl v5.4s, v17.4s, #7 + shl v6.4s, v18.4s, #7 + shl v7.4s, v19.4s, #7 + + sri v4.4s, v16.4s, #25 + ror a4, a4, #25 + sri v5.4s, v17.4s, #25 + ror a5, a5, #25 + sri v6.4s, v18.4s, #25 + ror a6, a6, #25 + sri v7.4s, v19.4s, #25 + ror a7, a7, #25 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 16) + // x1 += x6, x12 = rotl32(x12 ^ x1, 16) + // x2 += x7, x13 = rotl32(x13 ^ x2, 16) + // x3 += x4, x14 = rotl32(x14 ^ x3, 16) + add v0.4s, v0.4s, v5.4s + add a0, a0, a5 + add v1.4s, v1.4s, v6.4s + add a1, a1, a6 + add v2.4s, v2.4s, v7.4s + add a2, a2, a7 + add v3.4s, v3.4s, v4.4s + add a3, a3, a4 + + eor v15.16b, v15.16b, v0.16b + eor a15, a15, a0 + eor v12.16b, v12.16b, v1.16b + eor a12, a12, a1 + eor v13.16b, v13.16b, v2.16b + eor a13, a13, a2 + eor v14.16b, v14.16b, v3.16b + eor a14, a14, a3 + + rev32 v15.8h, v15.8h + ror a15, a15, #16 + rev32 v12.8h, v12.8h + ror a12, a12, #16 + rev32 v13.8h, v13.8h + ror a13, a13, #16 + rev32 v14.8h, v14.8h + ror a14, a14, #16 + + // x10 += x15, x5 = rotl32(x5 ^ x10, 12) + // x11 += x12, x6 = rotl32(x6 ^ x11, 12) + // x8 += x13, x7 = rotl32(x7 ^ x8, 12) + // x9 += x14, x4 = rotl32(x4 ^ x9, 12) + add v10.4s, v10.4s, v15.4s + add a10, a10, a15 + add v11.4s, v11.4s, v12.4s + add a11, a11, a12 + add v8.4s, v8.4s, v13.4s + add a8, a8, a13 + add v9.4s, v9.4s, v14.4s + add a9, a9, a14 + + eor v16.16b, v5.16b, v10.16b + eor a5, a5, a10 + eor v17.16b, v6.16b, v11.16b + eor a6, a6, a11 + eor v18.16b, v7.16b, v8.16b + eor a7, a7, a8 + eor v19.16b, v4.16b, v9.16b + eor a4, a4, a9 + + shl v5.4s, v16.4s, #12 + shl v6.4s, v17.4s, #12 + shl v7.4s, v18.4s, #12 + shl v4.4s, v19.4s, #12 + + sri v5.4s, v16.4s, #20 + ror a5, a5, #20 + sri v6.4s, v17.4s, #20 + ror a6, a6, #20 + sri v7.4s, v18.4s, #20 + ror a7, a7, #20 + sri v4.4s, v19.4s, #20 + ror a4, a4, #20 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 8) + // x1 += x6, x12 = rotl32(x12 ^ x1, 8) + // x2 += x7, x13 = rotl32(x13 ^ x2, 8) + // x3 += x4, x14 = rotl32(x14 ^ x3, 8) + add v0.4s, v0.4s, v5.4s + add a0, a0, a5 + add v1.4s, v1.4s, v6.4s + add a1, a1, a6 + add v2.4s, v2.4s, v7.4s + add a2, a2, a7 + add v3.4s, v3.4s, v4.4s + add a3, a3, a4 + + eor v15.16b, v15.16b, v0.16b + eor a15, a15, a0 + eor v12.16b, v12.16b, v1.16b + eor a12, a12, a1 + eor v13.16b, v13.16b, v2.16b + eor a13, a13, a2 + eor v14.16b, v14.16b, v3.16b + eor a14, a14, a3 + + tbl v15.16b, {v15.16b}, v31.16b + ror a15, a15, #24 + tbl v12.16b, {v12.16b}, v31.16b + ror a12, a12, #24 + tbl v13.16b, {v13.16b}, v31.16b + ror a13, a13, #24 + tbl v14.16b, {v14.16b}, v31.16b + ror a14, a14, #24 + + // x10 += x15, x5 = rotl32(x5 ^ x10, 7) + // x11 += x12, x6 = rotl32(x6 ^ x11, 7) + // x8 += x13, x7 = rotl32(x7 ^ x8, 7) + // x9 += x14, x4 = rotl32(x4 ^ x9, 7) + add v10.4s, v10.4s, v15.4s + add a10, a10, a15 + add v11.4s, v11.4s, v12.4s + add a11, a11, a12 + add v8.4s, v8.4s, v13.4s + add a8, a8, a13 + add v9.4s, v9.4s, v14.4s + add a9, a9, a14 + + eor v16.16b, v5.16b, v10.16b + eor a5, a5, a10 + eor v17.16b, v6.16b, v11.16b + eor a6, a6, a11 + eor v18.16b, v7.16b, v8.16b + eor a7, a7, a8 + eor v19.16b, v4.16b, v9.16b + eor a4, a4, a9 + + shl v5.4s, v16.4s, #7 + shl v6.4s, v17.4s, #7 + shl v7.4s, v18.4s, #7 + shl v4.4s, v19.4s, #7 + + sri v5.4s, v16.4s, #25 + ror a5, a5, #25 + sri v6.4s, v17.4s, #25 + ror a6, a6, #25 + sri v7.4s, v18.4s, #25 + ror a7, a7, #25 + sri v4.4s, v19.4s, #25 + ror a4, a4, #25 + + subs w3, w3, #2 + b.ne .Ldoubleround4 + + ld4r {v16.4s-v19.4s}, [x0], #16 + ld4r {v20.4s-v23.4s}, [x0], #16 + + // x12 += counter values 0-3 + add v12.4s, v12.4s, v30.4s + + // x0[0-3] += s0[0] + // x1[0-3] += s0[1] + // x2[0-3] += s0[2] + // x3[0-3] += s0[3] + add v0.4s, v0.4s, v16.4s + mov w6, v16.s[0] + mov w7, v17.s[0] + add v1.4s, v1.4s, v17.4s + mov w8, v18.s[0] + mov w9, v19.s[0] + add v2.4s, v2.4s, v18.4s + add a0, a0, w6 + add a1, a1, w7 + add v3.4s, v3.4s, v19.4s + add a2, a2, w8 + add a3, a3, w9 +CPU_BE( rev a0, a0 ) +CPU_BE( rev a1, a1 ) +CPU_BE( rev a2, a2 ) +CPU_BE( rev a3, a3 ) + + ld4r {v24.4s-v27.4s}, [x0], #16 + ld4r {v28.4s-v31.4s}, [x0] + + // x4[0-3] += s1[0] + // x5[0-3] += s1[1] + // x6[0-3] += s1[2] + // x7[0-3] += s1[3] + add v4.4s, v4.4s, v20.4s + mov w6, v20.s[0] + mov w7, v21.s[0] + add v5.4s, v5.4s, v21.4s + mov w8, v22.s[0] + mov w9, v23.s[0] + add v6.4s, v6.4s, v22.4s + add a4, a4, w6 + add a5, a5, w7 + add v7.4s, v7.4s, v23.4s + add a6, a6, w8 + add a7, a7, w9 +CPU_BE( rev a4, a4 ) +CPU_BE( rev a5, a5 ) +CPU_BE( rev a6, a6 ) +CPU_BE( rev a7, a7 ) + + // x8[0-3] += s2[0] + // x9[0-3] += s2[1] + // x10[0-3] += s2[2] + // x11[0-3] += s2[3] + add v8.4s, v8.4s, v24.4s + mov w6, v24.s[0] + mov w7, v25.s[0] + add v9.4s, v9.4s, v25.4s + mov w8, v26.s[0] + mov w9, v27.s[0] + add v10.4s, v10.4s, v26.4s + add a8, a8, w6 + add a9, a9, w7 + add v11.4s, v11.4s, v27.4s + add a10, a10, w8 + add a11, a11, w9 +CPU_BE( rev a8, a8 ) +CPU_BE( rev a9, a9 ) +CPU_BE( rev a10, a10 ) +CPU_BE( rev a11, a11 ) + + // x12[0-3] += s3[0] + // x13[0-3] += s3[1] + // x14[0-3] += s3[2] + // x15[0-3] += s3[3] + add v12.4s, v12.4s, v28.4s + mov w6, v28.s[0] + mov w7, v29.s[0] + add v13.4s, v13.4s, v29.4s + mov w8, v30.s[0] + mov w9, v31.s[0] + add v14.4s, v14.4s, v30.4s + add a12, a12, w6 + add a13, a13, w7 + add v15.4s, v15.4s, v31.4s + add a14, a14, w8 + add a15, a15, w9 +CPU_BE( rev a12, a12 ) +CPU_BE( rev a13, a13 ) +CPU_BE( rev a14, a14 ) +CPU_BE( rev a15, a15 ) + + // interleave 32-bit words in state n, n+1 + ldp w6, w7, [x2], #64 + zip1 v16.4s, v0.4s, v1.4s + ldp w8, w9, [x2, #-56] + eor a0, a0, w6 + zip2 v17.4s, v0.4s, v1.4s + eor a1, a1, w7 + zip1 v18.4s, v2.4s, v3.4s + eor a2, a2, w8 + zip2 v19.4s, v2.4s, v3.4s + eor a3, a3, w9 + ldp w6, w7, [x2, #-48] + zip1 v20.4s, v4.4s, v5.4s + ldp w8, w9, [x2, #-40] + eor a4, a4, w6 + zip2 v21.4s, v4.4s, v5.4s + eor a5, a5, w7 + zip1 v22.4s, v6.4s, v7.4s + eor a6, a6, w8 + zip2 v23.4s, v6.4s, v7.4s + eor a7, a7, w9 + ldp w6, w7, [x2, #-32] + zip1 v24.4s, v8.4s, v9.4s + ldp w8, w9, [x2, #-24] + eor a8, a8, w6 + zip2 v25.4s, v8.4s, v9.4s + eor a9, a9, w7 + zip1 v26.4s, v10.4s, v11.4s + eor a10, a10, w8 + zip2 v27.4s, v10.4s, v11.4s + eor a11, a11, w9 + ldp w6, w7, [x2, #-16] + zip1 v28.4s, v12.4s, v13.4s + ldp w8, w9, [x2, #-8] + eor a12, a12, w6 + zip2 v29.4s, v12.4s, v13.4s + eor a13, a13, w7 + zip1 v30.4s, v14.4s, v15.4s + eor a14, a14, w8 + zip2 v31.4s, v14.4s, v15.4s + eor a15, a15, w9 + + add x3, x2, x4 + sub x3, x3, #128 // start of last block + + subs x5, x4, #128 + csel x2, x2, x3, ge + + // interleave 64-bit words in state n, n+2 + zip1 v0.2d, v16.2d, v18.2d + zip2 v4.2d, v16.2d, v18.2d + stp a0, a1, [x1], #64 + zip1 v8.2d, v17.2d, v19.2d + zip2 v12.2d, v17.2d, v19.2d + stp a2, a3, [x1, #-56] + + subs x6, x4, #192 + ld1 {v16.16b-v19.16b}, [x2], #64 + csel x2, x2, x3, ge + + zip1 v1.2d, v20.2d, v22.2d + zip2 v5.2d, v20.2d, v22.2d + stp a4, a5, [x1, #-48] + zip1 v9.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + stp a6, a7, [x1, #-40] + + subs x7, x4, #256 + ld1 {v20.16b-v23.16b}, [x2], #64 + csel x2, x2, x3, ge + + zip1 v2.2d, v24.2d, v26.2d + zip2 v6.2d, v24.2d, v26.2d + stp a8, a9, [x1, #-32] + zip1 v10.2d, v25.2d, v27.2d + zip2 v14.2d, v25.2d, v27.2d + stp a10, a11, [x1, #-24] + + subs x8, x4, #320 + ld1 {v24.16b-v27.16b}, [x2], #64 + csel x2, x2, x3, ge + + zip1 v3.2d, v28.2d, v30.2d + zip2 v7.2d, v28.2d, v30.2d + stp a12, a13, [x1, #-16] + zip1 v11.2d, v29.2d, v31.2d + zip2 v15.2d, v29.2d, v31.2d + stp a14, a15, [x1, #-8] + + tbnz x5, #63, .Lt128 + ld1 {v28.16b-v31.16b}, [x2] + + // xor with corresponding input, write to output + eor v16.16b, v16.16b, v0.16b + eor v17.16b, v17.16b, v1.16b + eor v18.16b, v18.16b, v2.16b + eor v19.16b, v19.16b, v3.16b + + tbnz x6, #63, .Lt192 + + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v6.16b + eor v23.16b, v23.16b, v7.16b + + st1 {v16.16b-v19.16b}, [x1], #64 + tbnz x7, #63, .Lt256 + + eor v24.16b, v24.16b, v8.16b + eor v25.16b, v25.16b, v9.16b + eor v26.16b, v26.16b, v10.16b + eor v27.16b, v27.16b, v11.16b + + st1 {v20.16b-v23.16b}, [x1], #64 + tbnz x8, #63, .Lt320 + + eor v28.16b, v28.16b, v12.16b + eor v29.16b, v29.16b, v13.16b + eor v30.16b, v30.16b, v14.16b + eor v31.16b, v31.16b, v15.16b + + st1 {v24.16b-v27.16b}, [x1], #64 + st1 {v28.16b-v31.16b}, [x1] + +.Lout: frame_pop + ret + + // fewer than 192 bytes of in/output +.Lt192: cbz x5, 1f // exactly 128 bytes? + ld1 {v28.16b-v31.16b}, [x10] + add x5, x5, x1 + tbl v28.16b, {v4.16b-v7.16b}, v28.16b + tbl v29.16b, {v4.16b-v7.16b}, v29.16b + tbl v30.16b, {v4.16b-v7.16b}, v30.16b + tbl v31.16b, {v4.16b-v7.16b}, v31.16b + +0: eor v20.16b, v20.16b, v28.16b + eor v21.16b, v21.16b, v29.16b + eor v22.16b, v22.16b, v30.16b + eor v23.16b, v23.16b, v31.16b + st1 {v20.16b-v23.16b}, [x5] // overlapping stores +1: st1 {v16.16b-v19.16b}, [x1] + b .Lout + + // fewer than 128 bytes of in/output +.Lt128: ld1 {v28.16b-v31.16b}, [x10] + add x5, x5, x1 + sub x1, x1, #64 + tbl v28.16b, {v0.16b-v3.16b}, v28.16b + tbl v29.16b, {v0.16b-v3.16b}, v29.16b + tbl v30.16b, {v0.16b-v3.16b}, v30.16b + tbl v31.16b, {v0.16b-v3.16b}, v31.16b + ld1 {v16.16b-v19.16b}, [x1] // reload first output block + b 0b + + // fewer than 256 bytes of in/output +.Lt256: cbz x6, 2f // exactly 192 bytes? + ld1 {v4.16b-v7.16b}, [x10] + add x6, x6, x1 + tbl v0.16b, {v8.16b-v11.16b}, v4.16b + tbl v1.16b, {v8.16b-v11.16b}, v5.16b + tbl v2.16b, {v8.16b-v11.16b}, v6.16b + tbl v3.16b, {v8.16b-v11.16b}, v7.16b + + eor v28.16b, v28.16b, v0.16b + eor v29.16b, v29.16b, v1.16b + eor v30.16b, v30.16b, v2.16b + eor v31.16b, v31.16b, v3.16b + st1 {v28.16b-v31.16b}, [x6] // overlapping stores +2: st1 {v20.16b-v23.16b}, [x1] + b .Lout + + // fewer than 320 bytes of in/output +.Lt320: cbz x7, 3f // exactly 256 bytes? + ld1 {v4.16b-v7.16b}, [x10] + add x7, x7, x1 + tbl v0.16b, {v12.16b-v15.16b}, v4.16b + tbl v1.16b, {v12.16b-v15.16b}, v5.16b + tbl v2.16b, {v12.16b-v15.16b}, v6.16b + tbl v3.16b, {v12.16b-v15.16b}, v7.16b + + eor v28.16b, v28.16b, v0.16b + eor v29.16b, v29.16b, v1.16b + eor v30.16b, v30.16b, v2.16b + eor v31.16b, v31.16b, v3.16b + st1 {v28.16b-v31.16b}, [x7] // overlapping stores +3: st1 {v24.16b-v27.16b}, [x1] + b .Lout +SYM_FUNC_END(chacha_4block_xor_neon) + + .section ".rodata", "a", %progbits + .align L1_CACHE_SHIFT +.Lpermute: + .set .Li, 0 + .rept 128 + .byte (.Li - 64) + .set .Li, .Li + 1 + .endr + +CTRINC: .word 1, 2, 3, 4 +ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f diff --git a/lib/crypto/arm64/chacha-neon-glue.c b/lib/crypto/arm64/chacha-neon-glue.c new file mode 100644 index 000000000000..d0188f974ca5 --- /dev/null +++ b/lib/crypto/arm64/chacha-neon-glue.c @@ -0,0 +1,119 @@ +/* + * ChaCha and HChaCha functions (ARM64 optimized) + * + * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <crypto/chacha.h> +#include <crypto/internal/simd.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> + +asmlinkage void chacha_block_xor_neon(const struct chacha_state *state, + u8 *dst, const u8 *src, int nrounds); +asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state, + u8 *dst, const u8 *src, + int nrounds, int bytes); +asmlinkage void hchacha_block_neon(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src, + int bytes, int nrounds) +{ + while (bytes > 0) { + int l = min(bytes, CHACHA_BLOCK_SIZE * 5); + + if (l <= CHACHA_BLOCK_SIZE) { + u8 buf[CHACHA_BLOCK_SIZE]; + + memcpy(buf, src, l); + chacha_block_xor_neon(state, buf, buf, nrounds); + memcpy(dst, buf, l); + state->x[12] += 1; + break; + } + chacha_4block_xor_neon(state, dst, src, nrounds, l); + bytes -= l; + src += l; + dst += l; + state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); + } +} + +void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) { + hchacha_block_generic(state, out, nrounds); + } else { + kernel_neon_begin(); + hchacha_block_neon(state, out, nrounds); + kernel_neon_end(); + } +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE || + !crypto_simd_usable()) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + kernel_neon_begin(); + chacha_doneon(state, dst, src, todo, nrounds); + kernel_neon_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +bool chacha_is_arch_optimized(void) +{ + return static_key_enabled(&have_neon); +} +EXPORT_SYMBOL(chacha_is_arch_optimized); + +static int __init chacha_simd_mod_init(void) +{ + if (cpu_have_named_feature(ASIMD)) + static_branch_enable(&have_neon); + return 0; +} +subsys_initcall(chacha_simd_mod_init); + +static void __exit chacha_simd_mod_exit(void) +{ +} +module_exit(chacha_simd_mod_exit); + +MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM64 optimized)"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm64/poly1305-armv8.pl b/lib/crypto/arm64/poly1305-armv8.pl new file mode 100644 index 000000000000..22c9069c0650 --- /dev/null +++ b/lib/crypto/arm64/poly1305-armv8.pl @@ -0,0 +1,917 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# This module implements Poly1305 hash for ARMv8. +# +# June 2015 +# +# Numbers are cycles per processed byte with poly1305_blocks alone. +# +# IALU/gcc-4.9 NEON +# +# Apple A7 1.86/+5% 0.72 +# Cortex-A53 2.69/+58% 1.47 +# Cortex-A57 2.70/+7% 1.14 +# Denver 1.64/+50% 1.18(*) +# X-Gene 2.13/+68% 2.27 +# Mongoose 1.77/+75% 1.12 +# Kryo 2.70/+55% 1.13 +# ThunderX2 1.17/+95% 1.36 +# +# (*) estimate based on resources availability is less than 1.0, +# i.e. measured result is worse than expected, presumably binary +# translator is not almighty; + +$flavour=shift; +$output=shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); +my ($mac,$nonce)=($inp,$len); + +my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); + +$code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +.extern OPENSSL_armcap_P +#endif + +.text + +// forward "declarations" are required for Apple +.globl poly1305_blocks +.globl poly1305_emit + +.globl poly1305_init +.type poly1305_init,%function +.align 5 +poly1305_init: + cmp $inp,xzr + stp xzr,xzr,[$ctx] // zero hash value + stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] + + csel x0,xzr,x0,eq + b.eq .Lno_key + +#ifndef __KERNEL__ + adrp x17,OPENSSL_armcap_P + ldr w17,[x17,#:lo12:OPENSSL_armcap_P] +#endif + + ldp $r0,$r1,[$inp] // load key + mov $s1,#0xfffffffc0fffffff + movk $s1,#0x0fff,lsl#48 +#ifdef __AARCH64EB__ + rev $r0,$r0 // flip bytes + rev $r1,$r1 +#endif + and $r0,$r0,$s1 // &=0ffffffc0fffffff + and $s1,$s1,#-4 + and $r1,$r1,$s1 // &=0ffffffc0ffffffc + mov w#$s1,#-1 + stp $r0,$r1,[$ctx,#32] // save key value + str w#$s1,[$ctx,#48] // impossible key power value + +#ifndef __KERNEL__ + tst w17,#ARMV7_NEON + + adr $d0,.Lpoly1305_blocks + adr $r0,.Lpoly1305_blocks_neon + adr $d1,.Lpoly1305_emit + + csel $d0,$d0,$r0,eq + +# ifdef __ILP32__ + stp w#$d0,w#$d1,[$len] +# else + stp $d0,$d1,[$len] +# endif +#endif + mov x0,#1 +.Lno_key: + ret +.size poly1305_init,.-poly1305_init + +.type poly1305_blocks,%function +.align 5 +poly1305_blocks: +.Lpoly1305_blocks: + ands $len,$len,#-16 + b.eq .Lno_data + + ldp $h0,$h1,[$ctx] // load hash value + ldp $h2,x17,[$ctx,#16] // [along with is_base2_26] + ldp $r0,$r1,[$ctx,#32] // load key value + +#ifdef __AARCH64EB__ + lsr $d0,$h0,#32 + mov w#$d1,w#$h0 + lsr $d2,$h1,#32 + mov w15,w#$h1 + lsr x16,$h2,#32 +#else + mov w#$d0,w#$h0 + lsr $d1,$h0,#32 + mov w#$d2,w#$h1 + lsr x15,$h1,#32 + mov w16,w#$h2 +#endif + + add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 + lsr $d1,$d2,#12 + adds $d0,$d0,$d2,lsl#52 + add $d1,$d1,x15,lsl#14 + adc $d1,$d1,xzr + lsr $d2,x16,#24 + adds $d1,$d1,x16,lsl#40 + adc $d2,$d2,xzr + + cmp x17,#0 // is_base2_26? + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + csel $h0,$h0,$d0,eq // choose between radixes + csel $h1,$h1,$d1,eq + csel $h2,$h2,$d2,eq + +.Loop: + ldp $t0,$t1,[$inp],#16 // load input + sub $len,$len,#16 +#ifdef __AARCH64EB__ + rev $t0,$t0 + rev $t1,$t1 +#endif + adds $h0,$h0,$t0 // accumulate input + adcs $h1,$h1,$t1 + + mul $d0,$h0,$r0 // h0*r0 + adc $h2,$h2,$padbit + umulh $d1,$h0,$r0 + + mul $t0,$h1,$s1 // h1*5*r1 + umulh $t1,$h1,$s1 + + adds $d0,$d0,$t0 + mul $t0,$h0,$r1 // h0*r1 + adc $d1,$d1,$t1 + umulh $d2,$h0,$r1 + + adds $d1,$d1,$t0 + mul $t0,$h1,$r0 // h1*r0 + adc $d2,$d2,xzr + umulh $t1,$h1,$r0 + + adds $d1,$d1,$t0 + mul $t0,$h2,$s1 // h2*5*r1 + adc $d2,$d2,$t1 + mul $t1,$h2,$r0 // h2*r0 + + adds $d1,$d1,$t0 + adc $d2,$d2,$t1 + + and $t0,$d2,#-4 // final reduction + and $h2,$d2,#3 + add $t0,$t0,$d2,lsr#2 + adds $h0,$d0,$t0 + adcs $h1,$d1,xzr + adc $h2,$h2,xzr + + cbnz $len,.Loop + + stp $h0,$h1,[$ctx] // store hash value + stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26] + +.Lno_data: + ret +.size poly1305_blocks,.-poly1305_blocks + +.type poly1305_emit,%function +.align 5 +poly1305_emit: +.Lpoly1305_emit: + ldp $h0,$h1,[$ctx] // load hash base 2^64 + ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26] + ldp $t0,$t1,[$nonce] // load nonce + +#ifdef __AARCH64EB__ + lsr $d0,$h0,#32 + mov w#$d1,w#$h0 + lsr $d2,$h1,#32 + mov w15,w#$h1 + lsr x16,$h2,#32 +#else + mov w#$d0,w#$h0 + lsr $d1,$h0,#32 + mov w#$d2,w#$h1 + lsr x15,$h1,#32 + mov w16,w#$h2 +#endif + + add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 + lsr $d1,$d2,#12 + adds $d0,$d0,$d2,lsl#52 + add $d1,$d1,x15,lsl#14 + adc $d1,$d1,xzr + lsr $d2,x16,#24 + adds $d1,$d1,x16,lsl#40 + adc $d2,$d2,xzr + + cmp $r0,#0 // is_base2_26? + csel $h0,$h0,$d0,eq // choose between radixes + csel $h1,$h1,$d1,eq + csel $h2,$h2,$d2,eq + + adds $d0,$h0,#5 // compare to modulus + adcs $d1,$h1,xzr + adc $d2,$h2,xzr + + tst $d2,#-4 // see if it's carried/borrowed + + csel $h0,$h0,$d0,eq + csel $h1,$h1,$d1,eq + +#ifdef __AARCH64EB__ + ror $t0,$t0,#32 // flip nonce words + ror $t1,$t1,#32 +#endif + adds $h0,$h0,$t0 // accumulate nonce + adc $h1,$h1,$t1 +#ifdef __AARCH64EB__ + rev $h0,$h0 // flip output bytes + rev $h1,$h1 +#endif + stp $h0,$h1,[$mac] // write result + + ret +.size poly1305_emit,.-poly1305_emit +___ +my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); +my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); +my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); +my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); +my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); +my ($T0,$T1,$MASK) = map("v$_",(29..31)); + +my ($in2,$zeros)=("x16","x17"); +my $is_base2_26 = $zeros; # borrow + +$code.=<<___; +.type poly1305_mult,%function +.align 5 +poly1305_mult: + mul $d0,$h0,$r0 // h0*r0 + umulh $d1,$h0,$r0 + + mul $t0,$h1,$s1 // h1*5*r1 + umulh $t1,$h1,$s1 + + adds $d0,$d0,$t0 + mul $t0,$h0,$r1 // h0*r1 + adc $d1,$d1,$t1 + umulh $d2,$h0,$r1 + + adds $d1,$d1,$t0 + mul $t0,$h1,$r0 // h1*r0 + adc $d2,$d2,xzr + umulh $t1,$h1,$r0 + + adds $d1,$d1,$t0 + mul $t0,$h2,$s1 // h2*5*r1 + adc $d2,$d2,$t1 + mul $t1,$h2,$r0 // h2*r0 + + adds $d1,$d1,$t0 + adc $d2,$d2,$t1 + + and $t0,$d2,#-4 // final reduction + and $h2,$d2,#3 + add $t0,$t0,$d2,lsr#2 + adds $h0,$d0,$t0 + adcs $h1,$d1,xzr + adc $h2,$h2,xzr + + ret +.size poly1305_mult,.-poly1305_mult + +.type poly1305_splat,%function +.align 4 +poly1305_splat: + and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x13,$h0,#26,#26 + extr x14,$h1,$h0,#52 + and x14,x14,#0x03ffffff + ubfx x15,$h1,#14,#26 + extr x16,$h2,$h1,#40 + + str w12,[$ctx,#16*0] // r0 + add w12,w13,w13,lsl#2 // r1*5 + str w13,[$ctx,#16*1] // r1 + add w13,w14,w14,lsl#2 // r2*5 + str w12,[$ctx,#16*2] // s1 + str w14,[$ctx,#16*3] // r2 + add w14,w15,w15,lsl#2 // r3*5 + str w13,[$ctx,#16*4] // s2 + str w15,[$ctx,#16*5] // r3 + add w15,w16,w16,lsl#2 // r4*5 + str w14,[$ctx,#16*6] // s3 + str w16,[$ctx,#16*7] // r4 + str w15,[$ctx,#16*8] // s4 + + ret +.size poly1305_splat,.-poly1305_splat + +#ifdef __KERNEL__ +.globl poly1305_blocks_neon +#endif +.type poly1305_blocks_neon,%function +.align 5 +poly1305_blocks_neon: +.Lpoly1305_blocks_neon: + ldr $is_base2_26,[$ctx,#24] + cmp $len,#128 + b.lo .Lpoly1305_blocks + + .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + + stp d8,d9,[sp,#16] // meet ABI requirements + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + cbz $is_base2_26,.Lbase2_64_neon + + ldp w10,w11,[$ctx] // load hash value base 2^26 + ldp w12,w13,[$ctx,#8] + ldr w14,[$ctx,#16] + + tst $len,#31 + b.eq .Leven_neon + + ldp $r0,$r1,[$ctx,#32] // load key value + + add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 + lsr $h1,x12,#12 + adds $h0,$h0,x12,lsl#52 + add $h1,$h1,x13,lsl#14 + adc $h1,$h1,xzr + lsr $h2,x14,#24 + adds $h1,$h1,x14,lsl#40 + adc $d2,$h2,xzr // can be partially reduced... + + ldp $d0,$d1,[$inp],#16 // load input + sub $len,$len,#16 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + +#ifdef __AARCH64EB__ + rev $d0,$d0 + rev $d1,$d1 +#endif + adds $h0,$h0,$d0 // accumulate input + adcs $h1,$h1,$d1 + adc $h2,$h2,$padbit + + bl poly1305_mult + + and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,$h0,#26,#26 + extr x12,$h1,$h0,#52 + and x12,x12,#0x03ffffff + ubfx x13,$h1,#14,#26 + extr x14,$h2,$h1,#40 + + b .Leven_neon + +.align 4 +.Lbase2_64_neon: + ldp $r0,$r1,[$ctx,#32] // load key value + + ldp $h0,$h1,[$ctx] // load hash value base 2^64 + ldr $h2,[$ctx,#16] + + tst $len,#31 + b.eq .Linit_neon + + ldp $d0,$d1,[$inp],#16 // load input + sub $len,$len,#16 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) +#ifdef __AARCH64EB__ + rev $d0,$d0 + rev $d1,$d1 +#endif + adds $h0,$h0,$d0 // accumulate input + adcs $h1,$h1,$d1 + adc $h2,$h2,$padbit + + bl poly1305_mult + +.Linit_neon: + ldr w17,[$ctx,#48] // first table element + and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,$h0,#26,#26 + extr x12,$h1,$h0,#52 + and x12,x12,#0x03ffffff + ubfx x13,$h1,#14,#26 + extr x14,$h2,$h1,#40 + + cmp w17,#-1 // is value impossible? + b.ne .Leven_neon + + fmov ${H0},x10 + fmov ${H1},x11 + fmov ${H2},x12 + fmov ${H3},x13 + fmov ${H4},x14 + + ////////////////////////////////// initialize r^n table + mov $h0,$r0 // r^1 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + mov $h1,$r1 + mov $h2,xzr + add $ctx,$ctx,#48+12 + bl poly1305_splat + + bl poly1305_mult // r^2 + sub $ctx,$ctx,#4 + bl poly1305_splat + + bl poly1305_mult // r^3 + sub $ctx,$ctx,#4 + bl poly1305_splat + + bl poly1305_mult // r^4 + sub $ctx,$ctx,#4 + bl poly1305_splat + sub $ctx,$ctx,#48 // restore original $ctx + b .Ldo_neon + +.align 4 +.Leven_neon: + fmov ${H0},x10 + fmov ${H1},x11 + fmov ${H2},x12 + fmov ${H3},x13 + fmov ${H4},x14 + +.Ldo_neon: + ldp x8,x12,[$inp,#32] // inp[2:3] + subs $len,$len,#64 + ldp x9,x13,[$inp,#48] + add $in2,$inp,#96 + adrp $zeros,.Lzeros + add $zeros,$zeros,#:lo12:.Lzeros + + lsl $padbit,$padbit,#24 + add x15,$ctx,#48 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + and x5,x9,#0x03ffffff + ubfx x6,x8,#26,#26 + ubfx x7,x9,#26,#26 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + extr x8,x12,x8,#52 + extr x9,x13,x9,#52 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + fmov $IN23_0,x4 + and x8,x8,#0x03ffffff + and x9,x9,#0x03ffffff + ubfx x10,x12,#14,#26 + ubfx x11,x13,#14,#26 + add x12,$padbit,x12,lsr#40 + add x13,$padbit,x13,lsr#40 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + fmov $IN23_1,x6 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + fmov $IN23_2,x8 + fmov $IN23_3,x10 + fmov $IN23_4,x12 + + ldp x8,x12,[$inp],#16 // inp[0:1] + ldp x9,x13,[$inp],#48 + + ld1 {$R0,$R1,$S1,$R2},[x15],#64 + ld1 {$S2,$R3,$S3,$R4},[x15],#64 + ld1 {$S4},[x15] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + and x5,x9,#0x03ffffff + ubfx x6,x8,#26,#26 + ubfx x7,x9,#26,#26 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + extr x8,x12,x8,#52 + extr x9,x13,x9,#52 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + fmov $IN01_0,x4 + and x8,x8,#0x03ffffff + and x9,x9,#0x03ffffff + ubfx x10,x12,#14,#26 + ubfx x11,x13,#14,#26 + add x12,$padbit,x12,lsr#40 + add x13,$padbit,x13,lsr#40 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + fmov $IN01_1,x6 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + movi $MASK.2d,#-1 + fmov $IN01_2,x8 + fmov $IN01_3,x10 + fmov $IN01_4,x12 + ushr $MASK.2d,$MASK.2d,#38 + + b.ls .Lskip_loop + +.align 4 +.Loop_neon: + //////////////////////////////////////////////////////////////// + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + // \___________________/ + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + // \___________________/ \____________________/ + // + // Note that we start with inp[2:3]*r^2. This is because it + // doesn't depend on reduction in previous iteration. + //////////////////////////////////////////////////////////////// + // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 + // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 + // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 + // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 + // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 + + subs $len,$len,#64 + umull $ACC4,$IN23_0,${R4}[2] + csel $in2,$zeros,$in2,lo + umull $ACC3,$IN23_0,${R3}[2] + umull $ACC2,$IN23_0,${R2}[2] + ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) + umull $ACC1,$IN23_0,${R1}[2] + ldp x9,x13,[$in2],#48 + umull $ACC0,$IN23_0,${R0}[2] +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + + umlal $ACC4,$IN23_1,${R3}[2] + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + umlal $ACC3,$IN23_1,${R2}[2] + and x5,x9,#0x03ffffff + umlal $ACC2,$IN23_1,${R1}[2] + ubfx x6,x8,#26,#26 + umlal $ACC1,$IN23_1,${R0}[2] + ubfx x7,x9,#26,#26 + umlal $ACC0,$IN23_1,${S4}[2] + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + + umlal $ACC4,$IN23_2,${R2}[2] + extr x8,x12,x8,#52 + umlal $ACC3,$IN23_2,${R1}[2] + extr x9,x13,x9,#52 + umlal $ACC2,$IN23_2,${R0}[2] + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + umlal $ACC1,$IN23_2,${S4}[2] + fmov $IN23_0,x4 + umlal $ACC0,$IN23_2,${S3}[2] + and x8,x8,#0x03ffffff + + umlal $ACC4,$IN23_3,${R1}[2] + and x9,x9,#0x03ffffff + umlal $ACC3,$IN23_3,${R0}[2] + ubfx x10,x12,#14,#26 + umlal $ACC2,$IN23_3,${S4}[2] + ubfx x11,x13,#14,#26 + umlal $ACC1,$IN23_3,${S3}[2] + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + umlal $ACC0,$IN23_3,${S2}[2] + fmov $IN23_1,x6 + + add $IN01_2,$IN01_2,$H2 + add x12,$padbit,x12,lsr#40 + umlal $ACC4,$IN23_4,${R0}[2] + add x13,$padbit,x13,lsr#40 + umlal $ACC3,$IN23_4,${S4}[2] + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + umlal $ACC2,$IN23_4,${S3}[2] + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + umlal $ACC1,$IN23_4,${S2}[2] + fmov $IN23_2,x8 + umlal $ACC0,$IN23_4,${S1}[2] + fmov $IN23_3,x10 + + //////////////////////////////////////////////////////////////// + // (hash+inp[0:1])*r^4 and accumulate + + add $IN01_0,$IN01_0,$H0 + fmov $IN23_4,x12 + umlal $ACC3,$IN01_2,${R1}[0] + ldp x8,x12,[$inp],#16 // inp[0:1] + umlal $ACC0,$IN01_2,${S3}[0] + ldp x9,x13,[$inp],#48 + umlal $ACC4,$IN01_2,${R2}[0] + umlal $ACC1,$IN01_2,${S4}[0] + umlal $ACC2,$IN01_2,${R0}[0] +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + + add $IN01_1,$IN01_1,$H1 + umlal $ACC3,$IN01_0,${R3}[0] + umlal $ACC4,$IN01_0,${R4}[0] + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + umlal $ACC2,$IN01_0,${R2}[0] + and x5,x9,#0x03ffffff + umlal $ACC0,$IN01_0,${R0}[0] + ubfx x6,x8,#26,#26 + umlal $ACC1,$IN01_0,${R1}[0] + ubfx x7,x9,#26,#26 + + add $IN01_3,$IN01_3,$H3 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + umlal $ACC3,$IN01_1,${R2}[0] + extr x8,x12,x8,#52 + umlal $ACC4,$IN01_1,${R3}[0] + extr x9,x13,x9,#52 + umlal $ACC0,$IN01_1,${S4}[0] + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + umlal $ACC2,$IN01_1,${R1}[0] + fmov $IN01_0,x4 + umlal $ACC1,$IN01_1,${R0}[0] + and x8,x8,#0x03ffffff + + add $IN01_4,$IN01_4,$H4 + and x9,x9,#0x03ffffff + umlal $ACC3,$IN01_3,${R0}[0] + ubfx x10,x12,#14,#26 + umlal $ACC0,$IN01_3,${S2}[0] + ubfx x11,x13,#14,#26 + umlal $ACC4,$IN01_3,${R1}[0] + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + umlal $ACC1,$IN01_3,${S3}[0] + fmov $IN01_1,x6 + umlal $ACC2,$IN01_3,${S4}[0] + add x12,$padbit,x12,lsr#40 + + umlal $ACC3,$IN01_4,${S4}[0] + add x13,$padbit,x13,lsr#40 + umlal $ACC0,$IN01_4,${S1}[0] + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + umlal $ACC4,$IN01_4,${R0}[0] + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + umlal $ACC1,$IN01_4,${S2}[0] + fmov $IN01_2,x8 + umlal $ACC2,$IN01_4,${S3}[0] + fmov $IN01_3,x10 + fmov $IN01_4,x12 + + ///////////////////////////////////////////////////////////////// + // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + // and P. Schwabe + // + // [see discussion in poly1305-armv4 module] + + ushr $T0.2d,$ACC3,#26 + xtn $H3,$ACC3 + ushr $T1.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + add $ACC4,$ACC4,$T0.2d // h3 -> h4 + bic $H3,#0xfc,lsl#24 // &=0x03ffffff + add $ACC1,$ACC1,$T1.2d // h0 -> h1 + + ushr $T0.2d,$ACC4,#26 + xtn $H4,$ACC4 + ushr $T1.2d,$ACC1,#26 + xtn $H1,$ACC1 + bic $H4,#0xfc,lsl#24 + add $ACC2,$ACC2,$T1.2d // h1 -> h2 + + add $ACC0,$ACC0,$T0.2d + shl $T0.2d,$T0.2d,#2 + shrn $T1.2s,$ACC2,#26 + xtn $H2,$ACC2 + add $ACC0,$ACC0,$T0.2d // h4 -> h0 + bic $H1,#0xfc,lsl#24 + add $H3,$H3,$T1.2s // h2 -> h3 + bic $H2,#0xfc,lsl#24 + + shrn $T0.2s,$ACC0,#26 + xtn $H0,$ACC0 + ushr $T1.2s,$H3,#26 + bic $H3,#0xfc,lsl#24 + bic $H0,#0xfc,lsl#24 + add $H1,$H1,$T0.2s // h0 -> h1 + add $H4,$H4,$T1.2s // h3 -> h4 + + b.hi .Loop_neon + +.Lskip_loop: + dup $IN23_2,${IN23_2}[0] + add $IN01_2,$IN01_2,$H2 + + //////////////////////////////////////////////////////////////// + // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + adds $len,$len,#32 + b.ne .Long_tail + + dup $IN23_2,${IN01_2}[0] + add $IN23_0,$IN01_0,$H0 + add $IN23_3,$IN01_3,$H3 + add $IN23_1,$IN01_1,$H1 + add $IN23_4,$IN01_4,$H4 + +.Long_tail: + dup $IN23_0,${IN23_0}[0] + umull2 $ACC0,$IN23_2,${S3} + umull2 $ACC3,$IN23_2,${R1} + umull2 $ACC4,$IN23_2,${R2} + umull2 $ACC2,$IN23_2,${R0} + umull2 $ACC1,$IN23_2,${S4} + + dup $IN23_1,${IN23_1}[0] + umlal2 $ACC0,$IN23_0,${R0} + umlal2 $ACC2,$IN23_0,${R2} + umlal2 $ACC3,$IN23_0,${R3} + umlal2 $ACC4,$IN23_0,${R4} + umlal2 $ACC1,$IN23_0,${R1} + + dup $IN23_3,${IN23_3}[0] + umlal2 $ACC0,$IN23_1,${S4} + umlal2 $ACC3,$IN23_1,${R2} + umlal2 $ACC2,$IN23_1,${R1} + umlal2 $ACC4,$IN23_1,${R3} + umlal2 $ACC1,$IN23_1,${R0} + + dup $IN23_4,${IN23_4}[0] + umlal2 $ACC3,$IN23_3,${R0} + umlal2 $ACC4,$IN23_3,${R1} + umlal2 $ACC0,$IN23_3,${S2} + umlal2 $ACC1,$IN23_3,${S3} + umlal2 $ACC2,$IN23_3,${S4} + + umlal2 $ACC3,$IN23_4,${S4} + umlal2 $ACC0,$IN23_4,${S1} + umlal2 $ACC4,$IN23_4,${R0} + umlal2 $ACC1,$IN23_4,${S2} + umlal2 $ACC2,$IN23_4,${S3} + + b.eq .Lshort_tail + + //////////////////////////////////////////////////////////////// + // (hash+inp[0:1])*r^4:r^3 and accumulate + + add $IN01_0,$IN01_0,$H0 + umlal $ACC3,$IN01_2,${R1} + umlal $ACC0,$IN01_2,${S3} + umlal $ACC4,$IN01_2,${R2} + umlal $ACC1,$IN01_2,${S4} + umlal $ACC2,$IN01_2,${R0} + + add $IN01_1,$IN01_1,$H1 + umlal $ACC3,$IN01_0,${R3} + umlal $ACC0,$IN01_0,${R0} + umlal $ACC4,$IN01_0,${R4} + umlal $ACC1,$IN01_0,${R1} + umlal $ACC2,$IN01_0,${R2} + + add $IN01_3,$IN01_3,$H3 + umlal $ACC3,$IN01_1,${R2} + umlal $ACC0,$IN01_1,${S4} + umlal $ACC4,$IN01_1,${R3} + umlal $ACC1,$IN01_1,${R0} + umlal $ACC2,$IN01_1,${R1} + + add $IN01_4,$IN01_4,$H4 + umlal $ACC3,$IN01_3,${R0} + umlal $ACC0,$IN01_3,${S2} + umlal $ACC4,$IN01_3,${R1} + umlal $ACC1,$IN01_3,${S3} + umlal $ACC2,$IN01_3,${S4} + + umlal $ACC3,$IN01_4,${S4} + umlal $ACC0,$IN01_4,${S1} + umlal $ACC4,$IN01_4,${R0} + umlal $ACC1,$IN01_4,${S2} + umlal $ACC2,$IN01_4,${S3} + +.Lshort_tail: + //////////////////////////////////////////////////////////////// + // horizontal add + + addp $ACC3,$ACC3,$ACC3 + ldp d8,d9,[sp,#16] // meet ABI requirements + addp $ACC0,$ACC0,$ACC0 + ldp d10,d11,[sp,#32] + addp $ACC4,$ACC4,$ACC4 + ldp d12,d13,[sp,#48] + addp $ACC1,$ACC1,$ACC1 + ldp d14,d15,[sp,#64] + addp $ACC2,$ACC2,$ACC2 + ldr x30,[sp,#8] + + //////////////////////////////////////////////////////////////// + // lazy reduction, but without narrowing + + ushr $T0.2d,$ACC3,#26 + and $ACC3,$ACC3,$MASK.2d + ushr $T1.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + + add $ACC4,$ACC4,$T0.2d // h3 -> h4 + add $ACC1,$ACC1,$T1.2d // h0 -> h1 + + ushr $T0.2d,$ACC4,#26 + and $ACC4,$ACC4,$MASK.2d + ushr $T1.2d,$ACC1,#26 + and $ACC1,$ACC1,$MASK.2d + add $ACC2,$ACC2,$T1.2d // h1 -> h2 + + add $ACC0,$ACC0,$T0.2d + shl $T0.2d,$T0.2d,#2 + ushr $T1.2d,$ACC2,#26 + and $ACC2,$ACC2,$MASK.2d + add $ACC0,$ACC0,$T0.2d // h4 -> h0 + add $ACC3,$ACC3,$T1.2d // h2 -> h3 + + ushr $T0.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + ushr $T1.2d,$ACC3,#26 + and $ACC3,$ACC3,$MASK.2d + add $ACC1,$ACC1,$T0.2d // h0 -> h1 + add $ACC4,$ACC4,$T1.2d // h3 -> h4 + + //////////////////////////////////////////////////////////////// + // write the result, can be partially reduced + + st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 + mov x4,#1 + st1 {$ACC4}[0],[$ctx] + str x4,[$ctx,#8] // set is_base2_26 + + ldr x29,[sp],#80 + .inst 0xd50323bf // autiasp + ret +.size poly1305_blocks_neon,.-poly1305_blocks_neon + +.pushsection .rodata +.align 5 +.Lzeros: +.long 0,0,0,0,0,0,0,0 +.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm" +.popsection + +.align 2 +#if !defined(__KERNEL__) && !defined(_WIN64) +.comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P +#endif +___ + +foreach (split("\n",$code)) { + s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or + s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or + (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or + (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or + (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or + (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or + (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); + + s/\.[124]([sd])\[/.$1\[/; + s/w#x([0-9]+)/w$1/g; + + print $_,"\n"; +} +close STDOUT; diff --git a/lib/crypto/arm64/poly1305-glue.c b/lib/crypto/arm64/poly1305-glue.c new file mode 100644 index 000000000000..31aea21ce42f --- /dev/null +++ b/lib/crypto/arm64/poly1305-glue.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64 + * + * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> +#include <crypto/internal/poly1305.h> +#include <linux/cpufeature.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/unaligned.h> + +asmlinkage void poly1305_block_init_arch( + struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +EXPORT_SYMBOL_GPL(poly1305_block_init_arch); +asmlinkage void poly1305_blocks(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +EXPORT_SYMBOL_GPL(poly1305_emit_arch); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, + unsigned int len, u32 padbit) +{ + len = round_down(len, POLY1305_BLOCK_SIZE); + if (static_branch_likely(&have_neon) && likely(may_use_simd())) { + do { + unsigned int todo = min_t(unsigned int, len, SZ_4K); + + kernel_neon_begin(); + poly1305_blocks_neon(state, src, todo, padbit); + kernel_neon_end(); + + len -= todo; + src += todo; + } while (len); + } else + poly1305_blocks(state, src, len, padbit); +} +EXPORT_SYMBOL_GPL(poly1305_blocks_arch); + +bool poly1305_is_arch_optimized(void) +{ + /* We always can use at least the ARM64 scalar implementation. */ + return true; +} +EXPORT_SYMBOL(poly1305_is_arch_optimized); + +static int __init neon_poly1305_mod_init(void) +{ + if (cpu_have_named_feature(ASIMD)) + static_branch_enable(&have_neon); + return 0; +} +subsys_initcall(neon_poly1305_mod_init); + +static void __exit neon_poly1305_mod_exit(void) +{ +} +module_exit(neon_poly1305_mod_exit); + +MODULE_DESCRIPTION("Poly1305 authenticator (ARM64 optimized)"); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm64/sha1-ce-core.S b/lib/crypto/arm64/sha1-ce-core.S new file mode 100644 index 000000000000..21efbbafd7d6 --- /dev/null +++ b/lib/crypto/arm64/sha1-ce-core.S @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions + * + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .arch armv8-a+crypto + + k0 .req v0 + k1 .req v1 + k2 .req v2 + k3 .req v3 + + t0 .req v4 + t1 .req v5 + + dga .req q6 + dgav .req v6 + dgb .req s7 + dgbv .req v7 + + dg0q .req q12 + dg0s .req s12 + dg0v .req v12 + dg1s .req s13 + dg1v .req v13 + dg2s .req s14 + + .macro add_only, op, ev, rc, s0, dg1 + .ifc \ev, ev + add t1.4s, v\s0\().4s, \rc\().4s + sha1h dg2s, dg0s + .ifnb \dg1 + sha1\op dg0q, \dg1, t0.4s + .else + sha1\op dg0q, dg1s, t0.4s + .endif + .else + .ifnb \s0 + add t0.4s, v\s0\().4s, \rc\().4s + .endif + sha1h dg1s, dg0s + sha1\op dg0q, dg2s, t1.4s + .endif + .endm + + .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1 + sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s + add_only \op, \ev, \rc, \s1, \dg1 + sha1su1 v\s0\().4s, v\s3\().4s + .endm + + .macro loadrc, k, val, tmp + movz \tmp, :abs_g0_nc:\val + movk \tmp, :abs_g1:\val + dup \k, \tmp + .endm + + /* + * size_t __sha1_ce_transform(struct sha1_block_state *state, + * const u8 *data, size_t nblocks); + */ +SYM_FUNC_START(__sha1_ce_transform) + /* load round constants */ + loadrc k0.4s, 0x5a827999, w6 + loadrc k1.4s, 0x6ed9eba1, w6 + loadrc k2.4s, 0x8f1bbcdc, w6 + loadrc k3.4s, 0xca62c1d6, w6 + + /* load state */ + ld1 {dgav.4s}, [x0] + ldr dgb, [x0, #16] + + /* load input */ +0: ld1 {v8.4s-v11.4s}, [x1], #64 + sub x2, x2, #1 + +CPU_LE( rev32 v8.16b, v8.16b ) +CPU_LE( rev32 v9.16b, v9.16b ) +CPU_LE( rev32 v10.16b, v10.16b ) +CPU_LE( rev32 v11.16b, v11.16b ) + + add t0.4s, v8.4s, k0.4s + mov dg0v.16b, dgav.16b + + add_update c, ev, k0, 8, 9, 10, 11, dgb + add_update c, od, k0, 9, 10, 11, 8 + add_update c, ev, k0, 10, 11, 8, 9 + add_update c, od, k0, 11, 8, 9, 10 + add_update c, ev, k1, 8, 9, 10, 11 + + add_update p, od, k1, 9, 10, 11, 8 + add_update p, ev, k1, 10, 11, 8, 9 + add_update p, od, k1, 11, 8, 9, 10 + add_update p, ev, k1, 8, 9, 10, 11 + add_update p, od, k2, 9, 10, 11, 8 + + add_update m, ev, k2, 10, 11, 8, 9 + add_update m, od, k2, 11, 8, 9, 10 + add_update m, ev, k2, 8, 9, 10, 11 + add_update m, od, k2, 9, 10, 11, 8 + add_update m, ev, k3, 10, 11, 8, 9 + + add_update p, od, k3, 11, 8, 9, 10 + add_only p, ev, k3, 9 + add_only p, od, k3, 10 + add_only p, ev, k3, 11 + add_only p, od + + /* update state */ + add dgbv.2s, dgbv.2s, dg1v.2s + add dgav.4s, dgav.4s, dg0v.4s + + /* return early if voluntary preemption is needed */ + cond_yield 1f, x5, x6 + + /* handled all input blocks? */ + cbnz x2, 0b + + /* store new state */ +1: st1 {dgav.4s}, [x0] + str dgb, [x0, #16] + mov x0, x2 + ret +SYM_FUNC_END(__sha1_ce_transform) diff --git a/lib/crypto/arm64/sha1.h b/lib/crypto/arm64/sha1.h new file mode 100644 index 000000000000..f822563538cc --- /dev/null +++ b/lib/crypto/arm64/sha1.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-1 optimized for ARM64 + * + * Copyright 2025 Google LLC + */ +#include <asm/neon.h> +#include <asm/simd.h> +#include <linux/cpufeature.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); + +asmlinkage size_t __sha1_ce_transform(struct sha1_block_state *state, + const u8 *data, size_t nblocks); + +static void sha1_blocks(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_ce) && likely(may_use_simd())) { + do { + size_t rem; + + kernel_neon_begin(); + rem = __sha1_ce_transform(state, data, nblocks); + kernel_neon_end(); + data += (nblocks - rem) * SHA1_BLOCK_SIZE; + nblocks = rem; + } while (nblocks); + } else { + sha1_blocks_generic(state, data, nblocks); + } +} + +#define sha1_mod_init_arch sha1_mod_init_arch +static inline void sha1_mod_init_arch(void) +{ + if (cpu_have_named_feature(SHA1)) + static_branch_enable(&have_ce); +} diff --git a/lib/crypto/arm64/sha2-armv8.pl b/lib/crypto/arm64/sha2-armv8.pl new file mode 100644 index 000000000000..35ec9ae99fe1 --- /dev/null +++ b/lib/crypto/arm64/sha2-armv8.pl @@ -0,0 +1,786 @@ +#! /usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 + +# This code is taken from the OpenSSL project but the author (Andy Polyakov) +# has relicensed it under the GPLv2. Therefore this program is free software; +# you can redistribute it and/or modify it under the terms of the GNU General +# Public License version 2 as published by the Free Software Foundation. +# +# The original headers, including the original license headers, are +# included below for completeness. + +# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA256/512 for ARMv8. +# +# Performance in cycles per processed byte and improvement coefficient +# over code generated with "default" compiler: +# +# SHA256-hw SHA256(*) SHA512 +# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +# Denver 2.01 10.5 (+26%) 6.70 (+8%) +# X-Gene 20.0 (+100%) 12.8 (+300%(***)) +# Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +# +# (*) Software SHA256 results are of lesser relevance, presented +# mostly for informational purposes. +# (**) The result is a trade-off: it's possible to improve it by +# 10% (or by 1 cycle per round), but at the cost of 20% loss +# on Cortex-A53 (or by 4 cycles per round). +# (***) Super-impressive coefficients over gcc-generated code are +# indication of some compiler "pathology", most notably code +# generated with -mgeneral-regs-only is significantly faster +# and the gap is only 40-90%. +# +# October 2016. +# +# Originally it was reckoned that it makes no sense to implement NEON +# version of SHA256 for 64-bit processors. This is because performance +# improvement on most wide-spread Cortex-A5x processors was observed +# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was +# observed that 32-bit NEON SHA256 performs significantly better than +# 64-bit scalar version on *some* of the more recent processors. As +# result 64-bit NEON version of SHA256 was added to provide best +# all-round performance. For example it executes ~30% faster on X-Gene +# and Mongoose. [For reference, NEON version of SHA512 is bound to +# deliver much less improvement, likely *negative* on Cortex-A5x. +# Which is why NEON support is limited to SHA256.] + +$output=pop; +$flavour=pop; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open OUT,"| \"$^X\" $xlate $flavour $output"; + *STDOUT=*OUT; +} else { + open STDOUT,">$output"; +} + +if ($output =~ /512/) { + $BITS=512; + $SZ=8; + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=(1, 8, 7); + @sigma1=(19,61, 6); + $rounds=80; + $reg_t="x"; +} else { + $BITS=256; + $SZ=4; + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 7,18, 3); + @sigma1=(17,19,10); + $rounds=64; + $reg_t="w"; +} + +$func="sha${BITS}_block_data_order"; + +($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); + +@X=map("$reg_t$_",(3..15,0..2)); +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27)); +($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28)); + +sub BODY_00_xx { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +my $j=($i+1)&15; +my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]); + $T0=@X[$i+3] if ($i<11); + +$code.=<<___ if ($i<16); +#ifndef __AARCH64EB__ + rev @X[$i],@X[$i] // $i +#endif +___ +$code.=<<___ if ($i<13 && ($i&1)); + ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ +___ +$code.=<<___ if ($i==13); + ldp @X[14],@X[15],[$inp] +___ +$code.=<<___ if ($i>=14); + ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`] +___ +$code.=<<___ if ($i>0 && $i<16); + add $a,$a,$t1 // h+=Sigma0(a) +___ +$code.=<<___ if ($i>=11); + str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`] +___ +# While ARMv8 specifies merged rotate-n-logical operation such as +# 'eor x,y,z,ror#n', it was found to negatively affect performance +# on Apple A7. The reason seems to be that it requires even 'y' to +# be available earlier. This means that such merged instruction is +# not necessarily best choice on critical path... On the other hand +# Cortex-A5x handles merged instructions much better than disjoint +# rotate and logical... See (**) footnote above. +$code.=<<___ if ($i<15); + ror $t0,$e,#$Sigma1[0] + add $h,$h,$t2 // h+=K[i] + eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]` + and $t1,$f,$e + bic $t2,$g,$e + add $h,$h,@X[$i&15] // h+=X[i] + orr $t1,$t1,$t2 // Ch(e,f,g) + eor $t2,$a,$b // a^b, b^c in next round + eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e) + ror $T0,$a,#$Sigma0[0] + add $h,$h,$t1 // h+=Ch(e,f,g) + eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]` + add $h,$h,$t0 // h+=Sigma1(e) + and $t3,$t3,$t2 // (b^c)&=(a^b) + add $d,$d,$h // d+=h + eor $t3,$t3,$b // Maj(a,b,c) + eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a) + add $h,$h,$t3 // h+=Maj(a,b,c) + ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round + //add $h,$h,$t1 // h+=Sigma0(a) +___ +$code.=<<___ if ($i>=15); + ror $t0,$e,#$Sigma1[0] + add $h,$h,$t2 // h+=K[i] + ror $T1,@X[($j+1)&15],#$sigma0[0] + and $t1,$f,$e + ror $T2,@X[($j+14)&15],#$sigma1[0] + bic $t2,$g,$e + ror $T0,$a,#$Sigma0[0] + add $h,$h,@X[$i&15] // h+=X[i] + eor $t0,$t0,$e,ror#$Sigma1[1] + eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1] + orr $t1,$t1,$t2 // Ch(e,f,g) + eor $t2,$a,$b // a^b, b^c in next round + eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e) + eor $T0,$T0,$a,ror#$Sigma0[1] + add $h,$h,$t1 // h+=Ch(e,f,g) + and $t3,$t3,$t2 // (b^c)&=(a^b) + eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1] + eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1]) + add $h,$h,$t0 // h+=Sigma1(e) + eor $t3,$t3,$b // Maj(a,b,c) + eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a) + eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14]) + add @X[$j],@X[$j],@X[($j+9)&15] + add $d,$d,$h // d+=h + add $h,$h,$t3 // h+=Maj(a,b,c) + ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round + add @X[$j],@X[$j],$T1 + add $h,$h,$t1 // h+=Sigma0(a) + add @X[$j],@X[$j],$T2 +___ + ($t2,$t3)=($t3,$t2); +} + +$code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +#endif + +.text + +.extern OPENSSL_armcap_P +.globl $func +.type $func,%function +.align 6 +$func: +___ +$code.=<<___ if ($SZ==4); +#ifndef __KERNEL__ +# ifdef __ILP32__ + ldrsw x16,.LOPENSSL_armcap_P +# else + ldr x16,.LOPENSSL_armcap_P +# endif + adr x17,.LOPENSSL_armcap_P + add x16,x16,x17 + ldr w16,[x16] + tst w16,#ARMV8_SHA256 + b.ne .Lv8_entry + tst w16,#ARMV7_NEON + b.ne .Lneon_entry +#endif +___ +$code.=<<___; + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*$SZ + + ldp $A,$B,[$ctx] // load context + ldp $C,$D,[$ctx,#2*$SZ] + ldp $E,$F,[$ctx,#4*$SZ] + add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input + ldp $G,$H,[$ctx,#6*$SZ] + adr $Ktbl,.LK$BITS + stp $ctx,$num,[x29,#96] + +.Loop: + ldp @X[0],@X[1],[$inp],#2*$SZ + ldr $t2,[$Ktbl],#$SZ // *K++ + eor $t3,$B,$C // magic seed + str $inp,[x29,#112] +___ +for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } +$code.=".Loop_16_xx:\n"; +for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + cbnz $t2,.Loop_16_xx + + ldp $ctx,$num,[x29,#96] + ldr $inp,[x29,#112] + sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind + + ldp @X[0],@X[1],[$ctx] + ldp @X[2],@X[3],[$ctx,#2*$SZ] + add $inp,$inp,#14*$SZ // advance input pointer + ldp @X[4],@X[5],[$ctx,#4*$SZ] + add $A,$A,@X[0] + ldp @X[6],@X[7],[$ctx,#6*$SZ] + add $B,$B,@X[1] + add $C,$C,@X[2] + add $D,$D,@X[3] + stp $A,$B,[$ctx] + add $E,$E,@X[4] + add $F,$F,@X[5] + stp $C,$D,[$ctx,#2*$SZ] + add $G,$G,@X[6] + add $H,$H,@X[7] + cmp $inp,$num + stp $E,$F,[$ctx,#4*$SZ] + stp $G,$H,[$ctx,#6*$SZ] + b.ne .Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*$SZ + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret +.size $func,.-$func + +.align 6 +.type .LK$BITS,%object +.LK$BITS: +___ +$code.=<<___ if ($SZ==8); + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + .quad 0 // terminator +___ +$code.=<<___ if ($SZ==4); + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0 //terminator +___ +$code.=<<___; +.size .LK$BITS,.-.LK$BITS +#ifndef __KERNEL__ +.align 3 +.LOPENSSL_armcap_P: +# ifdef __ILP32__ + .long OPENSSL_armcap_P-. +# else + .quad OPENSSL_armcap_P-. +# endif +#endif +.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +___ + +if ($SZ==4) { +my $Ktbl="x3"; + +my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); +my @MSG=map("v$_.16b",(4..7)); +my ($W0,$W1)=("v16.4s","v17.4s"); +my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); + +$code.=<<___; +#ifndef __KERNEL__ +.type sha256_block_armv8,%function +.align 6 +sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1.32 {$ABCD,$EFGH},[$ctx] + adr $Ktbl,.LK256 + +.Loop_hw: + ld1 {@MSG[0]-@MSG[3]},[$inp],#64 + sub $num,$num,#1 + ld1.32 {$W0},[$Ktbl],#16 + rev32 @MSG[0],@MSG[0] + rev32 @MSG[1],@MSG[1] + rev32 @MSG[2],@MSG[2] + rev32 @MSG[3],@MSG[3] + orr $ABCD_SAVE,$ABCD,$ABCD // offload + orr $EFGH_SAVE,$EFGH,$EFGH +___ +for($i=0;$i<12;$i++) { +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + sha256su0 @MSG[0],@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + sha256su1 @MSG[0],@MSG[2],@MSG[3] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); +} +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + ld1.32 {$W0},[$Ktbl],#16 + add.i32 $W1,$W1,@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + ld1.32 {$W1},[$Ktbl] + add.i32 $W0,$W0,@MSG[2] + sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + add.i32 $W1,$W1,@MSG[3] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + add.i32 $ABCD,$ABCD,$ABCD_SAVE + add.i32 $EFGH,$EFGH,$EFGH_SAVE + + cbnz $num,.Loop_hw + + st1.32 {$ABCD,$EFGH},[$ctx] + + ldr x29,[sp],#16 + ret +.size sha256_block_armv8,.-sha256_block_armv8 +#endif +___ +} + +if ($SZ==4) { ######################################### NEON stuff # +# You'll surely note a lot of similarities with sha256-armv4 module, +# and of course it's not a coincidence. sha256-armv4 was used as +# initial template, but was adapted for ARMv8 instruction set and +# extensively re-tuned for all-round performance. + +my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); +my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); +my $Ktbl="x16"; +my $Xfer="x17"; +my @X = map("q$_",(0..3)); +my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); +my $j=0; + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; } +sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; } +sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; } + +sub Xupdate() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + &ext_8 ($T0,@X[0],@X[1],4); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &ext_8 ($T3,@X[2],@X[3],4); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15] + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T2,$T0,$sigma0[0]); + eval(shift(@insns)); + &ushr_32 ($T1,$T0,$sigma0[2]); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12] + eval(shift(@insns)); + &sli_32 ($T2,$T0,32-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T3,$T0,$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T1,$T1,$T2); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T3,$T0,32-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T4,$T7,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T4,$T7,32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T5,$T7,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T3,$T7,$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &sli_u32 ($T3,$T7,32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T6,@X[0],$sigma1[0]); + eval(shift(@insns)); + &ushr_32 ($T7,@X[0],$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T6,@X[0],32-$sigma1[0]); + eval(shift(@insns)); + &ushr_32 ($T5,@X[0],$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T7,$T7,$T6); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T5,@X[0],32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_32 ("{$T0}","[$Ktbl], #16"); + eval(shift(@insns)); + &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T5); + eval(shift(@insns)); + eval(shift(@insns)); + &mov (&Dhi($T5), &Dlo($T7)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 ($T0,$T0,@X[0]); + while($#insns>=1) { eval(shift(@insns)); } + &st1_32 ("{$T0}","[$Xfer], #16"); + eval(shift(@insns)); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub Xpreload() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_8 ("{@X[0]}","[$inp],#16"); + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_32 ("{$T0}","[$Ktbl],#16"); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &rev32 (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 ($T0,$T0,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &st1_32 ("{$T0}","[$Xfer], #16"); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. + '&add ($h,$h,$t1)', # h+=X[i]+K[i] + '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past + '&and ($t1,$f,$e)', + '&bic ($t4,$g,$e)', + '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', + '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past + '&orr ($t1,$t1,$t4)', # Ch(e,f,g) + '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) + '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', + '&add ($h,$h,$t1)', # h+=Ch(e,f,g) + '&ror ($t0,$t0,"#$Sigma1[0]")', + '&eor ($t2,$a,$b)', # a^b, b^c in next round + '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) + '&add ($h,$h,$t0)', # h+=Sigma1(e) + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. + '&ldr ($t1,"[$Ktbl]") if ($j==15);'. + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) + '&ror ($t4,$t4,"#$Sigma0[0]")', + '&add ($d,$d,$h)', # d+=h + '&eor ($t3,$t3,$b)', # Maj(a,b,c) + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' + ) +} + +$code.=<<___; +#ifdef __KERNEL__ +.globl sha256_block_neon +#endif +.type sha256_block_neon,%function +.align 4 +sha256_block_neon: +.Lneon_entry: + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr $Ktbl,.LK256 + add $num,$inp,$num,lsl#6 // len to point at the end of inp + + ld1.8 {@X[0]},[$inp], #16 + ld1.8 {@X[1]},[$inp], #16 + ld1.8 {@X[2]},[$inp], #16 + ld1.8 {@X[3]},[$inp], #16 + ld1.32 {$T0},[$Ktbl], #16 + ld1.32 {$T1},[$Ktbl], #16 + ld1.32 {$T2},[$Ktbl], #16 + ld1.32 {$T3},[$Ktbl], #16 + rev32 @X[0],@X[0] // yes, even on + rev32 @X[1],@X[1] // big-endian + rev32 @X[2],@X[2] + rev32 @X[3],@X[3] + mov $Xfer,sp + add.32 $T0,$T0,@X[0] + add.32 $T1,$T1,@X[1] + add.32 $T2,$T2,@X[2] + st1.32 {$T0-$T1},[$Xfer], #32 + add.32 $T3,$T3,@X[3] + st1.32 {$T2-$T3},[$Xfer] + sub $Xfer,$Xfer,#32 + + ldp $A,$B,[$ctx] + ldp $C,$D,[$ctx,#8] + ldp $E,$F,[$ctx,#16] + ldp $G,$H,[$ctx,#24] + ldr $t1,[sp,#0] + mov $t2,wzr + eor $t3,$B,$C + mov $t4,wzr + b .L_00_48 + +.align 4 +.L_00_48: +___ + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); +$code.=<<___; + cmp $t1,#0 // check for K256 terminator + ldr $t1,[sp,#0] + sub $Xfer,$Xfer,#64 + bne .L_00_48 + + sub $Ktbl,$Ktbl,#256 // rewind $Ktbl + cmp $inp,$num + mov $Xfer, #64 + csel $Xfer, $Xfer, xzr, eq + sub $inp,$inp,$Xfer // avoid SEGV + mov $Xfer,sp +___ + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); +$code.=<<___; + add $A,$A,$t4 // h+=Sigma0(a) from the past + ldp $t0,$t1,[$ctx,#0] + add $A,$A,$t2 // h+=Maj(a,b,c) from the past + ldp $t2,$t3,[$ctx,#8] + add $A,$A,$t0 // accumulate + add $B,$B,$t1 + ldp $t0,$t1,[$ctx,#16] + add $C,$C,$t2 + add $D,$D,$t3 + ldp $t2,$t3,[$ctx,#24] + add $E,$E,$t0 + add $F,$F,$t1 + ldr $t1,[sp,#0] + stp $A,$B,[$ctx,#0] + add $G,$G,$t2 + mov $t2,wzr + stp $C,$D,[$ctx,#8] + add $H,$H,$t3 + stp $E,$F,[$ctx,#16] + eor $t3,$B,$C + stp $G,$H,[$ctx,#24] + mov $t4,wzr + mov $Xfer,sp + b.ne .L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret +.size sha256_block_neon,.-sha256_block_neon +___ +} + +$code.=<<___; +#ifndef __KERNEL__ +.comm OPENSSL_armcap_P,4,4 +#endif +___ + +{ my %opcode = ( + "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, + "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); + + sub unsha256 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; + + s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers + + s/\.[ui]?8(\s)/$1/; + s/\.\w?32\b// and s/\.16b/\.4s/g; + m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; + + print $_,"\n"; +} + +close STDOUT; diff --git a/lib/crypto/arm64/sha256-ce.S b/lib/crypto/arm64/sha256-ce.S new file mode 100644 index 000000000000..b99d9589c421 --- /dev/null +++ b/lib/crypto/arm64/sha256-ce.S @@ -0,0 +1,136 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions + * + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .arch armv8-a+crypto + + dga .req q20 + dgav .req v20 + dgb .req q21 + dgbv .req v21 + + t0 .req v22 + t1 .req v23 + + dg0q .req q24 + dg0v .req v24 + dg1q .req q25 + dg1v .req v25 + dg2q .req q26 + dg2v .req v26 + + .macro add_only, ev, rc, s0 + mov dg2v.16b, dg0v.16b + .ifeq \ev + add t1.4s, v\s0\().4s, \rc\().4s + sha256h dg0q, dg1q, t0.4s + sha256h2 dg1q, dg2q, t0.4s + .else + .ifnb \s0 + add t0.4s, v\s0\().4s, \rc\().4s + .endif + sha256h dg0q, dg1q, t1.4s + sha256h2 dg1q, dg2q, t1.4s + .endif + .endm + + .macro add_update, ev, rc, s0, s1, s2, s3 + sha256su0 v\s0\().4s, v\s1\().4s + add_only \ev, \rc, \s1 + sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s + .endm + + /* + * The SHA-256 round constants + */ + .section ".rodata", "a" + .align 4 +.Lsha2_rcon: + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + + /* + * size_t __sha256_ce_transform(struct sha256_block_state *state, + * const u8 *data, size_t nblocks); + */ + .text +SYM_FUNC_START(__sha256_ce_transform) + /* load round constants */ + adr_l x8, .Lsha2_rcon + ld1 { v0.4s- v3.4s}, [x8], #64 + ld1 { v4.4s- v7.4s}, [x8], #64 + ld1 { v8.4s-v11.4s}, [x8], #64 + ld1 {v12.4s-v15.4s}, [x8] + + /* load state */ + ld1 {dgav.4s, dgbv.4s}, [x0] + + /* load input */ +0: ld1 {v16.4s-v19.4s}, [x1], #64 + sub x2, x2, #1 + +CPU_LE( rev32 v16.16b, v16.16b ) +CPU_LE( rev32 v17.16b, v17.16b ) +CPU_LE( rev32 v18.16b, v18.16b ) +CPU_LE( rev32 v19.16b, v19.16b ) + + add t0.4s, v16.4s, v0.4s + mov dg0v.16b, dgav.16b + mov dg1v.16b, dgbv.16b + + add_update 0, v1, 16, 17, 18, 19 + add_update 1, v2, 17, 18, 19, 16 + add_update 0, v3, 18, 19, 16, 17 + add_update 1, v4, 19, 16, 17, 18 + + add_update 0, v5, 16, 17, 18, 19 + add_update 1, v6, 17, 18, 19, 16 + add_update 0, v7, 18, 19, 16, 17 + add_update 1, v8, 19, 16, 17, 18 + + add_update 0, v9, 16, 17, 18, 19 + add_update 1, v10, 17, 18, 19, 16 + add_update 0, v11, 18, 19, 16, 17 + add_update 1, v12, 19, 16, 17, 18 + + add_only 0, v13, 17 + add_only 1, v14, 18 + add_only 0, v15, 19 + add_only 1 + + /* update state */ + add dgav.4s, dgav.4s, dg0v.4s + add dgbv.4s, dgbv.4s, dg1v.4s + + /* return early if voluntary preemption is needed */ + cond_yield 1f, x5, x6 + + /* handled all input blocks? */ + cbnz x2, 0b + + /* store new state */ +1: st1 {dgav.4s, dgbv.4s}, [x0] + mov x0, x2 + ret +SYM_FUNC_END(__sha256_ce_transform) diff --git a/lib/crypto/arm64/sha256.h b/lib/crypto/arm64/sha256.h new file mode 100644 index 000000000000..a211966c124a --- /dev/null +++ b/lib/crypto/arm64/sha256.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-256 optimized for ARM64 + * + * Copyright 2025 Google LLC + */ +#include <asm/neon.h> +#include <crypto/internal/simd.h> +#include <linux/cpufeature.h> + +asmlinkage void sha256_block_data_order(struct sha256_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage void sha256_block_neon(struct sha256_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage size_t __sha256_ce_transform(struct sha256_block_state *state, + const u8 *data, size_t nblocks); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); + +static void sha256_blocks(struct sha256_block_state *state, + const u8 *data, size_t nblocks) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_neon) && crypto_simd_usable()) { + if (static_branch_likely(&have_ce)) { + do { + size_t rem; + + kernel_neon_begin(); + rem = __sha256_ce_transform(state, + data, nblocks); + kernel_neon_end(); + data += (nblocks - rem) * SHA256_BLOCK_SIZE; + nblocks = rem; + } while (nblocks); + } else { + kernel_neon_begin(); + sha256_block_neon(state, data, nblocks); + kernel_neon_end(); + } + } else { + sha256_block_data_order(state, data, nblocks); + } +} + +#ifdef CONFIG_KERNEL_MODE_NEON +#define sha256_mod_init_arch sha256_mod_init_arch +static inline void sha256_mod_init_arch(void) +{ + if (cpu_have_named_feature(ASIMD)) { + static_branch_enable(&have_neon); + if (cpu_have_named_feature(SHA2)) + static_branch_enable(&have_ce); + } +} +#endif /* CONFIG_KERNEL_MODE_NEON */ diff --git a/lib/crypto/arm64/sha512-ce-core.S b/lib/crypto/arm64/sha512-ce-core.S new file mode 100644 index 000000000000..22f1ded89bc8 --- /dev/null +++ b/lib/crypto/arm64/sha512-ce-core.S @@ -0,0 +1,197 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions + * + * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + /* + * We have to specify the "sha3" feature here, since the GNU and clang + * assemblers both consider the SHA-512 instructions to be part of the + * "sha3" feature. (Except binutils 2.30 through 2.42, which used + * "sha2". But "sha3" implies "sha2", so "sha3" still works in those + * versions.) "sha3" doesn't make a lot of sense, since SHA-512 is part + * of the SHA-2 family of algorithms, and also the Arm Architecture + * Reference Manual defines FEAT_SHA512 and FEAT_SHA3 separately. + * Regardless, we must use "sha3" to be compatible with the assemblers. + */ + .arch armv8-a+sha3 + + /* + * The SHA-512 round constants + */ + .section ".rodata", "a" + .align 4 +.Lsha512_rcon: + .quad 0x428a2f98d728ae22, 0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538, 0x59f111f1b605d019 + .quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242, 0x12835b0145706fbe + .quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235, 0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 + .quad 0x983e5152ee66dfab, 0xa831c66d2db43210 + .quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725 + .quad 0x06ca6351e003826f, 0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df + .quad 0x650a73548baf63de, 0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6, 0x92722c851482353b + .quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791, 0xc76c51a30654be30 + .quad 0xd192e819d6ef5218, 0xd69906245565a910 + .quad 0xf40e35855771202a, 0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc, 0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec + .quad 0x90befffa23631e28, 0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b + .quad 0xca273eceea26619c, 0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae, 0x1b710b35131c471b + .quad 0x28db77f523047d84, 0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 + + .macro dround, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4 + .ifnb \rc1 + ld1 {v\rc1\().2d}, [x4], #16 + .endif + add v5.2d, v\rc0\().2d, v\in0\().2d + ext v6.16b, v\i2\().16b, v\i3\().16b, #8 + ext v5.16b, v5.16b, v5.16b, #8 + ext v7.16b, v\i1\().16b, v\i2\().16b, #8 + add v\i3\().2d, v\i3\().2d, v5.2d + .ifnb \in1 + ext v5.16b, v\in3\().16b, v\in4\().16b, #8 + sha512su0 v\in0\().2d, v\in1\().2d + .endif + sha512h q\i3, q6, v7.2d + .ifnb \in1 + sha512su1 v\in0\().2d, v\in2\().2d, v5.2d + .endif + add v\i4\().2d, v\i1\().2d, v\i3\().2d + sha512h2 q\i3, q\i1, v\i0\().2d + .endm + + /* + * size_t __sha512_ce_transform(struct sha512_block_state *state, + * const u8 *data, size_t nblocks); + */ + .text +SYM_FUNC_START(__sha512_ce_transform) + /* load state */ + ld1 {v8.2d-v11.2d}, [x0] + + /* load first 4 round constants */ + adr_l x3, .Lsha512_rcon + ld1 {v20.2d-v23.2d}, [x3], #64 + + /* load input */ +0: ld1 {v12.2d-v15.2d}, [x1], #64 + ld1 {v16.2d-v19.2d}, [x1], #64 + sub x2, x2, #1 + +CPU_LE( rev64 v12.16b, v12.16b ) +CPU_LE( rev64 v13.16b, v13.16b ) +CPU_LE( rev64 v14.16b, v14.16b ) +CPU_LE( rev64 v15.16b, v15.16b ) +CPU_LE( rev64 v16.16b, v16.16b ) +CPU_LE( rev64 v17.16b, v17.16b ) +CPU_LE( rev64 v18.16b, v18.16b ) +CPU_LE( rev64 v19.16b, v19.16b ) + + mov x4, x3 // rc pointer + + mov v0.16b, v8.16b + mov v1.16b, v9.16b + mov v2.16b, v10.16b + mov v3.16b, v11.16b + + // v0 ab cd -- ef gh ab + // v1 cd -- ef gh ab cd + // v2 ef gh ab cd -- ef + // v3 gh ab cd -- ef gh + // v4 -- ef gh ab cd -- + + dround 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17 + dround 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18 + dround 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19 + dround 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12 + dround 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13 + + dround 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14 + dround 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15 + dround 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16 + dround 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17 + dround 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18 + + dround 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19 + dround 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12 + dround 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13 + dround 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14 + dround 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15 + + dround 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16 + dround 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17 + dround 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18 + dround 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19 + dround 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12 + + dround 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13 + dround 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14 + dround 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15 + dround 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16 + dround 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17 + + dround 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18 + dround 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19 + dround 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12 + dround 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13 + dround 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14 + + dround 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15 + dround 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16 + dround 2, 3, 1, 4, 0, 28, 24, 12 + dround 4, 2, 0, 1, 3, 29, 25, 13 + dround 1, 4, 3, 0, 2, 30, 26, 14 + + dround 0, 1, 2, 3, 4, 31, 27, 15 + dround 3, 0, 4, 2, 1, 24, , 16 + dround 2, 3, 1, 4, 0, 25, , 17 + dround 4, 2, 0, 1, 3, 26, , 18 + dround 1, 4, 3, 0, 2, 27, , 19 + + /* update state */ + add v8.2d, v8.2d, v0.2d + add v9.2d, v9.2d, v1.2d + add v10.2d, v10.2d, v2.2d + add v11.2d, v11.2d, v3.2d + + cond_yield 3f, x4, x5 + /* handled all input blocks? */ + cbnz x2, 0b + + /* store new state */ +3: st1 {v8.2d-v11.2d}, [x0] + mov x0, x2 + ret +SYM_FUNC_END(__sha512_ce_transform) diff --git a/lib/crypto/arm64/sha512.h b/lib/crypto/arm64/sha512.h new file mode 100644 index 000000000000..6abb40b467f2 --- /dev/null +++ b/lib/crypto/arm64/sha512.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * arm64-optimized SHA-512 block function + * + * Copyright 2025 Google LLC + */ + +#include <asm/neon.h> +#include <crypto/internal/simd.h> +#include <linux/cpufeature.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha512_insns); + +asmlinkage void sha512_block_data_order(struct sha512_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage size_t __sha512_ce_transform(struct sha512_block_state *state, + const u8 *data, size_t nblocks); + +static void sha512_blocks(struct sha512_block_state *state, + const u8 *data, size_t nblocks) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_sha512_insns) && + likely(crypto_simd_usable())) { + do { + size_t rem; + + kernel_neon_begin(); + rem = __sha512_ce_transform(state, data, nblocks); + kernel_neon_end(); + data += (nblocks - rem) * SHA512_BLOCK_SIZE; + nblocks = rem; + } while (nblocks); + } else { + sha512_block_data_order(state, data, nblocks); + } +} + +#ifdef CONFIG_KERNEL_MODE_NEON +#define sha512_mod_init_arch sha512_mod_init_arch +static inline void sha512_mod_init_arch(void) +{ + if (cpu_have_named_feature(SHA512)) + static_branch_enable(&have_sha512_insns); +} +#endif /* CONFIG_KERNEL_MODE_NEON */ diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c index 09682136b57c..9828176a2efe 100644 --- a/lib/crypto/blake2s-generic.c +++ b/lib/crypto/blake2s-generic.c @@ -9,11 +9,12 @@ */ #include <crypto/internal/blake2s.h> -#include <linux/types.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/init.h> #include <linux/bug.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/types.h> #include <linux/unaligned.h> static const u8 blake2s_sigma[10][16] = { diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c index 71a316552cc5..f6ec68c3dcda 100644 --- a/lib/crypto/blake2s.c +++ b/lib/crypto/blake2s.c @@ -9,12 +9,13 @@ */ #include <crypto/internal/blake2s.h> -#include <linux/types.h> -#include <linux/string.h> +#include <linux/bug.h> +#include <linux/export.h> +#include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/init.h> -#include <linux/bug.h> +#include <linux/string.h> +#include <linux/types.h> static inline void blake2s_set_lastblock(struct blake2s_state *state) { @@ -60,7 +61,7 @@ EXPORT_SYMBOL(blake2s_final); static int __init blake2s_mod_init(void) { - if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) && + if (IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) && WARN_ON(!blake2s_selftest())) return -ENODEV; return 0; diff --git a/lib/crypto/chacha.c b/lib/crypto/chacha.c index 3cdda3b5ee06..77f68de71066 100644 --- a/lib/crypto/chacha.c +++ b/lib/crypto/chacha.c @@ -5,16 +5,17 @@ * Copyright (C) 2015 Martin Willi */ +#include <crypto/chacha.h> +#include <linux/bitops.h> #include <linux/bug.h> -#include <linux/kernel.h> #include <linux/export.h> -#include <linux/bitops.h> +#include <linux/kernel.h> #include <linux/string.h> #include <linux/unaligned.h> -#include <crypto/chacha.h> -static void chacha_permute(u32 *x, int nrounds) +static void chacha_permute(struct chacha_state *state, int nrounds) { + u32 *x = state->x; int i; /* whitelist the allowed round counts */ @@ -65,34 +66,34 @@ static void chacha_permute(u32 *x, int nrounds) /** * chacha_block_generic - generate one keystream block and increment block counter - * @state: input state matrix (16 32-bit words) - * @stream: output keystream block (64 bytes) + * @state: input state matrix + * @out: output keystream block * @nrounds: number of rounds (20 or 12; 20 is recommended) * * This is the ChaCha core, a function from 64-byte strings to 64-byte strings. * The caller has already converted the endianness of the input. This function * also handles incrementing the block counter in the input matrix. */ -void chacha_block_generic(u32 *state, u8 *stream, int nrounds) +void chacha_block_generic(struct chacha_state *state, + u8 out[CHACHA_BLOCK_SIZE], int nrounds) { - u32 x[16]; + struct chacha_state permuted_state = *state; int i; - memcpy(x, state, 64); + chacha_permute(&permuted_state, nrounds); - chacha_permute(x, nrounds); + for (i = 0; i < ARRAY_SIZE(state->x); i++) + put_unaligned_le32(permuted_state.x[i] + state->x[i], + &out[i * sizeof(u32)]); - for (i = 0; i < ARRAY_SIZE(x); i++) - put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]); - - state[12]++; + state->x[12]++; } EXPORT_SYMBOL(chacha_block_generic); /** * hchacha_block_generic - abbreviated ChaCha core, for XChaCha - * @state: input state matrix (16 32-bit words) - * @stream: output (8 32-bit words) + * @state: input state matrix + * @out: the output words * @nrounds: number of rounds (20 or 12; 20 is recommended) * * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step @@ -100,15 +101,14 @@ EXPORT_SYMBOL(chacha_block_generic); * skips the final addition of the initial state, and outputs only certain words * of the state. It should not be used for streaming directly. */ -void hchacha_block_generic(const u32 *state, u32 *stream, int nrounds) +void hchacha_block_generic(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) { - u32 x[16]; - - memcpy(x, state, 64); + struct chacha_state permuted_state = *state; - chacha_permute(x, nrounds); + chacha_permute(&permuted_state, nrounds); - memcpy(&stream[0], &x[0], 16); - memcpy(&stream[4], &x[12], 16); + memcpy(&out[0], &permuted_state.x[0], 16); + memcpy(&out[4], &permuted_state.x[12], 16); } EXPORT_SYMBOL(hchacha_block_generic); diff --git a/lib/crypto/chacha20poly1305-selftest.c b/lib/crypto/chacha20poly1305-selftest.c index 2ea61c28be4f..e4c85bc5a6d7 100644 --- a/lib/crypto/chacha20poly1305-selftest.c +++ b/lib/crypto/chacha20poly1305-selftest.c @@ -8832,7 +8832,7 @@ chacha20poly1305_encrypt_bignonce(u8 *dst, const u8 *src, const size_t src_len, { const u8 *pad0 = page_address(ZERO_PAGE(0)); struct poly1305_desc_ctx poly1305_state; - u32 chacha20_state[CHACHA_STATE_WORDS]; + struct chacha_state chacha20_state; union { u8 block0[POLY1305_KEY_SIZE]; __le64 lens[2]; @@ -8844,12 +8844,12 @@ chacha20poly1305_encrypt_bignonce(u8 *dst, const u8 *src, const size_t src_len, memcpy(&bottom_row[4], nonce, 12); for (i = 0; i < 8; ++i) le_key[i] = get_unaligned_le32(key + sizeof(le_key[i]) * i); - chacha_init(chacha20_state, le_key, bottom_row); - chacha20_crypt(chacha20_state, b.block0, b.block0, sizeof(b.block0)); + chacha_init(&chacha20_state, le_key, bottom_row); + chacha20_crypt(&chacha20_state, b.block0, b.block0, sizeof(b.block0)); poly1305_init(&poly1305_state, b.block0); poly1305_update(&poly1305_state, ad, ad_len); poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf); - chacha20_crypt(chacha20_state, dst, src, src_len); + chacha20_crypt(&chacha20_state, dst, src, src_len); poly1305_update(&poly1305_state, dst, src_len); poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf); b.lens[0] = cpu_to_le64(ad_len); diff --git a/lib/crypto/chacha20poly1305.c b/lib/crypto/chacha20poly1305.c index 9cfa886f1f89..0b49d6aedefd 100644 --- a/lib/crypto/chacha20poly1305.c +++ b/lib/crypto/chacha20poly1305.c @@ -7,18 +7,16 @@ * Information: https://tools.ietf.org/html/rfc8439 */ -#include <crypto/chacha20poly1305.h> #include <crypto/chacha.h> +#include <crypto/chacha20poly1305.h> #include <crypto/poly1305.h> #include <crypto/utils.h> - -#include <linux/unaligned.h> -#include <linux/kernel.h> +#include <linux/export.h> #include <linux/init.h> +#include <linux/kernel.h> #include <linux/mm.h> #include <linux/module.h> - -#define CHACHA_KEY_WORDS (CHACHA_KEY_SIZE / sizeof(u32)) +#include <linux/unaligned.h> static void chacha_load_key(u32 *k, const u8 *in) { @@ -32,7 +30,8 @@ static void chacha_load_key(u32 *k, const u8 *in) k[7] = get_unaligned_le32(in + 28); } -static void xchacha_init(u32 *chacha_state, const u8 *key, const u8 *nonce) +static void xchacha_init(struct chacha_state *chacha_state, + const u8 *key, const u8 *nonce) { u32 k[CHACHA_KEY_WORDS]; u8 iv[CHACHA_IV_SIZE]; @@ -54,7 +53,8 @@ static void xchacha_init(u32 *chacha_state, const u8 *key, const u8 *nonce) static void __chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, - const u8 *ad, const size_t ad_len, u32 *chacha_state) + const u8 *ad, const size_t ad_len, + struct chacha_state *chacha_state) { const u8 *pad0 = page_address(ZERO_PAGE(0)); struct poly1305_desc_ctx poly1305_state; @@ -82,7 +82,7 @@ __chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, poly1305_final(&poly1305_state, dst + src_len); - memzero_explicit(chacha_state, CHACHA_STATE_WORDS * sizeof(u32)); + chacha_zeroize_state(chacha_state); memzero_explicit(&b, sizeof(b)); } @@ -91,7 +91,7 @@ void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, const u64 nonce, const u8 key[CHACHA20POLY1305_KEY_SIZE]) { - u32 chacha_state[CHACHA_STATE_WORDS]; + struct chacha_state chacha_state; u32 k[CHACHA_KEY_WORDS]; __le64 iv[2]; @@ -100,8 +100,9 @@ void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, iv[0] = 0; iv[1] = cpu_to_le64(nonce); - chacha_init(chacha_state, k, (u8 *)iv); - __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, chacha_state); + chacha_init(&chacha_state, k, (u8 *)iv); + __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, + &chacha_state); memzero_explicit(iv, sizeof(iv)); memzero_explicit(k, sizeof(k)); @@ -113,16 +114,18 @@ void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE], const u8 key[CHACHA20POLY1305_KEY_SIZE]) { - u32 chacha_state[CHACHA_STATE_WORDS]; + struct chacha_state chacha_state; - xchacha_init(chacha_state, key, nonce); - __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, chacha_state); + xchacha_init(&chacha_state, key, nonce); + __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, + &chacha_state); } EXPORT_SYMBOL(xchacha20poly1305_encrypt); static bool __chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len, - const u8 *ad, const size_t ad_len, u32 *chacha_state) + const u8 *ad, const size_t ad_len, + struct chacha_state *chacha_state) { const u8 *pad0 = page_address(ZERO_PAGE(0)); struct poly1305_desc_ctx poly1305_state; @@ -169,7 +172,7 @@ bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len, const u64 nonce, const u8 key[CHACHA20POLY1305_KEY_SIZE]) { - u32 chacha_state[CHACHA_STATE_WORDS]; + struct chacha_state chacha_state; u32 k[CHACHA_KEY_WORDS]; __le64 iv[2]; bool ret; @@ -179,11 +182,11 @@ bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len, iv[0] = 0; iv[1] = cpu_to_le64(nonce); - chacha_init(chacha_state, k, (u8 *)iv); + chacha_init(&chacha_state, k, (u8 *)iv); ret = __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len, - chacha_state); + &chacha_state); - memzero_explicit(chacha_state, sizeof(chacha_state)); + chacha_zeroize_state(&chacha_state); memzero_explicit(iv, sizeof(iv)); memzero_explicit(k, sizeof(k)); return ret; @@ -195,11 +198,11 @@ bool xchacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len, const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE], const u8 key[CHACHA20POLY1305_KEY_SIZE]) { - u32 chacha_state[CHACHA_STATE_WORDS]; + struct chacha_state chacha_state; - xchacha_init(chacha_state, key, nonce); + xchacha_init(&chacha_state, key, nonce); return __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len, - chacha_state); + &chacha_state); } EXPORT_SYMBOL(xchacha20poly1305_decrypt); @@ -213,7 +216,7 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src, { const u8 *pad0 = page_address(ZERO_PAGE(0)); struct poly1305_desc_ctx poly1305_state; - u32 chacha_state[CHACHA_STATE_WORDS]; + struct chacha_state chacha_state; struct sg_mapping_iter miter; size_t partial = 0; unsigned int flags; @@ -240,8 +243,8 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src, b.iv[0] = 0; b.iv[1] = cpu_to_le64(nonce); - chacha_init(chacha_state, b.k, (u8 *)b.iv); - chacha20_crypt(chacha_state, b.block0, pad0, sizeof(b.block0)); + chacha_init(&chacha_state, b.k, (u8 *)b.iv); + chacha20_crypt(&chacha_state, b.block0, pad0, sizeof(b.block0)); poly1305_init(&poly1305_state, b.block0); if (unlikely(ad_len)) { @@ -276,13 +279,13 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src, if (unlikely(length < sl)) l &= ~(CHACHA_BLOCK_SIZE - 1); - chacha20_crypt(chacha_state, addr, addr, l); + chacha20_crypt(&chacha_state, addr, addr, l); addr += l; length -= l; } if (unlikely(length > 0)) { - chacha20_crypt(chacha_state, b.chacha_stream, pad0, + chacha20_crypt(&chacha_state, b.chacha_stream, pad0, CHACHA_BLOCK_SIZE); crypto_xor(addr, b.chacha_stream, length); partial = length; @@ -323,7 +326,7 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src, !crypto_memneq(b.mac[0], b.mac[1], POLY1305_DIGEST_SIZE); } - memzero_explicit(chacha_state, sizeof(chacha_state)); + chacha_zeroize_state(&chacha_state); memzero_explicit(&b, sizeof(b)); return ret; @@ -355,7 +358,7 @@ EXPORT_SYMBOL(chacha20poly1305_decrypt_sg_inplace); static int __init chacha20poly1305_init(void) { - if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) && + if (IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) && WARN_ON(!chacha20poly1305_selftest())) return -ENODEV; return 0; diff --git a/lib/crypto/curve25519-generic.c b/lib/crypto/curve25519-generic.c index de7c99172fa2..f8aa70c9f559 100644 --- a/lib/crypto/curve25519-generic.c +++ b/lib/crypto/curve25519-generic.c @@ -10,6 +10,7 @@ */ #include <crypto/curve25519.h> +#include <linux/export.h> #include <linux/module.h> const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 }; diff --git a/lib/crypto/curve25519.c b/lib/crypto/curve25519.c index 064b352c6907..6850b76a80c9 100644 --- a/lib/crypto/curve25519.c +++ b/lib/crypto/curve25519.c @@ -15,7 +15,7 @@ static int __init curve25519_init(void) { - if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) && + if (IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) && WARN_ON(!curve25519_selftest())) return -ENODEV; return 0; diff --git a/lib/crypto/des.c b/lib/crypto/des.c index d3423b34a8e9..a906070136dc 100644 --- a/lib/crypto/des.c +++ b/lib/crypto/des.c @@ -7,21 +7,20 @@ * Copyright (c) 2005 Dag Arne Osvik <da@osvik.no> */ +#include <crypto/des.h> +#include <crypto/internal/des.h> #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/crypto.h> #include <linux/errno.h> +#include <linux/export.h> #include <linux/fips.h> #include <linux/init.h> #include <linux/module.h> #include <linux/string.h> #include <linux/types.h> - #include <linux/unaligned.h> -#include <crypto/des.h> -#include <crypto/internal/des.h> - #define ROL(x, r) ((x) = rol32((x), (r))) #define ROR(x, r) ((x) = ror32((x), (r))) diff --git a/lib/crypto/gf128mul.c b/lib/crypto/gf128mul.c index fbe72cb3453a..2a34590fe3f1 100644 --- a/lib/crypto/gf128mul.c +++ b/lib/crypto/gf128mul.c @@ -49,6 +49,7 @@ */ #include <crypto/gf128mul.h> +#include <linux/export.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> diff --git a/lib/crypto/hash_info.c b/lib/crypto/hash_info.c new file mode 100644 index 000000000000..9a467638c971 --- /dev/null +++ b/lib/crypto/hash_info.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Hash Info: Hash algorithms information + * + * Copyright (c) 2013 Dmitry Kasatkin <d.kasatkin@samsung.com> + */ + +#include <linux/export.h> +#include <crypto/hash_info.h> + +const char *const hash_algo_name[HASH_ALGO__LAST] = { + [HASH_ALGO_MD4] = "md4", + [HASH_ALGO_MD5] = "md5", + [HASH_ALGO_SHA1] = "sha1", + [HASH_ALGO_RIPE_MD_160] = "rmd160", + [HASH_ALGO_SHA256] = "sha256", + [HASH_ALGO_SHA384] = "sha384", + [HASH_ALGO_SHA512] = "sha512", + [HASH_ALGO_SHA224] = "sha224", + [HASH_ALGO_RIPE_MD_128] = "rmd128", + [HASH_ALGO_RIPE_MD_256] = "rmd256", + [HASH_ALGO_RIPE_MD_320] = "rmd320", + [HASH_ALGO_WP_256] = "wp256", + [HASH_ALGO_WP_384] = "wp384", + [HASH_ALGO_WP_512] = "wp512", + [HASH_ALGO_TGR_128] = "tgr128", + [HASH_ALGO_TGR_160] = "tgr160", + [HASH_ALGO_TGR_192] = "tgr192", + [HASH_ALGO_SM3_256] = "sm3", + [HASH_ALGO_STREEBOG_256] = "streebog256", + [HASH_ALGO_STREEBOG_512] = "streebog512", + [HASH_ALGO_SHA3_256] = "sha3-256", + [HASH_ALGO_SHA3_384] = "sha3-384", + [HASH_ALGO_SHA3_512] = "sha3-512", +}; +EXPORT_SYMBOL_GPL(hash_algo_name); + +const int hash_digest_size[HASH_ALGO__LAST] = { + [HASH_ALGO_MD4] = MD5_DIGEST_SIZE, + [HASH_ALGO_MD5] = MD5_DIGEST_SIZE, + [HASH_ALGO_SHA1] = SHA1_DIGEST_SIZE, + [HASH_ALGO_RIPE_MD_160] = RMD160_DIGEST_SIZE, + [HASH_ALGO_SHA256] = SHA256_DIGEST_SIZE, + [HASH_ALGO_SHA384] = SHA384_DIGEST_SIZE, + [HASH_ALGO_SHA512] = SHA512_DIGEST_SIZE, + [HASH_ALGO_SHA224] = SHA224_DIGEST_SIZE, + [HASH_ALGO_RIPE_MD_128] = RMD128_DIGEST_SIZE, + [HASH_ALGO_RIPE_MD_256] = RMD256_DIGEST_SIZE, + [HASH_ALGO_RIPE_MD_320] = RMD320_DIGEST_SIZE, + [HASH_ALGO_WP_256] = WP256_DIGEST_SIZE, + [HASH_ALGO_WP_384] = WP384_DIGEST_SIZE, + [HASH_ALGO_WP_512] = WP512_DIGEST_SIZE, + [HASH_ALGO_TGR_128] = TGR128_DIGEST_SIZE, + [HASH_ALGO_TGR_160] = TGR160_DIGEST_SIZE, + [HASH_ALGO_TGR_192] = TGR192_DIGEST_SIZE, + [HASH_ALGO_SM3_256] = SM3256_DIGEST_SIZE, + [HASH_ALGO_STREEBOG_256] = STREEBOG256_DIGEST_SIZE, + [HASH_ALGO_STREEBOG_512] = STREEBOG512_DIGEST_SIZE, + [HASH_ALGO_SHA3_256] = SHA3_256_DIGEST_SIZE, + [HASH_ALGO_SHA3_384] = SHA3_384_DIGEST_SIZE, + [HASH_ALGO_SHA3_512] = SHA3_512_DIGEST_SIZE, +}; +EXPORT_SYMBOL_GPL(hash_digest_size); diff --git a/lib/crypto/libchacha.c b/lib/crypto/libchacha.c index cc1be0496eb9..26862ad90a96 100644 --- a/lib/crypto/libchacha.c +++ b/lib/crypto/libchacha.c @@ -5,14 +5,13 @@ * Copyright (C) 2015 Martin Willi */ -#include <linux/kernel.h> -#include <linux/export.h> -#include <linux/module.h> - #include <crypto/algapi.h> // for crypto_xor_cpy #include <crypto/chacha.h> +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/module.h> -void chacha_crypt_generic(u32 *state, u8 *dst, const u8 *src, +void chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds) { /* aligned to potentially speed up crypto_xor() */ diff --git a/lib/crypto/memneq.c b/lib/crypto/memneq.c index a2afd10349c9..44daacb8cb51 100644 --- a/lib/crypto/memneq.c +++ b/lib/crypto/memneq.c @@ -59,9 +59,10 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include <linux/unaligned.h> #include <crypto/algapi.h> +#include <linux/export.h> #include <linux/module.h> +#include <linux/unaligned.h> /* Generic path for arbitrary size */ static inline unsigned long diff --git a/lib/crypto/mips/.gitignore b/lib/crypto/mips/.gitignore new file mode 100644 index 000000000000..0d47d4f21c6d --- /dev/null +++ b/lib/crypto/mips/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +poly1305-core.S diff --git a/lib/crypto/mips/Kconfig b/lib/crypto/mips/Kconfig new file mode 100644 index 000000000000..0670a170c1be --- /dev/null +++ b/lib/crypto/mips/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_CHACHA_MIPS + tristate + depends on CPU_MIPS32_R2 + default CRYPTO_LIB_CHACHA + select CRYPTO_ARCH_HAVE_LIB_CHACHA + +config CRYPTO_POLY1305_MIPS + tristate + default CRYPTO_LIB_POLY1305 + select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/lib/crypto/mips/Makefile b/lib/crypto/mips/Makefile new file mode 100644 index 000000000000..804488c7aded --- /dev/null +++ b/lib/crypto/mips/Makefile @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o +chacha-mips-y := chacha-core.o chacha-glue.o +AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots + +obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o +poly1305-mips-y := poly1305-core.o poly1305-glue.o + +perlasm-flavour-$(CONFIG_32BIT) := o32 +perlasm-flavour-$(CONFIG_64BIT) := 64 + +quiet_cmd_perlasm = PERLASM $@ + cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@) + +$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE + $(call if_changed,perlasm) + +targets += poly1305-core.S diff --git a/lib/crypto/mips/chacha-core.S b/lib/crypto/mips/chacha-core.S new file mode 100644 index 000000000000..706aeb850fb0 --- /dev/null +++ b/lib/crypto/mips/chacha-core.S @@ -0,0 +1,491 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved. + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#define MASK_U32 0x3c +#define CHACHA20_BLOCK_SIZE 64 +#define STACK_SIZE 32 + +#define X0 $t0 +#define X1 $t1 +#define X2 $t2 +#define X3 $t3 +#define X4 $t4 +#define X5 $t5 +#define X6 $t6 +#define X7 $t7 +#define X8 $t8 +#define X9 $t9 +#define X10 $v1 +#define X11 $s6 +#define X12 $s5 +#define X13 $s4 +#define X14 $s3 +#define X15 $s2 +/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */ +#define T0 $s1 +#define T1 $s0 +#define T(n) T ## n +#define X(n) X ## n + +/* Input arguments */ +#define STATE $a0 +#define OUT $a1 +#define IN $a2 +#define BYTES $a3 + +/* Output argument */ +/* NONCE[0] is kept in a register and not in memory. + * We don't want to touch original value in memory. + * Must be incremented every loop iteration. + */ +#define NONCE_0 $v0 + +/* SAVED_X and SAVED_CA are set in the jump table. + * Use regs which are overwritten on exit else we don't leak clear data. + * They are used to handling the last bytes which are not multiple of 4. + */ +#define SAVED_X X15 +#define SAVED_CA $s7 + +#define IS_UNALIGNED $s7 + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define MSB 0 +#define LSB 3 +#define CPU_TO_LE32(n) \ + wsbh n, n; \ + rotr n, 16; +#else +#define MSB 3 +#define LSB 0 +#define CPU_TO_LE32(n) +#endif + +#define FOR_EACH_WORD(x) \ + x( 0); \ + x( 1); \ + x( 2); \ + x( 3); \ + x( 4); \ + x( 5); \ + x( 6); \ + x( 7); \ + x( 8); \ + x( 9); \ + x(10); \ + x(11); \ + x(12); \ + x(13); \ + x(14); \ + x(15); + +#define FOR_EACH_WORD_REV(x) \ + x(15); \ + x(14); \ + x(13); \ + x(12); \ + x(11); \ + x(10); \ + x( 9); \ + x( 8); \ + x( 7); \ + x( 6); \ + x( 5); \ + x( 4); \ + x( 3); \ + x( 2); \ + x( 1); \ + x( 0); + +#define PLUS_ONE_0 1 +#define PLUS_ONE_1 2 +#define PLUS_ONE_2 3 +#define PLUS_ONE_3 4 +#define PLUS_ONE_4 5 +#define PLUS_ONE_5 6 +#define PLUS_ONE_6 7 +#define PLUS_ONE_7 8 +#define PLUS_ONE_8 9 +#define PLUS_ONE_9 10 +#define PLUS_ONE_10 11 +#define PLUS_ONE_11 12 +#define PLUS_ONE_12 13 +#define PLUS_ONE_13 14 +#define PLUS_ONE_14 15 +#define PLUS_ONE_15 16 +#define PLUS_ONE(x) PLUS_ONE_ ## x +#define _CONCAT3(a,b,c) a ## b ## c +#define CONCAT3(a,b,c) _CONCAT3(a,b,c) + +#define STORE_UNALIGNED(x) \ +CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \ + .if (x != 12); \ + lw T0, (x*4)(STATE); \ + .endif; \ + lwl T1, (x*4)+MSB ## (IN); \ + lwr T1, (x*4)+LSB ## (IN); \ + .if (x == 12); \ + addu X ## x, NONCE_0; \ + .else; \ + addu X ## x, T0; \ + .endif; \ + CPU_TO_LE32(X ## x); \ + xor X ## x, T1; \ + swl X ## x, (x*4)+MSB ## (OUT); \ + swr X ## x, (x*4)+LSB ## (OUT); + +#define STORE_ALIGNED(x) \ +CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \ + .if (x != 12); \ + lw T0, (x*4)(STATE); \ + .endif; \ + lw T1, (x*4) ## (IN); \ + .if (x == 12); \ + addu X ## x, NONCE_0; \ + .else; \ + addu X ## x, T0; \ + .endif; \ + CPU_TO_LE32(X ## x); \ + xor X ## x, T1; \ + sw X ## x, (x*4) ## (OUT); + +/* Jump table macro. + * Used for setup and handling the last bytes, which are not multiple of 4. + * X15 is free to store Xn + * Every jumptable entry must be equal in size. + */ +#define JMPTBL_ALIGNED(x) \ +.Lchacha_mips_jmptbl_aligned_ ## x: ; \ + .set noreorder; \ + b .Lchacha_mips_xor_aligned_ ## x ## _b; \ + .if (x == 12); \ + addu SAVED_X, X ## x, NONCE_0; \ + .else; \ + addu SAVED_X, X ## x, SAVED_CA; \ + .endif; \ + .set reorder + +#define JMPTBL_UNALIGNED(x) \ +.Lchacha_mips_jmptbl_unaligned_ ## x: ; \ + .set noreorder; \ + b .Lchacha_mips_xor_unaligned_ ## x ## _b; \ + .if (x == 12); \ + addu SAVED_X, X ## x, NONCE_0; \ + .else; \ + addu SAVED_X, X ## x, SAVED_CA; \ + .endif; \ + .set reorder + +#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \ + addu X(A), X(K); \ + addu X(B), X(L); \ + addu X(C), X(M); \ + addu X(D), X(N); \ + xor X(V), X(A); \ + xor X(W), X(B); \ + xor X(Y), X(C); \ + xor X(Z), X(D); \ + rotr X(V), 32 - S; \ + rotr X(W), 32 - S; \ + rotr X(Y), 32 - S; \ + rotr X(Z), 32 - S; + +.text +.set reorder +.set noat +.globl chacha_crypt_arch +.ent chacha_crypt_arch +chacha_crypt_arch: + .frame $sp, STACK_SIZE, $ra + + /* Load number of rounds */ + lw $at, 16($sp) + + addiu $sp, -STACK_SIZE + + /* Return bytes = 0. */ + beqz BYTES, .Lchacha_mips_end + + lw NONCE_0, 48(STATE) + + /* Save s0-s7 */ + sw $s0, 0($sp) + sw $s1, 4($sp) + sw $s2, 8($sp) + sw $s3, 12($sp) + sw $s4, 16($sp) + sw $s5, 20($sp) + sw $s6, 24($sp) + sw $s7, 28($sp) + + /* Test IN or OUT is unaligned. + * IS_UNALIGNED = ( IN | OUT ) & 0x00000003 + */ + or IS_UNALIGNED, IN, OUT + andi IS_UNALIGNED, 0x3 + + b .Lchacha_rounds_start + +.align 4 +.Loop_chacha_rounds: + addiu IN, CHACHA20_BLOCK_SIZE + addiu OUT, CHACHA20_BLOCK_SIZE + addiu NONCE_0, 1 + +.Lchacha_rounds_start: + lw X0, 0(STATE) + lw X1, 4(STATE) + lw X2, 8(STATE) + lw X3, 12(STATE) + + lw X4, 16(STATE) + lw X5, 20(STATE) + lw X6, 24(STATE) + lw X7, 28(STATE) + lw X8, 32(STATE) + lw X9, 36(STATE) + lw X10, 40(STATE) + lw X11, 44(STATE) + + move X12, NONCE_0 + lw X13, 52(STATE) + lw X14, 56(STATE) + lw X15, 60(STATE) + +.Loop_chacha_xor_rounds: + addiu $at, -2 + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); + bnez $at, .Loop_chacha_xor_rounds + + addiu BYTES, -(CHACHA20_BLOCK_SIZE) + + /* Is data src/dst unaligned? Jump */ + bnez IS_UNALIGNED, .Loop_chacha_unaligned + + /* Set number rounds here to fill delayslot. */ + lw $at, (STACK_SIZE+16)($sp) + + /* BYTES < 0, it has no full block. */ + bltz BYTES, .Lchacha_mips_no_full_block_aligned + + FOR_EACH_WORD_REV(STORE_ALIGNED) + + /* BYTES > 0? Loop again. */ + bgtz BYTES, .Loop_chacha_rounds + + /* Place this here to fill delay slot */ + addiu NONCE_0, 1 + + /* BYTES < 0? Handle last bytes */ + bltz BYTES, .Lchacha_mips_xor_bytes + +.Lchacha_mips_xor_done: + /* Restore used registers */ + lw $s0, 0($sp) + lw $s1, 4($sp) + lw $s2, 8($sp) + lw $s3, 12($sp) + lw $s4, 16($sp) + lw $s5, 20($sp) + lw $s6, 24($sp) + lw $s7, 28($sp) + + /* Write NONCE_0 back to right location in state */ + sw NONCE_0, 48(STATE) + +.Lchacha_mips_end: + addiu $sp, STACK_SIZE + jr $ra + +.Lchacha_mips_no_full_block_aligned: + /* Restore the offset on BYTES */ + addiu BYTES, CHACHA20_BLOCK_SIZE + + /* Get number of full WORDS */ + andi $at, BYTES, MASK_U32 + + /* Load upper half of jump table addr */ + lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0) + + /* Calculate lower half jump table offset */ + ins T0, $at, 1, 6 + + /* Add offset to STATE */ + addu T1, STATE, $at + + /* Add lower half jump table addr */ + addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0) + + /* Read value from STATE */ + lw SAVED_CA, 0(T1) + + /* Store remaining bytecounter as negative value */ + subu BYTES, $at, BYTES + + jr T0 + + /* Jump table */ + FOR_EACH_WORD(JMPTBL_ALIGNED) + + +.Loop_chacha_unaligned: + /* Set number rounds here to fill delayslot. */ + lw $at, (STACK_SIZE+16)($sp) + + /* BYTES > 0, it has no full block. */ + bltz BYTES, .Lchacha_mips_no_full_block_unaligned + + FOR_EACH_WORD_REV(STORE_UNALIGNED) + + /* BYTES > 0? Loop again. */ + bgtz BYTES, .Loop_chacha_rounds + + /* Write NONCE_0 back to right location in state */ + sw NONCE_0, 48(STATE) + + .set noreorder + /* Fall through to byte handling */ + bgez BYTES, .Lchacha_mips_xor_done +.Lchacha_mips_xor_unaligned_0_b: +.Lchacha_mips_xor_aligned_0_b: + /* Place this here to fill delay slot */ + addiu NONCE_0, 1 + .set reorder + +.Lchacha_mips_xor_bytes: + addu IN, $at + addu OUT, $at + /* First byte */ + lbu T1, 0(IN) + addiu $at, BYTES, 1 + xor T1, SAVED_X + sb T1, 0(OUT) + beqz $at, .Lchacha_mips_xor_done + /* Second byte */ + lbu T1, 1(IN) + addiu $at, BYTES, 2 + rotr SAVED_X, 8 + xor T1, SAVED_X + sb T1, 1(OUT) + beqz $at, .Lchacha_mips_xor_done + /* Third byte */ + lbu T1, 2(IN) + rotr SAVED_X, 8 + xor T1, SAVED_X + sb T1, 2(OUT) + b .Lchacha_mips_xor_done + +.Lchacha_mips_no_full_block_unaligned: + /* Restore the offset on BYTES */ + addiu BYTES, CHACHA20_BLOCK_SIZE + + /* Get number of full WORDS */ + andi $at, BYTES, MASK_U32 + + /* Load upper half of jump table addr */ + lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0) + + /* Calculate lower half jump table offset */ + ins T0, $at, 1, 6 + + /* Add offset to STATE */ + addu T1, STATE, $at + + /* Add lower half jump table addr */ + addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0) + + /* Read value from STATE */ + lw SAVED_CA, 0(T1) + + /* Store remaining bytecounter as negative value */ + subu BYTES, $at, BYTES + + jr T0 + + /* Jump table */ + FOR_EACH_WORD(JMPTBL_UNALIGNED) +.end chacha_crypt_arch +.set at + +/* Input arguments + * STATE $a0 + * OUT $a1 + * NROUND $a2 + */ + +#undef X12 +#undef X13 +#undef X14 +#undef X15 + +#define X12 $a3 +#define X13 $at +#define X14 $v0 +#define X15 STATE + +.set noat +.globl hchacha_block_arch +.ent hchacha_block_arch +hchacha_block_arch: + .frame $sp, STACK_SIZE, $ra + + addiu $sp, -STACK_SIZE + + /* Save X11(s6) */ + sw X11, 0($sp) + + lw X0, 0(STATE) + lw X1, 4(STATE) + lw X2, 8(STATE) + lw X3, 12(STATE) + lw X4, 16(STATE) + lw X5, 20(STATE) + lw X6, 24(STATE) + lw X7, 28(STATE) + lw X8, 32(STATE) + lw X9, 36(STATE) + lw X10, 40(STATE) + lw X11, 44(STATE) + lw X12, 48(STATE) + lw X13, 52(STATE) + lw X14, 56(STATE) + lw X15, 60(STATE) + +.Loop_hchacha_xor_rounds: + addiu $a2, -2 + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); + bnez $a2, .Loop_hchacha_xor_rounds + + /* Restore used register */ + lw X11, 0($sp) + + sw X0, 0(OUT) + sw X1, 4(OUT) + sw X2, 8(OUT) + sw X3, 12(OUT) + sw X12, 16(OUT) + sw X13, 20(OUT) + sw X14, 24(OUT) + sw X15, 28(OUT) + + addiu $sp, STACK_SIZE + jr $ra +.end hchacha_block_arch +.set at diff --git a/lib/crypto/mips/chacha-glue.c b/lib/crypto/mips/chacha-glue.c new file mode 100644 index 000000000000..88c097594eb0 --- /dev/null +++ b/lib/crypto/mips/chacha-glue.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ChaCha and HChaCha functions (MIPS optimized) + * + * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <crypto/chacha.h> +#include <linux/kernel.h> +#include <linux/module.h> + +asmlinkage void chacha_crypt_arch(struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int bytes, int nrounds); +EXPORT_SYMBOL(chacha_crypt_arch); + +asmlinkage void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); +EXPORT_SYMBOL(hchacha_block_arch); + +bool chacha_is_arch_optimized(void) +{ + return true; +} +EXPORT_SYMBOL(chacha_is_arch_optimized); + +MODULE_DESCRIPTION("ChaCha and HChaCha functions (MIPS optimized)"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/mips/poly1305-glue.c b/lib/crypto/mips/poly1305-glue.c new file mode 100644 index 000000000000..764a38a65200 --- /dev/null +++ b/lib/crypto/mips/poly1305-glue.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS + * + * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <crypto/internal/poly1305.h> +#include <linux/cpufeature.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/unaligned.h> + +asmlinkage void poly1305_block_init_arch( + struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +EXPORT_SYMBOL_GPL(poly1305_block_init_arch); +asmlinkage void poly1305_blocks_arch(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +EXPORT_SYMBOL_GPL(poly1305_blocks_arch); +asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +EXPORT_SYMBOL_GPL(poly1305_emit_arch); + +bool poly1305_is_arch_optimized(void) +{ + return true; +} +EXPORT_SYMBOL(poly1305_is_arch_optimized); + +MODULE_DESCRIPTION("Poly1305 transform (MIPS accelerated"); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/mips/poly1305-mips.pl b/lib/crypto/mips/poly1305-mips.pl new file mode 100644 index 000000000000..399f10c3e385 --- /dev/null +++ b/lib/crypto/mips/poly1305-mips.pl @@ -0,0 +1,1273 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL +# project. +# ==================================================================== + +# Poly1305 hash for MIPS. +# +# May 2016 +# +# Numbers are cycles per processed byte with poly1305_blocks alone. +# +# IALU/gcc +# R1x000 ~5.5/+130% (big-endian) +# Octeon II 2.50/+70% (little-endian) +# +# March 2019 +# +# Add 32-bit code path. +# +# October 2019 +# +# Modulo-scheduling reduction allows to omit dependency chain at the +# end of inner loop and improve performance. Also optimize MIPS32R2 +# code path for MIPS 1004K core. Per René von Dorst's suggestions. +# +# IALU/gcc +# R1x000 ~9.8/? (big-endian) +# Octeon II 3.65/+140% (little-endian) +# MT7621/1004K 4.75/? (little-endian) +# +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp [o32 can be +# excluded from the rule, because it's specified volatile]; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +# <appro@openssl.org> +# +###################################################################### + +$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64 + +$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; + +if ($flavour =~ /64|n32/i) {{{ +###################################################################### +# 64-bit code path +# + +my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); +my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); + +$code.=<<___; +#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\ + defined(_MIPS_ARCH_MIPS64R6)) \\ + && !defined(_MIPS_ARCH_MIPS64R2) +# define _MIPS_ARCH_MIPS64R2 +#endif + +#if defined(_MIPS_ARCH_MIPS64R6) +# define dmultu(rs,rt) +# define mflo(rd,rs,rt) dmulu rd,rs,rt +# define mfhi(rd,rs,rt) dmuhu rd,rs,rt +#else +# define dmultu(rs,rt) dmultu rs,rt +# define mflo(rd,rs,rt) mflo rd +# define mfhi(rd,rs,rt) mfhi rd +#endif + +#ifdef __KERNEL__ +# define poly1305_init poly1305_block_init_arch +# define poly1305_blocks poly1305_blocks_arch +# define poly1305_emit poly1305_emit_arch +#endif + +#if defined(__MIPSEB__) && !defined(MIPSEB) +# define MIPSEB +#endif + +#ifdef MIPSEB +# define MSB 0 +# define LSB 7 +#else +# define MSB 7 +# define LSB 0 +#endif + +.text +.set noat +.set noreorder + +.align 5 +.globl poly1305_init +.ent poly1305_init +poly1305_init: + .frame $sp,0,$ra + .set reorder + + sd $zero,0($ctx) + sd $zero,8($ctx) + sd $zero,16($ctx) + + beqz $inp,.Lno_key + +#if defined(_MIPS_ARCH_MIPS64R6) + andi $tmp0,$inp,7 # $inp % 8 + dsubu $inp,$inp,$tmp0 # align $inp + sll $tmp0,$tmp0,3 # byte to bit offset + ld $in0,0($inp) + ld $in1,8($inp) + beqz $tmp0,.Laligned_key + ld $tmp2,16($inp) + + subu $tmp1,$zero,$tmp0 +# ifdef MIPSEB + dsllv $in0,$in0,$tmp0 + dsrlv $tmp3,$in1,$tmp1 + dsllv $in1,$in1,$tmp0 + dsrlv $tmp2,$tmp2,$tmp1 +# else + dsrlv $in0,$in0,$tmp0 + dsllv $tmp3,$in1,$tmp1 + dsrlv $in1,$in1,$tmp0 + dsllv $tmp2,$tmp2,$tmp1 +# endif + or $in0,$in0,$tmp3 + or $in1,$in1,$tmp2 +.Laligned_key: +#else + ldl $in0,0+MSB($inp) + ldl $in1,8+MSB($inp) + ldr $in0,0+LSB($inp) + ldr $in1,8+LSB($inp) +#endif +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS64R2) + dsbh $in0,$in0 # byte swap + dsbh $in1,$in1 + dshd $in0,$in0 + dshd $in1,$in1 +# else + ori $tmp0,$zero,0xFF + dsll $tmp2,$tmp0,32 + or $tmp0,$tmp2 # 0x000000FF000000FF + + and $tmp1,$in0,$tmp0 # byte swap + and $tmp3,$in1,$tmp0 + dsrl $tmp2,$in0,24 + dsrl $tmp4,$in1,24 + dsll $tmp1,24 + dsll $tmp3,24 + and $tmp2,$tmp0 + and $tmp4,$tmp0 + dsll $tmp0,8 # 0x0000FF000000FF00 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + and $tmp2,$in0,$tmp0 + and $tmp4,$in1,$tmp0 + dsrl $in0,8 + dsrl $in1,8 + dsll $tmp2,8 + dsll $tmp4,8 + and $in0,$tmp0 + and $in1,$tmp0 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + or $in0,$tmp1 + or $in1,$tmp3 + dsrl $tmp1,$in0,32 + dsrl $tmp3,$in1,32 + dsll $in0,32 + dsll $in1,32 + or $in0,$tmp1 + or $in1,$tmp3 +# endif +#endif + li $tmp0,1 + dsll $tmp0,32 # 0x0000000100000000 + daddiu $tmp0,-63 # 0x00000000ffffffc1 + dsll $tmp0,28 # 0x0ffffffc10000000 + daddiu $tmp0,-1 # 0x0ffffffc0fffffff + + and $in0,$tmp0 + daddiu $tmp0,-3 # 0x0ffffffc0ffffffc + and $in1,$tmp0 + + sd $in0,24($ctx) + dsrl $tmp0,$in1,2 + sd $in1,32($ctx) + daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) + sd $tmp0,40($ctx) + +.Lno_key: + li $v0,0 # return 0 + jr $ra +.end poly1305_init +___ +{ +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; + +my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) = + ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); +my ($shr,$shl) = ($s6,$s7); # used on R6 + +$code.=<<___; +.align 5 +.globl poly1305_blocks +.ent poly1305_blocks +poly1305_blocks: + .set noreorder + dsrl $len,4 # number of complete blocks + bnez $len,poly1305_blocks_internal + nop + jr $ra + nop +.end poly1305_blocks + +.align 5 +.ent poly1305_blocks_internal +poly1305_blocks_internal: + .set noreorder +#if defined(_MIPS_ARCH_MIPS64R6) + .frame $sp,8*8,$ra + .mask $SAVED_REGS_MASK|0x000c0000,-8 + dsubu $sp,8*8 + sd $s7,56($sp) + sd $s6,48($sp) +#else + .frame $sp,6*8,$ra + .mask $SAVED_REGS_MASK,-8 + dsubu $sp,6*8 +#endif + sd $s5,40($sp) + sd $s4,32($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + sd $s3,24($sp) + sd $s2,16($sp) + sd $s1,8($sp) + sd $s0,0($sp) +___ +$code.=<<___; + .set reorder + +#if defined(_MIPS_ARCH_MIPS64R6) + andi $shr,$inp,7 + dsubu $inp,$inp,$shr # align $inp + sll $shr,$shr,3 # byte to bit offset + subu $shl,$zero,$shr +#endif + + ld $h0,0($ctx) # load hash value + ld $h1,8($ctx) + ld $h2,16($ctx) + + ld $r0,24($ctx) # load key + ld $r1,32($ctx) + ld $rs1,40($ctx) + + dsll $len,4 + daddu $len,$inp # end of buffer + b .Loop + +.align 4 +.Loop: +#if defined(_MIPS_ARCH_MIPS64R6) + ld $in0,0($inp) # load input + ld $in1,8($inp) + beqz $shr,.Laligned_inp + + ld $tmp2,16($inp) +# ifdef MIPSEB + dsllv $in0,$in0,$shr + dsrlv $tmp3,$in1,$shl + dsllv $in1,$in1,$shr + dsrlv $tmp2,$tmp2,$shl +# else + dsrlv $in0,$in0,$shr + dsllv $tmp3,$in1,$shl + dsrlv $in1,$in1,$shr + dsllv $tmp2,$tmp2,$shl +# endif + or $in0,$in0,$tmp3 + or $in1,$in1,$tmp2 +.Laligned_inp: +#else + ldl $in0,0+MSB($inp) # load input + ldl $in1,8+MSB($inp) + ldr $in0,0+LSB($inp) + ldr $in1,8+LSB($inp) +#endif + daddiu $inp,16 +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS64R2) + dsbh $in0,$in0 # byte swap + dsbh $in1,$in1 + dshd $in0,$in0 + dshd $in1,$in1 +# else + ori $tmp0,$zero,0xFF + dsll $tmp2,$tmp0,32 + or $tmp0,$tmp2 # 0x000000FF000000FF + + and $tmp1,$in0,$tmp0 # byte swap + and $tmp3,$in1,$tmp0 + dsrl $tmp2,$in0,24 + dsrl $tmp4,$in1,24 + dsll $tmp1,24 + dsll $tmp3,24 + and $tmp2,$tmp0 + and $tmp4,$tmp0 + dsll $tmp0,8 # 0x0000FF000000FF00 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + and $tmp2,$in0,$tmp0 + and $tmp4,$in1,$tmp0 + dsrl $in0,8 + dsrl $in1,8 + dsll $tmp2,8 + dsll $tmp4,8 + and $in0,$tmp0 + and $in1,$tmp0 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + or $in0,$tmp1 + or $in1,$tmp3 + dsrl $tmp1,$in0,32 + dsrl $tmp3,$in1,32 + dsll $in0,32 + dsll $in1,32 + or $in0,$tmp1 + or $in1,$tmp3 +# endif +#endif + dsrl $tmp1,$h2,2 # modulo-scheduled reduction + andi $h2,$h2,3 + dsll $tmp0,$tmp1,2 + + daddu $d0,$h0,$in0 # accumulate input + daddu $tmp1,$tmp0 + sltu $tmp0,$d0,$h0 + daddu $d0,$d0,$tmp1 # ... and residue + sltu $tmp1,$d0,$tmp1 + daddu $d1,$h1,$in1 + daddu $tmp0,$tmp1 + sltu $tmp1,$d1,$h1 + daddu $d1,$tmp0 + + dmultu ($r0,$d0) # h0*r0 + daddu $d2,$h2,$padbit + sltu $tmp0,$d1,$tmp0 + mflo ($h0,$r0,$d0) + mfhi ($h1,$r0,$d0) + + dmultu ($rs1,$d1) # h1*5*r1 + daddu $d2,$tmp1 + daddu $d2,$tmp0 + mflo ($tmp0,$rs1,$d1) + mfhi ($tmp1,$rs1,$d1) + + dmultu ($r1,$d0) # h0*r1 + mflo ($tmp2,$r1,$d0) + mfhi ($h2,$r1,$d0) + daddu $h0,$tmp0 + daddu $h1,$tmp1 + sltu $tmp0,$h0,$tmp0 + + dmultu ($r0,$d1) # h1*r0 + daddu $h1,$tmp0 + daddu $h1,$tmp2 + mflo ($tmp0,$r0,$d1) + mfhi ($tmp1,$r0,$d1) + + dmultu ($rs1,$d2) # h2*5*r1 + sltu $tmp2,$h1,$tmp2 + daddu $h2,$tmp2 + mflo ($tmp2,$rs1,$d2) + + dmultu ($r0,$d2) # h2*r0 + daddu $h1,$tmp0 + daddu $h2,$tmp1 + mflo ($tmp3,$r0,$d2) + sltu $tmp0,$h1,$tmp0 + daddu $h2,$tmp0 + + daddu $h1,$tmp2 + sltu $tmp2,$h1,$tmp2 + daddu $h2,$tmp2 + daddu $h2,$tmp3 + + bne $inp,$len,.Loop + + sd $h0,0($ctx) # store hash value + sd $h1,8($ctx) + sd $h2,16($ctx) + + .set noreorder +#if defined(_MIPS_ARCH_MIPS64R6) + ld $s7,56($sp) + ld $s6,48($sp) +#endif + ld $s5,40($sp) # epilogue + ld $s4,32($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue + ld $s3,24($sp) + ld $s2,16($sp) + ld $s1,8($sp) + ld $s0,0($sp) +___ +$code.=<<___; + jr $ra +#if defined(_MIPS_ARCH_MIPS64R6) + daddu $sp,8*8 +#else + daddu $sp,6*8 +#endif +.end poly1305_blocks_internal +___ +} +{ +my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); + +$code.=<<___; +.align 5 +.globl poly1305_emit +.ent poly1305_emit +poly1305_emit: + .frame $sp,0,$ra + .set reorder + + ld $tmp2,16($ctx) + ld $tmp0,0($ctx) + ld $tmp1,8($ctx) + + li $in0,-4 # final reduction + dsrl $in1,$tmp2,2 + and $in0,$tmp2 + andi $tmp2,$tmp2,3 + daddu $in0,$in1 + + daddu $tmp0,$tmp0,$in0 + sltu $in1,$tmp0,$in0 + daddiu $in0,$tmp0,5 # compare to modulus + daddu $tmp1,$tmp1,$in1 + sltiu $tmp3,$in0,5 + sltu $tmp4,$tmp1,$in1 + daddu $in1,$tmp1,$tmp3 + daddu $tmp2,$tmp2,$tmp4 + sltu $tmp3,$in1,$tmp3 + daddu $tmp2,$tmp2,$tmp3 + + dsrl $tmp2,2 # see if it carried/borrowed + dsubu $tmp2,$zero,$tmp2 + + xor $in0,$tmp0 + xor $in1,$tmp1 + and $in0,$tmp2 + and $in1,$tmp2 + xor $in0,$tmp0 + xor $in1,$tmp1 + + lwu $tmp0,0($nonce) # load nonce + lwu $tmp1,4($nonce) + lwu $tmp2,8($nonce) + lwu $tmp3,12($nonce) + dsll $tmp1,32 + dsll $tmp3,32 + or $tmp0,$tmp1 + or $tmp2,$tmp3 + + daddu $in0,$tmp0 # accumulate nonce + daddu $in1,$tmp2 + sltu $tmp0,$in0,$tmp0 + daddu $in1,$tmp0 + + dsrl $tmp0,$in0,8 # write mac value + dsrl $tmp1,$in0,16 + dsrl $tmp2,$in0,24 + sb $in0,0($mac) + dsrl $tmp3,$in0,32 + sb $tmp0,1($mac) + dsrl $tmp0,$in0,40 + sb $tmp1,2($mac) + dsrl $tmp1,$in0,48 + sb $tmp2,3($mac) + dsrl $tmp2,$in0,56 + sb $tmp3,4($mac) + dsrl $tmp3,$in1,8 + sb $tmp0,5($mac) + dsrl $tmp0,$in1,16 + sb $tmp1,6($mac) + dsrl $tmp1,$in1,24 + sb $tmp2,7($mac) + + sb $in1,8($mac) + dsrl $tmp2,$in1,32 + sb $tmp3,9($mac) + dsrl $tmp3,$in1,40 + sb $tmp0,10($mac) + dsrl $tmp0,$in1,48 + sb $tmp1,11($mac) + dsrl $tmp1,$in1,56 + sb $tmp2,12($mac) + sb $tmp3,13($mac) + sb $tmp0,14($mac) + sb $tmp1,15($mac) + + jr $ra +.end poly1305_emit +.rdata +.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm" +.align 2 +___ +} +}}} else {{{ +###################################################################### +# 32-bit code path +# + +my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); +my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) = + ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2); + +$code.=<<___; +#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\ + defined(_MIPS_ARCH_MIPS32R6)) \\ + && !defined(_MIPS_ARCH_MIPS32R2) +# define _MIPS_ARCH_MIPS32R2 +#endif + +#if defined(_MIPS_ARCH_MIPS32R6) +# define multu(rs,rt) +# define mflo(rd,rs,rt) mulu rd,rs,rt +# define mfhi(rd,rs,rt) muhu rd,rs,rt +#else +# define multu(rs,rt) multu rs,rt +# define mflo(rd,rs,rt) mflo rd +# define mfhi(rd,rs,rt) mfhi rd +#endif + +#ifdef __KERNEL__ +# define poly1305_init poly1305_block_init_arch +# define poly1305_blocks poly1305_blocks_arch +# define poly1305_emit poly1305_emit_arch +#endif + +#if defined(__MIPSEB__) && !defined(MIPSEB) +# define MIPSEB +#endif + +#ifdef MIPSEB +# define MSB 0 +# define LSB 3 +#else +# define MSB 3 +# define LSB 0 +#endif + +.text +.set noat +.set noreorder + +.align 5 +.globl poly1305_init +.ent poly1305_init +poly1305_init: + .frame $sp,0,$ra + .set reorder + + sw $zero,0($ctx) + sw $zero,4($ctx) + sw $zero,8($ctx) + sw $zero,12($ctx) + sw $zero,16($ctx) + + beqz $inp,.Lno_key + +#if defined(_MIPS_ARCH_MIPS32R6) + andi $tmp0,$inp,3 # $inp % 4 + subu $inp,$inp,$tmp0 # align $inp + sll $tmp0,$tmp0,3 # byte to bit offset + lw $in0,0($inp) + lw $in1,4($inp) + lw $in2,8($inp) + lw $in3,12($inp) + beqz $tmp0,.Laligned_key + + lw $tmp2,16($inp) + subu $tmp1,$zero,$tmp0 +# ifdef MIPSEB + sllv $in0,$in0,$tmp0 + srlv $tmp3,$in1,$tmp1 + sllv $in1,$in1,$tmp0 + or $in0,$in0,$tmp3 + srlv $tmp3,$in2,$tmp1 + sllv $in2,$in2,$tmp0 + or $in1,$in1,$tmp3 + srlv $tmp3,$in3,$tmp1 + sllv $in3,$in3,$tmp0 + or $in2,$in2,$tmp3 + srlv $tmp2,$tmp2,$tmp1 + or $in3,$in3,$tmp2 +# else + srlv $in0,$in0,$tmp0 + sllv $tmp3,$in1,$tmp1 + srlv $in1,$in1,$tmp0 + or $in0,$in0,$tmp3 + sllv $tmp3,$in2,$tmp1 + srlv $in2,$in2,$tmp0 + or $in1,$in1,$tmp3 + sllv $tmp3,$in3,$tmp1 + srlv $in3,$in3,$tmp0 + or $in2,$in2,$tmp3 + sllv $tmp2,$tmp2,$tmp1 + or $in3,$in3,$tmp2 +# endif +.Laligned_key: +#else + lwl $in0,0+MSB($inp) + lwl $in1,4+MSB($inp) + lwl $in2,8+MSB($inp) + lwl $in3,12+MSB($inp) + lwr $in0,0+LSB($inp) + lwr $in1,4+LSB($inp) + lwr $in2,8+LSB($inp) + lwr $in3,12+LSB($inp) +#endif +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS32R2) + wsbh $in0,$in0 # byte swap + wsbh $in1,$in1 + wsbh $in2,$in2 + wsbh $in3,$in3 + rotr $in0,$in0,16 + rotr $in1,$in1,16 + rotr $in2,$in2,16 + rotr $in3,$in3,16 +# else + srl $tmp0,$in0,24 # byte swap + srl $tmp1,$in0,8 + andi $tmp2,$in0,0xFF00 + sll $in0,$in0,24 + andi $tmp1,0xFF00 + sll $tmp2,$tmp2,8 + or $in0,$tmp0 + srl $tmp0,$in1,24 + or $tmp1,$tmp2 + srl $tmp2,$in1,8 + or $in0,$tmp1 + andi $tmp1,$in1,0xFF00 + sll $in1,$in1,24 + andi $tmp2,0xFF00 + sll $tmp1,$tmp1,8 + or $in1,$tmp0 + srl $tmp0,$in2,24 + or $tmp2,$tmp1 + srl $tmp1,$in2,8 + or $in1,$tmp2 + andi $tmp2,$in2,0xFF00 + sll $in2,$in2,24 + andi $tmp1,0xFF00 + sll $tmp2,$tmp2,8 + or $in2,$tmp0 + srl $tmp0,$in3,24 + or $tmp1,$tmp2 + srl $tmp2,$in3,8 + or $in2,$tmp1 + andi $tmp1,$in3,0xFF00 + sll $in3,$in3,24 + andi $tmp2,0xFF00 + sll $tmp1,$tmp1,8 + or $in3,$tmp0 + or $tmp2,$tmp1 + or $in3,$tmp2 +# endif +#endif + lui $tmp0,0x0fff + ori $tmp0,0xffff # 0x0fffffff + and $in0,$in0,$tmp0 + subu $tmp0,3 # 0x0ffffffc + and $in1,$in1,$tmp0 + and $in2,$in2,$tmp0 + and $in3,$in3,$tmp0 + + sw $in0,20($ctx) + sw $in1,24($ctx) + sw $in2,28($ctx) + sw $in3,32($ctx) + + srl $tmp1,$in1,2 + srl $tmp2,$in2,2 + srl $tmp3,$in3,2 + addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2) + addu $in2,$in2,$tmp2 + addu $in3,$in3,$tmp3 + sw $in1,36($ctx) + sw $in2,40($ctx) + sw $in3,44($ctx) +.Lno_key: + li $v0,0 + jr $ra +.end poly1305_init +___ +{ +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000"; + +my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) = + ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11); +my ($d0,$d1,$d2,$d3) = + ($a4,$a5,$a6,$a7); +my $shr = $t2; # used on R6 +my $one = $t2; # used on R2 + +$code.=<<___; +.globl poly1305_blocks +.align 5 +.ent poly1305_blocks +poly1305_blocks: + .frame $sp,16*4,$ra + .mask $SAVED_REGS_MASK,-4 + .set noreorder + subu $sp, $sp,4*12 + sw $s11,4*11($sp) + sw $s10,4*10($sp) + sw $s9, 4*9($sp) + sw $s8, 4*8($sp) + sw $s7, 4*7($sp) + sw $s6, 4*6($sp) + sw $s5, 4*5($sp) + sw $s4, 4*4($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + sw $s3, 4*3($sp) + sw $s2, 4*2($sp) + sw $s1, 4*1($sp) + sw $s0, 4*0($sp) +___ +$code.=<<___; + .set reorder + + srl $len,4 # number of complete blocks + li $one,1 + beqz $len,.Labort + +#if defined(_MIPS_ARCH_MIPS32R6) + andi $shr,$inp,3 + subu $inp,$inp,$shr # align $inp + sll $shr,$shr,3 # byte to bit offset +#endif + + lw $h0,0($ctx) # load hash value + lw $h1,4($ctx) + lw $h2,8($ctx) + lw $h3,12($ctx) + lw $h4,16($ctx) + + lw $r0,20($ctx) # load key + lw $r1,24($ctx) + lw $r2,28($ctx) + lw $r3,32($ctx) + lw $rs1,36($ctx) + lw $rs2,40($ctx) + lw $rs3,44($ctx) + + sll $len,4 + addu $len,$len,$inp # end of buffer + b .Loop + +.align 4 +.Loop: +#if defined(_MIPS_ARCH_MIPS32R6) + lw $d0,0($inp) # load input + lw $d1,4($inp) + lw $d2,8($inp) + lw $d3,12($inp) + beqz $shr,.Laligned_inp + + lw $t0,16($inp) + subu $t1,$zero,$shr +# ifdef MIPSEB + sllv $d0,$d0,$shr + srlv $at,$d1,$t1 + sllv $d1,$d1,$shr + or $d0,$d0,$at + srlv $at,$d2,$t1 + sllv $d2,$d2,$shr + or $d1,$d1,$at + srlv $at,$d3,$t1 + sllv $d3,$d3,$shr + or $d2,$d2,$at + srlv $t0,$t0,$t1 + or $d3,$d3,$t0 +# else + srlv $d0,$d0,$shr + sllv $at,$d1,$t1 + srlv $d1,$d1,$shr + or $d0,$d0,$at + sllv $at,$d2,$t1 + srlv $d2,$d2,$shr + or $d1,$d1,$at + sllv $at,$d3,$t1 + srlv $d3,$d3,$shr + or $d2,$d2,$at + sllv $t0,$t0,$t1 + or $d3,$d3,$t0 +# endif +.Laligned_inp: +#else + lwl $d0,0+MSB($inp) # load input + lwl $d1,4+MSB($inp) + lwl $d2,8+MSB($inp) + lwl $d3,12+MSB($inp) + lwr $d0,0+LSB($inp) + lwr $d1,4+LSB($inp) + lwr $d2,8+LSB($inp) + lwr $d3,12+LSB($inp) +#endif +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS32R2) + wsbh $d0,$d0 # byte swap + wsbh $d1,$d1 + wsbh $d2,$d2 + wsbh $d3,$d3 + rotr $d0,$d0,16 + rotr $d1,$d1,16 + rotr $d2,$d2,16 + rotr $d3,$d3,16 +# else + srl $at,$d0,24 # byte swap + srl $t0,$d0,8 + andi $t1,$d0,0xFF00 + sll $d0,$d0,24 + andi $t0,0xFF00 + sll $t1,$t1,8 + or $d0,$at + srl $at,$d1,24 + or $t0,$t1 + srl $t1,$d1,8 + or $d0,$t0 + andi $t0,$d1,0xFF00 + sll $d1,$d1,24 + andi $t1,0xFF00 + sll $t0,$t0,8 + or $d1,$at + srl $at,$d2,24 + or $t1,$t0 + srl $t0,$d2,8 + or $d1,$t1 + andi $t1,$d2,0xFF00 + sll $d2,$d2,24 + andi $t0,0xFF00 + sll $t1,$t1,8 + or $d2,$at + srl $at,$d3,24 + or $t0,$t1 + srl $t1,$d3,8 + or $d2,$t0 + andi $t0,$d3,0xFF00 + sll $d3,$d3,24 + andi $t1,0xFF00 + sll $t0,$t0,8 + or $d3,$at + or $t1,$t0 + or $d3,$t1 +# endif +#endif + srl $t0,$h4,2 # modulo-scheduled reduction + andi $h4,$h4,3 + sll $at,$t0,2 + + addu $d0,$d0,$h0 # accumulate input + addu $t0,$t0,$at + sltu $h0,$d0,$h0 + addu $d0,$d0,$t0 # ... and residue + sltu $at,$d0,$t0 + + addu $d1,$d1,$h1 + addu $h0,$h0,$at # carry + sltu $h1,$d1,$h1 + addu $d1,$d1,$h0 + sltu $h0,$d1,$h0 + + addu $d2,$d2,$h2 + addu $h1,$h1,$h0 # carry + sltu $h2,$d2,$h2 + addu $d2,$d2,$h1 + sltu $h1,$d2,$h1 + + addu $d3,$d3,$h3 + addu $h2,$h2,$h1 # carry + sltu $h3,$d3,$h3 + addu $d3,$d3,$h2 + +#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6) + multu $r0,$d0 # d0*r0 + sltu $h2,$d3,$h2 + maddu $rs3,$d1 # d1*s3 + addu $h3,$h3,$h2 # carry + maddu $rs2,$d2 # d2*s2 + addu $h4,$h4,$padbit + maddu $rs1,$d3 # d3*s1 + addu $h4,$h4,$h3 + mfhi $at + mflo $h0 + + multu $r1,$d0 # d0*r1 + maddu $r0,$d1 # d1*r0 + maddu $rs3,$d2 # d2*s3 + maddu $rs2,$d3 # d3*s2 + maddu $rs1,$h4 # h4*s1 + maddu $at,$one # hi*1 + mfhi $at + mflo $h1 + + multu $r2,$d0 # d0*r2 + maddu $r1,$d1 # d1*r1 + maddu $r0,$d2 # d2*r0 + maddu $rs3,$d3 # d3*s3 + maddu $rs2,$h4 # h4*s2 + maddu $at,$one # hi*1 + mfhi $at + mflo $h2 + + mul $t0,$r0,$h4 # h4*r0 + + multu $r3,$d0 # d0*r3 + maddu $r2,$d1 # d1*r2 + maddu $r1,$d2 # d2*r1 + maddu $r0,$d3 # d3*r0 + maddu $rs3,$h4 # h4*s3 + maddu $at,$one # hi*1 + mfhi $at + mflo $h3 + + addiu $inp,$inp,16 + + addu $h4,$t0,$at +#else + multu ($r0,$d0) # d0*r0 + mflo ($h0,$r0,$d0) + mfhi ($h1,$r0,$d0) + + sltu $h2,$d3,$h2 + addu $h3,$h3,$h2 # carry + + multu ($rs3,$d1) # d1*s3 + mflo ($at,$rs3,$d1) + mfhi ($t0,$rs3,$d1) + + addu $h4,$h4,$padbit + addiu $inp,$inp,16 + addu $h4,$h4,$h3 + + multu ($rs2,$d2) # d2*s2 + mflo ($a3,$rs2,$d2) + mfhi ($t1,$rs2,$d2) + addu $h0,$h0,$at + addu $h1,$h1,$t0 + multu ($rs1,$d3) # d3*s1 + sltu $at,$h0,$at + addu $h1,$h1,$at + + mflo ($at,$rs1,$d3) + mfhi ($t0,$rs1,$d3) + addu $h0,$h0,$a3 + addu $h1,$h1,$t1 + multu ($r1,$d0) # d0*r1 + sltu $a3,$h0,$a3 + addu $h1,$h1,$a3 + + + mflo ($a3,$r1,$d0) + mfhi ($h2,$r1,$d0) + addu $h0,$h0,$at + addu $h1,$h1,$t0 + multu ($r0,$d1) # d1*r0 + sltu $at,$h0,$at + addu $h1,$h1,$at + + mflo ($at,$r0,$d1) + mfhi ($t0,$r0,$d1) + addu $h1,$h1,$a3 + sltu $a3,$h1,$a3 + multu ($rs3,$d2) # d2*s3 + addu $h2,$h2,$a3 + + mflo ($a3,$rs3,$d2) + mfhi ($t1,$rs3,$d2) + addu $h1,$h1,$at + addu $h2,$h2,$t0 + multu ($rs2,$d3) # d3*s2 + sltu $at,$h1,$at + addu $h2,$h2,$at + + mflo ($at,$rs2,$d3) + mfhi ($t0,$rs2,$d3) + addu $h1,$h1,$a3 + addu $h2,$h2,$t1 + multu ($rs1,$h4) # h4*s1 + sltu $a3,$h1,$a3 + addu $h2,$h2,$a3 + + mflo ($a3,$rs1,$h4) + addu $h1,$h1,$at + addu $h2,$h2,$t0 + multu ($r2,$d0) # d0*r2 + sltu $at,$h1,$at + addu $h2,$h2,$at + + + mflo ($at,$r2,$d0) + mfhi ($h3,$r2,$d0) + addu $h1,$h1,$a3 + sltu $a3,$h1,$a3 + multu ($r1,$d1) # d1*r1 + addu $h2,$h2,$a3 + + mflo ($a3,$r1,$d1) + mfhi ($t1,$r1,$d1) + addu $h2,$h2,$at + sltu $at,$h2,$at + multu ($r0,$d2) # d2*r0 + addu $h3,$h3,$at + + mflo ($at,$r0,$d2) + mfhi ($t0,$r0,$d2) + addu $h2,$h2,$a3 + addu $h3,$h3,$t1 + multu ($rs3,$d3) # d3*s3 + sltu $a3,$h2,$a3 + addu $h3,$h3,$a3 + + mflo ($a3,$rs3,$d3) + mfhi ($t1,$rs3,$d3) + addu $h2,$h2,$at + addu $h3,$h3,$t0 + multu ($rs2,$h4) # h4*s2 + sltu $at,$h2,$at + addu $h3,$h3,$at + + mflo ($at,$rs2,$h4) + addu $h2,$h2,$a3 + addu $h3,$h3,$t1 + multu ($r3,$d0) # d0*r3 + sltu $a3,$h2,$a3 + addu $h3,$h3,$a3 + + + mflo ($a3,$r3,$d0) + mfhi ($t1,$r3,$d0) + addu $h2,$h2,$at + sltu $at,$h2,$at + multu ($r2,$d1) # d1*r2 + addu $h3,$h3,$at + + mflo ($at,$r2,$d1) + mfhi ($t0,$r2,$d1) + addu $h3,$h3,$a3 + sltu $a3,$h3,$a3 + multu ($r0,$d3) # d3*r0 + addu $t1,$t1,$a3 + + mflo ($a3,$r0,$d3) + mfhi ($d3,$r0,$d3) + addu $h3,$h3,$at + addu $t1,$t1,$t0 + multu ($r1,$d2) # d2*r1 + sltu $at,$h3,$at + addu $t1,$t1,$at + + mflo ($at,$r1,$d2) + mfhi ($t0,$r1,$d2) + addu $h3,$h3,$a3 + addu $t1,$t1,$d3 + multu ($rs3,$h4) # h4*s3 + sltu $a3,$h3,$a3 + addu $t1,$t1,$a3 + + mflo ($a3,$rs3,$h4) + addu $h3,$h3,$at + addu $t1,$t1,$t0 + multu ($r0,$h4) # h4*r0 + sltu $at,$h3,$at + addu $t1,$t1,$at + + + mflo ($h4,$r0,$h4) + addu $h3,$h3,$a3 + sltu $a3,$h3,$a3 + addu $t1,$t1,$a3 + addu $h4,$h4,$t1 + + li $padbit,1 # if we loop, padbit is 1 +#endif + bne $inp,$len,.Loop + + sw $h0,0($ctx) # store hash value + sw $h1,4($ctx) + sw $h2,8($ctx) + sw $h3,12($ctx) + sw $h4,16($ctx) + + .set noreorder +.Labort: + lw $s11,4*11($sp) + lw $s10,4*10($sp) + lw $s9, 4*9($sp) + lw $s8, 4*8($sp) + lw $s7, 4*7($sp) + lw $s6, 4*6($sp) + lw $s5, 4*5($sp) + lw $s4, 4*4($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + lw $s3, 4*3($sp) + lw $s2, 4*2($sp) + lw $s1, 4*1($sp) + lw $s0, 4*0($sp) +___ +$code.=<<___; + jr $ra + addu $sp,$sp,4*12 +.end poly1305_blocks +___ +} +{ +my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3); + +$code.=<<___; +.align 5 +.globl poly1305_emit +.ent poly1305_emit +poly1305_emit: + .frame $sp,0,$ra + .set reorder + + lw $tmp4,16($ctx) + lw $tmp0,0($ctx) + lw $tmp1,4($ctx) + lw $tmp2,8($ctx) + lw $tmp3,12($ctx) + + li $in0,-4 # final reduction + srl $ctx,$tmp4,2 + and $in0,$in0,$tmp4 + andi $tmp4,$tmp4,3 + addu $ctx,$ctx,$in0 + + addu $tmp0,$tmp0,$ctx + sltu $ctx,$tmp0,$ctx + addiu $in0,$tmp0,5 # compare to modulus + addu $tmp1,$tmp1,$ctx + sltiu $in1,$in0,5 + sltu $ctx,$tmp1,$ctx + addu $in1,$in1,$tmp1 + addu $tmp2,$tmp2,$ctx + sltu $in2,$in1,$tmp1 + sltu $ctx,$tmp2,$ctx + addu $in2,$in2,$tmp2 + addu $tmp3,$tmp3,$ctx + sltu $in3,$in2,$tmp2 + sltu $ctx,$tmp3,$ctx + addu $in3,$in3,$tmp3 + addu $tmp4,$tmp4,$ctx + sltu $ctx,$in3,$tmp3 + addu $ctx,$tmp4 + + srl $ctx,2 # see if it carried/borrowed + subu $ctx,$zero,$ctx + + xor $in0,$tmp0 + xor $in1,$tmp1 + xor $in2,$tmp2 + xor $in3,$tmp3 + and $in0,$ctx + and $in1,$ctx + and $in2,$ctx + and $in3,$ctx + xor $in0,$tmp0 + xor $in1,$tmp1 + xor $in2,$tmp2 + xor $in3,$tmp3 + + lw $tmp0,0($nonce) # load nonce + lw $tmp1,4($nonce) + lw $tmp2,8($nonce) + lw $tmp3,12($nonce) + + addu $in0,$tmp0 # accumulate nonce + sltu $ctx,$in0,$tmp0 + + addu $in1,$tmp1 + sltu $tmp1,$in1,$tmp1 + addu $in1,$ctx + sltu $ctx,$in1,$ctx + addu $ctx,$tmp1 + + addu $in2,$tmp2 + sltu $tmp2,$in2,$tmp2 + addu $in2,$ctx + sltu $ctx,$in2,$ctx + addu $ctx,$tmp2 + + addu $in3,$tmp3 + addu $in3,$ctx + + srl $tmp0,$in0,8 # write mac value + srl $tmp1,$in0,16 + srl $tmp2,$in0,24 + sb $in0, 0($mac) + sb $tmp0,1($mac) + srl $tmp0,$in1,8 + sb $tmp1,2($mac) + srl $tmp1,$in1,16 + sb $tmp2,3($mac) + srl $tmp2,$in1,24 + sb $in1, 4($mac) + sb $tmp0,5($mac) + srl $tmp0,$in2,8 + sb $tmp1,6($mac) + srl $tmp1,$in2,16 + sb $tmp2,7($mac) + srl $tmp2,$in2,24 + sb $in2, 8($mac) + sb $tmp0,9($mac) + srl $tmp0,$in3,8 + sb $tmp1,10($mac) + srl $tmp1,$in3,16 + sb $tmp2,11($mac) + srl $tmp2,$in3,24 + sb $in3, 12($mac) + sb $tmp0,13($mac) + sb $tmp1,14($mac) + sb $tmp2,15($mac) + + jr $ra +.end poly1305_emit +.rdata +.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm" +.align 2 +___ +} +}}} + +$output=pop and open STDOUT,">$output"; +print $code; +close STDOUT; diff --git a/lib/crypto/mips/sha1.h b/lib/crypto/mips/sha1.h new file mode 100644 index 000000000000..ba1965002e4a --- /dev/null +++ b/lib/crypto/mips/sha1.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Cryptographic API. + * + * SHA1 Secure Hash Algorithm. + * + * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>. + * + * Based on crypto/sha1_generic.c, which is: + * + * Copyright (c) Alan Smithee. + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> + */ + +#include <asm/octeon/crypto.h> +#include <asm/octeon/octeon.h> + +/* + * We pass everything as 64-bit. OCTEON can handle misaligned data. + */ + +static void octeon_sha1_store_hash(struct sha1_block_state *state) +{ + u64 *hash = (u64 *)&state->h[0]; + union { + u32 word[2]; + u64 dword; + } hash_tail = { { state->h[4], } }; + + write_octeon_64bit_hash_dword(hash[0], 0); + write_octeon_64bit_hash_dword(hash[1], 1); + write_octeon_64bit_hash_dword(hash_tail.dword, 2); + memzero_explicit(&hash_tail.word[0], sizeof(hash_tail.word[0])); +} + +static void octeon_sha1_read_hash(struct sha1_block_state *state) +{ + u64 *hash = (u64 *)&state->h[0]; + union { + u32 word[2]; + u64 dword; + } hash_tail; + + hash[0] = read_octeon_64bit_hash_dword(0); + hash[1] = read_octeon_64bit_hash_dword(1); + hash_tail.dword = read_octeon_64bit_hash_dword(2); + state->h[4] = hash_tail.word[0]; + memzero_explicit(&hash_tail.dword, sizeof(hash_tail.dword)); +} + +static void sha1_blocks(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + struct octeon_cop2_state cop2_state; + unsigned long flags; + + if (!octeon_has_crypto()) + return sha1_blocks_generic(state, data, nblocks); + + flags = octeon_crypto_enable(&cop2_state); + octeon_sha1_store_hash(state); + + do { + const u64 *block = (const u64 *)data; + + write_octeon_64bit_block_dword(block[0], 0); + write_octeon_64bit_block_dword(block[1], 1); + write_octeon_64bit_block_dword(block[2], 2); + write_octeon_64bit_block_dword(block[3], 3); + write_octeon_64bit_block_dword(block[4], 4); + write_octeon_64bit_block_dword(block[5], 5); + write_octeon_64bit_block_dword(block[6], 6); + octeon_sha1_start(block[7]); + + data += SHA1_BLOCK_SIZE; + } while (--nblocks); + + octeon_sha1_read_hash(state); + octeon_crypto_disable(&cop2_state, flags); +} diff --git a/lib/crypto/mips/sha256.h b/lib/crypto/mips/sha256.h new file mode 100644 index 000000000000..ccccfd131634 --- /dev/null +++ b/lib/crypto/mips/sha256.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-256 Secure Hash Algorithm. + * + * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>. + * + * Based on crypto/sha256_generic.c, which is: + * + * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> + * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com> + */ + +#include <asm/octeon/crypto.h> +#include <asm/octeon/octeon.h> + +/* + * We pass everything as 64-bit. OCTEON can handle misaligned data. + */ + +static void sha256_blocks(struct sha256_block_state *state, + const u8 *data, size_t nblocks) +{ + struct octeon_cop2_state cop2_state; + u64 *state64 = (u64 *)state; + unsigned long flags; + + if (!octeon_has_crypto()) + return sha256_blocks_generic(state, data, nblocks); + + flags = octeon_crypto_enable(&cop2_state); + write_octeon_64bit_hash_dword(state64[0], 0); + write_octeon_64bit_hash_dword(state64[1], 1); + write_octeon_64bit_hash_dword(state64[2], 2); + write_octeon_64bit_hash_dword(state64[3], 3); + + do { + const u64 *block = (const u64 *)data; + + write_octeon_64bit_block_dword(block[0], 0); + write_octeon_64bit_block_dword(block[1], 1); + write_octeon_64bit_block_dword(block[2], 2); + write_octeon_64bit_block_dword(block[3], 3); + write_octeon_64bit_block_dword(block[4], 4); + write_octeon_64bit_block_dword(block[5], 5); + write_octeon_64bit_block_dword(block[6], 6); + octeon_sha256_start(block[7]); + + data += SHA256_BLOCK_SIZE; + } while (--nblocks); + + state64[0] = read_octeon_64bit_hash_dword(0); + state64[1] = read_octeon_64bit_hash_dword(1); + state64[2] = read_octeon_64bit_hash_dword(2); + state64[3] = read_octeon_64bit_hash_dword(3); + octeon_crypto_disable(&cop2_state, flags); +} diff --git a/lib/crypto/mips/sha512.h b/lib/crypto/mips/sha512.h new file mode 100644 index 000000000000..b3ffbc1e8ca8 --- /dev/null +++ b/lib/crypto/mips/sha512.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Cryptographic API. + * + * SHA-512 and SHA-384 Secure Hash Algorithm. + * + * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>. + * + * Based on crypto/sha512_generic.c, which is: + * + * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) 2003 Kyle McMartin <kyle@debian.org> + */ + +#include <asm/octeon/crypto.h> +#include <asm/octeon/octeon.h> + +/* + * We pass everything as 64-bit. OCTEON can handle misaligned data. + */ + +static void sha512_blocks(struct sha512_block_state *state, + const u8 *data, size_t nblocks) +{ + struct octeon_cop2_state cop2_state; + unsigned long flags; + + if (!octeon_has_crypto()) + return sha512_blocks_generic(state, data, nblocks); + + flags = octeon_crypto_enable(&cop2_state); + write_octeon_64bit_hash_sha512(state->h[0], 0); + write_octeon_64bit_hash_sha512(state->h[1], 1); + write_octeon_64bit_hash_sha512(state->h[2], 2); + write_octeon_64bit_hash_sha512(state->h[3], 3); + write_octeon_64bit_hash_sha512(state->h[4], 4); + write_octeon_64bit_hash_sha512(state->h[5], 5); + write_octeon_64bit_hash_sha512(state->h[6], 6); + write_octeon_64bit_hash_sha512(state->h[7], 7); + + do { + const u64 *block = (const u64 *)data; + + write_octeon_64bit_block_sha512(block[0], 0); + write_octeon_64bit_block_sha512(block[1], 1); + write_octeon_64bit_block_sha512(block[2], 2); + write_octeon_64bit_block_sha512(block[3], 3); + write_octeon_64bit_block_sha512(block[4], 4); + write_octeon_64bit_block_sha512(block[5], 5); + write_octeon_64bit_block_sha512(block[6], 6); + write_octeon_64bit_block_sha512(block[7], 7); + write_octeon_64bit_block_sha512(block[8], 8); + write_octeon_64bit_block_sha512(block[9], 9); + write_octeon_64bit_block_sha512(block[10], 10); + write_octeon_64bit_block_sha512(block[11], 11); + write_octeon_64bit_block_sha512(block[12], 12); + write_octeon_64bit_block_sha512(block[13], 13); + write_octeon_64bit_block_sha512(block[14], 14); + octeon_sha512_start(block[15]); + + data += SHA512_BLOCK_SIZE; + } while (--nblocks); + + state->h[0] = read_octeon_64bit_hash_sha512(0); + state->h[1] = read_octeon_64bit_hash_sha512(1); + state->h[2] = read_octeon_64bit_hash_sha512(2); + state->h[3] = read_octeon_64bit_hash_sha512(3); + state->h[4] = read_octeon_64bit_hash_sha512(4); + state->h[5] = read_octeon_64bit_hash_sha512(5); + state->h[6] = read_octeon_64bit_hash_sha512(6); + state->h[7] = read_octeon_64bit_hash_sha512(7); + octeon_crypto_disable(&cop2_state, flags); +} diff --git a/lib/crypto/mpi/mpi-add.c b/lib/crypto/mpi/mpi-add.c index 3015140d4860..c0375c1672fa 100644 --- a/lib/crypto/mpi/mpi-add.c +++ b/lib/crypto/mpi/mpi-add.c @@ -11,6 +11,8 @@ * to avoid revealing of sensitive data due to paging etc. */ +#include <linux/export.h> + #include "mpi-internal.h" int mpi_add(MPI w, MPI u, MPI v) diff --git a/lib/crypto/mpi/mpi-bit.c b/lib/crypto/mpi/mpi-bit.c index 934d81311360..b3d0e7ddbc03 100644 --- a/lib/crypto/mpi/mpi-bit.c +++ b/lib/crypto/mpi/mpi-bit.c @@ -18,6 +18,8 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ +#include <linux/export.h> + #include "mpi-internal.h" #include "longlong.h" diff --git a/lib/crypto/mpi/mpi-cmp.c b/lib/crypto/mpi/mpi-cmp.c index ceaebe181cd7..b42929296bce 100644 --- a/lib/crypto/mpi/mpi-cmp.c +++ b/lib/crypto/mpi/mpi-cmp.c @@ -18,6 +18,8 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ +#include <linux/export.h> + #include "mpi-internal.h" int mpi_cmp_ui(MPI u, unsigned long v) diff --git a/lib/crypto/mpi/mpi-mul.c b/lib/crypto/mpi/mpi-mul.c index 7e6ff1ce3e9b..d79f186ad90b 100644 --- a/lib/crypto/mpi/mpi-mul.c +++ b/lib/crypto/mpi/mpi-mul.c @@ -11,6 +11,8 @@ * to avoid revealing of sensitive data due to paging etc. */ +#include <linux/export.h> + #include "mpi-internal.h" int mpi_mul(MPI w, MPI u, MPI v) diff --git a/lib/crypto/mpi/mpi-pow.c b/lib/crypto/mpi/mpi-pow.c index 67fbd4c2503d..9e695a3bda3a 100644 --- a/lib/crypto/mpi/mpi-pow.c +++ b/lib/crypto/mpi/mpi-pow.c @@ -13,8 +13,10 @@ * however I decided to publish this code under the plain GPL. */ +#include <linux/export.h> #include <linux/sched.h> #include <linux/string.h> + #include "mpi-internal.h" #include "longlong.h" diff --git a/lib/crypto/mpi/mpi-sub-ui.c b/lib/crypto/mpi/mpi-sub-ui.c index b41b082b5f3e..0edcdbd24833 100644 --- a/lib/crypto/mpi/mpi-sub-ui.c +++ b/lib/crypto/mpi/mpi-sub-ui.c @@ -32,6 +32,8 @@ * see https://www.gnu.org/licenses/. */ +#include <linux/export.h> + #include "mpi-internal.h" int mpi_sub_ui(MPI w, MPI u, unsigned long vval) diff --git a/lib/crypto/mpi/mpicoder.c b/lib/crypto/mpi/mpicoder.c index dde01030807d..47f6939599b3 100644 --- a/lib/crypto/mpi/mpicoder.c +++ b/lib/crypto/mpi/mpicoder.c @@ -19,8 +19,9 @@ */ #include <linux/bitops.h> -#include <linux/count_zeros.h> #include <linux/byteorder/generic.h> +#include <linux/count_zeros.h> +#include <linux/export.h> #include <linux/scatterlist.h> #include <linux/string.h> #include "mpi-internal.h" diff --git a/lib/crypto/mpi/mpiutil.c b/lib/crypto/mpi/mpiutil.c index 979ece5a81d2..7f2db830f404 100644 --- a/lib/crypto/mpi/mpiutil.c +++ b/lib/crypto/mpi/mpiutil.c @@ -18,6 +18,8 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ +#include <linux/export.h> + #include "mpi-internal.h" /**************** diff --git a/lib/crypto/poly1305-donna32.c b/lib/crypto/poly1305-donna32.c index 0a4a2d99e365..b66131b3f6d4 100644 --- a/lib/crypto/poly1305-donna32.c +++ b/lib/crypto/poly1305-donna32.c @@ -6,9 +6,10 @@ * public domain. */ +#include <crypto/internal/poly1305.h> +#include <linux/export.h> #include <linux/kernel.h> #include <linux/unaligned.h> -#include <crypto/internal/poly1305.h> void poly1305_core_setkey(struct poly1305_core_key *key, const u8 raw_key[POLY1305_BLOCK_SIZE]) diff --git a/lib/crypto/poly1305-donna64.c b/lib/crypto/poly1305-donna64.c index 530287531b2e..8a72a5a84944 100644 --- a/lib/crypto/poly1305-donna64.c +++ b/lib/crypto/poly1305-donna64.c @@ -6,9 +6,10 @@ * public domain. */ +#include <crypto/internal/poly1305.h> +#include <linux/export.h> #include <linux/kernel.h> #include <linux/unaligned.h> -#include <crypto/internal/poly1305.h> void poly1305_core_setkey(struct poly1305_core_key *key, const u8 raw_key[POLY1305_BLOCK_SIZE]) diff --git a/lib/crypto/poly1305-generic.c b/lib/crypto/poly1305-generic.c new file mode 100644 index 000000000000..71a16c5c538b --- /dev/null +++ b/lib/crypto/poly1305-generic.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Poly1305 authenticator algorithm, RFC7539 + * + * Copyright (C) 2015 Martin Willi + * + * Based on public domain code by Andrew Moon and Daniel J. Bernstein. + */ + +#include <crypto/internal/poly1305.h> +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/module.h> + +void poly1305_block_init_generic(struct poly1305_block_state *desc, + const u8 raw_key[POLY1305_BLOCK_SIZE]) +{ + poly1305_core_init(&desc->h); + poly1305_core_setkey(&desc->core_r, raw_key); +} +EXPORT_SYMBOL_GPL(poly1305_block_init_generic); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); +MODULE_DESCRIPTION("Poly1305 algorithm (generic implementation)"); diff --git a/lib/crypto/poly1305.c b/lib/crypto/poly1305.c index 6e80214ebad8..a6dc182b6c22 100644 --- a/lib/crypto/poly1305.c +++ b/lib/crypto/poly1305.c @@ -7,72 +7,68 @@ * Based on public domain code by Andrew Moon and Daniel J. Bernstein. */ +#include <crypto/internal/blockhash.h> #include <crypto/internal/poly1305.h> +#include <linux/export.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/string.h> #include <linux/unaligned.h> -void poly1305_init_generic(struct poly1305_desc_ctx *desc, - const u8 key[POLY1305_KEY_SIZE]) +void poly1305_init(struct poly1305_desc_ctx *desc, + const u8 key[POLY1305_KEY_SIZE]) { - poly1305_core_setkey(&desc->core_r, key); desc->s[0] = get_unaligned_le32(key + 16); desc->s[1] = get_unaligned_le32(key + 20); desc->s[2] = get_unaligned_le32(key + 24); desc->s[3] = get_unaligned_le32(key + 28); - poly1305_core_init(&desc->h); desc->buflen = 0; - desc->sset = true; - desc->rset = 2; + if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305)) + poly1305_block_init_arch(&desc->state, key); + else + poly1305_block_init_generic(&desc->state, key); } -EXPORT_SYMBOL_GPL(poly1305_init_generic); +EXPORT_SYMBOL(poly1305_init); -void poly1305_update_generic(struct poly1305_desc_ctx *desc, const u8 *src, - unsigned int nbytes) +static inline void poly1305_blocks(struct poly1305_block_state *state, + const u8 *src, unsigned int len) { - unsigned int bytes; - - if (unlikely(desc->buflen)) { - bytes = min(nbytes, POLY1305_BLOCK_SIZE - desc->buflen); - memcpy(desc->buf + desc->buflen, src, bytes); - src += bytes; - nbytes -= bytes; - desc->buflen += bytes; - - if (desc->buflen == POLY1305_BLOCK_SIZE) { - poly1305_core_blocks(&desc->h, &desc->core_r, desc->buf, - 1, 1); - desc->buflen = 0; - } - } - - if (likely(nbytes >= POLY1305_BLOCK_SIZE)) { - poly1305_core_blocks(&desc->h, &desc->core_r, src, - nbytes / POLY1305_BLOCK_SIZE, 1); - src += nbytes - (nbytes % POLY1305_BLOCK_SIZE); - nbytes %= POLY1305_BLOCK_SIZE; - } + if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305)) + poly1305_blocks_arch(state, src, len, 1); + else + poly1305_blocks_generic(state, src, len, 1); +} - if (unlikely(nbytes)) { - desc->buflen = nbytes; - memcpy(desc->buf, src, nbytes); - } +void poly1305_update(struct poly1305_desc_ctx *desc, + const u8 *src, unsigned int nbytes) +{ + desc->buflen = BLOCK_HASH_UPDATE(poly1305_blocks, &desc->state, + src, nbytes, POLY1305_BLOCK_SIZE, + desc->buf, desc->buflen); } -EXPORT_SYMBOL_GPL(poly1305_update_generic); +EXPORT_SYMBOL(poly1305_update); -void poly1305_final_generic(struct poly1305_desc_ctx *desc, u8 *dst) +void poly1305_final(struct poly1305_desc_ctx *desc, u8 *dst) { if (unlikely(desc->buflen)) { desc->buf[desc->buflen++] = 1; memset(desc->buf + desc->buflen, 0, POLY1305_BLOCK_SIZE - desc->buflen); - poly1305_core_blocks(&desc->h, &desc->core_r, desc->buf, 1, 0); + if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305)) + poly1305_blocks_arch(&desc->state, desc->buf, + POLY1305_BLOCK_SIZE, 0); + else + poly1305_blocks_generic(&desc->state, desc->buf, + POLY1305_BLOCK_SIZE, 0); } - poly1305_core_emit(&desc->h, desc->s, dst); + if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305)) + poly1305_emit_arch(&desc->state.h, dst, desc->s); + else + poly1305_emit_generic(&desc->state.h, dst, desc->s); *desc = (struct poly1305_desc_ctx){}; } -EXPORT_SYMBOL_GPL(poly1305_final_generic); +EXPORT_SYMBOL(poly1305_final); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); diff --git a/lib/crypto/powerpc/Kconfig b/lib/crypto/powerpc/Kconfig new file mode 100644 index 000000000000..2eaeb7665a6a --- /dev/null +++ b/lib/crypto/powerpc/Kconfig @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_CHACHA20_P10 + tristate + depends on PPC64 && CPU_LITTLE_ENDIAN && VSX + default CRYPTO_LIB_CHACHA + select CRYPTO_LIB_CHACHA_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CHACHA + +config CRYPTO_POLY1305_P10 + tristate + depends on PPC64 && CPU_LITTLE_ENDIAN && VSX + depends on BROKEN # Needs to be fixed to work in softirq context + default CRYPTO_LIB_POLY1305 + select CRYPTO_ARCH_HAVE_LIB_POLY1305 + select CRYPTO_LIB_POLY1305_GENERIC diff --git a/lib/crypto/powerpc/Makefile b/lib/crypto/powerpc/Makefile new file mode 100644 index 000000000000..5709ae14258a --- /dev/null +++ b/lib/crypto/powerpc/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o +chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o + +obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o +poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o diff --git a/lib/crypto/powerpc/chacha-p10-glue.c b/lib/crypto/powerpc/chacha-p10-glue.c new file mode 100644 index 000000000000..fcd23c6f1590 --- /dev/null +++ b/lib/crypto/powerpc/chacha-p10-glue.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * ChaCha stream cipher (P10 accelerated) + * + * Copyright 2023- IBM Corp. All rights reserved. + */ + +#include <crypto/chacha.h> +#include <crypto/internal/simd.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/cpufeature.h> +#include <linux/sizes.h> +#include <asm/simd.h> +#include <asm/switch_to.h> + +asmlinkage void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int len, int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10); + +static void vsx_begin(void) +{ + preempt_disable(); + enable_kernel_vsx(); +} + +static void vsx_end(void) +{ + disable_kernel_vsx(); + preempt_enable(); +} + +static void chacha_p10_do_8x(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + unsigned int l = bytes & ~0x0FF; + + if (l > 0) { + chacha_p10le_8x(state, dst, src, l, nrounds); + bytes -= l; + src += l; + dst += l; + state->x[12] += l / CHACHA_BLOCK_SIZE; + } + + if (bytes > 0) + chacha_crypt_generic(state, dst, src, bytes, nrounds); +} + +void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + hchacha_block_generic(state, out, nrounds); +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE || + !crypto_simd_usable()) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + vsx_begin(); + chacha_p10_do_8x(state, dst, src, todo, nrounds); + vsx_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +bool chacha_is_arch_optimized(void) +{ + return static_key_enabled(&have_p10); +} +EXPORT_SYMBOL(chacha_is_arch_optimized); + +static int __init chacha_p10_init(void) +{ + if (cpu_has_feature(CPU_FTR_ARCH_31)) + static_branch_enable(&have_p10); + return 0; +} +subsys_initcall(chacha_p10_init); + +static void __exit chacha_p10_exit(void) +{ +} +module_exit(chacha_p10_exit); + +MODULE_DESCRIPTION("ChaCha stream cipher (P10 accelerated)"); +MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>"); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/powerpc/chacha-p10le-8x.S b/lib/crypto/powerpc/chacha-p10le-8x.S new file mode 100644 index 000000000000..b29562bd5d40 --- /dev/null +++ b/lib/crypto/powerpc/chacha-p10le-8x.S @@ -0,0 +1,840 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +# +# Accelerated chacha20 implementation for ppc64le. +# +# Copyright 2023- IBM Corp. All rights reserved +# +#=================================================================================== +# Written by Danny Tsen <dtsen@us.ibm.com> +# +# do rounds, 8 quarter rounds +# 1. a += b; d ^= a; d <<<= 16; +# 2. c += d; b ^= c; b <<<= 12; +# 3. a += b; d ^= a; d <<<= 8; +# 4. c += d; b ^= c; b <<<= 7 +# +# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16 +# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12 +# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8 +# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7 +# +# 4 blocks (a b c d) +# +# a0 b0 c0 d0 +# a1 b1 c1 d1 +# ... +# a4 b4 c4 d4 +# ... +# a8 b8 c8 d8 +# ... +# a12 b12 c12 d12 +# a13 ... +# a14 ... +# a15 b15 c15 d15 +# +# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) +# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) +# + +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/asm-compat.h> +#include <linux/linkage.h> + +.machine "any" +.text + +.macro SAVE_GPR GPR OFFSET FRAME + std \GPR,\OFFSET(\FRAME) +.endm + +.macro SAVE_VRS VRS OFFSET FRAME + li 16, \OFFSET + stvx \VRS, 16, \FRAME +.endm + +.macro SAVE_VSX VSX OFFSET FRAME + li 16, \OFFSET + stxvx \VSX, 16, \FRAME +.endm + +.macro RESTORE_GPR GPR OFFSET FRAME + ld \GPR,\OFFSET(\FRAME) +.endm + +.macro RESTORE_VRS VRS OFFSET FRAME + li 16, \OFFSET + lvx \VRS, 16, \FRAME +.endm + +.macro RESTORE_VSX VSX OFFSET FRAME + li 16, \OFFSET + lxvx \VSX, 16, \FRAME +.endm + +.macro SAVE_REGS + mflr 0 + std 0, 16(1) + stdu 1,-752(1) + + SAVE_GPR 14, 112, 1 + SAVE_GPR 15, 120, 1 + SAVE_GPR 16, 128, 1 + SAVE_GPR 17, 136, 1 + SAVE_GPR 18, 144, 1 + SAVE_GPR 19, 152, 1 + SAVE_GPR 20, 160, 1 + SAVE_GPR 21, 168, 1 + SAVE_GPR 22, 176, 1 + SAVE_GPR 23, 184, 1 + SAVE_GPR 24, 192, 1 + SAVE_GPR 25, 200, 1 + SAVE_GPR 26, 208, 1 + SAVE_GPR 27, 216, 1 + SAVE_GPR 28, 224, 1 + SAVE_GPR 29, 232, 1 + SAVE_GPR 30, 240, 1 + SAVE_GPR 31, 248, 1 + + addi 9, 1, 256 + SAVE_VRS 20, 0, 9 + SAVE_VRS 21, 16, 9 + SAVE_VRS 22, 32, 9 + SAVE_VRS 23, 48, 9 + SAVE_VRS 24, 64, 9 + SAVE_VRS 25, 80, 9 + SAVE_VRS 26, 96, 9 + SAVE_VRS 27, 112, 9 + SAVE_VRS 28, 128, 9 + SAVE_VRS 29, 144, 9 + SAVE_VRS 30, 160, 9 + SAVE_VRS 31, 176, 9 + + SAVE_VSX 14, 192, 9 + SAVE_VSX 15, 208, 9 + SAVE_VSX 16, 224, 9 + SAVE_VSX 17, 240, 9 + SAVE_VSX 18, 256, 9 + SAVE_VSX 19, 272, 9 + SAVE_VSX 20, 288, 9 + SAVE_VSX 21, 304, 9 + SAVE_VSX 22, 320, 9 + SAVE_VSX 23, 336, 9 + SAVE_VSX 24, 352, 9 + SAVE_VSX 25, 368, 9 + SAVE_VSX 26, 384, 9 + SAVE_VSX 27, 400, 9 + SAVE_VSX 28, 416, 9 + SAVE_VSX 29, 432, 9 + SAVE_VSX 30, 448, 9 + SAVE_VSX 31, 464, 9 +.endm # SAVE_REGS + +.macro RESTORE_REGS + addi 9, 1, 256 + RESTORE_VRS 20, 0, 9 + RESTORE_VRS 21, 16, 9 + RESTORE_VRS 22, 32, 9 + RESTORE_VRS 23, 48, 9 + RESTORE_VRS 24, 64, 9 + RESTORE_VRS 25, 80, 9 + RESTORE_VRS 26, 96, 9 + RESTORE_VRS 27, 112, 9 + RESTORE_VRS 28, 128, 9 + RESTORE_VRS 29, 144, 9 + RESTORE_VRS 30, 160, 9 + RESTORE_VRS 31, 176, 9 + + RESTORE_VSX 14, 192, 9 + RESTORE_VSX 15, 208, 9 + RESTORE_VSX 16, 224, 9 + RESTORE_VSX 17, 240, 9 + RESTORE_VSX 18, 256, 9 + RESTORE_VSX 19, 272, 9 + RESTORE_VSX 20, 288, 9 + RESTORE_VSX 21, 304, 9 + RESTORE_VSX 22, 320, 9 + RESTORE_VSX 23, 336, 9 + RESTORE_VSX 24, 352, 9 + RESTORE_VSX 25, 368, 9 + RESTORE_VSX 26, 384, 9 + RESTORE_VSX 27, 400, 9 + RESTORE_VSX 28, 416, 9 + RESTORE_VSX 29, 432, 9 + RESTORE_VSX 30, 448, 9 + RESTORE_VSX 31, 464, 9 + + RESTORE_GPR 14, 112, 1 + RESTORE_GPR 15, 120, 1 + RESTORE_GPR 16, 128, 1 + RESTORE_GPR 17, 136, 1 + RESTORE_GPR 18, 144, 1 + RESTORE_GPR 19, 152, 1 + RESTORE_GPR 20, 160, 1 + RESTORE_GPR 21, 168, 1 + RESTORE_GPR 22, 176, 1 + RESTORE_GPR 23, 184, 1 + RESTORE_GPR 24, 192, 1 + RESTORE_GPR 25, 200, 1 + RESTORE_GPR 26, 208, 1 + RESTORE_GPR 27, 216, 1 + RESTORE_GPR 28, 224, 1 + RESTORE_GPR 29, 232, 1 + RESTORE_GPR 30, 240, 1 + RESTORE_GPR 31, 248, 1 + + addi 1, 1, 752 + ld 0, 16(1) + mtlr 0 +.endm # RESTORE_REGS + +.macro QT_loop_8x + # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) + xxlor 0, 32+25, 32+25 + xxlor 32+25, 20, 20 + vadduwm 0, 0, 4 + vadduwm 1, 1, 5 + vadduwm 2, 2, 6 + vadduwm 3, 3, 7 + vadduwm 16, 16, 20 + vadduwm 17, 17, 21 + vadduwm 18, 18, 22 + vadduwm 19, 19, 23 + + vpermxor 12, 12, 0, 25 + vpermxor 13, 13, 1, 25 + vpermxor 14, 14, 2, 25 + vpermxor 15, 15, 3, 25 + vpermxor 28, 28, 16, 25 + vpermxor 29, 29, 17, 25 + vpermxor 30, 30, 18, 25 + vpermxor 31, 31, 19, 25 + xxlor 32+25, 0, 0 + vadduwm 8, 8, 12 + vadduwm 9, 9, 13 + vadduwm 10, 10, 14 + vadduwm 11, 11, 15 + vadduwm 24, 24, 28 + vadduwm 25, 25, 29 + vadduwm 26, 26, 30 + vadduwm 27, 27, 31 + vxor 4, 4, 8 + vxor 5, 5, 9 + vxor 6, 6, 10 + vxor 7, 7, 11 + vxor 20, 20, 24 + vxor 21, 21, 25 + vxor 22, 22, 26 + vxor 23, 23, 27 + + xxlor 0, 32+25, 32+25 + xxlor 32+25, 21, 21 + vrlw 4, 4, 25 # + vrlw 5, 5, 25 + vrlw 6, 6, 25 + vrlw 7, 7, 25 + vrlw 20, 20, 25 # + vrlw 21, 21, 25 + vrlw 22, 22, 25 + vrlw 23, 23, 25 + xxlor 32+25, 0, 0 + vadduwm 0, 0, 4 + vadduwm 1, 1, 5 + vadduwm 2, 2, 6 + vadduwm 3, 3, 7 + vadduwm 16, 16, 20 + vadduwm 17, 17, 21 + vadduwm 18, 18, 22 + vadduwm 19, 19, 23 + + xxlor 0, 32+25, 32+25 + xxlor 32+25, 22, 22 + vpermxor 12, 12, 0, 25 + vpermxor 13, 13, 1, 25 + vpermxor 14, 14, 2, 25 + vpermxor 15, 15, 3, 25 + vpermxor 28, 28, 16, 25 + vpermxor 29, 29, 17, 25 + vpermxor 30, 30, 18, 25 + vpermxor 31, 31, 19, 25 + xxlor 32+25, 0, 0 + vadduwm 8, 8, 12 + vadduwm 9, 9, 13 + vadduwm 10, 10, 14 + vadduwm 11, 11, 15 + vadduwm 24, 24, 28 + vadduwm 25, 25, 29 + vadduwm 26, 26, 30 + vadduwm 27, 27, 31 + xxlor 0, 32+28, 32+28 + xxlor 32+28, 23, 23 + vxor 4, 4, 8 + vxor 5, 5, 9 + vxor 6, 6, 10 + vxor 7, 7, 11 + vxor 20, 20, 24 + vxor 21, 21, 25 + vxor 22, 22, 26 + vxor 23, 23, 27 + vrlw 4, 4, 28 # + vrlw 5, 5, 28 + vrlw 6, 6, 28 + vrlw 7, 7, 28 + vrlw 20, 20, 28 # + vrlw 21, 21, 28 + vrlw 22, 22, 28 + vrlw 23, 23, 28 + xxlor 32+28, 0, 0 + + # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) + xxlor 0, 32+25, 32+25 + xxlor 32+25, 20, 20 + vadduwm 0, 0, 5 + vadduwm 1, 1, 6 + vadduwm 2, 2, 7 + vadduwm 3, 3, 4 + vadduwm 16, 16, 21 + vadduwm 17, 17, 22 + vadduwm 18, 18, 23 + vadduwm 19, 19, 20 + + vpermxor 15, 15, 0, 25 + vpermxor 12, 12, 1, 25 + vpermxor 13, 13, 2, 25 + vpermxor 14, 14, 3, 25 + vpermxor 31, 31, 16, 25 + vpermxor 28, 28, 17, 25 + vpermxor 29, 29, 18, 25 + vpermxor 30, 30, 19, 25 + + xxlor 32+25, 0, 0 + vadduwm 10, 10, 15 + vadduwm 11, 11, 12 + vadduwm 8, 8, 13 + vadduwm 9, 9, 14 + vadduwm 26, 26, 31 + vadduwm 27, 27, 28 + vadduwm 24, 24, 29 + vadduwm 25, 25, 30 + vxor 5, 5, 10 + vxor 6, 6, 11 + vxor 7, 7, 8 + vxor 4, 4, 9 + vxor 21, 21, 26 + vxor 22, 22, 27 + vxor 23, 23, 24 + vxor 20, 20, 25 + + xxlor 0, 32+25, 32+25 + xxlor 32+25, 21, 21 + vrlw 5, 5, 25 + vrlw 6, 6, 25 + vrlw 7, 7, 25 + vrlw 4, 4, 25 + vrlw 21, 21, 25 + vrlw 22, 22, 25 + vrlw 23, 23, 25 + vrlw 20, 20, 25 + xxlor 32+25, 0, 0 + + vadduwm 0, 0, 5 + vadduwm 1, 1, 6 + vadduwm 2, 2, 7 + vadduwm 3, 3, 4 + vadduwm 16, 16, 21 + vadduwm 17, 17, 22 + vadduwm 18, 18, 23 + vadduwm 19, 19, 20 + + xxlor 0, 32+25, 32+25 + xxlor 32+25, 22, 22 + vpermxor 15, 15, 0, 25 + vpermxor 12, 12, 1, 25 + vpermxor 13, 13, 2, 25 + vpermxor 14, 14, 3, 25 + vpermxor 31, 31, 16, 25 + vpermxor 28, 28, 17, 25 + vpermxor 29, 29, 18, 25 + vpermxor 30, 30, 19, 25 + xxlor 32+25, 0, 0 + + vadduwm 10, 10, 15 + vadduwm 11, 11, 12 + vadduwm 8, 8, 13 + vadduwm 9, 9, 14 + vadduwm 26, 26, 31 + vadduwm 27, 27, 28 + vadduwm 24, 24, 29 + vadduwm 25, 25, 30 + + xxlor 0, 32+28, 32+28 + xxlor 32+28, 23, 23 + vxor 5, 5, 10 + vxor 6, 6, 11 + vxor 7, 7, 8 + vxor 4, 4, 9 + vxor 21, 21, 26 + vxor 22, 22, 27 + vxor 23, 23, 24 + vxor 20, 20, 25 + vrlw 5, 5, 28 + vrlw 6, 6, 28 + vrlw 7, 7, 28 + vrlw 4, 4, 28 + vrlw 21, 21, 28 + vrlw 22, 22, 28 + vrlw 23, 23, 28 + vrlw 20, 20, 28 + xxlor 32+28, 0, 0 +.endm + +.macro QT_loop_4x + # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) + vadduwm 0, 0, 4 + vadduwm 1, 1, 5 + vadduwm 2, 2, 6 + vadduwm 3, 3, 7 + vpermxor 12, 12, 0, 20 + vpermxor 13, 13, 1, 20 + vpermxor 14, 14, 2, 20 + vpermxor 15, 15, 3, 20 + vadduwm 8, 8, 12 + vadduwm 9, 9, 13 + vadduwm 10, 10, 14 + vadduwm 11, 11, 15 + vxor 4, 4, 8 + vxor 5, 5, 9 + vxor 6, 6, 10 + vxor 7, 7, 11 + vrlw 4, 4, 21 + vrlw 5, 5, 21 + vrlw 6, 6, 21 + vrlw 7, 7, 21 + vadduwm 0, 0, 4 + vadduwm 1, 1, 5 + vadduwm 2, 2, 6 + vadduwm 3, 3, 7 + vpermxor 12, 12, 0, 22 + vpermxor 13, 13, 1, 22 + vpermxor 14, 14, 2, 22 + vpermxor 15, 15, 3, 22 + vadduwm 8, 8, 12 + vadduwm 9, 9, 13 + vadduwm 10, 10, 14 + vadduwm 11, 11, 15 + vxor 4, 4, 8 + vxor 5, 5, 9 + vxor 6, 6, 10 + vxor 7, 7, 11 + vrlw 4, 4, 23 + vrlw 5, 5, 23 + vrlw 6, 6, 23 + vrlw 7, 7, 23 + + # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) + vadduwm 0, 0, 5 + vadduwm 1, 1, 6 + vadduwm 2, 2, 7 + vadduwm 3, 3, 4 + vpermxor 15, 15, 0, 20 + vpermxor 12, 12, 1, 20 + vpermxor 13, 13, 2, 20 + vpermxor 14, 14, 3, 20 + vadduwm 10, 10, 15 + vadduwm 11, 11, 12 + vadduwm 8, 8, 13 + vadduwm 9, 9, 14 + vxor 5, 5, 10 + vxor 6, 6, 11 + vxor 7, 7, 8 + vxor 4, 4, 9 + vrlw 5, 5, 21 + vrlw 6, 6, 21 + vrlw 7, 7, 21 + vrlw 4, 4, 21 + vadduwm 0, 0, 5 + vadduwm 1, 1, 6 + vadduwm 2, 2, 7 + vadduwm 3, 3, 4 + vpermxor 15, 15, 0, 22 + vpermxor 12, 12, 1, 22 + vpermxor 13, 13, 2, 22 + vpermxor 14, 14, 3, 22 + vadduwm 10, 10, 15 + vadduwm 11, 11, 12 + vadduwm 8, 8, 13 + vadduwm 9, 9, 14 + vxor 5, 5, 10 + vxor 6, 6, 11 + vxor 7, 7, 8 + vxor 4, 4, 9 + vrlw 5, 5, 23 + vrlw 6, 6, 23 + vrlw 7, 7, 23 + vrlw 4, 4, 23 +.endm + +# Transpose +.macro TP_4x a0 a1 a2 a3 + xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1 + xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3 + xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1 + xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3 + xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3 + xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3 + xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3 + xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3 +.endm + +# key stream = working state + state +.macro Add_state S + vadduwm \S+0, \S+0, 16-\S + vadduwm \S+4, \S+4, 17-\S + vadduwm \S+8, \S+8, 18-\S + vadduwm \S+12, \S+12, 19-\S + + vadduwm \S+1, \S+1, 16-\S + vadduwm \S+5, \S+5, 17-\S + vadduwm \S+9, \S+9, 18-\S + vadduwm \S+13, \S+13, 19-\S + + vadduwm \S+2, \S+2, 16-\S + vadduwm \S+6, \S+6, 17-\S + vadduwm \S+10, \S+10, 18-\S + vadduwm \S+14, \S+14, 19-\S + + vadduwm \S+3, \S+3, 16-\S + vadduwm \S+7, \S+7, 17-\S + vadduwm \S+11, \S+11, 18-\S + vadduwm \S+15, \S+15, 19-\S +.endm + +# +# write 256 bytes +# +.macro Write_256 S + add 9, 14, 5 + add 16, 14, 4 + lxvw4x 0, 0, 9 + lxvw4x 1, 17, 9 + lxvw4x 2, 18, 9 + lxvw4x 3, 19, 9 + lxvw4x 4, 20, 9 + lxvw4x 5, 21, 9 + lxvw4x 6, 22, 9 + lxvw4x 7, 23, 9 + lxvw4x 8, 24, 9 + lxvw4x 9, 25, 9 + lxvw4x 10, 26, 9 + lxvw4x 11, 27, 9 + lxvw4x 12, 28, 9 + lxvw4x 13, 29, 9 + lxvw4x 14, 30, 9 + lxvw4x 15, 31, 9 + + xxlxor \S+32, \S+32, 0 + xxlxor \S+36, \S+36, 1 + xxlxor \S+40, \S+40, 2 + xxlxor \S+44, \S+44, 3 + xxlxor \S+33, \S+33, 4 + xxlxor \S+37, \S+37, 5 + xxlxor \S+41, \S+41, 6 + xxlxor \S+45, \S+45, 7 + xxlxor \S+34, \S+34, 8 + xxlxor \S+38, \S+38, 9 + xxlxor \S+42, \S+42, 10 + xxlxor \S+46, \S+46, 11 + xxlxor \S+35, \S+35, 12 + xxlxor \S+39, \S+39, 13 + xxlxor \S+43, \S+43, 14 + xxlxor \S+47, \S+47, 15 + + stxvw4x \S+32, 0, 16 + stxvw4x \S+36, 17, 16 + stxvw4x \S+40, 18, 16 + stxvw4x \S+44, 19, 16 + + stxvw4x \S+33, 20, 16 + stxvw4x \S+37, 21, 16 + stxvw4x \S+41, 22, 16 + stxvw4x \S+45, 23, 16 + + stxvw4x \S+34, 24, 16 + stxvw4x \S+38, 25, 16 + stxvw4x \S+42, 26, 16 + stxvw4x \S+46, 27, 16 + + stxvw4x \S+35, 28, 16 + stxvw4x \S+39, 29, 16 + stxvw4x \S+43, 30, 16 + stxvw4x \S+47, 31, 16 + +.endm + +# +# void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, const u8 *src, +# unsigned int len, int nrounds); +# +SYM_FUNC_START(chacha_p10le_8x) +.align 5 + cmpdi 6, 0 + ble Out_no_chacha + + SAVE_REGS + + # r17 - r31 mainly for Write_256 macro. + li 17, 16 + li 18, 32 + li 19, 48 + li 20, 64 + li 21, 80 + li 22, 96 + li 23, 112 + li 24, 128 + li 25, 144 + li 26, 160 + li 27, 176 + li 28, 192 + li 29, 208 + li 30, 224 + li 31, 240 + + mr 15, 6 # len + li 14, 0 # offset to inp and outp + + lxvw4x 48, 0, 3 # vr16, constants + lxvw4x 49, 17, 3 # vr17, key 1 + lxvw4x 50, 18, 3 # vr18, key 2 + lxvw4x 51, 19, 3 # vr19, counter, nonce + + # create (0, 1, 2, 3) counters + vspltisw 0, 0 + vspltisw 1, 1 + vspltisw 2, 2 + vspltisw 3, 3 + vmrghw 4, 0, 1 + vmrglw 5, 2, 3 + vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3) + + vspltisw 21, 12 + vspltisw 23, 7 + + addis 11, 2, permx@toc@ha + addi 11, 11, permx@toc@l + lxvw4x 32+20, 0, 11 + lxvw4x 32+22, 17, 11 + + sradi 8, 7, 1 + + mtctr 8 + + # save constants to vsx + xxlor 16, 48, 48 + xxlor 17, 49, 49 + xxlor 18, 50, 50 + xxlor 19, 51, 51 + + vspltisw 25, 4 + vspltisw 26, 8 + + xxlor 25, 32+26, 32+26 + xxlor 24, 32+25, 32+25 + + vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4) + xxlor 30, 32+30, 32+30 + xxlor 31, 32+31, 32+31 + + xxlor 20, 32+20, 32+20 + xxlor 21, 32+21, 32+21 + xxlor 22, 32+22, 32+22 + xxlor 23, 32+23, 32+23 + + cmpdi 6, 512 + blt Loop_last + +Loop_8x: + xxspltw 32+0, 16, 0 + xxspltw 32+1, 16, 1 + xxspltw 32+2, 16, 2 + xxspltw 32+3, 16, 3 + + xxspltw 32+4, 17, 0 + xxspltw 32+5, 17, 1 + xxspltw 32+6, 17, 2 + xxspltw 32+7, 17, 3 + xxspltw 32+8, 18, 0 + xxspltw 32+9, 18, 1 + xxspltw 32+10, 18, 2 + xxspltw 32+11, 18, 3 + xxspltw 32+12, 19, 0 + xxspltw 32+13, 19, 1 + xxspltw 32+14, 19, 2 + xxspltw 32+15, 19, 3 + vadduwm 12, 12, 30 # increase counter + + xxspltw 32+16, 16, 0 + xxspltw 32+17, 16, 1 + xxspltw 32+18, 16, 2 + xxspltw 32+19, 16, 3 + + xxspltw 32+20, 17, 0 + xxspltw 32+21, 17, 1 + xxspltw 32+22, 17, 2 + xxspltw 32+23, 17, 3 + xxspltw 32+24, 18, 0 + xxspltw 32+25, 18, 1 + xxspltw 32+26, 18, 2 + xxspltw 32+27, 18, 3 + xxspltw 32+28, 19, 0 + xxspltw 32+29, 19, 1 + vadduwm 28, 28, 31 # increase counter + xxspltw 32+30, 19, 2 + xxspltw 32+31, 19, 3 + +.align 5 +quarter_loop_8x: + QT_loop_8x + + bdnz quarter_loop_8x + + xxlor 0, 32+30, 32+30 + xxlor 32+30, 30, 30 + vadduwm 12, 12, 30 + xxlor 32+30, 0, 0 + TP_4x 0, 1, 2, 3 + TP_4x 4, 5, 6, 7 + TP_4x 8, 9, 10, 11 + TP_4x 12, 13, 14, 15 + + xxlor 0, 48, 48 + xxlor 1, 49, 49 + xxlor 2, 50, 50 + xxlor 3, 51, 51 + xxlor 48, 16, 16 + xxlor 49, 17, 17 + xxlor 50, 18, 18 + xxlor 51, 19, 19 + Add_state 0 + xxlor 48, 0, 0 + xxlor 49, 1, 1 + xxlor 50, 2, 2 + xxlor 51, 3, 3 + Write_256 0 + addi 14, 14, 256 # offset +=256 + addi 15, 15, -256 # len -=256 + + xxlor 5, 32+31, 32+31 + xxlor 32+31, 31, 31 + vadduwm 28, 28, 31 + xxlor 32+31, 5, 5 + TP_4x 16+0, 16+1, 16+2, 16+3 + TP_4x 16+4, 16+5, 16+6, 16+7 + TP_4x 16+8, 16+9, 16+10, 16+11 + TP_4x 16+12, 16+13, 16+14, 16+15 + + xxlor 32, 16, 16 + xxlor 33, 17, 17 + xxlor 34, 18, 18 + xxlor 35, 19, 19 + Add_state 16 + Write_256 16 + addi 14, 14, 256 # offset +=256 + addi 15, 15, -256 # len +=256 + + xxlor 32+24, 24, 24 + xxlor 32+25, 25, 25 + xxlor 32+30, 30, 30 + vadduwm 30, 30, 25 + vadduwm 31, 30, 24 + xxlor 30, 32+30, 32+30 + xxlor 31, 32+31, 32+31 + + cmpdi 15, 0 + beq Out_loop + + cmpdi 15, 512 + blt Loop_last + + mtctr 8 + b Loop_8x + +Loop_last: + lxvw4x 48, 0, 3 # vr16, constants + lxvw4x 49, 17, 3 # vr17, key 1 + lxvw4x 50, 18, 3 # vr18, key 2 + lxvw4x 51, 19, 3 # vr19, counter, nonce + + vspltisw 21, 12 + vspltisw 23, 7 + addis 11, 2, permx@toc@ha + addi 11, 11, permx@toc@l + lxvw4x 32+20, 0, 11 + lxvw4x 32+22, 17, 11 + + sradi 8, 7, 1 + mtctr 8 + +Loop_4x: + vspltw 0, 16, 0 + vspltw 1, 16, 1 + vspltw 2, 16, 2 + vspltw 3, 16, 3 + + vspltw 4, 17, 0 + vspltw 5, 17, 1 + vspltw 6, 17, 2 + vspltw 7, 17, 3 + vspltw 8, 18, 0 + vspltw 9, 18, 1 + vspltw 10, 18, 2 + vspltw 11, 18, 3 + vspltw 12, 19, 0 + vadduwm 12, 12, 30 # increase counter + vspltw 13, 19, 1 + vspltw 14, 19, 2 + vspltw 15, 19, 3 + +.align 5 +quarter_loop: + QT_loop_4x + + bdnz quarter_loop + + vadduwm 12, 12, 30 + TP_4x 0, 1, 2, 3 + TP_4x 4, 5, 6, 7 + TP_4x 8, 9, 10, 11 + TP_4x 12, 13, 14, 15 + + Add_state 0 + Write_256 0 + addi 14, 14, 256 # offset += 256 + addi 15, 15, -256 # len += 256 + + # Update state counter + vspltisw 25, 4 + vadduwm 30, 30, 25 + + cmpdi 15, 0 + beq Out_loop + cmpdi 15, 256 + blt Out_loop + + mtctr 8 + b Loop_4x + +Out_loop: + RESTORE_REGS + blr + +Out_no_chacha: + li 3, 0 + blr +SYM_FUNC_END(chacha_p10le_8x) + +SYM_DATA_START_LOCAL(PERMX) +.align 5 +permx: +.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd +.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc +SYM_DATA_END(PERMX) diff --git a/lib/crypto/powerpc/poly1305-p10-glue.c b/lib/crypto/powerpc/poly1305-p10-glue.c new file mode 100644 index 000000000000..3f1664a724b6 --- /dev/null +++ b/lib/crypto/powerpc/poly1305-p10-glue.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Poly1305 authenticator algorithm, RFC7539. + * + * Copyright 2023- IBM Corp. All rights reserved. + */ +#include <asm/switch_to.h> +#include <crypto/internal/poly1305.h> +#include <linux/cpufeature.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/unaligned.h> + +asmlinkage void poly1305_p10le_4blocks(struct poly1305_block_state *state, const u8 *m, u32 mlen); +asmlinkage void poly1305_64s(struct poly1305_block_state *state, const u8 *m, u32 mlen, int highbit); +asmlinkage void poly1305_emit_64(const struct poly1305_state *state, const u32 nonce[4], u8 digest[POLY1305_DIGEST_SIZE]); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10); + +static void vsx_begin(void) +{ + preempt_disable(); + enable_kernel_vsx(); +} + +static void vsx_end(void) +{ + disable_kernel_vsx(); + preempt_enable(); +} + +void poly1305_block_init_arch(struct poly1305_block_state *dctx, + const u8 raw_key[POLY1305_BLOCK_SIZE]) +{ + if (!static_key_enabled(&have_p10)) + return poly1305_block_init_generic(dctx, raw_key); + + dctx->h = (struct poly1305_state){}; + dctx->core_r.key.r64[0] = get_unaligned_le64(raw_key + 0); + dctx->core_r.key.r64[1] = get_unaligned_le64(raw_key + 8); +} +EXPORT_SYMBOL_GPL(poly1305_block_init_arch); + +void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, + unsigned int len, u32 padbit) +{ + if (!static_key_enabled(&have_p10)) + return poly1305_blocks_generic(state, src, len, padbit); + vsx_begin(); + if (len >= POLY1305_BLOCK_SIZE * 4) { + poly1305_p10le_4blocks(state, src, len); + src += len - (len % (POLY1305_BLOCK_SIZE * 4)); + len %= POLY1305_BLOCK_SIZE * 4; + } + while (len >= POLY1305_BLOCK_SIZE) { + poly1305_64s(state, src, POLY1305_BLOCK_SIZE, padbit); + len -= POLY1305_BLOCK_SIZE; + src += POLY1305_BLOCK_SIZE; + } + vsx_end(); +} +EXPORT_SYMBOL_GPL(poly1305_blocks_arch); + +void poly1305_emit_arch(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]) +{ + if (!static_key_enabled(&have_p10)) + return poly1305_emit_generic(state, digest, nonce); + poly1305_emit_64(state, nonce, digest); +} +EXPORT_SYMBOL_GPL(poly1305_emit_arch); + +bool poly1305_is_arch_optimized(void) +{ + return static_key_enabled(&have_p10); +} +EXPORT_SYMBOL(poly1305_is_arch_optimized); + +static int __init poly1305_p10_init(void) +{ + if (cpu_has_feature(CPU_FTR_ARCH_31)) + static_branch_enable(&have_p10); + return 0; +} +subsys_initcall(poly1305_p10_init); + +static void __exit poly1305_p10_exit(void) +{ +} +module_exit(poly1305_p10_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>"); +MODULE_DESCRIPTION("Optimized Poly1305 for P10"); diff --git a/lib/crypto/powerpc/poly1305-p10le_64.S b/lib/crypto/powerpc/poly1305-p10le_64.S new file mode 100644 index 000000000000..a3c1987f1ecd --- /dev/null +++ b/lib/crypto/powerpc/poly1305-p10le_64.S @@ -0,0 +1,1075 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +# +# Accelerated poly1305 implementation for ppc64le. +# +# Copyright 2023- IBM Corp. All rights reserved +# +#=================================================================================== +# Written by Danny Tsen <dtsen@us.ibm.com> +# +# Poly1305 - this version mainly using vector/VSX/Scalar +# - 26 bits limbs +# - Handle multiple 64 byte blcok. +# +# Block size 16 bytes +# key = (r, s) +# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF +# p = 2^130 - 5 +# a += m +# a = (r + a) % p +# a += s +# +# Improve performance by breaking down polynominal to the sum of products with +# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r +# +# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0 +# to 9 vectors for multiplications. +# +# setup r^4, r^3, r^2, r vectors +# vs [r^1, r^3, r^2, r^4] +# vs0 = [r0,.....] +# vs1 = [r1,.....] +# vs2 = [r2,.....] +# vs3 = [r3,.....] +# vs4 = [r4,.....] +# vs5 = [r1*5,...] +# vs6 = [r2*5,...] +# vs7 = [r2*5,...] +# vs8 = [r4*5,...] +# +# Each word in a vector consists a member of a "r/s" in [a * r/s]. +# +# r0, r4*5, r3*5, r2*5, r1*5; +# r1, r0, r4*5, r3*5, r2*5; +# r2, r1, r0, r4*5, r3*5; +# r3, r2, r1, r0, r4*5; +# r4, r3, r2, r1, r0 ; +# +# +# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m) +# k = 32 bytes key +# r3 = k (r, s) +# r4 = mlen +# r5 = m +# +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/asm-compat.h> +#include <linux/linkage.h> + +.machine "any" + +.text + +.macro SAVE_GPR GPR OFFSET FRAME + std \GPR,\OFFSET(\FRAME) +.endm + +.macro SAVE_VRS VRS OFFSET FRAME + li 16, \OFFSET + stvx \VRS, 16, \FRAME +.endm + +.macro SAVE_VSX VSX OFFSET FRAME + li 16, \OFFSET + stxvx \VSX, 16, \FRAME +.endm + +.macro RESTORE_GPR GPR OFFSET FRAME + ld \GPR,\OFFSET(\FRAME) +.endm + +.macro RESTORE_VRS VRS OFFSET FRAME + li 16, \OFFSET + lvx \VRS, 16, \FRAME +.endm + +.macro RESTORE_VSX VSX OFFSET FRAME + li 16, \OFFSET + lxvx \VSX, 16, \FRAME +.endm + +.macro SAVE_REGS + mflr 0 + std 0, 16(1) + stdu 1,-752(1) + + SAVE_GPR 14, 112, 1 + SAVE_GPR 15, 120, 1 + SAVE_GPR 16, 128, 1 + SAVE_GPR 17, 136, 1 + SAVE_GPR 18, 144, 1 + SAVE_GPR 19, 152, 1 + SAVE_GPR 20, 160, 1 + SAVE_GPR 21, 168, 1 + SAVE_GPR 22, 176, 1 + SAVE_GPR 23, 184, 1 + SAVE_GPR 24, 192, 1 + SAVE_GPR 25, 200, 1 + SAVE_GPR 26, 208, 1 + SAVE_GPR 27, 216, 1 + SAVE_GPR 28, 224, 1 + SAVE_GPR 29, 232, 1 + SAVE_GPR 30, 240, 1 + SAVE_GPR 31, 248, 1 + + addi 9, 1, 256 + SAVE_VRS 20, 0, 9 + SAVE_VRS 21, 16, 9 + SAVE_VRS 22, 32, 9 + SAVE_VRS 23, 48, 9 + SAVE_VRS 24, 64, 9 + SAVE_VRS 25, 80, 9 + SAVE_VRS 26, 96, 9 + SAVE_VRS 27, 112, 9 + SAVE_VRS 28, 128, 9 + SAVE_VRS 29, 144, 9 + SAVE_VRS 30, 160, 9 + SAVE_VRS 31, 176, 9 + + SAVE_VSX 14, 192, 9 + SAVE_VSX 15, 208, 9 + SAVE_VSX 16, 224, 9 + SAVE_VSX 17, 240, 9 + SAVE_VSX 18, 256, 9 + SAVE_VSX 19, 272, 9 + SAVE_VSX 20, 288, 9 + SAVE_VSX 21, 304, 9 + SAVE_VSX 22, 320, 9 + SAVE_VSX 23, 336, 9 + SAVE_VSX 24, 352, 9 + SAVE_VSX 25, 368, 9 + SAVE_VSX 26, 384, 9 + SAVE_VSX 27, 400, 9 + SAVE_VSX 28, 416, 9 + SAVE_VSX 29, 432, 9 + SAVE_VSX 30, 448, 9 + SAVE_VSX 31, 464, 9 +.endm # SAVE_REGS + +.macro RESTORE_REGS + addi 9, 1, 256 + RESTORE_VRS 20, 0, 9 + RESTORE_VRS 21, 16, 9 + RESTORE_VRS 22, 32, 9 + RESTORE_VRS 23, 48, 9 + RESTORE_VRS 24, 64, 9 + RESTORE_VRS 25, 80, 9 + RESTORE_VRS 26, 96, 9 + RESTORE_VRS 27, 112, 9 + RESTORE_VRS 28, 128, 9 + RESTORE_VRS 29, 144, 9 + RESTORE_VRS 30, 160, 9 + RESTORE_VRS 31, 176, 9 + + RESTORE_VSX 14, 192, 9 + RESTORE_VSX 15, 208, 9 + RESTORE_VSX 16, 224, 9 + RESTORE_VSX 17, 240, 9 + RESTORE_VSX 18, 256, 9 + RESTORE_VSX 19, 272, 9 + RESTORE_VSX 20, 288, 9 + RESTORE_VSX 21, 304, 9 + RESTORE_VSX 22, 320, 9 + RESTORE_VSX 23, 336, 9 + RESTORE_VSX 24, 352, 9 + RESTORE_VSX 25, 368, 9 + RESTORE_VSX 26, 384, 9 + RESTORE_VSX 27, 400, 9 + RESTORE_VSX 28, 416, 9 + RESTORE_VSX 29, 432, 9 + RESTORE_VSX 30, 448, 9 + RESTORE_VSX 31, 464, 9 + + RESTORE_GPR 14, 112, 1 + RESTORE_GPR 15, 120, 1 + RESTORE_GPR 16, 128, 1 + RESTORE_GPR 17, 136, 1 + RESTORE_GPR 18, 144, 1 + RESTORE_GPR 19, 152, 1 + RESTORE_GPR 20, 160, 1 + RESTORE_GPR 21, 168, 1 + RESTORE_GPR 22, 176, 1 + RESTORE_GPR 23, 184, 1 + RESTORE_GPR 24, 192, 1 + RESTORE_GPR 25, 200, 1 + RESTORE_GPR 26, 208, 1 + RESTORE_GPR 27, 216, 1 + RESTORE_GPR 28, 224, 1 + RESTORE_GPR 29, 232, 1 + RESTORE_GPR 30, 240, 1 + RESTORE_GPR 31, 248, 1 + + addi 1, 1, 752 + ld 0, 16(1) + mtlr 0 +.endm # RESTORE_REGS + +# +# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5; +# p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5; +# p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5; +# p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5; +# p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ; +# +# [r^2, r^3, r^1, r^4] +# [m3, m2, m4, m1] +# +# multiply odd and even words +.macro mul_odd + vmulouw 14, 4, 26 + vmulouw 10, 5, 3 + vmulouw 11, 6, 2 + vmulouw 12, 7, 1 + vmulouw 13, 8, 0 + vmulouw 15, 4, 27 + vaddudm 14, 14, 10 + vaddudm 14, 14, 11 + vmulouw 10, 5, 26 + vmulouw 11, 6, 3 + vaddudm 14, 14, 12 + vaddudm 14, 14, 13 # x0 + vaddudm 15, 15, 10 + vaddudm 15, 15, 11 + vmulouw 12, 7, 2 + vmulouw 13, 8, 1 + vaddudm 15, 15, 12 + vaddudm 15, 15, 13 # x1 + vmulouw 16, 4, 28 + vmulouw 10, 5, 27 + vmulouw 11, 6, 26 + vaddudm 16, 16, 10 + vaddudm 16, 16, 11 + vmulouw 12, 7, 3 + vmulouw 13, 8, 2 + vaddudm 16, 16, 12 + vaddudm 16, 16, 13 # x2 + vmulouw 17, 4, 29 + vmulouw 10, 5, 28 + vmulouw 11, 6, 27 + vaddudm 17, 17, 10 + vaddudm 17, 17, 11 + vmulouw 12, 7, 26 + vmulouw 13, 8, 3 + vaddudm 17, 17, 12 + vaddudm 17, 17, 13 # x3 + vmulouw 18, 4, 30 + vmulouw 10, 5, 29 + vmulouw 11, 6, 28 + vaddudm 18, 18, 10 + vaddudm 18, 18, 11 + vmulouw 12, 7, 27 + vmulouw 13, 8, 26 + vaddudm 18, 18, 12 + vaddudm 18, 18, 13 # x4 +.endm + +.macro mul_even + vmuleuw 9, 4, 26 + vmuleuw 10, 5, 3 + vmuleuw 11, 6, 2 + vmuleuw 12, 7, 1 + vmuleuw 13, 8, 0 + vaddudm 14, 14, 9 + vaddudm 14, 14, 10 + vaddudm 14, 14, 11 + vaddudm 14, 14, 12 + vaddudm 14, 14, 13 # x0 + + vmuleuw 9, 4, 27 + vmuleuw 10, 5, 26 + vmuleuw 11, 6, 3 + vmuleuw 12, 7, 2 + vmuleuw 13, 8, 1 + vaddudm 15, 15, 9 + vaddudm 15, 15, 10 + vaddudm 15, 15, 11 + vaddudm 15, 15, 12 + vaddudm 15, 15, 13 # x1 + + vmuleuw 9, 4, 28 + vmuleuw 10, 5, 27 + vmuleuw 11, 6, 26 + vmuleuw 12, 7, 3 + vmuleuw 13, 8, 2 + vaddudm 16, 16, 9 + vaddudm 16, 16, 10 + vaddudm 16, 16, 11 + vaddudm 16, 16, 12 + vaddudm 16, 16, 13 # x2 + + vmuleuw 9, 4, 29 + vmuleuw 10, 5, 28 + vmuleuw 11, 6, 27 + vmuleuw 12, 7, 26 + vmuleuw 13, 8, 3 + vaddudm 17, 17, 9 + vaddudm 17, 17, 10 + vaddudm 17, 17, 11 + vaddudm 17, 17, 12 + vaddudm 17, 17, 13 # x3 + + vmuleuw 9, 4, 30 + vmuleuw 10, 5, 29 + vmuleuw 11, 6, 28 + vmuleuw 12, 7, 27 + vmuleuw 13, 8, 26 + vaddudm 18, 18, 9 + vaddudm 18, 18, 10 + vaddudm 18, 18, 11 + vaddudm 18, 18, 12 + vaddudm 18, 18, 13 # x4 +.endm + +# +# poly1305_setup_r +# +# setup r^4, r^3, r^2, r vectors +# [r, r^3, r^2, r^4] +# vs0 = [r0,...] +# vs1 = [r1,...] +# vs2 = [r2,...] +# vs3 = [r3,...] +# vs4 = [r4,...] +# vs5 = [r4*5,...] +# vs6 = [r3*5,...] +# vs7 = [r2*5,...] +# vs8 = [r1*5,...] +# +# r0, r4*5, r3*5, r2*5, r1*5; +# r1, r0, r4*5, r3*5, r2*5; +# r2, r1, r0, r4*5, r3*5; +# r3, r2, r1, r0, r4*5; +# r4, r3, r2, r1, r0 ; +# +.macro poly1305_setup_r + + # save r + xxlor 26, 58, 58 + xxlor 27, 59, 59 + xxlor 28, 60, 60 + xxlor 29, 61, 61 + xxlor 30, 62, 62 + + xxlxor 31, 31, 31 + +# [r, r^3, r^2, r^4] + # compute r^2 + vmr 4, 26 + vmr 5, 27 + vmr 6, 28 + vmr 7, 29 + vmr 8, 30 + bl do_mul # r^2 r^1 + xxpermdi 58, 58, 36, 0x3 # r0 + xxpermdi 59, 59, 37, 0x3 # r1 + xxpermdi 60, 60, 38, 0x3 # r2 + xxpermdi 61, 61, 39, 0x3 # r3 + xxpermdi 62, 62, 40, 0x3 # r4 + xxpermdi 36, 36, 36, 0x3 + xxpermdi 37, 37, 37, 0x3 + xxpermdi 38, 38, 38, 0x3 + xxpermdi 39, 39, 39, 0x3 + xxpermdi 40, 40, 40, 0x3 + vspltisb 13, 2 + vsld 9, 27, 13 + vsld 10, 28, 13 + vsld 11, 29, 13 + vsld 12, 30, 13 + vaddudm 0, 9, 27 + vaddudm 1, 10, 28 + vaddudm 2, 11, 29 + vaddudm 3, 12, 30 + + bl do_mul # r^4 r^3 + vmrgow 26, 26, 4 + vmrgow 27, 27, 5 + vmrgow 28, 28, 6 + vmrgow 29, 29, 7 + vmrgow 30, 30, 8 + vspltisb 13, 2 + vsld 9, 27, 13 + vsld 10, 28, 13 + vsld 11, 29, 13 + vsld 12, 30, 13 + vaddudm 0, 9, 27 + vaddudm 1, 10, 28 + vaddudm 2, 11, 29 + vaddudm 3, 12, 30 + + # r^2 r^4 + xxlor 0, 58, 58 + xxlor 1, 59, 59 + xxlor 2, 60, 60 + xxlor 3, 61, 61 + xxlor 4, 62, 62 + xxlor 5, 32, 32 + xxlor 6, 33, 33 + xxlor 7, 34, 34 + xxlor 8, 35, 35 + + vspltw 9, 26, 3 + vspltw 10, 26, 2 + vmrgow 26, 10, 9 + vspltw 9, 27, 3 + vspltw 10, 27, 2 + vmrgow 27, 10, 9 + vspltw 9, 28, 3 + vspltw 10, 28, 2 + vmrgow 28, 10, 9 + vspltw 9, 29, 3 + vspltw 10, 29, 2 + vmrgow 29, 10, 9 + vspltw 9, 30, 3 + vspltw 10, 30, 2 + vmrgow 30, 10, 9 + + vsld 9, 27, 13 + vsld 10, 28, 13 + vsld 11, 29, 13 + vsld 12, 30, 13 + vaddudm 0, 9, 27 + vaddudm 1, 10, 28 + vaddudm 2, 11, 29 + vaddudm 3, 12, 30 +.endm + +SYM_FUNC_START_LOCAL(do_mul) + mul_odd + + # do reduction ( h %= p ) + # carry reduction + vspltisb 9, 2 + vsrd 10, 14, 31 + vsrd 11, 17, 31 + vand 7, 17, 25 + vand 4, 14, 25 + vaddudm 18, 18, 11 + vsrd 12, 18, 31 + vaddudm 15, 15, 10 + + vsrd 11, 15, 31 + vand 8, 18, 25 + vand 5, 15, 25 + vaddudm 4, 4, 12 + vsld 10, 12, 9 + vaddudm 6, 16, 11 + + vsrd 13, 6, 31 + vand 6, 6, 25 + vaddudm 4, 4, 10 + vsrd 10, 4, 31 + vaddudm 7, 7, 13 + + vsrd 11, 7, 31 + vand 7, 7, 25 + vand 4, 4, 25 + vaddudm 5, 5, 10 + vaddudm 8, 8, 11 + blr +SYM_FUNC_END(do_mul) + +# +# init key +# +.macro do_poly1305_init + addis 10, 2, rmask@toc@ha + addi 10, 10, rmask@toc@l + + ld 11, 0(10) + ld 12, 8(10) + + li 14, 16 + li 15, 32 + addis 10, 2, cnum@toc@ha + addi 10, 10, cnum@toc@l + lvx 25, 0, 10 # v25 - mask + lvx 31, 14, 10 # v31 = 1a + lvx 19, 15, 10 # v19 = 1 << 24 + lxv 24, 48(10) # vs24 + lxv 25, 64(10) # vs25 + + # initialize + # load key from r3 to vectors + ld 9, 24(3) + ld 10, 32(3) + and. 9, 9, 11 + and. 10, 10, 12 + + # break 26 bits + extrdi 14, 9, 26, 38 + extrdi 15, 9, 26, 12 + extrdi 16, 9, 12, 0 + mtvsrdd 58, 0, 14 + insrdi 16, 10, 14, 38 + mtvsrdd 59, 0, 15 + extrdi 17, 10, 26, 24 + mtvsrdd 60, 0, 16 + extrdi 18, 10, 24, 0 + mtvsrdd 61, 0, 17 + mtvsrdd 62, 0, 18 + + # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5 + li 9, 5 + mtvsrdd 36, 0, 9 + vmulouw 0, 27, 4 # v0 = rr0 + vmulouw 1, 28, 4 # v1 = rr1 + vmulouw 2, 29, 4 # v2 = rr2 + vmulouw 3, 30, 4 # v3 = rr3 +.endm + +# +# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m) +# k = 32 bytes key +# r3 = k (r, s) +# r4 = mlen +# r5 = m +# +SYM_FUNC_START(poly1305_p10le_4blocks) +.align 5 + cmpdi 5, 64 + blt Out_no_poly1305 + + SAVE_REGS + + do_poly1305_init + + li 21, 0 # counter to message + + poly1305_setup_r + + # load previous H state + # break/convert r6 to 26 bits + ld 9, 0(3) + ld 10, 8(3) + ld 19, 16(3) + sldi 19, 19, 24 + mtvsrdd 41, 0, 19 + extrdi 14, 9, 26, 38 + extrdi 15, 9, 26, 12 + extrdi 16, 9, 12, 0 + mtvsrdd 36, 0, 14 + insrdi 16, 10, 14, 38 + mtvsrdd 37, 0, 15 + extrdi 17, 10, 26, 24 + mtvsrdd 38, 0, 16 + extrdi 18, 10, 24, 0 + mtvsrdd 39, 0, 17 + mtvsrdd 40, 0, 18 + vor 8, 8, 9 + + # input m1 m2 + add 20, 4, 21 + xxlor 49, 24, 24 + xxlor 50, 25, 25 + lxvw4x 43, 0, 20 + addi 17, 20, 16 + lxvw4x 44, 0, 17 + vperm 14, 11, 12, 17 + vperm 15, 11, 12, 18 + vand 9, 14, 25 # a0 + vsrd 10, 14, 31 # >> 26 + vsrd 11, 10, 31 # 12 bits left + vand 10, 10, 25 # a1 + vspltisb 13, 12 + vand 16, 15, 25 + vsld 12, 16, 13 + vor 11, 11, 12 + vand 11, 11, 25 # a2 + vspltisb 13, 14 + vsrd 12, 15, 13 # >> 14 + vsrd 13, 12, 31 # >> 26, a4 + vand 12, 12, 25 # a3 + + vaddudm 20, 4, 9 + vaddudm 21, 5, 10 + vaddudm 22, 6, 11 + vaddudm 23, 7, 12 + vaddudm 24, 8, 13 + + # m3 m4 + addi 17, 17, 16 + lxvw4x 43, 0, 17 + addi 17, 17, 16 + lxvw4x 44, 0, 17 + vperm 14, 11, 12, 17 + vperm 15, 11, 12, 18 + vand 9, 14, 25 # a0 + vsrd 10, 14, 31 # >> 26 + vsrd 11, 10, 31 # 12 bits left + vand 10, 10, 25 # a1 + vspltisb 13, 12 + vand 16, 15, 25 + vsld 12, 16, 13 + vspltisb 13, 14 + vor 11, 11, 12 + vand 11, 11, 25 # a2 + vsrd 12, 15, 13 # >> 14 + vsrd 13, 12, 31 # >> 26, a4 + vand 12, 12, 25 # a3 + + # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1] + vmrgow 4, 9, 20 + vmrgow 5, 10, 21 + vmrgow 6, 11, 22 + vmrgow 7, 12, 23 + vmrgow 8, 13, 24 + vaddudm 8, 8, 19 + + addi 5, 5, -64 # len -= 64 + addi 21, 21, 64 # offset += 64 + + li 9, 64 + divdu 31, 5, 9 + + cmpdi 31, 0 + ble Skip_block_loop + + mtctr 31 + +# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r +# Rewrite the polynominal sum of product as follows, +# h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2 +# h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2 +# .... Repeat +# h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 --> +# h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r +# +loop_4blocks: + + # Multiply odd words and even words + mul_odd + mul_even + # carry reduction + vspltisb 9, 2 + vsrd 10, 14, 31 + vsrd 11, 17, 31 + vand 7, 17, 25 + vand 4, 14, 25 + vaddudm 18, 18, 11 + vsrd 12, 18, 31 + vaddudm 15, 15, 10 + + vsrd 11, 15, 31 + vand 8, 18, 25 + vand 5, 15, 25 + vaddudm 4, 4, 12 + vsld 10, 12, 9 + vaddudm 6, 16, 11 + + vsrd 13, 6, 31 + vand 6, 6, 25 + vaddudm 4, 4, 10 + vsrd 10, 4, 31 + vaddudm 7, 7, 13 + + vsrd 11, 7, 31 + vand 7, 7, 25 + vand 4, 4, 25 + vaddudm 5, 5, 10 + vaddudm 8, 8, 11 + + # input m1 m2 m3 m4 + add 20, 4, 21 + xxlor 49, 24, 24 + xxlor 50, 25, 25 + lxvw4x 43, 0, 20 + addi 17, 20, 16 + lxvw4x 44, 0, 17 + vperm 14, 11, 12, 17 + vperm 15, 11, 12, 18 + addi 17, 17, 16 + lxvw4x 43, 0, 17 + addi 17, 17, 16 + lxvw4x 44, 0, 17 + vperm 17, 11, 12, 17 + vperm 18, 11, 12, 18 + + vand 20, 14, 25 # a0 + vand 9, 17, 25 # a0 + vsrd 21, 14, 31 # >> 26 + vsrd 22, 21, 31 # 12 bits left + vsrd 10, 17, 31 # >> 26 + vsrd 11, 10, 31 # 12 bits left + + vand 21, 21, 25 # a1 + vand 10, 10, 25 # a1 + + vspltisb 13, 12 + vand 16, 15, 25 + vsld 23, 16, 13 + vor 22, 22, 23 + vand 22, 22, 25 # a2 + vand 16, 18, 25 + vsld 12, 16, 13 + vor 11, 11, 12 + vand 11, 11, 25 # a2 + vspltisb 13, 14 + vsrd 23, 15, 13 # >> 14 + vsrd 24, 23, 31 # >> 26, a4 + vand 23, 23, 25 # a3 + vsrd 12, 18, 13 # >> 14 + vsrd 13, 12, 31 # >> 26, a4 + vand 12, 12, 25 # a3 + + vaddudm 4, 4, 20 + vaddudm 5, 5, 21 + vaddudm 6, 6, 22 + vaddudm 7, 7, 23 + vaddudm 8, 8, 24 + + # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1] + vmrgow 4, 9, 4 + vmrgow 5, 10, 5 + vmrgow 6, 11, 6 + vmrgow 7, 12, 7 + vmrgow 8, 13, 8 + vaddudm 8, 8, 19 + + addi 5, 5, -64 # len -= 64 + addi 21, 21, 64 # offset += 64 + + bdnz loop_4blocks + +Skip_block_loop: + xxlor 58, 0, 0 + xxlor 59, 1, 1 + xxlor 60, 2, 2 + xxlor 61, 3, 3 + xxlor 62, 4, 4 + xxlor 32, 5, 5 + xxlor 33, 6, 6 + xxlor 34, 7, 7 + xxlor 35, 8, 8 + + # Multiply odd words and even words + mul_odd + mul_even + + # Sum the products. + xxpermdi 41, 31, 46, 0 + xxpermdi 42, 31, 47, 0 + vaddudm 4, 14, 9 + xxpermdi 36, 31, 36, 3 + vaddudm 5, 15, 10 + xxpermdi 37, 31, 37, 3 + xxpermdi 43, 31, 48, 0 + vaddudm 6, 16, 11 + xxpermdi 38, 31, 38, 3 + xxpermdi 44, 31, 49, 0 + vaddudm 7, 17, 12 + xxpermdi 39, 31, 39, 3 + xxpermdi 45, 31, 50, 0 + vaddudm 8, 18, 13 + xxpermdi 40, 31, 40, 3 + + # carry reduction + vspltisb 9, 2 + vsrd 10, 4, 31 + vsrd 11, 7, 31 + vand 7, 7, 25 + vand 4, 4, 25 + vaddudm 8, 8, 11 + vsrd 12, 8, 31 + vaddudm 5, 5, 10 + + vsrd 11, 5, 31 + vand 8, 8, 25 + vand 5, 5, 25 + vaddudm 4, 4, 12 + vsld 10, 12, 9 + vaddudm 6, 6, 11 + + vsrd 13, 6, 31 + vand 6, 6, 25 + vaddudm 4, 4, 10 + vsrd 10, 4, 31 + vaddudm 7, 7, 13 + + vsrd 11, 7, 31 + vand 7, 7, 25 + vand 4, 4, 25 + vaddudm 5, 5, 10 + vsrd 10, 5, 31 + vand 5, 5, 25 + vaddudm 6, 6, 10 + vaddudm 8, 8, 11 + + b do_final_update + +do_final_update: + # combine 26 bit limbs + # v4, v5, v6, v7 and v8 are 26 bit vectors + vsld 5, 5, 31 + vor 20, 4, 5 + vspltisb 11, 12 + vsrd 12, 6, 11 + vsld 6, 6, 31 + vsld 6, 6, 31 + vor 20, 20, 6 + vspltisb 11, 14 + vsld 7, 7, 11 + vor 21, 7, 12 + mfvsrld 16, 40 # save last 2 bytes + vsld 8, 8, 11 + vsld 8, 8, 31 + vor 21, 21, 8 + mfvsrld 17, 52 + mfvsrld 19, 53 + srdi 16, 16, 24 + + std 17, 0(3) + std 19, 8(3) + stw 16, 16(3) + +Out_loop: + li 3, 0 + + RESTORE_REGS + + blr + +Out_no_poly1305: + li 3, 0 + blr +SYM_FUNC_END(poly1305_p10le_4blocks) + +# +# ======================================================================= +# The following functions implement 64 x 64 bits multiplication poly1305. +# +SYM_FUNC_START_LOCAL(Poly1305_init_64) + # mask 0x0FFFFFFC0FFFFFFC + # mask 0x0FFFFFFC0FFFFFFF + addis 10, 2, rmask@toc@ha + addi 10, 10, rmask@toc@l + ld 11, 0(10) + ld 12, 8(10) + + # initialize + # load key from r3 + ld 9, 24(3) + ld 10, 32(3) + and. 9, 9, 11 # cramp mask r0 + and. 10, 10, 12 # cramp mask r1 + + srdi 21, 10, 2 + add 19, 21, 10 # s1: r19 - (r1 >> 2) *5 + + # setup r and s + li 25, 0 + mtvsrdd 32+0, 9, 19 # r0, s1 + mtvsrdd 32+1, 10, 9 # r1, r0 + mtvsrdd 32+2, 19, 25 # s1 + mtvsrdd 32+3, 9, 25 # r0 + + blr +SYM_FUNC_END(Poly1305_init_64) + +# Poly1305_mult +# v6 = (h0, h1), v8 = h2 +# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0 +# +# Output: v7, v10, v11 +# +SYM_FUNC_START_LOCAL(Poly1305_mult) + # + # d0 = h0 * r0 + h1 * s1 + vmsumudm 7, 6, 0, 9 # h0 * r0, h1 * s1 + + # d1 = h0 * r1 + h1 * r0 + h2 * s1 + vmsumudm 11, 6, 1, 9 # h0 * r1, h1 * r0 + vmsumudm 10, 8, 2, 11 # d1 += h2 * s1 + + # d2 = r0 + vmsumudm 11, 8, 3, 9 # d2 = h2 * r0 + blr +SYM_FUNC_END(Poly1305_mult) + +# +# carry reduction +# h %=p +# +# Input: v7, v10, v11 +# Output: r27, r28, r29 +# +SYM_FUNC_START_LOCAL(Carry_reduction) + mfvsrld 27, 32+7 + mfvsrld 28, 32+10 + mfvsrld 29, 32+11 + mfvsrd 20, 32+7 # h0.h + mfvsrd 21, 32+10 # h1.h + + addc 28, 28, 20 + adde 29, 29, 21 + srdi 22, 29, 0x2 + sldi 23, 22, 0x2 + add 23, 23, 22 # (h2 & 3) * 5 + addc 27, 27, 23 # h0 + addze 28, 28 # h1 + andi. 29, 29, 0x3 # h2 + blr +SYM_FUNC_END(Carry_reduction) + +# +# poly1305 multiplication +# h *= r, h %= p +# d0 = h0 * r0 + h1 * s1 +# d1 = h0 * r1 + h1 * r0 + h2 * s1 +# d2 = h0 * r0 +# +# +# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit) +# - no highbit if final leftover block (highbit = 0) +# +SYM_FUNC_START(poly1305_64s) + cmpdi 5, 0 + ble Out_no_poly1305_64 + + mflr 0 + std 0, 16(1) + stdu 1,-400(1) + + SAVE_GPR 14, 112, 1 + SAVE_GPR 15, 120, 1 + SAVE_GPR 16, 128, 1 + SAVE_GPR 17, 136, 1 + SAVE_GPR 18, 144, 1 + SAVE_GPR 19, 152, 1 + SAVE_GPR 20, 160, 1 + SAVE_GPR 21, 168, 1 + SAVE_GPR 22, 176, 1 + SAVE_GPR 23, 184, 1 + SAVE_GPR 24, 192, 1 + SAVE_GPR 25, 200, 1 + SAVE_GPR 26, 208, 1 + SAVE_GPR 27, 216, 1 + SAVE_GPR 28, 224, 1 + SAVE_GPR 29, 232, 1 + SAVE_GPR 30, 240, 1 + SAVE_GPR 31, 248, 1 + + # Init poly1305 + bl Poly1305_init_64 + + li 25, 0 # offset to inp and outp + + add 11, 25, 4 + + # load h + # h0, h1, h2? + ld 27, 0(3) + ld 28, 8(3) + lwz 29, 16(3) + + li 30, 16 + divdu 31, 5, 30 + + mtctr 31 + + mr 24, 6 # highbit + +Loop_block_64: + vxor 9, 9, 9 + + ld 20, 0(11) + ld 21, 8(11) + addi 11, 11, 16 + + addc 27, 27, 20 + adde 28, 28, 21 + adde 29, 29, 24 + + li 22, 0 + mtvsrdd 32+6, 27, 28 # h0, h1 + mtvsrdd 32+8, 29, 22 # h2 + + bl Poly1305_mult + + bl Carry_reduction + + bdnz Loop_block_64 + + std 27, 0(3) + std 28, 8(3) + stw 29, 16(3) + + li 3, 0 + + RESTORE_GPR 14, 112, 1 + RESTORE_GPR 15, 120, 1 + RESTORE_GPR 16, 128, 1 + RESTORE_GPR 17, 136, 1 + RESTORE_GPR 18, 144, 1 + RESTORE_GPR 19, 152, 1 + RESTORE_GPR 20, 160, 1 + RESTORE_GPR 21, 168, 1 + RESTORE_GPR 22, 176, 1 + RESTORE_GPR 23, 184, 1 + RESTORE_GPR 24, 192, 1 + RESTORE_GPR 25, 200, 1 + RESTORE_GPR 26, 208, 1 + RESTORE_GPR 27, 216, 1 + RESTORE_GPR 28, 224, 1 + RESTORE_GPR 29, 232, 1 + RESTORE_GPR 30, 240, 1 + RESTORE_GPR 31, 248, 1 + + addi 1, 1, 400 + ld 0, 16(1) + mtlr 0 + + blr + +Out_no_poly1305_64: + li 3, 0 + blr +SYM_FUNC_END(poly1305_64s) + +# +# Input: r3 = h, r4 = s, r5 = mac +# mac = h + s +# +SYM_FUNC_START(poly1305_emit_64) + ld 10, 0(3) + ld 11, 8(3) + ld 12, 16(3) + + # compare modulus + # h + 5 + (-p) + mr 6, 10 + mr 7, 11 + mr 8, 12 + addic. 6, 6, 5 + addze 7, 7 + addze 8, 8 + srdi 9, 8, 2 # overflow? + cmpdi 9, 0 + beq Skip_h64 + mr 10, 6 + mr 11, 7 + mr 12, 8 + +Skip_h64: + ld 6, 0(4) + ld 7, 8(4) + addc 10, 10, 6 + adde 11, 11, 7 + addze 12, 12 + + std 10, 0(5) + std 11, 8(5) + blr +SYM_FUNC_END(poly1305_emit_64) + +SYM_DATA_START_LOCAL(RMASK) +.align 5 +rmask: +.byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f +cnum: +.long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000 +.long 0x1a, 0x00, 0x1a, 0x00 +.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 +.long 0x00010203, 0x04050607, 0x10111213, 0x14151617 +.long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f +SYM_DATA_END(RMASK) diff --git a/lib/crypto/powerpc/sha1-powerpc-asm.S b/lib/crypto/powerpc/sha1-powerpc-asm.S new file mode 100644 index 000000000000..f0d5ed557ab1 --- /dev/null +++ b/lib/crypto/powerpc/sha1-powerpc-asm.S @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * SHA-1 implementation for PowerPC. + * + * Copyright (C) 2005 Paul Mackerras <paulus@samba.org> + */ + +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/asm-compat.h> + +#ifdef __BIG_ENDIAN__ +#define LWZ(rt, d, ra) \ + lwz rt,d(ra) +#else +#define LWZ(rt, d, ra) \ + li rt,d; \ + lwbrx rt,rt,ra +#endif + +/* + * We roll the registers for T, A, B, C, D, E around on each + * iteration; T on iteration t is A on iteration t+1, and so on. + * We use registers 7 - 12 for this. + */ +#define RT(t) ((((t)+5)%6)+7) +#define RA(t) ((((t)+4)%6)+7) +#define RB(t) ((((t)+3)%6)+7) +#define RC(t) ((((t)+2)%6)+7) +#define RD(t) ((((t)+1)%6)+7) +#define RE(t) ((((t)+0)%6)+7) + +/* We use registers 16 - 31 for the W values */ +#define W(t) (((t)%16)+16) + +#define LOADW(t) \ + LWZ(W(t),(t)*4,r4) + +#define STEPD0_LOAD(t) \ + andc r0,RD(t),RB(t); \ + and r6,RB(t),RC(t); \ + rotlwi RT(t),RA(t),5; \ + or r6,r6,r0; \ + add r0,RE(t),r15; \ + add RT(t),RT(t),r6; \ + add r14,r0,W(t); \ + LWZ(W((t)+4),((t)+4)*4,r4); \ + rotlwi RB(t),RB(t),30; \ + add RT(t),RT(t),r14 + +#define STEPD0_UPDATE(t) \ + and r6,RB(t),RC(t); \ + andc r0,RD(t),RB(t); \ + rotlwi RT(t),RA(t),5; \ + rotlwi RB(t),RB(t),30; \ + or r6,r6,r0; \ + add r0,RE(t),r15; \ + xor r5,W((t)+4-3),W((t)+4-8); \ + add RT(t),RT(t),r6; \ + xor W((t)+4),W((t)+4-16),W((t)+4-14); \ + add r0,r0,W(t); \ + xor W((t)+4),W((t)+4),r5; \ + add RT(t),RT(t),r0; \ + rotlwi W((t)+4),W((t)+4),1 + +#define STEPD1(t) \ + xor r6,RB(t),RC(t); \ + rotlwi RT(t),RA(t),5; \ + rotlwi RB(t),RB(t),30; \ + xor r6,r6,RD(t); \ + add r0,RE(t),r15; \ + add RT(t),RT(t),r6; \ + add r0,r0,W(t); \ + add RT(t),RT(t),r0 + +#define STEPD1_UPDATE(t) \ + xor r6,RB(t),RC(t); \ + rotlwi RT(t),RA(t),5; \ + rotlwi RB(t),RB(t),30; \ + xor r6,r6,RD(t); \ + add r0,RE(t),r15; \ + xor r5,W((t)+4-3),W((t)+4-8); \ + add RT(t),RT(t),r6; \ + xor W((t)+4),W((t)+4-16),W((t)+4-14); \ + add r0,r0,W(t); \ + xor W((t)+4),W((t)+4),r5; \ + add RT(t),RT(t),r0; \ + rotlwi W((t)+4),W((t)+4),1 + +#define STEPD2_UPDATE(t) \ + and r6,RB(t),RC(t); \ + and r0,RB(t),RD(t); \ + rotlwi RT(t),RA(t),5; \ + or r6,r6,r0; \ + rotlwi RB(t),RB(t),30; \ + and r0,RC(t),RD(t); \ + xor r5,W((t)+4-3),W((t)+4-8); \ + or r6,r6,r0; \ + xor W((t)+4),W((t)+4-16),W((t)+4-14); \ + add r0,RE(t),r15; \ + add RT(t),RT(t),r6; \ + add r0,r0,W(t); \ + xor W((t)+4),W((t)+4),r5; \ + add RT(t),RT(t),r0; \ + rotlwi W((t)+4),W((t)+4),1 + +#define STEP0LD4(t) \ + STEPD0_LOAD(t); \ + STEPD0_LOAD((t)+1); \ + STEPD0_LOAD((t)+2); \ + STEPD0_LOAD((t)+3) + +#define STEPUP4(t, fn) \ + STEP##fn##_UPDATE(t); \ + STEP##fn##_UPDATE((t)+1); \ + STEP##fn##_UPDATE((t)+2); \ + STEP##fn##_UPDATE((t)+3) + +#define STEPUP20(t, fn) \ + STEPUP4(t, fn); \ + STEPUP4((t)+4, fn); \ + STEPUP4((t)+8, fn); \ + STEPUP4((t)+12, fn); \ + STEPUP4((t)+16, fn) + +_GLOBAL(powerpc_sha_transform) + PPC_STLU r1,-INT_FRAME_SIZE(r1) + SAVE_GPRS(14, 31, r1) + + /* Load up A - E */ + lwz RA(0),0(r3) /* A */ + lwz RB(0),4(r3) /* B */ + lwz RC(0),8(r3) /* C */ + lwz RD(0),12(r3) /* D */ + lwz RE(0),16(r3) /* E */ + + LOADW(0) + LOADW(1) + LOADW(2) + LOADW(3) + + lis r15,0x5a82 /* K0-19 */ + ori r15,r15,0x7999 + STEP0LD4(0) + STEP0LD4(4) + STEP0LD4(8) + STEPUP4(12, D0) + STEPUP4(16, D0) + + lis r15,0x6ed9 /* K20-39 */ + ori r15,r15,0xeba1 + STEPUP20(20, D1) + + lis r15,0x8f1b /* K40-59 */ + ori r15,r15,0xbcdc + STEPUP20(40, D2) + + lis r15,0xca62 /* K60-79 */ + ori r15,r15,0xc1d6 + STEPUP4(60, D1) + STEPUP4(64, D1) + STEPUP4(68, D1) + STEPUP4(72, D1) + lwz r20,16(r3) + STEPD1(76) + lwz r19,12(r3) + STEPD1(77) + lwz r18,8(r3) + STEPD1(78) + lwz r17,4(r3) + STEPD1(79) + + lwz r16,0(r3) + add r20,RE(80),r20 + add RD(0),RD(80),r19 + add RC(0),RC(80),r18 + add RB(0),RB(80),r17 + add RA(0),RA(80),r16 + mr RE(0),r20 + stw RA(0),0(r3) + stw RB(0),4(r3) + stw RC(0),8(r3) + stw RD(0),12(r3) + stw RE(0),16(r3) + + REST_GPRS(14, 31, r1) + addi r1,r1,INT_FRAME_SIZE + blr diff --git a/lib/crypto/powerpc/sha1-spe-asm.S b/lib/crypto/powerpc/sha1-spe-asm.S new file mode 100644 index 000000000000..0f447523be5e --- /dev/null +++ b/lib/crypto/powerpc/sha1-spe-asm.S @@ -0,0 +1,294 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Fast SHA-1 implementation for SPE instruction set (PPC) + * + * This code makes use of the SPE SIMD instruction set as defined in + * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf + * Implementation is based on optimization guide notes from + * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf + * + * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> + */ + +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> + +#define rHP r3 /* pointer to hash value */ +#define rWP r4 /* pointer to input */ +#define rKP r5 /* pointer to constants */ + +#define rW0 r14 /* 64 bit round words */ +#define rW1 r15 +#define rW2 r16 +#define rW3 r17 +#define rW4 r18 +#define rW5 r19 +#define rW6 r20 +#define rW7 r21 + +#define rH0 r6 /* 32 bit hash values */ +#define rH1 r7 +#define rH2 r8 +#define rH3 r9 +#define rH4 r10 + +#define rT0 r22 /* 64 bit temporary */ +#define rT1 r0 /* 32 bit temporaries */ +#define rT2 r11 +#define rT3 r12 + +#define rK r23 /* 64 bit constant in volatile register */ + +#define LOAD_K01 + +#define LOAD_K11 \ + evlwwsplat rK,0(rKP); + +#define LOAD_K21 \ + evlwwsplat rK,4(rKP); + +#define LOAD_K31 \ + evlwwsplat rK,8(rKP); + +#define LOAD_K41 \ + evlwwsplat rK,12(rKP); + +#define INITIALIZE \ + stwu r1,-128(r1); /* create stack frame */ \ + evstdw r14,8(r1); /* We must save non volatile */ \ + evstdw r15,16(r1); /* registers. Take the chance */ \ + evstdw r16,24(r1); /* and save the SPE part too */ \ + evstdw r17,32(r1); \ + evstdw r18,40(r1); \ + evstdw r19,48(r1); \ + evstdw r20,56(r1); \ + evstdw r21,64(r1); \ + evstdw r22,72(r1); \ + evstdw r23,80(r1); + + +#define FINALIZE \ + evldw r14,8(r1); /* restore SPE registers */ \ + evldw r15,16(r1); \ + evldw r16,24(r1); \ + evldw r17,32(r1); \ + evldw r18,40(r1); \ + evldw r19,48(r1); \ + evldw r20,56(r1); \ + evldw r21,64(r1); \ + evldw r22,72(r1); \ + evldw r23,80(r1); \ + xor r0,r0,r0; \ + stw r0,8(r1); /* Delete sensitive data */ \ + stw r0,16(r1); /* that we might have pushed */ \ + stw r0,24(r1); /* from other context that runs */ \ + stw r0,32(r1); /* the same code. Assume that */ \ + stw r0,40(r1); /* the lower part of the GPRs */ \ + stw r0,48(r1); /* were already overwritten on */ \ + stw r0,56(r1); /* the way down to here */ \ + stw r0,64(r1); \ + stw r0,72(r1); \ + stw r0,80(r1); \ + addi r1,r1,128; /* cleanup stack frame */ + +#ifdef __BIG_ENDIAN__ +#define LOAD_DATA(reg, off) \ + lwz reg,off(rWP); /* load data */ +#define NEXT_BLOCK \ + addi rWP,rWP,64; /* increment per block */ +#else +#define LOAD_DATA(reg, off) \ + lwbrx reg,0,rWP; /* load data */ \ + addi rWP,rWP,4; /* increment per word */ +#define NEXT_BLOCK /* nothing to do */ +#endif + +#define R_00_15(a, b, c, d, e, w0, w1, k, off) \ + LOAD_DATA(w0, off) /* 1: W */ \ + and rT2,b,c; /* 1: F' = B and C */ \ + LOAD_K##k##1 \ + andc rT1,d,b; /* 1: F" = ~B and D */ \ + rotrwi rT0,a,27; /* 1: A' = A rotl 5 */ \ + or rT2,rT2,rT1; /* 1: F = F' or F" */ \ + add e,e,rT0; /* 1: E = E + A' */ \ + rotrwi b,b,2; /* 1: B = B rotl 30 */ \ + add e,e,w0; /* 1: E = E + W */ \ + LOAD_DATA(w1, off+4) /* 2: W */ \ + add e,e,rT2; /* 1: E = E + F */ \ + and rT1,a,b; /* 2: F' = B and C */ \ + add e,e,rK; /* 1: E = E + K */ \ + andc rT2,c,a; /* 2: F" = ~B and D */ \ + add d,d,rK; /* 2: E = E + K */ \ + or rT2,rT2,rT1; /* 2: F = F' or F" */ \ + rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ + add d,d,w1; /* 2: E = E + W */ \ + rotrwi a,a,2; /* 2: B = B rotl 30 */ \ + add d,d,rT0; /* 2: E = E + A' */ \ + evmergelo w1,w1,w0; /* mix W[0]/W[1] */ \ + add d,d,rT2 /* 2: E = E + F */ + +#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ + and rT2,b,c; /* 1: F' = B and C */ \ + evmergelohi rT0,w7,w6; /* W[-3] */ \ + andc rT1,d,b; /* 1: F" = ~B and D */ \ + evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ + or rT1,rT1,rT2; /* 1: F = F' or F" */ \ + evxor w0,w0,w4; /* W = W xor W[-8] */ \ + add e,e,rT1; /* 1: E = E + F */ \ + evxor w0,w0,w1; /* W = W xor W[-14] */ \ + rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ + evrlwi w0,w0,1; /* W = W rotl 1 */ \ + add e,e,rT2; /* 1: E = E + A' */ \ + evaddw rT0,w0,rK; /* WK = W + K */ \ + rotrwi b,b,2; /* 1: B = B rotl 30 */ \ + LOAD_K##k##1 \ + evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ + add e,e,rT0; /* 1: E = E + WK */ \ + add d,d,rT1; /* 2: E = E + WK */ \ + and rT2,a,b; /* 2: F' = B and C */ \ + andc rT1,c,a; /* 2: F" = ~B and D */ \ + rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ + or rT1,rT1,rT2; /* 2: F = F' or F" */ \ + add d,d,rT0; /* 2: E = E + A' */ \ + rotrwi a,a,2; /* 2: B = B rotl 30 */ \ + add d,d,rT1 /* 2: E = E + F */ + +#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ + evmergelohi rT0,w7,w6; /* W[-3] */ \ + xor rT2,b,c; /* 1: F' = B xor C */ \ + evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ + xor rT2,rT2,d; /* 1: F = F' xor D */ \ + evxor w0,w0,w4; /* W = W xor W[-8] */ \ + add e,e,rT2; /* 1: E = E + F */ \ + evxor w0,w0,w1; /* W = W xor W[-14] */ \ + rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ + evrlwi w0,w0,1; /* W = W rotl 1 */ \ + add e,e,rT2; /* 1: E = E + A' */ \ + evaddw rT0,w0,rK; /* WK = W + K */ \ + rotrwi b,b,2; /* 1: B = B rotl 30 */ \ + LOAD_K##k##1 \ + evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ + add e,e,rT0; /* 1: E = E + WK */ \ + xor rT2,a,b; /* 2: F' = B xor C */ \ + add d,d,rT1; /* 2: E = E + WK */ \ + xor rT2,rT2,c; /* 2: F = F' xor D */ \ + rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ + add d,d,rT2; /* 2: E = E + F */ \ + rotrwi a,a,2; /* 2: B = B rotl 30 */ \ + add d,d,rT0 /* 2: E = E + A' */ + +#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ + and rT2,b,c; /* 1: F' = B and C */ \ + evmergelohi rT0,w7,w6; /* W[-3] */ \ + or rT1,b,c; /* 1: F" = B or C */ \ + evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ + and rT1,d,rT1; /* 1: F" = F" and D */ \ + evxor w0,w0,w4; /* W = W xor W[-8] */ \ + or rT2,rT2,rT1; /* 1: F = F' or F" */ \ + evxor w0,w0,w1; /* W = W xor W[-14] */ \ + add e,e,rT2; /* 1: E = E + F */ \ + evrlwi w0,w0,1; /* W = W rotl 1 */ \ + rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ + evaddw rT0,w0,rK; /* WK = W + K */ \ + add e,e,rT2; /* 1: E = E + A' */ \ + LOAD_K##k##1 \ + evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ + rotrwi b,b,2; /* 1: B = B rotl 30 */ \ + add e,e,rT0; /* 1: E = E + WK */ \ + and rT2,a,b; /* 2: F' = B and C */ \ + or rT0,a,b; /* 2: F" = B or C */ \ + add d,d,rT1; /* 2: E = E + WK */ \ + and rT0,c,rT0; /* 2: F" = F" and D */ \ + rotrwi a,a,2; /* 2: B = B rotl 30 */ \ + or rT2,rT2,rT0; /* 2: F = F' or F" */ \ + rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ + add d,d,rT2; /* 2: E = E + F */ \ + add d,d,rT0 /* 2: E = E + A' */ + +#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ + R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) + +_GLOBAL(ppc_spe_sha1_transform) + INITIALIZE + + lwz rH0,0(rHP) + lwz rH1,4(rHP) + mtctr r5 + lwz rH2,8(rHP) + lis rKP,PPC_SPE_SHA1_K@h + lwz rH3,12(rHP) + ori rKP,rKP,PPC_SPE_SHA1_K@l + lwz rH4,16(rHP) + +ppc_spe_sha1_main: + R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0) + R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8) + R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16) + R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24) + R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32) + R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40) + R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48) + R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56) + + R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0) + R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2) + + R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0) + R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0) + R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0) + R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0) + R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0) + R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0) + R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0) + R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0) + R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0) + R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3) + + R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0) + R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0) + R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0) + R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0) + R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0) + R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0) + R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0) + R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0) + R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0) + R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4) + + R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0) + R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0) + R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0) + R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0) + R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0) + R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0) + R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0) + lwz rT3,0(rHP) + R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0) + lwz rW1,4(rHP) + R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0) + lwz rW2,8(rHP) + R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0) + lwz rW3,12(rHP) + NEXT_BLOCK + lwz rW4,16(rHP) + + add rH0,rH0,rT3 + stw rH0,0(rHP) + add rH1,rH1,rW1 + stw rH1,4(rHP) + add rH2,rH2,rW2 + stw rH2,8(rHP) + add rH3,rH3,rW3 + stw rH3,12(rHP) + add rH4,rH4,rW4 + stw rH4,16(rHP) + + bdnz ppc_spe_sha1_main + + FINALIZE + blr + +.data +.align 4 +PPC_SPE_SHA1_K: + .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6 diff --git a/lib/crypto/powerpc/sha1.h b/lib/crypto/powerpc/sha1.h new file mode 100644 index 000000000000..e2c010f0370b --- /dev/null +++ b/lib/crypto/powerpc/sha1.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-1 optimized for PowerPC + * + * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> + */ + +#include <asm/switch_to.h> +#include <linux/preempt.h> + +#ifdef CONFIG_SPE +/* + * MAX_BYTES defines the number of bytes that are allowed to be processed + * between preempt_disable() and preempt_enable(). SHA1 takes ~1000 + * operations per 64 bytes. e500 cores can issue two arithmetic instructions + * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2). + * Thus 2KB of input data will need an estimated maximum of 18,000 cycles. + * Headroom for cache misses included. Even with the low end model clocked + * at 667 MHz this equals to a critical time window of less than 27us. + * + */ +#define MAX_BYTES 2048 + +asmlinkage void ppc_spe_sha1_transform(struct sha1_block_state *state, + const u8 *data, u32 nblocks); + +static void spe_begin(void) +{ + /* We just start SPE operations and will save SPE registers later. */ + preempt_disable(); + enable_kernel_spe(); +} + +static void spe_end(void) +{ + disable_kernel_spe(); + /* reenable preemption */ + preempt_enable(); +} + +static void sha1_blocks(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + do { + u32 unit = min_t(size_t, nblocks, MAX_BYTES / SHA1_BLOCK_SIZE); + + spe_begin(); + ppc_spe_sha1_transform(state, data, unit); + spe_end(); + + data += unit * SHA1_BLOCK_SIZE; + nblocks -= unit; + } while (nblocks); +} +#else /* CONFIG_SPE */ +asmlinkage void powerpc_sha_transform(struct sha1_block_state *state, + const u8 data[SHA1_BLOCK_SIZE]); + +static void sha1_blocks(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + do { + powerpc_sha_transform(state, data); + data += SHA1_BLOCK_SIZE; + } while (--nblocks); +} +#endif /* !CONFIG_SPE */ diff --git a/lib/crypto/powerpc/sha256-spe-asm.S b/lib/crypto/powerpc/sha256-spe-asm.S new file mode 100644 index 000000000000..cd99d71dae34 --- /dev/null +++ b/lib/crypto/powerpc/sha256-spe-asm.S @@ -0,0 +1,318 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Fast SHA-256 implementation for SPE instruction set (PPC) + * + * This code makes use of the SPE SIMD instruction set as defined in + * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf + * Implementation is based on optimization guide notes from + * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf + * + * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> + */ + +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> + +#define rHP r3 /* pointer to hash values in memory */ +#define rKP r24 /* pointer to round constants */ +#define rWP r4 /* pointer to input data */ + +#define rH0 r5 /* 8 32 bit hash values in 8 registers */ +#define rH1 r6 +#define rH2 r7 +#define rH3 r8 +#define rH4 r9 +#define rH5 r10 +#define rH6 r11 +#define rH7 r12 + +#define rW0 r14 /* 64 bit registers. 16 words in 8 registers */ +#define rW1 r15 +#define rW2 r16 +#define rW3 r17 +#define rW4 r18 +#define rW5 r19 +#define rW6 r20 +#define rW7 r21 + +#define rT0 r22 /* 64 bit temporaries */ +#define rT1 r23 +#define rT2 r0 /* 32 bit temporaries */ +#define rT3 r25 + +#define CMP_KN_LOOP +#define CMP_KC_LOOP \ + cmpwi rT1,0; + +#define INITIALIZE \ + stwu r1,-128(r1); /* create stack frame */ \ + evstdw r14,8(r1); /* We must save non volatile */ \ + evstdw r15,16(r1); /* registers. Take the chance */ \ + evstdw r16,24(r1); /* and save the SPE part too */ \ + evstdw r17,32(r1); \ + evstdw r18,40(r1); \ + evstdw r19,48(r1); \ + evstdw r20,56(r1); \ + evstdw r21,64(r1); \ + evstdw r22,72(r1); \ + evstdw r23,80(r1); \ + stw r24,88(r1); /* save normal registers */ \ + stw r25,92(r1); + + +#define FINALIZE \ + evldw r14,8(r1); /* restore SPE registers */ \ + evldw r15,16(r1); \ + evldw r16,24(r1); \ + evldw r17,32(r1); \ + evldw r18,40(r1); \ + evldw r19,48(r1); \ + evldw r20,56(r1); \ + evldw r21,64(r1); \ + evldw r22,72(r1); \ + evldw r23,80(r1); \ + lwz r24,88(r1); /* restore normal registers */ \ + lwz r25,92(r1); \ + xor r0,r0,r0; \ + stw r0,8(r1); /* Delete sensitive data */ \ + stw r0,16(r1); /* that we might have pushed */ \ + stw r0,24(r1); /* from other context that runs */ \ + stw r0,32(r1); /* the same code. Assume that */ \ + stw r0,40(r1); /* the lower part of the GPRs */ \ + stw r0,48(r1); /* was already overwritten on */ \ + stw r0,56(r1); /* the way down to here */ \ + stw r0,64(r1); \ + stw r0,72(r1); \ + stw r0,80(r1); \ + addi r1,r1,128; /* cleanup stack frame */ + +#ifdef __BIG_ENDIAN__ +#define LOAD_DATA(reg, off) \ + lwz reg,off(rWP); /* load data */ +#define NEXT_BLOCK \ + addi rWP,rWP,64; /* increment per block */ +#else +#define LOAD_DATA(reg, off) \ + lwbrx reg,0,rWP; /* load data */ \ + addi rWP,rWP,4; /* increment per word */ +#define NEXT_BLOCK /* nothing to do */ +#endif + +#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \ + LOAD_DATA(w, off) /* 1: W */ \ + rotrwi rT0,e,6; /* 1: S1 = e rotr 6 */ \ + rotrwi rT1,e,11; /* 1: S1' = e rotr 11 */ \ + rotrwi rT2,e,25; /* 1: S1" = e rotr 25 */ \ + xor rT0,rT0,rT1; /* 1: S1 = S1 xor S1' */ \ + and rT3,e,f; /* 1: ch = e and f */ \ + xor rT0,rT0,rT2; /* 1: S1 = S1 xor S1" */ \ + andc rT1,g,e; /* 1: ch' = ~e and g */ \ + lwz rT2,off(rKP); /* 1: K */ \ + xor rT3,rT3,rT1; /* 1: ch = ch xor ch' */ \ + add h,h,rT0; /* 1: temp1 = h + S1 */ \ + add rT3,rT3,w; /* 1: temp1' = ch + w */ \ + rotrwi rT0,a,2; /* 1: S0 = a rotr 2 */ \ + add h,h,rT3; /* 1: temp1 = temp1 + temp1' */ \ + rotrwi rT1,a,13; /* 1: S0' = a rotr 13 */ \ + add h,h,rT2; /* 1: temp1 = temp1 + K */ \ + rotrwi rT3,a,22; /* 1: S0" = a rotr 22 */ \ + xor rT0,rT0,rT1; /* 1: S0 = S0 xor S0' */ \ + add d,d,h; /* 1: d = d + temp1 */ \ + xor rT3,rT0,rT3; /* 1: S0 = S0 xor S0" */ \ + evmergelo w,w,w; /* shift W */ \ + or rT2,a,b; /* 1: maj = a or b */ \ + and rT1,a,b; /* 1: maj' = a and b */ \ + and rT2,rT2,c; /* 1: maj = maj and c */ \ + LOAD_DATA(w, off+4) /* 2: W */ \ + or rT2,rT1,rT2; /* 1: maj = maj or maj' */ \ + rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \ + add rT3,rT3,rT2; /* 1: temp2 = S0 + maj */ \ + rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \ + add h,h,rT3; /* 1: h = temp1 + temp2 */ \ + rotrwi rT2,d,25; /* 2: S1" = e rotr 25 */ \ + xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \ + and rT3,d,e; /* 2: ch = e and f */ \ + xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \ + andc rT1,f,d; /* 2: ch' = ~e and g */ \ + lwz rT2,off+4(rKP); /* 2: K */ \ + xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \ + add g,g,rT0; /* 2: temp1 = h + S1 */ \ + add rT3,rT3,w; /* 2: temp1' = ch + w */ \ + rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \ + add g,g,rT3; /* 2: temp1 = temp1 + temp1' */ \ + rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \ + add g,g,rT2; /* 2: temp1 = temp1 + K */ \ + rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \ + xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \ + or rT2,h,a; /* 2: maj = a or b */ \ + xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \ + and rT1,h,a; /* 2: maj' = a and b */ \ + and rT2,rT2,b; /* 2: maj = maj and c */ \ + add c,c,g; /* 2: d = d + temp1 */ \ + or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \ + add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \ + add g,g,rT3 /* 2: h = temp1 + temp2 */ + +#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \ + rotrwi rT2,e,6; /* 1: S1 = e rotr 6 */ \ + evmergelohi rT0,w0,w1; /* w[-15] */ \ + rotrwi rT3,e,11; /* 1: S1' = e rotr 11 */ \ + evsrwiu rT1,rT0,3; /* s0 = w[-15] >> 3 */ \ + xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \ + evrlwi rT0,rT0,25; /* s0' = w[-15] rotr 7 */ \ + rotrwi rT3,e,25; /* 1: S1' = e rotr 25 */ \ + evxor rT1,rT1,rT0; /* s0 = s0 xor s0' */ \ + xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \ + evrlwi rT0,rT0,21; /* s0' = w[-15] rotr 18 */ \ + add h,h,rT2; /* 1: temp1 = h + S1 */ \ + evxor rT0,rT0,rT1; /* s0 = s0 xor s0' */ \ + and rT2,e,f; /* 1: ch = e and f */ \ + evaddw w0,w0,rT0; /* w = w[-16] + s0 */ \ + andc rT3,g,e; /* 1: ch' = ~e and g */ \ + evsrwiu rT0,w7,10; /* s1 = w[-2] >> 10 */ \ + xor rT2,rT2,rT3; /* 1: ch = ch xor ch' */ \ + evrlwi rT1,w7,15; /* s1' = w[-2] rotr 17 */ \ + add h,h,rT2; /* 1: temp1 = temp1 + ch */ \ + evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \ + rotrwi rT2,a,2; /* 1: S0 = a rotr 2 */ \ + evrlwi rT1,w7,13; /* s1' = w[-2] rotr 19 */ \ + rotrwi rT3,a,13; /* 1: S0' = a rotr 13 */ \ + evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \ + xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \ + evldw rT1,off(rKP); /* k */ \ + rotrwi rT3,a,22; /* 1: S0' = a rotr 22 */ \ + evaddw w0,w0,rT0; /* w = w + s1 */ \ + xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \ + evmergelohi rT0,w4,w5; /* w[-7] */ \ + and rT3,a,b; /* 1: maj = a and b */ \ + evaddw w0,w0,rT0; /* w = w + w[-7] */ \ + CMP_K##k##_LOOP \ + add rT2,rT2,rT3; /* 1: temp2 = S0 + maj */ \ + evaddw rT1,rT1,w0; /* wk = w + k */ \ + xor rT3,a,b; /* 1: maj = a xor b */ \ + evmergehi rT0,rT1,rT1; /* wk1/wk2 */ \ + and rT3,rT3,c; /* 1: maj = maj and c */ \ + add h,h,rT0; /* 1: temp1 = temp1 + wk */ \ + add rT2,rT2,rT3; /* 1: temp2 = temp2 + maj */ \ + add g,g,rT1; /* 2: temp1 = temp1 + wk */ \ + add d,d,h; /* 1: d = d + temp1 */ \ + rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \ + add h,h,rT2; /* 1: h = temp1 + temp2 */ \ + rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \ + rotrwi rT2,d,25; /* 2: S" = e rotr 25 */ \ + xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \ + and rT3,d,e; /* 2: ch = e and f */ \ + xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \ + andc rT1,f,d; /* 2: ch' = ~e and g */ \ + add g,g,rT0; /* 2: temp1 = h + S1 */ \ + xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \ + rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \ + add g,g,rT3; /* 2: temp1 = temp1 + ch */ \ + rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \ + rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \ + xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \ + or rT2,h,a; /* 2: maj = a or b */ \ + and rT1,h,a; /* 2: maj' = a and b */ \ + and rT2,rT2,b; /* 2: maj = maj and c */ \ + xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \ + or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \ + add c,c,g; /* 2: d = d + temp1 */ \ + add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \ + add g,g,rT3 /* 2: h = temp1 + temp2 */ + +_GLOBAL(ppc_spe_sha256_transform) + INITIALIZE + + mtctr r5 + lwz rH0,0(rHP) + lwz rH1,4(rHP) + lwz rH2,8(rHP) + lwz rH3,12(rHP) + lwz rH4,16(rHP) + lwz rH5,20(rHP) + lwz rH6,24(rHP) + lwz rH7,28(rHP) + +ppc_spe_sha256_main: + lis rKP,PPC_SPE_SHA256_K@ha + addi rKP,rKP,PPC_SPE_SHA256_K@l + + R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0) + R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8) + R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16) + R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24) + R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32) + R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40) + R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48) + R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56) +ppc_spe_sha256_16_rounds: + addi rKP,rKP,64 + R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, + rW0, rW1, rW4, rW5, rW7, N, 0) + R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, + rW1, rW2, rW5, rW6, rW0, N, 8) + R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, + rW2, rW3, rW6, rW7, rW1, N, 16) + R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, + rW3, rW4, rW7, rW0, rW2, N, 24) + R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, + rW4, rW5, rW0, rW1, rW3, N, 32) + R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, + rW5, rW6, rW1, rW2, rW4, N, 40) + R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, + rW6, rW7, rW2, rW3, rW5, N, 48) + R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, + rW7, rW0, rW3, rW4, rW6, C, 56) + bt gt,ppc_spe_sha256_16_rounds + + lwz rW0,0(rHP) + NEXT_BLOCK + lwz rW1,4(rHP) + lwz rW2,8(rHP) + lwz rW3,12(rHP) + lwz rW4,16(rHP) + lwz rW5,20(rHP) + lwz rW6,24(rHP) + lwz rW7,28(rHP) + + add rH0,rH0,rW0 + stw rH0,0(rHP) + add rH1,rH1,rW1 + stw rH1,4(rHP) + add rH2,rH2,rW2 + stw rH2,8(rHP) + add rH3,rH3,rW3 + stw rH3,12(rHP) + add rH4,rH4,rW4 + stw rH4,16(rHP) + add rH5,rH5,rW5 + stw rH5,20(rHP) + add rH6,rH6,rW6 + stw rH6,24(rHP) + add rH7,rH7,rW7 + stw rH7,28(rHP) + + bdnz ppc_spe_sha256_main + + FINALIZE + blr + +.data +.align 5 +PPC_SPE_SHA256_K: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 diff --git a/lib/crypto/powerpc/sha256.h b/lib/crypto/powerpc/sha256.h new file mode 100644 index 000000000000..50d355441c7e --- /dev/null +++ b/lib/crypto/powerpc/sha256.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-256 Secure Hash Algorithm, SPE optimized + * + * Based on generic implementation. The assembler module takes care + * about the SPE registers so it can run from interrupt context. + * + * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> + */ + +#include <asm/switch_to.h> +#include <linux/preempt.h> + +/* + * MAX_BYTES defines the number of bytes that are allowed to be processed + * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000 + * operations per 64 bytes. e500 cores can issue two arithmetic instructions + * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2). + * Thus 1KB of input data will need an estimated maximum of 18,000 cycles. + * Headroom for cache misses included. Even with the low end model clocked + * at 667 MHz this equals to a critical time window of less than 27us. + * + */ +#define MAX_BYTES 1024 + +extern void ppc_spe_sha256_transform(struct sha256_block_state *state, + const u8 *src, u32 blocks); + +static void spe_begin(void) +{ + /* We just start SPE operations and will save SPE registers later. */ + preempt_disable(); + enable_kernel_spe(); +} + +static void spe_end(void) +{ + disable_kernel_spe(); + /* reenable preemption */ + preempt_enable(); +} + +static void sha256_blocks(struct sha256_block_state *state, + const u8 *data, size_t nblocks) +{ + do { + /* cut input data into smaller blocks */ + u32 unit = min_t(size_t, nblocks, + MAX_BYTES / SHA256_BLOCK_SIZE); + + spe_begin(); + ppc_spe_sha256_transform(state, data, unit); + spe_end(); + + data += unit * SHA256_BLOCK_SIZE; + nblocks -= unit; + } while (nblocks); +} diff --git a/lib/crypto/riscv/Kconfig b/lib/crypto/riscv/Kconfig new file mode 100644 index 000000000000..bc7a43f33eb3 --- /dev/null +++ b/lib/crypto/riscv/Kconfig @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_CHACHA_RISCV64 + tristate + depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + default CRYPTO_LIB_CHACHA + select CRYPTO_ARCH_HAVE_LIB_CHACHA + select CRYPTO_LIB_CHACHA_GENERIC diff --git a/lib/crypto/riscv/Makefile b/lib/crypto/riscv/Makefile new file mode 100644 index 000000000000..e27b78f317fc --- /dev/null +++ b/lib/crypto/riscv/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_CHACHA_RISCV64) += chacha-riscv64.o +chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o diff --git a/lib/crypto/riscv/chacha-riscv64-glue.c b/lib/crypto/riscv/chacha-riscv64-glue.c new file mode 100644 index 000000000000..8c3f11d79be3 --- /dev/null +++ b/lib/crypto/riscv/chacha-riscv64-glue.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * ChaCha stream cipher (RISC-V optimized) + * + * Copyright (C) 2023 SiFive, Inc. + * Author: Jerry Shih <jerry.shih@sifive.com> + */ + +#include <asm/simd.h> +#include <asm/vector.h> +#include <crypto/chacha.h> +#include <crypto/internal/simd.h> +#include <linux/linkage.h> +#include <linux/module.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_zvkb); + +asmlinkage void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out, + size_t nblocks, int nrounds); + +void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + hchacha_block_generic(state, out, nrounds); +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + u8 block_buffer[CHACHA_BLOCK_SIZE]; + unsigned int full_blocks = bytes / CHACHA_BLOCK_SIZE; + unsigned int tail_bytes = bytes % CHACHA_BLOCK_SIZE; + + if (!static_branch_likely(&use_zvkb) || !crypto_simd_usable()) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + kernel_vector_begin(); + if (full_blocks) { + chacha_zvkb(state, src, dst, full_blocks, nrounds); + src += full_blocks * CHACHA_BLOCK_SIZE; + dst += full_blocks * CHACHA_BLOCK_SIZE; + } + if (tail_bytes) { + memcpy(block_buffer, src, tail_bytes); + chacha_zvkb(state, block_buffer, block_buffer, 1, nrounds); + memcpy(dst, block_buffer, tail_bytes); + } + kernel_vector_end(); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +bool chacha_is_arch_optimized(void) +{ + return static_key_enabled(&use_zvkb); +} +EXPORT_SYMBOL(chacha_is_arch_optimized); + +static int __init riscv64_chacha_mod_init(void) +{ + if (riscv_isa_extension_available(NULL, ZVKB) && + riscv_vector_vlen() >= 128) + static_branch_enable(&use_zvkb); + return 0; +} +subsys_initcall(riscv64_chacha_mod_init); + +static void __exit riscv64_chacha_mod_exit(void) +{ +} +module_exit(riscv64_chacha_mod_exit); + +MODULE_DESCRIPTION("ChaCha stream cipher (RISC-V optimized)"); +MODULE_AUTHOR("Jerry Shih <jerry.shih@sifive.com>"); +MODULE_LICENSE("GPL"); diff --git a/lib/crypto/riscv/chacha-riscv64-zvkb.S b/lib/crypto/riscv/chacha-riscv64-zvkb.S new file mode 100644 index 000000000000..b777d0b4e379 --- /dev/null +++ b/lib/crypto/riscv/chacha-riscv64-zvkb.S @@ -0,0 +1,297 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// This file is dual-licensed, meaning that you can use it under your +// choice of either of the following two licenses: +// +// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You can obtain +// a copy in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// or +// +// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com> +// Copyright 2024 Google LLC +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The generated code of this file depends on the following RISC-V extensions: +// - RV64I +// - RISC-V Vector ('V') with VLEN >= 128 +// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') + +#include <linux/linkage.h> + +.text +.option arch, +zvkb + +#define STATEP a0 +#define INP a1 +#define OUTP a2 +#define NBLOCKS a3 +#define NROUNDS a4 + +#define CONSTS0 a5 +#define CONSTS1 a6 +#define CONSTS2 a7 +#define CONSTS3 t0 +#define TMP t1 +#define VL t2 +#define STRIDE t3 +#define ROUND_CTR t4 +#define KEY0 s0 +#define KEY1 s1 +#define KEY2 s2 +#define KEY3 s3 +#define KEY4 s4 +#define KEY5 s5 +#define KEY6 s6 +#define KEY7 s7 +#define COUNTER s8 +#define NONCE0 s9 +#define NONCE1 s10 +#define NONCE2 s11 + +.macro chacha_round a0, b0, c0, d0, a1, b1, c1, d1, \ + a2, b2, c2, d2, a3, b3, c3, d3 + // a += b; d ^= a; d = rol(d, 16); + vadd.vv \a0, \a0, \b0 + vadd.vv \a1, \a1, \b1 + vadd.vv \a2, \a2, \b2 + vadd.vv \a3, \a3, \b3 + vxor.vv \d0, \d0, \a0 + vxor.vv \d1, \d1, \a1 + vxor.vv \d2, \d2, \a2 + vxor.vv \d3, \d3, \a3 + vror.vi \d0, \d0, 32 - 16 + vror.vi \d1, \d1, 32 - 16 + vror.vi \d2, \d2, 32 - 16 + vror.vi \d3, \d3, 32 - 16 + + // c += d; b ^= c; b = rol(b, 12); + vadd.vv \c0, \c0, \d0 + vadd.vv \c1, \c1, \d1 + vadd.vv \c2, \c2, \d2 + vadd.vv \c3, \c3, \d3 + vxor.vv \b0, \b0, \c0 + vxor.vv \b1, \b1, \c1 + vxor.vv \b2, \b2, \c2 + vxor.vv \b3, \b3, \c3 + vror.vi \b0, \b0, 32 - 12 + vror.vi \b1, \b1, 32 - 12 + vror.vi \b2, \b2, 32 - 12 + vror.vi \b3, \b3, 32 - 12 + + // a += b; d ^= a; d = rol(d, 8); + vadd.vv \a0, \a0, \b0 + vadd.vv \a1, \a1, \b1 + vadd.vv \a2, \a2, \b2 + vadd.vv \a3, \a3, \b3 + vxor.vv \d0, \d0, \a0 + vxor.vv \d1, \d1, \a1 + vxor.vv \d2, \d2, \a2 + vxor.vv \d3, \d3, \a3 + vror.vi \d0, \d0, 32 - 8 + vror.vi \d1, \d1, 32 - 8 + vror.vi \d2, \d2, 32 - 8 + vror.vi \d3, \d3, 32 - 8 + + // c += d; b ^= c; b = rol(b, 7); + vadd.vv \c0, \c0, \d0 + vadd.vv \c1, \c1, \d1 + vadd.vv \c2, \c2, \d2 + vadd.vv \c3, \c3, \d3 + vxor.vv \b0, \b0, \c0 + vxor.vv \b1, \b1, \c1 + vxor.vv \b2, \b2, \c2 + vxor.vv \b3, \b3, \c3 + vror.vi \b0, \b0, 32 - 7 + vror.vi \b1, \b1, 32 - 7 + vror.vi \b2, \b2, 32 - 7 + vror.vi \b3, \b3, 32 - 7 +.endm + +// void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out, +// size_t nblocks, int nrounds); +// +// |nblocks| is the number of 64-byte blocks to process, and must be nonzero. +// +// |state| gives the ChaCha state matrix, including the 32-bit counter in +// state->x[12] following the RFC7539 convention; note that this differs from +// the original Salsa20 paper which uses a 64-bit counter in state->x[12..13]. +// The updated 32-bit counter is written back to state->x[12] before returning. +SYM_FUNC_START(chacha_zvkb) + addi sp, sp, -96 + sd s0, 0(sp) + sd s1, 8(sp) + sd s2, 16(sp) + sd s3, 24(sp) + sd s4, 32(sp) + sd s5, 40(sp) + sd s6, 48(sp) + sd s7, 56(sp) + sd s8, 64(sp) + sd s9, 72(sp) + sd s10, 80(sp) + sd s11, 88(sp) + + li STRIDE, 64 + + // Set up the initial state matrix in scalar registers. + lw CONSTS0, 0(STATEP) + lw CONSTS1, 4(STATEP) + lw CONSTS2, 8(STATEP) + lw CONSTS3, 12(STATEP) + lw KEY0, 16(STATEP) + lw KEY1, 20(STATEP) + lw KEY2, 24(STATEP) + lw KEY3, 28(STATEP) + lw KEY4, 32(STATEP) + lw KEY5, 36(STATEP) + lw KEY6, 40(STATEP) + lw KEY7, 44(STATEP) + lw COUNTER, 48(STATEP) + lw NONCE0, 52(STATEP) + lw NONCE1, 56(STATEP) + lw NONCE2, 60(STATEP) + +.Lblock_loop: + // Set vl to the number of blocks to process in this iteration. + vsetvli VL, NBLOCKS, e32, m1, ta, ma + + // Set up the initial state matrix for the next VL blocks in v0-v15. + // v{i} holds the i'th 32-bit word of the state matrix for all blocks. + // Note that only the counter word, at index 12, differs across blocks. + vmv.v.x v0, CONSTS0 + vmv.v.x v1, CONSTS1 + vmv.v.x v2, CONSTS2 + vmv.v.x v3, CONSTS3 + vmv.v.x v4, KEY0 + vmv.v.x v5, KEY1 + vmv.v.x v6, KEY2 + vmv.v.x v7, KEY3 + vmv.v.x v8, KEY4 + vmv.v.x v9, KEY5 + vmv.v.x v10, KEY6 + vmv.v.x v11, KEY7 + vid.v v12 + vadd.vx v12, v12, COUNTER + vmv.v.x v13, NONCE0 + vmv.v.x v14, NONCE1 + vmv.v.x v15, NONCE2 + + // Load the first half of the input data for each block into v16-v23. + // v{16+i} holds the i'th 32-bit word for all blocks. + vlsseg8e32.v v16, (INP), STRIDE + + mv ROUND_CTR, NROUNDS +.Lnext_doubleround: + addi ROUND_CTR, ROUND_CTR, -2 + // column round + chacha_round v0, v4, v8, v12, v1, v5, v9, v13, \ + v2, v6, v10, v14, v3, v7, v11, v15 + // diagonal round + chacha_round v0, v5, v10, v15, v1, v6, v11, v12, \ + v2, v7, v8, v13, v3, v4, v9, v14 + bnez ROUND_CTR, .Lnext_doubleround + + // Load the second half of the input data for each block into v24-v31. + // v{24+i} holds the {8+i}'th 32-bit word for all blocks. + addi TMP, INP, 32 + vlsseg8e32.v v24, (TMP), STRIDE + + // Finalize the first half of the keystream for each block. + vadd.vx v0, v0, CONSTS0 + vadd.vx v1, v1, CONSTS1 + vadd.vx v2, v2, CONSTS2 + vadd.vx v3, v3, CONSTS3 + vadd.vx v4, v4, KEY0 + vadd.vx v5, v5, KEY1 + vadd.vx v6, v6, KEY2 + vadd.vx v7, v7, KEY3 + + // Encrypt/decrypt the first half of the data for each block. + vxor.vv v16, v16, v0 + vxor.vv v17, v17, v1 + vxor.vv v18, v18, v2 + vxor.vv v19, v19, v3 + vxor.vv v20, v20, v4 + vxor.vv v21, v21, v5 + vxor.vv v22, v22, v6 + vxor.vv v23, v23, v7 + + // Store the first half of the output data for each block. + vssseg8e32.v v16, (OUTP), STRIDE + + // Finalize the second half of the keystream for each block. + vadd.vx v8, v8, KEY4 + vadd.vx v9, v9, KEY5 + vadd.vx v10, v10, KEY6 + vadd.vx v11, v11, KEY7 + vid.v v0 + vadd.vx v12, v12, COUNTER + vadd.vx v13, v13, NONCE0 + vadd.vx v14, v14, NONCE1 + vadd.vx v15, v15, NONCE2 + vadd.vv v12, v12, v0 + + // Encrypt/decrypt the second half of the data for each block. + vxor.vv v24, v24, v8 + vxor.vv v25, v25, v9 + vxor.vv v26, v26, v10 + vxor.vv v27, v27, v11 + vxor.vv v29, v29, v13 + vxor.vv v28, v28, v12 + vxor.vv v30, v30, v14 + vxor.vv v31, v31, v15 + + // Store the second half of the output data for each block. + addi TMP, OUTP, 32 + vssseg8e32.v v24, (TMP), STRIDE + + // Update the counter, the remaining number of blocks, and the input and + // output pointers according to the number of blocks processed (VL). + add COUNTER, COUNTER, VL + sub NBLOCKS, NBLOCKS, VL + slli TMP, VL, 6 + add OUTP, OUTP, TMP + add INP, INP, TMP + bnez NBLOCKS, .Lblock_loop + + sw COUNTER, 48(STATEP) + ld s0, 0(sp) + ld s1, 8(sp) + ld s2, 16(sp) + ld s3, 24(sp) + ld s4, 32(sp) + ld s5, 40(sp) + ld s6, 48(sp) + ld s7, 56(sp) + ld s8, 64(sp) + ld s9, 72(sp) + ld s10, 80(sp) + ld s11, 88(sp) + addi sp, sp, 96 + ret +SYM_FUNC_END(chacha_zvkb) diff --git a/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S b/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S new file mode 100644 index 000000000000..1618d1220a6e --- /dev/null +++ b/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S @@ -0,0 +1,225 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// This file is dual-licensed, meaning that you can use it under your +// choice of either of the following two licenses: +// +// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You can obtain +// a copy in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// or +// +// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu> +// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com> +// Copyright 2024 Google LLC +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The generated code of this file depends on the following RISC-V extensions: +// - RV64I +// - RISC-V Vector ('V') with VLEN >= 128 +// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknha' or 'Zvknhb') +// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') + +#include <linux/linkage.h> + +.text +.option arch, +zvknha, +zvkb + +#define STATEP a0 +#define DATA a1 +#define NUM_BLOCKS a2 + +#define STATEP_C a3 + +#define MASK v0 +#define INDICES v1 +#define W0 v2 +#define W1 v3 +#define W2 v4 +#define W3 v5 +#define VTMP v6 +#define FEBA v7 +#define HGDC v8 +#define K0 v10 +#define K1 v11 +#define K2 v12 +#define K3 v13 +#define K4 v14 +#define K5 v15 +#define K6 v16 +#define K7 v17 +#define K8 v18 +#define K9 v19 +#define K10 v20 +#define K11 v21 +#define K12 v22 +#define K13 v23 +#define K14 v24 +#define K15 v25 +#define PREV_FEBA v26 +#define PREV_HGDC v27 + +// Do 4 rounds of SHA-256. w0 contains the current 4 message schedule words. +// +// If not all the message schedule words have been computed yet, then this also +// computes 4 more message schedule words. w1-w3 contain the next 3 groups of 4 +// message schedule words; this macro computes the group after w3 and writes it +// to w0. This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3, +// w0), so the caller must cycle through the registers accordingly. +.macro sha256_4rounds last, k, w0, w1, w2, w3 + vadd.vv VTMP, \k, \w0 + vsha2cl.vv HGDC, FEBA, VTMP + vsha2ch.vv FEBA, HGDC, VTMP +.if !\last + vmerge.vvm VTMP, \w2, \w1, MASK + vsha2ms.vv \w0, VTMP, \w3 +.endif +.endm + +.macro sha256_16rounds last, k0, k1, k2, k3 + sha256_4rounds \last, \k0, W0, W1, W2, W3 + sha256_4rounds \last, \k1, W1, W2, W3, W0 + sha256_4rounds \last, \k2, W2, W3, W0, W1 + sha256_4rounds \last, \k3, W3, W0, W1, W2 +.endm + +// void sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state, +// const u8 *data, size_t nblocks); +SYM_FUNC_START(sha256_transform_zvknha_or_zvknhb_zvkb) + + // Load the round constants into K0-K15. + vsetivli zero, 4, e32, m1, ta, ma + la t0, K256 + vle32.v K0, (t0) + addi t0, t0, 16 + vle32.v K1, (t0) + addi t0, t0, 16 + vle32.v K2, (t0) + addi t0, t0, 16 + vle32.v K3, (t0) + addi t0, t0, 16 + vle32.v K4, (t0) + addi t0, t0, 16 + vle32.v K5, (t0) + addi t0, t0, 16 + vle32.v K6, (t0) + addi t0, t0, 16 + vle32.v K7, (t0) + addi t0, t0, 16 + vle32.v K8, (t0) + addi t0, t0, 16 + vle32.v K9, (t0) + addi t0, t0, 16 + vle32.v K10, (t0) + addi t0, t0, 16 + vle32.v K11, (t0) + addi t0, t0, 16 + vle32.v K12, (t0) + addi t0, t0, 16 + vle32.v K13, (t0) + addi t0, t0, 16 + vle32.v K14, (t0) + addi t0, t0, 16 + vle32.v K15, (t0) + + // Setup mask for the vmerge to replace the first word (idx==0) in + // message scheduling. There are 4 words, so an 8-bit mask suffices. + vsetivli zero, 1, e8, m1, ta, ma + vmv.v.i MASK, 0x01 + + // Load the state. The state is stored as {a,b,c,d,e,f,g,h}, but we + // need {f,e,b,a},{h,g,d,c}. The dst vtype is e32m1 and the index vtype + // is e8mf4. We use index-load with the i8 indices {20, 16, 4, 0}, + // loaded using the 32-bit little endian value 0x00041014. + li t0, 0x00041014 + vsetivli zero, 1, e32, m1, ta, ma + vmv.v.x INDICES, t0 + addi STATEP_C, STATEP, 8 + vsetivli zero, 4, e32, m1, ta, ma + vluxei8.v FEBA, (STATEP), INDICES + vluxei8.v HGDC, (STATEP_C), INDICES + +.Lnext_block: + addi NUM_BLOCKS, NUM_BLOCKS, -1 + + // Save the previous state, as it's needed later. + vmv.v.v PREV_FEBA, FEBA + vmv.v.v PREV_HGDC, HGDC + + // Load the next 512-bit message block and endian-swap each 32-bit word. + vle32.v W0, (DATA) + vrev8.v W0, W0 + addi DATA, DATA, 16 + vle32.v W1, (DATA) + vrev8.v W1, W1 + addi DATA, DATA, 16 + vle32.v W2, (DATA) + vrev8.v W2, W2 + addi DATA, DATA, 16 + vle32.v W3, (DATA) + vrev8.v W3, W3 + addi DATA, DATA, 16 + + // Do the 64 rounds of SHA-256. + sha256_16rounds 0, K0, K1, K2, K3 + sha256_16rounds 0, K4, K5, K6, K7 + sha256_16rounds 0, K8, K9, K10, K11 + sha256_16rounds 1, K12, K13, K14, K15 + + // Add the previous state. + vadd.vv FEBA, FEBA, PREV_FEBA + vadd.vv HGDC, HGDC, PREV_HGDC + + // Repeat if more blocks remain. + bnez NUM_BLOCKS, .Lnext_block + + // Store the new state and return. + vsuxei8.v FEBA, (STATEP), INDICES + vsuxei8.v HGDC, (STATEP_C), INDICES + ret +SYM_FUNC_END(sha256_transform_zvknha_or_zvknhb_zvkb) + +.section ".rodata" +.p2align 2 +.type K256, @object +K256: + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +.size K256, . - K256 diff --git a/lib/crypto/riscv/sha256.h b/lib/crypto/riscv/sha256.h new file mode 100644 index 000000000000..c0f79c18f119 --- /dev/null +++ b/lib/crypto/riscv/sha256.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-256 (RISC-V accelerated) + * + * Copyright (C) 2022 VRULL GmbH + * Author: Heiko Stuebner <heiko.stuebner@vrull.eu> + * + * Copyright (C) 2023 SiFive, Inc. + * Author: Jerry Shih <jerry.shih@sifive.com> + */ + +#include <asm/vector.h> +#include <crypto/internal/simd.h> + +asmlinkage void +sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state, + const u8 *data, size_t nblocks); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions); + +static void sha256_blocks(struct sha256_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_extensions) && crypto_simd_usable()) { + kernel_vector_begin(); + sha256_transform_zvknha_or_zvknhb_zvkb(state, data, nblocks); + kernel_vector_end(); + } else { + sha256_blocks_generic(state, data, nblocks); + } +} + +#define sha256_mod_init_arch sha256_mod_init_arch +static inline void sha256_mod_init_arch(void) +{ + /* Both zvknha and zvknhb provide the SHA-256 instructions. */ + if ((riscv_isa_extension_available(NULL, ZVKNHA) || + riscv_isa_extension_available(NULL, ZVKNHB)) && + riscv_isa_extension_available(NULL, ZVKB) && + riscv_vector_vlen() >= 128) + static_branch_enable(&have_extensions); +} diff --git a/lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S b/lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S new file mode 100644 index 000000000000..b41eebf60546 --- /dev/null +++ b/lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S @@ -0,0 +1,203 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// This file is dual-licensed, meaning that you can use it under your +// choice of either of the following two licenses: +// +// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You can obtain +// a copy in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// or +// +// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu> +// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com> +// Copyright 2024 Google LLC +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The generated code of this file depends on the following RISC-V extensions: +// - RV64I +// - RISC-V Vector ('V') with VLEN >= 128 +// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknhb') +// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') + +#include <linux/linkage.h> + +.text +.option arch, +zvknhb, +zvkb + +#define STATEP a0 +#define DATA a1 +#define NUM_BLOCKS a2 + +#define STATEP_C a3 +#define K a4 + +#define MASK v0 +#define INDICES v1 +#define W0 v10 // LMUL=2 +#define W1 v12 // LMUL=2 +#define W2 v14 // LMUL=2 +#define W3 v16 // LMUL=2 +#define VTMP v20 // LMUL=2 +#define FEBA v22 // LMUL=2 +#define HGDC v24 // LMUL=2 +#define PREV_FEBA v26 // LMUL=2 +#define PREV_HGDC v28 // LMUL=2 + +// Do 4 rounds of SHA-512. w0 contains the current 4 message schedule words. +// +// If not all the message schedule words have been computed yet, then this also +// computes 4 more message schedule words. w1-w3 contain the next 3 groups of 4 +// message schedule words; this macro computes the group after w3 and writes it +// to w0. This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3, +// w0), so the caller must cycle through the registers accordingly. +.macro sha512_4rounds last, w0, w1, w2, w3 + vle64.v VTMP, (K) + addi K, K, 32 + vadd.vv VTMP, VTMP, \w0 + vsha2cl.vv HGDC, FEBA, VTMP + vsha2ch.vv FEBA, HGDC, VTMP +.if !\last + vmerge.vvm VTMP, \w2, \w1, MASK + vsha2ms.vv \w0, VTMP, \w3 +.endif +.endm + +.macro sha512_16rounds last + sha512_4rounds \last, W0, W1, W2, W3 + sha512_4rounds \last, W1, W2, W3, W0 + sha512_4rounds \last, W2, W3, W0, W1 + sha512_4rounds \last, W3, W0, W1, W2 +.endm + +// void sha512_transform_zvknhb_zvkb(struct sha512_block_state *state, +// const u8 *data, size_t nblocks); +SYM_FUNC_START(sha512_transform_zvknhb_zvkb) + + // Setup mask for the vmerge to replace the first word (idx==0) in + // message scheduling. There are 4 words, so an 8-bit mask suffices. + vsetivli zero, 1, e8, m1, ta, ma + vmv.v.i MASK, 0x01 + + // Load the state. The state is stored as {a,b,c,d,e,f,g,h}, but we + // need {f,e,b,a},{h,g,d,c}. The dst vtype is e64m2 and the index vtype + // is e8mf4. We use index-load with the i8 indices {40, 32, 8, 0}, + // loaded using the 32-bit little endian value 0x00082028. + li t0, 0x00082028 + vsetivli zero, 1, e32, m1, ta, ma + vmv.v.x INDICES, t0 + addi STATEP_C, STATEP, 16 + vsetivli zero, 4, e64, m2, ta, ma + vluxei8.v FEBA, (STATEP), INDICES + vluxei8.v HGDC, (STATEP_C), INDICES + +.Lnext_block: + la K, K512 + addi NUM_BLOCKS, NUM_BLOCKS, -1 + + // Save the previous state, as it's needed later. + vmv.v.v PREV_FEBA, FEBA + vmv.v.v PREV_HGDC, HGDC + + // Load the next 1024-bit message block and endian-swap each 64-bit word + vle64.v W0, (DATA) + vrev8.v W0, W0 + addi DATA, DATA, 32 + vle64.v W1, (DATA) + vrev8.v W1, W1 + addi DATA, DATA, 32 + vle64.v W2, (DATA) + vrev8.v W2, W2 + addi DATA, DATA, 32 + vle64.v W3, (DATA) + vrev8.v W3, W3 + addi DATA, DATA, 32 + + // Do the 80 rounds of SHA-512. + sha512_16rounds 0 + sha512_16rounds 0 + sha512_16rounds 0 + sha512_16rounds 0 + sha512_16rounds 1 + + // Add the previous state. + vadd.vv FEBA, FEBA, PREV_FEBA + vadd.vv HGDC, HGDC, PREV_HGDC + + // Repeat if more blocks remain. + bnez NUM_BLOCKS, .Lnext_block + + // Store the new state and return. + vsuxei8.v FEBA, (STATEP), INDICES + vsuxei8.v HGDC, (STATEP_C), INDICES + ret +SYM_FUNC_END(sha512_transform_zvknhb_zvkb) + +.section ".rodata" +.p2align 3 +.type K512, @object +K512: + .dword 0x428a2f98d728ae22, 0x7137449123ef65cd + .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc + .dword 0x3956c25bf348b538, 0x59f111f1b605d019 + .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 + .dword 0xd807aa98a3030242, 0x12835b0145706fbe + .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 + .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 + .dword 0x9bdc06a725c71235, 0xc19bf174cf692694 + .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 + .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 + .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 + .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 + .dword 0x983e5152ee66dfab, 0xa831c66d2db43210 + .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4 + .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725 + .dword 0x06ca6351e003826f, 0x142929670a0e6e70 + .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926 + .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df + .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8 + .dword 0x81c2c92e47edaee6, 0x92722c851482353b + .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001 + .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30 + .dword 0xd192e819d6ef5218, 0xd69906245565a910 + .dword 0xf40e35855771202a, 0x106aa07032bbd1b8 + .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 + .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 + .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb + .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 + .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60 + .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec + .dword 0x90befffa23631e28, 0xa4506cebde82bde9 + .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b + .dword 0xca273eceea26619c, 0xd186b8c721c0c207 + .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 + .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6 + .dword 0x113f9804bef90dae, 0x1b710b35131c471b + .dword 0x28db77f523047d84, 0x32caab7b40c72493 + .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c + .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a + .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 +.size K512, . - K512 diff --git a/lib/crypto/riscv/sha512.h b/lib/crypto/riscv/sha512.h new file mode 100644 index 000000000000..9d0abede322f --- /dev/null +++ b/lib/crypto/riscv/sha512.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-512 and SHA-384 using the RISC-V vector crypto extensions + * + * Copyright (C) 2023 VRULL GmbH + * Author: Heiko Stuebner <heiko.stuebner@vrull.eu> + * + * Copyright (C) 2023 SiFive, Inc. + * Author: Jerry Shih <jerry.shih@sifive.com> + */ + +#include <asm/simd.h> +#include <asm/vector.h> +#include <crypto/internal/simd.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions); + +asmlinkage void sha512_transform_zvknhb_zvkb(struct sha512_block_state *state, + const u8 *data, size_t nblocks); + +static void sha512_blocks(struct sha512_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_extensions) && + likely(crypto_simd_usable())) { + kernel_vector_begin(); + sha512_transform_zvknhb_zvkb(state, data, nblocks); + kernel_vector_end(); + } else { + sha512_blocks_generic(state, data, nblocks); + } +} + +#define sha512_mod_init_arch sha512_mod_init_arch +static inline void sha512_mod_init_arch(void) +{ + if (riscv_isa_extension_available(NULL, ZVKNHB) && + riscv_isa_extension_available(NULL, ZVKB) && + riscv_vector_vlen() >= 128) + static_branch_enable(&have_extensions); +} diff --git a/lib/crypto/s390/Kconfig b/lib/crypto/s390/Kconfig new file mode 100644 index 000000000000..069b355fe51a --- /dev/null +++ b/lib/crypto/s390/Kconfig @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_CHACHA_S390 + tristate + default CRYPTO_LIB_CHACHA + select CRYPTO_LIB_CHACHA_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CHACHA diff --git a/lib/crypto/s390/Makefile b/lib/crypto/s390/Makefile new file mode 100644 index 000000000000..06c2cf77178e --- /dev/null +++ b/lib/crypto/s390/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o +chacha_s390-y := chacha-glue.o chacha-s390.o diff --git a/lib/crypto/s390/chacha-glue.c b/lib/crypto/s390/chacha-glue.c new file mode 100644 index 000000000000..c57dc851214f --- /dev/null +++ b/lib/crypto/s390/chacha-glue.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ChaCha stream cipher (s390 optimized) + * + * Copyright IBM Corp. 2021 + */ + +#define KMSG_COMPONENT "chacha_s390" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <crypto/chacha.h> +#include <linux/cpufeature.h> +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sizes.h> +#include <asm/fpu.h> +#include "chacha-s390.h" + +void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + /* TODO: implement hchacha_block_arch() in assembly */ + hchacha_block_generic(state, out, nrounds); +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + /* s390 chacha20 implementation has 20 rounds hard-coded, + * it cannot handle a block of data or less, but otherwise + * it can handle data of arbitrary size + */ + if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20 || !cpu_has_vx()) { + chacha_crypt_generic(state, dst, src, bytes, nrounds); + } else { + DECLARE_KERNEL_FPU_ONSTACK32(vxstate); + + kernel_fpu_begin(&vxstate, KERNEL_VXR); + chacha20_vx(dst, src, bytes, &state->x[4], &state->x[12]); + kernel_fpu_end(&vxstate, KERNEL_VXR); + + state->x[12] += round_up(bytes, CHACHA_BLOCK_SIZE) / + CHACHA_BLOCK_SIZE; + } +} +EXPORT_SYMBOL(chacha_crypt_arch); + +bool chacha_is_arch_optimized(void) +{ + return cpu_has_vx(); +} +EXPORT_SYMBOL(chacha_is_arch_optimized); + +MODULE_DESCRIPTION("ChaCha stream cipher (s390 optimized)"); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/s390/chacha-s390.S b/lib/crypto/s390/chacha-s390.S new file mode 100644 index 000000000000..63f3102678c0 --- /dev/null +++ b/lib/crypto/s390/chacha-s390.S @@ -0,0 +1,908 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Original implementation written by Andy Polyakov, @dot-asm. + * This is an adaptation of the original code for kernel use. + * + * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. + */ + +#include <linux/linkage.h> +#include <asm/nospec-insn.h> +#include <asm/fpu-insn.h> + +#define SP %r15 +#define FRAME (16 * 8 + 4 * 8) + + .data + .balign 32 + +SYM_DATA_START_LOCAL(sigma) + .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral + .long 1,0,0,0 + .long 2,0,0,0 + .long 3,0,0,0 + .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap + + .long 0,1,2,3 + .long 0x61707865,0x61707865,0x61707865,0x61707865 # smashed sigma + .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e + .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32 + .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574 +SYM_DATA_END(sigma) + + .previous + + GEN_BR_THUNK %r14 + + .text + +############################################################################# +# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len, +# counst u32 *key, const u32 *counter) + +#define OUT %r2 +#define INP %r3 +#define LEN %r4 +#define KEY %r5 +#define COUNTER %r6 + +#define BEPERM %v31 +#define CTR %v26 + +#define K0 %v16 +#define K1 %v17 +#define K2 %v18 +#define K3 %v19 + +#define XA0 %v0 +#define XA1 %v1 +#define XA2 %v2 +#define XA3 %v3 + +#define XB0 %v4 +#define XB1 %v5 +#define XB2 %v6 +#define XB3 %v7 + +#define XC0 %v8 +#define XC1 %v9 +#define XC2 %v10 +#define XC3 %v11 + +#define XD0 %v12 +#define XD1 %v13 +#define XD2 %v14 +#define XD3 %v15 + +#define XT0 %v27 +#define XT1 %v28 +#define XT2 %v29 +#define XT3 %v30 + +SYM_FUNC_START(chacha20_vx_4x) + stmg %r6,%r7,6*8(SP) + + larl %r7,sigma + lhi %r0,10 + lhi %r1,0 + + VL K0,0,,%r7 # load sigma + VL K1,0,,KEY # load key + VL K2,16,,KEY + VL K3,0,,COUNTER # load counter + + VL BEPERM,0x40,,%r7 + VL CTR,0x50,,%r7 + + VLM XA0,XA3,0x60,%r7,4 # load [smashed] sigma + + VREPF XB0,K1,0 # smash the key + VREPF XB1,K1,1 + VREPF XB2,K1,2 + VREPF XB3,K1,3 + + VREPF XD0,K3,0 + VREPF XD1,K3,1 + VREPF XD2,K3,2 + VREPF XD3,K3,3 + VAF XD0,XD0,CTR + + VREPF XC0,K2,0 + VREPF XC1,K2,1 + VREPF XC2,K2,2 + VREPF XC3,K2,3 + +.Loop_4x: + VAF XA0,XA0,XB0 + VX XD0,XD0,XA0 + VERLLF XD0,XD0,16 + + VAF XA1,XA1,XB1 + VX XD1,XD1,XA1 + VERLLF XD1,XD1,16 + + VAF XA2,XA2,XB2 + VX XD2,XD2,XA2 + VERLLF XD2,XD2,16 + + VAF XA3,XA3,XB3 + VX XD3,XD3,XA3 + VERLLF XD3,XD3,16 + + VAF XC0,XC0,XD0 + VX XB0,XB0,XC0 + VERLLF XB0,XB0,12 + + VAF XC1,XC1,XD1 + VX XB1,XB1,XC1 + VERLLF XB1,XB1,12 + + VAF XC2,XC2,XD2 + VX XB2,XB2,XC2 + VERLLF XB2,XB2,12 + + VAF XC3,XC3,XD3 + VX XB3,XB3,XC3 + VERLLF XB3,XB3,12 + + VAF XA0,XA0,XB0 + VX XD0,XD0,XA0 + VERLLF XD0,XD0,8 + + VAF XA1,XA1,XB1 + VX XD1,XD1,XA1 + VERLLF XD1,XD1,8 + + VAF XA2,XA2,XB2 + VX XD2,XD2,XA2 + VERLLF XD2,XD2,8 + + VAF XA3,XA3,XB3 + VX XD3,XD3,XA3 + VERLLF XD3,XD3,8 + + VAF XC0,XC0,XD0 + VX XB0,XB0,XC0 + VERLLF XB0,XB0,7 + + VAF XC1,XC1,XD1 + VX XB1,XB1,XC1 + VERLLF XB1,XB1,7 + + VAF XC2,XC2,XD2 + VX XB2,XB2,XC2 + VERLLF XB2,XB2,7 + + VAF XC3,XC3,XD3 + VX XB3,XB3,XC3 + VERLLF XB3,XB3,7 + + VAF XA0,XA0,XB1 + VX XD3,XD3,XA0 + VERLLF XD3,XD3,16 + + VAF XA1,XA1,XB2 + VX XD0,XD0,XA1 + VERLLF XD0,XD0,16 + + VAF XA2,XA2,XB3 + VX XD1,XD1,XA2 + VERLLF XD1,XD1,16 + + VAF XA3,XA3,XB0 + VX XD2,XD2,XA3 + VERLLF XD2,XD2,16 + + VAF XC2,XC2,XD3 + VX XB1,XB1,XC2 + VERLLF XB1,XB1,12 + + VAF XC3,XC3,XD0 + VX XB2,XB2,XC3 + VERLLF XB2,XB2,12 + + VAF XC0,XC0,XD1 + VX XB3,XB3,XC0 + VERLLF XB3,XB3,12 + + VAF XC1,XC1,XD2 + VX XB0,XB0,XC1 + VERLLF XB0,XB0,12 + + VAF XA0,XA0,XB1 + VX XD3,XD3,XA0 + VERLLF XD3,XD3,8 + + VAF XA1,XA1,XB2 + VX XD0,XD0,XA1 + VERLLF XD0,XD0,8 + + VAF XA2,XA2,XB3 + VX XD1,XD1,XA2 + VERLLF XD1,XD1,8 + + VAF XA3,XA3,XB0 + VX XD2,XD2,XA3 + VERLLF XD2,XD2,8 + + VAF XC2,XC2,XD3 + VX XB1,XB1,XC2 + VERLLF XB1,XB1,7 + + VAF XC3,XC3,XD0 + VX XB2,XB2,XC3 + VERLLF XB2,XB2,7 + + VAF XC0,XC0,XD1 + VX XB3,XB3,XC0 + VERLLF XB3,XB3,7 + + VAF XC1,XC1,XD2 + VX XB0,XB0,XC1 + VERLLF XB0,XB0,7 + brct %r0,.Loop_4x + + VAF XD0,XD0,CTR + + VMRHF XT0,XA0,XA1 # transpose data + VMRHF XT1,XA2,XA3 + VMRLF XT2,XA0,XA1 + VMRLF XT3,XA2,XA3 + VPDI XA0,XT0,XT1,0b0000 + VPDI XA1,XT0,XT1,0b0101 + VPDI XA2,XT2,XT3,0b0000 + VPDI XA3,XT2,XT3,0b0101 + + VMRHF XT0,XB0,XB1 + VMRHF XT1,XB2,XB3 + VMRLF XT2,XB0,XB1 + VMRLF XT3,XB2,XB3 + VPDI XB0,XT0,XT1,0b0000 + VPDI XB1,XT0,XT1,0b0101 + VPDI XB2,XT2,XT3,0b0000 + VPDI XB3,XT2,XT3,0b0101 + + VMRHF XT0,XC0,XC1 + VMRHF XT1,XC2,XC3 + VMRLF XT2,XC0,XC1 + VMRLF XT3,XC2,XC3 + VPDI XC0,XT0,XT1,0b0000 + VPDI XC1,XT0,XT1,0b0101 + VPDI XC2,XT2,XT3,0b0000 + VPDI XC3,XT2,XT3,0b0101 + + VMRHF XT0,XD0,XD1 + VMRHF XT1,XD2,XD3 + VMRLF XT2,XD0,XD1 + VMRLF XT3,XD2,XD3 + VPDI XD0,XT0,XT1,0b0000 + VPDI XD1,XT0,XT1,0b0101 + VPDI XD2,XT2,XT3,0b0000 + VPDI XD3,XT2,XT3,0b0101 + + VAF XA0,XA0,K0 + VAF XB0,XB0,K1 + VAF XC0,XC0,K2 + VAF XD0,XD0,K3 + + VPERM XA0,XA0,XA0,BEPERM + VPERM XB0,XB0,XB0,BEPERM + VPERM XC0,XC0,XC0,BEPERM + VPERM XD0,XD0,XD0,BEPERM + + VLM XT0,XT3,0,INP,0 + + VX XT0,XT0,XA0 + VX XT1,XT1,XB0 + VX XT2,XT2,XC0 + VX XT3,XT3,XD0 + + VSTM XT0,XT3,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + + VAF XA0,XA1,K0 + VAF XB0,XB1,K1 + VAF XC0,XC1,K2 + VAF XD0,XD1,K3 + + VPERM XA0,XA0,XA0,BEPERM + VPERM XB0,XB0,XB0,BEPERM + VPERM XC0,XC0,XC0,BEPERM + VPERM XD0,XD0,XD0,BEPERM + + clgfi LEN,0x40 + jl .Ltail_4x + + VLM XT0,XT3,0,INP,0 + + VX XT0,XT0,XA0 + VX XT1,XT1,XB0 + VX XT2,XT2,XC0 + VX XT3,XT3,XD0 + + VSTM XT0,XT3,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_4x + + VAF XA0,XA2,K0 + VAF XB0,XB2,K1 + VAF XC0,XC2,K2 + VAF XD0,XD2,K3 + + VPERM XA0,XA0,XA0,BEPERM + VPERM XB0,XB0,XB0,BEPERM + VPERM XC0,XC0,XC0,BEPERM + VPERM XD0,XD0,XD0,BEPERM + + clgfi LEN,0x40 + jl .Ltail_4x + + VLM XT0,XT3,0,INP,0 + + VX XT0,XT0,XA0 + VX XT1,XT1,XB0 + VX XT2,XT2,XC0 + VX XT3,XT3,XD0 + + VSTM XT0,XT3,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_4x + + VAF XA0,XA3,K0 + VAF XB0,XB3,K1 + VAF XC0,XC3,K2 + VAF XD0,XD3,K3 + + VPERM XA0,XA0,XA0,BEPERM + VPERM XB0,XB0,XB0,BEPERM + VPERM XC0,XC0,XC0,BEPERM + VPERM XD0,XD0,XD0,BEPERM + + clgfi LEN,0x40 + jl .Ltail_4x + + VLM XT0,XT3,0,INP,0 + + VX XT0,XT0,XA0 + VX XT1,XT1,XB0 + VX XT2,XT2,XC0 + VX XT3,XT3,XD0 + + VSTM XT0,XT3,0,OUT,0 + +.Ldone_4x: + lmg %r6,%r7,6*8(SP) + BR_EX %r14 + +.Ltail_4x: + VLR XT0,XC0 + VLR XT1,XD0 + + VST XA0,8*8+0x00,,SP + VST XB0,8*8+0x10,,SP + VST XT0,8*8+0x20,,SP + VST XT1,8*8+0x30,,SP + + lghi %r1,0 + +.Loop_tail_4x: + llgc %r5,0(%r1,INP) + llgc %r6,8*8(%r1,SP) + xr %r6,%r5 + stc %r6,0(%r1,OUT) + la %r1,1(%r1) + brct LEN,.Loop_tail_4x + + lmg %r6,%r7,6*8(SP) + BR_EX %r14 +SYM_FUNC_END(chacha20_vx_4x) + +#undef OUT +#undef INP +#undef LEN +#undef KEY +#undef COUNTER + +#undef BEPERM + +#undef K0 +#undef K1 +#undef K2 +#undef K3 + + +############################################################################# +# void chacha20_vx(u8 *out, counst u8 *inp, size_t len, +# counst u32 *key, const u32 *counter) + +#define OUT %r2 +#define INP %r3 +#define LEN %r4 +#define KEY %r5 +#define COUNTER %r6 + +#define BEPERM %v31 + +#define K0 %v27 +#define K1 %v24 +#define K2 %v25 +#define K3 %v26 + +#define A0 %v0 +#define B0 %v1 +#define C0 %v2 +#define D0 %v3 + +#define A1 %v4 +#define B1 %v5 +#define C1 %v6 +#define D1 %v7 + +#define A2 %v8 +#define B2 %v9 +#define C2 %v10 +#define D2 %v11 + +#define A3 %v12 +#define B3 %v13 +#define C3 %v14 +#define D3 %v15 + +#define A4 %v16 +#define B4 %v17 +#define C4 %v18 +#define D4 %v19 + +#define A5 %v20 +#define B5 %v21 +#define C5 %v22 +#define D5 %v23 + +#define T0 %v27 +#define T1 %v28 +#define T2 %v29 +#define T3 %v30 + +SYM_FUNC_START(chacha20_vx) + clgfi LEN,256 + jle chacha20_vx_4x + stmg %r6,%r7,6*8(SP) + + lghi %r1,-FRAME + lgr %r0,SP + la SP,0(%r1,SP) + stg %r0,0(SP) # back-chain + + larl %r7,sigma + lhi %r0,10 + + VLM K1,K2,0,KEY,0 # load key + VL K3,0,,COUNTER # load counter + + VLM K0,BEPERM,0,%r7,4 # load sigma, increments, ... + +.Loop_outer_vx: + VLR A0,K0 + VLR B0,K1 + VLR A1,K0 + VLR B1,K1 + VLR A2,K0 + VLR B2,K1 + VLR A3,K0 + VLR B3,K1 + VLR A4,K0 + VLR B4,K1 + VLR A5,K0 + VLR B5,K1 + + VLR D0,K3 + VAF D1,K3,T1 # K[3]+1 + VAF D2,K3,T2 # K[3]+2 + VAF D3,K3,T3 # K[3]+3 + VAF D4,D2,T2 # K[3]+4 + VAF D5,D2,T3 # K[3]+5 + + VLR C0,K2 + VLR C1,K2 + VLR C2,K2 + VLR C3,K2 + VLR C4,K2 + VLR C5,K2 + + VLR T1,D1 + VLR T2,D2 + VLR T3,D3 + +.Loop_vx: + VAF A0,A0,B0 + VAF A1,A1,B1 + VAF A2,A2,B2 + VAF A3,A3,B3 + VAF A4,A4,B4 + VAF A5,A5,B5 + VX D0,D0,A0 + VX D1,D1,A1 + VX D2,D2,A2 + VX D3,D3,A3 + VX D4,D4,A4 + VX D5,D5,A5 + VERLLF D0,D0,16 + VERLLF D1,D1,16 + VERLLF D2,D2,16 + VERLLF D3,D3,16 + VERLLF D4,D4,16 + VERLLF D5,D5,16 + + VAF C0,C0,D0 + VAF C1,C1,D1 + VAF C2,C2,D2 + VAF C3,C3,D3 + VAF C4,C4,D4 + VAF C5,C5,D5 + VX B0,B0,C0 + VX B1,B1,C1 + VX B2,B2,C2 + VX B3,B3,C3 + VX B4,B4,C4 + VX B5,B5,C5 + VERLLF B0,B0,12 + VERLLF B1,B1,12 + VERLLF B2,B2,12 + VERLLF B3,B3,12 + VERLLF B4,B4,12 + VERLLF B5,B5,12 + + VAF A0,A0,B0 + VAF A1,A1,B1 + VAF A2,A2,B2 + VAF A3,A3,B3 + VAF A4,A4,B4 + VAF A5,A5,B5 + VX D0,D0,A0 + VX D1,D1,A1 + VX D2,D2,A2 + VX D3,D3,A3 + VX D4,D4,A4 + VX D5,D5,A5 + VERLLF D0,D0,8 + VERLLF D1,D1,8 + VERLLF D2,D2,8 + VERLLF D3,D3,8 + VERLLF D4,D4,8 + VERLLF D5,D5,8 + + VAF C0,C0,D0 + VAF C1,C1,D1 + VAF C2,C2,D2 + VAF C3,C3,D3 + VAF C4,C4,D4 + VAF C5,C5,D5 + VX B0,B0,C0 + VX B1,B1,C1 + VX B2,B2,C2 + VX B3,B3,C3 + VX B4,B4,C4 + VX B5,B5,C5 + VERLLF B0,B0,7 + VERLLF B1,B1,7 + VERLLF B2,B2,7 + VERLLF B3,B3,7 + VERLLF B4,B4,7 + VERLLF B5,B5,7 + + VSLDB C0,C0,C0,8 + VSLDB C1,C1,C1,8 + VSLDB C2,C2,C2,8 + VSLDB C3,C3,C3,8 + VSLDB C4,C4,C4,8 + VSLDB C5,C5,C5,8 + VSLDB B0,B0,B0,4 + VSLDB B1,B1,B1,4 + VSLDB B2,B2,B2,4 + VSLDB B3,B3,B3,4 + VSLDB B4,B4,B4,4 + VSLDB B5,B5,B5,4 + VSLDB D0,D0,D0,12 + VSLDB D1,D1,D1,12 + VSLDB D2,D2,D2,12 + VSLDB D3,D3,D3,12 + VSLDB D4,D4,D4,12 + VSLDB D5,D5,D5,12 + + VAF A0,A0,B0 + VAF A1,A1,B1 + VAF A2,A2,B2 + VAF A3,A3,B3 + VAF A4,A4,B4 + VAF A5,A5,B5 + VX D0,D0,A0 + VX D1,D1,A1 + VX D2,D2,A2 + VX D3,D3,A3 + VX D4,D4,A4 + VX D5,D5,A5 + VERLLF D0,D0,16 + VERLLF D1,D1,16 + VERLLF D2,D2,16 + VERLLF D3,D3,16 + VERLLF D4,D4,16 + VERLLF D5,D5,16 + + VAF C0,C0,D0 + VAF C1,C1,D1 + VAF C2,C2,D2 + VAF C3,C3,D3 + VAF C4,C4,D4 + VAF C5,C5,D5 + VX B0,B0,C0 + VX B1,B1,C1 + VX B2,B2,C2 + VX B3,B3,C3 + VX B4,B4,C4 + VX B5,B5,C5 + VERLLF B0,B0,12 + VERLLF B1,B1,12 + VERLLF B2,B2,12 + VERLLF B3,B3,12 + VERLLF B4,B4,12 + VERLLF B5,B5,12 + + VAF A0,A0,B0 + VAF A1,A1,B1 + VAF A2,A2,B2 + VAF A3,A3,B3 + VAF A4,A4,B4 + VAF A5,A5,B5 + VX D0,D0,A0 + VX D1,D1,A1 + VX D2,D2,A2 + VX D3,D3,A3 + VX D4,D4,A4 + VX D5,D5,A5 + VERLLF D0,D0,8 + VERLLF D1,D1,8 + VERLLF D2,D2,8 + VERLLF D3,D3,8 + VERLLF D4,D4,8 + VERLLF D5,D5,8 + + VAF C0,C0,D0 + VAF C1,C1,D1 + VAF C2,C2,D2 + VAF C3,C3,D3 + VAF C4,C4,D4 + VAF C5,C5,D5 + VX B0,B0,C0 + VX B1,B1,C1 + VX B2,B2,C2 + VX B3,B3,C3 + VX B4,B4,C4 + VX B5,B5,C5 + VERLLF B0,B0,7 + VERLLF B1,B1,7 + VERLLF B2,B2,7 + VERLLF B3,B3,7 + VERLLF B4,B4,7 + VERLLF B5,B5,7 + + VSLDB C0,C0,C0,8 + VSLDB C1,C1,C1,8 + VSLDB C2,C2,C2,8 + VSLDB C3,C3,C3,8 + VSLDB C4,C4,C4,8 + VSLDB C5,C5,C5,8 + VSLDB B0,B0,B0,12 + VSLDB B1,B1,B1,12 + VSLDB B2,B2,B2,12 + VSLDB B3,B3,B3,12 + VSLDB B4,B4,B4,12 + VSLDB B5,B5,B5,12 + VSLDB D0,D0,D0,4 + VSLDB D1,D1,D1,4 + VSLDB D2,D2,D2,4 + VSLDB D3,D3,D3,4 + VSLDB D4,D4,D4,4 + VSLDB D5,D5,D5,4 + brct %r0,.Loop_vx + + VAF A0,A0,K0 + VAF B0,B0,K1 + VAF C0,C0,K2 + VAF D0,D0,K3 + VAF A1,A1,K0 + VAF D1,D1,T1 # +K[3]+1 + + VPERM A0,A0,A0,BEPERM + VPERM B0,B0,B0,BEPERM + VPERM C0,C0,C0,BEPERM + VPERM D0,D0,D0,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VAF D2,D2,T2 # +K[3]+2 + VAF D3,D3,T3 # +K[3]+3 + VLM T0,T3,0,INP,0 + + VX A0,A0,T0 + VX B0,B0,T1 + VX C0,C0,T2 + VX D0,D0,T3 + + VLM K0,T3,0,%r7,4 # re-load sigma and increments + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_vx + + VAF B1,B1,K1 + VAF C1,C1,K2 + + VPERM A0,A1,A1,BEPERM + VPERM B0,B1,B1,BEPERM + VPERM C0,C1,C1,BEPERM + VPERM D0,D1,D1,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VLM A1,D1,0,INP,0 + + VX A0,A0,A1 + VX B0,B0,B1 + VX C0,C0,C1 + VX D0,D0,D1 + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_vx + + VAF A2,A2,K0 + VAF B2,B2,K1 + VAF C2,C2,K2 + + VPERM A0,A2,A2,BEPERM + VPERM B0,B2,B2,BEPERM + VPERM C0,C2,C2,BEPERM + VPERM D0,D2,D2,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VLM A1,D1,0,INP,0 + + VX A0,A0,A1 + VX B0,B0,B1 + VX C0,C0,C1 + VX D0,D0,D1 + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_vx + + VAF A3,A3,K0 + VAF B3,B3,K1 + VAF C3,C3,K2 + VAF D2,K3,T3 # K[3]+3 + + VPERM A0,A3,A3,BEPERM + VPERM B0,B3,B3,BEPERM + VPERM C0,C3,C3,BEPERM + VPERM D0,D3,D3,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VAF D3,D2,T1 # K[3]+4 + VLM A1,D1,0,INP,0 + + VX A0,A0,A1 + VX B0,B0,B1 + VX C0,C0,C1 + VX D0,D0,D1 + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_vx + + VAF A4,A4,K0 + VAF B4,B4,K1 + VAF C4,C4,K2 + VAF D4,D4,D3 # +K[3]+4 + VAF D3,D3,T1 # K[3]+5 + VAF K3,D2,T3 # K[3]+=6 + + VPERM A0,A4,A4,BEPERM + VPERM B0,B4,B4,BEPERM + VPERM C0,C4,C4,BEPERM + VPERM D0,D4,D4,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VLM A1,D1,0,INP,0 + + VX A0,A0,A1 + VX B0,B0,B1 + VX C0,C0,C1 + VX D0,D0,D1 + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_vx + + VAF A5,A5,K0 + VAF B5,B5,K1 + VAF C5,C5,K2 + VAF D5,D5,D3 # +K[3]+5 + + VPERM A0,A5,A5,BEPERM + VPERM B0,B5,B5,BEPERM + VPERM C0,C5,C5,BEPERM + VPERM D0,D5,D5,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VLM A1,D1,0,INP,0 + + VX A0,A0,A1 + VX B0,B0,B1 + VX C0,C0,C1 + VX D0,D0,D1 + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + lhi %r0,10 + aghi LEN,-0x40 + jne .Loop_outer_vx + +.Ldone_vx: + lmg %r6,%r7,FRAME+6*8(SP) + la SP,FRAME(SP) + BR_EX %r14 + +.Ltail_vx: + VSTM A0,D0,8*8,SP,3 + lghi %r1,0 + +.Loop_tail_vx: + llgc %r5,0(%r1,INP) + llgc %r6,8*8(%r1,SP) + xr %r6,%r5 + stc %r6,0(%r1,OUT) + la %r1,1(%r1) + brct LEN,.Loop_tail_vx + + lmg %r6,%r7,FRAME+6*8(SP) + la SP,FRAME(SP) + BR_EX %r14 +SYM_FUNC_END(chacha20_vx) + +.previous diff --git a/lib/crypto/s390/chacha-s390.h b/lib/crypto/s390/chacha-s390.h new file mode 100644 index 000000000000..733744ce30f5 --- /dev/null +++ b/lib/crypto/s390/chacha-s390.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * s390 ChaCha stream cipher. + * + * Copyright IBM Corp. 2021 + */ + +#ifndef _CHACHA_S390_H +#define _CHACHA_S390_H + +void chacha20_vx(u8 *out, const u8 *inp, size_t len, const u32 *key, + const u32 *counter); + +#endif /* _CHACHA_S390_H */ diff --git a/lib/crypto/s390/sha1.h b/lib/crypto/s390/sha1.h new file mode 100644 index 000000000000..08bd138e881c --- /dev/null +++ b/lib/crypto/s390/sha1.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-1 optimized using the CP Assist for Cryptographic Functions (CPACF) + * + * Copyright 2025 Google LLC + */ +#include <asm/cpacf.h> +#include <linux/cpufeature.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha1); + +static void sha1_blocks(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_cpacf_sha1)) + cpacf_kimd(CPACF_KIMD_SHA_1, state, data, + nblocks * SHA1_BLOCK_SIZE); + else + sha1_blocks_generic(state, data, nblocks); +} + +#define sha1_mod_init_arch sha1_mod_init_arch +static inline void sha1_mod_init_arch(void) +{ + if (cpu_have_feature(S390_CPU_FEATURE_MSA) && + cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_1)) + static_branch_enable(&have_cpacf_sha1); +} diff --git a/lib/crypto/s390/sha256.h b/lib/crypto/s390/sha256.h new file mode 100644 index 000000000000..70a81cbc06b2 --- /dev/null +++ b/lib/crypto/s390/sha256.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-256 optimized using the CP Assist for Cryptographic Functions (CPACF) + * + * Copyright 2025 Google LLC + */ +#include <asm/cpacf.h> +#include <linux/cpufeature.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha256); + +static void sha256_blocks(struct sha256_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_cpacf_sha256)) + cpacf_kimd(CPACF_KIMD_SHA_256, state, data, + nblocks * SHA256_BLOCK_SIZE); + else + sha256_blocks_generic(state, data, nblocks); +} + +#define sha256_mod_init_arch sha256_mod_init_arch +static inline void sha256_mod_init_arch(void) +{ + if (cpu_have_feature(S390_CPU_FEATURE_MSA) && + cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256)) + static_branch_enable(&have_cpacf_sha256); +} diff --git a/lib/crypto/s390/sha512.h b/lib/crypto/s390/sha512.h new file mode 100644 index 000000000000..24744651550c --- /dev/null +++ b/lib/crypto/s390/sha512.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-512 optimized using the CP Assist for Cryptographic Functions (CPACF) + * + * Copyright 2025 Google LLC + */ +#include <asm/cpacf.h> +#include <linux/cpufeature.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha512); + +static void sha512_blocks(struct sha512_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_cpacf_sha512)) + cpacf_kimd(CPACF_KIMD_SHA_512, state, data, + nblocks * SHA512_BLOCK_SIZE); + else + sha512_blocks_generic(state, data, nblocks); +} + +#define sha512_mod_init_arch sha512_mod_init_arch +static inline void sha512_mod_init_arch(void) +{ + if (cpu_have_feature(S390_CPU_FEATURE_MSA) && + cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_512)) + static_branch_enable(&have_cpacf_sha512); +} diff --git a/lib/crypto/sha1.c b/lib/crypto/sha1.c index ebb60519ae93..5904e4ae85d2 100644 --- a/lib/crypto/sha1.c +++ b/lib/crypto/sha1.c @@ -1,18 +1,21 @@ // SPDX-License-Identifier: GPL-2.0 /* - * SHA1 routine optimized to do word accesses rather than byte accesses, - * and to avoid unnecessary copies into the context array. - * - * This was based on the git SHA1 implementation. + * SHA-1 and HMAC-SHA1 library functions */ -#include <linux/kernel.h> +#include <crypto/hmac.h> +#include <crypto/sha1.h> +#include <linux/bitops.h> #include <linux/export.h> +#include <linux/kernel.h> #include <linux/module.h> -#include <linux/bitops.h> #include <linux/string.h> -#include <crypto/sha1.h> #include <linux/unaligned.h> +#include <linux/wordpart.h> + +static const struct sha1_block_state sha1_iv = { + .h = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, +}; /* * If you have 32 registers or more, the compiler can (and should) @@ -124,10 +127,10 @@ void sha1_transform(__u32 *digest, const char *data, __u32 *array) EXPORT_SYMBOL(sha1_transform); /** - * sha1_init - initialize the vectors for a SHA1 digest + * sha1_init_raw - initialize the vectors for a SHA1 digest * @buf: vector to initialize */ -void sha1_init(__u32 *buf) +void sha1_init_raw(__u32 *buf) { buf[0] = 0x67452301; buf[1] = 0xefcdab89; @@ -135,7 +138,211 @@ void sha1_init(__u32 *buf) buf[3] = 0x10325476; buf[4] = 0xc3d2e1f0; } -EXPORT_SYMBOL(sha1_init); +EXPORT_SYMBOL(sha1_init_raw); + +static void __maybe_unused sha1_blocks_generic(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + u32 workspace[SHA1_WORKSPACE_WORDS]; + + do { + sha1_transform(state->h, data, workspace); + data += SHA1_BLOCK_SIZE; + } while (--nblocks); + + memzero_explicit(workspace, sizeof(workspace)); +} + +#ifdef CONFIG_CRYPTO_LIB_SHA1_ARCH +#include "sha1.h" /* $(SRCARCH)/sha1.h */ +#else +#define sha1_blocks sha1_blocks_generic +#endif + +void sha1_init(struct sha1_ctx *ctx) +{ + ctx->state = sha1_iv; + ctx->bytecount = 0; +} +EXPORT_SYMBOL_GPL(sha1_init); + +void sha1_update(struct sha1_ctx *ctx, const u8 *data, size_t len) +{ + size_t partial = ctx->bytecount % SHA1_BLOCK_SIZE; + + ctx->bytecount += len; + + if (partial + len >= SHA1_BLOCK_SIZE) { + size_t nblocks; + + if (partial) { + size_t l = SHA1_BLOCK_SIZE - partial; + + memcpy(&ctx->buf[partial], data, l); + data += l; + len -= l; + + sha1_blocks(&ctx->state, ctx->buf, 1); + } + + nblocks = len / SHA1_BLOCK_SIZE; + len %= SHA1_BLOCK_SIZE; + + if (nblocks) { + sha1_blocks(&ctx->state, data, nblocks); + data += nblocks * SHA1_BLOCK_SIZE; + } + partial = 0; + } + if (len) + memcpy(&ctx->buf[partial], data, len); +} +EXPORT_SYMBOL_GPL(sha1_update); + +static void __sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]) +{ + u64 bitcount = ctx->bytecount << 3; + size_t partial = ctx->bytecount % SHA1_BLOCK_SIZE; + + ctx->buf[partial++] = 0x80; + if (partial > SHA1_BLOCK_SIZE - 8) { + memset(&ctx->buf[partial], 0, SHA1_BLOCK_SIZE - partial); + sha1_blocks(&ctx->state, ctx->buf, 1); + partial = 0; + } + memset(&ctx->buf[partial], 0, SHA1_BLOCK_SIZE - 8 - partial); + *(__be64 *)&ctx->buf[SHA1_BLOCK_SIZE - 8] = cpu_to_be64(bitcount); + sha1_blocks(&ctx->state, ctx->buf, 1); + + for (size_t i = 0; i < SHA1_DIGEST_SIZE; i += 4) + put_unaligned_be32(ctx->state.h[i / 4], out + i); +} + +void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]) +{ + __sha1_final(ctx, out); + memzero_explicit(ctx, sizeof(*ctx)); +} +EXPORT_SYMBOL_GPL(sha1_final); + +void sha1(const u8 *data, size_t len, u8 out[SHA1_DIGEST_SIZE]) +{ + struct sha1_ctx ctx; + + sha1_init(&ctx); + sha1_update(&ctx, data, len); + sha1_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(sha1); + +static void __hmac_sha1_preparekey(struct sha1_block_state *istate, + struct sha1_block_state *ostate, + const u8 *raw_key, size_t raw_key_len) +{ + union { + u8 b[SHA1_BLOCK_SIZE]; + unsigned long w[SHA1_BLOCK_SIZE / sizeof(unsigned long)]; + } derived_key = { 0 }; + + if (unlikely(raw_key_len > SHA1_BLOCK_SIZE)) + sha1(raw_key, raw_key_len, derived_key.b); + else + memcpy(derived_key.b, raw_key, raw_key_len); + + for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) + derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE); + *istate = sha1_iv; + sha1_blocks(istate, derived_key.b, 1); + + for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) + derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^ + HMAC_IPAD_VALUE); + *ostate = sha1_iv; + sha1_blocks(ostate, derived_key.b, 1); + + memzero_explicit(&derived_key, sizeof(derived_key)); +} + +void hmac_sha1_preparekey(struct hmac_sha1_key *key, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_sha1_preparekey(&key->istate, &key->ostate, + raw_key, raw_key_len); +} +EXPORT_SYMBOL_GPL(hmac_sha1_preparekey); + +void hmac_sha1_init(struct hmac_sha1_ctx *ctx, const struct hmac_sha1_key *key) +{ + ctx->sha_ctx.state = key->istate; + ctx->sha_ctx.bytecount = SHA1_BLOCK_SIZE; + ctx->ostate = key->ostate; +} +EXPORT_SYMBOL_GPL(hmac_sha1_init); + +void hmac_sha1_init_usingrawkey(struct hmac_sha1_ctx *ctx, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_sha1_preparekey(&ctx->sha_ctx.state, &ctx->ostate, + raw_key, raw_key_len); + ctx->sha_ctx.bytecount = SHA1_BLOCK_SIZE; +} +EXPORT_SYMBOL_GPL(hmac_sha1_init_usingrawkey); + +void hmac_sha1_final(struct hmac_sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]) +{ + /* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */ + __sha1_final(&ctx->sha_ctx, ctx->sha_ctx.buf); + memset(&ctx->sha_ctx.buf[SHA1_DIGEST_SIZE], 0, + SHA1_BLOCK_SIZE - SHA1_DIGEST_SIZE); + ctx->sha_ctx.buf[SHA1_DIGEST_SIZE] = 0x80; + *(__be32 *)&ctx->sha_ctx.buf[SHA1_BLOCK_SIZE - 4] = + cpu_to_be32(8 * (SHA1_BLOCK_SIZE + SHA1_DIGEST_SIZE)); + + /* Compute the outer hash, which gives the HMAC value. */ + sha1_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1); + for (size_t i = 0; i < SHA1_DIGEST_SIZE; i += 4) + put_unaligned_be32(ctx->ostate.h[i / 4], out + i); + + memzero_explicit(ctx, sizeof(*ctx)); +} +EXPORT_SYMBOL_GPL(hmac_sha1_final); + +void hmac_sha1(const struct hmac_sha1_key *key, + const u8 *data, size_t data_len, u8 out[SHA1_DIGEST_SIZE]) +{ + struct hmac_sha1_ctx ctx; + + hmac_sha1_init(&ctx, key); + hmac_sha1_update(&ctx, data, data_len); + hmac_sha1_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_sha1); + +void hmac_sha1_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[SHA1_DIGEST_SIZE]) +{ + struct hmac_sha1_ctx ctx; + + hmac_sha1_init_usingrawkey(&ctx, raw_key, raw_key_len); + hmac_sha1_update(&ctx, data, data_len); + hmac_sha1_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_sha1_usingrawkey); + +#ifdef sha1_mod_init_arch +static int __init sha1_mod_init(void) +{ + sha1_mod_init_arch(); + return 0; +} +subsys_initcall(sha1_mod_init); + +static void __exit sha1_mod_exit(void) +{ +} +module_exit(sha1_mod_exit); +#endif -MODULE_DESCRIPTION("SHA-1 Algorithm"); +MODULE_DESCRIPTION("SHA-1 and HMAC-SHA1 library functions"); MODULE_LICENSE("GPL"); diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c index 04c1f2557e6c..8fa15165d23e 100644 --- a/lib/crypto/sha256.c +++ b/lib/crypto/sha256.c @@ -1,55 +1,57 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * SHA-256, as specified in - * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf - * - * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>. + * SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions * * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2014 Red Hat Inc. + * Copyright 2025 Google LLC */ -#include <linux/unaligned.h> -#include <crypto/sha256_base.h> +#include <crypto/hmac.h> +#include <crypto/sha2.h> +#include <linux/export.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/string.h> +#include <linux/unaligned.h> +#include <linux/wordpart.h> -static const u32 SHA256_K[] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, +static const struct sha256_block_state sha224_iv = { + .h = { + SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3, + SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7, + }, }; -static inline u32 Ch(u32 x, u32 y, u32 z) -{ - return z ^ (x & (y ^ z)); -} +static const struct sha256_block_state sha256_iv = { + .h = { + SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, + SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, + }, +}; -static inline u32 Maj(u32 x, u32 y, u32 z) -{ - return (x & y) | (z & (x | y)); -} +static const u32 sha256_K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, + 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, + 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, + 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, + 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, + 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, +}; -#define e0(x) (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22)) -#define e1(x) (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25)) -#define s0(x) (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3)) -#define s1(x) (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10)) +#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) +#define Maj(x, y, z) (((x) & (y)) | ((z) & ((x) | (y)))) +#define e0(x) (ror32((x), 2) ^ ror32((x), 13) ^ ror32((x), 22)) +#define e1(x) (ror32((x), 6) ^ ror32((x), 11) ^ ror32((x), 25)) +#define s0(x) (ror32((x), 7) ^ ror32((x), 18) ^ ((x) >> 3)) +#define s1(x) (ror32((x), 17) ^ ror32((x), 19) ^ ((x) >> 10)) static inline void LOAD_OP(int I, u32 *W, const u8 *input) { @@ -58,18 +60,20 @@ static inline void LOAD_OP(int I, u32 *W, const u8 *input) static inline void BLEND_OP(int I, u32 *W) { - W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16]; + W[I] = s1(W[I - 2]) + W[I - 7] + s0(W[I - 15]) + W[I - 16]; } -#define SHA256_ROUND(i, a, b, c, d, e, f, g, h) do { \ - u32 t1, t2; \ - t1 = h + e1(e) + Ch(e, f, g) + SHA256_K[i] + W[i]; \ - t2 = e0(a) + Maj(a, b, c); \ - d += t1; \ - h = t1 + t2; \ -} while (0) +#define SHA256_ROUND(i, a, b, c, d, e, f, g, h) \ + do { \ + u32 t1, t2; \ + t1 = h + e1(e) + Ch(e, f, g) + sha256_K[i] + W[i]; \ + t2 = e0(a) + Maj(a, b, c); \ + d += t1; \ + h = t1 + t2; \ + } while (0) -static void sha256_transform(u32 *state, const u8 *input, u32 *W) +static void sha256_block_generic(struct sha256_block_state *state, + const u8 *input, u32 W[64]) { u32 a, b, c, d, e, f, g, h; int i; @@ -99,8 +103,14 @@ static void sha256_transform(u32 *state, const u8 *input, u32 *W) } /* load the state into our registers */ - a = state[0]; b = state[1]; c = state[2]; d = state[3]; - e = state[4]; f = state[5]; g = state[6]; h = state[7]; + a = state->h[0]; + b = state->h[1]; + c = state->h[2]; + d = state->h[3]; + e = state->h[4]; + f = state->h[5]; + g = state->h[6]; + h = state->h[7]; /* now iterate */ for (i = 0; i < 64; i += 8) { @@ -114,56 +124,313 @@ static void sha256_transform(u32 *state, const u8 *input, u32 *W) SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a); } - state[0] += a; state[1] += b; state[2] += c; state[3] += d; - state[4] += e; state[5] += f; state[6] += g; state[7] += h; + state->h[0] += a; + state->h[1] += b; + state->h[2] += c; + state->h[3] += d; + state->h[4] += e; + state->h[5] += f; + state->h[6] += g; + state->h[7] += h; } -static void sha256_transform_blocks(struct sha256_state *sctx, - const u8 *input, int blocks) +static void __maybe_unused +sha256_blocks_generic(struct sha256_block_state *state, + const u8 *data, size_t nblocks) { u32 W[64]; do { - sha256_transform(sctx->state, input, W); - input += SHA256_BLOCK_SIZE; - } while (--blocks); + sha256_block_generic(state, data, W); + data += SHA256_BLOCK_SIZE; + } while (--nblocks); memzero_explicit(W, sizeof(W)); } -void sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len) +#if defined(CONFIG_CRYPTO_LIB_SHA256_ARCH) && !defined(__DISABLE_EXPORTS) +#include "sha256.h" /* $(SRCARCH)/sha256.h */ +#else +#define sha256_blocks sha256_blocks_generic +#endif + +static void __sha256_init(struct __sha256_ctx *ctx, + const struct sha256_block_state *iv, + u64 initial_bytecount) { - lib_sha256_base_do_update(sctx, data, len, sha256_transform_blocks); + ctx->state = *iv; + ctx->bytecount = initial_bytecount; } -EXPORT_SYMBOL(sha256_update); -static void __sha256_final(struct sha256_state *sctx, u8 *out, int digest_size) +void sha224_init(struct sha224_ctx *ctx) { - lib_sha256_base_do_finalize(sctx, sha256_transform_blocks); - lib_sha256_base_finish(sctx, out, digest_size); + __sha256_init(&ctx->ctx, &sha224_iv, 0); } +EXPORT_SYMBOL_GPL(sha224_init); -void sha256_final(struct sha256_state *sctx, u8 *out) +void sha256_init(struct sha256_ctx *ctx) { - __sha256_final(sctx, out, 32); + __sha256_init(&ctx->ctx, &sha256_iv, 0); } -EXPORT_SYMBOL(sha256_final); +EXPORT_SYMBOL_GPL(sha256_init); -void sha224_final(struct sha256_state *sctx, u8 *out) +void __sha256_update(struct __sha256_ctx *ctx, const u8 *data, size_t len) { - __sha256_final(sctx, out, 28); + size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE; + + ctx->bytecount += len; + + if (partial + len >= SHA256_BLOCK_SIZE) { + size_t nblocks; + + if (partial) { + size_t l = SHA256_BLOCK_SIZE - partial; + + memcpy(&ctx->buf[partial], data, l); + data += l; + len -= l; + + sha256_blocks(&ctx->state, ctx->buf, 1); + } + + nblocks = len / SHA256_BLOCK_SIZE; + len %= SHA256_BLOCK_SIZE; + + if (nblocks) { + sha256_blocks(&ctx->state, data, nblocks); + data += nblocks * SHA256_BLOCK_SIZE; + } + partial = 0; + } + if (len) + memcpy(&ctx->buf[partial], data, len); +} +EXPORT_SYMBOL(__sha256_update); + +static void __sha256_final(struct __sha256_ctx *ctx, + u8 *out, size_t digest_size) +{ + u64 bitcount = ctx->bytecount << 3; + size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE; + + ctx->buf[partial++] = 0x80; + if (partial > SHA256_BLOCK_SIZE - 8) { + memset(&ctx->buf[partial], 0, SHA256_BLOCK_SIZE - partial); + sha256_blocks(&ctx->state, ctx->buf, 1); + partial = 0; + } + memset(&ctx->buf[partial], 0, SHA256_BLOCK_SIZE - 8 - partial); + *(__be64 *)&ctx->buf[SHA256_BLOCK_SIZE - 8] = cpu_to_be64(bitcount); + sha256_blocks(&ctx->state, ctx->buf, 1); + + for (size_t i = 0; i < digest_size; i += 4) + put_unaligned_be32(ctx->state.h[i / 4], out + i); +} + +void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]) +{ + __sha256_final(&ctx->ctx, out, SHA224_DIGEST_SIZE); + memzero_explicit(ctx, sizeof(*ctx)); } EXPORT_SYMBOL(sha224_final); -void sha256(const u8 *data, unsigned int len, u8 *out) +void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]) { - struct sha256_state sctx; + __sha256_final(&ctx->ctx, out, SHA256_DIGEST_SIZE); + memzero_explicit(ctx, sizeof(*ctx)); +} +EXPORT_SYMBOL(sha256_final); + +void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE]) +{ + struct sha224_ctx ctx; - sha256_init(&sctx); - sha256_update(&sctx, data, len); - sha256_final(&sctx, out); + sha224_init(&ctx); + sha224_update(&ctx, data, len); + sha224_final(&ctx, out); +} +EXPORT_SYMBOL(sha224); + +void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]) +{ + struct sha256_ctx ctx; + + sha256_init(&ctx); + sha256_update(&ctx, data, len); + sha256_final(&ctx, out); } EXPORT_SYMBOL(sha256); -MODULE_DESCRIPTION("SHA-256 Algorithm"); +/* pre-boot environment (as indicated by __DISABLE_EXPORTS) doesn't need HMAC */ +#ifndef __DISABLE_EXPORTS +static void __hmac_sha256_preparekey(struct sha256_block_state *istate, + struct sha256_block_state *ostate, + const u8 *raw_key, size_t raw_key_len, + const struct sha256_block_state *iv) +{ + union { + u8 b[SHA256_BLOCK_SIZE]; + unsigned long w[SHA256_BLOCK_SIZE / sizeof(unsigned long)]; + } derived_key = { 0 }; + + if (unlikely(raw_key_len > SHA256_BLOCK_SIZE)) { + if (iv == &sha224_iv) + sha224(raw_key, raw_key_len, derived_key.b); + else + sha256(raw_key, raw_key_len, derived_key.b); + } else { + memcpy(derived_key.b, raw_key, raw_key_len); + } + + for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) + derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE); + *istate = *iv; + sha256_blocks(istate, derived_key.b, 1); + + for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) + derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^ + HMAC_IPAD_VALUE); + *ostate = *iv; + sha256_blocks(ostate, derived_key.b, 1); + + memzero_explicit(&derived_key, sizeof(derived_key)); +} + +void hmac_sha224_preparekey(struct hmac_sha224_key *key, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_sha256_preparekey(&key->key.istate, &key->key.ostate, + raw_key, raw_key_len, &sha224_iv); +} +EXPORT_SYMBOL_GPL(hmac_sha224_preparekey); + +void hmac_sha256_preparekey(struct hmac_sha256_key *key, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_sha256_preparekey(&key->key.istate, &key->key.ostate, + raw_key, raw_key_len, &sha256_iv); +} +EXPORT_SYMBOL_GPL(hmac_sha256_preparekey); + +void __hmac_sha256_init(struct __hmac_sha256_ctx *ctx, + const struct __hmac_sha256_key *key) +{ + __sha256_init(&ctx->sha_ctx, &key->istate, SHA256_BLOCK_SIZE); + ctx->ostate = key->ostate; +} +EXPORT_SYMBOL_GPL(__hmac_sha256_init); + +void hmac_sha224_init_usingrawkey(struct hmac_sha224_ctx *ctx, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_sha256_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate, + raw_key, raw_key_len, &sha224_iv); + ctx->ctx.sha_ctx.bytecount = SHA256_BLOCK_SIZE; +} +EXPORT_SYMBOL_GPL(hmac_sha224_init_usingrawkey); + +void hmac_sha256_init_usingrawkey(struct hmac_sha256_ctx *ctx, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_sha256_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate, + raw_key, raw_key_len, &sha256_iv); + ctx->ctx.sha_ctx.bytecount = SHA256_BLOCK_SIZE; +} +EXPORT_SYMBOL_GPL(hmac_sha256_init_usingrawkey); + +static void __hmac_sha256_final(struct __hmac_sha256_ctx *ctx, + u8 *out, size_t digest_size) +{ + /* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */ + __sha256_final(&ctx->sha_ctx, ctx->sha_ctx.buf, digest_size); + memset(&ctx->sha_ctx.buf[digest_size], 0, + SHA256_BLOCK_SIZE - digest_size); + ctx->sha_ctx.buf[digest_size] = 0x80; + *(__be32 *)&ctx->sha_ctx.buf[SHA256_BLOCK_SIZE - 4] = + cpu_to_be32(8 * (SHA256_BLOCK_SIZE + digest_size)); + + /* Compute the outer hash, which gives the HMAC value. */ + sha256_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1); + for (size_t i = 0; i < digest_size; i += 4) + put_unaligned_be32(ctx->ostate.h[i / 4], out + i); + + memzero_explicit(ctx, sizeof(*ctx)); +} + +void hmac_sha224_final(struct hmac_sha224_ctx *ctx, + u8 out[SHA224_DIGEST_SIZE]) +{ + __hmac_sha256_final(&ctx->ctx, out, SHA224_DIGEST_SIZE); +} +EXPORT_SYMBOL_GPL(hmac_sha224_final); + +void hmac_sha256_final(struct hmac_sha256_ctx *ctx, + u8 out[SHA256_DIGEST_SIZE]) +{ + __hmac_sha256_final(&ctx->ctx, out, SHA256_DIGEST_SIZE); +} +EXPORT_SYMBOL_GPL(hmac_sha256_final); + +void hmac_sha224(const struct hmac_sha224_key *key, + const u8 *data, size_t data_len, u8 out[SHA224_DIGEST_SIZE]) +{ + struct hmac_sha224_ctx ctx; + + hmac_sha224_init(&ctx, key); + hmac_sha224_update(&ctx, data, data_len); + hmac_sha224_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_sha224); + +void hmac_sha256(const struct hmac_sha256_key *key, + const u8 *data, size_t data_len, u8 out[SHA256_DIGEST_SIZE]) +{ + struct hmac_sha256_ctx ctx; + + hmac_sha256_init(&ctx, key); + hmac_sha256_update(&ctx, data, data_len); + hmac_sha256_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_sha256); + +void hmac_sha224_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[SHA224_DIGEST_SIZE]) +{ + struct hmac_sha224_ctx ctx; + + hmac_sha224_init_usingrawkey(&ctx, raw_key, raw_key_len); + hmac_sha224_update(&ctx, data, data_len); + hmac_sha224_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_sha224_usingrawkey); + +void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[SHA256_DIGEST_SIZE]) +{ + struct hmac_sha256_ctx ctx; + + hmac_sha256_init_usingrawkey(&ctx, raw_key, raw_key_len); + hmac_sha256_update(&ctx, data, data_len); + hmac_sha256_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_sha256_usingrawkey); +#endif /* !__DISABLE_EXPORTS */ + +#ifdef sha256_mod_init_arch +static int __init sha256_mod_init(void) +{ + sha256_mod_init_arch(); + return 0; +} +subsys_initcall(sha256_mod_init); + +static void __exit sha256_mod_exit(void) +{ +} +module_exit(sha256_mod_exit); +#endif + +MODULE_DESCRIPTION("SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions"); MODULE_LICENSE("GPL"); diff --git a/lib/crypto/sha512.c b/lib/crypto/sha512.c new file mode 100644 index 000000000000..d8062188be98 --- /dev/null +++ b/lib/crypto/sha512.c @@ -0,0 +1,423 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions + * + * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) 2003 Kyle McMartin <kyle@debian.org> + * Copyright 2025 Google LLC + */ + +#include <crypto/hmac.h> +#include <crypto/sha2.h> +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/overflow.h> +#include <linux/string.h> +#include <linux/unaligned.h> +#include <linux/wordpart.h> + +static const struct sha512_block_state sha384_iv = { + .h = { + SHA384_H0, SHA384_H1, SHA384_H2, SHA384_H3, + SHA384_H4, SHA384_H5, SHA384_H6, SHA384_H7, + }, +}; + +static const struct sha512_block_state sha512_iv = { + .h = { + SHA512_H0, SHA512_H1, SHA512_H2, SHA512_H3, + SHA512_H4, SHA512_H5, SHA512_H6, SHA512_H7, + }, +}; + +static const u64 sha512_K[80] = { + 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL, + 0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, + 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL, + 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, + 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL, + 0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, + 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL, + 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, + 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL, + 0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, + 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL, + 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, + 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL, + 0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, + 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL, + 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, + 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL, + 0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, + 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL, + 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, + 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL, + 0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, + 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL, + 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, + 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL, + 0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, + 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL, +}; + +#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) +#define Maj(x, y, z) (((x) & (y)) | ((z) & ((x) | (y)))) +#define e0(x) (ror64((x), 28) ^ ror64((x), 34) ^ ror64((x), 39)) +#define e1(x) (ror64((x), 14) ^ ror64((x), 18) ^ ror64((x), 41)) +#define s0(x) (ror64((x), 1) ^ ror64((x), 8) ^ ((x) >> 7)) +#define s1(x) (ror64((x), 19) ^ ror64((x), 61) ^ ((x) >> 6)) + +static void sha512_block_generic(struct sha512_block_state *state, + const u8 *data) +{ + u64 a = state->h[0]; + u64 b = state->h[1]; + u64 c = state->h[2]; + u64 d = state->h[3]; + u64 e = state->h[4]; + u64 f = state->h[5]; + u64 g = state->h[6]; + u64 h = state->h[7]; + u64 t1, t2; + u64 W[16]; + + for (int j = 0; j < 16; j++) + W[j] = get_unaligned_be64(data + j * sizeof(u64)); + + for (int i = 0; i < 80; i += 8) { + if ((i & 15) == 0 && i != 0) { + for (int j = 0; j < 16; j++) { + W[j & 15] += s1(W[(j - 2) & 15]) + + W[(j - 7) & 15] + + s0(W[(j - 15) & 15]); + } + } + t1 = h + e1(e) + Ch(e, f, g) + sha512_K[i] + W[(i & 15)]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; + t1 = g + e1(d) + Ch(d, e, f) + sha512_K[i+1] + W[(i & 15) + 1]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; + t1 = f + e1(c) + Ch(c, d, e) + sha512_K[i+2] + W[(i & 15) + 2]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; + t1 = e + e1(b) + Ch(b, c, d) + sha512_K[i+3] + W[(i & 15) + 3]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; + t1 = d + e1(a) + Ch(a, b, c) + sha512_K[i+4] + W[(i & 15) + 4]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; + t1 = c + e1(h) + Ch(h, a, b) + sha512_K[i+5] + W[(i & 15) + 5]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; + t1 = b + e1(g) + Ch(g, h, a) + sha512_K[i+6] + W[(i & 15) + 6]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; + t1 = a + e1(f) + Ch(f, g, h) + sha512_K[i+7] + W[(i & 15) + 7]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; + } + + state->h[0] += a; + state->h[1] += b; + state->h[2] += c; + state->h[3] += d; + state->h[4] += e; + state->h[5] += f; + state->h[6] += g; + state->h[7] += h; +} + +static void __maybe_unused +sha512_blocks_generic(struct sha512_block_state *state, + const u8 *data, size_t nblocks) +{ + do { + sha512_block_generic(state, data); + data += SHA512_BLOCK_SIZE; + } while (--nblocks); +} + +#ifdef CONFIG_CRYPTO_LIB_SHA512_ARCH +#include "sha512.h" /* $(SRCARCH)/sha512.h */ +#else +#define sha512_blocks sha512_blocks_generic +#endif + +static void __sha512_init(struct __sha512_ctx *ctx, + const struct sha512_block_state *iv, + u64 initial_bytecount) +{ + ctx->state = *iv; + ctx->bytecount_lo = initial_bytecount; + ctx->bytecount_hi = 0; +} + +void sha384_init(struct sha384_ctx *ctx) +{ + __sha512_init(&ctx->ctx, &sha384_iv, 0); +} +EXPORT_SYMBOL_GPL(sha384_init); + +void sha512_init(struct sha512_ctx *ctx) +{ + __sha512_init(&ctx->ctx, &sha512_iv, 0); +} +EXPORT_SYMBOL_GPL(sha512_init); + +void __sha512_update(struct __sha512_ctx *ctx, const u8 *data, size_t len) +{ + size_t partial = ctx->bytecount_lo % SHA512_BLOCK_SIZE; + + if (check_add_overflow(ctx->bytecount_lo, len, &ctx->bytecount_lo)) + ctx->bytecount_hi++; + + if (partial + len >= SHA512_BLOCK_SIZE) { + size_t nblocks; + + if (partial) { + size_t l = SHA512_BLOCK_SIZE - partial; + + memcpy(&ctx->buf[partial], data, l); + data += l; + len -= l; + + sha512_blocks(&ctx->state, ctx->buf, 1); + } + + nblocks = len / SHA512_BLOCK_SIZE; + len %= SHA512_BLOCK_SIZE; + + if (nblocks) { + sha512_blocks(&ctx->state, data, nblocks); + data += nblocks * SHA512_BLOCK_SIZE; + } + partial = 0; + } + if (len) + memcpy(&ctx->buf[partial], data, len); +} +EXPORT_SYMBOL_GPL(__sha512_update); + +static void __sha512_final(struct __sha512_ctx *ctx, + u8 *out, size_t digest_size) +{ + u64 bitcount_hi = (ctx->bytecount_hi << 3) | (ctx->bytecount_lo >> 61); + u64 bitcount_lo = ctx->bytecount_lo << 3; + size_t partial = ctx->bytecount_lo % SHA512_BLOCK_SIZE; + + ctx->buf[partial++] = 0x80; + if (partial > SHA512_BLOCK_SIZE - 16) { + memset(&ctx->buf[partial], 0, SHA512_BLOCK_SIZE - partial); + sha512_blocks(&ctx->state, ctx->buf, 1); + partial = 0; + } + memset(&ctx->buf[partial], 0, SHA512_BLOCK_SIZE - 16 - partial); + *(__be64 *)&ctx->buf[SHA512_BLOCK_SIZE - 16] = cpu_to_be64(bitcount_hi); + *(__be64 *)&ctx->buf[SHA512_BLOCK_SIZE - 8] = cpu_to_be64(bitcount_lo); + sha512_blocks(&ctx->state, ctx->buf, 1); + + for (size_t i = 0; i < digest_size; i += 8) + put_unaligned_be64(ctx->state.h[i / 8], out + i); +} + +void sha384_final(struct sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]) +{ + __sha512_final(&ctx->ctx, out, SHA384_DIGEST_SIZE); + memzero_explicit(ctx, sizeof(*ctx)); +} +EXPORT_SYMBOL_GPL(sha384_final); + +void sha512_final(struct sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]) +{ + __sha512_final(&ctx->ctx, out, SHA512_DIGEST_SIZE); + memzero_explicit(ctx, sizeof(*ctx)); +} +EXPORT_SYMBOL_GPL(sha512_final); + +void sha384(const u8 *data, size_t len, u8 out[SHA384_DIGEST_SIZE]) +{ + struct sha384_ctx ctx; + + sha384_init(&ctx); + sha384_update(&ctx, data, len); + sha384_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(sha384); + +void sha512(const u8 *data, size_t len, u8 out[SHA512_DIGEST_SIZE]) +{ + struct sha512_ctx ctx; + + sha512_init(&ctx); + sha512_update(&ctx, data, len); + sha512_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(sha512); + +static void __hmac_sha512_preparekey(struct sha512_block_state *istate, + struct sha512_block_state *ostate, + const u8 *raw_key, size_t raw_key_len, + const struct sha512_block_state *iv) +{ + union { + u8 b[SHA512_BLOCK_SIZE]; + unsigned long w[SHA512_BLOCK_SIZE / sizeof(unsigned long)]; + } derived_key = { 0 }; + + if (unlikely(raw_key_len > SHA512_BLOCK_SIZE)) { + if (iv == &sha384_iv) + sha384(raw_key, raw_key_len, derived_key.b); + else + sha512(raw_key, raw_key_len, derived_key.b); + } else { + memcpy(derived_key.b, raw_key, raw_key_len); + } + + for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) + derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE); + *istate = *iv; + sha512_blocks(istate, derived_key.b, 1); + + for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) + derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^ + HMAC_IPAD_VALUE); + *ostate = *iv; + sha512_blocks(ostate, derived_key.b, 1); + + memzero_explicit(&derived_key, sizeof(derived_key)); +} + +void hmac_sha384_preparekey(struct hmac_sha384_key *key, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_sha512_preparekey(&key->key.istate, &key->key.ostate, + raw_key, raw_key_len, &sha384_iv); +} +EXPORT_SYMBOL_GPL(hmac_sha384_preparekey); + +void hmac_sha512_preparekey(struct hmac_sha512_key *key, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_sha512_preparekey(&key->key.istate, &key->key.ostate, + raw_key, raw_key_len, &sha512_iv); +} +EXPORT_SYMBOL_GPL(hmac_sha512_preparekey); + +void __hmac_sha512_init(struct __hmac_sha512_ctx *ctx, + const struct __hmac_sha512_key *key) +{ + __sha512_init(&ctx->sha_ctx, &key->istate, SHA512_BLOCK_SIZE); + ctx->ostate = key->ostate; +} +EXPORT_SYMBOL_GPL(__hmac_sha512_init); + +void hmac_sha384_init_usingrawkey(struct hmac_sha384_ctx *ctx, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_sha512_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate, + raw_key, raw_key_len, &sha384_iv); + ctx->ctx.sha_ctx.bytecount_lo = SHA512_BLOCK_SIZE; + ctx->ctx.sha_ctx.bytecount_hi = 0; +} +EXPORT_SYMBOL_GPL(hmac_sha384_init_usingrawkey); + +void hmac_sha512_init_usingrawkey(struct hmac_sha512_ctx *ctx, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_sha512_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate, + raw_key, raw_key_len, &sha512_iv); + ctx->ctx.sha_ctx.bytecount_lo = SHA512_BLOCK_SIZE; + ctx->ctx.sha_ctx.bytecount_hi = 0; +} +EXPORT_SYMBOL_GPL(hmac_sha512_init_usingrawkey); + +static void __hmac_sha512_final(struct __hmac_sha512_ctx *ctx, + u8 *out, size_t digest_size) +{ + /* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */ + __sha512_final(&ctx->sha_ctx, ctx->sha_ctx.buf, digest_size); + memset(&ctx->sha_ctx.buf[digest_size], 0, + SHA512_BLOCK_SIZE - digest_size); + ctx->sha_ctx.buf[digest_size] = 0x80; + *(__be32 *)&ctx->sha_ctx.buf[SHA512_BLOCK_SIZE - 4] = + cpu_to_be32(8 * (SHA512_BLOCK_SIZE + digest_size)); + + /* Compute the outer hash, which gives the HMAC value. */ + sha512_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1); + for (size_t i = 0; i < digest_size; i += 8) + put_unaligned_be64(ctx->ostate.h[i / 8], out + i); + + memzero_explicit(ctx, sizeof(*ctx)); +} + +void hmac_sha384_final(struct hmac_sha384_ctx *ctx, + u8 out[SHA384_DIGEST_SIZE]) +{ + __hmac_sha512_final(&ctx->ctx, out, SHA384_DIGEST_SIZE); +} +EXPORT_SYMBOL_GPL(hmac_sha384_final); + +void hmac_sha512_final(struct hmac_sha512_ctx *ctx, + u8 out[SHA512_DIGEST_SIZE]) +{ + __hmac_sha512_final(&ctx->ctx, out, SHA512_DIGEST_SIZE); +} +EXPORT_SYMBOL_GPL(hmac_sha512_final); + +void hmac_sha384(const struct hmac_sha384_key *key, + const u8 *data, size_t data_len, u8 out[SHA384_DIGEST_SIZE]) +{ + struct hmac_sha384_ctx ctx; + + hmac_sha384_init(&ctx, key); + hmac_sha384_update(&ctx, data, data_len); + hmac_sha384_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_sha384); + +void hmac_sha512(const struct hmac_sha512_key *key, + const u8 *data, size_t data_len, u8 out[SHA512_DIGEST_SIZE]) +{ + struct hmac_sha512_ctx ctx; + + hmac_sha512_init(&ctx, key); + hmac_sha512_update(&ctx, data, data_len); + hmac_sha512_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_sha512); + +void hmac_sha384_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[SHA384_DIGEST_SIZE]) +{ + struct hmac_sha384_ctx ctx; + + hmac_sha384_init_usingrawkey(&ctx, raw_key, raw_key_len); + hmac_sha384_update(&ctx, data, data_len); + hmac_sha384_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_sha384_usingrawkey); + +void hmac_sha512_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[SHA512_DIGEST_SIZE]) +{ + struct hmac_sha512_ctx ctx; + + hmac_sha512_init_usingrawkey(&ctx, raw_key, raw_key_len); + hmac_sha512_update(&ctx, data, data_len); + hmac_sha512_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_sha512_usingrawkey); + +#ifdef sha512_mod_init_arch +static int __init sha512_mod_init(void) +{ + sha512_mod_init_arch(); + return 0; +} +subsys_initcall(sha512_mod_init); + +static void __exit sha512_mod_exit(void) +{ +} +module_exit(sha512_mod_exit); +#endif + +MODULE_DESCRIPTION("SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions"); +MODULE_LICENSE("GPL"); diff --git a/lib/crypto/sm3.c b/lib/crypto/sm3.c new file mode 100644 index 000000000000..c6b9ad8a3ac6 --- /dev/null +++ b/lib/crypto/sm3.c @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * SM3 secure hash, as specified by OSCCA GM/T 0004-2012 SM3 and described + * at https://datatracker.ietf.org/doc/html/draft-sca-cfrg-sm3-02 + * + * Copyright (C) 2017 ARM Limited or its affiliates. + * Copyright (C) 2017 Gilad Ben-Yossef <gilad@benyossef.com> + * Copyright (C) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> + */ + +#include <crypto/sm3.h> +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/unaligned.h> + +static const u32 ____cacheline_aligned K[64] = { + 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb, + 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc, + 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce, + 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6, + 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c, + 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce, + 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec, + 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5, + 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53, + 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d, + 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4, + 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43, + 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c, + 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce, + 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec, + 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5 +}; + +/* + * Transform the message X which consists of 16 32-bit-words. See + * GM/T 004-2012 for details. + */ +#define R(i, a, b, c, d, e, f, g, h, t, w1, w2) \ + do { \ + ss1 = rol32((rol32((a), 12) + (e) + (t)), 7); \ + ss2 = ss1 ^ rol32((a), 12); \ + d += FF ## i(a, b, c) + ss2 + ((w1) ^ (w2)); \ + h += GG ## i(e, f, g) + ss1 + (w1); \ + b = rol32((b), 9); \ + f = rol32((f), 19); \ + h = P0((h)); \ + } while (0) + +#define R1(a, b, c, d, e, f, g, h, t, w1, w2) \ + R(1, a, b, c, d, e, f, g, h, t, w1, w2) +#define R2(a, b, c, d, e, f, g, h, t, w1, w2) \ + R(2, a, b, c, d, e, f, g, h, t, w1, w2) + +#define FF1(x, y, z) (x ^ y ^ z) +#define FF2(x, y, z) ((x & y) | (x & z) | (y & z)) + +#define GG1(x, y, z) FF1(x, y, z) +#define GG2(x, y, z) ((x & y) | (~x & z)) + +/* Message expansion */ +#define P0(x) ((x) ^ rol32((x), 9) ^ rol32((x), 17)) +#define P1(x) ((x) ^ rol32((x), 15) ^ rol32((x), 23)) +#define I(i) (W[i] = get_unaligned_be32(data + i * 4)) +#define W1(i) (W[i & 0x0f]) +#define W2(i) (W[i & 0x0f] = \ + P1(W[i & 0x0f] \ + ^ W[(i-9) & 0x0f] \ + ^ rol32(W[(i-3) & 0x0f], 15)) \ + ^ rol32(W[(i-13) & 0x0f], 7) \ + ^ W[(i-6) & 0x0f]) + +static void sm3_transform(struct sm3_state *sctx, u8 const *data, u32 W[16]) +{ + u32 a, b, c, d, e, f, g, h, ss1, ss2; + + a = sctx->state[0]; + b = sctx->state[1]; + c = sctx->state[2]; + d = sctx->state[3]; + e = sctx->state[4]; + f = sctx->state[5]; + g = sctx->state[6]; + h = sctx->state[7]; + + R1(a, b, c, d, e, f, g, h, K[0], I(0), I(4)); + R1(d, a, b, c, h, e, f, g, K[1], I(1), I(5)); + R1(c, d, a, b, g, h, e, f, K[2], I(2), I(6)); + R1(b, c, d, a, f, g, h, e, K[3], I(3), I(7)); + R1(a, b, c, d, e, f, g, h, K[4], W1(4), I(8)); + R1(d, a, b, c, h, e, f, g, K[5], W1(5), I(9)); + R1(c, d, a, b, g, h, e, f, K[6], W1(6), I(10)); + R1(b, c, d, a, f, g, h, e, K[7], W1(7), I(11)); + R1(a, b, c, d, e, f, g, h, K[8], W1(8), I(12)); + R1(d, a, b, c, h, e, f, g, K[9], W1(9), I(13)); + R1(c, d, a, b, g, h, e, f, K[10], W1(10), I(14)); + R1(b, c, d, a, f, g, h, e, K[11], W1(11), I(15)); + R1(a, b, c, d, e, f, g, h, K[12], W1(12), W2(16)); + R1(d, a, b, c, h, e, f, g, K[13], W1(13), W2(17)); + R1(c, d, a, b, g, h, e, f, K[14], W1(14), W2(18)); + R1(b, c, d, a, f, g, h, e, K[15], W1(15), W2(19)); + + R2(a, b, c, d, e, f, g, h, K[16], W1(16), W2(20)); + R2(d, a, b, c, h, e, f, g, K[17], W1(17), W2(21)); + R2(c, d, a, b, g, h, e, f, K[18], W1(18), W2(22)); + R2(b, c, d, a, f, g, h, e, K[19], W1(19), W2(23)); + R2(a, b, c, d, e, f, g, h, K[20], W1(20), W2(24)); + R2(d, a, b, c, h, e, f, g, K[21], W1(21), W2(25)); + R2(c, d, a, b, g, h, e, f, K[22], W1(22), W2(26)); + R2(b, c, d, a, f, g, h, e, K[23], W1(23), W2(27)); + R2(a, b, c, d, e, f, g, h, K[24], W1(24), W2(28)); + R2(d, a, b, c, h, e, f, g, K[25], W1(25), W2(29)); + R2(c, d, a, b, g, h, e, f, K[26], W1(26), W2(30)); + R2(b, c, d, a, f, g, h, e, K[27], W1(27), W2(31)); + R2(a, b, c, d, e, f, g, h, K[28], W1(28), W2(32)); + R2(d, a, b, c, h, e, f, g, K[29], W1(29), W2(33)); + R2(c, d, a, b, g, h, e, f, K[30], W1(30), W2(34)); + R2(b, c, d, a, f, g, h, e, K[31], W1(31), W2(35)); + + R2(a, b, c, d, e, f, g, h, K[32], W1(32), W2(36)); + R2(d, a, b, c, h, e, f, g, K[33], W1(33), W2(37)); + R2(c, d, a, b, g, h, e, f, K[34], W1(34), W2(38)); + R2(b, c, d, a, f, g, h, e, K[35], W1(35), W2(39)); + R2(a, b, c, d, e, f, g, h, K[36], W1(36), W2(40)); + R2(d, a, b, c, h, e, f, g, K[37], W1(37), W2(41)); + R2(c, d, a, b, g, h, e, f, K[38], W1(38), W2(42)); + R2(b, c, d, a, f, g, h, e, K[39], W1(39), W2(43)); + R2(a, b, c, d, e, f, g, h, K[40], W1(40), W2(44)); + R2(d, a, b, c, h, e, f, g, K[41], W1(41), W2(45)); + R2(c, d, a, b, g, h, e, f, K[42], W1(42), W2(46)); + R2(b, c, d, a, f, g, h, e, K[43], W1(43), W2(47)); + R2(a, b, c, d, e, f, g, h, K[44], W1(44), W2(48)); + R2(d, a, b, c, h, e, f, g, K[45], W1(45), W2(49)); + R2(c, d, a, b, g, h, e, f, K[46], W1(46), W2(50)); + R2(b, c, d, a, f, g, h, e, K[47], W1(47), W2(51)); + + R2(a, b, c, d, e, f, g, h, K[48], W1(48), W2(52)); + R2(d, a, b, c, h, e, f, g, K[49], W1(49), W2(53)); + R2(c, d, a, b, g, h, e, f, K[50], W1(50), W2(54)); + R2(b, c, d, a, f, g, h, e, K[51], W1(51), W2(55)); + R2(a, b, c, d, e, f, g, h, K[52], W1(52), W2(56)); + R2(d, a, b, c, h, e, f, g, K[53], W1(53), W2(57)); + R2(c, d, a, b, g, h, e, f, K[54], W1(54), W2(58)); + R2(b, c, d, a, f, g, h, e, K[55], W1(55), W2(59)); + R2(a, b, c, d, e, f, g, h, K[56], W1(56), W2(60)); + R2(d, a, b, c, h, e, f, g, K[57], W1(57), W2(61)); + R2(c, d, a, b, g, h, e, f, K[58], W1(58), W2(62)); + R2(b, c, d, a, f, g, h, e, K[59], W1(59), W2(63)); + R2(a, b, c, d, e, f, g, h, K[60], W1(60), W2(64)); + R2(d, a, b, c, h, e, f, g, K[61], W1(61), W2(65)); + R2(c, d, a, b, g, h, e, f, K[62], W1(62), W2(66)); + R2(b, c, d, a, f, g, h, e, K[63], W1(63), W2(67)); + + sctx->state[0] ^= a; + sctx->state[1] ^= b; + sctx->state[2] ^= c; + sctx->state[3] ^= d; + sctx->state[4] ^= e; + sctx->state[5] ^= f; + sctx->state[6] ^= g; + sctx->state[7] ^= h; +} +#undef R +#undef R1 +#undef R2 +#undef I +#undef W1 +#undef W2 + +void sm3_block_generic(struct sm3_state *sctx, u8 const *data, int blocks) +{ + u32 W[16]; + + do { + sm3_transform(sctx, data, W); + data += SM3_BLOCK_SIZE; + } while (--blocks); + + memzero_explicit(W, sizeof(W)); +} +EXPORT_SYMBOL_GPL(sm3_block_generic); + +MODULE_DESCRIPTION("Generic SM3 library"); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/sparc/sha1.h b/lib/crypto/sparc/sha1.h new file mode 100644 index 000000000000..5015f93584b7 --- /dev/null +++ b/lib/crypto/sparc/sha1.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * SHA-1 accelerated using the sparc64 crypto opcodes + * + * Copyright (c) Alan Smithee. + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> + * Copyright (c) Mathias Krause <minipli@googlemail.com> + */ + +#include <asm/elf.h> +#include <asm/opcodes.h> +#include <asm/pstate.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha1_opcodes); + +asmlinkage void sha1_sparc64_transform(struct sha1_block_state *state, + const u8 *data, size_t nblocks); + +static void sha1_blocks(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_sha1_opcodes)) + sha1_sparc64_transform(state, data, nblocks); + else + sha1_blocks_generic(state, data, nblocks); +} + +#define sha1_mod_init_arch sha1_mod_init_arch +static inline void sha1_mod_init_arch(void) +{ + unsigned long cfr; + + if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) + return; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + if (!(cfr & CFR_SHA1)) + return; + + static_branch_enable(&have_sha1_opcodes); + pr_info("Using sparc64 sha1 opcode optimized SHA-1 implementation\n"); +} diff --git a/lib/crypto/sparc/sha1_asm.S b/lib/crypto/sparc/sha1_asm.S new file mode 100644 index 000000000000..00b46bac1b08 --- /dev/null +++ b/lib/crypto/sparc/sha1_asm.S @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> +#include <asm/opcodes.h> +#include <asm/visasm.h> + +ENTRY(sha1_sparc64_transform) + /* %o0 = digest, %o1 = data, %o2 = rounds */ + VISEntryHalf + ld [%o0 + 0x00], %f0 + ld [%o0 + 0x04], %f1 + ld [%o0 + 0x08], %f2 + andcc %o1, 0x7, %g0 + ld [%o0 + 0x0c], %f3 + bne,pn %xcc, 10f + ld [%o0 + 0x10], %f4 + +1: + ldd [%o1 + 0x00], %f8 + ldd [%o1 + 0x08], %f10 + ldd [%o1 + 0x10], %f12 + ldd [%o1 + 0x18], %f14 + ldd [%o1 + 0x20], %f16 + ldd [%o1 + 0x28], %f18 + ldd [%o1 + 0x30], %f20 + ldd [%o1 + 0x38], %f22 + + SHA1 + + subcc %o2, 1, %o2 + bne,pt %xcc, 1b + add %o1, 0x40, %o1 + +5: + st %f0, [%o0 + 0x00] + st %f1, [%o0 + 0x04] + st %f2, [%o0 + 0x08] + st %f3, [%o0 + 0x0c] + st %f4, [%o0 + 0x10] + retl + VISExitHalf +10: + alignaddr %o1, %g0, %o1 + + ldd [%o1 + 0x00], %f10 +1: + ldd [%o1 + 0x08], %f12 + ldd [%o1 + 0x10], %f14 + ldd [%o1 + 0x18], %f16 + ldd [%o1 + 0x20], %f18 + ldd [%o1 + 0x28], %f20 + ldd [%o1 + 0x30], %f22 + ldd [%o1 + 0x38], %f24 + ldd [%o1 + 0x40], %f26 + + faligndata %f10, %f12, %f8 + faligndata %f12, %f14, %f10 + faligndata %f14, %f16, %f12 + faligndata %f16, %f18, %f14 + faligndata %f18, %f20, %f16 + faligndata %f20, %f22, %f18 + faligndata %f22, %f24, %f20 + faligndata %f24, %f26, %f22 + + SHA1 + + subcc %o2, 1, %o2 + fsrc2 %f26, %f10 + bne,pt %xcc, 1b + add %o1, 0x40, %o1 + + ba,a,pt %xcc, 5b +ENDPROC(sha1_sparc64_transform) diff --git a/lib/crypto/sparc/sha256.h b/lib/crypto/sparc/sha256.h new file mode 100644 index 000000000000..1d10108eb195 --- /dev/null +++ b/lib/crypto/sparc/sha256.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * SHA-256 accelerated using the sparc64 sha256 opcodes + * + * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> + * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com> + */ + +#include <asm/elf.h> +#include <asm/opcodes.h> +#include <asm/pstate.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_opcodes); + +asmlinkage void sha256_sparc64_transform(struct sha256_block_state *state, + const u8 *data, size_t nblocks); + +static void sha256_blocks(struct sha256_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_sha256_opcodes)) + sha256_sparc64_transform(state, data, nblocks); + else + sha256_blocks_generic(state, data, nblocks); +} + +#define sha256_mod_init_arch sha256_mod_init_arch +static inline void sha256_mod_init_arch(void) +{ + unsigned long cfr; + + if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) + return; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + if (!(cfr & CFR_SHA256)) + return; + + static_branch_enable(&have_sha256_opcodes); + pr_info("Using sparc64 sha256 opcode optimized SHA-256/SHA-224 implementation\n"); +} diff --git a/lib/crypto/sparc/sha256_asm.S b/lib/crypto/sparc/sha256_asm.S new file mode 100644 index 000000000000..ddcdd3daf31e --- /dev/null +++ b/lib/crypto/sparc/sha256_asm.S @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> +#include <asm/opcodes.h> +#include <asm/visasm.h> + +ENTRY(sha256_sparc64_transform) + /* %o0 = state, %o1 = data, %o2 = nblocks */ + VISEntryHalf + ld [%o0 + 0x00], %f0 + ld [%o0 + 0x04], %f1 + ld [%o0 + 0x08], %f2 + ld [%o0 + 0x0c], %f3 + ld [%o0 + 0x10], %f4 + ld [%o0 + 0x14], %f5 + andcc %o1, 0x7, %g0 + ld [%o0 + 0x18], %f6 + bne,pn %xcc, 10f + ld [%o0 + 0x1c], %f7 + +1: + ldd [%o1 + 0x00], %f8 + ldd [%o1 + 0x08], %f10 + ldd [%o1 + 0x10], %f12 + ldd [%o1 + 0x18], %f14 + ldd [%o1 + 0x20], %f16 + ldd [%o1 + 0x28], %f18 + ldd [%o1 + 0x30], %f20 + ldd [%o1 + 0x38], %f22 + + SHA256 + + subcc %o2, 1, %o2 + bne,pt %xcc, 1b + add %o1, 0x40, %o1 + +5: + st %f0, [%o0 + 0x00] + st %f1, [%o0 + 0x04] + st %f2, [%o0 + 0x08] + st %f3, [%o0 + 0x0c] + st %f4, [%o0 + 0x10] + st %f5, [%o0 + 0x14] + st %f6, [%o0 + 0x18] + st %f7, [%o0 + 0x1c] + retl + VISExitHalf +10: + alignaddr %o1, %g0, %o1 + + ldd [%o1 + 0x00], %f10 +1: + ldd [%o1 + 0x08], %f12 + ldd [%o1 + 0x10], %f14 + ldd [%o1 + 0x18], %f16 + ldd [%o1 + 0x20], %f18 + ldd [%o1 + 0x28], %f20 + ldd [%o1 + 0x30], %f22 + ldd [%o1 + 0x38], %f24 + ldd [%o1 + 0x40], %f26 + + faligndata %f10, %f12, %f8 + faligndata %f12, %f14, %f10 + faligndata %f14, %f16, %f12 + faligndata %f16, %f18, %f14 + faligndata %f18, %f20, %f16 + faligndata %f20, %f22, %f18 + faligndata %f22, %f24, %f20 + faligndata %f24, %f26, %f22 + + SHA256 + + subcc %o2, 1, %o2 + fsrc2 %f26, %f10 + bne,pt %xcc, 1b + add %o1, 0x40, %o1 + + ba,a,pt %xcc, 5b +ENDPROC(sha256_sparc64_transform) diff --git a/lib/crypto/sparc/sha512.h b/lib/crypto/sparc/sha512.h new file mode 100644 index 000000000000..55303ab6b15f --- /dev/null +++ b/lib/crypto/sparc/sha512.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * SHA-512 accelerated using the sparc64 sha512 opcodes + * + * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) 2003 Kyle McMartin <kyle@debian.org> + */ + +#include <asm/elf.h> +#include <asm/opcodes.h> +#include <asm/pstate.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha512_opcodes); + +asmlinkage void sha512_sparc64_transform(struct sha512_block_state *state, + const u8 *data, size_t nblocks); + +static void sha512_blocks(struct sha512_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_sha512_opcodes)) + sha512_sparc64_transform(state, data, nblocks); + else + sha512_blocks_generic(state, data, nblocks); +} + +#define sha512_mod_init_arch sha512_mod_init_arch +static inline void sha512_mod_init_arch(void) +{ + unsigned long cfr; + + if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) + return; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + if (!(cfr & CFR_SHA512)) + return; + + static_branch_enable(&have_sha512_opcodes); + pr_info("Using sparc64 sha512 opcode optimized SHA-512/SHA-384 implementation\n"); +} diff --git a/lib/crypto/sparc/sha512_asm.S b/lib/crypto/sparc/sha512_asm.S new file mode 100644 index 000000000000..9932b4fe1b59 --- /dev/null +++ b/lib/crypto/sparc/sha512_asm.S @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> +#include <asm/opcodes.h> +#include <asm/visasm.h> + +ENTRY(sha512_sparc64_transform) + /* %o0 = digest, %o1 = data, %o2 = rounds */ + VISEntry + ldd [%o0 + 0x00], %f0 + ldd [%o0 + 0x08], %f2 + ldd [%o0 + 0x10], %f4 + ldd [%o0 + 0x18], %f6 + ldd [%o0 + 0x20], %f8 + ldd [%o0 + 0x28], %f10 + andcc %o1, 0x7, %g0 + ldd [%o0 + 0x30], %f12 + bne,pn %xcc, 10f + ldd [%o0 + 0x38], %f14 + +1: + ldd [%o1 + 0x00], %f16 + ldd [%o1 + 0x08], %f18 + ldd [%o1 + 0x10], %f20 + ldd [%o1 + 0x18], %f22 + ldd [%o1 + 0x20], %f24 + ldd [%o1 + 0x28], %f26 + ldd [%o1 + 0x30], %f28 + ldd [%o1 + 0x38], %f30 + ldd [%o1 + 0x40], %f32 + ldd [%o1 + 0x48], %f34 + ldd [%o1 + 0x50], %f36 + ldd [%o1 + 0x58], %f38 + ldd [%o1 + 0x60], %f40 + ldd [%o1 + 0x68], %f42 + ldd [%o1 + 0x70], %f44 + ldd [%o1 + 0x78], %f46 + + SHA512 + + subcc %o2, 1, %o2 + bne,pt %xcc, 1b + add %o1, 0x80, %o1 + +5: + std %f0, [%o0 + 0x00] + std %f2, [%o0 + 0x08] + std %f4, [%o0 + 0x10] + std %f6, [%o0 + 0x18] + std %f8, [%o0 + 0x20] + std %f10, [%o0 + 0x28] + std %f12, [%o0 + 0x30] + std %f14, [%o0 + 0x38] + retl + VISExit +10: + alignaddr %o1, %g0, %o1 + + ldd [%o1 + 0x00], %f18 +1: + ldd [%o1 + 0x08], %f20 + ldd [%o1 + 0x10], %f22 + ldd [%o1 + 0x18], %f24 + ldd [%o1 + 0x20], %f26 + ldd [%o1 + 0x28], %f28 + ldd [%o1 + 0x30], %f30 + ldd [%o1 + 0x38], %f32 + ldd [%o1 + 0x40], %f34 + ldd [%o1 + 0x48], %f36 + ldd [%o1 + 0x50], %f38 + ldd [%o1 + 0x58], %f40 + ldd [%o1 + 0x60], %f42 + ldd [%o1 + 0x68], %f44 + ldd [%o1 + 0x70], %f46 + ldd [%o1 + 0x78], %f48 + ldd [%o1 + 0x80], %f50 + + faligndata %f18, %f20, %f16 + faligndata %f20, %f22, %f18 + faligndata %f22, %f24, %f20 + faligndata %f24, %f26, %f22 + faligndata %f26, %f28, %f24 + faligndata %f28, %f30, %f26 + faligndata %f30, %f32, %f28 + faligndata %f32, %f34, %f30 + faligndata %f34, %f36, %f32 + faligndata %f36, %f38, %f34 + faligndata %f38, %f40, %f36 + faligndata %f40, %f42, %f38 + faligndata %f42, %f44, %f40 + faligndata %f44, %f46, %f42 + faligndata %f46, %f48, %f44 + faligndata %f48, %f50, %f46 + + SHA512 + + subcc %o2, 1, %o2 + fsrc2 %f50, %f18 + bne,pt %xcc, 1b + add %o1, 0x80, %o1 + + ba,a,pt %xcc, 5b +ENDPROC(sha512_sparc64_transform) diff --git a/lib/crypto/tests/Kconfig b/lib/crypto/tests/Kconfig new file mode 100644 index 000000000000..de7e8babb6af --- /dev/null +++ b/lib/crypto/tests/Kconfig @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +config CRYPTO_LIB_POLY1305_KUNIT_TEST + tristate "KUnit tests for Poly1305" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS + select CRYPTO_LIB_BENCHMARK_VISIBLE + select CRYPTO_LIB_POLY1305 + help + KUnit tests for the Poly1305 library functions. + +config CRYPTO_LIB_SHA1_KUNIT_TEST + tristate "KUnit tests for SHA-1" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS + select CRYPTO_LIB_BENCHMARK_VISIBLE + select CRYPTO_LIB_SHA1 + help + KUnit tests for the SHA-1 cryptographic hash function and its + corresponding HMAC. + +# Option is named *_SHA256_KUNIT_TEST, though both SHA-224 and SHA-256 tests are +# included, for consistency with the naming used elsewhere (e.g. CRYPTO_SHA256). +config CRYPTO_LIB_SHA256_KUNIT_TEST + tristate "KUnit tests for SHA-224 and SHA-256" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS + select CRYPTO_LIB_BENCHMARK_VISIBLE + select CRYPTO_LIB_SHA256 + help + KUnit tests for the SHA-224 and SHA-256 cryptographic hash functions + and their corresponding HMACs. + +# Option is named *_SHA512_KUNIT_TEST, though both SHA-384 and SHA-512 tests are +# included, for consistency with the naming used elsewhere (e.g. CRYPTO_SHA512). +config CRYPTO_LIB_SHA512_KUNIT_TEST + tristate "KUnit tests for SHA-384 and SHA-512" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS + select CRYPTO_LIB_BENCHMARK_VISIBLE + select CRYPTO_LIB_SHA512 + help + KUnit tests for the SHA-384 and SHA-512 cryptographic hash functions + and their corresponding HMACs. + +config CRYPTO_LIB_BENCHMARK_VISIBLE + bool + +config CRYPTO_LIB_BENCHMARK + bool "Include benchmarks in KUnit tests for cryptographic functions" + depends on CRYPTO_LIB_BENCHMARK_VISIBLE + help + Include benchmarks in the KUnit tests for cryptographic functions. + The benchmark results are printed to the kernel log when the + corresponding KUnit test suite runs. + + This is useful for evaluating the performance of the cryptographic + functions. However, it will increase the runtime of the KUnit tests. + + If you're only interested in correctness testing, leave this disabled. diff --git a/lib/crypto/tests/Makefile b/lib/crypto/tests/Makefile new file mode 100644 index 000000000000..8601dccd6fdd --- /dev/null +++ b/lib/crypto/tests/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +obj-$(CONFIG_CRYPTO_LIB_POLY1305_KUNIT_TEST) += poly1305_kunit.o +obj-$(CONFIG_CRYPTO_LIB_SHA1_KUNIT_TEST) += sha1_kunit.o +obj-$(CONFIG_CRYPTO_LIB_SHA256_KUNIT_TEST) += sha224_kunit.o sha256_kunit.o +obj-$(CONFIG_CRYPTO_LIB_SHA512_KUNIT_TEST) += sha384_kunit.o sha512_kunit.o diff --git a/lib/crypto/tests/hash-test-template.h b/lib/crypto/tests/hash-test-template.h new file mode 100644 index 000000000000..f437a0a9ac6c --- /dev/null +++ b/lib/crypto/tests/hash-test-template.h @@ -0,0 +1,683 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Test cases for hash functions, including a benchmark. This is included by + * KUnit test suites that want to use it. See sha512_kunit.c for an example. + * + * Copyright 2025 Google LLC + */ +#include <kunit/test.h> +#include <linux/hrtimer.h> +#include <linux/timekeeping.h> +#include <linux/vmalloc.h> +#include <linux/workqueue.h> + +/* test_buf is a guarded buffer, i.e. &test_buf[TEST_BUF_LEN] is not mapped. */ +#define TEST_BUF_LEN 16384 +static u8 *test_buf; + +static u8 *orig_test_buf; + +static u64 random_seed; + +/* + * This is a simple linear congruential generator. It is used only for testing, + * which does not require cryptographically secure random numbers. A hard-coded + * algorithm is used instead of <linux/prandom.h> so that it matches the + * algorithm used by the test vector generation script. This allows the input + * data in random test vectors to be concisely stored as just the seed. + */ +static u32 rand32(void) +{ + random_seed = (random_seed * 25214903917 + 11) & ((1ULL << 48) - 1); + return random_seed >> 16; +} + +static void rand_bytes(u8 *out, size_t len) +{ + for (size_t i = 0; i < len; i++) + out[i] = rand32(); +} + +static void rand_bytes_seeded_from_len(u8 *out, size_t len) +{ + random_seed = len; + rand_bytes(out, len); +} + +static bool rand_bool(void) +{ + return rand32() % 2; +} + +/* Generate a random length, preferring small lengths. */ +static size_t rand_length(size_t max_len) +{ + size_t len; + + switch (rand32() % 3) { + case 0: + len = rand32() % 128; + break; + case 1: + len = rand32() % 3072; + break; + default: + len = rand32(); + break; + } + return len % (max_len + 1); +} + +static size_t rand_offset(size_t max_offset) +{ + return min(rand32() % 128, max_offset); +} + +static int hash_suite_init(struct kunit_suite *suite) +{ + /* + * Allocate the test buffer using vmalloc() with a page-aligned length + * so that it is immediately followed by a guard page. This allows + * buffer overreads to be detected, even in assembly code. + */ + size_t alloc_len = round_up(TEST_BUF_LEN, PAGE_SIZE); + + orig_test_buf = vmalloc(alloc_len); + if (!orig_test_buf) + return -ENOMEM; + + test_buf = orig_test_buf + alloc_len - TEST_BUF_LEN; + return 0; +} + +static void hash_suite_exit(struct kunit_suite *suite) +{ + vfree(orig_test_buf); + orig_test_buf = NULL; + test_buf = NULL; +} + +/* + * Test the hash function against a list of test vectors. + * + * Note that it's only necessary to run each test vector in one way (e.g., + * one-shot instead of incremental), since consistency between different ways of + * using the APIs is verified by other test cases. + */ +static void test_hash_test_vectors(struct kunit *test) +{ + for (size_t i = 0; i < ARRAY_SIZE(hash_testvecs); i++) { + size_t data_len = hash_testvecs[i].data_len; + u8 actual_hash[HASH_SIZE]; + + KUNIT_ASSERT_LE(test, data_len, TEST_BUF_LEN); + rand_bytes_seeded_from_len(test_buf, data_len); + + HASH(test_buf, data_len, actual_hash); + KUNIT_ASSERT_MEMEQ_MSG( + test, actual_hash, hash_testvecs[i].digest, HASH_SIZE, + "Wrong result with test vector %zu; data_len=%zu", i, + data_len); + } +} + +/* + * Test that the hash function produces correct results for *every* length up to + * 4096 bytes. To do this, generate seeded random data, then calculate a hash + * value for each length 0..4096, then hash the hash values. Verify just the + * final hash value, which should match only when all hash values were correct. + */ +static void test_hash_all_lens_up_to_4096(struct kunit *test) +{ + struct HASH_CTX ctx; + u8 hash[HASH_SIZE]; + + static_assert(TEST_BUF_LEN >= 4096); + rand_bytes_seeded_from_len(test_buf, 4096); + HASH_INIT(&ctx); + for (size_t len = 0; len <= 4096; len++) { + HASH(test_buf, len, hash); + HASH_UPDATE(&ctx, hash, HASH_SIZE); + } + HASH_FINAL(&ctx, hash); + KUNIT_ASSERT_MEMEQ(test, hash, hash_testvec_consolidated, HASH_SIZE); +} + +/* + * Test that the hash function produces the same result with a one-shot + * computation as it does with an incremental computation. + */ +static void test_hash_incremental_updates(struct kunit *test) +{ + for (int i = 0; i < 1000; i++) { + size_t total_len, offset; + struct HASH_CTX ctx; + u8 hash1[HASH_SIZE]; + u8 hash2[HASH_SIZE]; + size_t num_parts = 0; + size_t remaining_len, cur_offset; + + total_len = rand_length(TEST_BUF_LEN); + offset = rand_offset(TEST_BUF_LEN - total_len); + rand_bytes(&test_buf[offset], total_len); + + /* Compute the hash value in one shot. */ + HASH(&test_buf[offset], total_len, hash1); + + /* + * Compute the hash value incrementally, using a randomly + * selected sequence of update lengths that sum to total_len. + */ + HASH_INIT(&ctx); + remaining_len = total_len; + cur_offset = offset; + while (rand_bool()) { + size_t part_len = rand_length(remaining_len); + + HASH_UPDATE(&ctx, &test_buf[cur_offset], part_len); + num_parts++; + cur_offset += part_len; + remaining_len -= part_len; + } + if (remaining_len != 0 || rand_bool()) { + HASH_UPDATE(&ctx, &test_buf[cur_offset], remaining_len); + num_parts++; + } + HASH_FINAL(&ctx, hash2); + + /* Verify that the two hash values are the same. */ + KUNIT_ASSERT_MEMEQ_MSG( + test, hash1, hash2, HASH_SIZE, + "Incremental test failed with total_len=%zu num_parts=%zu offset=%zu", + total_len, num_parts, offset); + } +} + +/* + * Test that the hash function does not overrun any buffers. Uses a guard page + * to catch buffer overruns even if they occur in assembly code. + */ +static void test_hash_buffer_overruns(struct kunit *test) +{ + const size_t max_tested_len = TEST_BUF_LEN - sizeof(struct HASH_CTX); + void *const buf_end = &test_buf[TEST_BUF_LEN]; + struct HASH_CTX *guarded_ctx = buf_end - sizeof(*guarded_ctx); + + rand_bytes(test_buf, TEST_BUF_LEN); + + for (int i = 0; i < 100; i++) { + size_t len = rand_length(max_tested_len); + struct HASH_CTX ctx; + u8 hash[HASH_SIZE]; + + /* Check for overruns of the data buffer. */ + HASH(buf_end - len, len, hash); + HASH_INIT(&ctx); + HASH_UPDATE(&ctx, buf_end - len, len); + HASH_FINAL(&ctx, hash); + + /* Check for overruns of the hash value buffer. */ + HASH(test_buf, len, buf_end - HASH_SIZE); + HASH_INIT(&ctx); + HASH_UPDATE(&ctx, test_buf, len); + HASH_FINAL(&ctx, buf_end - HASH_SIZE); + + /* Check for overuns of the hash context. */ + HASH_INIT(guarded_ctx); + HASH_UPDATE(guarded_ctx, test_buf, len); + HASH_FINAL(guarded_ctx, hash); + } +} + +/* + * Test that the caller is permitted to alias the output digest and source data + * buffer, and also modify the source data buffer after it has been used. + */ +static void test_hash_overlaps(struct kunit *test) +{ + const size_t max_tested_len = TEST_BUF_LEN - HASH_SIZE; + struct HASH_CTX ctx; + u8 hash[HASH_SIZE]; + + rand_bytes(test_buf, TEST_BUF_LEN); + + for (int i = 0; i < 100; i++) { + size_t len = rand_length(max_tested_len); + size_t offset = HASH_SIZE + rand_offset(max_tested_len - len); + bool left_end = rand_bool(); + u8 *ovl_hash = left_end ? &test_buf[offset] : + &test_buf[offset + len - HASH_SIZE]; + + HASH(&test_buf[offset], len, hash); + HASH(&test_buf[offset], len, ovl_hash); + KUNIT_ASSERT_MEMEQ_MSG( + test, hash, ovl_hash, HASH_SIZE, + "Overlap test 1 failed with len=%zu offset=%zu left_end=%d", + len, offset, left_end); + + /* Repeat the above test, but this time use init+update+final */ + HASH(&test_buf[offset], len, hash); + HASH_INIT(&ctx); + HASH_UPDATE(&ctx, &test_buf[offset], len); + HASH_FINAL(&ctx, ovl_hash); + KUNIT_ASSERT_MEMEQ_MSG( + test, hash, ovl_hash, HASH_SIZE, + "Overlap test 2 failed with len=%zu offset=%zu left_end=%d", + len, offset, left_end); + + /* Test modifying the source data after it was used. */ + HASH(&test_buf[offset], len, hash); + HASH_INIT(&ctx); + HASH_UPDATE(&ctx, &test_buf[offset], len); + rand_bytes(&test_buf[offset], len); + HASH_FINAL(&ctx, ovl_hash); + KUNIT_ASSERT_MEMEQ_MSG( + test, hash, ovl_hash, HASH_SIZE, + "Overlap test 3 failed with len=%zu offset=%zu left_end=%d", + len, offset, left_end); + } +} + +/* + * Test that if the same data is hashed at different alignments in memory, the + * results are the same. + */ +static void test_hash_alignment_consistency(struct kunit *test) +{ + u8 hash1[128 + HASH_SIZE]; + u8 hash2[128 + HASH_SIZE]; + + for (int i = 0; i < 100; i++) { + size_t len = rand_length(TEST_BUF_LEN); + size_t data_offs1 = rand_offset(TEST_BUF_LEN - len); + size_t data_offs2 = rand_offset(TEST_BUF_LEN - len); + size_t hash_offs1 = rand_offset(128); + size_t hash_offs2 = rand_offset(128); + + rand_bytes(&test_buf[data_offs1], len); + HASH(&test_buf[data_offs1], len, &hash1[hash_offs1]); + memmove(&test_buf[data_offs2], &test_buf[data_offs1], len); + HASH(&test_buf[data_offs2], len, &hash2[hash_offs2]); + KUNIT_ASSERT_MEMEQ_MSG( + test, &hash1[hash_offs1], &hash2[hash_offs2], HASH_SIZE, + "Alignment consistency test failed with len=%zu data_offs=(%zu,%zu) hash_offs=(%zu,%zu)", + len, data_offs1, data_offs2, hash_offs1, hash_offs2); + } +} + +/* Test that HASH_FINAL zeroizes the context. */ +static void test_hash_ctx_zeroization(struct kunit *test) +{ + static const u8 zeroes[sizeof(struct HASH_CTX)]; + struct HASH_CTX ctx; + + rand_bytes(test_buf, 128); + HASH_INIT(&ctx); + HASH_UPDATE(&ctx, test_buf, 128); + HASH_FINAL(&ctx, test_buf); + KUNIT_ASSERT_MEMEQ_MSG(test, &ctx, zeroes, sizeof(ctx), + "Hash context was not zeroized by finalization"); +} + +#define IRQ_TEST_HRTIMER_INTERVAL us_to_ktime(5) + +struct hash_irq_test_state { + bool (*func)(void *test_specific_state); + void *test_specific_state; + bool task_func_reported_failure; + bool hardirq_func_reported_failure; + bool softirq_func_reported_failure; + unsigned long hardirq_func_calls; + unsigned long softirq_func_calls; + struct hrtimer timer; + struct work_struct bh_work; +}; + +static enum hrtimer_restart hash_irq_test_timer_func(struct hrtimer *timer) +{ + struct hash_irq_test_state *state = + container_of(timer, typeof(*state), timer); + + WARN_ON_ONCE(!in_hardirq()); + state->hardirq_func_calls++; + + if (!state->func(state->test_specific_state)) + state->hardirq_func_reported_failure = true; + + hrtimer_forward_now(&state->timer, IRQ_TEST_HRTIMER_INTERVAL); + queue_work(system_bh_wq, &state->bh_work); + return HRTIMER_RESTART; +} + +static void hash_irq_test_bh_work_func(struct work_struct *work) +{ + struct hash_irq_test_state *state = + container_of(work, typeof(*state), bh_work); + + WARN_ON_ONCE(!in_serving_softirq()); + state->softirq_func_calls++; + + if (!state->func(state->test_specific_state)) + state->softirq_func_reported_failure = true; +} + +/* + * Helper function which repeatedly runs the given @func in task, softirq, and + * hardirq context concurrently, and reports a failure to KUnit if any + * invocation of @func in any context returns false. @func is passed + * @test_specific_state as its argument. At most 3 invocations of @func will + * run concurrently: one in each of task, softirq, and hardirq context. + * + * The main purpose of this interrupt context testing is to validate fallback + * code paths that run in contexts where the normal code path cannot be used, + * typically due to the FPU or vector registers already being in-use in kernel + * mode. These code paths aren't covered when the test code is executed only by + * the KUnit test runner thread in task context. The reason for the concurrency + * is because merely using hardirq context is not sufficient to reach a fallback + * code path on some architectures; the hardirq actually has to occur while the + * FPU or vector unit was already in-use in kernel mode. + * + * Another purpose of this testing is to detect issues with the architecture's + * irq_fpu_usable() and kernel_fpu_begin/end() or equivalent functions, + * especially in softirq context when the softirq may have interrupted a task + * already using kernel-mode FPU or vector (if the arch didn't prevent that). + * Crypto functions are often executed in softirqs, so this is important. + */ +static void run_irq_test(struct kunit *test, bool (*func)(void *), + int max_iterations, void *test_specific_state) +{ + struct hash_irq_test_state state = { + .func = func, + .test_specific_state = test_specific_state, + }; + unsigned long end_jiffies; + + /* + * Set up a hrtimer (the way we access hardirq context) and a work + * struct for the BH workqueue (the way we access softirq context). + */ + hrtimer_setup_on_stack(&state.timer, hash_irq_test_timer_func, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); + INIT_WORK_ONSTACK(&state.bh_work, hash_irq_test_bh_work_func); + + /* Run for up to max_iterations or 1 second, whichever comes first. */ + end_jiffies = jiffies + HZ; + hrtimer_start(&state.timer, IRQ_TEST_HRTIMER_INTERVAL, + HRTIMER_MODE_REL_HARD); + for (int i = 0; i < max_iterations && !time_after(jiffies, end_jiffies); + i++) { + if (!func(test_specific_state)) + state.task_func_reported_failure = true; + } + + /* Cancel the timer and work. */ + hrtimer_cancel(&state.timer); + flush_work(&state.bh_work); + + /* Sanity check: the timer and BH functions should have been run. */ + KUNIT_EXPECT_GT_MSG(test, state.hardirq_func_calls, 0, + "Timer function was not called"); + KUNIT_EXPECT_GT_MSG(test, state.softirq_func_calls, 0, + "BH work function was not called"); + + /* Check for incorrect hash values reported from any context. */ + KUNIT_EXPECT_FALSE_MSG( + test, state.task_func_reported_failure, + "Incorrect hash values reported from task context"); + KUNIT_EXPECT_FALSE_MSG( + test, state.hardirq_func_reported_failure, + "Incorrect hash values reported from hardirq context"); + KUNIT_EXPECT_FALSE_MSG( + test, state.softirq_func_reported_failure, + "Incorrect hash values reported from softirq context"); +} + +#define IRQ_TEST_DATA_LEN 256 +#define IRQ_TEST_NUM_BUFFERS 3 /* matches max concurrency level */ + +struct hash_irq_test1_state { + u8 expected_hashes[IRQ_TEST_NUM_BUFFERS][HASH_SIZE]; + atomic_t seqno; +}; + +/* + * Compute the hash of one of the test messages and verify that it matches the + * expected hash from @state->expected_hashes. To increase the chance of + * detecting problems, cycle through multiple messages. + */ +static bool hash_irq_test1_func(void *state_) +{ + struct hash_irq_test1_state *state = state_; + u32 i = (u32)atomic_inc_return(&state->seqno) % IRQ_TEST_NUM_BUFFERS; + u8 actual_hash[HASH_SIZE]; + + HASH(&test_buf[i * IRQ_TEST_DATA_LEN], IRQ_TEST_DATA_LEN, actual_hash); + return memcmp(actual_hash, state->expected_hashes[i], HASH_SIZE) == 0; +} + +/* + * Test that if hashes are computed in task, softirq, and hardirq context + * concurrently, then all results are as expected. + */ +static void test_hash_interrupt_context_1(struct kunit *test) +{ + struct hash_irq_test1_state state = {}; + + /* Prepare some test messages and compute the expected hash of each. */ + rand_bytes(test_buf, IRQ_TEST_NUM_BUFFERS * IRQ_TEST_DATA_LEN); + for (int i = 0; i < IRQ_TEST_NUM_BUFFERS; i++) + HASH(&test_buf[i * IRQ_TEST_DATA_LEN], IRQ_TEST_DATA_LEN, + state.expected_hashes[i]); + + run_irq_test(test, hash_irq_test1_func, 100000, &state); +} + +struct hash_irq_test2_hash_ctx { + struct HASH_CTX hash_ctx; + atomic_t in_use; + int offset; + int step; +}; + +struct hash_irq_test2_state { + struct hash_irq_test2_hash_ctx ctxs[IRQ_TEST_NUM_BUFFERS]; + u8 expected_hash[HASH_SIZE]; + u16 update_lens[32]; + int num_steps; +}; + +static bool hash_irq_test2_func(void *state_) +{ + struct hash_irq_test2_state *state = state_; + struct hash_irq_test2_hash_ctx *ctx; + bool ret = true; + + for (ctx = &state->ctxs[0]; ctx < &state->ctxs[ARRAY_SIZE(state->ctxs)]; + ctx++) { + if (atomic_cmpxchg(&ctx->in_use, 0, 1) == 0) + break; + } + if (WARN_ON_ONCE(ctx == &state->ctxs[ARRAY_SIZE(state->ctxs)])) { + /* + * This should never happen, as the number of contexts is equal + * to the maximum concurrency level of run_irq_test(). + */ + return false; + } + + if (ctx->step == 0) { + /* Init step */ + HASH_INIT(&ctx->hash_ctx); + ctx->offset = 0; + ctx->step++; + } else if (ctx->step < state->num_steps - 1) { + /* Update step */ + HASH_UPDATE(&ctx->hash_ctx, &test_buf[ctx->offset], + state->update_lens[ctx->step - 1]); + ctx->offset += state->update_lens[ctx->step - 1]; + ctx->step++; + } else { + /* Final step */ + u8 actual_hash[HASH_SIZE]; + + if (WARN_ON_ONCE(ctx->offset != TEST_BUF_LEN)) + ret = false; + HASH_FINAL(&ctx->hash_ctx, actual_hash); + if (memcmp(actual_hash, state->expected_hash, HASH_SIZE) != 0) + ret = false; + ctx->step = 0; + } + atomic_set_release(&ctx->in_use, 0); + return ret; +} + +/* + * Test that if hashes are computed in task, softirq, and hardirq context + * concurrently, *including doing different parts of the same incremental + * computation in different contexts*, then all results are as expected. + * Besides detecting bugs similar to those that test_hash_interrupt_context_1 + * can detect, this test case can also detect bugs where hash function + * implementations don't correctly handle these mixed incremental computations. + */ +static void test_hash_interrupt_context_2(struct kunit *test) +{ + struct hash_irq_test2_state *state; + int remaining = TEST_BUF_LEN; + + state = kunit_kzalloc(test, sizeof(*state), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, state); + + rand_bytes(test_buf, TEST_BUF_LEN); + HASH(test_buf, TEST_BUF_LEN, state->expected_hash); + + /* + * Generate a list of update lengths to use. Ensure that it contains + * multiple entries but is limited to a maximum length. + */ + static_assert(TEST_BUF_LEN / 4096 > 1); + for (state->num_steps = 0; + state->num_steps < ARRAY_SIZE(state->update_lens) - 1 && remaining; + state->num_steps++) { + state->update_lens[state->num_steps] = + rand_length(min(remaining, 4096)); + remaining -= state->update_lens[state->num_steps]; + } + if (remaining) + state->update_lens[state->num_steps++] = remaining; + state->num_steps += 2; /* for init and final */ + + run_irq_test(test, hash_irq_test2_func, 250000, state); +} + +#define UNKEYED_HASH_KUNIT_CASES \ + KUNIT_CASE(test_hash_test_vectors), \ + KUNIT_CASE(test_hash_all_lens_up_to_4096), \ + KUNIT_CASE(test_hash_incremental_updates), \ + KUNIT_CASE(test_hash_buffer_overruns), \ + KUNIT_CASE(test_hash_overlaps), \ + KUNIT_CASE(test_hash_alignment_consistency), \ + KUNIT_CASE(test_hash_ctx_zeroization), \ + KUNIT_CASE(test_hash_interrupt_context_1), \ + KUNIT_CASE(test_hash_interrupt_context_2) +/* benchmark_hash is omitted so that the suites can put it last. */ + +#ifdef HMAC +/* + * Test the corresponding HMAC variant. + * + * This test case is fairly short, since HMAC is just a simple C wrapper around + * the underlying unkeyed hash function, which is already well-tested by the + * other test cases. It's not useful to test things like data alignment or + * interrupt context again for HMAC, nor to have a long list of test vectors. + * + * Thus, just do a single consolidated test, which covers all data lengths up to + * 4096 bytes and all key lengths up to 292 bytes. For each data length, select + * a key length, generate the inputs from a seed, and compute the HMAC value. + * Concatenate all these HMAC values together, and compute the HMAC of that. + * Verify that value. If this fails, then the HMAC implementation is wrong. + * This won't show which specific input failed, but that should be fine. Any + * failure would likely be non-input-specific or also show in the unkeyed tests. + */ +static void test_hmac(struct kunit *test) +{ + static const u8 zeroes[sizeof(struct HMAC_CTX)]; + u8 *raw_key; + struct HMAC_KEY key; + struct HMAC_CTX ctx; + u8 mac[HASH_SIZE]; + u8 mac2[HASH_SIZE]; + + static_assert(TEST_BUF_LEN >= 4096 + 293); + rand_bytes_seeded_from_len(test_buf, 4096); + raw_key = &test_buf[4096]; + + rand_bytes_seeded_from_len(raw_key, 32); + HMAC_PREPAREKEY(&key, raw_key, 32); + HMAC_INIT(&ctx, &key); + for (size_t data_len = 0; data_len <= 4096; data_len++) { + /* + * Cycle through key lengths as well. Somewhat arbitrarily go + * up to 293, which is somewhat larger than the largest hash + * block size (which is the size at which the key starts being + * hashed down to one block); going higher would not be useful. + * To reduce correlation with data_len, use a prime number here. + */ + size_t key_len = data_len % 293; + + HMAC_UPDATE(&ctx, test_buf, data_len); + + rand_bytes_seeded_from_len(raw_key, key_len); + HMAC_USINGRAWKEY(raw_key, key_len, test_buf, data_len, mac); + HMAC_UPDATE(&ctx, mac, HASH_SIZE); + + /* Verify that HMAC() is consistent with HMAC_USINGRAWKEY(). */ + HMAC_PREPAREKEY(&key, raw_key, key_len); + HMAC(&key, test_buf, data_len, mac2); + KUNIT_ASSERT_MEMEQ_MSG( + test, mac, mac2, HASH_SIZE, + "HMAC gave different results with raw and prepared keys"); + } + HMAC_FINAL(&ctx, mac); + KUNIT_EXPECT_MEMEQ_MSG(test, mac, hmac_testvec_consolidated, HASH_SIZE, + "HMAC gave wrong result"); + KUNIT_EXPECT_MEMEQ_MSG(test, &ctx, zeroes, sizeof(ctx), + "HMAC context was not zeroized by finalization"); +} +#define HASH_KUNIT_CASES UNKEYED_HASH_KUNIT_CASES, KUNIT_CASE(test_hmac) +#else +#define HASH_KUNIT_CASES UNKEYED_HASH_KUNIT_CASES +#endif + +/* Benchmark the hash function on various data lengths. */ +static void benchmark_hash(struct kunit *test) +{ + static const size_t lens_to_test[] = { + 1, 16, 64, 127, 128, 200, 256, + 511, 512, 1024, 3173, 4096, 16384, + }; + u8 hash[HASH_SIZE]; + + if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK)) + kunit_skip(test, "not enabled"); + + /* Warm-up */ + for (size_t i = 0; i < 10000000; i += TEST_BUF_LEN) + HASH(test_buf, TEST_BUF_LEN, hash); + + for (size_t i = 0; i < ARRAY_SIZE(lens_to_test); i++) { + size_t len = lens_to_test[i]; + /* The '+ 128' tries to account for per-message overhead. */ + size_t num_iters = 10000000 / (len + 128); + u64 t; + + KUNIT_ASSERT_LE(test, len, TEST_BUF_LEN); + preempt_disable(); + t = ktime_get_ns(); + for (size_t j = 0; j < num_iters; j++) + HASH(test_buf, len, hash); + t = ktime_get_ns() - t; + preempt_enable(); + kunit_info(test, "len=%zu: %llu MB/s", len, + div64_u64((u64)len * num_iters * 1000, t ?: 1)); + } +} diff --git a/lib/crypto/tests/poly1305-testvecs.h b/lib/crypto/tests/poly1305-testvecs.h new file mode 100644 index 000000000000..ecf8662225c8 --- /dev/null +++ b/lib/crypto/tests/poly1305-testvecs.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py poly1305 */ + +static const struct { + size_t data_len; + u8 digest[POLY1305_DIGEST_SIZE]; +} hash_testvecs[] = { + { + .data_len = 0, + .digest = { + 0xe8, 0x2d, 0x67, 0x2c, 0x01, 0x48, 0xf9, 0xb7, + 0x87, 0x85, 0x3f, 0xcf, 0x18, 0x66, 0x8c, 0xd3, + }, + }, + { + .data_len = 1, + .digest = { + 0xb8, 0xad, 0xca, 0x6b, 0x32, 0xba, 0x34, 0x42, + 0x54, 0x10, 0x28, 0xf5, 0x0f, 0x7e, 0x8e, 0xe3, + }, + }, + { + .data_len = 2, + .digest = { + 0xb8, 0xf7, 0xf4, 0xc2, 0x85, 0x33, 0x80, 0x63, + 0xd1, 0x45, 0xda, 0xf8, 0x7c, 0x79, 0x42, 0xd1, + }, + }, + { + .data_len = 3, + .digest = { + 0xf3, 0x73, 0x7b, 0x60, 0x24, 0xcc, 0x5d, 0x3e, + 0xd1, 0x95, 0x86, 0xce, 0x89, 0x0a, 0x33, 0xba, + }, + }, + { + .data_len = 16, + .digest = { + 0x0a, 0x1a, 0x2d, 0x39, 0xea, 0x49, 0x8f, 0xb7, + 0x90, 0xb6, 0x74, 0x3b, 0x41, 0x3b, 0x96, 0x11, + }, + }, + { + .data_len = 32, + .digest = { + 0x99, 0x05, 0xe3, 0xa7, 0x9e, 0x2a, 0xd2, 0x42, + 0xb9, 0x45, 0x0c, 0x08, 0xe7, 0x10, 0xe4, 0xe1, + }, + }, + { + .data_len = 48, + .digest = { + 0xe1, 0xb2, 0x15, 0xee, 0xa2, 0xf3, 0x04, 0xac, + 0xdd, 0x27, 0x57, 0x95, 0x2f, 0x45, 0xa8, 0xd3, + }, + }, + { + .data_len = 49, + .digest = { + 0x1c, 0xf3, 0xab, 0x39, 0xc0, 0x69, 0x49, 0x69, + 0x89, 0x6f, 0x1f, 0x03, 0x16, 0xe7, 0xc0, 0xf0, + }, + }, + { + .data_len = 63, + .digest = { + 0x30, 0xb0, 0x32, 0x87, 0x51, 0x55, 0x9c, 0x39, + 0x38, 0x42, 0x06, 0xe9, 0x2a, 0x3e, 0x2c, 0x92, + }, + }, + { + .data_len = 64, + .digest = { + 0x2c, 0x04, 0x16, 0x36, 0x55, 0x25, 0x2d, 0xc6, + 0x3d, 0x70, 0x5b, 0x88, 0x46, 0xb6, 0x71, 0x77, + }, + }, + { + .data_len = 65, + .digest = { + 0x03, 0x87, 0xdd, 0xbe, 0xe8, 0x30, 0xf2, 0x15, + 0x40, 0x44, 0x29, 0x7b, 0xb1, 0xe9, 0x9d, 0xe7, + }, + }, + { + .data_len = 127, + .digest = { + 0x29, 0x83, 0x4f, 0xcb, 0x5a, 0x93, 0x25, 0xad, + 0x05, 0xa4, 0xb3, 0x24, 0x77, 0x62, 0x2d, 0x3d, + }, + }, + { + .data_len = 128, + .digest = { + 0x20, 0x0e, 0x2c, 0x05, 0xe2, 0x0b, 0x85, 0xa0, + 0x24, 0x73, 0x7f, 0x65, 0x70, 0x6c, 0x3e, 0xb0, + }, + }, + { + .data_len = 129, + .digest = { + 0xef, 0x2f, 0x98, 0x42, 0xc2, 0x90, 0x55, 0xea, + 0xba, 0x28, 0x76, 0xfd, 0x9e, 0x3e, 0x4d, 0x53, + }, + }, + { + .data_len = 256, + .digest = { + 0x9e, 0x75, 0x4b, 0xc7, 0x69, 0x68, 0x51, 0x90, + 0xdc, 0x29, 0xc8, 0xfa, 0x86, 0xf1, 0xc9, 0xb3, + }, + }, + { + .data_len = 511, + .digest = { + 0x9d, 0x13, 0xf5, 0x54, 0xe6, 0xe3, 0x45, 0x38, + 0x8b, 0x6d, 0x5c, 0xc4, 0x50, 0xeb, 0x90, 0xcb, + }, + }, + { + .data_len = 513, + .digest = { + 0xaa, 0xb2, 0x3e, 0x3c, 0x2a, 0xfc, 0x62, 0x0e, + 0xd4, 0xe6, 0xe5, 0x5c, 0x6b, 0x9f, 0x3d, 0xc7, + }, + }, + { + .data_len = 1000, + .digest = { + 0xd6, 0x8c, 0xea, 0x8a, 0x0f, 0x68, 0xa9, 0xa8, + 0x67, 0x86, 0xf9, 0xc1, 0x4c, 0x26, 0x10, 0x6d, + }, + }, + { + .data_len = 3333, + .digest = { + 0xdc, 0xc1, 0x54, 0xe7, 0x38, 0xc6, 0xdb, 0x24, + 0xa7, 0x0b, 0xff, 0xd3, 0x1b, 0x93, 0x01, 0xa6, + }, + }, + { + .data_len = 4096, + .digest = { + 0x8f, 0x88, 0x3e, 0x9c, 0x7b, 0x2e, 0x82, 0x5a, + 0x1d, 0x31, 0x82, 0xcc, 0x69, 0xb4, 0x16, 0x26, + }, + }, + { + .data_len = 4128, + .digest = { + 0x23, 0x45, 0x94, 0xa8, 0x11, 0x54, 0x9d, 0xf2, + 0xa1, 0x9a, 0xca, 0xf9, 0x3e, 0x65, 0x52, 0xfd, + }, + }, + { + .data_len = 4160, + .digest = { + 0x7b, 0xfc, 0xa9, 0x1e, 0x03, 0xad, 0xef, 0x03, + 0xe2, 0x20, 0x92, 0xc7, 0x54, 0x83, 0xfa, 0x37, + }, + }, + { + .data_len = 4224, + .digest = { + 0x46, 0xab, 0x8c, 0x75, 0xb3, 0x10, 0xa6, 0x3f, + 0x74, 0x55, 0x42, 0x6d, 0x69, 0x35, 0xd2, 0xf5, + }, + }, + { + .data_len = 16384, + .digest = { + 0xd0, 0xfe, 0x26, 0xc2, 0xca, 0x94, 0x2d, 0x52, + 0x2d, 0xe1, 0x11, 0xdd, 0x42, 0x28, 0x83, 0xa6, + }, + }, +}; + +static const u8 hash_testvec_consolidated[POLY1305_DIGEST_SIZE] = { + 0x9d, 0x07, 0x5d, 0xc9, 0x6c, 0xeb, 0x62, 0x5d, + 0x02, 0x5f, 0xe1, 0xe3, 0xd1, 0x71, 0x69, 0x34, +}; + +static const u8 poly1305_allones_macofmacs[POLY1305_DIGEST_SIZE] = { + 0x0c, 0x26, 0x6b, 0x45, 0x87, 0x06, 0xcf, 0xc4, + 0x3f, 0x70, 0x7d, 0xb3, 0x50, 0xdd, 0x81, 0x25, +}; diff --git a/lib/crypto/tests/poly1305_kunit.c b/lib/crypto/tests/poly1305_kunit.c new file mode 100644 index 000000000000..7ac191bd96b6 --- /dev/null +++ b/lib/crypto/tests/poly1305_kunit.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2025 Google LLC + */ +#include <crypto/poly1305.h> +#include "poly1305-testvecs.h" + +/* + * A fixed key used when presenting Poly1305 as an unkeyed hash function in + * order to reuse hash-test-template.h. At the beginning of the test suite, + * this is initialized to bytes generated from a fixed seed. + */ +static u8 test_key[POLY1305_KEY_SIZE]; + +/* This probably should be in the actual API, but just define it here for now */ +static void poly1305(const u8 key[POLY1305_KEY_SIZE], const u8 *data, + size_t len, u8 out[POLY1305_DIGEST_SIZE]) +{ + struct poly1305_desc_ctx ctx; + + poly1305_init(&ctx, key); + poly1305_update(&ctx, data, len); + poly1305_final(&ctx, out); +} + +static void poly1305_init_withtestkey(struct poly1305_desc_ctx *ctx) +{ + poly1305_init(ctx, test_key); +} + +static void poly1305_withtestkey(const u8 *data, size_t len, + u8 out[POLY1305_DIGEST_SIZE]) +{ + poly1305(test_key, data, len, out); +} + +/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */ +#define HASH poly1305_withtestkey +#define HASH_CTX poly1305_desc_ctx +#define HASH_SIZE POLY1305_DIGEST_SIZE +#define HASH_INIT poly1305_init_withtestkey +#define HASH_UPDATE poly1305_update +#define HASH_FINAL poly1305_final +#include "hash-test-template.h" + +static int poly1305_suite_init(struct kunit_suite *suite) +{ + rand_bytes_seeded_from_len(test_key, POLY1305_KEY_SIZE); + return hash_suite_init(suite); +} + +static void poly1305_suite_exit(struct kunit_suite *suite) +{ + hash_suite_exit(suite); +} + +/* + * Poly1305 test case which uses a key and message consisting only of one bits: + * + * - Using an all-one-bits r_key tests the key clamping. + * - Using an all-one-bits s_key tests carries in implementations of the + * addition mod 2**128 during finalization. + * - Using all-one-bits message, and to a lesser extent r_key, tends to maximize + * any intermediate accumulator values. This increases the chance of + * detecting bugs that occur only in rare cases where the accumulator values + * get very large, for example the bug fixed by commit 678cce4019d746da + * ("crypto: x86/poly1305 - fix overflow during partial reduction"). + * + * Accumulator overflow bugs may be specific to particular update lengths (in + * blocks) and/or particular values of the previous acculumator. Note that the + * accumulator starts at 0 which gives the lowest chance of an overflow. Thus, + * a single all-one-bits test vector may be insufficient. + * + * Considering that, do the following test: continuously update a single + * Poly1305 context with all-one-bits data of varying lengths (0, 16, 32, ..., + * 4096 bytes). After each update, generate the MAC from the current context, + * and feed that MAC into a separate Poly1305 context. Repeat that entire + * sequence of updates 32 times without re-initializing either context, + * resulting in a total of 8224 MAC computations from a long-running, cumulative + * context. Finally, generate and verify the MAC of all the MACs. + */ +static void test_poly1305_allones_keys_and_message(struct kunit *test) +{ + struct poly1305_desc_ctx mac_ctx, macofmacs_ctx; + u8 mac[POLY1305_DIGEST_SIZE]; + + static_assert(TEST_BUF_LEN >= 4096); + memset(test_buf, 0xff, 4096); + + poly1305_init(&mac_ctx, test_buf); + poly1305_init(&macofmacs_ctx, test_buf); + for (int i = 0; i < 32; i++) { + for (size_t len = 0; len <= 4096; len += 16) { + struct poly1305_desc_ctx tmp_ctx; + + poly1305_update(&mac_ctx, test_buf, len); + tmp_ctx = mac_ctx; + poly1305_final(&tmp_ctx, mac); + poly1305_update(&macofmacs_ctx, mac, + POLY1305_DIGEST_SIZE); + } + } + poly1305_final(&macofmacs_ctx, mac); + KUNIT_ASSERT_MEMEQ(test, mac, poly1305_allones_macofmacs, + POLY1305_DIGEST_SIZE); +} + +/* + * Poly1305 test case which uses r_key=1, s_key=0, and a 48-byte message + * consisting of three blocks with integer values [2**128 - i, 0, 0]. In this + * case, the result of the polynomial evaluation is 2**130 - i. For small + * values of i, this is very close to the modulus 2**130 - 5, which helps catch + * edge case bugs in the modular reduction logic. + */ +static void test_poly1305_reduction_edge_cases(struct kunit *test) +{ + static const u8 key[POLY1305_KEY_SIZE] = { 1 }; /* r_key=1, s_key=0 */ + u8 data[3 * POLY1305_BLOCK_SIZE] = {}; + u8 expected_mac[POLY1305_DIGEST_SIZE]; + u8 actual_mac[POLY1305_DIGEST_SIZE]; + + for (int i = 1; i <= 10; i++) { + /* Set the first data block to 2**128 - i. */ + data[0] = -i; + memset(&data[1], 0xff, POLY1305_BLOCK_SIZE - 1); + + /* + * Assuming s_key=0, the expected MAC as an integer is + * (2**130 - i mod 2**130 - 5) + 0 mod 2**128. If 1 <= i <= 5, + * that's 5 - i. If 6 <= i <= 10, that's 2**128 - i. + */ + if (i <= 5) { + expected_mac[0] = 5 - i; + memset(&expected_mac[1], 0, POLY1305_DIGEST_SIZE - 1); + } else { + expected_mac[0] = -i; + memset(&expected_mac[1], 0xff, + POLY1305_DIGEST_SIZE - 1); + } + + /* Compute and verify the MAC. */ + poly1305(key, data, sizeof(data), actual_mac); + KUNIT_ASSERT_MEMEQ(test, actual_mac, expected_mac, + POLY1305_DIGEST_SIZE); + } +} + +static struct kunit_case poly1305_test_cases[] = { + HASH_KUNIT_CASES, + KUNIT_CASE(test_poly1305_allones_keys_and_message), + KUNIT_CASE(test_poly1305_reduction_edge_cases), + KUNIT_CASE(benchmark_hash), + {}, +}; + +static struct kunit_suite poly1305_test_suite = { + .name = "poly1305", + .test_cases = poly1305_test_cases, + .suite_init = poly1305_suite_init, + .suite_exit = poly1305_suite_exit, +}; +kunit_test_suite(poly1305_test_suite); + +MODULE_DESCRIPTION("KUnit tests and benchmark for Poly1305"); +MODULE_LICENSE("GPL"); diff --git a/lib/crypto/tests/sha1-testvecs.h b/lib/crypto/tests/sha1-testvecs.h new file mode 100644 index 000000000000..f5d050122e84 --- /dev/null +++ b/lib/crypto/tests/sha1-testvecs.h @@ -0,0 +1,212 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha1 */ + +static const struct { + size_t data_len; + u8 digest[SHA1_DIGEST_SIZE]; +} hash_testvecs[] = { + { + .data_len = 0, + .digest = { + 0xda, 0x39, 0xa3, 0xee, 0x5e, 0x6b, 0x4b, 0x0d, + 0x32, 0x55, 0xbf, 0xef, 0x95, 0x60, 0x18, 0x90, + 0xaf, 0xd8, 0x07, 0x09, + }, + }, + { + .data_len = 1, + .digest = { + 0x0a, 0xd0, 0x52, 0xdd, 0x9f, 0x32, 0x40, 0x55, + 0x21, 0xe4, 0x3c, 0x6e, 0xbd, 0xc5, 0x2f, 0x5a, + 0x02, 0x54, 0x93, 0xb2, + }, + }, + { + .data_len = 2, + .digest = { + 0x13, 0x83, 0x82, 0x03, 0x23, 0xff, 0x46, 0xd6, + 0x12, 0x7f, 0xad, 0x05, 0x2b, 0xc3, 0x4a, 0x42, + 0x49, 0x6a, 0xf8, 0x84, + }, + }, + { + .data_len = 3, + .digest = { + 0xe4, 0xdf, 0x7b, 0xdc, 0xe8, 0x6e, 0x81, 0x97, + 0x1e, 0x0f, 0xe8, 0x8b, 0x76, 0xa8, 0x59, 0x04, + 0xae, 0x92, 0x1a, 0x7c, + }, + }, + { + .data_len = 16, + .digest = { + 0x8c, 0x1c, 0x30, 0xd8, 0xbc, 0xc4, 0xc3, 0xf5, + 0xf8, 0x83, 0x0d, 0x1e, 0x04, 0x5d, 0x29, 0xb5, + 0x68, 0x89, 0xc1, 0xe9, + }, + }, + { + .data_len = 32, + .digest = { + 0x6c, 0x1d, 0x72, 0x31, 0xa5, 0x03, 0x4f, 0xdc, + 0xff, 0x2d, 0x06, 0x3e, 0x24, 0x26, 0x34, 0x8d, + 0x60, 0xa4, 0x67, 0x16, + }, + }, + { + .data_len = 48, + .digest = { + 0x37, 0x53, 0x33, 0xfa, 0xd0, 0x21, 0xad, 0xe7, + 0xa5, 0x43, 0xf1, 0x94, 0x64, 0x11, 0x47, 0x9c, + 0x72, 0xb5, 0x78, 0xb4, + }, + }, + { + .data_len = 49, + .digest = { + 0x51, 0x5c, 0xd8, 0x5a, 0xa9, 0xde, 0x7b, 0x2a, + 0xa2, 0xff, 0x70, 0x09, 0x56, 0x88, 0x40, 0x2b, + 0x50, 0x93, 0x82, 0x47, + }, + }, + { + .data_len = 63, + .digest = { + 0xbc, 0x9c, 0xab, 0x93, 0x06, 0xd5, 0xdb, 0xac, + 0x2c, 0x33, 0x15, 0x83, 0x56, 0xf6, 0x91, 0x20, + 0x09, 0xc7, 0xb2, 0x6b, + }, + }, + { + .data_len = 64, + .digest = { + 0x26, 0x90, 0x3b, 0x47, 0xe1, 0x92, 0x42, 0xd0, + 0x85, 0x63, 0x2e, 0x6b, 0x68, 0xa4, 0xc4, 0x4c, + 0xe6, 0xf4, 0xb0, 0x52, + }, + }, + { + .data_len = 65, + .digest = { + 0x55, 0x6f, 0x87, 0xdc, 0x34, 0x3d, 0xe2, 0x4f, + 0xc3, 0x81, 0xa4, 0x82, 0x79, 0x84, 0x64, 0x01, + 0x55, 0xa0, 0x1e, 0x36, + }, + }, + { + .data_len = 127, + .digest = { + 0xb7, 0xd5, 0x5f, 0xa4, 0xef, 0xbf, 0x4f, 0x96, + 0x01, 0xc1, 0x06, 0xe3, 0x75, 0xa8, 0x90, 0x92, + 0x4c, 0x5f, 0xf1, 0x21, + }, + }, + { + .data_len = 128, + .digest = { + 0x70, 0x0c, 0xea, 0xa4, 0x93, 0xd0, 0x56, 0xf0, + 0x6f, 0xbb, 0x53, 0x42, 0x5b, 0xe3, 0xf2, 0xb0, + 0x30, 0x66, 0x8e, 0x75, + }, + }, + { + .data_len = 129, + .digest = { + 0x15, 0x01, 0xbc, 0xb0, 0xee, 0xd8, 0xeb, 0xa8, + 0x7d, 0xd9, 0x4d, 0x50, 0x2e, 0x41, 0x30, 0xba, + 0x41, 0xaa, 0x7b, 0x02, + }, + }, + { + .data_len = 256, + .digest = { + 0x98, 0x05, 0x52, 0xf5, 0x0f, 0xf0, 0xd3, 0x97, + 0x15, 0x8c, 0xa3, 0x9a, 0x2b, 0x4d, 0x67, 0x57, + 0x29, 0xa0, 0xac, 0x61, + }, + }, + { + .data_len = 511, + .digest = { + 0x1f, 0x47, 0xf0, 0xcc, 0xd7, 0xda, 0xa5, 0x3b, + 0x39, 0xb4, 0x5b, 0xa8, 0x33, 0xd4, 0xca, 0x2f, + 0xdd, 0xf2, 0x39, 0x89, + }, + }, + { + .data_len = 513, + .digest = { + 0xb9, 0x75, 0xe6, 0x57, 0x42, 0x7f, 0x8b, 0x0a, + 0xcc, 0x53, 0x10, 0x69, 0x45, 0xac, 0xfd, 0x11, + 0xf7, 0x1f, 0x4e, 0x6f, + }, + }, + { + .data_len = 1000, + .digest = { + 0x63, 0x66, 0xcb, 0x44, 0xc1, 0x2c, 0xa2, 0x06, + 0x5d, 0xb9, 0x8e, 0x31, 0xcb, 0x4f, 0x4e, 0x49, + 0xe0, 0xfb, 0x3c, 0x4e, + }, + }, + { + .data_len = 3333, + .digest = { + 0x35, 0xbc, 0x74, 0xfb, 0x31, 0x9c, 0xd4, 0xdd, + 0xe8, 0x87, 0xa7, 0x56, 0x3b, 0x08, 0xe5, 0x49, + 0xe1, 0xe9, 0xc9, 0xa8, + }, + }, + { + .data_len = 4096, + .digest = { + 0x43, 0x00, 0xea, 0xcd, 0x4e, 0x7c, 0xe9, 0xe4, + 0x32, 0xce, 0x25, 0xa8, 0xcd, 0x20, 0xa8, 0xaa, + 0x7b, 0x63, 0x2c, 0x3c, + }, + }, + { + .data_len = 4128, + .digest = { + 0xd0, 0x67, 0x26, 0x0e, 0x22, 0x72, 0xaa, 0x63, + 0xfc, 0x34, 0x55, 0x07, 0xab, 0xc8, 0x64, 0xb6, + 0xc4, 0xea, 0xd5, 0x7c, + }, + }, + { + .data_len = 4160, + .digest = { + 0x6b, 0xc9, 0x5e, 0xb9, 0x41, 0x19, 0x50, 0x35, + 0xf1, 0x39, 0xfe, 0xd9, 0x72, 0x6d, 0xd0, 0x55, + 0xb8, 0x1f, 0x1a, 0x95, + }, + }, + { + .data_len = 4224, + .digest = { + 0x70, 0x5d, 0x10, 0x2e, 0x4e, 0x44, 0xc9, 0x80, + 0x8f, 0xba, 0x13, 0xbc, 0xd0, 0x77, 0x78, 0xc7, + 0x84, 0xe3, 0x24, 0x43, + }, + }, + { + .data_len = 16384, + .digest = { + 0xa8, 0x82, 0xca, 0x08, 0xb4, 0x84, 0x09, 0x13, + 0xc0, 0x9c, 0x26, 0x18, 0xcf, 0x0f, 0xf3, 0x08, + 0xff, 0xa1, 0xe4, 0x5d, + }, + }, +}; + +static const u8 hash_testvec_consolidated[SHA1_DIGEST_SIZE] = { + 0xe1, 0x72, 0xa5, 0x3c, 0xda, 0xf2, 0xe5, 0x56, + 0xb8, 0xb5, 0x35, 0x6e, 0xce, 0xc8, 0x37, 0x57, + 0x31, 0xb4, 0x05, 0xdd, +}; + +static const u8 hmac_testvec_consolidated[SHA1_DIGEST_SIZE] = { + 0x9d, 0xe5, 0xb1, 0x43, 0x97, 0x95, 0x16, 0x52, + 0xa0, 0x7a, 0xc0, 0xe2, 0xc1, 0x60, 0x64, 0x7c, + 0x24, 0xf9, 0x34, 0xd7, +}; diff --git a/lib/crypto/tests/sha1_kunit.c b/lib/crypto/tests/sha1_kunit.c new file mode 100644 index 000000000000..24ba8d5669c8 --- /dev/null +++ b/lib/crypto/tests/sha1_kunit.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2025 Google LLC + */ +#include <crypto/sha1.h> +#include "sha1-testvecs.h" + +#define HASH sha1 +#define HASH_CTX sha1_ctx +#define HASH_SIZE SHA1_DIGEST_SIZE +#define HASH_INIT sha1_init +#define HASH_UPDATE sha1_update +#define HASH_FINAL sha1_final +#define HMAC_KEY hmac_sha1_key +#define HMAC_CTX hmac_sha1_ctx +#define HMAC_PREPAREKEY hmac_sha1_preparekey +#define HMAC_INIT hmac_sha1_init +#define HMAC_UPDATE hmac_sha1_update +#define HMAC_FINAL hmac_sha1_final +#define HMAC hmac_sha1 +#define HMAC_USINGRAWKEY hmac_sha1_usingrawkey +#include "hash-test-template.h" + +static struct kunit_case hash_test_cases[] = { + HASH_KUNIT_CASES, + KUNIT_CASE(benchmark_hash), + {}, +}; + +static struct kunit_suite hash_test_suite = { + .name = "sha1", + .test_cases = hash_test_cases, + .suite_init = hash_suite_init, + .suite_exit = hash_suite_exit, +}; +kunit_test_suite(hash_test_suite); + +MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-1 and HMAC-SHA1"); +MODULE_LICENSE("GPL"); diff --git a/lib/crypto/tests/sha224-testvecs.h b/lib/crypto/tests/sha224-testvecs.h new file mode 100644 index 000000000000..523adc178e1a --- /dev/null +++ b/lib/crypto/tests/sha224-testvecs.h @@ -0,0 +1,238 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha224 */ + +static const struct { + size_t data_len; + u8 digest[SHA224_DIGEST_SIZE]; +} hash_testvecs[] = { + { + .data_len = 0, + .digest = { + 0xd1, 0x4a, 0x02, 0x8c, 0x2a, 0x3a, 0x2b, 0xc9, + 0x47, 0x61, 0x02, 0xbb, 0x28, 0x82, 0x34, 0xc4, + 0x15, 0xa2, 0xb0, 0x1f, 0x82, 0x8e, 0xa6, 0x2a, + 0xc5, 0xb3, 0xe4, 0x2f, + }, + }, + { + .data_len = 1, + .digest = { + 0xe3, 0x4d, 0x79, 0x17, 0x75, 0x35, 0xdc, 0xd2, + 0x27, 0xc9, 0x9d, 0x0b, 0x90, 0x0f, 0x21, 0x5d, + 0x95, 0xfb, 0x9c, 0x6d, 0xa8, 0xec, 0x19, 0x15, + 0x12, 0xef, 0xf5, 0x0f, + }, + }, + { + .data_len = 2, + .digest = { + 0x81, 0xc7, 0x60, 0x0d, 0x6d, 0x13, 0x75, 0x70, + 0x4b, 0xc0, 0xab, 0xea, 0x04, 0xe3, 0x78, 0x7e, + 0x73, 0xb9, 0x0f, 0xb6, 0xae, 0x90, 0xf3, 0x94, + 0xb2, 0x56, 0xda, 0xc8, + }, + }, + { + .data_len = 3, + .digest = { + 0x24, 0xf0, 0x8c, 0x6e, 0x9d, 0xd6, 0x06, 0x80, + 0x0a, 0x03, 0xee, 0x9b, 0x33, 0xec, 0x83, 0x42, + 0x2c, 0x8b, 0xe7, 0xc7, 0xc6, 0x04, 0xfb, 0xc6, + 0xa3, 0x3a, 0x4d, 0xc9, + }, + }, + { + .data_len = 16, + .digest = { + 0x1c, 0x08, 0xa8, 0x55, 0x8f, 0xc6, 0x0a, 0xea, + 0x2f, 0x1b, 0x54, 0xff, 0x8d, 0xd2, 0xa3, 0xc7, + 0x42, 0xc2, 0x93, 0x3d, 0x73, 0x18, 0x84, 0xba, + 0x75, 0x49, 0x34, 0xfd, + }, + }, + { + .data_len = 32, + .digest = { + 0x45, 0xdd, 0xb5, 0xf0, 0x3c, 0xda, 0xe6, 0xd4, + 0x6c, 0x86, 0x91, 0x29, 0x11, 0x2f, 0x88, 0x7d, + 0xd8, 0x3c, 0xa3, 0xd6, 0xdd, 0x1e, 0xac, 0x98, + 0xff, 0xf0, 0x14, 0x69, + }, + }, + { + .data_len = 48, + .digest = { + 0x0b, 0xfb, 0x71, 0x4c, 0x06, 0x7a, 0xd5, 0x89, + 0x76, 0x0a, 0x43, 0x8b, 0x2b, 0x47, 0x12, 0x56, + 0xa7, 0x64, 0x33, 0x1d, 0xd3, 0x44, 0x17, 0x95, + 0x23, 0xe7, 0x53, 0x01, + }, + }, + { + .data_len = 49, + .digest = { + 0xc4, 0xae, 0x9c, 0x33, 0xd5, 0x1d, 0xf4, 0xa7, + 0xfd, 0xb7, 0xd4, 0x6b, 0xc3, 0xeb, 0xa8, 0xbf, + 0xfb, 0x07, 0x89, 0x4b, 0x07, 0x15, 0x22, 0xec, + 0xe1, 0x45, 0x84, 0xba, + }, + }, + { + .data_len = 63, + .digest = { + 0xad, 0x01, 0x34, 0x2a, 0xe2, 0x3b, 0x58, 0x06, + 0x9f, 0x20, 0xc8, 0xfb, 0xf3, 0x20, 0x82, 0xa6, + 0x9f, 0xee, 0x7a, 0xbe, 0xdf, 0xf3, 0x5d, 0x57, + 0x9b, 0xce, 0x79, 0x96, + }, + }, + { + .data_len = 64, + .digest = { + 0xa7, 0xa6, 0x47, 0xf7, 0xed, 0x2a, 0xe5, 0xe3, + 0xc0, 0x1e, 0x7b, 0x40, 0xe4, 0xf7, 0x40, 0x65, + 0x42, 0xc1, 0x6f, 0x7d, 0x8d, 0x0d, 0x17, 0x4f, + 0xd3, 0xbc, 0x0d, 0x85, + }, + }, + { + .data_len = 65, + .digest = { + 0xc4, 0x9c, 0xb5, 0x6a, 0x01, 0x2d, 0x10, 0xa9, + 0x5f, 0xa4, 0x5a, 0xe1, 0xba, 0x40, 0x12, 0x09, + 0x7b, 0xea, 0xdb, 0xa6, 0x7b, 0xcb, 0x56, 0xf0, + 0xfd, 0x5b, 0xe2, 0xe7, + }, + }, + { + .data_len = 127, + .digest = { + 0x14, 0xda, 0x0e, 0x01, 0xca, 0x78, 0x7d, 0x2d, + 0x85, 0xa3, 0xca, 0x0e, 0x80, 0xf9, 0x95, 0x10, + 0xa1, 0x7b, 0xa5, 0xaa, 0xfc, 0x95, 0x05, 0x08, + 0x53, 0xda, 0x52, 0xee, + }, + }, + { + .data_len = 128, + .digest = { + 0xa5, 0x24, 0xc4, 0x54, 0xe1, 0x50, 0xab, 0xee, + 0x22, 0xc1, 0xa7, 0x27, 0x15, 0x2c, 0x6f, 0xf7, + 0x4c, 0x31, 0xe5, 0x15, 0x25, 0x4e, 0x71, 0xc6, + 0x7e, 0xa0, 0x11, 0x5d, + }, + }, + { + .data_len = 129, + .digest = { + 0x73, 0xd0, 0x8c, 0xce, 0xed, 0xed, 0x9f, 0xaa, + 0x21, 0xaf, 0xa2, 0x08, 0x80, 0x16, 0x15, 0x59, + 0x3f, 0x1d, 0x7f, 0x0a, 0x79, 0x3d, 0x7b, 0x58, + 0xf8, 0xc8, 0x5c, 0x27, + }, + }, + { + .data_len = 256, + .digest = { + 0x31, 0xa7, 0xa1, 0xca, 0x49, 0x72, 0x75, 0xcc, + 0x6e, 0x02, 0x9e, 0xad, 0xea, 0x86, 0x5c, 0x91, + 0x02, 0xe4, 0xc9, 0xf9, 0xd3, 0x9e, 0x74, 0x50, + 0xd8, 0x43, 0x6b, 0x85, + }, + }, + { + .data_len = 511, + .digest = { + 0x40, 0x60, 0x8b, 0xb0, 0x03, 0xa9, 0x75, 0xab, + 0x2d, 0x5b, 0x20, 0x9a, 0x05, 0x72, 0xb7, 0xa8, + 0xce, 0xf2, 0x4f, 0x66, 0x62, 0xe3, 0x7e, 0x24, + 0xd6, 0xe2, 0xea, 0xfa, + }, + }, + { + .data_len = 513, + .digest = { + 0x4f, 0x5f, 0x9f, 0x1e, 0xb3, 0x66, 0x81, 0xdb, + 0x41, 0x5d, 0x65, 0x97, 0x00, 0x8d, 0xdc, 0x62, + 0x03, 0xb0, 0x4d, 0x6b, 0x5c, 0x7f, 0x1e, 0xa0, + 0xfe, 0xfc, 0x0e, 0xb8, + }, + }, + { + .data_len = 1000, + .digest = { + 0x08, 0xa8, 0xa1, 0xc0, 0xd8, 0xf9, 0xb4, 0xaa, + 0x53, 0x22, 0xa1, 0x73, 0x0b, 0x45, 0xa0, 0x20, + 0x72, 0xf3, 0xa9, 0xbc, 0x51, 0xd0, 0x20, 0x79, + 0x69, 0x97, 0xf7, 0xe3, + }, + }, + { + .data_len = 3333, + .digest = { + 0xe8, 0x60, 0x5f, 0xb9, 0x12, 0xe1, 0x6b, 0x24, + 0xc5, 0xe8, 0x43, 0xa9, 0x5c, 0x3f, 0x65, 0xed, + 0xbe, 0xfd, 0x77, 0xf5, 0x47, 0xf2, 0x75, 0x21, + 0xc2, 0x8f, 0x54, 0x8f, + }, + }, + { + .data_len = 4096, + .digest = { + 0xc7, 0xdf, 0x50, 0x16, 0x10, 0x01, 0xb7, 0xdf, + 0x34, 0x1d, 0x18, 0xa2, 0xd5, 0xad, 0x1f, 0x50, + 0xf7, 0xa8, 0x9a, 0x72, 0xfb, 0xfd, 0xd9, 0x1c, + 0x57, 0xac, 0x08, 0x97, + }, + }, + { + .data_len = 4128, + .digest = { + 0xdf, 0x16, 0x76, 0x7f, 0xc0, 0x16, 0x84, 0x63, + 0xac, 0xcf, 0xd0, 0x78, 0x1e, 0x96, 0x67, 0xc5, + 0x3c, 0x06, 0xe9, 0xdb, 0x6e, 0x7d, 0xd0, 0x07, + 0xaa, 0xb1, 0x56, 0xc9, + }, + }, + { + .data_len = 4160, + .digest = { + 0x49, 0xec, 0x5c, 0x18, 0xd7, 0x5b, 0xda, 0xed, + 0x5b, 0x59, 0xde, 0x09, 0x34, 0xb2, 0x49, 0x43, + 0x62, 0x6a, 0x0a, 0x63, 0x6a, 0x51, 0x08, 0x37, + 0x8c, 0xb6, 0x29, 0x84, + }, + }, + { + .data_len = 4224, + .digest = { + 0x3d, 0xc2, 0xc8, 0x43, 0xcf, 0xb7, 0x33, 0x14, + 0x04, 0x93, 0xed, 0xe2, 0xcd, 0x8a, 0x69, 0x5c, + 0x5a, 0xd5, 0x9b, 0x52, 0xdf, 0x48, 0xa7, 0xaa, + 0x28, 0x2b, 0x5d, 0x27, + }, + }, + { + .data_len = 16384, + .digest = { + 0xa7, 0xaf, 0xda, 0x92, 0xe2, 0xe7, 0x61, 0xdc, + 0xa1, 0x32, 0x53, 0x2a, 0x3f, 0x41, 0x5c, 0x7e, + 0xc9, 0x89, 0xda, 0x1c, 0xf7, 0x8d, 0x00, 0xbd, + 0x21, 0x73, 0xb1, 0x69, + }, + }, +}; + +static const u8 hash_testvec_consolidated[SHA224_DIGEST_SIZE] = { + 0x9e, 0xb8, 0x82, 0xab, 0x83, 0x37, 0xe4, 0x63, + 0x84, 0xee, 0x21, 0x15, 0xc2, 0xbb, 0xa3, 0x17, + 0x8f, 0xc4, 0x99, 0x33, 0xa0, 0x2c, 0x9f, 0xec, + 0xca, 0xd0, 0xf3, 0x73, +}; + +static const u8 hmac_testvec_consolidated[SHA224_DIGEST_SIZE] = { + 0x66, 0x34, 0x79, 0x92, 0x47, 0x0e, 0xcd, 0x70, + 0xb0, 0x8b, 0x91, 0xcb, 0x94, 0x2f, 0x67, 0x65, + 0x2f, 0xc9, 0xd2, 0x91, 0x32, 0xaf, 0xf7, 0x5f, + 0xb6, 0x01, 0x5b, 0xf2, +}; diff --git a/lib/crypto/tests/sha224_kunit.c b/lib/crypto/tests/sha224_kunit.c new file mode 100644 index 000000000000..962ad46b9c99 --- /dev/null +++ b/lib/crypto/tests/sha224_kunit.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2025 Google LLC + */ +#include <crypto/sha2.h> +#include "sha224-testvecs.h" + +#define HASH sha224 +#define HASH_CTX sha224_ctx +#define HASH_SIZE SHA224_DIGEST_SIZE +#define HASH_INIT sha224_init +#define HASH_UPDATE sha224_update +#define HASH_FINAL sha224_final +#define HMAC_KEY hmac_sha224_key +#define HMAC_CTX hmac_sha224_ctx +#define HMAC_PREPAREKEY hmac_sha224_preparekey +#define HMAC_INIT hmac_sha224_init +#define HMAC_UPDATE hmac_sha224_update +#define HMAC_FINAL hmac_sha224_final +#define HMAC hmac_sha224 +#define HMAC_USINGRAWKEY hmac_sha224_usingrawkey +#include "hash-test-template.h" + +static struct kunit_case hash_test_cases[] = { + HASH_KUNIT_CASES, + KUNIT_CASE(benchmark_hash), + {}, +}; + +static struct kunit_suite hash_test_suite = { + .name = "sha224", + .test_cases = hash_test_cases, + .suite_init = hash_suite_init, + .suite_exit = hash_suite_exit, +}; +kunit_test_suite(hash_test_suite); + +MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-224 and HMAC-SHA224"); +MODULE_LICENSE("GPL"); diff --git a/lib/crypto/tests/sha256-testvecs.h b/lib/crypto/tests/sha256-testvecs.h new file mode 100644 index 000000000000..2db7b2042034 --- /dev/null +++ b/lib/crypto/tests/sha256-testvecs.h @@ -0,0 +1,238 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha256 */ + +static const struct { + size_t data_len; + u8 digest[SHA256_DIGEST_SIZE]; +} hash_testvecs[] = { + { + .data_len = 0, + .digest = { + 0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14, + 0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f, 0xb9, 0x24, + 0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c, + 0xa4, 0x95, 0x99, 0x1b, 0x78, 0x52, 0xb8, 0x55, + }, + }, + { + .data_len = 1, + .digest = { + 0x45, 0xf8, 0x3d, 0x17, 0xe1, 0x0b, 0x34, 0xfc, + 0xa0, 0x1e, 0xb8, 0xf4, 0x45, 0x4d, 0xac, 0x34, + 0xa7, 0x77, 0xd9, 0x40, 0x4a, 0x46, 0x4e, 0x73, + 0x2c, 0xf4, 0xab, 0xf2, 0xc0, 0xda, 0x94, 0xc4, + }, + }, + { + .data_len = 2, + .digest = { + 0xf9, 0xd3, 0x52, 0x2f, 0xd5, 0xe0, 0x99, 0x15, + 0x1c, 0xd6, 0xa9, 0x24, 0x4f, 0x40, 0xba, 0x25, + 0x33, 0x43, 0x3e, 0xe1, 0x78, 0x6a, 0xfe, 0x7d, + 0x07, 0xe2, 0x29, 0x7b, 0x6d, 0xc5, 0x73, 0xf5, + }, + }, + { + .data_len = 3, + .digest = { + 0x71, 0xf7, 0xa1, 0xef, 0x69, 0x86, 0x0e, 0xe4, + 0x87, 0x25, 0x58, 0x4c, 0x07, 0x2c, 0xfc, 0x60, + 0xc5, 0xf6, 0xe2, 0x44, 0xaa, 0xfb, 0x41, 0xc7, + 0x2b, 0xc5, 0x01, 0x8c, 0x39, 0x98, 0x30, 0x37, + }, + }, + { + .data_len = 16, + .digest = { + 0x09, 0x95, 0x9a, 0xfa, 0x25, 0x18, 0x86, 0x06, + 0xfe, 0x65, 0xc9, 0x2f, 0x91, 0x15, 0x74, 0x06, + 0x6c, 0xbf, 0xef, 0x7b, 0x0b, 0xc7, 0x2c, 0x05, + 0xdd, 0x17, 0x5d, 0x6f, 0x8a, 0xa5, 0xde, 0x3c, + }, + }, + { + .data_len = 32, + .digest = { + 0xe5, 0x52, 0x3c, 0x85, 0xea, 0x1b, 0xe1, 0x6c, + 0xe0, 0xdb, 0xc3, 0xef, 0xf0, 0xca, 0xc2, 0xe1, + 0xb9, 0x36, 0xa1, 0x28, 0xb6, 0x9e, 0xf5, 0x6e, + 0x70, 0xf7, 0xf9, 0xa7, 0x1c, 0xd3, 0x22, 0xd0, + }, + }, + { + .data_len = 48, + .digest = { + 0x5f, 0x84, 0xd4, 0xd7, 0x2e, 0x80, 0x09, 0xef, + 0x1c, 0x77, 0x7c, 0x25, 0x59, 0x63, 0x88, 0x64, + 0xfd, 0x56, 0xea, 0x23, 0xf4, 0x4f, 0x2e, 0x49, + 0xcd, 0xb4, 0xaa, 0xc7, 0x5c, 0x8b, 0x75, 0x84, + }, + }, + { + .data_len = 49, + .digest = { + 0x22, 0x6e, 0xca, 0xda, 0x00, 0x2d, 0x90, 0x96, + 0x24, 0xf8, 0x55, 0x17, 0x11, 0xda, 0x42, 0x1c, + 0x78, 0x4e, 0xbf, 0xd9, 0xc5, 0xcf, 0xf3, 0xe3, + 0xaf, 0xd3, 0x60, 0xcd, 0xaa, 0xe2, 0xc7, 0x22, + }, + }, + { + .data_len = 63, + .digest = { + 0x97, 0xe2, 0x74, 0xdc, 0x6b, 0xa4, 0xaf, 0x32, + 0x3b, 0x50, 0x6d, 0x80, 0xb5, 0xd3, 0x0c, 0x36, + 0xea, 0x3f, 0x5d, 0x36, 0xa7, 0x49, 0x51, 0xf3, + 0xbd, 0x69, 0x68, 0x60, 0x9b, 0xde, 0x73, 0xf5, + }, + }, + { + .data_len = 64, + .digest = { + 0x13, 0x74, 0xb1, 0x72, 0xd6, 0x53, 0x48, 0x28, + 0x42, 0xd8, 0xba, 0x64, 0x20, 0x60, 0xb6, 0x4c, + 0xc3, 0xac, 0x5d, 0x93, 0x8c, 0xb9, 0xd4, 0xcc, + 0xb4, 0x9f, 0x31, 0x1f, 0xeb, 0x68, 0x35, 0x58, + }, + }, + { + .data_len = 65, + .digest = { + 0xda, 0xbe, 0xd7, 0xbc, 0x6e, 0xe6, 0x5a, 0x57, + 0xeb, 0x9a, 0x93, 0xaa, 0x66, 0xd0, 0xe0, 0xc4, + 0x29, 0x7f, 0xe9, 0x3b, 0x8e, 0xdf, 0x81, 0x82, + 0x8d, 0x15, 0x11, 0x59, 0x4e, 0x13, 0xa5, 0x58, + }, + }, + { + .data_len = 127, + .digest = { + 0x8c, 0x1a, 0xba, 0x40, 0x66, 0x94, 0x19, 0xf4, + 0x2e, 0xa2, 0xae, 0x94, 0x53, 0x18, 0xb6, 0xfd, + 0xa0, 0x12, 0xc5, 0xef, 0xd5, 0xd6, 0x1b, 0xa1, + 0x37, 0xea, 0x19, 0x44, 0x35, 0x54, 0x85, 0x74, + }, + }, + { + .data_len = 128, + .digest = { + 0xfd, 0x07, 0xd8, 0x77, 0x7d, 0x8b, 0x4f, 0xee, + 0x60, 0x60, 0x26, 0xef, 0x2a, 0x86, 0xfb, 0x67, + 0xeb, 0x31, 0x27, 0x03, 0x99, 0x3c, 0xde, 0xe5, + 0x84, 0x72, 0x71, 0x4c, 0x33, 0x7b, 0x87, 0x13, + }, + }, + { + .data_len = 129, + .digest = { + 0x97, 0xc5, 0x58, 0x38, 0x20, 0xc7, 0xde, 0xfa, + 0xdd, 0x9b, 0x10, 0xc6, 0xc2, 0x2f, 0x94, 0xb5, + 0xc0, 0x33, 0xc0, 0x20, 0x1c, 0x2f, 0xb4, 0x28, + 0x5e, 0x36, 0xfa, 0x8c, 0x24, 0x1c, 0x18, 0x27, + }, + }, + { + .data_len = 256, + .digest = { + 0x62, 0x17, 0x84, 0x26, 0x98, 0x30, 0x57, 0xca, + 0x4f, 0x32, 0xd9, 0x09, 0x09, 0x34, 0xe2, 0xcb, + 0x92, 0x45, 0xd5, 0xeb, 0x8b, 0x9b, 0x3c, 0xd8, + 0xaa, 0xc7, 0xd2, 0x2b, 0x04, 0xab, 0xb3, 0x35, + }, + }, + { + .data_len = 511, + .digest = { + 0x7f, 0xe1, 0x09, 0x78, 0x5d, 0x61, 0xfa, 0x5e, + 0x9b, 0x8c, 0xb1, 0xa9, 0x09, 0x69, 0xb4, 0x24, + 0x54, 0xf2, 0x1c, 0xc9, 0x5f, 0xfb, 0x59, 0x9d, + 0x36, 0x1b, 0x37, 0x44, 0xfc, 0x64, 0x79, 0xb6, + }, + }, + { + .data_len = 513, + .digest = { + 0xd2, 0x3b, 0x3a, 0xe7, 0x13, 0x4f, 0xbd, 0x29, + 0x6b, 0xd2, 0x79, 0x26, 0x6c, 0xd2, 0x22, 0x43, + 0x25, 0x34, 0x9b, 0x9b, 0x22, 0xb0, 0x9f, 0x61, + 0x1d, 0xf4, 0xe2, 0x65, 0x68, 0x95, 0x02, 0x6c, + }, + }, + { + .data_len = 1000, + .digest = { + 0x0c, 0x34, 0x53, 0x3f, 0x0f, 0x8a, 0x39, 0x8d, + 0x63, 0xe4, 0x83, 0x6e, 0x11, 0x7d, 0x14, 0x8e, + 0x5b, 0xf0, 0x4d, 0xca, 0x23, 0x24, 0xb5, 0xd2, + 0x13, 0x3f, 0xd9, 0xde, 0x84, 0x74, 0x26, 0x59, + }, + }, + { + .data_len = 3333, + .digest = { + 0xa8, 0xb8, 0x83, 0x01, 0x1b, 0x38, 0x7a, 0xca, + 0x59, 0xe9, 0x5b, 0x37, 0x6a, 0xab, 0xb4, 0x85, + 0x94, 0x73, 0x26, 0x04, 0xef, 0xed, 0xf4, 0x0d, + 0xd6, 0x09, 0x21, 0x09, 0x96, 0x78, 0xe3, 0xcf, + }, + }, + { + .data_len = 4096, + .digest = { + 0x0b, 0x12, 0x66, 0x96, 0x78, 0x4f, 0x2c, 0x35, + 0xa4, 0xed, 0xbc, 0xb8, 0x30, 0xa6, 0x37, 0x9b, + 0x94, 0x13, 0xae, 0x86, 0xf0, 0x20, 0xfb, 0x49, + 0x8f, 0x5d, 0x20, 0x70, 0x60, 0x2b, 0x02, 0x70, + }, + }, + { + .data_len = 4128, + .digest = { + 0xe4, 0xbd, 0xe4, 0x3b, 0x85, 0xf4, 0x6f, 0x11, + 0xad, 0xc4, 0x79, 0xcc, 0x8e, 0x6d, 0x8b, 0x15, + 0xbb, 0xf9, 0xd3, 0x65, 0xe1, 0xf8, 0x8d, 0x22, + 0x65, 0x66, 0x66, 0xb3, 0xf5, 0xd0, 0x9c, 0xaf, + }, + }, + { + .data_len = 4160, + .digest = { + 0x90, 0x5f, 0xe0, 0xfc, 0xb1, 0xdc, 0x38, 0x1b, + 0xe5, 0x37, 0x3f, 0xd2, 0xcc, 0x48, 0xc4, 0xbc, + 0xb4, 0xfd, 0xf7, 0x71, 0x5f, 0x6b, 0xf4, 0xc4, + 0xa6, 0x08, 0x7e, 0xfc, 0x4e, 0x96, 0xf7, 0xc2, + }, + }, + { + .data_len = 4224, + .digest = { + 0x1f, 0x34, 0x0a, 0x3b, 0xdb, 0xf7, 0x7a, 0xdb, + 0x3d, 0x89, 0x85, 0x0c, 0xd2, 0xf0, 0x0c, 0xbd, + 0x25, 0x39, 0x14, 0x06, 0x28, 0x0f, 0x6b, 0x5f, + 0xe3, 0x1f, 0x2a, 0xb6, 0xca, 0x56, 0x41, 0xa1, + }, + }, + { + .data_len = 16384, + .digest = { + 0x7b, 0x01, 0x2d, 0x84, 0x70, 0xee, 0xe0, 0x77, + 0x3c, 0x17, 0x63, 0xfe, 0x40, 0xd7, 0xfd, 0xa1, + 0x75, 0x90, 0xb8, 0x3e, 0x50, 0xcd, 0x06, 0xb7, + 0xb9, 0xb9, 0x2b, 0x91, 0x4f, 0xba, 0xe4, 0x4c, + }, + }, +}; + +static const u8 hash_testvec_consolidated[SHA256_DIGEST_SIZE] = { + 0x78, 0x1c, 0xb1, 0x9f, 0xe5, 0xe1, 0xcb, 0x41, + 0x8b, 0x34, 0x00, 0x33, 0x57, 0xc3, 0x1c, 0x8f, + 0x5c, 0x84, 0xc7, 0x8b, 0x87, 0x6a, 0x13, 0x29, + 0x2f, 0xc4, 0x1a, 0x70, 0x5e, 0x40, 0xf2, 0xfe, +}; + +static const u8 hmac_testvec_consolidated[SHA256_DIGEST_SIZE] = { + 0x56, 0x96, 0x2e, 0x23, 0x3f, 0x94, 0x89, 0x0d, + 0x0f, 0x24, 0x36, 0x2e, 0x19, 0x3d, 0xb5, 0xac, + 0xb8, 0xcd, 0xf1, 0xc9, 0xca, 0xac, 0xee, 0x9d, + 0x62, 0xe6, 0x81, 0xe5, 0x96, 0xf9, 0x38, 0xf5, +}; diff --git a/lib/crypto/tests/sha256_kunit.c b/lib/crypto/tests/sha256_kunit.c new file mode 100644 index 000000000000..1cd4caee6010 --- /dev/null +++ b/lib/crypto/tests/sha256_kunit.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2025 Google LLC + */ +#include <crypto/sha2.h> +#include "sha256-testvecs.h" + +#define HASH sha256 +#define HASH_CTX sha256_ctx +#define HASH_SIZE SHA256_DIGEST_SIZE +#define HASH_INIT sha256_init +#define HASH_UPDATE sha256_update +#define HASH_FINAL sha256_final +#define HMAC_KEY hmac_sha256_key +#define HMAC_CTX hmac_sha256_ctx +#define HMAC_PREPAREKEY hmac_sha256_preparekey +#define HMAC_INIT hmac_sha256_init +#define HMAC_UPDATE hmac_sha256_update +#define HMAC_FINAL hmac_sha256_final +#define HMAC hmac_sha256 +#define HMAC_USINGRAWKEY hmac_sha256_usingrawkey +#include "hash-test-template.h" + +static struct kunit_case hash_test_cases[] = { + HASH_KUNIT_CASES, + KUNIT_CASE(benchmark_hash), + {}, +}; + +static struct kunit_suite hash_test_suite = { + .name = "sha256", + .test_cases = hash_test_cases, + .suite_init = hash_suite_init, + .suite_exit = hash_suite_exit, +}; +kunit_test_suite(hash_test_suite); + +MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-256 and HMAC-SHA256"); +MODULE_LICENSE("GPL"); diff --git a/lib/crypto/tests/sha384-testvecs.h b/lib/crypto/tests/sha384-testvecs.h new file mode 100644 index 000000000000..6e2f572dd792 --- /dev/null +++ b/lib/crypto/tests/sha384-testvecs.h @@ -0,0 +1,290 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha384 */ + +static const struct { + size_t data_len; + u8 digest[SHA384_DIGEST_SIZE]; +} hash_testvecs[] = { + { + .data_len = 0, + .digest = { + 0x38, 0xb0, 0x60, 0xa7, 0x51, 0xac, 0x96, 0x38, + 0x4c, 0xd9, 0x32, 0x7e, 0xb1, 0xb1, 0xe3, 0x6a, + 0x21, 0xfd, 0xb7, 0x11, 0x14, 0xbe, 0x07, 0x43, + 0x4c, 0x0c, 0xc7, 0xbf, 0x63, 0xf6, 0xe1, 0xda, + 0x27, 0x4e, 0xde, 0xbf, 0xe7, 0x6f, 0x65, 0xfb, + 0xd5, 0x1a, 0xd2, 0xf1, 0x48, 0x98, 0xb9, 0x5b, + }, + }, + { + .data_len = 1, + .digest = { + 0x07, 0x34, 0x9d, 0x74, 0x48, 0x76, 0xa5, 0x72, + 0x78, 0x02, 0xb8, 0x6e, 0x21, 0x59, 0xb0, 0x75, + 0x09, 0x68, 0x11, 0x39, 0x53, 0x61, 0xee, 0x8d, + 0xf2, 0x01, 0xf3, 0x90, 0x53, 0x7c, 0xd3, 0xde, + 0x13, 0x9f, 0xd2, 0x74, 0x28, 0xfe, 0xe1, 0xc8, + 0x2e, 0x95, 0xc6, 0x7d, 0x69, 0x4d, 0x04, 0xc6, + }, + }, + { + .data_len = 2, + .digest = { + 0xc4, 0xef, 0x6e, 0x8c, 0x19, 0x1c, 0xaa, 0x0e, + 0x86, 0xf2, 0x68, 0xa1, 0xa0, 0x2d, 0x2e, 0xb2, + 0x84, 0xbc, 0x5d, 0x53, 0x31, 0xf8, 0x03, 0x75, + 0x56, 0xf4, 0x8b, 0x23, 0x1a, 0x68, 0x15, 0x9a, + 0x60, 0xb2, 0xec, 0x05, 0xe1, 0xd4, 0x5e, 0x9e, + 0xe8, 0x7c, 0x9d, 0xe4, 0x0f, 0x9c, 0x3a, 0xdd, + }, + }, + { + .data_len = 3, + .digest = { + 0x29, 0xd2, 0x02, 0xa2, 0x77, 0x24, 0xc7, 0xa7, + 0x23, 0x0c, 0x3e, 0x30, 0x56, 0x47, 0xdb, 0x75, + 0xd4, 0x41, 0xf8, 0xb3, 0x8e, 0x26, 0xf6, 0x92, + 0xbc, 0x20, 0x2e, 0x96, 0xcc, 0x81, 0x5f, 0x32, + 0x82, 0x60, 0xe2, 0xcf, 0x23, 0xd7, 0x3c, 0x90, + 0xb2, 0x56, 0x8f, 0xb6, 0x0f, 0xf0, 0x6b, 0x80, + }, + }, + { + .data_len = 16, + .digest = { + 0x21, 0x4c, 0xac, 0xfe, 0xbd, 0x40, 0x74, 0x1f, + 0xa2, 0x2d, 0x2f, 0x35, 0x91, 0xfd, 0xc9, 0x97, + 0x88, 0x12, 0x6c, 0x0c, 0x6e, 0xd8, 0x50, 0x0b, + 0x4b, 0x2c, 0x89, 0xa6, 0xa6, 0x4a, 0xad, 0xd7, + 0x72, 0x62, 0x2c, 0x62, 0x81, 0xcd, 0x24, 0x74, + 0xf5, 0x44, 0x05, 0xa0, 0x97, 0xea, 0xf1, 0x78, + }, + }, + { + .data_len = 32, + .digest = { + 0x06, 0x8b, 0x92, 0x9f, 0x8b, 0x64, 0xb2, 0x80, + 0xde, 0xcc, 0xde, 0xc3, 0x2f, 0x22, 0x27, 0xe8, + 0x3b, 0x6e, 0x16, 0x21, 0x14, 0x81, 0xbe, 0x5b, + 0xa7, 0xa7, 0x14, 0x8a, 0x00, 0x8f, 0x0d, 0x38, + 0x11, 0x63, 0xe8, 0x3e, 0xb9, 0xf1, 0xcf, 0x87, + 0xb1, 0x28, 0xe5, 0xa1, 0x89, 0xa8, 0x7a, 0xde, + }, + }, + { + .data_len = 48, + .digest = { + 0x9e, 0x37, 0x76, 0x62, 0x98, 0x39, 0xbe, 0xfd, + 0x2b, 0x91, 0x20, 0x54, 0x8f, 0x21, 0xe7, 0x30, + 0x0a, 0x01, 0x7a, 0x65, 0x0b, 0xc9, 0xb3, 0x89, + 0x3c, 0xb6, 0xd3, 0xa8, 0xff, 0xc9, 0x1b, 0x5c, + 0xd4, 0xac, 0xb4, 0x7e, 0xba, 0x94, 0xc3, 0x8a, + 0x26, 0x41, 0xf6, 0xd5, 0xed, 0x6f, 0x27, 0xa7, + }, + }, + { + .data_len = 49, + .digest = { + 0x03, 0x1f, 0xef, 0x5a, 0x16, 0x28, 0x78, 0x10, + 0x29, 0xe8, 0xe2, 0xe4, 0x84, 0x36, 0x19, 0x10, + 0xaa, 0xea, 0xde, 0x06, 0x39, 0x5f, 0xb2, 0x36, + 0xca, 0x24, 0x4f, 0x7b, 0x66, 0xf7, 0xe7, 0x31, + 0xf3, 0x9b, 0x74, 0x1e, 0x17, 0x20, 0x88, 0x62, + 0x50, 0xeb, 0x5f, 0x9a, 0xa7, 0x2c, 0xf4, 0xc9, + }, + }, + { + .data_len = 63, + .digest = { + 0x10, 0xce, 0xed, 0x26, 0xb8, 0xac, 0xc1, 0x1b, + 0xe6, 0xb9, 0xeb, 0x7c, 0xae, 0xcd, 0x55, 0x5a, + 0x20, 0x2a, 0x7b, 0x43, 0xe6, 0x3e, 0xf0, 0x3f, + 0xd9, 0x2f, 0x8c, 0x52, 0xe2, 0xf0, 0xb6, 0x24, + 0x2e, 0xa4, 0xac, 0x24, 0x3a, 0x54, 0x99, 0x71, + 0x65, 0xab, 0x97, 0x2d, 0xb6, 0xe6, 0x94, 0x20, + }, + }, + { + .data_len = 64, + .digest = { + 0x24, 0x6d, 0x9f, 0x59, 0x42, 0x36, 0xca, 0x34, + 0x36, 0x41, 0xa2, 0xcd, 0x69, 0xdf, 0x3d, 0xcb, + 0x64, 0x94, 0x54, 0xb2, 0xed, 0xc1, 0x1c, 0x31, + 0xe3, 0x26, 0xcb, 0x71, 0xe6, 0x98, 0xb2, 0x56, + 0x74, 0x30, 0xa9, 0x15, 0x98, 0x9d, 0xb3, 0x07, + 0xcc, 0xa8, 0xcc, 0x6f, 0x42, 0xb0, 0x9d, 0x2b, + }, + }, + { + .data_len = 65, + .digest = { + 0x85, 0x1f, 0xbc, 0x5e, 0x2a, 0x00, 0x7d, 0xc2, + 0x21, 0x4c, 0x28, 0x14, 0xc5, 0xd8, 0x0c, 0xe8, + 0x55, 0xa5, 0xa0, 0x77, 0xda, 0x8f, 0xce, 0xd4, + 0xf0, 0xcb, 0x30, 0xb8, 0x9c, 0x47, 0xe1, 0x33, + 0x92, 0x18, 0xc5, 0x1f, 0xf2, 0xef, 0xb5, 0xe5, + 0xbc, 0x63, 0xa6, 0xe5, 0x9a, 0xc9, 0xcc, 0xf1, + }, + }, + { + .data_len = 127, + .digest = { + 0x26, 0xd2, 0x4c, 0xb6, 0xce, 0xd8, 0x22, 0x2b, + 0x44, 0x10, 0x6f, 0x59, 0xf7, 0x0d, 0xb9, 0x3f, + 0x7d, 0x29, 0x75, 0xf1, 0x71, 0xb2, 0x71, 0x23, + 0xef, 0x68, 0xb7, 0x25, 0xae, 0xb8, 0x45, 0xf8, + 0xa3, 0xb2, 0x2d, 0x7a, 0x83, 0x0a, 0x05, 0x61, + 0xbc, 0x73, 0xf1, 0xf9, 0xba, 0xfb, 0x3d, 0xc2, + }, + }, + { + .data_len = 128, + .digest = { + 0x7c, 0xe5, 0x7f, 0x5e, 0xea, 0xd9, 0x7e, 0x54, + 0x14, 0x30, 0x6f, 0x37, 0x02, 0x71, 0x0f, 0xf1, + 0x14, 0x16, 0xfa, 0xeb, 0x6e, 0x1e, 0xf0, 0xbe, + 0x10, 0xed, 0x01, 0xbf, 0xa0, 0x9d, 0xcb, 0x07, + 0x5f, 0x8b, 0x7f, 0x44, 0xe1, 0xd9, 0x13, 0xf0, + 0x29, 0xa2, 0x54, 0x32, 0xd9, 0xb0, 0x69, 0x69, + }, + }, + { + .data_len = 129, + .digest = { + 0xc5, 0x54, 0x1f, 0xcb, 0x9d, 0x8f, 0xdf, 0xbf, + 0xab, 0x55, 0x92, 0x1d, 0x3b, 0x93, 0x79, 0x26, + 0xdf, 0xba, 0x9a, 0x28, 0xff, 0xa0, 0x6c, 0xae, + 0x7b, 0x53, 0x8d, 0xfa, 0xef, 0x35, 0x88, 0x19, + 0x16, 0xb8, 0x72, 0x86, 0x76, 0x2a, 0xf5, 0xe6, + 0xec, 0xb2, 0xd7, 0xd4, 0xbe, 0x1a, 0xe4, 0x9f, + }, + }, + { + .data_len = 256, + .digest = { + 0x74, 0x9d, 0x77, 0xfb, 0xe8, 0x0f, 0x0c, 0x2d, + 0x86, 0x0d, 0x49, 0xea, 0x2b, 0xd0, 0x13, 0xd1, + 0xe8, 0xb8, 0xe1, 0xa3, 0x7b, 0x48, 0xab, 0x6a, + 0x21, 0x2b, 0x4c, 0x48, 0x32, 0xb5, 0xdc, 0x31, + 0x7f, 0xd0, 0x32, 0x67, 0x9a, 0xc0, 0x85, 0x53, + 0xef, 0xe9, 0xfb, 0xe1, 0x8b, 0xd8, 0xcc, 0xc2, + }, + }, + { + .data_len = 511, + .digest = { + 0x7b, 0xa9, 0xde, 0xa3, 0x07, 0x5c, 0x4c, 0xaa, + 0x31, 0xc6, 0x9e, 0x55, 0xd4, 0x3f, 0x52, 0xdd, + 0xde, 0x36, 0x70, 0x96, 0x59, 0x6e, 0x90, 0x78, + 0x4c, 0x6a, 0x27, 0xde, 0x83, 0x84, 0xc3, 0x35, + 0x53, 0x76, 0x1d, 0xbf, 0x83, 0x64, 0xcf, 0xf2, + 0xb0, 0x3e, 0x07, 0x27, 0xe4, 0x25, 0x6c, 0x56, + }, + }, + { + .data_len = 513, + .digest = { + 0x53, 0x50, 0xf7, 0x3b, 0x86, 0x1d, 0x7a, 0xe2, + 0x5d, 0x9b, 0x71, 0xfa, 0x25, 0x23, 0x5a, 0xfe, + 0x8c, 0xb9, 0xac, 0x8a, 0x9d, 0x6c, 0x99, 0xbc, + 0x01, 0x9e, 0xa0, 0xd6, 0x3c, 0x03, 0x46, 0x21, + 0xb6, 0xd0, 0xb0, 0xb3, 0x23, 0x23, 0x58, 0xf1, + 0xea, 0x4e, 0xf2, 0x1a, 0x2f, 0x14, 0x2b, 0x5a, + }, + }, + { + .data_len = 1000, + .digest = { + 0x06, 0x03, 0xb3, 0xba, 0x14, 0xe0, 0x28, 0x07, + 0xd5, 0x15, 0x97, 0x1f, 0x87, 0xef, 0x80, 0xba, + 0x48, 0x03, 0xb6, 0xc5, 0x47, 0xca, 0x8c, 0x95, + 0xed, 0x95, 0xfd, 0x27, 0xb6, 0x83, 0xda, 0x6d, + 0xa7, 0xb2, 0x1a, 0xd2, 0xb5, 0x89, 0xbb, 0xb4, + 0x00, 0xbc, 0x86, 0x54, 0x7d, 0x5a, 0x91, 0x63, + }, + }, + { + .data_len = 3333, + .digest = { + 0xd3, 0xe0, 0x6e, 0x7d, 0x80, 0x08, 0x53, 0x07, + 0x8c, 0x0f, 0xc2, 0xce, 0x9f, 0x09, 0x86, 0x31, + 0x28, 0x24, 0x3c, 0x3e, 0x2d, 0x36, 0xb4, 0x28, + 0xc7, 0x1b, 0x70, 0xf9, 0x35, 0x9b, 0x10, 0xfa, + 0xc8, 0x5e, 0x2b, 0x32, 0x7f, 0x65, 0xd2, 0x68, + 0xb2, 0x84, 0x90, 0xf6, 0xc8, 0x6e, 0xb8, 0xdb, + }, + }, + { + .data_len = 4096, + .digest = { + 0x39, 0xeb, 0xc4, 0xb3, 0x08, 0xe2, 0xdd, 0xf3, + 0x9f, 0x5e, 0x44, 0x93, 0x63, 0x8b, 0x39, 0x57, + 0xd7, 0xe8, 0x7e, 0x3d, 0x74, 0xf8, 0xf6, 0xab, + 0xfe, 0x74, 0x51, 0xe4, 0x1b, 0x4a, 0x23, 0xbc, + 0x69, 0xfc, 0xbb, 0xa7, 0x71, 0xa7, 0x86, 0x24, + 0xcc, 0x85, 0x70, 0xf2, 0x31, 0x0d, 0x47, 0xc0, + }, + }, + { + .data_len = 4128, + .digest = { + 0x23, 0xc3, 0x97, 0x06, 0x79, 0xbe, 0x8a, 0xe9, + 0x1f, 0x1a, 0x43, 0xad, 0xe6, 0x76, 0x23, 0x13, + 0x64, 0xae, 0xda, 0xe7, 0x8b, 0x88, 0x96, 0xb6, + 0xa9, 0x1a, 0xb7, 0x80, 0x8e, 0x1c, 0x94, 0x98, + 0x09, 0x08, 0xdb, 0x8e, 0x4d, 0x0a, 0x09, 0x65, + 0xe5, 0x21, 0x1c, 0xd9, 0xab, 0x64, 0xbb, 0xea, + }, + }, + { + .data_len = 4160, + .digest = { + 0x4f, 0x4a, 0x88, 0x9f, 0x40, 0x89, 0xfe, 0xb6, + 0xda, 0x9d, 0xcd, 0xa5, 0x27, 0xd2, 0x29, 0x71, + 0x58, 0x60, 0xd4, 0x55, 0xfe, 0x92, 0xcd, 0x51, + 0x8b, 0xec, 0x3b, 0xd3, 0xd1, 0x3e, 0x8d, 0x36, + 0x7b, 0xb1, 0x41, 0xef, 0xec, 0x9d, 0xdf, 0xcd, + 0x4e, 0xde, 0x5a, 0xe5, 0xe5, 0x16, 0x14, 0x54, + }, + }, + { + .data_len = 4224, + .digest = { + 0xb5, 0xa5, 0x3e, 0x86, 0x39, 0x20, 0x49, 0x4c, + 0xcd, 0xb6, 0xdd, 0x03, 0xfe, 0x36, 0x6e, 0xa6, + 0xfc, 0xff, 0x19, 0x33, 0x0c, 0x52, 0xea, 0x37, + 0x94, 0xda, 0x5b, 0x27, 0xd1, 0x99, 0x5a, 0x89, + 0x40, 0x78, 0xfa, 0x96, 0xb9, 0x2f, 0xa0, 0x48, + 0xc9, 0xf8, 0x5c, 0xf0, 0x95, 0xf4, 0xea, 0x61, + }, + }, + { + .data_len = 16384, + .digest = { + 0x6f, 0x48, 0x6f, 0x21, 0xb9, 0xc1, 0xcc, 0x92, + 0x4e, 0xed, 0x6b, 0xef, 0x51, 0x88, 0xdf, 0xfd, + 0xcb, 0x3d, 0x44, 0x9c, 0x37, 0x85, 0xb4, 0xc5, + 0xeb, 0x60, 0x55, 0x58, 0x01, 0x47, 0xbf, 0x75, + 0x9b, 0xa8, 0x82, 0x8c, 0xec, 0xe8, 0x0e, 0x58, + 0xc1, 0x26, 0xa2, 0x45, 0x87, 0x3e, 0xfb, 0x8d, + }, + }, +}; + +static const u8 hash_testvec_consolidated[SHA384_DIGEST_SIZE] = { + 0xfc, 0xcb, 0xe6, 0x42, 0xf0, 0x9e, 0x2b, 0x77, + 0x7b, 0x62, 0xe8, 0x70, 0x86, 0xbf, 0xaf, 0x3c, + 0xbb, 0x02, 0xd9, 0x86, 0xdc, 0xba, 0xd3, 0xa4, + 0x0d, 0x8d, 0xb3, 0x2d, 0x0b, 0xa3, 0x84, 0x04, + 0x7c, 0x16, 0x37, 0xaf, 0xba, 0x1e, 0xd4, 0x2f, + 0x4c, 0x57, 0x55, 0x86, 0x52, 0x47, 0x9a, 0xec, +}; + +static const u8 hmac_testvec_consolidated[SHA384_DIGEST_SIZE] = { + 0x82, 0xcf, 0x7d, 0x80, 0x71, 0xdb, 0x91, 0x09, + 0x67, 0xe8, 0x44, 0x4a, 0x0d, 0x03, 0xb1, 0xf9, + 0x62, 0xde, 0x4e, 0xbb, 0x1f, 0x41, 0xcd, 0x62, + 0x39, 0x6b, 0x2d, 0x44, 0x0e, 0xde, 0x98, 0x73, + 0xdd, 0xeb, 0x9d, 0x53, 0xfb, 0xee, 0xd1, 0xc3, + 0x96, 0xdb, 0xfc, 0x2a, 0x38, 0x90, 0x02, 0x53, +}; diff --git a/lib/crypto/tests/sha384_kunit.c b/lib/crypto/tests/sha384_kunit.c new file mode 100644 index 000000000000..e1ef5c995bb6 --- /dev/null +++ b/lib/crypto/tests/sha384_kunit.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2025 Google LLC + */ +#include <crypto/sha2.h> +#include "sha384-testvecs.h" + +#define HASH sha384 +#define HASH_CTX sha384_ctx +#define HASH_SIZE SHA384_DIGEST_SIZE +#define HASH_INIT sha384_init +#define HASH_UPDATE sha384_update +#define HASH_FINAL sha384_final +#define HMAC_KEY hmac_sha384_key +#define HMAC_CTX hmac_sha384_ctx +#define HMAC_PREPAREKEY hmac_sha384_preparekey +#define HMAC_INIT hmac_sha384_init +#define HMAC_UPDATE hmac_sha384_update +#define HMAC_FINAL hmac_sha384_final +#define HMAC hmac_sha384 +#define HMAC_USINGRAWKEY hmac_sha384_usingrawkey +#include "hash-test-template.h" + +static struct kunit_case hash_test_cases[] = { + HASH_KUNIT_CASES, + KUNIT_CASE(benchmark_hash), + {}, +}; + +static struct kunit_suite hash_test_suite = { + .name = "sha384", + .test_cases = hash_test_cases, + .suite_init = hash_suite_init, + .suite_exit = hash_suite_exit, +}; +kunit_test_suite(hash_test_suite); + +MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-384 and HMAC-SHA384"); +MODULE_LICENSE("GPL"); diff --git a/lib/crypto/tests/sha512-testvecs.h b/lib/crypto/tests/sha512-testvecs.h new file mode 100644 index 000000000000..c506aaf72ba7 --- /dev/null +++ b/lib/crypto/tests/sha512-testvecs.h @@ -0,0 +1,342 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha512 */ + +static const struct { + size_t data_len; + u8 digest[SHA512_DIGEST_SIZE]; +} hash_testvecs[] = { + { + .data_len = 0, + .digest = { + 0xcf, 0x83, 0xe1, 0x35, 0x7e, 0xef, 0xb8, 0xbd, + 0xf1, 0x54, 0x28, 0x50, 0xd6, 0x6d, 0x80, 0x07, + 0xd6, 0x20, 0xe4, 0x05, 0x0b, 0x57, 0x15, 0xdc, + 0x83, 0xf4, 0xa9, 0x21, 0xd3, 0x6c, 0xe9, 0xce, + 0x47, 0xd0, 0xd1, 0x3c, 0x5d, 0x85, 0xf2, 0xb0, + 0xff, 0x83, 0x18, 0xd2, 0x87, 0x7e, 0xec, 0x2f, + 0x63, 0xb9, 0x31, 0xbd, 0x47, 0x41, 0x7a, 0x81, + 0xa5, 0x38, 0x32, 0x7a, 0xf9, 0x27, 0xda, 0x3e, + }, + }, + { + .data_len = 1, + .digest = { + 0x12, 0xf2, 0xb6, 0xec, 0x84, 0xa0, 0x8e, 0xcf, + 0x0d, 0xec, 0x17, 0xfd, 0x1f, 0x91, 0x15, 0x69, + 0xd2, 0xb9, 0x89, 0xff, 0x89, 0x9d, 0xd9, 0x0b, + 0x7a, 0x0f, 0x82, 0x94, 0x57, 0x5b, 0xf3, 0x08, + 0x42, 0x45, 0x23, 0x08, 0x44, 0x54, 0x35, 0x36, + 0xed, 0x4b, 0xb3, 0xa5, 0xbf, 0x17, 0xc1, 0x3c, + 0xdd, 0x25, 0x4a, 0x30, 0x64, 0xed, 0x66, 0x06, + 0x72, 0x05, 0xc2, 0x71, 0x5a, 0x6c, 0xd0, 0x75, + }, + }, + { + .data_len = 2, + .digest = { + 0x01, 0x37, 0x97, 0xcc, 0x0a, 0xcb, 0x61, 0xa4, + 0x93, 0x26, 0x36, 0x4b, 0xd2, 0x27, 0xea, 0xaf, + 0xda, 0xfa, 0x8f, 0x86, 0x12, 0x99, 0x7b, 0xc8, + 0x94, 0xa9, 0x1c, 0x70, 0x3f, 0x43, 0xf3, 0x9a, + 0x02, 0xc5, 0x0d, 0x8e, 0x01, 0xf8, 0x3a, 0xa6, + 0xec, 0xaa, 0xc5, 0xc7, 0x9d, 0x3d, 0x7f, 0x9d, + 0x47, 0x0e, 0x58, 0x2d, 0x9a, 0x2d, 0x51, 0x1d, + 0xc3, 0x77, 0xb2, 0x7f, 0x69, 0x9a, 0xc3, 0x50, + }, + }, + { + .data_len = 3, + .digest = { + 0xd4, 0xa3, 0xc2, 0x50, 0xa0, 0x33, 0xc6, 0xe4, + 0x50, 0x08, 0xea, 0xc9, 0xb8, 0x35, 0x55, 0x34, + 0x61, 0xb8, 0x2e, 0xa2, 0xe5, 0xdc, 0x04, 0x70, + 0x99, 0x86, 0x5f, 0xee, 0x2e, 0x1e, 0xd4, 0x40, + 0xf8, 0x88, 0x01, 0x84, 0x97, 0x16, 0x6a, 0xd3, + 0x0a, 0xb4, 0xe2, 0x34, 0xca, 0x1f, 0xfc, 0x6b, + 0x61, 0xf3, 0x7f, 0x72, 0xfa, 0x8d, 0x22, 0xcf, + 0x7f, 0x2d, 0x87, 0x9d, 0xbb, 0x59, 0xac, 0x53, + }, + }, + { + .data_len = 16, + .digest = { + 0x3b, 0xae, 0x66, 0x48, 0x0f, 0x35, 0x3c, 0xcd, + 0x92, 0xa9, 0xb6, 0xb9, 0xfe, 0x19, 0x90, 0x92, + 0x52, 0xa5, 0x02, 0xd1, 0x89, 0xcf, 0x98, 0x86, + 0x29, 0x28, 0xab, 0xc4, 0x9e, 0xcc, 0x75, 0x38, + 0x95, 0xa7, 0x59, 0x43, 0xef, 0x8c, 0x3a, 0xeb, + 0x40, 0xa3, 0xbe, 0x2b, 0x75, 0x0d, 0xfd, 0xc3, + 0xaf, 0x69, 0x08, 0xad, 0x9f, 0xc9, 0xf4, 0x96, + 0xa9, 0xc2, 0x2b, 0x1b, 0x66, 0x6f, 0x1d, 0x28, + }, + }, + { + .data_len = 32, + .digest = { + 0x9c, 0xfb, 0x3c, 0x40, 0xd5, 0x3b, 0xc4, 0xff, + 0x07, 0xa7, 0xf0, 0x24, 0xb7, 0xd6, 0x5e, 0x12, + 0x5b, 0x85, 0xb5, 0xa5, 0xe0, 0x82, 0xa6, 0xda, + 0x30, 0x13, 0x2f, 0x1a, 0xe3, 0xd0, 0x55, 0xcb, + 0x14, 0x19, 0xe2, 0x09, 0x91, 0x96, 0x26, 0xf9, + 0x38, 0xd7, 0xfa, 0x4a, 0xfb, 0x2f, 0x6f, 0xc0, + 0xf4, 0x95, 0xc3, 0x40, 0xf6, 0xdb, 0xe7, 0xc2, + 0x79, 0x23, 0xa4, 0x20, 0x96, 0x3a, 0x00, 0xbb, + }, + }, + { + .data_len = 48, + .digest = { + 0x92, 0x1a, 0x21, 0x06, 0x6e, 0x08, 0x84, 0x09, + 0x23, 0x8d, 0x63, 0xec, 0xd6, 0x72, 0xd3, 0x21, + 0x51, 0xe8, 0x65, 0x94, 0xf8, 0x1f, 0x5f, 0xa7, + 0xab, 0x6b, 0xae, 0x1c, 0x2c, 0xaf, 0xf9, 0x0c, + 0x7c, 0x5a, 0x74, 0x1d, 0x90, 0x26, 0x4a, 0xc3, + 0xa1, 0x60, 0xf4, 0x1d, 0xd5, 0x3c, 0x86, 0xe8, + 0x00, 0xb3, 0x99, 0x27, 0xb8, 0x9d, 0x3e, 0x17, + 0x32, 0x5a, 0x34, 0x3e, 0xc2, 0xb2, 0x6e, 0xbd, + }, + }, + { + .data_len = 49, + .digest = { + 0x5a, 0x1f, 0x40, 0x5f, 0xee, 0xf2, 0xdd, 0x67, + 0x01, 0xcb, 0x26, 0x58, 0xf5, 0x1b, 0xe8, 0x7e, + 0xeb, 0x7d, 0x9d, 0xef, 0xd3, 0x55, 0xd6, 0x89, + 0x2e, 0xfc, 0x14, 0xe2, 0x98, 0x4c, 0x31, 0xaa, + 0x69, 0x00, 0xf9, 0x4e, 0xb0, 0x75, 0x1b, 0x71, + 0x93, 0x60, 0xdf, 0xa1, 0xaf, 0xba, 0xc2, 0xd3, + 0x6a, 0x22, 0xa0, 0xff, 0xb5, 0x66, 0x15, 0x66, + 0xd2, 0x24, 0x9a, 0x7e, 0xe4, 0xe5, 0x84, 0xdb, + }, + }, + { + .data_len = 63, + .digest = { + 0x32, 0xbd, 0xcf, 0x72, 0xa9, 0x74, 0x87, 0xe6, + 0x2a, 0x53, 0x7e, 0x6d, 0xac, 0xc2, 0xdd, 0x2c, + 0x87, 0xb3, 0xf7, 0x90, 0x29, 0xc9, 0x16, 0x59, + 0xd2, 0x7e, 0x6e, 0x84, 0x1d, 0xa6, 0xaf, 0x3c, + 0xca, 0xd6, 0x1a, 0x24, 0xa4, 0xcd, 0x59, 0x44, + 0x20, 0xd7, 0xd2, 0x5b, 0x97, 0xda, 0xd5, 0xa9, + 0x23, 0xb1, 0xa4, 0x60, 0xb8, 0x05, 0x98, 0xdc, + 0xef, 0x89, 0x81, 0xe3, 0x3a, 0xf9, 0x24, 0x37, + }, + }, + { + .data_len = 64, + .digest = { + 0x96, 0x3a, 0x1a, 0xdd, 0x1b, 0xeb, 0x1a, 0x55, + 0x24, 0x52, 0x3d, 0xec, 0x9d, 0x52, 0x2e, 0xa6, + 0xfe, 0x81, 0xd6, 0x98, 0xac, 0xcc, 0x60, 0x56, + 0x04, 0x9d, 0xa3, 0xf3, 0x56, 0x05, 0xe4, 0x8a, + 0x61, 0xaf, 0x6f, 0x6e, 0x8e, 0x75, 0x67, 0x3a, + 0xd2, 0xb0, 0x85, 0x2d, 0x17, 0xd2, 0x86, 0x8c, + 0x50, 0x4b, 0xdd, 0xef, 0x35, 0x00, 0xde, 0x29, + 0x3d, 0x4b, 0x04, 0x12, 0x8a, 0x81, 0xe2, 0xcc, + }, + }, + { + .data_len = 65, + .digest = { + 0x9c, 0x6e, 0xf0, 0x6f, 0x71, 0x77, 0xd5, 0xd0, + 0xbb, 0x70, 0x1f, 0xcb, 0xbd, 0xd3, 0xfe, 0x23, + 0x71, 0x78, 0xad, 0x3a, 0xd2, 0x1e, 0x34, 0xf4, + 0x6d, 0xb4, 0xa2, 0x0a, 0x24, 0xcb, 0xe1, 0x99, + 0x07, 0xd0, 0x79, 0x8f, 0x7e, 0x69, 0x31, 0x68, + 0x29, 0xb5, 0x85, 0x82, 0x67, 0xdc, 0x4a, 0x8d, + 0x44, 0x04, 0x02, 0xc0, 0xfb, 0xd2, 0x19, 0x66, + 0x1e, 0x25, 0x8b, 0xd2, 0x5a, 0x59, 0x68, 0xc0, + }, + }, + { + .data_len = 127, + .digest = { + 0xb8, 0x8f, 0xa8, 0x29, 0x4d, 0xcf, 0x5f, 0x73, + 0x3c, 0x55, 0x43, 0xd9, 0x1c, 0xbc, 0x0c, 0x17, + 0x75, 0x0b, 0xc7, 0xb1, 0x1d, 0x9f, 0x7b, 0x2f, + 0x4c, 0x3d, 0x2a, 0x71, 0xfb, 0x1b, 0x0e, 0xca, + 0x4e, 0x96, 0xa0, 0x95, 0xee, 0xf4, 0x93, 0x76, + 0x36, 0xfb, 0x5d, 0xd3, 0x46, 0xc4, 0x1d, 0x41, + 0x32, 0x92, 0x9d, 0xed, 0xdb, 0x7f, 0xfa, 0xb3, + 0x91, 0x61, 0x3e, 0xd6, 0xb2, 0xca, 0x8d, 0x81, + }, + }, + { + .data_len = 128, + .digest = { + 0x54, 0xac, 0x1a, 0xa1, 0xa6, 0xa3, 0x47, 0x2a, + 0x5a, 0xac, 0x1a, 0x3a, 0x4b, 0xa1, 0x11, 0x08, + 0xa7, 0x90, 0xec, 0x52, 0xcb, 0xaf, 0x68, 0x41, + 0x38, 0x44, 0x53, 0x94, 0x93, 0x30, 0xaf, 0x3a, + 0xec, 0x99, 0x3a, 0x7d, 0x2a, 0xd5, 0xb6, 0x05, + 0xf5, 0xa6, 0xbb, 0x9b, 0x82, 0xc2, 0xbd, 0x98, + 0x28, 0x62, 0x98, 0x3e, 0xe4, 0x27, 0x9b, 0xaa, + 0xce, 0x0a, 0x6f, 0xab, 0x1b, 0x16, 0xf3, 0xdd, + }, + }, + { + .data_len = 129, + .digest = { + 0x04, 0x37, 0x60, 0xbc, 0xb3, 0xb1, 0xc6, 0x2d, + 0x42, 0xc5, 0xd7, 0x7e, 0xd9, 0x86, 0x82, 0xe0, + 0xf4, 0x62, 0xad, 0x75, 0x68, 0x0b, 0xc7, 0xa8, + 0xd6, 0x9a, 0x76, 0xe5, 0x29, 0xb8, 0x37, 0x30, + 0x0f, 0xc0, 0xbc, 0x81, 0x94, 0x7c, 0x13, 0xf4, + 0x9c, 0x27, 0xbc, 0x59, 0xa1, 0x70, 0x6a, 0x87, + 0x20, 0x12, 0x0a, 0x2a, 0x62, 0x5e, 0x6f, 0xca, + 0x91, 0x6b, 0x34, 0x7e, 0x4c, 0x0d, 0xf0, 0x6c, + }, + }, + { + .data_len = 256, + .digest = { + 0x4b, 0x7c, 0x1f, 0x53, 0x52, 0xcc, 0x30, 0xed, + 0x91, 0x44, 0x6f, 0x0d, 0xb5, 0x41, 0x79, 0x99, + 0xaf, 0x82, 0x65, 0x52, 0x03, 0xf8, 0x55, 0x74, + 0x7c, 0xd9, 0x41, 0xd6, 0xe8, 0x91, 0xa4, 0x85, + 0xcb, 0x0a, 0x60, 0x08, 0x76, 0x07, 0x60, 0x99, + 0x89, 0x76, 0xba, 0x84, 0xbd, 0x0b, 0xf2, 0xb3, + 0xdc, 0xf3, 0x33, 0xd1, 0x9b, 0x0b, 0x2e, 0x5d, + 0xf6, 0x9d, 0x0f, 0x67, 0xf4, 0x86, 0xb3, 0xd5, + }, + }, + { + .data_len = 511, + .digest = { + 0x7d, 0x83, 0x78, 0x6a, 0x5d, 0x52, 0x42, 0x2a, + 0xb1, 0x97, 0xc6, 0x62, 0xa2, 0x2a, 0x7c, 0x8b, + 0xcd, 0x4f, 0xa4, 0x86, 0x19, 0xa4, 0x5b, 0x1d, + 0xc7, 0x6f, 0x2f, 0x9c, 0x03, 0xc3, 0x45, 0x2e, + 0xa7, 0x8e, 0x38, 0xf2, 0x57, 0x55, 0x89, 0x47, + 0xed, 0xeb, 0x81, 0xe2, 0xe0, 0x55, 0x9f, 0xe6, + 0xca, 0x03, 0x59, 0xd3, 0xd4, 0xba, 0xc9, 0x2d, + 0xaf, 0xbb, 0x62, 0x2e, 0xe6, 0x89, 0xe4, 0x11, + }, + }, + { + .data_len = 513, + .digest = { + 0xe9, 0x14, 0xe7, 0x01, 0xd0, 0x81, 0x09, 0x51, + 0x78, 0x1c, 0x8e, 0x6c, 0x00, 0xd3, 0x28, 0xa0, + 0x2a, 0x7b, 0xd6, 0x25, 0xca, 0xd0, 0xf9, 0xb8, + 0xd8, 0xcf, 0xd0, 0xb7, 0x48, 0x25, 0xb7, 0x6a, + 0x53, 0x8e, 0xf8, 0x52, 0x9c, 0x1f, 0x7d, 0xae, + 0x4c, 0x22, 0xd5, 0x9d, 0xf0, 0xaf, 0x98, 0x91, + 0x19, 0x1f, 0x99, 0xbd, 0xa6, 0xc2, 0x0f, 0x05, + 0xa5, 0x9f, 0x3e, 0x87, 0xed, 0xc3, 0xab, 0x92, + }, + }, + { + .data_len = 1000, + .digest = { + 0x2e, 0xf4, 0x72, 0xd2, 0xd9, 0x4a, 0xd5, 0xf9, + 0x20, 0x03, 0x4a, 0xad, 0xed, 0xa9, 0x1b, 0x64, + 0x73, 0x38, 0xc6, 0x30, 0xa8, 0x7f, 0xf9, 0x3b, + 0x8c, 0xbc, 0xa1, 0x2d, 0x22, 0x7b, 0x84, 0x37, + 0xf5, 0xba, 0xee, 0xf0, 0x80, 0x9d, 0xe3, 0x82, + 0xbd, 0x07, 0x68, 0x15, 0x01, 0x22, 0xf6, 0x88, + 0x07, 0x0b, 0xfd, 0xb7, 0xb1, 0xc0, 0x68, 0x4b, + 0x8d, 0x05, 0xec, 0xfb, 0xcd, 0xde, 0xa4, 0x2a, + }, + }, + { + .data_len = 3333, + .digest = { + 0x73, 0xe3, 0xe5, 0x87, 0x01, 0x0a, 0x29, 0x4d, + 0xad, 0x92, 0x67, 0x64, 0xc7, 0x71, 0x0b, 0x22, + 0x80, 0x8a, 0x6e, 0x8b, 0x20, 0x73, 0xb2, 0xd7, + 0x98, 0x20, 0x35, 0x42, 0x42, 0x5d, 0x85, 0x12, + 0xb0, 0x06, 0x69, 0x63, 0x5f, 0x5b, 0xe7, 0x63, + 0x6f, 0xe6, 0x18, 0xa6, 0xc1, 0xa6, 0xae, 0x27, + 0xa7, 0x6a, 0x73, 0x6b, 0x27, 0xd5, 0x47, 0xe1, + 0xa2, 0x7d, 0xe4, 0x0d, 0xbd, 0x23, 0x7b, 0x7a, + }, + }, + { + .data_len = 4096, + .digest = { + 0x11, 0x5b, 0x77, 0x36, 0x6b, 0x3b, 0xe4, 0x42, + 0xe4, 0x92, 0x23, 0xcb, 0x0c, 0x06, 0xff, 0xb7, + 0x0c, 0x71, 0x64, 0xd9, 0x8a, 0x57, 0x75, 0x7b, + 0xa2, 0xd2, 0x17, 0x19, 0xbb, 0xb5, 0x3c, 0xb3, + 0x5f, 0xae, 0x35, 0x75, 0x8e, 0xa8, 0x97, 0x43, + 0xce, 0xfe, 0x41, 0x84, 0xfe, 0xcb, 0x18, 0x70, + 0x68, 0x2e, 0x16, 0x19, 0xd5, 0x10, 0x0d, 0x2f, + 0x61, 0x87, 0x79, 0xee, 0x5f, 0x24, 0xdd, 0x76, + }, + }, + { + .data_len = 4128, + .digest = { + 0x9e, 0x96, 0xe1, 0x0a, 0xb2, 0xd5, 0xba, 0xcf, + 0x27, 0xba, 0x6f, 0x85, 0xe9, 0xbf, 0x96, 0xb9, + 0x5a, 0x00, 0x00, 0x06, 0xdc, 0xb7, 0xaf, 0x0a, + 0x8f, 0x1d, 0x31, 0xf6, 0xce, 0xc3, 0x50, 0x2e, + 0x61, 0x3a, 0x8b, 0x28, 0xaf, 0xb2, 0x50, 0x0d, + 0x00, 0x98, 0x02, 0x11, 0x6b, 0xfa, 0x51, 0xc1, + 0xde, 0xe1, 0x34, 0x9f, 0xda, 0x11, 0x63, 0xfa, + 0x0a, 0xa0, 0xa2, 0x67, 0x39, 0xeb, 0x9b, 0xf1, + }, + }, + { + .data_len = 4160, + .digest = { + 0x46, 0x4e, 0x81, 0xd1, 0x08, 0x2a, 0x46, 0x12, + 0x4e, 0xae, 0x1f, 0x5d, 0x57, 0xe5, 0x19, 0xbc, + 0x76, 0x38, 0xb6, 0xa7, 0xe3, 0x72, 0x6d, 0xaf, + 0x80, 0x3b, 0xd0, 0xbc, 0x06, 0xe8, 0xab, 0xab, + 0x86, 0x4b, 0x0b, 0x7a, 0x61, 0xa6, 0x13, 0xff, + 0x64, 0x47, 0x89, 0xb7, 0x63, 0x8a, 0xa5, 0x4c, + 0x9f, 0x52, 0x70, 0xeb, 0x21, 0xe5, 0x2d, 0xe9, + 0xe7, 0xab, 0x1c, 0x0e, 0x74, 0xf5, 0x72, 0xec, + }, + }, + { + .data_len = 4224, + .digest = { + 0xfa, 0x6e, 0xff, 0x3c, 0xc1, 0x98, 0x49, 0x42, + 0x34, 0x67, 0xd4, 0xd3, 0xfa, 0xae, 0x27, 0xe4, + 0x77, 0x11, 0x84, 0xd2, 0x57, 0x99, 0xf8, 0xfd, + 0x41, 0x50, 0x84, 0x80, 0x7f, 0xf7, 0xb2, 0xd3, + 0x88, 0x21, 0x9c, 0xe8, 0xb9, 0x05, 0xd3, 0x48, + 0x64, 0xc5, 0xb7, 0x29, 0xd9, 0x21, 0x17, 0xad, + 0x89, 0x9c, 0x79, 0x55, 0x51, 0x0b, 0x96, 0x3e, + 0x10, 0x40, 0xe1, 0xdd, 0x7b, 0x39, 0x40, 0x86, + }, + }, + { + .data_len = 16384, + .digest = { + 0x41, 0xb3, 0xd2, 0x93, 0xcd, 0x79, 0x84, 0xc2, + 0xf5, 0xea, 0xf3, 0xb3, 0x94, 0x23, 0xaa, 0x76, + 0x87, 0x5f, 0xe3, 0xd2, 0x03, 0xd8, 0x00, 0xbb, + 0xa1, 0x55, 0xe4, 0xcb, 0x16, 0x04, 0x5b, 0xdf, + 0xf8, 0xd2, 0x63, 0x51, 0x02, 0x22, 0xc6, 0x0f, + 0x98, 0x2b, 0x12, 0x52, 0x25, 0x64, 0x93, 0xd9, + 0xab, 0xe9, 0x4d, 0x16, 0x4b, 0xf6, 0x09, 0x83, + 0x5c, 0x63, 0x1c, 0x41, 0x19, 0xf6, 0x76, 0xe3, + }, + }, +}; + +static const u8 hash_testvec_consolidated[SHA512_DIGEST_SIZE] = { + 0x5b, 0x9d, 0xf9, 0xab, 0x8c, 0x8e, 0x52, 0xdb, + 0x02, 0xa0, 0x4c, 0x24, 0x2d, 0xc4, 0xa8, 0x4e, + 0x9c, 0x93, 0x2f, 0x72, 0xa8, 0x75, 0xfb, 0xb5, + 0xdb, 0xef, 0x52, 0xc6, 0xa3, 0xfe, 0xeb, 0x6b, + 0x92, 0x79, 0x18, 0x05, 0xf6, 0xd7, 0xaf, 0x7b, + 0x36, 0xfc, 0x83, 0x2c, 0x7e, 0x7b, 0x59, 0x8b, + 0xf9, 0x81, 0xaa, 0x98, 0x38, 0x11, 0x97, 0x56, + 0x34, 0xe5, 0x2a, 0x4b, 0xf2, 0x9e, 0xf3, 0xdb, +}; + +static const u8 hmac_testvec_consolidated[SHA512_DIGEST_SIZE] = { + 0x40, 0xe7, 0xbc, 0x03, 0xdf, 0x22, 0xd4, 0x76, + 0x66, 0x45, 0xf8, 0x1d, 0x25, 0xdf, 0xbe, 0xa2, + 0x93, 0x06, 0x8c, 0x1d, 0x14, 0x23, 0x9b, 0x5c, + 0xfa, 0xac, 0xdf, 0xbd, 0xa2, 0x24, 0xe5, 0xf7, + 0xdc, 0xf7, 0xae, 0x96, 0xc1, 0x34, 0xe5, 0x24, + 0x16, 0x24, 0xdc, 0xee, 0x4f, 0x62, 0x1c, 0x67, + 0x4e, 0x02, 0x31, 0x4b, 0x9b, 0x65, 0x25, 0xeb, + 0x32, 0x2e, 0x24, 0xfb, 0xcd, 0x2b, 0x59, 0xd8, +}; diff --git a/lib/crypto/tests/sha512_kunit.c b/lib/crypto/tests/sha512_kunit.c new file mode 100644 index 000000000000..8923e2d7d3d4 --- /dev/null +++ b/lib/crypto/tests/sha512_kunit.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2025 Google LLC + */ +#include <crypto/sha2.h> +#include "sha512-testvecs.h" + +#define HASH sha512 +#define HASH_CTX sha512_ctx +#define HASH_SIZE SHA512_DIGEST_SIZE +#define HASH_INIT sha512_init +#define HASH_UPDATE sha512_update +#define HASH_FINAL sha512_final +#define HMAC_KEY hmac_sha512_key +#define HMAC_CTX hmac_sha512_ctx +#define HMAC_PREPAREKEY hmac_sha512_preparekey +#define HMAC_INIT hmac_sha512_init +#define HMAC_UPDATE hmac_sha512_update +#define HMAC_FINAL hmac_sha512_final +#define HMAC hmac_sha512 +#define HMAC_USINGRAWKEY hmac_sha512_usingrawkey +#include "hash-test-template.h" + +static struct kunit_case hash_test_cases[] = { + HASH_KUNIT_CASES, + KUNIT_CASE(benchmark_hash), + {}, +}; + +static struct kunit_suite hash_test_suite = { + .name = "sha512", + .test_cases = hash_test_cases, + .suite_init = hash_suite_init, + .suite_exit = hash_suite_exit, +}; +kunit_test_suite(hash_test_suite); + +MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-512 and HMAC-SHA512"); +MODULE_LICENSE("GPL"); diff --git a/lib/crypto/utils.c b/lib/crypto/utils.c index 87da2a6dd161..dec381d5e906 100644 --- a/lib/crypto/utils.c +++ b/lib/crypto/utils.c @@ -5,9 +5,10 @@ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ -#include <linux/unaligned.h> #include <crypto/utils.h> +#include <linux/export.h> #include <linux/module.h> +#include <linux/unaligned.h> /* * XOR @len bytes from @src1 and @src2 together, writing the result to @dst diff --git a/lib/crypto/x86/.gitignore b/lib/crypto/x86/.gitignore new file mode 100644 index 000000000000..580c839bb177 --- /dev/null +++ b/lib/crypto/x86/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +poly1305-x86_64-cryptogams.S diff --git a/lib/crypto/x86/Kconfig b/lib/crypto/x86/Kconfig new file mode 100644 index 000000000000..546fe2afe0b5 --- /dev/null +++ b/lib/crypto/x86/Kconfig @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_BLAKE2S_X86 + bool "Hash functions: BLAKE2s (SSSE3/AVX-512)" + depends on 64BIT + select CRYPTO_LIB_BLAKE2S_GENERIC + select CRYPTO_ARCH_HAVE_LIB_BLAKE2S + help + BLAKE2s cryptographic hash function (RFC 7693) + + Architecture: x86_64 using: + - SSSE3 (Supplemental SSE3) + - AVX-512 (Advanced Vector Extensions-512) + +config CRYPTO_CHACHA20_X86_64 + tristate + depends on 64BIT + default CRYPTO_LIB_CHACHA + select CRYPTO_LIB_CHACHA_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CHACHA + +config CRYPTO_POLY1305_X86_64 + tristate + depends on 64BIT + default CRYPTO_LIB_POLY1305 + select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/lib/crypto/x86/Makefile b/lib/crypto/x86/Makefile new file mode 100644 index 000000000000..c2ff8c5f1046 --- /dev/null +++ b/lib/crypto/x86/Makefile @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o +libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o + +obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o +chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha-avx512vl-x86_64.o chacha_glue.o + +obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o +poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o +targets += poly1305-x86_64-cryptogams.S + +quiet_cmd_perlasm = PERLASM $@ + cmd_perlasm = $(PERL) $< > $@ + +$(obj)/%.S: $(src)/%.pl FORCE + $(call if_changed,perlasm) diff --git a/lib/crypto/x86/blake2s-core.S b/lib/crypto/x86/blake2s-core.S new file mode 100644 index 000000000000..ac1c845445a4 --- /dev/null +++ b/lib/crypto/x86/blake2s-core.S @@ -0,0 +1,252 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. + */ + +#include <linux/linkage.h> + +.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 +.align 32 +IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 + .octa 0x5BE0CD191F83D9AB9B05688C510E527F +.section .rodata.cst16.ROT16, "aM", @progbits, 16 +.align 16 +ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 +.section .rodata.cst16.ROR328, "aM", @progbits, 16 +.align 16 +ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 +.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 +.align 64 +SIGMA: +.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 +.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 +.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 +.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 +.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 +.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 +.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 +.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 +.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 +.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 +.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 +.align 64 +SIGMA2: +.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 +.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 +.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 +.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 +.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 +.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 +.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 +.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 +.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 +.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 + +.text +SYM_FUNC_START(blake2s_compress_ssse3) + testq %rdx,%rdx + je .Lendofloop + movdqu (%rdi),%xmm0 + movdqu 0x10(%rdi),%xmm1 + movdqa ROT16(%rip),%xmm12 + movdqa ROR328(%rip),%xmm13 + movdqu 0x20(%rdi),%xmm14 + movq %rcx,%xmm15 + leaq SIGMA+0xa0(%rip),%r8 + jmp .Lbeginofloop + .align 32 +.Lbeginofloop: + movdqa %xmm0,%xmm10 + movdqa %xmm1,%xmm11 + paddq %xmm15,%xmm14 + movdqa IV(%rip),%xmm2 + movdqa %xmm14,%xmm3 + pxor IV+0x10(%rip),%xmm3 + leaq SIGMA(%rip),%rcx +.Lroundloop: + movzbl (%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + movzbl 0x1(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + movzbl 0x2(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + movzbl 0x3(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + punpckldq %xmm5,%xmm4 + punpckldq %xmm7,%xmm6 + punpcklqdq %xmm6,%xmm4 + paddd %xmm4,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movzbl 0x4(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + movzbl 0x5(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + movzbl 0x6(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + movzbl 0x7(%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + punpckldq %xmm6,%xmm5 + punpckldq %xmm4,%xmm7 + punpcklqdq %xmm7,%xmm5 + paddd %xmm5,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x93,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + movzbl 0x8(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + movzbl 0x9(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + movzbl 0xa(%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + movzbl 0xb(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + punpckldq %xmm7,%xmm6 + punpckldq %xmm5,%xmm4 + punpcklqdq %xmm4,%xmm6 + paddd %xmm6,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movzbl 0xc(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + movzbl 0xd(%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + movzbl 0xe(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + movzbl 0xf(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + punpckldq %xmm4,%xmm7 + punpckldq %xmm6,%xmm5 + punpcklqdq %xmm5,%xmm7 + paddd %xmm7,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x93,%xmm2,%xmm2 + addq $0x10,%rcx + cmpq %r8,%rcx + jnz .Lroundloop + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm1 + pxor %xmm10,%xmm0 + pxor %xmm11,%xmm1 + addq $0x40,%rsi + decq %rdx + jnz .Lbeginofloop + movdqu %xmm0,(%rdi) + movdqu %xmm1,0x10(%rdi) + movdqu %xmm14,0x20(%rdi) +.Lendofloop: + RET +SYM_FUNC_END(blake2s_compress_ssse3) + +SYM_FUNC_START(blake2s_compress_avx512) + vmovdqu (%rdi),%xmm0 + vmovdqu 0x10(%rdi),%xmm1 + vmovdqu 0x20(%rdi),%xmm4 + vmovq %rcx,%xmm5 + vmovdqa IV(%rip),%xmm14 + vmovdqa IV+16(%rip),%xmm15 + jmp .Lblake2s_compress_avx512_mainloop +.align 32 +.Lblake2s_compress_avx512_mainloop: + vmovdqa %xmm0,%xmm10 + vmovdqa %xmm1,%xmm11 + vpaddq %xmm5,%xmm4,%xmm4 + vmovdqa %xmm14,%xmm2 + vpxor %xmm15,%xmm4,%xmm3 + vmovdqu (%rsi),%ymm6 + vmovdqu 0x20(%rsi),%ymm7 + addq $0x40,%rsi + leaq SIGMA2(%rip),%rax + movb $0xa,%cl +.Lblake2s_compress_avx512_roundloop: + addq $0x40,%rax + vmovdqa -0x40(%rax),%ymm8 + vmovdqa -0x20(%rax),%ymm9 + vpermi2d %ymm7,%ymm6,%ymm8 + vpermi2d %ymm7,%ymm6,%ymm9 + vmovdqa %ymm8,%ymm6 + vmovdqa %ymm9,%ymm7 + vpaddd %xmm8,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x10,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0xc,%xmm1,%xmm1 + vextracti128 $0x1,%ymm8,%xmm8 + vpaddd %xmm8,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x8,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0x7,%xmm1,%xmm1 + vpshufd $0x93,%xmm0,%xmm0 + vpshufd $0x4e,%xmm3,%xmm3 + vpshufd $0x39,%xmm2,%xmm2 + vpaddd %xmm9,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x10,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0xc,%xmm1,%xmm1 + vextracti128 $0x1,%ymm9,%xmm9 + vpaddd %xmm9,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x8,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0x7,%xmm1,%xmm1 + vpshufd $0x39,%xmm0,%xmm0 + vpshufd $0x4e,%xmm3,%xmm3 + vpshufd $0x93,%xmm2,%xmm2 + decb %cl + jne .Lblake2s_compress_avx512_roundloop + vpxor %xmm10,%xmm0,%xmm0 + vpxor %xmm11,%xmm1,%xmm1 + vpxor %xmm2,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + decq %rdx + jne .Lblake2s_compress_avx512_mainloop + vmovdqu %xmm0,(%rdi) + vmovdqu %xmm1,0x10(%rdi) + vmovdqu %xmm4,0x20(%rdi) + vzeroupper + RET +SYM_FUNC_END(blake2s_compress_avx512) diff --git a/lib/crypto/x86/blake2s-glue.c b/lib/crypto/x86/blake2s-glue.c new file mode 100644 index 000000000000..adc296cd17c9 --- /dev/null +++ b/lib/crypto/x86/blake2s-glue.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <asm/cpufeature.h> +#include <asm/fpu/api.h> +#include <asm/processor.h> +#include <asm/simd.h> +#include <crypto/internal/blake2s.h> +#include <linux/init.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/sizes.h> + +asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state, + const u8 *block, const size_t nblocks, + const u32 inc); +asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, + const u8 *block, const size_t nblocks, + const u32 inc); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512); + +void blake2s_compress(struct blake2s_state *state, const u8 *block, + size_t nblocks, const u32 inc) +{ + /* SIMD disables preemption, so relax after processing each page. */ + BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); + + if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) { + blake2s_compress_generic(state, block, nblocks, inc); + return; + } + + do { + const size_t blocks = min_t(size_t, nblocks, + SZ_4K / BLAKE2S_BLOCK_SIZE); + + kernel_fpu_begin(); + if (static_branch_likely(&blake2s_use_avx512)) + blake2s_compress_avx512(state, block, blocks, inc); + else + blake2s_compress_ssse3(state, block, blocks, inc); + kernel_fpu_end(); + + nblocks -= blocks; + block += blocks * BLAKE2S_BLOCK_SIZE; + } while (nblocks); +} +EXPORT_SYMBOL(blake2s_compress); + +static int __init blake2s_mod_init(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + static_branch_enable(&blake2s_use_ssse3); + + if (boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | + XFEATURE_MASK_AVX512, NULL)) + static_branch_enable(&blake2s_use_avx512); + + return 0; +} + +subsys_initcall(blake2s_mod_init); diff --git a/lib/crypto/x86/chacha-avx2-x86_64.S b/lib/crypto/x86/chacha-avx2-x86_64.S new file mode 100644 index 000000000000..f3d8fc018249 --- /dev/null +++ b/lib/crypto/x86/chacha-avx2-x86_64.S @@ -0,0 +1,1021 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * ChaCha 256-bit cipher algorithm, x64 AVX2 functions + * + * Copyright (C) 2015 Martin Willi + */ + +#include <linux/linkage.h> + +.section .rodata.cst32.ROT8, "aM", @progbits, 32 +.align 32 +ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 + .octa 0x0e0d0c0f0a09080b0605040702010003 + +.section .rodata.cst32.ROT16, "aM", @progbits, 32 +.align 32 +ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 + .octa 0x0d0c0f0e09080b0a0504070601000302 + +.section .rodata.cst32.CTRINC, "aM", @progbits, 32 +.align 32 +CTRINC: .octa 0x00000003000000020000000100000000 + .octa 0x00000007000000060000000500000004 + +.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 +.align 32 +CTR2BL: .octa 0x00000000000000000000000000000000 + .octa 0x00000000000000000000000000000001 + +.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 +.align 32 +CTR4BL: .octa 0x00000000000000000000000000000002 + .octa 0x00000000000000000000000000000003 + +.text + +SYM_FUNC_START(chacha_2block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: up to 2 data blocks output, o + # %rdx: up to 2 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts two ChaCha blocks by loading the state + # matrix twice across four AVX registers. It performs matrix operations + # on four words in each matrix in parallel, but requires shuffling to + # rearrange the words after each round. + + vzeroupper + + # x0..3[0-2] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + + vmovdqa %ymm0,%ymm8 + vmovdqa %ymm1,%ymm9 + vmovdqa %ymm2,%ymm10 + vmovdqa %ymm3,%ymm11 + + vmovdqa ROT8(%rip),%ymm4 + vmovdqa ROT16(%rip),%ymm5 + + mov %rcx,%rax + +.Ldoubleround: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm5,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm6 + vpslld $12,%ymm6,%ymm6 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm6,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm4,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm7 + vpslld $7,%ymm7,%ymm7 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm5,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm6 + vpslld $12,%ymm6,%ymm6 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm6,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm4,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm7 + vpslld $7,%ymm7,%ymm7 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + + sub $2,%r8d + jnz .Ldoubleround + + # o0 = i0 ^ (x0 + s0) + vpaddd %ymm8,%ymm0,%ymm7 + cmp $0x10,%rax + jl .Lxorpart2 + vpxor 0x00(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x00(%rsi) + vextracti128 $1,%ymm7,%xmm0 + # o1 = i1 ^ (x1 + s1) + vpaddd %ymm9,%ymm1,%ymm7 + cmp $0x20,%rax + jl .Lxorpart2 + vpxor 0x10(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x10(%rsi) + vextracti128 $1,%ymm7,%xmm1 + # o2 = i2 ^ (x2 + s2) + vpaddd %ymm10,%ymm2,%ymm7 + cmp $0x30,%rax + jl .Lxorpart2 + vpxor 0x20(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x20(%rsi) + vextracti128 $1,%ymm7,%xmm2 + # o3 = i3 ^ (x3 + s3) + vpaddd %ymm11,%ymm3,%ymm7 + cmp $0x40,%rax + jl .Lxorpart2 + vpxor 0x30(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x30(%rsi) + vextracti128 $1,%ymm7,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm7 + cmp $0x50,%rax + jl .Lxorpart2 + vpxor 0x40(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x40(%rsi) + + vmovdqa %xmm1,%xmm7 + cmp $0x60,%rax + jl .Lxorpart2 + vpxor 0x50(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x50(%rsi) + + vmovdqa %xmm2,%xmm7 + cmp $0x70,%rax + jl .Lxorpart2 + vpxor 0x60(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x60(%rsi) + + vmovdqa %xmm3,%xmm7 + cmp $0x80,%rax + jl .Lxorpart2 + vpxor 0x70(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x70(%rsi) + +.Ldone2: + vzeroupper + RET + +.Lxorpart2: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone2 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%xmm7,%xmm7 + vmovdqa %xmm7,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone2 + +SYM_FUNC_END(chacha_2block_xor_avx2) + +SYM_FUNC_START(chacha_4block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts four ChaCha blocks by loading the state + # matrix four times across eight AVX registers. It performs matrix + # operations on four words in two matrices in parallel, sequentially + # to the operations on the four words of the other two matrices. The + # required word shuffling has a rather high latency, we can do the + # arithmetic on two matrix-pairs without much slowdown. + + vzeroupper + + # x0..3[0-4] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vmovdqa %ymm0,%ymm4 + vmovdqa %ymm1,%ymm5 + vmovdqa %ymm2,%ymm6 + vmovdqa %ymm3,%ymm7 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + vpaddd CTR4BL(%rip),%ymm7,%ymm7 + + vmovdqa %ymm0,%ymm11 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm3,%ymm14 + vmovdqa %ymm7,%ymm15 + + vmovdqa ROT8(%rip),%ymm8 + vmovdqa ROT16(%rip),%ymm9 + + mov %rcx,%rax + +.Ldoubleround4: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm9,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm9,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + vpshufd $0x39,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + vpshufd $0x93,%ymm7,%ymm7 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm9,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm9,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + vpshufd $0x93,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + vpshufd $0x39,%ymm7,%ymm7 + + sub $2,%r8d + jnz .Ldoubleround4 + + # o0 = i0 ^ (x0 + s0), first block + vpaddd %ymm11,%ymm0,%ymm10 + cmp $0x10,%rax + jl .Lxorpart4 + vpxor 0x00(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x00(%rsi) + vextracti128 $1,%ymm10,%xmm0 + # o1 = i1 ^ (x1 + s1), first block + vpaddd %ymm12,%ymm1,%ymm10 + cmp $0x20,%rax + jl .Lxorpart4 + vpxor 0x10(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x10(%rsi) + vextracti128 $1,%ymm10,%xmm1 + # o2 = i2 ^ (x2 + s2), first block + vpaddd %ymm13,%ymm2,%ymm10 + cmp $0x30,%rax + jl .Lxorpart4 + vpxor 0x20(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x20(%rsi) + vextracti128 $1,%ymm10,%xmm2 + # o3 = i3 ^ (x3 + s3), first block + vpaddd %ymm14,%ymm3,%ymm10 + cmp $0x40,%rax + jl .Lxorpart4 + vpxor 0x30(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x30(%rsi) + vextracti128 $1,%ymm10,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm10 + cmp $0x50,%rax + jl .Lxorpart4 + vpxor 0x40(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x40(%rsi) + + vmovdqa %xmm1,%xmm10 + cmp $0x60,%rax + jl .Lxorpart4 + vpxor 0x50(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x50(%rsi) + + vmovdqa %xmm2,%xmm10 + cmp $0x70,%rax + jl .Lxorpart4 + vpxor 0x60(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x60(%rsi) + + vmovdqa %xmm3,%xmm10 + cmp $0x80,%rax + jl .Lxorpart4 + vpxor 0x70(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x70(%rsi) + + # o0 = i0 ^ (x0 + s0), third block + vpaddd %ymm11,%ymm4,%ymm10 + cmp $0x90,%rax + jl .Lxorpart4 + vpxor 0x80(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x80(%rsi) + vextracti128 $1,%ymm10,%xmm4 + # o1 = i1 ^ (x1 + s1), third block + vpaddd %ymm12,%ymm5,%ymm10 + cmp $0xa0,%rax + jl .Lxorpart4 + vpxor 0x90(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x90(%rsi) + vextracti128 $1,%ymm10,%xmm5 + # o2 = i2 ^ (x2 + s2), third block + vpaddd %ymm13,%ymm6,%ymm10 + cmp $0xb0,%rax + jl .Lxorpart4 + vpxor 0xa0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xa0(%rsi) + vextracti128 $1,%ymm10,%xmm6 + # o3 = i3 ^ (x3 + s3), third block + vpaddd %ymm15,%ymm7,%ymm10 + cmp $0xc0,%rax + jl .Lxorpart4 + vpxor 0xb0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xb0(%rsi) + vextracti128 $1,%ymm10,%xmm7 + + # xor and write fourth block + vmovdqa %xmm4,%xmm10 + cmp $0xd0,%rax + jl .Lxorpart4 + vpxor 0xc0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xc0(%rsi) + + vmovdqa %xmm5,%xmm10 + cmp $0xe0,%rax + jl .Lxorpart4 + vpxor 0xd0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xd0(%rsi) + + vmovdqa %xmm6,%xmm10 + cmp $0xf0,%rax + jl .Lxorpart4 + vpxor 0xe0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xe0(%rsi) + + vmovdqa %xmm7,%xmm10 + cmp $0x100,%rax + jl .Lxorpart4 + vpxor 0xf0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xf0(%rsi) + +.Ldone4: + vzeroupper + RET + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone4 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%xmm10,%xmm10 + vmovdqa %xmm10,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone4 + +SYM_FUNC_END(chacha_4block_xor_avx2) + +SYM_FUNC_START(chacha_8block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: up to 8 data blocks output, o + # %rdx: up to 8 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts eight consecutive ChaCha blocks by loading + # the state matrix in AVX registers eight times. As we need some + # scratch registers, we save the first four registers on the stack. The + # algorithm performs each operation on the corresponding word of each + # state matrix, hence requires no word shuffling. For final XORing step + # we transpose the matrix by interleaving 32-, 64- and then 128-bit + # words, which allows us to do XOR in AVX registers. 8/16-bit word + # rotation is done with the slightly better performing byte shuffling, + # 7/12-bit word rotation uses traditional shift+OR. + + vzeroupper + # 4 * 32 byte stack, 32-byte aligned + lea 8(%rsp),%r10 + and $~31, %rsp + sub $0x80, %rsp + mov %rcx,%rax + + # x0..15[0-7] = s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpbroadcastd 0x04(%rdi),%ymm1 + vpbroadcastd 0x08(%rdi),%ymm2 + vpbroadcastd 0x0c(%rdi),%ymm3 + vpbroadcastd 0x10(%rdi),%ymm4 + vpbroadcastd 0x14(%rdi),%ymm5 + vpbroadcastd 0x18(%rdi),%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm7 + vpbroadcastd 0x20(%rdi),%ymm8 + vpbroadcastd 0x24(%rdi),%ymm9 + vpbroadcastd 0x28(%rdi),%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm11 + vpbroadcastd 0x30(%rdi),%ymm12 + vpbroadcastd 0x34(%rdi),%ymm13 + vpbroadcastd 0x38(%rdi),%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm15 + # x0..3 on stack + vmovdqa %ymm0,0x00(%rsp) + vmovdqa %ymm1,0x20(%rsp) + vmovdqa %ymm2,0x40(%rsp) + vmovdqa %ymm3,0x60(%rsp) + + vmovdqa CTRINC(%rip),%ymm1 + vmovdqa ROT8(%rip),%ymm2 + vmovdqa ROT16(%rip),%ymm3 + + # x12 += counter values 0-3 + vpaddd %ymm1,%ymm12,%ymm12 + +.Ldoubleround8: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + vpaddd 0x00(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm3,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + vpaddd 0x20(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm3,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + vpaddd 0x40(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm3,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + vpaddd 0x60(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm3,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $12,%ymm4,%ymm0 + vpsrld $20,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $12,%ymm5,%ymm0 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $12,%ymm6,%ymm0 + vpsrld $20,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + vpaddd %ymm15,%ymm11,%ymm11 + vpxor %ymm11,%ymm7,%ymm7 + vpslld $12,%ymm7,%ymm0 + vpsrld $20,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + vpaddd 0x00(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm2,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + vpaddd 0x20(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm2,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + vpaddd 0x40(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm2,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vpaddd 0x60(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm2,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm0 + vpsrld $25,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm0 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm0 + vpsrld $25,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + vpaddd %ymm15,%ymm11,%ymm11 + vpxor %ymm11,%ymm7,%ymm7 + vpslld $7,%ymm7,%ymm0 + vpsrld $25,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + vpaddd 0x00(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm3,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 + vpaddd 0x20(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm3,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + vpaddd 0x40(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm3,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + vpaddd 0x60(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm3,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + vpaddd %ymm15,%ymm10,%ymm10 + vpxor %ymm10,%ymm5,%ymm5 + vpslld $12,%ymm5,%ymm0 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + vpaddd %ymm12,%ymm11,%ymm11 + vpxor %ymm11,%ymm6,%ymm6 + vpslld $12,%ymm6,%ymm0 + vpsrld $20,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + vpaddd %ymm13,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpslld $12,%ymm7,%ymm0 + vpsrld $20,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + vpaddd %ymm14,%ymm9,%ymm9 + vpxor %ymm9,%ymm4,%ymm4 + vpslld $12,%ymm4,%ymm0 + vpsrld $20,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + vpaddd 0x00(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm2,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + vpaddd 0x20(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm2,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + vpaddd 0x40(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm2,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vpaddd 0x60(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm2,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + vpaddd %ymm15,%ymm10,%ymm10 + vpxor %ymm10,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm0 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + vpaddd %ymm12,%ymm11,%ymm11 + vpxor %ymm11,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm0 + vpsrld $25,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + vpaddd %ymm13,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpslld $7,%ymm7,%ymm0 + vpsrld $25,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + vpaddd %ymm14,%ymm9,%ymm9 + vpxor %ymm9,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm0 + vpsrld $25,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + + sub $2,%r8d + jnz .Ldoubleround8 + + # x0..15[0-3] += s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpaddd 0x00(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpbroadcastd 0x04(%rdi),%ymm0 + vpaddd 0x20(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpbroadcastd 0x08(%rdi),%ymm0 + vpaddd 0x40(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpbroadcastd 0x0c(%rdi),%ymm0 + vpaddd 0x60(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpbroadcastd 0x10(%rdi),%ymm0 + vpaddd %ymm0,%ymm4,%ymm4 + vpbroadcastd 0x14(%rdi),%ymm0 + vpaddd %ymm0,%ymm5,%ymm5 + vpbroadcastd 0x18(%rdi),%ymm0 + vpaddd %ymm0,%ymm6,%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm0 + vpaddd %ymm0,%ymm7,%ymm7 + vpbroadcastd 0x20(%rdi),%ymm0 + vpaddd %ymm0,%ymm8,%ymm8 + vpbroadcastd 0x24(%rdi),%ymm0 + vpaddd %ymm0,%ymm9,%ymm9 + vpbroadcastd 0x28(%rdi),%ymm0 + vpaddd %ymm0,%ymm10,%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm0 + vpaddd %ymm0,%ymm11,%ymm11 + vpbroadcastd 0x30(%rdi),%ymm0 + vpaddd %ymm0,%ymm12,%ymm12 + vpbroadcastd 0x34(%rdi),%ymm0 + vpaddd %ymm0,%ymm13,%ymm13 + vpbroadcastd 0x38(%rdi),%ymm0 + vpaddd %ymm0,%ymm14,%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm0 + vpaddd %ymm0,%ymm15,%ymm15 + + # x12 += counter values 0-3 + vpaddd %ymm1,%ymm12,%ymm12 + + # interleave 32-bit words in state n, n+1 + vmovdqa 0x00(%rsp),%ymm0 + vmovdqa 0x20(%rsp),%ymm1 + vpunpckldq %ymm1,%ymm0,%ymm2 + vpunpckhdq %ymm1,%ymm0,%ymm1 + vmovdqa %ymm2,0x00(%rsp) + vmovdqa %ymm1,0x20(%rsp) + vmovdqa 0x40(%rsp),%ymm0 + vmovdqa 0x60(%rsp),%ymm1 + vpunpckldq %ymm1,%ymm0,%ymm2 + vpunpckhdq %ymm1,%ymm0,%ymm1 + vmovdqa %ymm2,0x40(%rsp) + vmovdqa %ymm1,0x60(%rsp) + vmovdqa %ymm4,%ymm0 + vpunpckldq %ymm5,%ymm0,%ymm4 + vpunpckhdq %ymm5,%ymm0,%ymm5 + vmovdqa %ymm6,%ymm0 + vpunpckldq %ymm7,%ymm0,%ymm6 + vpunpckhdq %ymm7,%ymm0,%ymm7 + vmovdqa %ymm8,%ymm0 + vpunpckldq %ymm9,%ymm0,%ymm8 + vpunpckhdq %ymm9,%ymm0,%ymm9 + vmovdqa %ymm10,%ymm0 + vpunpckldq %ymm11,%ymm0,%ymm10 + vpunpckhdq %ymm11,%ymm0,%ymm11 + vmovdqa %ymm12,%ymm0 + vpunpckldq %ymm13,%ymm0,%ymm12 + vpunpckhdq %ymm13,%ymm0,%ymm13 + vmovdqa %ymm14,%ymm0 + vpunpckldq %ymm15,%ymm0,%ymm14 + vpunpckhdq %ymm15,%ymm0,%ymm15 + + # interleave 64-bit words in state n, n+2 + vmovdqa 0x00(%rsp),%ymm0 + vmovdqa 0x40(%rsp),%ymm2 + vpunpcklqdq %ymm2,%ymm0,%ymm1 + vpunpckhqdq %ymm2,%ymm0,%ymm2 + vmovdqa %ymm1,0x00(%rsp) + vmovdqa %ymm2,0x40(%rsp) + vmovdqa 0x20(%rsp),%ymm0 + vmovdqa 0x60(%rsp),%ymm2 + vpunpcklqdq %ymm2,%ymm0,%ymm1 + vpunpckhqdq %ymm2,%ymm0,%ymm2 + vmovdqa %ymm1,0x20(%rsp) + vmovdqa %ymm2,0x60(%rsp) + vmovdqa %ymm4,%ymm0 + vpunpcklqdq %ymm6,%ymm0,%ymm4 + vpunpckhqdq %ymm6,%ymm0,%ymm6 + vmovdqa %ymm5,%ymm0 + vpunpcklqdq %ymm7,%ymm0,%ymm5 + vpunpckhqdq %ymm7,%ymm0,%ymm7 + vmovdqa %ymm8,%ymm0 + vpunpcklqdq %ymm10,%ymm0,%ymm8 + vpunpckhqdq %ymm10,%ymm0,%ymm10 + vmovdqa %ymm9,%ymm0 + vpunpcklqdq %ymm11,%ymm0,%ymm9 + vpunpckhqdq %ymm11,%ymm0,%ymm11 + vmovdqa %ymm12,%ymm0 + vpunpcklqdq %ymm14,%ymm0,%ymm12 + vpunpckhqdq %ymm14,%ymm0,%ymm14 + vmovdqa %ymm13,%ymm0 + vpunpcklqdq %ymm15,%ymm0,%ymm13 + vpunpckhqdq %ymm15,%ymm0,%ymm15 + + # interleave 128-bit words in state n, n+4 + # xor/write first four blocks + vmovdqa 0x00(%rsp),%ymm1 + vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 + cmp $0x0020,%rax + jl .Lxorpart8 + vpxor 0x0000(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0000(%rsi) + vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 + + vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 + cmp $0x0040,%rax + jl .Lxorpart8 + vpxor 0x0020(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0020(%rsi) + vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 + + vmovdqa 0x40(%rsp),%ymm1 + vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 + cmp $0x0060,%rax + jl .Lxorpart8 + vpxor 0x0040(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0040(%rsi) + vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 + + vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 + cmp $0x0080,%rax + jl .Lxorpart8 + vpxor 0x0060(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0060(%rsi) + vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 + + vmovdqa 0x20(%rsp),%ymm1 + vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 + cmp $0x00a0,%rax + jl .Lxorpart8 + vpxor 0x0080(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0080(%rsi) + vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 + + vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 + cmp $0x00c0,%rax + jl .Lxorpart8 + vpxor 0x00a0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00a0(%rsi) + vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 + + vmovdqa 0x60(%rsp),%ymm1 + vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 + cmp $0x00e0,%rax + jl .Lxorpart8 + vpxor 0x00c0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00c0(%rsi) + vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 + + vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 + cmp $0x0100,%rax + jl .Lxorpart8 + vpxor 0x00e0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00e0(%rsi) + vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 + + # xor remaining blocks, write to output + vmovdqa %ymm4,%ymm0 + cmp $0x0120,%rax + jl .Lxorpart8 + vpxor 0x0100(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0100(%rsi) + + vmovdqa %ymm12,%ymm0 + cmp $0x0140,%rax + jl .Lxorpart8 + vpxor 0x0120(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0120(%rsi) + + vmovdqa %ymm6,%ymm0 + cmp $0x0160,%rax + jl .Lxorpart8 + vpxor 0x0140(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0140(%rsi) + + vmovdqa %ymm14,%ymm0 + cmp $0x0180,%rax + jl .Lxorpart8 + vpxor 0x0160(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0160(%rsi) + + vmovdqa %ymm5,%ymm0 + cmp $0x01a0,%rax + jl .Lxorpart8 + vpxor 0x0180(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0180(%rsi) + + vmovdqa %ymm13,%ymm0 + cmp $0x01c0,%rax + jl .Lxorpart8 + vpxor 0x01a0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01a0(%rsi) + + vmovdqa %ymm7,%ymm0 + cmp $0x01e0,%rax + jl .Lxorpart8 + vpxor 0x01c0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01c0(%rsi) + + vmovdqa %ymm15,%ymm0 + cmp $0x0200,%rax + jl .Lxorpart8 + vpxor 0x01e0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01e0(%rsi) + +.Ldone8: + vzeroupper + lea -8(%r10),%rsp + RET + +.Lxorpart8: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x1f,%r9 + jz .Ldone8 + and $~0x1f,%rax + + mov %rsi,%r11 + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + jmp .Ldone8 + +SYM_FUNC_END(chacha_8block_xor_avx2) diff --git a/lib/crypto/x86/chacha-avx512vl-x86_64.S b/lib/crypto/x86/chacha-avx512vl-x86_64.S new file mode 100644 index 000000000000..259383e1ad44 --- /dev/null +++ b/lib/crypto/x86/chacha-avx512vl-x86_64.S @@ -0,0 +1,836 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions + * + * Copyright (C) 2018 Martin Willi + */ + +#include <linux/linkage.h> + +.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 +.align 32 +CTR2BL: .octa 0x00000000000000000000000000000000 + .octa 0x00000000000000000000000000000001 + +.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 +.align 32 +CTR4BL: .octa 0x00000000000000000000000000000002 + .octa 0x00000000000000000000000000000003 + +.section .rodata.cst32.CTR8BL, "aM", @progbits, 32 +.align 32 +CTR8BL: .octa 0x00000003000000020000000100000000 + .octa 0x00000007000000060000000500000004 + +.text + +SYM_FUNC_START(chacha_2block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 2 data blocks output, o + # %rdx: up to 2 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts two ChaCha blocks by loading the state + # matrix twice across four AVX registers. It performs matrix operations + # on four words in each matrix in parallel, but requires shuffling to + # rearrange the words after each round. + + vzeroupper + + # x0..3[0-2] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + + vmovdqa %ymm0,%ymm8 + vmovdqa %ymm1,%ymm9 + vmovdqa %ymm2,%ymm10 + vmovdqa %ymm3,%ymm11 + +.Ldoubleround: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + + sub $2,%r8d + jnz .Ldoubleround + + # o0 = i0 ^ (x0 + s0) + vpaddd %ymm8,%ymm0,%ymm7 + cmp $0x10,%rcx + jl .Lxorpart2 + vpxord 0x00(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x00(%rsi) + vextracti128 $1,%ymm7,%xmm0 + # o1 = i1 ^ (x1 + s1) + vpaddd %ymm9,%ymm1,%ymm7 + cmp $0x20,%rcx + jl .Lxorpart2 + vpxord 0x10(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x10(%rsi) + vextracti128 $1,%ymm7,%xmm1 + # o2 = i2 ^ (x2 + s2) + vpaddd %ymm10,%ymm2,%ymm7 + cmp $0x30,%rcx + jl .Lxorpart2 + vpxord 0x20(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x20(%rsi) + vextracti128 $1,%ymm7,%xmm2 + # o3 = i3 ^ (x3 + s3) + vpaddd %ymm11,%ymm3,%ymm7 + cmp $0x40,%rcx + jl .Lxorpart2 + vpxord 0x30(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x30(%rsi) + vextracti128 $1,%ymm7,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm7 + cmp $0x50,%rcx + jl .Lxorpart2 + vpxord 0x40(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x40(%rsi) + + vmovdqa %xmm1,%xmm7 + cmp $0x60,%rcx + jl .Lxorpart2 + vpxord 0x50(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x50(%rsi) + + vmovdqa %xmm2,%xmm7 + cmp $0x70,%rcx + jl .Lxorpart2 + vpxord 0x60(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x60(%rsi) + + vmovdqa %xmm3,%xmm7 + cmp $0x80,%rcx + jl .Lxorpart2 + vpxord 0x70(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x70(%rsi) + +.Ldone2: + vzeroupper + RET + +.Lxorpart2: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0xf,%rcx + jz .Ldone2 + mov %rax,%r9 + and $~0xf,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} + vpxord %xmm7,%xmm1,%xmm1 + vmovdqu8 %xmm1,(%rsi,%r9){%k1} + + jmp .Ldone2 + +SYM_FUNC_END(chacha_2block_xor_avx512vl) + +SYM_FUNC_START(chacha_4block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts four ChaCha blocks by loading the state + # matrix four times across eight AVX registers. It performs matrix + # operations on four words in two matrices in parallel, sequentially + # to the operations on the four words of the other two matrices. The + # required word shuffling has a rather high latency, we can do the + # arithmetic on two matrix-pairs without much slowdown. + + vzeroupper + + # x0..3[0-4] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vmovdqa %ymm0,%ymm4 + vmovdqa %ymm1,%ymm5 + vmovdqa %ymm2,%ymm6 + vmovdqa %ymm3,%ymm7 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + vpaddd CTR4BL(%rip),%ymm7,%ymm7 + + vmovdqa %ymm0,%ymm11 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm3,%ymm14 + vmovdqa %ymm7,%ymm15 + +.Ldoubleround4: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $16,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + vpshufd $0x39,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + vpshufd $0x93,%ymm7,%ymm7 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $16,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + vpshufd $0x93,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + vpshufd $0x39,%ymm7,%ymm7 + + sub $2,%r8d + jnz .Ldoubleround4 + + # o0 = i0 ^ (x0 + s0), first block + vpaddd %ymm11,%ymm0,%ymm10 + cmp $0x10,%rcx + jl .Lxorpart4 + vpxord 0x00(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x00(%rsi) + vextracti128 $1,%ymm10,%xmm0 + # o1 = i1 ^ (x1 + s1), first block + vpaddd %ymm12,%ymm1,%ymm10 + cmp $0x20,%rcx + jl .Lxorpart4 + vpxord 0x10(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x10(%rsi) + vextracti128 $1,%ymm10,%xmm1 + # o2 = i2 ^ (x2 + s2), first block + vpaddd %ymm13,%ymm2,%ymm10 + cmp $0x30,%rcx + jl .Lxorpart4 + vpxord 0x20(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x20(%rsi) + vextracti128 $1,%ymm10,%xmm2 + # o3 = i3 ^ (x3 + s3), first block + vpaddd %ymm14,%ymm3,%ymm10 + cmp $0x40,%rcx + jl .Lxorpart4 + vpxord 0x30(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x30(%rsi) + vextracti128 $1,%ymm10,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm10 + cmp $0x50,%rcx + jl .Lxorpart4 + vpxord 0x40(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x40(%rsi) + + vmovdqa %xmm1,%xmm10 + cmp $0x60,%rcx + jl .Lxorpart4 + vpxord 0x50(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x50(%rsi) + + vmovdqa %xmm2,%xmm10 + cmp $0x70,%rcx + jl .Lxorpart4 + vpxord 0x60(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x60(%rsi) + + vmovdqa %xmm3,%xmm10 + cmp $0x80,%rcx + jl .Lxorpart4 + vpxord 0x70(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x70(%rsi) + + # o0 = i0 ^ (x0 + s0), third block + vpaddd %ymm11,%ymm4,%ymm10 + cmp $0x90,%rcx + jl .Lxorpart4 + vpxord 0x80(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x80(%rsi) + vextracti128 $1,%ymm10,%xmm4 + # o1 = i1 ^ (x1 + s1), third block + vpaddd %ymm12,%ymm5,%ymm10 + cmp $0xa0,%rcx + jl .Lxorpart4 + vpxord 0x90(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x90(%rsi) + vextracti128 $1,%ymm10,%xmm5 + # o2 = i2 ^ (x2 + s2), third block + vpaddd %ymm13,%ymm6,%ymm10 + cmp $0xb0,%rcx + jl .Lxorpart4 + vpxord 0xa0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xa0(%rsi) + vextracti128 $1,%ymm10,%xmm6 + # o3 = i3 ^ (x3 + s3), third block + vpaddd %ymm15,%ymm7,%ymm10 + cmp $0xc0,%rcx + jl .Lxorpart4 + vpxord 0xb0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xb0(%rsi) + vextracti128 $1,%ymm10,%xmm7 + + # xor and write fourth block + vmovdqa %xmm4,%xmm10 + cmp $0xd0,%rcx + jl .Lxorpart4 + vpxord 0xc0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xc0(%rsi) + + vmovdqa %xmm5,%xmm10 + cmp $0xe0,%rcx + jl .Lxorpart4 + vpxord 0xd0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xd0(%rsi) + + vmovdqa %xmm6,%xmm10 + cmp $0xf0,%rcx + jl .Lxorpart4 + vpxord 0xe0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xe0(%rsi) + + vmovdqa %xmm7,%xmm10 + cmp $0x100,%rcx + jl .Lxorpart4 + vpxord 0xf0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xf0(%rsi) + +.Ldone4: + vzeroupper + RET + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0xf,%rcx + jz .Ldone4 + mov %rax,%r9 + and $~0xf,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} + vpxord %xmm10,%xmm1,%xmm1 + vmovdqu8 %xmm1,(%rsi,%r9){%k1} + + jmp .Ldone4 + +SYM_FUNC_END(chacha_4block_xor_avx512vl) + +SYM_FUNC_START(chacha_8block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 8 data blocks output, o + # %rdx: up to 8 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts eight consecutive ChaCha blocks by loading + # the state matrix in AVX registers eight times. Compared to AVX2, this + # mostly benefits from the new rotate instructions in VL and the + # additional registers. + + vzeroupper + + # x0..15[0-7] = s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpbroadcastd 0x04(%rdi),%ymm1 + vpbroadcastd 0x08(%rdi),%ymm2 + vpbroadcastd 0x0c(%rdi),%ymm3 + vpbroadcastd 0x10(%rdi),%ymm4 + vpbroadcastd 0x14(%rdi),%ymm5 + vpbroadcastd 0x18(%rdi),%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm7 + vpbroadcastd 0x20(%rdi),%ymm8 + vpbroadcastd 0x24(%rdi),%ymm9 + vpbroadcastd 0x28(%rdi),%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm11 + vpbroadcastd 0x30(%rdi),%ymm12 + vpbroadcastd 0x34(%rdi),%ymm13 + vpbroadcastd 0x38(%rdi),%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm15 + + # x12 += counter values 0-3 + vpaddd CTR8BL(%rip),%ymm12,%ymm12 + + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm1,%ymm17 + vmovdqa64 %ymm2,%ymm18 + vmovdqa64 %ymm3,%ymm19 + vmovdqa64 %ymm4,%ymm20 + vmovdqa64 %ymm5,%ymm21 + vmovdqa64 %ymm6,%ymm22 + vmovdqa64 %ymm7,%ymm23 + vmovdqa64 %ymm8,%ymm24 + vmovdqa64 %ymm9,%ymm25 + vmovdqa64 %ymm10,%ymm26 + vmovdqa64 %ymm11,%ymm27 + vmovdqa64 %ymm12,%ymm28 + vmovdqa64 %ymm13,%ymm29 + vmovdqa64 %ymm14,%ymm30 + vmovdqa64 %ymm15,%ymm31 + +.Ldoubleround8: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + vpaddd %ymm0,%ymm4,%ymm0 + vpxord %ymm0,%ymm12,%ymm12 + vprold $16,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + vpaddd %ymm1,%ymm5,%ymm1 + vpxord %ymm1,%ymm13,%ymm13 + vprold $16,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + vpaddd %ymm2,%ymm6,%ymm2 + vpxord %ymm2,%ymm14,%ymm14 + vprold $16,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + vpaddd %ymm3,%ymm7,%ymm3 + vpxord %ymm3,%ymm15,%ymm15 + vprold $16,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + vpaddd %ymm12,%ymm8,%ymm8 + vpxord %ymm8,%ymm4,%ymm4 + vprold $12,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + vpaddd %ymm13,%ymm9,%ymm9 + vpxord %ymm9,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + vpaddd %ymm14,%ymm10,%ymm10 + vpxord %ymm10,%ymm6,%ymm6 + vprold $12,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + vpaddd %ymm15,%ymm11,%ymm11 + vpxord %ymm11,%ymm7,%ymm7 + vprold $12,%ymm7,%ymm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + vpaddd %ymm0,%ymm4,%ymm0 + vpxord %ymm0,%ymm12,%ymm12 + vprold $8,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + vpaddd %ymm1,%ymm5,%ymm1 + vpxord %ymm1,%ymm13,%ymm13 + vprold $8,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + vpaddd %ymm2,%ymm6,%ymm2 + vpxord %ymm2,%ymm14,%ymm14 + vprold $8,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vpaddd %ymm3,%ymm7,%ymm3 + vpxord %ymm3,%ymm15,%ymm15 + vprold $8,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + vpaddd %ymm12,%ymm8,%ymm8 + vpxord %ymm8,%ymm4,%ymm4 + vprold $7,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + vpaddd %ymm13,%ymm9,%ymm9 + vpxord %ymm9,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + vpaddd %ymm14,%ymm10,%ymm10 + vpxord %ymm10,%ymm6,%ymm6 + vprold $7,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + vpaddd %ymm15,%ymm11,%ymm11 + vpxord %ymm11,%ymm7,%ymm7 + vprold $7,%ymm7,%ymm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + vpaddd %ymm0,%ymm5,%ymm0 + vpxord %ymm0,%ymm15,%ymm15 + vprold $16,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16) + vpaddd %ymm1,%ymm6,%ymm1 + vpxord %ymm1,%ymm12,%ymm12 + vprold $16,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + vpaddd %ymm2,%ymm7,%ymm2 + vpxord %ymm2,%ymm13,%ymm13 + vprold $16,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + vpaddd %ymm3,%ymm4,%ymm3 + vpxord %ymm3,%ymm14,%ymm14 + vprold $16,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + vpaddd %ymm15,%ymm10,%ymm10 + vpxord %ymm10,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + vpaddd %ymm12,%ymm11,%ymm11 + vpxord %ymm11,%ymm6,%ymm6 + vprold $12,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + vpaddd %ymm13,%ymm8,%ymm8 + vpxord %ymm8,%ymm7,%ymm7 + vprold $12,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + vpaddd %ymm14,%ymm9,%ymm9 + vpxord %ymm9,%ymm4,%ymm4 + vprold $12,%ymm4,%ymm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + vpaddd %ymm0,%ymm5,%ymm0 + vpxord %ymm0,%ymm15,%ymm15 + vprold $8,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + vpaddd %ymm1,%ymm6,%ymm1 + vpxord %ymm1,%ymm12,%ymm12 + vprold $8,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + vpaddd %ymm2,%ymm7,%ymm2 + vpxord %ymm2,%ymm13,%ymm13 + vprold $8,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vpaddd %ymm3,%ymm4,%ymm3 + vpxord %ymm3,%ymm14,%ymm14 + vprold $8,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + vpaddd %ymm15,%ymm10,%ymm10 + vpxord %ymm10,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + vpaddd %ymm12,%ymm11,%ymm11 + vpxord %ymm11,%ymm6,%ymm6 + vprold $7,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + vpaddd %ymm13,%ymm8,%ymm8 + vpxord %ymm8,%ymm7,%ymm7 + vprold $7,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + vpaddd %ymm14,%ymm9,%ymm9 + vpxord %ymm9,%ymm4,%ymm4 + vprold $7,%ymm4,%ymm4 + + sub $2,%r8d + jnz .Ldoubleround8 + + # x0..15[0-3] += s[0..15] + vpaddd %ymm16,%ymm0,%ymm0 + vpaddd %ymm17,%ymm1,%ymm1 + vpaddd %ymm18,%ymm2,%ymm2 + vpaddd %ymm19,%ymm3,%ymm3 + vpaddd %ymm20,%ymm4,%ymm4 + vpaddd %ymm21,%ymm5,%ymm5 + vpaddd %ymm22,%ymm6,%ymm6 + vpaddd %ymm23,%ymm7,%ymm7 + vpaddd %ymm24,%ymm8,%ymm8 + vpaddd %ymm25,%ymm9,%ymm9 + vpaddd %ymm26,%ymm10,%ymm10 + vpaddd %ymm27,%ymm11,%ymm11 + vpaddd %ymm28,%ymm12,%ymm12 + vpaddd %ymm29,%ymm13,%ymm13 + vpaddd %ymm30,%ymm14,%ymm14 + vpaddd %ymm31,%ymm15,%ymm15 + + # interleave 32-bit words in state n, n+1 + vpunpckldq %ymm1,%ymm0,%ymm16 + vpunpckhdq %ymm1,%ymm0,%ymm17 + vpunpckldq %ymm3,%ymm2,%ymm18 + vpunpckhdq %ymm3,%ymm2,%ymm19 + vpunpckldq %ymm5,%ymm4,%ymm20 + vpunpckhdq %ymm5,%ymm4,%ymm21 + vpunpckldq %ymm7,%ymm6,%ymm22 + vpunpckhdq %ymm7,%ymm6,%ymm23 + vpunpckldq %ymm9,%ymm8,%ymm24 + vpunpckhdq %ymm9,%ymm8,%ymm25 + vpunpckldq %ymm11,%ymm10,%ymm26 + vpunpckhdq %ymm11,%ymm10,%ymm27 + vpunpckldq %ymm13,%ymm12,%ymm28 + vpunpckhdq %ymm13,%ymm12,%ymm29 + vpunpckldq %ymm15,%ymm14,%ymm30 + vpunpckhdq %ymm15,%ymm14,%ymm31 + + # interleave 64-bit words in state n, n+2 + vpunpcklqdq %ymm18,%ymm16,%ymm0 + vpunpcklqdq %ymm19,%ymm17,%ymm1 + vpunpckhqdq %ymm18,%ymm16,%ymm2 + vpunpckhqdq %ymm19,%ymm17,%ymm3 + vpunpcklqdq %ymm22,%ymm20,%ymm4 + vpunpcklqdq %ymm23,%ymm21,%ymm5 + vpunpckhqdq %ymm22,%ymm20,%ymm6 + vpunpckhqdq %ymm23,%ymm21,%ymm7 + vpunpcklqdq %ymm26,%ymm24,%ymm8 + vpunpcklqdq %ymm27,%ymm25,%ymm9 + vpunpckhqdq %ymm26,%ymm24,%ymm10 + vpunpckhqdq %ymm27,%ymm25,%ymm11 + vpunpcklqdq %ymm30,%ymm28,%ymm12 + vpunpcklqdq %ymm31,%ymm29,%ymm13 + vpunpckhqdq %ymm30,%ymm28,%ymm14 + vpunpckhqdq %ymm31,%ymm29,%ymm15 + + # interleave 128-bit words in state n, n+4 + # xor/write first four blocks + vmovdqa64 %ymm0,%ymm16 + vperm2i128 $0x20,%ymm4,%ymm0,%ymm0 + cmp $0x0020,%rcx + jl .Lxorpart8 + vpxord 0x0000(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0000(%rsi) + vmovdqa64 %ymm16,%ymm0 + vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 + + vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 + cmp $0x0040,%rcx + jl .Lxorpart8 + vpxord 0x0020(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0020(%rsi) + vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 + + vperm2i128 $0x20,%ymm6,%ymm2,%ymm0 + cmp $0x0060,%rcx + jl .Lxorpart8 + vpxord 0x0040(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0040(%rsi) + vperm2i128 $0x31,%ymm6,%ymm2,%ymm6 + + vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 + cmp $0x0080,%rcx + jl .Lxorpart8 + vpxord 0x0060(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0060(%rsi) + vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 + + vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 + cmp $0x00a0,%rcx + jl .Lxorpart8 + vpxord 0x0080(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0080(%rsi) + vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 + + vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 + cmp $0x00c0,%rcx + jl .Lxorpart8 + vpxord 0x00a0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00a0(%rsi) + vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 + + vperm2i128 $0x20,%ymm7,%ymm3,%ymm0 + cmp $0x00e0,%rcx + jl .Lxorpart8 + vpxord 0x00c0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00c0(%rsi) + vperm2i128 $0x31,%ymm7,%ymm3,%ymm7 + + vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 + cmp $0x0100,%rcx + jl .Lxorpart8 + vpxord 0x00e0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00e0(%rsi) + vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 + + # xor remaining blocks, write to output + vmovdqa64 %ymm4,%ymm0 + cmp $0x0120,%rcx + jl .Lxorpart8 + vpxord 0x0100(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0100(%rsi) + + vmovdqa64 %ymm12,%ymm0 + cmp $0x0140,%rcx + jl .Lxorpart8 + vpxord 0x0120(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0120(%rsi) + + vmovdqa64 %ymm6,%ymm0 + cmp $0x0160,%rcx + jl .Lxorpart8 + vpxord 0x0140(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0140(%rsi) + + vmovdqa64 %ymm14,%ymm0 + cmp $0x0180,%rcx + jl .Lxorpart8 + vpxord 0x0160(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0160(%rsi) + + vmovdqa64 %ymm5,%ymm0 + cmp $0x01a0,%rcx + jl .Lxorpart8 + vpxord 0x0180(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0180(%rsi) + + vmovdqa64 %ymm13,%ymm0 + cmp $0x01c0,%rcx + jl .Lxorpart8 + vpxord 0x01a0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01a0(%rsi) + + vmovdqa64 %ymm7,%ymm0 + cmp $0x01e0,%rcx + jl .Lxorpart8 + vpxord 0x01c0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01c0(%rsi) + + vmovdqa64 %ymm15,%ymm0 + cmp $0x0200,%rcx + jl .Lxorpart8 + vpxord 0x01e0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01e0(%rsi) + +.Ldone8: + vzeroupper + RET + +.Lxorpart8: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0x1f,%rcx + jz .Ldone8 + mov %rax,%r9 + and $~0x1f,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z} + vpxord %ymm0,%ymm1,%ymm1 + vmovdqu8 %ymm1,(%rsi,%r9){%k1} + + jmp .Ldone8 + +SYM_FUNC_END(chacha_8block_xor_avx512vl) diff --git a/lib/crypto/x86/chacha-ssse3-x86_64.S b/lib/crypto/x86/chacha-ssse3-x86_64.S new file mode 100644 index 000000000000..7111949cd5b9 --- /dev/null +++ b/lib/crypto/x86/chacha-ssse3-x86_64.S @@ -0,0 +1,791 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions + * + * Copyright (C) 2015 Martin Willi + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +.section .rodata.cst16.ROT8, "aM", @progbits, 16 +.align 16 +ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 +.section .rodata.cst16.ROT16, "aM", @progbits, 16 +.align 16 +ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 +.section .rodata.cst16.CTRINC, "aM", @progbits, 16 +.align 16 +CTRINC: .octa 0x00000003000000020000000100000000 + +.text + +/* + * chacha_permute - permute one block + * + * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This + * function performs matrix operations on four words in parallel, but requires + * shuffling to rearrange the words after each round. 8/16-bit word rotation is + * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word + * rotation uses traditional shift+OR. + * + * The round count is given in %r8d. + * + * Clobbers: %r8d, %xmm4-%xmm7 + */ +SYM_FUNC_START_LOCAL(chacha_permute) + + movdqa ROT8(%rip),%xmm4 + movdqa ROT16(%rip),%xmm5 + +.Ldoubleround: + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm5,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm6 + pslld $12,%xmm6 + psrld $20,%xmm1 + por %xmm6,%xmm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm4,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm7 + pslld $7,%xmm7 + psrld $25,%xmm1 + por %xmm7,%xmm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + pshufd $0x39,%xmm1,%xmm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + pshufd $0x4e,%xmm2,%xmm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + pshufd $0x93,%xmm3,%xmm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm5,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm6 + pslld $12,%xmm6 + psrld $20,%xmm1 + por %xmm6,%xmm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm4,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm7 + pslld $7,%xmm7 + psrld $25,%xmm1 + por %xmm7,%xmm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + pshufd $0x93,%xmm1,%xmm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + pshufd $0x4e,%xmm2,%xmm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + pshufd $0x39,%xmm3,%xmm3 + + sub $2,%r8d + jnz .Ldoubleround + + RET +SYM_FUNC_END(chacha_permute) + +SYM_FUNC_START(chacha_block_xor_ssse3) + # %rdi: Input state matrix, s + # %rsi: up to 1 data block output, o + # %rdx: up to 1 data block input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + FRAME_BEGIN + + # x0..3 = s0..3 + movdqu 0x00(%rdi),%xmm0 + movdqu 0x10(%rdi),%xmm1 + movdqu 0x20(%rdi),%xmm2 + movdqu 0x30(%rdi),%xmm3 + movdqa %xmm0,%xmm8 + movdqa %xmm1,%xmm9 + movdqa %xmm2,%xmm10 + movdqa %xmm3,%xmm11 + + mov %rcx,%rax + call chacha_permute + + # o0 = i0 ^ (x0 + s0) + paddd %xmm8,%xmm0 + cmp $0x10,%rax + jl .Lxorpart + movdqu 0x00(%rdx),%xmm4 + pxor %xmm4,%xmm0 + movdqu %xmm0,0x00(%rsi) + # o1 = i1 ^ (x1 + s1) + paddd %xmm9,%xmm1 + movdqa %xmm1,%xmm0 + cmp $0x20,%rax + jl .Lxorpart + movdqu 0x10(%rdx),%xmm0 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x10(%rsi) + # o2 = i2 ^ (x2 + s2) + paddd %xmm10,%xmm2 + movdqa %xmm2,%xmm0 + cmp $0x30,%rax + jl .Lxorpart + movdqu 0x20(%rdx),%xmm0 + pxor %xmm2,%xmm0 + movdqu %xmm0,0x20(%rsi) + # o3 = i3 ^ (x3 + s3) + paddd %xmm11,%xmm3 + movdqa %xmm3,%xmm0 + cmp $0x40,%rax + jl .Lxorpart + movdqu 0x30(%rdx),%xmm0 + pxor %xmm3,%xmm0 + movdqu %xmm0,0x30(%rsi) + +.Ldone: + FRAME_END + RET + +.Lxorpart: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + pxor 0x00(%rsp),%xmm0 + movdqa %xmm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone + +SYM_FUNC_END(chacha_block_xor_ssse3) + +SYM_FUNC_START(hchacha_block_ssse3) + # %rdi: Input state matrix, s + # %rsi: output (8 32-bit words) + # %edx: nrounds + FRAME_BEGIN + + movdqu 0x00(%rdi),%xmm0 + movdqu 0x10(%rdi),%xmm1 + movdqu 0x20(%rdi),%xmm2 + movdqu 0x30(%rdi),%xmm3 + + mov %edx,%r8d + call chacha_permute + + movdqu %xmm0,0x00(%rsi) + movdqu %xmm3,0x10(%rsi) + + FRAME_END + RET +SYM_FUNC_END(hchacha_block_ssse3) + +SYM_FUNC_START(chacha_4block_xor_ssse3) + # %rdi: Input state matrix, s + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts four consecutive ChaCha blocks by loading the + # the state matrix in SSE registers four times. As we need some scratch + # registers, we save the first four registers on the stack. The + # algorithm performs each operation on the corresponding word of each + # state matrix, hence requires no word shuffling. For final XORing step + # we transpose the matrix by interleaving 32- and then 64-bit words, + # which allows us to do XOR in SSE registers. 8/16-bit word rotation is + # done with the slightly better performing SSSE3 byte shuffling, + # 7/12-bit word rotation uses traditional shift+OR. + + lea 8(%rsp),%r10 + sub $0x80,%rsp + and $~63,%rsp + mov %rcx,%rax + + # x0..15[0-3] = s0..3[0..3] + movq 0x00(%rdi),%xmm1 + pshufd $0x00,%xmm1,%xmm0 + pshufd $0x55,%xmm1,%xmm1 + movq 0x08(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + movq 0x10(%rdi),%xmm5 + pshufd $0x00,%xmm5,%xmm4 + pshufd $0x55,%xmm5,%xmm5 + movq 0x18(%rdi),%xmm7 + pshufd $0x00,%xmm7,%xmm6 + pshufd $0x55,%xmm7,%xmm7 + movq 0x20(%rdi),%xmm9 + pshufd $0x00,%xmm9,%xmm8 + pshufd $0x55,%xmm9,%xmm9 + movq 0x28(%rdi),%xmm11 + pshufd $0x00,%xmm11,%xmm10 + pshufd $0x55,%xmm11,%xmm11 + movq 0x30(%rdi),%xmm13 + pshufd $0x00,%xmm13,%xmm12 + pshufd $0x55,%xmm13,%xmm13 + movq 0x38(%rdi),%xmm15 + pshufd $0x00,%xmm15,%xmm14 + pshufd $0x55,%xmm15,%xmm15 + # x0..3 on stack + movdqa %xmm0,0x00(%rsp) + movdqa %xmm1,0x10(%rsp) + movdqa %xmm2,0x20(%rsp) + movdqa %xmm3,0x30(%rsp) + + movdqa CTRINC(%rip),%xmm1 + movdqa ROT8(%rip),%xmm2 + movdqa ROT16(%rip),%xmm3 + + # x12 += counter values 0-3 + paddd %xmm1,%xmm12 + +.Ldoubleround4: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm3,%xmm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm3,%xmm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm3,%xmm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm3,%xmm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm4 + por %xmm0,%xmm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm5 + por %xmm0,%xmm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm6 + por %xmm0,%xmm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm7 + por %xmm0,%xmm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm2,%xmm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm2,%xmm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm2,%xmm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm2,%xmm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm4 + por %xmm0,%xmm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm5 + por %xmm0,%xmm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm6 + por %xmm0,%xmm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm7 + por %xmm0,%xmm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm3,%xmm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm3,%xmm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm3,%xmm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm3,%xmm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + paddd %xmm15,%xmm10 + pxor %xmm10,%xmm5 + movdqa %xmm5,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm5 + por %xmm0,%xmm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + paddd %xmm12,%xmm11 + pxor %xmm11,%xmm6 + movdqa %xmm6,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm6 + por %xmm0,%xmm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + paddd %xmm13,%xmm8 + pxor %xmm8,%xmm7 + movdqa %xmm7,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm7 + por %xmm0,%xmm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + paddd %xmm14,%xmm9 + pxor %xmm9,%xmm4 + movdqa %xmm4,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm4 + por %xmm0,%xmm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm2,%xmm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm2,%xmm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm2,%xmm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm2,%xmm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + paddd %xmm15,%xmm10 + pxor %xmm10,%xmm5 + movdqa %xmm5,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm5 + por %xmm0,%xmm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + paddd %xmm12,%xmm11 + pxor %xmm11,%xmm6 + movdqa %xmm6,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm6 + por %xmm0,%xmm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + paddd %xmm13,%xmm8 + pxor %xmm8,%xmm7 + movdqa %xmm7,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm7 + por %xmm0,%xmm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + paddd %xmm14,%xmm9 + pxor %xmm9,%xmm4 + movdqa %xmm4,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm4 + por %xmm0,%xmm4 + + sub $2,%r8d + jnz .Ldoubleround4 + + # x0[0-3] += s0[0] + # x1[0-3] += s0[1] + movq 0x00(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd 0x00(%rsp),%xmm2 + movdqa %xmm2,0x00(%rsp) + paddd 0x10(%rsp),%xmm3 + movdqa %xmm3,0x10(%rsp) + # x2[0-3] += s0[2] + # x3[0-3] += s0[3] + movq 0x08(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd 0x20(%rsp),%xmm2 + movdqa %xmm2,0x20(%rsp) + paddd 0x30(%rsp),%xmm3 + movdqa %xmm3,0x30(%rsp) + + # x4[0-3] += s1[0] + # x5[0-3] += s1[1] + movq 0x10(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm4 + paddd %xmm3,%xmm5 + # x6[0-3] += s1[2] + # x7[0-3] += s1[3] + movq 0x18(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + + # x8[0-3] += s2[0] + # x9[0-3] += s2[1] + movq 0x20(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm8 + paddd %xmm3,%xmm9 + # x10[0-3] += s2[2] + # x11[0-3] += s2[3] + movq 0x28(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm10 + paddd %xmm3,%xmm11 + + # x12[0-3] += s3[0] + # x13[0-3] += s3[1] + movq 0x30(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm12 + paddd %xmm3,%xmm13 + # x14[0-3] += s3[2] + # x15[0-3] += s3[3] + movq 0x38(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm14 + paddd %xmm3,%xmm15 + + # x12 += counter values 0-3 + paddd %xmm1,%xmm12 + + # interleave 32-bit words in state n, n+1 + movdqa 0x00(%rsp),%xmm0 + movdqa 0x10(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpckldq %xmm1,%xmm2 + punpckhdq %xmm1,%xmm0 + movdqa %xmm2,0x00(%rsp) + movdqa %xmm0,0x10(%rsp) + movdqa 0x20(%rsp),%xmm0 + movdqa 0x30(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpckldq %xmm1,%xmm2 + punpckhdq %xmm1,%xmm0 + movdqa %xmm2,0x20(%rsp) + movdqa %xmm0,0x30(%rsp) + movdqa %xmm4,%xmm0 + punpckldq %xmm5,%xmm4 + punpckhdq %xmm5,%xmm0 + movdqa %xmm0,%xmm5 + movdqa %xmm6,%xmm0 + punpckldq %xmm7,%xmm6 + punpckhdq %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + movdqa %xmm8,%xmm0 + punpckldq %xmm9,%xmm8 + punpckhdq %xmm9,%xmm0 + movdqa %xmm0,%xmm9 + movdqa %xmm10,%xmm0 + punpckldq %xmm11,%xmm10 + punpckhdq %xmm11,%xmm0 + movdqa %xmm0,%xmm11 + movdqa %xmm12,%xmm0 + punpckldq %xmm13,%xmm12 + punpckhdq %xmm13,%xmm0 + movdqa %xmm0,%xmm13 + movdqa %xmm14,%xmm0 + punpckldq %xmm15,%xmm14 + punpckhdq %xmm15,%xmm0 + movdqa %xmm0,%xmm15 + + # interleave 64-bit words in state n, n+2 + movdqa 0x00(%rsp),%xmm0 + movdqa 0x20(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpcklqdq %xmm1,%xmm2 + punpckhqdq %xmm1,%xmm0 + movdqa %xmm2,0x00(%rsp) + movdqa %xmm0,0x20(%rsp) + movdqa 0x10(%rsp),%xmm0 + movdqa 0x30(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpcklqdq %xmm1,%xmm2 + punpckhqdq %xmm1,%xmm0 + movdqa %xmm2,0x10(%rsp) + movdqa %xmm0,0x30(%rsp) + movdqa %xmm4,%xmm0 + punpcklqdq %xmm6,%xmm4 + punpckhqdq %xmm6,%xmm0 + movdqa %xmm0,%xmm6 + movdqa %xmm5,%xmm0 + punpcklqdq %xmm7,%xmm5 + punpckhqdq %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + movdqa %xmm8,%xmm0 + punpcklqdq %xmm10,%xmm8 + punpckhqdq %xmm10,%xmm0 + movdqa %xmm0,%xmm10 + movdqa %xmm9,%xmm0 + punpcklqdq %xmm11,%xmm9 + punpckhqdq %xmm11,%xmm0 + movdqa %xmm0,%xmm11 + movdqa %xmm12,%xmm0 + punpcklqdq %xmm14,%xmm12 + punpckhqdq %xmm14,%xmm0 + movdqa %xmm0,%xmm14 + movdqa %xmm13,%xmm0 + punpcklqdq %xmm15,%xmm13 + punpckhqdq %xmm15,%xmm0 + movdqa %xmm0,%xmm15 + + # xor with corresponding input, write to output + movdqa 0x00(%rsp),%xmm0 + cmp $0x10,%rax + jl .Lxorpart4 + movdqu 0x00(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x00(%rsi) + + movdqu %xmm4,%xmm0 + cmp $0x20,%rax + jl .Lxorpart4 + movdqu 0x10(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x10(%rsi) + + movdqu %xmm8,%xmm0 + cmp $0x30,%rax + jl .Lxorpart4 + movdqu 0x20(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x20(%rsi) + + movdqu %xmm12,%xmm0 + cmp $0x40,%rax + jl .Lxorpart4 + movdqu 0x30(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x30(%rsi) + + movdqa 0x20(%rsp),%xmm0 + cmp $0x50,%rax + jl .Lxorpart4 + movdqu 0x40(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x40(%rsi) + + movdqu %xmm6,%xmm0 + cmp $0x60,%rax + jl .Lxorpart4 + movdqu 0x50(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x50(%rsi) + + movdqu %xmm10,%xmm0 + cmp $0x70,%rax + jl .Lxorpart4 + movdqu 0x60(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x60(%rsi) + + movdqu %xmm14,%xmm0 + cmp $0x80,%rax + jl .Lxorpart4 + movdqu 0x70(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x70(%rsi) + + movdqa 0x10(%rsp),%xmm0 + cmp $0x90,%rax + jl .Lxorpart4 + movdqu 0x80(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x80(%rsi) + + movdqu %xmm5,%xmm0 + cmp $0xa0,%rax + jl .Lxorpart4 + movdqu 0x90(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x90(%rsi) + + movdqu %xmm9,%xmm0 + cmp $0xb0,%rax + jl .Lxorpart4 + movdqu 0xa0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xa0(%rsi) + + movdqu %xmm13,%xmm0 + cmp $0xc0,%rax + jl .Lxorpart4 + movdqu 0xb0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xb0(%rsi) + + movdqa 0x30(%rsp),%xmm0 + cmp $0xd0,%rax + jl .Lxorpart4 + movdqu 0xc0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xc0(%rsi) + + movdqu %xmm7,%xmm0 + cmp $0xe0,%rax + jl .Lxorpart4 + movdqu 0xd0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xd0(%rsi) + + movdqu %xmm11,%xmm0 + cmp $0xf0,%rax + jl .Lxorpart4 + movdqu 0xe0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xe0(%rsi) + + movdqu %xmm15,%xmm0 + cmp $0x100,%rax + jl .Lxorpart4 + movdqu 0xf0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xf0(%rsi) + +.Ldone4: + lea -8(%r10),%rsp + RET + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone4 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + pxor 0x00(%rsp),%xmm0 + movdqa %xmm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + jmp .Ldone4 + +SYM_FUNC_END(chacha_4block_xor_ssse3) diff --git a/lib/crypto/x86/chacha_glue.c b/lib/crypto/x86/chacha_glue.c new file mode 100644 index 000000000000..10b2c945f541 --- /dev/null +++ b/lib/crypto/x86/chacha_glue.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * ChaCha and HChaCha functions (x86_64 optimized) + * + * Copyright (C) 2015 Martin Willi + */ + +#include <asm/simd.h> +#include <crypto/chacha.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sizes.h> + +asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void hchacha_block_ssse3(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); + +asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); + +asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); + +static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) +{ + len = min(len, maxblocks * CHACHA_BLOCK_SIZE); + return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; +} + +static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + if (static_branch_likely(&chacha_use_avx512vl)) { + while (bytes >= CHACHA_BLOCK_SIZE * 8) { + chacha_8block_xor_avx512vl(state, dst, src, bytes, + nrounds); + bytes -= CHACHA_BLOCK_SIZE * 8; + src += CHACHA_BLOCK_SIZE * 8; + dst += CHACHA_BLOCK_SIZE * 8; + state->x[12] += 8; + } + if (bytes > CHACHA_BLOCK_SIZE * 4) { + chacha_8block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state->x[12] += chacha_advance(bytes, 8); + return; + } + if (bytes > CHACHA_BLOCK_SIZE * 2) { + chacha_4block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state->x[12] += chacha_advance(bytes, 4); + return; + } + if (bytes) { + chacha_2block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state->x[12] += chacha_advance(bytes, 2); + return; + } + } + + if (static_branch_likely(&chacha_use_avx2)) { + while (bytes >= CHACHA_BLOCK_SIZE * 8) { + chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); + bytes -= CHACHA_BLOCK_SIZE * 8; + src += CHACHA_BLOCK_SIZE * 8; + dst += CHACHA_BLOCK_SIZE * 8; + state->x[12] += 8; + } + if (bytes > CHACHA_BLOCK_SIZE * 4) { + chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); + state->x[12] += chacha_advance(bytes, 8); + return; + } + if (bytes > CHACHA_BLOCK_SIZE * 2) { + chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); + state->x[12] += chacha_advance(bytes, 4); + return; + } + if (bytes > CHACHA_BLOCK_SIZE) { + chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); + state->x[12] += chacha_advance(bytes, 2); + return; + } + } + + while (bytes >= CHACHA_BLOCK_SIZE * 4) { + chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); + bytes -= CHACHA_BLOCK_SIZE * 4; + src += CHACHA_BLOCK_SIZE * 4; + dst += CHACHA_BLOCK_SIZE * 4; + state->x[12] += 4; + } + if (bytes > CHACHA_BLOCK_SIZE) { + chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); + state->x[12] += chacha_advance(bytes, 4); + return; + } + if (bytes) { + chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); + state->x[12]++; + } +} + +void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + if (!static_branch_likely(&chacha_use_simd)) { + hchacha_block_generic(state, out, nrounds); + } else { + kernel_fpu_begin(); + hchacha_block_ssse3(state, out, nrounds); + kernel_fpu_end(); + } +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + if (!static_branch_likely(&chacha_use_simd) || + bytes <= CHACHA_BLOCK_SIZE) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + kernel_fpu_begin(); + chacha_dosimd(state, dst, src, todo, nrounds); + kernel_fpu_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +bool chacha_is_arch_optimized(void) +{ + return static_key_enabled(&chacha_use_simd); +} +EXPORT_SYMBOL(chacha_is_arch_optimized); + +static int __init chacha_simd_mod_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_SSSE3)) + return 0; + + static_branch_enable(&chacha_use_simd); + + if (boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { + static_branch_enable(&chacha_use_avx2); + + if (boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ + static_branch_enable(&chacha_use_avx512vl); + } + return 0; +} +subsys_initcall(chacha_simd_mod_init); + +static void __exit chacha_simd_mod_exit(void) +{ +} +module_exit(chacha_simd_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); +MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)"); diff --git a/lib/crypto/x86/poly1305-x86_64-cryptogams.pl b/lib/crypto/x86/poly1305-x86_64-cryptogams.pl new file mode 100644 index 000000000000..501827254fed --- /dev/null +++ b/lib/crypto/x86/poly1305-x86_64-cryptogams.pl @@ -0,0 +1,4253 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# +# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. +# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. +# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. +# +# This code is taken from the OpenSSL project but the author, Andy Polyakov, +# has relicensed it under the licenses specified in the SPDX header above. +# The original headers, including the original license headers, are +# included below for completeness. +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements Poly1305 hash for x86_64. +# +# March 2015 +# +# Initial release. +# +# December 2016 +# +# Add AVX512F+VL+BW code path. +# +# November 2017 +# +# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be +# executed even on Knights Landing. Trigger for modification was +# observation that AVX512 code paths can negatively affect overall +# Skylake-X system performance. Since we are likely to suppress +# AVX512F capability flag [at least on Skylake-X], conversion serves +# as kind of "investment protection". Note that next *lake processor, +# Cannonlake, has AVX512IFMA code path to execute... +# +# Numbers are cycles per processed byte with poly1305_blocks alone, +# measured with rdtsc at fixed clock frequency. +# +# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 +# P4 4.46/+120% - +# Core 2 2.41/+90% - +# Westmere 1.88/+120% - +# Sandy Bridge 1.39/+140% 1.10 +# Haswell 1.14/+175% 1.11 0.65 +# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] +# Silvermont 2.83/+95% - +# Knights L 3.60/? 1.65 1.10 0.41(***) +# Goldmont 1.70/+180% - +# VIA Nano 1.82/+150% - +# Sledgehammer 1.38/+160% - +# Bulldozer 2.30/+130% 0.97 +# Ryzen 1.15/+200% 1.08 1.18 +# +# (*) improvement coefficients relative to clang are more modest and +# are ~50% on most processors, in both cases we are comparing to +# __int128 code; +# (**) SSE2 implementation was attempted, but among non-AVX processors +# it was faster than integer-only code only on older Intel P4 and +# Core processors, 50-30%, less newer processor is, but slower on +# contemporary ones, for example almost 2x slower on Atom, and as +# former are naturally disappearing, SSE2 is deemed unnecessary; +# (***) strangely enough performance seems to vary from core to core, +# listed result is best case; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); +$kernel=0; $kernel=1 if (!$flavour && !$output); + +if (!$kernel) { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or + die "can't locate x86_64-xlate.pl"; + + open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; + *STDOUT=*OUT; + + if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); + } + + if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { + $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); + $avx += 1 if ($1==2.11 && $2>=8); + } + + if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); + } + + if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); + } +} else { + $avx = 4; # The kernel uses ifdefs for this. +} + +sub declare_function() { + my ($name, $align, $nargs) = @_; + if($kernel) { + $code .= "SYM_FUNC_START($name)\n"; + $code .= ".L$name:\n"; + } else { + $code .= ".globl $name\n"; + $code .= ".type $name,\@function,$nargs\n"; + $code .= ".align $align\n"; + $code .= "$name:\n"; + } +} + +sub declare_typed_function() { + my ($name, $align, $nargs) = @_; + if($kernel) { + $code .= "SYM_TYPED_FUNC_START($name)\n"; + $code .= ".L$name:\n"; + } else { + $code .= ".globl $name\n"; + $code .= ".type $name,\@function,$nargs\n"; + $code .= ".align $align\n"; + $code .= "$name:\n"; + } +} + +sub end_function() { + my ($name) = @_; + if($kernel) { + $code .= "SYM_FUNC_END($name)\n"; + } else { + $code .= ".size $name,.-$name\n"; + } +} + +$code.=<<___ if $kernel; +#include <linux/cfi_types.h> +___ + +if ($avx) { +$code.=<<___ if $kernel; +.section .rodata +___ +$code.=<<___; +.align 64 +.Lconst: +.Lmask24: +.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 +.L129: +.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 +.Lmask26: +.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 +.Lpermd_avx2: +.long 2,2,2,3,2,0,2,1 +.Lpermd_avx512: +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 + +.L2_44_inp_permd: +.long 0,1,1,2,2,3,7,7 +.L2_44_inp_shift: +.quad 0,12,24,64 +.L2_44_mask: +.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff +.L2_44_shift_rgt: +.quad 44,44,42,64 +.L2_44_shift_lft: +.quad 8,8,10,64 + +.align 64 +.Lx_mask44: +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.Lx_mask42: +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +___ +} +$code.=<<___ if (!$kernel); +.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +.align 16 +___ + +my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); +my ($mac,$nonce)=($inp,$len); # *_emit arguments +my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13"); +my ($h0,$h1,$h2)=("%r14","%rbx","%r10"); + +sub poly1305_iteration { +# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 +# output: $h0-$h2 *= $r0-$r1 +$code.=<<___; + mulq $h0 # h0*r1 + mov %rax,$d2 + mov $r0,%rax + mov %rdx,$d3 + + mulq $h0 # h0*r0 + mov %rax,$h0 # future $h0 + mov $r0,%rax + mov %rdx,$d1 + + mulq $h1 # h1*r0 + add %rax,$d2 + mov $s1,%rax + adc %rdx,$d3 + + mulq $h1 # h1*s1 + mov $h2,$h1 # borrow $h1 + add %rax,$h0 + adc %rdx,$d1 + + imulq $s1,$h1 # h2*s1 + add $h1,$d2 + mov $d1,$h1 + adc \$0,$d3 + + imulq $r0,$h2 # h2*r0 + add $d2,$h1 + mov \$-4,%rax # mask value + adc $h2,$d3 + + and $d3,%rax # last reduction step + mov $d3,$h2 + shr \$2,$d3 + and \$3,$h2 + add $d3,%rax + add %rax,$h0 + adc \$0,$h1 + adc \$0,$h2 +___ +} + +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int64 h[3]; # current hash value base 2^64 +# unsigned __int64 r[2]; # key value base 2^64 + +$code.=<<___; +.text +___ +$code.=<<___ if (!$kernel); +.extern OPENSSL_ia32cap_P + +.globl poly1305_block_init_arch +.hidden poly1305_block_init_arch +.globl poly1305_blocks_x86_64 +.hidden poly1305_blocks_x86_64 +.globl poly1305_emit_x86_64 +.hidden poly1305_emit_x86_64 +___ +&declare_typed_function("poly1305_block_init_arch", 32, 3); +$code.=<<___; + xor %eax,%eax + mov %rax,0($ctx) # initialize hash value + mov %rax,8($ctx) + mov %rax,16($ctx) + + test $inp,$inp + je .Lno_key +___ +$code.=<<___ if (!$kernel); + lea poly1305_blocks_x86_64(%rip),%r10 + lea poly1305_emit_x86_64(%rip),%r11 +___ +$code.=<<___ if (!$kernel && $avx); + mov OPENSSL_ia32cap_P+4(%rip),%r9 + lea poly1305_blocks_avx(%rip),%rax + lea poly1305_emit_avx(%rip),%rcx + bt \$`60-32`,%r9 # AVX? + cmovc %rax,%r10 + cmovc %rcx,%r11 +___ +$code.=<<___ if (!$kernel && $avx>1); + lea poly1305_blocks_avx2(%rip),%rax + bt \$`5+32`,%r9 # AVX2? + cmovc %rax,%r10 +___ +$code.=<<___ if (!$kernel && $avx>3); + mov \$`(1<<31|1<<21|1<<16)`,%rax + shr \$32,%r9 + and %rax,%r9 + cmp %rax,%r9 + je .Linit_base2_44 +___ +$code.=<<___; + mov \$0x0ffffffc0fffffff,%rax + mov \$0x0ffffffc0ffffffc,%rcx + and 0($inp),%rax + and 8($inp),%rcx + mov %rax,24($ctx) + mov %rcx,32($ctx) +___ +$code.=<<___ if (!$kernel && $flavour !~ /elf32/); + mov %r10,0(%rdx) + mov %r11,8(%rdx) +___ +$code.=<<___ if (!$kernel && $flavour =~ /elf32/); + mov %r10d,0(%rdx) + mov %r11d,4(%rdx) +___ +$code.=<<___; + mov \$1,%eax +.Lno_key: + RET +___ +&end_function("poly1305_block_init_arch"); + +&declare_function("poly1305_blocks_x86_64", 32, 4); +$code.=<<___; +.cfi_startproc +.Lblocks: + shr \$4,$len + jz .Lno_data # too short + + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $ctx +.cfi_push $ctx +.Lblocks_body: + + mov $len,%r15 # reassign $len + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + mov 0($ctx),$h0 # load hash value + mov 8($ctx),$h1 + mov 16($ctx),$h2 + + mov $s1,$r1 + shr \$2,$s1 + mov $r1,%rax + add $r1,$s1 # s1 = r1 + (r1 >> 2) + jmp .Loop + +.align 32 +.Loop: + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 +___ + + &poly1305_iteration(); + +$code.=<<___; + mov $r1,%rax + dec %r15 # len-=16 + jnz .Loop + + mov 0(%rsp),$ctx +.cfi_restore $ctx + + mov $h0,0($ctx) # store hash value + mov $h1,8($ctx) + mov $h2,16($ctx) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lno_data: +.Lblocks_epilogue: + RET +.cfi_endproc +___ +&end_function("poly1305_blocks_x86_64"); + +&declare_function("poly1305_emit_x86_64", 32, 3); +$code.=<<___; +.Lemit: + mov 0($ctx),%r8 # load hash value + mov 8($ctx),%r9 + mov 16($ctx),%r10 + + mov %r8,%rax + add \$5,%r8 # compare to modulus + mov %r9,%rcx + adc \$0,%r9 + adc \$0,%r10 + shr \$2,%r10 # did 130-bit value overflow? + cmovnz %r8,%rax + cmovnz %r9,%rcx + + add 0($nonce),%rax # accumulate nonce + adc 8($nonce),%rcx + mov %rax,0($mac) # write result + mov %rcx,8($mac) + + RET +___ +&end_function("poly1305_emit_x86_64"); +if ($avx) { + +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int32 h[5]; # current hash value base 2^26 +# unsigned __int32 is_base2_26; +# unsigned __int64 r[2]; # key value base 2^64 +# unsigned __int64 pad; +# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; +# +# where r^n are base 2^26 digits of degrees of multiplier key. There are +# 5 digits, but last four are interleaved with multiples of 5, totalling +# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. + +my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = + map("%xmm$_",(0..15)); + +$code.=<<___; +.type __poly1305_block,\@abi-omnipotent +.align 32 +__poly1305_block: + push $ctx +___ + &poly1305_iteration(); +$code.=<<___; + pop $ctx + RET +.size __poly1305_block,.-__poly1305_block + +.type __poly1305_init_avx,\@abi-omnipotent +.align 32 +__poly1305_init_avx: + push %rbp + mov %rsp,%rbp + mov $r0,$h0 + mov $r1,$h1 + xor $h2,$h2 + + lea 48+64($ctx),$ctx # size optimization + + mov $r1,%rax + call __poly1305_block # r^2 + + mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 + mov \$0x3ffffff,%edx + mov $h0,$d1 + and $h0#d,%eax + mov $r0,$d2 + and $r0#d,%edx + mov %eax,`16*0+0-64`($ctx) + shr \$26,$d1 + mov %edx,`16*0+4-64`($ctx) + shr \$26,$d2 + + mov \$0x3ffffff,%eax + mov \$0x3ffffff,%edx + and $d1#d,%eax + and $d2#d,%edx + mov %eax,`16*1+0-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov %edx,`16*1+4-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + mov %eax,`16*2+0-64`($ctx) + shr \$26,$d1 + mov %edx,`16*2+4-64`($ctx) + shr \$26,$d2 + + mov $h1,%rax + mov $r1,%rdx + shl \$12,%rax + shl \$12,%rdx + or $d1,%rax + or $d2,%rdx + and \$0x3ffffff,%eax + and \$0x3ffffff,%edx + mov %eax,`16*3+0-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov %edx,`16*3+4-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + mov %eax,`16*4+0-64`($ctx) + mov $h1,$d1 + mov %edx,`16*4+4-64`($ctx) + mov $r1,$d2 + + mov \$0x3ffffff,%eax + mov \$0x3ffffff,%edx + shr \$14,$d1 + shr \$14,$d2 + and $d1#d,%eax + and $d2#d,%edx + mov %eax,`16*5+0-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov %edx,`16*5+4-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + mov %eax,`16*6+0-64`($ctx) + shr \$26,$d1 + mov %edx,`16*6+4-64`($ctx) + shr \$26,$d2 + + mov $h2,%rax + shl \$24,%rax + or %rax,$d1 + mov $d1#d,`16*7+0-64`($ctx) + lea ($d1,$d1,4),$d1 # *5 + mov $d2#d,`16*7+4-64`($ctx) + lea ($d2,$d2,4),$d2 # *5 + mov $d1#d,`16*8+0-64`($ctx) + mov $d2#d,`16*8+4-64`($ctx) + + mov $r1,%rax + call __poly1305_block # r^3 + + mov \$0x3ffffff,%eax # save r^3 base 2^26 + mov $h0,$d1 + and $h0#d,%eax + shr \$26,$d1 + mov %eax,`16*0+12-64`($ctx) + + mov \$0x3ffffff,%edx + and $d1#d,%edx + mov %edx,`16*1+12-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*2+12-64`($ctx) + + mov $h1,%rax + shl \$12,%rax + or $d1,%rax + and \$0x3ffffff,%eax + mov %eax,`16*3+12-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov $h1,$d1 + mov %eax,`16*4+12-64`($ctx) + + mov \$0x3ffffff,%edx + shr \$14,$d1 + and $d1#d,%edx + mov %edx,`16*5+12-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*6+12-64`($ctx) + + mov $h2,%rax + shl \$24,%rax + or %rax,$d1 + mov $d1#d,`16*7+12-64`($ctx) + lea ($d1,$d1,4),$d1 # *5 + mov $d1#d,`16*8+12-64`($ctx) + + mov $r1,%rax + call __poly1305_block # r^4 + + mov \$0x3ffffff,%eax # save r^4 base 2^26 + mov $h0,$d1 + and $h0#d,%eax + shr \$26,$d1 + mov %eax,`16*0+8-64`($ctx) + + mov \$0x3ffffff,%edx + and $d1#d,%edx + mov %edx,`16*1+8-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*2+8-64`($ctx) + + mov $h1,%rax + shl \$12,%rax + or $d1,%rax + and \$0x3ffffff,%eax + mov %eax,`16*3+8-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov $h1,$d1 + mov %eax,`16*4+8-64`($ctx) + + mov \$0x3ffffff,%edx + shr \$14,$d1 + and $d1#d,%edx + mov %edx,`16*5+8-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*6+8-64`($ctx) + + mov $h2,%rax + shl \$24,%rax + or %rax,$d1 + mov $d1#d,`16*7+8-64`($ctx) + lea ($d1,$d1,4),$d1 # *5 + mov $d1#d,`16*8+8-64`($ctx) + + lea -48-64($ctx),$ctx # size [de-]optimization + pop %rbp + RET +.size __poly1305_init_avx,.-__poly1305_init_avx +___ + +&declare_function("poly1305_blocks_avx", 32, 4); +$code.=<<___; +.cfi_startproc + mov 20($ctx),%r8d # is_base2_26 + cmp \$128,$len + jae .Lblocks_avx + test %r8d,%r8d + jz .Lblocks + +.Lblocks_avx: + and \$-16,$len + jz .Lno_data_avx + + vzeroupper + + test %r8d,%r8d + jz .Lbase2_64_avx + + test \$31,$len + jz .Leven_avx + + push %rbp +.cfi_push %rbp + mov %rsp,%rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lblocks_avx_body: + + mov $len,%r15 # reassign $len + + mov 0($ctx),$d1 # load hash value + mov 8($ctx),$d2 + mov 16($ctx),$h2#d + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + ################################# base 2^26 -> base 2^64 + mov $d1#d,$h0#d + and \$`-1*(1<<31)`,$d1 + mov $d2,$r1 # borrow $r1 + mov $d2#d,$h1#d + and \$`-1*(1<<31)`,$d2 + + shr \$6,$d1 + shl \$52,$r1 + add $d1,$h0 + shr \$12,$h1 + shr \$18,$d2 + add $r1,$h0 + adc $d2,$h1 + + mov $h2,$d1 + shl \$40,$d1 + shr \$24,$h2 + add $d1,$h1 + adc \$0,$h2 # can be partially reduced... + + mov \$-4,$d2 # ... so reduce + mov $h2,$d1 + and $h2,$d2 + shr \$2,$d1 + and \$3,$h2 + add $d2,$d1 # =*5 + add $d1,$h0 + adc \$0,$h1 + adc \$0,$h2 + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + + call __poly1305_block + + test $padbit,$padbit # if $padbit is zero, + jz .Lstore_base2_64_avx # store hash in base 2^64 format + + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$r0 + mov $h1,$r1 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$r0 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $r0,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$r1 + and \$0x3ffffff,$h1 # h[3] + or $r1,$h2 # h[4] + + sub \$16,%r15 + jz .Lstore_base2_26_avx + + vmovd %rax#d,$H0 + vmovd %rdx#d,$H1 + vmovd $h0#d,$H2 + vmovd $h1#d,$H3 + vmovd $h2#d,$H4 + jmp .Lproceed_avx + +.align 32 +.Lstore_base2_64_avx: + mov $h0,0($ctx) + mov $h1,8($ctx) + mov $h2,16($ctx) # note that is_base2_26 is zeroed + jmp .Ldone_avx + +.align 16 +.Lstore_base2_26_avx: + mov %rax#d,0($ctx) # store hash value base 2^26 + mov %rdx#d,4($ctx) + mov $h0#d,8($ctx) + mov $h1#d,12($ctx) + mov $h2#d,16($ctx) +.align 16 +.Ldone_avx: + pop %r15 +.cfi_restore %r15 + pop %r14 +.cfi_restore %r14 + pop %r13 +.cfi_restore %r13 + pop %r12 +.cfi_restore %r12 + pop %rbx +.cfi_restore %rbx + pop %rbp +.cfi_restore %rbp +.Lno_data_avx: +.Lblocks_avx_epilogue: + RET +.cfi_endproc + +.align 32 +.Lbase2_64_avx: +.cfi_startproc + push %rbp +.cfi_push %rbp + mov %rsp,%rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lbase2_64_avx_body: + + mov $len,%r15 # reassign $len + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + mov 0($ctx),$h0 # load hash value + mov 8($ctx),$h1 + mov 16($ctx),$h2#d + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + + test \$31,$len + jz .Linit_avx + + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + sub \$16,%r15 + + call __poly1305_block + +.Linit_avx: + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$d1 + mov $h1,$d2 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$d1 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $d1,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$d2 + and \$0x3ffffff,$h1 # h[3] + or $d2,$h2 # h[4] + + vmovd %rax#d,$H0 + vmovd %rdx#d,$H1 + vmovd $h0#d,$H2 + vmovd $h1#d,$H3 + vmovd $h2#d,$H4 + movl \$1,20($ctx) # set is_base2_26 + + call __poly1305_init_avx + +.Lproceed_avx: + mov %r15,$len + pop %r15 +.cfi_restore %r15 + pop %r14 +.cfi_restore %r14 + pop %r13 +.cfi_restore %r13 + pop %r12 +.cfi_restore %r12 + pop %rbx +.cfi_restore %rbx + pop %rbp +.cfi_restore %rbp +.Lbase2_64_avx_epilogue: + jmp .Ldo_avx +.cfi_endproc + +.align 32 +.Leven_avx: +.cfi_startproc + vmovd 4*0($ctx),$H0 # load hash value + vmovd 4*1($ctx),$H1 + vmovd 4*2($ctx),$H2 + vmovd 4*3($ctx),$H3 + vmovd 4*4($ctx),$H4 + +.Ldo_avx: +___ +$code.=<<___ if (!$win64); + lea 8(%rsp),%r10 +.cfi_def_cfa_register %r10 + and \$-32,%rsp + sub \$-8,%rsp + lea -0x58(%rsp),%r11 + sub \$0x178,%rsp +___ +$code.=<<___ if ($win64); + lea -0xf8(%rsp),%r11 + sub \$0x218,%rsp + vmovdqa %xmm6,0x50(%r11) + vmovdqa %xmm7,0x60(%r11) + vmovdqa %xmm8,0x70(%r11) + vmovdqa %xmm9,0x80(%r11) + vmovdqa %xmm10,0x90(%r11) + vmovdqa %xmm11,0xa0(%r11) + vmovdqa %xmm12,0xb0(%r11) + vmovdqa %xmm13,0xc0(%r11) + vmovdqa %xmm14,0xd0(%r11) + vmovdqa %xmm15,0xe0(%r11) +.Ldo_avx_body: +___ +$code.=<<___; + sub \$64,$len + lea -32($inp),%rax + cmovc %rax,$inp + + vmovdqu `16*3`($ctx),$D4 # preload r0^2 + lea `16*3+64`($ctx),$ctx # size optimization + lea .Lconst(%rip),%rcx + + ################################################################ + # load input + vmovdqu 16*2($inp),$T0 + vmovdqu 16*3($inp),$T1 + vmovdqa 64(%rcx),$MASK # .Lmask26 + + vpsrldq \$6,$T0,$T2 # splat input + vpsrldq \$6,$T1,$T3 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpunpcklqdq $T3,$T2,$T3 # 2:3 + + vpsrlq \$40,$T4,$T4 # 4 + vpsrlq \$26,$T0,$T1 + vpand $MASK,$T0,$T0 # 0 + vpsrlq \$4,$T3,$T2 + vpand $MASK,$T1,$T1 # 1 + vpsrlq \$30,$T3,$T3 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + jbe .Lskip_loop_avx + + # expand and copy pre-calculated table to stack + vmovdqu `16*1-64`($ctx),$D1 + vmovdqu `16*2-64`($ctx),$D2 + vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 + vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 + vmovdqa $D3,-0x90(%r11) + vmovdqa $D0,0x00(%rsp) + vpshufd \$0xEE,$D1,$D4 + vmovdqu `16*3-64`($ctx),$D0 + vpshufd \$0x44,$D1,$D1 + vmovdqa $D4,-0x80(%r11) + vmovdqa $D1,0x10(%rsp) + vpshufd \$0xEE,$D2,$D3 + vmovdqu `16*4-64`($ctx),$D1 + vpshufd \$0x44,$D2,$D2 + vmovdqa $D3,-0x70(%r11) + vmovdqa $D2,0x20(%rsp) + vpshufd \$0xEE,$D0,$D4 + vmovdqu `16*5-64`($ctx),$D2 + vpshufd \$0x44,$D0,$D0 + vmovdqa $D4,-0x60(%r11) + vmovdqa $D0,0x30(%rsp) + vpshufd \$0xEE,$D1,$D3 + vmovdqu `16*6-64`($ctx),$D0 + vpshufd \$0x44,$D1,$D1 + vmovdqa $D3,-0x50(%r11) + vmovdqa $D1,0x40(%rsp) + vpshufd \$0xEE,$D2,$D4 + vmovdqu `16*7-64`($ctx),$D1 + vpshufd \$0x44,$D2,$D2 + vmovdqa $D4,-0x40(%r11) + vmovdqa $D2,0x50(%rsp) + vpshufd \$0xEE,$D0,$D3 + vmovdqu `16*8-64`($ctx),$D2 + vpshufd \$0x44,$D0,$D0 + vmovdqa $D3,-0x30(%r11) + vmovdqa $D0,0x60(%rsp) + vpshufd \$0xEE,$D1,$D4 + vpshufd \$0x44,$D1,$D1 + vmovdqa $D4,-0x20(%r11) + vmovdqa $D1,0x70(%rsp) + vpshufd \$0xEE,$D2,$D3 + vmovdqa 0x00(%rsp),$D4 # preload r0^2 + vpshufd \$0x44,$D2,$D2 + vmovdqa $D3,-0x10(%r11) + vmovdqa $D2,0x80(%rsp) + + jmp .Loop_avx + +.align 32 +.Loop_avx: + ################################################################ + # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + # \___________________/ + # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + # \___________________/ \____________________/ + # + # Note that we start with inp[2:3]*r^2. This is because it + # doesn't depend on reduction in previous iteration. + ################################################################ + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + # + # though note that $Tx and $Hx are "reversed" in this section, + # and $D4 is preloaded with r0^2... + + vpmuludq $T0,$D4,$D0 # d0 = h0*r0 + vpmuludq $T1,$D4,$D1 # d1 = h1*r0 + vmovdqa $H2,0x20(%r11) # offload hash + vpmuludq $T2,$D4,$D2 # d3 = h2*r0 + vmovdqa 0x10(%rsp),$H2 # r1^2 + vpmuludq $T3,$D4,$D3 # d3 = h3*r0 + vpmuludq $T4,$D4,$D4 # d4 = h4*r0 + + vmovdqa $H0,0x00(%r11) # + vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 + vmovdqa $H1,0x10(%r11) # + vpmuludq $T3,$H2,$H1 # h3*r1 + vpaddq $H0,$D0,$D0 # d0 += h4*s1 + vpaddq $H1,$D4,$D4 # d4 += h3*r1 + vmovdqa $H3,0x30(%r11) # + vpmuludq $T2,$H2,$H0 # h2*r1 + vpmuludq $T1,$H2,$H1 # h1*r1 + vpaddq $H0,$D3,$D3 # d3 += h2*r1 + vmovdqa 0x30(%rsp),$H3 # r2^2 + vpaddq $H1,$D2,$D2 # d2 += h1*r1 + vmovdqa $H4,0x40(%r11) # + vpmuludq $T0,$H2,$H2 # h0*r1 + vpmuludq $T2,$H3,$H0 # h2*r2 + vpaddq $H2,$D1,$D1 # d1 += h0*r1 + + vmovdqa 0x40(%rsp),$H4 # s2^2 + vpaddq $H0,$D4,$D4 # d4 += h2*r2 + vpmuludq $T1,$H3,$H1 # h1*r2 + vpmuludq $T0,$H3,$H3 # h0*r2 + vpaddq $H1,$D3,$D3 # d3 += h1*r2 + vmovdqa 0x50(%rsp),$H2 # r3^2 + vpaddq $H3,$D2,$D2 # d2 += h0*r2 + vpmuludq $T4,$H4,$H0 # h4*s2 + vpmuludq $T3,$H4,$H4 # h3*s2 + vpaddq $H0,$D1,$D1 # d1 += h4*s2 + vmovdqa 0x60(%rsp),$H3 # s3^2 + vpaddq $H4,$D0,$D0 # d0 += h3*s2 + + vmovdqa 0x80(%rsp),$H4 # s4^2 + vpmuludq $T1,$H2,$H1 # h1*r3 + vpmuludq $T0,$H2,$H2 # h0*r3 + vpaddq $H1,$D4,$D4 # d4 += h1*r3 + vpaddq $H2,$D3,$D3 # d3 += h0*r3 + vpmuludq $T4,$H3,$H0 # h4*s3 + vpmuludq $T3,$H3,$H1 # h3*s3 + vpaddq $H0,$D2,$D2 # d2 += h4*s3 + vmovdqu 16*0($inp),$H0 # load input + vpaddq $H1,$D1,$D1 # d1 += h3*s3 + vpmuludq $T2,$H3,$H3 # h2*s3 + vpmuludq $T2,$H4,$T2 # h2*s4 + vpaddq $H3,$D0,$D0 # d0 += h2*s3 + + vmovdqu 16*1($inp),$H1 # + vpaddq $T2,$D1,$D1 # d1 += h2*s4 + vpmuludq $T3,$H4,$T3 # h3*s4 + vpmuludq $T4,$H4,$T4 # h4*s4 + vpsrldq \$6,$H0,$H2 # splat input + vpaddq $T3,$D2,$D2 # d2 += h3*s4 + vpaddq $T4,$D3,$D3 # d3 += h4*s4 + vpsrldq \$6,$H1,$H3 # + vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 + vpmuludq $T1,$H4,$T0 # h1*s4 + vpunpckhqdq $H1,$H0,$H4 # 4 + vpaddq $T4,$D4,$D4 # d4 += h0*r4 + vmovdqa -0x90(%r11),$T4 # r0^4 + vpaddq $T0,$D0,$D0 # d0 += h1*s4 + + vpunpcklqdq $H1,$H0,$H0 # 0:1 + vpunpcklqdq $H3,$H2,$H3 # 2:3 + + #vpsrlq \$40,$H4,$H4 # 4 + vpsrldq \$`40/8`,$H4,$H4 # 4 + vpsrlq \$26,$H0,$H1 + vpand $MASK,$H0,$H0 # 0 + vpsrlq \$4,$H3,$H2 + vpand $MASK,$H1,$H1 # 1 + vpand 0(%rcx),$H4,$H4 # .Lmask24 + vpsrlq \$30,$H3,$H3 + vpand $MASK,$H2,$H2 # 2 + vpand $MASK,$H3,$H3 # 3 + vpor 32(%rcx),$H4,$H4 # padbit, yes, always + + vpaddq 0x00(%r11),$H0,$H0 # add hash value + vpaddq 0x10(%r11),$H1,$H1 + vpaddq 0x20(%r11),$H2,$H2 + vpaddq 0x30(%r11),$H3,$H3 + vpaddq 0x40(%r11),$H4,$H4 + + lea 16*2($inp),%rax + lea 16*4($inp),$inp + sub \$64,$len + cmovc %rax,$inp + + ################################################################ + # Now we accumulate (inp[0:1]+hash)*r^4 + ################################################################ + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + vpmuludq $H0,$T4,$T0 # h0*r0 + vpmuludq $H1,$T4,$T1 # h1*r0 + vpaddq $T0,$D0,$D0 + vpaddq $T1,$D1,$D1 + vmovdqa -0x80(%r11),$T2 # r1^4 + vpmuludq $H2,$T4,$T0 # h2*r0 + vpmuludq $H3,$T4,$T1 # h3*r0 + vpaddq $T0,$D2,$D2 + vpaddq $T1,$D3,$D3 + vpmuludq $H4,$T4,$T4 # h4*r0 + vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 + vpaddq $T4,$D4,$D4 + + vpaddq $T0,$D0,$D0 # d0 += h4*s1 + vpmuludq $H2,$T2,$T1 # h2*r1 + vpmuludq $H3,$T2,$T0 # h3*r1 + vpaddq $T1,$D3,$D3 # d3 += h2*r1 + vmovdqa -0x60(%r11),$T3 # r2^4 + vpaddq $T0,$D4,$D4 # d4 += h3*r1 + vpmuludq $H1,$T2,$T1 # h1*r1 + vpmuludq $H0,$T2,$T2 # h0*r1 + vpaddq $T1,$D2,$D2 # d2 += h1*r1 + vpaddq $T2,$D1,$D1 # d1 += h0*r1 + + vmovdqa -0x50(%r11),$T4 # s2^4 + vpmuludq $H2,$T3,$T0 # h2*r2 + vpmuludq $H1,$T3,$T1 # h1*r2 + vpaddq $T0,$D4,$D4 # d4 += h2*r2 + vpaddq $T1,$D3,$D3 # d3 += h1*r2 + vmovdqa -0x40(%r11),$T2 # r3^4 + vpmuludq $H0,$T3,$T3 # h0*r2 + vpmuludq $H4,$T4,$T0 # h4*s2 + vpaddq $T3,$D2,$D2 # d2 += h0*r2 + vpaddq $T0,$D1,$D1 # d1 += h4*s2 + vmovdqa -0x30(%r11),$T3 # s3^4 + vpmuludq $H3,$T4,$T4 # h3*s2 + vpmuludq $H1,$T2,$T1 # h1*r3 + vpaddq $T4,$D0,$D0 # d0 += h3*s2 + + vmovdqa -0x10(%r11),$T4 # s4^4 + vpaddq $T1,$D4,$D4 # d4 += h1*r3 + vpmuludq $H0,$T2,$T2 # h0*r3 + vpmuludq $H4,$T3,$T0 # h4*s3 + vpaddq $T2,$D3,$D3 # d3 += h0*r3 + vpaddq $T0,$D2,$D2 # d2 += h4*s3 + vmovdqu 16*2($inp),$T0 # load input + vpmuludq $H3,$T3,$T2 # h3*s3 + vpmuludq $H2,$T3,$T3 # h2*s3 + vpaddq $T2,$D1,$D1 # d1 += h3*s3 + vmovdqu 16*3($inp),$T1 # + vpaddq $T3,$D0,$D0 # d0 += h2*s3 + + vpmuludq $H2,$T4,$H2 # h2*s4 + vpmuludq $H3,$T4,$H3 # h3*s4 + vpsrldq \$6,$T0,$T2 # splat input + vpaddq $H2,$D1,$D1 # d1 += h2*s4 + vpmuludq $H4,$T4,$H4 # h4*s4 + vpsrldq \$6,$T1,$T3 # + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 + vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 + vpmuludq $H1,$T4,$H0 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 + + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpunpcklqdq $T3,$T2,$T3 # 2:3 + + #vpsrlq \$40,$T4,$T4 # 4 + vpsrldq \$`40/8`,$T4,$T4 # 4 + vpsrlq \$26,$T0,$T1 + vmovdqa 0x00(%rsp),$D4 # preload r0^2 + vpand $MASK,$T0,$T0 # 0 + vpsrlq \$4,$T3,$T2 + vpand $MASK,$T1,$T1 # 1 + vpand 0(%rcx),$T4,$T4 # .Lmask24 + vpsrlq \$30,$T3,$T3 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + ################################################################ + # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + # and P. Schwabe + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$D1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D0 + vpand $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D0,$H0,$H0 + vpsllq \$2,$D0,$D0 + vpaddq $D0,$H0,$H0 # h4 -> h0 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + ja .Loop_avx + +.Lskip_loop_avx: + ################################################################ + # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 + add \$32,$len + jnz .Long_tail_avx + + vpaddq $H2,$T2,$T2 + vpaddq $H0,$T0,$T0 + vpaddq $H1,$T1,$T1 + vpaddq $H3,$T3,$T3 + vpaddq $H4,$T4,$T4 + +.Long_tail_avx: + vmovdqa $H2,0x20(%r11) + vmovdqa $H0,0x00(%r11) + vmovdqa $H1,0x10(%r11) + vmovdqa $H3,0x30(%r11) + vmovdqa $H4,0x40(%r11) + + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + vpmuludq $T2,$D4,$D2 # d2 = h2*r0 + vpmuludq $T0,$D4,$D0 # d0 = h0*r0 + vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n + vpmuludq $T1,$D4,$D1 # d1 = h1*r0 + vpmuludq $T3,$D4,$D3 # d3 = h3*r0 + vpmuludq $T4,$D4,$D4 # d4 = h4*r0 + + vpmuludq $T3,$H2,$H0 # h3*r1 + vpaddq $H0,$D4,$D4 # d4 += h3*r1 + vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n + vpmuludq $T2,$H2,$H1 # h2*r1 + vpaddq $H1,$D3,$D3 # d3 += h2*r1 + vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n + vpmuludq $T1,$H2,$H0 # h1*r1 + vpaddq $H0,$D2,$D2 # d2 += h1*r1 + vpmuludq $T0,$H2,$H2 # h0*r1 + vpaddq $H2,$D1,$D1 # d1 += h0*r1 + vpmuludq $T4,$H3,$H3 # h4*s1 + vpaddq $H3,$D0,$D0 # d0 += h4*s1 + + vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n + vpmuludq $T2,$H4,$H1 # h2*r2 + vpaddq $H1,$D4,$D4 # d4 += h2*r2 + vpmuludq $T1,$H4,$H0 # h1*r2 + vpaddq $H0,$D3,$D3 # d3 += h1*r2 + vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n + vpmuludq $T0,$H4,$H4 # h0*r2 + vpaddq $H4,$D2,$D2 # d2 += h0*r2 + vpmuludq $T4,$H2,$H1 # h4*s2 + vpaddq $H1,$D1,$D1 # d1 += h4*s2 + vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n + vpmuludq $T3,$H2,$H2 # h3*s2 + vpaddq $H2,$D0,$D0 # d0 += h3*s2 + + vpmuludq $T1,$H3,$H0 # h1*r3 + vpaddq $H0,$D4,$D4 # d4 += h1*r3 + vpmuludq $T0,$H3,$H3 # h0*r3 + vpaddq $H3,$D3,$D3 # d3 += h0*r3 + vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n + vpmuludq $T4,$H4,$H1 # h4*s3 + vpaddq $H1,$D2,$D2 # d2 += h4*s3 + vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n + vpmuludq $T3,$H4,$H0 # h3*s3 + vpaddq $H0,$D1,$D1 # d1 += h3*s3 + vpmuludq $T2,$H4,$H4 # h2*s3 + vpaddq $H4,$D0,$D0 # d0 += h2*s3 + + vpmuludq $T0,$H2,$H2 # h0*r4 + vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 + vpmuludq $T4,$H3,$H1 # h4*s4 + vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 + vpmuludq $T3,$H3,$H0 # h3*s4 + vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 + vpmuludq $T2,$H3,$H1 # h2*s4 + vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 + vpmuludq $T1,$H3,$H3 # h1*s4 + vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 + + jz .Lshort_tail_avx + + vmovdqu 16*0($inp),$H0 # load input + vmovdqu 16*1($inp),$H1 + + vpsrldq \$6,$H0,$H2 # splat input + vpsrldq \$6,$H1,$H3 + vpunpckhqdq $H1,$H0,$H4 # 4 + vpunpcklqdq $H1,$H0,$H0 # 0:1 + vpunpcklqdq $H3,$H2,$H3 # 2:3 + + vpsrlq \$40,$H4,$H4 # 4 + vpsrlq \$26,$H0,$H1 + vpand $MASK,$H0,$H0 # 0 + vpsrlq \$4,$H3,$H2 + vpand $MASK,$H1,$H1 # 1 + vpsrlq \$30,$H3,$H3 + vpand $MASK,$H2,$H2 # 2 + vpand $MASK,$H3,$H3 # 3 + vpor 32(%rcx),$H4,$H4 # padbit, yes, always + + vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 + vpaddq 0x00(%r11),$H0,$H0 + vpaddq 0x10(%r11),$H1,$H1 + vpaddq 0x20(%r11),$H2,$H2 + vpaddq 0x30(%r11),$H3,$H3 + vpaddq 0x40(%r11),$H4,$H4 + + ################################################################ + # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate + + vpmuludq $H0,$T4,$T0 # h0*r0 + vpaddq $T0,$D0,$D0 # d0 += h0*r0 + vpmuludq $H1,$T4,$T1 # h1*r0 + vpaddq $T1,$D1,$D1 # d1 += h1*r0 + vpmuludq $H2,$T4,$T0 # h2*r0 + vpaddq $T0,$D2,$D2 # d2 += h2*r0 + vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n + vpmuludq $H3,$T4,$T1 # h3*r0 + vpaddq $T1,$D3,$D3 # d3 += h3*r0 + vpmuludq $H4,$T4,$T4 # h4*r0 + vpaddq $T4,$D4,$D4 # d4 += h4*r0 + + vpmuludq $H3,$T2,$T0 # h3*r1 + vpaddq $T0,$D4,$D4 # d4 += h3*r1 + vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 + vpmuludq $H2,$T2,$T1 # h2*r1 + vpaddq $T1,$D3,$D3 # d3 += h2*r1 + vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 + vpmuludq $H1,$T2,$T0 # h1*r1 + vpaddq $T0,$D2,$D2 # d2 += h1*r1 + vpmuludq $H0,$T2,$T2 # h0*r1 + vpaddq $T2,$D1,$D1 # d1 += h0*r1 + vpmuludq $H4,$T3,$T3 # h4*s1 + vpaddq $T3,$D0,$D0 # d0 += h4*s1 + + vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 + vpmuludq $H2,$T4,$T1 # h2*r2 + vpaddq $T1,$D4,$D4 # d4 += h2*r2 + vpmuludq $H1,$T4,$T0 # h1*r2 + vpaddq $T0,$D3,$D3 # d3 += h1*r2 + vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 + vpmuludq $H0,$T4,$T4 # h0*r2 + vpaddq $T4,$D2,$D2 # d2 += h0*r2 + vpmuludq $H4,$T2,$T1 # h4*s2 + vpaddq $T1,$D1,$D1 # d1 += h4*s2 + vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 + vpmuludq $H3,$T2,$T2 # h3*s2 + vpaddq $T2,$D0,$D0 # d0 += h3*s2 + + vpmuludq $H1,$T3,$T0 # h1*r3 + vpaddq $T0,$D4,$D4 # d4 += h1*r3 + vpmuludq $H0,$T3,$T3 # h0*r3 + vpaddq $T3,$D3,$D3 # d3 += h0*r3 + vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 + vpmuludq $H4,$T4,$T1 # h4*s3 + vpaddq $T1,$D2,$D2 # d2 += h4*s3 + vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 + vpmuludq $H3,$T4,$T0 # h3*s3 + vpaddq $T0,$D1,$D1 # d1 += h3*s3 + vpmuludq $H2,$T4,$T4 # h2*s3 + vpaddq $T4,$D0,$D0 # d0 += h2*s3 + + vpmuludq $H0,$T2,$T2 # h0*r4 + vpaddq $T2,$D4,$D4 # d4 += h0*r4 + vpmuludq $H4,$T3,$T1 # h4*s4 + vpaddq $T1,$D3,$D3 # d3 += h4*s4 + vpmuludq $H3,$T3,$T0 # h3*s4 + vpaddq $T0,$D2,$D2 # d2 += h3*s4 + vpmuludq $H2,$T3,$T1 # h2*s4 + vpaddq $T1,$D1,$D1 # d1 += h2*s4 + vpmuludq $H1,$T3,$T3 # h1*s4 + vpaddq $T3,$D0,$D0 # d0 += h1*s4 + +.Lshort_tail_avx: + ################################################################ + # horizontal addition + + vpsrldq \$8,$D4,$T4 + vpsrldq \$8,$D3,$T3 + vpsrldq \$8,$D1,$T1 + vpsrldq \$8,$D0,$T0 + vpsrldq \$8,$D2,$T2 + vpaddq $T3,$D3,$D3 + vpaddq $T4,$D4,$D4 + vpaddq $T0,$D0,$D0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$D2,$D2 + + ################################################################ + # lazy reduction + + vpsrlq \$26,$D3,$H3 + vpand $MASK,$D3,$D3 + vpaddq $H3,$D4,$D4 # h3 -> h4 + + vpsrlq \$26,$D0,$H0 + vpand $MASK,$D0,$D0 + vpaddq $H0,$D1,$D1 # h0 -> h1 + + vpsrlq \$26,$D4,$H4 + vpand $MASK,$D4,$D4 + + vpsrlq \$26,$D1,$H1 + vpand $MASK,$D1,$D1 + vpaddq $H1,$D2,$D2 # h1 -> h2 + + vpaddq $H4,$D0,$D0 + vpsllq \$2,$H4,$H4 + vpaddq $H4,$D0,$D0 # h4 -> h0 + + vpsrlq \$26,$D2,$H2 + vpand $MASK,$D2,$D2 + vpaddq $H2,$D3,$D3 # h2 -> h3 + + vpsrlq \$26,$D0,$H0 + vpand $MASK,$D0,$D0 + vpaddq $H0,$D1,$D1 # h0 -> h1 + + vpsrlq \$26,$D3,$H3 + vpand $MASK,$D3,$D3 + vpaddq $H3,$D4,$D4 # h3 -> h4 + + vmovd $D0,`4*0-48-64`($ctx) # save partially reduced + vmovd $D1,`4*1-48-64`($ctx) + vmovd $D2,`4*2-48-64`($ctx) + vmovd $D3,`4*3-48-64`($ctx) + vmovd $D4,`4*4-48-64`($ctx) +___ +$code.=<<___ if ($win64); + vmovdqa 0x50(%r11),%xmm6 + vmovdqa 0x60(%r11),%xmm7 + vmovdqa 0x70(%r11),%xmm8 + vmovdqa 0x80(%r11),%xmm9 + vmovdqa 0x90(%r11),%xmm10 + vmovdqa 0xa0(%r11),%xmm11 + vmovdqa 0xb0(%r11),%xmm12 + vmovdqa 0xc0(%r11),%xmm13 + vmovdqa 0xd0(%r11),%xmm14 + vmovdqa 0xe0(%r11),%xmm15 + lea 0xf8(%r11),%rsp +.Ldo_avx_epilogue: +___ +$code.=<<___ if (!$win64); + lea -8(%r10),%rsp +.cfi_def_cfa_register %rsp +___ +$code.=<<___; + vzeroupper + RET +.cfi_endproc +___ +&end_function("poly1305_blocks_avx"); + +&declare_function("poly1305_emit_avx", 32, 3); +$code.=<<___; + cmpl \$0,20($ctx) # is_base2_26? + je .Lemit + + mov 0($ctx),%eax # load hash value base 2^26 + mov 4($ctx),%ecx + mov 8($ctx),%r8d + mov 12($ctx),%r11d + mov 16($ctx),%r10d + + shl \$26,%rcx # base 2^26 -> base 2^64 + mov %r8,%r9 + shl \$52,%r8 + add %rcx,%rax + shr \$12,%r9 + add %rax,%r8 # h0 + adc \$0,%r9 + + shl \$14,%r11 + mov %r10,%rax + shr \$24,%r10 + add %r11,%r9 + shl \$40,%rax + add %rax,%r9 # h1 + adc \$0,%r10 # h2 + + mov %r10,%rax # could be partially reduced, so reduce + mov %r10,%rcx + and \$3,%r10 + shr \$2,%rax + and \$-4,%rcx + add %rcx,%rax + add %rax,%r8 + adc \$0,%r9 + adc \$0,%r10 + + mov %r8,%rax + add \$5,%r8 # compare to modulus + mov %r9,%rcx + adc \$0,%r9 + adc \$0,%r10 + shr \$2,%r10 # did 130-bit value overflow? + cmovnz %r8,%rax + cmovnz %r9,%rcx + + add 0($nonce),%rax # accumulate nonce + adc 8($nonce),%rcx + mov %rax,0($mac) # write result + mov %rcx,8($mac) + + RET +___ +&end_function("poly1305_emit_avx"); + +if ($avx>1) { + +my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = + map("%ymm$_",(0..15)); +my $S4=$MASK; + +sub poly1305_blocks_avxN { + my ($avx512) = @_; + my $suffix = $avx512 ? "_avx512" : ""; +$code.=<<___; +.cfi_startproc + mov 20($ctx),%r8d # is_base2_26 + cmp \$128,$len + jae .Lblocks_avx2$suffix + test %r8d,%r8d + jz .Lblocks + +.Lblocks_avx2$suffix: + and \$-16,$len + jz .Lno_data_avx2$suffix + + vzeroupper + + test %r8d,%r8d + jz .Lbase2_64_avx2$suffix + + test \$63,$len + jz .Leven_avx2$suffix + + push %rbp +.cfi_push %rbp + mov %rsp,%rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lblocks_avx2_body$suffix: + + mov $len,%r15 # reassign $len + + mov 0($ctx),$d1 # load hash value + mov 8($ctx),$d2 + mov 16($ctx),$h2#d + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + ################################# base 2^26 -> base 2^64 + mov $d1#d,$h0#d + and \$`-1*(1<<31)`,$d1 + mov $d2,$r1 # borrow $r1 + mov $d2#d,$h1#d + and \$`-1*(1<<31)`,$d2 + + shr \$6,$d1 + shl \$52,$r1 + add $d1,$h0 + shr \$12,$h1 + shr \$18,$d2 + add $r1,$h0 + adc $d2,$h1 + + mov $h2,$d1 + shl \$40,$d1 + shr \$24,$h2 + add $d1,$h1 + adc \$0,$h2 # can be partially reduced... + + mov \$-4,$d2 # ... so reduce + mov $h2,$d1 + and $h2,$d2 + shr \$2,$d1 + and \$3,$h2 + add $d2,$d1 # =*5 + add $d1,$h0 + adc \$0,$h1 + adc \$0,$h2 + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + +.Lbase2_26_pre_avx2$suffix: + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + sub \$16,%r15 + + call __poly1305_block + mov $r1,%rax + + test \$63,%r15 + jnz .Lbase2_26_pre_avx2$suffix + + test $padbit,$padbit # if $padbit is zero, + jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format + + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$r0 + mov $h1,$r1 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$r0 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $r0,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$r1 + and \$0x3ffffff,$h1 # h[3] + or $r1,$h2 # h[4] + + test %r15,%r15 + jz .Lstore_base2_26_avx2$suffix + + vmovd %rax#d,%x#$H0 + vmovd %rdx#d,%x#$H1 + vmovd $h0#d,%x#$H2 + vmovd $h1#d,%x#$H3 + vmovd $h2#d,%x#$H4 + jmp .Lproceed_avx2$suffix + +.align 32 +.Lstore_base2_64_avx2$suffix: + mov $h0,0($ctx) + mov $h1,8($ctx) + mov $h2,16($ctx) # note that is_base2_26 is zeroed + jmp .Ldone_avx2$suffix + +.align 16 +.Lstore_base2_26_avx2$suffix: + mov %rax#d,0($ctx) # store hash value base 2^26 + mov %rdx#d,4($ctx) + mov $h0#d,8($ctx) + mov $h1#d,12($ctx) + mov $h2#d,16($ctx) +.align 16 +.Ldone_avx2$suffix: + pop %r15 +.cfi_restore %r15 + pop %r14 +.cfi_restore %r14 + pop %r13 +.cfi_restore %r13 + pop %r12 +.cfi_restore %r12 + pop %rbx +.cfi_restore %rbx + pop %rbp +.cfi_restore %rbp +.Lno_data_avx2$suffix: +.Lblocks_avx2_epilogue$suffix: + RET +.cfi_endproc + +.align 32 +.Lbase2_64_avx2$suffix: +.cfi_startproc + push %rbp +.cfi_push %rbp + mov %rsp,%rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lbase2_64_avx2_body$suffix: + + mov $len,%r15 # reassign $len + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + mov 0($ctx),$h0 # load hash value + mov 8($ctx),$h1 + mov 16($ctx),$h2#d + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + + test \$63,$len + jz .Linit_avx2$suffix + +.Lbase2_64_pre_avx2$suffix: + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + sub \$16,%r15 + + call __poly1305_block + mov $r1,%rax + + test \$63,%r15 + jnz .Lbase2_64_pre_avx2$suffix + +.Linit_avx2$suffix: + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$d1 + mov $h1,$d2 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$d1 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $d1,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$d2 + and \$0x3ffffff,$h1 # h[3] + or $d2,$h2 # h[4] + + vmovd %rax#d,%x#$H0 + vmovd %rdx#d,%x#$H1 + vmovd $h0#d,%x#$H2 + vmovd $h1#d,%x#$H3 + vmovd $h2#d,%x#$H4 + movl \$1,20($ctx) # set is_base2_26 + + call __poly1305_init_avx + +.Lproceed_avx2$suffix: + mov %r15,$len # restore $len +___ +$code.=<<___ if (!$kernel); + mov OPENSSL_ia32cap_P+8(%rip),%r9d + mov \$`(1<<31|1<<30|1<<16)`,%r11d +___ +$code.=<<___; + pop %r15 +.cfi_restore %r15 + pop %r14 +.cfi_restore %r14 + pop %r13 +.cfi_restore %r13 + pop %r12 +.cfi_restore %r12 + pop %rbx +.cfi_restore %rbx + pop %rbp +.cfi_restore %rbp +.Lbase2_64_avx2_epilogue$suffix: + jmp .Ldo_avx2$suffix +.cfi_endproc + +.align 32 +.Leven_avx2$suffix: +.cfi_startproc +___ +$code.=<<___ if (!$kernel); + mov OPENSSL_ia32cap_P+8(%rip),%r9d +___ +$code.=<<___; + vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 + vmovd 4*1($ctx),%x#$H1 + vmovd 4*2($ctx),%x#$H2 + vmovd 4*3($ctx),%x#$H3 + vmovd 4*4($ctx),%x#$H4 + +.Ldo_avx2$suffix: +___ +$code.=<<___ if (!$kernel && $avx>2); + cmp \$512,$len + jb .Lskip_avx512 + and %r11d,%r9d + test \$`1<<16`,%r9d # check for AVX512F + jnz .Lblocks_avx512 +.Lskip_avx512$suffix: +___ +$code.=<<___ if ($avx > 2 && $avx512 && $kernel); + cmp \$512,$len + jae .Lblocks_avx512 +___ +$code.=<<___ if (!$win64); + lea 8(%rsp),%r10 +.cfi_def_cfa_register %r10 + sub \$0x128,%rsp +___ +$code.=<<___ if ($win64); + lea 8(%rsp),%r10 + sub \$0x1c8,%rsp + vmovdqa %xmm6,-0xb0(%r10) + vmovdqa %xmm7,-0xa0(%r10) + vmovdqa %xmm8,-0x90(%r10) + vmovdqa %xmm9,-0x80(%r10) + vmovdqa %xmm10,-0x70(%r10) + vmovdqa %xmm11,-0x60(%r10) + vmovdqa %xmm12,-0x50(%r10) + vmovdqa %xmm13,-0x40(%r10) + vmovdqa %xmm14,-0x30(%r10) + vmovdqa %xmm15,-0x20(%r10) +.Ldo_avx2_body$suffix: +___ +$code.=<<___; + lea .Lconst(%rip),%rcx + lea 48+64($ctx),$ctx # size optimization + vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 + + # expand and copy pre-calculated table to stack + vmovdqu `16*0-64`($ctx),%x#$T2 + and \$-512,%rsp + vmovdqu `16*1-64`($ctx),%x#$T3 + vmovdqu `16*2-64`($ctx),%x#$T4 + vmovdqu `16*3-64`($ctx),%x#$D0 + vmovdqu `16*4-64`($ctx),%x#$D1 + vmovdqu `16*5-64`($ctx),%x#$D2 + lea 0x90(%rsp),%rax # size optimization + vmovdqu `16*6-64`($ctx),%x#$D3 + vpermd $T2,$T0,$T2 # 00003412 -> 14243444 + vmovdqu `16*7-64`($ctx),%x#$D4 + vpermd $T3,$T0,$T3 + vmovdqu `16*8-64`($ctx),%x#$MASK + vpermd $T4,$T0,$T4 + vmovdqa $T2,0x00(%rsp) + vpermd $D0,$T0,$D0 + vmovdqa $T3,0x20-0x90(%rax) + vpermd $D1,$T0,$D1 + vmovdqa $T4,0x40-0x90(%rax) + vpermd $D2,$T0,$D2 + vmovdqa $D0,0x60-0x90(%rax) + vpermd $D3,$T0,$D3 + vmovdqa $D1,0x80-0x90(%rax) + vpermd $D4,$T0,$D4 + vmovdqa $D2,0xa0-0x90(%rax) + vpermd $MASK,$T0,$MASK + vmovdqa $D3,0xc0-0x90(%rax) + vmovdqa $D4,0xe0-0x90(%rax) + vmovdqa $MASK,0x100-0x90(%rax) + vmovdqa 64(%rcx),$MASK # .Lmask26 + + ################################################################ + # load input + vmovdqu 16*0($inp),%x#$T0 + vmovdqu 16*1($inp),%x#$T1 + vinserti128 \$1,16*2($inp),$T0,$T0 + vinserti128 \$1,16*3($inp),$T1,$T1 + lea 16*4($inp),$inp + + vpsrldq \$6,$T0,$T2 # splat input + vpsrldq \$6,$T1,$T3 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpunpcklqdq $T3,$T2,$T2 # 2:3 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + + vpsrlq \$30,$T2,$T3 + vpsrlq \$4,$T2,$T2 + vpsrlq \$26,$T0,$T1 + vpsrlq \$40,$T4,$T4 # 4 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T0,$T0 # 0 + vpand $MASK,$T1,$T1 # 1 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + vpaddq $H2,$T2,$H2 # accumulate input + sub \$64,$len + jz .Ltail_avx2$suffix + jmp .Loop_avx2$suffix + +.align 32 +.Loop_avx2$suffix: + ################################################################ + # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 + # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 + # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 + # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 + # \________/\__________/ + ################################################################ + #vpaddq $H2,$T2,$H2 # accumulate input + vpaddq $H0,$T0,$H0 + vmovdqa `32*0`(%rsp),$T0 # r0^4 + vpaddq $H1,$T1,$H1 + vmovdqa `32*1`(%rsp),$T1 # r1^4 + vpaddq $H3,$T3,$H3 + vmovdqa `32*3`(%rsp),$T2 # r2^4 + vpaddq $H4,$T4,$H4 + vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 + vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 + + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + # + # however, as h2 is "chronologically" first one available pull + # corresponding operations up, so it's + # + # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 + # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 + + vpmuludq $H2,$T0,$D2 # d2 = h2*r0 + vpmuludq $H2,$T1,$D3 # d3 = h2*r1 + vpmuludq $H2,$T2,$D4 # d4 = h2*r2 + vpmuludq $H2,$T3,$D0 # d0 = h2*s3 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + + vpmuludq $H0,$T1,$T4 # h0*r1 + vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp + vpaddq $T4,$D1,$D1 # d1 += h0*r1 + vpaddq $H2,$D2,$D2 # d2 += h1*r1 + vpmuludq $H3,$T1,$T4 # h3*r1 + vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 + vpaddq $T4,$D4,$D4 # d4 += h3*r1 + vpaddq $H2,$D0,$D0 # d0 += h4*s1 + vmovdqa `32*4-0x90`(%rax),$T1 # s2 + + vpmuludq $H0,$T0,$T4 # h0*r0 + vpmuludq $H1,$T0,$H2 # h1*r0 + vpaddq $T4,$D0,$D0 # d0 += h0*r0 + vpaddq $H2,$D1,$D1 # d1 += h1*r0 + vpmuludq $H3,$T0,$T4 # h3*r0 + vpmuludq $H4,$T0,$H2 # h4*r0 + vmovdqu 16*0($inp),%x#$T0 # load input + vpaddq $T4,$D3,$D3 # d3 += h3*r0 + vpaddq $H2,$D4,$D4 # d4 += h4*r0 + vinserti128 \$1,16*2($inp),$T0,$T0 + + vpmuludq $H3,$T1,$T4 # h3*s2 + vpmuludq $H4,$T1,$H2 # h4*s2 + vmovdqu 16*1($inp),%x#$T1 + vpaddq $T4,$D0,$D0 # d0 += h3*s2 + vpaddq $H2,$D1,$D1 # d1 += h4*s2 + vmovdqa `32*5-0x90`(%rax),$H2 # r3 + vpmuludq $H1,$T2,$T4 # h1*r2 + vpmuludq $H0,$T2,$T2 # h0*r2 + vpaddq $T4,$D3,$D3 # d3 += h1*r2 + vpaddq $T2,$D2,$D2 # d2 += h0*r2 + vinserti128 \$1,16*3($inp),$T1,$T1 + lea 16*4($inp),$inp + + vpmuludq $H1,$H2,$T4 # h1*r3 + vpmuludq $H0,$H2,$H2 # h0*r3 + vpsrldq \$6,$T0,$T2 # splat input + vpaddq $T4,$D4,$D4 # d4 += h1*r3 + vpaddq $H2,$D3,$D3 # d3 += h0*r3 + vpmuludq $H3,$T3,$T4 # h3*s3 + vpmuludq $H4,$T3,$H2 # h4*s3 + vpsrldq \$6,$T1,$T3 + vpaddq $T4,$D1,$D1 # d1 += h3*s3 + vpaddq $H2,$D2,$D2 # d2 += h4*s3 + vpunpckhqdq $T1,$T0,$T4 # 4 + + vpmuludq $H3,$S4,$H3 # h3*s4 + vpmuludq $H4,$S4,$H4 # h4*s4 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 + vpunpcklqdq $T3,$T2,$T3 # 2:3 + vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 + vpmuludq $H1,$S4,$H0 # h1*s4 + vmovdqa 64(%rcx),$MASK # .Lmask26 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 + + ################################################################ + # lazy reduction (interleaved with tail of input splat) + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$D1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D4 + vpand $MASK,$H4,$H4 + + vpsrlq \$4,$T3,$T2 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpand $MASK,$T2,$T2 # 2 + vpsrlq \$26,$T0,$T1 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpaddq $T2,$H2,$H2 # modulo-scheduled + vpsrlq \$30,$T3,$T3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$40,$T4,$T4 # 4 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpand $MASK,$T0,$T0 # 0 + vpand $MASK,$T1,$T1 # 1 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + sub \$64,$len + jnz .Loop_avx2$suffix + + .byte 0x66,0x90 +.Ltail_avx2$suffix: + ################################################################ + # while above multiplications were by r^4 in all lanes, in last + # iteration we multiply least significant lane by r^4 and most + # significant one by r, so copy of above except that references + # to the precomputed table are displaced by 4... + + #vpaddq $H2,$T2,$H2 # accumulate input + vpaddq $H0,$T0,$H0 + vmovdqu `32*0+4`(%rsp),$T0 # r0^4 + vpaddq $H1,$T1,$H1 + vmovdqu `32*1+4`(%rsp),$T1 # r1^4 + vpaddq $H3,$T3,$H3 + vmovdqu `32*3+4`(%rsp),$T2 # r2^4 + vpaddq $H4,$T4,$H4 + vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 + vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 + + vpmuludq $H2,$T0,$D2 # d2 = h2*r0 + vpmuludq $H2,$T1,$D3 # d3 = h2*r1 + vpmuludq $H2,$T2,$D4 # d4 = h2*r2 + vpmuludq $H2,$T3,$D0 # d0 = h2*s3 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + + vpmuludq $H0,$T1,$T4 # h0*r1 + vpmuludq $H1,$T1,$H2 # h1*r1 + vpaddq $T4,$D1,$D1 # d1 += h0*r1 + vpaddq $H2,$D2,$D2 # d2 += h1*r1 + vpmuludq $H3,$T1,$T4 # h3*r1 + vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 + vpaddq $T4,$D4,$D4 # d4 += h3*r1 + vpaddq $H2,$D0,$D0 # d0 += h4*s1 + + vpmuludq $H0,$T0,$T4 # h0*r0 + vpmuludq $H1,$T0,$H2 # h1*r0 + vpaddq $T4,$D0,$D0 # d0 += h0*r0 + vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 + vpaddq $H2,$D1,$D1 # d1 += h1*r0 + vpmuludq $H3,$T0,$T4 # h3*r0 + vpmuludq $H4,$T0,$H2 # h4*r0 + vpaddq $T4,$D3,$D3 # d3 += h3*r0 + vpaddq $H2,$D4,$D4 # d4 += h4*r0 + + vpmuludq $H3,$T1,$T4 # h3*s2 + vpmuludq $H4,$T1,$H2 # h4*s2 + vpaddq $T4,$D0,$D0 # d0 += h3*s2 + vpaddq $H2,$D1,$D1 # d1 += h4*s2 + vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 + vpmuludq $H1,$T2,$T4 # h1*r2 + vpmuludq $H0,$T2,$T2 # h0*r2 + vpaddq $T4,$D3,$D3 # d3 += h1*r2 + vpaddq $T2,$D2,$D2 # d2 += h0*r2 + + vpmuludq $H1,$H2,$T4 # h1*r3 + vpmuludq $H0,$H2,$H2 # h0*r3 + vpaddq $T4,$D4,$D4 # d4 += h1*r3 + vpaddq $H2,$D3,$D3 # d3 += h0*r3 + vpmuludq $H3,$T3,$T4 # h3*s3 + vpmuludq $H4,$T3,$H2 # h4*s3 + vpaddq $T4,$D1,$D1 # d1 += h3*s3 + vpaddq $H2,$D2,$D2 # d2 += h4*s3 + + vpmuludq $H3,$S4,$H3 # h3*s4 + vpmuludq $H4,$S4,$H4 # h4*s4 + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 + vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 + vpmuludq $H1,$S4,$H0 # h1*s4 + vmovdqa 64(%rcx),$MASK # .Lmask26 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 + + ################################################################ + # horizontal addition + + vpsrldq \$8,$D1,$T1 + vpsrldq \$8,$H2,$T2 + vpsrldq \$8,$H3,$T3 + vpsrldq \$8,$H4,$T4 + vpsrldq \$8,$H0,$T0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$H2,$H2 + vpaddq $T3,$H3,$H3 + vpaddq $T4,$H4,$H4 + vpaddq $T0,$H0,$H0 + + vpermq \$0x2,$H3,$T3 + vpermq \$0x2,$H4,$T4 + vpermq \$0x2,$H0,$T0 + vpermq \$0x2,$D1,$T1 + vpermq \$0x2,$H2,$T2 + vpaddq $T3,$H3,$H3 + vpaddq $T4,$H4,$H4 + vpaddq $T0,$H0,$H0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$H2,$H2 + + ################################################################ + # lazy reduction + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$D1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D4 + vpand $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced + vmovd %x#$H1,`4*1-48-64`($ctx) + vmovd %x#$H2,`4*2-48-64`($ctx) + vmovd %x#$H3,`4*3-48-64`($ctx) + vmovd %x#$H4,`4*4-48-64`($ctx) +___ +$code.=<<___ if ($win64); + vmovdqa -0xb0(%r10),%xmm6 + vmovdqa -0xa0(%r10),%xmm7 + vmovdqa -0x90(%r10),%xmm8 + vmovdqa -0x80(%r10),%xmm9 + vmovdqa -0x70(%r10),%xmm10 + vmovdqa -0x60(%r10),%xmm11 + vmovdqa -0x50(%r10),%xmm12 + vmovdqa -0x40(%r10),%xmm13 + vmovdqa -0x30(%r10),%xmm14 + vmovdqa -0x20(%r10),%xmm15 + lea -8(%r10),%rsp +.Ldo_avx2_epilogue$suffix: +___ +$code.=<<___ if (!$win64); + lea -8(%r10),%rsp +.cfi_def_cfa_register %rsp +___ +$code.=<<___; + vzeroupper + RET +.cfi_endproc +___ +if($avx > 2 && $avx512) { +my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); +my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); +my $PADBIT="%zmm30"; + +map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain +map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); +map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); +map(s/%y/%z/,($MASK)); + +$code.=<<___; +.cfi_startproc +.Lblocks_avx512: + mov \$15,%eax + kmovw %eax,%k2 +___ +$code.=<<___ if (!$win64); + lea 8(%rsp),%r10 +.cfi_def_cfa_register %r10 + sub \$0x128,%rsp +___ +$code.=<<___ if ($win64); + lea 8(%rsp),%r10 + sub \$0x1c8,%rsp + vmovdqa %xmm6,-0xb0(%r10) + vmovdqa %xmm7,-0xa0(%r10) + vmovdqa %xmm8,-0x90(%r10) + vmovdqa %xmm9,-0x80(%r10) + vmovdqa %xmm10,-0x70(%r10) + vmovdqa %xmm11,-0x60(%r10) + vmovdqa %xmm12,-0x50(%r10) + vmovdqa %xmm13,-0x40(%r10) + vmovdqa %xmm14,-0x30(%r10) + vmovdqa %xmm15,-0x20(%r10) +.Ldo_avx512_body: +___ +$code.=<<___; + lea .Lconst(%rip),%rcx + lea 48+64($ctx),$ctx # size optimization + vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 + + # expand pre-calculated table + vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} + and \$-512,%rsp + vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} + mov \$0x20,%rax + vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} + vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} + vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} + vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} + vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} + vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} + vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} + vpermd $D0,$T2,$R0 # 00003412 -> 14243444 + vpbroadcastq 64(%rcx),$MASK # .Lmask26 + vpermd $D1,$T2,$R1 + vpermd $T0,$T2,$S1 + vpermd $D2,$T2,$R2 + vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 + vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 + vpermd $T1,$T2,$S2 + vmovdqu64 $R1,0x00(%rsp,%rax){%k2} + vpsrlq \$32,$R1,$T1 + vpermd $D3,$T2,$R3 + vmovdqa64 $S1,0x40(%rsp){%k2} + vpermd $T3,$T2,$S3 + vpermd $D4,$T2,$R4 + vmovdqu64 $R2,0x40(%rsp,%rax){%k2} + vpermd $T4,$T2,$S4 + vmovdqa64 $S2,0x80(%rsp){%k2} + vmovdqu64 $R3,0x80(%rsp,%rax){%k2} + vmovdqa64 $S3,0xc0(%rsp){%k2} + vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} + vmovdqa64 $S4,0x100(%rsp){%k2} + + ################################################################ + # calculate 5th through 8th powers of the key + # + # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 + # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 + # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 + # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 + # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 + + vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 + vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 + vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 + vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 + vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 + vpsrlq \$32,$R2,$T2 + + vpmuludq $T1,$S4,$M0 + vpmuludq $T1,$R0,$M1 + vpmuludq $T1,$R1,$M2 + vpmuludq $T1,$R2,$M3 + vpmuludq $T1,$R3,$M4 + vpsrlq \$32,$R3,$T3 + vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 + vpaddq $M1,$D1,$D1 # d1 += r1'*r0 + vpaddq $M2,$D2,$D2 # d2 += r1'*r1 + vpaddq $M3,$D3,$D3 # d3 += r1'*r2 + vpaddq $M4,$D4,$D4 # d4 += r1'*r3 + + vpmuludq $T2,$S3,$M0 + vpmuludq $T2,$S4,$M1 + vpmuludq $T2,$R1,$M3 + vpmuludq $T2,$R2,$M4 + vpmuludq $T2,$R0,$M2 + vpsrlq \$32,$R4,$T4 + vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 + vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 + vpaddq $M3,$D3,$D3 # d3 += r2'*r1 + vpaddq $M4,$D4,$D4 # d4 += r2'*r2 + vpaddq $M2,$D2,$D2 # d2 += r2'*r0 + + vpmuludq $T3,$S2,$M0 + vpmuludq $T3,$R0,$M3 + vpmuludq $T3,$R1,$M4 + vpmuludq $T3,$S3,$M1 + vpmuludq $T3,$S4,$M2 + vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 + vpaddq $M3,$D3,$D3 # d3 += r3'*r0 + vpaddq $M4,$D4,$D4 # d4 += r3'*r1 + vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 + vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 + + vpmuludq $T4,$S4,$M3 + vpmuludq $T4,$R0,$M4 + vpmuludq $T4,$S1,$M0 + vpmuludq $T4,$S2,$M1 + vpmuludq $T4,$S3,$M2 + vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 + vpaddq $M4,$D4,$D4 # d4 += r2'*r0 + vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 + vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 + vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 + + ################################################################ + # load input + vmovdqu64 16*0($inp),%z#$T3 + vmovdqu64 16*4($inp),%z#$T4 + lea 16*8($inp),$inp + + ################################################################ + # lazy reduction + + vpsrlq \$26,$D3,$M3 + vpandq $MASK,$D3,$D3 + vpaddq $M3,$D4,$D4 # d3 -> d4 + + vpsrlq \$26,$D0,$M0 + vpandq $MASK,$D0,$D0 + vpaddq $M0,$D1,$D1 # d0 -> d1 + + vpsrlq \$26,$D4,$M4 + vpandq $MASK,$D4,$D4 + + vpsrlq \$26,$D1,$M1 + vpandq $MASK,$D1,$D1 + vpaddq $M1,$D2,$D2 # d1 -> d2 + + vpaddq $M4,$D0,$D0 + vpsllq \$2,$M4,$M4 + vpaddq $M4,$D0,$D0 # d4 -> d0 + + vpsrlq \$26,$D2,$M2 + vpandq $MASK,$D2,$D2 + vpaddq $M2,$D3,$D3 # d2 -> d3 + + vpsrlq \$26,$D0,$M0 + vpandq $MASK,$D0,$D0 + vpaddq $M0,$D1,$D1 # d0 -> d1 + + vpsrlq \$26,$D3,$M3 + vpandq $MASK,$D3,$D3 + vpaddq $M3,$D4,$D4 # d3 -> d4 + + ################################################################ + # at this point we have 14243444 in $R0-$S4 and 05060708 in + # $D0-$D4, ... + + vpunpcklqdq $T4,$T3,$T0 # transpose input + vpunpckhqdq $T4,$T3,$T4 + + # ... since input 64-bit lanes are ordered as 73625140, we could + # "vperm" it to 76543210 (here and in each loop iteration), *or* + # we could just flow along, hence the goal for $R0-$S4 is + # 1858286838784888 ... + + vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: + mov \$0x7777,%eax + kmovw %eax,%k1 + + vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- + vpermd $R1,$M0,$R1 + vpermd $R2,$M0,$R2 + vpermd $R3,$M0,$R3 + vpermd $R4,$M0,$R4 + + vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 + vpermd $D1,$M0,${R1}{%k1} + vpermd $D2,$M0,${R2}{%k1} + vpermd $D3,$M0,${R3}{%k1} + vpermd $D4,$M0,${R4}{%k1} + + vpslld \$2,$R1,$S1 # *5 + vpslld \$2,$R2,$S2 + vpslld \$2,$R3,$S3 + vpslld \$2,$R4,$S4 + vpaddd $R1,$S1,$S1 + vpaddd $R2,$S2,$S2 + vpaddd $R3,$S3,$S3 + vpaddd $R4,$S4,$S4 + + vpbroadcastq 32(%rcx),$PADBIT # .L129 + + vpsrlq \$52,$T0,$T2 # splat input + vpsllq \$12,$T4,$T3 + vporq $T3,$T2,$T2 + vpsrlq \$26,$T0,$T1 + vpsrlq \$14,$T4,$T3 + vpsrlq \$40,$T4,$T4 # 4 + vpandq $MASK,$T2,$T2 # 2 + vpandq $MASK,$T0,$T0 # 0 + #vpandq $MASK,$T1,$T1 # 1 + #vpandq $MASK,$T3,$T3 # 3 + #vporq $PADBIT,$T4,$T4 # padbit, yes, always + + vpaddq $H2,$T2,$H2 # accumulate input + sub \$192,$len + jbe .Ltail_avx512 + jmp .Loop_avx512 + +.align 32 +.Loop_avx512: + ################################################################ + # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 + # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 + # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 + # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 + # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 + # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 + # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 + # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 + # \________/\___________/ + ################################################################ + #vpaddq $H2,$T2,$H2 # accumulate input + + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + # + # however, as h2 is "chronologically" first one available pull + # corresponding operations up, so it's + # + # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 + # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 + # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 + # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 + # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 + + vpmuludq $H2,$R1,$D3 # d3 = h2*r1 + vpaddq $H0,$T0,$H0 + vpmuludq $H2,$R2,$D4 # d4 = h2*r2 + vpandq $MASK,$T1,$T1 # 1 + vpmuludq $H2,$S3,$D0 # d0 = h2*s3 + vpandq $MASK,$T3,$T3 # 3 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + vporq $PADBIT,$T4,$T4 # padbit, yes, always + vpmuludq $H2,$R0,$D2 # d2 = h2*r0 + vpaddq $H1,$T1,$H1 # accumulate input + vpaddq $H3,$T3,$H3 + vpaddq $H4,$T4,$H4 + + vmovdqu64 16*0($inp),$T3 # load input + vmovdqu64 16*4($inp),$T4 + lea 16*8($inp),$inp + vpmuludq $H0,$R3,$M3 + vpmuludq $H0,$R4,$M4 + vpmuludq $H0,$R0,$M0 + vpmuludq $H0,$R1,$M1 + vpaddq $M3,$D3,$D3 # d3 += h0*r3 + vpaddq $M4,$D4,$D4 # d4 += h0*r4 + vpaddq $M0,$D0,$D0 # d0 += h0*r0 + vpaddq $M1,$D1,$D1 # d1 += h0*r1 + + vpmuludq $H1,$R2,$M3 + vpmuludq $H1,$R3,$M4 + vpmuludq $H1,$S4,$M0 + vpmuludq $H0,$R2,$M2 + vpaddq $M3,$D3,$D3 # d3 += h1*r2 + vpaddq $M4,$D4,$D4 # d4 += h1*r3 + vpaddq $M0,$D0,$D0 # d0 += h1*s4 + vpaddq $M2,$D2,$D2 # d2 += h0*r2 + + vpunpcklqdq $T4,$T3,$T0 # transpose input + vpunpckhqdq $T4,$T3,$T4 + + vpmuludq $H3,$R0,$M3 + vpmuludq $H3,$R1,$M4 + vpmuludq $H1,$R0,$M1 + vpmuludq $H1,$R1,$M2 + vpaddq $M3,$D3,$D3 # d3 += h3*r0 + vpaddq $M4,$D4,$D4 # d4 += h3*r1 + vpaddq $M1,$D1,$D1 # d1 += h1*r0 + vpaddq $M2,$D2,$D2 # d2 += h1*r1 + + vpmuludq $H4,$S4,$M3 + vpmuludq $H4,$R0,$M4 + vpmuludq $H3,$S2,$M0 + vpmuludq $H3,$S3,$M1 + vpaddq $M3,$D3,$D3 # d3 += h4*s4 + vpmuludq $H3,$S4,$M2 + vpaddq $M4,$D4,$D4 # d4 += h4*r0 + vpaddq $M0,$D0,$D0 # d0 += h3*s2 + vpaddq $M1,$D1,$D1 # d1 += h3*s3 + vpaddq $M2,$D2,$D2 # d2 += h3*s4 + + vpmuludq $H4,$S1,$M0 + vpmuludq $H4,$S2,$M1 + vpmuludq $H4,$S3,$M2 + vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 + vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 + vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 + + ################################################################ + # lazy reduction (interleaved with input splat) + + vpsrlq \$52,$T0,$T2 # splat input + vpsllq \$12,$T4,$T3 + + vpsrlq \$26,$D3,$H3 + vpandq $MASK,$D3,$D3 + vpaddq $H3,$D4,$H4 # h3 -> h4 + + vporq $T3,$T2,$T2 + + vpsrlq \$26,$H0,$D0 + vpandq $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpandq $MASK,$T2,$T2 # 2 + + vpsrlq \$26,$H4,$D4 + vpandq $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpandq $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpaddq $T2,$H2,$H2 # modulo-scheduled + vpsrlq \$26,$T0,$T1 + + vpsrlq \$26,$H2,$D2 + vpandq $MASK,$H2,$H2 + vpaddq $D2,$D3,$H3 # h2 -> h3 + + vpsrlq \$14,$T4,$T3 + + vpsrlq \$26,$H0,$D0 + vpandq $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$40,$T4,$T4 # 4 + + vpsrlq \$26,$H3,$D3 + vpandq $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpandq $MASK,$T0,$T0 # 0 + #vpandq $MASK,$T1,$T1 # 1 + #vpandq $MASK,$T3,$T3 # 3 + #vporq $PADBIT,$T4,$T4 # padbit, yes, always + + sub \$128,$len + ja .Loop_avx512 + +.Ltail_avx512: + ################################################################ + # while above multiplications were by r^8 in all lanes, in last + # iteration we multiply least significant lane by r^8 and most + # significant one by r, that's why table gets shifted... + + vpsrlq \$32,$R0,$R0 # 0105020603070408 + vpsrlq \$32,$R1,$R1 + vpsrlq \$32,$R2,$R2 + vpsrlq \$32,$S3,$S3 + vpsrlq \$32,$S4,$S4 + vpsrlq \$32,$R3,$R3 + vpsrlq \$32,$R4,$R4 + vpsrlq \$32,$S1,$S1 + vpsrlq \$32,$S2,$S2 + + ################################################################ + # load either next or last 64 byte of input + lea ($inp,$len),$inp + + #vpaddq $H2,$T2,$H2 # accumulate input + vpaddq $H0,$T0,$H0 + + vpmuludq $H2,$R1,$D3 # d3 = h2*r1 + vpmuludq $H2,$R2,$D4 # d4 = h2*r2 + vpmuludq $H2,$S3,$D0 # d0 = h2*s3 + vpandq $MASK,$T1,$T1 # 1 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + vpandq $MASK,$T3,$T3 # 3 + vpmuludq $H2,$R0,$D2 # d2 = h2*r0 + vporq $PADBIT,$T4,$T4 # padbit, yes, always + vpaddq $H1,$T1,$H1 # accumulate input + vpaddq $H3,$T3,$H3 + vpaddq $H4,$T4,$H4 + + vmovdqu 16*0($inp),%x#$T0 + vpmuludq $H0,$R3,$M3 + vpmuludq $H0,$R4,$M4 + vpmuludq $H0,$R0,$M0 + vpmuludq $H0,$R1,$M1 + vpaddq $M3,$D3,$D3 # d3 += h0*r3 + vpaddq $M4,$D4,$D4 # d4 += h0*r4 + vpaddq $M0,$D0,$D0 # d0 += h0*r0 + vpaddq $M1,$D1,$D1 # d1 += h0*r1 + + vmovdqu 16*1($inp),%x#$T1 + vpmuludq $H1,$R2,$M3 + vpmuludq $H1,$R3,$M4 + vpmuludq $H1,$S4,$M0 + vpmuludq $H0,$R2,$M2 + vpaddq $M3,$D3,$D3 # d3 += h1*r2 + vpaddq $M4,$D4,$D4 # d4 += h1*r3 + vpaddq $M0,$D0,$D0 # d0 += h1*s4 + vpaddq $M2,$D2,$D2 # d2 += h0*r2 + + vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 + vpmuludq $H3,$R0,$M3 + vpmuludq $H3,$R1,$M4 + vpmuludq $H1,$R0,$M1 + vpmuludq $H1,$R1,$M2 + vpaddq $M3,$D3,$D3 # d3 += h3*r0 + vpaddq $M4,$D4,$D4 # d4 += h3*r1 + vpaddq $M1,$D1,$D1 # d1 += h1*r0 + vpaddq $M2,$D2,$D2 # d2 += h1*r1 + + vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 + vpmuludq $H4,$S4,$M3 + vpmuludq $H4,$R0,$M4 + vpmuludq $H3,$S2,$M0 + vpmuludq $H3,$S3,$M1 + vpmuludq $H3,$S4,$M2 + vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 + vpaddq $M4,$D4,$D4 # d4 += h4*r0 + vpaddq $M0,$D0,$D0 # d0 += h3*s2 + vpaddq $M1,$D1,$D1 # d1 += h3*s3 + vpaddq $M2,$D2,$D2 # d2 += h3*s4 + + vpmuludq $H4,$S1,$M0 + vpmuludq $H4,$S2,$M1 + vpmuludq $H4,$S3,$M2 + vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 + vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 + vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 + + ################################################################ + # horizontal addition + + mov \$1,%eax + vpermq \$0xb1,$H3,$D3 + vpermq \$0xb1,$D4,$H4 + vpermq \$0xb1,$H0,$D0 + vpermq \$0xb1,$H1,$D1 + vpermq \$0xb1,$H2,$D2 + vpaddq $D3,$H3,$H3 + vpaddq $D4,$H4,$H4 + vpaddq $D0,$H0,$H0 + vpaddq $D1,$H1,$H1 + vpaddq $D2,$H2,$H2 + + kmovw %eax,%k3 + vpermq \$0x2,$H3,$D3 + vpermq \$0x2,$H4,$D4 + vpermq \$0x2,$H0,$D0 + vpermq \$0x2,$H1,$D1 + vpermq \$0x2,$H2,$D2 + vpaddq $D3,$H3,$H3 + vpaddq $D4,$H4,$H4 + vpaddq $D0,$H0,$H0 + vpaddq $D1,$H1,$H1 + vpaddq $D2,$H2,$H2 + + vextracti64x4 \$0x1,$H3,%y#$D3 + vextracti64x4 \$0x1,$H4,%y#$D4 + vextracti64x4 \$0x1,$H0,%y#$D0 + vextracti64x4 \$0x1,$H1,%y#$D1 + vextracti64x4 \$0x1,$H2,%y#$D2 + vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case + vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 + vpaddq $D0,$H0,${H0}{%k3}{z} + vpaddq $D1,$H1,${H1}{%k3}{z} + vpaddq $D2,$H2,${H2}{%k3}{z} +___ +map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); +map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); +$code.=<<___; + ################################################################ + # lazy reduction (interleaved with input splat) + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpsrldq \$6,$T0,$T2 # splat input + vpsrldq \$6,$T1,$T3 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpunpcklqdq $T3,$T2,$T2 # 2:3 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D4 + vpand $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpsrlq \$30,$T2,$T3 + vpsrlq \$4,$T2,$T2 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpsrlq \$26,$T0,$T1 + vpsrlq \$40,$T4,$T4 # 4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T0,$T0 # 0 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 + vpand $MASK,$T1,$T1 # 1 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + vpaddq $D3,$H4,$H4 # h3 -> h4 + + lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 + add \$64,$len + jnz .Ltail_avx2$suffix + + vpsubq $T2,$H2,$H2 # undo input accumulation + vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced + vmovd %x#$H1,`4*1-48-64`($ctx) + vmovd %x#$H2,`4*2-48-64`($ctx) + vmovd %x#$H3,`4*3-48-64`($ctx) + vmovd %x#$H4,`4*4-48-64`($ctx) + vzeroall +___ +$code.=<<___ if ($win64); + movdqa -0xb0(%r10),%xmm6 + movdqa -0xa0(%r10),%xmm7 + movdqa -0x90(%r10),%xmm8 + movdqa -0x80(%r10),%xmm9 + movdqa -0x70(%r10),%xmm10 + movdqa -0x60(%r10),%xmm11 + movdqa -0x50(%r10),%xmm12 + movdqa -0x40(%r10),%xmm13 + movdqa -0x30(%r10),%xmm14 + movdqa -0x20(%r10),%xmm15 + lea -8(%r10),%rsp +.Ldo_avx512_epilogue: +___ +$code.=<<___ if (!$win64); + lea -8(%r10),%rsp +.cfi_def_cfa_register %rsp +___ +$code.=<<___; + RET +.cfi_endproc +___ + +} + +} + +&declare_function("poly1305_blocks_avx2", 32, 4); +poly1305_blocks_avxN(0); +&end_function("poly1305_blocks_avx2"); + +####################################################################### +if ($avx>2) { +# On entry we have input length divisible by 64. But since inner loop +# processes 128 bytes per iteration, cases when length is not divisible +# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this +# reason stack layout is kept identical to poly1305_blocks_avx2. If not +# for this tail, we wouldn't have to even allocate stack frame... + +&declare_function("poly1305_blocks_avx512", 32, 4); +poly1305_blocks_avxN(1); +&end_function("poly1305_blocks_avx512"); + +if (!$kernel && $avx>3) { +######################################################################## +# VPMADD52 version using 2^44 radix. +# +# One can argue that base 2^52 would be more natural. Well, even though +# some operations would be more natural, one has to recognize couple of +# things. Base 2^52 doesn't provide advantage over base 2^44 if you look +# at amount of multiply-n-accumulate operations. Secondly, it makes it +# impossible to pre-compute multiples of 5 [referred to as s[]/sN in +# reference implementations], which means that more such operations +# would have to be performed in inner loop, which in turn makes critical +# path longer. In other words, even though base 2^44 reduction might +# look less elegant, overall critical path is actually shorter... + +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int64 h[3]; # current hash value base 2^44 +# unsigned __int64 s[2]; # key value*20 base 2^44 +# unsigned __int64 r[3]; # key value base 2^44 +# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; +# # r^n positions reflect +# # placement in register, not +# # memory, R[3] is R[1]*20 + +$code.=<<___; +.type poly1305_init_base2_44,\@function,3 +.align 32 +poly1305_init_base2_44: + xor %eax,%eax + mov %rax,0($ctx) # initialize hash value + mov %rax,8($ctx) + mov %rax,16($ctx) + +.Linit_base2_44: + lea poly1305_blocks_vpmadd52(%rip),%r10 + lea poly1305_emit_base2_44(%rip),%r11 + + mov \$0x0ffffffc0fffffff,%rax + mov \$0x0ffffffc0ffffffc,%rcx + and 0($inp),%rax + mov \$0x00000fffffffffff,%r8 + and 8($inp),%rcx + mov \$0x00000fffffffffff,%r9 + and %rax,%r8 + shrd \$44,%rcx,%rax + mov %r8,40($ctx) # r0 + and %r9,%rax + shr \$24,%rcx + mov %rax,48($ctx) # r1 + lea (%rax,%rax,4),%rax # *5 + mov %rcx,56($ctx) # r2 + shl \$2,%rax # magic <<2 + lea (%rcx,%rcx,4),%rcx # *5 + shl \$2,%rcx # magic <<2 + mov %rax,24($ctx) # s1 + mov %rcx,32($ctx) # s2 + movq \$-1,64($ctx) # write impossible value +___ +$code.=<<___ if ($flavour !~ /elf32/); + mov %r10,0(%rdx) + mov %r11,8(%rdx) +___ +$code.=<<___ if ($flavour =~ /elf32/); + mov %r10d,0(%rdx) + mov %r11d,4(%rdx) +___ +$code.=<<___; + mov \$1,%eax + RET +.size poly1305_init_base2_44,.-poly1305_init_base2_44 +___ +{ +my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); +my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); +my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52,\@function,4 +.align 32 +poly1305_blocks_vpmadd52: + shr \$4,$len + jz .Lno_data_vpmadd52 # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + + # if powers of the key are not calculated yet, process up to 3 + # blocks with this single-block subroutine, otherwise ensure that + # length is divisible by 2 blocks and pass the rest down to next + # subroutine... + + mov \$3,%rax + mov \$1,%r10 + cmp \$4,$len # is input long + cmovae %r10,%rax + test %r8,%r8 # is power value impossible? + cmovns %r10,%rax + + and $len,%rax # is input of favourable length? + jz .Lblocks_vpmadd52_4x + + sub %rax,$len + mov \$7,%r10d + mov \$1,%r11d + kmovw %r10d,%k7 + lea .L2_44_inp_permd(%rip),%r10 + kmovw %r11d,%k1 + + vmovq $padbit,%x#$PAD + vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd + vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift + vpermq \$0xcf,$PAD,$PAD + vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask + + vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value + vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys + vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} + vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} + + vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt + vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft + + jmp .Loop_vpmadd52 + +.align 32 +.Loop_vpmadd52: + vmovdqu32 0($inp),%x#$T0 # load input as ----3210 + lea 16($inp),$inp + + vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 + vpsrlvq $inp_shift,$T0,$T0 + vpandq $reduc_mask,$T0,$T0 + vporq $PAD,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo # accumulate input + + vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value + vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} + vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} + + vpxord $Dlo,$Dlo,$Dlo + vpxord $Dhi,$Dhi,$Dhi + + vpmadd52luq $r2r1r0,$H0,$Dlo + vpmadd52huq $r2r1r0,$H0,$Dhi + + vpmadd52luq $r1r0s2,$H1,$Dlo + vpmadd52huq $r1r0s2,$H1,$Dhi + + vpmadd52luq $r0s2s1,$H2,$Dlo + vpmadd52huq $r0s2s1,$H2,$Dhi + + vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword + vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword + vpandq $reduc_mask,$Dlo,$Dlo + + vpaddq $T0,$Dhi,$Dhi + + vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword + + vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) + + vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word + vpandq $reduc_mask,$Dlo,$Dlo + + vpermq \$0b10010011,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo + + vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} + + vpaddq $T0,$Dlo,$Dlo + vpsllq \$2,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo + + dec %rax # len-=16 + jnz .Loop_vpmadd52 + + vmovdqu64 $Dlo,0($ctx){%k7} # store hash value + + test $len,$len + jnz .Lblocks_vpmadd52_4x + +.Lno_data_vpmadd52: + RET +.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 +___ +} +{ +######################################################################## +# As implied by its name 4x subroutine processes 4 blocks in parallel +# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power +# and is handled in 256-bit %ymm registers. + +my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); +my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); +my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52_4x,\@function,4 +.align 32 +poly1305_blocks_vpmadd52_4x: + shr \$4,$len + jz .Lno_data_vpmadd52_4x # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + +.Lblocks_vpmadd52_4x: + vpbroadcastq $padbit,$PAD + + vmovdqa64 .Lx_mask44(%rip),$mask44 + mov \$5,%eax + vmovdqa64 .Lx_mask42(%rip),$mask42 + kmovw %eax,%k1 # used in 2x path + + test %r8,%r8 # is power value impossible? + js .Linit_vpmadd52 # if it is, then init R[4] + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + + test \$3,$len # is length 4*n+2? + jnz .Lblocks_vpmadd52_2x_do + +.Lblocks_vpmadd52_4x_do: + vpbroadcastq 64($ctx),$R0 # load 4th power of the key + vpbroadcastq 96($ctx),$R1 + vpbroadcastq 128($ctx),$R2 + vpbroadcastq 160($ctx),$S1 + +.Lblocks_vpmadd52_4x_key_loaded: + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + test \$7,$len # is len 8*n? + jz .Lblocks_vpmadd52_8x + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*2($inp),$T3 + lea 16*4($inp),$inp + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as 3-1-2-0 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + sub \$4,$len + jz .Ltail_vpmadd52_4x + jmp .Loop_vpmadd52_4x + ud2 + +.align 32 +.Linit_vpmadd52: + vmovq 24($ctx),%x#$S1 # load key + vmovq 56($ctx),%x#$H2 + vmovq 32($ctx),%x#$S2 + vmovq 40($ctx),%x#$R0 + vmovq 48($ctx),%x#$R1 + + vmovdqa $R0,$H0 + vmovdqa $R1,$H1 + vmovdqa $H2,$R2 + + mov \$2,%eax + +.Lmul_init_vpmadd52: + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + dec %eax + jz .Ldone_init_vpmadd52 + + vpunpcklqdq $R1,$H1,$R1 # 1,2 + vpbroadcastq %x#$H1,%x#$H1 # 2,2 + vpunpcklqdq $R2,$H2,$R2 + vpbroadcastq %x#$H2,%x#$H2 + vpunpcklqdq $R0,$H0,$R0 + vpbroadcastq %x#$H0,%x#$H0 + + vpsllq \$2,$R1,$S1 # S1 = R1*5*4 + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R1,$S1,$S1 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S1,$S1 + vpsllq \$2,$S2,$S2 + + jmp .Lmul_init_vpmadd52 + ud2 + +.align 32 +.Ldone_init_vpmadd52: + vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 + vinserti128 \$1,%x#$R2,$H2,$R2 + vinserti128 \$1,%x#$R0,$H0,$R0 + + vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 + vpermq \$0b11011000,$R2,$R2 + vpermq \$0b11011000,$R0,$R0 + + vpsllq \$2,$R1,$S1 # S1 = R1*5*4 + vpaddq $R1,$S1,$S1 + vpsllq \$2,$S1,$S1 + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + + test \$3,$len # is length 4*n+2? + jnz .Ldone_init_vpmadd52_2x + + vmovdqu64 $R0,64($ctx) # save key powers + vpbroadcastq %x#$R0,$R0 # broadcast 4th power + vmovdqu64 $R1,96($ctx) + vpbroadcastq %x#$R1,$R1 + vmovdqu64 $R2,128($ctx) + vpbroadcastq %x#$R2,$R2 + vmovdqu64 $S1,160($ctx) + vpbroadcastq %x#$S1,$S1 + + jmp .Lblocks_vpmadd52_4x_key_loaded + ud2 + +.align 32 +.Ldone_init_vpmadd52_2x: + vmovdqu64 $R0,64($ctx) # save key powers + vpsrldq \$8,$R0,$R0 # 0-1-0-2 + vmovdqu64 $R1,96($ctx) + vpsrldq \$8,$R1,$R1 + vmovdqu64 $R2,128($ctx) + vpsrldq \$8,$R2,$R2 + vmovdqu64 $S1,160($ctx) + vpsrldq \$8,$S1,$S1 + jmp .Lblocks_vpmadd52_2x_key_loaded + ud2 + +.align 32 +.Lblocks_vpmadd52_2x_do: + vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers + vmovdqu64 160+8($ctx),${S1}{%k1}{z} + vmovdqu64 64+8($ctx),${R0}{%k1}{z} + vmovdqu64 96+8($ctx),${R1}{%k1}{z} + +.Lblocks_vpmadd52_2x_key_loaded: + vmovdqu64 16*0($inp),$T2 # load data + vpxorq $T3,$T3,$T3 + lea 16*2($inp),$inp + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as x-1-x-0 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + jmp .Ltail_vpmadd52_2x + ud2 + +.align 32 +.Loop_vpmadd52_4x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*2($inp),$T3 + lea 16*4($inp),$inp + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction (interleaved with data splat) + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpsrlq \$24,$T3,$T2 + vporq $PAD,$T2,$T2 + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + sub \$4,$len # len-=64 + jnz .Loop_vpmadd52_4x + +.Ltail_vpmadd52_4x: + vmovdqu64 128($ctx),$R2 # load all key powers + vmovdqu64 160($ctx),$S1 + vmovdqu64 64($ctx),$R0 + vmovdqu64 96($ctx),$R1 + +.Ltail_vpmadd52_2x: + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # horizontal addition + + mov \$1,%eax + kmovw %eax,%k1 + vpsrldq \$8,$D0lo,$T0 + vpsrldq \$8,$D0hi,$H0 + vpsrldq \$8,$D1lo,$T1 + vpsrldq \$8,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpsrldq \$8,$D2lo,$T2 + vpsrldq \$8,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vpermq \$0x2,$D0lo,$T0 + vpermq \$0x2,$D0hi,$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vpermq \$0x2,$D1lo,$T1 + vpermq \$0x2,$D1hi,$H1 + vpaddq $T0,$D0lo,${D0lo}{%k1}{z} + vpaddq $H0,$D0hi,${D0hi}{%k1}{z} + vpermq \$0x2,$D2lo,$T2 + vpermq \$0x2,$D2hi,$H2 + vpaddq $T1,$D1lo,${D1lo}{%k1}{z} + vpaddq $H1,$D1hi,${D1hi}{%k1}{z} + vpaddq $T2,$D2lo,${D2lo}{%k1}{z} + vpaddq $H2,$D2hi,${D2hi}{%k1}{z} + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + # at this point $len is + # either 4*n+2 or 0... + sub \$2,$len # len-=32 + ja .Lblocks_vpmadd52_4x_do + + vmovq %x#$H0,0($ctx) + vmovq %x#$H1,8($ctx) + vmovq %x#$H2,16($ctx) + vzeroall + +.Lno_data_vpmadd52_4x: + RET +.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x +___ +} +{ +######################################################################## +# As implied by its name 8x subroutine processes 8 blocks in parallel... +# This is intermediate version, as it's used only in cases when input +# length is either 8*n, 8*n+1 or 8*n+2... + +my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); +my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); +my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); +my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52_8x,\@function,4 +.align 32 +poly1305_blocks_vpmadd52_8x: + shr \$4,$len + jz .Lno_data_vpmadd52_8x # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + + vmovdqa64 .Lx_mask44(%rip),$mask44 + vmovdqa64 .Lx_mask42(%rip),$mask42 + + test %r8,%r8 # is power value impossible? + js .Linit_vpmadd52 # if it is, then init R[4] + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + +.Lblocks_vpmadd52_8x: + ################################################################ + # fist we calculate more key powers + + vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers + vmovdqu64 160($ctx),$S1 + vmovdqu64 64($ctx),$R0 + vmovdqu64 96($ctx),$R1 + + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + vpbroadcastq %x#$R2,$RR2 # broadcast 4th power + vpbroadcastq %x#$R0,$RR0 + vpbroadcastq %x#$R1,$RR1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $RR2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $RR2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $RR2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $RR2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $RR2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $RR2,$R0,$D2hi + + vpmadd52luq $RR0,$R0,$D0lo + vpmadd52huq $RR0,$R0,$D0hi + vpmadd52luq $RR0,$R1,$D1lo + vpmadd52huq $RR0,$R1,$D1hi + vpmadd52luq $RR0,$R2,$D2lo + vpmadd52huq $RR0,$R2,$D2hi + + vpmadd52luq $RR1,$S2,$D0lo + vpmadd52huq $RR1,$S2,$D0hi + vpmadd52luq $RR1,$R0,$D1lo + vpmadd52huq $RR1,$R0,$D1hi + vpmadd52luq $RR1,$R1,$D2lo + vpmadd52huq $RR1,$R1,$D2hi + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$RR0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$RR1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$RR2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$RR0,$RR0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$RR0,$RR0 + + vpsrlq \$44,$RR0,$tmp # additional step + vpandq $mask44,$RR0,$RR0 + + vpaddq $tmp,$RR1,$RR1 + + ################################################################ + # At this point Rx holds 1324 powers, RRx - 5768, and the goal + # is 15263748, which reflects how data is loaded... + + vpunpcklqdq $R2,$RR2,$T2 # 3748 + vpunpckhqdq $R2,$RR2,$R2 # 1526 + vpunpcklqdq $R0,$RR0,$T0 + vpunpckhqdq $R0,$RR0,$R0 + vpunpcklqdq $R1,$RR1,$T1 + vpunpckhqdq $R1,$RR1,$R1 +___ +######## switch to %zmm +map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); +map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); +map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); +map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); + +$code.=<<___; + vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 + vshufi64x2 \$0x44,$R0,$T0,$RR0 + vshufi64x2 \$0x44,$R1,$T1,$RR1 + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*4($inp),$T3 + lea 16*8($inp),$inp + + vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 + vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 + vpaddq $RR2,$SS2,$SS2 + vpaddq $RR1,$SS1,$SS1 + vpsllq \$2,$SS2,$SS2 + vpsllq \$2,$SS1,$SS1 + + vpbroadcastq $padbit,$PAD + vpbroadcastq %x#$mask44,$mask44 + vpbroadcastq %x#$mask42,$mask42 + + vpbroadcastq %x#$SS1,$S1 # broadcast 8th power + vpbroadcastq %x#$SS2,$S2 + vpbroadcastq %x#$RR0,$R0 + vpbroadcastq %x#$RR1,$R1 + vpbroadcastq %x#$RR2,$R2 + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as 73625140 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + sub \$8,$len + jz .Ltail_vpmadd52_8x + jmp .Loop_vpmadd52_8x + +.align 32 +.Loop_vpmadd52_8x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*4($inp),$T3 + lea 16*8($inp),$inp + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction (interleaved with data splat) + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpsrlq \$24,$T3,$T2 + vporq $PAD,$T2,$T2 + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + sub \$8,$len # len-=128 + jnz .Loop_vpmadd52_8x + +.Ltail_vpmadd52_8x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$SS1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$SS1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$SS2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$SS2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$RR0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$RR0,$D2hi + + vpmadd52luq $H0,$RR0,$D0lo + vpmadd52huq $H0,$RR0,$D0hi + vpmadd52luq $H0,$RR1,$D1lo + vpmadd52huq $H0,$RR1,$D1hi + vpmadd52luq $H0,$RR2,$D2lo + vpmadd52huq $H0,$RR2,$D2hi + + vpmadd52luq $H1,$SS2,$D0lo + vpmadd52huq $H1,$SS2,$D0hi + vpmadd52luq $H1,$RR0,$D1lo + vpmadd52huq $H1,$RR0,$D1hi + vpmadd52luq $H1,$RR1,$D2lo + vpmadd52huq $H1,$RR1,$D2hi + + ################################################################ + # horizontal addition + + mov \$1,%eax + kmovw %eax,%k1 + vpsrldq \$8,$D0lo,$T0 + vpsrldq \$8,$D0hi,$H0 + vpsrldq \$8,$D1lo,$T1 + vpsrldq \$8,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpsrldq \$8,$D2lo,$T2 + vpsrldq \$8,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vpermq \$0x2,$D0lo,$T0 + vpermq \$0x2,$D0hi,$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vpermq \$0x2,$D1lo,$T1 + vpermq \$0x2,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpermq \$0x2,$D2lo,$T2 + vpermq \$0x2,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vextracti64x4 \$1,$D0lo,%y#$T0 + vextracti64x4 \$1,$D0hi,%y#$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vextracti64x4 \$1,$D1lo,%y#$T1 + vextracti64x4 \$1,$D1hi,%y#$H1 + vextracti64x4 \$1,$D2lo,%y#$T2 + vextracti64x4 \$1,$D2hi,%y#$H2 +___ +######## switch back to %ymm +map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); +map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); +map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); + +$code.=<<___; + vpaddq $T0,$D0lo,${D0lo}{%k1}{z} + vpaddq $H0,$D0hi,${D0hi}{%k1}{z} + vpaddq $T1,$D1lo,${D1lo}{%k1}{z} + vpaddq $H1,$D1hi,${D1hi}{%k1}{z} + vpaddq $T2,$D2lo,${D2lo}{%k1}{z} + vpaddq $H2,$D2hi,${D2hi}{%k1}{z} + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + ################################################################ + + vmovq %x#$H0,0($ctx) + vmovq %x#$H1,8($ctx) + vmovq %x#$H2,16($ctx) + vzeroall + +.Lno_data_vpmadd52_8x: + RET +.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x +___ +} +$code.=<<___; +.type poly1305_emit_base2_44,\@function,3 +.align 32 +poly1305_emit_base2_44: + mov 0($ctx),%r8 # load hash value + mov 8($ctx),%r9 + mov 16($ctx),%r10 + + mov %r9,%rax + shr \$20,%r9 + shl \$44,%rax + mov %r10,%rcx + shr \$40,%r10 + shl \$24,%rcx + + add %rax,%r8 + adc %rcx,%r9 + adc \$0,%r10 + + mov %r8,%rax + add \$5,%r8 # compare to modulus + mov %r9,%rcx + adc \$0,%r9 + adc \$0,%r10 + shr \$2,%r10 # did 130-bit value overflow? + cmovnz %r8,%rax + cmovnz %r9,%rcx + + add 0($nonce),%rax # accumulate nonce + adc 8($nonce),%rcx + mov %rax,0($mac) # write result + mov %rcx,8($mac) + + RET +.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 +___ +} } } +} + +if (!$kernel) +{ # chacha20-poly1305 helpers +my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order +$code.=<<___; +.globl xor128_encrypt_n_pad +.type xor128_encrypt_n_pad,\@abi-omnipotent +.align 16 +xor128_encrypt_n_pad: + sub $otp,$inp + sub $otp,$out + mov $len,%r10 # put len aside + shr \$4,$len # len / 16 + jz .Ltail_enc + nop +.Loop_enc_xmm: + movdqu ($inp,$otp),%xmm0 + pxor ($otp),%xmm0 + movdqu %xmm0,($out,$otp) + movdqa %xmm0,($otp) + lea 16($otp),$otp + dec $len + jnz .Loop_enc_xmm + + and \$15,%r10 # len % 16 + jz .Ldone_enc + +.Ltail_enc: + mov \$16,$len + sub %r10,$len + xor %eax,%eax +.Loop_enc_byte: + mov ($inp,$otp),%al + xor ($otp),%al + mov %al,($out,$otp) + mov %al,($otp) + lea 1($otp),$otp + dec %r10 + jnz .Loop_enc_byte + + xor %eax,%eax +.Loop_enc_pad: + mov %al,($otp) + lea 1($otp),$otp + dec $len + jnz .Loop_enc_pad + +.Ldone_enc: + mov $otp,%rax + RET +.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad + +.globl xor128_decrypt_n_pad +.type xor128_decrypt_n_pad,\@abi-omnipotent +.align 16 +xor128_decrypt_n_pad: + sub $otp,$inp + sub $otp,$out + mov $len,%r10 # put len aside + shr \$4,$len # len / 16 + jz .Ltail_dec + nop +.Loop_dec_xmm: + movdqu ($inp,$otp),%xmm0 + movdqa ($otp),%xmm1 + pxor %xmm0,%xmm1 + movdqu %xmm1,($out,$otp) + movdqa %xmm0,($otp) + lea 16($otp),$otp + dec $len + jnz .Loop_dec_xmm + + pxor %xmm1,%xmm1 + and \$15,%r10 # len % 16 + jz .Ldone_dec + +.Ltail_dec: + mov \$16,$len + sub %r10,$len + xor %eax,%eax + xor %r11d,%r11d +.Loop_dec_byte: + mov ($inp,$otp),%r11b + mov ($otp),%al + xor %r11b,%al + mov %al,($out,$otp) + mov %r11b,($otp) + lea 1($otp),$otp + dec %r10 + jnz .Loop_dec_byte + + xor %eax,%eax +.Loop_dec_pad: + mov %al,($otp) + lea 1($otp),$otp + dec $len + jnz .Loop_dec_pad + +.Ldone_dec: + mov $otp,%rax + RET +.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad +___ +} + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<.Lprologue + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lcommon_seh_tail + + lea 48(%rax),%rax + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R14 + + jmp .Lcommon_seh_tail +.size se_handler,.-se_handler + +.type avx_handler,\@abi-omnipotent +.align 16 +avx_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<prologue label + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 208($context),%rax # pull context->R11 + + lea 0x50(%rax),%rsi + lea 0xf8(%rax),%rax + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + RET +.size avx_handler,.-avx_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_poly1305_block_init_arch + .rva .LSEH_end_poly1305_block_init_arch + .rva .LSEH_info_poly1305_block_init_arch + + .rva .LSEH_begin_poly1305_blocks_x86_64 + .rva .LSEH_end_poly1305_blocks_x86_64 + .rva .LSEH_info_poly1305_blocks_x86_64 + + .rva .LSEH_begin_poly1305_emit_x86_64 + .rva .LSEH_end_poly1305_emit_x86_64 + .rva .LSEH_info_poly1305_emit_x86_64 +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_poly1305_blocks_avx + .rva .Lbase2_64_avx + .rva .LSEH_info_poly1305_blocks_avx_1 + + .rva .Lbase2_64_avx + .rva .Leven_avx + .rva .LSEH_info_poly1305_blocks_avx_2 + + .rva .Leven_avx + .rva .LSEH_end_poly1305_blocks_avx + .rva .LSEH_info_poly1305_blocks_avx_3 + + .rva .LSEH_begin_poly1305_emit_avx + .rva .LSEH_end_poly1305_emit_avx + .rva .LSEH_info_poly1305_emit_avx +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_poly1305_blocks_avx2 + .rva .Lbase2_64_avx2 + .rva .LSEH_info_poly1305_blocks_avx2_1 + + .rva .Lbase2_64_avx2 + .rva .Leven_avx2 + .rva .LSEH_info_poly1305_blocks_avx2_2 + + .rva .Leven_avx2 + .rva .LSEH_end_poly1305_blocks_avx2 + .rva .LSEH_info_poly1305_blocks_avx2_3 +___ +$code.=<<___ if ($avx>2); + .rva .LSEH_begin_poly1305_blocks_avx512 + .rva .LSEH_end_poly1305_blocks_avx512 + .rva .LSEH_info_poly1305_blocks_avx512 +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_poly1305_block_init_arch: + .byte 9,0,0,0 + .rva se_handler + .rva .LSEH_begin_poly1305_block_init_arch,.LSEH_begin_poly1305_block_init_arch + +.LSEH_info_poly1305_blocks_x86_64: + .byte 9,0,0,0 + .rva se_handler + .rva .Lblocks_body,.Lblocks_epilogue + +.LSEH_info_poly1305_emit_x86_64: + .byte 9,0,0,0 + .rva se_handler + .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64 +___ +$code.=<<___ if ($avx); +.LSEH_info_poly1305_blocks_avx_1: + .byte 9,0,0,0 + .rva se_handler + .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx_2: + .byte 9,0,0,0 + .rva se_handler + .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx_3: + .byte 9,0,0,0 + .rva avx_handler + .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] + +.LSEH_info_poly1305_emit_avx: + .byte 9,0,0,0 + .rva se_handler + .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx +___ +$code.=<<___ if ($avx>1); +.LSEH_info_poly1305_blocks_avx2_1: + .byte 9,0,0,0 + .rva se_handler + .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx2_2: + .byte 9,0,0,0 + .rva se_handler + .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx2_3: + .byte 9,0,0,0 + .rva avx_handler + .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] +___ +$code.=<<___ if ($avx>2); +.LSEH_info_poly1305_blocks_avx512: + .byte 9,0,0,0 + .rva avx_handler + .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] +___ +} + +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach (split('\n',$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + s/%r([a-z]+)#d/%e$1/g; + s/%r([0-9]+)#d/%r$1d/g; + s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; + + if ($kernel) { + s/(^\.type.*),[0-9]+$/\1/; + s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/; + next if /^\.cfi.*/; + } + + print $_,"\n"; +} +close STDOUT; diff --git a/lib/crypto/x86/poly1305_glue.c b/lib/crypto/x86/poly1305_glue.c new file mode 100644 index 000000000000..856d48fd422b --- /dev/null +++ b/lib/crypto/x86/poly1305_glue.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <asm/cpu_device_id.h> +#include <asm/fpu/api.h> +#include <crypto/internal/poly1305.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sizes.h> +#include <linux/unaligned.h> + +struct poly1305_arch_internal { + union { + struct { + u32 h[5]; + u32 is_base2_26; + }; + u64 hs[3]; + }; + u64 r[2]; + u64 pad; + struct { u32 r2, r1, r4, r3; } rn[9]; +}; + +/* + * The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit + * the unfortunate situation of using AVX and then having to go back to scalar + * -- because the user is silly and has called the update function from two + * separate contexts -- then we need to convert back to the original base before + * proceeding. It is possible to reason that the initial reduction below is + * sufficient given the implementation invariants. However, for an avoidance of + * doubt and because this is not performance critical, we do the full reduction + * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py + */ +static void convert_to_base2_64(void *ctx) +{ + struct poly1305_arch_internal *state = ctx; + u32 cy; + + if (!state->is_base2_26) + return; + + cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy; + cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy; + cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy; + cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy; + state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0]; + state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12); + state->hs[2] = state->h[4] >> 24; + /* Unsigned Less Than: branchlessly produces 1 if a < b, else 0. */ +#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1)) + cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL); + state->hs[2] &= 3; + state->hs[0] += cy; + state->hs[1] += (cy = ULT(state->hs[0], cy)); + state->hs[2] += ULT(state->hs[1], cy); +#undef ULT + state->is_base2_26 = 0; +} + +asmlinkage void poly1305_block_init_arch( + struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +EXPORT_SYMBOL_GPL(poly1305_block_init_arch); +asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx, + const u8 *inp, + const size_t len, const u32 padbit); +asmlinkage void poly1305_emit_x86_64(const struct poly1305_state *ctx, + u8 mac[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +asmlinkage void poly1305_emit_avx(const struct poly1305_state *ctx, + u8 mac[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +asmlinkage void poly1305_blocks_avx(struct poly1305_arch_internal *ctx, + const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_blocks_avx2(struct poly1305_arch_internal *ctx, + const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_blocks_avx512(struct poly1305_arch_internal *ctx, + const u8 *inp, + const size_t len, const u32 padbit); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); + +void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp, + unsigned int len, u32 padbit) +{ + struct poly1305_arch_internal *ctx = + container_of(&state->h.h, struct poly1305_arch_internal, h); + + /* SIMD disables preemption, so relax after processing each page. */ + BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || + SZ_4K % POLY1305_BLOCK_SIZE); + + /* + * The AVX implementations have significant setup overhead (e.g. key + * power computation, kernel FPU enabling) which makes them slower for + * short messages. Fall back to the scalar implementation for messages + * shorter than 288 bytes, unless the AVX-specific key setup has already + * been performed (indicated by ctx->is_base2_26). + */ + if (!static_branch_likely(&poly1305_use_avx) || + (len < POLY1305_BLOCK_SIZE * 18 && !ctx->is_base2_26) || + unlikely(!irq_fpu_usable())) { + convert_to_base2_64(ctx); + poly1305_blocks_x86_64(ctx, inp, len, padbit); + return; + } + + do { + const unsigned int bytes = min(len, SZ_4K); + + kernel_fpu_begin(); + if (static_branch_likely(&poly1305_use_avx512)) + poly1305_blocks_avx512(ctx, inp, bytes, padbit); + else if (static_branch_likely(&poly1305_use_avx2)) + poly1305_blocks_avx2(ctx, inp, bytes, padbit); + else + poly1305_blocks_avx(ctx, inp, bytes, padbit); + kernel_fpu_end(); + + len -= bytes; + inp += bytes; + } while (len); +} +EXPORT_SYMBOL_GPL(poly1305_blocks_arch); + +void poly1305_emit_arch(const struct poly1305_state *ctx, + u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]) +{ + if (!static_branch_likely(&poly1305_use_avx)) + poly1305_emit_x86_64(ctx, mac, nonce); + else + poly1305_emit_avx(ctx, mac, nonce); +} +EXPORT_SYMBOL_GPL(poly1305_emit_arch); + +bool poly1305_is_arch_optimized(void) +{ + return static_key_enabled(&poly1305_use_avx); +} +EXPORT_SYMBOL(poly1305_is_arch_optimized); + +static int __init poly1305_simd_mod_init(void) +{ + if (boot_cpu_has(X86_FEATURE_AVX) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + static_branch_enable(&poly1305_use_avx); + if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + static_branch_enable(&poly1305_use_avx2); + if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX512F) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && + /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ + boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X) + static_branch_enable(&poly1305_use_avx512); + return 0; +} +subsys_initcall(poly1305_simd_mod_init); + +static void __exit poly1305_simd_mod_exit(void) +{ +} +module_exit(poly1305_simd_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); +MODULE_DESCRIPTION("Poly1305 authenticator"); diff --git a/lib/crypto/x86/sha1-avx2-asm.S b/lib/crypto/x86/sha1-avx2-asm.S new file mode 100644 index 000000000000..91b2dc6db179 --- /dev/null +++ b/lib/crypto/x86/sha1-avx2-asm.S @@ -0,0 +1,697 @@ +/* + * Implement fast SHA-1 with AVX2 instructions. (x86_64) + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Ilya Albrekht <ilya.albrekht@intel.com> + * Maxim Locktyukhin <maxim.locktyukhin@intel.com> + * Ronen Zohar <ronen.zohar@intel.com> + * Chandramouli Narayanan <mouli@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * SHA-1 implementation with Intel(R) AVX2 instruction set extensions. + * + *This implementation is based on the previous SSSE3 release: + *Visit http://software.intel.com/en-us/articles/ + *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ + * + * void sha1_transform_avx2(struct sha1_block_state *state, + * const u8 *data, size_t nblocks); + */ + +#include <linux/linkage.h> + +#define CTX %rdi /* arg1 */ +#define BUF %rsi /* arg2 */ +#define CNT %rdx /* arg3 */ + +#define REG_A %ecx +#define REG_B %esi +#define REG_C %edi +#define REG_D %eax +#define REG_E %edx +#define REG_TB %ebx +#define REG_TA %r12d +#define REG_RA %rcx +#define REG_RB %rsi +#define REG_RC %rdi +#define REG_RD %rax +#define REG_RE %rdx +#define REG_RTA %r12 +#define REG_RTB %rbx +#define REG_T1 %r11d +#define xmm_mov vmovups +#define avx2_zeroupper vzeroupper +#define RND_F1 1 +#define RND_F2 2 +#define RND_F3 3 + +.macro REGALLOC + .set A, REG_A + .set B, REG_B + .set C, REG_C + .set D, REG_D + .set E, REG_E + .set TB, REG_TB + .set TA, REG_TA + + .set RA, REG_RA + .set RB, REG_RB + .set RC, REG_RC + .set RD, REG_RD + .set RE, REG_RE + + .set RTA, REG_RTA + .set RTB, REG_RTB + + .set T1, REG_T1 +.endm + +#define HASH_PTR %r9 +#define BLOCKS_CTR %r8 +#define BUFFER_PTR %r10 +#define BUFFER_PTR2 %r13 + +#define PRECALC_BUF %r14 +#define WK_BUF %r15 + +#define W_TMP %xmm0 +#define WY_TMP %ymm0 +#define WY_TMP2 %ymm9 + +# AVX2 variables +#define WY0 %ymm3 +#define WY4 %ymm5 +#define WY08 %ymm7 +#define WY12 %ymm8 +#define WY16 %ymm12 +#define WY20 %ymm13 +#define WY24 %ymm14 +#define WY28 %ymm15 + +#define YMM_SHUFB_BSWAP %ymm10 + +/* + * Keep 2 iterations precalculated at a time: + * - 80 DWORDs per iteration * 2 + */ +#define W_SIZE (80*2*2 +16) + +#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) +#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF) + + +.macro UPDATE_HASH hash, val + add \hash, \val + mov \val, \hash +.endm + +.macro PRECALC_RESET_WY + .set WY_00, WY0 + .set WY_04, WY4 + .set WY_08, WY08 + .set WY_12, WY12 + .set WY_16, WY16 + .set WY_20, WY20 + .set WY_24, WY24 + .set WY_28, WY28 + .set WY_32, WY_00 +.endm + +.macro PRECALC_ROTATE_WY + /* Rotate macros */ + .set WY_32, WY_28 + .set WY_28, WY_24 + .set WY_24, WY_20 + .set WY_20, WY_16 + .set WY_16, WY_12 + .set WY_12, WY_08 + .set WY_08, WY_04 + .set WY_04, WY_00 + .set WY_00, WY_32 + + /* Define register aliases */ + .set WY, WY_00 + .set WY_minus_04, WY_04 + .set WY_minus_08, WY_08 + .set WY_minus_12, WY_12 + .set WY_minus_16, WY_16 + .set WY_minus_20, WY_20 + .set WY_minus_24, WY_24 + .set WY_minus_28, WY_28 + .set WY_minus_32, WY +.endm + +.macro PRECALC_00_15 + .if (i == 0) # Initialize and rotate registers + PRECALC_RESET_WY + PRECALC_ROTATE_WY + .endif + + /* message scheduling pre-compute for rounds 0-15 */ + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + vmovdqu (i * 2)(BUFFER_PTR), W_TMP + .elseif ((i & 7) == 1) + vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ + WY_TMP, WY_TMP + .elseif ((i & 7) == 2) + vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY + .elseif ((i & 7) == 4) + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP + .elseif ((i & 7) == 7) + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC_16_31 + /* + * message scheduling pre-compute for rounds 16-31 + * calculating last 32 w[i] values in 8 XMM registers + * pre-calculate K+w[i] values and store to mem + * for later load by ALU add instruction + * + * "brute force" vectorization for rounds 16-31 only + * due to w[i]->w[i-3] dependency + */ + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + /* w[i-14] */ + vpalignr $8, WY_minus_16, WY_minus_12, WY + vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */ + .elseif ((i & 7) == 1) + vpxor WY_minus_08, WY, WY + vpxor WY_minus_16, WY_TMP, WY_TMP + .elseif ((i & 7) == 2) + vpxor WY_TMP, WY, WY + vpslldq $12, WY, WY_TMP2 + .elseif ((i & 7) == 3) + vpslld $1, WY, WY_TMP + vpsrld $31, WY, WY + .elseif ((i & 7) == 4) + vpor WY, WY_TMP, WY_TMP + vpslld $2, WY_TMP2, WY + .elseif ((i & 7) == 5) + vpsrld $30, WY_TMP2, WY_TMP2 + vpxor WY, WY_TMP, WY_TMP + .elseif ((i & 7) == 7) + vpxor WY_TMP2, WY_TMP, WY + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC_32_79 + /* + * in SHA-1 specification: + * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 + * instead we do equal: + * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 + * allows more efficient vectorization + * since w[i]=>w[i-3] dependency is broken + */ + + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP + .elseif ((i & 7) == 1) + /* W is W_minus_32 before xor */ + vpxor WY_minus_28, WY, WY + .elseif ((i & 7) == 2) + vpxor WY_minus_16, WY_TMP, WY_TMP + .elseif ((i & 7) == 3) + vpxor WY_TMP, WY, WY + .elseif ((i & 7) == 4) + vpslld $2, WY, WY_TMP + .elseif ((i & 7) == 5) + vpsrld $30, WY, WY + vpor WY, WY_TMP, WY + .elseif ((i & 7) == 7) + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC r, s + .set i, \r + + .if (i < 40) + .set K_XMM, 32*0 + .elseif (i < 80) + .set K_XMM, 32*1 + .elseif (i < 120) + .set K_XMM, 32*2 + .else + .set K_XMM, 32*3 + .endif + + .if (i<32) + PRECALC_00_15 \s + .elseif (i<64) + PRECALC_16_31 \s + .elseif (i < 160) + PRECALC_32_79 \s + .endif +.endm + +.macro ROTATE_STATE + .set T_REG, E + .set E, D + .set D, C + .set C, B + .set B, TB + .set TB, A + .set A, T_REG + + .set T_REG, RE + .set RE, RD + .set RD, RC + .set RC, RB + .set RB, RTB + .set RTB, RA + .set RA, T_REG +.endm + +/* Macro relies on saved ROUND_Fx */ + +.macro RND_FUN f, r + .if (\f == RND_F1) + ROUND_F1 \r + .elseif (\f == RND_F2) + ROUND_F2 \r + .elseif (\f == RND_F3) + ROUND_F3 \r + .endif +.endm + +.macro RR r + .set round_id, (\r % 80) + + .if (round_id == 0) /* Precalculate F for first round */ + .set ROUND_FUNC, RND_F1 + mov B, TB + + rorx $(32-30), B, B /* b>>>2 */ + andn D, TB, T1 + and C, TB + xor T1, TB + .endif + + RND_FUN ROUND_FUNC, \r + ROTATE_STATE + + .if (round_id == 18) + .set ROUND_FUNC, RND_F2 + .elseif (round_id == 38) + .set ROUND_FUNC, RND_F3 + .elseif (round_id == 58) + .set ROUND_FUNC, RND_F2 + .endif + + .set round_id, ( (\r+1) % 80) + + RND_FUN ROUND_FUNC, (\r+1) + ROTATE_STATE +.endm + +.macro ROUND_F1 r + add WK(\r), E + + andn C, A, T1 /* ~b&d */ + lea (RE,RTB), E /* Add F from the previous round */ + + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + rorx $(32-30),A, TB /* b>>>2 for next round */ + + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + /* + * Calculate F for the next round + * (b & c) ^ andn[b, d] + */ + and B, A /* b&c */ + xor T1, A /* F1 = (b&c) ^ (~b&d) */ + + lea (RE,RTA), E /* E += A >>> 5 */ +.endm + +.macro ROUND_F2 r + add WK(\r), E + lea (RE,RTB), E /* Add F from the previous round */ + + /* Calculate F for the next round */ + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + .if ((round_id) < 79) + rorx $(32-30), A, TB /* b>>>2 for next round */ + .endif + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + .if ((round_id) < 79) + xor B, A + .endif + + add TA, E /* E += A >>> 5 */ + + .if ((round_id) < 79) + xor C, A + .endif +.endm + +.macro ROUND_F3 r + add WK(\r), E + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + lea (RE,RTB), E /* Add F from the previous round */ + + mov B, T1 + or A, T1 + + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + rorx $(32-30), A, TB /* b>>>2 for next round */ + + /* Calculate F for the next round + * (b and c) or (d and (b or c)) + */ + and C, T1 + and B, A + or T1, A + + add TA, E /* E += A >>> 5 */ + +.endm + +/* Add constant only if (%2 > %3) condition met (uses RTA as temp) + * %1 + %2 >= %3 ? %4 : 0 + */ +.macro ADD_IF_GE a, b, c, d + mov \a, RTA + add $\d, RTA + cmp $\c, \b + cmovge RTA, \a +.endm + +/* + * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining + */ +.macro SHA1_PIPELINED_MAIN_BODY + + REGALLOC + + mov (HASH_PTR), A + mov 4(HASH_PTR), B + mov 8(HASH_PTR), C + mov 12(HASH_PTR), D + mov 16(HASH_PTR), E + + mov %rsp, PRECALC_BUF + lea (2*4*80+32)(%rsp), WK_BUF + + # Precalc WK for first 2 blocks + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 + .set i, 0 + .rept 160 + PRECALC i + .set i, i + 1 + .endr + + /* Go to next block if needed */ + ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 + xchg WK_BUF, PRECALC_BUF + + .align 32 +.L_loop: + /* + * code loops through more than one block + * we use K_BASE value as a signal of a last block, + * it is set below by: cmovae BUFFER_PTR, K_BASE + */ + test BLOCKS_CTR, BLOCKS_CTR + jnz .L_begin + .align 32 + jmp .L_end + .align 32 +.L_begin: + + /* + * Do first block + * rounds: 0,2,4,6,8 + */ + .set j, 0 + .rept 5 + RR j + .set j, j+2 + .endr + + /* + * rounds: + * 10,12,14,16,18 + * 20,22,24,26,28 + * 30,32,34,36,38 + * 40,42,44,46,48 + * 50,52,54,56,58 + */ + .rept 25 + RR j + .set j, j+2 + .endr + + /* Update Counter */ + sub $1, BLOCKS_CTR + /* Move to the next block only if needed*/ + ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 + /* + * rounds + * 60,62,64,66,68 + * 70,72,74,76,78 + */ + .rept 10 + RR j + .set j, j+2 + .endr + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), TB + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + test BLOCKS_CTR, BLOCKS_CTR + jz .L_loop + + mov TB, B + + /* Process second block */ + /* + * rounds + * 0+80, 2+80, 4+80, 6+80, 8+80 + * 10+80,12+80,14+80,16+80,18+80 + */ + + .set j, 0 + .rept 10 + RR j+80 + .set j, j+2 + .endr + + /* + * rounds + * 20+80,22+80,24+80,26+80,28+80 + * 30+80,32+80,34+80,36+80,38+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + /* + * rounds + * 40+80,42+80,44+80,46+80,48+80 + * 50+80,52+80,54+80,56+80,58+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + /* update counter */ + sub $1, BLOCKS_CTR + /* Move to the next block only if needed*/ + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 + + /* + * rounds + * 60+80,62+80,64+80,66+80,68+80 + * 70+80,72+80,74+80,76+80,78+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), TB + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + /* Reset state for AVX2 reg permutation */ + mov A, TA + mov TB, A + mov C, TB + mov E, C + mov D, B + mov TA, D + + REGALLOC + + xchg WK_BUF, PRECALC_BUF + + jmp .L_loop + + .align 32 +.L_end: + +.endm +/* + * macro implements SHA-1 function's body for several 64-byte blocks + * param: function's name + */ +.macro SHA1_VECTOR_ASM name + SYM_FUNC_START(\name) + + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + RESERVE_STACK = (W_SIZE*4 + 8+24) + + /* Align stack */ + push %rbp + mov %rsp, %rbp + and $~(0x20-1), %rsp + sub $RESERVE_STACK, %rsp + + avx2_zeroupper + + /* Setup initial values */ + mov CTX, HASH_PTR + mov BUF, BUFFER_PTR + + mov BUF, BUFFER_PTR2 + mov CNT, BLOCKS_CTR + + xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP + + SHA1_PIPELINED_MAIN_BODY + + avx2_zeroupper + + mov %rbp, %rsp + pop %rbp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + + RET + + SYM_FUNC_END(\name) +.endm + +.section .rodata + +#define K1 0x5a827999 +#define K2 0x6ed9eba1 +#define K3 0x8f1bbcdc +#define K4 0xca62c1d6 + +.align 128 +K_XMM_AR: + .long K1, K1, K1, K1 + .long K1, K1, K1, K1 + .long K2, K2, K2, K2 + .long K2, K2, K2, K2 + .long K3, K3, K3, K3 + .long K3, K3, K3, K3 + .long K4, K4, K4, K4 + .long K4, K4, K4, K4 + +BSWAP_SHUFB_CTL: + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f +.text + +SHA1_VECTOR_ASM sha1_transform_avx2 diff --git a/lib/crypto/x86/sha1-ni-asm.S b/lib/crypto/x86/sha1-ni-asm.S new file mode 100644 index 000000000000..428f9b960594 --- /dev/null +++ b/lib/crypto/x86/sha1-ni-asm.S @@ -0,0 +1,152 @@ +/* + * Intel SHA Extensions optimized implementation of a SHA-1 update function + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Sean Gulley <sean.m.gulley@intel.com> + * Tim Chen <tim.c.chen@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/linkage.h> + +#define STATE_PTR %rdi /* 1st arg */ +#define DATA_PTR %rsi /* 2nd arg */ +#define NUM_BLKS %rdx /* 3rd arg */ + +#define ABCD %xmm0 +#define E0 %xmm1 /* Need two E's b/c they ping pong */ +#define E1 %xmm2 +#define MSG0 %xmm3 +#define MSG1 %xmm4 +#define MSG2 %xmm5 +#define MSG3 %xmm6 +#define SHUF_MASK %xmm7 +#define ABCD_SAVED %xmm8 +#define E0_SAVED %xmm9 + +.macro do_4rounds i, m0, m1, m2, m3, e0, e1 +.if \i < 16 + movdqu \i*4(DATA_PTR), \m0 + pshufb SHUF_MASK, \m0 +.endif +.if \i == 0 + paddd \m0, \e0 +.else + sha1nexte \m0, \e0 +.endif + movdqa ABCD, \e1 +.if \i >= 12 && \i < 76 + sha1msg2 \m0, \m1 +.endif + sha1rnds4 $\i / 20, \e0, ABCD +.if \i >= 4 && \i < 68 + sha1msg1 \m0, \m3 +.endif +.if \i >= 8 && \i < 72 + pxor \m0, \m2 +.endif +.endm + +/* + * Intel SHA Extensions optimized implementation of a SHA-1 block function + * + * This function takes a pointer to the current SHA-1 state, a pointer to the + * input data, and the number of 64-byte blocks to process. The number of + * blocks to process is assumed to be nonzero. Once all blocks have been + * processed, the state is updated with the new state. This function only + * processes complete blocks. State initialization, buffering of partial + * blocks, and digest finalization are expected to be handled elsewhere. + * + * void sha1_ni_transform(struct sha1_block_state *state, + * const u8 *data, size_t nblocks) + */ +.text +SYM_FUNC_START(sha1_ni_transform) + + /* Load the initial state from STATE_PTR. */ + pxor E0, E0 + pinsrd $3, 16(STATE_PTR), E0 + movdqu (STATE_PTR), ABCD + pshufd $0x1B, ABCD, ABCD + + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK + +.Lnext_block: + /* Save the state for addition after the rounds. */ + movdqa E0, E0_SAVED + movdqa ABCD, ABCD_SAVED + +.irp i, 0, 16, 32, 48, 64 + do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3, E0, E1 + do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0, E1, E0 + do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1, E0, E1 + do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2, E1, E0 +.endr + + /* Add the previous state (before the rounds) to the current state. */ + sha1nexte E0_SAVED, E0 + paddd ABCD_SAVED, ABCD + + /* Advance to the next block, or break if there are no more blocks. */ + add $64, DATA_PTR + dec NUM_BLKS + jnz .Lnext_block + + /* Store the new state to STATE_PTR. */ + pextrd $3, E0, 16(STATE_PTR) + pshufd $0x1B, ABCD, ABCD + movdqu ABCD, (STATE_PTR) + + RET +SYM_FUNC_END(sha1_ni_transform) + +.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 +.align 16 +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x000102030405060708090a0b0c0d0e0f diff --git a/lib/crypto/x86/sha1-ssse3-and-avx.S b/lib/crypto/x86/sha1-ssse3-and-avx.S new file mode 100644 index 000000000000..78b48cb09c5b --- /dev/null +++ b/lib/crypto/x86/sha1-ssse3-and-avx.S @@ -0,0 +1,551 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental + * SSE3 instruction set extensions introduced in Intel Core Microarchitecture + * processors. CPUs supporting Intel(R) AVX extensions will get an additional + * boost. + * + * This work was inspired by the vectorized implementation of Dean Gaudet. + * Additional information on it can be found at: + * http://www.arctic.org/~dean/crypto/sha1.html + * + * It was improved upon with more efficient vectorization of the message + * scheduling. This implementation has also been optimized for all current and + * several future generations of Intel CPUs. + * + * See this article for more information about the implementation details: + * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ + * + * Copyright (C) 2010, Intel Corp. + * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> + * Ronen Zohar <ronen.zohar@intel.com> + * + * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: + * Author: Mathias Krause <minipli@googlemail.com> + */ + +#include <linux/linkage.h> + +#define CTX %rdi // arg1 +#define BUF %rsi // arg2 +#define CNT %rdx // arg3 + +#define REG_A %ecx +#define REG_B %esi +#define REG_C %edi +#define REG_D %r12d +#define REG_E %edx + +#define REG_T1 %eax +#define REG_T2 %ebx + +#define K_BASE %r8 +#define HASH_PTR %r9 +#define BUFFER_PTR %r10 +#define BUFFER_END %r11 + +#define W_TMP1 %xmm0 +#define W_TMP2 %xmm9 + +#define W0 %xmm1 +#define W4 %xmm2 +#define W8 %xmm3 +#define W12 %xmm4 +#define W16 %xmm5 +#define W20 %xmm6 +#define W24 %xmm7 +#define W28 %xmm8 + +#define XMM_SHUFB_BSWAP %xmm10 + +/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ +#define WK(t) (((t) & 15) * 4)(%rsp) +#define W_PRECALC_AHEAD 16 + +/* + * This macro implements the SHA-1 function's body for single 64-byte block + * param: function's name + */ +.macro SHA1_VECTOR_ASM name + SYM_FUNC_START(\name) + + push %rbx + push %r12 + push %rbp + mov %rsp, %rbp + + sub $64, %rsp # allocate workspace + and $~15, %rsp # align stack + + mov CTX, HASH_PTR + mov BUF, BUFFER_PTR + + shl $6, CNT # multiply by 64 + add BUF, CNT + mov CNT, BUFFER_END + + lea K_XMM_AR(%rip), K_BASE + xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP + + SHA1_PIPELINED_MAIN_BODY + + # cleanup workspace + mov $8, %ecx + mov %rsp, %rdi + xor %eax, %eax + rep stosq + + mov %rbp, %rsp # deallocate workspace + pop %rbp + pop %r12 + pop %rbx + RET + + SYM_FUNC_END(\name) +.endm + +/* + * This macro implements 80 rounds of SHA-1 for one 64-byte block + */ +.macro SHA1_PIPELINED_MAIN_BODY + INIT_REGALLOC + + mov (HASH_PTR), A + mov 4(HASH_PTR), B + mov 8(HASH_PTR), C + mov 12(HASH_PTR), D + mov 16(HASH_PTR), E + + .set i, 0 + .rept W_PRECALC_AHEAD + W_PRECALC i + .set i, (i+1) + .endr + +.align 4 +1: + RR F1,A,B,C,D,E,0 + RR F1,D,E,A,B,C,2 + RR F1,B,C,D,E,A,4 + RR F1,E,A,B,C,D,6 + RR F1,C,D,E,A,B,8 + + RR F1,A,B,C,D,E,10 + RR F1,D,E,A,B,C,12 + RR F1,B,C,D,E,A,14 + RR F1,E,A,B,C,D,16 + RR F1,C,D,E,A,B,18 + + RR F2,A,B,C,D,E,20 + RR F2,D,E,A,B,C,22 + RR F2,B,C,D,E,A,24 + RR F2,E,A,B,C,D,26 + RR F2,C,D,E,A,B,28 + + RR F2,A,B,C,D,E,30 + RR F2,D,E,A,B,C,32 + RR F2,B,C,D,E,A,34 + RR F2,E,A,B,C,D,36 + RR F2,C,D,E,A,B,38 + + RR F3,A,B,C,D,E,40 + RR F3,D,E,A,B,C,42 + RR F3,B,C,D,E,A,44 + RR F3,E,A,B,C,D,46 + RR F3,C,D,E,A,B,48 + + RR F3,A,B,C,D,E,50 + RR F3,D,E,A,B,C,52 + RR F3,B,C,D,E,A,54 + RR F3,E,A,B,C,D,56 + RR F3,C,D,E,A,B,58 + + add $64, BUFFER_PTR # move to the next 64-byte block + cmp BUFFER_END, BUFFER_PTR # if the current is the last one use + cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun + + RR F4,A,B,C,D,E,60 + RR F4,D,E,A,B,C,62 + RR F4,B,C,D,E,A,64 + RR F4,E,A,B,C,D,66 + RR F4,C,D,E,A,B,68 + + RR F4,A,B,C,D,E,70 + RR F4,D,E,A,B,C,72 + RR F4,B,C,D,E,A,74 + RR F4,E,A,B,C,D,76 + RR F4,C,D,E,A,B,78 + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), B + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + RESTORE_RENAMED_REGS + cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end + jne 1b +.endm + +.macro INIT_REGALLOC + .set A, REG_A + .set B, REG_B + .set C, REG_C + .set D, REG_D + .set E, REG_E + .set T1, REG_T1 + .set T2, REG_T2 +.endm + +.macro RESTORE_RENAMED_REGS + # order is important (REG_C is where it should be) + mov B, REG_B + mov D, REG_D + mov A, REG_A + mov E, REG_E +.endm + +.macro SWAP_REG_NAMES a, b + .set _T, \a + .set \a, \b + .set \b, _T +.endm + +.macro F1 b, c, d + mov \c, T1 + SWAP_REG_NAMES \c, T1 + xor \d, T1 + and \b, T1 + xor \d, T1 +.endm + +.macro F2 b, c, d + mov \d, T1 + SWAP_REG_NAMES \d, T1 + xor \c, T1 + xor \b, T1 +.endm + +.macro F3 b, c ,d + mov \c, T1 + SWAP_REG_NAMES \c, T1 + mov \b, T2 + or \b, T1 + and \c, T2 + and \d, T1 + or T2, T1 +.endm + +.macro F4 b, c, d + F2 \b, \c, \d +.endm + +.macro UPDATE_HASH hash, val + add \hash, \val + mov \val, \hash +.endm + +/* + * RR does two rounds of SHA-1 back to back with W[] pre-calc + * t1 = F(b, c, d); e += w(i) + * e += t1; b <<= 30; d += w(i+1); + * t1 = F(a, b, c); + * d += t1; a <<= 5; + * e += a; + * t1 = e; a >>= 7; + * t1 <<= 5; + * d += t1; + */ +.macro RR F, a, b, c, d, e, round + add WK(\round), \e + \F \b, \c, \d # t1 = F(b, c, d); + W_PRECALC (\round + W_PRECALC_AHEAD) + rol $30, \b + add T1, \e + add WK(\round + 1), \d + + \F \a, \b, \c + W_PRECALC (\round + W_PRECALC_AHEAD + 1) + rol $5, \a + add \a, \e + add T1, \d + ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) + + mov \e, T1 + SWAP_REG_NAMES \e, T1 + + rol $5, T1 + add T1, \d + + # write: \a, \b + # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c +.endm + +.macro W_PRECALC r + .set i, \r + + .if (i < 20) + .set K_XMM, 0 + .elseif (i < 40) + .set K_XMM, 16 + .elseif (i < 60) + .set K_XMM, 32 + .elseif (i < 80) + .set K_XMM, 48 + .endif + + .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) + .set i, ((\r) % 80) # pre-compute for the next iteration + .if (i == 0) + W_PRECALC_RESET + .endif + W_PRECALC_00_15 + .elseif (i<32) + W_PRECALC_16_31 + .elseif (i < 80) // rounds 32-79 + W_PRECALC_32_79 + .endif +.endm + +.macro W_PRECALC_RESET + .set W, W0 + .set W_minus_04, W4 + .set W_minus_08, W8 + .set W_minus_12, W12 + .set W_minus_16, W16 + .set W_minus_20, W20 + .set W_minus_24, W24 + .set W_minus_28, W28 + .set W_minus_32, W +.endm + +.macro W_PRECALC_ROTATE + .set W_minus_32, W_minus_28 + .set W_minus_28, W_minus_24 + .set W_minus_24, W_minus_20 + .set W_minus_20, W_minus_16 + .set W_minus_16, W_minus_12 + .set W_minus_12, W_minus_08 + .set W_minus_08, W_minus_04 + .set W_minus_04, W + .set W, W_minus_32 +.endm + +.macro W_PRECALC_SSSE3 + +.macro W_PRECALC_00_15 + W_PRECALC_00_15_SSSE3 +.endm +.macro W_PRECALC_16_31 + W_PRECALC_16_31_SSSE3 +.endm +.macro W_PRECALC_32_79 + W_PRECALC_32_79_SSSE3 +.endm + +/* message scheduling pre-compute for rounds 0-15 */ +.macro W_PRECALC_00_15_SSSE3 + .if ((i & 3) == 0) + movdqu (i*4)(BUFFER_PTR), W_TMP1 + .elseif ((i & 3) == 1) + pshufb XMM_SHUFB_BSWAP, W_TMP1 + movdqa W_TMP1, W + .elseif ((i & 3) == 2) + paddd (K_BASE), W_TMP1 + .elseif ((i & 3) == 3) + movdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +/* message scheduling pre-compute for rounds 16-31 + * + * - calculating last 32 w[i] values in 8 XMM registers + * - pre-calculate K+w[i] values and store to mem, for later load by ALU add + * instruction + * + * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] + * dependency, but improves for 32-79 + */ +.macro W_PRECALC_16_31_SSSE3 + # blended scheduling of vector and scalar instruction streams, one 4-wide + # vector iteration / 4 scalar rounds + .if ((i & 3) == 0) + movdqa W_minus_12, W + palignr $8, W_minus_16, W # w[i-14] + movdqa W_minus_04, W_TMP1 + psrldq $4, W_TMP1 # w[i-3] + pxor W_minus_08, W + .elseif ((i & 3) == 1) + pxor W_minus_16, W_TMP1 + pxor W_TMP1, W + movdqa W, W_TMP2 + movdqa W, W_TMP1 + pslldq $12, W_TMP2 + .elseif ((i & 3) == 2) + psrld $31, W + pslld $1, W_TMP1 + por W, W_TMP1 + movdqa W_TMP2, W + psrld $30, W_TMP2 + pslld $2, W + .elseif ((i & 3) == 3) + pxor W, W_TMP1 + pxor W_TMP2, W_TMP1 + movdqa W_TMP1, W + paddd K_XMM(K_BASE), W_TMP1 + movdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +/* message scheduling pre-compute for rounds 32-79 + * + * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 + * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 + * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken + */ +.macro W_PRECALC_32_79_SSSE3 + .if ((i & 3) == 0) + movdqa W_minus_04, W_TMP1 + pxor W_minus_28, W # W is W_minus_32 before xor + palignr $8, W_minus_08, W_TMP1 + .elseif ((i & 3) == 1) + pxor W_minus_16, W + pxor W_TMP1, W + movdqa W, W_TMP1 + .elseif ((i & 3) == 2) + psrld $30, W + pslld $2, W_TMP1 + por W, W_TMP1 + .elseif ((i & 3) == 3) + movdqa W_TMP1, W + paddd K_XMM(K_BASE), W_TMP1 + movdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.endm // W_PRECALC_SSSE3 + + +#define K1 0x5a827999 +#define K2 0x6ed9eba1 +#define K3 0x8f1bbcdc +#define K4 0xca62c1d6 + +.section .rodata +.align 16 + +K_XMM_AR: + .long K1, K1, K1, K1 + .long K2, K2, K2, K2 + .long K3, K3, K3, K3 + .long K4, K4, K4, K4 + +BSWAP_SHUFB_CTL: + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f + + +.section .text + +W_PRECALC_SSSE3 +.macro xmm_mov a, b + movdqu \a,\b +.endm + +/* + * SSSE3 optimized implementation: + * + * void sha1_transform_ssse3(struct sha1_block_state *state, + * const u8 *data, size_t nblocks); + */ +SHA1_VECTOR_ASM sha1_transform_ssse3 + +.macro W_PRECALC_AVX + +.purgem W_PRECALC_00_15 +.macro W_PRECALC_00_15 + W_PRECALC_00_15_AVX +.endm +.purgem W_PRECALC_16_31 +.macro W_PRECALC_16_31 + W_PRECALC_16_31_AVX +.endm +.purgem W_PRECALC_32_79 +.macro W_PRECALC_32_79 + W_PRECALC_32_79_AVX +.endm + +.macro W_PRECALC_00_15_AVX + .if ((i & 3) == 0) + vmovdqu (i*4)(BUFFER_PTR), W_TMP1 + .elseif ((i & 3) == 1) + vpshufb XMM_SHUFB_BSWAP, W_TMP1, W + .elseif ((i & 3) == 2) + vpaddd (K_BASE), W, W_TMP1 + .elseif ((i & 3) == 3) + vmovdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.macro W_PRECALC_16_31_AVX + .if ((i & 3) == 0) + vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] + vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] + vpxor W_minus_08, W, W + vpxor W_minus_16, W_TMP1, W_TMP1 + .elseif ((i & 3) == 1) + vpxor W_TMP1, W, W + vpslldq $12, W, W_TMP2 + vpslld $1, W, W_TMP1 + .elseif ((i & 3) == 2) + vpsrld $31, W, W + vpor W, W_TMP1, W_TMP1 + vpslld $2, W_TMP2, W + vpsrld $30, W_TMP2, W_TMP2 + .elseif ((i & 3) == 3) + vpxor W, W_TMP1, W_TMP1 + vpxor W_TMP2, W_TMP1, W + vpaddd K_XMM(K_BASE), W, W_TMP1 + vmovdqu W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.macro W_PRECALC_32_79_AVX + .if ((i & 3) == 0) + vpalignr $8, W_minus_08, W_minus_04, W_TMP1 + vpxor W_minus_28, W, W # W is W_minus_32 before xor + .elseif ((i & 3) == 1) + vpxor W_minus_16, W_TMP1, W_TMP1 + vpxor W_TMP1, W, W + .elseif ((i & 3) == 2) + vpslld $2, W, W_TMP1 + vpsrld $30, W, W + vpor W, W_TMP1, W + .elseif ((i & 3) == 3) + vpaddd K_XMM(K_BASE), W, W_TMP1 + vmovdqu W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.endm // W_PRECALC_AVX + +W_PRECALC_AVX +.purgem xmm_mov +.macro xmm_mov a, b + vmovdqu \a,\b +.endm + + +/* AVX optimized implementation: + * void sha1_transform_avx(struct sha1_block_state *state, + * const u8 *data, size_t nblocks); + */ +SHA1_VECTOR_ASM sha1_transform_avx diff --git a/lib/crypto/x86/sha1.h b/lib/crypto/x86/sha1.h new file mode 100644 index 000000000000..e308379d89bc --- /dev/null +++ b/lib/crypto/x86/sha1.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-1 optimized for x86_64 + * + * Copyright 2025 Google LLC + */ +#include <asm/fpu/api.h> +#include <linux/static_call.h> + +DEFINE_STATIC_CALL(sha1_blocks_x86, sha1_blocks_generic); + +#define DEFINE_X86_SHA1_FN(c_fn, asm_fn) \ + asmlinkage void asm_fn(struct sha1_block_state *state, \ + const u8 *data, size_t nblocks); \ + static void c_fn(struct sha1_block_state *state, \ + const u8 *data, size_t nblocks) \ + { \ + if (likely(irq_fpu_usable())) { \ + kernel_fpu_begin(); \ + asm_fn(state, data, nblocks); \ + kernel_fpu_end(); \ + } else { \ + sha1_blocks_generic(state, data, nblocks); \ + } \ + } + +DEFINE_X86_SHA1_FN(sha1_blocks_ssse3, sha1_transform_ssse3); +DEFINE_X86_SHA1_FN(sha1_blocks_avx, sha1_transform_avx); +DEFINE_X86_SHA1_FN(sha1_blocks_ni, sha1_ni_transform); + +#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ + +asmlinkage void sha1_transform_avx2(struct sha1_block_state *state, + const u8 *data, size_t nblocks); +static void sha1_blocks_avx2(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + if (likely(irq_fpu_usable())) { + kernel_fpu_begin(); + /* Select the optimal transform based on the number of blocks */ + if (nblocks >= SHA1_AVX2_BLOCK_OPTSIZE) + sha1_transform_avx2(state, data, nblocks); + else + sha1_transform_avx(state, data, nblocks); + kernel_fpu_end(); + } else { + sha1_blocks_generic(state, data, nblocks); + } +} + +static void sha1_blocks(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + static_call(sha1_blocks_x86)(state, data, nblocks); +} + +#define sha1_mod_init_arch sha1_mod_init_arch +static inline void sha1_mod_init_arch(void) +{ + if (boot_cpu_has(X86_FEATURE_SHA_NI)) { + static_call_update(sha1_blocks_x86, sha1_blocks_ni); + } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + NULL) && + boot_cpu_has(X86_FEATURE_AVX)) { + if (boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_BMI1) && + boot_cpu_has(X86_FEATURE_BMI2)) + static_call_update(sha1_blocks_x86, sha1_blocks_avx2); + else + static_call_update(sha1_blocks_x86, sha1_blocks_avx); + } else if (boot_cpu_has(X86_FEATURE_SSSE3)) { + static_call_update(sha1_blocks_x86, sha1_blocks_ssse3); + } +} diff --git a/lib/crypto/x86/sha256-avx-asm.S b/lib/crypto/x86/sha256-avx-asm.S new file mode 100644 index 000000000000..c1aceb3ba3a3 --- /dev/null +++ b/lib/crypto/x86/sha256-avx-asm.S @@ -0,0 +1,493 @@ +######################################################################## +# Implement fast SHA-256 with AVX1 instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford <james.guilford@intel.com> +# Kirk Yap <kirk.s.yap@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-256 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## +# This code schedules 1 block at a time, with 4 lanes per block +######################################################################## + +#include <linux/linkage.h> + +## assume buffers not aligned +#define VMOVDQ vmovdqu + +################################ Define Macros + +# addm [mem], reg +# Add reg to mem using reg-mem add and store +.macro addm p1 p2 + add \p1, \p2 + mov \p2, \p1 +.endm + + +.macro MY_ROR p1 p2 + shld $(32-(\p1)), \p2, \p2 +.endm + +################################ + +# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask +# Load xmm with mem and byte swap each dword +.macro COPY_XMM_AND_BSWAP p1 p2 p3 + VMOVDQ \p2, \p1 + vpshufb \p3, \p1, \p1 +.endm + +################################ + +X0 = %xmm4 +X1 = %xmm5 +X2 = %xmm6 +X3 = %xmm7 + +XTMP0 = %xmm0 +XTMP1 = %xmm1 +XTMP2 = %xmm2 +XTMP3 = %xmm3 +XTMP4 = %xmm8 +XFER = %xmm9 +XTMP5 = %xmm11 + +SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA +SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00 +BYTE_FLIP_MASK = %xmm13 + +NUM_BLKS = %rdx # 3rd arg +INP = %rsi # 2nd arg +CTX = %rdi # 1st arg + +SRND = %rsi # clobbers INP +c = %ecx +d = %r8d +e = %edx +TBL = %r12 +a = %eax +b = %ebx + +f = %r9d +g = %r10d +h = %r11d + +y0 = %r13d +y1 = %r14d +y2 = %r15d + + +_INP_END_SIZE = 8 +_INP_SIZE = 8 +_XFER_SIZE = 16 +_XMM_SAVE_SIZE = 0 + +_INP_END = 0 +_INP = _INP_END + _INP_END_SIZE +_XFER = _INP + _INP_SIZE +_XMM_SAVE = _XFER + _XFER_SIZE +STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE + +# rotate_Xs +# Rotate values of symbols X0...X3 +.macro rotate_Xs +X_ = X0 +X0 = X1 +X1 = X2 +X2 = X3 +X3 = X_ +.endm + +# ROTATE_ARGS +# Rotate values of symbols a...h +.macro ROTATE_ARGS +TMP_ = h +h = g +g = f +f = e +e = d +d = c +c = b +b = a +a = TMP_ +.endm + +.macro FOUR_ROUNDS_AND_SCHED + ## compute s0 four at a time and s1 two at a time + ## compute W[-16] + W[-7] 4 at a time + + mov e, y0 # y0 = e + MY_ROR (25-11), y0 # y0 = e >> (25-11) + mov a, y1 # y1 = a + vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] + MY_ROR (22-13), y1 # y1 = a >> (22-13) + xor e, y0 # y0 = e ^ (e >> (25-11)) + mov f, y2 # y2 = f + MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor a, y1 # y1 = a ^ (a >> (22-13) + xor g, y2 # y2 = f^g + vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16] + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + ## compute s0 + vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor g, y2 # y2 = CH = ((f^g)&e)^g + MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y0, y2 # y2 = S1 + CH + add _XFER(%rsp), y2 # y2 = k + w + S1 + CH + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + vpsrld $7, XTMP1, XTMP2 + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + vpslld $(32-7), XTMP1, XTMP3 + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS + mov e, y0 # y0 = e + mov a, y1 # y1 = a + MY_ROR (25-11), y0 # y0 = e >> (25-11) + xor e, y0 # y0 = e ^ (e >> (25-11)) + mov f, y2 # y2 = f + MY_ROR (22-13), y1 # y1 = a >> (22-13) + vpsrld $18, XTMP1, XTMP2 # + xor a, y1 # y1 = a ^ (a >> (22-13) + MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor g, y2 # y2 = f^g + vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 + MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + vpslld $(32-18), XTMP1, XTMP1 + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor g, y2 # y2 = CH = ((f^g)&e)^g + vpxor XTMP1, XTMP3, XTMP3 # + add y0, y2 # y2 = S1 + CH + add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH + MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + ## compute low s1 + vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS + mov e, y0 # y0 = e + mov a, y1 # y1 = a + MY_ROR (25-11), y0 # y0 = e >> (25-11) + xor e, y0 # y0 = e ^ (e >> (25-11)) + MY_ROR (22-13), y1 # y1 = a >> (22-13) + mov f, y2 # y2 = f + xor a, y1 # y1 = a ^ (a >> (22-13) + MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} + xor g, y2 # y2 = f^g + vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA} + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA} + MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor g, y2 # y2 = CH = ((f^g)&e)^g + MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + vpxor XTMP3, XTMP2, XTMP2 # + add y0, y2 # y2 = S1 + CH + MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH + vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + ## compute high s1 + vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS + mov e, y0 # y0 = e + MY_ROR (25-11), y0 # y0 = e >> (25-11) + mov a, y1 # y1 = a + MY_ROR (22-13), y1 # y1 = a >> (22-13) + xor e, y0 # y0 = e ^ (e >> (25-11)) + mov f, y2 # y2 = f + MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} + xor a, y1 # y1 = a ^ (a >> (22-13) + xor g, y2 # y2 = f^g + vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC} + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC} + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor g, y2 # y2 = CH = ((f^g)&e)^g + vpxor XTMP3, XTMP2, XTMP2 + MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y0, y2 # y2 = S1 + CH + add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH + vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS + rotate_Xs +.endm + +## input is [rsp + _XFER + %1 * 4] +.macro DO_ROUND round + mov e, y0 # y0 = e + MY_ROR (25-11), y0 # y0 = e >> (25-11) + mov a, y1 # y1 = a + xor e, y0 # y0 = e ^ (e >> (25-11)) + MY_ROR (22-13), y1 # y1 = a >> (22-13) + mov f, y2 # y2 = f + xor a, y1 # y1 = a ^ (a >> (22-13) + MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor g, y2 # y2 = f^g + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + and e, y2 # y2 = (f^g)&e + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor g, y2 # y2 = CH = ((f^g)&e)^g + add y0, y2 # y2 = S1 + CH + MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + offset = \round * 4 + _XFER # + add offset(%rsp), y2 # y2 = k + w + S1 + CH + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS +.endm + +######################################################################## +## void sha256_transform_avx(struct sha256_block_state *state, +## const u8 *data, size_t nblocks); +######################################################################## +.text +SYM_FUNC_START(sha256_transform_avx) + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbp + movq %rsp, %rbp + + subq $STACK_SIZE, %rsp # allocate stack space + and $~15, %rsp # align stack pointer + + shl $6, NUM_BLKS # convert to bytes + add INP, NUM_BLKS # pointer to end of data + mov NUM_BLKS, _INP_END(%rsp) + + ## load initial digest + mov 4*0(CTX), a + mov 4*1(CTX), b + mov 4*2(CTX), c + mov 4*3(CTX), d + mov 4*4(CTX), e + mov 4*5(CTX), f + mov 4*6(CTX), g + mov 4*7(CTX), h + + vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK + vmovdqa _SHUF_00BA(%rip), SHUF_00BA + vmovdqa _SHUF_DC00(%rip), SHUF_DC00 +.Lloop0: + lea K256(%rip), TBL + + ## byte swap first 16 dwords + COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK + + mov INP, _INP(%rsp) + + ## schedule 48 input dwords, by doing 3 rounds of 16 each + mov $3, SRND +.align 16 +.Lloop1: + vpaddd (TBL), X0, XFER + vmovdqa XFER, _XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddd 1*16(TBL), X0, XFER + vmovdqa XFER, _XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddd 2*16(TBL), X0, XFER + vmovdqa XFER, _XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddd 3*16(TBL), X0, XFER + vmovdqa XFER, _XFER(%rsp) + add $4*16, TBL + FOUR_ROUNDS_AND_SCHED + + sub $1, SRND + jne .Lloop1 + + mov $2, SRND +.Lloop2: + vpaddd (TBL), X0, XFER + vmovdqa XFER, _XFER(%rsp) + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + vpaddd 1*16(TBL), X1, XFER + vmovdqa XFER, _XFER(%rsp) + add $2*16, TBL + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + vmovdqa X2, X0 + vmovdqa X3, X1 + + sub $1, SRND + jne .Lloop2 + + addm (4*0)(CTX),a + addm (4*1)(CTX),b + addm (4*2)(CTX),c + addm (4*3)(CTX),d + addm (4*4)(CTX),e + addm (4*5)(CTX),f + addm (4*6)(CTX),g + addm (4*7)(CTX),h + + mov _INP(%rsp), INP + add $64, INP + cmp _INP_END(%rsp), INP + jne .Lloop0 + + mov %rbp, %rsp + popq %rbp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + RET +SYM_FUNC_END(sha256_transform_avx) + +.section .rodata.cst256.K256, "aM", @progbits, 256 +.align 64 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 +.align 16 +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203 + +.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 +.align 16 +# shuffle xBxA -> 00BA +_SHUF_00BA: + .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 + +.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 +.align 16 +# shuffle xDxC -> DC00 +_SHUF_DC00: + .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/lib/crypto/x86/sha256-avx2-asm.S b/lib/crypto/x86/sha256-avx2-asm.S new file mode 100644 index 000000000000..eb8836fb9695 --- /dev/null +++ b/lib/crypto/x86/sha256-avx2-asm.S @@ -0,0 +1,770 @@ +######################################################################## +# Implement fast SHA-256 with AVX2 instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford <james.guilford@intel.com> +# Kirk Yap <kirk.s.yap@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-256 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## +# This code schedules 2 blocks at a time, with 4 lanes per block +######################################################################## + +#include <linux/linkage.h> + +## assume buffers not aligned +#define VMOVDQ vmovdqu + +################################ Define Macros + +# addm [mem], reg +# Add reg to mem using reg-mem add and store +.macro addm p1 p2 + add \p1, \p2 + mov \p2, \p1 +.endm + +################################ + +X0 = %ymm4 +X1 = %ymm5 +X2 = %ymm6 +X3 = %ymm7 + +# XMM versions of above +XWORD0 = %xmm4 +XWORD1 = %xmm5 +XWORD2 = %xmm6 +XWORD3 = %xmm7 + +XTMP0 = %ymm0 +XTMP1 = %ymm1 +XTMP2 = %ymm2 +XTMP3 = %ymm3 +XTMP4 = %ymm8 +XFER = %ymm9 +XTMP5 = %ymm11 + +SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA +SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 +BYTE_FLIP_MASK = %ymm13 + +X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK + +NUM_BLKS = %rdx # 3rd arg +INP = %rsi # 2nd arg +CTX = %rdi # 1st arg +c = %ecx +d = %r8d +e = %edx # clobbers NUM_BLKS +y3 = %esi # clobbers INP + +SRND = CTX # SRND is same register as CTX + +a = %eax +b = %ebx +f = %r9d +g = %r10d +h = %r11d +old_h = %r11d + +T1 = %r12d +y0 = %r13d +y1 = %r14d +y2 = %r15d + + +_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round +_XMM_SAVE_SIZE = 0 +_INP_END_SIZE = 8 +_INP_SIZE = 8 +_CTX_SIZE = 8 + +_XFER = 0 +_XMM_SAVE = _XFER + _XFER_SIZE +_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE +_INP = _INP_END + _INP_END_SIZE +_CTX = _INP + _INP_SIZE +STACK_SIZE = _CTX + _CTX_SIZE + +# rotate_Xs +# Rotate values of symbols X0...X3 +.macro rotate_Xs + X_ = X0 + X0 = X1 + X1 = X2 + X2 = X3 + X3 = X_ +.endm + +# ROTATE_ARGS +# Rotate values of symbols a...h +.macro ROTATE_ARGS + old_h = h + TMP_ = h + h = g + g = f + f = e + e = d + d = c + c = b + b = a + a = TMP_ +.endm + +.macro FOUR_ROUNDS_AND_SCHED disp +################################### RND N + 0 ############################ + + mov a, y3 # y3 = a # MAJA + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + + addl \disp(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] + mov f, y2 # y2 = f # CH + rorx $13, a, T1 # T1 = a >> 13 # S0B + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + xor g, y2 # y2 = f^g # CH + vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 + rorx $6, e, y1 # y1 = (e >> 6) # S1 + + and e, y2 # y2 = (f^g)&e # CH + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $22, a, y1 # y1 = a >> 22 # S0A + add h, d # d = k + w + h + d # -- + + and b, y3 # y3 = (a|c)&b # MAJA + vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + vpsrld $7, XTMP1, XTMP2 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + + add y0, y2 # y2 = S1 + CH # -- + vpslld $(32-7), XTMP1, XTMP3 + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 + + vpsrld $18, XTMP1, XTMP2 + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + + ROTATE_ARGS + +################################### RND N + 1 ############################ + + mov a, y3 # y3 = a # MAJA + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + offset = \disp + 1*4 + addl offset(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + + vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 + mov f, y2 # y2 = f # CH + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + xor g, y2 # y2 = f^g # CH + + + rorx $6, e, y1 # y1 = (e >> 6) # S1 + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $22, a, y1 # y1 = a >> 22 # S0A + and e, y2 # y2 = (f^g)&e # CH + add h, d # d = k + w + h + d # -- + + vpslld $(32-18), XTMP1, XTMP1 + and b, y3 # y3 = (a|c)&b # MAJA + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + + vpxor XTMP1, XTMP3, XTMP3 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 + vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} + + + ROTATE_ARGS + +################################### RND N + 2 ############################ + + mov a, y3 # y3 = a # MAJA + rorx $25, e, y0 # y0 = e >> 25 # S1A + offset = \disp + 2*4 + addl offset(%rsp, SRND), h # h = k + w + h # -- + + vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} + rorx $11, e, y1 # y1 = e >> 11 # S1B + or c, y3 # y3 = a|c # MAJA + mov f, y2 # y2 = f # CH + xor g, y2 # y2 = f^g # CH + + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} + and e, y2 # y2 = (f^g)&e # CH + + rorx $6, e, y1 # y1 = (e >> 6) # S1 + vpxor XTMP3, XTMP2, XTMP2 + add h, d # d = k + w + h + d # -- + and b, y3 # y3 = (a|c)&b # MAJA + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $22, a, y1 # y1 = a >> 22 # S0A + vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a ,T1 # T1 = (a >> 2) # S0 + vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} + + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1,h # h = k + w + h + S0 # -- + add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- + add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + + add y3,h # h = t1 + S0 + MAJ # -- + + + ROTATE_ARGS + +################################### RND N + 3 ############################ + + mov a, y3 # y3 = a # MAJA + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + offset = \disp + 3*4 + addl offset(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + + vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} + mov f, y2 # y2 = f # CH + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + xor g, y2 # y2 = f^g # CH + + + vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} + rorx $6, e, y1 # y1 = (e >> 6) # S1 + and e, y2 # y2 = (f^g)&e # CH + add h, d # d = k + w + h + d # -- + and b, y3 # y3 = (a|c)&b # MAJA + + vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + vpxor XTMP3, XTMP2, XTMP2 + rorx $22, a, y1 # y1 = a >> 22 # S0A + add y0, y2 # y2 = S1 + CH # -- + + vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + rorx $2, a, T1 # T1 = (a >> 2) # S0 + vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} + + vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + + add y1, h # h = k + w + h + S0 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + ROTATE_ARGS + rotate_Xs +.endm + +.macro DO_4ROUNDS disp +################################### RND N + 0 ########################### + + mov f, y2 # y2 = f # CH + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + rorx $6, e, y1 # y1 = (e >> 6) # S1 + and e, y2 # y2 = (f^g)&e # CH + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $22, a, y1 # y1 = a >> 22 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + addl \disp(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + ROTATE_ARGS + +################################### RND N + 1 ########################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + rorx $6, e, y1 # y1 = (e >> 6) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $22, a, y1 # y1 = a >> 22 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + offset = 4*1 + \disp + addl offset(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + ROTATE_ARGS + +################################### RND N + 2 ############################## + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + rorx $6, e, y1 # y1 = (e >> 6) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $22, a, y1 # y1 = a >> 22 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + offset = 4*2 + \disp + addl offset(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + ROTATE_ARGS + +################################### RND N + 3 ########################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + rorx $6, e, y1 # y1 = (e >> 6) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $22, a, y1 # y1 = a >> 22 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + offset = 4*3 + \disp + addl offset(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + + add y3, h # h = t1 + S0 + MAJ # -- + + ROTATE_ARGS + +.endm + +######################################################################## +## void sha256_transform_rorx(struct sha256_block_state *state, +## const u8 *data, size_t nblocks); +######################################################################## +.text +SYM_FUNC_START(sha256_transform_rorx) + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + push %rbp + mov %rsp, %rbp + + subq $STACK_SIZE, %rsp + and $-32, %rsp # align rsp to 32 byte boundary + + shl $6, NUM_BLKS # convert to bytes + lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block + mov NUM_BLKS, _INP_END(%rsp) + + cmp NUM_BLKS, INP + je .Lonly_one_block + + ## load initial digest + mov (CTX), a + mov 4*1(CTX), b + mov 4*2(CTX), c + mov 4*3(CTX), d + mov 4*4(CTX), e + mov 4*5(CTX), f + mov 4*6(CTX), g + mov 4*7(CTX), h + + vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK + vmovdqa _SHUF_00BA(%rip), SHUF_00BA + vmovdqa _SHUF_DC00(%rip), SHUF_DC00 + + mov CTX, _CTX(%rsp) + +.Lloop0: + ## Load first 16 dwords from two blocks + VMOVDQ 0*32(INP),XTMP0 + VMOVDQ 1*32(INP),XTMP1 + VMOVDQ 2*32(INP),XTMP2 + VMOVDQ 3*32(INP),XTMP3 + + ## byte swap data + vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 + vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 + vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 + vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 + + ## transpose data into high/low halves + vperm2i128 $0x20, XTMP2, XTMP0, X0 + vperm2i128 $0x31, XTMP2, XTMP0, X1 + vperm2i128 $0x20, XTMP3, XTMP1, X2 + vperm2i128 $0x31, XTMP3, XTMP1, X3 + +.Llast_block_enter: + add $64, INP + mov INP, _INP(%rsp) + + ## schedule 48 input dwords, by doing 3 rounds of 12 each + xor SRND, SRND + +.align 16 +.Lloop1: + leaq K256+0*32(%rip), INP ## reuse INP as scratch reg + vpaddd (INP, SRND), X0, XFER + vmovdqa XFER, 0*32+_XFER(%rsp, SRND) + FOUR_ROUNDS_AND_SCHED (_XFER + 0*32) + + leaq K256+1*32(%rip), INP + vpaddd (INP, SRND), X0, XFER + vmovdqa XFER, 1*32+_XFER(%rsp, SRND) + FOUR_ROUNDS_AND_SCHED (_XFER + 1*32) + + leaq K256+2*32(%rip), INP + vpaddd (INP, SRND), X0, XFER + vmovdqa XFER, 2*32+_XFER(%rsp, SRND) + FOUR_ROUNDS_AND_SCHED (_XFER + 2*32) + + leaq K256+3*32(%rip), INP + vpaddd (INP, SRND), X0, XFER + vmovdqa XFER, 3*32+_XFER(%rsp, SRND) + FOUR_ROUNDS_AND_SCHED (_XFER + 3*32) + + add $4*32, SRND + cmp $3*4*32, SRND + jb .Lloop1 + +.Lloop2: + ## Do last 16 rounds with no scheduling + leaq K256+0*32(%rip), INP + vpaddd (INP, SRND), X0, XFER + vmovdqa XFER, 0*32+_XFER(%rsp, SRND) + DO_4ROUNDS (_XFER + 0*32) + + leaq K256+1*32(%rip), INP + vpaddd (INP, SRND), X1, XFER + vmovdqa XFER, 1*32+_XFER(%rsp, SRND) + DO_4ROUNDS (_XFER + 1*32) + add $2*32, SRND + + vmovdqa X2, X0 + vmovdqa X3, X1 + + cmp $4*4*32, SRND + jb .Lloop2 + + mov _CTX(%rsp), CTX + mov _INP(%rsp), INP + + addm (4*0)(CTX),a + addm (4*1)(CTX),b + addm (4*2)(CTX),c + addm (4*3)(CTX),d + addm (4*4)(CTX),e + addm (4*5)(CTX),f + addm (4*6)(CTX),g + addm (4*7)(CTX),h + + cmp _INP_END(%rsp), INP + ja .Ldone_hash + + #### Do second block using previously scheduled results + xor SRND, SRND +.align 16 +.Lloop3: + DO_4ROUNDS (_XFER + 0*32 + 16) + DO_4ROUNDS (_XFER + 1*32 + 16) + add $2*32, SRND + cmp $4*4*32, SRND + jb .Lloop3 + + mov _CTX(%rsp), CTX + mov _INP(%rsp), INP + add $64, INP + + addm (4*0)(CTX),a + addm (4*1)(CTX),b + addm (4*2)(CTX),c + addm (4*3)(CTX),d + addm (4*4)(CTX),e + addm (4*5)(CTX),f + addm (4*6)(CTX),g + addm (4*7)(CTX),h + + cmp _INP_END(%rsp), INP + jb .Lloop0 + ja .Ldone_hash + +.Ldo_last_block: + VMOVDQ 0*16(INP),XWORD0 + VMOVDQ 1*16(INP),XWORD1 + VMOVDQ 2*16(INP),XWORD2 + VMOVDQ 3*16(INP),XWORD3 + + vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 + vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 + vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 + vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 + + jmp .Llast_block_enter + +.Lonly_one_block: + + ## load initial digest + mov (4*0)(CTX),a + mov (4*1)(CTX),b + mov (4*2)(CTX),c + mov (4*3)(CTX),d + mov (4*4)(CTX),e + mov (4*5)(CTX),f + mov (4*6)(CTX),g + mov (4*7)(CTX),h + + vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK + vmovdqa _SHUF_00BA(%rip), SHUF_00BA + vmovdqa _SHUF_DC00(%rip), SHUF_DC00 + + mov CTX, _CTX(%rsp) + jmp .Ldo_last_block + +.Ldone_hash: + + mov %rbp, %rsp + pop %rbp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + vzeroupper + RET +SYM_FUNC_END(sha256_transform_rorx) + +.section .rodata.cst512.K256, "aM", @progbits, 512 +.align 64 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 +.align 32 +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 + +# shuffle xBxA -> 00BA +.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32 +.align 32 +_SHUF_00BA: + .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 + +# shuffle xDxC -> DC00 +.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32 +.align 32 +_SHUF_DC00: + .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/lib/crypto/x86/sha256-ni-asm.S b/lib/crypto/x86/sha256-ni-asm.S new file mode 100644 index 000000000000..4bd9490ffc66 --- /dev/null +++ b/lib/crypto/x86/sha256-ni-asm.S @@ -0,0 +1,191 @@ +/* + * Intel SHA Extensions optimized implementation of a SHA-256 update function + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Sean Gulley <sean.m.gulley@intel.com> + * Tim Chen <tim.c.chen@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/linkage.h> + +#define STATE_PTR %rdi /* 1st arg */ +#define DATA_PTR %rsi /* 2nd arg */ +#define NUM_BLKS %rdx /* 3rd arg */ + +#define SHA256CONSTANTS %rax + +#define MSG %xmm0 /* sha256rnds2 implicit operand */ +#define STATE0 %xmm1 +#define STATE1 %xmm2 +#define MSG0 %xmm3 +#define MSG1 %xmm4 +#define MSG2 %xmm5 +#define MSG3 %xmm6 +#define TMP %xmm7 + +#define SHUF_MASK %xmm8 + +#define ABEF_SAVE %xmm9 +#define CDGH_SAVE %xmm10 + +.macro do_4rounds i, m0, m1, m2, m3 +.if \i < 16 + movdqu \i*4(DATA_PTR), \m0 + pshufb SHUF_MASK, \m0 +.endif + movdqa (\i-32)*4(SHA256CONSTANTS), MSG + paddd \m0, MSG + sha256rnds2 STATE0, STATE1 +.if \i >= 12 && \i < 60 + movdqa \m0, TMP + palignr $4, \m3, TMP + paddd TMP, \m1 + sha256msg2 \m0, \m1 +.endif + punpckhqdq MSG, MSG + sha256rnds2 STATE1, STATE0 +.if \i >= 4 && \i < 52 + sha256msg1 \m0, \m3 +.endif +.endm + +/* + * Intel SHA Extensions optimized implementation of a SHA-256 block function + * + * This function takes a pointer to the current SHA-256 state, a pointer to the + * input data, and the number of 64-byte blocks to process. Once all blocks + * have been processed, the state is updated with the new state. This function + * only processes complete blocks. State initialization, buffering of partial + * blocks, and digest finalization is expected to be handled elsewhere. + * + * void sha256_ni_transform(struct sha256_block_state *state, + * const u8 *data, size_t nblocks); + */ +.text +SYM_FUNC_START(sha256_ni_transform) + + shl $6, NUM_BLKS /* convert to bytes */ + add DATA_PTR, NUM_BLKS /* pointer to end of data */ + + /* + * load initial hash values + * Need to reorder these appropriately + * DCBA, HGFE -> ABEF, CDGH + */ + movdqu 0*16(STATE_PTR), STATE0 /* DCBA */ + movdqu 1*16(STATE_PTR), STATE1 /* HGFE */ + + movdqa STATE0, TMP + punpcklqdq STATE1, STATE0 /* FEBA */ + punpckhqdq TMP, STATE1 /* DCHG */ + pshufd $0x1B, STATE0, STATE0 /* ABEF */ + pshufd $0xB1, STATE1, STATE1 /* CDGH */ + + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK + lea K256+32*4(%rip), SHA256CONSTANTS + +.Lloop0: + /* Save hash values for addition after rounds */ + movdqa STATE0, ABEF_SAVE + movdqa STATE1, CDGH_SAVE + +.irp i, 0, 16, 32, 48 + do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3 + do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0 + do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1 + do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2 +.endr + + /* Add current hash values with previously saved */ + paddd ABEF_SAVE, STATE0 + paddd CDGH_SAVE, STATE1 + + /* Increment data pointer and loop if more to process */ + add $64, DATA_PTR + cmp NUM_BLKS, DATA_PTR + jne .Lloop0 + + /* Write hash values back in the correct order */ + movdqa STATE0, TMP + punpcklqdq STATE1, STATE0 /* GHEF */ + punpckhqdq TMP, STATE1 /* ABCD */ + pshufd $0xB1, STATE0, STATE0 /* HGFE */ + pshufd $0x1B, STATE1, STATE1 /* DCBA */ + + movdqu STATE1, 0*16(STATE_PTR) + movdqu STATE0, 1*16(STATE_PTR) + + RET +SYM_FUNC_END(sha256_ni_transform) + +.section .rodata.cst256.K256, "aM", @progbits, 256 +.align 64 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 +.align 16 +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203 diff --git a/lib/crypto/x86/sha256-ssse3-asm.S b/lib/crypto/x86/sha256-ssse3-asm.S new file mode 100644 index 000000000000..383b8eec7ebe --- /dev/null +++ b/lib/crypto/x86/sha256-ssse3-asm.S @@ -0,0 +1,505 @@ +######################################################################## +# Implement fast SHA-256 with SSSE3 instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford <james.guilford@intel.com> +# Kirk Yap <kirk.s.yap@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-256 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## + +#include <linux/linkage.h> + +## assume buffers not aligned +#define MOVDQ movdqu + +################################ Define Macros + +# addm [mem], reg +# Add reg to mem using reg-mem add and store +.macro addm p1 p2 + add \p1, \p2 + mov \p2, \p1 +.endm + +################################ + +# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask +# Load xmm with mem and byte swap each dword +.macro COPY_XMM_AND_BSWAP p1 p2 p3 + MOVDQ \p2, \p1 + pshufb \p3, \p1 +.endm + +################################ + +X0 = %xmm4 +X1 = %xmm5 +X2 = %xmm6 +X3 = %xmm7 + +XTMP0 = %xmm0 +XTMP1 = %xmm1 +XTMP2 = %xmm2 +XTMP3 = %xmm3 +XTMP4 = %xmm8 +XFER = %xmm9 + +SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA +SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00 +BYTE_FLIP_MASK = %xmm12 + +NUM_BLKS = %rdx # 3rd arg +INP = %rsi # 2nd arg +CTX = %rdi # 1st arg + +SRND = %rsi # clobbers INP +c = %ecx +d = %r8d +e = %edx +TBL = %r12 +a = %eax +b = %ebx + +f = %r9d +g = %r10d +h = %r11d + +y0 = %r13d +y1 = %r14d +y2 = %r15d + + + +_INP_END_SIZE = 8 +_INP_SIZE = 8 +_XFER_SIZE = 16 +_XMM_SAVE_SIZE = 0 + +_INP_END = 0 +_INP = _INP_END + _INP_END_SIZE +_XFER = _INP + _INP_SIZE +_XMM_SAVE = _XFER + _XFER_SIZE +STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE + +# rotate_Xs +# Rotate values of symbols X0...X3 +.macro rotate_Xs +X_ = X0 +X0 = X1 +X1 = X2 +X2 = X3 +X3 = X_ +.endm + +# ROTATE_ARGS +# Rotate values of symbols a...h +.macro ROTATE_ARGS +TMP_ = h +h = g +g = f +f = e +e = d +d = c +c = b +b = a +a = TMP_ +.endm + +.macro FOUR_ROUNDS_AND_SCHED + ## compute s0 four at a time and s1 two at a time + ## compute W[-16] + W[-7] 4 at a time + movdqa X3, XTMP0 + mov e, y0 # y0 = e + ror $(25-11), y0 # y0 = e >> (25-11) + mov a, y1 # y1 = a + palignr $4, X2, XTMP0 # XTMP0 = W[-7] + ror $(22-13), y1 # y1 = a >> (22-13) + xor e, y0 # y0 = e ^ (e >> (25-11)) + mov f, y2 # y2 = f + ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + movdqa X1, XTMP1 + xor a, y1 # y1 = a ^ (a >> (22-13) + xor g, y2 # y2 = f^g + paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16] + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + ## compute s0 + palignr $4, X0, XTMP1 # XTMP1 = W[-15] + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor g, y2 # y2 = CH = ((f^g)&e)^g + movdqa XTMP1, XTMP2 # XTMP2 = W[-15] + ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y0, y2 # y2 = S1 + CH + add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH + movdqa XTMP1, XTMP3 # XTMP3 = W[-15] + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + pslld $(32-7), XTMP1 # + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + psrld $7, XTMP2 # + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + # + ROTATE_ARGS # + movdqa XTMP3, XTMP2 # XTMP2 = W[-15] + mov e, y0 # y0 = e + mov a, y1 # y1 = a + movdqa XTMP3, XTMP4 # XTMP4 = W[-15] + ror $(25-11), y0 # y0 = e >> (25-11) + xor e, y0 # y0 = e ^ (e >> (25-11)) + mov f, y2 # y2 = f + ror $(22-13), y1 # y1 = a >> (22-13) + pslld $(32-18), XTMP3 # + xor a, y1 # y1 = a ^ (a >> (22-13) + ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor g, y2 # y2 = f^g + psrld $18, XTMP2 # + ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + pxor XTMP3, XTMP1 + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor g, y2 # y2 = CH = ((f^g)&e)^g + psrld $3, XTMP4 # XTMP4 = W[-15] >> 3 + add y0, y2 # y2 = S1 + CH + add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH + ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + pxor XTMP4, XTMP1 # XTMP1 = s0 + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + ## compute low s1 + pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + + ROTATE_ARGS + movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA} + mov e, y0 # y0 = e + mov a, y1 # y1 = a + ror $(25-11), y0 # y0 = e >> (25-11) + movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA} + xor e, y0 # y0 = e ^ (e >> (25-11)) + ror $(22-13), y1 # y1 = a >> (22-13) + mov f, y2 # y2 = f + xor a, y1 # y1 = a ^ (a >> (22-13) + ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} + xor g, y2 # y2 = f^g + psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} + ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor g, y2 # y2 = CH = ((f^g)&e)^g + ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + pxor XTMP3, XTMP2 + add y0, y2 # y2 = S1 + CH + ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH + pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA} + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA} + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + ## compute high s1 + pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA} + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + # + ROTATE_ARGS # + movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC} + mov e, y0 # y0 = e + ror $(25-11), y0 # y0 = e >> (25-11) + mov a, y1 # y1 = a + movdqa XTMP2, X0 # X0 = W[-2] {DDCC} + ror $(22-13), y1 # y1 = a >> (22-13) + xor e, y0 # y0 = e ^ (e >> (25-11)) + mov f, y2 # y2 = f + ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} + xor a, y1 # y1 = a ^ (a >> (22-13) + xor g, y2 # y2 = f^g + psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25 + and e, y2 # y2 = (f^g)&e + ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + psrld $10, X0 # X0 = W[-2] >> 10 {DDCC} + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22 + ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2 + xor g, y2 # y2 = CH = ((f^g)&e)^g + pxor XTMP3, XTMP2 # + ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2 + add y0, y2 # y2 = S1 + CH + add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH + pxor XTMP2, X0 # X0 = s1 {xDxC} + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + pshufb SHUF_DC00, X0 # X0 = s1 {DC00} + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]} + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + + ROTATE_ARGS + rotate_Xs +.endm + +## input is [rsp + _XFER + %1 * 4] +.macro DO_ROUND round + mov e, y0 # y0 = e + ror $(25-11), y0 # y0 = e >> (25-11) + mov a, y1 # y1 = a + xor e, y0 # y0 = e ^ (e >> (25-11)) + ror $(22-13), y1 # y1 = a >> (22-13) + mov f, y2 # y2 = f + xor a, y1 # y1 = a ^ (a >> (22-13) + ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor g, y2 # y2 = f^g + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + and e, y2 # y2 = (f^g)&e + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor g, y2 # y2 = CH = ((f^g)&e)^g + add y0, y2 # y2 = S1 + CH + ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + offset = \round * 4 + _XFER + add offset(%rsp), y2 # y2 = k + w + S1 + CH + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS +.endm + +######################################################################## +## void sha256_transform_ssse3(struct sha256_block_state *state, +## const u8 *data, size_t nblocks); +######################################################################## +.text +SYM_FUNC_START(sha256_transform_ssse3) + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbp + mov %rsp, %rbp + + subq $STACK_SIZE, %rsp + and $~15, %rsp + + shl $6, NUM_BLKS # convert to bytes + add INP, NUM_BLKS + mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data + + ## load initial digest + mov 4*0(CTX), a + mov 4*1(CTX), b + mov 4*2(CTX), c + mov 4*3(CTX), d + mov 4*4(CTX), e + mov 4*5(CTX), f + mov 4*6(CTX), g + mov 4*7(CTX), h + + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK + movdqa _SHUF_00BA(%rip), SHUF_00BA + movdqa _SHUF_DC00(%rip), SHUF_DC00 + +.Lloop0: + lea K256(%rip), TBL + + ## byte swap first 16 dwords + COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK + + mov INP, _INP(%rsp) + + ## schedule 48 input dwords, by doing 3 rounds of 16 each + mov $3, SRND +.align 16 +.Lloop1: + movdqa (TBL), XFER + paddd X0, XFER + movdqa XFER, _XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + movdqa 1*16(TBL), XFER + paddd X0, XFER + movdqa XFER, _XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + movdqa 2*16(TBL), XFER + paddd X0, XFER + movdqa XFER, _XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + movdqa 3*16(TBL), XFER + paddd X0, XFER + movdqa XFER, _XFER(%rsp) + add $4*16, TBL + FOUR_ROUNDS_AND_SCHED + + sub $1, SRND + jne .Lloop1 + + mov $2, SRND +.Lloop2: + paddd (TBL), X0 + movdqa X0, _XFER(%rsp) + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + paddd 1*16(TBL), X1 + movdqa X1, _XFER(%rsp) + add $2*16, TBL + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + movdqa X2, X0 + movdqa X3, X1 + + sub $1, SRND + jne .Lloop2 + + addm (4*0)(CTX),a + addm (4*1)(CTX),b + addm (4*2)(CTX),c + addm (4*3)(CTX),d + addm (4*4)(CTX),e + addm (4*5)(CTX),f + addm (4*6)(CTX),g + addm (4*7)(CTX),h + + mov _INP(%rsp), INP + add $64, INP + cmp _INP_END(%rsp), INP + jne .Lloop0 + + mov %rbp, %rsp + popq %rbp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + + RET +SYM_FUNC_END(sha256_transform_ssse3) + +.section .rodata.cst256.K256, "aM", @progbits, 256 +.align 64 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 +.align 16 +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203 + +.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 +.align 16 +# shuffle xBxA -> 00BA +_SHUF_00BA: + .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 + +.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 +.align 16 +# shuffle xDxC -> DC00 +_SHUF_DC00: + .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/lib/crypto/x86/sha256.h b/lib/crypto/x86/sha256.h new file mode 100644 index 000000000000..669bc06538b6 --- /dev/null +++ b/lib/crypto/x86/sha256.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-256 optimized for x86_64 + * + * Copyright 2025 Google LLC + */ +#include <asm/fpu/api.h> +#include <crypto/internal/simd.h> +#include <linux/static_call.h> + +DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic); + +#define DEFINE_X86_SHA256_FN(c_fn, asm_fn) \ + asmlinkage void asm_fn(struct sha256_block_state *state, \ + const u8 *data, size_t nblocks); \ + static void c_fn(struct sha256_block_state *state, const u8 *data, \ + size_t nblocks) \ + { \ + if (likely(crypto_simd_usable())) { \ + kernel_fpu_begin(); \ + asm_fn(state, data, nblocks); \ + kernel_fpu_end(); \ + } else { \ + sha256_blocks_generic(state, data, nblocks); \ + } \ + } + +DEFINE_X86_SHA256_FN(sha256_blocks_ssse3, sha256_transform_ssse3); +DEFINE_X86_SHA256_FN(sha256_blocks_avx, sha256_transform_avx); +DEFINE_X86_SHA256_FN(sha256_blocks_avx2, sha256_transform_rorx); +DEFINE_X86_SHA256_FN(sha256_blocks_ni, sha256_ni_transform); + +static void sha256_blocks(struct sha256_block_state *state, + const u8 *data, size_t nblocks) +{ + static_call(sha256_blocks_x86)(state, data, nblocks); +} + +#define sha256_mod_init_arch sha256_mod_init_arch +static inline void sha256_mod_init_arch(void) +{ + if (boot_cpu_has(X86_FEATURE_SHA_NI)) { + static_call_update(sha256_blocks_x86, sha256_blocks_ni); + } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + NULL) && + boot_cpu_has(X86_FEATURE_AVX)) { + if (boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_BMI2)) + static_call_update(sha256_blocks_x86, + sha256_blocks_avx2); + else + static_call_update(sha256_blocks_x86, + sha256_blocks_avx); + } else if (boot_cpu_has(X86_FEATURE_SSSE3)) { + static_call_update(sha256_blocks_x86, sha256_blocks_ssse3); + } +} diff --git a/lib/crypto/x86/sha512-avx-asm.S b/lib/crypto/x86/sha512-avx-asm.S new file mode 100644 index 000000000000..7732aa8fd850 --- /dev/null +++ b/lib/crypto/x86/sha512-avx-asm.S @@ -0,0 +1,420 @@ +######################################################################## +# Implement fast SHA-512 with AVX instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford <james.guilford@intel.com> +# Kirk Yap <kirk.s.yap@intel.com> +# David Cote <david.m.cote@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-512 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## + +#include <linux/linkage.h> + +.text + +# Virtual Registers +# ARG1 +digest = %rdi +# ARG2 +msg = %rsi +# ARG3 +msglen = %rdx +T1 = %rcx +T2 = %r8 +a_64 = %r9 +b_64 = %r10 +c_64 = %r11 +d_64 = %r12 +e_64 = %r13 +f_64 = %r14 +g_64 = %r15 +h_64 = %rbx +tmp0 = %rax + +# Local variables (stack frame) + +# Message Schedule +W_SIZE = 80*8 +# W[t] + K[t] | W[t+1] + K[t+1] +WK_SIZE = 2*8 + +frame_W = 0 +frame_WK = frame_W + W_SIZE +frame_size = frame_WK + WK_SIZE + +# Useful QWORD "arrays" for simpler memory references +# MSG, DIGEST, K_t, W_t are arrays +# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even + +# Input message (arg1) +#define MSG(i) 8*i(msg) + +# Output Digest (arg2) +#define DIGEST(i) 8*i(digest) + +# SHA Constants (static mem) +#define K_t(i) 8*i+K512(%rip) + +# Message Schedule (stack frame) +#define W_t(i) 8*i+frame_W(%rsp) + +# W[t]+K[t] (stack frame) +#define WK_2(i) 8*((i%2))+frame_WK(%rsp) + +.macro RotateState + # Rotate symbols a..h right + TMP = h_64 + h_64 = g_64 + g_64 = f_64 + f_64 = e_64 + e_64 = d_64 + d_64 = c_64 + c_64 = b_64 + b_64 = a_64 + a_64 = TMP +.endm + +.macro RORQ p1 p2 + # shld is faster than ror on Sandybridge + shld $(64-\p2), \p1, \p1 +.endm + +.macro SHA512_Round rnd + # Compute Round %%t + mov f_64, T1 # T1 = f + mov e_64, tmp0 # tmp = e + xor g_64, T1 # T1 = f ^ g + RORQ tmp0, 23 # 41 # tmp = e ror 23 + and e_64, T1 # T1 = (f ^ g) & e + xor e_64, tmp0 # tmp = (e ror 23) ^ e + xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g) + idx = \rnd + add WK_2(idx), T1 # W[t] + K[t] from message scheduler + RORQ tmp0, 4 # 18 # tmp = ((e ror 23) ^ e) ror 4 + xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e + mov a_64, T2 # T2 = a + add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h + RORQ tmp0, 14 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) + add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e) + mov a_64, tmp0 # tmp = a + xor c_64, T2 # T2 = a ^ c + and c_64, tmp0 # tmp = a & c + and b_64, T2 # T2 = (a ^ c) & b + xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) + mov a_64, tmp0 # tmp = a + RORQ tmp0, 5 # 39 # tmp = a ror 5 + xor a_64, tmp0 # tmp = (a ror 5) ^ a + add T1, d_64 # e(next_state) = d + T1 + RORQ tmp0, 6 # 34 # tmp = ((a ror 5) ^ a) ror 6 + xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a + lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c) + RORQ tmp0, 28 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) + add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a) + RotateState +.endm + +.macro SHA512_2Sched_2Round_avx rnd + # Compute rounds t-2 and t-1 + # Compute message schedule QWORDS t and t+1 + + # Two rounds are computed based on the values for K[t-2]+W[t-2] and + # K[t-1]+W[t-1] which were previously stored at WK_2 by the message + # scheduler. + # The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)]. + # They are then added to their respective SHA512 constants at + # [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)] + # For brievity, the comments following vectored instructions only refer to + # the first of a pair of QWORDS. + # Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} + # The computation of the message schedule and the rounds are tightly + # stitched to take advantage of instruction-level parallelism. + + idx = \rnd - 2 + vmovdqa W_t(idx), %xmm4 # XMM4 = W[t-2] + idx = \rnd - 15 + vmovdqu W_t(idx), %xmm5 # XMM5 = W[t-15] + mov f_64, T1 + vpsrlq $61, %xmm4, %xmm0 # XMM0 = W[t-2]>>61 + mov e_64, tmp0 + vpsrlq $1, %xmm5, %xmm6 # XMM6 = W[t-15]>>1 + xor g_64, T1 + RORQ tmp0, 23 # 41 + vpsrlq $19, %xmm4, %xmm1 # XMM1 = W[t-2]>>19 + and e_64, T1 + xor e_64, tmp0 + vpxor %xmm1, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 + xor g_64, T1 + idx = \rnd + add WK_2(idx), T1# + vpsrlq $8, %xmm5, %xmm7 # XMM7 = W[t-15]>>8 + RORQ tmp0, 4 # 18 + vpsrlq $6, %xmm4, %xmm2 # XMM2 = W[t-2]>>6 + xor e_64, tmp0 + mov a_64, T2 + add h_64, T1 + vpxor %xmm7, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 + RORQ tmp0, 14 # 14 + add tmp0, T1 + vpsrlq $7, %xmm5, %xmm8 # XMM8 = W[t-15]>>7 + mov a_64, tmp0 + xor c_64, T2 + vpsllq $(64-61), %xmm4, %xmm3 # XMM3 = W[t-2]<<3 + and c_64, tmp0 + and b_64, T2 + vpxor %xmm3, %xmm2, %xmm2 # XMM2 = W[t-2]>>6 ^ W[t-2]<<3 + xor tmp0, T2 + mov a_64, tmp0 + vpsllq $(64-1), %xmm5, %xmm9 # XMM9 = W[t-15]<<63 + RORQ tmp0, 5 # 39 + vpxor %xmm9, %xmm8, %xmm8 # XMM8 = W[t-15]>>7 ^ W[t-15]<<63 + xor a_64, tmp0 + add T1, d_64 + RORQ tmp0, 6 # 34 + xor a_64, tmp0 + vpxor %xmm8, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ + # W[t-15]>>7 ^ W[t-15]<<63 + lea (T1, T2), h_64 + RORQ tmp0, 28 # 28 + vpsllq $(64-19), %xmm4, %xmm4 # XMM4 = W[t-2]<<25 + add tmp0, h_64 + RotateState + vpxor %xmm4, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ + # W[t-2]<<25 + mov f_64, T1 + vpxor %xmm2, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + mov e_64, tmp0 + xor g_64, T1 + idx = \rnd - 16 + vpaddq W_t(idx), %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + idx = \rnd - 7 + vmovdqu W_t(idx), %xmm1 # XMM1 = W[t-7] + RORQ tmp0, 23 # 41 + and e_64, T1 + xor e_64, tmp0 + xor g_64, T1 + vpsllq $(64-8), %xmm5, %xmm5 # XMM5 = W[t-15]<<56 + idx = \rnd + 1 + add WK_2(idx), T1 + vpxor %xmm5, %xmm6, %xmm6 # XMM6 = s0(W[t-15]) + RORQ tmp0, 4 # 18 + vpaddq %xmm6, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) + xor e_64, tmp0 + vpaddq %xmm1, %xmm0, %xmm0 # XMM0 = W[t] = s1(W[t-2]) + W[t-7] + + # s0(W[t-15]) + W[t-16] + mov a_64, T2 + add h_64, T1 + RORQ tmp0, 14 # 14 + add tmp0, T1 + idx = \rnd + vmovdqa %xmm0, W_t(idx) # Store W[t] + vpaddq K_t(idx), %xmm0, %xmm0 # Compute W[t]+K[t] + vmovdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds + mov a_64, tmp0 + xor c_64, T2 + and c_64, tmp0 + and b_64, T2 + xor tmp0, T2 + mov a_64, tmp0 + RORQ tmp0, 5 # 39 + xor a_64, tmp0 + add T1, d_64 + RORQ tmp0, 6 # 34 + xor a_64, tmp0 + lea (T1, T2), h_64 + RORQ tmp0, 28 # 28 + add tmp0, h_64 + RotateState +.endm + +######################################################################## +# void sha512_transform_avx(struct sha512_block_state *state, +# const u8 *data, size_t nblocks); +# Purpose: Updates the SHA512 digest stored at "state" with the message +# stored in "data". +# The size of the message pointed to by "data" must be an integer multiple +# of SHA512 message blocks. +# "nblocks" is the message length in SHA512 blocks. Must be >= 1. +######################################################################## +SYM_FUNC_START(sha512_transform_avx) + + # Save GPRs + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + # Allocate Stack Space + push %rbp + mov %rsp, %rbp + sub $frame_size, %rsp + and $~(0x20 - 1), %rsp + +.Lupdateblock: + + # Load state variables + mov DIGEST(0), a_64 + mov DIGEST(1), b_64 + mov DIGEST(2), c_64 + mov DIGEST(3), d_64 + mov DIGEST(4), e_64 + mov DIGEST(5), f_64 + mov DIGEST(6), g_64 + mov DIGEST(7), h_64 + + t = 0 + .rept 80/2 + 1 + # (80 rounds) / (2 rounds/iteration) + (1 iteration) + # +1 iteration because the scheduler leads hashing by 1 iteration + .if t < 2 + # BSWAP 2 QWORDS + vmovdqa XMM_QWORD_BSWAP(%rip), %xmm1 + vmovdqu MSG(t), %xmm0 + vpshufb %xmm1, %xmm0, %xmm0 # BSWAP + vmovdqa %xmm0, W_t(t) # Store Scheduled Pair + vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t] + vmovdqa %xmm0, WK_2(t) # Store into WK for rounds + .elseif t < 16 + # BSWAP 2 QWORDS# Compute 2 Rounds + vmovdqu MSG(t), %xmm0 + vpshufb %xmm1, %xmm0, %xmm0 # BSWAP + SHA512_Round t-2 # Round t-2 + vmovdqa %xmm0, W_t(t) # Store Scheduled Pair + vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t] + SHA512_Round t-1 # Round t-1 + vmovdqa %xmm0, WK_2(t)# Store W[t]+K[t] into WK + .elseif t < 79 + # Schedule 2 QWORDS# Compute 2 Rounds + SHA512_2Sched_2Round_avx t + .else + # Compute 2 Rounds + SHA512_Round t-2 + SHA512_Round t-1 + .endif + t = t+2 + .endr + + # Update digest + add a_64, DIGEST(0) + add b_64, DIGEST(1) + add c_64, DIGEST(2) + add d_64, DIGEST(3) + add e_64, DIGEST(4) + add f_64, DIGEST(5) + add g_64, DIGEST(6) + add h_64, DIGEST(7) + + # Advance to next message block + add $16*8, msg + dec msglen + jnz .Lupdateblock + + # Restore Stack Pointer + mov %rbp, %rsp + pop %rbp + + # Restore GPRs + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + + RET +SYM_FUNC_END(sha512_transform_avx) + +######################################################################## +### Binary Data + +.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16 +.align 16 +# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. +XMM_QWORD_BSWAP: + .octa 0x08090a0b0c0d0e0f0001020304050607 + +# Mergeable 640-byte rodata section. This allows linker to merge the table +# with other, exactly the same 640-byte fragment of another rodata section +# (if such section exists). +.section .rodata.cst640.K512, "aM", @progbits, 640 +.align 64 +# K[t] used in SHA512 hashing +K512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 diff --git a/lib/crypto/x86/sha512-avx2-asm.S b/lib/crypto/x86/sha512-avx2-asm.S new file mode 100644 index 000000000000..22bdbfd899d0 --- /dev/null +++ b/lib/crypto/x86/sha512-avx2-asm.S @@ -0,0 +1,748 @@ +######################################################################## +# Implement fast SHA-512 with AVX2 instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford <james.guilford@intel.com> +# Kirk Yap <kirk.s.yap@intel.com> +# David Cote <david.m.cote@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-512 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## +# This code schedules 1 blocks at a time, with 4 lanes per block +######################################################################## + +#include <linux/linkage.h> + +.text + +# Virtual Registers +Y_0 = %ymm4 +Y_1 = %ymm5 +Y_2 = %ymm6 +Y_3 = %ymm7 + +YTMP0 = %ymm0 +YTMP1 = %ymm1 +YTMP2 = %ymm2 +YTMP3 = %ymm3 +YTMP4 = %ymm8 +XFER = YTMP0 + +BYTE_FLIP_MASK = %ymm9 + +# 1st arg is %rdi, which is saved to the stack and accessed later via %r12 +CTX1 = %rdi +CTX2 = %r12 +# 2nd arg +INP = %rsi +# 3rd arg +NUM_BLKS = %rdx + +c = %rcx +d = %r8 +e = %rdx +y3 = %rsi + +TBL = %rdi # clobbers CTX1 + +a = %rax +b = %rbx + +f = %r9 +g = %r10 +h = %r11 +old_h = %r11 + +T1 = %r12 # clobbers CTX2 +y0 = %r13 +y1 = %r14 +y2 = %r15 + +# Local variables (stack frame) +XFER_SIZE = 4*8 +SRND_SIZE = 1*8 +INP_SIZE = 1*8 +INPEND_SIZE = 1*8 +CTX_SIZE = 1*8 + +frame_XFER = 0 +frame_SRND = frame_XFER + XFER_SIZE +frame_INP = frame_SRND + SRND_SIZE +frame_INPEND = frame_INP + INP_SIZE +frame_CTX = frame_INPEND + INPEND_SIZE +frame_size = frame_CTX + CTX_SIZE + +## assume buffers not aligned +#define VMOVDQ vmovdqu + +# addm [mem], reg +# Add reg to mem using reg-mem add and store +.macro addm p1 p2 + add \p1, \p2 + mov \p2, \p1 +.endm + + +# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask +# Load ymm with mem and byte swap each dword +.macro COPY_YMM_AND_BSWAP p1 p2 p3 + VMOVDQ \p2, \p1 + vpshufb \p3, \p1, \p1 +.endm +# rotate_Ys +# Rotate values of symbols Y0...Y3 +.macro rotate_Ys + Y_ = Y_0 + Y_0 = Y_1 + Y_1 = Y_2 + Y_2 = Y_3 + Y_3 = Y_ +.endm + +# RotateState +.macro RotateState + # Rotate symbols a..h right + old_h = h + TMP_ = h + h = g + g = f + f = e + e = d + d = c + c = b + b = a + a = TMP_ +.endm + +# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL +# YDST = {YSRC1, YSRC2} >> RVAL*8 +.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL + vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI} + vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8 +.endm + +.macro FOUR_ROUNDS_AND_SCHED +################################### RND N + 0 ######################################### + + # Extract w[t-7] + MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7] + # Calculate w[t-16] + w[t-7] + vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16] + # Extract w[t-15] + MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15] + + # Calculate sigma0 + + # Calculate w[t-15] ror 1 + vpsrlq $1, YTMP1, YTMP2 + vpsllq $(64-1), YTMP1, YTMP3 + vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 + # Calculate w[t-15] shr 7 + vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7 + + mov a, y3 # y3 = a # MAJA + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + add frame_XFER(%rsp),h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + mov f, y2 # y2 = f # CH + rorx $34, a, T1 # T1 = a >> 34 # S0B + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + xor g, y2 # y2 = f^g # CH + rorx $14, e, y1 # y1 = (e >> 14) # S1 + + and e, y2 # y2 = (f^g)&e # CH + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $39, a, y1 # y1 = a >> 39 # S0A + add h, d # d = k + w + h + d # -- + + and b, y3 # y3 = (a|c)&b # MAJA + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + + add y0, y2 # y2 = S1 + CH # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + +################################### RND N + 1 ######################################### + + # Calculate w[t-15] ror 8 + vpsrlq $8, YTMP1, YTMP2 + vpsllq $(64-8), YTMP1, YTMP1 + vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8 + # XOR the three components + vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 + vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0 + + + # Add three components, w[t-16], w[t-7] and sigma0 + vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 + # Move to appropriate lanes for calculating w[16] and w[17] + vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA} + # Move to appropriate lanes for calculating w[18] and w[19] + vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00} + + # Calculate w[16] and w[17] in both 128 bit lanes + + # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes + vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA} + vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA} + + + mov a, y3 # y3 = a # MAJA + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + add 1*8+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + + mov f, y2 # y2 = f # CH + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + xor g, y2 # y2 = f^g # CH + + + rorx $14, e, y1 # y1 = (e >> 14) # S1 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $39, a, y1 # y1 = a >> 39 # S0A + and e, y2 # y2 = (f^g)&e # CH + add h, d # d = k + w + h + d # -- + + and b, y3 # y3 = (a|c)&b # MAJA + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + + rorx $28, a, T1 # T1 = (a >> 28) # S0 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + + +################################### RND N + 2 ######################################### + + vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA} + vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA} + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA} + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} + vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA} + vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA} + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA} + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ + # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} + + # Add sigma1 to the other compunents to get w[16] and w[17] + vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]} + + # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane + vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--} + + mov a, y3 # y3 = a # MAJA + rorx $41, e, y0 # y0 = e >> 41 # S1A + add 2*8+frame_XFER(%rsp), h # h = k + w + h # -- + + rorx $18, e, y1 # y1 = e >> 18 # S1B + or c, y3 # y3 = a|c # MAJA + mov f, y2 # y2 = f # CH + xor g, y2 # y2 = f^g # CH + + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + and e, y2 # y2 = (f^g)&e # CH + + rorx $14, e, y1 # y1 = (e >> 14) # S1 + add h, d # d = k + w + h + d # -- + and b, y3 # y3 = (a|c)&b # MAJA + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $39, a, y1 # y1 = a >> 39 # S0A + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + +################################### RND N + 3 ######################################### + + vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--} + vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--} + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--} + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} + vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--} + vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--} + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--} + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ + # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} + + # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] + # to newly calculated sigma1 to get w[18] and w[19] + vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --} + + # Form w[19, w[18], w17], w[16] + vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]} + + mov a, y3 # y3 = a # MAJA + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + add 3*8+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + + mov f, y2 # y2 = f # CH + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + xor g, y2 # y2 = f^g # CH + + + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + add h, d # d = k + w + h + d # -- + and b, y3 # y3 = (a|c)&b # MAJA + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + rorx $39, a, y1 # y1 = a >> 39 # S0A + add y0, y2 # y2 = S1 + CH # -- + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + rorx $28, a, T1 # T1 = (a >> 28) # S0 + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + + add y1, h # h = k + w + h + S0 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + + rotate_Ys +.endm + +.macro DO_4ROUNDS + +################################### RND N + 0 ######################################### + + mov f, y2 # y2 = f # CH + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $39, a, y1 # y1 = a >> 39 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + add frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + RotateState + +################################### RND N + 1 ######################################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $39, a, y1 # y1 = a >> 39 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + add 8*1+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + RotateState + +################################### RND N + 2 ######################################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $39, a, y1 # y1 = a >> 39 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + add 8*2+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + RotateState + +################################### RND N + 3 ######################################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $39, a, y1 # y1 = a >> 39 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + add 8*3+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + +.endm + +######################################################################## +# void sha512_transform_rorx(struct sha512_block_state *state, +# const u8 *data, size_t nblocks); +# Purpose: Updates the SHA512 digest stored at "state" with the message +# stored in "data". +# The size of the message pointed to by "data" must be an integer multiple +# of SHA512 message blocks. +# "nblocks" is the message length in SHA512 blocks. Must be >= 1. +######################################################################## +SYM_FUNC_START(sha512_transform_rorx) + + # Save GPRs + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + # Allocate Stack Space + push %rbp + mov %rsp, %rbp + sub $frame_size, %rsp + and $~(0x20 - 1), %rsp + + shl $7, NUM_BLKS # convert to bytes + add INP, NUM_BLKS # pointer to end of data + mov NUM_BLKS, frame_INPEND(%rsp) + + ## load initial digest + mov 8*0(CTX1), a + mov 8*1(CTX1), b + mov 8*2(CTX1), c + mov 8*3(CTX1), d + mov 8*4(CTX1), e + mov 8*5(CTX1), f + mov 8*6(CTX1), g + mov 8*7(CTX1), h + + # save %rdi (CTX) before it gets clobbered + mov %rdi, frame_CTX(%rsp) + + vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK + +.Lloop0: + lea K512(%rip), TBL + + ## byte swap first 16 dwords + COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK + + mov INP, frame_INP(%rsp) + + ## schedule 64 input dwords, by doing 12 rounds of 4 each + movq $4, frame_SRND(%rsp) + +.align 16 +.Lloop1: + vpaddq (TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddq 1*32(TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddq 2*32(TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddq 3*32(TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + add $(4*32), TBL + FOUR_ROUNDS_AND_SCHED + + subq $1, frame_SRND(%rsp) + jne .Lloop1 + + movq $2, frame_SRND(%rsp) +.Lloop2: + vpaddq (TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + DO_4ROUNDS + vpaddq 1*32(TBL), Y_1, XFER + vmovdqa XFER, frame_XFER(%rsp) + add $(2*32), TBL + DO_4ROUNDS + + vmovdqa Y_2, Y_0 + vmovdqa Y_3, Y_1 + + subq $1, frame_SRND(%rsp) + jne .Lloop2 + + mov frame_CTX(%rsp), CTX2 + addm 8*0(CTX2), a + addm 8*1(CTX2), b + addm 8*2(CTX2), c + addm 8*3(CTX2), d + addm 8*4(CTX2), e + addm 8*5(CTX2), f + addm 8*6(CTX2), g + addm 8*7(CTX2), h + + mov frame_INP(%rsp), INP + add $128, INP + cmp frame_INPEND(%rsp), INP + jne .Lloop0 + + # Restore Stack Pointer + mov %rbp, %rsp + pop %rbp + + # Restore GPRs + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + + vzeroupper + RET +SYM_FUNC_END(sha512_transform_rorx) + +######################################################################## +### Binary Data + + +# Mergeable 640-byte rodata section. This allows linker to merge the table +# with other, exactly the same 640-byte fragment of another rodata section +# (if such section exists). +.section .rodata.cst640.K512, "aM", @progbits, 640 +.align 64 +# K[t] used in SHA512 hashing +K512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 +.align 32 +# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x08090a0b0c0d0e0f0001020304050607 + .octa 0x18191a1b1c1d1e1f1011121314151617 + +.section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32 +.align 32 +MASK_YMM_LO: + .octa 0x00000000000000000000000000000000 + .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF diff --git a/lib/crypto/x86/sha512-ssse3-asm.S b/lib/crypto/x86/sha512-ssse3-asm.S new file mode 100644 index 000000000000..4cae7445b2a8 --- /dev/null +++ b/lib/crypto/x86/sha512-ssse3-asm.S @@ -0,0 +1,419 @@ +######################################################################## +# Implement fast SHA-512 with SSSE3 instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford <james.guilford@intel.com> +# Kirk Yap <kirk.s.yap@intel.com> +# David Cote <david.m.cote@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-512 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## + +#include <linux/linkage.h> + +.text + +# Virtual Registers +# ARG1 +digest = %rdi +# ARG2 +msg = %rsi +# ARG3 +msglen = %rdx +T1 = %rcx +T2 = %r8 +a_64 = %r9 +b_64 = %r10 +c_64 = %r11 +d_64 = %r12 +e_64 = %r13 +f_64 = %r14 +g_64 = %r15 +h_64 = %rbx +tmp0 = %rax + +# Local variables (stack frame) + +W_SIZE = 80*8 +WK_SIZE = 2*8 + +frame_W = 0 +frame_WK = frame_W + W_SIZE +frame_size = frame_WK + WK_SIZE + +# Useful QWORD "arrays" for simpler memory references +# MSG, DIGEST, K_t, W_t are arrays +# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even + +# Input message (arg1) +#define MSG(i) 8*i(msg) + +# Output Digest (arg2) +#define DIGEST(i) 8*i(digest) + +# SHA Constants (static mem) +#define K_t(i) 8*i+K512(%rip) + +# Message Schedule (stack frame) +#define W_t(i) 8*i+frame_W(%rsp) + +# W[t]+K[t] (stack frame) +#define WK_2(i) 8*((i%2))+frame_WK(%rsp) + +.macro RotateState + # Rotate symbols a..h right + TMP = h_64 + h_64 = g_64 + g_64 = f_64 + f_64 = e_64 + e_64 = d_64 + d_64 = c_64 + c_64 = b_64 + b_64 = a_64 + a_64 = TMP +.endm + +.macro SHA512_Round rnd + + # Compute Round %%t + mov f_64, T1 # T1 = f + mov e_64, tmp0 # tmp = e + xor g_64, T1 # T1 = f ^ g + ror $23, tmp0 # 41 # tmp = e ror 23 + and e_64, T1 # T1 = (f ^ g) & e + xor e_64, tmp0 # tmp = (e ror 23) ^ e + xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g) + idx = \rnd + add WK_2(idx), T1 # W[t] + K[t] from message scheduler + ror $4, tmp0 # 18 # tmp = ((e ror 23) ^ e) ror 4 + xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e + mov a_64, T2 # T2 = a + add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h + ror $14, tmp0 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) + add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e) + mov a_64, tmp0 # tmp = a + xor c_64, T2 # T2 = a ^ c + and c_64, tmp0 # tmp = a & c + and b_64, T2 # T2 = (a ^ c) & b + xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) + mov a_64, tmp0 # tmp = a + ror $5, tmp0 # 39 # tmp = a ror 5 + xor a_64, tmp0 # tmp = (a ror 5) ^ a + add T1, d_64 # e(next_state) = d + T1 + ror $6, tmp0 # 34 # tmp = ((a ror 5) ^ a) ror 6 + xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a + lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c) + ror $28, tmp0 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) + add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a) + RotateState +.endm + +.macro SHA512_2Sched_2Round_sse rnd + + # Compute rounds t-2 and t-1 + # Compute message schedule QWORDS t and t+1 + + # Two rounds are computed based on the values for K[t-2]+W[t-2] and + # K[t-1]+W[t-1] which were previously stored at WK_2 by the message + # scheduler. + # The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. + # They are then added to their respective SHA512 constants at + # [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] + # For brievity, the comments following vectored instructions only refer to + # the first of a pair of QWORDS. + # Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} + # The computation of the message schedule and the rounds are tightly + # stitched to take advantage of instruction-level parallelism. + # For clarity, integer instructions (for the rounds calculation) are indented + # by one tab. Vectored instructions (for the message scheduler) are indented + # by two tabs. + + mov f_64, T1 + idx = \rnd -2 + movdqa W_t(idx), %xmm2 # XMM2 = W[t-2] + xor g_64, T1 + and e_64, T1 + movdqa %xmm2, %xmm0 # XMM0 = W[t-2] + xor g_64, T1 + idx = \rnd + add WK_2(idx), T1 + idx = \rnd - 15 + movdqu W_t(idx), %xmm5 # XMM5 = W[t-15] + mov e_64, tmp0 + ror $23, tmp0 # 41 + movdqa %xmm5, %xmm3 # XMM3 = W[t-15] + xor e_64, tmp0 + ror $4, tmp0 # 18 + psrlq $61-19, %xmm0 # XMM0 = W[t-2] >> 42 + xor e_64, tmp0 + ror $14, tmp0 # 14 + psrlq $(8-7), %xmm3 # XMM3 = W[t-15] >> 1 + add tmp0, T1 + add h_64, T1 + pxor %xmm2, %xmm0 # XMM0 = (W[t-2] >> 42) ^ W[t-2] + mov a_64, T2 + xor c_64, T2 + pxor %xmm5, %xmm3 # XMM3 = (W[t-15] >> 1) ^ W[t-15] + and b_64, T2 + mov a_64, tmp0 + psrlq $(19-6), %xmm0 # XMM0 = ((W[t-2]>>42)^W[t-2])>>13 + and c_64, tmp0 + xor tmp0, T2 + psrlq $(7-1), %xmm3 # XMM3 = ((W[t-15]>>1)^W[t-15])>>6 + mov a_64, tmp0 + ror $5, tmp0 # 39 + pxor %xmm2, %xmm0 # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] + xor a_64, tmp0 + ror $6, tmp0 # 34 + pxor %xmm5, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] + xor a_64, tmp0 + ror $28, tmp0 # 28 + psrlq $6, %xmm0 # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 + add tmp0, T2 + add T1, d_64 + psrlq $1, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 + lea (T1, T2), h_64 + RotateState + movdqa %xmm2, %xmm1 # XMM1 = W[t-2] + mov f_64, T1 + xor g_64, T1 + movdqa %xmm5, %xmm4 # XMM4 = W[t-15] + and e_64, T1 + xor g_64, T1 + psllq $(64-19)-(64-61) , %xmm1 # XMM1 = W[t-2] << 42 + idx = \rnd + 1 + add WK_2(idx), T1 + mov e_64, tmp0 + psllq $(64-1)-(64-8), %xmm4 # XMM4 = W[t-15] << 7 + ror $23, tmp0 # 41 + xor e_64, tmp0 + pxor %xmm2, %xmm1 # XMM1 = (W[t-2] << 42)^W[t-2] + ror $4, tmp0 # 18 + xor e_64, tmp0 + pxor %xmm5, %xmm4 # XMM4 = (W[t-15]<<7)^W[t-15] + ror $14, tmp0 # 14 + add tmp0, T1 + psllq $(64-61), %xmm1 # XMM1 = ((W[t-2] << 42)^W[t-2])<<3 + add h_64, T1 + mov a_64, T2 + psllq $(64-8), %xmm4 # XMM4 = ((W[t-15]<<7)^W[t-15])<<56 + xor c_64, T2 + and b_64, T2 + pxor %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + mov a_64, tmp0 + and c_64, tmp0 + idx = \rnd - 7 + movdqu W_t(idx), %xmm1 # XMM1 = W[t-7] + xor tmp0, T2 + pxor %xmm4, %xmm3 # XMM3 = s0(W[t-15]) + mov a_64, tmp0 + paddq %xmm3, %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + ror $5, tmp0 # 39 + idx =\rnd-16 + paddq W_t(idx), %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] + xor a_64, tmp0 + paddq %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] + ror $6, tmp0 # 34 + movdqa %xmm0, W_t(\rnd) # Store scheduled qwords + xor a_64, tmp0 + paddq K_t(\rnd), %xmm0 # Compute W[t]+K[t] + ror $28, tmp0 # 28 + idx = \rnd + movdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds + add tmp0, T2 + add T1, d_64 + lea (T1, T2), h_64 + RotateState +.endm + +######################################################################## +# void sha512_transform_ssse3(struct sha512_block_state *state, +# const u8 *data, size_t nblocks); +# Purpose: Updates the SHA512 digest stored at "state" with the message +# stored in "data". +# The size of the message pointed to by "data" must be an integer multiple +# of SHA512 message blocks. +# "nblocks" is the message length in SHA512 blocks. Must be >= 1. +######################################################################## +SYM_FUNC_START(sha512_transform_ssse3) + + # Save GPRs + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + # Allocate Stack Space + push %rbp + mov %rsp, %rbp + sub $frame_size, %rsp + and $~(0x20 - 1), %rsp + +.Lupdateblock: + +# Load state variables + mov DIGEST(0), a_64 + mov DIGEST(1), b_64 + mov DIGEST(2), c_64 + mov DIGEST(3), d_64 + mov DIGEST(4), e_64 + mov DIGEST(5), f_64 + mov DIGEST(6), g_64 + mov DIGEST(7), h_64 + + t = 0 + .rept 80/2 + 1 + # (80 rounds) / (2 rounds/iteration) + (1 iteration) + # +1 iteration because the scheduler leads hashing by 1 iteration + .if t < 2 + # BSWAP 2 QWORDS + movdqa XMM_QWORD_BSWAP(%rip), %xmm1 + movdqu MSG(t), %xmm0 + pshufb %xmm1, %xmm0 # BSWAP + movdqa %xmm0, W_t(t) # Store Scheduled Pair + paddq K_t(t), %xmm0 # Compute W[t]+K[t] + movdqa %xmm0, WK_2(t) # Store into WK for rounds + .elseif t < 16 + # BSWAP 2 QWORDS# Compute 2 Rounds + movdqu MSG(t), %xmm0 + pshufb %xmm1, %xmm0 # BSWAP + SHA512_Round t-2 # Round t-2 + movdqa %xmm0, W_t(t) # Store Scheduled Pair + paddq K_t(t), %xmm0 # Compute W[t]+K[t] + SHA512_Round t-1 # Round t-1 + movdqa %xmm0, WK_2(t) # Store W[t]+K[t] into WK + .elseif t < 79 + # Schedule 2 QWORDS# Compute 2 Rounds + SHA512_2Sched_2Round_sse t + .else + # Compute 2 Rounds + SHA512_Round t-2 + SHA512_Round t-1 + .endif + t = t+2 + .endr + + # Update digest + add a_64, DIGEST(0) + add b_64, DIGEST(1) + add c_64, DIGEST(2) + add d_64, DIGEST(3) + add e_64, DIGEST(4) + add f_64, DIGEST(5) + add g_64, DIGEST(6) + add h_64, DIGEST(7) + + # Advance to next message block + add $16*8, msg + dec msglen + jnz .Lupdateblock + + # Restore Stack Pointer + mov %rbp, %rsp + pop %rbp + + # Restore GPRs + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + + RET +SYM_FUNC_END(sha512_transform_ssse3) + +######################################################################## +### Binary Data + +.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16 +.align 16 +# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. +XMM_QWORD_BSWAP: + .octa 0x08090a0b0c0d0e0f0001020304050607 + +# Mergeable 640-byte rodata section. This allows linker to merge the table +# with other, exactly the same 640-byte fragment of another rodata section +# (if such section exists). +.section .rodata.cst640.K512, "aM", @progbits, 640 +.align 64 +# K[t] used in SHA512 hashing +K512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 diff --git a/lib/crypto/x86/sha512.h b/lib/crypto/x86/sha512.h new file mode 100644 index 000000000000..c13503d9d57d --- /dev/null +++ b/lib/crypto/x86/sha512.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * x86-optimized SHA-512 block function + * + * Copyright 2025 Google LLC + */ + +#include <asm/fpu/api.h> +#include <crypto/internal/simd.h> +#include <linux/static_call.h> + +DEFINE_STATIC_CALL(sha512_blocks_x86, sha512_blocks_generic); + +#define DEFINE_X86_SHA512_FN(c_fn, asm_fn) \ + asmlinkage void asm_fn(struct sha512_block_state *state, \ + const u8 *data, size_t nblocks); \ + static void c_fn(struct sha512_block_state *state, const u8 *data, \ + size_t nblocks) \ + { \ + if (likely(crypto_simd_usable())) { \ + kernel_fpu_begin(); \ + asm_fn(state, data, nblocks); \ + kernel_fpu_end(); \ + } else { \ + sha512_blocks_generic(state, data, nblocks); \ + } \ + } + +DEFINE_X86_SHA512_FN(sha512_blocks_ssse3, sha512_transform_ssse3); +DEFINE_X86_SHA512_FN(sha512_blocks_avx, sha512_transform_avx); +DEFINE_X86_SHA512_FN(sha512_blocks_avx2, sha512_transform_rorx); + +static void sha512_blocks(struct sha512_block_state *state, + const u8 *data, size_t nblocks) +{ + static_call(sha512_blocks_x86)(state, data, nblocks); +} + +#define sha512_mod_init_arch sha512_mod_init_arch +static inline void sha512_mod_init_arch(void) +{ + if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && + boot_cpu_has(X86_FEATURE_AVX)) { + if (boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_BMI2)) + static_call_update(sha512_blocks_x86, + sha512_blocks_avx2); + else + static_call_update(sha512_blocks_x86, + sha512_blocks_avx); + } else if (boot_cpu_has(X86_FEATURE_SSSE3)) { + static_call_update(sha512_blocks_x86, sha512_blocks_ssse3); + } +} diff --git a/lib/devres.c b/lib/devres.c index 73901160197e..378b07730420 100644 --- a/lib/devres.c +++ b/lib/devres.c @@ -206,6 +206,7 @@ void __iomem *devm_ioremap_resource_wc(struct device *dev, { return __devm_ioremap_resource(dev, res, DEVM_IOREMAP_WC); } +EXPORT_SYMBOL(devm_ioremap_resource_wc); /* * devm_of_iomap - Requests a resource and maps the memory mapped IO diff --git a/lib/errseq.c b/lib/errseq.c index 93e9b94358dc..13a2581c5a87 100644 --- a/lib/errseq.c +++ b/lib/errseq.c @@ -34,11 +34,14 @@ */ /* The low bits are designated for error code (max of MAX_ERRNO) */ -#define ERRSEQ_SHIFT ilog2(MAX_ERRNO + 1) +#define ERRSEQ_SHIFT (ilog2(MAX_ERRNO) + 1) /* This bit is used as a flag to indicate whether the value has been seen */ #define ERRSEQ_SEEN (1 << ERRSEQ_SHIFT) +/* Leverage macro ERRSEQ_SEEN to define errno mask macro here */ +#define ERRNO_MASK (ERRSEQ_SEEN - 1) + /* The lowest bit of the counter */ #define ERRSEQ_CTR_INC (1 << (ERRSEQ_SHIFT + 1)) @@ -60,8 +63,6 @@ errseq_t errseq_set(errseq_t *eseq, int err) { errseq_t cur, old; - /* MAX_ERRNO must be able to serve as a mask */ - BUILD_BUG_ON_NOT_POWER_OF_2(MAX_ERRNO + 1); /* * Ensure the error code actually fits where we want it to go. If it @@ -79,7 +80,7 @@ errseq_t errseq_set(errseq_t *eseq, int err) errseq_t new; /* Clear out error bits and set new error */ - new = (old & ~(MAX_ERRNO|ERRSEQ_SEEN)) | -err; + new = (old & ~(ERRNO_MASK | ERRSEQ_SEEN)) | -err; /* Only increment if someone has looked at it */ if (old & ERRSEQ_SEEN) @@ -148,7 +149,7 @@ int errseq_check(errseq_t *eseq, errseq_t since) if (likely(cur == since)) return 0; - return -(cur & MAX_ERRNO); + return -(cur & ERRNO_MASK); } EXPORT_SYMBOL(errseq_check); @@ -200,7 +201,7 @@ int errseq_check_and_advance(errseq_t *eseq, errseq_t *since) if (new != old) cmpxchg(eseq, old, new); *since = new; - err = -(new & MAX_ERRNO); + err = -(new & ERRNO_MASK); } return err; } diff --git a/lib/find_bit.c b/lib/find_bit.c index 0836bb3d76c5..d4b5a29e3e72 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -18,6 +18,7 @@ #include <linux/math.h> #include <linux/minmax.h> #include <linux/swab.h> +#include <linux/random.h> /* * Common helper for find_bit() function family @@ -117,6 +118,17 @@ EXPORT_SYMBOL(_find_first_and_bit); #endif /* + * Find the first bit set in 1st memory region and unset in 2nd. + */ +unsigned long _find_first_andnot_bit(const unsigned long *addr1, + const unsigned long *addr2, + unsigned long size) +{ + return FIND_FIRST_BIT(addr1[idx] & ~addr2[idx], /* nop */, size); +} +EXPORT_SYMBOL(_find_first_andnot_bit); + +/* * Find the first set bit in three memory regions. */ unsigned long _find_first_and_and_bit(const unsigned long *addr1, @@ -280,3 +292,26 @@ EXPORT_SYMBOL(_find_next_bit_le); #endif #endif /* __BIG_ENDIAN */ + +/** + * find_random_bit - find a set bit at random position + * @addr: The address to base the search on + * @size: The bitmap size in bits + * + * Returns: a position of a random set bit; >= @size otherwise + */ +unsigned long find_random_bit(const unsigned long *addr, unsigned long size) +{ + int w = bitmap_weight(addr, size); + + switch (w) { + case 0: + return size; + case 1: + /* Performance trick for single-bit bitmaps */ + return find_first_bit(addr, size); + default: + return find_nth_bit(addr, size, get_random_u32_below(w)); + } +} +EXPORT_SYMBOL(find_random_bit); diff --git a/lib/group_cpus.c b/lib/group_cpus.c index ee272c4cefcc..6d08ac05f371 100644 --- a/lib/group_cpus.c +++ b/lib/group_cpus.c @@ -332,9 +332,11 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, /** * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality * @numgrps: number of groups + * @nummasks: number of initialized cpumasks * * Return: cpumask array if successful, NULL otherwise. And each element - * includes CPUs assigned to this group + * includes CPUs assigned to this group. nummasks contains the number + * of initialized masks which can be less than numgrps. * * Try to put close CPUs from viewpoint of CPU and NUMA locality into * same group, and run two-stage grouping: @@ -344,7 +346,7 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, * We guarantee in the resulted grouping that all CPUs are covered, and * no same CPU is assigned to multiple groups */ -struct cpumask *group_cpus_evenly(unsigned int numgrps) +struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks) { unsigned int curgrp = 0, nr_present = 0, nr_others = 0; cpumask_var_t *node_to_cpumask; @@ -352,6 +354,9 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) int ret = -ENOMEM; struct cpumask *masks = NULL; + if (numgrps == 0) + return NULL; + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; @@ -386,7 +391,7 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, npresmsk, nmsk, masks); if (ret < 0) - goto fail_build_affinity; + goto fail_node_to_cpumask; nr_present = ret; /* @@ -405,10 +410,6 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) if (ret >= 0) nr_others = ret; - fail_build_affinity: - if (ret >= 0) - WARN_ON(nr_present + nr_others < numgrps); - fail_node_to_cpumask: free_node_to_cpumask(node_to_cpumask); @@ -421,18 +422,24 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) kfree(masks); return NULL; } + *nummasks = min(nr_present + nr_others, numgrps); return masks; } #else /* CONFIG_SMP */ -struct cpumask *group_cpus_evenly(unsigned int numgrps) +struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks) { - struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); + struct cpumask *masks; + + if (numgrps == 0) + return NULL; + masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); if (!masks) return NULL; /* assign all CPUs(cpu 0) to the 1st group only */ cpumask_copy(&masks[0], cpu_possible_mask); + *nummasks = 1; return masks; } #endif /* CONFIG_SMP */ diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 8c7fdb7d8c8f..f9193f952f49 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -457,38 +457,35 @@ size_t iov_iter_zero(size_t bytes, struct iov_iter *i) } EXPORT_SYMBOL(iov_iter_zero); -size_t copy_page_from_iter_atomic(struct page *page, size_t offset, +size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i) { size_t n, copied = 0; - bool uses_kmap = IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) || - PageHighMem(page); - if (!page_copy_sane(page, offset, bytes)) + if (!page_copy_sane(&folio->page, offset, bytes)) return 0; if (WARN_ON_ONCE(!i->data_source)) return 0; do { - char *p; + char *to = kmap_local_folio(folio, offset); n = bytes - copied; - if (uses_kmap) { - page += offset / PAGE_SIZE; - offset %= PAGE_SIZE; - n = min_t(size_t, n, PAGE_SIZE - offset); - } - - p = kmap_atomic(page) + offset; - n = __copy_from_iter(p, n, i); - kunmap_atomic(p); + if (folio_test_partial_kmap(folio) && + n > PAGE_SIZE - offset_in_page(offset)) + n = PAGE_SIZE - offset_in_page(offset); + + pagefault_disable(); + n = __copy_from_iter(to, n, i); + pagefault_enable(); + kunmap_local(to); copied += n; offset += n; - } while (uses_kmap && copied != bytes && n > 0); + } while (copied != bytes && n > 0); return copied; } -EXPORT_SYMBOL(copy_page_from_iter_atomic); +EXPORT_SYMBOL(copy_folio_from_iter_atomic); static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) { @@ -820,7 +817,7 @@ static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, size_t size = i->count; do { - size_t len = bvec->bv_len; + size_t len = bvec->bv_len - skip; if (len > size) len = size; @@ -1059,22 +1056,22 @@ static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa pgoff_t index, unsigned int nr_pages) { XA_STATE(xas, xa, index); - struct page *page; + struct folio *folio; unsigned int ret = 0; rcu_read_lock(); - for (page = xas_load(&xas); page; page = xas_next(&xas)) { - if (xas_retry(&xas, page)) + for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { + if (xas_retry(&xas, folio)) continue; - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) { + /* Has the folio moved or been split? */ + if (unlikely(folio != xas_reload(&xas))) { xas_reset(&xas); continue; } - pages[ret] = find_subpage(page, xas.xa_index); - get_page(pages[ret]); + pages[ret] = folio_file_page(folio, xas.xa_index); + folio_get(folio); if (++ret == nr_pages) break; } @@ -1191,7 +1188,7 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, return -ENOMEM; p = *pages; for (int k = 0; k < n; k++) { - struct folio *folio = page_folio(page); + struct folio *folio = page_folio(page + k); p[k] = page + k; if (!folio_test_slab(folio)) folio_get(folio); @@ -1650,11 +1647,11 @@ static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, iov_iter_extraction_t extraction_flags, size_t *offset0) { - struct page *page, **p; + struct page **p; + struct folio *folio; unsigned int nr = 0, offset; loff_t pos = i->xarray_start + i->iov_offset; - pgoff_t index = pos >> PAGE_SHIFT; - XA_STATE(xas, i->xarray, index); + XA_STATE(xas, i->xarray, pos >> PAGE_SHIFT); offset = pos & ~PAGE_MASK; *offset0 = offset; @@ -1665,17 +1662,17 @@ static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, p = *pages; rcu_read_lock(); - for (page = xas_load(&xas); page; page = xas_next(&xas)) { - if (xas_retry(&xas, page)) + for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { + if (xas_retry(&xas, folio)) continue; - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) { + /* Has the folio moved or been split? */ + if (unlikely(folio != xas_reload(&xas))) { xas_reset(&xas); continue; } - p[nr++] = find_subpage(page, xas.xa_index); + p[nr++] = folio_file_page(folio, xas.xa_index); if (nr == maxpages) break; } diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index b7f2fa08d9c8..78e16b95d210 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -826,3 +826,23 @@ static int __init kobject_uevent_init(void) postcore_initcall(kobject_uevent_init); #endif + +#ifdef CONFIG_UEVENT_HELPER +static const struct ctl_table uevent_helper_sysctl_table[] = { + { + .procname = "hotplug", + .data = &uevent_helper, + .maxlen = UEVENT_HELPER_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, +}; + +static int __init init_uevent_helper_sysctl(void) +{ + register_sysctl_init("kernel", uevent_helper_sysctl_table); + return 0; +} + +postcore_initcall(init_uevent_helper_sysctl); +#endif diff --git a/lib/kstrtox.c b/lib/kstrtox.c index d586e6af5e5a..bdde40cd69d7 100644 --- a/lib/kstrtox.c +++ b/lib/kstrtox.c @@ -351,6 +351,8 @@ int kstrtobool(const char *s, bool *res) return -EINVAL; switch (s[0]) { + case 'e': + case 'E': case 'y': case 'Y': case 't': @@ -358,6 +360,8 @@ int kstrtobool(const char *s, bool *res) case '1': *res = true; return 0; + case 'd': + case 'D': case 'n': case 'N': case 'f': diff --git a/lib/kunit/Kconfig b/lib/kunit/Kconfig index a97897edd964..c10ede4b1d22 100644 --- a/lib/kunit/Kconfig +++ b/lib/kunit/Kconfig @@ -93,4 +93,17 @@ config KUNIT_AUTORUN_ENABLED In most cases this should be left as Y. Only if additional opt-in behavior is needed should this be set to N. +config KUNIT_DEFAULT_TIMEOUT + int "Default value of the timeout module parameter" + default 300 + help + Sets the default timeout, in seconds, for Kunit test cases. This value + is further multiplied by a factor determined by the assigned speed + setting: 1x for `DEFAULT`, 3x for `KUNIT_SPEED_SLOW`, and 12x for + `KUNIT_SPEED_VERY_SLOW`. This allows slower tests on slower machines + sufficient time to complete. + + If unsure, the default timeout of 300 seconds is suitable for most + cases. + endif # KUNIT diff --git a/lib/kunit/executor.c b/lib/kunit/executor.c index 3f39955cb0f1..0061d4c7e351 100644 --- a/lib/kunit/executor.c +++ b/lib/kunit/executor.c @@ -177,7 +177,7 @@ kunit_filter_suites(const struct kunit_suite_set *suite_set, const size_t max = suite_set->end - suite_set->start; - copy = kcalloc(max, sizeof(*filtered.start), GFP_KERNEL); + copy = kcalloc(max, sizeof(*copy), GFP_KERNEL); if (!copy) { /* won't be able to run anything, return an empty set */ return filtered; } diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c index d9c781c859fd..8c01eabd4eaf 100644 --- a/lib/kunit/kunit-test.c +++ b/lib/kunit/kunit-test.c @@ -8,6 +8,7 @@ #include "linux/gfp_types.h" #include <kunit/test.h> #include <kunit/test-bug.h> +#include <kunit/static_stub.h> #include <linux/device.h> #include <kunit/device.h> @@ -43,7 +44,8 @@ static void kunit_test_try_catch_successful_try_no_catch(struct kunit *test) kunit_try_catch_init(try_catch, test, kunit_test_successful_try, - kunit_test_no_catch); + kunit_test_no_catch, + 300 * msecs_to_jiffies(MSEC_PER_SEC)); kunit_try_catch_run(try_catch, test); KUNIT_EXPECT_TRUE(test, ctx->function_called); @@ -75,7 +77,8 @@ static void kunit_test_try_catch_unsuccessful_try_does_catch(struct kunit *test) kunit_try_catch_init(try_catch, test, kunit_test_unsuccessful_try, - kunit_test_catch); + kunit_test_catch, + 300 * msecs_to_jiffies(MSEC_PER_SEC)); kunit_try_catch_run(try_catch, test); KUNIT_EXPECT_TRUE(test, ctx->function_called); @@ -129,7 +132,8 @@ static void kunit_test_fault_null_dereference(struct kunit *test) kunit_try_catch_init(try_catch, test, kunit_test_null_dereference, - kunit_test_catch); + kunit_test_catch, + 300 * msecs_to_jiffies(MSEC_PER_SEC)); kunit_try_catch_run(try_catch, test); KUNIT_EXPECT_EQ(test, try_catch->try_result, -EINTR); @@ -868,10 +872,53 @@ static struct kunit_suite kunit_current_test_suite = { .test_cases = kunit_current_test_cases, }; +static void kunit_stub_test(struct kunit *test) +{ + struct kunit fake_test; + const unsigned long fake_real_fn_addr = 0x1234; + const unsigned long fake_replacement_addr = 0x5678; + struct kunit_resource *res; + struct { + void *real_fn_addr; + void *replacement_addr; + } *stub_ctx; + + kunit_init_test(&fake_test, "kunit_stub_fake_test", NULL); + KUNIT_ASSERT_EQ(test, fake_test.status, KUNIT_SUCCESS); + KUNIT_ASSERT_EQ(test, list_count_nodes(&fake_test.resources), 0); + + __kunit_activate_static_stub(&fake_test, (void *)fake_real_fn_addr, + (void *)fake_replacement_addr); + KUNIT_ASSERT_EQ(test, fake_test.status, KUNIT_SUCCESS); + KUNIT_ASSERT_EQ(test, list_count_nodes(&fake_test.resources), 1); + + res = list_first_entry(&fake_test.resources, struct kunit_resource, node); + KUNIT_EXPECT_NOT_NULL(test, res); + + stub_ctx = res->data; + KUNIT_EXPECT_NOT_NULL(test, stub_ctx); + KUNIT_EXPECT_EQ(test, (unsigned long)stub_ctx->real_fn_addr, fake_real_fn_addr); + KUNIT_EXPECT_EQ(test, (unsigned long)stub_ctx->replacement_addr, fake_replacement_addr); + + __kunit_activate_static_stub(&fake_test, (void *)fake_real_fn_addr, NULL); + KUNIT_ASSERT_EQ(test, fake_test.status, KUNIT_SUCCESS); + KUNIT_ASSERT_EQ(test, list_count_nodes(&fake_test.resources), 0); +} + +static struct kunit_case kunit_stub_test_cases[] = { + KUNIT_CASE(kunit_stub_test), + {} +}; + +static struct kunit_suite kunit_stub_test_suite = { + .name = "kunit_stub", + .test_cases = kunit_stub_test_cases, +}; + kunit_test_suites(&kunit_try_catch_test_suite, &kunit_resource_test_suite, &kunit_log_test_suite, &kunit_status_test_suite, &kunit_current_test_suite, &kunit_device_test_suite, - &kunit_fault_test_suite); + &kunit_fault_test_suite, &kunit_stub_test_suite); MODULE_DESCRIPTION("KUnit test for core test infrastructure"); MODULE_LICENSE("GPL v2"); diff --git a/lib/kunit/static_stub.c b/lib/kunit/static_stub.c index 92b2cccd5e76..484fd85251b4 100644 --- a/lib/kunit/static_stub.c +++ b/lib/kunit/static_stub.c @@ -96,7 +96,7 @@ void __kunit_activate_static_stub(struct kunit *test, /* If the replacement address is NULL, deactivate the stub. */ if (!replacement_addr) { - kunit_deactivate_static_stub(test, replacement_addr); + kunit_deactivate_static_stub(test, real_fn_addr); return; } diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 146d1b48a096..d2bfa331a2b1 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -70,6 +70,13 @@ module_param_named(enable, enable_param, bool, 0); MODULE_PARM_DESC(enable, "Enable KUnit tests"); /* + * Configure the base timeout. + */ +static unsigned long kunit_base_timeout = CONFIG_KUNIT_DEFAULT_TIMEOUT; +module_param_named(timeout, kunit_base_timeout, ulong, 0644); +MODULE_PARM_DESC(timeout, "Set the base timeout for Kunit test cases"); + +/* * KUnit statistic mode: * 0 - disabled * 1 - only when there is more than one subtest @@ -373,6 +380,40 @@ static void kunit_run_case_check_speed(struct kunit *test, duration.tv_sec, duration.tv_nsec); } +/* Returns timeout multiplier based on speed. + * DEFAULT: 1 + * KUNIT_SPEED_SLOW: 3 + * KUNIT_SPEED_VERY_SLOW: 12 + */ +static int kunit_timeout_mult(enum kunit_speed speed) +{ + switch (speed) { + case KUNIT_SPEED_SLOW: + return 3; + case KUNIT_SPEED_VERY_SLOW: + return 12; + default: + return 1; + } +} + +static unsigned long kunit_test_timeout(struct kunit_suite *suite, struct kunit_case *test_case) +{ + int mult = 1; + + /* + * The default test timeout is 300 seconds and will be adjusted by mult + * based on the test speed. The test speed will be overridden by the + * innermost test component. + */ + if (suite->attr.speed != KUNIT_SPEED_UNSET) + mult = kunit_timeout_mult(suite->attr.speed); + if (test_case->attr.speed != KUNIT_SPEED_UNSET) + mult = kunit_timeout_mult(test_case->attr.speed); + return mult * kunit_base_timeout * msecs_to_jiffies(MSEC_PER_SEC); +} + + /* * Initializes and runs test case. Does not clean up or do post validations. */ @@ -527,7 +568,8 @@ static void kunit_run_case_catch_errors(struct kunit_suite *suite, kunit_try_catch_init(try_catch, test, kunit_try_run_case, - kunit_catch_run_case); + kunit_catch_run_case, + kunit_test_timeout(suite, test_case)); context.test = test; context.suite = suite; context.test_case = test_case; @@ -537,7 +579,8 @@ static void kunit_run_case_catch_errors(struct kunit_suite *suite, kunit_try_catch_init(try_catch, test, kunit_try_run_case_cleanup, - kunit_catch_run_case_cleanup); + kunit_catch_run_case_cleanup, + kunit_test_timeout(suite, test_case)); kunit_try_catch_run(try_catch, &context); /* Propagate the parameter result to the test case. */ @@ -759,7 +802,6 @@ void __kunit_test_suites_exit(struct kunit_suite **suites, int num_suites) } EXPORT_SYMBOL_GPL(__kunit_test_suites_exit); -#ifdef CONFIG_MODULES static void kunit_module_init(struct module *mod) { struct kunit_suite_set suite_set, filtered_set; @@ -847,7 +889,6 @@ static struct notifier_block kunit_mod_nb = { .notifier_call = kunit_module_notify, .priority = 0, }; -#endif KUNIT_DEFINE_ACTION_WRAPPER(kfree_action_wrapper, kfree, const void *) @@ -938,20 +979,14 @@ static int __init kunit_init(void) kunit_debugfs_init(); kunit_bus_init(); -#ifdef CONFIG_MODULES return register_module_notifier(&kunit_mod_nb); -#else - return 0; -#endif } late_initcall(kunit_init); static void __exit kunit_exit(void) { memset(&kunit_hooks, 0, sizeof(kunit_hooks)); -#ifdef CONFIG_MODULES unregister_module_notifier(&kunit_mod_nb); -#endif kunit_bus_shutdown(); diff --git a/lib/kunit/try-catch-impl.h b/lib/kunit/try-catch-impl.h index 203ba6a5e740..6f401b97cd0b 100644 --- a/lib/kunit/try-catch-impl.h +++ b/lib/kunit/try-catch-impl.h @@ -17,11 +17,13 @@ struct kunit; static inline void kunit_try_catch_init(struct kunit_try_catch *try_catch, struct kunit *test, kunit_try_catch_func_t try, - kunit_try_catch_func_t catch) + kunit_try_catch_func_t catch, + unsigned long timeout) { try_catch->test = test; try_catch->try = try; try_catch->catch = catch; + try_catch->timeout = timeout; } #endif /* _KUNIT_TRY_CATCH_IMPL_H */ diff --git a/lib/kunit/try-catch.c b/lib/kunit/try-catch.c index 6bbe0025b079..d84a879f0a78 100644 --- a/lib/kunit/try-catch.c +++ b/lib/kunit/try-catch.c @@ -34,31 +34,6 @@ static int kunit_generic_run_threadfn_adapter(void *data) return 0; } -static unsigned long kunit_test_timeout(void) -{ - /* - * TODO(brendanhiggins@google.com): We should probably have some type of - * variable timeout here. The only question is what that timeout value - * should be. - * - * The intention has always been, at some point, to be able to label - * tests with some type of size bucket (unit/small, integration/medium, - * large/system/end-to-end, etc), where each size bucket would get a - * default timeout value kind of like what Bazel does: - * https://docs.bazel.build/versions/master/be/common-definitions.html#test.size - * There is still some debate to be had on exactly how we do this. (For - * one, we probably want to have some sort of test runner level - * timeout.) - * - * For more background on this topic, see: - * https://mike-bland.com/2011/11/01/small-medium-large.html - * - * If tests timeout due to exceeding sysctl_hung_task_timeout_secs, - * the task will be killed and an oops generated. - */ - return 300 * msecs_to_jiffies(MSEC_PER_SEC); /* 5 min */ -} - void kunit_try_catch_run(struct kunit_try_catch *try_catch, void *context) { struct kunit *test = try_catch->test; @@ -85,8 +60,8 @@ void kunit_try_catch_run(struct kunit_try_catch *try_catch, void *context) task_done = task_struct->vfork_done; wake_up_process(task_struct); - time_remaining = wait_for_completion_timeout(task_done, - kunit_test_timeout()); + time_remaining = wait_for_completion_timeout( + task_done, try_catch->timeout); if (time_remaining == 0) { try_catch->try_result = -ETIMEDOUT; kthread_stop(task_struct); diff --git a/lib/kunit/user_alloc.c b/lib/kunit/user_alloc.c index 46951be018be..b8cac765e620 100644 --- a/lib/kunit/user_alloc.c +++ b/lib/kunit/user_alloc.c @@ -22,8 +22,7 @@ struct kunit_vm_mmap_params { unsigned long offset; }; -/* Create and attach a new mm if it doesn't already exist. */ -static int kunit_attach_mm(void) +int kunit_attach_mm(void) { struct mm_struct *mm; @@ -49,6 +48,7 @@ static int kunit_attach_mm(void) return 0; } +EXPORT_SYMBOL_GPL(kunit_attach_mm); static int kunit_vm_mmap_init(struct kunit_resource *res, void *context) { diff --git a/lib/llist.c b/lib/llist.c index f21d0cfbbaaa..f574c17a238e 100644 --- a/lib/llist.c +++ b/lib/llist.c @@ -14,28 +14,6 @@ #include <linux/export.h> #include <linux/llist.h> - -/** - * llist_add_batch - add several linked entries in batch - * @new_first: first entry in batch to be added - * @new_last: last entry in batch to be added - * @head: the head for your lock-less list - * - * Return whether list is empty before adding. - */ -bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, - struct llist_head *head) -{ - struct llist_node *first = READ_ONCE(head->first); - - do { - new_last->next = first; - } while (!try_cmpxchg(&head->first, &first, new_first)); - - return !first; -} -EXPORT_SYMBOL_GPL(llist_add_batch); - /** * llist_del_first - delete the first entry of lock-less list * @head: the head for your lock-less list diff --git a/lib/maple_tree.c b/lib/maple_tree.c index d0bea23fa4bc..b4ee2d29d7a9 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -211,14 +211,14 @@ static void ma_free_rcu(struct maple_node *node) call_rcu(&node->rcu, mt_free_rcu); } -static void mas_set_height(struct ma_state *mas) +static void mt_set_height(struct maple_tree *mt, unsigned char height) { - unsigned int new_flags = mas->tree->ma_flags; + unsigned int new_flags = mt->ma_flags; new_flags &= ~MT_FLAGS_HEIGHT_MASK; - MAS_BUG_ON(mas, mas->depth > MAPLE_HEIGHT_MAX); - new_flags |= mas->depth << MT_FLAGS_HEIGHT_OFFSET; - mas->tree->ma_flags = new_flags; + MT_BUG_ON(mt, height > MAPLE_HEIGHT_MAX); + new_flags |= height << MT_FLAGS_HEIGHT_OFFSET; + mt->ma_flags = new_flags; } static unsigned int mas_mt_height(struct ma_state *mas) @@ -1053,7 +1053,7 @@ static inline void mte_set_gap(const struct maple_enode *mn, * mas_ascend() - Walk up a level of the tree. * @mas: The maple state * - * Sets the @mas->max and @mas->min to the correct values when walking up. This + * Sets the @mas->max and @mas->min for the parent node of mas->node. This * may cause several levels of walking up to find the correct min and max. * May find a dead node which will cause a premature return. * Return: 1 on dead node, 0 otherwise @@ -1098,6 +1098,12 @@ static int mas_ascend(struct ma_state *mas) min = 0; max = ULONG_MAX; + + /* + * !mas->offset implies that parent node min == mas->min. + * mas->offset > 0 implies that we need to walk up to find the + * implied pivot min. + */ if (!mas->offset) { min = mas->min; set_min = true; @@ -1371,7 +1377,7 @@ retry: root = mas_root(mas); /* Tree with nodes */ if (likely(xa_is_node(root))) { - mas->depth = 1; + mas->depth = 0; mas->status = ma_active; mas->node = mte_safe_root(root); mas->offset = 0; @@ -1712,9 +1718,10 @@ static inline void mas_adopt_children(struct ma_state *mas, * node as dead. * @mas: the maple state with the new node * @old_enode: The old maple encoded node to replace. + * @new_height: if we are inserting a root node, update the height of the tree */ static inline void mas_put_in_tree(struct ma_state *mas, - struct maple_enode *old_enode) + struct maple_enode *old_enode, char new_height) __must_hold(mas->tree->ma_lock) { unsigned char offset; @@ -1723,7 +1730,7 @@ static inline void mas_put_in_tree(struct ma_state *mas, if (mte_is_root(mas->node)) { mas_mn(mas)->parent = ma_parent_ptr(mas_tree_parent(mas)); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); - mas_set_height(mas); + mt_set_height(mas->tree, new_height); } else { offset = mte_parent_slot(mas->node); @@ -1741,12 +1748,13 @@ static inline void mas_put_in_tree(struct ma_state *mas, * the parent encoding to locate the maple node in the tree. * @mas: the ma_state with @mas->node pointing to the new node. * @old_enode: The old maple encoded node. + * @new_height: The new height of the tree as a result of the operation */ static inline void mas_replace_node(struct ma_state *mas, - struct maple_enode *old_enode) + struct maple_enode *old_enode, unsigned char new_height) __must_hold(mas->tree->ma_lock) { - mas_put_in_tree(mas, old_enode); + mas_put_in_tree(mas, old_enode, new_height); mas_free(mas, old_enode); } @@ -2536,10 +2544,11 @@ static inline void mas_topiary_node(struct ma_state *mas, * * @mas: The maple state pointing at the new data * @old_enode: The maple encoded node being replaced + * @new_height: The new height of the tree as a result of the operation * */ static inline void mas_topiary_replace(struct ma_state *mas, - struct maple_enode *old_enode) + struct maple_enode *old_enode, unsigned char new_height) { struct ma_state tmp[3], tmp_next[3]; MA_TOPIARY(subtrees, mas->tree); @@ -2547,7 +2556,7 @@ static inline void mas_topiary_replace(struct ma_state *mas, int i, n; /* Place data in tree & then mark node as old */ - mas_put_in_tree(mas, old_enode); + mas_put_in_tree(mas, old_enode, new_height); /* Update the parent pointers in the tree */ tmp[0] = *mas; @@ -2631,14 +2640,15 @@ static inline void mas_topiary_replace(struct ma_state *mas, * mas_wmb_replace() - Write memory barrier and replace * @mas: The maple state * @old_enode: The old maple encoded node that is being replaced. + * @new_height: The new height of the tree as a result of the operation * * Updates gap as necessary. */ static inline void mas_wmb_replace(struct ma_state *mas, - struct maple_enode *old_enode) + struct maple_enode *old_enode, unsigned char new_height) { /* Insert the new data in the tree */ - mas_topiary_replace(mas, old_enode); + mas_topiary_replace(mas, old_enode, new_height); if (mte_is_leaf(mas->node)) return; @@ -2737,7 +2747,7 @@ static inline bool mast_sufficient(struct maple_subtree_state *mast) */ static inline bool mast_overflow(struct maple_subtree_state *mast) { - if (mast->bn->b_end >= mt_slot_count(mast->orig_l->node)) + if (mast->bn->b_end > mt_slot_count(mast->orig_l->node)) return true; return false; @@ -2824,6 +2834,7 @@ static void mas_spanning_rebalance(struct ma_state *mas, { unsigned char split, mid_split; unsigned char slot = 0; + unsigned char new_height = 0; /* used if node is a new root */ struct maple_enode *left = NULL, *middle = NULL, *right = NULL; struct maple_enode *old_enode; @@ -2845,8 +2856,6 @@ static void mas_spanning_rebalance(struct ma_state *mas, unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type])) mast_spanning_rebalance(mast); - l_mas.depth = 0; - /* * Each level of the tree is examined and balanced, pushing data to the left or * right, or rebalancing against left or right nodes is employed to avoid @@ -2866,6 +2875,7 @@ static void mas_spanning_rebalance(struct ma_state *mas, mast_set_split_parents(mast, left, middle, right, split, mid_split); mast_cp_to_nodes(mast, left, middle, right, split, mid_split); + new_height++; /* * Copy data from next level in the tree to mast->bn from next @@ -2873,7 +2883,6 @@ static void mas_spanning_rebalance(struct ma_state *mas, */ memset(mast->bn, 0, sizeof(struct maple_big_node)); mast->bn->type = mte_node_type(left); - l_mas.depth++; /* Root already stored in l->node. */ if (mas_is_root_limits(mast->l)) @@ -2890,11 +2899,21 @@ static void mas_spanning_rebalance(struct ma_state *mas, mast_combine_cp_right(mast); mast->orig_l->last = mast->orig_l->max; - if (mast_sufficient(mast)) - continue; + if (mast_sufficient(mast)) { + if (mast_overflow(mast)) + continue; + + if (mast->orig_l->node == mast->orig_r->node) { + /* + * The data in b_node should be stored in one + * node and in the tree + */ + slot = mast->l->offset; + break; + } - if (mast_overflow(mast)) continue; + } /* May be a new root stored in mast->bn */ if (mas_is_root_limits(mast->orig_l)) @@ -2909,8 +2928,9 @@ static void mas_spanning_rebalance(struct ma_state *mas, l_mas.node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), mte_node_type(mast->orig_l->node)); - l_mas.depth++; + mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, &l_mas, true); + new_height++; mas_set_parent(mas, left, l_mas.node, slot); if (middle) mas_set_parent(mas, middle, l_mas.node, ++slot); @@ -2933,7 +2953,7 @@ new_root: mas->min = l_mas.min; mas->max = l_mas.max; mas->offset = l_mas.offset; - mas_wmb_replace(mas, old_enode); + mas_wmb_replace(mas, old_enode, new_height); mtree_range_walk(mas); return; } @@ -3009,6 +3029,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end void __rcu **l_slots, **slots; unsigned long *l_pivs, *pivs, gap; bool in_rcu = mt_in_rcu(mas->tree); + unsigned char new_height = mas_mt_height(mas); MA_STATE(l_mas, mas->tree, mas->index, mas->last); @@ -3103,7 +3124,7 @@ done: mas_ascend(mas); if (in_rcu) { - mas_replace_node(mas, old_eparent); + mas_replace_node(mas, old_eparent, new_height); mas_adopt_children(mas, mas->node); } @@ -3114,10 +3135,9 @@ done: * mas_split_final_node() - Split the final node in a subtree operation. * @mast: the maple subtree state * @mas: The maple state - * @height: The height of the tree in case it's a new root. */ static inline void mas_split_final_node(struct maple_subtree_state *mast, - struct ma_state *mas, int height) + struct ma_state *mas) { struct maple_enode *ancestor; @@ -3126,7 +3146,6 @@ static inline void mas_split_final_node(struct maple_subtree_state *mast, mast->bn->type = maple_arange_64; else mast->bn->type = maple_range_64; - mas->depth = height; } /* * Only a single node is used here, could be root. @@ -3214,7 +3233,6 @@ static inline void mast_split_data(struct maple_subtree_state *mast, * mas_push_data() - Instead of splitting a node, it is beneficial to push the * data to the right or left node if there is room. * @mas: The maple state - * @height: The current height of the maple state * @mast: The maple subtree state * @left: Push left or not. * @@ -3222,8 +3240,8 @@ static inline void mast_split_data(struct maple_subtree_state *mast, * * Return: True if pushed, false otherwise. */ -static inline bool mas_push_data(struct ma_state *mas, int height, - struct maple_subtree_state *mast, bool left) +static inline bool mas_push_data(struct ma_state *mas, + struct maple_subtree_state *mast, bool left) { unsigned char slot_total = mast->bn->b_end; unsigned char end, space, split; @@ -3280,7 +3298,7 @@ static inline bool mas_push_data(struct ma_state *mas, int height, mast_split_data(mast, mas, split); mast_fill_bnode(mast, mas, 2); - mas_split_final_node(mast, mas, height + 1); + mas_split_final_node(mast, mas); return true; } @@ -3293,6 +3311,7 @@ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) { struct maple_subtree_state mast; int height = 0; + unsigned int orig_height = mas_mt_height(mas); unsigned char mid_split, split = 0; struct maple_enode *old; @@ -3319,7 +3338,6 @@ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last); trace_ma_op(__func__, mas); - mas->depth = mas_mt_height(mas); mast.l = &l_mas; mast.r = &r_mas; @@ -3327,9 +3345,9 @@ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) mast.orig_r = &prev_r_mas; mast.bn = b_node; - while (height++ <= mas->depth) { + while (height++ <= orig_height) { if (mt_slots[b_node->type] > b_node->b_end) { - mas_split_final_node(&mast, mas, height); + mas_split_final_node(&mast, mas); break; } @@ -3344,11 +3362,15 @@ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) * is a significant savings. */ /* Try to push left. */ - if (mas_push_data(mas, height, &mast, true)) + if (mas_push_data(mas, &mast, true)) { + height++; break; + } /* Try to push right. */ - if (mas_push_data(mas, height, &mast, false)) + if (mas_push_data(mas, &mast, false)) { + height++; break; + } split = mab_calc_split(mas, b_node, &mid_split); mast_split_data(&mast, mas, split); @@ -3365,7 +3387,7 @@ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) /* Set the original node as dead */ old = mas->node; mas->node = l_mas.node; - mas_wmb_replace(mas, old); + mas_wmb_replace(mas, old, height); mtree_range_walk(mas); return; } @@ -3424,8 +3446,7 @@ static inline void mas_root_expand(struct ma_state *mas, void *entry) if (mas->last != ULONG_MAX) pivots[++slot] = ULONG_MAX; - mas->depth = 1; - mas_set_height(mas); + mt_set_height(mas->tree, 1); ma_set_meta(node, maple_leaf_64, 0, slot); /* swap the new root into the tree */ rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); @@ -3532,6 +3553,16 @@ static bool mas_wr_walk(struct ma_wr_state *wr_mas) if (ma_is_leaf(wr_mas->type)) return true; + if (mas->end < mt_slots[wr_mas->type] - 1) + wr_mas->vacant_height = mas->depth + 1; + + if (ma_is_root(mas_mn(mas))) { + /* root needs more than 2 entries to be sufficient + 1 */ + if (mas->end > 2) + wr_mas->sufficient_height = 1; + } else if (mas->end > mt_min_slots[wr_mas->type] + 1) + wr_mas->sufficient_height = mas->depth + 1; + mas_wr_walk_traverse(wr_mas); } @@ -3669,8 +3700,7 @@ static inline void mas_new_root(struct ma_state *mas, void *entry) WARN_ON_ONCE(mas->index || mas->last != ULONG_MAX); if (!entry) { - mas->depth = 0; - mas_set_height(mas); + mt_set_height(mas->tree, 0); rcu_assign_pointer(mas->tree->ma_root, entry); mas->status = ma_start; goto done; @@ -3684,8 +3714,7 @@ static inline void mas_new_root(struct ma_state *mas, void *entry) mas->status = ma_active; rcu_assign_pointer(slots[0], entry); pivots[0] = mas->last; - mas->depth = 1; - mas_set_height(mas); + mt_set_height(mas->tree, 1); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); done: @@ -3804,6 +3833,7 @@ static inline void mas_wr_node_store(struct ma_wr_state *wr_mas, struct maple_node reuse, *newnode; unsigned char copy_size, node_pivots = mt_pivots[wr_mas->type]; bool in_rcu = mt_in_rcu(mas->tree); + unsigned char height = mas_mt_height(mas); if (mas->last == wr_mas->end_piv) offset_end++; /* don't copy this offset */ @@ -3860,7 +3890,7 @@ done: struct maple_enode *old_enode = mas->node; mas->node = mt_mk_node(newnode, wr_mas->type); - mas_replace_node(mas, old_enode); + mas_replace_node(mas, old_enode, height); } else { memcpy(wr_mas->node, newnode, sizeof(struct maple_node)); } @@ -4059,15 +4089,6 @@ static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas) unsigned char new_end = mas_wr_new_end(wr_mas); switch (mas->store_type) { - case wr_invalid: - MT_BUG_ON(mas->tree, 1); - return; - case wr_new_root: - mas_new_root(mas, wr_mas->entry); - break; - case wr_store_root: - mas_store_root(mas, wr_mas->entry); - break; case wr_exact_fit: rcu_assign_pointer(wr_mas->slots[mas->offset], wr_mas->entry); if (!!wr_mas->entry ^ !!wr_mas->content) @@ -4089,6 +4110,14 @@ static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas) case wr_rebalance: mas_wr_bnode(wr_mas); break; + case wr_new_root: + mas_new_root(mas, wr_mas->entry); + break; + case wr_store_root: + mas_store_root(mas, wr_mas->entry); + break; + case wr_invalid: + MT_BUG_ON(mas->tree, 1); } return; @@ -4140,18 +4169,41 @@ set_content: /** * mas_prealloc_calc() - Calculate number of nodes needed for a * given store oepration - * @mas: The maple state + * @wr_mas: The maple write state * @entry: The entry to store into the tree * * Return: Number of nodes required for preallocation. */ -static inline int mas_prealloc_calc(struct ma_state *mas, void *entry) +static inline int mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry) { - int ret = mas_mt_height(mas) * 3 + 1; + struct ma_state *mas = wr_mas->mas; + unsigned char height = mas_mt_height(mas); + int ret = height * 3 + 1; + unsigned char delta = height - wr_mas->vacant_height; switch (mas->store_type) { - case wr_invalid: - WARN_ON_ONCE(1); + case wr_exact_fit: + case wr_append: + case wr_slot_store: + ret = 0; + break; + case wr_spanning_store: + if (wr_mas->sufficient_height < wr_mas->vacant_height) + ret = (height - wr_mas->sufficient_height) * 3 + 1; + else + ret = delta * 3 + 1; + break; + case wr_split_store: + ret = delta * 2 + 1; + break; + case wr_rebalance: + if (wr_mas->sufficient_height < wr_mas->vacant_height) + ret = (height - wr_mas->sufficient_height) * 2 + 1; + else + ret = delta * 2 + 1; + break; + case wr_node_store: + ret = mt_in_rcu(mas->tree) ? 1 : 0; break; case wr_new_root: ret = 1; @@ -4164,22 +4216,8 @@ static inline int mas_prealloc_calc(struct ma_state *mas, void *entry) else ret = 0; break; - case wr_spanning_store: - ret = mas_mt_height(mas) * 3 + 1; - break; - case wr_split_store: - ret = mas_mt_height(mas) * 2 + 1; - break; - case wr_rebalance: - ret = mas_mt_height(mas) * 2 - 1; - break; - case wr_node_store: - ret = mt_in_rcu(mas->tree) ? 1 : 0; - break; - case wr_append: - case wr_exact_fit: - case wr_slot_store: - ret = 0; + case wr_invalid: + WARN_ON_ONCE(1); } return ret; @@ -4243,16 +4281,15 @@ static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas) */ static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry) { - struct ma_state *mas = wr_mas->mas; int request; mas_wr_prealloc_setup(wr_mas); - mas->store_type = mas_wr_store_type(wr_mas); - request = mas_prealloc_calc(mas, entry); + wr_mas->mas->store_type = mas_wr_store_type(wr_mas); + request = mas_prealloc_calc(wr_mas, entry); if (!request) return; - mas_node_count(mas, request); + mas_node_count(wr_mas->mas, request); } /** @@ -4529,15 +4566,12 @@ again: if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; - if (likely(entry)) return entry; if (!empty) { - if (mas->index <= min) { - mas->status = ma_underflow; - return NULL; - } + if (mas->index <= min) + goto underflow; goto again; } @@ -4899,7 +4933,7 @@ void *mas_walk(struct ma_state *mas) { void *entry; - if (!mas_is_active(mas) || !mas_is_start(mas)) + if (!mas_is_active(mas) && !mas_is_start(mas)) mas->status = ma_start; retry: entry = mas_state_walk(mas); @@ -5288,6 +5322,7 @@ static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, struct maple_enode *start; if (mte_is_leaf(enode)) { + mte_set_node_dead(enode); node->type = mte_node_type(enode); goto free_leaf; } @@ -5397,7 +5432,7 @@ void *mas_store(struct ma_state *mas, void *entry) return wr_mas.content; } - request = mas_prealloc_calc(mas, entry); + request = mas_prealloc_calc(&wr_mas, entry); if (!request) goto store; @@ -5494,10 +5529,11 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) mas_wr_prealloc_setup(&wr_mas); mas->store_type = mas_wr_store_type(&wr_mas); - request = mas_prealloc_calc(mas, entry); + request = mas_prealloc_calc(&wr_mas, entry); if (!request) - return ret; + goto set_flag; + mas->mas_flags &= ~MA_STATE_PREALLOC; mas_node_count_gfp(mas, request, gfp); if (mas_is_err(mas)) { mas_set_alloc_req(mas, 0); @@ -5507,6 +5543,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) return ret; } +set_flag: mas->mas_flags |= MA_STATE_PREALLOC; return ret; } @@ -5625,6 +5662,17 @@ int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries) } EXPORT_SYMBOL_GPL(mas_expected_entries); +static void mas_may_activate(struct ma_state *mas) +{ + if (!mas->node) { + mas->status = ma_start; + } else if (mas->index > mas->max || mas->index < mas->min) { + mas->status = ma_start; + } else { + mas->status = ma_active; + } +} + static bool mas_next_setup(struct ma_state *mas, unsigned long max, void **entry) { @@ -5648,11 +5696,11 @@ static bool mas_next_setup(struct ma_state *mas, unsigned long max, break; case ma_overflow: /* Overflowed before, but the max changed */ - mas->status = ma_active; + mas_may_activate(mas); break; case ma_underflow: /* The user expects the mas to be one before where it is */ - mas->status = ma_active; + mas_may_activate(mas); *entry = mas_walk(mas); if (*entry) return true; @@ -5773,11 +5821,11 @@ static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry break; case ma_underflow: /* underflowed before but the min changed */ - mas->status = ma_active; + mas_may_activate(mas); break; case ma_overflow: /* User expects mas to be one after where it is */ - mas->status = ma_active; + mas_may_activate(mas); *entry = mas_walk(mas); if (*entry) return true; @@ -5942,7 +5990,7 @@ static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long m return true; } - mas->status = ma_active; + mas_may_activate(mas); *entry = mas_walk(mas); if (*entry) return true; @@ -5951,7 +5999,7 @@ static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long m if (unlikely(mas->last >= max)) return true; - mas->status = ma_active; + mas_may_activate(mas); *entry = mas_walk(mas); if (*entry) return true; diff --git a/lib/math/div64.c b/lib/math/div64.c index 5faa29208bdb..bf77b9843175 100644 --- a/lib/math/div64.c +++ b/lib/math/div64.c @@ -212,12 +212,13 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c) #endif - /* make sure c is not zero, trigger exception otherwise */ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdiv-by-zero" - if (unlikely(c == 0)) - return 1/0; -#pragma GCC diagnostic pop + /* make sure c is not zero, trigger runtime exception otherwise */ + if (unlikely(c == 0)) { + unsigned long zero = 0; + + OPTIMIZER_HIDE_VAR(zero); + return ~0UL/zero; + } int shift = __builtin_ctzll(c); diff --git a/lib/math/gcd.c b/lib/math/gcd.c index e3b042214d1b..62efca6787ae 100644 --- a/lib/math/gcd.c +++ b/lib/math/gcd.c @@ -11,22 +11,16 @@ * has decent hardware division. */ +DEFINE_STATIC_KEY_TRUE(efficient_ffs_key); + #if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) /* If __ffs is available, the even/odd algorithm benchmarks slower. */ -/** - * gcd - calculate and return the greatest common divisor of 2 unsigned longs - * @a: first value - * @b: second value - */ -unsigned long gcd(unsigned long a, unsigned long b) +static unsigned long binary_gcd(unsigned long a, unsigned long b) { unsigned long r = a | b; - if (!a || !b) - return r; - b >>= __ffs(b); if (b == 1) return r & -r; @@ -44,9 +38,15 @@ unsigned long gcd(unsigned long a, unsigned long b) } } -#else +#endif /* If normalization is done by loops, the even/odd algorithm is a win. */ + +/** + * gcd - calculate and return the greatest common divisor of 2 unsigned longs + * @a: first value + * @b: second value + */ unsigned long gcd(unsigned long a, unsigned long b) { unsigned long r = a | b; @@ -54,6 +54,11 @@ unsigned long gcd(unsigned long a, unsigned long b) if (!a || !b) return r; +#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) + if (static_branch_likely(&efficient_ffs_key)) + return binary_gcd(a, b); +#endif + /* Isolate lsbit of r */ r &= -r; @@ -80,6 +85,4 @@ unsigned long gcd(unsigned long a, unsigned long b) } } -#endif - EXPORT_SYMBOL_GPL(gcd); diff --git a/lib/oid_registry.c b/lib/oid_registry.c index fe6705cfd780..9b757a117f09 100644 --- a/lib/oid_registry.c +++ b/lib/oid_registry.c @@ -117,7 +117,7 @@ int parse_OID(const void *data, size_t datasize, enum OID *oid) EXPORT_SYMBOL_GPL(parse_OID); /* - * sprint_OID - Print an Object Identifier into a buffer + * sprint_oid - Print an Object Identifier into a buffer * @data: The encoded OID to print * @datasize: The size of the encoded OID * @buffer: The buffer to render into @@ -173,26 +173,3 @@ bad: return -EBADMSG; } EXPORT_SYMBOL_GPL(sprint_oid); - -/** - * sprint_OID - Print an Object Identifier into a buffer - * @oid: The OID to print - * @buffer: The buffer to render into - * @bufsize: The size of the buffer - * - * The OID is rendered into the buffer in "a.b.c.d" format and the number of - * bytes is returned. - */ -int sprint_OID(enum OID oid, char *buffer, size_t bufsize) -{ - int ret; - - BUG_ON(oid >= OID__NR); - - ret = sprint_oid(oid_data + oid_index[oid], - oid_index[oid + 1] - oid_index[oid], - buffer, bufsize); - BUG_ON(ret == -EBADMSG); - return ret; -} -EXPORT_SYMBOL_GPL(sprint_OID); diff --git a/lib/pldmfw/pldmfw.c b/lib/pldmfw/pldmfw.c index 6264e2013f25..b45ceb725780 100644 --- a/lib/pldmfw/pldmfw.c +++ b/lib/pldmfw/pldmfw.c @@ -728,6 +728,9 @@ pldm_send_package_data(struct pldmfw_priv *data) struct pldmfw_record *record = data->matching_record; const struct pldmfw_ops *ops = data->context->ops; + if (!ops->send_package_data) + return 0; + return ops->send_package_data(data->context, record->package_data, record->package_data_len); } @@ -755,6 +758,9 @@ pldm_send_component_tables(struct pldmfw_priv *data) if (!test_bit(index, bitmap)) continue; + if (!data->context->ops->send_component_table) + continue; + /* determine whether this is the start, middle, end, or both * the start and end of the component tables */ diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 29127dd05d63..5be0a4e60ab1 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -10,6 +10,7 @@ raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \ raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o +raid6_pq-$(CONFIG_RISCV_ISA_V) += rvv.o recov_rvv.o hostprogs += mktables diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index cd2e88ee1f14..799e0e5eac26 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -18,9 +18,6 @@ #else #include <linux/module.h> #include <linux/gfp.h> -/* In .bss so it's zeroed */ -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); -EXPORT_SYMBOL(raid6_empty_zero_page); #endif struct raid6_calls raid6_call; @@ -28,10 +25,8 @@ EXPORT_SYMBOL_GPL(raid6_call); const struct raid6_calls * const raid6_algos[] = { #if defined(__i386__) && !defined(__arch_um__) -#ifdef CONFIG_AS_AVX512 &raid6_avx512x2, &raid6_avx512x1, -#endif &raid6_avx2x2, &raid6_avx2x1, &raid6_sse2x2, @@ -42,11 +37,9 @@ const struct raid6_calls * const raid6_algos[] = { &raid6_mmxx1, #endif #if defined(__x86_64__) && !defined(__arch_um__) -#ifdef CONFIG_AS_AVX512 &raid6_avx512x4, &raid6_avx512x2, &raid6_avx512x1, -#endif &raid6_avx2x4, &raid6_avx2x2, &raid6_avx2x1, @@ -81,6 +74,12 @@ const struct raid6_calls * const raid6_algos[] = { &raid6_lsx, #endif #endif +#ifdef CONFIG_RISCV_ISA_V + &raid6_rvvx1, + &raid6_rvvx2, + &raid6_rvvx4, + &raid6_rvvx8, +#endif &raid6_intx8, &raid6_intx4, &raid6_intx2, @@ -96,9 +95,7 @@ EXPORT_SYMBOL_GPL(raid6_datap_recov); const struct raid6_recov_calls *const raid6_recov_algos[] = { #ifdef CONFIG_X86 -#ifdef CONFIG_AS_AVX512 &raid6_recov_avx512, -#endif &raid6_recov_avx2, &raid6_recov_ssse3, #endif @@ -116,6 +113,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = { &raid6_recov_lsx, #endif #endif +#ifdef CONFIG_RISCV_ISA_V + &raid6_recov_rvv, +#endif &raid6_recov_intx1, NULL }; diff --git a/lib/raid6/avx512.c b/lib/raid6/avx512.c index 9c3e822e1adf..009bd0adeebf 100644 --- a/lib/raid6/avx512.c +++ b/lib/raid6/avx512.c @@ -17,8 +17,6 @@ * */ -#ifdef CONFIG_AS_AVX512 - #include <linux/raid/pq.h> #include "x86.h" @@ -560,5 +558,3 @@ const struct raid6_calls raid6_avx512x4 = { .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */ }; #endif - -#endif /* CONFIG_AS_AVX512 */ diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c index a7c1b2bbe40d..b5e47c008b41 100644 --- a/lib/raid6/recov.c +++ b/lib/raid6/recov.c @@ -31,10 +31,10 @@ static void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -72,7 +72,7 @@ static void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c index 4e8095403ee2..97d598d2535c 100644 --- a/lib/raid6/recov_avx2.c +++ b/lib/raid6/recov_avx2.c @@ -28,10 +28,10 @@ static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -196,7 +196,7 @@ static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_avx512.c b/lib/raid6/recov_avx512.c index fd9e15bf3f30..7986120ca444 100644 --- a/lib/raid6/recov_avx512.c +++ b/lib/raid6/recov_avx512.c @@ -6,8 +6,6 @@ * Author: Megha Dey <megha.dey@linux.intel.com> */ -#ifdef CONFIG_AS_AVX512 - #include <linux/raid/pq.h> #include "x86.h" @@ -39,10 +37,10 @@ static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila, */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -240,7 +238,7 @@ static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila, */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -377,7 +375,3 @@ const struct raid6_recov_calls raid6_recov_avx512 = { #endif .priority = 3, }; - -#else -#warning "your version of binutils lacks AVX512 support" -#endif diff --git a/lib/raid6/recov_loongarch_simd.c b/lib/raid6/recov_loongarch_simd.c index 94aeac85e6f7..93dc515997a1 100644 --- a/lib/raid6/recov_loongarch_simd.c +++ b/lib/raid6/recov_loongarch_simd.c @@ -42,10 +42,10 @@ static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila, * delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -197,7 +197,7 @@ static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila, * Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -316,10 +316,10 @@ static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila, * delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -436,7 +436,7 @@ static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila, * Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_neon.c b/lib/raid6/recov_neon.c index 1bfc14174d4d..70e1404c1512 100644 --- a/lib/raid6/recov_neon.c +++ b/lib/raid6/recov_neon.c @@ -36,10 +36,10 @@ static void raid6_2data_recov_neon(int disks, size_t bytes, int faila, * delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -74,7 +74,7 @@ static void raid6_datap_recov_neon(int disks, size_t bytes, int faila, * Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c new file mode 100644 index 000000000000..5d54c4b437df --- /dev/null +++ b/lib/raid6/recov_rvv.c @@ -0,0 +1,229 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2024 Institute of Software, CAS. + * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn> + */ + +#include <asm/simd.h> +#include <asm/vector.h> +#include <crypto/internal/simd.h> +#include <linux/raid/pq.h> + +static int rvv_has_vector(void) +{ + return has_vector(); +} + +static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp, + u8 *dq, const u8 *pbmul, + const u8 *qmul) +{ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli x0, %[avl], e8, m1, ta, ma\n" + ".option pop\n" + : : + [avl]"r"(16) + ); + + /* + * while ( bytes-- ) { + * uint8_t px, qx, db; + * + * px = *p ^ *dp; + * qx = qmul[*q ^ *dq]; + * *dq++ = db = pbmul[px] ^ qx; + * *dp++ = db ^ px; + * p++; q++; + * } + */ + while (bytes) { + /* + * v0:px, v1:dp, + * v2:qx, v3:dq, + * v4:vx, v5:vy, + * v6:qm0, v7:qm1, + * v8:pm0, v9:pm1, + * v14:p/qm[vx], v15:p/qm[vy] + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[px])\n" + "vle8.v v1, (%[dp])\n" + "vxor.vv v0, v0, v1\n" + "vle8.v v2, (%[qx])\n" + "vle8.v v3, (%[dq])\n" + "vxor.vv v4, v2, v3\n" + "vsrl.vi v5, v4, 4\n" + "vand.vi v4, v4, 0xf\n" + "vle8.v v6, (%[qm0])\n" + "vle8.v v7, (%[qm1])\n" + "vrgather.vv v14, v6, v4\n" /* v14 = qm[vx] */ + "vrgather.vv v15, v7, v5\n" /* v15 = qm[vy] */ + "vxor.vv v2, v14, v15\n" /* v2 = qmul[*q ^ *dq] */ + + "vsrl.vi v5, v0, 4\n" + "vand.vi v4, v0, 0xf\n" + "vle8.v v8, (%[pm0])\n" + "vle8.v v9, (%[pm1])\n" + "vrgather.vv v14, v8, v4\n" /* v14 = pm[vx] */ + "vrgather.vv v15, v9, v5\n" /* v15 = pm[vy] */ + "vxor.vv v4, v14, v15\n" /* v4 = pbmul[px] */ + "vxor.vv v3, v4, v2\n" /* v3 = db = pbmul[px] ^ qx */ + "vxor.vv v1, v3, v0\n" /* v1 = db ^ px; */ + "vse8.v v3, (%[dq])\n" + "vse8.v v1, (%[dp])\n" + ".option pop\n" + : : + [px]"r"(p), + [dp]"r"(dp), + [qx]"r"(q), + [dq]"r"(dq), + [qm0]"r"(qmul), + [qm1]"r"(qmul + 16), + [pm0]"r"(pbmul), + [pm1]"r"(pbmul + 16) + :); + + bytes -= 16; + p += 16; + q += 16; + dp += 16; + dq += 16; + } +} + +static void __raid6_datap_recov_rvv(int bytes, u8 *p, u8 *q, + u8 *dq, const u8 *qmul) +{ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli x0, %[avl], e8, m1, ta, ma\n" + ".option pop\n" + : : + [avl]"r"(16) + ); + + /* + * while (bytes--) { + * *p++ ^= *dq = qmul[*q ^ *dq]; + * q++; dq++; + * } + */ + while (bytes) { + /* + * v0:vx, v1:vy, + * v2:dq, v3:p, + * v4:qm0, v5:qm1, + * v10:m[vx], v11:m[vy] + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[vx])\n" + "vle8.v v2, (%[dq])\n" + "vxor.vv v0, v0, v2\n" + "vsrl.vi v1, v0, 4\n" + "vand.vi v0, v0, 0xf\n" + "vle8.v v4, (%[qm0])\n" + "vle8.v v5, (%[qm1])\n" + "vrgather.vv v10, v4, v0\n" + "vrgather.vv v11, v5, v1\n" + "vxor.vv v0, v10, v11\n" + "vle8.v v1, (%[vy])\n" + "vxor.vv v1, v0, v1\n" + "vse8.v v0, (%[dq])\n" + "vse8.v v1, (%[vy])\n" + ".option pop\n" + : : + [vx]"r"(q), + [vy]"r"(p), + [dq]"r"(dq), + [qm0]"r"(qmul), + [qm1]"r"(qmul + 16) + :); + + bytes -= 16; + p += 16; + q += 16; + dq += 16; + } +} + +static void raid6_2data_recov_rvv(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks - 2] = p; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + kernel_vector_begin(); + __raid6_2data_recov_rvv(bytes, p, q, dp, dq, pbmul, qmul); + kernel_vector_end(); +} + +static void raid6_datap_recov_rvv(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_vector_begin(); + __raid6_datap_recov_rvv(bytes, p, q, dq, qmul); + kernel_vector_end(); +} + +const struct raid6_recov_calls raid6_recov_rvv = { + .data2 = raid6_2data_recov_rvv, + .datap = raid6_datap_recov_rvv, + .valid = rvv_has_vector, + .name = "rvv", + .priority = 1, +}; diff --git a/lib/raid6/recov_s390xc.c b/lib/raid6/recov_s390xc.c index 179eec900cea..487018f81192 100644 --- a/lib/raid6/recov_s390xc.c +++ b/lib/raid6/recov_s390xc.c @@ -6,7 +6,6 @@ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ -#include <linux/export.h> #include <linux/raid/pq.h> static inline void xor_block(u8 *p1, u8 *p2) @@ -35,10 +34,10 @@ static void raid6_2data_recov_s390xc(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -82,7 +81,7 @@ static void raid6_datap_recov_s390xc(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c index 4bfa3c6b60de..2e849185c32b 100644 --- a/lib/raid6/recov_ssse3.c +++ b/lib/raid6/recov_ssse3.c @@ -30,10 +30,10 @@ static void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -203,7 +203,7 @@ static void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c new file mode 100644 index 000000000000..7d82efa5b14f --- /dev/null +++ b/lib/raid6/rvv.c @@ -0,0 +1,1220 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RAID-6 syndrome calculation using RISC-V vector instructions + * + * Copyright 2024 Institute of Software, CAS. + * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn> + * + * Based on neon.uc: + * Copyright 2002-2004 H. Peter Anvin + */ + +#include <asm/simd.h> +#include <asm/vector.h> +#include <crypto/internal/simd.h> +#include <linux/raid/pq.h> +#include <linux/types.h> +#include "rvv.h" + +#define NSIZE (riscv_v_vsize / 32) /* NSIZE = vlenb */ + +static int rvv_has_vector(void) +{ + return has_vector(); +} + +static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d; + int z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */ + for (d = 0; d < bytes; d += NSIZE * 1) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vle8.v v1, (%[wp0])\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * NSIZE]) + ); + + for (z = z0 - 1 ; z >= 0 ; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * NSIZE]), + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] = wp$$; + * *(unative_t *)&q[d+NSIZE*$$] = wq$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vse8.v v0, (%[wp0])\n" + "vse8.v v1, (%[wq0])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + NSIZE * 0]), + [wq0]"r"(&q[d + NSIZE * 0]) + ); + } +} + +static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d; + int z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */ + for (d = 0 ; d < bytes ; d += NSIZE * 1) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vle8.v v1, (%[wp0])\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * NSIZE]) + ); + + /* P/Q data pages */ + for (z = z0 - 1; z >= start; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * NSIZE]), + [x1d]"r"(0x1d) + ); + } + + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * wq$$ = w1$$ ^ w2$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v1, v3, v2\n" + ".option pop\n" + : : + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + * v0:wp0, v1:wq0, v2:p0, v3:q0 + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v2, (%[wp0])\n" + "vle8.v v3, (%[wq0])\n" + "vxor.vv v2, v2, v0\n" + "vxor.vv v3, v3, v1\n" + "vse8.v v2, (%[wp0])\n" + "vse8.v v3, (%[wq0])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + NSIZE * 0]), + [wq0]"r"(&q[d + NSIZE * 0]) + ); + } +} + +static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d; + int z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + */ + for (d = 0; d < bytes; d += NSIZE * 2) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vle8.v v1, (%[wp0])\n" + "vle8.v v4, (%[wp1])\n" + "vle8.v v5, (%[wp1])\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), + [wp1]"r"(&dptr[z0][d + 1 * NSIZE]) + ); + + for (z = z0 - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * NSIZE]), + [wd1]"r"(&dptr[z][d + 1 * NSIZE]), + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] = wp$$; + * *(unative_t *)&q[d+NSIZE*$$] = wq$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vse8.v v0, (%[wp0])\n" + "vse8.v v1, (%[wq0])\n" + "vse8.v v4, (%[wp1])\n" + "vse8.v v5, (%[wq1])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + NSIZE * 0]), + [wq0]"r"(&q[d + NSIZE * 0]), + [wp1]"r"(&p[d + NSIZE * 1]), + [wq1]"r"(&q[d + NSIZE * 1]) + ); + } +} + +static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d; + int z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + */ + for (d = 0; d < bytes; d += NSIZE * 2) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vle8.v v1, (%[wp0])\n" + "vle8.v v4, (%[wp1])\n" + "vle8.v v5, (%[wp1])\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), + [wp1]"r"(&dptr[z0][d + 1 * NSIZE]) + ); + + /* P/Q data pages */ + for (z = z0 - 1; z >= start; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * NSIZE]), + [wd1]"r"(&dptr[z][d + 1 * NSIZE]), + [x1d]"r"(0x1d) + ); + } + + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * wq$$ = w1$$ ^ w2$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v1, v3, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v5, v7, v6\n" + ".option pop\n" + : : + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + * v0:wp0, v1:wq0, v2:p0, v3:q0 + * v4:wp1, v5:wq1, v6:p1, v7:q1 + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v2, (%[wp0])\n" + "vle8.v v3, (%[wq0])\n" + "vxor.vv v2, v2, v0\n" + "vxor.vv v3, v3, v1\n" + "vse8.v v2, (%[wp0])\n" + "vse8.v v3, (%[wq0])\n" + + "vle8.v v6, (%[wp1])\n" + "vle8.v v7, (%[wq1])\n" + "vxor.vv v6, v6, v4\n" + "vxor.vv v7, v7, v5\n" + "vse8.v v6, (%[wp1])\n" + "vse8.v v7, (%[wq1])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + NSIZE * 0]), + [wq0]"r"(&q[d + NSIZE * 0]), + [wp1]"r"(&p[d + NSIZE * 1]), + [wq1]"r"(&q[d + NSIZE * 1]) + ); + } +} + +static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d; + int z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12 + * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13 + */ + for (d = 0; d < bytes; d += NSIZE * 4) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vle8.v v1, (%[wp0])\n" + "vle8.v v4, (%[wp1])\n" + "vle8.v v5, (%[wp1])\n" + "vle8.v v8, (%[wp2])\n" + "vle8.v v9, (%[wp2])\n" + "vle8.v v12, (%[wp3])\n" + "vle8.v v13, (%[wp3])\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), + [wp1]"r"(&dptr[z0][d + 1 * NSIZE]), + [wp2]"r"(&dptr[z0][d + 2 * NSIZE]), + [wp3]"r"(&dptr[z0][d + 3 * NSIZE]) + ); + + for (z = z0 - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v11, v11, v10\n" + "vle8.v v10, (%[wd2])\n" + "vxor.vv v9, v11, v10\n" + "vxor.vv v8, v8, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v15, v15, v14\n" + "vle8.v v14, (%[wd3])\n" + "vxor.vv v13, v15, v14\n" + "vxor.vv v12, v12, v14\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * NSIZE]), + [wd1]"r"(&dptr[z][d + 1 * NSIZE]), + [wd2]"r"(&dptr[z][d + 2 * NSIZE]), + [wd3]"r"(&dptr[z][d + 3 * NSIZE]), + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] = wp$$; + * *(unative_t *)&q[d+NSIZE*$$] = wq$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vse8.v v0, (%[wp0])\n" + "vse8.v v1, (%[wq0])\n" + "vse8.v v4, (%[wp1])\n" + "vse8.v v5, (%[wq1])\n" + "vse8.v v8, (%[wp2])\n" + "vse8.v v9, (%[wq2])\n" + "vse8.v v12, (%[wp3])\n" + "vse8.v v13, (%[wq3])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + NSIZE * 0]), + [wq0]"r"(&q[d + NSIZE * 0]), + [wp1]"r"(&p[d + NSIZE * 1]), + [wq1]"r"(&q[d + NSIZE * 1]), + [wp2]"r"(&p[d + NSIZE * 2]), + [wq2]"r"(&q[d + NSIZE * 2]), + [wp3]"r"(&p[d + NSIZE * 3]), + [wq3]"r"(&q[d + NSIZE * 3]) + ); + } +} + +static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d; + int z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12 + * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13 + */ + for (d = 0; d < bytes; d += NSIZE * 4) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vle8.v v1, (%[wp0])\n" + "vle8.v v4, (%[wp1])\n" + "vle8.v v5, (%[wp1])\n" + "vle8.v v8, (%[wp2])\n" + "vle8.v v9, (%[wp2])\n" + "vle8.v v12, (%[wp3])\n" + "vle8.v v13, (%[wp3])\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), + [wp1]"r"(&dptr[z0][d + 1 * NSIZE]), + [wp2]"r"(&dptr[z0][d + 2 * NSIZE]), + [wp3]"r"(&dptr[z0][d + 3 * NSIZE]) + ); + + /* P/Q data pages */ + for (z = z0 - 1; z >= start; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v11, v11, v10\n" + "vle8.v v10, (%[wd2])\n" + "vxor.vv v9, v11, v10\n" + "vxor.vv v8, v8, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v15, v15, v14\n" + "vle8.v v14, (%[wd3])\n" + "vxor.vv v13, v15, v14\n" + "vxor.vv v12, v12, v14\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * NSIZE]), + [wd1]"r"(&dptr[z][d + 1 * NSIZE]), + [wd2]"r"(&dptr[z][d + 2 * NSIZE]), + [wd3]"r"(&dptr[z][d + 3 * NSIZE]), + [x1d]"r"(0x1d) + ); + } + + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * wq$$ = w1$$ ^ w2$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v1, v3, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v5, v7, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v9, v11, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v13, v15, v14\n" + ".option pop\n" + : : + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + * v0:wp0, v1:wq0, v2:p0, v3:q0 + * v4:wp1, v5:wq1, v6:p1, v7:q1 + * v8:wp2, v9:wq2, v10:p2, v11:q2 + * v12:wp3, v13:wq3, v14:p3, v15:q3 + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v2, (%[wp0])\n" + "vle8.v v3, (%[wq0])\n" + "vxor.vv v2, v2, v0\n" + "vxor.vv v3, v3, v1\n" + "vse8.v v2, (%[wp0])\n" + "vse8.v v3, (%[wq0])\n" + + "vle8.v v6, (%[wp1])\n" + "vle8.v v7, (%[wq1])\n" + "vxor.vv v6, v6, v4\n" + "vxor.vv v7, v7, v5\n" + "vse8.v v6, (%[wp1])\n" + "vse8.v v7, (%[wq1])\n" + + "vle8.v v10, (%[wp2])\n" + "vle8.v v11, (%[wq2])\n" + "vxor.vv v10, v10, v8\n" + "vxor.vv v11, v11, v9\n" + "vse8.v v10, (%[wp2])\n" + "vse8.v v11, (%[wq2])\n" + + "vle8.v v14, (%[wp3])\n" + "vle8.v v15, (%[wq3])\n" + "vxor.vv v14, v14, v12\n" + "vxor.vv v15, v15, v13\n" + "vse8.v v14, (%[wp3])\n" + "vse8.v v15, (%[wq3])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + NSIZE * 0]), + [wq0]"r"(&q[d + NSIZE * 0]), + [wp1]"r"(&p[d + NSIZE * 1]), + [wq1]"r"(&q[d + NSIZE * 1]), + [wp2]"r"(&p[d + NSIZE * 2]), + [wq2]"r"(&q[d + NSIZE * 2]), + [wp3]"r"(&p[d + NSIZE * 3]), + [wq3]"r"(&q[d + NSIZE * 3]) + ); + } +} + +static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d; + int z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12 + * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13 + * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14 + * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15 + * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16 + * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17 + */ + for (d = 0; d < bytes; d += NSIZE * 8) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vle8.v v1, (%[wp0])\n" + "vle8.v v4, (%[wp1])\n" + "vle8.v v5, (%[wp1])\n" + "vle8.v v8, (%[wp2])\n" + "vle8.v v9, (%[wp2])\n" + "vle8.v v12, (%[wp3])\n" + "vle8.v v13, (%[wp3])\n" + "vle8.v v16, (%[wp4])\n" + "vle8.v v17, (%[wp4])\n" + "vle8.v v20, (%[wp5])\n" + "vle8.v v21, (%[wp5])\n" + "vle8.v v24, (%[wp6])\n" + "vle8.v v25, (%[wp6])\n" + "vle8.v v28, (%[wp7])\n" + "vle8.v v29, (%[wp7])\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), + [wp1]"r"(&dptr[z0][d + 1 * NSIZE]), + [wp2]"r"(&dptr[z0][d + 2 * NSIZE]), + [wp3]"r"(&dptr[z0][d + 3 * NSIZE]), + [wp4]"r"(&dptr[z0][d + 4 * NSIZE]), + [wp5]"r"(&dptr[z0][d + 5 * NSIZE]), + [wp6]"r"(&dptr[z0][d + 6 * NSIZE]), + [wp7]"r"(&dptr[z0][d + 7 * NSIZE]) + ); + + for (z = z0 - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v11, v11, v10\n" + "vle8.v v10, (%[wd2])\n" + "vxor.vv v9, v11, v10\n" + "vxor.vv v8, v8, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v15, v15, v14\n" + "vle8.v v14, (%[wd3])\n" + "vxor.vv v13, v15, v14\n" + "vxor.vv v12, v12, v14\n" + + "vsra.vi v18, v17, 7\n" + "vsll.vi v19, v17, 1\n" + "vand.vx v18, v18, %[x1d]\n" + "vxor.vv v19, v19, v18\n" + "vle8.v v18, (%[wd4])\n" + "vxor.vv v17, v19, v18\n" + "vxor.vv v16, v16, v18\n" + + "vsra.vi v22, v21, 7\n" + "vsll.vi v23, v21, 1\n" + "vand.vx v22, v22, %[x1d]\n" + "vxor.vv v23, v23, v22\n" + "vle8.v v22, (%[wd5])\n" + "vxor.vv v21, v23, v22\n" + "vxor.vv v20, v20, v22\n" + + "vsra.vi v26, v25, 7\n" + "vsll.vi v27, v25, 1\n" + "vand.vx v26, v26, %[x1d]\n" + "vxor.vv v27, v27, v26\n" + "vle8.v v26, (%[wd6])\n" + "vxor.vv v25, v27, v26\n" + "vxor.vv v24, v24, v26\n" + + "vsra.vi v30, v29, 7\n" + "vsll.vi v31, v29, 1\n" + "vand.vx v30, v30, %[x1d]\n" + "vxor.vv v31, v31, v30\n" + "vle8.v v30, (%[wd7])\n" + "vxor.vv v29, v31, v30\n" + "vxor.vv v28, v28, v30\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * NSIZE]), + [wd1]"r"(&dptr[z][d + 1 * NSIZE]), + [wd2]"r"(&dptr[z][d + 2 * NSIZE]), + [wd3]"r"(&dptr[z][d + 3 * NSIZE]), + [wd4]"r"(&dptr[z][d + 4 * NSIZE]), + [wd5]"r"(&dptr[z][d + 5 * NSIZE]), + [wd6]"r"(&dptr[z][d + 6 * NSIZE]), + [wd7]"r"(&dptr[z][d + 7 * NSIZE]), + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] = wp$$; + * *(unative_t *)&q[d+NSIZE*$$] = wq$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vse8.v v0, (%[wp0])\n" + "vse8.v v1, (%[wq0])\n" + "vse8.v v4, (%[wp1])\n" + "vse8.v v5, (%[wq1])\n" + "vse8.v v8, (%[wp2])\n" + "vse8.v v9, (%[wq2])\n" + "vse8.v v12, (%[wp3])\n" + "vse8.v v13, (%[wq3])\n" + "vse8.v v16, (%[wp4])\n" + "vse8.v v17, (%[wq4])\n" + "vse8.v v20, (%[wp5])\n" + "vse8.v v21, (%[wq5])\n" + "vse8.v v24, (%[wp6])\n" + "vse8.v v25, (%[wq6])\n" + "vse8.v v28, (%[wp7])\n" + "vse8.v v29, (%[wq7])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + NSIZE * 0]), + [wq0]"r"(&q[d + NSIZE * 0]), + [wp1]"r"(&p[d + NSIZE * 1]), + [wq1]"r"(&q[d + NSIZE * 1]), + [wp2]"r"(&p[d + NSIZE * 2]), + [wq2]"r"(&q[d + NSIZE * 2]), + [wp3]"r"(&p[d + NSIZE * 3]), + [wq3]"r"(&q[d + NSIZE * 3]), + [wp4]"r"(&p[d + NSIZE * 4]), + [wq4]"r"(&q[d + NSIZE * 4]), + [wp5]"r"(&p[d + NSIZE * 5]), + [wq5]"r"(&q[d + NSIZE * 5]), + [wp6]"r"(&p[d + NSIZE * 6]), + [wq6]"r"(&q[d + NSIZE * 6]), + [wp7]"r"(&p[d + NSIZE * 7]), + [wq7]"r"(&q[d + NSIZE * 7]) + ); + } +} + +static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d; + int z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12 + * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13 + * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14 + * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15 + * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16 + * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17 + */ + for (d = 0; d < bytes; d += NSIZE * 8) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vle8.v v1, (%[wp0])\n" + "vle8.v v4, (%[wp1])\n" + "vle8.v v5, (%[wp1])\n" + "vle8.v v8, (%[wp2])\n" + "vle8.v v9, (%[wp2])\n" + "vle8.v v12, (%[wp3])\n" + "vle8.v v13, (%[wp3])\n" + "vle8.v v16, (%[wp4])\n" + "vle8.v v17, (%[wp4])\n" + "vle8.v v20, (%[wp5])\n" + "vle8.v v21, (%[wp5])\n" + "vle8.v v24, (%[wp6])\n" + "vle8.v v25, (%[wp6])\n" + "vle8.v v28, (%[wp7])\n" + "vle8.v v29, (%[wp7])\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), + [wp1]"r"(&dptr[z0][d + 1 * NSIZE]), + [wp2]"r"(&dptr[z0][d + 2 * NSIZE]), + [wp3]"r"(&dptr[z0][d + 3 * NSIZE]), + [wp4]"r"(&dptr[z0][d + 4 * NSIZE]), + [wp5]"r"(&dptr[z0][d + 5 * NSIZE]), + [wp6]"r"(&dptr[z0][d + 6 * NSIZE]), + [wp7]"r"(&dptr[z0][d + 7 * NSIZE]) + ); + + /* P/Q data pages */ + for (z = z0 - 1; z >= start; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v11, v11, v10\n" + "vle8.v v10, (%[wd2])\n" + "vxor.vv v9, v11, v10\n" + "vxor.vv v8, v8, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v15, v15, v14\n" + "vle8.v v14, (%[wd3])\n" + "vxor.vv v13, v15, v14\n" + "vxor.vv v12, v12, v14\n" + + "vsra.vi v18, v17, 7\n" + "vsll.vi v19, v17, 1\n" + "vand.vx v18, v18, %[x1d]\n" + "vxor.vv v19, v19, v18\n" + "vle8.v v18, (%[wd4])\n" + "vxor.vv v17, v19, v18\n" + "vxor.vv v16, v16, v18\n" + + "vsra.vi v22, v21, 7\n" + "vsll.vi v23, v21, 1\n" + "vand.vx v22, v22, %[x1d]\n" + "vxor.vv v23, v23, v22\n" + "vle8.v v22, (%[wd5])\n" + "vxor.vv v21, v23, v22\n" + "vxor.vv v20, v20, v22\n" + + "vsra.vi v26, v25, 7\n" + "vsll.vi v27, v25, 1\n" + "vand.vx v26, v26, %[x1d]\n" + "vxor.vv v27, v27, v26\n" + "vle8.v v26, (%[wd6])\n" + "vxor.vv v25, v27, v26\n" + "vxor.vv v24, v24, v26\n" + + "vsra.vi v30, v29, 7\n" + "vsll.vi v31, v29, 1\n" + "vand.vx v30, v30, %[x1d]\n" + "vxor.vv v31, v31, v30\n" + "vle8.v v30, (%[wd7])\n" + "vxor.vv v29, v31, v30\n" + "vxor.vv v28, v28, v30\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * NSIZE]), + [wd1]"r"(&dptr[z][d + 1 * NSIZE]), + [wd2]"r"(&dptr[z][d + 2 * NSIZE]), + [wd3]"r"(&dptr[z][d + 3 * NSIZE]), + [wd4]"r"(&dptr[z][d + 4 * NSIZE]), + [wd5]"r"(&dptr[z][d + 5 * NSIZE]), + [wd6]"r"(&dptr[z][d + 6 * NSIZE]), + [wd7]"r"(&dptr[z][d + 7 * NSIZE]), + [x1d]"r"(0x1d) + ); + } + + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * wq$$ = w1$$ ^ w2$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v1, v3, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v5, v7, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v9, v11, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v13, v15, v14\n" + + "vsra.vi v18, v17, 7\n" + "vsll.vi v19, v17, 1\n" + "vand.vx v18, v18, %[x1d]\n" + "vxor.vv v17, v19, v18\n" + + "vsra.vi v22, v21, 7\n" + "vsll.vi v23, v21, 1\n" + "vand.vx v22, v22, %[x1d]\n" + "vxor.vv v21, v23, v22\n" + + "vsra.vi v26, v25, 7\n" + "vsll.vi v27, v25, 1\n" + "vand.vx v26, v26, %[x1d]\n" + "vxor.vv v25, v27, v26\n" + + "vsra.vi v30, v29, 7\n" + "vsll.vi v31, v29, 1\n" + "vand.vx v30, v30, %[x1d]\n" + "vxor.vv v29, v31, v30\n" + ".option pop\n" + : : + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + * v0:wp0, v1:wq0, v2:p0, v3:q0 + * v4:wp1, v5:wq1, v6:p1, v7:q1 + * v8:wp2, v9:wq2, v10:p2, v11:q2 + * v12:wp3, v13:wq3, v14:p3, v15:q3 + * v16:wp4, v17:wq4, v18:p4, v19:q4 + * v20:wp5, v21:wq5, v22:p5, v23:q5 + * v24:wp6, v25:wq6, v26:p6, v27:q6 + * v28:wp7, v29:wq7, v30:p7, v31:q7 + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v2, (%[wp0])\n" + "vle8.v v3, (%[wq0])\n" + "vxor.vv v2, v2, v0\n" + "vxor.vv v3, v3, v1\n" + "vse8.v v2, (%[wp0])\n" + "vse8.v v3, (%[wq0])\n" + + "vle8.v v6, (%[wp1])\n" + "vle8.v v7, (%[wq1])\n" + "vxor.vv v6, v6, v4\n" + "vxor.vv v7, v7, v5\n" + "vse8.v v6, (%[wp1])\n" + "vse8.v v7, (%[wq1])\n" + + "vle8.v v10, (%[wp2])\n" + "vle8.v v11, (%[wq2])\n" + "vxor.vv v10, v10, v8\n" + "vxor.vv v11, v11, v9\n" + "vse8.v v10, (%[wp2])\n" + "vse8.v v11, (%[wq2])\n" + + "vle8.v v14, (%[wp3])\n" + "vle8.v v15, (%[wq3])\n" + "vxor.vv v14, v14, v12\n" + "vxor.vv v15, v15, v13\n" + "vse8.v v14, (%[wp3])\n" + "vse8.v v15, (%[wq3])\n" + + "vle8.v v18, (%[wp4])\n" + "vle8.v v19, (%[wq4])\n" + "vxor.vv v18, v18, v16\n" + "vxor.vv v19, v19, v17\n" + "vse8.v v18, (%[wp4])\n" + "vse8.v v19, (%[wq4])\n" + + "vle8.v v22, (%[wp5])\n" + "vle8.v v23, (%[wq5])\n" + "vxor.vv v22, v22, v20\n" + "vxor.vv v23, v23, v21\n" + "vse8.v v22, (%[wp5])\n" + "vse8.v v23, (%[wq5])\n" + + "vle8.v v26, (%[wp6])\n" + "vle8.v v27, (%[wq6])\n" + "vxor.vv v26, v26, v24\n" + "vxor.vv v27, v27, v25\n" + "vse8.v v26, (%[wp6])\n" + "vse8.v v27, (%[wq6])\n" + + "vle8.v v30, (%[wp7])\n" + "vle8.v v31, (%[wq7])\n" + "vxor.vv v30, v30, v28\n" + "vxor.vv v31, v31, v29\n" + "vse8.v v30, (%[wp7])\n" + "vse8.v v31, (%[wq7])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + NSIZE * 0]), + [wq0]"r"(&q[d + NSIZE * 0]), + [wp1]"r"(&p[d + NSIZE * 1]), + [wq1]"r"(&q[d + NSIZE * 1]), + [wp2]"r"(&p[d + NSIZE * 2]), + [wq2]"r"(&q[d + NSIZE * 2]), + [wp3]"r"(&p[d + NSIZE * 3]), + [wq3]"r"(&q[d + NSIZE * 3]), + [wp4]"r"(&p[d + NSIZE * 4]), + [wq4]"r"(&q[d + NSIZE * 4]), + [wp5]"r"(&p[d + NSIZE * 5]), + [wq5]"r"(&q[d + NSIZE * 5]), + [wp6]"r"(&p[d + NSIZE * 6]), + [wq6]"r"(&q[d + NSIZE * 6]), + [wp7]"r"(&p[d + NSIZE * 7]), + [wq7]"r"(&q[d + NSIZE * 7]) + ); + } +} + +RAID6_RVV_WRAPPER(1); +RAID6_RVV_WRAPPER(2); +RAID6_RVV_WRAPPER(4); +RAID6_RVV_WRAPPER(8); diff --git a/lib/raid6/rvv.h b/lib/raid6/rvv.h new file mode 100644 index 000000000000..94044a1b707b --- /dev/null +++ b/lib/raid6/rvv.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2024 Institute of Software, CAS. + * + * raid6/rvv.h + * + * Definitions for RISC-V RAID-6 code + */ + +#define RAID6_RVV_WRAPPER(_n) \ + static void raid6_rvv ## _n ## _gen_syndrome(int disks, \ + size_t bytes, void **ptrs) \ + { \ + void raid6_rvv ## _n ## _gen_syndrome_real(int d, \ + unsigned long b, void **p); \ + kernel_vector_begin(); \ + raid6_rvv ## _n ## _gen_syndrome_real(disks, \ + (unsigned long)bytes, ptrs); \ + kernel_vector_end(); \ + } \ + static void raid6_rvv ## _n ## _xor_syndrome(int disks, \ + int start, int stop, \ + size_t bytes, void **ptrs) \ + { \ + void raid6_rvv ## _n ## _xor_syndrome_real(int d, \ + int s1, int s2, \ + unsigned long b, void **p); \ + kernel_vector_begin(); \ + raid6_rvv ## _n ## _xor_syndrome_real(disks, \ + start, stop, (unsigned long)bytes, ptrs); \ + kernel_vector_end(); \ + } \ + struct raid6_calls const raid6_rvvx ## _n = { \ + raid6_rvv ## _n ## _gen_syndrome, \ + raid6_rvv ## _n ## _xor_syndrome, \ + rvv_has_vector, \ + "rvvx" #_n, \ + 0 \ + } diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 2abe0076a636..8f2dd2210ba8 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -54,9 +54,6 @@ endif ifeq ($(IS_X86),yes) OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o CFLAGS += -DCONFIG_X86 - CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" | \ - gcc -c -x assembler - >/dev/null 2>&1 && \ - rm ./-.o && echo -DCONFIG_AS_AVX512=1) else ifeq ($(HAS_NEON),yes) OBJS += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 diff --git a/lib/ratelimit.c b/lib/ratelimit.c index ce945c17980b..859c251b23ce 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -33,44 +33,73 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) int interval = READ_ONCE(rs->interval); int burst = READ_ONCE(rs->burst); unsigned long flags; - int ret; + int ret = 0; - if (!interval) - return 1; + /* + * Zero interval says never limit, otherwise, non-positive burst + * says always limit. + */ + if (interval <= 0 || burst <= 0) { + WARN_ONCE(interval < 0 || burst < 0, "Negative interval (%d) or burst (%d): Uninitialized ratelimit_state structure?\n", interval, burst); + ret = interval == 0 || burst > 0; + if (!(READ_ONCE(rs->flags) & RATELIMIT_INITIALIZED) || (!interval && !burst) || + !raw_spin_trylock_irqsave(&rs->lock, flags)) + goto nolock_ret; + + /* Force re-initialization once re-enabled. */ + rs->flags &= ~RATELIMIT_INITIALIZED; + goto unlock_ret; + } /* - * If we contend on this state's lock then almost - * by definition we are too busy to print a message, - * in addition to the one that will be printed by - * the entity that is holding the lock already: + * If we contend on this state's lock then just check if + * the current burst is used or not. It might cause + * false positive when we are past the interval and + * the current lock owner is just about to reset it. */ - if (!raw_spin_trylock_irqsave(&rs->lock, flags)) - return 0; + if (!raw_spin_trylock_irqsave(&rs->lock, flags)) { + if (READ_ONCE(rs->flags) & RATELIMIT_INITIALIZED && + atomic_read(&rs->rs_n_left) > 0 && atomic_dec_return(&rs->rs_n_left) >= 0) + ret = 1; + goto nolock_ret; + } - if (!rs->begin) + if (!(rs->flags & RATELIMIT_INITIALIZED)) { rs->begin = jiffies; + rs->flags |= RATELIMIT_INITIALIZED; + atomic_set(&rs->rs_n_left, rs->burst); + } if (time_is_before_jiffies(rs->begin + interval)) { - if (rs->missed) { - if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { + int m; + + /* + * Reset rs_n_left ASAP to reduce false positives + * in parallel calls, see above. + */ + atomic_set(&rs->rs_n_left, rs->burst); + rs->begin = jiffies; + + if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { + m = ratelimit_state_reset_miss(rs); + if (m) { printk_deferred(KERN_WARNING - "%s: %d callbacks suppressed\n", - func, rs->missed); - rs->missed = 0; + "%s: %d callbacks suppressed\n", func, m); } } - rs->begin = jiffies; - rs->printed = 0; } - if (burst && burst > rs->printed) { - rs->printed++; + + /* Note that the burst might be taken by a parallel call. */ + if (atomic_read(&rs->rs_n_left) > 0 && atomic_dec_return(&rs->rs_n_left) >= 0) ret = 1; - } else { - rs->missed++; - ret = 0; - } + +unlock_ret: raw_spin_unlock_irqrestore(&rs->lock, flags); +nolock_ret: + if (!ret) + ratelimit_state_inc_miss(rs); + return ret; } EXPORT_SYMBOL(___ratelimit); diff --git a/lib/rbtree.c b/lib/rbtree.c index 989c2d615f92..5114eda6309c 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -297,9 +297,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, * / \ / \ * N S --> N sl * / \ \ - * sl sr S + * sl Sr S * \ - * sr + * Sr * * Note: p might be red, and then both * p and sl are red after rotation(which @@ -312,9 +312,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, * / \ / \ * N sl --> P S * \ / \ - * S N sr + * S N Sr * \ - * sr + * Sr */ tmp1 = tmp2->rb_right; WRITE_ONCE(sibling->rb_left, tmp1); diff --git a/lib/ref_tracker.c b/lib/ref_tracker.c index cf5609b1ca79..a9e6ffcff04b 100644 --- a/lib/ref_tracker.c +++ b/lib/ref_tracker.c @@ -8,6 +8,7 @@ #include <linux/slab.h> #include <linux/stacktrace.h> #include <linux/stackdepot.h> +#include <linux/seq_file.h> #define REF_TRACKER_STACK_ENTRIES 16 #define STACK_BUF_SIZE 1024 @@ -28,6 +29,45 @@ struct ref_tracker_dir_stats { } stacks[]; }; +#ifdef CONFIG_DEBUG_FS +#include <linux/xarray.h> + +/* + * ref_tracker_dir_init() is usually called in allocation-safe contexts, but + * the same is not true of ref_tracker_dir_exit() which can be called from + * anywhere an object is freed. Removing debugfs dentries is a blocking + * operation, so we defer that work to the debugfs_reap_worker. + * + * Each dentry is tracked in the appropriate xarray. When + * ref_tracker_dir_exit() is called, its entries in the xarrays are marked and + * the workqueue job is scheduled. The worker then runs and deletes any marked + * dentries asynchronously. + */ +static struct xarray debugfs_dentries; +static struct xarray debugfs_symlinks; +static struct work_struct debugfs_reap_worker; + +#define REF_TRACKER_DIR_DEAD XA_MARK_0 +static inline void ref_tracker_debugfs_mark(struct ref_tracker_dir *dir) +{ + unsigned long flags; + + xa_lock_irqsave(&debugfs_dentries, flags); + __xa_set_mark(&debugfs_dentries, (unsigned long)dir, REF_TRACKER_DIR_DEAD); + xa_unlock_irqrestore(&debugfs_dentries, flags); + + xa_lock_irqsave(&debugfs_symlinks, flags); + __xa_set_mark(&debugfs_symlinks, (unsigned long)dir, REF_TRACKER_DIR_DEAD); + xa_unlock_irqrestore(&debugfs_symlinks, flags); + + schedule_work(&debugfs_reap_worker); +} +#else +static inline void ref_tracker_debugfs_mark(struct ref_tracker_dir *dir) +{ +} +#endif + static struct ref_tracker_dir_stats * ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit) { @@ -63,21 +103,39 @@ ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit) } struct ostream { + void __ostream_printf (*func)(struct ostream *stream, char *fmt, ...); + char *prefix; char *buf; + struct seq_file *seq; int size, used; }; +static void __ostream_printf pr_ostream_log(struct ostream *stream, char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); +} + +static void __ostream_printf pr_ostream_buf(struct ostream *stream, char *fmt, ...) +{ + int ret, len = stream->size - stream->used; + va_list args; + + va_start(args, fmt); + ret = vsnprintf(stream->buf + stream->used, len, fmt, args); + va_end(args); + if (ret > 0) + stream->used += min(ret, len); +} + #define pr_ostream(stream, fmt, args...) \ ({ \ struct ostream *_s = (stream); \ \ - if (!_s->buf) { \ - pr_err(fmt, ##args); \ - } else { \ - int ret, len = _s->size - _s->used; \ - ret = snprintf(_s->buf + _s->used, len, pr_fmt(fmt), ##args); \ - _s->used += min(ret, len); \ - } \ + _s->func(_s, fmt, ##args); \ }) static void @@ -96,8 +154,8 @@ __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir, stats = ref_tracker_get_stats(dir, display_limit); if (IS_ERR(stats)) { - pr_ostream(s, "%s@%pK: couldn't get stats, error %pe\n", - dir->name, dir, stats); + pr_ostream(s, "%s%s@%p: couldn't get stats, error %pe\n", + s->prefix, dir->class, dir, stats); return; } @@ -107,14 +165,15 @@ __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir, stack = stats->stacks[i].stack_handle; if (sbuf && !stack_depot_snprint(stack, sbuf, STACK_BUF_SIZE, 4)) sbuf[0] = 0; - pr_ostream(s, "%s@%pK has %d/%d users at\n%s\n", dir->name, dir, - stats->stacks[i].count, stats->total, sbuf); + pr_ostream(s, "%s%s@%p has %d/%d users at\n%s\n", s->prefix, + dir->class, dir, stats->stacks[i].count, + stats->total, sbuf); skipped -= stats->stacks[i].count; } if (skipped) - pr_ostream(s, "%s@%pK skipped reports about %d/%d users.\n", - dir->name, dir, skipped, stats->total); + pr_ostream(s, "%s%s@%p skipped reports about %d/%d users.\n", + s->prefix, dir->class, dir, skipped, stats->total); kfree(sbuf); @@ -124,7 +183,8 @@ __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir, void ref_tracker_dir_print_locked(struct ref_tracker_dir *dir, unsigned int display_limit) { - struct ostream os = {}; + struct ostream os = { .func = pr_ostream_log, + .prefix = "ref_tracker: " }; __ref_tracker_dir_pr_ostream(dir, display_limit, &os); } @@ -143,7 +203,10 @@ EXPORT_SYMBOL(ref_tracker_dir_print); int ref_tracker_dir_snprint(struct ref_tracker_dir *dir, char *buf, size_t size) { - struct ostream os = { .buf = buf, .size = size }; + struct ostream os = { .func = pr_ostream_buf, + .prefix = "ref_tracker: ", + .buf = buf, + .size = size }; unsigned long flags; spin_lock_irqsave(&dir->lock, flags); @@ -161,6 +224,11 @@ void ref_tracker_dir_exit(struct ref_tracker_dir *dir) bool leak = false; dir->dead = true; + /* + * The xarray entries must be marked before the dir->lock is taken to + * protect simultaneous debugfs readers. + */ + ref_tracker_debugfs_mark(dir); spin_lock_irqsave(&dir->lock, flags); list_for_each_entry_safe(tracker, n, &dir->quarantine, head) { list_del(&tracker->head); @@ -273,3 +341,194 @@ int ref_tracker_free(struct ref_tracker_dir *dir, return 0; } EXPORT_SYMBOL_GPL(ref_tracker_free); + +#ifdef CONFIG_DEBUG_FS +#include <linux/debugfs.h> + +static struct dentry *ref_tracker_debug_dir = (struct dentry *)-ENOENT; + +static void __ostream_printf pr_ostream_seq(struct ostream *stream, char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + seq_vprintf(stream->seq, fmt, args); + va_end(args); +} + +static int ref_tracker_dir_seq_print(struct ref_tracker_dir *dir, struct seq_file *seq) +{ + struct ostream os = { .func = pr_ostream_seq, + .prefix = "", + .seq = seq }; + + __ref_tracker_dir_pr_ostream(dir, 16, &os); + + return os.used; +} + +static int ref_tracker_debugfs_show(struct seq_file *f, void *v) +{ + struct ref_tracker_dir *dir = f->private; + unsigned long index = (unsigned long)dir; + unsigned long flags; + int ret; + + /* + * "dir" may not exist at this point if ref_tracker_dir_exit() has + * already been called. Take care not to dereference it until its + * legitimacy is established. + * + * The xa_lock is necessary to ensure that "dir" doesn't disappear + * before its lock can be taken. If it's in the hash and not marked + * dead, then it's safe to take dir->lock which prevents + * ref_tracker_dir_exit() from completing. Once the dir->lock is + * acquired, the xa_lock can be released. All of this must be IRQ-safe. + */ + xa_lock_irqsave(&debugfs_dentries, flags); + if (!xa_load(&debugfs_dentries, index) || + xa_get_mark(&debugfs_dentries, index, REF_TRACKER_DIR_DEAD)) { + xa_unlock_irqrestore(&debugfs_dentries, flags); + return -ENODATA; + } + + spin_lock(&dir->lock); + xa_unlock(&debugfs_dentries); + ret = ref_tracker_dir_seq_print(dir, f); + spin_unlock_irqrestore(&dir->lock, flags); + return ret; +} + +static int ref_tracker_debugfs_open(struct inode *inode, struct file *filp) +{ + struct ref_tracker_dir *dir = inode->i_private; + + return single_open(filp, ref_tracker_debugfs_show, dir); +} + +static const struct file_operations ref_tracker_debugfs_fops = { + .owner = THIS_MODULE, + .open = ref_tracker_debugfs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/** + * ref_tracker_dir_debugfs - create debugfs file for ref_tracker_dir + * @dir: ref_tracker_dir to be associated with debugfs file + * + * In most cases, a debugfs file will be created automatically for every + * ref_tracker_dir. If the object was created before debugfs is brought up + * then that may fail. In those cases, it is safe to call this at a later + * time to create the file. + */ +void ref_tracker_dir_debugfs(struct ref_tracker_dir *dir) +{ + char name[NAME_MAX + 1]; + struct dentry *dentry; + int ret; + + /* No-op if already created */ + dentry = xa_load(&debugfs_dentries, (unsigned long)dir); + if (dentry && !xa_is_err(dentry)) + return; + + ret = snprintf(name, sizeof(name), "%s@%px", dir->class, dir); + name[sizeof(name) - 1] = '\0'; + + if (ret < sizeof(name)) { + dentry = debugfs_create_file(name, S_IFREG | 0400, + ref_tracker_debug_dir, dir, + &ref_tracker_debugfs_fops); + if (!IS_ERR(dentry)) { + void *old; + + old = xa_store_irq(&debugfs_dentries, (unsigned long)dir, + dentry, GFP_KERNEL); + + if (xa_is_err(old)) + debugfs_remove(dentry); + else + WARN_ON_ONCE(old); + } + } +} +EXPORT_SYMBOL(ref_tracker_dir_debugfs); + +void __ostream_printf ref_tracker_dir_symlink(struct ref_tracker_dir *dir, const char *fmt, ...) +{ + char name[NAME_MAX + 1]; + struct dentry *symlink, *dentry; + va_list args; + int ret; + + symlink = xa_load(&debugfs_symlinks, (unsigned long)dir); + dentry = xa_load(&debugfs_dentries, (unsigned long)dir); + + /* Already created?*/ + if (symlink && !xa_is_err(symlink)) + return; + + if (!dentry || xa_is_err(dentry)) + return; + + va_start(args, fmt); + ret = vsnprintf(name, sizeof(name), fmt, args); + va_end(args); + name[sizeof(name) - 1] = '\0'; + + if (ret < sizeof(name)) { + symlink = debugfs_create_symlink(name, ref_tracker_debug_dir, + dentry->d_name.name); + if (!IS_ERR(symlink)) { + void *old; + + old = xa_store_irq(&debugfs_symlinks, (unsigned long)dir, + symlink, GFP_KERNEL); + if (xa_is_err(old)) + debugfs_remove(symlink); + else + WARN_ON_ONCE(old); + } + } +} +EXPORT_SYMBOL(ref_tracker_dir_symlink); + +static void debugfs_reap_work(struct work_struct *work) +{ + struct dentry *dentry; + unsigned long index; + bool reaped; + + do { + reaped = false; + xa_for_each_marked(&debugfs_symlinks, index, dentry, REF_TRACKER_DIR_DEAD) { + xa_erase_irq(&debugfs_symlinks, index); + debugfs_remove(dentry); + reaped = true; + } + xa_for_each_marked(&debugfs_dentries, index, dentry, REF_TRACKER_DIR_DEAD) { + xa_erase_irq(&debugfs_dentries, index); + debugfs_remove(dentry); + reaped = true; + } + } while (reaped); +} + +static int __init ref_tracker_debugfs_postcore_init(void) +{ + INIT_WORK(&debugfs_reap_worker, debugfs_reap_work); + xa_init_flags(&debugfs_dentries, XA_FLAGS_LOCK_IRQ); + xa_init_flags(&debugfs_symlinks, XA_FLAGS_LOCK_IRQ); + return 0; +} +postcore_initcall(ref_tracker_debugfs_postcore_init); + +static int __init ref_tracker_debugfs_late_init(void) +{ + ref_tracker_debug_dir = debugfs_create_dir("ref_tracker", NULL); + return 0; +} +late_initcall(ref_tracker_debugfs_late_init); +#endif /* CONFIG_DEBUG_FS */ diff --git a/lib/scatterlist.c b/lib/scatterlist.c index b58d5ef1a34b..4af1c8b0775a 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -14,29 +14,6 @@ #include <linux/folio_queue.h> /** - * sg_next - return the next scatterlist entry in a list - * @sg: The current sg entry - * - * Description: - * Usually the next entry will be @sg@ + 1, but if this sg element is part - * of a chained scatterlist, it could jump to the start of a new - * scatterlist array. - * - **/ -struct scatterlist *sg_next(struct scatterlist *sg) -{ - if (sg_is_last(sg)) - return NULL; - - sg++; - if (unlikely(sg_is_chain(sg))) - sg = sg_chain_ptr(sg); - - return sg; -} -EXPORT_SYMBOL(sg_next); - -/** * sg_nents - return total count of entries in scatterlist * @sg: The scatterlist * @@ -96,9 +73,9 @@ EXPORT_SYMBOL(sg_nents_for_len); * Should only be used casually, it (currently) scans the entire list * to get the last entry. * - * Note that the @sgl@ pointer passed in need not be the first one, - * the important bit is that @nents@ denotes the number of entries that - * exist from @sgl@. + * Note that the @sgl pointer passed in need not be the first one, + * the important bit is that @nents denotes the number of entries that + * exist from @sgl. * **/ struct scatterlist *sg_last(struct scatterlist *sgl, unsigned int nents) @@ -368,7 +345,7 @@ EXPORT_SYMBOL(__sg_alloc_table); * @gfp_mask: GFP allocation mask * * Description: - * Allocate and initialize an sg table. If @nents@ is larger than + * Allocate and initialize an sg table. If @nents is larger than * SG_MAX_SINGLE_ALLOC a chained sg table will be setup. * **/ diff --git a/lib/sg_split.c b/lib/sg_split.c index 60a0babebf2e..0f89aab5c671 100644 --- a/lib/sg_split.c +++ b/lib/sg_split.c @@ -88,8 +88,6 @@ static void sg_split_phys(struct sg_splitter *splitters, const int nb_splits) if (!j) { out_sg->offset += split->skip_sg0; out_sg->length -= split->skip_sg0; - } else { - out_sg->offset = 0; } sg_dma_address(out_sg) = 0; sg_dma_len(out_sg) = 0; diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index a2bb7738c373..94b3f6b19538 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -22,10 +22,8 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2) if (is_percpu_thread()) goto out; -#ifdef CONFIG_SMP if (current->migration_disabled) goto out; -#endif /* * It is valid to assume CPU-locality during early bootup: diff --git a/lib/sort.c b/lib/sort.c index 8e73dc55476b..52363995ccc5 100644 --- a/lib/sort.c +++ b/lib/sort.c @@ -186,36 +186,13 @@ static size_t parent(size_t i, unsigned int lsbit, size_t size) return i / 2; } -/** - * sort_r - sort an array of elements - * @base: pointer to data to sort - * @num: number of elements - * @size: size of each element - * @cmp_func: pointer to comparison function - * @swap_func: pointer to swap function or NULL - * @priv: third argument passed to comparison function - * - * This function does a heapsort on the given array. You may provide - * a swap_func function if you need to do something more than a memory - * copy (e.g. fix up pointers or auxiliary data), but the built-in swap - * avoids a slow retpoline and so is significantly faster. - * - * The comparison function must adhere to specific mathematical - * properties to ensure correct and stable sorting: - * - Antisymmetry: cmp_func(a, b) must return the opposite sign of - * cmp_func(b, a). - * - Transitivity: if cmp_func(a, b) <= 0 and cmp_func(b, c) <= 0, then - * cmp_func(a, c) <= 0. - * - * Sorting time is O(n log n) both on average and worst-case. While - * quicksort is slightly faster on average, it suffers from exploitable - * O(n*n) worst-case behavior and extra memory requirements that make - * it less suitable for kernel use. - */ -void sort_r(void *base, size_t num, size_t size, - cmp_r_func_t cmp_func, - swap_r_func_t swap_func, - const void *priv) +#include <linux/sched.h> + +static void __sort_r(void *base, size_t num, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv, + bool may_schedule) { /* pre-scale counters for performance */ size_t n = num * size, a = (num/2) * size; @@ -286,6 +263,9 @@ void sort_r(void *base, size_t num, size_t size, b = parent(b, lsbit, size); do_swap(base + b, base + c, size, swap_func, priv); } + + if (may_schedule) + cond_resched(); } n -= size; @@ -293,8 +273,63 @@ void sort_r(void *base, size_t num, size_t size, if (n == size * 2 && do_cmp(base, base + size, cmp_func, priv) > 0) do_swap(base, base + size, size, swap_func, priv); } + +/** + * sort_r - sort an array of elements + * @base: pointer to data to sort + * @num: number of elements + * @size: size of each element + * @cmp_func: pointer to comparison function + * @swap_func: pointer to swap function or NULL + * @priv: third argument passed to comparison function + * + * This function does a heapsort on the given array. You may provide + * a swap_func function if you need to do something more than a memory + * copy (e.g. fix up pointers or auxiliary data), but the built-in swap + * avoids a slow retpoline and so is significantly faster. + * + * The comparison function must adhere to specific mathematical + * properties to ensure correct and stable sorting: + * - Antisymmetry: cmp_func(a, b) must return the opposite sign of + * cmp_func(b, a). + * - Transitivity: if cmp_func(a, b) <= 0 and cmp_func(b, c) <= 0, then + * cmp_func(a, c) <= 0. + * + * Sorting time is O(n log n) both on average and worst-case. While + * quicksort is slightly faster on average, it suffers from exploitable + * O(n*n) worst-case behavior and extra memory requirements that make + * it less suitable for kernel use. + */ +void sort_r(void *base, size_t num, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) +{ + __sort_r(base, num, size, cmp_func, swap_func, priv, false); +} EXPORT_SYMBOL(sort_r); +/** + * sort_r_nonatomic - sort an array of elements, with cond_resched + * @base: pointer to data to sort + * @num: number of elements + * @size: size of each element + * @cmp_func: pointer to comparison function + * @swap_func: pointer to swap function or NULL + * @priv: third argument passed to comparison function + * + * Same as sort_r, but preferred for larger arrays as it does a periodic + * cond_resched(). + */ +void sort_r_nonatomic(void *base, size_t num, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) +{ + __sort_r(base, num, size, cmp_func, swap_func, priv, true); +} +EXPORT_SYMBOL(sort_r_nonatomic); + void sort(void *base, size_t num, size_t size, cmp_func_t cmp_func, swap_func_t swap_func) @@ -304,6 +339,19 @@ void sort(void *base, size_t num, size_t size, .swap = swap_func, }; - return sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); + return __sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w, false); } EXPORT_SYMBOL(sort); + +void sort_nonatomic(void *base, size_t num, size_t size, + cmp_func_t cmp_func, + swap_func_t swap_func) +{ + struct wrapper w = { + .cmp = cmp_func, + .swap = swap_func, + }; + + return __sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w, true); +} +EXPORT_SYMBOL(sort_nonatomic); diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 73d7b50924ef..de0b0025af2b 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -36,11 +36,11 @@ #include <linux/memblock.h> #include <linux/kasan-enabled.h> -#define DEPOT_POOLS_CAP 8192 -/* The pool_index is offset by 1 so the first record does not have a 0 handle. */ -#define DEPOT_MAX_POOLS \ - (((1LL << (DEPOT_POOL_INDEX_BITS)) - 1 < DEPOT_POOLS_CAP) ? \ - (1LL << (DEPOT_POOL_INDEX_BITS)) - 1 : DEPOT_POOLS_CAP) +/* + * The pool_index is offset by 1 so the first record does not have a 0 handle. + */ +static unsigned int stack_max_pools __read_mostly = + MIN((1LL << DEPOT_POOL_INDEX_BITS) - 1, 8192); static bool stack_depot_disabled; static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); @@ -62,7 +62,7 @@ static unsigned int stack_bucket_number_order; static unsigned int stack_hash_mask; /* Array of memory regions that store stack records. */ -static void *stack_pools[DEPOT_MAX_POOLS]; +static void **stack_pools; /* Newly allocated pool that is not yet added to stack_pools. */ static void *new_pool; /* Number of pools in stack_pools. */ @@ -101,6 +101,34 @@ static int __init disable_stack_depot(char *str) } early_param("stack_depot_disable", disable_stack_depot); +static int __init parse_max_pools(char *str) +{ + const long long limit = (1LL << (DEPOT_POOL_INDEX_BITS)) - 1; + unsigned int max_pools; + int rv; + + rv = kstrtouint(str, 0, &max_pools); + if (rv) + return rv; + + if (max_pools < 1024) { + pr_err("stack_depot_max_pools below 1024, using default of %u\n", + stack_max_pools); + goto out; + } + + if (max_pools > limit) { + pr_err("stack_depot_max_pools exceeds %lld, using default of %u\n", + limit, stack_max_pools); + goto out; + } + + stack_max_pools = max_pools; +out: + return 0; +} +early_param("stack_depot_max_pools", parse_max_pools); + void __init stack_depot_request_early_init(void) { /* Too late to request early init now. */ @@ -182,6 +210,17 @@ int __init stack_depot_early_init(void) } init_stack_table(entries); + pr_info("allocating space for %u stack pools via memblock\n", + stack_max_pools); + stack_pools = + memblock_alloc(stack_max_pools * sizeof(void *), PAGE_SIZE); + if (!stack_pools) { + pr_err("stack pools allocation failed, disabling\n"); + memblock_free(stack_table, entries * sizeof(struct list_head)); + stack_depot_disabled = true; + return -ENOMEM; + } + return 0; } @@ -231,6 +270,16 @@ int stack_depot_init(void) stack_hash_mask = entries - 1; init_stack_table(entries); + pr_info("allocating space for %u stack pools via kvcalloc\n", + stack_max_pools); + stack_pools = kvcalloc(stack_max_pools, sizeof(void *), GFP_KERNEL); + if (!stack_pools) { + pr_err("stack pools allocation failed, disabling\n"); + kvfree(stack_table); + stack_depot_disabled = true; + ret = -ENOMEM; + } + out_unlock: mutex_unlock(&stack_depot_init_mutex); @@ -245,9 +294,9 @@ static bool depot_init_pool(void **prealloc) { lockdep_assert_held(&pool_lock); - if (unlikely(pools_num >= DEPOT_MAX_POOLS)) { + if (unlikely(pools_num >= stack_max_pools)) { /* Bail out if we reached the pool limit. */ - WARN_ON_ONCE(pools_num > DEPOT_MAX_POOLS); /* should never happen */ + WARN_ON_ONCE(pools_num > stack_max_pools); /* should never happen */ WARN_ON_ONCE(!new_pool); /* to avoid unnecessary pre-allocation */ WARN_ONCE(1, "Stack depot reached limit capacity"); return false; @@ -273,7 +322,7 @@ static bool depot_init_pool(void **prealloc) * NULL; do not reset to NULL if we have reached the maximum number of * pools. */ - if (pools_num < DEPOT_MAX_POOLS) + if (pools_num < stack_max_pools) WRITE_ONCE(new_pool, NULL); else WRITE_ONCE(new_pool, STACK_DEPOT_POISON); diff --git a/lib/string.c b/lib/string.c index eb4486ed40d2..b632c71df1a5 100644 --- a/lib/string.c +++ b/lib/string.c @@ -119,6 +119,7 @@ ssize_t sized_strscpy(char *dest, const char *src, size_t count) if (count == 0 || WARN_ON_ONCE(count > INT_MAX)) return -E2BIG; +#ifndef CONFIG_DCACHE_WORD_ACCESS #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS /* * If src is unaligned, don't cross a page boundary, @@ -134,11 +135,13 @@ ssize_t sized_strscpy(char *dest, const char *src, size_t count) if (((long) dest | (long) src) & (sizeof(long) - 1)) max = 0; #endif +#endif /* - * read_word_at_a_time() below may read uninitialized bytes after the - * trailing zero and use them in comparisons. Disable this optimization - * under KMSAN to prevent false positive reports. + * load_unaligned_zeropad() or read_word_at_a_time() below may read + * uninitialized bytes after the trailing zero and use them in + * comparisons. Disable this optimization under KMSAN to prevent + * false positive reports. */ if (IS_ENABLED(CONFIG_KMSAN)) max = 0; @@ -146,7 +149,11 @@ ssize_t sized_strscpy(char *dest, const char *src, size_t count) while (max >= sizeof(unsigned long)) { unsigned long c, data; +#ifdef CONFIG_DCACHE_WORD_ACCESS + c = load_unaligned_zeropad(src+res); +#else c = read_word_at_a_time(src+res); +#endif if (has_zero(c, &data, &constants)) { data = prep_zero_mask(c, data, &constants); data = create_zero_mask(data); diff --git a/lib/string_helpers.c b/lib/string_helpers.c index 91fa37b5c510..ffb8ead6d4cd 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -138,6 +138,25 @@ int string_get_size(u64 size, u64 blk_size, const enum string_size_units units, } EXPORT_SYMBOL(string_get_size); +int parse_int_array(const char *buf, size_t count, int **array) +{ + int *ints, nints; + + get_options(buf, 0, &nints); + if (!nints) + return -ENOENT; + + ints = kcalloc(nints + 1, sizeof(*ints), GFP_KERNEL); + if (!ints) + return -ENOMEM; + + get_options(buf, nints + 1, ints); + *array = ints; + + return 0; +} +EXPORT_SYMBOL(parse_int_array); + /** * parse_int_array_user - Split string into a sequence of integers * @from: The user space buffer to read from @@ -153,30 +172,14 @@ EXPORT_SYMBOL(string_get_size); */ int parse_int_array_user(const char __user *from, size_t count, int **array) { - int *ints, nints; char *buf; - int ret = 0; + int ret; buf = memdup_user_nul(from, count); if (IS_ERR(buf)) return PTR_ERR(buf); - get_options(buf, 0, &nints); - if (!nints) { - ret = -ENOENT; - goto free_buf; - } - - ints = kcalloc(nints + 1, sizeof(*ints), GFP_KERNEL); - if (!ints) { - ret = -ENOMEM; - goto free_buf; - } - - get_options(buf, nints + 1, ints); - *array = ints; - -free_buf: + ret = parse_int_array(buf, count, array); kfree(buf); return ret; } diff --git a/lib/sys_info.c b/lib/sys_info.c new file mode 100644 index 000000000000..5bf503fd7ec1 --- /dev/null +++ b/lib/sys_info.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/sched/debug.h> +#include <linux/console.h> +#include <linux/kernel.h> +#include <linux/ftrace.h> +#include <linux/sysctl.h> +#include <linux/nmi.h> + +#include <linux/sys_info.h> + +struct sys_info_name { + unsigned long bit; + const char *name; +}; + +/* + * When 'si_names' gets updated, please make sure the 'sys_info_avail' + * below is updated accordingly. + */ +static const struct sys_info_name si_names[] = { + { SYS_INFO_TASKS, "tasks" }, + { SYS_INFO_MEM, "mem" }, + { SYS_INFO_TIMERS, "timers" }, + { SYS_INFO_LOCKS, "locks" }, + { SYS_INFO_FTRACE, "ftrace" }, + { SYS_INFO_ALL_CPU_BT, "all_bt" }, + { SYS_INFO_BLOCKED_TASKS, "blocked_tasks" }, +}; + +/* Expecting string like "xxx_sys_info=tasks,mem,timers,locks,ftrace,..." */ +unsigned long sys_info_parse_param(char *str) +{ + unsigned long si_bits = 0; + char *s, *name; + int i; + + s = str; + while ((name = strsep(&s, ",")) && *name) { + for (i = 0; i < ARRAY_SIZE(si_names); i++) { + if (!strcmp(name, si_names[i].name)) { + si_bits |= si_names[i].bit; + break; + } + } + } + + return si_bits; +} + +#ifdef CONFIG_SYSCTL + +static const char sys_info_avail[] __maybe_unused = "tasks,mem,timers,locks,ftrace,all_bt,blocked_tasks"; + +int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + char names[sizeof(sys_info_avail) + 1]; + struct ctl_table table; + unsigned long *si_bits_global; + + si_bits_global = ro_table->data; + + if (write) { + unsigned long si_bits; + int ret; + + table = *ro_table; + table.data = names; + table.maxlen = sizeof(names); + ret = proc_dostring(&table, write, buffer, lenp, ppos); + if (ret) + return ret; + + si_bits = sys_info_parse_param(names); + /* The access to the global value is not synchronized. */ + WRITE_ONCE(*si_bits_global, si_bits); + return 0; + } else { + /* for 'read' operation */ + char *delim = ""; + int i, len = 0; + + for (i = 0; i < ARRAY_SIZE(si_names); i++) { + if (*si_bits_global & si_names[i].bit) { + len += scnprintf(names + len, sizeof(names) - len, + "%s%s", delim, si_names[i].name); + delim = ","; + } + } + + table = *ro_table; + table.data = names; + table.maxlen = sizeof(names); + return proc_dostring(&table, write, buffer, lenp, ppos); + } +} +#endif + +void sys_info(unsigned long si_mask) +{ + if (si_mask & SYS_INFO_TASKS) + show_state(); + + if (si_mask & SYS_INFO_MEM) + show_mem(); + + if (si_mask & SYS_INFO_TIMERS) + sysrq_timer_list_show(); + + if (si_mask & SYS_INFO_LOCKS) + debug_show_all_locks(); + + if (si_mask & SYS_INFO_FTRACE) + ftrace_dump(DUMP_ALL); + + if (si_mask & SYS_INFO_ALL_CPU_BT) + trigger_all_cpu_backtrace(); + + if (si_mask & SYS_INFO_BLOCKED_TASKS) + show_state_filter(TASK_UNINTERRUPTIBLE); +} diff --git a/lib/test_fortify/Makefile b/lib/test_fortify/Makefile index 1c3f82ad8bb2..399cae880e1d 100644 --- a/lib/test_fortify/Makefile +++ b/lib/test_fortify/Makefile @@ -18,10 +18,7 @@ quiet_cmd_gen_fortify_log = CAT $@ $(obj)/test_fortify.log: $(addprefix $(obj)/, $(logs)) FORCE $(call if_changed,gen_fortify_log) -# GCC<=7 does not always produce *.d files. -# Run the tests only for GCC>=8 or Clang. -always-$(call gcc-min-version, 80000) += test_fortify.log -always-$(CONFIG_CC_IS_CLANG) += test_fortify.log +always-y += test_fortify.log # Some architectures define __NO_FORTIFY if __SANITIZE_ADDRESS__ is undefined. # Pass CFLAGS_KASAN to avoid warnings. diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 5b144bc5c4ec..761725bc713c 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -330,7 +330,7 @@ static int dmirror_fault(struct dmirror *dmirror, unsigned long start, { struct mm_struct *mm = dmirror->notifier.mm; unsigned long addr; - unsigned long pfns[64]; + unsigned long pfns[32]; struct hmm_range range = { .notifier = &dmirror->notifier, .hmm_pfns = pfns, @@ -879,8 +879,8 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror, unsigned long size = cmd->npages << PAGE_SHIFT; struct mm_struct *mm = dmirror->notifier.mm; struct vm_area_struct *vma; - unsigned long src_pfns[64] = { 0 }; - unsigned long dst_pfns[64] = { 0 }; + unsigned long src_pfns[32] = { 0 }; + unsigned long dst_pfns[32] = { 0 }; struct migrate_vma args = { 0 }; unsigned long next; int ret; @@ -939,8 +939,8 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror, unsigned long size = cmd->npages << PAGE_SHIFT; struct mm_struct *mm = dmirror->notifier.mm; struct vm_area_struct *vma; - unsigned long src_pfns[64] = { 0 }; - unsigned long dst_pfns[64] = { 0 }; + unsigned long src_pfns[32] = { 0 }; + unsigned long dst_pfns[32] = { 0 }; struct dmirror_bounce bounce; struct migrate_vma args = { 0 }; unsigned long next; @@ -1144,8 +1144,8 @@ static int dmirror_snapshot(struct dmirror *dmirror, unsigned long size = cmd->npages << PAGE_SHIFT; unsigned long addr; unsigned long next; - unsigned long pfns[64]; - unsigned char perm[64]; + unsigned long pfns[32]; + unsigned char perm[32]; char __user *uptr; struct hmm_range range = { .hmm_pfns = pfns, diff --git a/lib/test_kho.c b/lib/test_kho.c new file mode 100644 index 000000000000..c2eb899c3b45 --- /dev/null +++ b/lib/test_kho.c @@ -0,0 +1,305 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Test module for KHO + * Copyright (c) 2025 Microsoft Corporation. + * + * Authors: + * Saurabh Sengar <ssengar@microsoft.com> + * Mike Rapoport <rppt@kernel.org> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/mm.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/kexec.h> +#include <linux/libfdt.h> +#include <linux/module.h> +#include <linux/printk.h> +#include <linux/vmalloc.h> +#include <linux/kexec_handover.h> + +#include <net/checksum.h> + +#define KHO_TEST_MAGIC 0x4b484f21 /* KHO! */ +#define KHO_TEST_FDT "kho_test" +#define KHO_TEST_COMPAT "kho-test-v1" + +static long max_mem = (PAGE_SIZE << MAX_PAGE_ORDER) * 2; +module_param(max_mem, long, 0644); + +struct kho_test_state { + unsigned int nr_folios; + struct folio **folios; + struct folio *fdt; + __wsum csum; +}; + +static struct kho_test_state kho_test_state; + +static int kho_test_notifier(struct notifier_block *self, unsigned long cmd, + void *v) +{ + struct kho_test_state *state = &kho_test_state; + struct kho_serialization *ser = v; + int err = 0; + + switch (cmd) { + case KEXEC_KHO_ABORT: + return NOTIFY_DONE; + case KEXEC_KHO_FINALIZE: + /* Handled below */ + break; + default: + return NOTIFY_BAD; + } + + err |= kho_preserve_folio(state->fdt); + err |= kho_add_subtree(ser, KHO_TEST_FDT, folio_address(state->fdt)); + + return err ? NOTIFY_BAD : NOTIFY_DONE; +} + +static struct notifier_block kho_test_nb = { + .notifier_call = kho_test_notifier, +}; + +static int kho_test_save_data(struct kho_test_state *state, void *fdt) +{ + phys_addr_t *folios_info __free(kvfree) = NULL; + int err = 0; + + folios_info = kvmalloc_array(state->nr_folios, sizeof(*folios_info), + GFP_KERNEL); + if (!folios_info) + return -ENOMEM; + + for (int i = 0; i < state->nr_folios; i++) { + struct folio *folio = state->folios[i]; + unsigned int order = folio_order(folio); + + folios_info[i] = virt_to_phys(folio_address(folio)) | order; + + err = kho_preserve_folio(folio); + if (err) + return err; + } + + err |= fdt_begin_node(fdt, "data"); + err |= fdt_property(fdt, "nr_folios", &state->nr_folios, + sizeof(state->nr_folios)); + err |= fdt_property(fdt, "folios_info", folios_info, + state->nr_folios * sizeof(*folios_info)); + err |= fdt_property(fdt, "csum", &state->csum, sizeof(state->csum)); + err |= fdt_end_node(fdt); + + return err; +} + +static int kho_test_prepare_fdt(struct kho_test_state *state) +{ + const char compatible[] = KHO_TEST_COMPAT; + unsigned int magic = KHO_TEST_MAGIC; + ssize_t fdt_size; + int err = 0; + void *fdt; + + fdt_size = state->nr_folios * sizeof(phys_addr_t) + PAGE_SIZE; + state->fdt = folio_alloc(GFP_KERNEL, get_order(fdt_size)); + if (!state->fdt) + return -ENOMEM; + + fdt = folio_address(state->fdt); + + err |= fdt_create(fdt, fdt_size); + err |= fdt_finish_reservemap(fdt); + + err |= fdt_begin_node(fdt, ""); + err |= fdt_property(fdt, "compatible", compatible, sizeof(compatible)); + err |= fdt_property(fdt, "magic", &magic, sizeof(magic)); + err |= kho_test_save_data(state, fdt); + err |= fdt_end_node(fdt); + + err |= fdt_finish(fdt); + + if (err) + folio_put(state->fdt); + + return err; +} + +static int kho_test_generate_data(struct kho_test_state *state) +{ + size_t alloc_size = 0; + __wsum csum = 0; + + while (alloc_size < max_mem) { + int order = get_random_u32() % NR_PAGE_ORDERS; + struct folio *folio; + unsigned int size; + void *addr; + + /* cap allocation so that we won't exceed max_mem */ + if (alloc_size + (PAGE_SIZE << order) > max_mem) { + order = get_order(max_mem - alloc_size); + if (order) + order--; + } + size = PAGE_SIZE << order; + + folio = folio_alloc(GFP_KERNEL | __GFP_NORETRY, order); + if (!folio) + goto err_free_folios; + + state->folios[state->nr_folios++] = folio; + addr = folio_address(folio); + get_random_bytes(addr, size); + csum = csum_partial(addr, size, csum); + alloc_size += size; + } + + state->csum = csum; + return 0; + +err_free_folios: + for (int i = 0; i < state->nr_folios; i++) + folio_put(state->folios[i]); + return -ENOMEM; +} + +static int kho_test_save(void) +{ + struct kho_test_state *state = &kho_test_state; + struct folio **folios __free(kvfree) = NULL; + unsigned long max_nr; + int err; + + max_mem = PAGE_ALIGN(max_mem); + max_nr = max_mem >> PAGE_SHIFT; + + folios = kvmalloc_array(max_nr, sizeof(*state->folios), GFP_KERNEL); + if (!folios) + return -ENOMEM; + state->folios = folios; + + err = kho_test_generate_data(state); + if (err) + return err; + + err = kho_test_prepare_fdt(state); + if (err) + return err; + + return register_kho_notifier(&kho_test_nb); +} + +static int kho_test_restore_data(const void *fdt, int node) +{ + const unsigned int *nr_folios; + const phys_addr_t *folios_info; + const __wsum *old_csum; + __wsum csum = 0; + int len; + + node = fdt_path_offset(fdt, "/data"); + + nr_folios = fdt_getprop(fdt, node, "nr_folios", &len); + if (!nr_folios || len != sizeof(*nr_folios)) + return -EINVAL; + + old_csum = fdt_getprop(fdt, node, "csum", &len); + if (!old_csum || len != sizeof(*old_csum)) + return -EINVAL; + + folios_info = fdt_getprop(fdt, node, "folios_info", &len); + if (!folios_info || len != sizeof(*folios_info) * *nr_folios) + return -EINVAL; + + for (int i = 0; i < *nr_folios; i++) { + unsigned int order = folios_info[i] & ~PAGE_MASK; + phys_addr_t phys = folios_info[i] & PAGE_MASK; + unsigned int size = PAGE_SIZE << order; + struct folio *folio; + + folio = kho_restore_folio(phys); + if (!folio) + break; + + if (folio_order(folio) != order) + break; + + csum = csum_partial(folio_address(folio), size, csum); + folio_put(folio); + } + + if (csum != *old_csum) + return -EINVAL; + + return 0; +} + +static int kho_test_restore(phys_addr_t fdt_phys) +{ + void *fdt = phys_to_virt(fdt_phys); + const unsigned int *magic; + int node, len, err; + + node = fdt_path_offset(fdt, "/"); + if (node < 0) + return -EINVAL; + + if (fdt_node_check_compatible(fdt, node, KHO_TEST_COMPAT)) + return -EINVAL; + + magic = fdt_getprop(fdt, node, "magic", &len); + if (!magic || len != sizeof(*magic)) + return -EINVAL; + + if (*magic != KHO_TEST_MAGIC) + return -EINVAL; + + err = kho_test_restore_data(fdt, node); + if (err) + return err; + + pr_info("KHO restore succeeded\n"); + return 0; +} + +static int __init kho_test_init(void) +{ + phys_addr_t fdt_phys; + int err; + + err = kho_retrieve_subtree(KHO_TEST_FDT, &fdt_phys); + if (!err) + return kho_test_restore(fdt_phys); + + if (err != -ENOENT) { + pr_warn("failed to retrieve %s FDT: %d\n", KHO_TEST_FDT, err); + return err; + } + + return kho_test_save(); +} +module_init(kho_test_init); + +static void kho_test_cleanup(void) +{ + for (int i = 0; i < kho_test_state.nr_folios; i++) + folio_put(kho_test_state.folios[i]); + + kvfree(kho_test_state.folios); +} + +static void __exit kho_test_exit(void) +{ + unregister_kho_notifier(&kho_test_nb); + kho_test_cleanup(); +} +module_exit(kho_test_exit); + +MODULE_AUTHOR("Mike Rapoport <rppt@kernel.org>"); +MODULE_DESCRIPTION("KHO test module"); +MODULE_LICENSE("GPL"); diff --git a/lib/test_kmod.c b/lib/test_kmod.c index 064ed0fce75a..f0dd092860ea 100644 --- a/lib/test_kmod.c +++ b/lib/test_kmod.c @@ -28,14 +28,20 @@ #define TEST_START_NUM_THREADS 50 #define TEST_START_DRIVER "test_module" -#define TEST_START_TEST_FS "xfs" #define TEST_START_TEST_CASE TEST_KMOD_DRIVER - static bool force_init_test = false; -module_param(force_init_test, bool_enable_only, 0644); +module_param(force_init_test, bool_enable_only, 0444); MODULE_PARM_DESC(force_init_test, "Force kicking a test immediately after driver loads"); +static char *start_driver; +module_param(start_driver, charp, 0444); +MODULE_PARM_DESC(start_driver, + "Module/driver to use for the testing after driver loads"); +static char *start_test_fs; +module_param(start_test_fs, charp, 0444); +MODULE_PARM_DESC(start_test_fs, + "File system to use for the testing after driver loads"); /* * For device allocation / registration @@ -508,6 +514,11 @@ static int __trigger_config_run(struct kmod_test_device *test_dev) case TEST_KMOD_DRIVER: return run_test_driver(test_dev); case TEST_KMOD_FS_TYPE: + if (!config->test_fs) { + dev_warn(test_dev->dev, + "No fs type specified, can't run the test\n"); + return -EINVAL; + } return run_test_fs_type(test_dev); default: dev_warn(test_dev->dev, @@ -721,26 +732,20 @@ static ssize_t config_test_fs_show(struct device *dev, static DEVICE_ATTR_RW(config_test_fs); static int trigger_config_run_type(struct kmod_test_device *test_dev, - enum kmod_test_case test_case, - const char *test_str) + enum kmod_test_case test_case) { - int copied = 0; struct test_config *config = &test_dev->config; mutex_lock(&test_dev->config_mutex); switch (test_case) { case TEST_KMOD_DRIVER: - kfree_const(config->test_driver); - config->test_driver = NULL; - copied = config_copy_test_driver_name(config, test_str, - strlen(test_str)); break; case TEST_KMOD_FS_TYPE: - kfree_const(config->test_fs); - config->test_fs = NULL; - copied = config_copy_test_fs(config, test_str, - strlen(test_str)); + if (!config->test_fs) { + mutex_unlock(&test_dev->config_mutex); + return 0; + } break; default: mutex_unlock(&test_dev->config_mutex); @@ -751,11 +756,6 @@ static int trigger_config_run_type(struct kmod_test_device *test_dev, mutex_unlock(&test_dev->config_mutex); - if (copied <= 0 || copied != strlen(test_str)) { - test_dev->test_is_oom = true; - return -ENOMEM; - } - test_dev->test_is_oom = false; return trigger_config_run(test_dev); @@ -800,19 +800,24 @@ static unsigned int kmod_init_test_thread_limit(void) static int __kmod_config_init(struct kmod_test_device *test_dev) { struct test_config *config = &test_dev->config; + const char *test_start_driver = start_driver ? start_driver : + TEST_START_DRIVER; int ret = -ENOMEM, copied; __kmod_config_free(config); - copied = config_copy_test_driver_name(config, TEST_START_DRIVER, - strlen(TEST_START_DRIVER)); - if (copied != strlen(TEST_START_DRIVER)) + copied = config_copy_test_driver_name(config, test_start_driver, + strlen(test_start_driver)); + if (copied != strlen(test_start_driver)) goto err_out; - copied = config_copy_test_fs(config, TEST_START_TEST_FS, - strlen(TEST_START_TEST_FS)); - if (copied != strlen(TEST_START_TEST_FS)) - goto err_out; + + if (start_test_fs) { + copied = config_copy_test_fs(config, start_test_fs, + strlen(start_test_fs)); + if (copied != strlen(start_test_fs)) + goto err_out; + } config->num_threads = kmod_init_test_thread_limit(); config->test_result = 0; @@ -1178,12 +1183,11 @@ static int __init test_kmod_init(void) * lowering the init level for more fun. */ if (force_init_test) { - ret = trigger_config_run_type(test_dev, - TEST_KMOD_DRIVER, "tun"); + ret = trigger_config_run_type(test_dev, TEST_KMOD_DRIVER); if (WARN_ON(ret)) return ret; - ret = trigger_config_run_type(test_dev, - TEST_KMOD_FS_TYPE, "btrfs"); + + ret = trigger_config_run_type(test_dev, TEST_KMOD_FS_TYPE); if (WARN_ON(ret)) return ret; } diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 13e2a10d7554..cb3936595b0d 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -3177,6 +3177,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) void *entry, *ptr = (void *) 0x1234500; void *ptr2 = &ptr; void *ptr3 = &ptr2; + unsigned long index; /* Check MAS_ROOT First */ mtree_store_range(mt, 0, 0, ptr, GFP_KERNEL); @@ -3707,6 +3708,37 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, !mas_is_active(&mas)); mas_unlock(&mas); + mtree_destroy(mt); + + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mas_lock(&mas); + for (int count = 0; count < 30; count++) { + mas_set(&mas, count); + mas_store_gfp(&mas, xa_mk_value(count), GFP_KERNEL); + } + + /* Ensure mas_find works with MA_UNDERFLOW */ + mas_set(&mas, 0); + entry = mas_walk(&mas); + mas_set(&mas, 0); + mas_prev(&mas, 0); + MT_BUG_ON(mt, mas.status != ma_underflow); + MT_BUG_ON(mt, mas_find(&mas, ULONG_MAX) != entry); + + /* Restore active on mas_next */ + entry = mas_next(&mas, ULONG_MAX); + index = mas.index; + mas_prev(&mas, index); + MT_BUG_ON(mt, mas.status != ma_underflow); + MT_BUG_ON(mt, mas_next(&mas, ULONG_MAX) != entry); + + /* Ensure overflow -> active works */ + mas_prev(&mas, 0); + mas_next(&mas, index - 1); + MT_BUG_ON(mt, mas.status != ma_overflow); + MT_BUG_ON(mt, mas_next(&mas, ULONG_MAX) != entry); + + mas_unlock(&mas); } static noinline void __init alloc_cyclic_testing(struct maple_tree *mt) diff --git a/lib/test_objagg.c b/lib/test_objagg.c index d34df4306b87..ce5c4c36a084 100644 --- a/lib/test_objagg.c +++ b/lib/test_objagg.c @@ -899,57 +899,31 @@ static int check_expect_hints_stats(struct objagg_hints *objagg_hints, int err; stats = objagg_hints_stats_get(objagg_hints); - if (IS_ERR(stats)) + if (IS_ERR(stats)) { + *errmsg = "objagg_hints_stats_get() failed."; return PTR_ERR(stats); + } err = __check_expect_stats(stats, expect_stats, errmsg); objagg_stats_put(stats); return err; } -static int test_hints_case(const struct hints_case *hints_case) +static int test_hints_case2(const struct hints_case *hints_case, + struct objagg_hints *hints, struct objagg *objagg) { struct objagg_obj *objagg_obj; - struct objagg_hints *hints; struct world world2 = {}; - struct world world = {}; struct objagg *objagg2; - struct objagg *objagg; const char *errmsg; int i; int err; - objagg = objagg_create(&delta_ops, NULL, &world); - if (IS_ERR(objagg)) - return PTR_ERR(objagg); - - for (i = 0; i < hints_case->key_ids_count; i++) { - objagg_obj = world_obj_get(&world, objagg, - hints_case->key_ids[i]); - if (IS_ERR(objagg_obj)) { - err = PTR_ERR(objagg_obj); - goto err_world_obj_get; - } - } - - pr_debug_stats(objagg); - err = check_expect_stats(objagg, &hints_case->expect_stats, &errmsg); - if (err) { - pr_err("Stats: %s\n", errmsg); - goto err_check_expect_stats; - } - - hints = objagg_hints_get(objagg, OBJAGG_OPT_ALGO_SIMPLE_GREEDY); - if (IS_ERR(hints)) { - err = PTR_ERR(hints); - goto err_hints_get; - } - pr_debug_hints_stats(hints); err = check_expect_hints_stats(hints, &hints_case->expect_stats_hints, &errmsg); if (err) { pr_err("Hints stats: %s\n", errmsg); - goto err_check_expect_hints_stats; + return err; } objagg2 = objagg_create(&delta_ops, hints, &world2); @@ -981,7 +955,48 @@ err_world2_obj_get: world_obj_put(&world2, objagg, hints_case->key_ids[i]); i = hints_case->key_ids_count; objagg_destroy(objagg2); -err_check_expect_hints_stats: + + return err; +} + +static int test_hints_case(const struct hints_case *hints_case) +{ + struct objagg_obj *objagg_obj; + struct objagg_hints *hints; + struct world world = {}; + struct objagg *objagg; + const char *errmsg; + int i; + int err; + + objagg = objagg_create(&delta_ops, NULL, &world); + if (IS_ERR(objagg)) + return PTR_ERR(objagg); + + for (i = 0; i < hints_case->key_ids_count; i++) { + objagg_obj = world_obj_get(&world, objagg, + hints_case->key_ids[i]); + if (IS_ERR(objagg_obj)) { + err = PTR_ERR(objagg_obj); + goto err_world_obj_get; + } + } + + pr_debug_stats(objagg); + err = check_expect_stats(objagg, &hints_case->expect_stats, &errmsg); + if (err) { + pr_err("Stats: %s\n", errmsg); + goto err_check_expect_stats; + } + + hints = objagg_hints_get(objagg, OBJAGG_OPT_ALGO_SIMPLE_GREEDY); + if (IS_ERR(hints)) { + err = PTR_ERR(hints); + goto err_hints_get; + } + + err = test_hints_case2(hints_case, hints, objagg); + objagg_hints_put(hints); err_hints_get: err_check_expect_stats: diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c index 4249e0cc8aaf..c02aa9c868f2 100644 --- a/lib/test_sysctl.c +++ b/lib/test_sysctl.c @@ -30,15 +30,17 @@ static int i_zero; static int i_one_hundred = 100; static int match_int_ok = 1; +enum { + TEST_H_SETUP_NODE, + TEST_H_MNT, + TEST_H_MNTERROR, + TEST_H_EMPTY_ADD, + TEST_H_EMPTY, + TEST_H_U8, + TEST_H_SIZE /* Always at the end */ +}; -static struct { - struct ctl_table_header *test_h_setup_node; - struct ctl_table_header *test_h_mnt; - struct ctl_table_header *test_h_mnterror; - struct ctl_table_header *empty_add; - struct ctl_table_header *empty; -} sysctl_test_headers; - +static struct ctl_table_header *ctl_headers[TEST_H_SIZE] = {}; struct test_sysctl_data { int int_0001; int int_0002; @@ -167,8 +169,8 @@ static int test_sysctl_setup_node_tests(void) test_data.bitmap_0001 = kzalloc(SYSCTL_TEST_BITMAP_SIZE/8, GFP_KERNEL); if (!test_data.bitmap_0001) return -ENOMEM; - sysctl_test_headers.test_h_setup_node = register_sysctl("debug/test_sysctl", test_table); - if (!sysctl_test_headers.test_h_setup_node) { + ctl_headers[TEST_H_SETUP_NODE] = register_sysctl("debug/test_sysctl", test_table); + if (!ctl_headers[TEST_H_SETUP_NODE]) { kfree(test_data.bitmap_0001); return -ENOMEM; } @@ -202,12 +204,12 @@ static int test_sysctl_run_unregister_nested(void) static int test_sysctl_run_register_mount_point(void) { - sysctl_test_headers.test_h_mnt + ctl_headers[TEST_H_MNT] = register_sysctl_mount_point("debug/test_sysctl/mnt"); - if (!sysctl_test_headers.test_h_mnt) + if (!ctl_headers[TEST_H_MNT]) return -ENOMEM; - sysctl_test_headers.test_h_mnterror + ctl_headers[TEST_H_MNTERROR] = register_sysctl("debug/test_sysctl/mnt/mnt_error", test_table_unregister); /* @@ -225,39 +227,94 @@ static const struct ctl_table test_table_empty[] = { }; static int test_sysctl_run_register_empty(void) { /* Tets that an empty dir can be created */ - sysctl_test_headers.empty_add + ctl_headers[TEST_H_EMPTY_ADD] = register_sysctl("debug/test_sysctl/empty_add", test_table_empty); - if (!sysctl_test_headers.empty_add) + if (!ctl_headers[TEST_H_EMPTY_ADD]) return -ENOMEM; /* Test that register on top of an empty dir works */ - sysctl_test_headers.empty + ctl_headers[TEST_H_EMPTY] = register_sysctl("debug/test_sysctl/empty_add/empty", test_table_empty); - if (!sysctl_test_headers.empty) + if (!ctl_headers[TEST_H_EMPTY]) return -ENOMEM; return 0; } -static int __init test_sysctl_init(void) +static const struct ctl_table table_u8_over[] = { + { + .procname = "u8_over", + .data = &test_data.uint_0001, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_FOUR, + .extra2 = SYSCTL_ONE_THOUSAND, + }, +}; + +static const struct ctl_table table_u8_under[] = { + { + .procname = "u8_under", + .data = &test_data.uint_0001, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_NEG_ONE, + .extra2 = SYSCTL_ONE_HUNDRED, + }, +}; + +static const struct ctl_table table_u8_valid[] = { + { + .procname = "u8_valid", + .data = &test_data.uint_0001, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO_HUNDRED, + }, +}; + +static int test_sysctl_register_u8_extra(void) { - int err; + /* should fail because it's over */ + ctl_headers[TEST_H_U8] + = register_sysctl("debug/test_sysctl", table_u8_over); + if (ctl_headers[TEST_H_U8]) + return -ENOMEM; + + /* should fail because it's under */ + ctl_headers[TEST_H_U8] + = register_sysctl("debug/test_sysctl", table_u8_under); + if (ctl_headers[TEST_H_U8]) + return -ENOMEM; - err = test_sysctl_setup_node_tests(); - if (err) - goto out; + /* should not fail because it's valid */ + ctl_headers[TEST_H_U8] + = register_sysctl("debug/test_sysctl", table_u8_valid); + if (!ctl_headers[TEST_H_U8]) + return -ENOMEM; - err = test_sysctl_run_unregister_nested(); - if (err) - goto out; + return 0; +} - err = test_sysctl_run_register_mount_point(); - if (err) - goto out; +static int __init test_sysctl_init(void) +{ + int err = 0; + + int (*func_array[])(void) = { + test_sysctl_setup_node_tests, + test_sysctl_run_unregister_nested, + test_sysctl_run_register_mount_point, + test_sysctl_run_register_empty, + test_sysctl_register_u8_extra + }; - err = test_sysctl_run_register_empty(); + for (int i = 0; !err && i < ARRAY_SIZE(func_array); i++) + err = func_array[i](); -out: return err; } module_init(test_sysctl_init); @@ -265,16 +322,10 @@ module_init(test_sysctl_init); static void __exit test_sysctl_exit(void) { kfree(test_data.bitmap_0001); - if (sysctl_test_headers.test_h_setup_node) - unregister_sysctl_table(sysctl_test_headers.test_h_setup_node); - if (sysctl_test_headers.test_h_mnt) - unregister_sysctl_table(sysctl_test_headers.test_h_mnt); - if (sysctl_test_headers.test_h_mnterror) - unregister_sysctl_table(sysctl_test_headers.test_h_mnterror); - if (sysctl_test_headers.empty) - unregister_sysctl_table(sysctl_test_headers.empty); - if (sysctl_test_headers.empty_add) - unregister_sysctl_table(sysctl_test_headers.empty_add); + for (int i = 0; i < TEST_H_SIZE; i++) { + if (ctl_headers[i]) + unregister_sysctl_table(ctl_headers[i]); + } } module_exit(test_sysctl_exit); diff --git a/lib/test_ubsan.c b/lib/test_ubsan.c index 8772e5edaa4f..a4b6f52b9c57 100644 --- a/lib/test_ubsan.c +++ b/lib/test_ubsan.c @@ -77,18 +77,22 @@ static void test_ubsan_shift_out_of_bounds(void) static void test_ubsan_out_of_bounds(void) { - volatile int i = 4, j = 5, k = -1; - volatile char above[4] = { }; /* Protect surrounding memory. */ - volatile int arr[4]; - volatile char below[4] = { }; /* Protect surrounding memory. */ + int i = 4, j = 4, k = -1; + volatile struct { + char above[4]; /* Protect surrounding memory. */ + int arr[4]; + char below[4]; /* Protect surrounding memory. */ + } data; - above[0] = below[0]; + OPTIMIZER_HIDE_VAR(i); + OPTIMIZER_HIDE_VAR(j); + OPTIMIZER_HIDE_VAR(k); UBSAN_TEST(CONFIG_UBSAN_BOUNDS, "above"); - arr[j] = i; + data.arr[j] = i; UBSAN_TEST(CONFIG_UBSAN_BOUNDS, "below"); - arr[k] = i; + data.arr[k] = i; } enum ubsan_test_enum { diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index f585949ff696..2815658ccc37 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -13,9 +13,9 @@ #include <linux/moduleparam.h> #include <linux/completion.h> #include <linux/delay.h> -#include <linux/rwsem.h> #include <linux/mm.h> #include <linux/rcupdate.h> +#include <linux/srcu.h> #include <linux/slab.h> #define __param(type, name, init, msg) \ @@ -41,7 +41,7 @@ __param(int, nr_pages, 0, __param(bool, use_huge, false, "Use vmalloc_huge in fix_size_alloc_test"); -__param(int, run_test_mask, INT_MAX, +__param(int, run_test_mask, 7, "Set tests specified in the mask.\n\n" "\t\tid: 1, name: fix_size_alloc_test\n" "\t\tid: 2, name: full_fit_alloc_test\n" @@ -58,10 +58,9 @@ __param(int, run_test_mask, INT_MAX, ); /* - * Read write semaphore for synchronization of setup - * phase that is done in main thread and workers. + * This is for synchronization of setup phase. */ -static DECLARE_RWSEM(prepare_for_test_rwsem); +DEFINE_STATIC_SRCU(prepare_for_test_srcu); /* * Completion tracking for worker threads. @@ -397,25 +396,27 @@ cleanup: struct test_case_desc { const char *test_name; int (*test_func)(void); + bool xfail; }; static struct test_case_desc test_case_array[] = { - { "fix_size_alloc_test", fix_size_alloc_test }, - { "full_fit_alloc_test", full_fit_alloc_test }, - { "long_busy_list_alloc_test", long_busy_list_alloc_test }, - { "random_size_alloc_test", random_size_alloc_test }, - { "fix_align_alloc_test", fix_align_alloc_test }, - { "random_size_align_alloc_test", random_size_align_alloc_test }, - { "align_shift_alloc_test", align_shift_alloc_test }, - { "pcpu_alloc_test", pcpu_alloc_test }, - { "kvfree_rcu_1_arg_vmalloc_test", kvfree_rcu_1_arg_vmalloc_test }, - { "kvfree_rcu_2_arg_vmalloc_test", kvfree_rcu_2_arg_vmalloc_test }, - { "vm_map_ram_test", vm_map_ram_test }, + { "fix_size_alloc_test", fix_size_alloc_test, }, + { "full_fit_alloc_test", full_fit_alloc_test, }, + { "long_busy_list_alloc_test", long_busy_list_alloc_test, }, + { "random_size_alloc_test", random_size_alloc_test, }, + { "fix_align_alloc_test", fix_align_alloc_test, }, + { "random_size_align_alloc_test", random_size_align_alloc_test, }, + { "align_shift_alloc_test", align_shift_alloc_test, true }, + { "pcpu_alloc_test", pcpu_alloc_test, }, + { "kvfree_rcu_1_arg_vmalloc_test", kvfree_rcu_1_arg_vmalloc_test, }, + { "kvfree_rcu_2_arg_vmalloc_test", kvfree_rcu_2_arg_vmalloc_test, }, + { "vm_map_ram_test", vm_map_ram_test, }, /* Add a new test case here. */ }; struct test_case_data { int test_failed; + int test_xfailed; int test_passed; u64 time; }; @@ -445,7 +446,7 @@ static int test_func(void *private) { struct test_driver *t = private; int random_array[ARRAY_SIZE(test_case_array)]; - int index, i, j; + int index, i, j, ret; ktime_t kt; u64 delta; @@ -458,7 +459,7 @@ static int test_func(void *private) /* * Block until initialization is done. */ - down_read(&prepare_for_test_rwsem); + synchronize_srcu(&prepare_for_test_srcu); t->start = get_cycles(); for (i = 0; i < ARRAY_SIZE(test_case_array); i++) { @@ -469,11 +470,14 @@ static int test_func(void *private) */ if (!((run_test_mask & (1 << index)) >> index)) continue; - kt = ktime_get(); for (j = 0; j < test_repeat_count; j++) { - if (!test_case_array[index].test_func()) + ret = test_case_array[index].test_func(); + + if (!ret && !test_case_array[index].xfail) t->data[index].test_passed++; + else if (ret && test_case_array[index].xfail) + t->data[index].test_xfailed++; else t->data[index].test_failed++; } @@ -487,8 +491,6 @@ static int test_func(void *private) t->data[index].time = delta; } t->stop = get_cycles(); - - up_read(&prepare_for_test_rwsem); test_report_one_done(); /* @@ -526,7 +528,7 @@ init_test_configuration(void) static void do_concurrent_test(void) { - int i, ret; + int i, ret, idx; /* * Set some basic configurations plus sanity check. @@ -538,7 +540,7 @@ static void do_concurrent_test(void) /* * Put on hold all workers. */ - down_write(&prepare_for_test_rwsem); + idx = srcu_read_lock(&prepare_for_test_srcu); for (i = 0; i < nr_threads; i++) { struct test_driver *t = &tdriver[i]; @@ -555,7 +557,7 @@ static void do_concurrent_test(void) /* * Now let the workers do their job. */ - up_write(&prepare_for_test_rwsem); + srcu_read_unlock(&prepare_for_test_srcu, idx); /* * Sleep quiet until all workers are done with 1 second @@ -579,10 +581,11 @@ static void do_concurrent_test(void) continue; pr_info( - "Summary: %s passed: %d failed: %d repeat: %d loops: %d avg: %llu usec\n", + "Summary: %s passed: %d failed: %d xfailed: %d repeat: %d loops: %d avg: %llu usec\n", test_case_array[j].test_name, t->data[j].test_passed, t->data[j].test_failed, + t->data[j].test_xfailed, test_repeat_count, test_loop_count, t->data[j].time); } @@ -594,13 +597,18 @@ static void do_concurrent_test(void) kvfree(tdriver); } -static int vmalloc_test_init(void) +static int __init vmalloc_test_init(void) { do_concurrent_test(); - return -EAGAIN; /* Fail will directly unload the module */ + /* Fail will directly unload the module */ + return IS_BUILTIN(CONFIG_TEST_VMALLOC) ? 0:-EAGAIN; } +#ifdef MODULE module_init(vmalloc_test_init) +#else +late_initcall(vmalloc_test_init); +#endif MODULE_LICENSE("GPL"); MODULE_AUTHOR("Uladzislau Rezki"); diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 080a39d22e73..5ca0aefee9aa 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -1040,6 +1040,7 @@ static noinline void check_xa_alloc_3(struct xarray *xa, unsigned int base) unsigned int i, id; unsigned long index; void *entry; + int ret; XA_BUG_ON(xa, xa_alloc_cyclic(xa, &id, xa_mk_index(1), limit, &next, GFP_KERNEL) != 0); @@ -1059,7 +1060,7 @@ static noinline void check_xa_alloc_3(struct xarray *xa, unsigned int base) else entry = xa_mk_index(i - 0x3fff); XA_BUG_ON(xa, xa_alloc_cyclic(xa, &id, entry, limit, - &next, GFP_KERNEL) != (id == 1)); + &next, GFP_KERNEL) != 0); XA_BUG_ON(xa, xa_mk_index(id) != entry); } @@ -1072,7 +1073,7 @@ static noinline void check_xa_alloc_3(struct xarray *xa, unsigned int base) xa_limit_32b, &next, GFP_KERNEL) != 0); XA_BUG_ON(xa, id != UINT_MAX); XA_BUG_ON(xa, xa_alloc_cyclic(xa, &id, xa_mk_index(base), - xa_limit_32b, &next, GFP_KERNEL) != 1); + xa_limit_32b, &next, GFP_KERNEL) != 0); XA_BUG_ON(xa, id != base); XA_BUG_ON(xa, xa_alloc_cyclic(xa, &id, xa_mk_index(base + 1), xa_limit_32b, &next, GFP_KERNEL) != 0); @@ -1080,7 +1081,19 @@ static noinline void check_xa_alloc_3(struct xarray *xa, unsigned int base) xa_for_each(xa, index, entry) xa_erase_index(xa, index); + XA_BUG_ON(xa, !xa_empty(xa)); + /* check wrap-around return of __xa_alloc_cyclic() */ + next = UINT_MAX; + XA_BUG_ON(xa, xa_alloc_cyclic(xa, &id, xa_mk_index(UINT_MAX), + xa_limit_32b, &next, GFP_KERNEL) != 0); + xa_lock(xa); + ret = __xa_alloc_cyclic(xa, &id, xa_mk_index(base), xa_limit_32b, + &next, GFP_KERNEL); + xa_unlock(xa); + XA_BUG_ON(xa, ret != 1); + xa_for_each(xa, index, entry) + xa_erase_index(xa, index); XA_BUG_ON(xa, !xa_empty(xa)); } diff --git a/lib/tests/Makefile b/lib/tests/Makefile index 5a4794c1826e..fa6d728a8b5b 100644 --- a/lib/tests/Makefile +++ b/lib/tests/Makefile @@ -10,7 +10,6 @@ obj-$(CONFIG_BLACKHOLE_DEV_KUNIT_TEST) += blackhole_dev_kunit.o obj-$(CONFIG_CHECKSUM_KUNIT) += checksum_kunit.o obj-$(CONFIG_CMDLINE_KUNIT_TEST) += cmdline_kunit.o obj-$(CONFIG_CPUMASK_KUNIT_TEST) += cpumask_kunit.o -obj-$(CONFIG_CRC_KUNIT_TEST) += crc_kunit.o CFLAGS_fortify_kunit.o += $(call cc-disable-warning, unsequenced) CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-overread) CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-truncation) @@ -35,7 +34,9 @@ obj-$(CONFIG_MEMCPY_KUNIT_TEST) += memcpy_kunit.o CFLAGS_overflow_kunit.o = $(call cc-disable-warning, tautological-constant-out-of-range-compare) obj-$(CONFIG_OVERFLOW_KUNIT_TEST) += overflow_kunit.o obj-$(CONFIG_PRINTF_KUNIT_TEST) += printf_kunit.o +obj-$(CONFIG_RANDSTRUCT_KUNIT_TEST) += randstruct_kunit.o obj-$(CONFIG_SCANF_KUNIT_TEST) += scanf_kunit.o +obj-$(CONFIG_SEQ_BUF_KUNIT_TEST) += seq_buf_kunit.o obj-$(CONFIG_SIPHASH_KUNIT_TEST) += siphash_kunit.o obj-$(CONFIG_SLUB_KUNIT_TEST) += slub_kunit.o obj-$(CONFIG_TEST_SORT) += test_sort.o @@ -45,5 +46,6 @@ obj-$(CONFIG_STRING_KUNIT_TEST) += string_kunit.o obj-$(CONFIG_STRING_HELPERS_KUNIT_TEST) += string_helpers_kunit.o obj-$(CONFIG_USERCOPY_KUNIT_TEST) += usercopy_kunit.o obj-$(CONFIG_UTIL_MACROS_KUNIT) += util_macros_kunit.o +obj-$(CONFIG_RATELIMIT_KUNIT_TEST) += test_ratelimit.o obj-$(CONFIG_TEST_RUNTIME_MODULE) += module/ diff --git a/lib/tests/fortify_kunit.c b/lib/tests/fortify_kunit.c index 29ffc62a71e3..fc9c76f026d6 100644 --- a/lib/tests/fortify_kunit.c +++ b/lib/tests/fortify_kunit.c @@ -1003,8 +1003,8 @@ static void fortify_test_memcmp(struct kunit *test) { char one[] = "My mind is going ..."; char two[] = "My mind is going ... I can feel it."; - size_t one_len = sizeof(one) - 1; - size_t two_len = sizeof(two) - 1; + volatile size_t one_len = sizeof(one) - 1; + volatile size_t two_len = sizeof(two) - 1; OPTIMIZER_HIDE_VAR(one_len); OPTIMIZER_HIDE_VAR(two_len); diff --git a/lib/tests/longest_symbol_kunit.c b/lib/tests/longest_symbol_kunit.c index e3c28ff1807f..9b4de3050ba7 100644 --- a/lib/tests/longest_symbol_kunit.c +++ b/lib/tests/longest_symbol_kunit.c @@ -3,8 +3,7 @@ * Test the longest symbol length. Execute with: * ./tools/testing/kunit/kunit.py run longest-symbol * --arch=x86_64 --kconfig_add CONFIG_KPROBES=y --kconfig_add CONFIG_MODULES=y - * --kconfig_add CONFIG_RETPOLINE=n --kconfig_add CONFIG_CFI_CLANG=n - * --kconfig_add CONFIG_MITIGATION_RETPOLINE=n + * --kconfig_add CONFIG_CPU_MITIGATIONS=n --kconfig_add CONFIG_GCOV_KERNEL=n */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/lib/tests/overflow_kunit.c b/lib/tests/overflow_kunit.c index 894691b4411a..19cb03b25dc5 100644 --- a/lib/tests/overflow_kunit.c +++ b/lib/tests/overflow_kunit.c @@ -1210,6 +1210,10 @@ static void DEFINE_FLEX_test(struct kunit *test) KUNIT_EXPECT_EQ(test, __struct_size(empty->array), 0); KUNIT_EXPECT_EQ(test, __member_size(empty->array), 0); + KUNIT_EXPECT_EQ(test, STACK_FLEX_ARRAY_SIZE(two, array), 2); + KUNIT_EXPECT_EQ(test, STACK_FLEX_ARRAY_SIZE(eight, array), 8); + KUNIT_EXPECT_EQ(test, STACK_FLEX_ARRAY_SIZE(empty, array), 0); + /* If __counted_by is not being used, array size will have the on-stack size. */ if (!IS_ENABLED(CONFIG_CC_HAS_COUNTED_BY)) array_size_override = 2 * sizeof(s16); diff --git a/lib/tests/printf_kunit.c b/lib/tests/printf_kunit.c index 2c9f6170bacd..bc54cca2d7a6 100644 --- a/lib/tests/printf_kunit.c +++ b/lib/tests/printf_kunit.c @@ -701,21 +701,46 @@ static void fwnode_pointer(struct kunit *kunittest) software_node_unregister_node_group(group); } +struct fourcc_struct { + u32 code; + const char *str; +}; + +static void fourcc_pointer_test(struct kunit *kunittest, const struct fourcc_struct *fc, + size_t n, const char *fmt) +{ + size_t i; + + for (i = 0; i < n; i++) + test(fc[i].str, fmt, &fc[i].code); +} + static void fourcc_pointer(struct kunit *kunittest) { - struct { - u32 code; - char *str; - } const try[] = { + static const struct fourcc_struct try_cc[] = { { 0x3231564e, "NV12 little-endian (0x3231564e)", }, { 0xb231564e, "NV12 big-endian (0xb231564e)", }, { 0x10111213, ".... little-endian (0x10111213)", }, { 0x20303159, "Y10 little-endian (0x20303159)", }, }; - unsigned int i; + static const struct fourcc_struct try_ch[] = { + { 0x41424344, "ABCD (0x41424344)", }, + }; + static const struct fourcc_struct try_chR[] = { + { 0x41424344, "DCBA (0x44434241)", }, + }; + static const struct fourcc_struct try_cl[] = { + { (__force u32)cpu_to_le32(0x41424344), "ABCD (0x41424344)", }, + }; + static const struct fourcc_struct try_cb[] = { + { (__force u32)cpu_to_be32(0x41424344), "ABCD (0x41424344)", }, + }; - for (i = 0; i < ARRAY_SIZE(try); i++) - test(try[i].str, "%p4cc", &try[i].code); + fourcc_pointer_test(kunittest, try_cc, ARRAY_SIZE(try_cc), "%p4cc"); + fourcc_pointer_test(kunittest, try_ch, ARRAY_SIZE(try_ch), "%p4ch"); + fourcc_pointer_test(kunittest, try_chR, ARRAY_SIZE(try_chR), "%p4chR"); + fourcc_pointer_test(kunittest, try_cl, ARRAY_SIZE(try_cl), "%p4cl"); + fourcc_pointer_test(kunittest, try_cb, ARRAY_SIZE(try_cb), "%p4cb"); } static void diff --git a/lib/tests/randstruct_kunit.c b/lib/tests/randstruct_kunit.c new file mode 100644 index 000000000000..f3a2d63c4cfb --- /dev/null +++ b/lib/tests/randstruct_kunit.c @@ -0,0 +1,334 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Test cases for struct randomization, i.e. CONFIG_RANDSTRUCT=y. + * + * For example, see: + * "Running tests with kunit_tool" at Documentation/dev-tools/kunit/start.rst + * ./tools/testing/kunit/kunit.py run randstruct [--raw_output] \ + * [--make_option LLVM=1] \ + * --kconfig_add CONFIG_RANDSTRUCT_FULL=y + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <kunit/test.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> + +#define DO_MANY_MEMBERS(macro, args...) \ + macro(a, args) \ + macro(b, args) \ + macro(c, args) \ + macro(d, args) \ + macro(e, args) \ + macro(f, args) \ + macro(g, args) \ + macro(h, args) + +#define do_enum(x, ignored) MEMBER_NAME_ ## x, +enum randstruct_member_names { + DO_MANY_MEMBERS(do_enum) + MEMBER_NAME_MAX, +}; +/* Make sure the macros are working: want 8 test members. */ +_Static_assert(MEMBER_NAME_MAX == 8, "Number of test members changed?!"); + +/* This is an unsigned long member to match the function pointer size */ +#define unsigned_long_member(x, ignored) unsigned long x; +struct randstruct_untouched { + DO_MANY_MEMBERS(unsigned_long_member) +}; + +/* Struct explicitly marked with __randomize_layout. */ +struct randstruct_shuffled { + DO_MANY_MEMBERS(unsigned_long_member) +} __randomize_layout; +#undef unsigned_long_member + +/* Struct implicitly randomized from being all func ptrs. */ +#define func_member(x, ignored) size_t (*x)(int); +struct randstruct_funcs_untouched { + DO_MANY_MEMBERS(func_member) +} __no_randomize_layout; + +struct randstruct_funcs_shuffled { + DO_MANY_MEMBERS(func_member) +}; + +#define func_body(x, ignored) \ +static noinline size_t func_##x(int arg) \ +{ \ + return offsetof(struct randstruct_funcs_untouched, x); \ +} +DO_MANY_MEMBERS(func_body) + +/* Various mixed types. */ +#define mixed_members \ + bool a; \ + short b; \ + unsigned int c __aligned(16); \ + size_t d; \ + char e; \ + u64 f; \ + union { \ + struct randstruct_shuffled shuffled; \ + uintptr_t g; \ + }; \ + union { \ + void *ptr; \ + char h; \ + }; + +struct randstruct_mixed_untouched { + mixed_members +}; + +struct randstruct_mixed_shuffled { + mixed_members +} __randomize_layout; +#undef mixed_members + +struct contains_randstruct_untouched { + int before; + struct randstruct_untouched untouched; + int after; +}; + +struct contains_randstruct_shuffled { + int before; + struct randstruct_shuffled shuffled; + int after; +}; + +struct contains_func_untouched { + struct randstruct_funcs_shuffled inner; + DO_MANY_MEMBERS(func_member) +} __no_randomize_layout; + +struct contains_func_shuffled { + struct randstruct_funcs_shuffled inner; + DO_MANY_MEMBERS(func_member) +}; +#undef func_member + +#define check_mismatch(x, untouched, shuffled) \ + if (offsetof(untouched, x) != offsetof(shuffled, x)) \ + mismatches++; \ + kunit_info(test, #shuffled "::" #x " @ %zu (vs %zu)\n", \ + offsetof(shuffled, x), \ + offsetof(untouched, x)); \ + +#define check_pair(outcome, untouched, shuffled, checker...) \ + mismatches = 0; \ + DO_MANY_MEMBERS(checker, untouched, shuffled) \ + kunit_info(test, "Differing " #untouched " vs " #shuffled " member positions: %d\n", \ + mismatches); \ + KUNIT_##outcome##_MSG(test, mismatches, 0, \ + #untouched " vs " #shuffled " layouts: unlucky or broken?\n"); + +static void randstruct_layout_same(struct kunit *test) +{ + int mismatches; + + check_pair(EXPECT_EQ, struct randstruct_untouched, struct randstruct_untouched, + check_mismatch) + check_pair(EXPECT_GT, struct randstruct_untouched, struct randstruct_shuffled, + check_mismatch) +} + +static void randstruct_layout_mixed(struct kunit *test) +{ + int mismatches; + + check_pair(EXPECT_EQ, struct randstruct_mixed_untouched, struct randstruct_mixed_untouched, + check_mismatch) + check_pair(EXPECT_GT, struct randstruct_mixed_untouched, struct randstruct_mixed_shuffled, + check_mismatch) +} + +static void randstruct_layout_fptr(struct kunit *test) +{ + int mismatches; + + check_pair(EXPECT_EQ, struct randstruct_untouched, struct randstruct_untouched, + check_mismatch) + check_pair(EXPECT_GT, struct randstruct_untouched, struct randstruct_funcs_shuffled, + check_mismatch) + check_pair(EXPECT_GT, struct randstruct_funcs_untouched, struct randstruct_funcs_shuffled, + check_mismatch) +} + +#define check_mismatch_prefixed(x, prefix, untouched, shuffled) \ + check_mismatch(prefix.x, untouched, shuffled) + +static void randstruct_layout_fptr_deep(struct kunit *test) +{ + int mismatches; + + if (IS_ENABLED(CONFIG_CC_IS_CLANG)) + kunit_skip(test, "Clang randstruct misses inner functions: https://github.com/llvm/llvm-project/issues/138355"); + + check_pair(EXPECT_EQ, struct contains_func_untouched, struct contains_func_untouched, + check_mismatch_prefixed, inner) + + check_pair(EXPECT_GT, struct contains_func_untouched, struct contains_func_shuffled, + check_mismatch_prefixed, inner) +} + +#undef check_pair +#undef check_mismatch + +#define check_mismatch(x, ignore) \ + KUNIT_EXPECT_EQ_MSG(test, untouched->x, shuffled->x, \ + "Mismatched member value in %s initializer\n", \ + name); + +static void test_check_init(struct kunit *test, const char *name, + struct randstruct_untouched *untouched, + struct randstruct_shuffled *shuffled) +{ + DO_MANY_MEMBERS(check_mismatch) +} + +static void test_check_mixed_init(struct kunit *test, const char *name, + struct randstruct_mixed_untouched *untouched, + struct randstruct_mixed_shuffled *shuffled) +{ + DO_MANY_MEMBERS(check_mismatch) +} +#undef check_mismatch + +#define check_mismatch(x, ignore) \ + KUNIT_EXPECT_EQ_MSG(test, untouched->untouched.x, \ + shuffled->shuffled.x, \ + "Mismatched member value in %s initializer\n", \ + name); +static void test_check_contained_init(struct kunit *test, const char *name, + struct contains_randstruct_untouched *untouched, + struct contains_randstruct_shuffled *shuffled) +{ + DO_MANY_MEMBERS(check_mismatch) +} +#undef check_mismatch + +#define check_mismatch(x, ignore) \ + KUNIT_EXPECT_PTR_EQ_MSG(test, untouched->x, shuffled->x, \ + "Mismatched member value in %s initializer\n", \ + name); + +static void test_check_funcs_init(struct kunit *test, const char *name, + struct randstruct_funcs_untouched *untouched, + struct randstruct_funcs_shuffled *shuffled) +{ + DO_MANY_MEMBERS(check_mismatch) +} +#undef check_mismatch + +static void randstruct_initializers(struct kunit *test) +{ +#define init_members \ + .a = 1, \ + .b = 3, \ + .c = 5, \ + .d = 7, \ + .e = 11, \ + .f = 13, \ + .g = 17, \ + .h = 19, + struct randstruct_untouched untouched = { + init_members + }; + struct randstruct_shuffled shuffled = { + init_members + }; + struct randstruct_mixed_untouched mixed_untouched = { + init_members + }; + struct randstruct_mixed_shuffled mixed_shuffled = { + init_members + }; + struct contains_randstruct_untouched contains_untouched = { + .untouched = { + init_members + }, + }; + struct contains_randstruct_shuffled contains_shuffled = { + .shuffled = { + init_members + }, + }; +#define func_member(x, ignored) \ + .x = func_##x, + struct randstruct_funcs_untouched funcs_untouched = { + DO_MANY_MEMBERS(func_member) + }; + struct randstruct_funcs_shuffled funcs_shuffled = { + DO_MANY_MEMBERS(func_member) + }; + + test_check_init(test, "named", &untouched, &shuffled); + test_check_init(test, "unnamed", &untouched, + &(struct randstruct_shuffled){ + init_members + }); + + test_check_contained_init(test, "named", &contains_untouched, &contains_shuffled); + test_check_contained_init(test, "unnamed", &contains_untouched, + &(struct contains_randstruct_shuffled){ + .shuffled = (struct randstruct_shuffled){ + init_members + }, + }); + + test_check_contained_init(test, "named", &contains_untouched, &contains_shuffled); + test_check_contained_init(test, "unnamed copy", &contains_untouched, + &(struct contains_randstruct_shuffled){ + /* full struct copy initializer */ + .shuffled = shuffled, + }); + + test_check_mixed_init(test, "named", &mixed_untouched, &mixed_shuffled); + test_check_mixed_init(test, "unnamed", &mixed_untouched, + &(struct randstruct_mixed_shuffled){ + init_members + }); + + test_check_funcs_init(test, "named", &funcs_untouched, &funcs_shuffled); + test_check_funcs_init(test, "unnamed", &funcs_untouched, + &(struct randstruct_funcs_shuffled){ + DO_MANY_MEMBERS(func_member) + }); + +#undef func_member +#undef init_members +} + +static int randstruct_test_init(struct kunit *test) +{ + if (!IS_ENABLED(CONFIG_RANDSTRUCT)) + kunit_skip(test, "Not built with CONFIG_RANDSTRUCT=y"); + + return 0; +} + +static struct kunit_case randstruct_test_cases[] = { + KUNIT_CASE(randstruct_layout_same), + KUNIT_CASE(randstruct_layout_mixed), + KUNIT_CASE(randstruct_layout_fptr), + KUNIT_CASE(randstruct_layout_fptr_deep), + KUNIT_CASE(randstruct_initializers), + {} +}; + +static struct kunit_suite randstruct_test_suite = { + .name = "randstruct", + .init = randstruct_test_init, + .test_cases = randstruct_test_cases, +}; + +kunit_test_suites(&randstruct_test_suite); + +MODULE_DESCRIPTION("Test cases for struct randomization"); +MODULE_LICENSE("GPL"); diff --git a/lib/tests/seq_buf_kunit.c b/lib/tests/seq_buf_kunit.c new file mode 100644 index 000000000000..8a01579a978e --- /dev/null +++ b/lib/tests/seq_buf_kunit.c @@ -0,0 +1,208 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit tests for the seq_buf API + * + * Copyright (C) 2025, Google LLC. + */ + +#include <kunit/test.h> +#include <linux/seq_buf.h> + +static void seq_buf_init_test(struct kunit *test) +{ + char buf[32]; + struct seq_buf s; + + seq_buf_init(&s, buf, sizeof(buf)); + + KUNIT_EXPECT_EQ(test, s.size, 32); + KUNIT_EXPECT_EQ(test, s.len, 0); + KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s)); + KUNIT_EXPECT_EQ(test, seq_buf_buffer_left(&s), 32); + KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 0); + KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), ""); +} + +static void seq_buf_declare_test(struct kunit *test) +{ + DECLARE_SEQ_BUF(s, 24); + + KUNIT_EXPECT_EQ(test, s.size, 24); + KUNIT_EXPECT_EQ(test, s.len, 0); + KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s)); + KUNIT_EXPECT_EQ(test, seq_buf_buffer_left(&s), 24); + KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 0); + KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), ""); +} + +static void seq_buf_clear_test(struct kunit *test) +{ + DECLARE_SEQ_BUF(s, 128); + + seq_buf_puts(&s, "hello"); + KUNIT_EXPECT_EQ(test, s.len, 5); + KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s)); + KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "hello"); + + seq_buf_clear(&s); + + KUNIT_EXPECT_EQ(test, s.len, 0); + KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s)); + KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), ""); +} + +static void seq_buf_puts_test(struct kunit *test) +{ + DECLARE_SEQ_BUF(s, 16); + + seq_buf_puts(&s, "hello"); + KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 5); + KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s)); + KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "hello"); + + seq_buf_puts(&s, " world"); + KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 11); + KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s)); + KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "hello world"); +} + +static void seq_buf_puts_overflow_test(struct kunit *test) +{ + DECLARE_SEQ_BUF(s, 10); + + seq_buf_puts(&s, "123456789"); + KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s)); + KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 9); + + seq_buf_puts(&s, "0"); + KUNIT_EXPECT_TRUE(test, seq_buf_has_overflowed(&s)); + KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 10); + KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "123456789"); + + seq_buf_clear(&s); + KUNIT_EXPECT_EQ(test, s.len, 0); + KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s)); + KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), ""); +} + +static void seq_buf_putc_test(struct kunit *test) +{ + DECLARE_SEQ_BUF(s, 4); + + seq_buf_putc(&s, 'a'); + seq_buf_putc(&s, 'b'); + seq_buf_putc(&s, 'c'); + + KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 3); + KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s)); + KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "abc"); + + seq_buf_putc(&s, 'd'); + KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 4); + KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s)); + KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "abc"); + + seq_buf_putc(&s, 'e'); + KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 4); + KUNIT_EXPECT_TRUE(test, seq_buf_has_overflowed(&s)); + KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "abc"); + + seq_buf_clear(&s); + KUNIT_EXPECT_EQ(test, s.len, 0); + KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s)); + KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), ""); +} + +static void seq_buf_printf_test(struct kunit *test) +{ + DECLARE_SEQ_BUF(s, 32); + + seq_buf_printf(&s, "hello %s", "world"); + KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 11); + KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s)); + KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "hello world"); + + seq_buf_printf(&s, " %d", 123); + KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 15); + KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s)); + KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "hello world 123"); +} + +static void seq_buf_printf_overflow_test(struct kunit *test) +{ + DECLARE_SEQ_BUF(s, 16); + + seq_buf_printf(&s, "%lu", 1234567890UL); + KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s)); + KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 10); + KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "1234567890"); + + seq_buf_printf(&s, "%s", "abcdefghij"); + KUNIT_EXPECT_TRUE(test, seq_buf_has_overflowed(&s)); + KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 16); + KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "1234567890abcde"); + + seq_buf_clear(&s); + KUNIT_EXPECT_EQ(test, s.len, 0); + KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s)); + KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), ""); +} + +static void seq_buf_get_buf_commit_test(struct kunit *test) +{ + DECLARE_SEQ_BUF(s, 16); + char *buf; + size_t len; + + len = seq_buf_get_buf(&s, &buf); + KUNIT_EXPECT_EQ(test, len, 16); + KUNIT_EXPECT_PTR_NE(test, buf, NULL); + + memcpy(buf, "hello", 5); + seq_buf_commit(&s, 5); + + KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 5); + KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s)); + KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "hello"); + + len = seq_buf_get_buf(&s, &buf); + KUNIT_EXPECT_EQ(test, len, 11); + KUNIT_EXPECT_PTR_NE(test, buf, NULL); + + memcpy(buf, " worlds!", 8); + seq_buf_commit(&s, 6); + + KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 11); + KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s)); + KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "hello world"); + + len = seq_buf_get_buf(&s, &buf); + KUNIT_EXPECT_EQ(test, len, 5); + KUNIT_EXPECT_PTR_NE(test, buf, NULL); + + seq_buf_commit(&s, -1); + KUNIT_EXPECT_TRUE(test, seq_buf_has_overflowed(&s)); +} + +static struct kunit_case seq_buf_test_cases[] = { + KUNIT_CASE(seq_buf_init_test), + KUNIT_CASE(seq_buf_declare_test), + KUNIT_CASE(seq_buf_clear_test), + KUNIT_CASE(seq_buf_puts_test), + KUNIT_CASE(seq_buf_puts_overflow_test), + KUNIT_CASE(seq_buf_putc_test), + KUNIT_CASE(seq_buf_printf_test), + KUNIT_CASE(seq_buf_printf_overflow_test), + KUNIT_CASE(seq_buf_get_buf_commit_test), + {} +}; + +static struct kunit_suite seq_buf_test_suite = { + .name = "seq_buf", + .test_cases = seq_buf_test_cases, +}; + +kunit_test_suite(seq_buf_test_suite); + +MODULE_DESCRIPTION("Runtime test cases for seq_buf string API"); +MODULE_LICENSE("GPL"); diff --git a/lib/tests/slub_kunit.c b/lib/tests/slub_kunit.c index d47c472b0520..848b682a2d70 100644 --- a/lib/tests/slub_kunit.c +++ b/lib/tests/slub_kunit.c @@ -325,4 +325,5 @@ static struct kunit_suite test_suite = { }; kunit_test_suite(test_suite); +MODULE_DESCRIPTION("Kunit tests for slub allocator"); MODULE_LICENSE("GPL"); diff --git a/lib/tests/stackinit_kunit.c b/lib/tests/stackinit_kunit.c index 63aa78e6f5c1..ff2784769772 100644 --- a/lib/tests/stackinit_kunit.c +++ b/lib/tests/stackinit_kunit.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * Test cases for compiler-based stack variable zeroing via - * -ftrivial-auto-var-init={zero,pattern} or CONFIG_GCC_PLUGIN_STRUCTLEAK*. + * -ftrivial-auto-var-init={zero,pattern}. * For example, see: * "Running tests with kunit_tool" at Documentation/dev-tools/kunit/start.rst * ./tools/testing/kunit/kunit.py run stackinit [--raw_output] \ @@ -376,14 +376,6 @@ union test_small_end { # define USER_PASS XFAIL # define BYREF_PASS XFAIL # define STRONG_PASS XFAIL -#elif defined(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER) -# define USER_PASS WANT_SUCCESS -# define BYREF_PASS XFAIL -# define STRONG_PASS XFAIL -#elif defined(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF) -# define USER_PASS WANT_SUCCESS -# define BYREF_PASS WANT_SUCCESS -# define STRONG_PASS XFAIL #else # define USER_PASS WANT_SUCCESS # define BYREF_PASS WANT_SUCCESS diff --git a/lib/tests/test_bits.c b/lib/tests/test_bits.c index c7b38d91e1f1..ab88e50d2edf 100644 --- a/lib/tests/test_bits.c +++ b/lib/tests/test_bits.c @@ -5,7 +5,44 @@ #include <kunit/test.h> #include <linux/bits.h> +#include <linux/types.h> +#define assert_type(t, x) _Generic(x, t: x, default: 0) + +static_assert(assert_type(u8, BIT_U8(0)) == 1u); +static_assert(assert_type(u16, BIT_U16(0)) == 1u); +static_assert(assert_type(u32, BIT_U32(0)) == 1u); +static_assert(assert_type(u64, BIT_U64(0)) == 1ull); + +static_assert(assert_type(u8, BIT_U8(7)) == 0x80u); +static_assert(assert_type(u16, BIT_U16(15)) == 0x8000u); +static_assert(assert_type(u32, BIT_U32(31)) == 0x80000000u); +static_assert(assert_type(u64, BIT_U64(63)) == 0x8000000000000000ull); + +static_assert(assert_type(unsigned long, GENMASK(31, 0)) == U32_MAX); +static_assert(assert_type(unsigned long long, GENMASK_ULL(63, 0)) == U64_MAX); +static_assert(assert_type(u8, GENMASK_U8(7, 0)) == U8_MAX); +static_assert(assert_type(u16, GENMASK_U16(15, 0)) == U16_MAX); +static_assert(assert_type(u32, GENMASK_U32(31, 0)) == U32_MAX); +static_assert(assert_type(u64, GENMASK_U64(63, 0)) == U64_MAX); + +/* FIXME: add a test case written in asm for GENMASK() and GENMASK_ULL() */ + +static void __genmask_test(struct kunit *test) +{ + KUNIT_EXPECT_EQ(test, 1ul, __GENMASK(0, 0)); + KUNIT_EXPECT_EQ(test, 3ul, __GENMASK(1, 0)); + KUNIT_EXPECT_EQ(test, 6ul, __GENMASK(2, 1)); + KUNIT_EXPECT_EQ(test, 0xFFFFFFFFul, __GENMASK(31, 0)); +} + +static void __genmask_ull_test(struct kunit *test) +{ + KUNIT_EXPECT_EQ(test, 1ull, __GENMASK_ULL(0, 0)); + KUNIT_EXPECT_EQ(test, 3ull, __GENMASK_ULL(1, 0)); + KUNIT_EXPECT_EQ(test, 0x000000ffffe00000ull, __GENMASK_ULL(39, 21)); + KUNIT_EXPECT_EQ(test, 0xffffffffffffffffull, __GENMASK_ULL(63, 0)); +} static void genmask_test(struct kunit *test) { @@ -14,11 +51,21 @@ static void genmask_test(struct kunit *test) KUNIT_EXPECT_EQ(test, 6ul, GENMASK(2, 1)); KUNIT_EXPECT_EQ(test, 0xFFFFFFFFul, GENMASK(31, 0)); + KUNIT_EXPECT_EQ(test, 1u, GENMASK_U8(0, 0)); + KUNIT_EXPECT_EQ(test, 3u, GENMASK_U16(1, 0)); + KUNIT_EXPECT_EQ(test, 0x10000, GENMASK_U32(16, 16)); + #ifdef TEST_GENMASK_FAILURES /* these should fail compilation */ GENMASK(0, 1); GENMASK(0, 10); GENMASK(9, 10); + + GENMASK_U32(0, 31); + GENMASK_U64(64, 0); + GENMASK_U32(32, 0); + GENMASK_U16(16, 0); + GENMASK_U8(8, 0); #endif @@ -93,6 +140,8 @@ static void genmask_input_check_test(struct kunit *test) static struct kunit_case bits_test_cases[] = { + KUNIT_CASE(__genmask_test), + KUNIT_CASE(__genmask_ull_test), KUNIT_CASE(genmask_test), KUNIT_CASE(genmask_ull_test), KUNIT_CASE(genmask_u128_test), diff --git a/lib/tests/test_ratelimit.c b/lib/tests/test_ratelimit.c new file mode 100644 index 000000000000..bfaeca49304a --- /dev/null +++ b/lib/tests/test_ratelimit.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <kunit/test.h> + +#include <linux/ratelimit.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/cpumask.h> + +/* a simple boot-time regression test */ + +#define TESTRL_INTERVAL (5 * HZ) +static DEFINE_RATELIMIT_STATE(testrl, TESTRL_INTERVAL, 3); + +#define test_ratelimited(test, expected) \ + KUNIT_ASSERT_EQ(test, ___ratelimit(&testrl, "test_ratelimit_smoke"), (expected)) + +static void test_ratelimit_smoke(struct kunit *test) +{ + // Check settings. + KUNIT_ASSERT_GE(test, TESTRL_INTERVAL, 100); + + // Test normal operation. + test_ratelimited(test, true); + test_ratelimited(test, true); + test_ratelimited(test, true); + test_ratelimited(test, false); + + schedule_timeout_idle(TESTRL_INTERVAL / 2); + test_ratelimited(test, false); + + schedule_timeout_idle(TESTRL_INTERVAL * 3 / 4); + test_ratelimited(test, true); + + schedule_timeout_idle(2 * TESTRL_INTERVAL); + test_ratelimited(test, true); + test_ratelimited(test, true); + + schedule_timeout_idle(TESTRL_INTERVAL / 2 ); + test_ratelimited(test, true); + schedule_timeout_idle(TESTRL_INTERVAL * 3 / 4); + test_ratelimited(test, true); + test_ratelimited(test, true); + test_ratelimited(test, true); + test_ratelimited(test, false); + + // Test disabling. + testrl.burst = 0; + test_ratelimited(test, false); + testrl.burst = 2; + testrl.interval = 0; + test_ratelimited(test, true); + test_ratelimited(test, true); + test_ratelimited(test, true); + test_ratelimited(test, true); + test_ratelimited(test, true); + test_ratelimited(test, true); + test_ratelimited(test, true); + + // Testing re-enabling. + testrl.interval = TESTRL_INTERVAL; + test_ratelimited(test, true); + test_ratelimited(test, true); + test_ratelimited(test, false); + test_ratelimited(test, false); +} + +static struct ratelimit_state stressrl = RATELIMIT_STATE_INIT_FLAGS("stressrl", HZ / 10, 3, + RATELIMIT_MSG_ON_RELEASE); + +static int doneflag; +static const int stress_duration = 2 * HZ; + +struct stress_kthread { + unsigned long nattempts; + unsigned long nunlimited; + unsigned long nlimited; + unsigned long nmissed; + struct task_struct *tp; +}; + +static int test_ratelimit_stress_child(void *arg) +{ + struct stress_kthread *sktp = arg; + + set_user_nice(current, MAX_NICE); + WARN_ON_ONCE(!sktp->tp); + + while (!READ_ONCE(doneflag)) { + sktp->nattempts++; + if (___ratelimit(&stressrl, __func__)) + sktp->nunlimited++; + else + sktp->nlimited++; + cond_resched(); + } + + sktp->nmissed = ratelimit_state_reset_miss(&stressrl); + return 0; +} + +static void test_ratelimit_stress(struct kunit *test) +{ + int i; + const int n_stress_kthread = cpumask_weight(cpu_online_mask); + struct stress_kthread skt = { 0 }; + struct stress_kthread *sktp = kcalloc(n_stress_kthread, sizeof(*sktp), GFP_KERNEL); + + KUNIT_EXPECT_NOT_NULL_MSG(test, sktp, "Memory allocation failure"); + for (i = 0; i < n_stress_kthread; i++) { + sktp[i].tp = kthread_run(test_ratelimit_stress_child, &sktp[i], "%s/%i", + "test_ratelimit_stress_child", i); + KUNIT_EXPECT_NOT_NULL_MSG(test, sktp, "kthread creation failure"); + pr_alert("Spawned test_ratelimit_stress_child %d\n", i); + } + schedule_timeout_idle(stress_duration); + WRITE_ONCE(doneflag, 1); + for (i = 0; i < n_stress_kthread; i++) { + kthread_stop(sktp[i].tp); + skt.nattempts += sktp[i].nattempts; + skt.nunlimited += sktp[i].nunlimited; + skt.nlimited += sktp[i].nlimited; + skt.nmissed += sktp[i].nmissed; + } + KUNIT_ASSERT_EQ_MSG(test, skt.nunlimited + skt.nlimited, skt.nattempts, + "Outcomes not equal to attempts"); + KUNIT_ASSERT_EQ_MSG(test, skt.nlimited, skt.nmissed, "Misses not equal to limits"); +} + +static struct kunit_case ratelimit_test_cases[] = { + KUNIT_CASE_SLOW(test_ratelimit_smoke), + KUNIT_CASE_SLOW(test_ratelimit_stress), + {} +}; + +static struct kunit_suite ratelimit_test_suite = { + .name = "lib_ratelimit", + .test_cases = ratelimit_test_cases, +}; + +kunit_test_suites(&ratelimit_test_suite); + +MODULE_DESCRIPTION("___ratelimit() KUnit test suite"); +MODULE_LICENSE("GPL"); diff --git a/lib/tests/usercopy_kunit.c b/lib/tests/usercopy_kunit.c index 77fa00a13df7..80f8abe10968 100644 --- a/lib/tests/usercopy_kunit.c +++ b/lib/tests/usercopy_kunit.c @@ -27,6 +27,7 @@ !defined(CONFIG_MICROBLAZE) && \ !defined(CONFIG_NIOS2) && \ !defined(CONFIG_PPC32) && \ + !defined(CONFIG_SPARC32) && \ !defined(CONFIG_SUPERH)) # define TEST_U64 #endif diff --git a/lib/ubsan.c b/lib/ubsan.c index cdc1d31c3821..a6ca235dd714 100644 --- a/lib/ubsan.c +++ b/lib/ubsan.c @@ -19,13 +19,13 @@ #include "ubsan.h" -#ifdef CONFIG_UBSAN_TRAP +#if defined(CONFIG_UBSAN_TRAP) || defined(CONFIG_UBSAN_KVM_EL2) /* * Only include matches for UBSAN checks that are actually compiled in. * The mappings of struct SanitizerKind (the -fsanitize=xxx args) to * enum SanitizerHandler (the traps) in Clang is in clang/lib/CodeGen/. */ -const char *report_ubsan_failure(struct pt_regs *regs, u32 check_type) +const char *report_ubsan_failure(u32 check_type) { switch (check_type) { #ifdef CONFIG_UBSAN_BOUNDS @@ -97,7 +97,9 @@ const char *report_ubsan_failure(struct pt_regs *regs, u32 check_type) } } -#else +#endif + +#ifndef CONFIG_UBSAN_TRAP static const char * const type_check_kinds[] = { "load of", "store to", diff --git a/lib/ucs2_string.c b/lib/ucs2_string.c index 9308bcfb2ad5..dfb4f2358cab 100644 --- a/lib/ucs2_string.c +++ b/lib/ucs2_string.c @@ -165,4 +165,5 @@ ucs2_as_utf8(u8 *dest, const ucs2_char_t *src, unsigned long maxlength) } EXPORT_SYMBOL(ucs2_as_utf8); +MODULE_DESCRIPTION("UCS2 string handling"); MODULE_LICENSE("GPL v2"); diff --git a/lib/vdso/datastore.c b/lib/vdso/datastore.c index c715e217ec65..3693c6caf2c4 100644 --- a/lib/vdso/datastore.c +++ b/lib/vdso/datastore.c @@ -99,7 +99,8 @@ const struct vm_special_mapping vdso_vvar_mapping = { struct vm_area_struct *vdso_install_vvar_mapping(struct mm_struct *mm, unsigned long addr) { return _install_special_mapping(mm, addr, VDSO_NR_PAGES * PAGE_SIZE, - VM_READ | VM_MAYREAD | VM_IO | VM_DONTDUMP | VM_PFNMAP, + VM_READ | VM_MAYREAD | VM_IO | VM_DONTDUMP | + VM_PFNMAP | VM_SEALED_SYSMAP, &vdso_vvar_mapping); } diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index 93ef801a97ef..02ea19f67164 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -2,6 +2,7 @@ /* * Generic userspace implementations of gettimeofday() and similar. */ +#include <vdso/auxclock.h> #include <vdso/datapage.h> #include <vdso/helpers.h> @@ -71,6 +72,42 @@ static inline bool vdso_cycles_ok(u64 cycles) } #endif +static __always_inline bool vdso_clockid_valid(clockid_t clock) +{ + /* Check for negative values or invalid clocks */ + return likely((u32) clock <= CLOCK_AUX_LAST); +} + +/* + * Must not be invoked within the sequence read section as a race inside + * that loop could result in __iter_div_u64_rem() being extremely slow. + */ +static __always_inline void vdso_set_timespec(struct __kernel_timespec *ts, u64 sec, u64 ns) +{ + ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; +} + +static __always_inline +bool vdso_get_timestamp(const struct vdso_time_data *vd, const struct vdso_clock *vc, + unsigned int clkidx, u64 *sec, u64 *ns) +{ + const struct vdso_timestamp *vdso_ts = &vc->basetime[clkidx]; + u64 cycles; + + if (unlikely(!vdso_clocksource_ok(vc))) + return false; + + cycles = __arch_get_hw_counter(vc->clock_mode, vd); + if (unlikely(!vdso_cycles_ok(cycles))) + return false; + + *ns = vdso_calc_ns(vc, cycles, vdso_ts->nsec); + *sec = vdso_ts->sec; + + return true; +} + #ifdef CONFIG_TIME_NS #ifdef CONFIG_GENERIC_VDSO_DATA_STORE @@ -82,48 +119,35 @@ const struct vdso_time_data *__arch_get_vdso_u_timens_data(const struct vdso_tim #endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ static __always_inline -int do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, - clockid_t clk, struct __kernel_timespec *ts) +bool do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, + clockid_t clk, struct __kernel_timespec *ts) { const struct vdso_time_data *vd = __arch_get_vdso_u_timens_data(vdns); const struct timens_offset *offs = &vcns->offset[clk]; const struct vdso_clock *vc = vd->clock_data; - const struct vdso_timestamp *vdso_ts; - u64 cycles, ns; u32 seq; s64 sec; + u64 ns; if (clk != CLOCK_MONOTONIC_RAW) vc = &vc[CS_HRES_COARSE]; else vc = &vc[CS_RAW]; - vdso_ts = &vc->basetime[clk]; do { seq = vdso_read_begin(vc); - if (unlikely(!vdso_clocksource_ok(vc))) - return -1; - - cycles = __arch_get_hw_counter(vc->clock_mode, vd); - if (unlikely(!vdso_cycles_ok(cycles))) - return -1; - ns = vdso_calc_ns(vc, cycles, vdso_ts->nsec); - sec = vdso_ts->sec; + if (!vdso_get_timestamp(vd, vc, clk, &sec, &ns)) + return false; } while (unlikely(vdso_read_retry(vc, seq))); /* Add the namespace offset */ sec += offs->sec; ns += offs->nsec; - /* - * Do this outside the loop: a race inside the loop could result - * in __iter_div_u64_rem() being extremely slow. - */ - ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); - ts->tv_nsec = ns; + vdso_set_timespec(ts, sec, ns); - return 0; + return true; } #else static __always_inline @@ -133,24 +157,23 @@ const struct vdso_time_data *__arch_get_vdso_u_timens_data(const struct vdso_tim } static __always_inline -int do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, - clockid_t clk, struct __kernel_timespec *ts) +bool do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, + clockid_t clk, struct __kernel_timespec *ts) { - return -EINVAL; + return false; } #endif static __always_inline -int do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, - clockid_t clk, struct __kernel_timespec *ts) +bool do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, + clockid_t clk, struct __kernel_timespec *ts) { - const struct vdso_timestamp *vdso_ts = &vc->basetime[clk]; - u64 cycles, sec, ns; + u64 sec, ns; u32 seq; /* Allows to compile the high resolution parts out */ if (!__arch_vdso_hres_capable()) - return -1; + return false; do { /* @@ -172,30 +195,19 @@ int do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, } smp_rmb(); - if (unlikely(!vdso_clocksource_ok(vc))) - return -1; - - cycles = __arch_get_hw_counter(vc->clock_mode, vd); - if (unlikely(!vdso_cycles_ok(cycles))) - return -1; - ns = vdso_calc_ns(vc, cycles, vdso_ts->nsec); - sec = vdso_ts->sec; + if (!vdso_get_timestamp(vd, vc, clk, &sec, &ns)) + return false; } while (unlikely(vdso_read_retry(vc, seq))); - /* - * Do this outside the loop: a race inside the loop could result - * in __iter_div_u64_rem() being extremely slow. - */ - ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); - ts->tv_nsec = ns; + vdso_set_timespec(ts, sec, ns); - return 0; + return true; } #ifdef CONFIG_TIME_NS static __always_inline -int do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, - clockid_t clk, struct __kernel_timespec *ts) +bool do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, + clockid_t clk, struct __kernel_timespec *ts) { const struct vdso_time_data *vd = __arch_get_vdso_u_timens_data(vdns); const struct timens_offset *offs = &vcns->offset[clk]; @@ -217,26 +229,22 @@ int do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock sec += offs->sec; nsec += offs->nsec; - /* - * Do this outside the loop: a race inside the loop could result - * in __iter_div_u64_rem() being extremely slow. - */ - ts->tv_sec = sec + __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec); - ts->tv_nsec = nsec; - return 0; + vdso_set_timespec(ts, sec, nsec); + + return true; } #else static __always_inline -int do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, - clockid_t clk, struct __kernel_timespec *ts) +bool do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, + clockid_t clk, struct __kernel_timespec *ts) { - return -1; + return false; } #endif static __always_inline -int do_coarse(const struct vdso_time_data *vd, const struct vdso_clock *vc, - clockid_t clk, struct __kernel_timespec *ts) +bool do_coarse(const struct vdso_time_data *vd, const struct vdso_clock *vc, + clockid_t clk, struct __kernel_timespec *ts) { const struct vdso_timestamp *vdso_ts = &vc->basetime[clk]; u32 seq; @@ -258,19 +266,60 @@ int do_coarse(const struct vdso_time_data *vd, const struct vdso_clock *vc, ts->tv_nsec = vdso_ts->nsec; } while (unlikely(vdso_read_retry(vc, seq))); - return 0; + return true; +} + +static __always_inline +bool do_aux(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_timespec *ts) +{ + const struct vdso_clock *vc; + u32 seq, idx; + u64 sec, ns; + + if (!IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) + return false; + + idx = clock - CLOCK_AUX; + vc = &vd->aux_clock_data[idx]; + + do { + /* + * Open coded function vdso_read_begin() to handle + * VDSO_CLOCK_TIMENS. See comment in do_hres(). + */ + while ((seq = READ_ONCE(vc->seq)) & 1) { + if (IS_ENABLED(CONFIG_TIME_NS) && vc->clock_mode == VDSO_CLOCKMODE_TIMENS) { + vd = __arch_get_vdso_u_timens_data(vd); + vc = &vd->aux_clock_data[idx]; + /* Re-read from the real time data page */ + continue; + } + cpu_relax(); + } + smp_rmb(); + + /* Auxclock disabled? */ + if (vc->clock_mode == VDSO_CLOCKMODE_NONE) + return false; + + if (!vdso_get_timestamp(vd, vc, VDSO_BASE_AUX, &sec, &ns)) + return false; + } while (unlikely(vdso_read_retry(vc, seq))); + + vdso_set_timespec(ts, sec, ns); + + return true; } -static __always_inline int +static __always_inline bool __cvdso_clock_gettime_common(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_timespec *ts) { const struct vdso_clock *vc = vd->clock_data; u32 msk; - /* Check for negative values or invalid clocks */ - if (unlikely((u32) clock >= MAX_CLOCKS)) - return -1; + if (!vdso_clockid_valid(clock)) + return false; /* * Convert the clockid to a bitmask and use it to check which @@ -283,8 +332,10 @@ __cvdso_clock_gettime_common(const struct vdso_time_data *vd, clockid_t clock, return do_coarse(vd, &vc[CS_HRES_COARSE], clock, ts); else if (msk & VDSO_RAW) vc = &vc[CS_RAW]; + else if (msk & VDSO_AUX) + return do_aux(vd, clock, ts); else - return -1; + return false; return do_hres(vd, vc, clock, ts); } @@ -293,9 +344,11 @@ static __maybe_unused int __cvdso_clock_gettime_data(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_timespec *ts) { - int ret = __cvdso_clock_gettime_common(vd, clock, ts); + bool ok; + + ok = __cvdso_clock_gettime_common(vd, clock, ts); - if (unlikely(ret)) + if (unlikely(!ok)) return clock_gettime_fallback(clock, ts); return 0; } @@ -312,18 +365,18 @@ __cvdso_clock_gettime32_data(const struct vdso_time_data *vd, clockid_t clock, struct old_timespec32 *res) { struct __kernel_timespec ts; - int ret; + bool ok; - ret = __cvdso_clock_gettime_common(vd, clock, &ts); + ok = __cvdso_clock_gettime_common(vd, clock, &ts); - if (unlikely(ret)) + if (unlikely(!ok)) return clock_gettime32_fallback(clock, res); - /* For ret == 0 */ + /* For ok == true */ res->tv_sec = ts.tv_sec; res->tv_nsec = ts.tv_nsec; - return ret; + return 0; } static __maybe_unused int @@ -342,7 +395,7 @@ __cvdso_gettimeofday_data(const struct vdso_time_data *vd, if (likely(tv != NULL)) { struct __kernel_timespec ts; - if (do_hres(vd, &vc[CS_HRES_COARSE], CLOCK_REALTIME, &ts)) + if (!do_hres(vd, &vc[CS_HRES_COARSE], CLOCK_REALTIME, &ts)) return gettimeofday_fallback(tv, tz); tv->tv_sec = ts.tv_sec; @@ -396,16 +449,15 @@ static __maybe_unused __kernel_old_time_t __cvdso_time(__kernel_old_time_t *time #ifdef VDSO_HAS_CLOCK_GETRES static __maybe_unused -int __cvdso_clock_getres_common(const struct vdso_time_data *vd, clockid_t clock, - struct __kernel_timespec *res) +bool __cvdso_clock_getres_common(const struct vdso_time_data *vd, clockid_t clock, + struct __kernel_timespec *res) { const struct vdso_clock *vc = vd->clock_data; u32 msk; u64 ns; - /* Check for negative values or invalid clocks */ - if (unlikely((u32) clock >= MAX_CLOCKS)) - return -1; + if (!vdso_clockid_valid(clock)) + return false; if (IS_ENABLED(CONFIG_TIME_NS) && vc->clock_mode == VDSO_CLOCKMODE_TIMENS) @@ -426,24 +478,28 @@ int __cvdso_clock_getres_common(const struct vdso_time_data *vd, clockid_t clock * Preserves the behaviour of posix_get_coarse_res(). */ ns = LOW_RES_NSEC; + } else if (msk & VDSO_AUX) { + ns = aux_clock_resolution_ns(); } else { - return -1; + return false; } if (likely(res)) { res->tv_sec = 0; res->tv_nsec = ns; } - return 0; + return true; } static __maybe_unused int __cvdso_clock_getres_data(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_timespec *res) { - int ret = __cvdso_clock_getres_common(vd, clock, res); + bool ok; - if (unlikely(ret)) + ok = __cvdso_clock_getres_common(vd, clock, res); + + if (unlikely(!ok)) return clock_getres_fallback(clock, res); return 0; } @@ -460,18 +516,18 @@ __cvdso_clock_getres_time32_data(const struct vdso_time_data *vd, clockid_t cloc struct old_timespec32 *res) { struct __kernel_timespec ts; - int ret; + bool ok; - ret = __cvdso_clock_getres_common(vd, clock, &ts); + ok = __cvdso_clock_getres_common(vd, clock, &ts); - if (unlikely(ret)) + if (unlikely(!ok)) return clock_getres32_fallback(clock, res); if (likely(res)) { res->tv_sec = ts.tv_sec; res->tv_nsec = ts.tv_nsec; } - return ret; + return 0; } static __maybe_unused int diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 02a811f6f661..eb0cb11d0d12 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1806,27 +1806,49 @@ char *fourcc_string(char *buf, char *end, const u32 *fourcc, char output[sizeof("0123 little-endian (0x01234567)")]; char *p = output; unsigned int i; + bool pixel_fmt = false; u32 orig, val; - if (fmt[1] != 'c' || fmt[2] != 'c') + if (fmt[1] != 'c') return error_string(buf, end, "(%p4?)", spec); if (check_pointer(&buf, end, fourcc, spec)) return buf; orig = get_unaligned(fourcc); - val = orig & ~BIT(31); + switch (fmt[2]) { + case 'h': + if (fmt[3] == 'R') + orig = swab32(orig); + break; + case 'l': + orig = (__force u32)cpu_to_le32(orig); + break; + case 'b': + orig = (__force u32)cpu_to_be32(orig); + break; + case 'c': + /* Pixel formats are printed LSB-first */ + pixel_fmt = true; + break; + default: + return error_string(buf, end, "(%p4?)", spec); + } + + val = pixel_fmt ? swab32(orig & ~BIT(31)) : orig; for (i = 0; i < sizeof(u32); i++) { - unsigned char c = val >> (i * 8); + unsigned char c = val >> ((3 - i) * 8); /* Print non-control ASCII characters as-is, dot otherwise */ *p++ = isascii(c) && isprint(c) ? c : '.'; } - *p++ = ' '; - strcpy(p, orig & BIT(31) ? "big-endian" : "little-endian"); - p += strlen(p); + if (pixel_fmt) { + *p++ = ' '; + strcpy(p, orig & BIT(31) ? "big-endian" : "little-endian"); + p += strlen(p); + } *p++ = ' '; *p++ = '('; @@ -1994,15 +2016,11 @@ char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec, if (check_pointer(&buf, end, clk, spec)) return buf; - switch (fmt[1]) { - case 'n': - default: #ifdef CONFIG_COMMON_CLK - return string(buf, end, __clk_get_name(clk), spec); + return string(buf, end, __clk_get_name(clk), spec); #else - return ptr_to_id(buf, end, clk, spec); + return ptr_to_id(buf, end, clk, spec); #endif - } } static @@ -2426,6 +2444,12 @@ early_param("no_hash_pointers", no_hash_pointers_enable); * read the documentation (path below) first. * - 'NF' For a netdev_features_t * - '4cc' V4L2 or DRM FourCC code, with endianness and raw numerical value. + * - '4c[h[R]lb]' For generic FourCC code with raw numerical value. Both are + * displayed in the big-endian format. This is the opposite of V4L2 or + * DRM FourCCs. + * The additional specifiers define what endianness is used to load + * the stored bytes. The data might be interpreted using the host, + * reversed host byte order, little-endian, or big-endian. * - 'h[CDN]' For a variable-length buffer, it prints it as a hex string with * a certain separator (' ' by default): * C colon @@ -2443,8 +2467,6 @@ early_param("no_hash_pointers", no_hash_pointers_enable); * T time64_t * - 'C' For a clock, it prints the name (Common Clock Framework) or address * (legacy clock framework) of the clock - * - 'Cn' For a clock, it prints the name (Common Clock Framework) or address - * (legacy clock framework) of the clock * - 'G' For flags to be printed as a collection of symbolic strings that would * construct the specific value. Supported flags given by option: * p page flags (see struct page) given as pointer to unsigned long diff --git a/lib/xarray.c b/lib/xarray.c index 9644b18af18d..ae3d80f4b4ee 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -1742,20 +1742,23 @@ static inline void *__xa_cmpxchg_raw(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp); /** - * __xa_cmpxchg() - Store this entry in the XArray. + * __xa_cmpxchg() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. - * @entry: New entry. + * @entry: New value to place in array. * @gfp: Memory allocation flags. * * You must already be holding the xa_lock when calling this function. * It will drop the lock if needed to allocate memory, and then reacquire * it afterwards. * + * If the entry at @index is the same as @old, replace it with @entry. + * If the return value is equal to @old, then the exchange was successful. + * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. - * Return: The old entry at this index or xa_err() if an error happened. + * Return: The old value at this index or xa_err() if an error happened. */ void *__xa_cmpxchg(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) @@ -1907,6 +1910,7 @@ EXPORT_SYMBOL(xa_store_range); * @xas: XArray operation state. * * Called after xas_load, the xas should not be in an error state. + * The xas should not be pointing to a sibling entry. * * Return: A number between 0 and 63 indicating the order of the entry. */ @@ -1917,6 +1921,8 @@ int xas_get_order(struct xa_state *xas) if (!xas->xa_node) return 0; + XA_NODE_BUG_ON(xas->xa_node, xa_is_sibling(xa_entry(xas->xa, + xas->xa_node, xas->xa_offset))); for (;;) { unsigned int slot = xas->xa_offset + (1 << order); diff --git a/lib/xxhash.c b/lib/xxhash.c index b5bd567aa6b3..cf629766f376 100644 --- a/lib/xxhash.c +++ b/lib/xxhash.c @@ -267,113 +267,6 @@ void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed) } EXPORT_SYMBOL(xxh64_reset); -int xxh32_update(struct xxh32_state *state, const void *input, const size_t len) -{ - const uint8_t *p = (const uint8_t *)input; - const uint8_t *const b_end = p + len; - - if (input == NULL) - return -EINVAL; - - state->total_len_32 += (uint32_t)len; - state->large_len |= (len >= 16) | (state->total_len_32 >= 16); - - if (state->memsize + len < 16) { /* fill in tmp buffer */ - memcpy((uint8_t *)(state->mem32) + state->memsize, input, len); - state->memsize += (uint32_t)len; - return 0; - } - - if (state->memsize) { /* some data left from previous update */ - const uint32_t *p32 = state->mem32; - - memcpy((uint8_t *)(state->mem32) + state->memsize, input, - 16 - state->memsize); - - state->v1 = xxh32_round(state->v1, get_unaligned_le32(p32)); - p32++; - state->v2 = xxh32_round(state->v2, get_unaligned_le32(p32)); - p32++; - state->v3 = xxh32_round(state->v3, get_unaligned_le32(p32)); - p32++; - state->v4 = xxh32_round(state->v4, get_unaligned_le32(p32)); - p32++; - - p += 16-state->memsize; - state->memsize = 0; - } - - if (p <= b_end - 16) { - const uint8_t *const limit = b_end - 16; - uint32_t v1 = state->v1; - uint32_t v2 = state->v2; - uint32_t v3 = state->v3; - uint32_t v4 = state->v4; - - do { - v1 = xxh32_round(v1, get_unaligned_le32(p)); - p += 4; - v2 = xxh32_round(v2, get_unaligned_le32(p)); - p += 4; - v3 = xxh32_round(v3, get_unaligned_le32(p)); - p += 4; - v4 = xxh32_round(v4, get_unaligned_le32(p)); - p += 4; - } while (p <= limit); - - state->v1 = v1; - state->v2 = v2; - state->v3 = v3; - state->v4 = v4; - } - - if (p < b_end) { - memcpy(state->mem32, p, (size_t)(b_end-p)); - state->memsize = (uint32_t)(b_end-p); - } - - return 0; -} -EXPORT_SYMBOL(xxh32_update); - -uint32_t xxh32_digest(const struct xxh32_state *state) -{ - const uint8_t *p = (const uint8_t *)state->mem32; - const uint8_t *const b_end = (const uint8_t *)(state->mem32) + - state->memsize; - uint32_t h32; - - if (state->large_len) { - h32 = xxh_rotl32(state->v1, 1) + xxh_rotl32(state->v2, 7) + - xxh_rotl32(state->v3, 12) + xxh_rotl32(state->v4, 18); - } else { - h32 = state->v3 /* == seed */ + PRIME32_5; - } - - h32 += state->total_len_32; - - while (p + 4 <= b_end) { - h32 += get_unaligned_le32(p) * PRIME32_3; - h32 = xxh_rotl32(h32, 17) * PRIME32_4; - p += 4; - } - - while (p < b_end) { - h32 += (*p) * PRIME32_5; - h32 = xxh_rotl32(h32, 11) * PRIME32_1; - p++; - } - - h32 ^= h32 >> 15; - h32 *= PRIME32_2; - h32 ^= h32 >> 13; - h32 *= PRIME32_3; - h32 ^= h32 >> 16; - - return h32; -} -EXPORT_SYMBOL(xxh32_digest); - int xxh64_update(struct xxh64_state *state, const void *input, const size_t len) { const uint8_t *p = (const uint8_t *)input; diff --git a/lib/zlib_inflate/inflate_syms.c b/lib/zlib_inflate/inflate_syms.c index 9720114c0672..b8996d90e8bc 100644 --- a/lib/zlib_inflate/inflate_syms.c +++ b/lib/zlib_inflate/inflate_syms.c @@ -18,4 +18,5 @@ EXPORT_SYMBOL(zlib_inflateEnd); EXPORT_SYMBOL(zlib_inflateReset); EXPORT_SYMBOL(zlib_inflateIncomp); EXPORT_SYMBOL(zlib_inflate_blob); +MODULE_DESCRIPTION("Data decompression using the deflation algorithm"); MODULE_LICENSE("GPL"); |