summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/Kconfig4
-rw-r--r--arch/alpha/include/asm/param.h12
-rw-r--r--arch/alpha/include/uapi/asm/param.h9
-rw-r--r--arch/alpha/kernel/syscalls/syscall.tbl2
-rw-r--r--arch/arc/kernel/ptrace.c4
-rw-r--r--arch/arm/Kconfig6
-rw-r--r--arch/arm/Makefile2
-rw-r--r--arch/arm/boot/compressed/Makefile2
-rw-r--r--arch/arm/boot/dts/allwinner/sun8i-v3s.dtsi2
-rw-r--r--arch/arm/configs/exynos_defconfig2
-rw-r--r--arch/arm/configs/milbeaut_m10v_defconfig3
-rw-r--r--arch/arm/configs/multi_v7_defconfig4
-rw-r--r--arch/arm/configs/omap2plus_defconfig2
-rw-r--r--arch/arm/configs/pxa_defconfig2
-rw-r--r--arch/arm/crypto/Kconfig41
-rw-r--r--arch/arm/crypto/Makefile21
-rw-r--r--arch/arm/crypto/sha1-armv4-large.S507
-rw-r--r--arch/arm/crypto/sha1-armv7-neon.S634
-rw-r--r--arch/arm/crypto/sha1-ce-core.S123
-rw-r--r--arch/arm/crypto/sha1-ce-glue.c72
-rw-r--r--arch/arm/crypto/sha1_glue.c75
-rw-r--r--arch/arm/crypto/sha1_neon_glue.c83
-rw-r--r--arch/arm/crypto/sha512-armv4.pl657
-rw-r--r--arch/arm/crypto/sha512-glue.c110
-rw-r--r--arch/arm/crypto/sha512-neon-glue.c75
-rw-r--r--arch/arm/crypto/sha512.h3
-rw-r--r--arch/arm/kernel/entry-common.S2
-rw-r--r--arch/arm/kernel/ptrace.c6
-rw-r--r--arch/arm/lib/.gitignore4
-rw-r--r--arch/arm/lib/Makefile8
-rw-r--r--arch/arm/lib/crc-t10dif-core.S468
-rw-r--r--arch/arm/lib/crc-t10dif.c72
-rw-r--r--arch/arm/lib/crc32-core.S306
-rw-r--r--arch/arm/lib/crc32.c123
-rw-r--r--arch/arm/lib/crypto/.gitignore3
-rw-r--r--arch/arm/lib/crypto/Kconfig31
-rw-r--r--arch/arm/lib/crypto/Makefile32
-rw-r--r--arch/arm/lib/crypto/blake2s-core.S306
-rw-r--r--arch/arm/lib/crypto/blake2s-glue.c7
-rw-r--r--arch/arm/lib/crypto/chacha-glue.c138
-rw-r--r--arch/arm/lib/crypto/chacha-neon-core.S643
-rw-r--r--arch/arm/lib/crypto/chacha-scalar-core.S444
-rw-r--r--arch/arm/lib/crypto/poly1305-armv4.pl1236
-rw-r--r--arch/arm/lib/crypto/poly1305-glue.c80
-rw-r--r--arch/arm/lib/crypto/sha256-armv4.pl724
-rw-r--r--arch/arm/lib/crypto/sha256-ce.S123
-rw-r--r--arch/arm/lib/crypto/sha256.c64
-rw-r--r--arch/arm/mm/cache-feroceon-l2.c2
-rw-r--r--arch/arm/mm/cache-tauros2.c2
-rw-r--r--arch/arm/tools/syscall.tbl2
-rw-r--r--arch/arm/vdso/Makefile2
-rw-r--r--arch/arm64/Kconfig4
-rw-r--r--arch/arm64/boot/dts/allwinner/sun55i-a523.dtsi6
-rw-r--r--arch/arm64/boot/dts/allwinner/sun55i-a527-cubie-a5e.dts4
-rw-r--r--arch/arm64/boot/dts/allwinner/sun55i-t527-avaota-a1.dts4
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi3
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi1
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mp-venice-gw71xx.dtsi2
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi2
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi2
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts2
-rw-r--r--arch/arm64/boot/dts/freescale/imx95-15x15-evk.dts20
-rw-r--r--arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts12
-rw-r--r--arch/arm64/boot/dts/freescale/imx95.dtsi2
-rw-r--r--arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts2
-rw-r--r--arch/arm64/boot/dts/qcom/x1e80100-pmics.dtsi1
-rw-r--r--arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi23
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts1
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts3
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts28
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3576.dtsi2
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3588-base-pinctrl.dtsi20
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi1
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3588-extra-pinctrl.dtsi5
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts1
-rw-r--r--arch/arm64/boot/dts/rockchip/rockchip-pinconf.dtsi35
-rw-r--r--arch/arm64/configs/defconfig3
-rw-r--r--arch/arm64/crypto/Kconfig30
-rw-r--r--arch/arm64/crypto/Makefile17
-rw-r--r--arch/arm64/crypto/sha1-ce-core.S150
-rw-r--r--arch/arm64/crypto/sha1-ce-glue.c118
-rw-r--r--arch/arm64/crypto/sha512-ce-core.S206
-rw-r--r--arch/arm64/crypto/sha512-ce-glue.c96
-rw-r--r--arch/arm64/crypto/sha512-glue.c83
-rw-r--r--arch/arm64/include/asm/acpi.h2
-rw-r--r--arch/arm64/include/asm/assembler.h5
-rw-r--r--arch/arm64/kernel/entry.S8
-rw-r--r--arch/arm64/kernel/pi/Makefile2
-rw-r--r--arch/arm64/kernel/ptrace.c52
-rw-r--r--arch/arm64/kernel/vdso/Makefile3
-rw-r--r--arch/arm64/kvm/hyp/nvhe/Makefile2
-rw-r--r--arch/arm64/lib/.gitignore4
-rw-r--r--arch/arm64/lib/Makefile9
-rw-r--r--arch/arm64/lib/crc-t10dif-core.S469
-rw-r--r--arch/arm64/lib/crc-t10dif.c73
-rw-r--r--arch/arm64/lib/crc32-core.S362
-rw-r--r--arch/arm64/lib/crc32.c99
-rw-r--r--arch/arm64/lib/crypto/.gitignore3
-rw-r--r--arch/arm64/lib/crypto/Kconfig20
-rw-r--r--arch/arm64/lib/crypto/Makefile24
-rw-r--r--arch/arm64/lib/crypto/chacha-neon-core.S805
-rw-r--r--arch/arm64/lib/crypto/chacha-neon-glue.c119
-rw-r--r--arch/arm64/lib/crypto/poly1305-armv8.pl917
-rw-r--r--arch/arm64/lib/crypto/poly1305-glue.c73
-rw-r--r--arch/arm64/lib/crypto/sha2-armv8.pl786
-rw-r--r--arch/arm64/lib/crypto/sha256-ce.S136
-rw-r--r--arch/arm64/lib/crypto/sha256.c75
-rw-r--r--arch/arm64/tools/syscall_32.tbl2
-rw-r--r--arch/csky/kernel/ptrace.c4
-rw-r--r--arch/hexagon/kernel/ptrace.c2
-rw-r--r--arch/loongarch/Kconfig3
-rw-r--r--arch/loongarch/include/asm/Kbuild1
-rw-r--r--arch/loongarch/kernel/ptrace.c16
-rw-r--r--arch/loongarch/lib/Makefile2
-rw-r--r--arch/loongarch/lib/crc32-loongarch.c136
-rw-r--r--arch/m68k/kernel/ptrace.c4
-rw-r--r--arch/m68k/kernel/syscalls/syscall.tbl2
-rw-r--r--arch/microblaze/kernel/syscalls/syscall.tbl2
-rw-r--r--arch/mips/Kconfig1
-rw-r--r--arch/mips/cavium-octeon/Kconfig6
-rw-r--r--arch/mips/cavium-octeon/crypto/Makefile3
-rw-r--r--arch/mips/cavium-octeon/crypto/octeon-crypto.c3
-rw-r--r--arch/mips/cavium-octeon/crypto/octeon-md5.c3
-rw-r--r--arch/mips/cavium-octeon/crypto/octeon-sha1.c147
-rw-r--r--arch/mips/cavium-octeon/crypto/octeon-sha256.c73
-rw-r--r--arch/mips/cavium-octeon/crypto/octeon-sha512.c167
-rw-r--r--arch/mips/configs/cavium_octeon_defconfig2
-rw-r--r--arch/mips/crypto/Kconfig20
-rw-r--r--arch/mips/include/asm/octeon/crypto.h (renamed from arch/mips/cavium-octeon/crypto/octeon-crypto.h)0
-rw-r--r--arch/mips/include/asm/time.h2
-rw-r--r--arch/mips/kernel/ptrace.c20
-rw-r--r--arch/mips/kernel/syscalls/syscall_n32.tbl2
-rw-r--r--arch/mips/kernel/syscalls/syscall_n64.tbl2
-rw-r--r--arch/mips/kernel/syscalls/syscall_o32.tbl2
-rw-r--r--arch/mips/lib/.gitignore4
-rw-r--r--arch/mips/lib/Makefile4
-rw-r--r--arch/mips/lib/crc32-mips.c183
-rw-r--r--arch/mips/lib/crypto/.gitignore2
-rw-r--r--arch/mips/lib/crypto/Kconfig12
-rw-r--r--arch/mips/lib/crypto/Makefile19
-rw-r--r--arch/mips/lib/crypto/chacha-core.S497
-rw-r--r--arch/mips/lib/crypto/chacha-glue.c29
-rw-r--r--arch/mips/lib/crypto/poly1305-glue.c33
-rw-r--r--arch/mips/lib/crypto/poly1305-mips.pl1273
-rw-r--r--arch/nios2/kernel/ptrace.c2
-rw-r--r--arch/openrisc/kernel/ptrace.c4
-rw-r--r--arch/parisc/kernel/ptrace.c8
-rw-r--r--arch/parisc/kernel/syscalls/syscall.tbl2
-rw-r--r--arch/powerpc/Kconfig2
-rw-r--r--arch/powerpc/configs/44x/akebono_defconfig1
-rw-r--r--arch/powerpc/configs/powernv_defconfig1
-rw-r--r--arch/powerpc/configs/ppc64_defconfig1
-rw-r--r--arch/powerpc/crypto/Kconfig16
-rw-r--r--arch/powerpc/crypto/Makefile4
-rw-r--r--arch/powerpc/crypto/sha1-powerpc-asm.S188
-rw-r--r--arch/powerpc/crypto/sha1-spe-asm.S294
-rw-r--r--arch/powerpc/crypto/sha1-spe-glue.c107
-rw-r--r--arch/powerpc/crypto/sha1.c78
-rw-r--r--arch/powerpc/kernel/ptrace/ptrace-view.c74
-rw-r--r--arch/powerpc/kernel/syscalls/syscall.tbl2
-rw-r--r--arch/powerpc/lib/Makefile8
-rw-r--r--arch/powerpc/lib/crc-t10dif.c83
-rw-r--r--arch/powerpc/lib/crc-vpmsum-template.S746
-rw-r--r--arch/powerpc/lib/crc32.c93
-rw-r--r--arch/powerpc/lib/crc32c-vpmsum_asm.S842
-rw-r--r--arch/powerpc/lib/crct10dif-vpmsum_asm.S845
-rw-r--r--arch/powerpc/lib/crypto/Kconfig22
-rw-r--r--arch/powerpc/lib/crypto/Makefile10
-rw-r--r--arch/powerpc/lib/crypto/chacha-p10-glue.c100
-rw-r--r--arch/powerpc/lib/crypto/chacha-p10le-8x.S840
-rw-r--r--arch/powerpc/lib/crypto/poly1305-p10-glue.c96
-rw-r--r--arch/powerpc/lib/crypto/poly1305-p10le_64.S1075
-rw-r--r--arch/powerpc/lib/crypto/sha256-spe-asm.S318
-rw-r--r--arch/powerpc/lib/crypto/sha256.c70
-rw-r--r--arch/powerpc/mm/book3s64/hash_utils.c6
-rw-r--r--arch/powerpc/mm/book3s64/radix_pgtable.c4
-rw-r--r--arch/powerpc/platforms/cell/spufs/inode.c49
-rw-r--r--arch/riscv/Kconfig8
-rw-r--r--arch/riscv/crypto/Kconfig11
-rw-r--r--arch/riscv/crypto/Makefile3
-rw-r--r--arch/riscv/crypto/sha512-riscv64-glue.c124
-rw-r--r--arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S203
-rw-r--r--arch/riscv/include/asm/uaccess.h4
-rw-r--r--arch/riscv/kernel/entry.S2
-rw-r--r--arch/riscv/kernel/ftrace.c18
-rw-r--r--arch/riscv/kernel/pi/Makefile2
-rw-r--r--arch/riscv/kernel/ptrace.c12
-rw-r--r--arch/riscv/kernel/traps.c10
-rw-r--r--arch/riscv/kernel/traps_misaligned.c2
-rw-r--r--arch/riscv/lib/Makefile7
-rw-r--r--arch/riscv/lib/crc-clmul-consts.h122
-rw-r--r--arch/riscv/lib/crc-clmul-template.h265
-rw-r--r--arch/riscv/lib/crc-clmul.h23
-rw-r--r--arch/riscv/lib/crc-t10dif.c24
-rw-r--r--arch/riscv/lib/crc16_msb.c18
-rw-r--r--arch/riscv/lib/crc32.c53
-rw-r--r--arch/riscv/lib/crc32_lsb.c18
-rw-r--r--arch/riscv/lib/crc32_msb.c18
-rw-r--r--arch/riscv/lib/crc64.c34
-rw-r--r--arch/riscv/lib/crc64_lsb.c18
-rw-r--r--arch/riscv/lib/crc64_msb.c18
-rw-r--r--arch/riscv/lib/crypto/Kconfig16
-rw-r--r--arch/riscv/lib/crypto/Makefile7
-rw-r--r--arch/riscv/lib/crypto/chacha-riscv64-glue.c75
-rw-r--r--arch/riscv/lib/crypto/chacha-riscv64-zvkb.S297
-rw-r--r--arch/riscv/lib/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S225
-rw-r--r--arch/riscv/lib/crypto/sha256.c67
-rw-r--r--arch/riscv/purgatory/Makefile2
-rw-r--r--arch/riscv/purgatory/purgatory.c8
-rwxr-xr-xarch/riscv/tools/relocs_check.sh4
-rw-r--r--arch/s390/Kconfig3
-rw-r--r--arch/s390/configs/debug_defconfig2
-rw-r--r--arch/s390/configs/defconfig2
-rw-r--r--arch/s390/crypto/Kconfig20
-rw-r--r--arch/s390/crypto/Makefile2
-rw-r--r--arch/s390/crypto/sha1_s390.c105
-rw-r--r--arch/s390/crypto/sha512_s390.c154
-rw-r--r--arch/s390/hypfs/hypfs.h2
-rw-r--r--arch/s390/hypfs/hypfs_diag.h2
-rw-r--r--arch/s390/kernel/entry.S2
-rw-r--r--arch/s390/kernel/ptrace.c42
-rw-r--r--arch/s390/kernel/syscalls/syscall.tbl2
-rw-r--r--arch/s390/lib/Makefile4
-rw-r--r--arch/s390/lib/crc32-vx.h12
-rw-r--r--arch/s390/lib/crc32.c77
-rw-r--r--arch/s390/lib/crc32be-vx.c174
-rw-r--r--arch/s390/lib/crc32le-vx.c240
-rw-r--r--arch/s390/lib/crypto/Kconfig13
-rw-r--r--arch/s390/lib/crypto/Makefile7
-rw-r--r--arch/s390/lib/crypto/chacha-glue.c56
-rw-r--r--arch/s390/lib/crypto/chacha-s390.S908
-rw-r--r--arch/s390/lib/crypto/chacha-s390.h14
-rw-r--r--arch/s390/lib/crypto/sha256.c47
-rw-r--r--arch/s390/mm/init.c2
-rw-r--r--arch/s390/net/bpf_jit_comp.c10
-rw-r--r--arch/s390/purgatory/purgatory.c2
-rw-r--r--arch/sh/kernel/ptrace_32.c4
-rw-r--r--arch/sh/kernel/syscalls/syscall.tbl2
-rw-r--r--arch/sparc/Kconfig1
-rw-r--r--arch/sparc/crypto/Kconfig20
-rw-r--r--arch/sparc/crypto/Makefile4
-rw-r--r--arch/sparc/crypto/sha1_asm.S72
-rw-r--r--arch/sparc/crypto/sha1_glue.c94
-rw-r--r--arch/sparc/crypto/sha512_asm.S102
-rw-r--r--arch/sparc/crypto/sha512_glue.c122
-rw-r--r--arch/sparc/kernel/ptrace_32.c4
-rw-r--r--arch/sparc/kernel/ptrace_64.c8
-rw-r--r--arch/sparc/kernel/syscalls/syscall.tbl2
-rw-r--r--arch/sparc/lib/Makefile3
-rw-r--r--arch/sparc/lib/crc32.c93
-rw-r--r--arch/sparc/lib/crc32c_asm.S20
-rw-r--r--arch/sparc/lib/crypto/Kconfig8
-rw-r--r--arch/sparc/lib/crypto/Makefile4
-rw-r--r--arch/sparc/lib/crypto/sha256.c64
-rw-r--r--arch/sparc/lib/crypto/sha256_asm.S78
-rw-r--r--arch/sparc/vdso/Makefile3
-rw-r--r--arch/um/include/asm/Kbuild1
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/coco/sev/Makefile3
-rw-r--r--arch/x86/crypto/Kconfig27
-rw-r--r--arch/x86/crypto/Makefile6
-rw-r--r--arch/x86/crypto/sha1_avx2_x86_64_asm.S700
-rw-r--r--arch/x86/crypto/sha1_ni_asm.S304
-rw-r--r--arch/x86/crypto/sha1_ssse3_asm.S554
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c324
-rw-r--r--arch/x86/crypto/sha512-avx-asm.S423
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S750
-rw-r--r--arch/x86/crypto/sha512-ssse3-asm.S425
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c322
-rw-r--r--arch/x86/entry/calling.h4
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl2
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/entry/vdso/Makefile3
-rw-r--r--arch/x86/hyperv/hv_init.c1
-rw-r--r--arch/x86/hyperv/irqdomain.c69
-rw-r--r--arch/x86/hyperv/ivm.c1
-rw-r--r--arch/x86/hyperv/nested.c1
-rw-r--r--arch/x86/include/asm/acpi.h4
-rw-r--r--arch/x86/include/asm/init.h2
-rw-r--r--arch/x86/include/asm/mshyperv.h22
-rw-r--r--arch/x86/include/asm/realmode.h2
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/ptrace.c22
-rw-r--r--arch/x86/lib/.gitignore4
-rw-r--r--arch/x86/lib/Makefile12
-rw-r--r--arch/x86/lib/crc-pclmul-consts.h195
-rw-r--r--arch/x86/lib/crc-pclmul-template.S582
-rw-r--r--arch/x86/lib/crc-pclmul-template.h76
-rw-r--r--arch/x86/lib/crc-t10dif.c40
-rw-r--r--arch/x86/lib/crc16-msb-pclmul.S6
-rw-r--r--arch/x86/lib/crc32-pclmul.S6
-rw-r--r--arch/x86/lib/crc32.c111
-rw-r--r--arch/x86/lib/crc32c-3way.S360
-rw-r--r--arch/x86/lib/crc64-pclmul.S7
-rw-r--r--arch/x86/lib/crc64.c50
-rw-r--r--arch/x86/lib/crypto/.gitignore2
-rw-r--r--arch/x86/lib/crypto/Kconfig34
-rw-r--r--arch/x86/lib/crypto/Makefile20
-rw-r--r--arch/x86/lib/crypto/blake2s-core.S252
-rw-r--r--arch/x86/lib/crypto/blake2s-glue.c70
-rw-r--r--arch/x86/lib/crypto/chacha-avx2-x86_64.S1021
-rw-r--r--arch/x86/lib/crypto/chacha-avx512vl-x86_64.S836
-rw-r--r--arch/x86/lib/crypto/chacha-ssse3-x86_64.S791
-rw-r--r--arch/x86/lib/crypto/chacha_glue.c196
-rw-r--r--arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl4253
-rw-r--r--arch/x86/lib/crypto/poly1305_glue.c129
-rw-r--r--arch/x86/lib/crypto/sha256-avx-asm.S499
-rw-r--r--arch/x86/lib/crypto/sha256-avx2-asm.S774
-rw-r--r--arch/x86/lib/crypto/sha256-ni-asm.S196
-rw-r--r--arch/x86/lib/crypto/sha256-ssse3-asm.S511
-rw-r--r--arch/x86/lib/crypto/sha256.c80
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/purgatory/Makefile2
-rw-r--r--arch/x86/purgatory/purgatory.c2
-rw-r--r--arch/x86/um/ptrace.c10
-rw-r--r--arch/xtensa/include/asm/Kbuild1
-rw-r--r--arch/xtensa/include/uapi/asm/param.h31
-rw-r--r--arch/xtensa/kernel/ptrace.c4
-rw-r--r--arch/xtensa/kernel/syscalls/syscall.tbl2
319 files changed, 500 insertions, 39874 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index a3308a220f86..4d1908f6f084 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -630,11 +630,11 @@ config SECCOMP_CACHE_DEBUG
If unsure, say N.
-config HAVE_ARCH_STACKLEAK
+config HAVE_ARCH_KSTACK_ERASE
bool
help
An architecture should select this if it has the code which
- fills the used part of the kernel stack with the STACKLEAK_POISON
+ fills the used part of the kernel stack with the KSTACK_ERASE_POISON
value before returning from system calls.
config HAVE_STACKPROTECTOR
diff --git a/arch/alpha/include/asm/param.h b/arch/alpha/include/asm/param.h
deleted file mode 100644
index cfe947ce9461..000000000000
--- a/arch/alpha/include/asm/param.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_ALPHA_PARAM_H
-#define _ASM_ALPHA_PARAM_H
-
-#include <uapi/asm/param.h>
-
-# undef HZ
-# define HZ CONFIG_HZ
-# define USER_HZ 1024
-# define CLOCKS_PER_SEC USER_HZ /* frequency at which times() counts */
-
-#endif /* _ASM_ALPHA_PARAM_H */
diff --git a/arch/alpha/include/uapi/asm/param.h b/arch/alpha/include/uapi/asm/param.h
index 49c7119934e2..e4e410f9bf85 100644
--- a/arch/alpha/include/uapi/asm/param.h
+++ b/arch/alpha/include/uapi/asm/param.h
@@ -2,14 +2,9 @@
#ifndef _UAPI_ASM_ALPHA_PARAM_H
#define _UAPI_ASM_ALPHA_PARAM_H
-#define HZ 1024
-
+#define __USER_HZ 1024
#define EXEC_PAGESIZE 8192
-#ifndef NOGROUP
-#define NOGROUP (-1)
-#endif
-
-#define MAXHOSTNAMELEN 64 /* max length of hostname */
+#include <asm-generic/param.h>
#endif /* _UAPI_ASM_ALPHA_PARAM_H */
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 2dd6340de6b4..16dca28ebf17 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -507,3 +507,5 @@
575 common listxattrat sys_listxattrat
576 common removexattrat sys_removexattrat
577 common open_tree_attr sys_open_tree_attr
+578 common file_getattr sys_file_getattr
+579 common file_setattr sys_file_setattr
diff --git a/arch/arc/kernel/ptrace.c b/arch/arc/kernel/ptrace.c
index e0c233c178b1..cad5367b7c37 100644
--- a/arch/arc/kernel/ptrace.c
+++ b/arch/arc/kernel/ptrace.c
@@ -284,7 +284,7 @@ enum arc_getset {
static const struct user_regset arc_regsets[] = {
[REGSET_CMN] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = ELF_NGREG,
.size = sizeof(unsigned long),
.align = sizeof(unsigned long),
@@ -293,7 +293,7 @@ static const struct user_regset arc_regsets[] = {
},
#ifdef CONFIG_ISA_ARCV2
[REGSET_ARCV2] = {
- .core_note_type = NT_ARC_V2,
+ USER_REGSET_NOTE_TYPE(ARC_V2),
.n = ELF_ARCV2REG,
.size = sizeof(unsigned long),
.align = sizeof(unsigned long),
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 3072731fe09c..a6e80653abd1 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -8,8 +8,6 @@ config ARM
select ARCH_HAS_CACHE_LINE_SIZE if OF
select ARCH_HAS_CPU_CACHE_ALIASING
select ARCH_HAS_CPU_FINALIZE_INIT if MMU
- select ARCH_HAS_CRC32 if KERNEL_MODE_NEON
- select ARCH_HAS_CRC_T10DIF if KERNEL_MODE_NEON
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL if MMU
select ARCH_HAS_DMA_ALLOC if MMU
@@ -87,11 +85,11 @@ config ARM
select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL
select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN
+ select HAVE_ARCH_KSTACK_ERASE
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_PFN_VALID
select HAVE_ARCH_SECCOMP
select HAVE_ARCH_SECCOMP_FILTER if AEABI && !OABI_COMPAT
- select HAVE_ARCH_STACKLEAK
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARM_LPAE
@@ -121,7 +119,7 @@ config ARM
select HAVE_KERNEL_XZ
select HAVE_KPROBES if !XIP_KERNEL && !CPU_ENDIAN_BE32 && !CPU_V7M
select HAVE_KRETPROBES if HAVE_KPROBES
- select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if (LD_VERSION >= 23600 || LD_CAN_USE_KEEP_IN_OVERLAY)
+ select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if (LD_VERSION >= 23600 || LD_IS_LLD) && LD_CAN_USE_KEEP_IN_OVERLAY
select HAVE_MOD_ARCH_SPECIFIC
select HAVE_NMI
select HAVE_OPTPROBES if !THUMB2_KERNEL
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 4808d3ed98e4..e31e95ffd33f 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -149,7 +149,7 @@ endif
# Need -Uarm for gcc < 3.x
KBUILD_CPPFLAGS +=$(cpp-y)
KBUILD_CFLAGS +=$(CFLAGS_ABI) $(CFLAGS_ISA) $(arch-y) $(tune-y) $(call cc-option,-mshort-load-bytes,$(call cc-option,-malignment-traps,)) -msoft-float -Uarm
-KBUILD_AFLAGS +=$(CFLAGS_ABI) $(AFLAGS_ISA) -Wa,$(arch-y) $(tune-y) -include asm/unified.h -msoft-float
+KBUILD_AFLAGS +=$(CFLAGS_ABI) $(AFLAGS_ISA) -Wa,$(arch-y) $(tune-y) -include $(srctree)/arch/arm/include/asm/unified.h -msoft-float
KBUILD_RUSTFLAGS += --target=arm-unknown-linux-gnueabi
CHECKFLAGS += -D__arm__
diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile
index d61369b1eabe..a159120d1e42 100644
--- a/arch/arm/boot/compressed/Makefile
+++ b/arch/arm/boot/compressed/Makefile
@@ -9,7 +9,6 @@ OBJS =
HEAD = head.o
OBJS += misc.o decompress.o
-CFLAGS_decompress.o += $(DISABLE_STACKLEAK_PLUGIN)
ifeq ($(CONFIG_DEBUG_UNCOMPRESS),y)
OBJS += debug.o
AFLAGS_head.o += -DDEBUG
@@ -96,6 +95,7 @@ KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
ccflags-y := -fpic $(call cc-option,-mno-single-pic-base,) -fno-builtin \
-I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \
+ $(DISABLE_KSTACK_ERASE) \
-I$(obj)
ccflags-remove-$(CONFIG_FUNCTION_TRACER) += -pg
asflags-y := -DZIMAGE
diff --git a/arch/arm/boot/dts/allwinner/sun8i-v3s.dtsi b/arch/arm/boot/dts/allwinner/sun8i-v3s.dtsi
index f909b1d4dbca..e82cf312da25 100644
--- a/arch/arm/boot/dts/allwinner/sun8i-v3s.dtsi
+++ b/arch/arm/boot/dts/allwinner/sun8i-v3s.dtsi
@@ -652,7 +652,7 @@
reg = <0x01cb4000 0x3000>;
interrupts = <GIC_SPI 84 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&ccu CLK_BUS_CSI>,
- <&ccu CLK_CSI1_SCLK>,
+ <&ccu CLK_CSI_SCLK>,
<&ccu CLK_DRAM_CSI>;
clock-names = "bus", "mod", "ram";
resets = <&ccu RST_BUS_CSI>;
diff --git a/arch/arm/configs/exynos_defconfig b/arch/arm/configs/exynos_defconfig
index f71af368674c..6915c766923a 100644
--- a/arch/arm/configs/exynos_defconfig
+++ b/arch/arm/configs/exynos_defconfig
@@ -363,8 +363,6 @@ CONFIG_CRYPTO_USER_API_HASH=m
CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
-CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA512_ARM=m
CONFIG_CRYPTO_AES_ARM_BS=m
CONFIG_CRYPTO_CHACHA20_NEON=m
CONFIG_CRYPTO_DEV_EXYNOS_RNG=y
diff --git a/arch/arm/configs/milbeaut_m10v_defconfig b/arch/arm/configs/milbeaut_m10v_defconfig
index 242e7d5a3f68..a3be0b2ede09 100644
--- a/arch/arm/configs/milbeaut_m10v_defconfig
+++ b/arch/arm/configs/milbeaut_m10v_defconfig
@@ -98,9 +98,6 @@ CONFIG_CRYPTO_SELFTESTS=y
CONFIG_CRYPTO_AES=y
CONFIG_CRYPTO_SEQIV=m
CONFIG_CRYPTO_GHASH_ARM_CE=m
-CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA1_ARM_CE=m
-CONFIG_CRYPTO_SHA512_ARM=m
CONFIG_CRYPTO_AES_ARM=m
CONFIG_CRYPTO_AES_ARM_BS=m
CONFIG_CRYPTO_AES_ARM_CE=m
diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig
index 50c170b4619f..d6ecb22e5fd8 100644
--- a/arch/arm/configs/multi_v7_defconfig
+++ b/arch/arm/configs/multi_v7_defconfig
@@ -1280,9 +1280,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
CONFIG_CRYPTO_GHASH_ARM_CE=m
-CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA1_ARM_CE=m
-CONFIG_CRYPTO_SHA512_ARM=m
CONFIG_CRYPTO_AES_ARM=m
CONFIG_CRYPTO_AES_ARM_BS=m
CONFIG_CRYPTO_AES_ARM_CE=m
@@ -1298,7 +1295,6 @@ CONFIG_CRYPTO_DEV_MARVELL_CESA=m
CONFIG_CRYPTO_DEV_QCE=m
CONFIG_CRYPTO_DEV_QCOM_RNG=m
CONFIG_CRYPTO_DEV_ROCKCHIP=m
-CONFIG_CRYPTO_DEV_STM32_CRC=m
CONFIG_CRYPTO_DEV_STM32_HASH=m
CONFIG_CRYPTO_DEV_STM32_CRYP=m
CONFIG_CMA_SIZE_MBYTES=64
diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig
index 9f9780c8e62a..046467637901 100644
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -704,8 +704,6 @@ CONFIG_NLS_ISO8859_1=y
CONFIG_SECURITY=y
CONFIG_CRYPTO_MICHAEL_MIC=y
CONFIG_CRYPTO_GHASH_ARM_CE=m
-CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA512_ARM=m
CONFIG_CRYPTO_AES_ARM=m
CONFIG_CRYPTO_AES_ARM_BS=m
CONFIG_CRYPTO_CHACHA20_NEON=m
diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig
index ff29c5b0e9c9..1a80602c1284 100644
--- a/arch/arm/configs/pxa_defconfig
+++ b/arch/arm/configs/pxa_defconfig
@@ -658,8 +658,6 @@ CONFIG_CRYPTO_ANUBIS=m
CONFIG_CRYPTO_XCBC=m
CONFIG_CRYPTO_DEFLATE=y
CONFIG_CRYPTO_LZO=y
-CONFIG_CRYPTO_SHA1_ARM=m
-CONFIG_CRYPTO_SHA512_ARM=m
CONFIG_CRYPTO_AES_ARM=m
CONFIG_FONTS=y
CONFIG_FONT_8x8=y
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 7efb9a8596e4..1e5f3cdf691c 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -62,47 +62,6 @@ config CRYPTO_BLAKE2B_NEON
much faster than the SHA-2 family and slightly faster than
SHA-1.
-config CRYPTO_SHA1_ARM
- tristate "Hash functions: SHA-1"
- select CRYPTO_SHA1
- select CRYPTO_HASH
- help
- SHA-1 secure hash algorithm (FIPS 180)
-
- Architecture: arm
-
-config CRYPTO_SHA1_ARM_NEON
- tristate "Hash functions: SHA-1 (NEON)"
- depends on KERNEL_MODE_NEON
- select CRYPTO_SHA1_ARM
- select CRYPTO_SHA1
- select CRYPTO_HASH
- help
- SHA-1 secure hash algorithm (FIPS 180)
-
- Architecture: arm using
- - NEON (Advanced SIMD) extensions
-
-config CRYPTO_SHA1_ARM_CE
- tristate "Hash functions: SHA-1 (ARMv8 Crypto Extensions)"
- depends on KERNEL_MODE_NEON
- select CRYPTO_SHA1_ARM
- select CRYPTO_HASH
- help
- SHA-1 secure hash algorithm (FIPS 180)
-
- Architecture: arm using ARMv8 Crypto Extensions
-
-config CRYPTO_SHA512_ARM
- tristate "Hash functions: SHA-384 and SHA-512 (NEON)"
- select CRYPTO_HASH
- depends on !CPU_V7M
- help
- SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
-
- Architecture: arm using
- - NEON (Advanced SIMD) extensions
-
config CRYPTO_AES_ARM
tristate "Ciphers: AES"
select CRYPTO_ALGAPI
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 8479137c6e80..4f23999ae17d 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -5,38 +5,17 @@
obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
-obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
-obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
-obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
aes-arm-y := aes-cipher-core.o aes-cipher-glue.o
aes-arm-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
-sha1-arm-y := sha1-armv4-large.o sha1_glue.o
-sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o
-sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
-sha512-arm-y := sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o
-sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o
aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
curve25519-neon-y := curve25519-core.o curve25519-glue.o
-
-quiet_cmd_perl = PERL $@
- cmd_perl = $(PERL) $(<) > $(@)
-
-$(obj)/%-core.S: $(src)/%-armv4.pl
- $(call cmd,perl)
-
-clean-files += sha512-core.S
-
-aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1
-
-AFLAGS_sha512-core.o += $(aflags-thumb2-y)
diff --git a/arch/arm/crypto/sha1-armv4-large.S b/arch/arm/crypto/sha1-armv4-large.S
deleted file mode 100644
index 1c8b685149f2..000000000000
--- a/arch/arm/crypto/sha1-armv4-large.S
+++ /dev/null
@@ -1,507 +0,0 @@
-#define __ARM_ARCH__ __LINUX_ARM_ARCH__
-@ SPDX-License-Identifier: GPL-2.0
-
-@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
-@ has relicensed it under the GPLv2. Therefore this program is free software;
-@ you can redistribute it and/or modify it under the terms of the GNU General
-@ Public License version 2 as published by the Free Software Foundation.
-@
-@ The original headers, including the original license headers, are
-@ included below for completeness.
-
-@ ====================================================================
-@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-@ project. The module is, however, dual licensed under OpenSSL and
-@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see https://www.openssl.org/~appro/cryptogams/.
-@ ====================================================================
-
-@ sha1_block procedure for ARMv4.
-@
-@ January 2007.
-
-@ Size/performance trade-off
-@ ====================================================================
-@ impl size in bytes comp cycles[*] measured performance
-@ ====================================================================
-@ thumb 304 3212 4420
-@ armv4-small 392/+29% 1958/+64% 2250/+96%
-@ armv4-compact 740/+89% 1552/+26% 1840/+22%
-@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
-@ full unroll ~5100/+260% ~1260/+4% ~1300/+5%
-@ ====================================================================
-@ thumb = same as 'small' but in Thumb instructions[**] and
-@ with recurring code in two private functions;
-@ small = detached Xload/update, loops are folded;
-@ compact = detached Xload/update, 5x unroll;
-@ large = interleaved Xload/update, 5x unroll;
-@ full unroll = interleaved Xload/update, full unroll, estimated[!];
-@
-@ [*] Manually counted instructions in "grand" loop body. Measured
-@ performance is affected by prologue and epilogue overhead,
-@ i-cache availability, branch penalties, etc.
-@ [**] While each Thumb instruction is twice smaller, they are not as
-@ diverse as ARM ones: e.g., there are only two arithmetic
-@ instructions with 3 arguments, no [fixed] rotate, addressing
-@ modes are limited. As result it takes more instructions to do
-@ the same job in Thumb, therefore the code is never twice as
-@ small and always slower.
-@ [***] which is also ~35% better than compiler generated code. Dual-
-@ issue Cortex A8 core was measured to process input block in
-@ ~990 cycles.
-
-@ August 2010.
-@
-@ Rescheduling for dual-issue pipeline resulted in 13% improvement on
-@ Cortex A8 core and in absolute terms ~870 cycles per input block
-@ [or 13.6 cycles per byte].
-
-@ February 2011.
-@
-@ Profiler-assisted and platform-specific optimization resulted in 10%
-@ improvement on Cortex A8 core and 12.2 cycles per byte.
-
-#include <linux/linkage.h>
-
-.text
-
-.align 2
-ENTRY(sha1_block_data_order)
- stmdb sp!,{r4-r12,lr}
- add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
- ldmia r0,{r3,r4,r5,r6,r7}
-.Lloop:
- ldr r8,.LK_00_19
- mov r14,sp
- sub sp,sp,#15*4
- mov r5,r5,ror#30
- mov r6,r6,ror#30
- mov r7,r7,ror#30 @ [6]
-.L_00_15:
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r7,r8,r7,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r5,r6 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r7,r8,r7,ror#2 @ E+=K_00_19
- eor r10,r5,r6 @ F_xx_xx
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r4,r10,ror#2
- add r7,r7,r9 @ E+=X[i]
- eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r7,r7,r10 @ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r6,r8,r6,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r4,r5 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r6,r6,r7,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r6,r8,r6,ror#2 @ E+=K_00_19
- eor r10,r4,r5 @ F_xx_xx
- add r6,r6,r7,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r3,r10,ror#2
- add r6,r6,r9 @ E+=X[i]
- eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r6,r6,r10 @ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r5,r8,r5,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r3,r4 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r5,r5,r6,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r5,r8,r5,ror#2 @ E+=K_00_19
- eor r10,r3,r4 @ F_xx_xx
- add r5,r5,r6,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r7,r10,ror#2
- add r5,r5,r9 @ E+=X[i]
- eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r5,r5,r10 @ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r4,r8,r4,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r7,r3 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r4,r4,r5,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r4,r8,r4,ror#2 @ E+=K_00_19
- eor r10,r7,r3 @ F_xx_xx
- add r4,r4,r5,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r6,r10,ror#2
- add r4,r4,r9 @ E+=X[i]
- eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r4,r4,r10 @ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r3,r8,r3,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r6,r7 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r3,r3,r4,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r3,r8,r3,ror#2 @ E+=K_00_19
- eor r10,r6,r7 @ F_xx_xx
- add r3,r3,r4,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r5,r10,ror#2
- add r3,r3,r9 @ E+=X[i]
- eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r3,r3,r10 @ E+=F_00_19(B,C,D)
- cmp r14,sp
- bne .L_00_15 @ [((11+4)*5+2)*3]
- sub sp,sp,#25*4
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r7,r8,r7,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r5,r6 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r7,r8,r7,ror#2 @ E+=K_00_19
- eor r10,r5,r6 @ F_xx_xx
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r4,r10,ror#2
- add r7,r7,r9 @ E+=X[i]
- eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r7,r7,r10 @ E+=F_00_19(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r6,r8,r6,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r4,r5 @ F_xx_xx
- mov r9,r9,ror#31
- add r6,r6,r7,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r3,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r6,r6,r9 @ E+=X[i]
- eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
- add r6,r6,r10 @ E+=F_00_19(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r5,r8,r5,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r3,r4 @ F_xx_xx
- mov r9,r9,ror#31
- add r5,r5,r6,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r7,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r5,r5,r9 @ E+=X[i]
- eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
- add r5,r5,r10 @ E+=F_00_19(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r4,r8,r4,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r7,r3 @ F_xx_xx
- mov r9,r9,ror#31
- add r4,r4,r5,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r6,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r4,r4,r9 @ E+=X[i]
- eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
- add r4,r4,r10 @ E+=F_00_19(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r3,r8,r3,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r6,r7 @ F_xx_xx
- mov r9,r9,ror#31
- add r3,r3,r4,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r5,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r3,r3,r9 @ E+=X[i]
- eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
- add r3,r3,r10 @ E+=F_00_19(B,C,D)
-
- ldr r8,.LK_20_39 @ [+15+16*4]
- cmn sp,#0 @ [+3], clear carry to denote 20_39
-.L_20_39_or_60_79:
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r7,r8,r7,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r5,r6 @ F_xx_xx
- mov r9,r9,ror#31
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- eor r10,r4,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r7,r7,r9 @ E+=X[i]
- add r7,r7,r10 @ E+=F_20_39(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r6,r8,r6,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r4,r5 @ F_xx_xx
- mov r9,r9,ror#31
- add r6,r6,r7,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- eor r10,r3,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r6,r6,r9 @ E+=X[i]
- add r6,r6,r10 @ E+=F_20_39(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r5,r8,r5,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r3,r4 @ F_xx_xx
- mov r9,r9,ror#31
- add r5,r5,r6,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- eor r10,r7,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r5,r5,r9 @ E+=X[i]
- add r5,r5,r10 @ E+=F_20_39(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r4,r8,r4,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r7,r3 @ F_xx_xx
- mov r9,r9,ror#31
- add r4,r4,r5,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- eor r10,r6,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r4,r4,r9 @ E+=X[i]
- add r4,r4,r10 @ E+=F_20_39(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r3,r8,r3,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r6,r7 @ F_xx_xx
- mov r9,r9,ror#31
- add r3,r3,r4,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- eor r10,r5,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r3,r3,r9 @ E+=X[i]
- add r3,r3,r10 @ E+=F_20_39(B,C,D)
- ARM( teq r14,sp ) @ preserve carry
- THUMB( mov r11,sp )
- THUMB( teq r14,r11 ) @ preserve carry
- bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
- bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
-
- ldr r8,.LK_40_59
- sub sp,sp,#20*4 @ [+2]
-.L_40_59:
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r7,r8,r7,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r5,r6 @ F_xx_xx
- mov r9,r9,ror#31
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r4,r10,ror#2 @ F_xx_xx
- and r11,r5,r6 @ F_xx_xx
- add r7,r7,r9 @ E+=X[i]
- add r7,r7,r10 @ E+=F_40_59(B,C,D)
- add r7,r7,r11,ror#2
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r6,r8,r6,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r4,r5 @ F_xx_xx
- mov r9,r9,ror#31
- add r6,r6,r7,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r3,r10,ror#2 @ F_xx_xx
- and r11,r4,r5 @ F_xx_xx
- add r6,r6,r9 @ E+=X[i]
- add r6,r6,r10 @ E+=F_40_59(B,C,D)
- add r6,r6,r11,ror#2
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r5,r8,r5,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r3,r4 @ F_xx_xx
- mov r9,r9,ror#31
- add r5,r5,r6,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r7,r10,ror#2 @ F_xx_xx
- and r11,r3,r4 @ F_xx_xx
- add r5,r5,r9 @ E+=X[i]
- add r5,r5,r10 @ E+=F_40_59(B,C,D)
- add r5,r5,r11,ror#2
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r4,r8,r4,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r7,r3 @ F_xx_xx
- mov r9,r9,ror#31
- add r4,r4,r5,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r6,r10,ror#2 @ F_xx_xx
- and r11,r7,r3 @ F_xx_xx
- add r4,r4,r9 @ E+=X[i]
- add r4,r4,r10 @ E+=F_40_59(B,C,D)
- add r4,r4,r11,ror#2
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r3,r8,r3,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r6,r7 @ F_xx_xx
- mov r9,r9,ror#31
- add r3,r3,r4,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r5,r10,ror#2 @ F_xx_xx
- and r11,r6,r7 @ F_xx_xx
- add r3,r3,r9 @ E+=X[i]
- add r3,r3,r10 @ E+=F_40_59(B,C,D)
- add r3,r3,r11,ror#2
- cmp r14,sp
- bne .L_40_59 @ [+((12+5)*5+2)*4]
-
- ldr r8,.LK_60_79
- sub sp,sp,#20*4
- cmp sp,#0 @ set carry to denote 60_79
- b .L_20_39_or_60_79 @ [+4], spare 300 bytes
-.L_done:
- add sp,sp,#80*4 @ "deallocate" stack frame
- ldmia r0,{r8,r9,r10,r11,r12}
- add r3,r8,r3
- add r4,r9,r4
- add r5,r10,r5,ror#2
- add r6,r11,r6,ror#2
- add r7,r12,r7,ror#2
- stmia r0,{r3,r4,r5,r6,r7}
- teq r1,r2
- bne .Lloop @ [+18], total 1307
-
- ldmia sp!,{r4-r12,pc}
-.align 2
-.LK_00_19: .word 0x5a827999
-.LK_20_39: .word 0x6ed9eba1
-.LK_40_59: .word 0x8f1bbcdc
-.LK_60_79: .word 0xca62c1d6
-ENDPROC(sha1_block_data_order)
-.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
-.align 2
diff --git a/arch/arm/crypto/sha1-armv7-neon.S b/arch/arm/crypto/sha1-armv7-neon.S
deleted file mode 100644
index 28d816a6a530..000000000000
--- a/arch/arm/crypto/sha1-armv7-neon.S
+++ /dev/null
@@ -1,634 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
- *
- * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-.syntax unified
-.fpu neon
-
-.text
-
-
-/* Context structure */
-
-#define state_h0 0
-#define state_h1 4
-#define state_h2 8
-#define state_h3 12
-#define state_h4 16
-
-
-/* Constants */
-
-#define K1 0x5A827999
-#define K2 0x6ED9EBA1
-#define K3 0x8F1BBCDC
-#define K4 0xCA62C1D6
-.align 4
-.LK_VEC:
-.LK1: .long K1, K1, K1, K1
-.LK2: .long K2, K2, K2, K2
-.LK3: .long K3, K3, K3, K3
-.LK4: .long K4, K4, K4, K4
-
-
-/* Register macros */
-
-#define RSTATE r0
-#define RDATA r1
-#define RNBLKS r2
-#define ROLDSTACK r3
-#define RWK lr
-
-#define _a r4
-#define _b r5
-#define _c r6
-#define _d r7
-#define _e r8
-
-#define RT0 r9
-#define RT1 r10
-#define RT2 r11
-#define RT3 r12
-
-#define W0 q0
-#define W1 q7
-#define W2 q2
-#define W3 q3
-#define W4 q4
-#define W5 q6
-#define W6 q5
-#define W7 q1
-
-#define tmp0 q8
-#define tmp1 q9
-#define tmp2 q10
-#define tmp3 q11
-
-#define qK1 q12
-#define qK2 q13
-#define qK3 q14
-#define qK4 q15
-
-#ifdef CONFIG_CPU_BIG_ENDIAN
-#define ARM_LE(code...)
-#else
-#define ARM_LE(code...) code
-#endif
-
-/* Round function macros. */
-
-#define WK_offs(i) (((i) & 15) * 4)
-
-#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
- W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ldr RT3, [sp, WK_offs(i)]; \
- pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
- bic RT0, d, b; \
- add e, e, a, ror #(32 - 5); \
- and RT1, c, b; \
- pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
- add RT0, RT0, RT3; \
- add e, e, RT1; \
- ror b, #(32 - 30); \
- pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
- add e, e, RT0;
-
-#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
- W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ldr RT3, [sp, WK_offs(i)]; \
- pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
- eor RT0, d, b; \
- add e, e, a, ror #(32 - 5); \
- eor RT0, RT0, c; \
- pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
- add e, e, RT3; \
- ror b, #(32 - 30); \
- pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
- add e, e, RT0; \
-
-#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
- W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ldr RT3, [sp, WK_offs(i)]; \
- pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
- eor RT0, b, c; \
- and RT1, b, c; \
- add e, e, a, ror #(32 - 5); \
- pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
- and RT0, RT0, d; \
- add RT1, RT1, RT3; \
- add e, e, RT0; \
- ror b, #(32 - 30); \
- pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
- add e, e, RT1;
-
-#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
- W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
- W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
-
-#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
- W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
- W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
-
-#define R(a,b,c,d,e,f,i) \
- _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
- W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
-
-#define dummy(...)
-
-
-/* Input expansion macros. */
-
-/********* Precalc macros for rounds 0-15 *************************************/
-
-#define W_PRECALC_00_15() \
- add RWK, sp, #(WK_offs(0)); \
- \
- vld1.32 {W0, W7}, [RDATA]!; \
- ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \
- vld1.32 {W6, W5}, [RDATA]!; \
- vadd.u32 tmp0, W0, curK; \
- ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \
- ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \
- vadd.u32 tmp1, W7, curK; \
- ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \
- vadd.u32 tmp2, W6, curK; \
- vst1.32 {tmp0, tmp1}, [RWK]!; \
- vadd.u32 tmp3, W5, curK; \
- vst1.32 {tmp2, tmp3}, [RWK]; \
-
-#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vld1.32 {W0, W7}, [RDATA]!; \
-
-#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- add RWK, sp, #(WK_offs(0)); \
-
-#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \
-
-#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vld1.32 {W6, W5}, [RDATA]!; \
-
-#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vadd.u32 tmp0, W0, curK; \
-
-#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \
-
-#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \
-
-#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vadd.u32 tmp1, W7, curK; \
-
-#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \
-
-#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vadd.u32 tmp2, W6, curK; \
-
-#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vst1.32 {tmp0, tmp1}, [RWK]!; \
-
-#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vadd.u32 tmp3, W5, curK; \
-
-#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vst1.32 {tmp2, tmp3}, [RWK]; \
-
-
-/********* Precalc macros for rounds 16-31 ************************************/
-
-#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- veor tmp0, tmp0; \
- vext.8 W, W_m16, W_m12, #8; \
-
-#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- add RWK, sp, #(WK_offs(i)); \
- vext.8 tmp0, W_m04, tmp0, #4; \
-
-#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- veor tmp0, tmp0, W_m16; \
- veor.32 W, W, W_m08; \
-
-#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- veor tmp1, tmp1; \
- veor W, W, tmp0; \
-
-#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vshl.u32 tmp0, W, #1; \
-
-#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vext.8 tmp1, tmp1, W, #(16-12); \
- vshr.u32 W, W, #31; \
-
-#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vorr tmp0, tmp0, W; \
- vshr.u32 W, tmp1, #30; \
-
-#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vshl.u32 tmp1, tmp1, #2; \
-
-#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- veor tmp0, tmp0, W; \
-
-#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- veor W, tmp0, tmp1; \
-
-#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vadd.u32 tmp0, W, curK; \
-
-#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vst1.32 {tmp0}, [RWK];
-
-
-/********* Precalc macros for rounds 32-79 ************************************/
-
-#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- veor W, W_m28; \
-
-#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vext.8 tmp0, W_m08, W_m04, #8; \
-
-#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- veor W, W_m16; \
-
-#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- veor W, tmp0; \
-
-#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- add RWK, sp, #(WK_offs(i&~3)); \
-
-#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vshl.u32 tmp1, W, #2; \
-
-#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vshr.u32 tmp0, W, #30; \
-
-#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vorr W, tmp0, tmp1; \
-
-#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vadd.u32 tmp0, W, curK; \
-
-#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vst1.32 {tmp0}, [RWK];
-
-
-/*
- * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
- *
- * unsigned int
- * sha1_transform_neon (void *ctx, const unsigned char *data,
- * unsigned int nblks)
- */
-.align 3
-ENTRY(sha1_transform_neon)
- /* input:
- * r0: ctx, CTX
- * r1: data (64*nblks bytes)
- * r2: nblks
- */
-
- cmp RNBLKS, #0;
- beq .Ldo_nothing;
-
- push {r4-r12, lr};
- /*vpush {q4-q7};*/
-
- adr RT3, .LK_VEC;
-
- mov ROLDSTACK, sp;
-
- /* Align stack. */
- sub RT0, sp, #(16*4);
- and RT0, #(~(16-1));
- mov sp, RT0;
-
- vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
-
- /* Get the values of the chaining variables. */
- ldm RSTATE, {_a-_e};
-
- vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
-
-#undef curK
-#define curK qK1
- /* Precalc 0-15. */
- W_PRECALC_00_15();
-
-.Loop:
- /* Transform 0-15 + Precalc 16-31. */
- _R( _a, _b, _c, _d, _e, F1, 0,
- WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
- W4, W5, W6, W7, W0, _, _, _ );
- _R( _e, _a, _b, _c, _d, F1, 1,
- WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
- W4, W5, W6, W7, W0, _, _, _ );
- _R( _d, _e, _a, _b, _c, F1, 2,
- WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
- W4, W5, W6, W7, W0, _, _, _ );
- _R( _c, _d, _e, _a, _b, F1, 3,
- WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
- W4, W5, W6, W7, W0, _, _, _ );
-
-#undef curK
-#define curK qK2
- _R( _b, _c, _d, _e, _a, F1, 4,
- WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
- W3, W4, W5, W6, W7, _, _, _ );
- _R( _a, _b, _c, _d, _e, F1, 5,
- WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
- W3, W4, W5, W6, W7, _, _, _ );
- _R( _e, _a, _b, _c, _d, F1, 6,
- WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
- W3, W4, W5, W6, W7, _, _, _ );
- _R( _d, _e, _a, _b, _c, F1, 7,
- WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
- W3, W4, W5, W6, W7, _, _, _ );
-
- _R( _c, _d, _e, _a, _b, F1, 8,
- WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
- W2, W3, W4, W5, W6, _, _, _ );
- _R( _b, _c, _d, _e, _a, F1, 9,
- WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
- W2, W3, W4, W5, W6, _, _, _ );
- _R( _a, _b, _c, _d, _e, F1, 10,
- WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
- W2, W3, W4, W5, W6, _, _, _ );
- _R( _e, _a, _b, _c, _d, F1, 11,
- WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
- W2, W3, W4, W5, W6, _, _, _ );
-
- _R( _d, _e, _a, _b, _c, F1, 12,
- WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
- W1, W2, W3, W4, W5, _, _, _ );
- _R( _c, _d, _e, _a, _b, F1, 13,
- WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
- W1, W2, W3, W4, W5, _, _, _ );
- _R( _b, _c, _d, _e, _a, F1, 14,
- WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
- W1, W2, W3, W4, W5, _, _, _ );
- _R( _a, _b, _c, _d, _e, F1, 15,
- WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
- W1, W2, W3, W4, W5, _, _, _ );
-
- /* Transform 16-63 + Precalc 32-79. */
- _R( _e, _a, _b, _c, _d, F1, 16,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
- W0, W1, W2, W3, W4, W5, W6, W7);
- _R( _d, _e, _a, _b, _c, F1, 17,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
- W0, W1, W2, W3, W4, W5, W6, W7);
- _R( _c, _d, _e, _a, _b, F1, 18,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32,
- W0, W1, W2, W3, W4, W5, W6, W7);
- _R( _b, _c, _d, _e, _a, F1, 19,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32,
- W0, W1, W2, W3, W4, W5, W6, W7);
-
- _R( _a, _b, _c, _d, _e, F2, 20,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
- W7, W0, W1, W2, W3, W4, W5, W6);
- _R( _e, _a, _b, _c, _d, F2, 21,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
- W7, W0, W1, W2, W3, W4, W5, W6);
- _R( _d, _e, _a, _b, _c, F2, 22,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36,
- W7, W0, W1, W2, W3, W4, W5, W6);
- _R( _c, _d, _e, _a, _b, F2, 23,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36,
- W7, W0, W1, W2, W3, W4, W5, W6);
-
-#undef curK
-#define curK qK3
- _R( _b, _c, _d, _e, _a, F2, 24,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
- W6, W7, W0, W1, W2, W3, W4, W5);
- _R( _a, _b, _c, _d, _e, F2, 25,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
- W6, W7, W0, W1, W2, W3, W4, W5);
- _R( _e, _a, _b, _c, _d, F2, 26,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40,
- W6, W7, W0, W1, W2, W3, W4, W5);
- _R( _d, _e, _a, _b, _c, F2, 27,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40,
- W6, W7, W0, W1, W2, W3, W4, W5);
-
- _R( _c, _d, _e, _a, _b, F2, 28,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
- W5, W6, W7, W0, W1, W2, W3, W4);
- _R( _b, _c, _d, _e, _a, F2, 29,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
- W5, W6, W7, W0, W1, W2, W3, W4);
- _R( _a, _b, _c, _d, _e, F2, 30,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44,
- W5, W6, W7, W0, W1, W2, W3, W4);
- _R( _e, _a, _b, _c, _d, F2, 31,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44,
- W5, W6, W7, W0, W1, W2, W3, W4);
-
- _R( _d, _e, _a, _b, _c, F2, 32,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
- W4, W5, W6, W7, W0, W1, W2, W3);
- _R( _c, _d, _e, _a, _b, F2, 33,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
- W4, W5, W6, W7, W0, W1, W2, W3);
- _R( _b, _c, _d, _e, _a, F2, 34,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48,
- W4, W5, W6, W7, W0, W1, W2, W3);
- _R( _a, _b, _c, _d, _e, F2, 35,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48,
- W4, W5, W6, W7, W0, W1, W2, W3);
-
- _R( _e, _a, _b, _c, _d, F2, 36,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
- W3, W4, W5, W6, W7, W0, W1, W2);
- _R( _d, _e, _a, _b, _c, F2, 37,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
- W3, W4, W5, W6, W7, W0, W1, W2);
- _R( _c, _d, _e, _a, _b, F2, 38,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52,
- W3, W4, W5, W6, W7, W0, W1, W2);
- _R( _b, _c, _d, _e, _a, F2, 39,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52,
- W3, W4, W5, W6, W7, W0, W1, W2);
-
- _R( _a, _b, _c, _d, _e, F3, 40,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
- W2, W3, W4, W5, W6, W7, W0, W1);
- _R( _e, _a, _b, _c, _d, F3, 41,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
- W2, W3, W4, W5, W6, W7, W0, W1);
- _R( _d, _e, _a, _b, _c, F3, 42,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56,
- W2, W3, W4, W5, W6, W7, W0, W1);
- _R( _c, _d, _e, _a, _b, F3, 43,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56,
- W2, W3, W4, W5, W6, W7, W0, W1);
-
-#undef curK
-#define curK qK4
- _R( _b, _c, _d, _e, _a, F3, 44,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
- W1, W2, W3, W4, W5, W6, W7, W0);
- _R( _a, _b, _c, _d, _e, F3, 45,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
- W1, W2, W3, W4, W5, W6, W7, W0);
- _R( _e, _a, _b, _c, _d, F3, 46,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60,
- W1, W2, W3, W4, W5, W6, W7, W0);
- _R( _d, _e, _a, _b, _c, F3, 47,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60,
- W1, W2, W3, W4, W5, W6, W7, W0);
-
- _R( _c, _d, _e, _a, _b, F3, 48,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
- W0, W1, W2, W3, W4, W5, W6, W7);
- _R( _b, _c, _d, _e, _a, F3, 49,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
- W0, W1, W2, W3, W4, W5, W6, W7);
- _R( _a, _b, _c, _d, _e, F3, 50,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64,
- W0, W1, W2, W3, W4, W5, W6, W7);
- _R( _e, _a, _b, _c, _d, F3, 51,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64,
- W0, W1, W2, W3, W4, W5, W6, W7);
-
- _R( _d, _e, _a, _b, _c, F3, 52,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
- W7, W0, W1, W2, W3, W4, W5, W6);
- _R( _c, _d, _e, _a, _b, F3, 53,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
- W7, W0, W1, W2, W3, W4, W5, W6);
- _R( _b, _c, _d, _e, _a, F3, 54,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68,
- W7, W0, W1, W2, W3, W4, W5, W6);
- _R( _a, _b, _c, _d, _e, F3, 55,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68,
- W7, W0, W1, W2, W3, W4, W5, W6);
-
- _R( _e, _a, _b, _c, _d, F3, 56,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
- W6, W7, W0, W1, W2, W3, W4, W5);
- _R( _d, _e, _a, _b, _c, F3, 57,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
- W6, W7, W0, W1, W2, W3, W4, W5);
- _R( _c, _d, _e, _a, _b, F3, 58,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72,
- W6, W7, W0, W1, W2, W3, W4, W5);
- _R( _b, _c, _d, _e, _a, F3, 59,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72,
- W6, W7, W0, W1, W2, W3, W4, W5);
-
- subs RNBLKS, #1;
-
- _R( _a, _b, _c, _d, _e, F4, 60,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
- W5, W6, W7, W0, W1, W2, W3, W4);
- _R( _e, _a, _b, _c, _d, F4, 61,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
- W5, W6, W7, W0, W1, W2, W3, W4);
- _R( _d, _e, _a, _b, _c, F4, 62,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76,
- W5, W6, W7, W0, W1, W2, W3, W4);
- _R( _c, _d, _e, _a, _b, F4, 63,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76,
- W5, W6, W7, W0, W1, W2, W3, W4);
-
- beq .Lend;
-
- /* Transform 64-79 + Precalc 0-15 of next block. */
-#undef curK
-#define curK qK1
- _R( _b, _c, _d, _e, _a, F4, 64,
- WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _a, _b, _c, _d, _e, F4, 65,
- WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _e, _a, _b, _c, _d, F4, 66,
- WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _d, _e, _a, _b, _c, F4, 67,
- WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-
- _R( _c, _d, _e, _a, _b, F4, 68,
- dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _b, _c, _d, _e, _a, F4, 69,
- dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _a, _b, _c, _d, _e, F4, 70,
- WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _e, _a, _b, _c, _d, F4, 71,
- WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-
- _R( _d, _e, _a, _b, _c, F4, 72,
- dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _c, _d, _e, _a, _b, F4, 73,
- dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _b, _c, _d, _e, _a, F4, 74,
- WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _a, _b, _c, _d, _e, F4, 75,
- WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-
- _R( _e, _a, _b, _c, _d, F4, 76,
- WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _d, _e, _a, _b, _c, F4, 77,
- WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _c, _d, _e, _a, _b, F4, 78,
- WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _b, _c, _d, _e, _a, F4, 79,
- WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
-
- /* Update the chaining variables. */
- ldm RSTATE, {RT0-RT3};
- add _a, RT0;
- ldr RT0, [RSTATE, #state_h4];
- add _b, RT1;
- add _c, RT2;
- add _d, RT3;
- add _e, RT0;
- stm RSTATE, {_a-_e};
-
- b .Loop;
-
-.Lend:
- /* Transform 64-79 */
- R( _b, _c, _d, _e, _a, F4, 64 );
- R( _a, _b, _c, _d, _e, F4, 65 );
- R( _e, _a, _b, _c, _d, F4, 66 );
- R( _d, _e, _a, _b, _c, F4, 67 );
- R( _c, _d, _e, _a, _b, F4, 68 );
- R( _b, _c, _d, _e, _a, F4, 69 );
- R( _a, _b, _c, _d, _e, F4, 70 );
- R( _e, _a, _b, _c, _d, F4, 71 );
- R( _d, _e, _a, _b, _c, F4, 72 );
- R( _c, _d, _e, _a, _b, F4, 73 );
- R( _b, _c, _d, _e, _a, F4, 74 );
- R( _a, _b, _c, _d, _e, F4, 75 );
- R( _e, _a, _b, _c, _d, F4, 76 );
- R( _d, _e, _a, _b, _c, F4, 77 );
- R( _c, _d, _e, _a, _b, F4, 78 );
- R( _b, _c, _d, _e, _a, F4, 79 );
-
- mov sp, ROLDSTACK;
-
- /* Update the chaining variables. */
- ldm RSTATE, {RT0-RT3};
- add _a, RT0;
- ldr RT0, [RSTATE, #state_h4];
- add _b, RT1;
- add _c, RT2;
- add _d, RT3;
- /*vpop {q4-q7};*/
- add _e, RT0;
- stm RSTATE, {_a-_e};
-
- pop {r4-r12, pc};
-
-.Ldo_nothing:
- bx lr
-ENDPROC(sha1_transform_neon)
diff --git a/arch/arm/crypto/sha1-ce-core.S b/arch/arm/crypto/sha1-ce-core.S
deleted file mode 100644
index 8a702e051738..000000000000
--- a/arch/arm/crypto/sha1-ce-core.S
+++ /dev/null
@@ -1,123 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2015 Linaro Ltd.
- * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .text
- .arch armv8-a
- .fpu crypto-neon-fp-armv8
-
- k0 .req q0
- k1 .req q1
- k2 .req q2
- k3 .req q3
-
- ta0 .req q4
- ta1 .req q5
- tb0 .req q5
- tb1 .req q4
-
- dga .req q6
- dgb .req q7
- dgbs .req s28
-
- dg0 .req q12
- dg1a0 .req q13
- dg1a1 .req q14
- dg1b0 .req q14
- dg1b1 .req q13
-
- .macro add_only, op, ev, rc, s0, dg1
- .ifnb \s0
- vadd.u32 tb\ev, q\s0, \rc
- .endif
- sha1h.32 dg1b\ev, dg0
- .ifb \dg1
- sha1\op\().32 dg0, dg1a\ev, ta\ev
- .else
- sha1\op\().32 dg0, \dg1, ta\ev
- .endif
- .endm
-
- .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1
- sha1su0.32 q\s0, q\s1, q\s2
- add_only \op, \ev, \rc, \s1, \dg1
- sha1su1.32 q\s0, q\s3
- .endm
-
- .align 6
-.Lsha1_rcon:
- .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999
- .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1
- .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc
- .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
-
- /*
- * void sha1_ce_transform(struct sha1_state *sst, u8 const *src,
- * int blocks);
- */
-ENTRY(sha1_ce_transform)
- /* load round constants */
- adr ip, .Lsha1_rcon
- vld1.32 {k0-k1}, [ip, :128]!
- vld1.32 {k2-k3}, [ip, :128]
-
- /* load state */
- vld1.32 {dga}, [r0]
- vldr dgbs, [r0, #16]
-
- /* load input */
-0: vld1.32 {q8-q9}, [r1]!
- vld1.32 {q10-q11}, [r1]!
- subs r2, r2, #1
-
-#ifndef CONFIG_CPU_BIG_ENDIAN
- vrev32.8 q8, q8
- vrev32.8 q9, q9
- vrev32.8 q10, q10
- vrev32.8 q11, q11
-#endif
-
- vadd.u32 ta0, q8, k0
- vmov dg0, dga
-
- add_update c, 0, k0, 8, 9, 10, 11, dgb
- add_update c, 1, k0, 9, 10, 11, 8
- add_update c, 0, k0, 10, 11, 8, 9
- add_update c, 1, k0, 11, 8, 9, 10
- add_update c, 0, k1, 8, 9, 10, 11
-
- add_update p, 1, k1, 9, 10, 11, 8
- add_update p, 0, k1, 10, 11, 8, 9
- add_update p, 1, k1, 11, 8, 9, 10
- add_update p, 0, k1, 8, 9, 10, 11
- add_update p, 1, k2, 9, 10, 11, 8
-
- add_update m, 0, k2, 10, 11, 8, 9
- add_update m, 1, k2, 11, 8, 9, 10
- add_update m, 0, k2, 8, 9, 10, 11
- add_update m, 1, k2, 9, 10, 11, 8
- add_update m, 0, k3, 10, 11, 8, 9
-
- add_update p, 1, k3, 11, 8, 9, 10
- add_only p, 0, k3, 9
- add_only p, 1, k3, 10
- add_only p, 0, k3, 11
- add_only p, 1
-
- /* update state */
- vadd.u32 dga, dga, dg0
- vadd.u32 dgb, dgb, dg1a0
- bne 0b
-
- /* store new state */
- vst1.32 {dga}, [r0]
- vstr dgbs, [r0, #16]
- bx lr
-ENDPROC(sha1_ce_transform)
diff --git a/arch/arm/crypto/sha1-ce-glue.c b/arch/arm/crypto/sha1-ce-glue.c
deleted file mode 100644
index fac07a4799de..000000000000
--- a/arch/arm/crypto/sha1-ce-glue.c
+++ /dev/null
@@ -1,72 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-
-asmlinkage void sha1_ce_transform(struct sha1_state *sst, u8 const *src,
- int blocks);
-
-static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- int remain;
-
- kernel_neon_begin();
- remain = sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform);
- kernel_neon_end();
-
- return remain;
-}
-
-static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- kernel_neon_begin();
- sha1_base_do_finup(desc, data, len, sha1_ce_transform);
- kernel_neon_end();
-
- return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg alg = {
- .init = sha1_base_init,
- .update = sha1_ce_update,
- .finup = sha1_ce_finup,
- .descsize = SHA1_STATE_SIZE,
- .digestsize = SHA1_DIGEST_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name = "sha1-ce",
- .cra_priority = 200,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static int __init sha1_ce_mod_init(void)
-{
- return crypto_register_shash(&alg);
-}
-
-static void __exit sha1_ce_mod_fini(void)
-{
- crypto_unregister_shash(&alg);
-}
-
-module_cpu_feature_match(SHA1, sha1_ce_mod_init);
-module_exit(sha1_ce_mod_fini);
diff --git a/arch/arm/crypto/sha1_glue.c b/arch/arm/crypto/sha1_glue.c
deleted file mode 100644
index 255da00c7d98..000000000000
--- a/arch/arm/crypto/sha1_glue.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Cryptographic API.
- * Glue code for the SHA1 Secure Hash Algorithm assembler implementation
- *
- * This file is based on sha1_generic.c and sha1_ssse3_glue.c
- *
- * Copyright (c) Alan Smithee.
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
- * Copyright (c) Mathias Krause <minipli@googlemail.com>
- */
-
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha1_block_data_order(struct sha1_state *digest,
- const u8 *data, int rounds);
-
-static int sha1_update_arm(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- /* make sure signature matches sha1_block_fn() */
- BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
-
- return sha1_base_do_update_blocks(desc, data, len,
- sha1_block_data_order);
-}
-
-static int sha1_finup_arm(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- sha1_base_do_finup(desc, data, len, sha1_block_data_order);
- return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_base_init,
- .update = sha1_update_arm,
- .finup = sha1_finup_arm,
- .descsize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name= "sha1-asm",
- .cra_priority = 150,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-
-static int __init sha1_mod_init(void)
-{
- return crypto_register_shash(&alg);
-}
-
-
-static void __exit sha1_mod_fini(void)
-{
- crypto_unregister_shash(&alg);
-}
-
-
-module_init(sha1_mod_init);
-module_exit(sha1_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm (ARM)");
-MODULE_ALIAS_CRYPTO("sha1");
-MODULE_AUTHOR("David McCullough <ucdevel@gmail.com>");
diff --git a/arch/arm/crypto/sha1_neon_glue.c b/arch/arm/crypto/sha1_neon_glue.c
deleted file mode 100644
index d321850f22a6..000000000000
--- a/arch/arm/crypto/sha1_neon_glue.c
+++ /dev/null
@@ -1,83 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using
- * ARM NEON instructions.
- *
- * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This file is based on sha1_generic.c and sha1_ssse3_glue.c:
- * Copyright (c) Alan Smithee.
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
- * Copyright (c) Mathias Krause <minipli@googlemail.com>
- * Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha1_transform_neon(struct sha1_state *state_h,
- const u8 *data, int rounds);
-
-static int sha1_neon_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- int remain;
-
- kernel_neon_begin();
- remain = sha1_base_do_update_blocks(desc, data, len,
- sha1_transform_neon);
- kernel_neon_end();
-
- return remain;
-}
-
-static int sha1_neon_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- kernel_neon_begin();
- sha1_base_do_finup(desc, data, len, sha1_transform_neon);
- kernel_neon_end();
-
- return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_base_init,
- .update = sha1_neon_update,
- .finup = sha1_neon_finup,
- .descsize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name = "sha1-neon",
- .cra_priority = 250,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static int __init sha1_neon_mod_init(void)
-{
- if (!cpu_has_neon())
- return -ENODEV;
-
- return crypto_register_shash(&alg);
-}
-
-static void __exit sha1_neon_mod_fini(void)
-{
- crypto_unregister_shash(&alg);
-}
-
-module_init(sha1_neon_mod_init);
-module_exit(sha1_neon_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, NEON accelerated");
-MODULE_ALIAS_CRYPTO("sha1");
diff --git a/arch/arm/crypto/sha512-armv4.pl b/arch/arm/crypto/sha512-armv4.pl
deleted file mode 100644
index 2fc3516912fa..000000000000
--- a/arch/arm/crypto/sha512-armv4.pl
+++ /dev/null
@@ -1,657 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0
-
-# This code is taken from the OpenSSL project but the author (Andy Polyakov)
-# has relicensed it under the GPLv2. Therefore this program is free software;
-# you can redistribute it and/or modify it under the terms of the GNU General
-# Public License version 2 as published by the Free Software Foundation.
-#
-# The original headers, including the original license headers, are
-# included below for completeness.
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see https://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# SHA512 block procedure for ARMv4. September 2007.
-
-# This code is ~4.5 (four and a half) times faster than code generated
-# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
-# Xscale PXA250 core].
-#
-# July 2010.
-#
-# Rescheduling for dual-issue pipeline resulted in 6% improvement on
-# Cortex A8 core and ~40 cycles per processed byte.
-
-# February 2011.
-#
-# Profiler-assisted and platform-specific optimization resulted in 7%
-# improvement on Coxtex A8 core and ~38 cycles per byte.
-
-# March 2011.
-#
-# Add NEON implementation. On Cortex A8 it was measured to process
-# one byte in 23.3 cycles or ~60% faster than integer-only code.
-
-# August 2012.
-#
-# Improve NEON performance by 12% on Snapdragon S4. In absolute
-# terms it's 22.6 cycles per byte, which is disappointing result.
-# Technical writers asserted that 3-way S4 pipeline can sustain
-# multiple NEON instructions per cycle, but dual NEON issue could
-# not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
-# for further details. On side note Cortex-A15 processes one byte in
-# 16 cycles.
-
-# Byte order [in]dependence. =========================================
-#
-# Originally caller was expected to maintain specific *dword* order in
-# h[0-7], namely with most significant dword at *lower* address, which
-# was reflected in below two parameters as 0 and 4. Now caller is
-# expected to maintain native byte order for whole 64-bit values.
-$hi="HI";
-$lo="LO";
-# ====================================================================
-
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-
-$ctx="r0"; # parameter block
-$inp="r1";
-$len="r2";
-
-$Tlo="r3";
-$Thi="r4";
-$Alo="r5";
-$Ahi="r6";
-$Elo="r7";
-$Ehi="r8";
-$t0="r9";
-$t1="r10";
-$t2="r11";
-$t3="r12";
-############ r13 is stack pointer
-$Ktbl="r14";
-############ r15 is program counter
-
-$Aoff=8*0;
-$Boff=8*1;
-$Coff=8*2;
-$Doff=8*3;
-$Eoff=8*4;
-$Foff=8*5;
-$Goff=8*6;
-$Hoff=8*7;
-$Xoff=8*8;
-
-sub BODY_00_15() {
-my $magic = shift;
-$code.=<<___;
- @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
- @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
- @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
- mov $t0,$Elo,lsr#14
- str $Tlo,[sp,#$Xoff+0]
- mov $t1,$Ehi,lsr#14
- str $Thi,[sp,#$Xoff+4]
- eor $t0,$t0,$Ehi,lsl#18
- ldr $t2,[sp,#$Hoff+0] @ h.lo
- eor $t1,$t1,$Elo,lsl#18
- ldr $t3,[sp,#$Hoff+4] @ h.hi
- eor $t0,$t0,$Elo,lsr#18
- eor $t1,$t1,$Ehi,lsr#18
- eor $t0,$t0,$Ehi,lsl#14
- eor $t1,$t1,$Elo,lsl#14
- eor $t0,$t0,$Ehi,lsr#9
- eor $t1,$t1,$Elo,lsr#9
- eor $t0,$t0,$Elo,lsl#23
- eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
- adds $Tlo,$Tlo,$t0
- ldr $t0,[sp,#$Foff+0] @ f.lo
- adc $Thi,$Thi,$t1 @ T += Sigma1(e)
- ldr $t1,[sp,#$Foff+4] @ f.hi
- adds $Tlo,$Tlo,$t2
- ldr $t2,[sp,#$Goff+0] @ g.lo
- adc $Thi,$Thi,$t3 @ T += h
- ldr $t3,[sp,#$Goff+4] @ g.hi
-
- eor $t0,$t0,$t2
- str $Elo,[sp,#$Eoff+0]
- eor $t1,$t1,$t3
- str $Ehi,[sp,#$Eoff+4]
- and $t0,$t0,$Elo
- str $Alo,[sp,#$Aoff+0]
- and $t1,$t1,$Ehi
- str $Ahi,[sp,#$Aoff+4]
- eor $t0,$t0,$t2
- ldr $t2,[$Ktbl,#$lo] @ K[i].lo
- eor $t1,$t1,$t3 @ Ch(e,f,g)
- ldr $t3,[$Ktbl,#$hi] @ K[i].hi
-
- adds $Tlo,$Tlo,$t0
- ldr $Elo,[sp,#$Doff+0] @ d.lo
- adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
- ldr $Ehi,[sp,#$Doff+4] @ d.hi
- adds $Tlo,$Tlo,$t2
- and $t0,$t2,#0xff
- adc $Thi,$Thi,$t3 @ T += K[i]
- adds $Elo,$Elo,$Tlo
- ldr $t2,[sp,#$Boff+0] @ b.lo
- adc $Ehi,$Ehi,$Thi @ d += T
- teq $t0,#$magic
-
- ldr $t3,[sp,#$Coff+0] @ c.lo
-#if __ARM_ARCH__>=7
- it eq @ Thumb2 thing, sanity check in ARM
-#endif
- orreq $Ktbl,$Ktbl,#1
- @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
- @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
- @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
- mov $t0,$Alo,lsr#28
- mov $t1,$Ahi,lsr#28
- eor $t0,$t0,$Ahi,lsl#4
- eor $t1,$t1,$Alo,lsl#4
- eor $t0,$t0,$Ahi,lsr#2
- eor $t1,$t1,$Alo,lsr#2
- eor $t0,$t0,$Alo,lsl#30
- eor $t1,$t1,$Ahi,lsl#30
- eor $t0,$t0,$Ahi,lsr#7
- eor $t1,$t1,$Alo,lsr#7
- eor $t0,$t0,$Alo,lsl#25
- eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
- adds $Tlo,$Tlo,$t0
- and $t0,$Alo,$t2
- adc $Thi,$Thi,$t1 @ T += Sigma0(a)
-
- ldr $t1,[sp,#$Boff+4] @ b.hi
- orr $Alo,$Alo,$t2
- ldr $t2,[sp,#$Coff+4] @ c.hi
- and $Alo,$Alo,$t3
- and $t3,$Ahi,$t1
- orr $Ahi,$Ahi,$t1
- orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
- and $Ahi,$Ahi,$t2
- adds $Alo,$Alo,$Tlo
- orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
- sub sp,sp,#8
- adc $Ahi,$Ahi,$Thi @ h += T
- tst $Ktbl,#1
- add $Ktbl,$Ktbl,#8
-___
-}
-$code=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
-# define VFP_ABI_POP vldmia sp!,{d8-d15}
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-# define VFP_ABI_PUSH
-# define VFP_ABI_POP
-#endif
-
-#ifdef __ARMEL__
-# define LO 0
-# define HI 4
-# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
-#else
-# define HI 0
-# define LO 4
-# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
-#endif
-
-.text
-#if __ARM_ARCH__<7
-.code 32
-#else
-.syntax unified
-# ifdef __thumb2__
-.thumb
-# else
-.code 32
-# endif
-#endif
-
-.type K512,%object
-.align 5
-K512:
-WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
-WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
-WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
-WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
-WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
-WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
-WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
-WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
-WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
-WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
-WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
-WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
-WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
-WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
-WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
-WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
-WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
-WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
-WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
-WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
-WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
-WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
-WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
-WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
-WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
-WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
-WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
-WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
-WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
-WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
-WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
-WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
-WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
-WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
-WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
-WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
-WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
-WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
-WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
-WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
-.size K512,.-K512
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-sha512_block_data_order
-.skip 32-4
-#else
-.skip 32
-#endif
-
-.global sha512_block_data_order
-.type sha512_block_data_order,%function
-sha512_block_data_order:
-.Lsha512_block_data_order:
-#if __ARM_ARCH__<7
- sub r3,pc,#8 @ sha512_block_data_order
-#else
- adr r3,.Lsha512_block_data_order
-#endif
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- ldr r12,.LOPENSSL_armcap
- ldr r12,[r3,r12] @ OPENSSL_armcap_P
- tst r12,#1
- bne .LNEON
-#endif
- add $len,$inp,$len,lsl#7 @ len to point at the end of inp
- stmdb sp!,{r4-r12,lr}
- sub $Ktbl,r3,#672 @ K512
- sub sp,sp,#9*8
-
- ldr $Elo,[$ctx,#$Eoff+$lo]
- ldr $Ehi,[$ctx,#$Eoff+$hi]
- ldr $t0, [$ctx,#$Goff+$lo]
- ldr $t1, [$ctx,#$Goff+$hi]
- ldr $t2, [$ctx,#$Hoff+$lo]
- ldr $t3, [$ctx,#$Hoff+$hi]
-.Loop:
- str $t0, [sp,#$Goff+0]
- str $t1, [sp,#$Goff+4]
- str $t2, [sp,#$Hoff+0]
- str $t3, [sp,#$Hoff+4]
- ldr $Alo,[$ctx,#$Aoff+$lo]
- ldr $Ahi,[$ctx,#$Aoff+$hi]
- ldr $Tlo,[$ctx,#$Boff+$lo]
- ldr $Thi,[$ctx,#$Boff+$hi]
- ldr $t0, [$ctx,#$Coff+$lo]
- ldr $t1, [$ctx,#$Coff+$hi]
- ldr $t2, [$ctx,#$Doff+$lo]
- ldr $t3, [$ctx,#$Doff+$hi]
- str $Tlo,[sp,#$Boff+0]
- str $Thi,[sp,#$Boff+4]
- str $t0, [sp,#$Coff+0]
- str $t1, [sp,#$Coff+4]
- str $t2, [sp,#$Doff+0]
- str $t3, [sp,#$Doff+4]
- ldr $Tlo,[$ctx,#$Foff+$lo]
- ldr $Thi,[$ctx,#$Foff+$hi]
- str $Tlo,[sp,#$Foff+0]
- str $Thi,[sp,#$Foff+4]
-
-.L00_15:
-#if __ARM_ARCH__<7
- ldrb $Tlo,[$inp,#7]
- ldrb $t0, [$inp,#6]
- ldrb $t1, [$inp,#5]
- ldrb $t2, [$inp,#4]
- ldrb $Thi,[$inp,#3]
- ldrb $t3, [$inp,#2]
- orr $Tlo,$Tlo,$t0,lsl#8
- ldrb $t0, [$inp,#1]
- orr $Tlo,$Tlo,$t1,lsl#16
- ldrb $t1, [$inp],#8
- orr $Tlo,$Tlo,$t2,lsl#24
- orr $Thi,$Thi,$t3,lsl#8
- orr $Thi,$Thi,$t0,lsl#16
- orr $Thi,$Thi,$t1,lsl#24
-#else
- ldr $Tlo,[$inp,#4]
- ldr $Thi,[$inp],#8
-#ifdef __ARMEL__
- rev $Tlo,$Tlo
- rev $Thi,$Thi
-#endif
-#endif
-___
- &BODY_00_15(0x94);
-$code.=<<___;
- tst $Ktbl,#1
- beq .L00_15
- ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
- ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
- bic $Ktbl,$Ktbl,#1
-.L16_79:
- @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
- @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
- @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
- mov $Tlo,$t0,lsr#1
- ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
- mov $Thi,$t1,lsr#1
- ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
- eor $Tlo,$Tlo,$t1,lsl#31
- eor $Thi,$Thi,$t0,lsl#31
- eor $Tlo,$Tlo,$t0,lsr#8
- eor $Thi,$Thi,$t1,lsr#8
- eor $Tlo,$Tlo,$t1,lsl#24
- eor $Thi,$Thi,$t0,lsl#24
- eor $Tlo,$Tlo,$t0,lsr#7
- eor $Thi,$Thi,$t1,lsr#7
- eor $Tlo,$Tlo,$t1,lsl#25
-
- @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
- @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
- @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
- mov $t0,$t2,lsr#19
- mov $t1,$t3,lsr#19
- eor $t0,$t0,$t3,lsl#13
- eor $t1,$t1,$t2,lsl#13
- eor $t0,$t0,$t3,lsr#29
- eor $t1,$t1,$t2,lsr#29
- eor $t0,$t0,$t2,lsl#3
- eor $t1,$t1,$t3,lsl#3
- eor $t0,$t0,$t2,lsr#6
- eor $t1,$t1,$t3,lsr#6
- ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
- eor $t0,$t0,$t3,lsl#26
-
- ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
- adds $Tlo,$Tlo,$t0
- ldr $t0,[sp,#`$Xoff+8*16`+0]
- adc $Thi,$Thi,$t1
-
- ldr $t1,[sp,#`$Xoff+8*16`+4]
- adds $Tlo,$Tlo,$t2
- adc $Thi,$Thi,$t3
- adds $Tlo,$Tlo,$t0
- adc $Thi,$Thi,$t1
-___
- &BODY_00_15(0x17);
-$code.=<<___;
-#if __ARM_ARCH__>=7
- ittt eq @ Thumb2 thing, sanity check in ARM
-#endif
- ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
- ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
- beq .L16_79
- bic $Ktbl,$Ktbl,#1
-
- ldr $Tlo,[sp,#$Boff+0]
- ldr $Thi,[sp,#$Boff+4]
- ldr $t0, [$ctx,#$Aoff+$lo]
- ldr $t1, [$ctx,#$Aoff+$hi]
- ldr $t2, [$ctx,#$Boff+$lo]
- ldr $t3, [$ctx,#$Boff+$hi]
- adds $t0,$Alo,$t0
- str $t0, [$ctx,#$Aoff+$lo]
- adc $t1,$Ahi,$t1
- str $t1, [$ctx,#$Aoff+$hi]
- adds $t2,$Tlo,$t2
- str $t2, [$ctx,#$Boff+$lo]
- adc $t3,$Thi,$t3
- str $t3, [$ctx,#$Boff+$hi]
-
- ldr $Alo,[sp,#$Coff+0]
- ldr $Ahi,[sp,#$Coff+4]
- ldr $Tlo,[sp,#$Doff+0]
- ldr $Thi,[sp,#$Doff+4]
- ldr $t0, [$ctx,#$Coff+$lo]
- ldr $t1, [$ctx,#$Coff+$hi]
- ldr $t2, [$ctx,#$Doff+$lo]
- ldr $t3, [$ctx,#$Doff+$hi]
- adds $t0,$Alo,$t0
- str $t0, [$ctx,#$Coff+$lo]
- adc $t1,$Ahi,$t1
- str $t1, [$ctx,#$Coff+$hi]
- adds $t2,$Tlo,$t2
- str $t2, [$ctx,#$Doff+$lo]
- adc $t3,$Thi,$t3
- str $t3, [$ctx,#$Doff+$hi]
-
- ldr $Tlo,[sp,#$Foff+0]
- ldr $Thi,[sp,#$Foff+4]
- ldr $t0, [$ctx,#$Eoff+$lo]
- ldr $t1, [$ctx,#$Eoff+$hi]
- ldr $t2, [$ctx,#$Foff+$lo]
- ldr $t3, [$ctx,#$Foff+$hi]
- adds $Elo,$Elo,$t0
- str $Elo,[$ctx,#$Eoff+$lo]
- adc $Ehi,$Ehi,$t1
- str $Ehi,[$ctx,#$Eoff+$hi]
- adds $t2,$Tlo,$t2
- str $t2, [$ctx,#$Foff+$lo]
- adc $t3,$Thi,$t3
- str $t3, [$ctx,#$Foff+$hi]
-
- ldr $Alo,[sp,#$Goff+0]
- ldr $Ahi,[sp,#$Goff+4]
- ldr $Tlo,[sp,#$Hoff+0]
- ldr $Thi,[sp,#$Hoff+4]
- ldr $t0, [$ctx,#$Goff+$lo]
- ldr $t1, [$ctx,#$Goff+$hi]
- ldr $t2, [$ctx,#$Hoff+$lo]
- ldr $t3, [$ctx,#$Hoff+$hi]
- adds $t0,$Alo,$t0
- str $t0, [$ctx,#$Goff+$lo]
- adc $t1,$Ahi,$t1
- str $t1, [$ctx,#$Goff+$hi]
- adds $t2,$Tlo,$t2
- str $t2, [$ctx,#$Hoff+$lo]
- adc $t3,$Thi,$t3
- str $t3, [$ctx,#$Hoff+$hi]
-
- add sp,sp,#640
- sub $Ktbl,$Ktbl,#640
-
- teq $inp,$len
- bne .Loop
-
- add sp,sp,#8*9 @ destroy frame
-#if __ARM_ARCH__>=5
- ldmia sp!,{r4-r12,pc}
-#else
- ldmia sp!,{r4-r12,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size sha512_block_data_order,.-sha512_block_data_order
-___
-
-{
-my @Sigma0=(28,34,39);
-my @Sigma1=(14,18,41);
-my @sigma0=(1, 8, 7);
-my @sigma1=(19,61,6);
-
-my $Ktbl="r3";
-my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
-
-my @X=map("d$_",(0..15));
-my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
-
-sub NEON_00_15() {
-my $i=shift;
-my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
-my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
-
-$code.=<<___ if ($i<16 || $i&1);
- vshr.u64 $t0,$e,#@Sigma1[0] @ $i
-#if $i<16
- vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
-#endif
- vshr.u64 $t1,$e,#@Sigma1[1]
-#if $i>0
- vadd.i64 $a,$Maj @ h+=Maj from the past
-#endif
- vshr.u64 $t2,$e,#@Sigma1[2]
-___
-$code.=<<___;
- vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
- vsli.64 $t0,$e,#`64-@Sigma1[0]`
- vsli.64 $t1,$e,#`64-@Sigma1[1]`
- vmov $Ch,$e
- vsli.64 $t2,$e,#`64-@Sigma1[2]`
-#if $i<16 && defined(__ARMEL__)
- vrev64.8 @X[$i],@X[$i]
-#endif
- veor $t1,$t0
- vbsl $Ch,$f,$g @ Ch(e,f,g)
- vshr.u64 $t0,$a,#@Sigma0[0]
- veor $t2,$t1 @ Sigma1(e)
- vadd.i64 $T1,$Ch,$h
- vshr.u64 $t1,$a,#@Sigma0[1]
- vsli.64 $t0,$a,#`64-@Sigma0[0]`
- vadd.i64 $T1,$t2
- vshr.u64 $t2,$a,#@Sigma0[2]
- vadd.i64 $K,@X[$i%16]
- vsli.64 $t1,$a,#`64-@Sigma0[1]`
- veor $Maj,$a,$b
- vsli.64 $t2,$a,#`64-@Sigma0[2]`
- veor $h,$t0,$t1
- vadd.i64 $T1,$K
- vbsl $Maj,$c,$b @ Maj(a,b,c)
- veor $h,$t2 @ Sigma0(a)
- vadd.i64 $d,$T1
- vadd.i64 $Maj,$T1
- @ vadd.i64 $h,$Maj
-___
-}
-
-sub NEON_16_79() {
-my $i=shift;
-
-if ($i&1) { &NEON_00_15($i,@_); return; }
-
-# 2x-vectorized, therefore runs every 2nd round
-my @X=map("q$_",(0..7)); # view @X as 128-bit vector
-my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
-my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
-my $e=@_[4]; # $e from NEON_00_15
-$i /= 2;
-$code.=<<___;
- vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
- vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
- vadd.i64 @_[0],d30 @ h+=Maj from the past
- vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
- vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
- vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
- vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
- veor $s1,$t0
- vshr.u64 $t0,$s0,#@sigma0[0]
- veor $s1,$t1 @ sigma1(X[i+14])
- vshr.u64 $t1,$s0,#@sigma0[1]
- vadd.i64 @X[$i%8],$s1
- vshr.u64 $s1,$s0,#@sigma0[2]
- vsli.64 $t0,$s0,#`64-@sigma0[0]`
- vsli.64 $t1,$s0,#`64-@sigma0[1]`
- vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
- veor $s1,$t0
- vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
- vadd.i64 @X[$i%8],$s0
- vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
- veor $s1,$t1 @ sigma0(X[i+1])
- vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
- vadd.i64 @X[$i%8],$s1
-___
- &NEON_00_15(2*$i,@_);
-}
-
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7
-.arch armv7-a
-.fpu neon
-
-.global sha512_block_data_order_neon
-.type sha512_block_data_order_neon,%function
-.align 4
-sha512_block_data_order_neon:
-.LNEON:
- dmb @ errata #451034 on early Cortex A8
- add $len,$inp,$len,lsl#7 @ len to point at the end of inp
- VFP_ABI_PUSH
- adr $Ktbl,.Lsha512_block_data_order
- sub $Ktbl,$Ktbl,.Lsha512_block_data_order-K512
- vldmia $ctx,{$A-$H} @ load context
-.Loop_neon:
-___
-for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
- mov $cnt,#4
-.L16_79_neon:
- subs $cnt,#1
-___
-for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
- bne .L16_79_neon
-
- vadd.i64 $A,d30 @ h+=Maj from the past
- vldmia $ctx,{d24-d31} @ load context to temp
- vadd.i64 q8,q12 @ vectorized accumulate
- vadd.i64 q9,q13
- vadd.i64 q10,q14
- vadd.i64 q11,q15
- vstmia $ctx,{$A-$H} @ save context
- teq $inp,$len
- sub $Ktbl,#640 @ rewind K512
- bne .Loop_neon
-
- VFP_ABI_POP
- ret @ bx lr
-.size sha512_block_data_order_neon,.-sha512_block_data_order_neon
-#endif
-___
-}
-$code.=<<___;
-.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
-.align 2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm OPENSSL_armcap_P,4,4
-#endif
-___
-
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
-$code =~ s/\bret\b/bx lr/gm;
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/@/ and !/^$/);
- print;
-}
-close SELF;
-
-print $code;
-close STDOUT; # enforce flush
diff --git a/arch/arm/crypto/sha512-glue.c b/arch/arm/crypto/sha512-glue.c
deleted file mode 100644
index f8a6480889b1..000000000000
--- a/arch/arm/crypto/sha512-glue.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sha512-glue.c - accelerated SHA-384/512 for ARM
- *
- * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include "sha512.h"
-
-MODULE_DESCRIPTION("Accelerated SHA-384/SHA-512 secure hash for ARM");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-
-MODULE_ALIAS_CRYPTO("sha384");
-MODULE_ALIAS_CRYPTO("sha512");
-MODULE_ALIAS_CRYPTO("sha384-arm");
-MODULE_ALIAS_CRYPTO("sha512-arm");
-
-asmlinkage void sha512_block_data_order(struct sha512_state *state,
- u8 const *src, int blocks);
-
-static int sha512_arm_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha512_base_do_update_blocks(desc, data, len,
- sha512_block_data_order);
-}
-
-static int sha512_arm_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- sha512_base_do_finup(desc, data, len, sha512_block_data_order);
- return sha512_base_finish(desc, out);
-}
-
-static struct shash_alg sha512_arm_algs[] = { {
- .init = sha384_base_init,
- .update = sha512_arm_update,
- .finup = sha512_arm_finup,
- .descsize = SHA512_STATE_SIZE,
- .digestsize = SHA384_DIGEST_SIZE,
- .base = {
- .cra_name = "sha384",
- .cra_driver_name = "sha384-arm",
- .cra_priority = 250,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .init = sha512_base_init,
- .update = sha512_arm_update,
- .finup = sha512_arm_finup,
- .descsize = SHA512_STATE_SIZE,
- .digestsize = SHA512_DIGEST_SIZE,
- .base = {
- .cra_name = "sha512",
- .cra_driver_name = "sha512-arm",
- .cra_priority = 250,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static int __init sha512_arm_mod_init(void)
-{
- int err;
-
- err = crypto_register_shashes(sha512_arm_algs,
- ARRAY_SIZE(sha512_arm_algs));
- if (err)
- return err;
-
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) {
- err = crypto_register_shashes(sha512_neon_algs,
- ARRAY_SIZE(sha512_neon_algs));
- if (err)
- goto err_unregister;
- }
- return 0;
-
-err_unregister:
- crypto_unregister_shashes(sha512_arm_algs,
- ARRAY_SIZE(sha512_arm_algs));
-
- return err;
-}
-
-static void __exit sha512_arm_mod_fini(void)
-{
- crypto_unregister_shashes(sha512_arm_algs,
- ARRAY_SIZE(sha512_arm_algs));
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon())
- crypto_unregister_shashes(sha512_neon_algs,
- ARRAY_SIZE(sha512_neon_algs));
-}
-
-module_init(sha512_arm_mod_init);
-module_exit(sha512_arm_mod_fini);
diff --git a/arch/arm/crypto/sha512-neon-glue.c b/arch/arm/crypto/sha512-neon-glue.c
deleted file mode 100644
index bd528077fefb..000000000000
--- a/arch/arm/crypto/sha512-neon-glue.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sha512-neon-glue.c - accelerated SHA-384/512 for ARM NEON
- *
- * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include "sha512.h"
-
-MODULE_ALIAS_CRYPTO("sha384-neon");
-MODULE_ALIAS_CRYPTO("sha512-neon");
-
-asmlinkage void sha512_block_data_order_neon(struct sha512_state *state,
- const u8 *src, int blocks);
-
-static int sha512_neon_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- int remain;
-
- kernel_neon_begin();
- remain = sha512_base_do_update_blocks(desc, data, len,
- sha512_block_data_order_neon);
- kernel_neon_end();
- return remain;
-}
-
-static int sha512_neon_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- kernel_neon_begin();
- sha512_base_do_finup(desc, data, len, sha512_block_data_order_neon);
- kernel_neon_end();
- return sha512_base_finish(desc, out);
-}
-
-struct shash_alg sha512_neon_algs[] = { {
- .init = sha384_base_init,
- .update = sha512_neon_update,
- .finup = sha512_neon_finup,
- .descsize = SHA512_STATE_SIZE,
- .digestsize = SHA384_DIGEST_SIZE,
- .base = {
- .cra_name = "sha384",
- .cra_driver_name = "sha384-neon",
- .cra_priority = 300,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA384_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
-
- }
-}, {
- .init = sha512_base_init,
- .update = sha512_neon_update,
- .finup = sha512_neon_finup,
- .descsize = SHA512_STATE_SIZE,
- .digestsize = SHA512_DIGEST_SIZE,
- .base = {
- .cra_name = "sha512",
- .cra_driver_name = "sha512-neon",
- .cra_priority = 300,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
diff --git a/arch/arm/crypto/sha512.h b/arch/arm/crypto/sha512.h
deleted file mode 100644
index eeaee52cda69..000000000000
--- a/arch/arm/crypto/sha512.h
+++ /dev/null
@@ -1,3 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-extern struct shash_alg sha512_neon_algs[2];
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index f379c852dcb7..88336a1292bb 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -119,7 +119,7 @@ no_work_pending:
ct_user_enter save = 0
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
bl stackleak_erase_on_task_stack
#endif
restore_user_regs fast = 0, offset = 0
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index c421a899fc84..7951b2c06fec 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -677,7 +677,7 @@ enum arm_regset {
static const struct user_regset arm_regsets[] = {
[REGSET_GPR] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = ELF_NGREG,
.size = sizeof(u32),
.align = sizeof(u32),
@@ -689,7 +689,7 @@ static const struct user_regset arm_regsets[] = {
* For the FPA regs in fpstate, the real fields are a mixture
* of sizes, so pretend that the registers are word-sized:
*/
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(struct user_fp) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -702,7 +702,7 @@ static const struct user_regset arm_regsets[] = {
* Pretend that the VFP regs are word-sized, since the FPSCR is
* a single word dangling at the end of struct user_vfp:
*/
- .core_note_type = NT_ARM_VFP,
+ USER_REGSET_NOTE_TYPE(ARM_VFP),
.n = ARM_VFPREGS_SIZE / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
diff --git a/arch/arm/lib/.gitignore b/arch/arm/lib/.gitignore
new file mode 100644
index 000000000000..647d7a922e68
--- /dev/null
+++ b/arch/arm/lib/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# This now-removed directory used to contain generated files.
+/crypto/
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 91ea0e29107a..0ca5aae1bcc3 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -5,8 +5,6 @@
# Copyright (C) 1995-2000 Russell King
#
-obj-y += crypto/
-
lib-y := changebit.o csumipv6.o csumpartial.o \
csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
delay.o delay-loop.o findbit.o memchr.o memcpy.o \
@@ -47,9 +45,3 @@ ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
endif
obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
-
-obj-$(CONFIG_CRC32_ARCH) += crc32-arm.o
-crc32-arm-y := crc32.o crc32-core.o
-
-obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-arm.o
-crc-t10dif-arm-y := crc-t10dif.o crc-t10dif-core.o
diff --git a/arch/arm/lib/crc-t10dif-core.S b/arch/arm/lib/crc-t10dif-core.S
deleted file mode 100644
index 2bbf2df9c1e2..000000000000
--- a/arch/arm/lib/crc-t10dif-core.S
+++ /dev/null
@@ -1,468 +0,0 @@
-//
-// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
-//
-// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
-// Copyright (C) 2019 Google LLC <ebiggers@google.com>
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License version 2 as
-// published by the Free Software Foundation.
-//
-
-// Derived from the x86 version:
-//
-// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
-//
-// Copyright (c) 2013, Intel Corporation
-//
-// Authors:
-// Erdinc Ozturk <erdinc.ozturk@intel.com>
-// Vinodh Gopal <vinodh.gopal@intel.com>
-// James Guilford <james.guilford@intel.com>
-// Tim Chen <tim.c.chen@linux.intel.com>
-//
-// This software is available to you under a choice of one of two
-// licenses. You may choose to be licensed under the terms of the GNU
-// General Public License (GPL) Version 2, available from the file
-// COPYING in the main directory of this source tree, or the
-// OpenIB.org BSD license below:
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the
-// distribution.
-//
-// * Neither the name of the Intel Corporation nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-//
-// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Reference paper titled "Fast CRC Computation for Generic
-// Polynomials Using PCLMULQDQ Instruction"
-// URL: http://www.intel.com/content/dam/www/public/us/en/documents
-// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
-//
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-#ifdef CONFIG_CPU_ENDIAN_BE8
-#define CPU_LE(code...)
-#else
-#define CPU_LE(code...) code
-#endif
-
- .text
- .arch armv8-a
- .fpu crypto-neon-fp-armv8
-
- init_crc .req r0
- buf .req r1
- len .req r2
-
- fold_consts_ptr .req ip
-
- q0l .req d0
- q0h .req d1
- q1l .req d2
- q1h .req d3
- q2l .req d4
- q2h .req d5
- q3l .req d6
- q3h .req d7
- q4l .req d8
- q4h .req d9
- q5l .req d10
- q5h .req d11
- q6l .req d12
- q6h .req d13
- q7l .req d14
- q7h .req d15
- q8l .req d16
- q8h .req d17
- q9l .req d18
- q9h .req d19
- q10l .req d20
- q10h .req d21
- q11l .req d22
- q11h .req d23
- q12l .req d24
- q12h .req d25
-
- FOLD_CONSTS .req q10
- FOLD_CONST_L .req q10l
- FOLD_CONST_H .req q10h
-
- /*
- * Pairwise long polynomial multiplication of two 16-bit values
- *
- * { w0, w1 }, { y0, y1 }
- *
- * by two 64-bit values
- *
- * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
- *
- * where each vector element is a byte, ordered from least to most
- * significant. The resulting 80-bit vectors are XOR'ed together.
- *
- * This can be implemented using 8x8 long polynomial multiplication, by
- * reorganizing the input so that each pairwise 8x8 multiplication
- * produces one of the terms from the decomposition below, and
- * combining the results of each rank and shifting them into place.
- *
- * Rank
- * 0 w0*x0 ^ | y0*z0 ^
- * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^
- * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^
- * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^
- * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^
- * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^
- * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^
- * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^
- * 8 w1*x7 << 64 | y1*z7 << 64
- *
- * The inputs can be reorganized into
- *
- * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
- * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
- *
- * and after performing 8x8->16 bit long polynomial multiplication of
- * each of the halves of the first vector with those of the second one,
- * we obtain the following four vectors of 16-bit elements:
- *
- * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
- * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
- * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
- * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
- *
- * Results b and c can be XORed together, as the vector elements have
- * matching ranks. Then, the final XOR can be pulled forward, and
- * applied between the halves of each of the remaining three vectors,
- * which are then shifted into place, and XORed together to produce the
- * final 80-bit result.
- */
- .macro pmull16x64_p8, v16, v64
- vext.8 q11, \v64, \v64, #1
- vld1.64 {q12}, [r4, :128]
- vuzp.8 q11, \v64
- vtbl.8 d24, {\v16\()_L-\v16\()_H}, d24
- vtbl.8 d25, {\v16\()_L-\v16\()_H}, d25
- bl __pmull16x64_p8
- veor \v64, q12, q14
- .endm
-
-__pmull16x64_p8:
- vmull.p8 q13, d23, d24
- vmull.p8 q14, d23, d25
- vmull.p8 q15, d22, d24
- vmull.p8 q12, d22, d25
-
- veor q14, q14, q15
- veor d24, d24, d25
- veor d26, d26, d27
- veor d28, d28, d29
- vmov.i32 d25, #0
- vmov.i32 d29, #0
- vext.8 q12, q12, q12, #14
- vext.8 q14, q14, q14, #15
- veor d24, d24, d26
- bx lr
-ENDPROC(__pmull16x64_p8)
-
- .macro pmull16x64_p64, v16, v64
- vmull.p64 q11, \v64\()l, \v16\()_L
- vmull.p64 \v64, \v64\()h, \v16\()_H
- veor \v64, \v64, q11
- .endm
-
- // Fold reg1, reg2 into the next 32 data bytes, storing the result back
- // into reg1, reg2.
- .macro fold_32_bytes, reg1, reg2, p
- vld1.64 {q8-q9}, [buf]!
-
- pmull16x64_\p FOLD_CONST, \reg1
- pmull16x64_\p FOLD_CONST, \reg2
-
-CPU_LE( vrev64.8 q8, q8 )
-CPU_LE( vrev64.8 q9, q9 )
- vswp q8l, q8h
- vswp q9l, q9h
-
- veor.8 \reg1, \reg1, q8
- veor.8 \reg2, \reg2, q9
- .endm
-
- // Fold src_reg into dst_reg, optionally loading the next fold constants
- .macro fold_16_bytes, src_reg, dst_reg, p, load_next_consts
- pmull16x64_\p FOLD_CONST, \src_reg
- .ifnb \load_next_consts
- vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
- .endif
- veor.8 \dst_reg, \dst_reg, \src_reg
- .endm
-
- .macro crct10dif, p
- // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
- cmp len, #256
- blt .Lless_than_256_bytes\@
-
- mov_l fold_consts_ptr, .Lfold_across_128_bytes_consts
-
- // Load the first 128 data bytes. Byte swapping is necessary to make
- // the bit order match the polynomial coefficient order.
- vld1.64 {q0-q1}, [buf]!
- vld1.64 {q2-q3}, [buf]!
- vld1.64 {q4-q5}, [buf]!
- vld1.64 {q6-q7}, [buf]!
-CPU_LE( vrev64.8 q0, q0 )
-CPU_LE( vrev64.8 q1, q1 )
-CPU_LE( vrev64.8 q2, q2 )
-CPU_LE( vrev64.8 q3, q3 )
-CPU_LE( vrev64.8 q4, q4 )
-CPU_LE( vrev64.8 q5, q5 )
-CPU_LE( vrev64.8 q6, q6 )
-CPU_LE( vrev64.8 q7, q7 )
- vswp q0l, q0h
- vswp q1l, q1h
- vswp q2l, q2h
- vswp q3l, q3h
- vswp q4l, q4h
- vswp q5l, q5h
- vswp q6l, q6h
- vswp q7l, q7h
-
- // XOR the first 16 data *bits* with the initial CRC value.
- vmov.i8 q8h, #0
- vmov.u16 q8h[3], init_crc
- veor q0h, q0h, q8h
-
- // Load the constants for folding across 128 bytes.
- vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
-
- // Subtract 128 for the 128 data bytes just consumed. Subtract another
- // 128 to simplify the termination condition of the following loop.
- sub len, len, #256
-
- // While >= 128 data bytes remain (not counting q0-q7), fold the 128
- // bytes q0-q7 into them, storing the result back into q0-q7.
-.Lfold_128_bytes_loop\@:
- fold_32_bytes q0, q1, \p
- fold_32_bytes q2, q3, \p
- fold_32_bytes q4, q5, \p
- fold_32_bytes q6, q7, \p
- subs len, len, #128
- bge .Lfold_128_bytes_loop\@
-
- // Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
-
- // Fold across 64 bytes.
- vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
- fold_16_bytes q0, q4, \p
- fold_16_bytes q1, q5, \p
- fold_16_bytes q2, q6, \p
- fold_16_bytes q3, q7, \p, 1
- // Fold across 32 bytes.
- fold_16_bytes q4, q6, \p
- fold_16_bytes q5, q7, \p, 1
- // Fold across 16 bytes.
- fold_16_bytes q6, q7, \p
-
- // Add 128 to get the correct number of data bytes remaining in 0...127
- // (not counting q7), following the previous extra subtraction by 128.
- // Then subtract 16 to simplify the termination condition of the
- // following loop.
- adds len, len, #(128-16)
-
- // While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7
- // into them, storing the result back into q7.
- blt .Lfold_16_bytes_loop_done\@
-.Lfold_16_bytes_loop\@:
- pmull16x64_\p FOLD_CONST, q7
- vld1.64 {q0}, [buf]!
-CPU_LE( vrev64.8 q0, q0 )
- vswp q0l, q0h
- veor.8 q7, q7, q0
- subs len, len, #16
- bge .Lfold_16_bytes_loop\@
-
-.Lfold_16_bytes_loop_done\@:
- // Add 16 to get the correct number of data bytes remaining in 0...15
- // (not counting q7), following the previous extra subtraction by 16.
- adds len, len, #16
- beq .Lreduce_final_16_bytes\@
-
-.Lhandle_partial_segment\@:
- // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
- // 16 bytes are in q7 and the rest are the remaining data in 'buf'. To
- // do this without needing a fold constant for each possible 'len',
- // redivide the bytes into a first chunk of 'len' bytes and a second
- // chunk of 16 bytes, then fold the first chunk into the second.
-
- // q0 = last 16 original data bytes
- add buf, buf, len
- sub buf, buf, #16
- vld1.64 {q0}, [buf]
-CPU_LE( vrev64.8 q0, q0 )
- vswp q0l, q0h
-
- // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
- mov_l r1, .Lbyteshift_table + 16
- sub r1, r1, len
- vld1.8 {q2}, [r1]
- vtbl.8 q1l, {q7l-q7h}, q2l
- vtbl.8 q1h, {q7l-q7h}, q2h
-
- // q3 = first chunk: q7 right-shifted by '16-len' bytes.
- vmov.i8 q3, #0x80
- veor.8 q2, q2, q3
- vtbl.8 q3l, {q7l-q7h}, q2l
- vtbl.8 q3h, {q7l-q7h}, q2h
-
- // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
- vshr.s8 q2, q2, #7
-
- // q2 = second chunk: 'len' bytes from q0 (low-order bytes),
- // then '16-len' bytes from q1 (high-order bytes).
- vbsl.8 q2, q1, q0
-
- // Fold the first chunk into the second chunk, storing the result in q7.
- pmull16x64_\p FOLD_CONST, q3
- veor.8 q7, q3, q2
- b .Lreduce_final_16_bytes\@
-
-.Lless_than_256_bytes\@:
- // Checksumming a buffer of length 16...255 bytes
-
- mov_l fold_consts_ptr, .Lfold_across_16_bytes_consts
-
- // Load the first 16 data bytes.
- vld1.64 {q7}, [buf]!
-CPU_LE( vrev64.8 q7, q7 )
- vswp q7l, q7h
-
- // XOR the first 16 data *bits* with the initial CRC value.
- vmov.i8 q0h, #0
- vmov.u16 q0h[3], init_crc
- veor.8 q7h, q7h, q0h
-
- // Load the fold-across-16-bytes constants.
- vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
-
- cmp len, #16
- beq .Lreduce_final_16_bytes\@ // len == 16
- subs len, len, #32
- addlt len, len, #16
- blt .Lhandle_partial_segment\@ // 17 <= len <= 31
- b .Lfold_16_bytes_loop\@ // 32 <= len <= 255
-
-.Lreduce_final_16_bytes\@:
- .endm
-
-//
-// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-ENTRY(crc_t10dif_pmull64)
- crct10dif p64
-
- // Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
-
- // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
- vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
-
- // Fold the high 64 bits into the low 64 bits, while also multiplying by
- // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
- // whose low 48 bits are 0.
- vmull.p64 q0, q7h, FOLD_CONST_H // high bits * x^48 * (x^80 mod G(x))
- veor.8 q0h, q0h, q7l // + low bits * x^64
-
- // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
- // value congruent to x^64 * M(x) and whose low 48 bits are 0.
- vmov.i8 q1, #0
- vmov s4, s3 // extract high 32 bits
- vmov s3, s5 // zero high 32 bits
- vmull.p64 q1, q1l, FOLD_CONST_L // high 32 bits * x^48 * (x^48 mod G(x))
- veor.8 q0, q0, q1 // + low bits
-
- // Load G(x) and floor(x^48 / G(x)).
- vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]
-
- // Use Barrett reduction to compute the final CRC value.
- vmull.p64 q1, q0h, FOLD_CONST_H // high 32 bits * floor(x^48 / G(x))
- vshr.u64 q1l, q1l, #32 // /= x^32
- vmull.p64 q1, q1l, FOLD_CONST_L // *= G(x)
- vshr.u64 q0l, q0l, #48
- veor.8 q0l, q0l, q1l // + low 16 nonzero bits
- // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of q0.
-
- vmov.u16 r0, q0l[0]
- bx lr
-ENDPROC(crc_t10dif_pmull64)
-
-ENTRY(crc_t10dif_pmull8)
- push {r4, lr}
- mov_l r4, .L16x64perm
-
- crct10dif p8
-
-CPU_LE( vrev64.8 q7, q7 )
- vswp q7l, q7h
- vst1.64 {q7}, [r3, :128]
- pop {r4, pc}
-ENDPROC(crc_t10dif_pmull8)
-
- .section ".rodata", "a"
- .align 4
-
-// Fold constants precomputed from the polynomial 0x18bb7
-// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
-.Lfold_across_128_bytes_consts:
- .quad 0x0000000000006123 // x^(8*128) mod G(x)
- .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
-// .Lfold_across_64_bytes_consts:
- .quad 0x0000000000001069 // x^(4*128) mod G(x)
- .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
-// .Lfold_across_32_bytes_consts:
- .quad 0x000000000000857d // x^(2*128) mod G(x)
- .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
-.Lfold_across_16_bytes_consts:
- .quad 0x000000000000a010 // x^(1*128) mod G(x)
- .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
-// .Lfinal_fold_consts:
- .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
- .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
-// .Lbarrett_reduction_consts:
- .quad 0x0000000000018bb7 // G(x)
- .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
-
-// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
-// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
-// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
-.Lbyteshift_table:
- .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
- .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
- .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
- .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
-
-.L16x64perm:
- .quad 0x808080800000000, 0x909090901010101
diff --git a/arch/arm/lib/crc-t10dif.c b/arch/arm/lib/crc-t10dif.c
deleted file mode 100644
index 1093f8ec13b0..000000000000
--- a/arch/arm/lib/crc-t10dif.c
+++ /dev/null
@@ -1,72 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/crc-t10dif.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <crypto/internal/simd.h>
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
-
-#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
-
-asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len);
-asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len,
- u8 out[16]);
-
-u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
-{
- if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
- if (static_branch_likely(&have_pmull)) {
- if (crypto_simd_usable()) {
- kernel_neon_begin();
- crc = crc_t10dif_pmull64(crc, data, length);
- kernel_neon_end();
- return crc;
- }
- } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
- static_branch_likely(&have_neon) &&
- crypto_simd_usable()) {
- u8 buf[16] __aligned(16);
-
- kernel_neon_begin();
- crc_t10dif_pmull8(crc, data, length, buf);
- kernel_neon_end();
-
- return crc_t10dif_generic(0, buf, sizeof(buf));
- }
- }
- return crc_t10dif_generic(crc, data, length);
-}
-EXPORT_SYMBOL(crc_t10dif_arch);
-
-static int __init crc_t10dif_arm_init(void)
-{
- if (elf_hwcap & HWCAP_NEON) {
- static_branch_enable(&have_neon);
- if (elf_hwcap2 & HWCAP2_PMULL)
- static_branch_enable(&have_pmull);
- }
- return 0;
-}
-subsys_initcall(crc_t10dif_arm_init);
-
-static void __exit crc_t10dif_arm_exit(void)
-{
-}
-module_exit(crc_t10dif_arm_exit);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_DESCRIPTION("Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/lib/crc32-core.S b/arch/arm/lib/crc32-core.S
deleted file mode 100644
index 6f674f30c70b..000000000000
--- a/arch/arm/lib/crc32-core.S
+++ /dev/null
@@ -1,306 +0,0 @@
-/*
- * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-/* GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see http://www.gnu.org/licenses
- *
- * Please visit http://www.xyratex.com/contact if you need additional
- * information or have any questions.
- *
- * GPL HEADER END
- */
-
-/*
- * Copyright 2012 Xyratex Technology Limited
- *
- * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
- * calculation.
- * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
- * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
- * at:
- * https://www.intel.com/products/processor/manuals/
- * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
- * Volume 2B: Instruction Set Reference, N-Z
- *
- * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
- * Alexander Boyko <Alexander_Boyko@xyratex.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .text
- .align 6
- .arch armv8-a
- .arch_extension crc
- .fpu crypto-neon-fp-armv8
-
-.Lcrc32_constants:
- /*
- * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
- * #define CONSTANT_R1 0x154442bd4LL
- *
- * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
- * #define CONSTANT_R2 0x1c6e41596LL
- */
- .quad 0x0000000154442bd4
- .quad 0x00000001c6e41596
-
- /*
- * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
- * #define CONSTANT_R3 0x1751997d0LL
- *
- * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
- * #define CONSTANT_R4 0x0ccaa009eLL
- */
- .quad 0x00000001751997d0
- .quad 0x00000000ccaa009e
-
- /*
- * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
- * #define CONSTANT_R5 0x163cd6124LL
- */
- .quad 0x0000000163cd6124
- .quad 0x00000000FFFFFFFF
-
- /*
- * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
- *
- * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
- * = 0x1F7011641LL
- * #define CONSTANT_RU 0x1F7011641LL
- */
- .quad 0x00000001DB710641
- .quad 0x00000001F7011641
-
-.Lcrc32c_constants:
- .quad 0x00000000740eef02
- .quad 0x000000009e4addf8
- .quad 0x00000000f20c0dfe
- .quad 0x000000014cd00bd6
- .quad 0x00000000dd45aab8
- .quad 0x00000000FFFFFFFF
- .quad 0x0000000105ec76f0
- .quad 0x00000000dea713f1
-
- dCONSTANTl .req d0
- dCONSTANTh .req d1
- qCONSTANT .req q0
-
- BUF .req r0
- LEN .req r1
- CRC .req r2
-
- qzr .req q9
-
- /**
- * Calculate crc32
- * BUF - buffer
- * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
- * CRC - initial crc32
- * return %eax crc32
- * uint crc32_pmull_le(unsigned char const *buffer,
- * size_t len, uint crc32)
- */
-SYM_FUNC_START(crc32_pmull_le)
- adr r3, .Lcrc32_constants
- b 0f
-SYM_FUNC_END(crc32_pmull_le)
-
-SYM_FUNC_START(crc32c_pmull_le)
- adr r3, .Lcrc32c_constants
-
-0: bic LEN, LEN, #15
- vld1.8 {q1-q2}, [BUF, :128]!
- vld1.8 {q3-q4}, [BUF, :128]!
- vmov.i8 qzr, #0
- vmov.i8 qCONSTANT, #0
- vmov.32 dCONSTANTl[0], CRC
- veor.8 d2, d2, dCONSTANTl
- sub LEN, LEN, #0x40
- cmp LEN, #0x40
- blt less_64
-
- vld1.64 {qCONSTANT}, [r3]
-
-loop_64: /* 64 bytes Full cache line folding */
- sub LEN, LEN, #0x40
-
- vmull.p64 q5, d3, dCONSTANTh
- vmull.p64 q6, d5, dCONSTANTh
- vmull.p64 q7, d7, dCONSTANTh
- vmull.p64 q8, d9, dCONSTANTh
-
- vmull.p64 q1, d2, dCONSTANTl
- vmull.p64 q2, d4, dCONSTANTl
- vmull.p64 q3, d6, dCONSTANTl
- vmull.p64 q4, d8, dCONSTANTl
-
- veor.8 q1, q1, q5
- vld1.8 {q5}, [BUF, :128]!
- veor.8 q2, q2, q6
- vld1.8 {q6}, [BUF, :128]!
- veor.8 q3, q3, q7
- vld1.8 {q7}, [BUF, :128]!
- veor.8 q4, q4, q8
- vld1.8 {q8}, [BUF, :128]!
-
- veor.8 q1, q1, q5
- veor.8 q2, q2, q6
- veor.8 q3, q3, q7
- veor.8 q4, q4, q8
-
- cmp LEN, #0x40
- bge loop_64
-
-less_64: /* Folding cache line into 128bit */
- vldr dCONSTANTl, [r3, #16]
- vldr dCONSTANTh, [r3, #24]
-
- vmull.p64 q5, d3, dCONSTANTh
- vmull.p64 q1, d2, dCONSTANTl
- veor.8 q1, q1, q5
- veor.8 q1, q1, q2
-
- vmull.p64 q5, d3, dCONSTANTh
- vmull.p64 q1, d2, dCONSTANTl
- veor.8 q1, q1, q5
- veor.8 q1, q1, q3
-
- vmull.p64 q5, d3, dCONSTANTh
- vmull.p64 q1, d2, dCONSTANTl
- veor.8 q1, q1, q5
- veor.8 q1, q1, q4
-
- teq LEN, #0
- beq fold_64
-
-loop_16: /* Folding rest buffer into 128bit */
- subs LEN, LEN, #0x10
-
- vld1.8 {q2}, [BUF, :128]!
- vmull.p64 q5, d3, dCONSTANTh
- vmull.p64 q1, d2, dCONSTANTl
- veor.8 q1, q1, q5
- veor.8 q1, q1, q2
-
- bne loop_16
-
-fold_64:
- /* perform the last 64 bit fold, also adds 32 zeroes
- * to the input stream */
- vmull.p64 q2, d2, dCONSTANTh
- vext.8 q1, q1, qzr, #8
- veor.8 q1, q1, q2
-
- /* final 32-bit fold */
- vldr dCONSTANTl, [r3, #32]
- vldr d6, [r3, #40]
- vmov.i8 d7, #0
-
- vext.8 q2, q1, qzr, #4
- vand.8 d2, d2, d6
- vmull.p64 q1, d2, dCONSTANTl
- veor.8 q1, q1, q2
-
- /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
- vldr dCONSTANTl, [r3, #48]
- vldr dCONSTANTh, [r3, #56]
-
- vand.8 q2, q1, q3
- vext.8 q2, qzr, q2, #8
- vmull.p64 q2, d5, dCONSTANTh
- vand.8 q2, q2, q3
- vmull.p64 q2, d4, dCONSTANTl
- veor.8 q1, q1, q2
- vmov r0, s5
-
- bx lr
-SYM_FUNC_END(crc32c_pmull_le)
-
- .macro __crc32, c
- subs ip, r2, #8
- bmi .Ltail\c
-
- tst r1, #3
- bne .Lunaligned\c
-
- teq ip, #0
-.Laligned8\c:
- ldrd r2, r3, [r1], #8
-ARM_BE8(rev r2, r2 )
-ARM_BE8(rev r3, r3 )
- crc32\c\()w r0, r0, r2
- crc32\c\()w r0, r0, r3
- bxeq lr
- subs ip, ip, #8
- bpl .Laligned8\c
-
-.Ltail\c:
- tst ip, #4
- beq 2f
- ldr r3, [r1], #4
-ARM_BE8(rev r3, r3 )
- crc32\c\()w r0, r0, r3
-
-2: tst ip, #2
- beq 1f
- ldrh r3, [r1], #2
-ARM_BE8(rev16 r3, r3 )
- crc32\c\()h r0, r0, r3
-
-1: tst ip, #1
- bxeq lr
- ldrb r3, [r1]
- crc32\c\()b r0, r0, r3
- bx lr
-
-.Lunaligned\c:
- tst r1, #1
- beq 2f
- ldrb r3, [r1], #1
- subs r2, r2, #1
- crc32\c\()b r0, r0, r3
-
- tst r1, #2
- beq 0f
-2: ldrh r3, [r1], #2
- subs r2, r2, #2
-ARM_BE8(rev16 r3, r3 )
- crc32\c\()h r0, r0, r3
-
-0: subs ip, r2, #8
- bpl .Laligned8\c
- b .Ltail\c
- .endm
-
- .align 5
-SYM_FUNC_START(crc32_armv8_le)
- __crc32
-SYM_FUNC_END(crc32_armv8_le)
-
- .align 5
-SYM_FUNC_START(crc32c_armv8_le)
- __crc32 c
-SYM_FUNC_END(crc32c_armv8_le)
diff --git a/arch/arm/lib/crc32.c b/arch/arm/lib/crc32.c
deleted file mode 100644
index f2bef8849c7c..000000000000
--- a/arch/arm/lib/crc32.c
+++ /dev/null
@@ -1,123 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/cpufeature.h>
-#include <linux/crc32.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <crypto/internal/simd.h>
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
-
-#define PMULL_MIN_LEN 64 /* min size of buffer for pmull functions */
-
-asmlinkage u32 crc32_pmull_le(const u8 buf[], u32 len, u32 init_crc);
-asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], u32 len);
-
-asmlinkage u32 crc32c_pmull_le(const u8 buf[], u32 len, u32 init_crc);
-asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], u32 len);
-
-static u32 crc32_le_scalar(u32 crc, const u8 *p, size_t len)
-{
- if (static_branch_likely(&have_crc32))
- return crc32_armv8_le(crc, p, len);
- return crc32_le_base(crc, p, len);
-}
-
-u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
-{
- if (len >= PMULL_MIN_LEN + 15 &&
- static_branch_likely(&have_pmull) && crypto_simd_usable()) {
- size_t n = -(uintptr_t)p & 15;
-
- /* align p to 16-byte boundary */
- if (n) {
- crc = crc32_le_scalar(crc, p, n);
- p += n;
- len -= n;
- }
- n = round_down(len, 16);
- kernel_neon_begin();
- crc = crc32_pmull_le(p, n, crc);
- kernel_neon_end();
- p += n;
- len -= n;
- }
- return crc32_le_scalar(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_le_arch);
-
-static u32 crc32c_scalar(u32 crc, const u8 *p, size_t len)
-{
- if (static_branch_likely(&have_crc32))
- return crc32c_armv8_le(crc, p, len);
- return crc32c_base(crc, p, len);
-}
-
-u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
-{
- if (len >= PMULL_MIN_LEN + 15 &&
- static_branch_likely(&have_pmull) && crypto_simd_usable()) {
- size_t n = -(uintptr_t)p & 15;
-
- /* align p to 16-byte boundary */
- if (n) {
- crc = crc32c_scalar(crc, p, n);
- p += n;
- len -= n;
- }
- n = round_down(len, 16);
- kernel_neon_begin();
- crc = crc32c_pmull_le(p, n, crc);
- kernel_neon_end();
- p += n;
- len -= n;
- }
- return crc32c_scalar(crc, p, len);
-}
-EXPORT_SYMBOL(crc32c_arch);
-
-u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
-{
- return crc32_be_base(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_be_arch);
-
-static int __init crc32_arm_init(void)
-{
- if (elf_hwcap2 & HWCAP2_CRC32)
- static_branch_enable(&have_crc32);
- if (elf_hwcap2 & HWCAP2_PMULL)
- static_branch_enable(&have_pmull);
- return 0;
-}
-subsys_initcall(crc32_arm_init);
-
-static void __exit crc32_arm_exit(void)
-{
-}
-module_exit(crc32_arm_exit);
-
-u32 crc32_optimizations(void)
-{
- if (elf_hwcap2 & (HWCAP2_CRC32 | HWCAP2_PMULL))
- return CRC32_LE_OPTIMIZATION | CRC32C_OPTIMIZATION;
- return 0;
-}
-EXPORT_SYMBOL(crc32_optimizations);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_DESCRIPTION("Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/lib/crypto/.gitignore b/arch/arm/lib/crypto/.gitignore
deleted file mode 100644
index 12d74d8b03d0..000000000000
--- a/arch/arm/lib/crypto/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-poly1305-core.S
-sha256-core.S
diff --git a/arch/arm/lib/crypto/Kconfig b/arch/arm/lib/crypto/Kconfig
deleted file mode 100644
index d1ad664f0c67..000000000000
--- a/arch/arm/lib/crypto/Kconfig
+++ /dev/null
@@ -1,31 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_BLAKE2S_ARM
- bool "Hash functions: BLAKE2s"
- select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
- help
- BLAKE2s cryptographic hash function (RFC 7693)
-
- Architecture: arm
-
- This is faster than the generic implementations of BLAKE2s and
- BLAKE2b, but slower than the NEON implementation of BLAKE2b.
- There is no NEON implementation of BLAKE2s, since NEON doesn't
- really help with it.
-
-config CRYPTO_CHACHA20_NEON
- tristate
- default CRYPTO_LIB_CHACHA
- select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_POLY1305_ARM
- tristate
- default CRYPTO_LIB_POLY1305
- select CRYPTO_ARCH_HAVE_LIB_POLY1305
-
-config CRYPTO_SHA256_ARM
- tristate
- depends on !CPU_V7M
- default CRYPTO_LIB_SHA256
- select CRYPTO_ARCH_HAVE_LIB_SHA256
- select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
diff --git a/arch/arm/lib/crypto/Makefile b/arch/arm/lib/crypto/Makefile
deleted file mode 100644
index 431f77c3ff6f..000000000000
--- a/arch/arm/lib/crypto/Makefile
+++ /dev/null
@@ -1,32 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o
-libblake2s-arm-y := blake2s-core.o blake2s-glue.o
-
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
-chacha-neon-y := chacha-scalar-core.o chacha-glue.o
-chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
-
-obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
-poly1305-arm-y := poly1305-core.o poly1305-glue.o
-
-obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
-sha256-arm-y := sha256.o sha256-core.o
-sha256-arm-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o
-
-quiet_cmd_perl = PERL $@
- cmd_perl = $(PERL) $(<) > $(@)
-
-$(obj)/%-core.S: $(src)/%-armv4.pl
- $(call cmd,perl)
-
-clean-files += poly1305-core.S sha256-core.S
-
-aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1
-
-# massage the perlasm code a bit so we only get the NEON routine if we need it
-poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
-poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
-AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y)
-
-AFLAGS_sha256-core.o += $(aflags-thumb2-y)
diff --git a/arch/arm/lib/crypto/blake2s-core.S b/arch/arm/lib/crypto/blake2s-core.S
deleted file mode 100644
index df40e46601f1..000000000000
--- a/arch/arm/lib/crypto/blake2s-core.S
+++ /dev/null
@@ -1,306 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * BLAKE2s digest algorithm, ARM scalar implementation
- *
- * Copyright 2020 Google LLC
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- // Registers used to hold message words temporarily. There aren't
- // enough ARM registers to hold the whole message block, so we have to
- // load the words on-demand.
- M_0 .req r12
- M_1 .req r14
-
-// The BLAKE2s initialization vector
-.Lblake2s_IV:
- .word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
- .word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-
-.macro __ldrd a, b, src, offset
-#if __LINUX_ARM_ARCH__ >= 6
- ldrd \a, \b, [\src, #\offset]
-#else
- ldr \a, [\src, #\offset]
- ldr \b, [\src, #\offset + 4]
-#endif
-.endm
-
-.macro __strd a, b, dst, offset
-#if __LINUX_ARM_ARCH__ >= 6
- strd \a, \b, [\dst, #\offset]
-#else
- str \a, [\dst, #\offset]
- str \b, [\dst, #\offset + 4]
-#endif
-.endm
-
-.macro _le32_bswap a, tmp
-#ifdef __ARMEB__
- rev_l \a, \tmp
-#endif
-.endm
-
-.macro _le32_bswap_8x a, b, c, d, e, f, g, h, tmp
- _le32_bswap \a, \tmp
- _le32_bswap \b, \tmp
- _le32_bswap \c, \tmp
- _le32_bswap \d, \tmp
- _le32_bswap \e, \tmp
- _le32_bswap \f, \tmp
- _le32_bswap \g, \tmp
- _le32_bswap \h, \tmp
-.endm
-
-// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
-// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
-// columns/diagonals. s0-s1 are the word offsets to the message words the first
-// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
-// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
-//
-// Note that to save instructions, the rotations don't happen when the
-// pseudocode says they should, but rather they are delayed until the values are
-// used. See the comment above _blake2s_round().
-.macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3
-
- ldr M_0, [sp, #32 + 4 * \s0]
- ldr M_1, [sp, #32 + 4 * \s2]
-
- // a += b + m[blake2s_sigma[r][2*i + 0]];
- add \a0, \a0, \b0, ror #brot
- add \a1, \a1, \b1, ror #brot
- add \a0, \a0, M_0
- add \a1, \a1, M_1
-
- // d = ror32(d ^ a, 16);
- eor \d0, \a0, \d0, ror #drot
- eor \d1, \a1, \d1, ror #drot
-
- // c += d;
- add \c0, \c0, \d0, ror #16
- add \c1, \c1, \d1, ror #16
-
- // b = ror32(b ^ c, 12);
- eor \b0, \c0, \b0, ror #brot
- eor \b1, \c1, \b1, ror #brot
-
- ldr M_0, [sp, #32 + 4 * \s1]
- ldr M_1, [sp, #32 + 4 * \s3]
-
- // a += b + m[blake2s_sigma[r][2*i + 1]];
- add \a0, \a0, \b0, ror #12
- add \a1, \a1, \b1, ror #12
- add \a0, \a0, M_0
- add \a1, \a1, M_1
-
- // d = ror32(d ^ a, 8);
- eor \d0, \a0, \d0, ror#16
- eor \d1, \a1, \d1, ror#16
-
- // c += d;
- add \c0, \c0, \d0, ror#8
- add \c1, \c1, \d1, ror#8
-
- // b = ror32(b ^ c, 7);
- eor \b0, \c0, \b0, ror#12
- eor \b1, \c1, \b1, ror#12
-.endm
-
-// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9]
-// are in r0..r9. The stack pointer points to 8 bytes of scratch space for
-// spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and
-// r14 are free to use. The macro arguments s0-s15 give the order in which the
-// message words are used in this round.
-//
-// All rotates are performed using the implicit rotate operand accepted by the
-// 'add' and 'eor' instructions. This is faster than using explicit rotate
-// instructions. To make this work, we allow the values in the second and last
-// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
-// wrong rotation amount. The rotation amount is then fixed up just in time
-// when the values are used. 'brot' is the number of bits the values in row 'b'
-// need to be rotated right to arrive at the correct values, and 'drot'
-// similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
-// that they end up as (7, 8) after every round.
-.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \
- s8, s9, s10, s11, s12, s13, s14, s15
-
- // Mix first two columns:
- // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
- __ldrd r10, r11, sp, 16 // load v[12] and v[13]
- _blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \
- \s0, \s1, \s2, \s3
- __strd r8, r9, sp, 0
- __strd r10, r11, sp, 16
-
- // Mix second two columns:
- // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
- __ldrd r8, r9, sp, 8 // load v[10] and v[11]
- __ldrd r10, r11, sp, 24 // load v[14] and v[15]
- _blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \
- \s4, \s5, \s6, \s7
- str r10, [sp, #24] // store v[14]
- // v[10], v[11], and v[15] are used below, so no need to store them yet.
-
- .set brot, 7
- .set drot, 8
-
- // Mix first two diagonals:
- // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
- ldr r10, [sp, #16] // load v[12]
- _blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \
- \s8, \s9, \s10, \s11
- __strd r8, r9, sp, 8
- str r11, [sp, #28]
- str r10, [sp, #16]
-
- // Mix second two diagonals:
- // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
- __ldrd r8, r9, sp, 0 // load v[8] and v[9]
- __ldrd r10, r11, sp, 20 // load v[13] and v[14]
- _blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \
- \s12, \s13, \s14, \s15
- __strd r10, r11, sp, 20
-.endm
-
-//
-// void blake2s_compress(struct blake2s_state *state,
-// const u8 *block, size_t nblocks, u32 inc);
-//
-// Only the first three fields of struct blake2s_state are used:
-// u32 h[8]; (inout)
-// u32 t[2]; (inout)
-// u32 f[2]; (in)
-//
- .align 5
-ENTRY(blake2s_compress)
- push {r0-r2,r4-r11,lr} // keep this an even number
-
-.Lnext_block:
- // r0 is 'state'
- // r1 is 'block'
- // r3 is 'inc'
-
- // Load and increment the counter t[0..1].
- __ldrd r10, r11, r0, 32
- adds r10, r10, r3
- adc r11, r11, #0
- __strd r10, r11, r0, 32
-
- // _blake2s_round is very short on registers, so copy the message block
- // to the stack to save a register during the rounds. This also has the
- // advantage that misalignment only needs to be dealt with in one place.
- sub sp, sp, #64
- mov r12, sp
- tst r1, #3
- bne .Lcopy_block_misaligned
- ldmia r1!, {r2-r9}
- _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14
- stmia r12!, {r2-r9}
- ldmia r1!, {r2-r9}
- _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14
- stmia r12, {r2-r9}
-.Lcopy_block_done:
- str r1, [sp, #68] // Update message pointer
-
- // Calculate v[8..15]. Push v[9..15] onto the stack, and leave space
- // for spilling v[8..9]. Leave v[8..9] in r8-r9.
- mov r14, r0 // r14 = state
- adr r12, .Lblake2s_IV
- ldmia r12!, {r8-r9} // load IV[0..1]
- __ldrd r0, r1, r14, 40 // load f[0..1]
- ldm r12, {r2-r7} // load IV[3..7]
- eor r4, r4, r10 // v[12] = IV[4] ^ t[0]
- eor r5, r5, r11 // v[13] = IV[5] ^ t[1]
- eor r6, r6, r0 // v[14] = IV[6] ^ f[0]
- eor r7, r7, r1 // v[15] = IV[7] ^ f[1]
- push {r2-r7} // push v[9..15]
- sub sp, sp, #8 // leave space for v[8..9]
-
- // Load h[0..7] == v[0..7].
- ldm r14, {r0-r7}
-
- // Execute the rounds. Each round is provided the order in which it
- // needs to use the message words.
- .set brot, 0
- .set drot, 0
- _blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
- _blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
- _blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
- _blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
- _blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
- _blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
- _blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
- _blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
- _blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
- _blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
-
- // Fold the final state matrix into the hash chaining value:
- //
- // for (i = 0; i < 8; i++)
- // h[i] ^= v[i] ^ v[i + 8];
- //
- ldr r14, [sp, #96] // r14 = &h[0]
- add sp, sp, #8 // v[8..9] are already loaded.
- pop {r10-r11} // load v[10..11]
- eor r0, r0, r8
- eor r1, r1, r9
- eor r2, r2, r10
- eor r3, r3, r11
- ldm r14, {r8-r11} // load h[0..3]
- eor r0, r0, r8
- eor r1, r1, r9
- eor r2, r2, r10
- eor r3, r3, r11
- stmia r14!, {r0-r3} // store new h[0..3]
- ldm r14, {r0-r3} // load old h[4..7]
- pop {r8-r11} // load v[12..15]
- eor r0, r0, r4, ror #brot
- eor r1, r1, r5, ror #brot
- eor r2, r2, r6, ror #brot
- eor r3, r3, r7, ror #brot
- eor r0, r0, r8, ror #drot
- eor r1, r1, r9, ror #drot
- eor r2, r2, r10, ror #drot
- eor r3, r3, r11, ror #drot
- add sp, sp, #64 // skip copy of message block
- stm r14, {r0-r3} // store new h[4..7]
-
- // Advance to the next block, if there is one. Note that if there are
- // multiple blocks, then 'inc' (the counter increment amount) must be
- // 64. So we can simply set it to 64 without re-loading it.
- ldm sp, {r0, r1, r2} // load (state, block, nblocks)
- mov r3, #64 // set 'inc'
- subs r2, r2, #1 // nblocks--
- str r2, [sp, #8]
- bne .Lnext_block // nblocks != 0?
-
- pop {r0-r2,r4-r11,pc}
-
- // The next message block (pointed to by r1) isn't 4-byte aligned, so it
- // can't be loaded using ldmia. Copy it to the stack buffer (pointed to
- // by r12) using an alternative method. r2-r9 are free to use.
-.Lcopy_block_misaligned:
- mov r2, #64
-1:
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
- ldr r3, [r1], #4
- _le32_bswap r3, r4
-#else
- ldrb r3, [r1, #0]
- ldrb r4, [r1, #1]
- ldrb r5, [r1, #2]
- ldrb r6, [r1, #3]
- add r1, r1, #4
- orr r3, r3, r4, lsl #8
- orr r3, r3, r5, lsl #16
- orr r3, r3, r6, lsl #24
-#endif
- subs r2, r2, #4
- str r3, [r12], #4
- bne 1b
- b .Lcopy_block_done
-ENDPROC(blake2s_compress)
diff --git a/arch/arm/lib/crypto/blake2s-glue.c b/arch/arm/lib/crypto/blake2s-glue.c
deleted file mode 100644
index 0238a70d9581..000000000000
--- a/arch/arm/lib/crypto/blake2s-glue.c
+++ /dev/null
@@ -1,7 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#include <crypto/internal/blake2s.h>
-#include <linux/module.h>
-
-/* defined in blake2s-core.S */
-EXPORT_SYMBOL(blake2s_compress);
diff --git a/arch/arm/lib/crypto/chacha-glue.c b/arch/arm/lib/crypto/chacha-glue.c
deleted file mode 100644
index 88ec96415283..000000000000
--- a/arch/arm/lib/crypto/chacha-glue.c
+++ /dev/null
@@ -1,138 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ChaCha and HChaCha functions (ARM optimized)
- *
- * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <crypto/chacha.h>
-#include <crypto/internal/simd.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <asm/cputype.h>
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
- u8 *dst, const u8 *src, int nrounds);
-asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- int nrounds, unsigned int nbytes);
-asmlinkage void hchacha_block_arm(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds);
-asmlinkage void hchacha_block_neon(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds);
-
-asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
- const struct chacha_state *state, int nrounds);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
-
-static inline bool neon_usable(void)
-{
- return static_branch_likely(&use_neon) && crypto_simd_usable();
-}
-
-static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- u8 buf[CHACHA_BLOCK_SIZE];
-
- while (bytes > CHACHA_BLOCK_SIZE) {
- unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
-
- chacha_4block_xor_neon(state, dst, src, nrounds, l);
- bytes -= l;
- src += l;
- dst += l;
- state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
- }
- if (bytes) {
- const u8 *s = src;
- u8 *d = dst;
-
- if (bytes != CHACHA_BLOCK_SIZE)
- s = d = memcpy(buf, src, bytes);
- chacha_block_xor_neon(state, d, s, nrounds);
- if (d != dst)
- memcpy(dst, buf, bytes);
- state->x[12]++;
- }
-}
-
-void hchacha_block_arch(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds)
-{
- if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
- hchacha_block_arm(state, out, nrounds);
- } else {
- kernel_neon_begin();
- hchacha_block_neon(state, out, nrounds);
- kernel_neon_end();
- }
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
- bytes <= CHACHA_BLOCK_SIZE) {
- chacha_doarm(dst, src, bytes, state, nrounds);
- state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
- return;
- }
-
- do {
- unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
-
- kernel_neon_begin();
- chacha_doneon(state, dst, src, todo, nrounds);
- kernel_neon_end();
-
- bytes -= todo;
- src += todo;
- dst += todo;
- } while (bytes);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-bool chacha_is_arch_optimized(void)
-{
- /* We always can use at least the ARM scalar implementation. */
- return true;
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-static int __init chacha_arm_mod_init(void)
-{
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
- switch (read_cpuid_part()) {
- case ARM_CPU_PART_CORTEX_A7:
- case ARM_CPU_PART_CORTEX_A5:
- /*
- * The Cortex-A7 and Cortex-A5 do not perform well with
- * the NEON implementation but do incredibly with the
- * scalar one and use less power.
- */
- break;
- default:
- static_branch_enable(&use_neon);
- }
- }
- return 0;
-}
-subsys_initcall(chacha_arm_mod_init);
-
-static void __exit chacha_arm_mod_exit(void)
-{
-}
-module_exit(chacha_arm_mod_exit);
-
-MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM optimized)");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/lib/crypto/chacha-neon-core.S b/arch/arm/lib/crypto/chacha-neon-core.S
deleted file mode 100644
index ddd62b6294a5..000000000000
--- a/arch/arm/lib/crypto/chacha-neon-core.S
+++ /dev/null
@@ -1,643 +0,0 @@
-/*
- * ChaCha/HChaCha NEON helper functions
- *
- * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
- /*
- * NEON doesn't have a rotate instruction. The alternatives are, more or less:
- *
- * (a) vshl.u32 + vsri.u32 (needs temporary register)
- * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register)
- * (c) vrev32.16 (16-bit rotations only)
- * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only,
- * needs index vector)
- *
- * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations,
- * the only choices are (a) and (b). We use (a) since it takes two-thirds the
- * cycles of (b) on both Cortex-A7 and Cortex-A53.
- *
- * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
- * and doesn't need a temporary register.
- *
- * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence
- * is twice as fast as (a), even when doing (a) on multiple registers
- * simultaneously to eliminate the stall between vshl and vsri. Also, it
- * parallelizes better when temporary registers are scarce.
- *
- * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
- * (a), so the need to load the rotation table actually makes the vtbl method
- * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it
- * seems to be a good compromise to get a more significant speed boost on some
- * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
- */
-
-#include <linux/linkage.h>
-#include <asm/cache.h>
-
- .text
- .fpu neon
- .align 5
-
-/*
- * chacha_permute - permute one block
- *
- * Permute one 64-byte block where the state matrix is stored in the four NEON
- * registers q0-q3. It performs matrix operations on four words in parallel,
- * but requires shuffling to rearrange the words after each round.
- *
- * The round count is given in r3.
- *
- * Clobbers: r3, ip, q4-q5
- */
-chacha_permute:
-
- adr ip, .Lrol8_table
- vld1.8 {d10}, [ip, :64]
-
-.Ldoubleround:
- // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vadd.i32 q0, q0, q1
- veor q3, q3, q0
- vrev32.16 q3, q3
-
- // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vadd.i32 q2, q2, q3
- veor q4, q1, q2
- vshl.u32 q1, q4, #12
- vsri.u32 q1, q4, #20
-
- // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vadd.i32 q0, q0, q1
- veor q3, q3, q0
- vtbl.8 d6, {d6}, d10
- vtbl.8 d7, {d7}, d10
-
- // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vadd.i32 q2, q2, q3
- veor q4, q1, q2
- vshl.u32 q1, q4, #7
- vsri.u32 q1, q4, #25
-
- // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- vext.8 q1, q1, q1, #4
- // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vext.8 q2, q2, q2, #8
- // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- vext.8 q3, q3, q3, #12
-
- // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vadd.i32 q0, q0, q1
- veor q3, q3, q0
- vrev32.16 q3, q3
-
- // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vadd.i32 q2, q2, q3
- veor q4, q1, q2
- vshl.u32 q1, q4, #12
- vsri.u32 q1, q4, #20
-
- // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vadd.i32 q0, q0, q1
- veor q3, q3, q0
- vtbl.8 d6, {d6}, d10
- vtbl.8 d7, {d7}, d10
-
- // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vadd.i32 q2, q2, q3
- veor q4, q1, q2
- vshl.u32 q1, q4, #7
- vsri.u32 q1, q4, #25
-
- // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- vext.8 q1, q1, q1, #12
- // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vext.8 q2, q2, q2, #8
- // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- vext.8 q3, q3, q3, #4
-
- subs r3, r3, #2
- bne .Ldoubleround
-
- bx lr
-ENDPROC(chacha_permute)
-
-ENTRY(chacha_block_xor_neon)
- // r0: Input state matrix, s
- // r1: 1 data block output, o
- // r2: 1 data block input, i
- // r3: nrounds
- push {lr}
-
- // x0..3 = s0..3
- add ip, r0, #0x20
- vld1.32 {q0-q1}, [r0]
- vld1.32 {q2-q3}, [ip]
-
- vmov q8, q0
- vmov q9, q1
- vmov q10, q2
- vmov q11, q3
-
- bl chacha_permute
-
- add ip, r2, #0x20
- vld1.8 {q4-q5}, [r2]
- vld1.8 {q6-q7}, [ip]
-
- // o0 = i0 ^ (x0 + s0)
- vadd.i32 q0, q0, q8
- veor q0, q0, q4
-
- // o1 = i1 ^ (x1 + s1)
- vadd.i32 q1, q1, q9
- veor q1, q1, q5
-
- // o2 = i2 ^ (x2 + s2)
- vadd.i32 q2, q2, q10
- veor q2, q2, q6
-
- // o3 = i3 ^ (x3 + s3)
- vadd.i32 q3, q3, q11
- veor q3, q3, q7
-
- add ip, r1, #0x20
- vst1.8 {q0-q1}, [r1]
- vst1.8 {q2-q3}, [ip]
-
- pop {pc}
-ENDPROC(chacha_block_xor_neon)
-
-ENTRY(hchacha_block_neon)
- // r0: Input state matrix, s
- // r1: output (8 32-bit words)
- // r2: nrounds
- push {lr}
-
- vld1.32 {q0-q1}, [r0]!
- vld1.32 {q2-q3}, [r0]
-
- mov r3, r2
- bl chacha_permute
-
- vst1.32 {q0}, [r1]!
- vst1.32 {q3}, [r1]
-
- pop {pc}
-ENDPROC(hchacha_block_neon)
-
- .align 4
-.Lctrinc: .word 0, 1, 2, 3
-.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6
-
- .align 5
-ENTRY(chacha_4block_xor_neon)
- push {r4, lr}
- mov r4, sp // preserve the stack pointer
- sub ip, sp, #0x20 // allocate a 32 byte buffer
- bic ip, ip, #0x1f // aligned to 32 bytes
- mov sp, ip
-
- // r0: Input state matrix, s
- // r1: 4 data blocks output, o
- // r2: 4 data blocks input, i
- // r3: nrounds
-
- //
- // This function encrypts four consecutive ChaCha blocks by loading
- // the state matrix in NEON registers four times. The algorithm performs
- // each operation on the corresponding word of each state matrix, hence
- // requires no word shuffling. The words are re-interleaved before the
- // final addition of the original state and the XORing step.
- //
-
- // x0..15[0-3] = s0..15[0-3]
- add ip, r0, #0x20
- vld1.32 {q0-q1}, [r0]
- vld1.32 {q2-q3}, [ip]
-
- adr lr, .Lctrinc
- vdup.32 q15, d7[1]
- vdup.32 q14, d7[0]
- vld1.32 {q4}, [lr, :128]
- vdup.32 q13, d6[1]
- vdup.32 q12, d6[0]
- vdup.32 q11, d5[1]
- vdup.32 q10, d5[0]
- vadd.u32 q12, q12, q4 // x12 += counter values 0-3
- vdup.32 q9, d4[1]
- vdup.32 q8, d4[0]
- vdup.32 q7, d3[1]
- vdup.32 q6, d3[0]
- vdup.32 q5, d2[1]
- vdup.32 q4, d2[0]
- vdup.32 q3, d1[1]
- vdup.32 q2, d1[0]
- vdup.32 q1, d0[1]
- vdup.32 q0, d0[0]
-
- adr ip, .Lrol8_table
- b 1f
-
-.Ldoubleround4:
- vld1.32 {q8-q9}, [sp, :256]
-1:
- // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
- // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
- // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
- // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
- vadd.i32 q0, q0, q4
- vadd.i32 q1, q1, q5
- vadd.i32 q2, q2, q6
- vadd.i32 q3, q3, q7
-
- veor q12, q12, q0
- veor q13, q13, q1
- veor q14, q14, q2
- veor q15, q15, q3
-
- vrev32.16 q12, q12
- vrev32.16 q13, q13
- vrev32.16 q14, q14
- vrev32.16 q15, q15
-
- // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
- // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
- // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
- // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
- vadd.i32 q8, q8, q12
- vadd.i32 q9, q9, q13
- vadd.i32 q10, q10, q14
- vadd.i32 q11, q11, q15
-
- vst1.32 {q8-q9}, [sp, :256]
-
- veor q8, q4, q8
- veor q9, q5, q9
- vshl.u32 q4, q8, #12
- vshl.u32 q5, q9, #12
- vsri.u32 q4, q8, #20
- vsri.u32 q5, q9, #20
-
- veor q8, q6, q10
- veor q9, q7, q11
- vshl.u32 q6, q8, #12
- vshl.u32 q7, q9, #12
- vsri.u32 q6, q8, #20
- vsri.u32 q7, q9, #20
-
- // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
- // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
- // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
- // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
- vld1.8 {d16}, [ip, :64]
- vadd.i32 q0, q0, q4
- vadd.i32 q1, q1, q5
- vadd.i32 q2, q2, q6
- vadd.i32 q3, q3, q7
-
- veor q12, q12, q0
- veor q13, q13, q1
- veor q14, q14, q2
- veor q15, q15, q3
-
- vtbl.8 d24, {d24}, d16
- vtbl.8 d25, {d25}, d16
- vtbl.8 d26, {d26}, d16
- vtbl.8 d27, {d27}, d16
- vtbl.8 d28, {d28}, d16
- vtbl.8 d29, {d29}, d16
- vtbl.8 d30, {d30}, d16
- vtbl.8 d31, {d31}, d16
-
- vld1.32 {q8-q9}, [sp, :256]
-
- // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
- // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
- // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
- // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
- vadd.i32 q8, q8, q12
- vadd.i32 q9, q9, q13
- vadd.i32 q10, q10, q14
- vadd.i32 q11, q11, q15
-
- vst1.32 {q8-q9}, [sp, :256]
-
- veor q8, q4, q8
- veor q9, q5, q9
- vshl.u32 q4, q8, #7
- vshl.u32 q5, q9, #7
- vsri.u32 q4, q8, #25
- vsri.u32 q5, q9, #25
-
- veor q8, q6, q10
- veor q9, q7, q11
- vshl.u32 q6, q8, #7
- vshl.u32 q7, q9, #7
- vsri.u32 q6, q8, #25
- vsri.u32 q7, q9, #25
-
- vld1.32 {q8-q9}, [sp, :256]
-
- // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
- // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
- // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
- // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
- vadd.i32 q0, q0, q5
- vadd.i32 q1, q1, q6
- vadd.i32 q2, q2, q7
- vadd.i32 q3, q3, q4
-
- veor q15, q15, q0
- veor q12, q12, q1
- veor q13, q13, q2
- veor q14, q14, q3
-
- vrev32.16 q15, q15
- vrev32.16 q12, q12
- vrev32.16 q13, q13
- vrev32.16 q14, q14
-
- // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
- // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
- // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
- // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
- vadd.i32 q10, q10, q15
- vadd.i32 q11, q11, q12
- vadd.i32 q8, q8, q13
- vadd.i32 q9, q9, q14
-
- vst1.32 {q8-q9}, [sp, :256]
-
- veor q8, q7, q8
- veor q9, q4, q9
- vshl.u32 q7, q8, #12
- vshl.u32 q4, q9, #12
- vsri.u32 q7, q8, #20
- vsri.u32 q4, q9, #20
-
- veor q8, q5, q10
- veor q9, q6, q11
- vshl.u32 q5, q8, #12
- vshl.u32 q6, q9, #12
- vsri.u32 q5, q8, #20
- vsri.u32 q6, q9, #20
-
- // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
- // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
- // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
- // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
- vld1.8 {d16}, [ip, :64]
- vadd.i32 q0, q0, q5
- vadd.i32 q1, q1, q6
- vadd.i32 q2, q2, q7
- vadd.i32 q3, q3, q4
-
- veor q15, q15, q0
- veor q12, q12, q1
- veor q13, q13, q2
- veor q14, q14, q3
-
- vtbl.8 d30, {d30}, d16
- vtbl.8 d31, {d31}, d16
- vtbl.8 d24, {d24}, d16
- vtbl.8 d25, {d25}, d16
- vtbl.8 d26, {d26}, d16
- vtbl.8 d27, {d27}, d16
- vtbl.8 d28, {d28}, d16
- vtbl.8 d29, {d29}, d16
-
- vld1.32 {q8-q9}, [sp, :256]
-
- // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
- // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
- // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
- // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
- vadd.i32 q10, q10, q15
- vadd.i32 q11, q11, q12
- vadd.i32 q8, q8, q13
- vadd.i32 q9, q9, q14
-
- vst1.32 {q8-q9}, [sp, :256]
-
- veor q8, q7, q8
- veor q9, q4, q9
- vshl.u32 q7, q8, #7
- vshl.u32 q4, q9, #7
- vsri.u32 q7, q8, #25
- vsri.u32 q4, q9, #25
-
- veor q8, q5, q10
- veor q9, q6, q11
- vshl.u32 q5, q8, #7
- vshl.u32 q6, q9, #7
- vsri.u32 q5, q8, #25
- vsri.u32 q6, q9, #25
-
- subs r3, r3, #2
- bne .Ldoubleround4
-
- // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
- // x8..9[0-3] are on the stack.
-
- // Re-interleave the words in the first two rows of each block (x0..7).
- // Also add the counter values 0-3 to x12[0-3].
- vld1.32 {q8}, [lr, :128] // load counter values 0-3
- vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1)
- vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3)
- vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5)
- vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7)
- vadd.u32 q12, q8 // x12 += counter values 0-3
- vswp d1, d4
- vswp d3, d6
- vld1.32 {q8-q9}, [r0]! // load s0..7
- vswp d9, d12
- vswp d11, d14
-
- // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
- // after XORing the first 32 bytes.
- vswp q1, q4
-
- // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
-
- // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block)
- vadd.u32 q0, q0, q8
- vadd.u32 q2, q2, q8
- vadd.u32 q4, q4, q8
- vadd.u32 q3, q3, q8
-
- // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block)
- vadd.u32 q1, q1, q9
- vadd.u32 q6, q6, q9
- vadd.u32 q5, q5, q9
- vadd.u32 q7, q7, q9
-
- // XOR first 32 bytes using keystream from first two rows of first block
- vld1.8 {q8-q9}, [r2]!
- veor q8, q8, q0
- veor q9, q9, q1
- vst1.8 {q8-q9}, [r1]!
-
- // Re-interleave the words in the last two rows of each block (x8..15).
- vld1.32 {q8-q9}, [sp, :256]
- mov sp, r4 // restore original stack pointer
- ldr r4, [r4, #8] // load number of bytes
- vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13)
- vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15)
- vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9)
- vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11)
- vld1.32 {q0-q1}, [r0] // load s8..15
- vswp d25, d28
- vswp d27, d30
- vswp d17, d20
- vswp d19, d22
-
- // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
-
- // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block)
- vadd.u32 q8, q8, q0
- vadd.u32 q10, q10, q0
- vadd.u32 q9, q9, q0
- vadd.u32 q11, q11, q0
-
- // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
- vadd.u32 q12, q12, q1
- vadd.u32 q14, q14, q1
- vadd.u32 q13, q13, q1
- vadd.u32 q15, q15, q1
-
- // XOR the rest of the data with the keystream
-
- vld1.8 {q0-q1}, [r2]!
- subs r4, r4, #96
- veor q0, q0, q8
- veor q1, q1, q12
- ble .Lle96
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]!
- subs r4, r4, #32
- veor q0, q0, q2
- veor q1, q1, q6
- ble .Lle128
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]!
- subs r4, r4, #32
- veor q0, q0, q10
- veor q1, q1, q14
- ble .Lle160
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]!
- subs r4, r4, #32
- veor q0, q0, q4
- veor q1, q1, q5
- ble .Lle192
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]!
- subs r4, r4, #32
- veor q0, q0, q9
- veor q1, q1, q13
- ble .Lle224
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]!
- subs r4, r4, #32
- veor q0, q0, q3
- veor q1, q1, q7
- blt .Llt256
-.Lout:
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]
- veor q0, q0, q11
- veor q1, q1, q15
- vst1.8 {q0-q1}, [r1]
-
- pop {r4, pc}
-
-.Lle192:
- vmov q4, q9
- vmov q5, q13
-
-.Lle160:
- // nothing to do
-
-.Lfinalblock:
- // Process the final block if processing less than 4 full blocks.
- // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
- // previous 32 byte output block that still needs to be written at
- // [r1] in q0-q1.
- beq .Lfullblock
-
-.Lpartialblock:
- adr lr, .Lpermute + 32
- add r2, r2, r4
- add lr, lr, r4
- add r4, r4, r1
-
- vld1.8 {q2-q3}, [lr]
- vld1.8 {q6-q7}, [r2]
-
- add r4, r4, #32
-
- vtbl.8 d4, {q4-q5}, d4
- vtbl.8 d5, {q4-q5}, d5
- vtbl.8 d6, {q4-q5}, d6
- vtbl.8 d7, {q4-q5}, d7
-
- veor q6, q6, q2
- veor q7, q7, q3
-
- vst1.8 {q6-q7}, [r4] // overlapping stores
- vst1.8 {q0-q1}, [r1]
- pop {r4, pc}
-
-.Lfullblock:
- vmov q11, q4
- vmov q15, q5
- b .Lout
-.Lle96:
- vmov q4, q2
- vmov q5, q6
- b .Lfinalblock
-.Lle128:
- vmov q4, q10
- vmov q5, q14
- b .Lfinalblock
-.Lle224:
- vmov q4, q3
- vmov q5, q7
- b .Lfinalblock
-.Llt256:
- vmov q4, q11
- vmov q5, q15
- b .Lpartialblock
-ENDPROC(chacha_4block_xor_neon)
-
- .align L1_CACHE_SHIFT
-.Lpermute:
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
- .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
- .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
- .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
- .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
- .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
- .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
diff --git a/arch/arm/lib/crypto/chacha-scalar-core.S b/arch/arm/lib/crypto/chacha-scalar-core.S
deleted file mode 100644
index 4951df05c158..000000000000
--- a/arch/arm/lib/crypto/chacha-scalar-core.S
+++ /dev/null
@@ -1,444 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2018 Google, Inc.
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-/*
- * Design notes:
- *
- * 16 registers would be needed to hold the state matrix, but only 14 are
- * available because 'sp' and 'pc' cannot be used. So we spill the elements
- * (x8, x9) to the stack and swap them out with (x10, x11). This adds one
- * 'ldrd' and one 'strd' instruction per round.
- *
- * All rotates are performed using the implicit rotate operand accepted by the
- * 'add' and 'eor' instructions. This is faster than using explicit rotate
- * instructions. To make this work, we allow the values in the second and last
- * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
- * wrong rotation amount. The rotation amount is then fixed up just in time
- * when the values are used. 'brot' is the number of bits the values in row 'b'
- * need to be rotated right to arrive at the correct values, and 'drot'
- * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
- * that they end up as (25, 24) after every round.
- */
-
- // ChaCha state registers
- X0 .req r0
- X1 .req r1
- X2 .req r2
- X3 .req r3
- X4 .req r4
- X5 .req r5
- X6 .req r6
- X7 .req r7
- X8_X10 .req r8 // shared by x8 and x10
- X9_X11 .req r9 // shared by x9 and x11
- X12 .req r10
- X13 .req r11
- X14 .req r12
- X15 .req r14
-
-.macro _le32_bswap_4x a, b, c, d, tmp
-#ifdef __ARMEB__
- rev_l \a, \tmp
- rev_l \b, \tmp
- rev_l \c, \tmp
- rev_l \d, \tmp
-#endif
-.endm
-
-.macro __ldrd a, b, src, offset
-#if __LINUX_ARM_ARCH__ >= 6
- ldrd \a, \b, [\src, #\offset]
-#else
- ldr \a, [\src, #\offset]
- ldr \b, [\src, #\offset + 4]
-#endif
-.endm
-
-.macro __strd a, b, dst, offset
-#if __LINUX_ARM_ARCH__ >= 6
- strd \a, \b, [\dst, #\offset]
-#else
- str \a, [\dst, #\offset]
- str \b, [\dst, #\offset + 4]
-#endif
-.endm
-
-.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
-
- // a += b; d ^= a; d = rol(d, 16);
- add \a1, \a1, \b1, ror #brot
- add \a2, \a2, \b2, ror #brot
- eor \d1, \a1, \d1, ror #drot
- eor \d2, \a2, \d2, ror #drot
- // drot == 32 - 16 == 16
-
- // c += d; b ^= c; b = rol(b, 12);
- add \c1, \c1, \d1, ror #16
- add \c2, \c2, \d2, ror #16
- eor \b1, \c1, \b1, ror #brot
- eor \b2, \c2, \b2, ror #brot
- // brot == 32 - 12 == 20
-
- // a += b; d ^= a; d = rol(d, 8);
- add \a1, \a1, \b1, ror #20
- add \a2, \a2, \b2, ror #20
- eor \d1, \a1, \d1, ror #16
- eor \d2, \a2, \d2, ror #16
- // drot == 32 - 8 == 24
-
- // c += d; b ^= c; b = rol(b, 7);
- add \c1, \c1, \d1, ror #24
- add \c2, \c2, \d2, ror #24
- eor \b1, \c1, \b1, ror #20
- eor \b2, \c2, \b2, ror #20
- // brot == 32 - 7 == 25
-.endm
-
-.macro _doubleround
-
- // column round
-
- // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
- _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
-
- // save (x8, x9); restore (x10, x11)
- __strd X8_X10, X9_X11, sp, 0
- __ldrd X8_X10, X9_X11, sp, 8
-
- // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
- _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
-
- .set brot, 25
- .set drot, 24
-
- // diagonal round
-
- // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
- _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
-
- // save (x10, x11); restore (x8, x9)
- __strd X8_X10, X9_X11, sp, 8
- __ldrd X8_X10, X9_X11, sp, 0
-
- // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
- _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
-.endm
-
-.macro _chacha_permute nrounds
- .set brot, 0
- .set drot, 0
- .rept \nrounds / 2
- _doubleround
- .endr
-.endm
-
-.macro _chacha nrounds
-
-.Lnext_block\@:
- // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
- // Registers contain x0-x9,x12-x15.
-
- // Do the core ChaCha permutation to update x0-x15.
- _chacha_permute \nrounds
-
- add sp, #8
- // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
- // Registers contain x0-x9,x12-x15.
- // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
- // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
- push {X8_X10, X9_X11, X12, X13, X14, X15}
-
- // Load (OUT, IN, LEN).
- ldr r14, [sp, #96]
- ldr r12, [sp, #100]
- ldr r11, [sp, #104]
-
- orr r10, r14, r12
-
- // Use slow path if fewer than 64 bytes remain.
- cmp r11, #64
- blt .Lxor_slowpath\@
-
- // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
- // ARMv6+, since ldmia and stmia (used below) still require alignment.
- tst r10, #3
- bne .Lxor_slowpath\@
-
- // Fast path: XOR 64 bytes of aligned data.
-
- // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
- // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
- // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
- // x0-x3
- __ldrd r8, r9, sp, 32
- __ldrd r10, r11, sp, 40
- add X0, X0, r8
- add X1, X1, r9
- add X2, X2, r10
- add X3, X3, r11
- _le32_bswap_4x X0, X1, X2, X3, r8
- ldmia r12!, {r8-r11}
- eor X0, X0, r8
- eor X1, X1, r9
- eor X2, X2, r10
- eor X3, X3, r11
- stmia r14!, {X0-X3}
-
- // x4-x7
- __ldrd r8, r9, sp, 48
- __ldrd r10, r11, sp, 56
- add X4, r8, X4, ror #brot
- add X5, r9, X5, ror #brot
- ldmia r12!, {X0-X3}
- add X6, r10, X6, ror #brot
- add X7, r11, X7, ror #brot
- _le32_bswap_4x X4, X5, X6, X7, r8
- eor X4, X4, X0
- eor X5, X5, X1
- eor X6, X6, X2
- eor X7, X7, X3
- stmia r14!, {X4-X7}
-
- // x8-x15
- pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
- __ldrd r8, r9, sp, 32
- __ldrd r10, r11, sp, 40
- add r0, r0, r8 // x8
- add r1, r1, r9 // x9
- add r6, r6, r10 // x10
- add r7, r7, r11 // x11
- _le32_bswap_4x r0, r1, r6, r7, r8
- ldmia r12!, {r8-r11}
- eor r0, r0, r8 // x8
- eor r1, r1, r9 // x9
- eor r6, r6, r10 // x10
- eor r7, r7, r11 // x11
- stmia r14!, {r0,r1,r6,r7}
- ldmia r12!, {r0,r1,r6,r7}
- __ldrd r8, r9, sp, 48
- __ldrd r10, r11, sp, 56
- add r2, r8, r2, ror #drot // x12
- add r3, r9, r3, ror #drot // x13
- add r4, r10, r4, ror #drot // x14
- add r5, r11, r5, ror #drot // x15
- _le32_bswap_4x r2, r3, r4, r5, r9
- ldr r9, [sp, #72] // load LEN
- eor r2, r2, r0 // x12
- eor r3, r3, r1 // x13
- eor r4, r4, r6 // x14
- eor r5, r5, r7 // x15
- subs r9, #64 // decrement and check LEN
- stmia r14!, {r2-r5}
-
- beq .Ldone\@
-
-.Lprepare_for_next_block\@:
-
- // Stack: x0-x15 OUT IN LEN
-
- // Increment block counter (x12)
- add r8, #1
-
- // Store updated (OUT, IN, LEN)
- str r14, [sp, #64]
- str r12, [sp, #68]
- str r9, [sp, #72]
-
- mov r14, sp
-
- // Store updated block counter (x12)
- str r8, [sp, #48]
-
- sub sp, #16
-
- // Reload state and do next block
- ldmia r14!, {r0-r11} // load x0-x11
- __strd r10, r11, sp, 8 // store x10-x11 before state
- ldmia r14, {r10-r12,r14} // load x12-x15
- b .Lnext_block\@
-
-.Lxor_slowpath\@:
- // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
- // We handle it by storing the 64 bytes of keystream to the stack, then
- // XOR-ing the needed portion with the data.
-
- // Allocate keystream buffer
- sub sp, #64
- mov r14, sp
-
- // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
- // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
- // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
- // Save keystream for x0-x3
- __ldrd r8, r9, sp, 96
- __ldrd r10, r11, sp, 104
- add X0, X0, r8
- add X1, X1, r9
- add X2, X2, r10
- add X3, X3, r11
- _le32_bswap_4x X0, X1, X2, X3, r8
- stmia r14!, {X0-X3}
-
- // Save keystream for x4-x7
- __ldrd r8, r9, sp, 112
- __ldrd r10, r11, sp, 120
- add X4, r8, X4, ror #brot
- add X5, r9, X5, ror #brot
- add X6, r10, X6, ror #brot
- add X7, r11, X7, ror #brot
- _le32_bswap_4x X4, X5, X6, X7, r8
- add r8, sp, #64
- stmia r14!, {X4-X7}
-
- // Save keystream for x8-x15
- ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
- __ldrd r8, r9, sp, 128
- __ldrd r10, r11, sp, 136
- add r0, r0, r8 // x8
- add r1, r1, r9 // x9
- add r6, r6, r10 // x10
- add r7, r7, r11 // x11
- _le32_bswap_4x r0, r1, r6, r7, r8
- stmia r14!, {r0,r1,r6,r7}
- __ldrd r8, r9, sp, 144
- __ldrd r10, r11, sp, 152
- add r2, r8, r2, ror #drot // x12
- add r3, r9, r3, ror #drot // x13
- add r4, r10, r4, ror #drot // x14
- add r5, r11, r5, ror #drot // x15
- _le32_bswap_4x r2, r3, r4, r5, r9
- stmia r14, {r2-r5}
-
- // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
- // Registers: r8 is block counter, r12 is IN.
-
- ldr r9, [sp, #168] // LEN
- ldr r14, [sp, #160] // OUT
- cmp r9, #64
- mov r0, sp
- movle r1, r9
- movgt r1, #64
- // r1 is number of bytes to XOR, in range [1, 64]
-
-.if __LINUX_ARM_ARCH__ < 6
- orr r2, r12, r14
- tst r2, #3 // IN or OUT misaligned?
- bne .Lxor_next_byte\@
-.endif
-
- // XOR a word at a time
-.rept 16
- subs r1, #4
- blt .Lxor_words_done\@
- ldr r2, [r12], #4
- ldr r3, [r0], #4
- eor r2, r2, r3
- str r2, [r14], #4
-.endr
- b .Lxor_slowpath_done\@
-.Lxor_words_done\@:
- ands r1, r1, #3
- beq .Lxor_slowpath_done\@
-
- // XOR a byte at a time
-.Lxor_next_byte\@:
- ldrb r2, [r12], #1
- ldrb r3, [r0], #1
- eor r2, r2, r3
- strb r2, [r14], #1
- subs r1, #1
- bne .Lxor_next_byte\@
-
-.Lxor_slowpath_done\@:
- subs r9, #64
- add sp, #96
- bgt .Lprepare_for_next_block\@
-
-.Ldone\@:
-.endm // _chacha
-
-/*
- * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
- * const struct chacha_state *state, int nrounds);
- */
-ENTRY(chacha_doarm)
- cmp r2, #0 // len == 0?
- reteq lr
-
- ldr ip, [sp]
- cmp ip, #12
-
- push {r0-r2,r4-r11,lr}
-
- // Push state x0-x15 onto stack.
- // Also store an extra copy of x10-x11 just before the state.
-
- add X12, r3, #48
- ldm X12, {X12,X13,X14,X15}
- push {X12,X13,X14,X15}
- sub sp, sp, #64
-
- __ldrd X8_X10, X9_X11, r3, 40
- __strd X8_X10, X9_X11, sp, 8
- __strd X8_X10, X9_X11, sp, 56
- ldm r3, {X0-X9_X11}
- __strd X0, X1, sp, 16
- __strd X2, X3, sp, 24
- __strd X4, X5, sp, 32
- __strd X6, X7, sp, 40
- __strd X8_X10, X9_X11, sp, 48
-
- beq 1f
- _chacha 20
-
-0: add sp, #76
- pop {r4-r11, pc}
-
-1: _chacha 12
- b 0b
-ENDPROC(chacha_doarm)
-
-/*
- * void hchacha_block_arm(const struct chacha_state *state,
- * u32 out[HCHACHA_OUT_WORDS], int nrounds);
- */
-ENTRY(hchacha_block_arm)
- push {r1,r4-r11,lr}
-
- cmp r2, #12 // ChaCha12 ?
-
- mov r14, r0
- ldmia r14!, {r0-r11} // load x0-x11
- push {r10-r11} // store x10-x11 to stack
- ldm r14, {r10-r12,r14} // load x12-x15
- sub sp, #8
-
- beq 1f
- _chacha_permute 20
-
- // Skip over (unused0-unused1, x10-x11)
-0: add sp, #16
-
- // Fix up rotations of x12-x15
- ror X12, X12, #drot
- ror X13, X13, #drot
- pop {r4} // load 'out'
- ror X14, X14, #drot
- ror X15, X15, #drot
-
- // Store (x0-x3,x12-x15) to 'out'
- stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
-
- pop {r4-r11,pc}
-
-1: _chacha_permute 12
- b 0b
-ENDPROC(hchacha_block_arm)
diff --git a/arch/arm/lib/crypto/poly1305-armv4.pl b/arch/arm/lib/crypto/poly1305-armv4.pl
deleted file mode 100644
index d57c6e2fc84a..000000000000
--- a/arch/arm/lib/crypto/poly1305-armv4.pl
+++ /dev/null
@@ -1,1236 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
-#
-# ====================================================================
-# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
-# project.
-# ====================================================================
-#
-# IALU(*)/gcc-4.4 NEON
-#
-# ARM11xx(ARMv6) 7.78/+100% -
-# Cortex-A5 6.35/+130% 3.00
-# Cortex-A8 6.25/+115% 2.36
-# Cortex-A9 5.10/+95% 2.55
-# Cortex-A15 3.85/+85% 1.25(**)
-# Snapdragon S4 5.70/+100% 1.48(**)
-#
-# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
-# (**) these are trade-off results, they can be improved by ~8% but at
-# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
-# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
-
-$flavour = shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
-
-if ($flavour && $flavour ne "void") {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
-
- open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
- open STDOUT,">$output";
-}
-
-($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
-
-$code.=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
-# define poly1305_init poly1305_block_init_arch
-# define poly1305_blocks poly1305_blocks_arm
-# define poly1305_emit poly1305_emit_arch
-.globl poly1305_blocks_neon
-#endif
-
-#if defined(__thumb2__)
-.syntax unified
-.thumb
-#else
-.code 32
-#endif
-
-.text
-
-.globl poly1305_emit
-.globl poly1305_blocks
-.globl poly1305_init
-.type poly1305_init,%function
-.align 5
-poly1305_init:
-.Lpoly1305_init:
- stmdb sp!,{r4-r11}
-
- eor r3,r3,r3
- cmp $inp,#0
- str r3,[$ctx,#0] @ zero hash value
- str r3,[$ctx,#4]
- str r3,[$ctx,#8]
- str r3,[$ctx,#12]
- str r3,[$ctx,#16]
- str r3,[$ctx,#36] @ clear is_base2_26
- add $ctx,$ctx,#20
-
-#ifdef __thumb2__
- it eq
-#endif
- moveq r0,#0
- beq .Lno_key
-
-#if __ARM_MAX_ARCH__>=7
- mov r3,#-1
- str r3,[$ctx,#28] @ impossible key power value
-# ifndef __KERNEL__
- adr r11,.Lpoly1305_init
- ldr r12,.LOPENSSL_armcap
-# endif
-#endif
- ldrb r4,[$inp,#0]
- mov r10,#0x0fffffff
- ldrb r5,[$inp,#1]
- and r3,r10,#-4 @ 0x0ffffffc
- ldrb r6,[$inp,#2]
- ldrb r7,[$inp,#3]
- orr r4,r4,r5,lsl#8
- ldrb r5,[$inp,#4]
- orr r4,r4,r6,lsl#16
- ldrb r6,[$inp,#5]
- orr r4,r4,r7,lsl#24
- ldrb r7,[$inp,#6]
- and r4,r4,r10
-
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-# if !defined(_WIN32)
- ldr r12,[r11,r12] @ OPENSSL_armcap_P
-# endif
-# if defined(__APPLE__) || defined(_WIN32)
- ldr r12,[r12]
-# endif
-#endif
- ldrb r8,[$inp,#7]
- orr r5,r5,r6,lsl#8
- ldrb r6,[$inp,#8]
- orr r5,r5,r7,lsl#16
- ldrb r7,[$inp,#9]
- orr r5,r5,r8,lsl#24
- ldrb r8,[$inp,#10]
- and r5,r5,r3
-
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- tst r12,#ARMV7_NEON @ check for NEON
-# ifdef __thumb2__
- adr r9,.Lpoly1305_blocks_neon
- adr r11,.Lpoly1305_blocks
- it ne
- movne r11,r9
- adr r12,.Lpoly1305_emit
- orr r11,r11,#1 @ thumb-ify addresses
- orr r12,r12,#1
-# else
- add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
- ite eq
- addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
- addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
-# endif
-#endif
- ldrb r9,[$inp,#11]
- orr r6,r6,r7,lsl#8
- ldrb r7,[$inp,#12]
- orr r6,r6,r8,lsl#16
- ldrb r8,[$inp,#13]
- orr r6,r6,r9,lsl#24
- ldrb r9,[$inp,#14]
- and r6,r6,r3
-
- ldrb r10,[$inp,#15]
- orr r7,r7,r8,lsl#8
- str r4,[$ctx,#0]
- orr r7,r7,r9,lsl#16
- str r5,[$ctx,#4]
- orr r7,r7,r10,lsl#24
- str r6,[$ctx,#8]
- and r7,r7,r3
- str r7,[$ctx,#12]
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- stmia r2,{r11,r12} @ fill functions table
- mov r0,#1
-#else
- mov r0,#0
-#endif
-.Lno_key:
- ldmia sp!,{r4-r11}
-#if __ARM_ARCH__>=5
- ret @ bx lr
-#else
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size poly1305_init,.-poly1305_init
-___
-{
-my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
-my ($s1,$s2,$s3)=($r1,$r2,$r3);
-
-$code.=<<___;
-.type poly1305_blocks,%function
-.align 5
-poly1305_blocks:
-.Lpoly1305_blocks:
- stmdb sp!,{r3-r11,lr}
-
- ands $len,$len,#-16
- beq .Lno_data
-
- add $len,$len,$inp @ end pointer
- sub sp,sp,#32
-
-#if __ARM_ARCH__<7
- ldmia $ctx,{$h0-$r3} @ load context
- add $ctx,$ctx,#20
- str $len,[sp,#16] @ offload stuff
- str $ctx,[sp,#12]
-#else
- ldr lr,[$ctx,#36] @ is_base2_26
- ldmia $ctx!,{$h0-$h4} @ load hash value
- str $len,[sp,#16] @ offload stuff
- str $ctx,[sp,#12]
-
- adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
- mov $r1,$h1,lsr#6
- adcs $r1,$r1,$h2,lsl#20
- mov $r2,$h2,lsr#12
- adcs $r2,$r2,$h3,lsl#14
- mov $r3,$h3,lsr#18
- adcs $r3,$r3,$h4,lsl#8
- mov $len,#0
- teq lr,#0
- str $len,[$ctx,#16] @ clear is_base2_26
- adc $len,$len,$h4,lsr#24
-
- itttt ne
- movne $h0,$r0 @ choose between radixes
- movne $h1,$r1
- movne $h2,$r2
- movne $h3,$r3
- ldmia $ctx,{$r0-$r3} @ load key
- it ne
- movne $h4,$len
-#endif
-
- mov lr,$inp
- cmp $padbit,#0
- str $r1,[sp,#20]
- str $r2,[sp,#24]
- str $r3,[sp,#28]
- b .Loop
-
-.align 4
-.Loop:
-#if __ARM_ARCH__<7
- ldrb r0,[lr],#16 @ load input
-# ifdef __thumb2__
- it hi
-# endif
- addhi $h4,$h4,#1 @ 1<<128
- ldrb r1,[lr,#-15]
- ldrb r2,[lr,#-14]
- ldrb r3,[lr,#-13]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-12]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-11]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-10]
- adds $h0,$h0,r3 @ accumulate input
-
- ldrb r3,[lr,#-9]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-8]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-7]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-6]
- adcs $h1,$h1,r3
-
- ldrb r3,[lr,#-5]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-4]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-3]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-2]
- adcs $h2,$h2,r3
-
- ldrb r3,[lr,#-1]
- orr r1,r0,r1,lsl#8
- str lr,[sp,#8] @ offload input pointer
- orr r2,r1,r2,lsl#16
- add $s1,$r1,$r1,lsr#2
- orr r3,r2,r3,lsl#24
-#else
- ldr r0,[lr],#16 @ load input
- it hi
- addhi $h4,$h4,#1 @ padbit
- ldr r1,[lr,#-12]
- ldr r2,[lr,#-8]
- ldr r3,[lr,#-4]
-# ifdef __ARMEB__
- rev r0,r0
- rev r1,r1
- rev r2,r2
- rev r3,r3
-# endif
- adds $h0,$h0,r0 @ accumulate input
- str lr,[sp,#8] @ offload input pointer
- adcs $h1,$h1,r1
- add $s1,$r1,$r1,lsr#2
- adcs $h2,$h2,r2
-#endif
- add $s2,$r2,$r2,lsr#2
- adcs $h3,$h3,r3
- add $s3,$r3,$r3,lsr#2
-
- umull r2,r3,$h1,$r0
- adc $h4,$h4,#0
- umull r0,r1,$h0,$r0
- umlal r2,r3,$h4,$s1
- umlal r0,r1,$h3,$s1
- ldr $r1,[sp,#20] @ reload $r1
- umlal r2,r3,$h2,$s3
- umlal r0,r1,$h1,$s3
- umlal r2,r3,$h3,$s2
- umlal r0,r1,$h2,$s2
- umlal r2,r3,$h0,$r1
- str r0,[sp,#0] @ future $h0
- mul r0,$s2,$h4
- ldr $r2,[sp,#24] @ reload $r2
- adds r2,r2,r1 @ d1+=d0>>32
- eor r1,r1,r1
- adc lr,r3,#0 @ future $h2
- str r2,[sp,#4] @ future $h1
-
- mul r2,$s3,$h4
- eor r3,r3,r3
- umlal r0,r1,$h3,$s3
- ldr $r3,[sp,#28] @ reload $r3
- umlal r2,r3,$h3,$r0
- umlal r0,r1,$h2,$r0
- umlal r2,r3,$h2,$r1
- umlal r0,r1,$h1,$r1
- umlal r2,r3,$h1,$r2
- umlal r0,r1,$h0,$r2
- umlal r2,r3,$h0,$r3
- ldr $h0,[sp,#0]
- mul $h4,$r0,$h4
- ldr $h1,[sp,#4]
-
- adds $h2,lr,r0 @ d2+=d1>>32
- ldr lr,[sp,#8] @ reload input pointer
- adc r1,r1,#0
- adds $h3,r2,r1 @ d3+=d2>>32
- ldr r0,[sp,#16] @ reload end pointer
- adc r3,r3,#0
- add $h4,$h4,r3 @ h4+=d3>>32
-
- and r1,$h4,#-4
- and $h4,$h4,#3
- add r1,r1,r1,lsr#2 @ *=5
- adds $h0,$h0,r1
- adcs $h1,$h1,#0
- adcs $h2,$h2,#0
- adcs $h3,$h3,#0
- adc $h4,$h4,#0
-
- cmp r0,lr @ done yet?
- bhi .Loop
-
- ldr $ctx,[sp,#12]
- add sp,sp,#32
- stmdb $ctx,{$h0-$h4} @ store the result
-
-.Lno_data:
-#if __ARM_ARCH__>=5
- ldmia sp!,{r3-r11,pc}
-#else
- ldmia sp!,{r3-r11,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size poly1305_blocks,.-poly1305_blocks
-___
-}
-{
-my ($ctx,$mac,$nonce)=map("r$_",(0..2));
-my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
-my $g4=$ctx;
-
-$code.=<<___;
-.type poly1305_emit,%function
-.align 5
-poly1305_emit:
-.Lpoly1305_emit:
- stmdb sp!,{r4-r11}
-
- ldmia $ctx,{$h0-$h4}
-
-#if __ARM_ARCH__>=7
- ldr ip,[$ctx,#36] @ is_base2_26
-
- adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
- mov $g1,$h1,lsr#6
- adcs $g1,$g1,$h2,lsl#20
- mov $g2,$h2,lsr#12
- adcs $g2,$g2,$h3,lsl#14
- mov $g3,$h3,lsr#18
- adcs $g3,$g3,$h4,lsl#8
- mov $g4,#0
- adc $g4,$g4,$h4,lsr#24
-
- tst ip,ip
- itttt ne
- movne $h0,$g0
- movne $h1,$g1
- movne $h2,$g2
- movne $h3,$g3
- it ne
- movne $h4,$g4
-#endif
-
- adds $g0,$h0,#5 @ compare to modulus
- adcs $g1,$h1,#0
- adcs $g2,$h2,#0
- adcs $g3,$h3,#0
- adc $g4,$h4,#0
- tst $g4,#4 @ did it carry/borrow?
-
-#ifdef __thumb2__
- it ne
-#endif
- movne $h0,$g0
- ldr $g0,[$nonce,#0]
-#ifdef __thumb2__
- it ne
-#endif
- movne $h1,$g1
- ldr $g1,[$nonce,#4]
-#ifdef __thumb2__
- it ne
-#endif
- movne $h2,$g2
- ldr $g2,[$nonce,#8]
-#ifdef __thumb2__
- it ne
-#endif
- movne $h3,$g3
- ldr $g3,[$nonce,#12]
-
- adds $h0,$h0,$g0
- adcs $h1,$h1,$g1
- adcs $h2,$h2,$g2
- adc $h3,$h3,$g3
-
-#if __ARM_ARCH__>=7
-# ifdef __ARMEB__
- rev $h0,$h0
- rev $h1,$h1
- rev $h2,$h2
- rev $h3,$h3
-# endif
- str $h0,[$mac,#0]
- str $h1,[$mac,#4]
- str $h2,[$mac,#8]
- str $h3,[$mac,#12]
-#else
- strb $h0,[$mac,#0]
- mov $h0,$h0,lsr#8
- strb $h1,[$mac,#4]
- mov $h1,$h1,lsr#8
- strb $h2,[$mac,#8]
- mov $h2,$h2,lsr#8
- strb $h3,[$mac,#12]
- mov $h3,$h3,lsr#8
-
- strb $h0,[$mac,#1]
- mov $h0,$h0,lsr#8
- strb $h1,[$mac,#5]
- mov $h1,$h1,lsr#8
- strb $h2,[$mac,#9]
- mov $h2,$h2,lsr#8
- strb $h3,[$mac,#13]
- mov $h3,$h3,lsr#8
-
- strb $h0,[$mac,#2]
- mov $h0,$h0,lsr#8
- strb $h1,[$mac,#6]
- mov $h1,$h1,lsr#8
- strb $h2,[$mac,#10]
- mov $h2,$h2,lsr#8
- strb $h3,[$mac,#14]
- mov $h3,$h3,lsr#8
-
- strb $h0,[$mac,#3]
- strb $h1,[$mac,#7]
- strb $h2,[$mac,#11]
- strb $h3,[$mac,#15]
-#endif
- ldmia sp!,{r4-r11}
-#if __ARM_ARCH__>=5
- ret @ bx lr
-#else
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size poly1305_emit,.-poly1305_emit
-___
-{
-my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
-my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
-my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
-
-my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
-
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7
-.fpu neon
-
-.type poly1305_init_neon,%function
-.align 5
-poly1305_init_neon:
-.Lpoly1305_init_neon:
- ldr r3,[$ctx,#48] @ first table element
- cmp r3,#-1 @ is value impossible?
- bne .Lno_init_neon
-
- ldr r4,[$ctx,#20] @ load key base 2^32
- ldr r5,[$ctx,#24]
- ldr r6,[$ctx,#28]
- ldr r7,[$ctx,#32]
-
- and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
- mov r3,r4,lsr#26
- mov r4,r5,lsr#20
- orr r3,r3,r5,lsl#6
- mov r5,r6,lsr#14
- orr r4,r4,r6,lsl#12
- mov r6,r7,lsr#8
- orr r5,r5,r7,lsl#18
- and r3,r3,#0x03ffffff
- and r4,r4,#0x03ffffff
- and r5,r5,#0x03ffffff
-
- vdup.32 $R0,r2 @ r^1 in both lanes
- add r2,r3,r3,lsl#2 @ *5
- vdup.32 $R1,r3
- add r3,r4,r4,lsl#2
- vdup.32 $S1,r2
- vdup.32 $R2,r4
- add r4,r5,r5,lsl#2
- vdup.32 $S2,r3
- vdup.32 $R3,r5
- add r5,r6,r6,lsl#2
- vdup.32 $S3,r4
- vdup.32 $R4,r6
- vdup.32 $S4,r5
-
- mov $zeros,#2 @ counter
-
-.Lsquare_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
-
- vmull.u32 $D0,$R0,${R0}[1]
- vmull.u32 $D1,$R1,${R0}[1]
- vmull.u32 $D2,$R2,${R0}[1]
- vmull.u32 $D3,$R3,${R0}[1]
- vmull.u32 $D4,$R4,${R0}[1]
-
- vmlal.u32 $D0,$R4,${S1}[1]
- vmlal.u32 $D1,$R0,${R1}[1]
- vmlal.u32 $D2,$R1,${R1}[1]
- vmlal.u32 $D3,$R2,${R1}[1]
- vmlal.u32 $D4,$R3,${R1}[1]
-
- vmlal.u32 $D0,$R3,${S2}[1]
- vmlal.u32 $D1,$R4,${S2}[1]
- vmlal.u32 $D3,$R1,${R2}[1]
- vmlal.u32 $D2,$R0,${R2}[1]
- vmlal.u32 $D4,$R2,${R2}[1]
-
- vmlal.u32 $D0,$R2,${S3}[1]
- vmlal.u32 $D3,$R0,${R3}[1]
- vmlal.u32 $D1,$R3,${S3}[1]
- vmlal.u32 $D2,$R4,${S3}[1]
- vmlal.u32 $D4,$R1,${R3}[1]
-
- vmlal.u32 $D3,$R4,${S4}[1]
- vmlal.u32 $D0,$R1,${S4}[1]
- vmlal.u32 $D1,$R2,${S4}[1]
- vmlal.u32 $D2,$R3,${S4}[1]
- vmlal.u32 $D4,$R0,${R4}[1]
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- @ and P. Schwabe
- @
- @ H0>>+H1>>+H2>>+H3>>+H4
- @ H3>>+H4>>*5+H0>>+H1
- @
- @ Trivia.
- @
- @ Result of multiplication of n-bit number by m-bit number is
- @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
- @ m-bit number multiplied by 2^n is still n+m bits wide.
- @
- @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
- @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
- @ one is n+1 bits wide.
- @
- @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
- @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
- @ can be 27. However! In cases when their width exceeds 26 bits
- @ they are limited by 2^26+2^6. This in turn means that *sum*
- @ of the products with these values can still be viewed as sum
- @ of 52-bit numbers as long as the amount of addends is not a
- @ power of 2. For example,
- @
- @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
- @
- @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
- @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
- @ 8 * (2^52) or 2^55. However, the value is then multiplied by
- @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
- @ which is less than 32 * (2^52) or 2^57. And when processing
- @ data we are looking at triple as many addends...
- @
- @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
- @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
- @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
- @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
- @ instruction accepts 2x32-bit input and writes 2x64-bit result.
- @ This means that result of reduction have to be compressed upon
- @ loop wrap-around. This can be done in the process of reduction
- @ to minimize amount of instructions [as well as amount of
- @ 128-bit instructions, which benefits low-end processors], but
- @ one has to watch for H2 (which is narrower than H0) and 5*H4
- @ not being wider than 58 bits, so that result of right shift
- @ by 26 bits fits in 32 bits. This is also useful on x86,
- @ because it allows to use paddd in place for paddq, which
- @ benefits Atom, where paddq is ridiculously slow.
-
- vshr.u64 $T0,$D3,#26
- vmovn.i64 $D3#lo,$D3
- vshr.u64 $T1,$D0,#26
- vmovn.i64 $D0#lo,$D0
- vadd.i64 $D4,$D4,$T0 @ h3 -> h4
- vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
- vadd.i64 $D1,$D1,$T1 @ h0 -> h1
- vbic.i32 $D0#lo,#0xfc000000
-
- vshrn.u64 $T0#lo,$D4,#26
- vmovn.i64 $D4#lo,$D4
- vshr.u64 $T1,$D1,#26
- vmovn.i64 $D1#lo,$D1
- vadd.i64 $D2,$D2,$T1 @ h1 -> h2
- vbic.i32 $D4#lo,#0xfc000000
- vbic.i32 $D1#lo,#0xfc000000
-
- vadd.i32 $D0#lo,$D0#lo,$T0#lo
- vshl.u32 $T0#lo,$T0#lo,#2
- vshrn.u64 $T1#lo,$D2,#26
- vmovn.i64 $D2#lo,$D2
- vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
- vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
- vbic.i32 $D2#lo,#0xfc000000
-
- vshr.u32 $T0#lo,$D0#lo,#26
- vbic.i32 $D0#lo,#0xfc000000
- vshr.u32 $T1#lo,$D3#lo,#26
- vbic.i32 $D3#lo,#0xfc000000
- vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
- vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
-
- subs $zeros,$zeros,#1
- beq .Lsquare_break_neon
-
- add $tbl0,$ctx,#(48+0*9*4)
- add $tbl1,$ctx,#(48+1*9*4)
-
- vtrn.32 $R0,$D0#lo @ r^2:r^1
- vtrn.32 $R2,$D2#lo
- vtrn.32 $R3,$D3#lo
- vtrn.32 $R1,$D1#lo
- vtrn.32 $R4,$D4#lo
-
- vshl.u32 $S2,$R2,#2 @ *5
- vshl.u32 $S3,$R3,#2
- vshl.u32 $S1,$R1,#2
- vshl.u32 $S4,$R4,#2
- vadd.i32 $S2,$S2,$R2
- vadd.i32 $S1,$S1,$R1
- vadd.i32 $S3,$S3,$R3
- vadd.i32 $S4,$S4,$R4
-
- vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
- vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
- vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vst1.32 {${S4}[0]},[$tbl0,:32]
- vst1.32 {${S4}[1]},[$tbl1,:32]
-
- b .Lsquare_neon
-
-.align 4
-.Lsquare_break_neon:
- add $tbl0,$ctx,#(48+2*4*9)
- add $tbl1,$ctx,#(48+3*4*9)
-
- vmov $R0,$D0#lo @ r^4:r^3
- vshl.u32 $S1,$D1#lo,#2 @ *5
- vmov $R1,$D1#lo
- vshl.u32 $S2,$D2#lo,#2
- vmov $R2,$D2#lo
- vshl.u32 $S3,$D3#lo,#2
- vmov $R3,$D3#lo
- vshl.u32 $S4,$D4#lo,#2
- vmov $R4,$D4#lo
- vadd.i32 $S1,$S1,$D1#lo
- vadd.i32 $S2,$S2,$D2#lo
- vadd.i32 $S3,$S3,$D3#lo
- vadd.i32 $S4,$S4,$D4#lo
-
- vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
- vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
- vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vst1.32 {${S4}[0]},[$tbl0]
- vst1.32 {${S4}[1]},[$tbl1]
-
-.Lno_init_neon:
- ret @ bx lr
-.size poly1305_init_neon,.-poly1305_init_neon
-
-.type poly1305_blocks_neon,%function
-.align 5
-poly1305_blocks_neon:
-.Lpoly1305_blocks_neon:
- ldr ip,[$ctx,#36] @ is_base2_26
-
- cmp $len,#64
- blo .Lpoly1305_blocks
-
- stmdb sp!,{r4-r7}
- vstmdb sp!,{d8-d15} @ ABI specification says so
-
- tst ip,ip @ is_base2_26?
- bne .Lbase2_26_neon
-
- stmdb sp!,{r1-r3,lr}
- bl .Lpoly1305_init_neon
-
- ldr r4,[$ctx,#0] @ load hash value base 2^32
- ldr r5,[$ctx,#4]
- ldr r6,[$ctx,#8]
- ldr r7,[$ctx,#12]
- ldr ip,[$ctx,#16]
-
- and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
- mov r3,r4,lsr#26
- veor $D0#lo,$D0#lo,$D0#lo
- mov r4,r5,lsr#20
- orr r3,r3,r5,lsl#6
- veor $D1#lo,$D1#lo,$D1#lo
- mov r5,r6,lsr#14
- orr r4,r4,r6,lsl#12
- veor $D2#lo,$D2#lo,$D2#lo
- mov r6,r7,lsr#8
- orr r5,r5,r7,lsl#18
- veor $D3#lo,$D3#lo,$D3#lo
- and r3,r3,#0x03ffffff
- orr r6,r6,ip,lsl#24
- veor $D4#lo,$D4#lo,$D4#lo
- and r4,r4,#0x03ffffff
- mov r1,#1
- and r5,r5,#0x03ffffff
- str r1,[$ctx,#36] @ set is_base2_26
-
- vmov.32 $D0#lo[0],r2
- vmov.32 $D1#lo[0],r3
- vmov.32 $D2#lo[0],r4
- vmov.32 $D3#lo[0],r5
- vmov.32 $D4#lo[0],r6
- adr $zeros,.Lzeros
-
- ldmia sp!,{r1-r3,lr}
- b .Lhash_loaded
-
-.align 4
-.Lbase2_26_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ load hash value
-
- veor $D0#lo,$D0#lo,$D0#lo
- veor $D1#lo,$D1#lo,$D1#lo
- veor $D2#lo,$D2#lo,$D2#lo
- veor $D3#lo,$D3#lo,$D3#lo
- veor $D4#lo,$D4#lo,$D4#lo
- vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
- adr $zeros,.Lzeros
- vld1.32 {$D4#lo[0]},[$ctx]
- sub $ctx,$ctx,#16 @ rewind
-
-.Lhash_loaded:
- add $in2,$inp,#32
- mov $padbit,$padbit,lsl#24
- tst $len,#31
- beq .Leven
-
- vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
- vmov.32 $H4#lo[0],$padbit
- sub $len,$len,#16
- add $in2,$inp,#32
-
-# ifdef __ARMEB__
- vrev32.8 $H0,$H0
- vrev32.8 $H3,$H3
- vrev32.8 $H1,$H1
- vrev32.8 $H2,$H2
-# endif
- vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
- vshl.u32 $H3#lo,$H3#lo,#18
-
- vsri.u32 $H3#lo,$H2#lo,#14
- vshl.u32 $H2#lo,$H2#lo,#12
- vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
-
- vbic.i32 $H3#lo,#0xfc000000
- vsri.u32 $H2#lo,$H1#lo,#20
- vshl.u32 $H1#lo,$H1#lo,#6
-
- vbic.i32 $H2#lo,#0xfc000000
- vsri.u32 $H1#lo,$H0#lo,#26
- vadd.i32 $H3#hi,$H3#lo,$D3#lo
-
- vbic.i32 $H0#lo,#0xfc000000
- vbic.i32 $H1#lo,#0xfc000000
- vadd.i32 $H2#hi,$H2#lo,$D2#lo
-
- vadd.i32 $H0#hi,$H0#lo,$D0#lo
- vadd.i32 $H1#hi,$H1#lo,$D1#lo
-
- mov $tbl1,$zeros
- add $tbl0,$ctx,#48
-
- cmp $len,$len
- b .Long_tail
-
-.align 4
-.Leven:
- subs $len,$len,#64
- it lo
- movlo $in2,$zeros
-
- vmov.i32 $H4,#1<<24 @ padbit, yes, always
- vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
- add $inp,$inp,#64
- vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
- add $in2,$in2,#64
- itt hi
- addhi $tbl1,$ctx,#(48+1*9*4)
- addhi $tbl0,$ctx,#(48+3*9*4)
-
-# ifdef __ARMEB__
- vrev32.8 $H0,$H0
- vrev32.8 $H3,$H3
- vrev32.8 $H1,$H1
- vrev32.8 $H2,$H2
-# endif
- vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
- vshl.u32 $H3,$H3,#18
-
- vsri.u32 $H3,$H2,#14
- vshl.u32 $H2,$H2,#12
-
- vbic.i32 $H3,#0xfc000000
- vsri.u32 $H2,$H1,#20
- vshl.u32 $H1,$H1,#6
-
- vbic.i32 $H2,#0xfc000000
- vsri.u32 $H1,$H0,#26
-
- vbic.i32 $H0,#0xfc000000
- vbic.i32 $H1,#0xfc000000
-
- bls .Lskip_loop
-
- vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
- vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
- vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- b .Loop_neon
-
-.align 5
-.Loop_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- @ \___________________/
- @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- @ \___________________/ \____________________/
- @
- @ Note that we start with inp[2:3]*r^2. This is because it
- @ doesn't depend on reduction in previous iteration.
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ inp[2:3]*r^2
-
- vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
- vmull.u32 $D2,$H2#hi,${R0}[1]
- vadd.i32 $H0#lo,$H0#lo,$D0#lo
- vmull.u32 $D0,$H0#hi,${R0}[1]
- vadd.i32 $H3#lo,$H3#lo,$D3#lo
- vmull.u32 $D3,$H3#hi,${R0}[1]
- vmlal.u32 $D2,$H1#hi,${R1}[1]
- vadd.i32 $H1#lo,$H1#lo,$D1#lo
- vmull.u32 $D1,$H1#hi,${R0}[1]
-
- vadd.i32 $H4#lo,$H4#lo,$D4#lo
- vmull.u32 $D4,$H4#hi,${R0}[1]
- subs $len,$len,#64
- vmlal.u32 $D0,$H4#hi,${S1}[1]
- it lo
- movlo $in2,$zeros
- vmlal.u32 $D3,$H2#hi,${R1}[1]
- vld1.32 ${S4}[1],[$tbl1,:32]
- vmlal.u32 $D1,$H0#hi,${R1}[1]
- vmlal.u32 $D4,$H3#hi,${R1}[1]
-
- vmlal.u32 $D0,$H3#hi,${S2}[1]
- vmlal.u32 $D3,$H1#hi,${R2}[1]
- vmlal.u32 $D4,$H2#hi,${R2}[1]
- vmlal.u32 $D1,$H4#hi,${S2}[1]
- vmlal.u32 $D2,$H0#hi,${R2}[1]
-
- vmlal.u32 $D3,$H0#hi,${R3}[1]
- vmlal.u32 $D0,$H2#hi,${S3}[1]
- vmlal.u32 $D4,$H1#hi,${R3}[1]
- vmlal.u32 $D1,$H3#hi,${S3}[1]
- vmlal.u32 $D2,$H4#hi,${S3}[1]
-
- vmlal.u32 $D3,$H4#hi,${S4}[1]
- vmlal.u32 $D0,$H1#hi,${S4}[1]
- vmlal.u32 $D4,$H0#hi,${R4}[1]
- vmlal.u32 $D1,$H2#hi,${S4}[1]
- vmlal.u32 $D2,$H3#hi,${S4}[1]
-
- vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
- add $in2,$in2,#64
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ (hash+inp[0:1])*r^4 and accumulate
-
- vmlal.u32 $D3,$H3#lo,${R0}[0]
- vmlal.u32 $D0,$H0#lo,${R0}[0]
- vmlal.u32 $D4,$H4#lo,${R0}[0]
- vmlal.u32 $D1,$H1#lo,${R0}[0]
- vmlal.u32 $D2,$H2#lo,${R0}[0]
- vld1.32 ${S4}[0],[$tbl0,:32]
-
- vmlal.u32 $D3,$H2#lo,${R1}[0]
- vmlal.u32 $D0,$H4#lo,${S1}[0]
- vmlal.u32 $D4,$H3#lo,${R1}[0]
- vmlal.u32 $D1,$H0#lo,${R1}[0]
- vmlal.u32 $D2,$H1#lo,${R1}[0]
-
- vmlal.u32 $D3,$H1#lo,${R2}[0]
- vmlal.u32 $D0,$H3#lo,${S2}[0]
- vmlal.u32 $D4,$H2#lo,${R2}[0]
- vmlal.u32 $D1,$H4#lo,${S2}[0]
- vmlal.u32 $D2,$H0#lo,${R2}[0]
-
- vmlal.u32 $D3,$H0#lo,${R3}[0]
- vmlal.u32 $D0,$H2#lo,${S3}[0]
- vmlal.u32 $D4,$H1#lo,${R3}[0]
- vmlal.u32 $D1,$H3#lo,${S3}[0]
- vmlal.u32 $D3,$H4#lo,${S4}[0]
-
- vmlal.u32 $D2,$H4#lo,${S3}[0]
- vmlal.u32 $D0,$H1#lo,${S4}[0]
- vmlal.u32 $D4,$H0#lo,${R4}[0]
- vmov.i32 $H4,#1<<24 @ padbit, yes, always
- vmlal.u32 $D1,$H2#lo,${S4}[0]
- vmlal.u32 $D2,$H3#lo,${S4}[0]
-
- vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
- add $inp,$inp,#64
-# ifdef __ARMEB__
- vrev32.8 $H0,$H0
- vrev32.8 $H1,$H1
- vrev32.8 $H2,$H2
- vrev32.8 $H3,$H3
-# endif
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction interleaved with base 2^32 -> base 2^26 of
- @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
-
- vshr.u64 $T0,$D3,#26
- vmovn.i64 $D3#lo,$D3
- vshr.u64 $T1,$D0,#26
- vmovn.i64 $D0#lo,$D0
- vadd.i64 $D4,$D4,$T0 @ h3 -> h4
- vbic.i32 $D3#lo,#0xfc000000
- vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
- vadd.i64 $D1,$D1,$T1 @ h0 -> h1
- vshl.u32 $H3,$H3,#18
- vbic.i32 $D0#lo,#0xfc000000
-
- vshrn.u64 $T0#lo,$D4,#26
- vmovn.i64 $D4#lo,$D4
- vshr.u64 $T1,$D1,#26
- vmovn.i64 $D1#lo,$D1
- vadd.i64 $D2,$D2,$T1 @ h1 -> h2
- vsri.u32 $H3,$H2,#14
- vbic.i32 $D4#lo,#0xfc000000
- vshl.u32 $H2,$H2,#12
- vbic.i32 $D1#lo,#0xfc000000
-
- vadd.i32 $D0#lo,$D0#lo,$T0#lo
- vshl.u32 $T0#lo,$T0#lo,#2
- vbic.i32 $H3,#0xfc000000
- vshrn.u64 $T1#lo,$D2,#26
- vmovn.i64 $D2#lo,$D2
- vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
- vsri.u32 $H2,$H1,#20
- vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
- vshl.u32 $H1,$H1,#6
- vbic.i32 $D2#lo,#0xfc000000
- vbic.i32 $H2,#0xfc000000
-
- vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
- vmovn.i64 $D0#lo,$D0
- vsri.u32 $H1,$H0,#26
- vbic.i32 $H0,#0xfc000000
- vshr.u32 $T1#lo,$D3#lo,#26
- vbic.i32 $D3#lo,#0xfc000000
- vbic.i32 $D0#lo,#0xfc000000
- vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
- vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
- vbic.i32 $H1,#0xfc000000
-
- bhi .Loop_neon
-
-.Lskip_loop:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- add $tbl1,$ctx,#(48+0*9*4)
- add $tbl0,$ctx,#(48+1*9*4)
- adds $len,$len,#32
- it ne
- movne $len,#0
- bne .Long_tail
-
- vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
- vadd.i32 $H0#hi,$H0#lo,$D0#lo
- vadd.i32 $H3#hi,$H3#lo,$D3#lo
- vadd.i32 $H1#hi,$H1#lo,$D1#lo
- vadd.i32 $H4#hi,$H4#lo,$D4#lo
-
-.Long_tail:
- vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
- vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
-
- vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
- vmull.u32 $D2,$H2#hi,$R0
- vadd.i32 $H0#lo,$H0#lo,$D0#lo
- vmull.u32 $D0,$H0#hi,$R0
- vadd.i32 $H3#lo,$H3#lo,$D3#lo
- vmull.u32 $D3,$H3#hi,$R0
- vadd.i32 $H1#lo,$H1#lo,$D1#lo
- vmull.u32 $D1,$H1#hi,$R0
- vadd.i32 $H4#lo,$H4#lo,$D4#lo
- vmull.u32 $D4,$H4#hi,$R0
-
- vmlal.u32 $D0,$H4#hi,$S1
- vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vmlal.u32 $D3,$H2#hi,$R1
- vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- vmlal.u32 $D1,$H0#hi,$R1
- vmlal.u32 $D4,$H3#hi,$R1
- vmlal.u32 $D2,$H1#hi,$R1
-
- vmlal.u32 $D3,$H1#hi,$R2
- vld1.32 ${S4}[1],[$tbl1,:32]
- vmlal.u32 $D0,$H3#hi,$S2
- vld1.32 ${S4}[0],[$tbl0,:32]
- vmlal.u32 $D4,$H2#hi,$R2
- vmlal.u32 $D1,$H4#hi,$S2
- vmlal.u32 $D2,$H0#hi,$R2
-
- vmlal.u32 $D3,$H0#hi,$R3
- it ne
- addne $tbl1,$ctx,#(48+2*9*4)
- vmlal.u32 $D0,$H2#hi,$S3
- it ne
- addne $tbl0,$ctx,#(48+3*9*4)
- vmlal.u32 $D4,$H1#hi,$R3
- vmlal.u32 $D1,$H3#hi,$S3
- vmlal.u32 $D2,$H4#hi,$S3
-
- vmlal.u32 $D3,$H4#hi,$S4
- vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
- vmlal.u32 $D0,$H1#hi,$S4
- vshr.u64 $MASK,$MASK,#38
- vmlal.u32 $D4,$H0#hi,$R4
- vmlal.u32 $D1,$H2#hi,$S4
- vmlal.u32 $D2,$H3#hi,$S4
-
- beq .Lshort_tail
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ (hash+inp[0:1])*r^4:r^3 and accumulate
-
- vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
- vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
-
- vmlal.u32 $D2,$H2#lo,$R0
- vmlal.u32 $D0,$H0#lo,$R0
- vmlal.u32 $D3,$H3#lo,$R0
- vmlal.u32 $D1,$H1#lo,$R0
- vmlal.u32 $D4,$H4#lo,$R0
-
- vmlal.u32 $D0,$H4#lo,$S1
- vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vmlal.u32 $D3,$H2#lo,$R1
- vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- vmlal.u32 $D1,$H0#lo,$R1
- vmlal.u32 $D4,$H3#lo,$R1
- vmlal.u32 $D2,$H1#lo,$R1
-
- vmlal.u32 $D3,$H1#lo,$R2
- vld1.32 ${S4}[1],[$tbl1,:32]
- vmlal.u32 $D0,$H3#lo,$S2
- vld1.32 ${S4}[0],[$tbl0,:32]
- vmlal.u32 $D4,$H2#lo,$R2
- vmlal.u32 $D1,$H4#lo,$S2
- vmlal.u32 $D2,$H0#lo,$R2
-
- vmlal.u32 $D3,$H0#lo,$R3
- vmlal.u32 $D0,$H2#lo,$S3
- vmlal.u32 $D4,$H1#lo,$R3
- vmlal.u32 $D1,$H3#lo,$S3
- vmlal.u32 $D2,$H4#lo,$S3
-
- vmlal.u32 $D3,$H4#lo,$S4
- vorn $MASK,$MASK,$MASK @ all-ones
- vmlal.u32 $D0,$H1#lo,$S4
- vshr.u64 $MASK,$MASK,#38
- vmlal.u32 $D4,$H0#lo,$R4
- vmlal.u32 $D1,$H2#lo,$S4
- vmlal.u32 $D2,$H3#lo,$S4
-
-.Lshort_tail:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ horizontal addition
-
- vadd.i64 $D3#lo,$D3#lo,$D3#hi
- vadd.i64 $D0#lo,$D0#lo,$D0#hi
- vadd.i64 $D4#lo,$D4#lo,$D4#hi
- vadd.i64 $D1#lo,$D1#lo,$D1#hi
- vadd.i64 $D2#lo,$D2#lo,$D2#hi
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction, but without narrowing
-
- vshr.u64 $T0,$D3,#26
- vand.i64 $D3,$D3,$MASK
- vshr.u64 $T1,$D0,#26
- vand.i64 $D0,$D0,$MASK
- vadd.i64 $D4,$D4,$T0 @ h3 -> h4
- vadd.i64 $D1,$D1,$T1 @ h0 -> h1
-
- vshr.u64 $T0,$D4,#26
- vand.i64 $D4,$D4,$MASK
- vshr.u64 $T1,$D1,#26
- vand.i64 $D1,$D1,$MASK
- vadd.i64 $D2,$D2,$T1 @ h1 -> h2
-
- vadd.i64 $D0,$D0,$T0
- vshl.u64 $T0,$T0,#2
- vshr.u64 $T1,$D2,#26
- vand.i64 $D2,$D2,$MASK
- vadd.i64 $D0,$D0,$T0 @ h4 -> h0
- vadd.i64 $D3,$D3,$T1 @ h2 -> h3
-
- vshr.u64 $T0,$D0,#26
- vand.i64 $D0,$D0,$MASK
- vshr.u64 $T1,$D3,#26
- vand.i64 $D3,$D3,$MASK
- vadd.i64 $D1,$D1,$T0 @ h0 -> h1
- vadd.i64 $D4,$D4,$T1 @ h3 -> h4
-
- cmp $len,#0
- bne .Leven
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ store hash value
-
- vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
- vst1.32 {$D4#lo[0]},[$ctx]
-
- vldmia sp!,{d8-d15} @ epilogue
- ldmia sp!,{r4-r7}
- ret @ bx lr
-.size poly1305_blocks_neon,.-poly1305_blocks_neon
-
-.align 5
-.Lzeros:
-.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-#ifndef __KERNEL__
-.LOPENSSL_armcap:
-# ifdef _WIN32
-.word OPENSSL_armcap_P
-# else
-.word OPENSSL_armcap_P-.Lpoly1305_init
-# endif
-.comm OPENSSL_armcap_P,4,4
-.hidden OPENSSL_armcap_P
-#endif
-#endif
-___
-} }
-$code.=<<___;
-.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
-.align 2
-___
-
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/geo;
-
- s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
- s/\bret\b/bx lr/go or
- s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
-
- print $_,"\n";
-}
-close STDOUT; # enforce flush
diff --git a/arch/arm/lib/crypto/poly1305-glue.c b/arch/arm/lib/crypto/poly1305-glue.c
deleted file mode 100644
index 2603b0771f2c..000000000000
--- a/arch/arm/lib/crypto/poly1305-glue.c
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
- *
- * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <crypto/internal/poly1305.h>
-#include <linux/cpufeature.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/unaligned.h>
-
-asmlinkage void poly1305_block_init_arch(
- struct poly1305_block_state *state,
- const u8 raw_key[POLY1305_BLOCK_SIZE]);
-EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
-asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state,
- const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
- const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
- u8 digest[POLY1305_DIGEST_SIZE],
- const u32 nonce[4]);
-EXPORT_SYMBOL_GPL(poly1305_emit_arch);
-
-void __weak poly1305_blocks_neon(struct poly1305_block_state *state,
- const u8 *src, u32 len, u32 hibit)
-{
-}
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
- unsigned int len, u32 padbit)
-{
- len = round_down(len, POLY1305_BLOCK_SIZE);
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
- static_branch_likely(&have_neon)) {
- do {
- unsigned int todo = min_t(unsigned int, len, SZ_4K);
-
- kernel_neon_begin();
- poly1305_blocks_neon(state, src, todo, padbit);
- kernel_neon_end();
-
- len -= todo;
- src += todo;
- } while (len);
- } else
- poly1305_blocks_arm(state, src, len, padbit);
-}
-EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
-
-bool poly1305_is_arch_optimized(void)
-{
- /* We always can use at least the ARM scalar implementation. */
- return true;
-}
-EXPORT_SYMBOL(poly1305_is_arch_optimized);
-
-static int __init arm_poly1305_mod_init(void)
-{
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
- (elf_hwcap & HWCAP_NEON))
- static_branch_enable(&have_neon);
- return 0;
-}
-subsys_initcall(arm_poly1305_mod_init);
-
-static void __exit arm_poly1305_mod_exit(void)
-{
-}
-module_exit(arm_poly1305_mod_exit);
-
-MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/lib/crypto/sha256-armv4.pl b/arch/arm/lib/crypto/sha256-armv4.pl
deleted file mode 100644
index 8122db7fd599..000000000000
--- a/arch/arm/lib/crypto/sha256-armv4.pl
+++ /dev/null
@@ -1,724 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0
-
-# This code is taken from the OpenSSL project but the author (Andy Polyakov)
-# has relicensed it under the GPLv2. Therefore this program is free software;
-# you can redistribute it and/or modify it under the terms of the GNU General
-# Public License version 2 as published by the Free Software Foundation.
-#
-# The original headers, including the original license headers, are
-# included below for completeness.
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see https://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# SHA256 block procedure for ARMv4. May 2007.
-
-# Performance is ~2x better than gcc 3.4 generated code and in "abso-
-# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
-# byte [on single-issue Xscale PXA250 core].
-
-# July 2010.
-#
-# Rescheduling for dual-issue pipeline resulted in 22% improvement on
-# Cortex A8 core and ~20 cycles per processed byte.
-
-# February 2011.
-#
-# Profiler-assisted and platform-specific optimization resulted in 16%
-# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
-
-# September 2013.
-#
-# Add NEON implementation. On Cortex A8 it was measured to process one
-# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
-# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
-# code (meaning that latter performs sub-optimally, nothing was done
-# about it).
-
-# May 2014.
-#
-# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
-
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-
-$ctx="r0"; $t0="r0";
-$inp="r1"; $t4="r1";
-$len="r2"; $t1="r2";
-$T1="r3"; $t3="r3";
-$A="r4";
-$B="r5";
-$C="r6";
-$D="r7";
-$E="r8";
-$F="r9";
-$G="r10";
-$H="r11";
-@V=($A,$B,$C,$D,$E,$F,$G,$H);
-$t2="r12";
-$Ktbl="r14";
-
-@Sigma0=( 2,13,22);
-@Sigma1=( 6,11,25);
-@sigma0=( 7,18, 3);
-@sigma1=(17,19,10);
-
-sub BODY_00_15 {
-my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
-
-$code.=<<___ if ($i<16);
-#if __ARM_ARCH__>=7
- @ ldr $t1,[$inp],#4 @ $i
-# if $i==15
- str $inp,[sp,#17*4] @ make room for $t4
-# endif
- eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
- add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
- eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
-# ifndef __ARMEB__
- rev $t1,$t1
-# endif
-#else
- @ ldrb $t1,[$inp,#3] @ $i
- add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
- ldrb $t2,[$inp,#2]
- ldrb $t0,[$inp,#1]
- orr $t1,$t1,$t2,lsl#8
- ldrb $t2,[$inp],#4
- orr $t1,$t1,$t0,lsl#16
-# if $i==15
- str $inp,[sp,#17*4] @ make room for $t4
-# endif
- eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
- orr $t1,$t1,$t2,lsl#24
- eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
-#endif
-___
-$code.=<<___;
- ldr $t2,[$Ktbl],#4 @ *K256++
- add $h,$h,$t1 @ h+=X[i]
- str $t1,[sp,#`$i%16`*4]
- eor $t1,$f,$g
- add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
- and $t1,$t1,$e
- add $h,$h,$t2 @ h+=K256[i]
- eor $t1,$t1,$g @ Ch(e,f,g)
- eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
- add $h,$h,$t1 @ h+=Ch(e,f,g)
-#if $i==31
- and $t2,$t2,#0xff
- cmp $t2,#0xf2 @ done?
-#endif
-#if $i<15
-# if __ARM_ARCH__>=7
- ldr $t1,[$inp],#4 @ prefetch
-# else
- ldrb $t1,[$inp,#3]
-# endif
- eor $t2,$a,$b @ a^b, b^c in next round
-#else
- ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
- eor $t2,$a,$b @ a^b, b^c in next round
- ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
-#endif
- eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
- and $t3,$t3,$t2 @ (b^c)&=(a^b)
- add $d,$d,$h @ d+=h
- eor $t3,$t3,$b @ Maj(a,b,c)
- add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
- @ add $h,$h,$t3 @ h+=Maj(a,b,c)
-___
- ($t2,$t3)=($t3,$t2);
-}
-
-sub BODY_16_XX {
-my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
-
-$code.=<<___;
- @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
- @ ldr $t4,[sp,#`($i+14)%16`*4]
- mov $t0,$t1,ror#$sigma0[0]
- add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
- mov $t2,$t4,ror#$sigma1[0]
- eor $t0,$t0,$t1,ror#$sigma0[1]
- eor $t2,$t2,$t4,ror#$sigma1[1]
- eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
- ldr $t1,[sp,#`($i+0)%16`*4]
- eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
- ldr $t4,[sp,#`($i+9)%16`*4]
-
- add $t2,$t2,$t0
- eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
- add $t1,$t1,$t2
- eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
- add $t1,$t1,$t4 @ X[i]
-___
- &BODY_00_15(@_);
-}
-
-$code=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-#endif
-
-.text
-#if __ARM_ARCH__<7
-.code 32
-#else
-.syntax unified
-# ifdef __thumb2__
-.thumb
-# else
-.code 32
-# endif
-#endif
-
-.type K256,%object
-.align 5
-K256:
-.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-.size K256,.-K256
-.word 0 @ terminator
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-sha256_blocks_arch
-#endif
-.align 5
-
-.global sha256_blocks_arch
-.type sha256_blocks_arch,%function
-sha256_blocks_arch:
-.Lsha256_blocks_arch:
-#if __ARM_ARCH__<7
- sub r3,pc,#8 @ sha256_blocks_arch
-#else
- adr r3,.Lsha256_blocks_arch
-#endif
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- ldr r12,.LOPENSSL_armcap
- ldr r12,[r3,r12] @ OPENSSL_armcap_P
- tst r12,#ARMV8_SHA256
- bne .LARMv8
- tst r12,#ARMV7_NEON
- bne .LNEON
-#endif
- add $len,$inp,$len,lsl#6 @ len to point at the end of inp
- stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
- ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
- sub $Ktbl,r3,#256+32 @ K256
- sub sp,sp,#16*4 @ alloca(X[16])
-.Loop:
-# if __ARM_ARCH__>=7
- ldr $t1,[$inp],#4
-# else
- ldrb $t1,[$inp,#3]
-# endif
- eor $t3,$B,$C @ magic
- eor $t2,$t2,$t2
-___
-for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
-$code.=".Lrounds_16_xx:\n";
-for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
-#if __ARM_ARCH__>=7
- ite eq @ Thumb2 thing, sanity check in ARM
-#endif
- ldreq $t3,[sp,#16*4] @ pull ctx
- bne .Lrounds_16_xx
-
- add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
- ldr $t0,[$t3,#0]
- ldr $t1,[$t3,#4]
- ldr $t2,[$t3,#8]
- add $A,$A,$t0
- ldr $t0,[$t3,#12]
- add $B,$B,$t1
- ldr $t1,[$t3,#16]
- add $C,$C,$t2
- ldr $t2,[$t3,#20]
- add $D,$D,$t0
- ldr $t0,[$t3,#24]
- add $E,$E,$t1
- ldr $t1,[$t3,#28]
- add $F,$F,$t2
- ldr $inp,[sp,#17*4] @ pull inp
- ldr $t2,[sp,#18*4] @ pull inp+len
- add $G,$G,$t0
- add $H,$H,$t1
- stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
- cmp $inp,$t2
- sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
- bne .Loop
-
- add sp,sp,#`16+3`*4 @ destroy frame
-#if __ARM_ARCH__>=5
- ldmia sp!,{r4-r11,pc}
-#else
- ldmia sp!,{r4-r11,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size sha256_blocks_arch,.-sha256_blocks_arch
-___
-######################################################################
-# NEON stuff
-#
-{{{
-my @X=map("q$_",(0..3));
-my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
-my $Xfer=$t4;
-my $j=0;
-
-sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
-sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
-
-sub AUTOLOAD() # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
- my $arg = pop;
- $arg = "#$arg" if ($arg*1 eq $arg);
- $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
-}
-
-sub Xupdate()
-{ use integer;
- my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body);
- my ($a,$b,$c,$d,$e,$f,$g,$h);
-
- &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T2,$T0,$sigma0[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T1,$T0,$sigma0[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vsli_32 ($T2,$T0,32-$sigma0[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T3,$T0,$sigma0[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &veor ($T1,$T1,$T2);
- eval(shift(@insns));
- eval(shift(@insns));
- &vsli_32 ($T3,$T0,32-$sigma0[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &veor ($T1,$T1,$T3); # sigma0(X[1..4])
- eval(shift(@insns));
- eval(shift(@insns));
- &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
- eval(shift(@insns));
- eval(shift(@insns));
- &veor ($T5,$T5,$T4);
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &veor ($T5,$T5,$T4); # sigma1(X[14..15])
- eval(shift(@insns));
- eval(shift(@insns));
- &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &veor ($T5,$T5,$T4);
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vld1_32 ("{$T0}","[$Ktbl,:128]!");
- eval(shift(@insns));
- eval(shift(@insns));
- &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &veor ($T5,$T5,$T4); # sigma1(X[16..17])
- eval(shift(@insns));
- eval(shift(@insns));
- &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
- eval(shift(@insns));
- eval(shift(@insns));
- &vadd_i32 ($T0,$T0,@X[0]);
- while($#insns>=2) { eval(shift(@insns)); }
- &vst1_32 ("{$T0}","[$Xfer,:128]!");
- eval(shift(@insns));
- eval(shift(@insns));
-
- push(@X,shift(@X)); # "rotate" X[]
-}
-
-sub Xpreload()
-{ use integer;
- my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body);
- my ($a,$b,$c,$d,$e,$f,$g,$h);
-
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vld1_32 ("{$T0}","[$Ktbl,:128]!");
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vrev32_8 (@X[0],@X[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vadd_i32 ($T0,$T0,@X[0]);
- foreach (@insns) { eval; } # remaining instructions
- &vst1_32 ("{$T0}","[$Xfer,:128]!");
-
- push(@X,shift(@X)); # "rotate" X[]
-}
-
-sub body_00_15 () {
- (
- '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
- '&add ($h,$h,$t1)', # h+=X[i]+K[i]
- '&eor ($t1,$f,$g)',
- '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
- '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
- '&and ($t1,$t1,$e)',
- '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
- '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
- '&eor ($t1,$t1,$g)', # Ch(e,f,g)
- '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
- '&eor ($t2,$a,$b)', # a^b, b^c in next round
- '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
- '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
- '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
- '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
- '&ldr ($t1,"[sp,#64]") if ($j==31)',
- '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
- '&add ($d,$d,$h)', # d+=h
- '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
- '&eor ($t3,$t3,$b)', # Maj(a,b,c)
- '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
- )
-}
-
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7
-.arch armv7-a
-.fpu neon
-
-.global sha256_block_data_order_neon
-.type sha256_block_data_order_neon,%function
-.align 4
-sha256_block_data_order_neon:
-.LNEON:
- stmdb sp!,{r4-r12,lr}
-
- sub $H,sp,#16*4+16
- adr $Ktbl,.Lsha256_blocks_arch
- sub $Ktbl,$Ktbl,#.Lsha256_blocks_arch-K256
- bic $H,$H,#15 @ align for 128-bit stores
- mov $t2,sp
- mov sp,$H @ alloca
- add $len,$inp,$len,lsl#6 @ len to point at the end of inp
-
- vld1.8 {@X[0]},[$inp]!
- vld1.8 {@X[1]},[$inp]!
- vld1.8 {@X[2]},[$inp]!
- vld1.8 {@X[3]},[$inp]!
- vld1.32 {$T0},[$Ktbl,:128]!
- vld1.32 {$T1},[$Ktbl,:128]!
- vld1.32 {$T2},[$Ktbl,:128]!
- vld1.32 {$T3},[$Ktbl,:128]!
- vrev32.8 @X[0],@X[0] @ yes, even on
- str $ctx,[sp,#64]
- vrev32.8 @X[1],@X[1] @ big-endian
- str $inp,[sp,#68]
- mov $Xfer,sp
- vrev32.8 @X[2],@X[2]
- str $len,[sp,#72]
- vrev32.8 @X[3],@X[3]
- str $t2,[sp,#76] @ save original sp
- vadd.i32 $T0,$T0,@X[0]
- vadd.i32 $T1,$T1,@X[1]
- vst1.32 {$T0},[$Xfer,:128]!
- vadd.i32 $T2,$T2,@X[2]
- vst1.32 {$T1},[$Xfer,:128]!
- vadd.i32 $T3,$T3,@X[3]
- vst1.32 {$T2},[$Xfer,:128]!
- vst1.32 {$T3},[$Xfer,:128]!
-
- ldmia $ctx,{$A-$H}
- sub $Xfer,$Xfer,#64
- ldr $t1,[sp,#0]
- eor $t2,$t2,$t2
- eor $t3,$B,$C
- b .L_00_48
-
-.align 4
-.L_00_48:
-___
- &Xupdate(\&body_00_15);
- &Xupdate(\&body_00_15);
- &Xupdate(\&body_00_15);
- &Xupdate(\&body_00_15);
-$code.=<<___;
- teq $t1,#0 @ check for K256 terminator
- ldr $t1,[sp,#0]
- sub $Xfer,$Xfer,#64
- bne .L_00_48
-
- ldr $inp,[sp,#68]
- ldr $t0,[sp,#72]
- sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
- teq $inp,$t0
- it eq
- subeq $inp,$inp,#64 @ avoid SEGV
- vld1.8 {@X[0]},[$inp]! @ load next input block
- vld1.8 {@X[1]},[$inp]!
- vld1.8 {@X[2]},[$inp]!
- vld1.8 {@X[3]},[$inp]!
- it ne
- strne $inp,[sp,#68]
- mov $Xfer,sp
-___
- &Xpreload(\&body_00_15);
- &Xpreload(\&body_00_15);
- &Xpreload(\&body_00_15);
- &Xpreload(\&body_00_15);
-$code.=<<___;
- ldr $t0,[$t1,#0]
- add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
- ldr $t2,[$t1,#4]
- ldr $t3,[$t1,#8]
- ldr $t4,[$t1,#12]
- add $A,$A,$t0 @ accumulate
- ldr $t0,[$t1,#16]
- add $B,$B,$t2
- ldr $t2,[$t1,#20]
- add $C,$C,$t3
- ldr $t3,[$t1,#24]
- add $D,$D,$t4
- ldr $t4,[$t1,#28]
- add $E,$E,$t0
- str $A,[$t1],#4
- add $F,$F,$t2
- str $B,[$t1],#4
- add $G,$G,$t3
- str $C,[$t1],#4
- add $H,$H,$t4
- str $D,[$t1],#4
- stmia $t1,{$E-$H}
-
- ittte ne
- movne $Xfer,sp
- ldrne $t1,[sp,#0]
- eorne $t2,$t2,$t2
- ldreq sp,[sp,#76] @ restore original sp
- itt ne
- eorne $t3,$B,$C
- bne .L_00_48
-
- ldmia sp!,{r4-r12,pc}
-.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
-#endif
-___
-}}}
-######################################################################
-# ARMv8 stuff
-#
-{{{
-my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
-my @MSG=map("q$_",(8..11));
-my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
-my $Ktbl="r3";
-
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-
-# ifdef __thumb2__
-# define INST(a,b,c,d) .byte c,d|0xc,a,b
-# else
-# define INST(a,b,c,d) .byte a,b,c,d
-# endif
-
-.type sha256_block_data_order_armv8,%function
-.align 5
-sha256_block_data_order_armv8:
-.LARMv8:
- vld1.32 {$ABCD,$EFGH},[$ctx]
-# ifdef __thumb2__
- adr $Ktbl,.LARMv8
- sub $Ktbl,$Ktbl,#.LARMv8-K256
-# else
- adrl $Ktbl,K256
-# endif
- add $len,$inp,$len,lsl#6 @ len to point at the end of inp
-
-.Loop_v8:
- vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
- vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
- vld1.32 {$W0},[$Ktbl]!
- vrev32.8 @MSG[0],@MSG[0]
- vrev32.8 @MSG[1],@MSG[1]
- vrev32.8 @MSG[2],@MSG[2]
- vrev32.8 @MSG[3],@MSG[3]
- vmov $ABCD_SAVE,$ABCD @ offload
- vmov $EFGH_SAVE,$EFGH
- teq $inp,$len
-___
-for($i=0;$i<12;$i++) {
-$code.=<<___;
- vld1.32 {$W1},[$Ktbl]!
- vadd.i32 $W0,$W0,@MSG[0]
- sha256su0 @MSG[0],@MSG[1]
- vmov $abcd,$ABCD
- sha256h $ABCD,$EFGH,$W0
- sha256h2 $EFGH,$abcd,$W0
- sha256su1 @MSG[0],@MSG[2],@MSG[3]
-___
- ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
-}
-$code.=<<___;
- vld1.32 {$W1},[$Ktbl]!
- vadd.i32 $W0,$W0,@MSG[0]
- vmov $abcd,$ABCD
- sha256h $ABCD,$EFGH,$W0
- sha256h2 $EFGH,$abcd,$W0
-
- vld1.32 {$W0},[$Ktbl]!
- vadd.i32 $W1,$W1,@MSG[1]
- vmov $abcd,$ABCD
- sha256h $ABCD,$EFGH,$W1
- sha256h2 $EFGH,$abcd,$W1
-
- vld1.32 {$W1},[$Ktbl]
- vadd.i32 $W0,$W0,@MSG[2]
- sub $Ktbl,$Ktbl,#256-16 @ rewind
- vmov $abcd,$ABCD
- sha256h $ABCD,$EFGH,$W0
- sha256h2 $EFGH,$abcd,$W0
-
- vadd.i32 $W1,$W1,@MSG[3]
- vmov $abcd,$ABCD
- sha256h $ABCD,$EFGH,$W1
- sha256h2 $EFGH,$abcd,$W1
-
- vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
- vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
- it ne
- bne .Loop_v8
-
- vst1.32 {$ABCD,$EFGH},[$ctx]
-
- ret @ bx lr
-.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
-#endif
-___
-}}}
-$code.=<<___;
-.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
-.align 2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm OPENSSL_armcap_P,4,4
-#endif
-___
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/@/ and !/^$/);
- print;
-}
-close SELF;
-
-{ my %opcode = (
- "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
- "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
-
- sub unsha256 {
- my ($mnemonic,$arg)=@_;
-
- if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
- my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
- |(($2&7)<<17)|(($2&8)<<4)
- |(($3&7)<<1) |(($3&8)<<2);
- # since ARMv7 instructions are always encoded little-endian.
- # correct solution is to use .inst directive, but older
- # assemblers don't implement it:-(
- sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
- $word&0xff,($word>>8)&0xff,
- ($word>>16)&0xff,($word>>24)&0xff,
- $mnemonic,$arg;
- }
- }
-}
-
-foreach (split($/,$code)) {
-
- s/\`([^\`]*)\`/eval $1/geo;
-
- s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
-
- s/\bret\b/bx lr/go or
- s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
-
- print $_,"\n";
-}
-
-close STDOUT; # enforce flush
diff --git a/arch/arm/lib/crypto/sha256-ce.S b/arch/arm/lib/crypto/sha256-ce.S
deleted file mode 100644
index ac2c9b01b22d..000000000000
--- a/arch/arm/lib/crypto/sha256-ce.S
+++ /dev/null
@@ -1,123 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * sha256-ce.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2015 Linaro Ltd.
- * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .text
- .arch armv8-a
- .fpu crypto-neon-fp-armv8
-
- k0 .req q7
- k1 .req q8
- rk .req r3
-
- ta0 .req q9
- ta1 .req q10
- tb0 .req q10
- tb1 .req q9
-
- dga .req q11
- dgb .req q12
-
- dg0 .req q13
- dg1 .req q14
- dg2 .req q15
-
- .macro add_only, ev, s0
- vmov dg2, dg0
- .ifnb \s0
- vld1.32 {k\ev}, [rk, :128]!
- .endif
- sha256h.32 dg0, dg1, tb\ev
- sha256h2.32 dg1, dg2, tb\ev
- .ifnb \s0
- vadd.u32 ta\ev, q\s0, k\ev
- .endif
- .endm
-
- .macro add_update, ev, s0, s1, s2, s3
- sha256su0.32 q\s0, q\s1
- add_only \ev, \s1
- sha256su1.32 q\s0, q\s2, q\s3
- .endm
-
- .align 6
-.Lsha256_rcon:
- .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
- .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
- .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
- .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
- .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
- .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
- .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
- .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
- .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
- .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
- .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
- .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
- .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
- .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
- .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
- .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-
- /*
- * void sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
- * const u8 *data, size_t nblocks);
- */
-ENTRY(sha256_ce_transform)
- /* load state */
- vld1.32 {dga-dgb}, [r0]
-
- /* load input */
-0: vld1.32 {q0-q1}, [r1]!
- vld1.32 {q2-q3}, [r1]!
- subs r2, r2, #1
-
-#ifndef CONFIG_CPU_BIG_ENDIAN
- vrev32.8 q0, q0
- vrev32.8 q1, q1
- vrev32.8 q2, q2
- vrev32.8 q3, q3
-#endif
-
- /* load first round constant */
- adr rk, .Lsha256_rcon
- vld1.32 {k0}, [rk, :128]!
-
- vadd.u32 ta0, q0, k0
- vmov dg0, dga
- vmov dg1, dgb
-
- add_update 1, 0, 1, 2, 3
- add_update 0, 1, 2, 3, 0
- add_update 1, 2, 3, 0, 1
- add_update 0, 3, 0, 1, 2
- add_update 1, 0, 1, 2, 3
- add_update 0, 1, 2, 3, 0
- add_update 1, 2, 3, 0, 1
- add_update 0, 3, 0, 1, 2
- add_update 1, 0, 1, 2, 3
- add_update 0, 1, 2, 3, 0
- add_update 1, 2, 3, 0, 1
- add_update 0, 3, 0, 1, 2
-
- add_only 1, 1
- add_only 0, 2
- add_only 1, 3
- add_only 0
-
- /* update state */
- vadd.u32 dga, dga, dg0
- vadd.u32 dgb, dgb, dg1
- bne 0b
-
- /* store new state */
- vst1.32 {dga-dgb}, [r0]
- bx lr
-ENDPROC(sha256_ce_transform)
diff --git a/arch/arm/lib/crypto/sha256.c b/arch/arm/lib/crypto/sha256.c
deleted file mode 100644
index 109192e54b0f..000000000000
--- a/arch/arm/lib/crypto/sha256.c
+++ /dev/null
@@ -1,64 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 optimized for ARM
- *
- * Copyright 2025 Google LLC
- */
-#include <asm/neon.h>
-#include <crypto/internal/sha2.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-asmlinkage void sha256_block_data_order_neon(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-asmlinkage void sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
-
-void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks)
-{
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
- static_branch_likely(&have_neon)) {
- kernel_neon_begin();
- if (static_branch_likely(&have_ce))
- sha256_ce_transform(state, data, nblocks);
- else
- sha256_block_data_order_neon(state, data, nblocks);
- kernel_neon_end();
- } else {
- sha256_blocks_arch(state, data, nblocks);
- }
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_simd);
-
-bool sha256_is_arch_optimized(void)
-{
- /* We always can use at least the ARM scalar implementation. */
- return true;
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
-static int __init sha256_arm_mod_init(void)
-{
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
- static_branch_enable(&have_neon);
- if (elf_hwcap2 & HWCAP2_SHA2)
- static_branch_enable(&have_ce);
- }
- return 0;
-}
-subsys_initcall(sha256_arm_mod_init);
-
-static void __exit sha256_arm_mod_exit(void)
-{
-}
-module_exit(sha256_arm_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 optimized for ARM");
diff --git a/arch/arm/mm/cache-feroceon-l2.c b/arch/arm/mm/cache-feroceon-l2.c
index 25dbd84a1aaf..2bfefb252ffd 100644
--- a/arch/arm/mm/cache-feroceon-l2.c
+++ b/arch/arm/mm/cache-feroceon-l2.c
@@ -295,7 +295,7 @@ static inline u32 read_extra_features(void)
return u;
}
-static inline void write_extra_features(u32 u)
+static inline void __init write_extra_features(u32 u)
{
__asm__("mcr p15, 1, %0, c15, c1, 0" : : "r" (u));
}
diff --git a/arch/arm/mm/cache-tauros2.c b/arch/arm/mm/cache-tauros2.c
index b1e1aba602f7..bfe166ccace0 100644
--- a/arch/arm/mm/cache-tauros2.c
+++ b/arch/arm/mm/cache-tauros2.c
@@ -177,7 +177,7 @@ static inline void __init write_actlr(u32 actlr)
__asm__("mcr p15, 0, %0, c1, c0, 1\n" : : "r" (actlr));
}
-static void enable_extra_feature(unsigned int features)
+static void __init enable_extra_feature(unsigned int features)
{
u32 u;
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 27c1d5ebcd91..b07e699aaa3c 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -482,3 +482,5 @@
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
467 common open_tree_attr sys_open_tree_attr
+468 common file_getattr sys_file_getattr
+469 common file_setattr sys_file_setattr
diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile
index cb044bfd145d..cf8cd39ab804 100644
--- a/arch/arm/vdso/Makefile
+++ b/arch/arm/vdso/Makefile
@@ -26,7 +26,7 @@ CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
CFLAGS_REMOVE_vdso.o = -pg
# Force -O2 to avoid libgcc dependencies
-CFLAGS_REMOVE_vgettimeofday.o = -pg -Os $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS)
+CFLAGS_REMOVE_vgettimeofday.o = -pg -Os $(RANDSTRUCT_CFLAGS) $(KSTACK_ERASE_CFLAGS) $(GCC_PLUGINS_CFLAGS)
ifeq ($(c-gettimeofday-y),)
CFLAGS_vgettimeofday.o = -O2
else
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 393d71124f5d..7c456aa838b9 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -21,8 +21,6 @@ config ARM64
select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
select ARCH_HAS_CACHE_LINE_SIZE
select ARCH_HAS_CC_PLATFORM
- select ARCH_HAS_CRC32
- select ARCH_HAS_CRC_T10DIF if KERNEL_MODE_NEON
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE
@@ -187,12 +185,12 @@ config ARM64
select HAVE_ARCH_KCSAN if EXPERT
select HAVE_ARCH_KFENCE
select HAVE_ARCH_KGDB
+ select HAVE_ARCH_KSTACK_ERASE
select HAVE_ARCH_MMAP_RND_BITS
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
select HAVE_ARCH_PREL32_RELOCATIONS
select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
select HAVE_ARCH_SECCOMP_FILTER
- select HAVE_ARCH_STACKLEAK
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
diff --git a/arch/arm64/boot/dts/allwinner/sun55i-a523.dtsi b/arch/arm64/boot/dts/allwinner/sun55i-a523.dtsi
index 8b7cbc2e78f5..51cd148f4227 100644
--- a/arch/arm64/boot/dts/allwinner/sun55i-a523.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun55i-a523.dtsi
@@ -131,7 +131,7 @@
"PH5", "PH6", "PH7", "PH9", "PH10",
"PH14", "PH15", "PH16", "PH17", "PH18";
allwinner,pinmux = <5>;
- function = "emac0";
+ function = "gmac0";
drive-strength = <40>;
bias-disable;
};
@@ -540,8 +540,8 @@
status = "disabled";
};
- emac0: ethernet@4500000 {
- compatible = "allwinner,sun55i-a523-emac0",
+ gmac0: ethernet@4500000 {
+ compatible = "allwinner,sun55i-a523-gmac0",
"allwinner,sun50i-a64-emac";
reg = <0x04500000 0x10000>;
clocks = <&ccu CLK_BUS_EMAC0>;
diff --git a/arch/arm64/boot/dts/allwinner/sun55i-a527-cubie-a5e.dts b/arch/arm64/boot/dts/allwinner/sun55i-a527-cubie-a5e.dts
index 0f58d92a6adc..8bc0f2c72a24 100644
--- a/arch/arm64/boot/dts/allwinner/sun55i-a527-cubie-a5e.dts
+++ b/arch/arm64/boot/dts/allwinner/sun55i-a527-cubie-a5e.dts
@@ -12,7 +12,7 @@
compatible = "radxa,cubie-a5e", "allwinner,sun55i-a527";
aliases {
- ethernet0 = &emac0;
+ ethernet0 = &gmac0;
serial0 = &uart0;
};
@@ -55,7 +55,7 @@
status = "okay";
};
-&emac0 {
+&gmac0 {
phy-mode = "rgmii-id";
phy-handle = <&ext_rgmii_phy>;
phy-supply = <&reg_cldo3>;
diff --git a/arch/arm64/boot/dts/allwinner/sun55i-t527-avaota-a1.dts b/arch/arm64/boot/dts/allwinner/sun55i-t527-avaota-a1.dts
index 08127f0cdd35..142177c1f737 100644
--- a/arch/arm64/boot/dts/allwinner/sun55i-t527-avaota-a1.dts
+++ b/arch/arm64/boot/dts/allwinner/sun55i-t527-avaota-a1.dts
@@ -12,7 +12,7 @@
compatible = "yuzukihd,avaota-a1", "allwinner,sun55i-t527";
aliases {
- ethernet0 = &emac0;
+ ethernet0 = &gmac0;
serial0 = &uart0;
};
@@ -65,7 +65,7 @@
status = "okay";
};
-&emac0 {
+&gmac0 {
phy-mode = "rgmii-id";
phy-handle = <&ext_rgmii_phy>;
phy-supply = <&reg_dcdc4>;
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
index 0baf256b4400..983b2f0e8797 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
@@ -687,11 +687,12 @@
};
wdog0: watchdog@2ad0000 {
- compatible = "fsl,imx21-wdt";
+ compatible = "fsl,ls1046a-wdt", "fsl,imx21-wdt";
reg = <0x0 0x2ad0000 0x0 0x10000>;
interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clockgen QORIQ_CLK_PLATFORM_PLL
QORIQ_CLK_PLL_DIV(2)>;
+ big-endian;
};
edma0: dma-controller@2c00000 {
diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi
index d29710772569..1594ce9182a5 100644
--- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi
@@ -464,6 +464,7 @@
};
reg_nvcc_sd: LDO5 {
+ regulator-always-on;
regulator-max-microvolt = <3300000>;
regulator-min-microvolt = <1800000>;
regulator-name = "On-module +V3.3_1.8_SD (LDO5)";
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw71xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw71xx.dtsi
index 2f740d74707b..4bf818873fe3 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw71xx.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw71xx.dtsi
@@ -70,7 +70,7 @@
tpm@1 {
compatible = "atmel,attpm20p", "tcg,tpm_tis-spi";
reg = <0x1>;
- spi-max-frequency = <36000000>;
+ spi-max-frequency = <25000000>;
};
};
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi
index 5ab3ffe9931d..cf747ec6fa16 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi
@@ -110,7 +110,7 @@
tpm@1 {
compatible = "atmel,attpm20p", "tcg,tpm_tis-spi";
reg = <0x1>;
- spi-max-frequency = <36000000>;
+ spi-max-frequency = <25000000>;
};
};
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi
index e2b5e7ac3e46..5eb114d2360a 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi
@@ -122,7 +122,7 @@
tpm@1 {
compatible = "atmel,attpm20p", "tcg,tpm_tis-spi";
reg = <0x1>;
- spi-max-frequency = <36000000>;
+ spi-max-frequency = <25000000>;
};
};
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts
index 6daa2313f879..568d24265ddf 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts
+++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts
@@ -201,7 +201,7 @@
tpm@0 {
compatible = "atmel,attpm20p", "tcg,tpm_tis-spi";
reg = <0x0>;
- spi-max-frequency = <36000000>;
+ spi-max-frequency = <25000000>;
};
};
diff --git a/arch/arm64/boot/dts/freescale/imx95-15x15-evk.dts b/arch/arm64/boot/dts/freescale/imx95-15x15-evk.dts
index 6c47f4b47356..9f4d0899a94d 100644
--- a/arch/arm64/boot/dts/freescale/imx95-15x15-evk.dts
+++ b/arch/arm64/boot/dts/freescale/imx95-15x15-evk.dts
@@ -574,17 +574,17 @@
&scmi_iomuxc {
pinctrl_emdio: emdiogrp {
fsl,pins = <
- IMX95_PAD_ENET2_MDC__NETCMIX_TOP_NETC_MDC 0x57e
- IMX95_PAD_ENET2_MDIO__NETCMIX_TOP_NETC_MDIO 0x97e
+ IMX95_PAD_ENET2_MDC__NETCMIX_TOP_NETC_MDC 0x50e
+ IMX95_PAD_ENET2_MDIO__NETCMIX_TOP_NETC_MDIO 0x90e
>;
};
pinctrl_enetc0: enetc0grp {
fsl,pins = <
- IMX95_PAD_ENET1_TD3__NETCMIX_TOP_ETH0_RGMII_TD3 0x57e
- IMX95_PAD_ENET1_TD2__NETCMIX_TOP_ETH0_RGMII_TD2 0x57e
- IMX95_PAD_ENET1_TD1__NETCMIX_TOP_ETH0_RGMII_TD1 0x57e
- IMX95_PAD_ENET1_TD0__NETCMIX_TOP_ETH0_RGMII_TD0 0x57e
+ IMX95_PAD_ENET1_TD3__NETCMIX_TOP_ETH0_RGMII_TD3 0x50e
+ IMX95_PAD_ENET1_TD2__NETCMIX_TOP_ETH0_RGMII_TD2 0x50e
+ IMX95_PAD_ENET1_TD1__NETCMIX_TOP_ETH0_RGMII_TD1 0x50e
+ IMX95_PAD_ENET1_TD0__NETCMIX_TOP_ETH0_RGMII_TD0 0x50e
IMX95_PAD_ENET1_TX_CTL__NETCMIX_TOP_ETH0_RGMII_TX_CTL 0x57e
IMX95_PAD_ENET1_TXC__NETCMIX_TOP_ETH0_RGMII_TX_CLK 0x58e
IMX95_PAD_ENET1_RX_CTL__NETCMIX_TOP_ETH0_RGMII_RX_CTL 0x57e
@@ -598,10 +598,10 @@
pinctrl_enetc1: enetc1grp {
fsl,pins = <
- IMX95_PAD_ENET2_TD3__NETCMIX_TOP_ETH1_RGMII_TD3 0x57e
- IMX95_PAD_ENET2_TD2__NETCMIX_TOP_ETH1_RGMII_TD2 0x57e
- IMX95_PAD_ENET2_TD1__NETCMIX_TOP_ETH1_RGMII_TD1 0x57e
- IMX95_PAD_ENET2_TD0__NETCMIX_TOP_ETH1_RGMII_TD0 0x57e
+ IMX95_PAD_ENET2_TD3__NETCMIX_TOP_ETH1_RGMII_TD3 0x50e
+ IMX95_PAD_ENET2_TD2__NETCMIX_TOP_ETH1_RGMII_TD2 0x50e
+ IMX95_PAD_ENET2_TD1__NETCMIX_TOP_ETH1_RGMII_TD1 0x50e
+ IMX95_PAD_ENET2_TD0__NETCMIX_TOP_ETH1_RGMII_TD0 0x50e
IMX95_PAD_ENET2_TX_CTL__NETCMIX_TOP_ETH1_RGMII_TX_CTL 0x57e
IMX95_PAD_ENET2_TXC__NETCMIX_TOP_ETH1_RGMII_TX_CLK 0x58e
IMX95_PAD_ENET2_RX_CTL__NETCMIX_TOP_ETH1_RGMII_RX_CTL 0x57e
diff --git a/arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts b/arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts
index 6886ea766655..d7d845231312 100644
--- a/arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts
+++ b/arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts
@@ -566,17 +566,17 @@
&scmi_iomuxc {
pinctrl_emdio: emdiogrp{
fsl,pins = <
- IMX95_PAD_ENET1_MDC__NETCMIX_TOP_NETC_MDC 0x57e
- IMX95_PAD_ENET1_MDIO__NETCMIX_TOP_NETC_MDIO 0x97e
+ IMX95_PAD_ENET1_MDC__NETCMIX_TOP_NETC_MDC 0x50e
+ IMX95_PAD_ENET1_MDIO__NETCMIX_TOP_NETC_MDIO 0x90e
>;
};
pinctrl_enetc0: enetc0grp {
fsl,pins = <
- IMX95_PAD_ENET1_TD3__NETCMIX_TOP_ETH0_RGMII_TD3 0x57e
- IMX95_PAD_ENET1_TD2__NETCMIX_TOP_ETH0_RGMII_TD2 0x57e
- IMX95_PAD_ENET1_TD1__NETCMIX_TOP_ETH0_RGMII_TD1 0x57e
- IMX95_PAD_ENET1_TD0__NETCMIX_TOP_ETH0_RGMII_TD0 0x57e
+ IMX95_PAD_ENET1_TD3__NETCMIX_TOP_ETH0_RGMII_TD3 0x50e
+ IMX95_PAD_ENET1_TD2__NETCMIX_TOP_ETH0_RGMII_TD2 0x50e
+ IMX95_PAD_ENET1_TD1__NETCMIX_TOP_ETH0_RGMII_TD1 0x50e
+ IMX95_PAD_ENET1_TD0__NETCMIX_TOP_ETH0_RGMII_TD0 0x50e
IMX95_PAD_ENET1_TX_CTL__NETCMIX_TOP_ETH0_RGMII_TX_CTL 0x57e
IMX95_PAD_ENET1_TXC__NETCMIX_TOP_ETH0_RGMII_TX_CLK 0x58e
IMX95_PAD_ENET1_RX_CTL__NETCMIX_TOP_ETH0_RGMII_RX_CTL 0x57e
diff --git a/arch/arm64/boot/dts/freescale/imx95.dtsi b/arch/arm64/boot/dts/freescale/imx95.dtsi
index 632631a29112..5aecdd9b62ff 100644
--- a/arch/arm64/boot/dts/freescale/imx95.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx95.dtsi
@@ -1708,7 +1708,7 @@
<0x9 0 1 0>;
reg-names = "dbi","atu", "dbi2", "app", "dma", "addr_space";
num-lanes = <1>;
- interrupts = <GIC_SPI 317 IRQ_TYPE_LEVEL_HIGH>;
+ interrupts = <GIC_SPI 311 IRQ_TYPE_LEVEL_HIGH>;
interrupt-names = "dma";
clocks = <&scmi_clk IMX95_CLK_HSIO>,
<&scmi_clk IMX95_CLK_HSIOPLL>,
diff --git a/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts b/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts
index ae7a275fd223..cefecb7a23cf 100644
--- a/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts
+++ b/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts
@@ -1090,6 +1090,8 @@
};
&pmk8280_rtc {
+ qcom,uefi-rtc-info;
+
status = "okay";
};
diff --git a/arch/arm64/boot/dts/qcom/x1e80100-pmics.dtsi b/arch/arm64/boot/dts/qcom/x1e80100-pmics.dtsi
index c02fd4d15c96..e3888bc143a0 100644
--- a/arch/arm64/boot/dts/qcom/x1e80100-pmics.dtsi
+++ b/arch/arm64/boot/dts/qcom/x1e80100-pmics.dtsi
@@ -224,6 +224,7 @@
reg-names = "rtc", "alarm";
interrupts = <0x0 0x62 0x1 IRQ_TYPE_EDGE_RISING>;
qcom,no-alarm; /* alarm owned by ADSP */
+ qcom,uefi-rtc-info;
};
pmk8550_sdam_2: nvram@7100 {
diff --git a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi
index ab232e5c7ad6..4203b335a263 100644
--- a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi
+++ b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi
@@ -379,6 +379,18 @@
<0 RK_PA7 RK_FUNC_GPIO &pcfg_pull_up>;
};
};
+
+ spi1 {
+ spi1_csn0_gpio_pin: spi1-csn0-gpio-pin {
+ rockchip,pins =
+ <3 RK_PB1 RK_FUNC_GPIO &pcfg_pull_up_4ma>;
+ };
+
+ spi1_csn1_gpio_pin: spi1-csn1-gpio-pin {
+ rockchip,pins =
+ <3 RK_PB2 RK_FUNC_GPIO &pcfg_pull_up_4ma>;
+ };
+ };
};
&pmu_io_domains {
@@ -396,6 +408,17 @@
vqmmc-supply = <&vccio_sd>;
};
+&spi1 {
+ /*
+ * Hardware CS has a very slow rise time of about 6us,
+ * causing transmission errors.
+ * With cs-gpios we have a rise time of about 20ns.
+ */
+ cs-gpios = <&gpio3 RK_PB1 GPIO_ACTIVE_LOW>, <&gpio3 RK_PB2 GPIO_ACTIVE_LOW>;
+ pinctrl-names = "default";
+ pinctrl-0 = <&spi1_clk &spi1_csn0_gpio_pin &spi1_csn1_gpio_pin &spi1_miso &spi1_mosi>;
+};
+
&tsadc {
status = "okay";
};
diff --git a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts
index 3c127c5c2607..a9021c524afb 100644
--- a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts
@@ -30,6 +30,7 @@
fan: gpio_fan {
compatible = "gpio-fan";
+ fan-supply = <&vcc12v_dcin>;
gpios = <&gpio0 RK_PD5 GPIO_ACTIVE_HIGH>;
gpio-fan,speed-map =
< 0 0>,
diff --git a/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts b/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts
index 3b31f0dd8f3b..539edc3c535f 100644
--- a/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts
@@ -29,7 +29,6 @@
function-enumerator = <1>;
gpios = <&gpio3 RK_PD6 GPIO_ACTIVE_HIGH>;
label = "LAN-1";
- linux,default-trigger = "netdev";
};
led-lan2 {
@@ -39,7 +38,6 @@
function-enumerator = <2>;
gpios = <&gpio3 RK_PD7 GPIO_ACTIVE_HIGH>;
label = "LAN-2";
- linux,default-trigger = "netdev";
};
power_led: led-sys {
@@ -56,7 +54,6 @@
function = LED_FUNCTION_WAN;
gpios = <&gpio2 RK_PC1 GPIO_ACTIVE_HIGH>;
label = "WAN";
- linux,default-trigger = "netdev";
};
};
};
diff --git a/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts b/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts
index b09e789c75c4..801b40fea4e8 100644
--- a/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts
@@ -211,10 +211,38 @@
status = "okay";
};
+&cpu_b0 {
+ cpu-supply = <&vdd_cpu_big_s0>;
+};
+
+&cpu_b1 {
+ cpu-supply = <&vdd_cpu_big_s0>;
+};
+
+&cpu_b2 {
+ cpu-supply = <&vdd_cpu_big_s0>;
+};
+
+&cpu_b3 {
+ cpu-supply = <&vdd_cpu_big_s0>;
+};
+
&cpu_l0 {
cpu-supply = <&vdd_cpu_lit_s0>;
};
+&cpu_l1 {
+ cpu-supply = <&vdd_cpu_lit_s0>;
+};
+
+&cpu_l2 {
+ cpu-supply = <&vdd_cpu_lit_s0>;
+};
+
+&cpu_l3 {
+ cpu-supply = <&vdd_cpu_lit_s0>;
+};
+
&gmac0 {
phy-mode = "rgmii-id";
clock_in_out = "output";
diff --git a/arch/arm64/boot/dts/rockchip/rk3576.dtsi b/arch/arm64/boot/dts/rockchip/rk3576.dtsi
index 1086482f0479..64812e3bcb61 100644
--- a/arch/arm64/boot/dts/rockchip/rk3576.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3576.dtsi
@@ -615,7 +615,7 @@
<0 0 0 2 &pcie1_intc 1>,
<0 0 0 3 &pcie1_intc 2>,
<0 0 0 4 &pcie1_intc 3>;
- linux,pci-domain = <0>;
+ linux,pci-domain = <1>;
max-link-speed = <2>;
num-ib-windows = <8>;
num-viewport = <8>;
diff --git a/arch/arm64/boot/dts/rockchip/rk3588-base-pinctrl.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-base-pinctrl.dtsi
index 7f874c77410c..6584d73660f6 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588-base-pinctrl.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588-base-pinctrl.dtsi
@@ -578,14 +578,14 @@
hdmim0_tx0_scl: hdmim0-tx0-scl {
rockchip,pins =
/* hdmim0_tx0_scl */
- <4 RK_PB7 5 &pcfg_pull_none>;
+ <4 RK_PB7 5 &pcfg_pull_none_drv_level_5_smt>;
};
/omit-if-no-ref/
hdmim0_tx0_sda: hdmim0-tx0-sda {
rockchip,pins =
/* hdmim0_tx0_sda */
- <4 RK_PC0 5 &pcfg_pull_none>;
+ <4 RK_PC0 5 &pcfg_pull_none_drv_level_1_smt>;
};
/omit-if-no-ref/
@@ -640,14 +640,14 @@
hdmim1_tx0_scl: hdmim1-tx0-scl {
rockchip,pins =
/* hdmim1_tx0_scl */
- <0 RK_PD5 11 &pcfg_pull_none>;
+ <0 RK_PD5 11 &pcfg_pull_none_drv_level_5_smt>;
};
/omit-if-no-ref/
hdmim1_tx0_sda: hdmim1-tx0-sda {
rockchip,pins =
/* hdmim1_tx0_sda */
- <0 RK_PD4 11 &pcfg_pull_none>;
+ <0 RK_PD4 11 &pcfg_pull_none_drv_level_1_smt>;
};
/omit-if-no-ref/
@@ -668,14 +668,14 @@
hdmim1_tx1_scl: hdmim1-tx1-scl {
rockchip,pins =
/* hdmim1_tx1_scl */
- <3 RK_PC6 5 &pcfg_pull_none>;
+ <3 RK_PC6 5 &pcfg_pull_none_drv_level_5_smt>;
};
/omit-if-no-ref/
hdmim1_tx1_sda: hdmim1-tx1-sda {
rockchip,pins =
/* hdmim1_tx1_sda */
- <3 RK_PC5 5 &pcfg_pull_none>;
+ <3 RK_PC5 5 &pcfg_pull_none_drv_level_1_smt>;
};
/omit-if-no-ref/
hdmim2_rx_cec: hdmim2-rx-cec {
@@ -709,14 +709,14 @@
hdmim2_tx0_scl: hdmim2-tx0-scl {
rockchip,pins =
/* hdmim2_tx0_scl */
- <3 RK_PC7 5 &pcfg_pull_none>;
+ <3 RK_PC7 5 &pcfg_pull_none_drv_level_5_smt>;
};
/omit-if-no-ref/
hdmim2_tx0_sda: hdmim2-tx0-sda {
rockchip,pins =
/* hdmim2_tx0_sda */
- <3 RK_PD0 5 &pcfg_pull_none>;
+ <3 RK_PD0 5 &pcfg_pull_none_drv_level_1_smt>;
};
/omit-if-no-ref/
@@ -730,14 +730,14 @@
hdmim2_tx1_scl: hdmim2-tx1-scl {
rockchip,pins =
/* hdmim2_tx1_scl */
- <1 RK_PA4 5 &pcfg_pull_none>;
+ <1 RK_PA4 5 &pcfg_pull_none_drv_level_5_smt>;
};
/omit-if-no-ref/
hdmim2_tx1_sda: hdmim2-tx1-sda {
rockchip,pins =
/* hdmim2_tx1_sda */
- <1 RK_PA3 5 &pcfg_pull_none>;
+ <1 RK_PA3 5 &pcfg_pull_none_drv_level_1_smt>;
};
/omit-if-no-ref/
diff --git a/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi
index cc37f082adea..b07543315f87 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi
@@ -321,6 +321,7 @@
bus-width = <4>;
cap-mmc-highspeed;
cap-sd-highspeed;
+ cd-gpios = <&gpio0 RK_PA4 GPIO_ACTIVE_LOW>;
disable-wp;
max-frequency = <150000000>;
no-sdio;
diff --git a/arch/arm64/boot/dts/rockchip/rk3588-extra-pinctrl.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-extra-pinctrl.dtsi
index 244c66faa161..fb48ddc04bcb 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588-extra-pinctrl.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588-extra-pinctrl.dtsi
@@ -160,14 +160,15 @@
hdmim0_tx1_scl: hdmim0-tx1-scl {
rockchip,pins =
/* hdmim0_tx1_scl */
- <2 RK_PB5 4 &pcfg_pull_none>;
+ <2 RK_PB5 4 &pcfg_pull_none_drv_level_3_smt>;
};
/omit-if-no-ref/
hdmim0_tx1_sda: hdmim0-tx1-sda {
rockchip,pins =
/* hdmim0_tx1_sda */
- <2 RK_PB4 4 &pcfg_pull_none>;
+ <2 RK_PB4 4 &pcfg_pull_none_drv_level_1_smt>;
+
};
};
diff --git a/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts b/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts
index 8b717c4017a4..b2947b36fada 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts
@@ -474,6 +474,7 @@
bus-width = <4>;
cap-mmc-highspeed;
cap-sd-highspeed;
+ cd-gpios = <&gpio0 RK_PA4 GPIO_ACTIVE_LOW>;
disable-wp;
max-frequency = <150000000>;
no-sdio;
diff --git a/arch/arm64/boot/dts/rockchip/rockchip-pinconf.dtsi b/arch/arm64/boot/dts/rockchip/rockchip-pinconf.dtsi
index 5c645437b507..b0475b7c655a 100644
--- a/arch/arm64/boot/dts/rockchip/rockchip-pinconf.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rockchip-pinconf.dtsi
@@ -333,6 +333,41 @@
};
/omit-if-no-ref/
+ pcfg_pull_none_drv_level_1_smt: pcfg-pull-none-drv-level-1-smt {
+ bias-disable;
+ drive-strength = <1>;
+ input-schmitt-enable;
+ };
+
+ /omit-if-no-ref/
+ pcfg_pull_none_drv_level_2_smt: pcfg-pull-none-drv-level-2-smt {
+ bias-disable;
+ drive-strength = <2>;
+ input-schmitt-enable;
+ };
+
+ /omit-if-no-ref/
+ pcfg_pull_none_drv_level_3_smt: pcfg-pull-none-drv-level-3-smt {
+ bias-disable;
+ drive-strength = <3>;
+ input-schmitt-enable;
+ };
+
+ /omit-if-no-ref/
+ pcfg_pull_none_drv_level_4_smt: pcfg-pull-none-drv-level-4-smt {
+ bias-disable;
+ drive-strength = <4>;
+ input-schmitt-enable;
+ };
+
+ /omit-if-no-ref/
+ pcfg_pull_none_drv_level_5_smt: pcfg-pull-none-drv-level-5-smt {
+ bias-disable;
+ drive-strength = <5>;
+ input-schmitt-enable;
+ };
+
+ /omit-if-no-ref/
pcfg_output_high: pcfg-output-high {
output-high;
};
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 7e04a2905ce4..03117405b6ce 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1444,6 +1444,7 @@ CONFIG_PLATFORM_MHU=y
CONFIG_BCM2835_MBOX=y
CONFIG_QCOM_APCS_IPC=y
CONFIG_MTK_ADSP_MBOX=m
+CONFIG_QCOM_CPUCP_MBOX=m
CONFIG_QCOM_IPCC=y
CONFIG_ROCKCHIP_IOMMU=y
CONFIG_TEGRA_IOMMU_SMMU=y
@@ -1743,8 +1744,6 @@ CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_ANSI_CPRNG=y
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_GHASH_ARM64_CE=y
-CONFIG_CRYPTO_SHA1_ARM64_CE=y
-CONFIG_CRYPTO_SHA512_ARM64_CE=m
CONFIG_CRYPTO_SHA3_ARM64=m
CONFIG_CRYPTO_SM3_ARM64_CE=m
CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index c44b0f202a1f..3bb5b513d5ae 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -25,36 +25,6 @@ config CRYPTO_NHPOLY1305_NEON
Architecture: arm64 using:
- NEON (Advanced SIMD) extensions
-config CRYPTO_SHA1_ARM64_CE
- tristate "Hash functions: SHA-1 (ARMv8 Crypto Extensions)"
- depends on KERNEL_MODE_NEON
- select CRYPTO_HASH
- select CRYPTO_SHA1
- help
- SHA-1 secure hash algorithm (FIPS 180)
-
- Architecture: arm64 using:
- - ARMv8 Crypto Extensions
-
-config CRYPTO_SHA512_ARM64
- tristate "Hash functions: SHA-384 and SHA-512"
- select CRYPTO_HASH
- help
- SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
-
- Architecture: arm64
-
-config CRYPTO_SHA512_ARM64_CE
- tristate "Hash functions: SHA-384 and SHA-512 (ARMv8 Crypto Extensions)"
- depends on KERNEL_MODE_NEON
- select CRYPTO_HASH
- select CRYPTO_SHA512_ARM64
- help
- SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
-
- Architecture: arm64 using:
- - ARMv8 Crypto Extensions
-
config CRYPTO_SHA3_ARM64
tristate "Hash functions: SHA-3 (ARMv8.2 Crypto Extensions)"
depends on KERNEL_MODE_NEON
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index c231c980c514..a8b2cdbe202c 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -5,12 +5,6 @@
# Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
#
-obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o
-sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o
-
-obj-$(CONFIG_CRYPTO_SHA512_ARM64_CE) += sha512-ce.o
-sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o
-
obj-$(CONFIG_CRYPTO_SHA3_ARM64) += sha3-ce.o
sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o
@@ -53,9 +47,6 @@ aes-ce-blk-y := aes-glue-ce.o aes-ce.o
obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o
aes-neon-blk-y := aes-glue-neon.o aes-neon.o
-obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
-sha512-arm64-y := sha512-glue.o sha512-core.o
-
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
@@ -64,11 +55,3 @@ aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
obj-$(CONFIG_CRYPTO_AES_ARM64_BS) += aes-neon-bs.o
aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
-
-quiet_cmd_perlasm = PERLASM $@
- cmd_perlasm = $(PERL) $(<) void $(@)
-
-$(obj)/sha512-core.S: $(src)/../lib/crypto/sha2-armv8.pl
- $(call cmd,perlasm)
-
-clean-files += sha512-core.S
diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S
deleted file mode 100644
index 9b1f2d82a6fe..000000000000
--- a/arch/arm64/crypto/sha1-ce-core.S
+++ /dev/null
@@ -1,150 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .text
- .arch armv8-a+crypto
-
- k0 .req v0
- k1 .req v1
- k2 .req v2
- k3 .req v3
-
- t0 .req v4
- t1 .req v5
-
- dga .req q6
- dgav .req v6
- dgb .req s7
- dgbv .req v7
-
- dg0q .req q12
- dg0s .req s12
- dg0v .req v12
- dg1s .req s13
- dg1v .req v13
- dg2s .req s14
-
- .macro add_only, op, ev, rc, s0, dg1
- .ifc \ev, ev
- add t1.4s, v\s0\().4s, \rc\().4s
- sha1h dg2s, dg0s
- .ifnb \dg1
- sha1\op dg0q, \dg1, t0.4s
- .else
- sha1\op dg0q, dg1s, t0.4s
- .endif
- .else
- .ifnb \s0
- add t0.4s, v\s0\().4s, \rc\().4s
- .endif
- sha1h dg1s, dg0s
- sha1\op dg0q, dg2s, t1.4s
- .endif
- .endm
-
- .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1
- sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s
- add_only \op, \ev, \rc, \s1, \dg1
- sha1su1 v\s0\().4s, v\s3\().4s
- .endm
-
- .macro loadrc, k, val, tmp
- movz \tmp, :abs_g0_nc:\val
- movk \tmp, :abs_g1:\val
- dup \k, \tmp
- .endm
-
- /*
- * int __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
- * int blocks)
- */
-SYM_FUNC_START(__sha1_ce_transform)
- /* load round constants */
- loadrc k0.4s, 0x5a827999, w6
- loadrc k1.4s, 0x6ed9eba1, w6
- loadrc k2.4s, 0x8f1bbcdc, w6
- loadrc k3.4s, 0xca62c1d6, w6
-
- /* load state */
- ld1 {dgav.4s}, [x0]
- ldr dgb, [x0, #16]
-
- /* load sha1_ce_state::finalize */
- ldr_l w4, sha1_ce_offsetof_finalize, x4
- ldr w4, [x0, x4]
-
- /* load input */
-0: ld1 {v8.4s-v11.4s}, [x1], #64
- sub w2, w2, #1
-
-CPU_LE( rev32 v8.16b, v8.16b )
-CPU_LE( rev32 v9.16b, v9.16b )
-CPU_LE( rev32 v10.16b, v10.16b )
-CPU_LE( rev32 v11.16b, v11.16b )
-
-1: add t0.4s, v8.4s, k0.4s
- mov dg0v.16b, dgav.16b
-
- add_update c, ev, k0, 8, 9, 10, 11, dgb
- add_update c, od, k0, 9, 10, 11, 8
- add_update c, ev, k0, 10, 11, 8, 9
- add_update c, od, k0, 11, 8, 9, 10
- add_update c, ev, k1, 8, 9, 10, 11
-
- add_update p, od, k1, 9, 10, 11, 8
- add_update p, ev, k1, 10, 11, 8, 9
- add_update p, od, k1, 11, 8, 9, 10
- add_update p, ev, k1, 8, 9, 10, 11
- add_update p, od, k2, 9, 10, 11, 8
-
- add_update m, ev, k2, 10, 11, 8, 9
- add_update m, od, k2, 11, 8, 9, 10
- add_update m, ev, k2, 8, 9, 10, 11
- add_update m, od, k2, 9, 10, 11, 8
- add_update m, ev, k3, 10, 11, 8, 9
-
- add_update p, od, k3, 11, 8, 9, 10
- add_only p, ev, k3, 9
- add_only p, od, k3, 10
- add_only p, ev, k3, 11
- add_only p, od
-
- /* update state */
- add dgbv.2s, dgbv.2s, dg1v.2s
- add dgav.4s, dgav.4s, dg0v.4s
-
- cbz w2, 2f
- cond_yield 3f, x5, x6
- b 0b
-
- /*
- * Final block: add padding and total bit count.
- * Skip if the input size was not a round multiple of the block size,
- * the padding is handled by the C code in that case.
- */
-2: cbz x4, 3f
- ldr_l w4, sha1_ce_offsetof_count, x4
- ldr x4, [x0, x4]
- movi v9.2d, #0
- mov x8, #0x80000000
- movi v10.2d, #0
- ror x7, x4, #29 // ror(lsl(x4, 3), 32)
- fmov d8, x8
- mov x4, #0
- mov v11.d[0], xzr
- mov v11.d[1], x7
- b 1b
-
- /* store new state */
-3: st1 {dgav.4s}, [x0]
- str dgb, [x0, #16]
- mov w0, w2
- ret
-SYM_FUNC_END(__sha1_ce_transform)
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
deleted file mode 100644
index 65b6980817e5..000000000000
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ /dev/null
@@ -1,118 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2014 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha1");
-
-struct sha1_ce_state {
- struct sha1_state sst;
- u32 finalize;
-};
-
-extern const u32 sha1_ce_offsetof_count;
-extern const u32 sha1_ce_offsetof_finalize;
-
-asmlinkage int __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
- int blocks);
-
-static void sha1_ce_transform(struct sha1_state *sst, u8 const *src,
- int blocks)
-{
- while (blocks) {
- int rem;
-
- kernel_neon_begin();
- rem = __sha1_ce_transform(container_of(sst,
- struct sha1_ce_state,
- sst), src, blocks);
- kernel_neon_end();
- src += (blocks - rem) * SHA1_BLOCK_SIZE;
- blocks = rem;
- }
-}
-
-const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count);
-const u32 sha1_ce_offsetof_finalize = offsetof(struct sha1_ce_state, finalize);
-
-static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- struct sha1_ce_state *sctx = shash_desc_ctx(desc);
-
- sctx->finalize = 0;
- return sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform);
-}
-
-static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- struct sha1_ce_state *sctx = shash_desc_ctx(desc);
- bool finalized = false;
-
- /*
- * Allow the asm code to perform the finalization if there is no
- * partial data and the input is a round multiple of the block size.
- */
- if (len >= SHA1_BLOCK_SIZE) {
- unsigned int remain = len - round_down(len, SHA1_BLOCK_SIZE);
-
- finalized = !remain;
- sctx->finalize = finalized;
- sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform);
- data += len - remain;
- len = remain;
- }
- if (!finalized) {
- sctx->finalize = 0;
- sha1_base_do_finup(desc, data, len, sha1_ce_transform);
- }
- return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg alg = {
- .init = sha1_base_init,
- .update = sha1_ce_update,
- .finup = sha1_ce_finup,
- .descsize = sizeof(struct sha1_ce_state),
- .statesize = SHA1_STATE_SIZE,
- .digestsize = SHA1_DIGEST_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name = "sha1-ce",
- .cra_priority = 200,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static int __init sha1_ce_mod_init(void)
-{
- return crypto_register_shash(&alg);
-}
-
-static void __exit sha1_ce_mod_fini(void)
-{
- crypto_unregister_shash(&alg);
-}
-
-module_cpu_feature_match(SHA1, sha1_ce_mod_init);
-module_exit(sha1_ce_mod_fini);
diff --git a/arch/arm64/crypto/sha512-ce-core.S b/arch/arm64/crypto/sha512-ce-core.S
deleted file mode 100644
index 91ef68b15fcc..000000000000
--- a/arch/arm64/crypto/sha512-ce-core.S
+++ /dev/null
@@ -1,206 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions
- *
- * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
- .set .Lq\b, \b
- .set .Lv\b\().2d, \b
- .endr
-
- .macro sha512h, rd, rn, rm
- .inst 0xce608000 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
- .endm
-
- .macro sha512h2, rd, rn, rm
- .inst 0xce608400 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
- .endm
-
- .macro sha512su0, rd, rn
- .inst 0xcec08000 | .L\rd | (.L\rn << 5)
- .endm
-
- .macro sha512su1, rd, rn, rm
- .inst 0xce608800 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
- .endm
-
- /*
- * The SHA-512 round constants
- */
- .section ".rodata", "a"
- .align 4
-.Lsha512_rcon:
- .quad 0x428a2f98d728ae22, 0x7137449123ef65cd
- .quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
- .quad 0x3956c25bf348b538, 0x59f111f1b605d019
- .quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
- .quad 0xd807aa98a3030242, 0x12835b0145706fbe
- .quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
- .quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
- .quad 0x9bdc06a725c71235, 0xc19bf174cf692694
- .quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
- .quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
- .quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
- .quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
- .quad 0x983e5152ee66dfab, 0xa831c66d2db43210
- .quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4
- .quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725
- .quad 0x06ca6351e003826f, 0x142929670a0e6e70
- .quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926
- .quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
- .quad 0x650a73548baf63de, 0x766a0abb3c77b2a8
- .quad 0x81c2c92e47edaee6, 0x92722c851482353b
- .quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001
- .quad 0xc24b8b70d0f89791, 0xc76c51a30654be30
- .quad 0xd192e819d6ef5218, 0xd69906245565a910
- .quad 0xf40e35855771202a, 0x106aa07032bbd1b8
- .quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
- .quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
- .quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
- .quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
- .quad 0x748f82ee5defb2fc, 0x78a5636f43172f60
- .quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec
- .quad 0x90befffa23631e28, 0xa4506cebde82bde9
- .quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b
- .quad 0xca273eceea26619c, 0xd186b8c721c0c207
- .quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
- .quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6
- .quad 0x113f9804bef90dae, 0x1b710b35131c471b
- .quad 0x28db77f523047d84, 0x32caab7b40c72493
- .quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
- .quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
- .quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
-
- .macro dround, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4
- .ifnb \rc1
- ld1 {v\rc1\().2d}, [x4], #16
- .endif
- add v5.2d, v\rc0\().2d, v\in0\().2d
- ext v6.16b, v\i2\().16b, v\i3\().16b, #8
- ext v5.16b, v5.16b, v5.16b, #8
- ext v7.16b, v\i1\().16b, v\i2\().16b, #8
- add v\i3\().2d, v\i3\().2d, v5.2d
- .ifnb \in1
- ext v5.16b, v\in3\().16b, v\in4\().16b, #8
- sha512su0 v\in0\().2d, v\in1\().2d
- .endif
- sha512h q\i3, q6, v7.2d
- .ifnb \in1
- sha512su1 v\in0\().2d, v\in2\().2d, v5.2d
- .endif
- add v\i4\().2d, v\i1\().2d, v\i3\().2d
- sha512h2 q\i3, q\i1, v\i0\().2d
- .endm
-
- /*
- * int __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
- * int blocks)
- */
- .text
-SYM_FUNC_START(__sha512_ce_transform)
- /* load state */
- ld1 {v8.2d-v11.2d}, [x0]
-
- /* load first 4 round constants */
- adr_l x3, .Lsha512_rcon
- ld1 {v20.2d-v23.2d}, [x3], #64
-
- /* load input */
-0: ld1 {v12.2d-v15.2d}, [x1], #64
- ld1 {v16.2d-v19.2d}, [x1], #64
- sub w2, w2, #1
-
-CPU_LE( rev64 v12.16b, v12.16b )
-CPU_LE( rev64 v13.16b, v13.16b )
-CPU_LE( rev64 v14.16b, v14.16b )
-CPU_LE( rev64 v15.16b, v15.16b )
-CPU_LE( rev64 v16.16b, v16.16b )
-CPU_LE( rev64 v17.16b, v17.16b )
-CPU_LE( rev64 v18.16b, v18.16b )
-CPU_LE( rev64 v19.16b, v19.16b )
-
- mov x4, x3 // rc pointer
-
- mov v0.16b, v8.16b
- mov v1.16b, v9.16b
- mov v2.16b, v10.16b
- mov v3.16b, v11.16b
-
- // v0 ab cd -- ef gh ab
- // v1 cd -- ef gh ab cd
- // v2 ef gh ab cd -- ef
- // v3 gh ab cd -- ef gh
- // v4 -- ef gh ab cd --
-
- dround 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17
- dround 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18
- dround 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19
- dround 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12
- dround 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13
-
- dround 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14
- dround 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15
- dround 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16
- dround 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17
- dround 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18
-
- dround 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19
- dround 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12
- dround 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13
- dround 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14
- dround 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15
-
- dround 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16
- dround 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17
- dround 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18
- dround 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19
- dround 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12
-
- dround 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13
- dround 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14
- dround 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15
- dround 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16
- dround 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17
-
- dround 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18
- dround 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19
- dround 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12
- dround 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13
- dround 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14
-
- dround 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15
- dround 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16
- dround 2, 3, 1, 4, 0, 28, 24, 12
- dround 4, 2, 0, 1, 3, 29, 25, 13
- dround 1, 4, 3, 0, 2, 30, 26, 14
-
- dround 0, 1, 2, 3, 4, 31, 27, 15
- dround 3, 0, 4, 2, 1, 24, , 16
- dround 2, 3, 1, 4, 0, 25, , 17
- dround 4, 2, 0, 1, 3, 26, , 18
- dround 1, 4, 3, 0, 2, 27, , 19
-
- /* update state */
- add v8.2d, v8.2d, v0.2d
- add v9.2d, v9.2d, v1.2d
- add v10.2d, v10.2d, v2.2d
- add v11.2d, v11.2d, v3.2d
-
- cond_yield 3f, x4, x5
- /* handled all input blocks? */
- cbnz w2, 0b
-
- /* store new state */
-3: st1 {v8.2d-v11.2d}, [x0]
- mov w0, w2
- ret
-SYM_FUNC_END(__sha512_ce_transform)
diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c
deleted file mode 100644
index 6fb3001fa2c9..000000000000
--- a/arch/arm64/crypto/sha512-ce-glue.c
+++ /dev/null
@@ -1,96 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * sha512-ce-glue.c - SHA-384/SHA-512 using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha384");
-MODULE_ALIAS_CRYPTO("sha512");
-
-asmlinkage int __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
- int blocks);
-
-static void sha512_ce_transform(struct sha512_state *sst, u8 const *src,
- int blocks)
-{
- do {
- int rem;
-
- kernel_neon_begin();
- rem = __sha512_ce_transform(sst, src, blocks);
- kernel_neon_end();
- src += (blocks - rem) * SHA512_BLOCK_SIZE;
- blocks = rem;
- } while (blocks);
-}
-
-static int sha512_ce_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha512_base_do_update_blocks(desc, data, len,
- sha512_ce_transform);
-}
-
-static int sha512_ce_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- sha512_base_do_finup(desc, data, len, sha512_ce_transform);
- return sha512_base_finish(desc, out);
-}
-
-static struct shash_alg algs[] = { {
- .init = sha384_base_init,
- .update = sha512_ce_update,
- .finup = sha512_ce_finup,
- .descsize = SHA512_STATE_SIZE,
- .digestsize = SHA384_DIGEST_SIZE,
- .base.cra_name = "sha384",
- .base.cra_driver_name = "sha384-ce",
- .base.cra_priority = 200,
- .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .base.cra_blocksize = SHA512_BLOCK_SIZE,
- .base.cra_module = THIS_MODULE,
-}, {
- .init = sha512_base_init,
- .update = sha512_ce_update,
- .finup = sha512_ce_finup,
- .descsize = SHA512_STATE_SIZE,
- .digestsize = SHA512_DIGEST_SIZE,
- .base.cra_name = "sha512",
- .base.cra_driver_name = "sha512-ce",
- .base.cra_priority = 200,
- .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .base.cra_blocksize = SHA512_BLOCK_SIZE,
- .base.cra_module = THIS_MODULE,
-} };
-
-static int __init sha512_ce_mod_init(void)
-{
- return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha512_ce_mod_fini(void)
-{
- crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_cpu_feature_match(SHA512, sha512_ce_mod_init);
-module_exit(sha512_ce_mod_fini);
diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c
deleted file mode 100644
index 15aa9d8b7b2c..000000000000
--- a/arch/arm64/crypto/sha512-glue.c
+++ /dev/null
@@ -1,83 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Linux/arm64 port of the OpenSSL SHA512 implementation for AArch64
- *
- * Copyright (c) 2016 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash for arm64");
-MODULE_AUTHOR("Andy Polyakov <appro@openssl.org>");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha384");
-MODULE_ALIAS_CRYPTO("sha512");
-
-asmlinkage void sha512_blocks_arch(u64 *digest, const void *data,
- unsigned int num_blks);
-
-static void sha512_arm64_transform(struct sha512_state *sst, u8 const *src,
- int blocks)
-{
- sha512_blocks_arch(sst->state, src, blocks);
-}
-
-static int sha512_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha512_base_do_update_blocks(desc, data, len,
- sha512_arm64_transform);
-}
-
-static int sha512_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- sha512_base_do_finup(desc, data, len, sha512_arm64_transform);
- return sha512_base_finish(desc, out);
-}
-
-static struct shash_alg algs[] = { {
- .digestsize = SHA512_DIGEST_SIZE,
- .init = sha512_base_init,
- .update = sha512_update,
- .finup = sha512_finup,
- .descsize = SHA512_STATE_SIZE,
- .base.cra_name = "sha512",
- .base.cra_driver_name = "sha512-arm64",
- .base.cra_priority = 150,
- .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .base.cra_blocksize = SHA512_BLOCK_SIZE,
- .base.cra_module = THIS_MODULE,
-}, {
- .digestsize = SHA384_DIGEST_SIZE,
- .init = sha384_base_init,
- .update = sha512_update,
- .finup = sha512_finup,
- .descsize = SHA512_STATE_SIZE,
- .base.cra_name = "sha384",
- .base.cra_driver_name = "sha384-arm64",
- .base.cra_priority = 150,
- .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .base.cra_blocksize = SHA384_BLOCK_SIZE,
- .base.cra_module = THIS_MODULE,
-} };
-
-static int __init sha512_mod_init(void)
-{
- return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha512_mod_fini(void)
-{
- crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_init(sha512_mod_init);
-module_exit(sha512_mod_fini);
diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h
index a407f9cd549e..c07a58b96329 100644
--- a/arch/arm64/include/asm/acpi.h
+++ b/arch/arm64/include/asm/acpi.h
@@ -150,7 +150,7 @@ acpi_set_mailbox_entry(int cpu, struct acpi_madt_generic_interrupt *processor)
{}
#endif
-static inline const char *acpi_get_enable_method(int cpu)
+static __always_inline const char *acpi_get_enable_method(int cpu)
{
if (acpi_psci_present())
return "psci";
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index ad63457a05c5..c56c21bb1eec 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -41,6 +41,11 @@
/*
* Save/restore interrupts.
*/
+ .macro save_and_disable_daif, flags
+ mrs \flags, daif
+ msr daifset, #0xf
+ .endm
+
.macro save_and_disable_irq, flags
mrs \flags, daif
msr daifset, #3
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 5ae2a34b50bd..fe62218b8e0e 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -614,7 +614,7 @@ SYM_CODE_END(ret_to_kernel)
SYM_CODE_START_LOCAL(ret_to_user)
ldr x19, [tsk, #TSK_TI_FLAGS] // re-check for single-step
enable_step_tsk x19, x2
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
bl stackleak_erase_on_task_stack
#endif
kernel_exit 0
@@ -825,6 +825,7 @@ SYM_CODE_END(__bp_harden_el1_vectors)
*
*/
SYM_FUNC_START(cpu_switch_to)
+ save_and_disable_daif x11
mov x10, #THREAD_CPU_CONTEXT
add x8, x0, x10
mov x9, sp
@@ -848,6 +849,7 @@ SYM_FUNC_START(cpu_switch_to)
ptrauth_keys_install_kernel x1, x8, x9, x10
scs_save x0
scs_load_current
+ restore_irq x11
ret
SYM_FUNC_END(cpu_switch_to)
NOKPROBE(cpu_switch_to)
@@ -874,6 +876,7 @@ NOKPROBE(ret_from_fork)
* Calls func(regs) using this CPU's irq stack and shadow irq stack.
*/
SYM_FUNC_START(call_on_irq_stack)
+ save_and_disable_daif x9
#ifdef CONFIG_SHADOW_CALL_STACK
get_current_task x16
scs_save x16
@@ -888,8 +891,10 @@ SYM_FUNC_START(call_on_irq_stack)
/* Move to the new stack and call the function there */
add sp, x16, #IRQ_STACK_SIZE
+ restore_irq x9
blr x1
+ save_and_disable_daif x9
/*
* Restore the SP from the FP, and restore the FP and LR from the frame
* record.
@@ -897,6 +902,7 @@ SYM_FUNC_START(call_on_irq_stack)
mov sp, x29
ldp x29, x30, [sp], #16
scs_load_current
+ restore_irq x9
ret
SYM_FUNC_END(call_on_irq_stack)
NOKPROBE(call_on_irq_stack)
diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile
index 4d11a8c29181..f440bf57b1a5 100644
--- a/arch/arm64/kernel/pi/Makefile
+++ b/arch/arm64/kernel/pi/Makefile
@@ -2,7 +2,7 @@
# Copyright 2022 Google LLC
KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \
- -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_STACKLEAK_PLUGIN) \
+ -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_KSTACK_ERASE) \
$(DISABLE_LATENT_ENTROPY_PLUGIN) \
$(call cc-option,-mbranch-protection=none) \
-I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index ee94b72bf8fb..4b001121c72d 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -1586,7 +1586,7 @@ enum aarch64_regset {
static const struct user_regset aarch64_regsets[] = {
[REGSET_GPR] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = sizeof(struct user_pt_regs) / sizeof(u64),
.size = sizeof(u64),
.align = sizeof(u64),
@@ -1594,7 +1594,7 @@ static const struct user_regset aarch64_regsets[] = {
.set = gpr_set
},
[REGSET_FPR] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(struct user_fpsimd_state) / sizeof(u32),
/*
* We pretend we have 32-bit registers because the fpsr and
@@ -1607,7 +1607,7 @@ static const struct user_regset aarch64_regsets[] = {
.set = fpr_set
},
[REGSET_TLS] = {
- .core_note_type = NT_ARM_TLS,
+ USER_REGSET_NOTE_TYPE(ARM_TLS),
.n = 2,
.size = sizeof(void *),
.align = sizeof(void *),
@@ -1616,7 +1616,7 @@ static const struct user_regset aarch64_regsets[] = {
},
#ifdef CONFIG_HAVE_HW_BREAKPOINT
[REGSET_HW_BREAK] = {
- .core_note_type = NT_ARM_HW_BREAK,
+ USER_REGSET_NOTE_TYPE(ARM_HW_BREAK),
.n = sizeof(struct user_hwdebug_state) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -1624,7 +1624,7 @@ static const struct user_regset aarch64_regsets[] = {
.set = hw_break_set,
},
[REGSET_HW_WATCH] = {
- .core_note_type = NT_ARM_HW_WATCH,
+ USER_REGSET_NOTE_TYPE(ARM_HW_WATCH),
.n = sizeof(struct user_hwdebug_state) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -1633,7 +1633,7 @@ static const struct user_regset aarch64_regsets[] = {
},
#endif
[REGSET_SYSTEM_CALL] = {
- .core_note_type = NT_ARM_SYSTEM_CALL,
+ USER_REGSET_NOTE_TYPE(ARM_SYSTEM_CALL),
.n = 1,
.size = sizeof(int),
.align = sizeof(int),
@@ -1641,7 +1641,7 @@ static const struct user_regset aarch64_regsets[] = {
.set = system_call_set,
},
[REGSET_FPMR] = {
- .core_note_type = NT_ARM_FPMR,
+ USER_REGSET_NOTE_TYPE(ARM_FPMR),
.n = 1,
.size = sizeof(u64),
.align = sizeof(u64),
@@ -1650,7 +1650,7 @@ static const struct user_regset aarch64_regsets[] = {
},
#ifdef CONFIG_ARM64_SVE
[REGSET_SVE] = { /* Scalable Vector Extension */
- .core_note_type = NT_ARM_SVE,
+ USER_REGSET_NOTE_TYPE(ARM_SVE),
.n = DIV_ROUND_UP(SVE_PT_SIZE(ARCH_SVE_VQ_MAX,
SVE_PT_REGS_SVE),
SVE_VQ_BYTES),
@@ -1662,7 +1662,7 @@ static const struct user_regset aarch64_regsets[] = {
#endif
#ifdef CONFIG_ARM64_SME
[REGSET_SSVE] = { /* Streaming mode SVE */
- .core_note_type = NT_ARM_SSVE,
+ USER_REGSET_NOTE_TYPE(ARM_SSVE),
.n = DIV_ROUND_UP(SVE_PT_SIZE(SME_VQ_MAX, SVE_PT_REGS_SVE),
SVE_VQ_BYTES),
.size = SVE_VQ_BYTES,
@@ -1671,7 +1671,7 @@ static const struct user_regset aarch64_regsets[] = {
.set = ssve_set,
},
[REGSET_ZA] = { /* SME ZA */
- .core_note_type = NT_ARM_ZA,
+ USER_REGSET_NOTE_TYPE(ARM_ZA),
/*
* ZA is a single register but it's variably sized and
* the ptrace core requires that the size of any data
@@ -1687,7 +1687,7 @@ static const struct user_regset aarch64_regsets[] = {
.set = za_set,
},
[REGSET_ZT] = { /* SME ZT */
- .core_note_type = NT_ARM_ZT,
+ USER_REGSET_NOTE_TYPE(ARM_ZT),
.n = 1,
.size = ZT_SIG_REG_BYTES,
.align = sizeof(u64),
@@ -1697,7 +1697,7 @@ static const struct user_regset aarch64_regsets[] = {
#endif
#ifdef CONFIG_ARM64_PTR_AUTH
[REGSET_PAC_MASK] = {
- .core_note_type = NT_ARM_PAC_MASK,
+ USER_REGSET_NOTE_TYPE(ARM_PAC_MASK),
.n = sizeof(struct user_pac_mask) / sizeof(u64),
.size = sizeof(u64),
.align = sizeof(u64),
@@ -1705,7 +1705,7 @@ static const struct user_regset aarch64_regsets[] = {
/* this cannot be set dynamically */
},
[REGSET_PAC_ENABLED_KEYS] = {
- .core_note_type = NT_ARM_PAC_ENABLED_KEYS,
+ USER_REGSET_NOTE_TYPE(ARM_PAC_ENABLED_KEYS),
.n = 1,
.size = sizeof(long),
.align = sizeof(long),
@@ -1714,7 +1714,7 @@ static const struct user_regset aarch64_regsets[] = {
},
#ifdef CONFIG_CHECKPOINT_RESTORE
[REGSET_PACA_KEYS] = {
- .core_note_type = NT_ARM_PACA_KEYS,
+ USER_REGSET_NOTE_TYPE(ARM_PACA_KEYS),
.n = sizeof(struct user_pac_address_keys) / sizeof(__uint128_t),
.size = sizeof(__uint128_t),
.align = sizeof(__uint128_t),
@@ -1722,7 +1722,7 @@ static const struct user_regset aarch64_regsets[] = {
.set = pac_address_keys_set,
},
[REGSET_PACG_KEYS] = {
- .core_note_type = NT_ARM_PACG_KEYS,
+ USER_REGSET_NOTE_TYPE(ARM_PACG_KEYS),
.n = sizeof(struct user_pac_generic_keys) / sizeof(__uint128_t),
.size = sizeof(__uint128_t),
.align = sizeof(__uint128_t),
@@ -1733,7 +1733,7 @@ static const struct user_regset aarch64_regsets[] = {
#endif
#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI
[REGSET_TAGGED_ADDR_CTRL] = {
- .core_note_type = NT_ARM_TAGGED_ADDR_CTRL,
+ USER_REGSET_NOTE_TYPE(ARM_TAGGED_ADDR_CTRL),
.n = 1,
.size = sizeof(long),
.align = sizeof(long),
@@ -1743,7 +1743,7 @@ static const struct user_regset aarch64_regsets[] = {
#endif
#ifdef CONFIG_ARM64_POE
[REGSET_POE] = {
- .core_note_type = NT_ARM_POE,
+ USER_REGSET_NOTE_TYPE(ARM_POE),
.n = 1,
.size = sizeof(long),
.align = sizeof(long),
@@ -1753,7 +1753,7 @@ static const struct user_regset aarch64_regsets[] = {
#endif
#ifdef CONFIG_ARM64_GCS
[REGSET_GCS] = {
- .core_note_type = NT_ARM_GCS,
+ USER_REGSET_NOTE_TYPE(ARM_GCS),
.n = sizeof(struct user_gcs) / sizeof(u64),
.size = sizeof(u64),
.align = sizeof(u64),
@@ -1943,7 +1943,7 @@ static int compat_tls_set(struct task_struct *target,
static const struct user_regset aarch32_regsets[] = {
[REGSET_COMPAT_GPR] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = COMPAT_ELF_NGREG,
.size = sizeof(compat_elf_greg_t),
.align = sizeof(compat_elf_greg_t),
@@ -1951,7 +1951,7 @@ static const struct user_regset aarch32_regsets[] = {
.set = compat_gpr_set
},
[REGSET_COMPAT_VFP] = {
- .core_note_type = NT_ARM_VFP,
+ USER_REGSET_NOTE_TYPE(ARM_VFP),
.n = VFP_STATE_SIZE / sizeof(compat_ulong_t),
.size = sizeof(compat_ulong_t),
.align = sizeof(compat_ulong_t),
@@ -1968,7 +1968,7 @@ static const struct user_regset_view user_aarch32_view = {
static const struct user_regset aarch32_ptrace_regsets[] = {
[REGSET_GPR] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = COMPAT_ELF_NGREG,
.size = sizeof(compat_elf_greg_t),
.align = sizeof(compat_elf_greg_t),
@@ -1976,7 +1976,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = {
.set = compat_gpr_set
},
[REGSET_FPR] = {
- .core_note_type = NT_ARM_VFP,
+ USER_REGSET_NOTE_TYPE(ARM_VFP),
.n = VFP_STATE_SIZE / sizeof(compat_ulong_t),
.size = sizeof(compat_ulong_t),
.align = sizeof(compat_ulong_t),
@@ -1984,7 +1984,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = {
.set = compat_vfp_set
},
[REGSET_TLS] = {
- .core_note_type = NT_ARM_TLS,
+ USER_REGSET_NOTE_TYPE(ARM_TLS),
.n = 1,
.size = sizeof(compat_ulong_t),
.align = sizeof(compat_ulong_t),
@@ -1993,7 +1993,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = {
},
#ifdef CONFIG_HAVE_HW_BREAKPOINT
[REGSET_HW_BREAK] = {
- .core_note_type = NT_ARM_HW_BREAK,
+ USER_REGSET_NOTE_TYPE(ARM_HW_BREAK),
.n = sizeof(struct user_hwdebug_state) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -2001,7 +2001,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = {
.set = hw_break_set,
},
[REGSET_HW_WATCH] = {
- .core_note_type = NT_ARM_HW_WATCH,
+ USER_REGSET_NOTE_TYPE(ARM_HW_WATCH),
.n = sizeof(struct user_hwdebug_state) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -2010,7 +2010,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = {
},
#endif
[REGSET_SYSTEM_CALL] = {
- .core_note_type = NT_ARM_SYSTEM_CALL,
+ USER_REGSET_NOTE_TYPE(ARM_SYSTEM_CALL),
.n = 1,
.size = sizeof(int),
.align = sizeof(int),
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index 5e27e46aa496..7dec05dd33b7 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -36,7 +36,8 @@ ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
# -Wmissing-prototypes and -Wmissing-declarations are removed from
# the CFLAGS to make possible to build the kernel with CONFIG_WERROR enabled.
CC_FLAGS_REMOVE_VDSO := $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \
- $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) \
+ $(RANDSTRUCT_CFLAGS) $(KSTACK_ERASE_CFLAGS) \
+ $(GCC_PLUGINS_CFLAGS) \
$(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \
-Wmissing-prototypes -Wmissing-declarations
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index a76522d63c3e..0b0a68b663d4 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -12,7 +12,7 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS
ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -D__DISABLE_TRACE_MMIO__
ccflags-y += -fno-stack-protector \
-DDISABLE_BRANCH_PROFILING \
- $(DISABLE_STACKLEAK_PLUGIN)
+ $(DISABLE_KSTACK_ERASE)
hostprogs := gen-hyprel
HOST_EXTRACFLAGS += -I$(objtree)/include
diff --git a/arch/arm64/lib/.gitignore b/arch/arm64/lib/.gitignore
new file mode 100644
index 000000000000..647d7a922e68
--- /dev/null
+++ b/arch/arm64/lib/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# This now-removed directory used to contain generated files.
+/crypto/
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 027bfa9689c6..633e5223d944 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,7 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-
-obj-y += crypto/
-
lib-y := clear_user.o delay.o copy_from_user.o \
copy_to_user.o copy_page.o \
clear_page.o csum.o insn.o memchr.o memcpy.o \
@@ -16,12 +13,6 @@ endif
lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
-obj-$(CONFIG_CRC32_ARCH) += crc32-arm64.o
-crc32-arm64-y := crc32.o crc32-core.o
-
-obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-arm64.o
-crc-t10dif-arm64-y := crc-t10dif.o crc-t10dif-core.o
-
obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
obj-$(CONFIG_ARM64_MTE) += mte.o
diff --git a/arch/arm64/lib/crc-t10dif-core.S b/arch/arm64/lib/crc-t10dif-core.S
deleted file mode 100644
index 87dd6d46224d..000000000000
--- a/arch/arm64/lib/crc-t10dif-core.S
+++ /dev/null
@@ -1,469 +0,0 @@
-//
-// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
-//
-// Copyright (C) 2016 Linaro Ltd
-// Copyright (C) 2019-2024 Google LLC
-//
-// Authors: Ard Biesheuvel <ardb@google.com>
-// Eric Biggers <ebiggers@google.com>
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License version 2 as
-// published by the Free Software Foundation.
-//
-
-// Derived from the x86 version:
-//
-// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
-//
-// Copyright (c) 2013, Intel Corporation
-//
-// Authors:
-// Erdinc Ozturk <erdinc.ozturk@intel.com>
-// Vinodh Gopal <vinodh.gopal@intel.com>
-// James Guilford <james.guilford@intel.com>
-// Tim Chen <tim.c.chen@linux.intel.com>
-//
-// This software is available to you under a choice of one of two
-// licenses. You may choose to be licensed under the terms of the GNU
-// General Public License (GPL) Version 2, available from the file
-// COPYING in the main directory of this source tree, or the
-// OpenIB.org BSD license below:
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the
-// distribution.
-//
-// * Neither the name of the Intel Corporation nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-//
-// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Reference paper titled "Fast CRC Computation for Generic
-// Polynomials Using PCLMULQDQ Instruction"
-// URL: http://www.intel.com/content/dam/www/public/us/en/documents
-// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
-//
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .text
- .arch armv8-a+crypto
-
- init_crc .req w0
- buf .req x1
- len .req x2
- fold_consts_ptr .req x5
-
- fold_consts .req v10
-
- t3 .req v17
- t4 .req v18
- t5 .req v19
- t6 .req v20
- t7 .req v21
- t8 .req v22
-
- perm .req v27
-
- .macro pmull16x64_p64, a16, b64, c64
- pmull2 \c64\().1q, \a16\().2d, \b64\().2d
- pmull \b64\().1q, \a16\().1d, \b64\().1d
- .endm
-
- /*
- * Pairwise long polynomial multiplication of two 16-bit values
- *
- * { w0, w1 }, { y0, y1 }
- *
- * by two 64-bit values
- *
- * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
- *
- * where each vector element is a byte, ordered from least to most
- * significant.
- *
- * This can be implemented using 8x8 long polynomial multiplication, by
- * reorganizing the input so that each pairwise 8x8 multiplication
- * produces one of the terms from the decomposition below, and
- * combining the results of each rank and shifting them into place.
- *
- * Rank
- * 0 w0*x0 ^ | y0*z0 ^
- * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^
- * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^
- * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^
- * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^
- * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^
- * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^
- * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^
- * 8 w1*x7 << 64 | y1*z7 << 64
- *
- * The inputs can be reorganized into
- *
- * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
- * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
- *
- * and after performing 8x8->16 bit long polynomial multiplication of
- * each of the halves of the first vector with those of the second one,
- * we obtain the following four vectors of 16-bit elements:
- *
- * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
- * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
- * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
- * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
- *
- * Results b and c can be XORed together, as the vector elements have
- * matching ranks. Then, the final XOR (*) can be pulled forward, and
- * applied between the halves of each of the remaining three vectors,
- * which are then shifted into place, and combined to produce two
- * 80-bit results.
- *
- * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent
- * to the 64x64 bit one above, but XOR'ing the outputs together will
- * produce the expected result, and this is sufficient in the context of
- * this algorithm.
- */
- .macro pmull16x64_p8, a16, b64, c64
- ext t7.16b, \b64\().16b, \b64\().16b, #1
- tbl t5.16b, {\a16\().16b}, perm.16b
- uzp1 t7.16b, \b64\().16b, t7.16b
- bl __pmull_p8_16x64
- ext \b64\().16b, t4.16b, t4.16b, #15
- eor \c64\().16b, t8.16b, t5.16b
- .endm
-
-SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
- ext t6.16b, t5.16b, t5.16b, #8
-
- pmull t3.8h, t7.8b, t5.8b
- pmull t4.8h, t7.8b, t6.8b
- pmull2 t5.8h, t7.16b, t5.16b
- pmull2 t6.8h, t7.16b, t6.16b
-
- ext t8.16b, t3.16b, t3.16b, #8
- eor t4.16b, t4.16b, t6.16b
- ext t7.16b, t5.16b, t5.16b, #8
- ext t6.16b, t4.16b, t4.16b, #8
- eor t8.8b, t8.8b, t3.8b
- eor t5.8b, t5.8b, t7.8b
- eor t4.8b, t4.8b, t6.8b
- ext t5.16b, t5.16b, t5.16b, #14
- ret
-SYM_FUNC_END(__pmull_p8_16x64)
-
-
- // Fold reg1, reg2 into the next 32 data bytes, storing the result back
- // into reg1, reg2.
- .macro fold_32_bytes, p, reg1, reg2
- ldp q11, q12, [buf], #0x20
-
- pmull16x64_\p fold_consts, \reg1, v8
-
-CPU_LE( rev64 v11.16b, v11.16b )
-CPU_LE( rev64 v12.16b, v12.16b )
-
- pmull16x64_\p fold_consts, \reg2, v9
-
-CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
-CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
-
- eor \reg1\().16b, \reg1\().16b, v8.16b
- eor \reg2\().16b, \reg2\().16b, v9.16b
- eor \reg1\().16b, \reg1\().16b, v11.16b
- eor \reg2\().16b, \reg2\().16b, v12.16b
- .endm
-
- // Fold src_reg into dst_reg, optionally loading the next fold constants
- .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts
- pmull16x64_\p fold_consts, \src_reg, v8
- .ifnb \load_next_consts
- ld1 {fold_consts.2d}, [fold_consts_ptr], #16
- .endif
- eor \dst_reg\().16b, \dst_reg\().16b, v8.16b
- eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
- .endm
-
- .macro crc_t10dif_pmull, p
-
- // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
- cmp len, #256
- b.lt .Lless_than_256_bytes_\@
-
- adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts
-
- // Load the first 128 data bytes. Byte swapping is necessary to make
- // the bit order match the polynomial coefficient order.
- ldp q0, q1, [buf]
- ldp q2, q3, [buf, #0x20]
- ldp q4, q5, [buf, #0x40]
- ldp q6, q7, [buf, #0x60]
- add buf, buf, #0x80
-CPU_LE( rev64 v0.16b, v0.16b )
-CPU_LE( rev64 v1.16b, v1.16b )
-CPU_LE( rev64 v2.16b, v2.16b )
-CPU_LE( rev64 v3.16b, v3.16b )
-CPU_LE( rev64 v4.16b, v4.16b )
-CPU_LE( rev64 v5.16b, v5.16b )
-CPU_LE( rev64 v6.16b, v6.16b )
-CPU_LE( rev64 v7.16b, v7.16b )
-CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
-CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
-CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
-CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
-CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
-CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
-CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
-CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
-
- // XOR the first 16 data *bits* with the initial CRC value.
- movi v8.16b, #0
- mov v8.h[7], init_crc
- eor v0.16b, v0.16b, v8.16b
-
- // Load the constants for folding across 128 bytes.
- ld1 {fold_consts.2d}, [fold_consts_ptr]
-
- // Subtract 128 for the 128 data bytes just consumed. Subtract another
- // 128 to simplify the termination condition of the following loop.
- sub len, len, #256
-
- // While >= 128 data bytes remain (not counting v0-v7), fold the 128
- // bytes v0-v7 into them, storing the result back into v0-v7.
-.Lfold_128_bytes_loop_\@:
- fold_32_bytes \p, v0, v1
- fold_32_bytes \p, v2, v3
- fold_32_bytes \p, v4, v5
- fold_32_bytes \p, v6, v7
-
- subs len, len, #128
- b.ge .Lfold_128_bytes_loop_\@
-
- // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
-
- // Fold across 64 bytes.
- add fold_consts_ptr, fold_consts_ptr, #16
- ld1 {fold_consts.2d}, [fold_consts_ptr], #16
- fold_16_bytes \p, v0, v4
- fold_16_bytes \p, v1, v5
- fold_16_bytes \p, v2, v6
- fold_16_bytes \p, v3, v7, 1
- // Fold across 32 bytes.
- fold_16_bytes \p, v4, v6
- fold_16_bytes \p, v5, v7, 1
- // Fold across 16 bytes.
- fold_16_bytes \p, v6, v7
-
- // Add 128 to get the correct number of data bytes remaining in 0...127
- // (not counting v7), following the previous extra subtraction by 128.
- // Then subtract 16 to simplify the termination condition of the
- // following loop.
- adds len, len, #(128-16)
-
- // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
- // into them, storing the result back into v7.
- b.lt .Lfold_16_bytes_loop_done_\@
-.Lfold_16_bytes_loop_\@:
- pmull16x64_\p fold_consts, v7, v8
- eor v7.16b, v7.16b, v8.16b
- ldr q0, [buf], #16
-CPU_LE( rev64 v0.16b, v0.16b )
-CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
- eor v7.16b, v7.16b, v0.16b
- subs len, len, #16
- b.ge .Lfold_16_bytes_loop_\@
-
-.Lfold_16_bytes_loop_done_\@:
- // Add 16 to get the correct number of data bytes remaining in 0...15
- // (not counting v7), following the previous extra subtraction by 16.
- adds len, len, #16
- b.eq .Lreduce_final_16_bytes_\@
-
-.Lhandle_partial_segment_\@:
- // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
- // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To
- // do this without needing a fold constant for each possible 'len',
- // redivide the bytes into a first chunk of 'len' bytes and a second
- // chunk of 16 bytes, then fold the first chunk into the second.
-
- // v0 = last 16 original data bytes
- add buf, buf, len
- ldr q0, [buf, #-16]
-CPU_LE( rev64 v0.16b, v0.16b )
-CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
-
- // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
- adr_l x4, .Lbyteshift_table + 16
- sub x4, x4, len
- ld1 {v2.16b}, [x4]
- tbl v1.16b, {v7.16b}, v2.16b
-
- // v3 = first chunk: v7 right-shifted by '16-len' bytes.
- movi v3.16b, #0x80
- eor v2.16b, v2.16b, v3.16b
- tbl v3.16b, {v7.16b}, v2.16b
-
- // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
- sshr v2.16b, v2.16b, #7
-
- // v2 = second chunk: 'len' bytes from v0 (low-order bytes),
- // then '16-len' bytes from v1 (high-order bytes).
- bsl v2.16b, v1.16b, v0.16b
-
- // Fold the first chunk into the second chunk, storing the result in v7.
- pmull16x64_\p fold_consts, v3, v0
- eor v7.16b, v3.16b, v0.16b
- eor v7.16b, v7.16b, v2.16b
- b .Lreduce_final_16_bytes_\@
-
-.Lless_than_256_bytes_\@:
- // Checksumming a buffer of length 16...255 bytes
-
- adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts
-
- // Load the first 16 data bytes.
- ldr q7, [buf], #0x10
-CPU_LE( rev64 v7.16b, v7.16b )
-CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
-
- // XOR the first 16 data *bits* with the initial CRC value.
- movi v0.16b, #0
- mov v0.h[7], init_crc
- eor v7.16b, v7.16b, v0.16b
-
- // Load the fold-across-16-bytes constants.
- ld1 {fold_consts.2d}, [fold_consts_ptr], #16
-
- cmp len, #16
- b.eq .Lreduce_final_16_bytes_\@ // len == 16
- subs len, len, #32
- b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255
- add len, len, #16
- b .Lhandle_partial_segment_\@ // 17 <= len <= 31
-
-.Lreduce_final_16_bytes_\@:
- .endm
-
-//
-// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-SYM_FUNC_START(crc_t10dif_pmull_p8)
- frame_push 1
-
- // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
- movi perm.4h, #8, lsl #8
- orr perm.2s, #1, lsl #16
- orr perm.2s, #1, lsl #24
- zip1 perm.16b, perm.16b, perm.16b
- zip1 perm.16b, perm.16b, perm.16b
-
- crc_t10dif_pmull p8
-
-CPU_LE( rev64 v7.16b, v7.16b )
-CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
- str q7, [x3]
-
- frame_pop
- ret
-SYM_FUNC_END(crc_t10dif_pmull_p8)
-
- .align 5
-//
-// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-SYM_FUNC_START(crc_t10dif_pmull_p64)
- crc_t10dif_pmull p64
-
- // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
-
- movi v2.16b, #0 // init zero register
-
- // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
- ld1 {fold_consts.2d}, [fold_consts_ptr], #16
-
- // Fold the high 64 bits into the low 64 bits, while also multiplying by
- // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
- // whose low 48 bits are 0.
- ext v0.16b, v2.16b, v7.16b, #8
- pmull2 v7.1q, v7.2d, fold_consts.2d // high bits * x^48 * (x^80 mod G(x))
- eor v0.16b, v0.16b, v7.16b // + low bits * x^64
-
- // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
- // value congruent to x^64 * M(x) and whose low 48 bits are 0.
- ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits
- mov v0.s[3], v2.s[0] // zero high 32 bits
- pmull v1.1q, v1.1d, fold_consts.1d // high 32 bits * x^48 * (x^48 mod G(x))
- eor v0.16b, v0.16b, v1.16b // + low bits
-
- // Load G(x) and floor(x^48 / G(x)).
- ld1 {fold_consts.2d}, [fold_consts_ptr]
-
- // Use Barrett reduction to compute the final CRC value.
- pmull2 v1.1q, v0.2d, fold_consts.2d // high 32 bits * floor(x^48 / G(x))
- ushr v1.2d, v1.2d, #32 // /= x^32
- pmull v1.1q, v1.1d, fold_consts.1d // *= G(x)
- ushr v0.2d, v0.2d, #48
- eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits
- // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
-
- umov w0, v0.h[0]
- ret
-SYM_FUNC_END(crc_t10dif_pmull_p64)
-
- .section ".rodata", "a"
- .align 4
-
-// Fold constants precomputed from the polynomial 0x18bb7
-// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
-.Lfold_across_128_bytes_consts:
- .quad 0x0000000000006123 // x^(8*128) mod G(x)
- .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
-// .Lfold_across_64_bytes_consts:
- .quad 0x0000000000001069 // x^(4*128) mod G(x)
- .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
-// .Lfold_across_32_bytes_consts:
- .quad 0x000000000000857d // x^(2*128) mod G(x)
- .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
-.Lfold_across_16_bytes_consts:
- .quad 0x000000000000a010 // x^(1*128) mod G(x)
- .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
-// .Lfinal_fold_consts:
- .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
- .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
-// .Lbarrett_reduction_consts:
- .quad 0x0000000000018bb7 // G(x)
- .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
-
-// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
-// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
-// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
-.Lbyteshift_table:
- .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
- .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
- .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
- .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
diff --git a/arch/arm64/lib/crc-t10dif.c b/arch/arm64/lib/crc-t10dif.c
deleted file mode 100644
index c2ffe4fdb59d..000000000000
--- a/arch/arm64/lib/crc-t10dif.c
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/cpufeature.h>
-#include <linux/crc-t10dif.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <crypto/internal/simd.h>
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_asimd);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
-
-#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
-
-asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len,
- u8 out[16]);
-asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
-
-u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
-{
- if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
- if (static_branch_likely(&have_pmull)) {
- if (crypto_simd_usable()) {
- kernel_neon_begin();
- crc = crc_t10dif_pmull_p64(crc, data, length);
- kernel_neon_end();
- return crc;
- }
- } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
- static_branch_likely(&have_asimd) &&
- crypto_simd_usable()) {
- u8 buf[16];
-
- kernel_neon_begin();
- crc_t10dif_pmull_p8(crc, data, length, buf);
- kernel_neon_end();
-
- return crc_t10dif_generic(0, buf, sizeof(buf));
- }
- }
- return crc_t10dif_generic(crc, data, length);
-}
-EXPORT_SYMBOL(crc_t10dif_arch);
-
-static int __init crc_t10dif_arm64_init(void)
-{
- if (cpu_have_named_feature(ASIMD)) {
- static_branch_enable(&have_asimd);
- if (cpu_have_named_feature(PMULL))
- static_branch_enable(&have_pmull);
- }
- return 0;
-}
-subsys_initcall(crc_t10dif_arm64_init);
-
-static void __exit crc_t10dif_arm64_exit(void)
-{
-}
-module_exit(crc_t10dif_arm64_exit);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_DESCRIPTION("CRC-T10DIF using arm64 NEON and Crypto Extensions");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/lib/crc32-core.S b/arch/arm64/lib/crc32-core.S
deleted file mode 100644
index 68825317460f..000000000000
--- a/arch/arm64/lib/crc32-core.S
+++ /dev/null
@@ -1,362 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Accelerated CRC32(C) using AArch64 CRC and PMULL instructions
- *
- * Copyright (C) 2016 - 2018 Linaro Ltd.
- * Copyright (C) 2024 Google LLC
- *
- * Author: Ard Biesheuvel <ardb@kernel.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .cpu generic+crc+crypto
-
- .macro bitle, reg
- .endm
-
- .macro bitbe, reg
- rbit \reg, \reg
- .endm
-
- .macro bytele, reg
- .endm
-
- .macro bytebe, reg
- rbit \reg, \reg
- lsr \reg, \reg, #24
- .endm
-
- .macro hwordle, reg
-CPU_BE( rev16 \reg, \reg )
- .endm
-
- .macro hwordbe, reg
-CPU_LE( rev \reg, \reg )
- rbit \reg, \reg
-CPU_BE( lsr \reg, \reg, #16 )
- .endm
-
- .macro le, regs:vararg
- .irp r, \regs
-CPU_BE( rev \r, \r )
- .endr
- .endm
-
- .macro be, regs:vararg
- .irp r, \regs
-CPU_LE( rev \r, \r )
- .endr
- .irp r, \regs
- rbit \r, \r
- .endr
- .endm
-
- .macro __crc32, c, order=le
- bit\order w0
- cmp x2, #16
- b.lt 8f // less than 16 bytes
-
- and x7, x2, #0x1f
- and x2, x2, #~0x1f
- cbz x7, 32f // multiple of 32 bytes
-
- and x8, x7, #0xf
- ldp x3, x4, [x1]
- add x8, x8, x1
- add x1, x1, x7
- ldp x5, x6, [x8]
- \order x3, x4, x5, x6
-
- tst x7, #8
- crc32\c\()x w8, w0, x3
- csel x3, x3, x4, eq
- csel w0, w0, w8, eq
- tst x7, #4
- lsr x4, x3, #32
- crc32\c\()w w8, w0, w3
- csel x3, x3, x4, eq
- csel w0, w0, w8, eq
- tst x7, #2
- lsr w4, w3, #16
- crc32\c\()h w8, w0, w3
- csel w3, w3, w4, eq
- csel w0, w0, w8, eq
- tst x7, #1
- crc32\c\()b w8, w0, w3
- csel w0, w0, w8, eq
- tst x7, #16
- crc32\c\()x w8, w0, x5
- crc32\c\()x w8, w8, x6
- csel w0, w0, w8, eq
- cbz x2, 0f
-
-32: ldp x3, x4, [x1], #32
- sub x2, x2, #32
- ldp x5, x6, [x1, #-16]
- \order x3, x4, x5, x6
- crc32\c\()x w0, w0, x3
- crc32\c\()x w0, w0, x4
- crc32\c\()x w0, w0, x5
- crc32\c\()x w0, w0, x6
- cbnz x2, 32b
-0: bit\order w0
- ret
-
-8: tbz x2, #3, 4f
- ldr x3, [x1], #8
- \order x3
- crc32\c\()x w0, w0, x3
-4: tbz x2, #2, 2f
- ldr w3, [x1], #4
- \order w3
- crc32\c\()w w0, w0, w3
-2: tbz x2, #1, 1f
- ldrh w3, [x1], #2
- hword\order w3
- crc32\c\()h w0, w0, w3
-1: tbz x2, #0, 0f
- ldrb w3, [x1]
- byte\order w3
- crc32\c\()b w0, w0, w3
-0: bit\order w0
- ret
- .endm
-
- .align 5
-SYM_FUNC_START(crc32_le_arm64)
- __crc32
-SYM_FUNC_END(crc32_le_arm64)
-
- .align 5
-SYM_FUNC_START(crc32c_le_arm64)
- __crc32 c
-SYM_FUNC_END(crc32c_le_arm64)
-
- .align 5
-SYM_FUNC_START(crc32_be_arm64)
- __crc32 order=be
-SYM_FUNC_END(crc32_be_arm64)
-
- in .req x1
- len .req x2
-
- /*
- * w0: input CRC at entry, output CRC at exit
- * x1: pointer to input buffer
- * x2: length of input in bytes
- */
- .macro crc4way, insn, table, order=le
- bit\order w0
- lsr len, len, #6 // len := # of 64-byte blocks
-
- /* Process up to 64 blocks of 64 bytes at a time */
-.La\@: mov x3, #64
- cmp len, #64
- csel x3, x3, len, hi // x3 := min(len, 64)
- sub len, len, x3
-
- /* Divide the input into 4 contiguous blocks */
- add x4, x3, x3, lsl #1 // x4 := 3 * x3
- add x7, in, x3, lsl #4 // x7 := in + 16 * x3
- add x8, in, x3, lsl #5 // x8 := in + 32 * x3
- add x9, in, x4, lsl #4 // x9 := in + 16 * x4
-
- /* Load the folding coefficients from the lookup table */
- adr_l x5, \table - 12 // entry 0 omitted
- add x5, x5, x4, lsl #2 // x5 += 12 * x3
- ldp s0, s1, [x5]
- ldr s2, [x5, #8]
-
- /* Zero init partial CRCs for this iteration */
- mov w4, wzr
- mov w5, wzr
- mov w6, wzr
- mov x17, xzr
-
-.Lb\@: sub x3, x3, #1
- \insn w6, w6, x17
- ldp x10, x11, [in], #16
- ldp x12, x13, [x7], #16
- ldp x14, x15, [x8], #16
- ldp x16, x17, [x9], #16
-
- \order x10, x11, x12, x13, x14, x15, x16, x17
-
- /* Apply the CRC transform to 4 16-byte blocks in parallel */
- \insn w0, w0, x10
- \insn w4, w4, x12
- \insn w5, w5, x14
- \insn w6, w6, x16
- \insn w0, w0, x11
- \insn w4, w4, x13
- \insn w5, w5, x15
- cbnz x3, .Lb\@
-
- /* Combine the 4 partial results into w0 */
- mov v3.d[0], x0
- mov v4.d[0], x4
- mov v5.d[0], x5
- pmull v0.1q, v0.1d, v3.1d
- pmull v1.1q, v1.1d, v4.1d
- pmull v2.1q, v2.1d, v5.1d
- eor v0.8b, v0.8b, v1.8b
- eor v0.8b, v0.8b, v2.8b
- mov x5, v0.d[0]
- eor x5, x5, x17
- \insn w0, w6, x5
-
- mov in, x9
- cbnz len, .La\@
-
- bit\order w0
- ret
- .endm
-
- .align 5
-SYM_FUNC_START(crc32c_le_arm64_4way)
- crc4way crc32cx, .L0
-SYM_FUNC_END(crc32c_le_arm64_4way)
-
- .align 5
-SYM_FUNC_START(crc32_le_arm64_4way)
- crc4way crc32x, .L1
-SYM_FUNC_END(crc32_le_arm64_4way)
-
- .align 5
-SYM_FUNC_START(crc32_be_arm64_4way)
- crc4way crc32x, .L1, be
-SYM_FUNC_END(crc32_be_arm64_4way)
-
- .section .rodata, "a", %progbits
- .align 6
-.L0: .long 0xddc0152b, 0xba4fc28e, 0x493c7d27
- .long 0x0715ce53, 0x9e4addf8, 0xba4fc28e
- .long 0xc96cfdc0, 0x0715ce53, 0xddc0152b
- .long 0xab7aff2a, 0x0d3b6092, 0x9e4addf8
- .long 0x299847d5, 0x878a92a7, 0x39d3b296
- .long 0xb6dd949b, 0xab7aff2a, 0x0715ce53
- .long 0xa60ce07b, 0x83348832, 0x47db8317
- .long 0xd270f1a2, 0xb9e02b86, 0x0d3b6092
- .long 0x65863b64, 0xb6dd949b, 0xc96cfdc0
- .long 0xb3e32c28, 0xbac2fd7b, 0x878a92a7
- .long 0xf285651c, 0xce7f39f4, 0xdaece73e
- .long 0x271d9844, 0xd270f1a2, 0xab7aff2a
- .long 0x6cb08e5c, 0x2b3cac5d, 0x2162d385
- .long 0xcec3662e, 0x1b03397f, 0x83348832
- .long 0x8227bb8a, 0xb3e32c28, 0x299847d5
- .long 0xd7a4825c, 0xdd7e3b0c, 0xb9e02b86
- .long 0xf6076544, 0x10746f3c, 0x18b33a4e
- .long 0x98d8d9cb, 0x271d9844, 0xb6dd949b
- .long 0x57a3d037, 0x93a5f730, 0x78d9ccb7
- .long 0x3771e98f, 0x6b749fb2, 0xbac2fd7b
- .long 0xe0ac139e, 0xcec3662e, 0xa60ce07b
- .long 0x6f345e45, 0xe6fc4e6a, 0xce7f39f4
- .long 0xa2b73df1, 0xb0cd4768, 0x61d82e56
- .long 0x86d8e4d2, 0xd7a4825c, 0xd270f1a2
- .long 0xa90fd27a, 0x0167d312, 0xc619809d
- .long 0xca6ef3ac, 0x26f6a60a, 0x2b3cac5d
- .long 0x4597456a, 0x98d8d9cb, 0x65863b64
- .long 0xc9c8b782, 0x68bce87a, 0x1b03397f
- .long 0x62ec6c6d, 0x6956fc3b, 0xebb883bd
- .long 0x2342001e, 0x3771e98f, 0xb3e32c28
- .long 0xe8b6368b, 0x2178513a, 0x064f7f26
- .long 0x9ef68d35, 0x170076fa, 0xdd7e3b0c
- .long 0x0b0bf8ca, 0x6f345e45, 0xf285651c
- .long 0x02ee03b2, 0xff0dba97, 0x10746f3c
- .long 0x135c83fd, 0xf872e54c, 0xc7a68855
- .long 0x00bcf5f6, 0x86d8e4d2, 0x271d9844
- .long 0x58ca5f00, 0x5bb8f1bc, 0x8e766a0c
- .long 0xded288f8, 0xb3af077a, 0x93a5f730
- .long 0x37170390, 0xca6ef3ac, 0x6cb08e5c
- .long 0xf48642e9, 0xdd66cbbb, 0x6b749fb2
- .long 0xb25b29f2, 0xe9e28eb4, 0x1393e203
- .long 0x45cddf4e, 0xc9c8b782, 0xcec3662e
- .long 0xdfd94fb2, 0x93e106a4, 0x96c515bb
- .long 0x021ac5ef, 0xd813b325, 0xe6fc4e6a
- .long 0x8e1450f7, 0x2342001e, 0x8227bb8a
- .long 0xe0cdcf86, 0x6d9a4957, 0xb0cd4768
- .long 0x613eee91, 0xd2c3ed1a, 0x39c7ff35
- .long 0xbedc6ba1, 0x9ef68d35, 0xd7a4825c
- .long 0x0cd1526a, 0xf2271e60, 0x0ab3844b
- .long 0xd6c3a807, 0x2664fd8b, 0x0167d312
- .long 0x1d31175f, 0x02ee03b2, 0xf6076544
- .long 0x4be7fd90, 0x363bd6b3, 0x26f6a60a
- .long 0x6eeed1c9, 0x5fabe670, 0xa741c1bf
- .long 0xb3a6da94, 0x00bcf5f6, 0x98d8d9cb
- .long 0x2e7d11a7, 0x17f27698, 0x49c3cc9c
- .long 0x889774e1, 0xaa7c7ad5, 0x68bce87a
- .long 0x8a074012, 0xded288f8, 0x57a3d037
- .long 0xbd0bb25f, 0x6d390dec, 0x6956fc3b
- .long 0x3be3c09b, 0x6353c1cc, 0x42d98888
- .long 0x465a4eee, 0xf48642e9, 0x3771e98f
- .long 0x2e5f3c8c, 0xdd35bc8d, 0xb42ae3d9
- .long 0xa52f58ec, 0x9a5ede41, 0x2178513a
- .long 0x47972100, 0x45cddf4e, 0xe0ac139e
- .long 0x359674f7, 0xa51b6135, 0x170076fa
-
-.L1: .long 0xaf449247, 0x81256527, 0xccaa009e
- .long 0x57c54819, 0x1d9513d7, 0x81256527
- .long 0x3f41287a, 0x57c54819, 0xaf449247
- .long 0xf5e48c85, 0x910eeec1, 0x1d9513d7
- .long 0x1f0c2cdd, 0x9026d5b1, 0xae0b5394
- .long 0x71d54a59, 0xf5e48c85, 0x57c54819
- .long 0x1c63267b, 0xfe807bbd, 0x0cbec0ed
- .long 0xd31343ea, 0xe95c1271, 0x910eeec1
- .long 0xf9d9c7ee, 0x71d54a59, 0x3f41287a
- .long 0x9ee62949, 0xcec97417, 0x9026d5b1
- .long 0xa55d1514, 0xf183c71b, 0xd1df2327
- .long 0x21aa2b26, 0xd31343ea, 0xf5e48c85
- .long 0x9d842b80, 0xeea395c4, 0x3c656ced
- .long 0xd8110ff1, 0xcd669a40, 0xfe807bbd
- .long 0x3f9e9356, 0x9ee62949, 0x1f0c2cdd
- .long 0x1d6708a0, 0x0c30f51d, 0xe95c1271
- .long 0xef82aa68, 0xdb3935ea, 0xb918a347
- .long 0xd14bcc9b, 0x21aa2b26, 0x71d54a59
- .long 0x99cce860, 0x356d209f, 0xff6f2fc2
- .long 0xd8af8e46, 0xc352f6de, 0xcec97417
- .long 0xf1996890, 0xd8110ff1, 0x1c63267b
- .long 0x631bc508, 0xe95c7216, 0xf183c71b
- .long 0x8511c306, 0x8e031a19, 0x9b9bdbd0
- .long 0xdb3839f3, 0x1d6708a0, 0xd31343ea
- .long 0x7a92fffb, 0xf7003835, 0x4470ac44
- .long 0x6ce68f2a, 0x00eba0c8, 0xeea395c4
- .long 0x4caaa263, 0xd14bcc9b, 0xf9d9c7ee
- .long 0xb46f7cff, 0x9a1b53c8, 0xcd669a40
- .long 0x60290934, 0x81b6f443, 0x6d40f445
- .long 0x8e976a7d, 0xd8af8e46, 0x9ee62949
- .long 0xdcf5088a, 0x9dbdc100, 0x145575d5
- .long 0x1753ab84, 0xbbf2f6d6, 0x0c30f51d
- .long 0x255b139e, 0x631bc508, 0xa55d1514
- .long 0xd784eaa8, 0xce26786c, 0xdb3935ea
- .long 0x6d2c864a, 0x8068c345, 0x2586d334
- .long 0x02072e24, 0xdb3839f3, 0x21aa2b26
- .long 0x06689b0a, 0x5efd72f5, 0xe0575528
- .long 0x1e52f5ea, 0x4117915b, 0x356d209f
- .long 0x1d3d1db6, 0x6ce68f2a, 0x9d842b80
- .long 0x3796455c, 0xb8e0e4a8, 0xc352f6de
- .long 0xdf3a4eb3, 0xc55a2330, 0xb84ffa9c
- .long 0x28ae0976, 0xb46f7cff, 0xd8110ff1
- .long 0x9764bc8d, 0xd7e7a22c, 0x712510f0
- .long 0x13a13e18, 0x3e9a43cd, 0xe95c7216
- .long 0xb8ee242e, 0x8e976a7d, 0x3f9e9356
- .long 0x0c540e7b, 0x753c81ff, 0x8e031a19
- .long 0x9924c781, 0xb9220208, 0x3edcde65
- .long 0x3954de39, 0x1753ab84, 0x1d6708a0
- .long 0xf32238b5, 0xbec81497, 0x9e70b943
- .long 0xbbd2cd2c, 0x0925d861, 0xf7003835
- .long 0xcc401304, 0xd784eaa8, 0xef82aa68
- .long 0x4987e684, 0x6044fbb0, 0x00eba0c8
- .long 0x3aa11427, 0x18fe3b4a, 0x87441142
- .long 0x297aad60, 0x02072e24, 0xd14bcc9b
- .long 0xf60c5e51, 0x6ef6f487, 0x5b7fdd0a
- .long 0x632d78c5, 0x3fc33de4, 0x9a1b53c8
- .long 0x25b8822a, 0x1e52f5ea, 0x99cce860
- .long 0xd4fc84bc, 0x1af62fb8, 0x81b6f443
- .long 0x5690aa32, 0xa91fdefb, 0x688a110e
- .long 0x1357a093, 0x3796455c, 0xd8af8e46
- .long 0x798fdd33, 0xaaa18a37, 0x357b9517
- .long 0xc2815395, 0x54d42691, 0x9dbdc100
- .long 0x21cfc0f7, 0x28ae0976, 0xf1996890
- .long 0xa0decef3, 0x7b4aa8b7, 0xbbf2f6d6
diff --git a/arch/arm64/lib/crc32.c b/arch/arm64/lib/crc32.c
deleted file mode 100644
index ed3acd71178f..000000000000
--- a/arch/arm64/lib/crc32.c
+++ /dev/null
@@ -1,99 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-#include <linux/crc32.h>
-#include <linux/linkage.h>
-#include <linux/module.h>
-
-#include <asm/alternative.h>
-#include <asm/cpufeature.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-#include <crypto/internal/simd.h>
-
-// The minimum input length to consider the 4-way interleaved code path
-static const size_t min_len = 1024;
-
-asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len);
-asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len);
-asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len);
-
-asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
-asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
-asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len);
-
-u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
-{
- if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
- return crc32_le_base(crc, p, len);
-
- if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
- kernel_neon_begin();
- crc = crc32_le_arm64_4way(crc, p, len);
- kernel_neon_end();
-
- p += round_down(len, 64);
- len %= 64;
-
- if (!len)
- return crc;
- }
-
- return crc32_le_arm64(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_le_arch);
-
-u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
-{
- if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
- return crc32c_base(crc, p, len);
-
- if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
- kernel_neon_begin();
- crc = crc32c_le_arm64_4way(crc, p, len);
- kernel_neon_end();
-
- p += round_down(len, 64);
- len %= 64;
-
- if (!len)
- return crc;
- }
-
- return crc32c_le_arm64(crc, p, len);
-}
-EXPORT_SYMBOL(crc32c_arch);
-
-u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
-{
- if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
- return crc32_be_base(crc, p, len);
-
- if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
- kernel_neon_begin();
- crc = crc32_be_arm64_4way(crc, p, len);
- kernel_neon_end();
-
- p += round_down(len, 64);
- len %= 64;
-
- if (!len)
- return crc;
- }
-
- return crc32_be_arm64(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_be_arch);
-
-u32 crc32_optimizations(void)
-{
- if (alternative_has_cap_likely(ARM64_HAS_CRC32))
- return CRC32_LE_OPTIMIZATION |
- CRC32_BE_OPTIMIZATION |
- CRC32C_OPTIMIZATION;
- return 0;
-}
-EXPORT_SYMBOL(crc32_optimizations);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("arm64-optimized CRC32 functions");
diff --git a/arch/arm64/lib/crypto/.gitignore b/arch/arm64/lib/crypto/.gitignore
deleted file mode 100644
index 12d74d8b03d0..000000000000
--- a/arch/arm64/lib/crypto/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-poly1305-core.S
-sha256-core.S
diff --git a/arch/arm64/lib/crypto/Kconfig b/arch/arm64/lib/crypto/Kconfig
deleted file mode 100644
index 129a7685cb4c..000000000000
--- a/arch/arm64/lib/crypto/Kconfig
+++ /dev/null
@@ -1,20 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_CHACHA20_NEON
- tristate
- depends on KERNEL_MODE_NEON
- default CRYPTO_LIB_CHACHA
- select CRYPTO_LIB_CHACHA_GENERIC
- select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_POLY1305_NEON
- tristate
- depends on KERNEL_MODE_NEON
- default CRYPTO_LIB_POLY1305
- select CRYPTO_ARCH_HAVE_LIB_POLY1305
-
-config CRYPTO_SHA256_ARM64
- tristate
- default CRYPTO_LIB_SHA256
- select CRYPTO_ARCH_HAVE_LIB_SHA256
- select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
diff --git a/arch/arm64/lib/crypto/Makefile b/arch/arm64/lib/crypto/Makefile
deleted file mode 100644
index 946c09903711..000000000000
--- a/arch/arm64/lib/crypto/Makefile
+++ /dev/null
@@ -1,24 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
-chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
-
-obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
-poly1305-neon-y := poly1305-core.o poly1305-glue.o
-AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_block_init_arch
-AFLAGS_poly1305-core.o += -Dpoly1305_emit=poly1305_emit_arch
-
-obj-$(CONFIG_CRYPTO_SHA256_ARM64) += sha256-arm64.o
-sha256-arm64-y := sha256.o sha256-core.o
-sha256-arm64-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o
-
-quiet_cmd_perlasm = PERLASM $@
- cmd_perlasm = $(PERL) $(<) void $(@)
-
-$(obj)/%-core.S: $(src)/%-armv8.pl
- $(call cmd,perlasm)
-
-$(obj)/sha256-core.S: $(src)/sha2-armv8.pl
- $(call cmd,perlasm)
-
-clean-files += poly1305-core.S sha256-core.S
diff --git a/arch/arm64/lib/crypto/chacha-neon-core.S b/arch/arm64/lib/crypto/chacha-neon-core.S
deleted file mode 100644
index 80079586ecc7..000000000000
--- a/arch/arm64/lib/crypto/chacha-neon-core.S
+++ /dev/null
@@ -1,805 +0,0 @@
-/*
- * ChaCha/HChaCha NEON helper functions
- *
- * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Originally based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/cache.h>
-
- .text
- .align 6
-
-/*
- * chacha_permute - permute one block
- *
- * Permute one 64-byte block where the state matrix is stored in the four NEON
- * registers v0-v3. It performs matrix operations on four words in parallel,
- * but requires shuffling to rearrange the words after each round.
- *
- * The round count is given in w3.
- *
- * Clobbers: w3, x10, v4, v12
- */
-SYM_FUNC_START_LOCAL(chacha_permute)
-
- adr_l x10, ROT8
- ld1 {v12.4s}, [x10]
-
-.Ldoubleround:
- // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- add v0.4s, v0.4s, v1.4s
- eor v3.16b, v3.16b, v0.16b
- rev32 v3.8h, v3.8h
-
- // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- add v2.4s, v2.4s, v3.4s
- eor v4.16b, v1.16b, v2.16b
- shl v1.4s, v4.4s, #12
- sri v1.4s, v4.4s, #20
-
- // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- add v0.4s, v0.4s, v1.4s
- eor v3.16b, v3.16b, v0.16b
- tbl v3.16b, {v3.16b}, v12.16b
-
- // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- add v2.4s, v2.4s, v3.4s
- eor v4.16b, v1.16b, v2.16b
- shl v1.4s, v4.4s, #7
- sri v1.4s, v4.4s, #25
-
- // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- ext v1.16b, v1.16b, v1.16b, #4
- // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- ext v2.16b, v2.16b, v2.16b, #8
- // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- ext v3.16b, v3.16b, v3.16b, #12
-
- // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- add v0.4s, v0.4s, v1.4s
- eor v3.16b, v3.16b, v0.16b
- rev32 v3.8h, v3.8h
-
- // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- add v2.4s, v2.4s, v3.4s
- eor v4.16b, v1.16b, v2.16b
- shl v1.4s, v4.4s, #12
- sri v1.4s, v4.4s, #20
-
- // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- add v0.4s, v0.4s, v1.4s
- eor v3.16b, v3.16b, v0.16b
- tbl v3.16b, {v3.16b}, v12.16b
-
- // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- add v2.4s, v2.4s, v3.4s
- eor v4.16b, v1.16b, v2.16b
- shl v1.4s, v4.4s, #7
- sri v1.4s, v4.4s, #25
-
- // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- ext v1.16b, v1.16b, v1.16b, #12
- // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- ext v2.16b, v2.16b, v2.16b, #8
- // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- ext v3.16b, v3.16b, v3.16b, #4
-
- subs w3, w3, #2
- b.ne .Ldoubleround
-
- ret
-SYM_FUNC_END(chacha_permute)
-
-SYM_FUNC_START(chacha_block_xor_neon)
- // x0: Input state matrix, s
- // x1: 1 data block output, o
- // x2: 1 data block input, i
- // w3: nrounds
-
- stp x29, x30, [sp, #-16]!
- mov x29, sp
-
- // x0..3 = s0..3
- ld1 {v0.4s-v3.4s}, [x0]
- ld1 {v8.4s-v11.4s}, [x0]
-
- bl chacha_permute
-
- ld1 {v4.16b-v7.16b}, [x2]
-
- // o0 = i0 ^ (x0 + s0)
- add v0.4s, v0.4s, v8.4s
- eor v0.16b, v0.16b, v4.16b
-
- // o1 = i1 ^ (x1 + s1)
- add v1.4s, v1.4s, v9.4s
- eor v1.16b, v1.16b, v5.16b
-
- // o2 = i2 ^ (x2 + s2)
- add v2.4s, v2.4s, v10.4s
- eor v2.16b, v2.16b, v6.16b
-
- // o3 = i3 ^ (x3 + s3)
- add v3.4s, v3.4s, v11.4s
- eor v3.16b, v3.16b, v7.16b
-
- st1 {v0.16b-v3.16b}, [x1]
-
- ldp x29, x30, [sp], #16
- ret
-SYM_FUNC_END(chacha_block_xor_neon)
-
-SYM_FUNC_START(hchacha_block_neon)
- // x0: Input state matrix, s
- // x1: output (8 32-bit words)
- // w2: nrounds
-
- stp x29, x30, [sp, #-16]!
- mov x29, sp
-
- ld1 {v0.4s-v3.4s}, [x0]
-
- mov w3, w2
- bl chacha_permute
-
- st1 {v0.4s}, [x1], #16
- st1 {v3.4s}, [x1]
-
- ldp x29, x30, [sp], #16
- ret
-SYM_FUNC_END(hchacha_block_neon)
-
- a0 .req w12
- a1 .req w13
- a2 .req w14
- a3 .req w15
- a4 .req w16
- a5 .req w17
- a6 .req w19
- a7 .req w20
- a8 .req w21
- a9 .req w22
- a10 .req w23
- a11 .req w24
- a12 .req w25
- a13 .req w26
- a14 .req w27
- a15 .req w28
-
- .align 6
-SYM_FUNC_START(chacha_4block_xor_neon)
- frame_push 10
-
- // x0: Input state matrix, s
- // x1: 4 data blocks output, o
- // x2: 4 data blocks input, i
- // w3: nrounds
- // x4: byte count
-
- adr_l x10, .Lpermute
- and x5, x4, #63
- add x10, x10, x5
-
- //
- // This function encrypts four consecutive ChaCha blocks by loading
- // the state matrix in NEON registers four times. The algorithm performs
- // each operation on the corresponding word of each state matrix, hence
- // requires no word shuffling. For final XORing step we transpose the
- // matrix by interleaving 32- and then 64-bit words, which allows us to
- // do XOR in NEON registers.
- //
- // At the same time, a fifth block is encrypted in parallel using
- // scalar registers
- //
- adr_l x9, CTRINC // ... and ROT8
- ld1 {v30.4s-v31.4s}, [x9]
-
- // x0..15[0-3] = s0..3[0..3]
- add x8, x0, #16
- ld4r { v0.4s- v3.4s}, [x0]
- ld4r { v4.4s- v7.4s}, [x8], #16
- ld4r { v8.4s-v11.4s}, [x8], #16
- ld4r {v12.4s-v15.4s}, [x8]
-
- mov a0, v0.s[0]
- mov a1, v1.s[0]
- mov a2, v2.s[0]
- mov a3, v3.s[0]
- mov a4, v4.s[0]
- mov a5, v5.s[0]
- mov a6, v6.s[0]
- mov a7, v7.s[0]
- mov a8, v8.s[0]
- mov a9, v9.s[0]
- mov a10, v10.s[0]
- mov a11, v11.s[0]
- mov a12, v12.s[0]
- mov a13, v13.s[0]
- mov a14, v14.s[0]
- mov a15, v15.s[0]
-
- // x12 += counter values 1-4
- add v12.4s, v12.4s, v30.4s
-
-.Ldoubleround4:
- // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
- // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
- // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
- // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
- add v0.4s, v0.4s, v4.4s
- add a0, a0, a4
- add v1.4s, v1.4s, v5.4s
- add a1, a1, a5
- add v2.4s, v2.4s, v6.4s
- add a2, a2, a6
- add v3.4s, v3.4s, v7.4s
- add a3, a3, a7
-
- eor v12.16b, v12.16b, v0.16b
- eor a12, a12, a0
- eor v13.16b, v13.16b, v1.16b
- eor a13, a13, a1
- eor v14.16b, v14.16b, v2.16b
- eor a14, a14, a2
- eor v15.16b, v15.16b, v3.16b
- eor a15, a15, a3
-
- rev32 v12.8h, v12.8h
- ror a12, a12, #16
- rev32 v13.8h, v13.8h
- ror a13, a13, #16
- rev32 v14.8h, v14.8h
- ror a14, a14, #16
- rev32 v15.8h, v15.8h
- ror a15, a15, #16
-
- // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
- // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
- // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
- // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
- add v8.4s, v8.4s, v12.4s
- add a8, a8, a12
- add v9.4s, v9.4s, v13.4s
- add a9, a9, a13
- add v10.4s, v10.4s, v14.4s
- add a10, a10, a14
- add v11.4s, v11.4s, v15.4s
- add a11, a11, a15
-
- eor v16.16b, v4.16b, v8.16b
- eor a4, a4, a8
- eor v17.16b, v5.16b, v9.16b
- eor a5, a5, a9
- eor v18.16b, v6.16b, v10.16b
- eor a6, a6, a10
- eor v19.16b, v7.16b, v11.16b
- eor a7, a7, a11
-
- shl v4.4s, v16.4s, #12
- shl v5.4s, v17.4s, #12
- shl v6.4s, v18.4s, #12
- shl v7.4s, v19.4s, #12
-
- sri v4.4s, v16.4s, #20
- ror a4, a4, #20
- sri v5.4s, v17.4s, #20
- ror a5, a5, #20
- sri v6.4s, v18.4s, #20
- ror a6, a6, #20
- sri v7.4s, v19.4s, #20
- ror a7, a7, #20
-
- // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
- // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
- // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
- // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
- add v0.4s, v0.4s, v4.4s
- add a0, a0, a4
- add v1.4s, v1.4s, v5.4s
- add a1, a1, a5
- add v2.4s, v2.4s, v6.4s
- add a2, a2, a6
- add v3.4s, v3.4s, v7.4s
- add a3, a3, a7
-
- eor v12.16b, v12.16b, v0.16b
- eor a12, a12, a0
- eor v13.16b, v13.16b, v1.16b
- eor a13, a13, a1
- eor v14.16b, v14.16b, v2.16b
- eor a14, a14, a2
- eor v15.16b, v15.16b, v3.16b
- eor a15, a15, a3
-
- tbl v12.16b, {v12.16b}, v31.16b
- ror a12, a12, #24
- tbl v13.16b, {v13.16b}, v31.16b
- ror a13, a13, #24
- tbl v14.16b, {v14.16b}, v31.16b
- ror a14, a14, #24
- tbl v15.16b, {v15.16b}, v31.16b
- ror a15, a15, #24
-
- // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
- // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
- // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
- // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
- add v8.4s, v8.4s, v12.4s
- add a8, a8, a12
- add v9.4s, v9.4s, v13.4s
- add a9, a9, a13
- add v10.4s, v10.4s, v14.4s
- add a10, a10, a14
- add v11.4s, v11.4s, v15.4s
- add a11, a11, a15
-
- eor v16.16b, v4.16b, v8.16b
- eor a4, a4, a8
- eor v17.16b, v5.16b, v9.16b
- eor a5, a5, a9
- eor v18.16b, v6.16b, v10.16b
- eor a6, a6, a10
- eor v19.16b, v7.16b, v11.16b
- eor a7, a7, a11
-
- shl v4.4s, v16.4s, #7
- shl v5.4s, v17.4s, #7
- shl v6.4s, v18.4s, #7
- shl v7.4s, v19.4s, #7
-
- sri v4.4s, v16.4s, #25
- ror a4, a4, #25
- sri v5.4s, v17.4s, #25
- ror a5, a5, #25
- sri v6.4s, v18.4s, #25
- ror a6, a6, #25
- sri v7.4s, v19.4s, #25
- ror a7, a7, #25
-
- // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
- // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
- // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
- // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
- add v0.4s, v0.4s, v5.4s
- add a0, a0, a5
- add v1.4s, v1.4s, v6.4s
- add a1, a1, a6
- add v2.4s, v2.4s, v7.4s
- add a2, a2, a7
- add v3.4s, v3.4s, v4.4s
- add a3, a3, a4
-
- eor v15.16b, v15.16b, v0.16b
- eor a15, a15, a0
- eor v12.16b, v12.16b, v1.16b
- eor a12, a12, a1
- eor v13.16b, v13.16b, v2.16b
- eor a13, a13, a2
- eor v14.16b, v14.16b, v3.16b
- eor a14, a14, a3
-
- rev32 v15.8h, v15.8h
- ror a15, a15, #16
- rev32 v12.8h, v12.8h
- ror a12, a12, #16
- rev32 v13.8h, v13.8h
- ror a13, a13, #16
- rev32 v14.8h, v14.8h
- ror a14, a14, #16
-
- // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
- // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
- // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
- // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
- add v10.4s, v10.4s, v15.4s
- add a10, a10, a15
- add v11.4s, v11.4s, v12.4s
- add a11, a11, a12
- add v8.4s, v8.4s, v13.4s
- add a8, a8, a13
- add v9.4s, v9.4s, v14.4s
- add a9, a9, a14
-
- eor v16.16b, v5.16b, v10.16b
- eor a5, a5, a10
- eor v17.16b, v6.16b, v11.16b
- eor a6, a6, a11
- eor v18.16b, v7.16b, v8.16b
- eor a7, a7, a8
- eor v19.16b, v4.16b, v9.16b
- eor a4, a4, a9
-
- shl v5.4s, v16.4s, #12
- shl v6.4s, v17.4s, #12
- shl v7.4s, v18.4s, #12
- shl v4.4s, v19.4s, #12
-
- sri v5.4s, v16.4s, #20
- ror a5, a5, #20
- sri v6.4s, v17.4s, #20
- ror a6, a6, #20
- sri v7.4s, v18.4s, #20
- ror a7, a7, #20
- sri v4.4s, v19.4s, #20
- ror a4, a4, #20
-
- // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
- // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
- // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
- // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
- add v0.4s, v0.4s, v5.4s
- add a0, a0, a5
- add v1.4s, v1.4s, v6.4s
- add a1, a1, a6
- add v2.4s, v2.4s, v7.4s
- add a2, a2, a7
- add v3.4s, v3.4s, v4.4s
- add a3, a3, a4
-
- eor v15.16b, v15.16b, v0.16b
- eor a15, a15, a0
- eor v12.16b, v12.16b, v1.16b
- eor a12, a12, a1
- eor v13.16b, v13.16b, v2.16b
- eor a13, a13, a2
- eor v14.16b, v14.16b, v3.16b
- eor a14, a14, a3
-
- tbl v15.16b, {v15.16b}, v31.16b
- ror a15, a15, #24
- tbl v12.16b, {v12.16b}, v31.16b
- ror a12, a12, #24
- tbl v13.16b, {v13.16b}, v31.16b
- ror a13, a13, #24
- tbl v14.16b, {v14.16b}, v31.16b
- ror a14, a14, #24
-
- // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
- // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
- // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
- // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
- add v10.4s, v10.4s, v15.4s
- add a10, a10, a15
- add v11.4s, v11.4s, v12.4s
- add a11, a11, a12
- add v8.4s, v8.4s, v13.4s
- add a8, a8, a13
- add v9.4s, v9.4s, v14.4s
- add a9, a9, a14
-
- eor v16.16b, v5.16b, v10.16b
- eor a5, a5, a10
- eor v17.16b, v6.16b, v11.16b
- eor a6, a6, a11
- eor v18.16b, v7.16b, v8.16b
- eor a7, a7, a8
- eor v19.16b, v4.16b, v9.16b
- eor a4, a4, a9
-
- shl v5.4s, v16.4s, #7
- shl v6.4s, v17.4s, #7
- shl v7.4s, v18.4s, #7
- shl v4.4s, v19.4s, #7
-
- sri v5.4s, v16.4s, #25
- ror a5, a5, #25
- sri v6.4s, v17.4s, #25
- ror a6, a6, #25
- sri v7.4s, v18.4s, #25
- ror a7, a7, #25
- sri v4.4s, v19.4s, #25
- ror a4, a4, #25
-
- subs w3, w3, #2
- b.ne .Ldoubleround4
-
- ld4r {v16.4s-v19.4s}, [x0], #16
- ld4r {v20.4s-v23.4s}, [x0], #16
-
- // x12 += counter values 0-3
- add v12.4s, v12.4s, v30.4s
-
- // x0[0-3] += s0[0]
- // x1[0-3] += s0[1]
- // x2[0-3] += s0[2]
- // x3[0-3] += s0[3]
- add v0.4s, v0.4s, v16.4s
- mov w6, v16.s[0]
- mov w7, v17.s[0]
- add v1.4s, v1.4s, v17.4s
- mov w8, v18.s[0]
- mov w9, v19.s[0]
- add v2.4s, v2.4s, v18.4s
- add a0, a0, w6
- add a1, a1, w7
- add v3.4s, v3.4s, v19.4s
- add a2, a2, w8
- add a3, a3, w9
-CPU_BE( rev a0, a0 )
-CPU_BE( rev a1, a1 )
-CPU_BE( rev a2, a2 )
-CPU_BE( rev a3, a3 )
-
- ld4r {v24.4s-v27.4s}, [x0], #16
- ld4r {v28.4s-v31.4s}, [x0]
-
- // x4[0-3] += s1[0]
- // x5[0-3] += s1[1]
- // x6[0-3] += s1[2]
- // x7[0-3] += s1[3]
- add v4.4s, v4.4s, v20.4s
- mov w6, v20.s[0]
- mov w7, v21.s[0]
- add v5.4s, v5.4s, v21.4s
- mov w8, v22.s[0]
- mov w9, v23.s[0]
- add v6.4s, v6.4s, v22.4s
- add a4, a4, w6
- add a5, a5, w7
- add v7.4s, v7.4s, v23.4s
- add a6, a6, w8
- add a7, a7, w9
-CPU_BE( rev a4, a4 )
-CPU_BE( rev a5, a5 )
-CPU_BE( rev a6, a6 )
-CPU_BE( rev a7, a7 )
-
- // x8[0-3] += s2[0]
- // x9[0-3] += s2[1]
- // x10[0-3] += s2[2]
- // x11[0-3] += s2[3]
- add v8.4s, v8.4s, v24.4s
- mov w6, v24.s[0]
- mov w7, v25.s[0]
- add v9.4s, v9.4s, v25.4s
- mov w8, v26.s[0]
- mov w9, v27.s[0]
- add v10.4s, v10.4s, v26.4s
- add a8, a8, w6
- add a9, a9, w7
- add v11.4s, v11.4s, v27.4s
- add a10, a10, w8
- add a11, a11, w9
-CPU_BE( rev a8, a8 )
-CPU_BE( rev a9, a9 )
-CPU_BE( rev a10, a10 )
-CPU_BE( rev a11, a11 )
-
- // x12[0-3] += s3[0]
- // x13[0-3] += s3[1]
- // x14[0-3] += s3[2]
- // x15[0-3] += s3[3]
- add v12.4s, v12.4s, v28.4s
- mov w6, v28.s[0]
- mov w7, v29.s[0]
- add v13.4s, v13.4s, v29.4s
- mov w8, v30.s[0]
- mov w9, v31.s[0]
- add v14.4s, v14.4s, v30.4s
- add a12, a12, w6
- add a13, a13, w7
- add v15.4s, v15.4s, v31.4s
- add a14, a14, w8
- add a15, a15, w9
-CPU_BE( rev a12, a12 )
-CPU_BE( rev a13, a13 )
-CPU_BE( rev a14, a14 )
-CPU_BE( rev a15, a15 )
-
- // interleave 32-bit words in state n, n+1
- ldp w6, w7, [x2], #64
- zip1 v16.4s, v0.4s, v1.4s
- ldp w8, w9, [x2, #-56]
- eor a0, a0, w6
- zip2 v17.4s, v0.4s, v1.4s
- eor a1, a1, w7
- zip1 v18.4s, v2.4s, v3.4s
- eor a2, a2, w8
- zip2 v19.4s, v2.4s, v3.4s
- eor a3, a3, w9
- ldp w6, w7, [x2, #-48]
- zip1 v20.4s, v4.4s, v5.4s
- ldp w8, w9, [x2, #-40]
- eor a4, a4, w6
- zip2 v21.4s, v4.4s, v5.4s
- eor a5, a5, w7
- zip1 v22.4s, v6.4s, v7.4s
- eor a6, a6, w8
- zip2 v23.4s, v6.4s, v7.4s
- eor a7, a7, w9
- ldp w6, w7, [x2, #-32]
- zip1 v24.4s, v8.4s, v9.4s
- ldp w8, w9, [x2, #-24]
- eor a8, a8, w6
- zip2 v25.4s, v8.4s, v9.4s
- eor a9, a9, w7
- zip1 v26.4s, v10.4s, v11.4s
- eor a10, a10, w8
- zip2 v27.4s, v10.4s, v11.4s
- eor a11, a11, w9
- ldp w6, w7, [x2, #-16]
- zip1 v28.4s, v12.4s, v13.4s
- ldp w8, w9, [x2, #-8]
- eor a12, a12, w6
- zip2 v29.4s, v12.4s, v13.4s
- eor a13, a13, w7
- zip1 v30.4s, v14.4s, v15.4s
- eor a14, a14, w8
- zip2 v31.4s, v14.4s, v15.4s
- eor a15, a15, w9
-
- add x3, x2, x4
- sub x3, x3, #128 // start of last block
-
- subs x5, x4, #128
- csel x2, x2, x3, ge
-
- // interleave 64-bit words in state n, n+2
- zip1 v0.2d, v16.2d, v18.2d
- zip2 v4.2d, v16.2d, v18.2d
- stp a0, a1, [x1], #64
- zip1 v8.2d, v17.2d, v19.2d
- zip2 v12.2d, v17.2d, v19.2d
- stp a2, a3, [x1, #-56]
-
- subs x6, x4, #192
- ld1 {v16.16b-v19.16b}, [x2], #64
- csel x2, x2, x3, ge
-
- zip1 v1.2d, v20.2d, v22.2d
- zip2 v5.2d, v20.2d, v22.2d
- stp a4, a5, [x1, #-48]
- zip1 v9.2d, v21.2d, v23.2d
- zip2 v13.2d, v21.2d, v23.2d
- stp a6, a7, [x1, #-40]
-
- subs x7, x4, #256
- ld1 {v20.16b-v23.16b}, [x2], #64
- csel x2, x2, x3, ge
-
- zip1 v2.2d, v24.2d, v26.2d
- zip2 v6.2d, v24.2d, v26.2d
- stp a8, a9, [x1, #-32]
- zip1 v10.2d, v25.2d, v27.2d
- zip2 v14.2d, v25.2d, v27.2d
- stp a10, a11, [x1, #-24]
-
- subs x8, x4, #320
- ld1 {v24.16b-v27.16b}, [x2], #64
- csel x2, x2, x3, ge
-
- zip1 v3.2d, v28.2d, v30.2d
- zip2 v7.2d, v28.2d, v30.2d
- stp a12, a13, [x1, #-16]
- zip1 v11.2d, v29.2d, v31.2d
- zip2 v15.2d, v29.2d, v31.2d
- stp a14, a15, [x1, #-8]
-
- tbnz x5, #63, .Lt128
- ld1 {v28.16b-v31.16b}, [x2]
-
- // xor with corresponding input, write to output
- eor v16.16b, v16.16b, v0.16b
- eor v17.16b, v17.16b, v1.16b
- eor v18.16b, v18.16b, v2.16b
- eor v19.16b, v19.16b, v3.16b
-
- tbnz x6, #63, .Lt192
-
- eor v20.16b, v20.16b, v4.16b
- eor v21.16b, v21.16b, v5.16b
- eor v22.16b, v22.16b, v6.16b
- eor v23.16b, v23.16b, v7.16b
-
- st1 {v16.16b-v19.16b}, [x1], #64
- tbnz x7, #63, .Lt256
-
- eor v24.16b, v24.16b, v8.16b
- eor v25.16b, v25.16b, v9.16b
- eor v26.16b, v26.16b, v10.16b
- eor v27.16b, v27.16b, v11.16b
-
- st1 {v20.16b-v23.16b}, [x1], #64
- tbnz x8, #63, .Lt320
-
- eor v28.16b, v28.16b, v12.16b
- eor v29.16b, v29.16b, v13.16b
- eor v30.16b, v30.16b, v14.16b
- eor v31.16b, v31.16b, v15.16b
-
- st1 {v24.16b-v27.16b}, [x1], #64
- st1 {v28.16b-v31.16b}, [x1]
-
-.Lout: frame_pop
- ret
-
- // fewer than 192 bytes of in/output
-.Lt192: cbz x5, 1f // exactly 128 bytes?
- ld1 {v28.16b-v31.16b}, [x10]
- add x5, x5, x1
- tbl v28.16b, {v4.16b-v7.16b}, v28.16b
- tbl v29.16b, {v4.16b-v7.16b}, v29.16b
- tbl v30.16b, {v4.16b-v7.16b}, v30.16b
- tbl v31.16b, {v4.16b-v7.16b}, v31.16b
-
-0: eor v20.16b, v20.16b, v28.16b
- eor v21.16b, v21.16b, v29.16b
- eor v22.16b, v22.16b, v30.16b
- eor v23.16b, v23.16b, v31.16b
- st1 {v20.16b-v23.16b}, [x5] // overlapping stores
-1: st1 {v16.16b-v19.16b}, [x1]
- b .Lout
-
- // fewer than 128 bytes of in/output
-.Lt128: ld1 {v28.16b-v31.16b}, [x10]
- add x5, x5, x1
- sub x1, x1, #64
- tbl v28.16b, {v0.16b-v3.16b}, v28.16b
- tbl v29.16b, {v0.16b-v3.16b}, v29.16b
- tbl v30.16b, {v0.16b-v3.16b}, v30.16b
- tbl v31.16b, {v0.16b-v3.16b}, v31.16b
- ld1 {v16.16b-v19.16b}, [x1] // reload first output block
- b 0b
-
- // fewer than 256 bytes of in/output
-.Lt256: cbz x6, 2f // exactly 192 bytes?
- ld1 {v4.16b-v7.16b}, [x10]
- add x6, x6, x1
- tbl v0.16b, {v8.16b-v11.16b}, v4.16b
- tbl v1.16b, {v8.16b-v11.16b}, v5.16b
- tbl v2.16b, {v8.16b-v11.16b}, v6.16b
- tbl v3.16b, {v8.16b-v11.16b}, v7.16b
-
- eor v28.16b, v28.16b, v0.16b
- eor v29.16b, v29.16b, v1.16b
- eor v30.16b, v30.16b, v2.16b
- eor v31.16b, v31.16b, v3.16b
- st1 {v28.16b-v31.16b}, [x6] // overlapping stores
-2: st1 {v20.16b-v23.16b}, [x1]
- b .Lout
-
- // fewer than 320 bytes of in/output
-.Lt320: cbz x7, 3f // exactly 256 bytes?
- ld1 {v4.16b-v7.16b}, [x10]
- add x7, x7, x1
- tbl v0.16b, {v12.16b-v15.16b}, v4.16b
- tbl v1.16b, {v12.16b-v15.16b}, v5.16b
- tbl v2.16b, {v12.16b-v15.16b}, v6.16b
- tbl v3.16b, {v12.16b-v15.16b}, v7.16b
-
- eor v28.16b, v28.16b, v0.16b
- eor v29.16b, v29.16b, v1.16b
- eor v30.16b, v30.16b, v2.16b
- eor v31.16b, v31.16b, v3.16b
- st1 {v28.16b-v31.16b}, [x7] // overlapping stores
-3: st1 {v24.16b-v27.16b}, [x1]
- b .Lout
-SYM_FUNC_END(chacha_4block_xor_neon)
-
- .section ".rodata", "a", %progbits
- .align L1_CACHE_SHIFT
-.Lpermute:
- .set .Li, 0
- .rept 128
- .byte (.Li - 64)
- .set .Li, .Li + 1
- .endr
-
-CTRINC: .word 1, 2, 3, 4
-ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
diff --git a/arch/arm64/lib/crypto/chacha-neon-glue.c b/arch/arm64/lib/crypto/chacha-neon-glue.c
deleted file mode 100644
index d0188f974ca5..000000000000
--- a/arch/arm64/lib/crypto/chacha-neon-glue.c
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * ChaCha and HChaCha functions (ARM64 optimized)
- *
- * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <crypto/chacha.h>
-#include <crypto/internal/simd.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
- u8 *dst, const u8 *src, int nrounds);
-asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- int nrounds, int bytes);
-asmlinkage void hchacha_block_neon(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
- int bytes, int nrounds)
-{
- while (bytes > 0) {
- int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
-
- if (l <= CHACHA_BLOCK_SIZE) {
- u8 buf[CHACHA_BLOCK_SIZE];
-
- memcpy(buf, src, l);
- chacha_block_xor_neon(state, buf, buf, nrounds);
- memcpy(dst, buf, l);
- state->x[12] += 1;
- break;
- }
- chacha_4block_xor_neon(state, dst, src, nrounds, l);
- bytes -= l;
- src += l;
- dst += l;
- state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
- }
-}
-
-void hchacha_block_arch(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds)
-{
- if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
- hchacha_block_generic(state, out, nrounds);
- } else {
- kernel_neon_begin();
- hchacha_block_neon(state, out, nrounds);
- kernel_neon_end();
- }
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
- !crypto_simd_usable())
- return chacha_crypt_generic(state, dst, src, bytes, nrounds);
-
- do {
- unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
-
- kernel_neon_begin();
- chacha_doneon(state, dst, src, todo, nrounds);
- kernel_neon_end();
-
- bytes -= todo;
- src += todo;
- dst += todo;
- } while (bytes);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-bool chacha_is_arch_optimized(void)
-{
- return static_key_enabled(&have_neon);
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-static int __init chacha_simd_mod_init(void)
-{
- if (cpu_have_named_feature(ASIMD))
- static_branch_enable(&have_neon);
- return 0;
-}
-subsys_initcall(chacha_simd_mod_init);
-
-static void __exit chacha_simd_mod_exit(void)
-{
-}
-module_exit(chacha_simd_mod_exit);
-
-MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM64 optimized)");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/lib/crypto/poly1305-armv8.pl b/arch/arm64/lib/crypto/poly1305-armv8.pl
deleted file mode 100644
index 22c9069c0650..000000000000
--- a/arch/arm64/lib/crypto/poly1305-armv8.pl
+++ /dev/null
@@ -1,917 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
-#
-# ====================================================================
-# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
-# project.
-# ====================================================================
-#
-# This module implements Poly1305 hash for ARMv8.
-#
-# June 2015
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone.
-#
-# IALU/gcc-4.9 NEON
-#
-# Apple A7 1.86/+5% 0.72
-# Cortex-A53 2.69/+58% 1.47
-# Cortex-A57 2.70/+7% 1.14
-# Denver 1.64/+50% 1.18(*)
-# X-Gene 2.13/+68% 2.27
-# Mongoose 1.77/+75% 1.12
-# Kryo 2.70/+55% 1.13
-# ThunderX2 1.17/+95% 1.36
-#
-# (*) estimate based on resources availability is less than 1.0,
-# i.e. measured result is worse than expected, presumably binary
-# translator is not almighty;
-
-$flavour=shift;
-$output=shift;
-
-if ($flavour && $flavour ne "void") {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
-
- open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
- open STDOUT,">$output";
-}
-
-my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
-my ($mac,$nonce)=($inp,$len);
-
-my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
-
-$code.=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-.extern OPENSSL_armcap_P
-#endif
-
-.text
-
-// forward "declarations" are required for Apple
-.globl poly1305_blocks
-.globl poly1305_emit
-
-.globl poly1305_init
-.type poly1305_init,%function
-.align 5
-poly1305_init:
- cmp $inp,xzr
- stp xzr,xzr,[$ctx] // zero hash value
- stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
-
- csel x0,xzr,x0,eq
- b.eq .Lno_key
-
-#ifndef __KERNEL__
- adrp x17,OPENSSL_armcap_P
- ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
-#endif
-
- ldp $r0,$r1,[$inp] // load key
- mov $s1,#0xfffffffc0fffffff
- movk $s1,#0x0fff,lsl#48
-#ifdef __AARCH64EB__
- rev $r0,$r0 // flip bytes
- rev $r1,$r1
-#endif
- and $r0,$r0,$s1 // &=0ffffffc0fffffff
- and $s1,$s1,#-4
- and $r1,$r1,$s1 // &=0ffffffc0ffffffc
- mov w#$s1,#-1
- stp $r0,$r1,[$ctx,#32] // save key value
- str w#$s1,[$ctx,#48] // impossible key power value
-
-#ifndef __KERNEL__
- tst w17,#ARMV7_NEON
-
- adr $d0,.Lpoly1305_blocks
- adr $r0,.Lpoly1305_blocks_neon
- adr $d1,.Lpoly1305_emit
-
- csel $d0,$d0,$r0,eq
-
-# ifdef __ILP32__
- stp w#$d0,w#$d1,[$len]
-# else
- stp $d0,$d1,[$len]
-# endif
-#endif
- mov x0,#1
-.Lno_key:
- ret
-.size poly1305_init,.-poly1305_init
-
-.type poly1305_blocks,%function
-.align 5
-poly1305_blocks:
-.Lpoly1305_blocks:
- ands $len,$len,#-16
- b.eq .Lno_data
-
- ldp $h0,$h1,[$ctx] // load hash value
- ldp $h2,x17,[$ctx,#16] // [along with is_base2_26]
- ldp $r0,$r1,[$ctx,#32] // load key value
-
-#ifdef __AARCH64EB__
- lsr $d0,$h0,#32
- mov w#$d1,w#$h0
- lsr $d2,$h1,#32
- mov w15,w#$h1
- lsr x16,$h2,#32
-#else
- mov w#$d0,w#$h0
- lsr $d1,$h0,#32
- mov w#$d2,w#$h1
- lsr x15,$h1,#32
- mov w16,w#$h2
-#endif
-
- add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
- lsr $d1,$d2,#12
- adds $d0,$d0,$d2,lsl#52
- add $d1,$d1,x15,lsl#14
- adc $d1,$d1,xzr
- lsr $d2,x16,#24
- adds $d1,$d1,x16,lsl#40
- adc $d2,$d2,xzr
-
- cmp x17,#0 // is_base2_26?
- add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
- csel $h0,$h0,$d0,eq // choose between radixes
- csel $h1,$h1,$d1,eq
- csel $h2,$h2,$d2,eq
-
-.Loop:
- ldp $t0,$t1,[$inp],#16 // load input
- sub $len,$len,#16
-#ifdef __AARCH64EB__
- rev $t0,$t0
- rev $t1,$t1
-#endif
- adds $h0,$h0,$t0 // accumulate input
- adcs $h1,$h1,$t1
-
- mul $d0,$h0,$r0 // h0*r0
- adc $h2,$h2,$padbit
- umulh $d1,$h0,$r0
-
- mul $t0,$h1,$s1 // h1*5*r1
- umulh $t1,$h1,$s1
-
- adds $d0,$d0,$t0
- mul $t0,$h0,$r1 // h0*r1
- adc $d1,$d1,$t1
- umulh $d2,$h0,$r1
-
- adds $d1,$d1,$t0
- mul $t0,$h1,$r0 // h1*r0
- adc $d2,$d2,xzr
- umulh $t1,$h1,$r0
-
- adds $d1,$d1,$t0
- mul $t0,$h2,$s1 // h2*5*r1
- adc $d2,$d2,$t1
- mul $t1,$h2,$r0 // h2*r0
-
- adds $d1,$d1,$t0
- adc $d2,$d2,$t1
-
- and $t0,$d2,#-4 // final reduction
- and $h2,$d2,#3
- add $t0,$t0,$d2,lsr#2
- adds $h0,$d0,$t0
- adcs $h1,$d1,xzr
- adc $h2,$h2,xzr
-
- cbnz $len,.Loop
-
- stp $h0,$h1,[$ctx] // store hash value
- stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26]
-
-.Lno_data:
- ret
-.size poly1305_blocks,.-poly1305_blocks
-
-.type poly1305_emit,%function
-.align 5
-poly1305_emit:
-.Lpoly1305_emit:
- ldp $h0,$h1,[$ctx] // load hash base 2^64
- ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26]
- ldp $t0,$t1,[$nonce] // load nonce
-
-#ifdef __AARCH64EB__
- lsr $d0,$h0,#32
- mov w#$d1,w#$h0
- lsr $d2,$h1,#32
- mov w15,w#$h1
- lsr x16,$h2,#32
-#else
- mov w#$d0,w#$h0
- lsr $d1,$h0,#32
- mov w#$d2,w#$h1
- lsr x15,$h1,#32
- mov w16,w#$h2
-#endif
-
- add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
- lsr $d1,$d2,#12
- adds $d0,$d0,$d2,lsl#52
- add $d1,$d1,x15,lsl#14
- adc $d1,$d1,xzr
- lsr $d2,x16,#24
- adds $d1,$d1,x16,lsl#40
- adc $d2,$d2,xzr
-
- cmp $r0,#0 // is_base2_26?
- csel $h0,$h0,$d0,eq // choose between radixes
- csel $h1,$h1,$d1,eq
- csel $h2,$h2,$d2,eq
-
- adds $d0,$h0,#5 // compare to modulus
- adcs $d1,$h1,xzr
- adc $d2,$h2,xzr
-
- tst $d2,#-4 // see if it's carried/borrowed
-
- csel $h0,$h0,$d0,eq
- csel $h1,$h1,$d1,eq
-
-#ifdef __AARCH64EB__
- ror $t0,$t0,#32 // flip nonce words
- ror $t1,$t1,#32
-#endif
- adds $h0,$h0,$t0 // accumulate nonce
- adc $h1,$h1,$t1
-#ifdef __AARCH64EB__
- rev $h0,$h0 // flip output bytes
- rev $h1,$h1
-#endif
- stp $h0,$h1,[$mac] // write result
-
- ret
-.size poly1305_emit,.-poly1305_emit
-___
-my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
-my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
-my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
-my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
-my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
-my ($T0,$T1,$MASK) = map("v$_",(29..31));
-
-my ($in2,$zeros)=("x16","x17");
-my $is_base2_26 = $zeros; # borrow
-
-$code.=<<___;
-.type poly1305_mult,%function
-.align 5
-poly1305_mult:
- mul $d0,$h0,$r0 // h0*r0
- umulh $d1,$h0,$r0
-
- mul $t0,$h1,$s1 // h1*5*r1
- umulh $t1,$h1,$s1
-
- adds $d0,$d0,$t0
- mul $t0,$h0,$r1 // h0*r1
- adc $d1,$d1,$t1
- umulh $d2,$h0,$r1
-
- adds $d1,$d1,$t0
- mul $t0,$h1,$r0 // h1*r0
- adc $d2,$d2,xzr
- umulh $t1,$h1,$r0
-
- adds $d1,$d1,$t0
- mul $t0,$h2,$s1 // h2*5*r1
- adc $d2,$d2,$t1
- mul $t1,$h2,$r0 // h2*r0
-
- adds $d1,$d1,$t0
- adc $d2,$d2,$t1
-
- and $t0,$d2,#-4 // final reduction
- and $h2,$d2,#3
- add $t0,$t0,$d2,lsr#2
- adds $h0,$d0,$t0
- adcs $h1,$d1,xzr
- adc $h2,$h2,xzr
-
- ret
-.size poly1305_mult,.-poly1305_mult
-
-.type poly1305_splat,%function
-.align 4
-poly1305_splat:
- and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x13,$h0,#26,#26
- extr x14,$h1,$h0,#52
- and x14,x14,#0x03ffffff
- ubfx x15,$h1,#14,#26
- extr x16,$h2,$h1,#40
-
- str w12,[$ctx,#16*0] // r0
- add w12,w13,w13,lsl#2 // r1*5
- str w13,[$ctx,#16*1] // r1
- add w13,w14,w14,lsl#2 // r2*5
- str w12,[$ctx,#16*2] // s1
- str w14,[$ctx,#16*3] // r2
- add w14,w15,w15,lsl#2 // r3*5
- str w13,[$ctx,#16*4] // s2
- str w15,[$ctx,#16*5] // r3
- add w15,w16,w16,lsl#2 // r4*5
- str w14,[$ctx,#16*6] // s3
- str w16,[$ctx,#16*7] // r4
- str w15,[$ctx,#16*8] // s4
-
- ret
-.size poly1305_splat,.-poly1305_splat
-
-#ifdef __KERNEL__
-.globl poly1305_blocks_neon
-#endif
-.type poly1305_blocks_neon,%function
-.align 5
-poly1305_blocks_neon:
-.Lpoly1305_blocks_neon:
- ldr $is_base2_26,[$ctx,#24]
- cmp $len,#128
- b.lo .Lpoly1305_blocks
-
- .inst 0xd503233f // paciasp
- stp x29,x30,[sp,#-80]!
- add x29,sp,#0
-
- stp d8,d9,[sp,#16] // meet ABI requirements
- stp d10,d11,[sp,#32]
- stp d12,d13,[sp,#48]
- stp d14,d15,[sp,#64]
-
- cbz $is_base2_26,.Lbase2_64_neon
-
- ldp w10,w11,[$ctx] // load hash value base 2^26
- ldp w12,w13,[$ctx,#8]
- ldr w14,[$ctx,#16]
-
- tst $len,#31
- b.eq .Leven_neon
-
- ldp $r0,$r1,[$ctx,#32] // load key value
-
- add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
- lsr $h1,x12,#12
- adds $h0,$h0,x12,lsl#52
- add $h1,$h1,x13,lsl#14
- adc $h1,$h1,xzr
- lsr $h2,x14,#24
- adds $h1,$h1,x14,lsl#40
- adc $d2,$h2,xzr // can be partially reduced...
-
- ldp $d0,$d1,[$inp],#16 // load input
- sub $len,$len,#16
- add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
-
-#ifdef __AARCH64EB__
- rev $d0,$d0
- rev $d1,$d1
-#endif
- adds $h0,$h0,$d0 // accumulate input
- adcs $h1,$h1,$d1
- adc $h2,$h2,$padbit
-
- bl poly1305_mult
-
- and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x11,$h0,#26,#26
- extr x12,$h1,$h0,#52
- and x12,x12,#0x03ffffff
- ubfx x13,$h1,#14,#26
- extr x14,$h2,$h1,#40
-
- b .Leven_neon
-
-.align 4
-.Lbase2_64_neon:
- ldp $r0,$r1,[$ctx,#32] // load key value
-
- ldp $h0,$h1,[$ctx] // load hash value base 2^64
- ldr $h2,[$ctx,#16]
-
- tst $len,#31
- b.eq .Linit_neon
-
- ldp $d0,$d1,[$inp],#16 // load input
- sub $len,$len,#16
- add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
-#ifdef __AARCH64EB__
- rev $d0,$d0
- rev $d1,$d1
-#endif
- adds $h0,$h0,$d0 // accumulate input
- adcs $h1,$h1,$d1
- adc $h2,$h2,$padbit
-
- bl poly1305_mult
-
-.Linit_neon:
- ldr w17,[$ctx,#48] // first table element
- and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x11,$h0,#26,#26
- extr x12,$h1,$h0,#52
- and x12,x12,#0x03ffffff
- ubfx x13,$h1,#14,#26
- extr x14,$h2,$h1,#40
-
- cmp w17,#-1 // is value impossible?
- b.ne .Leven_neon
-
- fmov ${H0},x10
- fmov ${H1},x11
- fmov ${H2},x12
- fmov ${H3},x13
- fmov ${H4},x14
-
- ////////////////////////////////// initialize r^n table
- mov $h0,$r0 // r^1
- add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
- mov $h1,$r1
- mov $h2,xzr
- add $ctx,$ctx,#48+12
- bl poly1305_splat
-
- bl poly1305_mult // r^2
- sub $ctx,$ctx,#4
- bl poly1305_splat
-
- bl poly1305_mult // r^3
- sub $ctx,$ctx,#4
- bl poly1305_splat
-
- bl poly1305_mult // r^4
- sub $ctx,$ctx,#4
- bl poly1305_splat
- sub $ctx,$ctx,#48 // restore original $ctx
- b .Ldo_neon
-
-.align 4
-.Leven_neon:
- fmov ${H0},x10
- fmov ${H1},x11
- fmov ${H2},x12
- fmov ${H3},x13
- fmov ${H4},x14
-
-.Ldo_neon:
- ldp x8,x12,[$inp,#32] // inp[2:3]
- subs $len,$len,#64
- ldp x9,x13,[$inp,#48]
- add $in2,$inp,#96
- adrp $zeros,.Lzeros
- add $zeros,$zeros,#:lo12:.Lzeros
-
- lsl $padbit,$padbit,#24
- add x15,$ctx,#48
-
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- and x5,x9,#0x03ffffff
- ubfx x6,x8,#26,#26
- ubfx x7,x9,#26,#26
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- extr x8,x12,x8,#52
- extr x9,x13,x9,#52
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- fmov $IN23_0,x4
- and x8,x8,#0x03ffffff
- and x9,x9,#0x03ffffff
- ubfx x10,x12,#14,#26
- ubfx x11,x13,#14,#26
- add x12,$padbit,x12,lsr#40
- add x13,$padbit,x13,lsr#40
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- fmov $IN23_1,x6
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- fmov $IN23_2,x8
- fmov $IN23_3,x10
- fmov $IN23_4,x12
-
- ldp x8,x12,[$inp],#16 // inp[0:1]
- ldp x9,x13,[$inp],#48
-
- ld1 {$R0,$R1,$S1,$R2},[x15],#64
- ld1 {$S2,$R3,$S3,$R4},[x15],#64
- ld1 {$S4},[x15]
-
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- and x5,x9,#0x03ffffff
- ubfx x6,x8,#26,#26
- ubfx x7,x9,#26,#26
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- extr x8,x12,x8,#52
- extr x9,x13,x9,#52
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- fmov $IN01_0,x4
- and x8,x8,#0x03ffffff
- and x9,x9,#0x03ffffff
- ubfx x10,x12,#14,#26
- ubfx x11,x13,#14,#26
- add x12,$padbit,x12,lsr#40
- add x13,$padbit,x13,lsr#40
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- fmov $IN01_1,x6
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- movi $MASK.2d,#-1
- fmov $IN01_2,x8
- fmov $IN01_3,x10
- fmov $IN01_4,x12
- ushr $MASK.2d,$MASK.2d,#38
-
- b.ls .Lskip_loop
-
-.align 4
-.Loop_neon:
- ////////////////////////////////////////////////////////////////
- // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- // \___________________/
- // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- // \___________________/ \____________________/
- //
- // Note that we start with inp[2:3]*r^2. This is because it
- // doesn't depend on reduction in previous iteration.
- ////////////////////////////////////////////////////////////////
- // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
- // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
- // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
- // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
- // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
-
- subs $len,$len,#64
- umull $ACC4,$IN23_0,${R4}[2]
- csel $in2,$zeros,$in2,lo
- umull $ACC3,$IN23_0,${R3}[2]
- umull $ACC2,$IN23_0,${R2}[2]
- ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
- umull $ACC1,$IN23_0,${R1}[2]
- ldp x9,x13,[$in2],#48
- umull $ACC0,$IN23_0,${R0}[2]
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
-
- umlal $ACC4,$IN23_1,${R3}[2]
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- umlal $ACC3,$IN23_1,${R2}[2]
- and x5,x9,#0x03ffffff
- umlal $ACC2,$IN23_1,${R1}[2]
- ubfx x6,x8,#26,#26
- umlal $ACC1,$IN23_1,${R0}[2]
- ubfx x7,x9,#26,#26
- umlal $ACC0,$IN23_1,${S4}[2]
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
-
- umlal $ACC4,$IN23_2,${R2}[2]
- extr x8,x12,x8,#52
- umlal $ACC3,$IN23_2,${R1}[2]
- extr x9,x13,x9,#52
- umlal $ACC2,$IN23_2,${R0}[2]
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- umlal $ACC1,$IN23_2,${S4}[2]
- fmov $IN23_0,x4
- umlal $ACC0,$IN23_2,${S3}[2]
- and x8,x8,#0x03ffffff
-
- umlal $ACC4,$IN23_3,${R1}[2]
- and x9,x9,#0x03ffffff
- umlal $ACC3,$IN23_3,${R0}[2]
- ubfx x10,x12,#14,#26
- umlal $ACC2,$IN23_3,${S4}[2]
- ubfx x11,x13,#14,#26
- umlal $ACC1,$IN23_3,${S3}[2]
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- umlal $ACC0,$IN23_3,${S2}[2]
- fmov $IN23_1,x6
-
- add $IN01_2,$IN01_2,$H2
- add x12,$padbit,x12,lsr#40
- umlal $ACC4,$IN23_4,${R0}[2]
- add x13,$padbit,x13,lsr#40
- umlal $ACC3,$IN23_4,${S4}[2]
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- umlal $ACC2,$IN23_4,${S3}[2]
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- umlal $ACC1,$IN23_4,${S2}[2]
- fmov $IN23_2,x8
- umlal $ACC0,$IN23_4,${S1}[2]
- fmov $IN23_3,x10
-
- ////////////////////////////////////////////////////////////////
- // (hash+inp[0:1])*r^4 and accumulate
-
- add $IN01_0,$IN01_0,$H0
- fmov $IN23_4,x12
- umlal $ACC3,$IN01_2,${R1}[0]
- ldp x8,x12,[$inp],#16 // inp[0:1]
- umlal $ACC0,$IN01_2,${S3}[0]
- ldp x9,x13,[$inp],#48
- umlal $ACC4,$IN01_2,${R2}[0]
- umlal $ACC1,$IN01_2,${S4}[0]
- umlal $ACC2,$IN01_2,${R0}[0]
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
-
- add $IN01_1,$IN01_1,$H1
- umlal $ACC3,$IN01_0,${R3}[0]
- umlal $ACC4,$IN01_0,${R4}[0]
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- umlal $ACC2,$IN01_0,${R2}[0]
- and x5,x9,#0x03ffffff
- umlal $ACC0,$IN01_0,${R0}[0]
- ubfx x6,x8,#26,#26
- umlal $ACC1,$IN01_0,${R1}[0]
- ubfx x7,x9,#26,#26
-
- add $IN01_3,$IN01_3,$H3
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- umlal $ACC3,$IN01_1,${R2}[0]
- extr x8,x12,x8,#52
- umlal $ACC4,$IN01_1,${R3}[0]
- extr x9,x13,x9,#52
- umlal $ACC0,$IN01_1,${S4}[0]
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- umlal $ACC2,$IN01_1,${R1}[0]
- fmov $IN01_0,x4
- umlal $ACC1,$IN01_1,${R0}[0]
- and x8,x8,#0x03ffffff
-
- add $IN01_4,$IN01_4,$H4
- and x9,x9,#0x03ffffff
- umlal $ACC3,$IN01_3,${R0}[0]
- ubfx x10,x12,#14,#26
- umlal $ACC0,$IN01_3,${S2}[0]
- ubfx x11,x13,#14,#26
- umlal $ACC4,$IN01_3,${R1}[0]
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- umlal $ACC1,$IN01_3,${S3}[0]
- fmov $IN01_1,x6
- umlal $ACC2,$IN01_3,${S4}[0]
- add x12,$padbit,x12,lsr#40
-
- umlal $ACC3,$IN01_4,${S4}[0]
- add x13,$padbit,x13,lsr#40
- umlal $ACC0,$IN01_4,${S1}[0]
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- umlal $ACC4,$IN01_4,${R0}[0]
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- umlal $ACC1,$IN01_4,${S2}[0]
- fmov $IN01_2,x8
- umlal $ACC2,$IN01_4,${S3}[0]
- fmov $IN01_3,x10
- fmov $IN01_4,x12
-
- /////////////////////////////////////////////////////////////////
- // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- // and P. Schwabe
- //
- // [see discussion in poly1305-armv4 module]
-
- ushr $T0.2d,$ACC3,#26
- xtn $H3,$ACC3
- ushr $T1.2d,$ACC0,#26
- and $ACC0,$ACC0,$MASK.2d
- add $ACC4,$ACC4,$T0.2d // h3 -> h4
- bic $H3,#0xfc,lsl#24 // &=0x03ffffff
- add $ACC1,$ACC1,$T1.2d // h0 -> h1
-
- ushr $T0.2d,$ACC4,#26
- xtn $H4,$ACC4
- ushr $T1.2d,$ACC1,#26
- xtn $H1,$ACC1
- bic $H4,#0xfc,lsl#24
- add $ACC2,$ACC2,$T1.2d // h1 -> h2
-
- add $ACC0,$ACC0,$T0.2d
- shl $T0.2d,$T0.2d,#2
- shrn $T1.2s,$ACC2,#26
- xtn $H2,$ACC2
- add $ACC0,$ACC0,$T0.2d // h4 -> h0
- bic $H1,#0xfc,lsl#24
- add $H3,$H3,$T1.2s // h2 -> h3
- bic $H2,#0xfc,lsl#24
-
- shrn $T0.2s,$ACC0,#26
- xtn $H0,$ACC0
- ushr $T1.2s,$H3,#26
- bic $H3,#0xfc,lsl#24
- bic $H0,#0xfc,lsl#24
- add $H1,$H1,$T0.2s // h0 -> h1
- add $H4,$H4,$T1.2s // h3 -> h4
-
- b.hi .Loop_neon
-
-.Lskip_loop:
- dup $IN23_2,${IN23_2}[0]
- add $IN01_2,$IN01_2,$H2
-
- ////////////////////////////////////////////////////////////////
- // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- adds $len,$len,#32
- b.ne .Long_tail
-
- dup $IN23_2,${IN01_2}[0]
- add $IN23_0,$IN01_0,$H0
- add $IN23_3,$IN01_3,$H3
- add $IN23_1,$IN01_1,$H1
- add $IN23_4,$IN01_4,$H4
-
-.Long_tail:
- dup $IN23_0,${IN23_0}[0]
- umull2 $ACC0,$IN23_2,${S3}
- umull2 $ACC3,$IN23_2,${R1}
- umull2 $ACC4,$IN23_2,${R2}
- umull2 $ACC2,$IN23_2,${R0}
- umull2 $ACC1,$IN23_2,${S4}
-
- dup $IN23_1,${IN23_1}[0]
- umlal2 $ACC0,$IN23_0,${R0}
- umlal2 $ACC2,$IN23_0,${R2}
- umlal2 $ACC3,$IN23_0,${R3}
- umlal2 $ACC4,$IN23_0,${R4}
- umlal2 $ACC1,$IN23_0,${R1}
-
- dup $IN23_3,${IN23_3}[0]
- umlal2 $ACC0,$IN23_1,${S4}
- umlal2 $ACC3,$IN23_1,${R2}
- umlal2 $ACC2,$IN23_1,${R1}
- umlal2 $ACC4,$IN23_1,${R3}
- umlal2 $ACC1,$IN23_1,${R0}
-
- dup $IN23_4,${IN23_4}[0]
- umlal2 $ACC3,$IN23_3,${R0}
- umlal2 $ACC4,$IN23_3,${R1}
- umlal2 $ACC0,$IN23_3,${S2}
- umlal2 $ACC1,$IN23_3,${S3}
- umlal2 $ACC2,$IN23_3,${S4}
-
- umlal2 $ACC3,$IN23_4,${S4}
- umlal2 $ACC0,$IN23_4,${S1}
- umlal2 $ACC4,$IN23_4,${R0}
- umlal2 $ACC1,$IN23_4,${S2}
- umlal2 $ACC2,$IN23_4,${S3}
-
- b.eq .Lshort_tail
-
- ////////////////////////////////////////////////////////////////
- // (hash+inp[0:1])*r^4:r^3 and accumulate
-
- add $IN01_0,$IN01_0,$H0
- umlal $ACC3,$IN01_2,${R1}
- umlal $ACC0,$IN01_2,${S3}
- umlal $ACC4,$IN01_2,${R2}
- umlal $ACC1,$IN01_2,${S4}
- umlal $ACC2,$IN01_2,${R0}
-
- add $IN01_1,$IN01_1,$H1
- umlal $ACC3,$IN01_0,${R3}
- umlal $ACC0,$IN01_0,${R0}
- umlal $ACC4,$IN01_0,${R4}
- umlal $ACC1,$IN01_0,${R1}
- umlal $ACC2,$IN01_0,${R2}
-
- add $IN01_3,$IN01_3,$H3
- umlal $ACC3,$IN01_1,${R2}
- umlal $ACC0,$IN01_1,${S4}
- umlal $ACC4,$IN01_1,${R3}
- umlal $ACC1,$IN01_1,${R0}
- umlal $ACC2,$IN01_1,${R1}
-
- add $IN01_4,$IN01_4,$H4
- umlal $ACC3,$IN01_3,${R0}
- umlal $ACC0,$IN01_3,${S2}
- umlal $ACC4,$IN01_3,${R1}
- umlal $ACC1,$IN01_3,${S3}
- umlal $ACC2,$IN01_3,${S4}
-
- umlal $ACC3,$IN01_4,${S4}
- umlal $ACC0,$IN01_4,${S1}
- umlal $ACC4,$IN01_4,${R0}
- umlal $ACC1,$IN01_4,${S2}
- umlal $ACC2,$IN01_4,${S3}
-
-.Lshort_tail:
- ////////////////////////////////////////////////////////////////
- // horizontal add
-
- addp $ACC3,$ACC3,$ACC3
- ldp d8,d9,[sp,#16] // meet ABI requirements
- addp $ACC0,$ACC0,$ACC0
- ldp d10,d11,[sp,#32]
- addp $ACC4,$ACC4,$ACC4
- ldp d12,d13,[sp,#48]
- addp $ACC1,$ACC1,$ACC1
- ldp d14,d15,[sp,#64]
- addp $ACC2,$ACC2,$ACC2
- ldr x30,[sp,#8]
-
- ////////////////////////////////////////////////////////////////
- // lazy reduction, but without narrowing
-
- ushr $T0.2d,$ACC3,#26
- and $ACC3,$ACC3,$MASK.2d
- ushr $T1.2d,$ACC0,#26
- and $ACC0,$ACC0,$MASK.2d
-
- add $ACC4,$ACC4,$T0.2d // h3 -> h4
- add $ACC1,$ACC1,$T1.2d // h0 -> h1
-
- ushr $T0.2d,$ACC4,#26
- and $ACC4,$ACC4,$MASK.2d
- ushr $T1.2d,$ACC1,#26
- and $ACC1,$ACC1,$MASK.2d
- add $ACC2,$ACC2,$T1.2d // h1 -> h2
-
- add $ACC0,$ACC0,$T0.2d
- shl $T0.2d,$T0.2d,#2
- ushr $T1.2d,$ACC2,#26
- and $ACC2,$ACC2,$MASK.2d
- add $ACC0,$ACC0,$T0.2d // h4 -> h0
- add $ACC3,$ACC3,$T1.2d // h2 -> h3
-
- ushr $T0.2d,$ACC0,#26
- and $ACC0,$ACC0,$MASK.2d
- ushr $T1.2d,$ACC3,#26
- and $ACC3,$ACC3,$MASK.2d
- add $ACC1,$ACC1,$T0.2d // h0 -> h1
- add $ACC4,$ACC4,$T1.2d // h3 -> h4
-
- ////////////////////////////////////////////////////////////////
- // write the result, can be partially reduced
-
- st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
- mov x4,#1
- st1 {$ACC4}[0],[$ctx]
- str x4,[$ctx,#8] // set is_base2_26
-
- ldr x29,[sp],#80
- .inst 0xd50323bf // autiasp
- ret
-.size poly1305_blocks_neon,.-poly1305_blocks_neon
-
-.pushsection .rodata
-.align 5
-.Lzeros:
-.long 0,0,0,0,0,0,0,0
-.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
-.popsection
-
-.align 2
-#if !defined(__KERNEL__) && !defined(_WIN64)
-.comm OPENSSL_armcap_P,4,4
-.hidden OPENSSL_armcap_P
-#endif
-___
-
-foreach (split("\n",$code)) {
- s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
- s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
- (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
- (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
- (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
- (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
- (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
-
- s/\.[124]([sd])\[/.$1\[/;
- s/w#x([0-9]+)/w$1/g;
-
- print $_,"\n";
-}
-close STDOUT;
diff --git a/arch/arm64/lib/crypto/poly1305-glue.c b/arch/arm64/lib/crypto/poly1305-glue.c
deleted file mode 100644
index c9a74766785b..000000000000
--- a/arch/arm64/lib/crypto/poly1305-glue.c
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
- *
- * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <crypto/internal/poly1305.h>
-#include <linux/cpufeature.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/unaligned.h>
-
-asmlinkage void poly1305_block_init_arch(
- struct poly1305_block_state *state,
- const u8 raw_key[POLY1305_BLOCK_SIZE]);
-EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
-asmlinkage void poly1305_blocks(struct poly1305_block_state *state,
- const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
- const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
- u8 digest[POLY1305_DIGEST_SIZE],
- const u32 nonce[4]);
-EXPORT_SYMBOL_GPL(poly1305_emit_arch);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
- unsigned int len, u32 padbit)
-{
- len = round_down(len, POLY1305_BLOCK_SIZE);
- if (static_branch_likely(&have_neon)) {
- do {
- unsigned int todo = min_t(unsigned int, len, SZ_4K);
-
- kernel_neon_begin();
- poly1305_blocks_neon(state, src, todo, padbit);
- kernel_neon_end();
-
- len -= todo;
- src += todo;
- } while (len);
- } else
- poly1305_blocks(state, src, len, padbit);
-}
-EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
-
-bool poly1305_is_arch_optimized(void)
-{
- /* We always can use at least the ARM64 scalar implementation. */
- return true;
-}
-EXPORT_SYMBOL(poly1305_is_arch_optimized);
-
-static int __init neon_poly1305_mod_init(void)
-{
- if (cpu_have_named_feature(ASIMD))
- static_branch_enable(&have_neon);
- return 0;
-}
-subsys_initcall(neon_poly1305_mod_init);
-
-static void __exit neon_poly1305_mod_exit(void)
-{
-}
-module_exit(neon_poly1305_mod_exit);
-
-MODULE_DESCRIPTION("Poly1305 authenticator (ARM64 optimized)");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/lib/crypto/sha2-armv8.pl b/arch/arm64/lib/crypto/sha2-armv8.pl
deleted file mode 100644
index 4aebd20c498b..000000000000
--- a/arch/arm64/lib/crypto/sha2-armv8.pl
+++ /dev/null
@@ -1,786 +0,0 @@
-#! /usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0
-
-# This code is taken from the OpenSSL project but the author (Andy Polyakov)
-# has relicensed it under the GPLv2. Therefore this program is free software;
-# you can redistribute it and/or modify it under the terms of the GNU General
-# Public License version 2 as published by the Free Software Foundation.
-#
-# The original headers, including the original license headers, are
-# included below for completeness.
-
-# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
-#
-# Licensed under the OpenSSL license (the "License"). You may not use
-# this file except in compliance with the License. You can obtain a copy
-# in the file LICENSE in the source distribution or at
-# https://www.openssl.org/source/license.html
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# SHA256/512 for ARMv8.
-#
-# Performance in cycles per processed byte and improvement coefficient
-# over code generated with "default" compiler:
-#
-# SHA256-hw SHA256(*) SHA512
-# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
-# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
-# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
-# Denver 2.01 10.5 (+26%) 6.70 (+8%)
-# X-Gene 20.0 (+100%) 12.8 (+300%(***))
-# Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
-#
-# (*) Software SHA256 results are of lesser relevance, presented
-# mostly for informational purposes.
-# (**) The result is a trade-off: it's possible to improve it by
-# 10% (or by 1 cycle per round), but at the cost of 20% loss
-# on Cortex-A53 (or by 4 cycles per round).
-# (***) Super-impressive coefficients over gcc-generated code are
-# indication of some compiler "pathology", most notably code
-# generated with -mgeneral-regs-only is significantly faster
-# and the gap is only 40-90%.
-#
-# October 2016.
-#
-# Originally it was reckoned that it makes no sense to implement NEON
-# version of SHA256 for 64-bit processors. This is because performance
-# improvement on most wide-spread Cortex-A5x processors was observed
-# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
-# observed that 32-bit NEON SHA256 performs significantly better than
-# 64-bit scalar version on *some* of the more recent processors. As
-# result 64-bit NEON version of SHA256 was added to provide best
-# all-round performance. For example it executes ~30% faster on X-Gene
-# and Mongoose. [For reference, NEON version of SHA512 is bound to
-# deliver much less improvement, likely *negative* on Cortex-A5x.
-# Which is why NEON support is limited to SHA256.]
-
-$output=pop;
-$flavour=pop;
-
-if ($flavour && $flavour ne "void") {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
-
- open OUT,"| \"$^X\" $xlate $flavour $output";
- *STDOUT=*OUT;
-} else {
- open STDOUT,">$output";
-}
-
-if ($output =~ /512/) {
- $BITS=512;
- $SZ=8;
- @Sigma0=(28,34,39);
- @Sigma1=(14,18,41);
- @sigma0=(1, 8, 7);
- @sigma1=(19,61, 6);
- $rounds=80;
- $reg_t="x";
-} else {
- $BITS=256;
- $SZ=4;
- @Sigma0=( 2,13,22);
- @Sigma1=( 6,11,25);
- @sigma0=( 7,18, 3);
- @sigma1=(17,19,10);
- $rounds=64;
- $reg_t="w";
-}
-
-$func="sha${BITS}_blocks_arch";
-
-($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
-
-@X=map("$reg_t$_",(3..15,0..2));
-@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
-($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
-
-sub BODY_00_xx {
-my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
-my $j=($i+1)&15;
-my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
- $T0=@X[$i+3] if ($i<11);
-
-$code.=<<___ if ($i<16);
-#ifndef __AARCH64EB__
- rev @X[$i],@X[$i] // $i
-#endif
-___
-$code.=<<___ if ($i<13 && ($i&1));
- ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
-___
-$code.=<<___ if ($i==13);
- ldp @X[14],@X[15],[$inp]
-___
-$code.=<<___ if ($i>=14);
- ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
-___
-$code.=<<___ if ($i>0 && $i<16);
- add $a,$a,$t1 // h+=Sigma0(a)
-___
-$code.=<<___ if ($i>=11);
- str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
-___
-# While ARMv8 specifies merged rotate-n-logical operation such as
-# 'eor x,y,z,ror#n', it was found to negatively affect performance
-# on Apple A7. The reason seems to be that it requires even 'y' to
-# be available earlier. This means that such merged instruction is
-# not necessarily best choice on critical path... On the other hand
-# Cortex-A5x handles merged instructions much better than disjoint
-# rotate and logical... See (**) footnote above.
-$code.=<<___ if ($i<15);
- ror $t0,$e,#$Sigma1[0]
- add $h,$h,$t2 // h+=K[i]
- eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
- and $t1,$f,$e
- bic $t2,$g,$e
- add $h,$h,@X[$i&15] // h+=X[i]
- orr $t1,$t1,$t2 // Ch(e,f,g)
- eor $t2,$a,$b // a^b, b^c in next round
- eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e)
- ror $T0,$a,#$Sigma0[0]
- add $h,$h,$t1 // h+=Ch(e,f,g)
- eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
- add $h,$h,$t0 // h+=Sigma1(e)
- and $t3,$t3,$t2 // (b^c)&=(a^b)
- add $d,$d,$h // d+=h
- eor $t3,$t3,$b // Maj(a,b,c)
- eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a)
- add $h,$h,$t3 // h+=Maj(a,b,c)
- ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
- //add $h,$h,$t1 // h+=Sigma0(a)
-___
-$code.=<<___ if ($i>=15);
- ror $t0,$e,#$Sigma1[0]
- add $h,$h,$t2 // h+=K[i]
- ror $T1,@X[($j+1)&15],#$sigma0[0]
- and $t1,$f,$e
- ror $T2,@X[($j+14)&15],#$sigma1[0]
- bic $t2,$g,$e
- ror $T0,$a,#$Sigma0[0]
- add $h,$h,@X[$i&15] // h+=X[i]
- eor $t0,$t0,$e,ror#$Sigma1[1]
- eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
- orr $t1,$t1,$t2 // Ch(e,f,g)
- eor $t2,$a,$b // a^b, b^c in next round
- eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e)
- eor $T0,$T0,$a,ror#$Sigma0[1]
- add $h,$h,$t1 // h+=Ch(e,f,g)
- and $t3,$t3,$t2 // (b^c)&=(a^b)
- eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
- eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1])
- add $h,$h,$t0 // h+=Sigma1(e)
- eor $t3,$t3,$b // Maj(a,b,c)
- eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a)
- eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14])
- add @X[$j],@X[$j],@X[($j+9)&15]
- add $d,$d,$h // d+=h
- add $h,$h,$t3 // h+=Maj(a,b,c)
- ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
- add @X[$j],@X[$j],$T1
- add $h,$h,$t1 // h+=Sigma0(a)
- add @X[$j],@X[$j],$T2
-___
- ($t2,$t3)=($t3,$t2);
-}
-
-$code.=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#endif
-
-.text
-
-.extern OPENSSL_armcap_P
-.globl $func
-.type $func,%function
-.align 6
-$func:
-___
-$code.=<<___ if ($SZ==4);
-#ifndef __KERNEL__
-# ifdef __ILP32__
- ldrsw x16,.LOPENSSL_armcap_P
-# else
- ldr x16,.LOPENSSL_armcap_P
-# endif
- adr x17,.LOPENSSL_armcap_P
- add x16,x16,x17
- ldr w16,[x16]
- tst w16,#ARMV8_SHA256
- b.ne .Lv8_entry
- tst w16,#ARMV7_NEON
- b.ne .Lneon_entry
-#endif
-___
-$code.=<<___;
- stp x29,x30,[sp,#-128]!
- add x29,sp,#0
-
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub sp,sp,#4*$SZ
-
- ldp $A,$B,[$ctx] // load context
- ldp $C,$D,[$ctx,#2*$SZ]
- ldp $E,$F,[$ctx,#4*$SZ]
- add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
- ldp $G,$H,[$ctx,#6*$SZ]
- adr $Ktbl,.LK$BITS
- stp $ctx,$num,[x29,#96]
-
-.Loop:
- ldp @X[0],@X[1],[$inp],#2*$SZ
- ldr $t2,[$Ktbl],#$SZ // *K++
- eor $t3,$B,$C // magic seed
- str $inp,[x29,#112]
-___
-for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
-$code.=".Loop_16_xx:\n";
-for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
- cbnz $t2,.Loop_16_xx
-
- ldp $ctx,$num,[x29,#96]
- ldr $inp,[x29,#112]
- sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind
-
- ldp @X[0],@X[1],[$ctx]
- ldp @X[2],@X[3],[$ctx,#2*$SZ]
- add $inp,$inp,#14*$SZ // advance input pointer
- ldp @X[4],@X[5],[$ctx,#4*$SZ]
- add $A,$A,@X[0]
- ldp @X[6],@X[7],[$ctx,#6*$SZ]
- add $B,$B,@X[1]
- add $C,$C,@X[2]
- add $D,$D,@X[3]
- stp $A,$B,[$ctx]
- add $E,$E,@X[4]
- add $F,$F,@X[5]
- stp $C,$D,[$ctx,#2*$SZ]
- add $G,$G,@X[6]
- add $H,$H,@X[7]
- cmp $inp,$num
- stp $E,$F,[$ctx,#4*$SZ]
- stp $G,$H,[$ctx,#6*$SZ]
- b.ne .Loop
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#4*$SZ
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#128
- ret
-.size $func,.-$func
-
-.align 6
-.type .LK$BITS,%object
-.LK$BITS:
-___
-$code.=<<___ if ($SZ==8);
- .quad 0x428a2f98d728ae22,0x7137449123ef65cd
- .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
- .quad 0x3956c25bf348b538,0x59f111f1b605d019
- .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
- .quad 0xd807aa98a3030242,0x12835b0145706fbe
- .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
- .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
- .quad 0x9bdc06a725c71235,0xc19bf174cf692694
- .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
- .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
- .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
- .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
- .quad 0x983e5152ee66dfab,0xa831c66d2db43210
- .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
- .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
- .quad 0x06ca6351e003826f,0x142929670a0e6e70
- .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
- .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
- .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
- .quad 0x81c2c92e47edaee6,0x92722c851482353b
- .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
- .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
- .quad 0xd192e819d6ef5218,0xd69906245565a910
- .quad 0xf40e35855771202a,0x106aa07032bbd1b8
- .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
- .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
- .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
- .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
- .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
- .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
- .quad 0x90befffa23631e28,0xa4506cebde82bde9
- .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
- .quad 0xca273eceea26619c,0xd186b8c721c0c207
- .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
- .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
- .quad 0x113f9804bef90dae,0x1b710b35131c471b
- .quad 0x28db77f523047d84,0x32caab7b40c72493
- .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
- .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
- .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
- .quad 0 // terminator
-___
-$code.=<<___ if ($SZ==4);
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
- .long 0 //terminator
-___
-$code.=<<___;
-.size .LK$BITS,.-.LK$BITS
-#ifndef __KERNEL__
-.align 3
-.LOPENSSL_armcap_P:
-# ifdef __ILP32__
- .long OPENSSL_armcap_P-.
-# else
- .quad OPENSSL_armcap_P-.
-# endif
-#endif
-.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
-.align 2
-___
-
-if ($SZ==4) {
-my $Ktbl="x3";
-
-my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
-my @MSG=map("v$_.16b",(4..7));
-my ($W0,$W1)=("v16.4s","v17.4s");
-my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
-
-$code.=<<___;
-#ifndef __KERNEL__
-.type sha256_block_armv8,%function
-.align 6
-sha256_block_armv8:
-.Lv8_entry:
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
-
- ld1.32 {$ABCD,$EFGH},[$ctx]
- adr $Ktbl,.LK256
-
-.Loop_hw:
- ld1 {@MSG[0]-@MSG[3]},[$inp],#64
- sub $num,$num,#1
- ld1.32 {$W0},[$Ktbl],#16
- rev32 @MSG[0],@MSG[0]
- rev32 @MSG[1],@MSG[1]
- rev32 @MSG[2],@MSG[2]
- rev32 @MSG[3],@MSG[3]
- orr $ABCD_SAVE,$ABCD,$ABCD // offload
- orr $EFGH_SAVE,$EFGH,$EFGH
-___
-for($i=0;$i<12;$i++) {
-$code.=<<___;
- ld1.32 {$W1},[$Ktbl],#16
- add.i32 $W0,$W0,@MSG[0]
- sha256su0 @MSG[0],@MSG[1]
- orr $abcd,$ABCD,$ABCD
- sha256h $ABCD,$EFGH,$W0
- sha256h2 $EFGH,$abcd,$W0
- sha256su1 @MSG[0],@MSG[2],@MSG[3]
-___
- ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
-}
-$code.=<<___;
- ld1.32 {$W1},[$Ktbl],#16
- add.i32 $W0,$W0,@MSG[0]
- orr $abcd,$ABCD,$ABCD
- sha256h $ABCD,$EFGH,$W0
- sha256h2 $EFGH,$abcd,$W0
-
- ld1.32 {$W0},[$Ktbl],#16
- add.i32 $W1,$W1,@MSG[1]
- orr $abcd,$ABCD,$ABCD
- sha256h $ABCD,$EFGH,$W1
- sha256h2 $EFGH,$abcd,$W1
-
- ld1.32 {$W1},[$Ktbl]
- add.i32 $W0,$W0,@MSG[2]
- sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
- orr $abcd,$ABCD,$ABCD
- sha256h $ABCD,$EFGH,$W0
- sha256h2 $EFGH,$abcd,$W0
-
- add.i32 $W1,$W1,@MSG[3]
- orr $abcd,$ABCD,$ABCD
- sha256h $ABCD,$EFGH,$W1
- sha256h2 $EFGH,$abcd,$W1
-
- add.i32 $ABCD,$ABCD,$ABCD_SAVE
- add.i32 $EFGH,$EFGH,$EFGH_SAVE
-
- cbnz $num,.Loop_hw
-
- st1.32 {$ABCD,$EFGH},[$ctx]
-
- ldr x29,[sp],#16
- ret
-.size sha256_block_armv8,.-sha256_block_armv8
-#endif
-___
-}
-
-if ($SZ==4) { ######################################### NEON stuff #
-# You'll surely note a lot of similarities with sha256-armv4 module,
-# and of course it's not a coincidence. sha256-armv4 was used as
-# initial template, but was adapted for ARMv8 instruction set and
-# extensively re-tuned for all-round performance.
-
-my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
-my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
-my $Ktbl="x16";
-my $Xfer="x17";
-my @X = map("q$_",(0..3));
-my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
-my $j=0;
-
-sub AUTOLOAD() # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
- my $arg = pop;
- $arg = "#$arg" if ($arg*1 eq $arg);
- $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
-}
-
-sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
-sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
-sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
-
-sub Xupdate()
-{ use integer;
- my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body);
- my ($a,$b,$c,$d,$e,$f,$g,$h);
-
- &ext_8 ($T0,@X[0],@X[1],4); # X[1..4]
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &ext_8 ($T3,@X[2],@X[3],4); # X[9..12]
- eval(shift(@insns));
- eval(shift(@insns));
- &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15]
- eval(shift(@insns));
- eval(shift(@insns));
- &ushr_32 ($T2,$T0,$sigma0[0]);
- eval(shift(@insns));
- &ushr_32 ($T1,$T0,$sigma0[2]);
- eval(shift(@insns));
- &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12]
- eval(shift(@insns));
- &sli_32 ($T2,$T0,32-$sigma0[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &ushr_32 ($T3,$T0,$sigma0[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &eor_8 ($T1,$T1,$T2);
- eval(shift(@insns));
- eval(shift(@insns));
- &sli_32 ($T3,$T0,32-$sigma0[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &ushr_32 ($T4,$T7,$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4])
- eval(shift(@insns));
- eval(shift(@insns));
- &sli_32 ($T4,$T7,32-$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &ushr_32 ($T5,$T7,$sigma1[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &ushr_32 ($T3,$T7,$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
- eval(shift(@insns));
- eval(shift(@insns));
- &sli_u32 ($T3,$T7,32-$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &eor_8 ($T5,$T5,$T4);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15])
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15])
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &ushr_32 ($T6,@X[0],$sigma1[0]);
- eval(shift(@insns));
- &ushr_32 ($T7,@X[0],$sigma1[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &sli_32 ($T6,@X[0],32-$sigma1[0]);
- eval(shift(@insns));
- &ushr_32 ($T5,@X[0],$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &eor_8 ($T7,$T7,$T6);
- eval(shift(@insns));
- eval(shift(@insns));
- &sli_32 ($T5,@X[0],32-$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &ld1_32 ("{$T0}","[$Ktbl], #16");
- eval(shift(@insns));
- &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17])
- eval(shift(@insns));
- eval(shift(@insns));
- &eor_8 ($T5,$T5,$T5);
- eval(shift(@insns));
- eval(shift(@insns));
- &mov (&Dhi($T5), &Dlo($T7));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17])
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &add_32 ($T0,$T0,@X[0]);
- while($#insns>=1) { eval(shift(@insns)); }
- &st1_32 ("{$T0}","[$Xfer], #16");
- eval(shift(@insns));
-
- push(@X,shift(@X)); # "rotate" X[]
-}
-
-sub Xpreload()
-{ use integer;
- my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body);
- my ($a,$b,$c,$d,$e,$f,$g,$h);
-
- eval(shift(@insns));
- eval(shift(@insns));
- &ld1_8 ("{@X[0]}","[$inp],#16");
- eval(shift(@insns));
- eval(shift(@insns));
- &ld1_32 ("{$T0}","[$Ktbl],#16");
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &rev32 (@X[0],@X[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &add_32 ($T0,$T0,@X[0]);
- foreach (@insns) { eval; } # remaining instructions
- &st1_32 ("{$T0}","[$Xfer], #16");
-
- push(@X,shift(@X)); # "rotate" X[]
-}
-
-sub body_00_15 () {
- (
- '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
- '&add ($h,$h,$t1)', # h+=X[i]+K[i]
- '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past
- '&and ($t1,$f,$e)',
- '&bic ($t4,$g,$e)',
- '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
- '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
- '&orr ($t1,$t1,$t4)', # Ch(e,f,g)
- '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
- '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
- '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
- '&ror ($t0,$t0,"#$Sigma1[0]")',
- '&eor ($t2,$a,$b)', # a^b, b^c in next round
- '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
- '&add ($h,$h,$t0)', # h+=Sigma1(e)
- '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
- '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
- '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
- '&ror ($t4,$t4,"#$Sigma0[0]")',
- '&add ($d,$d,$h)', # d+=h
- '&eor ($t3,$t3,$b)', # Maj(a,b,c)
- '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
- )
-}
-
-$code.=<<___;
-#ifdef __KERNEL__
-.globl sha256_block_neon
-#endif
-.type sha256_block_neon,%function
-.align 4
-sha256_block_neon:
-.Lneon_entry:
- stp x29, x30, [sp, #-16]!
- mov x29, sp
- sub sp,sp,#16*4
-
- adr $Ktbl,.LK256
- add $num,$inp,$num,lsl#6 // len to point at the end of inp
-
- ld1.8 {@X[0]},[$inp], #16
- ld1.8 {@X[1]},[$inp], #16
- ld1.8 {@X[2]},[$inp], #16
- ld1.8 {@X[3]},[$inp], #16
- ld1.32 {$T0},[$Ktbl], #16
- ld1.32 {$T1},[$Ktbl], #16
- ld1.32 {$T2},[$Ktbl], #16
- ld1.32 {$T3},[$Ktbl], #16
- rev32 @X[0],@X[0] // yes, even on
- rev32 @X[1],@X[1] // big-endian
- rev32 @X[2],@X[2]
- rev32 @X[3],@X[3]
- mov $Xfer,sp
- add.32 $T0,$T0,@X[0]
- add.32 $T1,$T1,@X[1]
- add.32 $T2,$T2,@X[2]
- st1.32 {$T0-$T1},[$Xfer], #32
- add.32 $T3,$T3,@X[3]
- st1.32 {$T2-$T3},[$Xfer]
- sub $Xfer,$Xfer,#32
-
- ldp $A,$B,[$ctx]
- ldp $C,$D,[$ctx,#8]
- ldp $E,$F,[$ctx,#16]
- ldp $G,$H,[$ctx,#24]
- ldr $t1,[sp,#0]
- mov $t2,wzr
- eor $t3,$B,$C
- mov $t4,wzr
- b .L_00_48
-
-.align 4
-.L_00_48:
-___
- &Xupdate(\&body_00_15);
- &Xupdate(\&body_00_15);
- &Xupdate(\&body_00_15);
- &Xupdate(\&body_00_15);
-$code.=<<___;
- cmp $t1,#0 // check for K256 terminator
- ldr $t1,[sp,#0]
- sub $Xfer,$Xfer,#64
- bne .L_00_48
-
- sub $Ktbl,$Ktbl,#256 // rewind $Ktbl
- cmp $inp,$num
- mov $Xfer, #64
- csel $Xfer, $Xfer, xzr, eq
- sub $inp,$inp,$Xfer // avoid SEGV
- mov $Xfer,sp
-___
- &Xpreload(\&body_00_15);
- &Xpreload(\&body_00_15);
- &Xpreload(\&body_00_15);
- &Xpreload(\&body_00_15);
-$code.=<<___;
- add $A,$A,$t4 // h+=Sigma0(a) from the past
- ldp $t0,$t1,[$ctx,#0]
- add $A,$A,$t2 // h+=Maj(a,b,c) from the past
- ldp $t2,$t3,[$ctx,#8]
- add $A,$A,$t0 // accumulate
- add $B,$B,$t1
- ldp $t0,$t1,[$ctx,#16]
- add $C,$C,$t2
- add $D,$D,$t3
- ldp $t2,$t3,[$ctx,#24]
- add $E,$E,$t0
- add $F,$F,$t1
- ldr $t1,[sp,#0]
- stp $A,$B,[$ctx,#0]
- add $G,$G,$t2
- mov $t2,wzr
- stp $C,$D,[$ctx,#8]
- add $H,$H,$t3
- stp $E,$F,[$ctx,#16]
- eor $t3,$B,$C
- stp $G,$H,[$ctx,#24]
- mov $t4,wzr
- mov $Xfer,sp
- b.ne .L_00_48
-
- ldr x29,[x29]
- add sp,sp,#16*4+16
- ret
-.size sha256_block_neon,.-sha256_block_neon
-___
-}
-
-$code.=<<___;
-#ifndef __KERNEL__
-.comm OPENSSL_armcap_P,4,4
-#endif
-___
-
-{ my %opcode = (
- "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
- "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
-
- sub unsha256 {
- my ($mnemonic,$arg)=@_;
-
- $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
- &&
- sprintf ".inst\t0x%08x\t//%s %s",
- $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
- $mnemonic,$arg;
- }
-}
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/\/\// and !/^$/);
- print;
-}
-close SELF;
-
-foreach(split("\n",$code)) {
-
- s/\`([^\`]*)\`/eval($1)/ge;
-
- s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
-
- s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers
-
- s/\.[ui]?8(\s)/$1/;
- s/\.\w?32\b// and s/\.16b/\.4s/g;
- m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g;
-
- print $_,"\n";
-}
-
-close STDOUT;
diff --git a/arch/arm64/lib/crypto/sha256-ce.S b/arch/arm64/lib/crypto/sha256-ce.S
deleted file mode 100644
index f3e21c6d87d2..000000000000
--- a/arch/arm64/lib/crypto/sha256-ce.S
+++ /dev/null
@@ -1,136 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
- *
- * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .text
- .arch armv8-a+crypto
-
- dga .req q20
- dgav .req v20
- dgb .req q21
- dgbv .req v21
-
- t0 .req v22
- t1 .req v23
-
- dg0q .req q24
- dg0v .req v24
- dg1q .req q25
- dg1v .req v25
- dg2q .req q26
- dg2v .req v26
-
- .macro add_only, ev, rc, s0
- mov dg2v.16b, dg0v.16b
- .ifeq \ev
- add t1.4s, v\s0\().4s, \rc\().4s
- sha256h dg0q, dg1q, t0.4s
- sha256h2 dg1q, dg2q, t0.4s
- .else
- .ifnb \s0
- add t0.4s, v\s0\().4s, \rc\().4s
- .endif
- sha256h dg0q, dg1q, t1.4s
- sha256h2 dg1q, dg2q, t1.4s
- .endif
- .endm
-
- .macro add_update, ev, rc, s0, s1, s2, s3
- sha256su0 v\s0\().4s, v\s1\().4s
- add_only \ev, \rc, \s1
- sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s
- .endm
-
- /*
- * The SHA-256 round constants
- */
- .section ".rodata", "a"
- .align 4
-.Lsha2_rcon:
- .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
- .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
- .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
- .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
- .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
- .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
- .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
- .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
- .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
- .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
- .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
- .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
- .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
- .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
- .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
- .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-
- /*
- * size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
- * const u8 *data, size_t nblocks);
- */
- .text
-SYM_FUNC_START(__sha256_ce_transform)
- /* load round constants */
- adr_l x8, .Lsha2_rcon
- ld1 { v0.4s- v3.4s}, [x8], #64
- ld1 { v4.4s- v7.4s}, [x8], #64
- ld1 { v8.4s-v11.4s}, [x8], #64
- ld1 {v12.4s-v15.4s}, [x8]
-
- /* load state */
- ld1 {dgav.4s, dgbv.4s}, [x0]
-
- /* load input */
-0: ld1 {v16.4s-v19.4s}, [x1], #64
- sub x2, x2, #1
-
-CPU_LE( rev32 v16.16b, v16.16b )
-CPU_LE( rev32 v17.16b, v17.16b )
-CPU_LE( rev32 v18.16b, v18.16b )
-CPU_LE( rev32 v19.16b, v19.16b )
-
- add t0.4s, v16.4s, v0.4s
- mov dg0v.16b, dgav.16b
- mov dg1v.16b, dgbv.16b
-
- add_update 0, v1, 16, 17, 18, 19
- add_update 1, v2, 17, 18, 19, 16
- add_update 0, v3, 18, 19, 16, 17
- add_update 1, v4, 19, 16, 17, 18
-
- add_update 0, v5, 16, 17, 18, 19
- add_update 1, v6, 17, 18, 19, 16
- add_update 0, v7, 18, 19, 16, 17
- add_update 1, v8, 19, 16, 17, 18
-
- add_update 0, v9, 16, 17, 18, 19
- add_update 1, v10, 17, 18, 19, 16
- add_update 0, v11, 18, 19, 16, 17
- add_update 1, v12, 19, 16, 17, 18
-
- add_only 0, v13, 17
- add_only 1, v14, 18
- add_only 0, v15, 19
- add_only 1
-
- /* update state */
- add dgav.4s, dgav.4s, dg0v.4s
- add dgbv.4s, dgbv.4s, dg1v.4s
-
- /* return early if voluntary preemption is needed */
- cond_yield 1f, x5, x6
-
- /* handled all input blocks? */
- cbnz x2, 0b
-
- /* store new state */
-1: st1 {dgav.4s, dgbv.4s}, [x0]
- mov x0, x2
- ret
-SYM_FUNC_END(__sha256_ce_transform)
diff --git a/arch/arm64/lib/crypto/sha256.c b/arch/arm64/lib/crypto/sha256.c
deleted file mode 100644
index bcf7a3adc0c4..000000000000
--- a/arch/arm64/lib/crypto/sha256.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 optimized for ARM64
- *
- * Copyright 2025 Google LLC
- */
-#include <asm/neon.h>
-#include <crypto/internal/sha2.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-asmlinkage void sha256_block_neon(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-asmlinkage size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
-
-void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks)
-{
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
- static_branch_likely(&have_neon)) {
- if (static_branch_likely(&have_ce)) {
- do {
- size_t rem;
-
- kernel_neon_begin();
- rem = __sha256_ce_transform(state,
- data, nblocks);
- kernel_neon_end();
- data += (nblocks - rem) * SHA256_BLOCK_SIZE;
- nblocks = rem;
- } while (nblocks);
- } else {
- kernel_neon_begin();
- sha256_block_neon(state, data, nblocks);
- kernel_neon_end();
- }
- } else {
- sha256_blocks_arch(state, data, nblocks);
- }
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_simd);
-
-bool sha256_is_arch_optimized(void)
-{
- /* We always can use at least the ARM64 scalar implementation. */
- return true;
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
-static int __init sha256_arm64_mod_init(void)
-{
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
- cpu_have_named_feature(ASIMD)) {
- static_branch_enable(&have_neon);
- if (cpu_have_named_feature(SHA2))
- static_branch_enable(&have_ce);
- }
- return 0;
-}
-subsys_initcall(sha256_arm64_mod_init);
-
-static void __exit sha256_arm64_mod_exit(void)
-{
-}
-module_exit(sha256_arm64_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 optimized for ARM64");
diff --git a/arch/arm64/tools/syscall_32.tbl b/arch/arm64/tools/syscall_32.tbl
index 0765b3a8d6d6..8d9088bc577d 100644
--- a/arch/arm64/tools/syscall_32.tbl
+++ b/arch/arm64/tools/syscall_32.tbl
@@ -479,3 +479,5 @@
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
467 common open_tree_attr sys_open_tree_attr
+468 common file_getattr sys_file_getattr
+469 common file_setattr sys_file_setattr
diff --git a/arch/csky/kernel/ptrace.c b/arch/csky/kernel/ptrace.c
index 0f7e7b653c72..6bb685a2646b 100644
--- a/arch/csky/kernel/ptrace.c
+++ b/arch/csky/kernel/ptrace.c
@@ -166,7 +166,7 @@ static int fpr_set(struct task_struct *target,
static const struct user_regset csky_regsets[] = {
[REGSET_GPR] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = sizeof(struct pt_regs) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -174,7 +174,7 @@ static const struct user_regset csky_regsets[] = {
.set = gpr_set,
},
[REGSET_FPR] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(struct user_fp) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
diff --git a/arch/hexagon/kernel/ptrace.c b/arch/hexagon/kernel/ptrace.c
index 905b06790ab7..2093eee143e1 100644
--- a/arch/hexagon/kernel/ptrace.c
+++ b/arch/hexagon/kernel/ptrace.c
@@ -137,7 +137,7 @@ enum hexagon_regset {
static const struct user_regset hexagon_regsets[] = {
[REGSET_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = ELF_NGREG,
.size = sizeof(unsigned long),
.align = sizeof(unsigned long),
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 4b19f93379a1..fa3eb87f0999 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -15,7 +15,6 @@ config LOONGARCH
select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
select ARCH_HAS_CPU_FINALIZE_INIT
- select ARCH_HAS_CRC32
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VM_PGTABLE
select ARCH_HAS_FAST_MULTIPLIER
@@ -120,11 +119,11 @@ config LOONGARCH
select HAVE_ARCH_KASAN
select HAVE_ARCH_KFENCE
select HAVE_ARCH_KGDB if PERF_EVENTS
+ select HAVE_ARCH_KSTACK_ERASE
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
select HAVE_ARCH_SECCOMP
select HAVE_ARCH_SECCOMP_FILTER
- select HAVE_ARCH_STACKLEAK
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD
diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild
index 80ddb5edb845..b04d2cef935f 100644
--- a/arch/loongarch/include/asm/Kbuild
+++ b/arch/loongarch/include/asm/Kbuild
@@ -10,5 +10,4 @@ generic-y += user.h
generic-y += ioctl.h
generic-y += mmzone.h
generic-y += statfs.h
-generic-y += param.h
generic-y += text-patching.h
diff --git a/arch/loongarch/kernel/ptrace.c b/arch/loongarch/kernel/ptrace.c
index 5e2402cfcab0..8edd0954e55a 100644
--- a/arch/loongarch/kernel/ptrace.c
+++ b/arch/loongarch/kernel/ptrace.c
@@ -864,7 +864,7 @@ enum loongarch_regset {
static const struct user_regset loongarch64_regsets[] = {
[REGSET_GPR] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = ELF_NGREG,
.size = sizeof(elf_greg_t),
.align = sizeof(elf_greg_t),
@@ -872,7 +872,7 @@ static const struct user_regset loongarch64_regsets[] = {
.set = gpr_set,
},
[REGSET_FPR] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = ELF_NFPREG,
.size = sizeof(elf_fpreg_t),
.align = sizeof(elf_fpreg_t),
@@ -880,7 +880,7 @@ static const struct user_regset loongarch64_regsets[] = {
.set = fpr_set,
},
[REGSET_CPUCFG] = {
- .core_note_type = NT_LOONGARCH_CPUCFG,
+ USER_REGSET_NOTE_TYPE(LOONGARCH_CPUCFG),
.n = 64,
.size = sizeof(u32),
.align = sizeof(u32),
@@ -889,7 +889,7 @@ static const struct user_regset loongarch64_regsets[] = {
},
#ifdef CONFIG_CPU_HAS_LSX
[REGSET_LSX] = {
- .core_note_type = NT_LOONGARCH_LSX,
+ USER_REGSET_NOTE_TYPE(LOONGARCH_LSX),
.n = NUM_FPU_REGS,
.size = 16,
.align = 16,
@@ -899,7 +899,7 @@ static const struct user_regset loongarch64_regsets[] = {
#endif
#ifdef CONFIG_CPU_HAS_LASX
[REGSET_LASX] = {
- .core_note_type = NT_LOONGARCH_LASX,
+ USER_REGSET_NOTE_TYPE(LOONGARCH_LASX),
.n = NUM_FPU_REGS,
.size = 32,
.align = 32,
@@ -909,7 +909,7 @@ static const struct user_regset loongarch64_regsets[] = {
#endif
#ifdef CONFIG_CPU_HAS_LBT
[REGSET_LBT] = {
- .core_note_type = NT_LOONGARCH_LBT,
+ USER_REGSET_NOTE_TYPE(LOONGARCH_LBT),
.n = 5,
.size = sizeof(u64),
.align = sizeof(u64),
@@ -919,7 +919,7 @@ static const struct user_regset loongarch64_regsets[] = {
#endif
#ifdef CONFIG_HAVE_HW_BREAKPOINT
[REGSET_HW_BREAK] = {
- .core_note_type = NT_LOONGARCH_HW_BREAK,
+ USER_REGSET_NOTE_TYPE(LOONGARCH_HW_BREAK),
.n = sizeof(struct user_watch_state_v2) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -927,7 +927,7 @@ static const struct user_regset loongarch64_regsets[] = {
.set = hw_break_set,
},
[REGSET_HW_WATCH] = {
- .core_note_type = NT_LOONGARCH_HW_WATCH,
+ USER_REGSET_NOTE_TYPE(LOONGARCH_HW_WATCH),
.n = sizeof(struct user_watch_state_v2) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
diff --git a/arch/loongarch/lib/Makefile b/arch/loongarch/lib/Makefile
index fae77809048b..ccea3bbd4353 100644
--- a/arch/loongarch/lib/Makefile
+++ b/arch/loongarch/lib/Makefile
@@ -11,5 +11,3 @@ obj-$(CONFIG_ARCH_SUPPORTS_INT128) += tishift.o
obj-$(CONFIG_CPU_HAS_LSX) += xor_simd.o xor_simd_glue.o
obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
-
-obj-$(CONFIG_CRC32_ARCH) += crc32-loongarch.o
diff --git a/arch/loongarch/lib/crc32-loongarch.c b/arch/loongarch/lib/crc32-loongarch.c
deleted file mode 100644
index db22c2ec55e2..000000000000
--- a/arch/loongarch/lib/crc32-loongarch.c
+++ /dev/null
@@ -1,136 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * CRC32 and CRC32C using LoongArch crc* instructions
- *
- * Module based on mips/crypto/crc32-mips.c
- *
- * Copyright (C) 2014 Linaro Ltd <yazen.ghannam@linaro.org>
- * Copyright (C) 2018 MIPS Tech, LLC
- * Copyright (C) 2020-2023 Loongson Technology Corporation Limited
- */
-
-#include <asm/cpu-features.h>
-#include <linux/crc32.h>
-#include <linux/export.h>
-#include <linux/module.h>
-#include <linux/unaligned.h>
-
-#define _CRC32(crc, value, size, type) \
-do { \
- __asm__ __volatile__( \
- #type ".w." #size ".w" " %0, %1, %0\n\t"\
- : "+r" (crc) \
- : "r" (value) \
- : "memory"); \
-} while (0)
-
-#define CRC32(crc, value, size) _CRC32(crc, value, size, crc)
-#define CRC32C(crc, value, size) _CRC32(crc, value, size, crcc)
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
-
-u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
-{
- if (!static_branch_likely(&have_crc32))
- return crc32_le_base(crc, p, len);
-
- while (len >= sizeof(u64)) {
- u64 value = get_unaligned_le64(p);
-
- CRC32(crc, value, d);
- p += sizeof(u64);
- len -= sizeof(u64);
- }
-
- if (len & sizeof(u32)) {
- u32 value = get_unaligned_le32(p);
-
- CRC32(crc, value, w);
- p += sizeof(u32);
- }
-
- if (len & sizeof(u16)) {
- u16 value = get_unaligned_le16(p);
-
- CRC32(crc, value, h);
- p += sizeof(u16);
- }
-
- if (len & sizeof(u8)) {
- u8 value = *p++;
-
- CRC32(crc, value, b);
- }
-
- return crc;
-}
-EXPORT_SYMBOL(crc32_le_arch);
-
-u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
-{
- if (!static_branch_likely(&have_crc32))
- return crc32c_base(crc, p, len);
-
- while (len >= sizeof(u64)) {
- u64 value = get_unaligned_le64(p);
-
- CRC32C(crc, value, d);
- p += sizeof(u64);
- len -= sizeof(u64);
- }
-
- if (len & sizeof(u32)) {
- u32 value = get_unaligned_le32(p);
-
- CRC32C(crc, value, w);
- p += sizeof(u32);
- }
-
- if (len & sizeof(u16)) {
- u16 value = get_unaligned_le16(p);
-
- CRC32C(crc, value, h);
- p += sizeof(u16);
- }
-
- if (len & sizeof(u8)) {
- u8 value = *p++;
-
- CRC32C(crc, value, b);
- }
-
- return crc;
-}
-EXPORT_SYMBOL(crc32c_arch);
-
-u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
-{
- return crc32_be_base(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_be_arch);
-
-static int __init crc32_loongarch_init(void)
-{
- if (cpu_has_crc32)
- static_branch_enable(&have_crc32);
- return 0;
-}
-subsys_initcall(crc32_loongarch_init);
-
-static void __exit crc32_loongarch_exit(void)
-{
-}
-module_exit(crc32_loongarch_exit);
-
-u32 crc32_optimizations(void)
-{
- if (static_key_enabled(&have_crc32))
- return CRC32_LE_OPTIMIZATION | CRC32C_OPTIMIZATION;
- return 0;
-}
-EXPORT_SYMBOL(crc32_optimizations);
-
-MODULE_AUTHOR("Min Zhou <zhoumin@loongson.cn>");
-MODULE_AUTHOR("Huacai Chen <chenhuacai@loongson.cn>");
-MODULE_DESCRIPTION("CRC32 and CRC32C using LoongArch crc* instructions");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/m68k/kernel/ptrace.c b/arch/m68k/kernel/ptrace.c
index c20d590e4297..cfa2df24eced 100644
--- a/arch/m68k/kernel/ptrace.c
+++ b/arch/m68k/kernel/ptrace.c
@@ -319,7 +319,7 @@ enum m68k_regset {
static const struct user_regset m68k_user_regsets[] = {
[REGSET_GPR] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = ELF_NGREG,
.size = sizeof(u32),
.align = sizeof(u16),
@@ -327,7 +327,7 @@ static const struct user_regset m68k_user_regsets[] = {
},
#ifdef CONFIG_FPU
[REGSET_FPU] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(struct user_m68kfp_struct) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 9fe47112c586..f41d38dfbf13 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -467,3 +467,5 @@
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
467 common open_tree_attr sys_open_tree_attr
+468 common file_getattr sys_file_getattr
+469 common file_setattr sys_file_setattr
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 7b6e97828e55..580af574fe73 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -473,3 +473,5 @@
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
467 common open_tree_attr sys_open_tree_attr
+468 common file_getattr sys_file_getattr
+469 common file_setattr sys_file_setattr
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 1e48184ecf1e..934eb961bd0d 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2024,7 +2024,6 @@ config CPU_MIPSR5
config CPU_MIPSR6
bool
default y if CPU_MIPS32_R6 || CPU_MIPS64_R6
- select ARCH_HAS_CRC32
select CPU_HAS_RIXI
select CPU_HAS_DIEI if !CPU_DIEI_BROKEN
select HAVE_ARCH_BITREVERSE
diff --git a/arch/mips/cavium-octeon/Kconfig b/arch/mips/cavium-octeon/Kconfig
index 11f4aa6e80e9..450e979ef5d9 100644
--- a/arch/mips/cavium-octeon/Kconfig
+++ b/arch/mips/cavium-octeon/Kconfig
@@ -23,12 +23,6 @@ config CAVIUM_OCTEON_CVMSEG_SIZE
legally range is from zero to 54 cache blocks (i.e. CVMSEG LM is
between zero and 6192 bytes).
-config CRYPTO_SHA256_OCTEON
- tristate
- default CRYPTO_LIB_SHA256
- select CRYPTO_ARCH_HAVE_LIB_SHA256
- select CRYPTO_LIB_SHA256_GENERIC
-
endif # CPU_CAVIUM_OCTEON
if CAVIUM_OCTEON_SOC
diff --git a/arch/mips/cavium-octeon/crypto/Makefile b/arch/mips/cavium-octeon/crypto/Makefile
index db26c73fa0ed..83f2f5dd93cc 100644
--- a/arch/mips/cavium-octeon/crypto/Makefile
+++ b/arch/mips/cavium-octeon/crypto/Makefile
@@ -6,6 +6,3 @@
obj-y += octeon-crypto.o
obj-$(CONFIG_CRYPTO_MD5_OCTEON) += octeon-md5.o
-obj-$(CONFIG_CRYPTO_SHA1_OCTEON) += octeon-sha1.o
-obj-$(CONFIG_CRYPTO_SHA256_OCTEON) += octeon-sha256.o
-obj-$(CONFIG_CRYPTO_SHA512_OCTEON) += octeon-sha512.o
diff --git a/arch/mips/cavium-octeon/crypto/octeon-crypto.c b/arch/mips/cavium-octeon/crypto/octeon-crypto.c
index cfb4a146cf17..0ff8559391f5 100644
--- a/arch/mips/cavium-octeon/crypto/octeon-crypto.c
+++ b/arch/mips/cavium-octeon/crypto/octeon-crypto.c
@@ -7,12 +7,11 @@
*/
#include <asm/cop2.h>
+#include <asm/octeon/crypto.h>
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/sched/task_stack.h>
-#include "octeon-crypto.h"
-
/**
* Enable access to Octeon's COP2 crypto hardware for kernel use. Wrap any
* crypto operations in calls to octeon_crypto_enable/disable in order to make
diff --git a/arch/mips/cavium-octeon/crypto/octeon-md5.c b/arch/mips/cavium-octeon/crypto/octeon-md5.c
index fbc84eb7fedf..a8ce831e2ceb 100644
--- a/arch/mips/cavium-octeon/crypto/octeon-md5.c
+++ b/arch/mips/cavium-octeon/crypto/octeon-md5.c
@@ -19,6 +19,7 @@
* any later version.
*/
+#include <asm/octeon/crypto.h>
#include <asm/octeon/octeon.h>
#include <crypto/internal/hash.h>
#include <crypto/md5.h>
@@ -27,8 +28,6 @@
#include <linux/string.h>
#include <linux/unaligned.h>
-#include "octeon-crypto.h"
-
struct octeon_md5_state {
__le32 hash[MD5_HASH_WORDS];
u64 byte_count;
diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha1.c b/arch/mips/cavium-octeon/crypto/octeon-sha1.c
deleted file mode 100644
index e70f21a473da..000000000000
--- a/arch/mips/cavium-octeon/crypto/octeon-sha1.c
+++ /dev/null
@@ -1,147 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Cryptographic API.
- *
- * SHA1 Secure Hash Algorithm.
- *
- * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
- *
- * Based on crypto/sha1_generic.c, which is:
- *
- * Copyright (c) Alan Smithee.
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
- */
-
-#include <asm/octeon/octeon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include "octeon-crypto.h"
-
-/*
- * We pass everything as 64-bit. OCTEON can handle misaligned data.
- */
-
-static void octeon_sha1_store_hash(struct sha1_state *sctx)
-{
- u64 *hash = (u64 *)sctx->state;
- union {
- u32 word[2];
- u64 dword;
- } hash_tail = { { sctx->state[4], } };
-
- write_octeon_64bit_hash_dword(hash[0], 0);
- write_octeon_64bit_hash_dword(hash[1], 1);
- write_octeon_64bit_hash_dword(hash_tail.dword, 2);
- memzero_explicit(&hash_tail.word[0], sizeof(hash_tail.word[0]));
-}
-
-static void octeon_sha1_read_hash(struct sha1_state *sctx)
-{
- u64 *hash = (u64 *)sctx->state;
- union {
- u32 word[2];
- u64 dword;
- } hash_tail;
-
- hash[0] = read_octeon_64bit_hash_dword(0);
- hash[1] = read_octeon_64bit_hash_dword(1);
- hash_tail.dword = read_octeon_64bit_hash_dword(2);
- sctx->state[4] = hash_tail.word[0];
- memzero_explicit(&hash_tail.dword, sizeof(hash_tail.dword));
-}
-
-static void octeon_sha1_transform(struct sha1_state *sctx, const u8 *src,
- int blocks)
-{
- do {
- const u64 *block = (const u64 *)src;
-
- write_octeon_64bit_block_dword(block[0], 0);
- write_octeon_64bit_block_dword(block[1], 1);
- write_octeon_64bit_block_dword(block[2], 2);
- write_octeon_64bit_block_dword(block[3], 3);
- write_octeon_64bit_block_dword(block[4], 4);
- write_octeon_64bit_block_dword(block[5], 5);
- write_octeon_64bit_block_dword(block[6], 6);
- octeon_sha1_start(block[7]);
-
- src += SHA1_BLOCK_SIZE;
- } while (--blocks);
-}
-
-static int octeon_sha1_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- struct sha1_state *sctx = shash_desc_ctx(desc);
- struct octeon_cop2_state state;
- unsigned long flags;
- int remain;
-
- flags = octeon_crypto_enable(&state);
- octeon_sha1_store_hash(sctx);
-
- remain = sha1_base_do_update_blocks(desc, data, len,
- octeon_sha1_transform);
-
- octeon_sha1_read_hash(sctx);
- octeon_crypto_disable(&state, flags);
- return remain;
-}
-
-static int octeon_sha1_finup(struct shash_desc *desc, const u8 *src,
- unsigned int len, u8 *out)
-{
- struct sha1_state *sctx = shash_desc_ctx(desc);
- struct octeon_cop2_state state;
- unsigned long flags;
-
- flags = octeon_crypto_enable(&state);
- octeon_sha1_store_hash(sctx);
-
- sha1_base_do_finup(desc, src, len, octeon_sha1_transform);
-
- octeon_sha1_read_hash(sctx);
- octeon_crypto_disable(&state, flags);
- return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg octeon_sha1_alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_base_init,
- .update = octeon_sha1_update,
- .finup = octeon_sha1_finup,
- .descsize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name= "octeon-sha1",
- .cra_priority = OCTEON_CR_OPCODE_PRIORITY,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static int __init octeon_sha1_mod_init(void)
-{
- if (!octeon_has_crypto())
- return -ENOTSUPP;
- return crypto_register_shash(&octeon_sha1_alg);
-}
-
-static void __exit octeon_sha1_mod_fini(void)
-{
- crypto_unregister_shash(&octeon_sha1_alg);
-}
-
-module_init(octeon_sha1_mod_init);
-module_exit(octeon_sha1_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm (OCTEON)");
-MODULE_AUTHOR("Aaro Koskinen <aaro.koskinen@iki.fi>");
diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha256.c b/arch/mips/cavium-octeon/crypto/octeon-sha256.c
deleted file mode 100644
index f93faaf1f4af..000000000000
--- a/arch/mips/cavium-octeon/crypto/octeon-sha256.c
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 Secure Hash Algorithm.
- *
- * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
- *
- * Based on crypto/sha256_generic.c, which is:
- *
- * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
- * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
- */
-
-#include <asm/octeon/octeon.h>
-#include <crypto/internal/sha2.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include "octeon-crypto.h"
-
-/*
- * We pass everything as 64-bit. OCTEON can handle misaligned data.
- */
-
-void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks)
-{
- struct octeon_cop2_state cop2_state;
- u64 *state64 = (u64 *)state;
- unsigned long flags;
-
- if (!octeon_has_crypto())
- return sha256_blocks_generic(state, data, nblocks);
-
- flags = octeon_crypto_enable(&cop2_state);
- write_octeon_64bit_hash_dword(state64[0], 0);
- write_octeon_64bit_hash_dword(state64[1], 1);
- write_octeon_64bit_hash_dword(state64[2], 2);
- write_octeon_64bit_hash_dword(state64[3], 3);
-
- do {
- const u64 *block = (const u64 *)data;
-
- write_octeon_64bit_block_dword(block[0], 0);
- write_octeon_64bit_block_dword(block[1], 1);
- write_octeon_64bit_block_dword(block[2], 2);
- write_octeon_64bit_block_dword(block[3], 3);
- write_octeon_64bit_block_dword(block[4], 4);
- write_octeon_64bit_block_dword(block[5], 5);
- write_octeon_64bit_block_dword(block[6], 6);
- octeon_sha256_start(block[7]);
-
- data += SHA256_BLOCK_SIZE;
- } while (--nblocks);
-
- state64[0] = read_octeon_64bit_hash_dword(0);
- state64[1] = read_octeon_64bit_hash_dword(1);
- state64[2] = read_octeon_64bit_hash_dword(2);
- state64[3] = read_octeon_64bit_hash_dword(3);
- octeon_crypto_disable(&cop2_state, flags);
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-
-bool sha256_is_arch_optimized(void)
-{
- return octeon_has_crypto();
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 Secure Hash Algorithm (OCTEON)");
-MODULE_AUTHOR("Aaro Koskinen <aaro.koskinen@iki.fi>");
diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha512.c b/arch/mips/cavium-octeon/crypto/octeon-sha512.c
deleted file mode 100644
index 215311053db3..000000000000
--- a/arch/mips/cavium-octeon/crypto/octeon-sha512.c
+++ /dev/null
@@ -1,167 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Cryptographic API.
- *
- * SHA-512 and SHA-384 Secure Hash Algorithm.
- *
- * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
- *
- * Based on crypto/sha512_generic.c, which is:
- *
- * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
- */
-
-#include <asm/octeon/octeon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include "octeon-crypto.h"
-
-/*
- * We pass everything as 64-bit. OCTEON can handle misaligned data.
- */
-
-static void octeon_sha512_store_hash(struct sha512_state *sctx)
-{
- write_octeon_64bit_hash_sha512(sctx->state[0], 0);
- write_octeon_64bit_hash_sha512(sctx->state[1], 1);
- write_octeon_64bit_hash_sha512(sctx->state[2], 2);
- write_octeon_64bit_hash_sha512(sctx->state[3], 3);
- write_octeon_64bit_hash_sha512(sctx->state[4], 4);
- write_octeon_64bit_hash_sha512(sctx->state[5], 5);
- write_octeon_64bit_hash_sha512(sctx->state[6], 6);
- write_octeon_64bit_hash_sha512(sctx->state[7], 7);
-}
-
-static void octeon_sha512_read_hash(struct sha512_state *sctx)
-{
- sctx->state[0] = read_octeon_64bit_hash_sha512(0);
- sctx->state[1] = read_octeon_64bit_hash_sha512(1);
- sctx->state[2] = read_octeon_64bit_hash_sha512(2);
- sctx->state[3] = read_octeon_64bit_hash_sha512(3);
- sctx->state[4] = read_octeon_64bit_hash_sha512(4);
- sctx->state[5] = read_octeon_64bit_hash_sha512(5);
- sctx->state[6] = read_octeon_64bit_hash_sha512(6);
- sctx->state[7] = read_octeon_64bit_hash_sha512(7);
-}
-
-static void octeon_sha512_transform(struct sha512_state *sctx,
- const u8 *src, int blocks)
-{
- do {
- const u64 *block = (const u64 *)src;
-
- write_octeon_64bit_block_sha512(block[0], 0);
- write_octeon_64bit_block_sha512(block[1], 1);
- write_octeon_64bit_block_sha512(block[2], 2);
- write_octeon_64bit_block_sha512(block[3], 3);
- write_octeon_64bit_block_sha512(block[4], 4);
- write_octeon_64bit_block_sha512(block[5], 5);
- write_octeon_64bit_block_sha512(block[6], 6);
- write_octeon_64bit_block_sha512(block[7], 7);
- write_octeon_64bit_block_sha512(block[8], 8);
- write_octeon_64bit_block_sha512(block[9], 9);
- write_octeon_64bit_block_sha512(block[10], 10);
- write_octeon_64bit_block_sha512(block[11], 11);
- write_octeon_64bit_block_sha512(block[12], 12);
- write_octeon_64bit_block_sha512(block[13], 13);
- write_octeon_64bit_block_sha512(block[14], 14);
- octeon_sha512_start(block[15]);
-
- src += SHA512_BLOCK_SIZE;
- } while (--blocks);
-}
-
-static int octeon_sha512_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- struct sha512_state *sctx = shash_desc_ctx(desc);
- struct octeon_cop2_state state;
- unsigned long flags;
- int remain;
-
- flags = octeon_crypto_enable(&state);
- octeon_sha512_store_hash(sctx);
-
- remain = sha512_base_do_update_blocks(desc, data, len,
- octeon_sha512_transform);
-
- octeon_sha512_read_hash(sctx);
- octeon_crypto_disable(&state, flags);
- return remain;
-}
-
-static int octeon_sha512_finup(struct shash_desc *desc, const u8 *src,
- unsigned int len, u8 *hash)
-{
- struct sha512_state *sctx = shash_desc_ctx(desc);
- struct octeon_cop2_state state;
- unsigned long flags;
-
- flags = octeon_crypto_enable(&state);
- octeon_sha512_store_hash(sctx);
-
- sha512_base_do_finup(desc, src, len, octeon_sha512_transform);
-
- octeon_sha512_read_hash(sctx);
- octeon_crypto_disable(&state, flags);
- return sha512_base_finish(desc, hash);
-}
-
-static struct shash_alg octeon_sha512_algs[2] = { {
- .digestsize = SHA512_DIGEST_SIZE,
- .init = sha512_base_init,
- .update = octeon_sha512_update,
- .finup = octeon_sha512_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha512",
- .cra_driver_name= "octeon-sha512",
- .cra_priority = OCTEON_CR_OPCODE_PRIORITY,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA384_DIGEST_SIZE,
- .init = sha384_base_init,
- .update = octeon_sha512_update,
- .finup = octeon_sha512_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha384",
- .cra_driver_name= "octeon-sha384",
- .cra_priority = OCTEON_CR_OPCODE_PRIORITY,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA384_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static int __init octeon_sha512_mod_init(void)
-{
- if (!octeon_has_crypto())
- return -ENOTSUPP;
- return crypto_register_shashes(octeon_sha512_algs,
- ARRAY_SIZE(octeon_sha512_algs));
-}
-
-static void __exit octeon_sha512_mod_fini(void)
-{
- crypto_unregister_shashes(octeon_sha512_algs,
- ARRAY_SIZE(octeon_sha512_algs));
-}
-
-module_init(octeon_sha512_mod_init);
-module_exit(octeon_sha512_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-512 and SHA-384 Secure Hash Algorithms (OCTEON)");
-MODULE_AUTHOR("Aaro Koskinen <aaro.koskinen@iki.fi>");
diff --git a/arch/mips/configs/cavium_octeon_defconfig b/arch/mips/configs/cavium_octeon_defconfig
index 88ae0aa85364..3f50e1d78894 100644
--- a/arch/mips/configs/cavium_octeon_defconfig
+++ b/arch/mips/configs/cavium_octeon_defconfig
@@ -156,8 +156,6 @@ CONFIG_SECURITY_NETWORK=y
CONFIG_CRYPTO_CBC=y
CONFIG_CRYPTO_HMAC=y
CONFIG_CRYPTO_MD5_OCTEON=y
-CONFIG_CRYPTO_SHA1_OCTEON=m
-CONFIG_CRYPTO_SHA512_OCTEON=m
CONFIG_CRYPTO_DES=y
CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
CONFIG_DEBUG_FS=y
diff --git a/arch/mips/crypto/Kconfig b/arch/mips/crypto/Kconfig
index 6bf073ae7613..7b91f4ec65bf 100644
--- a/arch/mips/crypto/Kconfig
+++ b/arch/mips/crypto/Kconfig
@@ -12,24 +12,4 @@ config CRYPTO_MD5_OCTEON
Architecture: mips OCTEON using crypto instructions, when available
-config CRYPTO_SHA1_OCTEON
- tristate "Hash functions: SHA-1 (OCTEON)"
- depends on CPU_CAVIUM_OCTEON
- select CRYPTO_SHA1
- select CRYPTO_HASH
- help
- SHA-1 secure hash algorithm (FIPS 180)
-
- Architecture: mips OCTEON
-
-config CRYPTO_SHA512_OCTEON
- tristate "Hash functions: SHA-384 and SHA-512 (OCTEON)"
- depends on CPU_CAVIUM_OCTEON
- select CRYPTO_SHA512
- select CRYPTO_HASH
- help
- SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
-
- Architecture: mips OCTEON using crypto instructions, when available
-
endmenu
diff --git a/arch/mips/cavium-octeon/crypto/octeon-crypto.h b/arch/mips/include/asm/octeon/crypto.h
index cb68f9e284bb..cb68f9e284bb 100644
--- a/arch/mips/cavium-octeon/crypto/octeon-crypto.h
+++ b/arch/mips/include/asm/octeon/crypto.h
diff --git a/arch/mips/include/asm/time.h b/arch/mips/include/asm/time.h
index e855a3611d92..5e7193b759f3 100644
--- a/arch/mips/include/asm/time.h
+++ b/arch/mips/include/asm/time.h
@@ -55,7 +55,7 @@ static inline int mips_clockevent_init(void)
*/
extern int init_r4k_clocksource(void);
-static inline int init_mips_clocksource(void)
+static inline __init int init_mips_clocksource(void)
{
#ifdef CONFIG_CSRC_R4K
return init_r4k_clocksource();
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index b890d64d352c..3f4c94c88124 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -935,7 +935,7 @@ int regs_query_register_offset(const char *name)
static const struct user_regset mips_regsets[] = {
[REGSET_GPR] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = ELF_NGREG,
.size = sizeof(unsigned int),
.align = sizeof(unsigned int),
@@ -943,7 +943,7 @@ static const struct user_regset mips_regsets[] = {
.set = gpr32_set,
},
[REGSET_DSP] = {
- .core_note_type = NT_MIPS_DSP,
+ USER_REGSET_NOTE_TYPE(MIPS_DSP),
.n = NUM_DSP_REGS + 1,
.size = sizeof(u32),
.align = sizeof(u32),
@@ -953,7 +953,7 @@ static const struct user_regset mips_regsets[] = {
},
#ifdef CONFIG_MIPS_FP_SUPPORT
[REGSET_FPR] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = ELF_NFPREG,
.size = sizeof(elf_fpreg_t),
.align = sizeof(elf_fpreg_t),
@@ -961,7 +961,7 @@ static const struct user_regset mips_regsets[] = {
.set = fpr_set,
},
[REGSET_FP_MODE] = {
- .core_note_type = NT_MIPS_FP_MODE,
+ USER_REGSET_NOTE_TYPE(MIPS_FP_MODE),
.n = 1,
.size = sizeof(int),
.align = sizeof(int),
@@ -971,7 +971,7 @@ static const struct user_regset mips_regsets[] = {
#endif
#ifdef CONFIG_CPU_HAS_MSA
[REGSET_MSA] = {
- .core_note_type = NT_MIPS_MSA,
+ USER_REGSET_NOTE_TYPE(MIPS_MSA),
.n = NUM_FPU_REGS + 1,
.size = 16,
.align = 16,
@@ -995,7 +995,7 @@ static const struct user_regset_view user_mips_view = {
static const struct user_regset mips64_regsets[] = {
[REGSET_GPR] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = ELF_NGREG,
.size = sizeof(unsigned long),
.align = sizeof(unsigned long),
@@ -1003,7 +1003,7 @@ static const struct user_regset mips64_regsets[] = {
.set = gpr64_set,
},
[REGSET_DSP] = {
- .core_note_type = NT_MIPS_DSP,
+ USER_REGSET_NOTE_TYPE(MIPS_DSP),
.n = NUM_DSP_REGS + 1,
.size = sizeof(u64),
.align = sizeof(u64),
@@ -1013,7 +1013,7 @@ static const struct user_regset mips64_regsets[] = {
},
#ifdef CONFIG_MIPS_FP_SUPPORT
[REGSET_FP_MODE] = {
- .core_note_type = NT_MIPS_FP_MODE,
+ USER_REGSET_NOTE_TYPE(MIPS_FP_MODE),
.n = 1,
.size = sizeof(int),
.align = sizeof(int),
@@ -1021,7 +1021,7 @@ static const struct user_regset mips64_regsets[] = {
.set = fp_mode_set,
},
[REGSET_FPR] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = ELF_NFPREG,
.size = sizeof(elf_fpreg_t),
.align = sizeof(elf_fpreg_t),
@@ -1031,7 +1031,7 @@ static const struct user_regset mips64_regsets[] = {
#endif
#ifdef CONFIG_CPU_HAS_MSA
[REGSET_MSA] = {
- .core_note_type = NT_MIPS_MSA,
+ USER_REGSET_NOTE_TYPE(MIPS_MSA),
.n = NUM_FPU_REGS + 1,
.size = 16,
.align = 16,
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index aa70e371bb54..d824ffe9a014 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -406,3 +406,5 @@
465 n32 listxattrat sys_listxattrat
466 n32 removexattrat sys_removexattrat
467 n32 open_tree_attr sys_open_tree_attr
+468 n32 file_getattr sys_file_getattr
+469 n32 file_setattr sys_file_setattr
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 1e8c44c7b614..7a7049c2c307 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -382,3 +382,5 @@
465 n64 listxattrat sys_listxattrat
466 n64 removexattrat sys_removexattrat
467 n64 open_tree_attr sys_open_tree_attr
+468 n64 file_getattr sys_file_getattr
+469 n64 file_setattr sys_file_setattr
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 114a5a1a6230..d330274f0601 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -455,3 +455,5 @@
465 o32 listxattrat sys_listxattrat
466 o32 removexattrat sys_removexattrat
467 o32 open_tree_attr sys_open_tree_attr
+468 o32 file_getattr sys_file_getattr
+469 o32 file_setattr sys_file_setattr
diff --git a/arch/mips/lib/.gitignore b/arch/mips/lib/.gitignore
new file mode 100644
index 000000000000..647d7a922e68
--- /dev/null
+++ b/arch/mips/lib/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# This now-removed directory used to contain generated files.
+/crypto/
diff --git a/arch/mips/lib/Makefile b/arch/mips/lib/Makefile
index 9d75845ef78e..5d5b993cbc2b 100644
--- a/arch/mips/lib/Makefile
+++ b/arch/mips/lib/Makefile
@@ -3,8 +3,6 @@
# Makefile for MIPS-specific library files..
#
-obj-y += crypto/
-
lib-y += bitops.o csum_partial.o delay.o memcpy.o memset.o \
mips-atomic.o strncpy_user.o \
strnlen_user.o uncached.o
@@ -16,7 +14,5 @@ lib-$(CONFIG_GENERIC_CSUM) := $(filter-out csum_partial.o, $(lib-y))
obj-$(CONFIG_CPU_GENERIC_DUMP_TLB) += dump_tlb.o
obj-$(CONFIG_CPU_R3000) += r3k_dump_tlb.o
-obj-$(CONFIG_CRC32_ARCH) += crc32-mips.o
-
# libgcc-style stuff needed in the kernel
obj-y += bswapsi.o bswapdi.o multi3.o
diff --git a/arch/mips/lib/crc32-mips.c b/arch/mips/lib/crc32-mips.c
deleted file mode 100644
index 45e4d2c9fbf5..000000000000
--- a/arch/mips/lib/crc32-mips.c
+++ /dev/null
@@ -1,183 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * crc32-mips.c - CRC32 and CRC32C using optional MIPSr6 instructions
- *
- * Module based on arm64/crypto/crc32-arm.c
- *
- * Copyright (C) 2014 Linaro Ltd <yazen.ghannam@linaro.org>
- * Copyright (C) 2018 MIPS Tech, LLC
- */
-
-#include <linux/cpufeature.h>
-#include <linux/crc32.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <asm/mipsregs.h>
-#include <linux/unaligned.h>
-
-#ifndef TOOLCHAIN_SUPPORTS_CRC
-#define _ASM_SET_CRC(OP, SZ, TYPE) \
-_ASM_MACRO_3R(OP, rt, rs, rt2, \
- ".ifnc \\rt, \\rt2\n\t" \
- ".error \"invalid operands \\\"" #OP " \\rt,\\rs,\\rt2\\\"\"\n\t" \
- ".endif\n\t" \
- _ASM_INSN_IF_MIPS(0x7c00000f | (__rt << 16) | (__rs << 21) | \
- ((SZ) << 6) | ((TYPE) << 8)) \
- _ASM_INSN32_IF_MM(0x00000030 | (__rs << 16) | (__rt << 21) | \
- ((SZ) << 14) | ((TYPE) << 3)))
-#define _ASM_UNSET_CRC(op, SZ, TYPE) ".purgem " #op "\n\t"
-#else /* !TOOLCHAIN_SUPPORTS_CRC */
-#define _ASM_SET_CRC(op, SZ, TYPE) ".set\tcrc\n\t"
-#define _ASM_UNSET_CRC(op, SZ, TYPE)
-#endif
-
-#define __CRC32(crc, value, op, SZ, TYPE) \
-do { \
- __asm__ __volatile__( \
- ".set push\n\t" \
- _ASM_SET_CRC(op, SZ, TYPE) \
- #op " %0, %1, %0\n\t" \
- _ASM_UNSET_CRC(op, SZ, TYPE) \
- ".set pop" \
- : "+r" (crc) \
- : "r" (value)); \
-} while (0)
-
-#define _CRC32_crc32b(crc, value) __CRC32(crc, value, crc32b, 0, 0)
-#define _CRC32_crc32h(crc, value) __CRC32(crc, value, crc32h, 1, 0)
-#define _CRC32_crc32w(crc, value) __CRC32(crc, value, crc32w, 2, 0)
-#define _CRC32_crc32d(crc, value) __CRC32(crc, value, crc32d, 3, 0)
-#define _CRC32_crc32cb(crc, value) __CRC32(crc, value, crc32cb, 0, 1)
-#define _CRC32_crc32ch(crc, value) __CRC32(crc, value, crc32ch, 1, 1)
-#define _CRC32_crc32cw(crc, value) __CRC32(crc, value, crc32cw, 2, 1)
-#define _CRC32_crc32cd(crc, value) __CRC32(crc, value, crc32cd, 3, 1)
-
-#define _CRC32(crc, value, size, op) \
- _CRC32_##op##size(crc, value)
-
-#define CRC32(crc, value, size) \
- _CRC32(crc, value, size, crc32)
-
-#define CRC32C(crc, value, size) \
- _CRC32(crc, value, size, crc32c)
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
-
-u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
-{
- if (!static_branch_likely(&have_crc32))
- return crc32_le_base(crc, p, len);
-
- if (IS_ENABLED(CONFIG_64BIT)) {
- for (; len >= sizeof(u64); p += sizeof(u64), len -= sizeof(u64)) {
- u64 value = get_unaligned_le64(p);
-
- CRC32(crc, value, d);
- }
-
- if (len & sizeof(u32)) {
- u32 value = get_unaligned_le32(p);
-
- CRC32(crc, value, w);
- p += sizeof(u32);
- }
- } else {
- for (; len >= sizeof(u32); len -= sizeof(u32)) {
- u32 value = get_unaligned_le32(p);
-
- CRC32(crc, value, w);
- p += sizeof(u32);
- }
- }
-
- if (len & sizeof(u16)) {
- u16 value = get_unaligned_le16(p);
-
- CRC32(crc, value, h);
- p += sizeof(u16);
- }
-
- if (len & sizeof(u8)) {
- u8 value = *p++;
-
- CRC32(crc, value, b);
- }
-
- return crc;
-}
-EXPORT_SYMBOL(crc32_le_arch);
-
-u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
-{
- if (!static_branch_likely(&have_crc32))
- return crc32c_base(crc, p, len);
-
- if (IS_ENABLED(CONFIG_64BIT)) {
- for (; len >= sizeof(u64); p += sizeof(u64), len -= sizeof(u64)) {
- u64 value = get_unaligned_le64(p);
-
- CRC32C(crc, value, d);
- }
-
- if (len & sizeof(u32)) {
- u32 value = get_unaligned_le32(p);
-
- CRC32C(crc, value, w);
- p += sizeof(u32);
- }
- } else {
- for (; len >= sizeof(u32); len -= sizeof(u32)) {
- u32 value = get_unaligned_le32(p);
-
- CRC32C(crc, value, w);
- p += sizeof(u32);
- }
- }
-
- if (len & sizeof(u16)) {
- u16 value = get_unaligned_le16(p);
-
- CRC32C(crc, value, h);
- p += sizeof(u16);
- }
-
- if (len & sizeof(u8)) {
- u8 value = *p++;
-
- CRC32C(crc, value, b);
- }
- return crc;
-}
-EXPORT_SYMBOL(crc32c_arch);
-
-u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
-{
- return crc32_be_base(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_be_arch);
-
-static int __init crc32_mips_init(void)
-{
- if (cpu_have_feature(cpu_feature(MIPS_CRC32)))
- static_branch_enable(&have_crc32);
- return 0;
-}
-subsys_initcall(crc32_mips_init);
-
-static void __exit crc32_mips_exit(void)
-{
-}
-module_exit(crc32_mips_exit);
-
-u32 crc32_optimizations(void)
-{
- if (static_key_enabled(&have_crc32))
- return CRC32_LE_OPTIMIZATION | CRC32C_OPTIMIZATION;
- return 0;
-}
-EXPORT_SYMBOL(crc32_optimizations);
-
-MODULE_AUTHOR("Marcin Nowakowski <marcin.nowakowski@mips.com");
-MODULE_DESCRIPTION("CRC32 and CRC32C using optional MIPS instructions");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/mips/lib/crypto/.gitignore b/arch/mips/lib/crypto/.gitignore
deleted file mode 100644
index 0d47d4f21c6d..000000000000
--- a/arch/mips/lib/crypto/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-poly1305-core.S
diff --git a/arch/mips/lib/crypto/Kconfig b/arch/mips/lib/crypto/Kconfig
deleted file mode 100644
index 0670a170c1be..000000000000
--- a/arch/mips/lib/crypto/Kconfig
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_CHACHA_MIPS
- tristate
- depends on CPU_MIPS32_R2
- default CRYPTO_LIB_CHACHA
- select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_POLY1305_MIPS
- tristate
- default CRYPTO_LIB_POLY1305
- select CRYPTO_ARCH_HAVE_LIB_POLY1305
diff --git a/arch/mips/lib/crypto/Makefile b/arch/mips/lib/crypto/Makefile
deleted file mode 100644
index 804488c7aded..000000000000
--- a/arch/mips/lib/crypto/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
-chacha-mips-y := chacha-core.o chacha-glue.o
-AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
-
-obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
-poly1305-mips-y := poly1305-core.o poly1305-glue.o
-
-perlasm-flavour-$(CONFIG_32BIT) := o32
-perlasm-flavour-$(CONFIG_64BIT) := 64
-
-quiet_cmd_perlasm = PERLASM $@
- cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
-
-$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
- $(call if_changed,perlasm)
-
-targets += poly1305-core.S
diff --git a/arch/mips/lib/crypto/chacha-core.S b/arch/mips/lib/crypto/chacha-core.S
deleted file mode 100644
index 5755f69cfe00..000000000000
--- a/arch/mips/lib/crypto/chacha-core.S
+++ /dev/null
@@ -1,497 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#define MASK_U32 0x3c
-#define CHACHA20_BLOCK_SIZE 64
-#define STACK_SIZE 32
-
-#define X0 $t0
-#define X1 $t1
-#define X2 $t2
-#define X3 $t3
-#define X4 $t4
-#define X5 $t5
-#define X6 $t6
-#define X7 $t7
-#define X8 $t8
-#define X9 $t9
-#define X10 $v1
-#define X11 $s6
-#define X12 $s5
-#define X13 $s4
-#define X14 $s3
-#define X15 $s2
-/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
-#define T0 $s1
-#define T1 $s0
-#define T(n) T ## n
-#define X(n) X ## n
-
-/* Input arguments */
-#define STATE $a0
-#define OUT $a1
-#define IN $a2
-#define BYTES $a3
-
-/* Output argument */
-/* NONCE[0] is kept in a register and not in memory.
- * We don't want to touch original value in memory.
- * Must be incremented every loop iteration.
- */
-#define NONCE_0 $v0
-
-/* SAVED_X and SAVED_CA are set in the jump table.
- * Use regs which are overwritten on exit else we don't leak clear data.
- * They are used to handling the last bytes which are not multiple of 4.
- */
-#define SAVED_X X15
-#define SAVED_CA $s7
-
-#define IS_UNALIGNED $s7
-
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-#define MSB 0
-#define LSB 3
-#define ROTx rotl
-#define ROTR(n) rotr n, 24
-#define CPU_TO_LE32(n) \
- wsbh n; \
- rotr n, 16;
-#else
-#define MSB 3
-#define LSB 0
-#define ROTx rotr
-#define CPU_TO_LE32(n)
-#define ROTR(n)
-#endif
-
-#define FOR_EACH_WORD(x) \
- x( 0); \
- x( 1); \
- x( 2); \
- x( 3); \
- x( 4); \
- x( 5); \
- x( 6); \
- x( 7); \
- x( 8); \
- x( 9); \
- x(10); \
- x(11); \
- x(12); \
- x(13); \
- x(14); \
- x(15);
-
-#define FOR_EACH_WORD_REV(x) \
- x(15); \
- x(14); \
- x(13); \
- x(12); \
- x(11); \
- x(10); \
- x( 9); \
- x( 8); \
- x( 7); \
- x( 6); \
- x( 5); \
- x( 4); \
- x( 3); \
- x( 2); \
- x( 1); \
- x( 0);
-
-#define PLUS_ONE_0 1
-#define PLUS_ONE_1 2
-#define PLUS_ONE_2 3
-#define PLUS_ONE_3 4
-#define PLUS_ONE_4 5
-#define PLUS_ONE_5 6
-#define PLUS_ONE_6 7
-#define PLUS_ONE_7 8
-#define PLUS_ONE_8 9
-#define PLUS_ONE_9 10
-#define PLUS_ONE_10 11
-#define PLUS_ONE_11 12
-#define PLUS_ONE_12 13
-#define PLUS_ONE_13 14
-#define PLUS_ONE_14 15
-#define PLUS_ONE_15 16
-#define PLUS_ONE(x) PLUS_ONE_ ## x
-#define _CONCAT3(a,b,c) a ## b ## c
-#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
-
-#define STORE_UNALIGNED(x) \
-CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
- .if (x != 12); \
- lw T0, (x*4)(STATE); \
- .endif; \
- lwl T1, (x*4)+MSB ## (IN); \
- lwr T1, (x*4)+LSB ## (IN); \
- .if (x == 12); \
- addu X ## x, NONCE_0; \
- .else; \
- addu X ## x, T0; \
- .endif; \
- CPU_TO_LE32(X ## x); \
- xor X ## x, T1; \
- swl X ## x, (x*4)+MSB ## (OUT); \
- swr X ## x, (x*4)+LSB ## (OUT);
-
-#define STORE_ALIGNED(x) \
-CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
- .if (x != 12); \
- lw T0, (x*4)(STATE); \
- .endif; \
- lw T1, (x*4) ## (IN); \
- .if (x == 12); \
- addu X ## x, NONCE_0; \
- .else; \
- addu X ## x, T0; \
- .endif; \
- CPU_TO_LE32(X ## x); \
- xor X ## x, T1; \
- sw X ## x, (x*4) ## (OUT);
-
-/* Jump table macro.
- * Used for setup and handling the last bytes, which are not multiple of 4.
- * X15 is free to store Xn
- * Every jumptable entry must be equal in size.
- */
-#define JMPTBL_ALIGNED(x) \
-.Lchacha_mips_jmptbl_aligned_ ## x: ; \
- .set noreorder; \
- b .Lchacha_mips_xor_aligned_ ## x ## _b; \
- .if (x == 12); \
- addu SAVED_X, X ## x, NONCE_0; \
- .else; \
- addu SAVED_X, X ## x, SAVED_CA; \
- .endif; \
- .set reorder
-
-#define JMPTBL_UNALIGNED(x) \
-.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
- .set noreorder; \
- b .Lchacha_mips_xor_unaligned_ ## x ## _b; \
- .if (x == 12); \
- addu SAVED_X, X ## x, NONCE_0; \
- .else; \
- addu SAVED_X, X ## x, SAVED_CA; \
- .endif; \
- .set reorder
-
-#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
- addu X(A), X(K); \
- addu X(B), X(L); \
- addu X(C), X(M); \
- addu X(D), X(N); \
- xor X(V), X(A); \
- xor X(W), X(B); \
- xor X(Y), X(C); \
- xor X(Z), X(D); \
- rotl X(V), S; \
- rotl X(W), S; \
- rotl X(Y), S; \
- rotl X(Z), S;
-
-.text
-.set reorder
-.set noat
-.globl chacha_crypt_arch
-.ent chacha_crypt_arch
-chacha_crypt_arch:
- .frame $sp, STACK_SIZE, $ra
-
- /* Load number of rounds */
- lw $at, 16($sp)
-
- addiu $sp, -STACK_SIZE
-
- /* Return bytes = 0. */
- beqz BYTES, .Lchacha_mips_end
-
- lw NONCE_0, 48(STATE)
-
- /* Save s0-s7 */
- sw $s0, 0($sp)
- sw $s1, 4($sp)
- sw $s2, 8($sp)
- sw $s3, 12($sp)
- sw $s4, 16($sp)
- sw $s5, 20($sp)
- sw $s6, 24($sp)
- sw $s7, 28($sp)
-
- /* Test IN or OUT is unaligned.
- * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
- */
- or IS_UNALIGNED, IN, OUT
- andi IS_UNALIGNED, 0x3
-
- b .Lchacha_rounds_start
-
-.align 4
-.Loop_chacha_rounds:
- addiu IN, CHACHA20_BLOCK_SIZE
- addiu OUT, CHACHA20_BLOCK_SIZE
- addiu NONCE_0, 1
-
-.Lchacha_rounds_start:
- lw X0, 0(STATE)
- lw X1, 4(STATE)
- lw X2, 8(STATE)
- lw X3, 12(STATE)
-
- lw X4, 16(STATE)
- lw X5, 20(STATE)
- lw X6, 24(STATE)
- lw X7, 28(STATE)
- lw X8, 32(STATE)
- lw X9, 36(STATE)
- lw X10, 40(STATE)
- lw X11, 44(STATE)
-
- move X12, NONCE_0
- lw X13, 52(STATE)
- lw X14, 56(STATE)
- lw X15, 60(STATE)
-
-.Loop_chacha_xor_rounds:
- addiu $at, -2
- AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
- AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
- AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
- AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
- AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
- AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
- AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
- AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
- bnez $at, .Loop_chacha_xor_rounds
-
- addiu BYTES, -(CHACHA20_BLOCK_SIZE)
-
- /* Is data src/dst unaligned? Jump */
- bnez IS_UNALIGNED, .Loop_chacha_unaligned
-
- /* Set number rounds here to fill delayslot. */
- lw $at, (STACK_SIZE+16)($sp)
-
- /* BYTES < 0, it has no full block. */
- bltz BYTES, .Lchacha_mips_no_full_block_aligned
-
- FOR_EACH_WORD_REV(STORE_ALIGNED)
-
- /* BYTES > 0? Loop again. */
- bgtz BYTES, .Loop_chacha_rounds
-
- /* Place this here to fill delay slot */
- addiu NONCE_0, 1
-
- /* BYTES < 0? Handle last bytes */
- bltz BYTES, .Lchacha_mips_xor_bytes
-
-.Lchacha_mips_xor_done:
- /* Restore used registers */
- lw $s0, 0($sp)
- lw $s1, 4($sp)
- lw $s2, 8($sp)
- lw $s3, 12($sp)
- lw $s4, 16($sp)
- lw $s5, 20($sp)
- lw $s6, 24($sp)
- lw $s7, 28($sp)
-
- /* Write NONCE_0 back to right location in state */
- sw NONCE_0, 48(STATE)
-
-.Lchacha_mips_end:
- addiu $sp, STACK_SIZE
- jr $ra
-
-.Lchacha_mips_no_full_block_aligned:
- /* Restore the offset on BYTES */
- addiu BYTES, CHACHA20_BLOCK_SIZE
-
- /* Get number of full WORDS */
- andi $at, BYTES, MASK_U32
-
- /* Load upper half of jump table addr */
- lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
-
- /* Calculate lower half jump table offset */
- ins T0, $at, 1, 6
-
- /* Add offset to STATE */
- addu T1, STATE, $at
-
- /* Add lower half jump table addr */
- addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
-
- /* Read value from STATE */
- lw SAVED_CA, 0(T1)
-
- /* Store remaining bytecounter as negative value */
- subu BYTES, $at, BYTES
-
- jr T0
-
- /* Jump table */
- FOR_EACH_WORD(JMPTBL_ALIGNED)
-
-
-.Loop_chacha_unaligned:
- /* Set number rounds here to fill delayslot. */
- lw $at, (STACK_SIZE+16)($sp)
-
- /* BYTES > 0, it has no full block. */
- bltz BYTES, .Lchacha_mips_no_full_block_unaligned
-
- FOR_EACH_WORD_REV(STORE_UNALIGNED)
-
- /* BYTES > 0? Loop again. */
- bgtz BYTES, .Loop_chacha_rounds
-
- /* Write NONCE_0 back to right location in state */
- sw NONCE_0, 48(STATE)
-
- .set noreorder
- /* Fall through to byte handling */
- bgez BYTES, .Lchacha_mips_xor_done
-.Lchacha_mips_xor_unaligned_0_b:
-.Lchacha_mips_xor_aligned_0_b:
- /* Place this here to fill delay slot */
- addiu NONCE_0, 1
- .set reorder
-
-.Lchacha_mips_xor_bytes:
- addu IN, $at
- addu OUT, $at
- /* First byte */
- lbu T1, 0(IN)
- addiu $at, BYTES, 1
- CPU_TO_LE32(SAVED_X)
- ROTR(SAVED_X)
- xor T1, SAVED_X
- sb T1, 0(OUT)
- beqz $at, .Lchacha_mips_xor_done
- /* Second byte */
- lbu T1, 1(IN)
- addiu $at, BYTES, 2
- ROTx SAVED_X, 8
- xor T1, SAVED_X
- sb T1, 1(OUT)
- beqz $at, .Lchacha_mips_xor_done
- /* Third byte */
- lbu T1, 2(IN)
- ROTx SAVED_X, 8
- xor T1, SAVED_X
- sb T1, 2(OUT)
- b .Lchacha_mips_xor_done
-
-.Lchacha_mips_no_full_block_unaligned:
- /* Restore the offset on BYTES */
- addiu BYTES, CHACHA20_BLOCK_SIZE
-
- /* Get number of full WORDS */
- andi $at, BYTES, MASK_U32
-
- /* Load upper half of jump table addr */
- lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
-
- /* Calculate lower half jump table offset */
- ins T0, $at, 1, 6
-
- /* Add offset to STATE */
- addu T1, STATE, $at
-
- /* Add lower half jump table addr */
- addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
-
- /* Read value from STATE */
- lw SAVED_CA, 0(T1)
-
- /* Store remaining bytecounter as negative value */
- subu BYTES, $at, BYTES
-
- jr T0
-
- /* Jump table */
- FOR_EACH_WORD(JMPTBL_UNALIGNED)
-.end chacha_crypt_arch
-.set at
-
-/* Input arguments
- * STATE $a0
- * OUT $a1
- * NROUND $a2
- */
-
-#undef X12
-#undef X13
-#undef X14
-#undef X15
-
-#define X12 $a3
-#define X13 $at
-#define X14 $v0
-#define X15 STATE
-
-.set noat
-.globl hchacha_block_arch
-.ent hchacha_block_arch
-hchacha_block_arch:
- .frame $sp, STACK_SIZE, $ra
-
- addiu $sp, -STACK_SIZE
-
- /* Save X11(s6) */
- sw X11, 0($sp)
-
- lw X0, 0(STATE)
- lw X1, 4(STATE)
- lw X2, 8(STATE)
- lw X3, 12(STATE)
- lw X4, 16(STATE)
- lw X5, 20(STATE)
- lw X6, 24(STATE)
- lw X7, 28(STATE)
- lw X8, 32(STATE)
- lw X9, 36(STATE)
- lw X10, 40(STATE)
- lw X11, 44(STATE)
- lw X12, 48(STATE)
- lw X13, 52(STATE)
- lw X14, 56(STATE)
- lw X15, 60(STATE)
-
-.Loop_hchacha_xor_rounds:
- addiu $a2, -2
- AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
- AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
- AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
- AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
- AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
- AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
- AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
- AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
- bnez $a2, .Loop_hchacha_xor_rounds
-
- /* Restore used register */
- lw X11, 0($sp)
-
- sw X0, 0(OUT)
- sw X1, 4(OUT)
- sw X2, 8(OUT)
- sw X3, 12(OUT)
- sw X12, 16(OUT)
- sw X13, 20(OUT)
- sw X14, 24(OUT)
- sw X15, 28(OUT)
-
- addiu $sp, STACK_SIZE
- jr $ra
-.end hchacha_block_arch
-.set at
diff --git a/arch/mips/lib/crypto/chacha-glue.c b/arch/mips/lib/crypto/chacha-glue.c
deleted file mode 100644
index 88c097594eb0..000000000000
--- a/arch/mips/lib/crypto/chacha-glue.c
+++ /dev/null
@@ -1,29 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ChaCha and HChaCha functions (MIPS optimized)
- *
- * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <crypto/chacha.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void chacha_crypt_arch(struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds);
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-asmlinkage void hchacha_block_arch(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds);
-EXPORT_SYMBOL(hchacha_block_arch);
-
-bool chacha_is_arch_optimized(void)
-{
- return true;
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-MODULE_DESCRIPTION("ChaCha and HChaCha functions (MIPS optimized)");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/mips/lib/crypto/poly1305-glue.c b/arch/mips/lib/crypto/poly1305-glue.c
deleted file mode 100644
index 764a38a65200..000000000000
--- a/arch/mips/lib/crypto/poly1305-glue.c
+++ /dev/null
@@ -1,33 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
- *
- * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <crypto/internal/poly1305.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/unaligned.h>
-
-asmlinkage void poly1305_block_init_arch(
- struct poly1305_block_state *state,
- const u8 raw_key[POLY1305_BLOCK_SIZE]);
-EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
-asmlinkage void poly1305_blocks_arch(struct poly1305_block_state *state,
- const u8 *src, u32 len, u32 hibit);
-EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
-asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
- u8 digest[POLY1305_DIGEST_SIZE],
- const u32 nonce[4]);
-EXPORT_SYMBOL_GPL(poly1305_emit_arch);
-
-bool poly1305_is_arch_optimized(void)
-{
- return true;
-}
-EXPORT_SYMBOL(poly1305_is_arch_optimized);
-
-MODULE_DESCRIPTION("Poly1305 transform (MIPS accelerated");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/mips/lib/crypto/poly1305-mips.pl b/arch/mips/lib/crypto/poly1305-mips.pl
deleted file mode 100644
index 399f10c3e385..000000000000
--- a/arch/mips/lib/crypto/poly1305-mips.pl
+++ /dev/null
@@ -1,1273 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
-#
-# ====================================================================
-# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
-# project.
-# ====================================================================
-
-# Poly1305 hash for MIPS.
-#
-# May 2016
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone.
-#
-# IALU/gcc
-# R1x000 ~5.5/+130% (big-endian)
-# Octeon II 2.50/+70% (little-endian)
-#
-# March 2019
-#
-# Add 32-bit code path.
-#
-# October 2019
-#
-# Modulo-scheduling reduction allows to omit dependency chain at the
-# end of inner loop and improve performance. Also optimize MIPS32R2
-# code path for MIPS 1004K core. Per René von Dorst's suggestions.
-#
-# IALU/gcc
-# R1x000 ~9.8/? (big-endian)
-# Octeon II 3.65/+140% (little-endian)
-# MT7621/1004K 4.75/? (little-endian)
-#
-######################################################################
-# There is a number of MIPS ABI in use, O32 and N32/64 are most
-# widely used. Then there is a new contender: NUBI. It appears that if
-# one picks the latter, it's possible to arrange code in ABI neutral
-# manner. Therefore let's stick to NUBI register layout:
-#
-($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
-($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
-($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
-#
-# The return value is placed in $a0. Following coding rules facilitate
-# interoperability:
-#
-# - never ever touch $tp, "thread pointer", former $gp [o32 can be
-# excluded from the rule, because it's specified volatile];
-# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
-# old code];
-# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
-#
-# For reference here is register layout for N32/64 MIPS ABIs:
-#
-# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
-# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
-# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
-# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
-#
-# <appro@openssl.org>
-#
-######################################################################
-
-$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
-
-$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
-
-if ($flavour =~ /64|n32/i) {{{
-######################################################################
-# 64-bit code path
-#
-
-my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
-my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
-
-$code.=<<___;
-#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
- defined(_MIPS_ARCH_MIPS64R6)) \\
- && !defined(_MIPS_ARCH_MIPS64R2)
-# define _MIPS_ARCH_MIPS64R2
-#endif
-
-#if defined(_MIPS_ARCH_MIPS64R6)
-# define dmultu(rs,rt)
-# define mflo(rd,rs,rt) dmulu rd,rs,rt
-# define mfhi(rd,rs,rt) dmuhu rd,rs,rt
-#else
-# define dmultu(rs,rt) dmultu rs,rt
-# define mflo(rd,rs,rt) mflo rd
-# define mfhi(rd,rs,rt) mfhi rd
-#endif
-
-#ifdef __KERNEL__
-# define poly1305_init poly1305_block_init_arch
-# define poly1305_blocks poly1305_blocks_arch
-# define poly1305_emit poly1305_emit_arch
-#endif
-
-#if defined(__MIPSEB__) && !defined(MIPSEB)
-# define MIPSEB
-#endif
-
-#ifdef MIPSEB
-# define MSB 0
-# define LSB 7
-#else
-# define MSB 7
-# define LSB 0
-#endif
-
-.text
-.set noat
-.set noreorder
-
-.align 5
-.globl poly1305_init
-.ent poly1305_init
-poly1305_init:
- .frame $sp,0,$ra
- .set reorder
-
- sd $zero,0($ctx)
- sd $zero,8($ctx)
- sd $zero,16($ctx)
-
- beqz $inp,.Lno_key
-
-#if defined(_MIPS_ARCH_MIPS64R6)
- andi $tmp0,$inp,7 # $inp % 8
- dsubu $inp,$inp,$tmp0 # align $inp
- sll $tmp0,$tmp0,3 # byte to bit offset
- ld $in0,0($inp)
- ld $in1,8($inp)
- beqz $tmp0,.Laligned_key
- ld $tmp2,16($inp)
-
- subu $tmp1,$zero,$tmp0
-# ifdef MIPSEB
- dsllv $in0,$in0,$tmp0
- dsrlv $tmp3,$in1,$tmp1
- dsllv $in1,$in1,$tmp0
- dsrlv $tmp2,$tmp2,$tmp1
-# else
- dsrlv $in0,$in0,$tmp0
- dsllv $tmp3,$in1,$tmp1
- dsrlv $in1,$in1,$tmp0
- dsllv $tmp2,$tmp2,$tmp1
-# endif
- or $in0,$in0,$tmp3
- or $in1,$in1,$tmp2
-.Laligned_key:
-#else
- ldl $in0,0+MSB($inp)
- ldl $in1,8+MSB($inp)
- ldr $in0,0+LSB($inp)
- ldr $in1,8+LSB($inp)
-#endif
-#ifdef MIPSEB
-# if defined(_MIPS_ARCH_MIPS64R2)
- dsbh $in0,$in0 # byte swap
- dsbh $in1,$in1
- dshd $in0,$in0
- dshd $in1,$in1
-# else
- ori $tmp0,$zero,0xFF
- dsll $tmp2,$tmp0,32
- or $tmp0,$tmp2 # 0x000000FF000000FF
-
- and $tmp1,$in0,$tmp0 # byte swap
- and $tmp3,$in1,$tmp0
- dsrl $tmp2,$in0,24
- dsrl $tmp4,$in1,24
- dsll $tmp1,24
- dsll $tmp3,24
- and $tmp2,$tmp0
- and $tmp4,$tmp0
- dsll $tmp0,8 # 0x0000FF000000FF00
- or $tmp1,$tmp2
- or $tmp3,$tmp4
- and $tmp2,$in0,$tmp0
- and $tmp4,$in1,$tmp0
- dsrl $in0,8
- dsrl $in1,8
- dsll $tmp2,8
- dsll $tmp4,8
- and $in0,$tmp0
- and $in1,$tmp0
- or $tmp1,$tmp2
- or $tmp3,$tmp4
- or $in0,$tmp1
- or $in1,$tmp3
- dsrl $tmp1,$in0,32
- dsrl $tmp3,$in1,32
- dsll $in0,32
- dsll $in1,32
- or $in0,$tmp1
- or $in1,$tmp3
-# endif
-#endif
- li $tmp0,1
- dsll $tmp0,32 # 0x0000000100000000
- daddiu $tmp0,-63 # 0x00000000ffffffc1
- dsll $tmp0,28 # 0x0ffffffc10000000
- daddiu $tmp0,-1 # 0x0ffffffc0fffffff
-
- and $in0,$tmp0
- daddiu $tmp0,-3 # 0x0ffffffc0ffffffc
- and $in1,$tmp0
-
- sd $in0,24($ctx)
- dsrl $tmp0,$in1,2
- sd $in1,32($ctx)
- daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
- sd $tmp0,40($ctx)
-
-.Lno_key:
- li $v0,0 # return 0
- jr $ra
-.end poly1305_init
-___
-{
-my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
-
-my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
- ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
-my ($shr,$shl) = ($s6,$s7); # used on R6
-
-$code.=<<___;
-.align 5
-.globl poly1305_blocks
-.ent poly1305_blocks
-poly1305_blocks:
- .set noreorder
- dsrl $len,4 # number of complete blocks
- bnez $len,poly1305_blocks_internal
- nop
- jr $ra
- nop
-.end poly1305_blocks
-
-.align 5
-.ent poly1305_blocks_internal
-poly1305_blocks_internal:
- .set noreorder
-#if defined(_MIPS_ARCH_MIPS64R6)
- .frame $sp,8*8,$ra
- .mask $SAVED_REGS_MASK|0x000c0000,-8
- dsubu $sp,8*8
- sd $s7,56($sp)
- sd $s6,48($sp)
-#else
- .frame $sp,6*8,$ra
- .mask $SAVED_REGS_MASK,-8
- dsubu $sp,6*8
-#endif
- sd $s5,40($sp)
- sd $s4,32($sp)
-___
-$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
- sd $s3,24($sp)
- sd $s2,16($sp)
- sd $s1,8($sp)
- sd $s0,0($sp)
-___
-$code.=<<___;
- .set reorder
-
-#if defined(_MIPS_ARCH_MIPS64R6)
- andi $shr,$inp,7
- dsubu $inp,$inp,$shr # align $inp
- sll $shr,$shr,3 # byte to bit offset
- subu $shl,$zero,$shr
-#endif
-
- ld $h0,0($ctx) # load hash value
- ld $h1,8($ctx)
- ld $h2,16($ctx)
-
- ld $r0,24($ctx) # load key
- ld $r1,32($ctx)
- ld $rs1,40($ctx)
-
- dsll $len,4
- daddu $len,$inp # end of buffer
- b .Loop
-
-.align 4
-.Loop:
-#if defined(_MIPS_ARCH_MIPS64R6)
- ld $in0,0($inp) # load input
- ld $in1,8($inp)
- beqz $shr,.Laligned_inp
-
- ld $tmp2,16($inp)
-# ifdef MIPSEB
- dsllv $in0,$in0,$shr
- dsrlv $tmp3,$in1,$shl
- dsllv $in1,$in1,$shr
- dsrlv $tmp2,$tmp2,$shl
-# else
- dsrlv $in0,$in0,$shr
- dsllv $tmp3,$in1,$shl
- dsrlv $in1,$in1,$shr
- dsllv $tmp2,$tmp2,$shl
-# endif
- or $in0,$in0,$tmp3
- or $in1,$in1,$tmp2
-.Laligned_inp:
-#else
- ldl $in0,0+MSB($inp) # load input
- ldl $in1,8+MSB($inp)
- ldr $in0,0+LSB($inp)
- ldr $in1,8+LSB($inp)
-#endif
- daddiu $inp,16
-#ifdef MIPSEB
-# if defined(_MIPS_ARCH_MIPS64R2)
- dsbh $in0,$in0 # byte swap
- dsbh $in1,$in1
- dshd $in0,$in0
- dshd $in1,$in1
-# else
- ori $tmp0,$zero,0xFF
- dsll $tmp2,$tmp0,32
- or $tmp0,$tmp2 # 0x000000FF000000FF
-
- and $tmp1,$in0,$tmp0 # byte swap
- and $tmp3,$in1,$tmp0
- dsrl $tmp2,$in0,24
- dsrl $tmp4,$in1,24
- dsll $tmp1,24
- dsll $tmp3,24
- and $tmp2,$tmp0
- and $tmp4,$tmp0
- dsll $tmp0,8 # 0x0000FF000000FF00
- or $tmp1,$tmp2
- or $tmp3,$tmp4
- and $tmp2,$in0,$tmp0
- and $tmp4,$in1,$tmp0
- dsrl $in0,8
- dsrl $in1,8
- dsll $tmp2,8
- dsll $tmp4,8
- and $in0,$tmp0
- and $in1,$tmp0
- or $tmp1,$tmp2
- or $tmp3,$tmp4
- or $in0,$tmp1
- or $in1,$tmp3
- dsrl $tmp1,$in0,32
- dsrl $tmp3,$in1,32
- dsll $in0,32
- dsll $in1,32
- or $in0,$tmp1
- or $in1,$tmp3
-# endif
-#endif
- dsrl $tmp1,$h2,2 # modulo-scheduled reduction
- andi $h2,$h2,3
- dsll $tmp0,$tmp1,2
-
- daddu $d0,$h0,$in0 # accumulate input
- daddu $tmp1,$tmp0
- sltu $tmp0,$d0,$h0
- daddu $d0,$d0,$tmp1 # ... and residue
- sltu $tmp1,$d0,$tmp1
- daddu $d1,$h1,$in1
- daddu $tmp0,$tmp1
- sltu $tmp1,$d1,$h1
- daddu $d1,$tmp0
-
- dmultu ($r0,$d0) # h0*r0
- daddu $d2,$h2,$padbit
- sltu $tmp0,$d1,$tmp0
- mflo ($h0,$r0,$d0)
- mfhi ($h1,$r0,$d0)
-
- dmultu ($rs1,$d1) # h1*5*r1
- daddu $d2,$tmp1
- daddu $d2,$tmp0
- mflo ($tmp0,$rs1,$d1)
- mfhi ($tmp1,$rs1,$d1)
-
- dmultu ($r1,$d0) # h0*r1
- mflo ($tmp2,$r1,$d0)
- mfhi ($h2,$r1,$d0)
- daddu $h0,$tmp0
- daddu $h1,$tmp1
- sltu $tmp0,$h0,$tmp0
-
- dmultu ($r0,$d1) # h1*r0
- daddu $h1,$tmp0
- daddu $h1,$tmp2
- mflo ($tmp0,$r0,$d1)
- mfhi ($tmp1,$r0,$d1)
-
- dmultu ($rs1,$d2) # h2*5*r1
- sltu $tmp2,$h1,$tmp2
- daddu $h2,$tmp2
- mflo ($tmp2,$rs1,$d2)
-
- dmultu ($r0,$d2) # h2*r0
- daddu $h1,$tmp0
- daddu $h2,$tmp1
- mflo ($tmp3,$r0,$d2)
- sltu $tmp0,$h1,$tmp0
- daddu $h2,$tmp0
-
- daddu $h1,$tmp2
- sltu $tmp2,$h1,$tmp2
- daddu $h2,$tmp2
- daddu $h2,$tmp3
-
- bne $inp,$len,.Loop
-
- sd $h0,0($ctx) # store hash value
- sd $h1,8($ctx)
- sd $h2,16($ctx)
-
- .set noreorder
-#if defined(_MIPS_ARCH_MIPS64R6)
- ld $s7,56($sp)
- ld $s6,48($sp)
-#endif
- ld $s5,40($sp) # epilogue
- ld $s4,32($sp)
-___
-$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
- ld $s3,24($sp)
- ld $s2,16($sp)
- ld $s1,8($sp)
- ld $s0,0($sp)
-___
-$code.=<<___;
- jr $ra
-#if defined(_MIPS_ARCH_MIPS64R6)
- daddu $sp,8*8
-#else
- daddu $sp,6*8
-#endif
-.end poly1305_blocks_internal
-___
-}
-{
-my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
-
-$code.=<<___;
-.align 5
-.globl poly1305_emit
-.ent poly1305_emit
-poly1305_emit:
- .frame $sp,0,$ra
- .set reorder
-
- ld $tmp2,16($ctx)
- ld $tmp0,0($ctx)
- ld $tmp1,8($ctx)
-
- li $in0,-4 # final reduction
- dsrl $in1,$tmp2,2
- and $in0,$tmp2
- andi $tmp2,$tmp2,3
- daddu $in0,$in1
-
- daddu $tmp0,$tmp0,$in0
- sltu $in1,$tmp0,$in0
- daddiu $in0,$tmp0,5 # compare to modulus
- daddu $tmp1,$tmp1,$in1
- sltiu $tmp3,$in0,5
- sltu $tmp4,$tmp1,$in1
- daddu $in1,$tmp1,$tmp3
- daddu $tmp2,$tmp2,$tmp4
- sltu $tmp3,$in1,$tmp3
- daddu $tmp2,$tmp2,$tmp3
-
- dsrl $tmp2,2 # see if it carried/borrowed
- dsubu $tmp2,$zero,$tmp2
-
- xor $in0,$tmp0
- xor $in1,$tmp1
- and $in0,$tmp2
- and $in1,$tmp2
- xor $in0,$tmp0
- xor $in1,$tmp1
-
- lwu $tmp0,0($nonce) # load nonce
- lwu $tmp1,4($nonce)
- lwu $tmp2,8($nonce)
- lwu $tmp3,12($nonce)
- dsll $tmp1,32
- dsll $tmp3,32
- or $tmp0,$tmp1
- or $tmp2,$tmp3
-
- daddu $in0,$tmp0 # accumulate nonce
- daddu $in1,$tmp2
- sltu $tmp0,$in0,$tmp0
- daddu $in1,$tmp0
-
- dsrl $tmp0,$in0,8 # write mac value
- dsrl $tmp1,$in0,16
- dsrl $tmp2,$in0,24
- sb $in0,0($mac)
- dsrl $tmp3,$in0,32
- sb $tmp0,1($mac)
- dsrl $tmp0,$in0,40
- sb $tmp1,2($mac)
- dsrl $tmp1,$in0,48
- sb $tmp2,3($mac)
- dsrl $tmp2,$in0,56
- sb $tmp3,4($mac)
- dsrl $tmp3,$in1,8
- sb $tmp0,5($mac)
- dsrl $tmp0,$in1,16
- sb $tmp1,6($mac)
- dsrl $tmp1,$in1,24
- sb $tmp2,7($mac)
-
- sb $in1,8($mac)
- dsrl $tmp2,$in1,32
- sb $tmp3,9($mac)
- dsrl $tmp3,$in1,40
- sb $tmp0,10($mac)
- dsrl $tmp0,$in1,48
- sb $tmp1,11($mac)
- dsrl $tmp1,$in1,56
- sb $tmp2,12($mac)
- sb $tmp3,13($mac)
- sb $tmp0,14($mac)
- sb $tmp1,15($mac)
-
- jr $ra
-.end poly1305_emit
-.rdata
-.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
-.align 2
-___
-}
-}}} else {{{
-######################################################################
-# 32-bit code path
-#
-
-my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
-my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
- ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
-
-$code.=<<___;
-#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
- defined(_MIPS_ARCH_MIPS32R6)) \\
- && !defined(_MIPS_ARCH_MIPS32R2)
-# define _MIPS_ARCH_MIPS32R2
-#endif
-
-#if defined(_MIPS_ARCH_MIPS32R6)
-# define multu(rs,rt)
-# define mflo(rd,rs,rt) mulu rd,rs,rt
-# define mfhi(rd,rs,rt) muhu rd,rs,rt
-#else
-# define multu(rs,rt) multu rs,rt
-# define mflo(rd,rs,rt) mflo rd
-# define mfhi(rd,rs,rt) mfhi rd
-#endif
-
-#ifdef __KERNEL__
-# define poly1305_init poly1305_block_init_arch
-# define poly1305_blocks poly1305_blocks_arch
-# define poly1305_emit poly1305_emit_arch
-#endif
-
-#if defined(__MIPSEB__) && !defined(MIPSEB)
-# define MIPSEB
-#endif
-
-#ifdef MIPSEB
-# define MSB 0
-# define LSB 3
-#else
-# define MSB 3
-# define LSB 0
-#endif
-
-.text
-.set noat
-.set noreorder
-
-.align 5
-.globl poly1305_init
-.ent poly1305_init
-poly1305_init:
- .frame $sp,0,$ra
- .set reorder
-
- sw $zero,0($ctx)
- sw $zero,4($ctx)
- sw $zero,8($ctx)
- sw $zero,12($ctx)
- sw $zero,16($ctx)
-
- beqz $inp,.Lno_key
-
-#if defined(_MIPS_ARCH_MIPS32R6)
- andi $tmp0,$inp,3 # $inp % 4
- subu $inp,$inp,$tmp0 # align $inp
- sll $tmp0,$tmp0,3 # byte to bit offset
- lw $in0,0($inp)
- lw $in1,4($inp)
- lw $in2,8($inp)
- lw $in3,12($inp)
- beqz $tmp0,.Laligned_key
-
- lw $tmp2,16($inp)
- subu $tmp1,$zero,$tmp0
-# ifdef MIPSEB
- sllv $in0,$in0,$tmp0
- srlv $tmp3,$in1,$tmp1
- sllv $in1,$in1,$tmp0
- or $in0,$in0,$tmp3
- srlv $tmp3,$in2,$tmp1
- sllv $in2,$in2,$tmp0
- or $in1,$in1,$tmp3
- srlv $tmp3,$in3,$tmp1
- sllv $in3,$in3,$tmp0
- or $in2,$in2,$tmp3
- srlv $tmp2,$tmp2,$tmp1
- or $in3,$in3,$tmp2
-# else
- srlv $in0,$in0,$tmp0
- sllv $tmp3,$in1,$tmp1
- srlv $in1,$in1,$tmp0
- or $in0,$in0,$tmp3
- sllv $tmp3,$in2,$tmp1
- srlv $in2,$in2,$tmp0
- or $in1,$in1,$tmp3
- sllv $tmp3,$in3,$tmp1
- srlv $in3,$in3,$tmp0
- or $in2,$in2,$tmp3
- sllv $tmp2,$tmp2,$tmp1
- or $in3,$in3,$tmp2
-# endif
-.Laligned_key:
-#else
- lwl $in0,0+MSB($inp)
- lwl $in1,4+MSB($inp)
- lwl $in2,8+MSB($inp)
- lwl $in3,12+MSB($inp)
- lwr $in0,0+LSB($inp)
- lwr $in1,4+LSB($inp)
- lwr $in2,8+LSB($inp)
- lwr $in3,12+LSB($inp)
-#endif
-#ifdef MIPSEB
-# if defined(_MIPS_ARCH_MIPS32R2)
- wsbh $in0,$in0 # byte swap
- wsbh $in1,$in1
- wsbh $in2,$in2
- wsbh $in3,$in3
- rotr $in0,$in0,16
- rotr $in1,$in1,16
- rotr $in2,$in2,16
- rotr $in3,$in3,16
-# else
- srl $tmp0,$in0,24 # byte swap
- srl $tmp1,$in0,8
- andi $tmp2,$in0,0xFF00
- sll $in0,$in0,24
- andi $tmp1,0xFF00
- sll $tmp2,$tmp2,8
- or $in0,$tmp0
- srl $tmp0,$in1,24
- or $tmp1,$tmp2
- srl $tmp2,$in1,8
- or $in0,$tmp1
- andi $tmp1,$in1,0xFF00
- sll $in1,$in1,24
- andi $tmp2,0xFF00
- sll $tmp1,$tmp1,8
- or $in1,$tmp0
- srl $tmp0,$in2,24
- or $tmp2,$tmp1
- srl $tmp1,$in2,8
- or $in1,$tmp2
- andi $tmp2,$in2,0xFF00
- sll $in2,$in2,24
- andi $tmp1,0xFF00
- sll $tmp2,$tmp2,8
- or $in2,$tmp0
- srl $tmp0,$in3,24
- or $tmp1,$tmp2
- srl $tmp2,$in3,8
- or $in2,$tmp1
- andi $tmp1,$in3,0xFF00
- sll $in3,$in3,24
- andi $tmp2,0xFF00
- sll $tmp1,$tmp1,8
- or $in3,$tmp0
- or $tmp2,$tmp1
- or $in3,$tmp2
-# endif
-#endif
- lui $tmp0,0x0fff
- ori $tmp0,0xffff # 0x0fffffff
- and $in0,$in0,$tmp0
- subu $tmp0,3 # 0x0ffffffc
- and $in1,$in1,$tmp0
- and $in2,$in2,$tmp0
- and $in3,$in3,$tmp0
-
- sw $in0,20($ctx)
- sw $in1,24($ctx)
- sw $in2,28($ctx)
- sw $in3,32($ctx)
-
- srl $tmp1,$in1,2
- srl $tmp2,$in2,2
- srl $tmp3,$in3,2
- addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
- addu $in2,$in2,$tmp2
- addu $in3,$in3,$tmp3
- sw $in1,36($ctx)
- sw $in2,40($ctx)
- sw $in3,44($ctx)
-.Lno_key:
- li $v0,0
- jr $ra
-.end poly1305_init
-___
-{
-my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
-
-my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
- ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
-my ($d0,$d1,$d2,$d3) =
- ($a4,$a5,$a6,$a7);
-my $shr = $t2; # used on R6
-my $one = $t2; # used on R2
-
-$code.=<<___;
-.globl poly1305_blocks
-.align 5
-.ent poly1305_blocks
-poly1305_blocks:
- .frame $sp,16*4,$ra
- .mask $SAVED_REGS_MASK,-4
- .set noreorder
- subu $sp, $sp,4*12
- sw $s11,4*11($sp)
- sw $s10,4*10($sp)
- sw $s9, 4*9($sp)
- sw $s8, 4*8($sp)
- sw $s7, 4*7($sp)
- sw $s6, 4*6($sp)
- sw $s5, 4*5($sp)
- sw $s4, 4*4($sp)
-___
-$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
- sw $s3, 4*3($sp)
- sw $s2, 4*2($sp)
- sw $s1, 4*1($sp)
- sw $s0, 4*0($sp)
-___
-$code.=<<___;
- .set reorder
-
- srl $len,4 # number of complete blocks
- li $one,1
- beqz $len,.Labort
-
-#if defined(_MIPS_ARCH_MIPS32R6)
- andi $shr,$inp,3
- subu $inp,$inp,$shr # align $inp
- sll $shr,$shr,3 # byte to bit offset
-#endif
-
- lw $h0,0($ctx) # load hash value
- lw $h1,4($ctx)
- lw $h2,8($ctx)
- lw $h3,12($ctx)
- lw $h4,16($ctx)
-
- lw $r0,20($ctx) # load key
- lw $r1,24($ctx)
- lw $r2,28($ctx)
- lw $r3,32($ctx)
- lw $rs1,36($ctx)
- lw $rs2,40($ctx)
- lw $rs3,44($ctx)
-
- sll $len,4
- addu $len,$len,$inp # end of buffer
- b .Loop
-
-.align 4
-.Loop:
-#if defined(_MIPS_ARCH_MIPS32R6)
- lw $d0,0($inp) # load input
- lw $d1,4($inp)
- lw $d2,8($inp)
- lw $d3,12($inp)
- beqz $shr,.Laligned_inp
-
- lw $t0,16($inp)
- subu $t1,$zero,$shr
-# ifdef MIPSEB
- sllv $d0,$d0,$shr
- srlv $at,$d1,$t1
- sllv $d1,$d1,$shr
- or $d0,$d0,$at
- srlv $at,$d2,$t1
- sllv $d2,$d2,$shr
- or $d1,$d1,$at
- srlv $at,$d3,$t1
- sllv $d3,$d3,$shr
- or $d2,$d2,$at
- srlv $t0,$t0,$t1
- or $d3,$d3,$t0
-# else
- srlv $d0,$d0,$shr
- sllv $at,$d1,$t1
- srlv $d1,$d1,$shr
- or $d0,$d0,$at
- sllv $at,$d2,$t1
- srlv $d2,$d2,$shr
- or $d1,$d1,$at
- sllv $at,$d3,$t1
- srlv $d3,$d3,$shr
- or $d2,$d2,$at
- sllv $t0,$t0,$t1
- or $d3,$d3,$t0
-# endif
-.Laligned_inp:
-#else
- lwl $d0,0+MSB($inp) # load input
- lwl $d1,4+MSB($inp)
- lwl $d2,8+MSB($inp)
- lwl $d3,12+MSB($inp)
- lwr $d0,0+LSB($inp)
- lwr $d1,4+LSB($inp)
- lwr $d2,8+LSB($inp)
- lwr $d3,12+LSB($inp)
-#endif
-#ifdef MIPSEB
-# if defined(_MIPS_ARCH_MIPS32R2)
- wsbh $d0,$d0 # byte swap
- wsbh $d1,$d1
- wsbh $d2,$d2
- wsbh $d3,$d3
- rotr $d0,$d0,16
- rotr $d1,$d1,16
- rotr $d2,$d2,16
- rotr $d3,$d3,16
-# else
- srl $at,$d0,24 # byte swap
- srl $t0,$d0,8
- andi $t1,$d0,0xFF00
- sll $d0,$d0,24
- andi $t0,0xFF00
- sll $t1,$t1,8
- or $d0,$at
- srl $at,$d1,24
- or $t0,$t1
- srl $t1,$d1,8
- or $d0,$t0
- andi $t0,$d1,0xFF00
- sll $d1,$d1,24
- andi $t1,0xFF00
- sll $t0,$t0,8
- or $d1,$at
- srl $at,$d2,24
- or $t1,$t0
- srl $t0,$d2,8
- or $d1,$t1
- andi $t1,$d2,0xFF00
- sll $d2,$d2,24
- andi $t0,0xFF00
- sll $t1,$t1,8
- or $d2,$at
- srl $at,$d3,24
- or $t0,$t1
- srl $t1,$d3,8
- or $d2,$t0
- andi $t0,$d3,0xFF00
- sll $d3,$d3,24
- andi $t1,0xFF00
- sll $t0,$t0,8
- or $d3,$at
- or $t1,$t0
- or $d3,$t1
-# endif
-#endif
- srl $t0,$h4,2 # modulo-scheduled reduction
- andi $h4,$h4,3
- sll $at,$t0,2
-
- addu $d0,$d0,$h0 # accumulate input
- addu $t0,$t0,$at
- sltu $h0,$d0,$h0
- addu $d0,$d0,$t0 # ... and residue
- sltu $at,$d0,$t0
-
- addu $d1,$d1,$h1
- addu $h0,$h0,$at # carry
- sltu $h1,$d1,$h1
- addu $d1,$d1,$h0
- sltu $h0,$d1,$h0
-
- addu $d2,$d2,$h2
- addu $h1,$h1,$h0 # carry
- sltu $h2,$d2,$h2
- addu $d2,$d2,$h1
- sltu $h1,$d2,$h1
-
- addu $d3,$d3,$h3
- addu $h2,$h2,$h1 # carry
- sltu $h3,$d3,$h3
- addu $d3,$d3,$h2
-
-#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
- multu $r0,$d0 # d0*r0
- sltu $h2,$d3,$h2
- maddu $rs3,$d1 # d1*s3
- addu $h3,$h3,$h2 # carry
- maddu $rs2,$d2 # d2*s2
- addu $h4,$h4,$padbit
- maddu $rs1,$d3 # d3*s1
- addu $h4,$h4,$h3
- mfhi $at
- mflo $h0
-
- multu $r1,$d0 # d0*r1
- maddu $r0,$d1 # d1*r0
- maddu $rs3,$d2 # d2*s3
- maddu $rs2,$d3 # d3*s2
- maddu $rs1,$h4 # h4*s1
- maddu $at,$one # hi*1
- mfhi $at
- mflo $h1
-
- multu $r2,$d0 # d0*r2
- maddu $r1,$d1 # d1*r1
- maddu $r0,$d2 # d2*r0
- maddu $rs3,$d3 # d3*s3
- maddu $rs2,$h4 # h4*s2
- maddu $at,$one # hi*1
- mfhi $at
- mflo $h2
-
- mul $t0,$r0,$h4 # h4*r0
-
- multu $r3,$d0 # d0*r3
- maddu $r2,$d1 # d1*r2
- maddu $r1,$d2 # d2*r1
- maddu $r0,$d3 # d3*r0
- maddu $rs3,$h4 # h4*s3
- maddu $at,$one # hi*1
- mfhi $at
- mflo $h3
-
- addiu $inp,$inp,16
-
- addu $h4,$t0,$at
-#else
- multu ($r0,$d0) # d0*r0
- mflo ($h0,$r0,$d0)
- mfhi ($h1,$r0,$d0)
-
- sltu $h2,$d3,$h2
- addu $h3,$h3,$h2 # carry
-
- multu ($rs3,$d1) # d1*s3
- mflo ($at,$rs3,$d1)
- mfhi ($t0,$rs3,$d1)
-
- addu $h4,$h4,$padbit
- addiu $inp,$inp,16
- addu $h4,$h4,$h3
-
- multu ($rs2,$d2) # d2*s2
- mflo ($a3,$rs2,$d2)
- mfhi ($t1,$rs2,$d2)
- addu $h0,$h0,$at
- addu $h1,$h1,$t0
- multu ($rs1,$d3) # d3*s1
- sltu $at,$h0,$at
- addu $h1,$h1,$at
-
- mflo ($at,$rs1,$d3)
- mfhi ($t0,$rs1,$d3)
- addu $h0,$h0,$a3
- addu $h1,$h1,$t1
- multu ($r1,$d0) # d0*r1
- sltu $a3,$h0,$a3
- addu $h1,$h1,$a3
-
-
- mflo ($a3,$r1,$d0)
- mfhi ($h2,$r1,$d0)
- addu $h0,$h0,$at
- addu $h1,$h1,$t0
- multu ($r0,$d1) # d1*r0
- sltu $at,$h0,$at
- addu $h1,$h1,$at
-
- mflo ($at,$r0,$d1)
- mfhi ($t0,$r0,$d1)
- addu $h1,$h1,$a3
- sltu $a3,$h1,$a3
- multu ($rs3,$d2) # d2*s3
- addu $h2,$h2,$a3
-
- mflo ($a3,$rs3,$d2)
- mfhi ($t1,$rs3,$d2)
- addu $h1,$h1,$at
- addu $h2,$h2,$t0
- multu ($rs2,$d3) # d3*s2
- sltu $at,$h1,$at
- addu $h2,$h2,$at
-
- mflo ($at,$rs2,$d3)
- mfhi ($t0,$rs2,$d3)
- addu $h1,$h1,$a3
- addu $h2,$h2,$t1
- multu ($rs1,$h4) # h4*s1
- sltu $a3,$h1,$a3
- addu $h2,$h2,$a3
-
- mflo ($a3,$rs1,$h4)
- addu $h1,$h1,$at
- addu $h2,$h2,$t0
- multu ($r2,$d0) # d0*r2
- sltu $at,$h1,$at
- addu $h2,$h2,$at
-
-
- mflo ($at,$r2,$d0)
- mfhi ($h3,$r2,$d0)
- addu $h1,$h1,$a3
- sltu $a3,$h1,$a3
- multu ($r1,$d1) # d1*r1
- addu $h2,$h2,$a3
-
- mflo ($a3,$r1,$d1)
- mfhi ($t1,$r1,$d1)
- addu $h2,$h2,$at
- sltu $at,$h2,$at
- multu ($r0,$d2) # d2*r0
- addu $h3,$h3,$at
-
- mflo ($at,$r0,$d2)
- mfhi ($t0,$r0,$d2)
- addu $h2,$h2,$a3
- addu $h3,$h3,$t1
- multu ($rs3,$d3) # d3*s3
- sltu $a3,$h2,$a3
- addu $h3,$h3,$a3
-
- mflo ($a3,$rs3,$d3)
- mfhi ($t1,$rs3,$d3)
- addu $h2,$h2,$at
- addu $h3,$h3,$t0
- multu ($rs2,$h4) # h4*s2
- sltu $at,$h2,$at
- addu $h3,$h3,$at
-
- mflo ($at,$rs2,$h4)
- addu $h2,$h2,$a3
- addu $h3,$h3,$t1
- multu ($r3,$d0) # d0*r3
- sltu $a3,$h2,$a3
- addu $h3,$h3,$a3
-
-
- mflo ($a3,$r3,$d0)
- mfhi ($t1,$r3,$d0)
- addu $h2,$h2,$at
- sltu $at,$h2,$at
- multu ($r2,$d1) # d1*r2
- addu $h3,$h3,$at
-
- mflo ($at,$r2,$d1)
- mfhi ($t0,$r2,$d1)
- addu $h3,$h3,$a3
- sltu $a3,$h3,$a3
- multu ($r0,$d3) # d3*r0
- addu $t1,$t1,$a3
-
- mflo ($a3,$r0,$d3)
- mfhi ($d3,$r0,$d3)
- addu $h3,$h3,$at
- addu $t1,$t1,$t0
- multu ($r1,$d2) # d2*r1
- sltu $at,$h3,$at
- addu $t1,$t1,$at
-
- mflo ($at,$r1,$d2)
- mfhi ($t0,$r1,$d2)
- addu $h3,$h3,$a3
- addu $t1,$t1,$d3
- multu ($rs3,$h4) # h4*s3
- sltu $a3,$h3,$a3
- addu $t1,$t1,$a3
-
- mflo ($a3,$rs3,$h4)
- addu $h3,$h3,$at
- addu $t1,$t1,$t0
- multu ($r0,$h4) # h4*r0
- sltu $at,$h3,$at
- addu $t1,$t1,$at
-
-
- mflo ($h4,$r0,$h4)
- addu $h3,$h3,$a3
- sltu $a3,$h3,$a3
- addu $t1,$t1,$a3
- addu $h4,$h4,$t1
-
- li $padbit,1 # if we loop, padbit is 1
-#endif
- bne $inp,$len,.Loop
-
- sw $h0,0($ctx) # store hash value
- sw $h1,4($ctx)
- sw $h2,8($ctx)
- sw $h3,12($ctx)
- sw $h4,16($ctx)
-
- .set noreorder
-.Labort:
- lw $s11,4*11($sp)
- lw $s10,4*10($sp)
- lw $s9, 4*9($sp)
- lw $s8, 4*8($sp)
- lw $s7, 4*7($sp)
- lw $s6, 4*6($sp)
- lw $s5, 4*5($sp)
- lw $s4, 4*4($sp)
-___
-$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
- lw $s3, 4*3($sp)
- lw $s2, 4*2($sp)
- lw $s1, 4*1($sp)
- lw $s0, 4*0($sp)
-___
-$code.=<<___;
- jr $ra
- addu $sp,$sp,4*12
-.end poly1305_blocks
-___
-}
-{
-my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
-
-$code.=<<___;
-.align 5
-.globl poly1305_emit
-.ent poly1305_emit
-poly1305_emit:
- .frame $sp,0,$ra
- .set reorder
-
- lw $tmp4,16($ctx)
- lw $tmp0,0($ctx)
- lw $tmp1,4($ctx)
- lw $tmp2,8($ctx)
- lw $tmp3,12($ctx)
-
- li $in0,-4 # final reduction
- srl $ctx,$tmp4,2
- and $in0,$in0,$tmp4
- andi $tmp4,$tmp4,3
- addu $ctx,$ctx,$in0
-
- addu $tmp0,$tmp0,$ctx
- sltu $ctx,$tmp0,$ctx
- addiu $in0,$tmp0,5 # compare to modulus
- addu $tmp1,$tmp1,$ctx
- sltiu $in1,$in0,5
- sltu $ctx,$tmp1,$ctx
- addu $in1,$in1,$tmp1
- addu $tmp2,$tmp2,$ctx
- sltu $in2,$in1,$tmp1
- sltu $ctx,$tmp2,$ctx
- addu $in2,$in2,$tmp2
- addu $tmp3,$tmp3,$ctx
- sltu $in3,$in2,$tmp2
- sltu $ctx,$tmp3,$ctx
- addu $in3,$in3,$tmp3
- addu $tmp4,$tmp4,$ctx
- sltu $ctx,$in3,$tmp3
- addu $ctx,$tmp4
-
- srl $ctx,2 # see if it carried/borrowed
- subu $ctx,$zero,$ctx
-
- xor $in0,$tmp0
- xor $in1,$tmp1
- xor $in2,$tmp2
- xor $in3,$tmp3
- and $in0,$ctx
- and $in1,$ctx
- and $in2,$ctx
- and $in3,$ctx
- xor $in0,$tmp0
- xor $in1,$tmp1
- xor $in2,$tmp2
- xor $in3,$tmp3
-
- lw $tmp0,0($nonce) # load nonce
- lw $tmp1,4($nonce)
- lw $tmp2,8($nonce)
- lw $tmp3,12($nonce)
-
- addu $in0,$tmp0 # accumulate nonce
- sltu $ctx,$in0,$tmp0
-
- addu $in1,$tmp1
- sltu $tmp1,$in1,$tmp1
- addu $in1,$ctx
- sltu $ctx,$in1,$ctx
- addu $ctx,$tmp1
-
- addu $in2,$tmp2
- sltu $tmp2,$in2,$tmp2
- addu $in2,$ctx
- sltu $ctx,$in2,$ctx
- addu $ctx,$tmp2
-
- addu $in3,$tmp3
- addu $in3,$ctx
-
- srl $tmp0,$in0,8 # write mac value
- srl $tmp1,$in0,16
- srl $tmp2,$in0,24
- sb $in0, 0($mac)
- sb $tmp0,1($mac)
- srl $tmp0,$in1,8
- sb $tmp1,2($mac)
- srl $tmp1,$in1,16
- sb $tmp2,3($mac)
- srl $tmp2,$in1,24
- sb $in1, 4($mac)
- sb $tmp0,5($mac)
- srl $tmp0,$in2,8
- sb $tmp1,6($mac)
- srl $tmp1,$in2,16
- sb $tmp2,7($mac)
- srl $tmp2,$in2,24
- sb $in2, 8($mac)
- sb $tmp0,9($mac)
- srl $tmp0,$in3,8
- sb $tmp1,10($mac)
- srl $tmp1,$in3,16
- sb $tmp2,11($mac)
- srl $tmp2,$in3,24
- sb $in3, 12($mac)
- sb $tmp0,13($mac)
- sb $tmp1,14($mac)
- sb $tmp2,15($mac)
-
- jr $ra
-.end poly1305_emit
-.rdata
-.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
-.align 2
-___
-}
-}}}
-
-$output=pop and open STDOUT,">$output";
-print $code;
-close STDOUT;
diff --git a/arch/nios2/kernel/ptrace.c b/arch/nios2/kernel/ptrace.c
index 9221c15972e6..c88f5cabc0c1 100644
--- a/arch/nios2/kernel/ptrace.c
+++ b/arch/nios2/kernel/ptrace.c
@@ -95,7 +95,7 @@ enum nios2_regset {
static const struct user_regset nios2_regsets[] = {
[REGSET_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = NUM_PTRACE_REG,
.size = sizeof(unsigned long),
.align = sizeof(unsigned long),
diff --git a/arch/openrisc/kernel/ptrace.c b/arch/openrisc/kernel/ptrace.c
index 8430570d0620..552489b24855 100644
--- a/arch/openrisc/kernel/ptrace.c
+++ b/arch/openrisc/kernel/ptrace.c
@@ -124,7 +124,7 @@ enum or1k_regset {
static const struct user_regset or1k_regsets[] = {
[REGSET_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = ELF_NGREG,
.size = sizeof(long),
.align = sizeof(long),
@@ -133,7 +133,7 @@ static const struct user_regset or1k_regsets[] = {
},
#ifdef CONFIG_FPU
[REGSET_FPU] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(struct __or1k_fpu_state) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c
index ceb45f51d52e..8a17ab7e6e0b 100644
--- a/arch/parisc/kernel/ptrace.c
+++ b/arch/parisc/kernel/ptrace.c
@@ -562,12 +562,12 @@ static int gpr_set(struct task_struct *target,
static const struct user_regset native_regsets[] = {
[REGSET_GENERAL] = {
- .core_note_type = NT_PRSTATUS, .n = ELF_NGREG,
+ USER_REGSET_NOTE_TYPE(PRSTATUS), .n = ELF_NGREG,
.size = sizeof(long), .align = sizeof(long),
.regset_get = gpr_get, .set = gpr_set
},
[REGSET_FP] = {
- .core_note_type = NT_PRFPREG, .n = ELF_NFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG), .n = ELF_NFPREG,
.size = sizeof(__u64), .align = sizeof(__u64),
.regset_get = fpr_get, .set = fpr_set
}
@@ -629,12 +629,12 @@ static int gpr32_set(struct task_struct *target,
*/
static const struct user_regset compat_regsets[] = {
[REGSET_GENERAL] = {
- .core_note_type = NT_PRSTATUS, .n = ELF_NGREG,
+ USER_REGSET_NOTE_TYPE(PRSTATUS), .n = ELF_NGREG,
.size = sizeof(compat_long_t), .align = sizeof(compat_long_t),
.regset_get = gpr32_get, .set = gpr32_set
},
[REGSET_FP] = {
- .core_note_type = NT_PRFPREG, .n = ELF_NFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG), .n = ELF_NFPREG,
.size = sizeof(__u64), .align = sizeof(__u64),
.regset_get = fpr_get, .set = fpr_set
}
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 94df3cb957e9..88a788a7b18d 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -466,3 +466,5 @@
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
467 common open_tree_attr sys_open_tree_attr
+468 common file_getattr sys_file_getattr
+469 common file_setattr sys_file_setattr
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index c3e0cc83f120..45b4fa7b9b02 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -127,8 +127,6 @@ config PPC
select ARCH_ENABLE_MEMORY_HOTPLUG
select ARCH_ENABLE_MEMORY_HOTREMOVE
select ARCH_HAS_COPY_MC if PPC64
- select ARCH_HAS_CRC32 if PPC64 && ALTIVEC
- select ARCH_HAS_CRC_T10DIF if PPC64 && ALTIVEC
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE
diff --git a/arch/powerpc/configs/44x/akebono_defconfig b/arch/powerpc/configs/44x/akebono_defconfig
index fde4824f235e..1882eb2da354 100644
--- a/arch/powerpc/configs/44x/akebono_defconfig
+++ b/arch/powerpc/configs/44x/akebono_defconfig
@@ -128,6 +128,5 @@ CONFIG_PPC_EARLY_DEBUG_44x_PHYSLOW=0x00010000
CONFIG_PPC_EARLY_DEBUG_44x_PHYSHIGH=0x33f
CONFIG_CRYPTO_PCBC=y
CONFIG_CRYPTO_MD5=y
-CONFIG_CRYPTO_SHA1_PPC=y
CONFIG_CRYPTO_DES=y
# CONFIG_CRYPTO_HW is not set
diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index 379229c982a4..98f56e63ad21 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -322,7 +322,6 @@ CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_HMAC=y
CONFIG_CRYPTO_MD5_PPC=m
CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_SHA1_PPC=m
CONFIG_CRYPTO_SHA256=y
CONFIG_CRYPTO_WP512=m
CONFIG_CRYPTO_ANUBIS=m
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index 3423c405cad4..dca67aae5da3 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -388,7 +388,6 @@ CONFIG_CRYPTO_SHA256=y
CONFIG_CRYPTO_WP512=m
CONFIG_CRYPTO_LZO=m
CONFIG_CRYPTO_MD5_PPC=m
-CONFIG_CRYPTO_SHA1_PPC=m
CONFIG_CRYPTO_AES_GCM_P10=m
CONFIG_CRYPTO_DEV_NX=y
CONFIG_CRYPTO_DEV_NX_ENCRYPT=m
diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
index caaa359f4742..cfe39fc221cf 100644
--- a/arch/powerpc/crypto/Kconfig
+++ b/arch/powerpc/crypto/Kconfig
@@ -23,22 +23,6 @@ config CRYPTO_MD5_PPC
Architecture: powerpc
-config CRYPTO_SHA1_PPC
- tristate "Hash functions: SHA-1"
- help
- SHA-1 secure hash algorithm (FIPS 180)
-
- Architecture: powerpc
-
-config CRYPTO_SHA1_PPC_SPE
- tristate "Hash functions: SHA-1 (SPE)"
- depends on SPE
- help
- SHA-1 secure hash algorithm (FIPS 180)
-
- Architecture: powerpc using
- - SPE (Signal Processing Engine) extensions
-
config CRYPTO_AES_PPC_SPE
tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (SPE)"
depends on SPE
diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile
index 8c2936ae466f..bc8fd27344b8 100644
--- a/arch/powerpc/crypto/Makefile
+++ b/arch/powerpc/crypto/Makefile
@@ -7,16 +7,12 @@
obj-$(CONFIG_CRYPTO_AES_PPC_SPE) += aes-ppc-spe.o
obj-$(CONFIG_CRYPTO_MD5_PPC) += md5-ppc.o
-obj-$(CONFIG_CRYPTO_SHA1_PPC) += sha1-powerpc.o
-obj-$(CONFIG_CRYPTO_SHA1_PPC_SPE) += sha1-ppc-spe.o
obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o
obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o
obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o
aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o
md5-ppc-y := md5-asm.o md5-glue.o
-sha1-powerpc-y := sha1-powerpc-asm.o sha1.o
-sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o
aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-ppc.o
vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o
curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o
diff --git a/arch/powerpc/crypto/sha1-powerpc-asm.S b/arch/powerpc/crypto/sha1-powerpc-asm.S
deleted file mode 100644
index f0d5ed557ab1..000000000000
--- a/arch/powerpc/crypto/sha1-powerpc-asm.S
+++ /dev/null
@@ -1,188 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * SHA-1 implementation for PowerPC.
- *
- * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
- */
-
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-#include <asm/asm-compat.h>
-
-#ifdef __BIG_ENDIAN__
-#define LWZ(rt, d, ra) \
- lwz rt,d(ra)
-#else
-#define LWZ(rt, d, ra) \
- li rt,d; \
- lwbrx rt,rt,ra
-#endif
-
-/*
- * We roll the registers for T, A, B, C, D, E around on each
- * iteration; T on iteration t is A on iteration t+1, and so on.
- * We use registers 7 - 12 for this.
- */
-#define RT(t) ((((t)+5)%6)+7)
-#define RA(t) ((((t)+4)%6)+7)
-#define RB(t) ((((t)+3)%6)+7)
-#define RC(t) ((((t)+2)%6)+7)
-#define RD(t) ((((t)+1)%6)+7)
-#define RE(t) ((((t)+0)%6)+7)
-
-/* We use registers 16 - 31 for the W values */
-#define W(t) (((t)%16)+16)
-
-#define LOADW(t) \
- LWZ(W(t),(t)*4,r4)
-
-#define STEPD0_LOAD(t) \
- andc r0,RD(t),RB(t); \
- and r6,RB(t),RC(t); \
- rotlwi RT(t),RA(t),5; \
- or r6,r6,r0; \
- add r0,RE(t),r15; \
- add RT(t),RT(t),r6; \
- add r14,r0,W(t); \
- LWZ(W((t)+4),((t)+4)*4,r4); \
- rotlwi RB(t),RB(t),30; \
- add RT(t),RT(t),r14
-
-#define STEPD0_UPDATE(t) \
- and r6,RB(t),RC(t); \
- andc r0,RD(t),RB(t); \
- rotlwi RT(t),RA(t),5; \
- rotlwi RB(t),RB(t),30; \
- or r6,r6,r0; \
- add r0,RE(t),r15; \
- xor r5,W((t)+4-3),W((t)+4-8); \
- add RT(t),RT(t),r6; \
- xor W((t)+4),W((t)+4-16),W((t)+4-14); \
- add r0,r0,W(t); \
- xor W((t)+4),W((t)+4),r5; \
- add RT(t),RT(t),r0; \
- rotlwi W((t)+4),W((t)+4),1
-
-#define STEPD1(t) \
- xor r6,RB(t),RC(t); \
- rotlwi RT(t),RA(t),5; \
- rotlwi RB(t),RB(t),30; \
- xor r6,r6,RD(t); \
- add r0,RE(t),r15; \
- add RT(t),RT(t),r6; \
- add r0,r0,W(t); \
- add RT(t),RT(t),r0
-
-#define STEPD1_UPDATE(t) \
- xor r6,RB(t),RC(t); \
- rotlwi RT(t),RA(t),5; \
- rotlwi RB(t),RB(t),30; \
- xor r6,r6,RD(t); \
- add r0,RE(t),r15; \
- xor r5,W((t)+4-3),W((t)+4-8); \
- add RT(t),RT(t),r6; \
- xor W((t)+4),W((t)+4-16),W((t)+4-14); \
- add r0,r0,W(t); \
- xor W((t)+4),W((t)+4),r5; \
- add RT(t),RT(t),r0; \
- rotlwi W((t)+4),W((t)+4),1
-
-#define STEPD2_UPDATE(t) \
- and r6,RB(t),RC(t); \
- and r0,RB(t),RD(t); \
- rotlwi RT(t),RA(t),5; \
- or r6,r6,r0; \
- rotlwi RB(t),RB(t),30; \
- and r0,RC(t),RD(t); \
- xor r5,W((t)+4-3),W((t)+4-8); \
- or r6,r6,r0; \
- xor W((t)+4),W((t)+4-16),W((t)+4-14); \
- add r0,RE(t),r15; \
- add RT(t),RT(t),r6; \
- add r0,r0,W(t); \
- xor W((t)+4),W((t)+4),r5; \
- add RT(t),RT(t),r0; \
- rotlwi W((t)+4),W((t)+4),1
-
-#define STEP0LD4(t) \
- STEPD0_LOAD(t); \
- STEPD0_LOAD((t)+1); \
- STEPD0_LOAD((t)+2); \
- STEPD0_LOAD((t)+3)
-
-#define STEPUP4(t, fn) \
- STEP##fn##_UPDATE(t); \
- STEP##fn##_UPDATE((t)+1); \
- STEP##fn##_UPDATE((t)+2); \
- STEP##fn##_UPDATE((t)+3)
-
-#define STEPUP20(t, fn) \
- STEPUP4(t, fn); \
- STEPUP4((t)+4, fn); \
- STEPUP4((t)+8, fn); \
- STEPUP4((t)+12, fn); \
- STEPUP4((t)+16, fn)
-
-_GLOBAL(powerpc_sha_transform)
- PPC_STLU r1,-INT_FRAME_SIZE(r1)
- SAVE_GPRS(14, 31, r1)
-
- /* Load up A - E */
- lwz RA(0),0(r3) /* A */
- lwz RB(0),4(r3) /* B */
- lwz RC(0),8(r3) /* C */
- lwz RD(0),12(r3) /* D */
- lwz RE(0),16(r3) /* E */
-
- LOADW(0)
- LOADW(1)
- LOADW(2)
- LOADW(3)
-
- lis r15,0x5a82 /* K0-19 */
- ori r15,r15,0x7999
- STEP0LD4(0)
- STEP0LD4(4)
- STEP0LD4(8)
- STEPUP4(12, D0)
- STEPUP4(16, D0)
-
- lis r15,0x6ed9 /* K20-39 */
- ori r15,r15,0xeba1
- STEPUP20(20, D1)
-
- lis r15,0x8f1b /* K40-59 */
- ori r15,r15,0xbcdc
- STEPUP20(40, D2)
-
- lis r15,0xca62 /* K60-79 */
- ori r15,r15,0xc1d6
- STEPUP4(60, D1)
- STEPUP4(64, D1)
- STEPUP4(68, D1)
- STEPUP4(72, D1)
- lwz r20,16(r3)
- STEPD1(76)
- lwz r19,12(r3)
- STEPD1(77)
- lwz r18,8(r3)
- STEPD1(78)
- lwz r17,4(r3)
- STEPD1(79)
-
- lwz r16,0(r3)
- add r20,RE(80),r20
- add RD(0),RD(80),r19
- add RC(0),RC(80),r18
- add RB(0),RB(80),r17
- add RA(0),RA(80),r16
- mr RE(0),r20
- stw RA(0),0(r3)
- stw RB(0),4(r3)
- stw RC(0),8(r3)
- stw RD(0),12(r3)
- stw RE(0),16(r3)
-
- REST_GPRS(14, 31, r1)
- addi r1,r1,INT_FRAME_SIZE
- blr
diff --git a/arch/powerpc/crypto/sha1-spe-asm.S b/arch/powerpc/crypto/sha1-spe-asm.S
deleted file mode 100644
index 0f447523be5e..000000000000
--- a/arch/powerpc/crypto/sha1-spe-asm.S
+++ /dev/null
@@ -1,294 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Fast SHA-1 implementation for SPE instruction set (PPC)
- *
- * This code makes use of the SPE SIMD instruction set as defined in
- * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
- * Implementation is based on optimization guide notes from
- * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
- *
- * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
- */
-
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-
-#define rHP r3 /* pointer to hash value */
-#define rWP r4 /* pointer to input */
-#define rKP r5 /* pointer to constants */
-
-#define rW0 r14 /* 64 bit round words */
-#define rW1 r15
-#define rW2 r16
-#define rW3 r17
-#define rW4 r18
-#define rW5 r19
-#define rW6 r20
-#define rW7 r21
-
-#define rH0 r6 /* 32 bit hash values */
-#define rH1 r7
-#define rH2 r8
-#define rH3 r9
-#define rH4 r10
-
-#define rT0 r22 /* 64 bit temporary */
-#define rT1 r0 /* 32 bit temporaries */
-#define rT2 r11
-#define rT3 r12
-
-#define rK r23 /* 64 bit constant in volatile register */
-
-#define LOAD_K01
-
-#define LOAD_K11 \
- evlwwsplat rK,0(rKP);
-
-#define LOAD_K21 \
- evlwwsplat rK,4(rKP);
-
-#define LOAD_K31 \
- evlwwsplat rK,8(rKP);
-
-#define LOAD_K41 \
- evlwwsplat rK,12(rKP);
-
-#define INITIALIZE \
- stwu r1,-128(r1); /* create stack frame */ \
- evstdw r14,8(r1); /* We must save non volatile */ \
- evstdw r15,16(r1); /* registers. Take the chance */ \
- evstdw r16,24(r1); /* and save the SPE part too */ \
- evstdw r17,32(r1); \
- evstdw r18,40(r1); \
- evstdw r19,48(r1); \
- evstdw r20,56(r1); \
- evstdw r21,64(r1); \
- evstdw r22,72(r1); \
- evstdw r23,80(r1);
-
-
-#define FINALIZE \
- evldw r14,8(r1); /* restore SPE registers */ \
- evldw r15,16(r1); \
- evldw r16,24(r1); \
- evldw r17,32(r1); \
- evldw r18,40(r1); \
- evldw r19,48(r1); \
- evldw r20,56(r1); \
- evldw r21,64(r1); \
- evldw r22,72(r1); \
- evldw r23,80(r1); \
- xor r0,r0,r0; \
- stw r0,8(r1); /* Delete sensitive data */ \
- stw r0,16(r1); /* that we might have pushed */ \
- stw r0,24(r1); /* from other context that runs */ \
- stw r0,32(r1); /* the same code. Assume that */ \
- stw r0,40(r1); /* the lower part of the GPRs */ \
- stw r0,48(r1); /* were already overwritten on */ \
- stw r0,56(r1); /* the way down to here */ \
- stw r0,64(r1); \
- stw r0,72(r1); \
- stw r0,80(r1); \
- addi r1,r1,128; /* cleanup stack frame */
-
-#ifdef __BIG_ENDIAN__
-#define LOAD_DATA(reg, off) \
- lwz reg,off(rWP); /* load data */
-#define NEXT_BLOCK \
- addi rWP,rWP,64; /* increment per block */
-#else
-#define LOAD_DATA(reg, off) \
- lwbrx reg,0,rWP; /* load data */ \
- addi rWP,rWP,4; /* increment per word */
-#define NEXT_BLOCK /* nothing to do */
-#endif
-
-#define R_00_15(a, b, c, d, e, w0, w1, k, off) \
- LOAD_DATA(w0, off) /* 1: W */ \
- and rT2,b,c; /* 1: F' = B and C */ \
- LOAD_K##k##1 \
- andc rT1,d,b; /* 1: F" = ~B and D */ \
- rotrwi rT0,a,27; /* 1: A' = A rotl 5 */ \
- or rT2,rT2,rT1; /* 1: F = F' or F" */ \
- add e,e,rT0; /* 1: E = E + A' */ \
- rotrwi b,b,2; /* 1: B = B rotl 30 */ \
- add e,e,w0; /* 1: E = E + W */ \
- LOAD_DATA(w1, off+4) /* 2: W */ \
- add e,e,rT2; /* 1: E = E + F */ \
- and rT1,a,b; /* 2: F' = B and C */ \
- add e,e,rK; /* 1: E = E + K */ \
- andc rT2,c,a; /* 2: F" = ~B and D */ \
- add d,d,rK; /* 2: E = E + K */ \
- or rT2,rT2,rT1; /* 2: F = F' or F" */ \
- rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
- add d,d,w1; /* 2: E = E + W */ \
- rotrwi a,a,2; /* 2: B = B rotl 30 */ \
- add d,d,rT0; /* 2: E = E + A' */ \
- evmergelo w1,w1,w0; /* mix W[0]/W[1] */ \
- add d,d,rT2 /* 2: E = E + F */
-
-#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
- and rT2,b,c; /* 1: F' = B and C */ \
- evmergelohi rT0,w7,w6; /* W[-3] */ \
- andc rT1,d,b; /* 1: F" = ~B and D */ \
- evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
- or rT1,rT1,rT2; /* 1: F = F' or F" */ \
- evxor w0,w0,w4; /* W = W xor W[-8] */ \
- add e,e,rT1; /* 1: E = E + F */ \
- evxor w0,w0,w1; /* W = W xor W[-14] */ \
- rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
- evrlwi w0,w0,1; /* W = W rotl 1 */ \
- add e,e,rT2; /* 1: E = E + A' */ \
- evaddw rT0,w0,rK; /* WK = W + K */ \
- rotrwi b,b,2; /* 1: B = B rotl 30 */ \
- LOAD_K##k##1 \
- evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
- add e,e,rT0; /* 1: E = E + WK */ \
- add d,d,rT1; /* 2: E = E + WK */ \
- and rT2,a,b; /* 2: F' = B and C */ \
- andc rT1,c,a; /* 2: F" = ~B and D */ \
- rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
- or rT1,rT1,rT2; /* 2: F = F' or F" */ \
- add d,d,rT0; /* 2: E = E + A' */ \
- rotrwi a,a,2; /* 2: B = B rotl 30 */ \
- add d,d,rT1 /* 2: E = E + F */
-
-#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
- evmergelohi rT0,w7,w6; /* W[-3] */ \
- xor rT2,b,c; /* 1: F' = B xor C */ \
- evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
- xor rT2,rT2,d; /* 1: F = F' xor D */ \
- evxor w0,w0,w4; /* W = W xor W[-8] */ \
- add e,e,rT2; /* 1: E = E + F */ \
- evxor w0,w0,w1; /* W = W xor W[-14] */ \
- rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
- evrlwi w0,w0,1; /* W = W rotl 1 */ \
- add e,e,rT2; /* 1: E = E + A' */ \
- evaddw rT0,w0,rK; /* WK = W + K */ \
- rotrwi b,b,2; /* 1: B = B rotl 30 */ \
- LOAD_K##k##1 \
- evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
- add e,e,rT0; /* 1: E = E + WK */ \
- xor rT2,a,b; /* 2: F' = B xor C */ \
- add d,d,rT1; /* 2: E = E + WK */ \
- xor rT2,rT2,c; /* 2: F = F' xor D */ \
- rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
- add d,d,rT2; /* 2: E = E + F */ \
- rotrwi a,a,2; /* 2: B = B rotl 30 */ \
- add d,d,rT0 /* 2: E = E + A' */
-
-#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
- and rT2,b,c; /* 1: F' = B and C */ \
- evmergelohi rT0,w7,w6; /* W[-3] */ \
- or rT1,b,c; /* 1: F" = B or C */ \
- evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
- and rT1,d,rT1; /* 1: F" = F" and D */ \
- evxor w0,w0,w4; /* W = W xor W[-8] */ \
- or rT2,rT2,rT1; /* 1: F = F' or F" */ \
- evxor w0,w0,w1; /* W = W xor W[-14] */ \
- add e,e,rT2; /* 1: E = E + F */ \
- evrlwi w0,w0,1; /* W = W rotl 1 */ \
- rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
- evaddw rT0,w0,rK; /* WK = W + K */ \
- add e,e,rT2; /* 1: E = E + A' */ \
- LOAD_K##k##1 \
- evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
- rotrwi b,b,2; /* 1: B = B rotl 30 */ \
- add e,e,rT0; /* 1: E = E + WK */ \
- and rT2,a,b; /* 2: F' = B and C */ \
- or rT0,a,b; /* 2: F" = B or C */ \
- add d,d,rT1; /* 2: E = E + WK */ \
- and rT0,c,rT0; /* 2: F" = F" and D */ \
- rotrwi a,a,2; /* 2: B = B rotl 30 */ \
- or rT2,rT2,rT0; /* 2: F = F' or F" */ \
- rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
- add d,d,rT2; /* 2: E = E + F */ \
- add d,d,rT0 /* 2: E = E + A' */
-
-#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
- R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k)
-
-_GLOBAL(ppc_spe_sha1_transform)
- INITIALIZE
-
- lwz rH0,0(rHP)
- lwz rH1,4(rHP)
- mtctr r5
- lwz rH2,8(rHP)
- lis rKP,PPC_SPE_SHA1_K@h
- lwz rH3,12(rHP)
- ori rKP,rKP,PPC_SPE_SHA1_K@l
- lwz rH4,16(rHP)
-
-ppc_spe_sha1_main:
- R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0)
- R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8)
- R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16)
- R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24)
- R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32)
- R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40)
- R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48)
- R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56)
-
- R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0)
- R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2)
-
- R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0)
- R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0)
- R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0)
- R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0)
- R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0)
- R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0)
- R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0)
- R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0)
- R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0)
- R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3)
-
- R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0)
- R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0)
- R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0)
- R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0)
- R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0)
- R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0)
- R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0)
- R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0)
- R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0)
- R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4)
-
- R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0)
- R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0)
- R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0)
- R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0)
- R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0)
- R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0)
- R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0)
- lwz rT3,0(rHP)
- R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0)
- lwz rW1,4(rHP)
- R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0)
- lwz rW2,8(rHP)
- R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0)
- lwz rW3,12(rHP)
- NEXT_BLOCK
- lwz rW4,16(rHP)
-
- add rH0,rH0,rT3
- stw rH0,0(rHP)
- add rH1,rH1,rW1
- stw rH1,4(rHP)
- add rH2,rH2,rW2
- stw rH2,8(rHP)
- add rH3,rH3,rW3
- stw rH3,12(rHP)
- add rH4,rH4,rW4
- stw rH4,16(rHP)
-
- bdnz ppc_spe_sha1_main
-
- FINALIZE
- blr
-
-.data
-.align 4
-PPC_SPE_SHA1_K:
- .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6
diff --git a/arch/powerpc/crypto/sha1-spe-glue.c b/arch/powerpc/crypto/sha1-spe-glue.c
deleted file mode 100644
index 04c88e173ce1..000000000000
--- a/arch/powerpc/crypto/sha1-spe-glue.c
+++ /dev/null
@@ -1,107 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Glue code for SHA-1 implementation for SPE instructions (PPC)
- *
- * Based on generic implementation.
- *
- * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
- */
-
-#include <asm/switch_to.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/kernel.h>
-#include <linux/preempt.h>
-#include <linux/module.h>
-
-/*
- * MAX_BYTES defines the number of bytes that are allowed to be processed
- * between preempt_disable() and preempt_enable(). SHA1 takes ~1000
- * operations per 64 bytes. e500 cores can issue two arithmetic instructions
- * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
- * Thus 2KB of input data will need an estimated maximum of 18,000 cycles.
- * Headroom for cache misses included. Even with the low end model clocked
- * at 667 MHz this equals to a critical time window of less than 27us.
- *
- */
-#define MAX_BYTES 2048
-
-asmlinkage void ppc_spe_sha1_transform(u32 *state, const u8 *src, u32 blocks);
-
-static void spe_begin(void)
-{
- /* We just start SPE operations and will save SPE registers later. */
- preempt_disable();
- enable_kernel_spe();
-}
-
-static void spe_end(void)
-{
- disable_kernel_spe();
- /* reenable preemption */
- preempt_enable();
-}
-
-static void ppc_spe_sha1_block(struct sha1_state *sctx, const u8 *src,
- int blocks)
-{
- do {
- int unit = min(blocks, MAX_BYTES / SHA1_BLOCK_SIZE);
-
- spe_begin();
- ppc_spe_sha1_transform(sctx->state, src, unit);
- spe_end();
-
- src += unit * SHA1_BLOCK_SIZE;
- blocks -= unit;
- } while (blocks);
-}
-
-static int ppc_spe_sha1_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha1_base_do_update_blocks(desc, data, len, ppc_spe_sha1_block);
-}
-
-static int ppc_spe_sha1_finup(struct shash_desc *desc, const u8 *src,
- unsigned int len, u8 *out)
-{
- sha1_base_do_finup(desc, src, len, ppc_spe_sha1_block);
- return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_base_init,
- .update = ppc_spe_sha1_update,
- .finup = ppc_spe_sha1_finup,
- .descsize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name= "sha1-ppc-spe",
- .cra_priority = 300,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static int __init ppc_spe_sha1_mod_init(void)
-{
- return crypto_register_shash(&alg);
-}
-
-static void __exit ppc_spe_sha1_mod_fini(void)
-{
- crypto_unregister_shash(&alg);
-}
-
-module_init(ppc_spe_sha1_mod_init);
-module_exit(ppc_spe_sha1_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, SPE optimized");
-
-MODULE_ALIAS_CRYPTO("sha1");
-MODULE_ALIAS_CRYPTO("sha1-ppc-spe");
diff --git a/arch/powerpc/crypto/sha1.c b/arch/powerpc/crypto/sha1.c
deleted file mode 100644
index 4593946aa9b3..000000000000
--- a/arch/powerpc/crypto/sha1.c
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Cryptographic API.
- *
- * powerpc implementation of the SHA1 Secure Hash Algorithm.
- *
- * Derived from cryptoapi implementation, adapted for in-place
- * scatterlist interface.
- *
- * Derived from "crypto/sha1.c"
- * Copyright (c) Alan Smithee.
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
- */
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void powerpc_sha_transform(u32 *state, const u8 *src);
-
-static void powerpc_sha_block(struct sha1_state *sctx, const u8 *data,
- int blocks)
-{
- do {
- powerpc_sha_transform(sctx->state, data);
- data += 64;
- } while (--blocks);
-}
-
-static int powerpc_sha1_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha1_base_do_update_blocks(desc, data, len, powerpc_sha_block);
-}
-
-/* Add padding and return the message digest. */
-static int powerpc_sha1_finup(struct shash_desc *desc, const u8 *src,
- unsigned int len, u8 *out)
-{
- sha1_base_do_finup(desc, src, len, powerpc_sha_block);
- return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_base_init,
- .update = powerpc_sha1_update,
- .finup = powerpc_sha1_finup,
- .descsize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name= "sha1-powerpc",
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static int __init sha1_powerpc_mod_init(void)
-{
- return crypto_register_shash(&alg);
-}
-
-static void __exit sha1_powerpc_mod_fini(void)
-{
- crypto_unregister_shash(&alg);
-}
-
-module_init(sha1_powerpc_mod_init);
-module_exit(sha1_powerpc_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm");
-
-MODULE_ALIAS_CRYPTO("sha1");
-MODULE_ALIAS_CRYPTO("sha1-powerpc");
diff --git a/arch/powerpc/kernel/ptrace/ptrace-view.c b/arch/powerpc/kernel/ptrace/ptrace-view.c
index c1819e0a6684..0310f9097e39 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-view.c
+++ b/arch/powerpc/kernel/ptrace/ptrace-view.c
@@ -568,114 +568,114 @@ static int pkey_set(struct task_struct *target, const struct user_regset *regset
static const struct user_regset native_regsets[] = {
[REGSET_GPR] = {
- .core_note_type = NT_PRSTATUS, .n = ELF_NGREG,
+ USER_REGSET_NOTE_TYPE(PRSTATUS), .n = ELF_NGREG,
.size = sizeof(long), .align = sizeof(long),
.regset_get = gpr_get, .set = gpr_set
},
[REGSET_FPR] = {
- .core_note_type = NT_PRFPREG, .n = ELF_NFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG), .n = ELF_NFPREG,
.size = sizeof(double), .align = sizeof(double),
.regset_get = fpr_get, .set = fpr_set
},
#ifdef CONFIG_ALTIVEC
[REGSET_VMX] = {
- .core_note_type = NT_PPC_VMX, .n = 34,
+ USER_REGSET_NOTE_TYPE(PPC_VMX), .n = 34,
.size = sizeof(vector128), .align = sizeof(vector128),
.active = vr_active, .regset_get = vr_get, .set = vr_set
},
#endif
#ifdef CONFIG_VSX
[REGSET_VSX] = {
- .core_note_type = NT_PPC_VSX, .n = 32,
+ USER_REGSET_NOTE_TYPE(PPC_VSX), .n = 32,
.size = sizeof(double), .align = sizeof(double),
.active = vsr_active, .regset_get = vsr_get, .set = vsr_set
},
#endif
#ifdef CONFIG_SPE
[REGSET_SPE] = {
- .core_note_type = NT_PPC_SPE, .n = 35,
+ USER_REGSET_NOTE_TYPE(PPC_SPE), .n = 35,
.size = sizeof(u32), .align = sizeof(u32),
.active = evr_active, .regset_get = evr_get, .set = evr_set
},
#endif
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
[REGSET_TM_CGPR] = {
- .core_note_type = NT_PPC_TM_CGPR, .n = ELF_NGREG,
+ USER_REGSET_NOTE_TYPE(PPC_TM_CGPR), .n = ELF_NGREG,
.size = sizeof(long), .align = sizeof(long),
.active = tm_cgpr_active, .regset_get = tm_cgpr_get, .set = tm_cgpr_set
},
[REGSET_TM_CFPR] = {
- .core_note_type = NT_PPC_TM_CFPR, .n = ELF_NFPREG,
+ USER_REGSET_NOTE_TYPE(PPC_TM_CFPR), .n = ELF_NFPREG,
.size = sizeof(double), .align = sizeof(double),
.active = tm_cfpr_active, .regset_get = tm_cfpr_get, .set = tm_cfpr_set
},
[REGSET_TM_CVMX] = {
- .core_note_type = NT_PPC_TM_CVMX, .n = ELF_NVMX,
+ USER_REGSET_NOTE_TYPE(PPC_TM_CVMX), .n = ELF_NVMX,
.size = sizeof(vector128), .align = sizeof(vector128),
.active = tm_cvmx_active, .regset_get = tm_cvmx_get, .set = tm_cvmx_set
},
[REGSET_TM_CVSX] = {
- .core_note_type = NT_PPC_TM_CVSX, .n = ELF_NVSX,
+ USER_REGSET_NOTE_TYPE(PPC_TM_CVSX), .n = ELF_NVSX,
.size = sizeof(double), .align = sizeof(double),
.active = tm_cvsx_active, .regset_get = tm_cvsx_get, .set = tm_cvsx_set
},
[REGSET_TM_SPR] = {
- .core_note_type = NT_PPC_TM_SPR, .n = ELF_NTMSPRREG,
+ USER_REGSET_NOTE_TYPE(PPC_TM_SPR), .n = ELF_NTMSPRREG,
.size = sizeof(u64), .align = sizeof(u64),
.active = tm_spr_active, .regset_get = tm_spr_get, .set = tm_spr_set
},
[REGSET_TM_CTAR] = {
- .core_note_type = NT_PPC_TM_CTAR, .n = 1,
+ USER_REGSET_NOTE_TYPE(PPC_TM_CTAR), .n = 1,
.size = sizeof(u64), .align = sizeof(u64),
.active = tm_tar_active, .regset_get = tm_tar_get, .set = tm_tar_set
},
[REGSET_TM_CPPR] = {
- .core_note_type = NT_PPC_TM_CPPR, .n = 1,
+ USER_REGSET_NOTE_TYPE(PPC_TM_CPPR), .n = 1,
.size = sizeof(u64), .align = sizeof(u64),
.active = tm_ppr_active, .regset_get = tm_ppr_get, .set = tm_ppr_set
},
[REGSET_TM_CDSCR] = {
- .core_note_type = NT_PPC_TM_CDSCR, .n = 1,
+ USER_REGSET_NOTE_TYPE(PPC_TM_CDSCR), .n = 1,
.size = sizeof(u64), .align = sizeof(u64),
.active = tm_dscr_active, .regset_get = tm_dscr_get, .set = tm_dscr_set
},
#endif
#ifdef CONFIG_PPC64
[REGSET_PPR] = {
- .core_note_type = NT_PPC_PPR, .n = 1,
+ USER_REGSET_NOTE_TYPE(PPC_PPR), .n = 1,
.size = sizeof(u64), .align = sizeof(u64),
.regset_get = ppr_get, .set = ppr_set
},
[REGSET_DSCR] = {
- .core_note_type = NT_PPC_DSCR, .n = 1,
+ USER_REGSET_NOTE_TYPE(PPC_DSCR), .n = 1,
.size = sizeof(u64), .align = sizeof(u64),
.regset_get = dscr_get, .set = dscr_set
},
#endif
#ifdef CONFIG_PPC_BOOK3S_64
[REGSET_TAR] = {
- .core_note_type = NT_PPC_TAR, .n = 1,
+ USER_REGSET_NOTE_TYPE(PPC_TAR), .n = 1,
.size = sizeof(u64), .align = sizeof(u64),
.regset_get = tar_get, .set = tar_set
},
[REGSET_EBB] = {
- .core_note_type = NT_PPC_EBB, .n = ELF_NEBB,
+ USER_REGSET_NOTE_TYPE(PPC_EBB), .n = ELF_NEBB,
.size = sizeof(u64), .align = sizeof(u64),
.active = ebb_active, .regset_get = ebb_get, .set = ebb_set
},
[REGSET_PMR] = {
- .core_note_type = NT_PPC_PMU, .n = ELF_NPMU,
+ USER_REGSET_NOTE_TYPE(PPC_PMU), .n = ELF_NPMU,
.size = sizeof(u64), .align = sizeof(u64),
.active = pmu_active, .regset_get = pmu_get, .set = pmu_set
},
[REGSET_DEXCR] = {
- .core_note_type = NT_PPC_DEXCR, .n = ELF_NDEXCR,
+ USER_REGSET_NOTE_TYPE(PPC_DEXCR), .n = ELF_NDEXCR,
.size = sizeof(u64), .align = sizeof(u64),
.active = dexcr_active, .regset_get = dexcr_get
},
#ifdef CONFIG_CHECKPOINT_RESTORE
[REGSET_HASHKEYR] = {
- .core_note_type = NT_PPC_HASHKEYR, .n = ELF_NHASHKEYR,
+ USER_REGSET_NOTE_TYPE(PPC_HASHKEYR), .n = ELF_NHASHKEYR,
.size = sizeof(u64), .align = sizeof(u64),
.active = hashkeyr_active, .regset_get = hashkeyr_get, .set = hashkeyr_set
},
@@ -683,7 +683,7 @@ static const struct user_regset native_regsets[] = {
#endif
#ifdef CONFIG_PPC_MEM_KEYS
[REGSET_PKEY] = {
- .core_note_type = NT_PPC_PKEY, .n = ELF_NPKEY,
+ USER_REGSET_NOTE_TYPE(PPC_PKEY), .n = ELF_NPKEY,
.size = sizeof(u64), .align = sizeof(u64),
.active = pkey_active, .regset_get = pkey_get, .set = pkey_set
},
@@ -843,92 +843,92 @@ static int gpr32_set(struct task_struct *target,
*/
static const struct user_regset compat_regsets[] = {
[REGSET_GPR] = {
- .core_note_type = NT_PRSTATUS, .n = ELF_NGREG,
+ USER_REGSET_NOTE_TYPE(PRSTATUS), .n = ELF_NGREG,
.size = sizeof(compat_long_t), .align = sizeof(compat_long_t),
.regset_get = gpr32_get, .set = gpr32_set
},
[REGSET_FPR] = {
- .core_note_type = NT_PRFPREG, .n = ELF_NFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG), .n = ELF_NFPREG,
.size = sizeof(double), .align = sizeof(double),
.regset_get = fpr_get, .set = fpr_set
},
#ifdef CONFIG_ALTIVEC
[REGSET_VMX] = {
- .core_note_type = NT_PPC_VMX, .n = 34,
+ USER_REGSET_NOTE_TYPE(PPC_VMX), .n = 34,
.size = sizeof(vector128), .align = sizeof(vector128),
.active = vr_active, .regset_get = vr_get, .set = vr_set
},
#endif
#ifdef CONFIG_SPE
[REGSET_SPE] = {
- .core_note_type = NT_PPC_SPE, .n = 35,
+ USER_REGSET_NOTE_TYPE(PPC_SPE), .n = 35,
.size = sizeof(u32), .align = sizeof(u32),
.active = evr_active, .regset_get = evr_get, .set = evr_set
},
#endif
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
[REGSET_TM_CGPR] = {
- .core_note_type = NT_PPC_TM_CGPR, .n = ELF_NGREG,
+ USER_REGSET_NOTE_TYPE(PPC_TM_CGPR), .n = ELF_NGREG,
.size = sizeof(long), .align = sizeof(long),
.active = tm_cgpr_active,
.regset_get = tm_cgpr32_get, .set = tm_cgpr32_set
},
[REGSET_TM_CFPR] = {
- .core_note_type = NT_PPC_TM_CFPR, .n = ELF_NFPREG,
+ USER_REGSET_NOTE_TYPE(PPC_TM_CFPR), .n = ELF_NFPREG,
.size = sizeof(double), .align = sizeof(double),
.active = tm_cfpr_active, .regset_get = tm_cfpr_get, .set = tm_cfpr_set
},
[REGSET_TM_CVMX] = {
- .core_note_type = NT_PPC_TM_CVMX, .n = ELF_NVMX,
+ USER_REGSET_NOTE_TYPE(PPC_TM_CVMX), .n = ELF_NVMX,
.size = sizeof(vector128), .align = sizeof(vector128),
.active = tm_cvmx_active, .regset_get = tm_cvmx_get, .set = tm_cvmx_set
},
[REGSET_TM_CVSX] = {
- .core_note_type = NT_PPC_TM_CVSX, .n = ELF_NVSX,
+ USER_REGSET_NOTE_TYPE(PPC_TM_CVSX), .n = ELF_NVSX,
.size = sizeof(double), .align = sizeof(double),
.active = tm_cvsx_active, .regset_get = tm_cvsx_get, .set = tm_cvsx_set
},
[REGSET_TM_SPR] = {
- .core_note_type = NT_PPC_TM_SPR, .n = ELF_NTMSPRREG,
+ USER_REGSET_NOTE_TYPE(PPC_TM_SPR), .n = ELF_NTMSPRREG,
.size = sizeof(u64), .align = sizeof(u64),
.active = tm_spr_active, .regset_get = tm_spr_get, .set = tm_spr_set
},
[REGSET_TM_CTAR] = {
- .core_note_type = NT_PPC_TM_CTAR, .n = 1,
+ USER_REGSET_NOTE_TYPE(PPC_TM_CTAR), .n = 1,
.size = sizeof(u64), .align = sizeof(u64),
.active = tm_tar_active, .regset_get = tm_tar_get, .set = tm_tar_set
},
[REGSET_TM_CPPR] = {
- .core_note_type = NT_PPC_TM_CPPR, .n = 1,
+ USER_REGSET_NOTE_TYPE(PPC_TM_CPPR), .n = 1,
.size = sizeof(u64), .align = sizeof(u64),
.active = tm_ppr_active, .regset_get = tm_ppr_get, .set = tm_ppr_set
},
[REGSET_TM_CDSCR] = {
- .core_note_type = NT_PPC_TM_CDSCR, .n = 1,
+ USER_REGSET_NOTE_TYPE(PPC_TM_CDSCR), .n = 1,
.size = sizeof(u64), .align = sizeof(u64),
.active = tm_dscr_active, .regset_get = tm_dscr_get, .set = tm_dscr_set
},
#endif
#ifdef CONFIG_PPC64
[REGSET_PPR] = {
- .core_note_type = NT_PPC_PPR, .n = 1,
+ USER_REGSET_NOTE_TYPE(PPC_PPR), .n = 1,
.size = sizeof(u64), .align = sizeof(u64),
.regset_get = ppr_get, .set = ppr_set
},
[REGSET_DSCR] = {
- .core_note_type = NT_PPC_DSCR, .n = 1,
+ USER_REGSET_NOTE_TYPE(PPC_DSCR), .n = 1,
.size = sizeof(u64), .align = sizeof(u64),
.regset_get = dscr_get, .set = dscr_set
},
#endif
#ifdef CONFIG_PPC_BOOK3S_64
[REGSET_TAR] = {
- .core_note_type = NT_PPC_TAR, .n = 1,
+ USER_REGSET_NOTE_TYPE(PPC_TAR), .n = 1,
.size = sizeof(u64), .align = sizeof(u64),
.regset_get = tar_get, .set = tar_set
},
[REGSET_EBB] = {
- .core_note_type = NT_PPC_EBB, .n = ELF_NEBB,
+ USER_REGSET_NOTE_TYPE(PPC_EBB), .n = ELF_NEBB,
.size = sizeof(u64), .align = sizeof(u64),
.active = ebb_active, .regset_get = ebb_get, .set = ebb_set
},
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 9a084bdb8926..b453e80dfc00 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -558,3 +558,5 @@
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
467 common open_tree_attr sys_open_tree_attr
+468 common file_getattr sys_file_getattr
+469 common file_setattr sys_file_setattr
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 481f968e42c7..f14ecab674a3 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -3,8 +3,6 @@
# Makefile for ppc-specific library files..
#
-obj-y += crypto/
-
CFLAGS_code-patching.o += -fno-stack-protector
CFLAGS_feature-fixups.o += -fno-stack-protector
@@ -80,10 +78,4 @@ CFLAGS_xor_vmx.o += -mhard-float -maltivec $(call cc-option,-mabi=altivec)
# Enable <altivec.h>
CFLAGS_xor_vmx.o += -isystem $(shell $(CC) -print-file-name=include)
-obj-$(CONFIG_CRC32_ARCH) += crc32-powerpc.o
-crc32-powerpc-y := crc32.o crc32c-vpmsum_asm.o
-
-obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-powerpc.o
-crc-t10dif-powerpc-y := crc-t10dif.o crct10dif-vpmsum_asm.o
-
obj-$(CONFIG_PPC64) += $(obj64-y)
diff --git a/arch/powerpc/lib/crc-t10dif.c b/arch/powerpc/lib/crc-t10dif.c
deleted file mode 100644
index be23ded3a9df..000000000000
--- a/arch/powerpc/lib/crc-t10dif.c
+++ /dev/null
@@ -1,83 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Calculate a CRC T10-DIF with vpmsum acceleration
- *
- * Copyright 2017, Daniel Axtens, IBM Corporation.
- * [based on crc32c-vpmsum_glue.c]
- */
-
-#include <asm/switch_to.h>
-#include <crypto/internal/simd.h>
-#include <linux/cpufeature.h>
-#include <linux/crc-t10dif.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/preempt.h>
-#include <linux/uaccess.h>
-
-#define VMX_ALIGN 16
-#define VMX_ALIGN_MASK (VMX_ALIGN-1)
-
-#define VECTOR_BREAKPOINT 64
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto);
-
-u32 __crct10dif_vpmsum(u32 crc, unsigned char const *p, size_t len);
-
-u16 crc_t10dif_arch(u16 crci, const u8 *p, size_t len)
-{
- unsigned int prealign;
- unsigned int tail;
- u32 crc = crci;
-
- if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) ||
- !static_branch_likely(&have_vec_crypto) || !crypto_simd_usable())
- return crc_t10dif_generic(crc, p, len);
-
- if ((unsigned long)p & VMX_ALIGN_MASK) {
- prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
- crc = crc_t10dif_generic(crc, p, prealign);
- len -= prealign;
- p += prealign;
- }
-
- if (len & ~VMX_ALIGN_MASK) {
- crc <<= 16;
- preempt_disable();
- pagefault_disable();
- enable_kernel_altivec();
- crc = __crct10dif_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
- disable_kernel_altivec();
- pagefault_enable();
- preempt_enable();
- crc >>= 16;
- }
-
- tail = len & VMX_ALIGN_MASK;
- if (tail) {
- p += len & ~VMX_ALIGN_MASK;
- crc = crc_t10dif_generic(crc, p, tail);
- }
-
- return crc & 0xffff;
-}
-EXPORT_SYMBOL(crc_t10dif_arch);
-
-static int __init crc_t10dif_powerpc_init(void)
-{
- if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
- (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO))
- static_branch_enable(&have_vec_crypto);
- return 0;
-}
-subsys_initcall(crc_t10dif_powerpc_init);
-
-static void __exit crc_t10dif_powerpc_exit(void)
-{
-}
-module_exit(crc_t10dif_powerpc_exit);
-
-MODULE_AUTHOR("Daniel Axtens <dja@axtens.net>");
-MODULE_DESCRIPTION("CRCT10DIF using vector polynomial multiply-sum instructions");
-MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/lib/crc-vpmsum-template.S b/arch/powerpc/lib/crc-vpmsum-template.S
deleted file mode 100644
index b0f87f595b26..000000000000
--- a/arch/powerpc/lib/crc-vpmsum-template.S
+++ /dev/null
@@ -1,746 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Core of the accelerated CRC algorithm.
- * In your file, define the constants and CRC_FUNCTION_NAME
- * Then include this file.
- *
- * Calculate the checksum of data that is 16 byte aligned and a multiple of
- * 16 bytes.
- *
- * The first step is to reduce it to 1024 bits. We do this in 8 parallel
- * chunks in order to mask the latency of the vpmsum instructions. If we
- * have more than 32 kB of data to checksum we repeat this step multiple
- * times, passing in the previous 1024 bits.
- *
- * The next step is to reduce the 1024 bits to 64 bits. This step adds
- * 32 bits of 0s to the end - this matches what a CRC does. We just
- * calculate constants that land the data in this 32 bits.
- *
- * We then use fixed point Barrett reduction to compute a mod n over GF(2)
- * for n = CRC using POWER8 instructions. We use x = 32.
- *
- * https://en.wikipedia.org/wiki/Barrett_reduction
- *
- * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
-*/
-
-#include <asm/ppc_asm.h>
-#include <asm/ppc-opcode.h>
-
-#define MAX_SIZE 32768
-
- .text
-
-#if defined(__BIG_ENDIAN__) && defined(REFLECT)
-#define BYTESWAP_DATA
-#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
-#define BYTESWAP_DATA
-#else
-#undef BYTESWAP_DATA
-#endif
-
-#define off16 r25
-#define off32 r26
-#define off48 r27
-#define off64 r28
-#define off80 r29
-#define off96 r30
-#define off112 r31
-
-#define const1 v24
-#define const2 v25
-
-#define byteswap v26
-#define mask_32bit v27
-#define mask_64bit v28
-#define zeroes v29
-
-#ifdef BYTESWAP_DATA
-#define VPERM(A, B, C, D) vperm A, B, C, D
-#else
-#define VPERM(A, B, C, D)
-#endif
-
-/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
-FUNC_START(CRC_FUNCTION_NAME)
- std r31,-8(r1)
- std r30,-16(r1)
- std r29,-24(r1)
- std r28,-32(r1)
- std r27,-40(r1)
- std r26,-48(r1)
- std r25,-56(r1)
-
- li off16,16
- li off32,32
- li off48,48
- li off64,64
- li off80,80
- li off96,96
- li off112,112
- li r0,0
-
- /* Enough room for saving 10 non volatile VMX registers */
- subi r6,r1,56+10*16
- subi r7,r1,56+2*16
-
- stvx v20,0,r6
- stvx v21,off16,r6
- stvx v22,off32,r6
- stvx v23,off48,r6
- stvx v24,off64,r6
- stvx v25,off80,r6
- stvx v26,off96,r6
- stvx v27,off112,r6
- stvx v28,0,r7
- stvx v29,off16,r7
-
- mr r10,r3
-
- vxor zeroes,zeroes,zeroes
- vspltisw v0,-1
-
- vsldoi mask_32bit,zeroes,v0,4
- vsldoi mask_64bit,zeroes,v0,8
-
- /* Get the initial value into v8 */
- vxor v8,v8,v8
- MTVRD(v8, R3)
-#ifdef REFLECT
- vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
-#else
- vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */
-#endif
-
-#ifdef BYTESWAP_DATA
- LOAD_REG_ADDR(r3, .byteswap_constant)
- lvx byteswap,0,r3
- addi r3,r3,16
-#endif
-
- cmpdi r5,256
- blt .Lshort
-
- rldicr r6,r5,0,56
-
- /* Checksum in blocks of MAX_SIZE */
-1: lis r7,MAX_SIZE@h
- ori r7,r7,MAX_SIZE@l
- mr r9,r7
- cmpd r6,r7
- bgt 2f
- mr r7,r6
-2: subf r6,r7,r6
-
- /* our main loop does 128 bytes at a time */
- srdi r7,r7,7
-
- /*
- * Work out the offset into the constants table to start at. Each
- * constant is 16 bytes, and it is used against 128 bytes of input
- * data - 128 / 16 = 8
- */
- sldi r8,r7,4
- srdi r9,r9,3
- subf r8,r8,r9
-
- /* We reduce our final 128 bytes in a separate step */
- addi r7,r7,-1
- mtctr r7
-
- LOAD_REG_ADDR(r3, .constants)
-
- /* Find the start of our constants */
- add r3,r3,r8
-
- /* zero v0-v7 which will contain our checksums */
- vxor v0,v0,v0
- vxor v1,v1,v1
- vxor v2,v2,v2
- vxor v3,v3,v3
- vxor v4,v4,v4
- vxor v5,v5,v5
- vxor v6,v6,v6
- vxor v7,v7,v7
-
- lvx const1,0,r3
-
- /*
- * If we are looping back to consume more data we use the values
- * already in v16-v23.
- */
- cmpdi r0,1
- beq 2f
-
- /* First warm up pass */
- lvx v16,0,r4
- lvx v17,off16,r4
- VPERM(v16,v16,v16,byteswap)
- VPERM(v17,v17,v17,byteswap)
- lvx v18,off32,r4
- lvx v19,off48,r4
- VPERM(v18,v18,v18,byteswap)
- VPERM(v19,v19,v19,byteswap)
- lvx v20,off64,r4
- lvx v21,off80,r4
- VPERM(v20,v20,v20,byteswap)
- VPERM(v21,v21,v21,byteswap)
- lvx v22,off96,r4
- lvx v23,off112,r4
- VPERM(v22,v22,v22,byteswap)
- VPERM(v23,v23,v23,byteswap)
- addi r4,r4,8*16
-
- /* xor in initial value */
- vxor v16,v16,v8
-
-2: bdz .Lfirst_warm_up_done
-
- addi r3,r3,16
- lvx const2,0,r3
-
- /* Second warm up pass */
- VPMSUMD(v8,v16,const1)
- lvx v16,0,r4
- VPERM(v16,v16,v16,byteswap)
- ori r2,r2,0
-
- VPMSUMD(v9,v17,const1)
- lvx v17,off16,r4
- VPERM(v17,v17,v17,byteswap)
- ori r2,r2,0
-
- VPMSUMD(v10,v18,const1)
- lvx v18,off32,r4
- VPERM(v18,v18,v18,byteswap)
- ori r2,r2,0
-
- VPMSUMD(v11,v19,const1)
- lvx v19,off48,r4
- VPERM(v19,v19,v19,byteswap)
- ori r2,r2,0
-
- VPMSUMD(v12,v20,const1)
- lvx v20,off64,r4
- VPERM(v20,v20,v20,byteswap)
- ori r2,r2,0
-
- VPMSUMD(v13,v21,const1)
- lvx v21,off80,r4
- VPERM(v21,v21,v21,byteswap)
- ori r2,r2,0
-
- VPMSUMD(v14,v22,const1)
- lvx v22,off96,r4
- VPERM(v22,v22,v22,byteswap)
- ori r2,r2,0
-
- VPMSUMD(v15,v23,const1)
- lvx v23,off112,r4
- VPERM(v23,v23,v23,byteswap)
-
- addi r4,r4,8*16
-
- bdz .Lfirst_cool_down
-
- /*
- * main loop. We modulo schedule it such that it takes three iterations
- * to complete - first iteration load, second iteration vpmsum, third
- * iteration xor.
- */
- .balign 16
-4: lvx const1,0,r3
- addi r3,r3,16
- ori r2,r2,0
-
- vxor v0,v0,v8
- VPMSUMD(v8,v16,const2)
- lvx v16,0,r4
- VPERM(v16,v16,v16,byteswap)
- ori r2,r2,0
-
- vxor v1,v1,v9
- VPMSUMD(v9,v17,const2)
- lvx v17,off16,r4
- VPERM(v17,v17,v17,byteswap)
- ori r2,r2,0
-
- vxor v2,v2,v10
- VPMSUMD(v10,v18,const2)
- lvx v18,off32,r4
- VPERM(v18,v18,v18,byteswap)
- ori r2,r2,0
-
- vxor v3,v3,v11
- VPMSUMD(v11,v19,const2)
- lvx v19,off48,r4
- VPERM(v19,v19,v19,byteswap)
- lvx const2,0,r3
- ori r2,r2,0
-
- vxor v4,v4,v12
- VPMSUMD(v12,v20,const1)
- lvx v20,off64,r4
- VPERM(v20,v20,v20,byteswap)
- ori r2,r2,0
-
- vxor v5,v5,v13
- VPMSUMD(v13,v21,const1)
- lvx v21,off80,r4
- VPERM(v21,v21,v21,byteswap)
- ori r2,r2,0
-
- vxor v6,v6,v14
- VPMSUMD(v14,v22,const1)
- lvx v22,off96,r4
- VPERM(v22,v22,v22,byteswap)
- ori r2,r2,0
-
- vxor v7,v7,v15
- VPMSUMD(v15,v23,const1)
- lvx v23,off112,r4
- VPERM(v23,v23,v23,byteswap)
-
- addi r4,r4,8*16
-
- bdnz 4b
-
-.Lfirst_cool_down:
- /* First cool down pass */
- lvx const1,0,r3
- addi r3,r3,16
-
- vxor v0,v0,v8
- VPMSUMD(v8,v16,const1)
- ori r2,r2,0
-
- vxor v1,v1,v9
- VPMSUMD(v9,v17,const1)
- ori r2,r2,0
-
- vxor v2,v2,v10
- VPMSUMD(v10,v18,const1)
- ori r2,r2,0
-
- vxor v3,v3,v11
- VPMSUMD(v11,v19,const1)
- ori r2,r2,0
-
- vxor v4,v4,v12
- VPMSUMD(v12,v20,const1)
- ori r2,r2,0
-
- vxor v5,v5,v13
- VPMSUMD(v13,v21,const1)
- ori r2,r2,0
-
- vxor v6,v6,v14
- VPMSUMD(v14,v22,const1)
- ori r2,r2,0
-
- vxor v7,v7,v15
- VPMSUMD(v15,v23,const1)
- ori r2,r2,0
-
-.Lsecond_cool_down:
- /* Second cool down pass */
- vxor v0,v0,v8
- vxor v1,v1,v9
- vxor v2,v2,v10
- vxor v3,v3,v11
- vxor v4,v4,v12
- vxor v5,v5,v13
- vxor v6,v6,v14
- vxor v7,v7,v15
-
-#ifdef REFLECT
- /*
- * vpmsumd produces a 96 bit result in the least significant bits
- * of the register. Since we are bit reflected we have to shift it
- * left 32 bits so it occupies the least significant bits in the
- * bit reflected domain.
- */
- vsldoi v0,v0,zeroes,4
- vsldoi v1,v1,zeroes,4
- vsldoi v2,v2,zeroes,4
- vsldoi v3,v3,zeroes,4
- vsldoi v4,v4,zeroes,4
- vsldoi v5,v5,zeroes,4
- vsldoi v6,v6,zeroes,4
- vsldoi v7,v7,zeroes,4
-#endif
-
- /* xor with last 1024 bits */
- lvx v8,0,r4
- lvx v9,off16,r4
- VPERM(v8,v8,v8,byteswap)
- VPERM(v9,v9,v9,byteswap)
- lvx v10,off32,r4
- lvx v11,off48,r4
- VPERM(v10,v10,v10,byteswap)
- VPERM(v11,v11,v11,byteswap)
- lvx v12,off64,r4
- lvx v13,off80,r4
- VPERM(v12,v12,v12,byteswap)
- VPERM(v13,v13,v13,byteswap)
- lvx v14,off96,r4
- lvx v15,off112,r4
- VPERM(v14,v14,v14,byteswap)
- VPERM(v15,v15,v15,byteswap)
-
- addi r4,r4,8*16
-
- vxor v16,v0,v8
- vxor v17,v1,v9
- vxor v18,v2,v10
- vxor v19,v3,v11
- vxor v20,v4,v12
- vxor v21,v5,v13
- vxor v22,v6,v14
- vxor v23,v7,v15
-
- li r0,1
- cmpdi r6,0
- addi r6,r6,128
- bne 1b
-
- /* Work out how many bytes we have left */
- andi. r5,r5,127
-
- /* Calculate where in the constant table we need to start */
- subfic r6,r5,128
- add r3,r3,r6
-
- /* How many 16 byte chunks are in the tail */
- srdi r7,r5,4
- mtctr r7
-
- /*
- * Reduce the previously calculated 1024 bits to 64 bits, shifting
- * 32 bits to include the trailing 32 bits of zeros
- */
- lvx v0,0,r3
- lvx v1,off16,r3
- lvx v2,off32,r3
- lvx v3,off48,r3
- lvx v4,off64,r3
- lvx v5,off80,r3
- lvx v6,off96,r3
- lvx v7,off112,r3
- addi r3,r3,8*16
-
- VPMSUMW(v0,v16,v0)
- VPMSUMW(v1,v17,v1)
- VPMSUMW(v2,v18,v2)
- VPMSUMW(v3,v19,v3)
- VPMSUMW(v4,v20,v4)
- VPMSUMW(v5,v21,v5)
- VPMSUMW(v6,v22,v6)
- VPMSUMW(v7,v23,v7)
-
- /* Now reduce the tail (0 - 112 bytes) */
- cmpdi r7,0
- beq 1f
-
- lvx v16,0,r4
- lvx v17,0,r3
- VPERM(v16,v16,v16,byteswap)
- VPMSUMW(v16,v16,v17)
- vxor v0,v0,v16
- bdz 1f
-
- lvx v16,off16,r4
- lvx v17,off16,r3
- VPERM(v16,v16,v16,byteswap)
- VPMSUMW(v16,v16,v17)
- vxor v0,v0,v16
- bdz 1f
-
- lvx v16,off32,r4
- lvx v17,off32,r3
- VPERM(v16,v16,v16,byteswap)
- VPMSUMW(v16,v16,v17)
- vxor v0,v0,v16
- bdz 1f
-
- lvx v16,off48,r4
- lvx v17,off48,r3
- VPERM(v16,v16,v16,byteswap)
- VPMSUMW(v16,v16,v17)
- vxor v0,v0,v16
- bdz 1f
-
- lvx v16,off64,r4
- lvx v17,off64,r3
- VPERM(v16,v16,v16,byteswap)
- VPMSUMW(v16,v16,v17)
- vxor v0,v0,v16
- bdz 1f
-
- lvx v16,off80,r4
- lvx v17,off80,r3
- VPERM(v16,v16,v16,byteswap)
- VPMSUMW(v16,v16,v17)
- vxor v0,v0,v16
- bdz 1f
-
- lvx v16,off96,r4
- lvx v17,off96,r3
- VPERM(v16,v16,v16,byteswap)
- VPMSUMW(v16,v16,v17)
- vxor v0,v0,v16
-
- /* Now xor all the parallel chunks together */
-1: vxor v0,v0,v1
- vxor v2,v2,v3
- vxor v4,v4,v5
- vxor v6,v6,v7
-
- vxor v0,v0,v2
- vxor v4,v4,v6
-
- vxor v0,v0,v4
-
-.Lbarrett_reduction:
- /* Barrett constants */
- LOAD_REG_ADDR(r3, .barrett_constants)
-
- lvx const1,0,r3
- lvx const2,off16,r3
-
- vsldoi v1,v0,v0,8
- vxor v0,v0,v1 /* xor two 64 bit results together */
-
-#ifdef REFLECT
- /* shift left one bit */
- vspltisb v1,1
- vsl v0,v0,v1
-#endif
-
- vand v0,v0,mask_64bit
-#ifndef REFLECT
- /*
- * Now for the Barrett reduction algorithm. The idea is to calculate q,
- * the multiple of our polynomial that we need to subtract. By
- * doing the computation 2x bits higher (ie 64 bits) and shifting the
- * result back down 2x bits, we round down to the nearest multiple.
- */
- VPMSUMD(v1,v0,const1) /* ma */
- vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */
- VPMSUMD(v1,v1,const2) /* qn */
- vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
-
- /*
- * Get the result into r3. We need to shift it left 8 bytes:
- * V0 [ 0 1 2 X ]
- * V0 [ 0 X 2 3 ]
- */
- vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */
-#else
- /*
- * The reflected version of Barrett reduction. Instead of bit
- * reflecting our data (which is expensive to do), we bit reflect our
- * constants and our algorithm, which means the intermediate data in
- * our vector registers goes from 0-63 instead of 63-0. We can reflect
- * the algorithm because we don't carry in mod 2 arithmetic.
- */
- vand v1,v0,mask_32bit /* bottom 32 bits of a */
- VPMSUMD(v1,v1,const1) /* ma */
- vand v1,v1,mask_32bit /* bottom 32bits of ma */
- VPMSUMD(v1,v1,const2) /* qn */
- vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
-
- /*
- * Since we are bit reflected, the result (ie the low 32 bits) is in
- * the high 32 bits. We just need to shift it left 4 bytes
- * V0 [ 0 1 X 3 ]
- * V0 [ 0 X 2 3 ]
- */
- vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
-#endif
-
- /* Get it into r3 */
- MFVRD(R3, v0)
-
-.Lout:
- subi r6,r1,56+10*16
- subi r7,r1,56+2*16
-
- lvx v20,0,r6
- lvx v21,off16,r6
- lvx v22,off32,r6
- lvx v23,off48,r6
- lvx v24,off64,r6
- lvx v25,off80,r6
- lvx v26,off96,r6
- lvx v27,off112,r6
- lvx v28,0,r7
- lvx v29,off16,r7
-
- ld r31,-8(r1)
- ld r30,-16(r1)
- ld r29,-24(r1)
- ld r28,-32(r1)
- ld r27,-40(r1)
- ld r26,-48(r1)
- ld r25,-56(r1)
-
- blr
-
-.Lfirst_warm_up_done:
- lvx const1,0,r3
- addi r3,r3,16
-
- VPMSUMD(v8,v16,const1)
- VPMSUMD(v9,v17,const1)
- VPMSUMD(v10,v18,const1)
- VPMSUMD(v11,v19,const1)
- VPMSUMD(v12,v20,const1)
- VPMSUMD(v13,v21,const1)
- VPMSUMD(v14,v22,const1)
- VPMSUMD(v15,v23,const1)
-
- b .Lsecond_cool_down
-
-.Lshort:
- cmpdi r5,0
- beq .Lzero
-
- LOAD_REG_ADDR(r3, .short_constants)
-
- /* Calculate where in the constant table we need to start */
- subfic r6,r5,256
- add r3,r3,r6
-
- /* How many 16 byte chunks? */
- srdi r7,r5,4
- mtctr r7
-
- vxor v19,v19,v19
- vxor v20,v20,v20
-
- lvx v0,0,r4
- lvx v16,0,r3
- VPERM(v0,v0,v16,byteswap)
- vxor v0,v0,v8 /* xor in initial value */
- VPMSUMW(v0,v0,v16)
- bdz .Lv0
-
- lvx v1,off16,r4
- lvx v17,off16,r3
- VPERM(v1,v1,v17,byteswap)
- VPMSUMW(v1,v1,v17)
- bdz .Lv1
-
- lvx v2,off32,r4
- lvx v16,off32,r3
- VPERM(v2,v2,v16,byteswap)
- VPMSUMW(v2,v2,v16)
- bdz .Lv2
-
- lvx v3,off48,r4
- lvx v17,off48,r3
- VPERM(v3,v3,v17,byteswap)
- VPMSUMW(v3,v3,v17)
- bdz .Lv3
-
- lvx v4,off64,r4
- lvx v16,off64,r3
- VPERM(v4,v4,v16,byteswap)
- VPMSUMW(v4,v4,v16)
- bdz .Lv4
-
- lvx v5,off80,r4
- lvx v17,off80,r3
- VPERM(v5,v5,v17,byteswap)
- VPMSUMW(v5,v5,v17)
- bdz .Lv5
-
- lvx v6,off96,r4
- lvx v16,off96,r3
- VPERM(v6,v6,v16,byteswap)
- VPMSUMW(v6,v6,v16)
- bdz .Lv6
-
- lvx v7,off112,r4
- lvx v17,off112,r3
- VPERM(v7,v7,v17,byteswap)
- VPMSUMW(v7,v7,v17)
- bdz .Lv7
-
- addi r3,r3,128
- addi r4,r4,128
-
- lvx v8,0,r4
- lvx v16,0,r3
- VPERM(v8,v8,v16,byteswap)
- VPMSUMW(v8,v8,v16)
- bdz .Lv8
-
- lvx v9,off16,r4
- lvx v17,off16,r3
- VPERM(v9,v9,v17,byteswap)
- VPMSUMW(v9,v9,v17)
- bdz .Lv9
-
- lvx v10,off32,r4
- lvx v16,off32,r3
- VPERM(v10,v10,v16,byteswap)
- VPMSUMW(v10,v10,v16)
- bdz .Lv10
-
- lvx v11,off48,r4
- lvx v17,off48,r3
- VPERM(v11,v11,v17,byteswap)
- VPMSUMW(v11,v11,v17)
- bdz .Lv11
-
- lvx v12,off64,r4
- lvx v16,off64,r3
- VPERM(v12,v12,v16,byteswap)
- VPMSUMW(v12,v12,v16)
- bdz .Lv12
-
- lvx v13,off80,r4
- lvx v17,off80,r3
- VPERM(v13,v13,v17,byteswap)
- VPMSUMW(v13,v13,v17)
- bdz .Lv13
-
- lvx v14,off96,r4
- lvx v16,off96,r3
- VPERM(v14,v14,v16,byteswap)
- VPMSUMW(v14,v14,v16)
- bdz .Lv14
-
- lvx v15,off112,r4
- lvx v17,off112,r3
- VPERM(v15,v15,v17,byteswap)
- VPMSUMW(v15,v15,v17)
-
-.Lv15: vxor v19,v19,v15
-.Lv14: vxor v20,v20,v14
-.Lv13: vxor v19,v19,v13
-.Lv12: vxor v20,v20,v12
-.Lv11: vxor v19,v19,v11
-.Lv10: vxor v20,v20,v10
-.Lv9: vxor v19,v19,v9
-.Lv8: vxor v20,v20,v8
-.Lv7: vxor v19,v19,v7
-.Lv6: vxor v20,v20,v6
-.Lv5: vxor v19,v19,v5
-.Lv4: vxor v20,v20,v4
-.Lv3: vxor v19,v19,v3
-.Lv2: vxor v20,v20,v2
-.Lv1: vxor v19,v19,v1
-.Lv0: vxor v20,v20,v0
-
- vxor v0,v19,v20
-
- b .Lbarrett_reduction
-
-.Lzero:
- mr r3,r10
- b .Lout
-
-FUNC_END(CRC_FUNCTION_NAME)
diff --git a/arch/powerpc/lib/crc32.c b/arch/powerpc/lib/crc32.c
deleted file mode 100644
index 0d9befb6e7b8..000000000000
--- a/arch/powerpc/lib/crc32.c
+++ /dev/null
@@ -1,93 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include <asm/switch_to.h>
-#include <crypto/internal/simd.h>
-#include <linux/cpufeature.h>
-#include <linux/crc32.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/preempt.h>
-#include <linux/uaccess.h>
-
-#define VMX_ALIGN 16
-#define VMX_ALIGN_MASK (VMX_ALIGN-1)
-
-#define VECTOR_BREAKPOINT 512
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto);
-
-u32 __crc32c_vpmsum(u32 crc, const u8 *p, size_t len);
-
-u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
-{
- return crc32_le_base(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_le_arch);
-
-u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
-{
- unsigned int prealign;
- unsigned int tail;
-
- if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) ||
- !static_branch_likely(&have_vec_crypto) || !crypto_simd_usable())
- return crc32c_base(crc, p, len);
-
- if ((unsigned long)p & VMX_ALIGN_MASK) {
- prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
- crc = crc32c_base(crc, p, prealign);
- len -= prealign;
- p += prealign;
- }
-
- if (len & ~VMX_ALIGN_MASK) {
- preempt_disable();
- pagefault_disable();
- enable_kernel_altivec();
- crc = __crc32c_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
- disable_kernel_altivec();
- pagefault_enable();
- preempt_enable();
- }
-
- tail = len & VMX_ALIGN_MASK;
- if (tail) {
- p += len & ~VMX_ALIGN_MASK;
- crc = crc32c_base(crc, p, tail);
- }
-
- return crc;
-}
-EXPORT_SYMBOL(crc32c_arch);
-
-u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
-{
- return crc32_be_base(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_be_arch);
-
-static int __init crc32_powerpc_init(void)
-{
- if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
- (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO))
- static_branch_enable(&have_vec_crypto);
- return 0;
-}
-subsys_initcall(crc32_powerpc_init);
-
-static void __exit crc32_powerpc_exit(void)
-{
-}
-module_exit(crc32_powerpc_exit);
-
-u32 crc32_optimizations(void)
-{
- if (static_key_enabled(&have_vec_crypto))
- return CRC32C_OPTIMIZATION;
- return 0;
-}
-EXPORT_SYMBOL(crc32_optimizations);
-
-MODULE_AUTHOR("Anton Blanchard <anton@samba.org>");
-MODULE_DESCRIPTION("CRC32C using vector polynomial multiply-sum instructions");
-MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/lib/crc32c-vpmsum_asm.S b/arch/powerpc/lib/crc32c-vpmsum_asm.S
deleted file mode 100644
index 1b35c55cce0a..000000000000
--- a/arch/powerpc/lib/crc32c-vpmsum_asm.S
+++ /dev/null
@@ -1,842 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Calculate a crc32c with vpmsum acceleration
- *
- * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
- */
- .section .rodata
-.balign 16
-
-.byteswap_constant:
- /* byte reverse permute constant */
- .octa 0x0F0E0D0C0B0A09080706050403020100
-
-.constants:
-
- /* Reduce 262144 kbits to 1024 bits */
- /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
- .octa 0x00000000b6ca9e20000000009c37c408
-
- /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
- .octa 0x00000000350249a800000001b51df26c
-
- /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
- .octa 0x00000001862dac54000000000724b9d0
-
- /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
- .octa 0x00000001d87fb48c00000001c00532fe
-
- /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
- .octa 0x00000001f39b699e00000000f05a9362
-
- /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
- .octa 0x0000000101da11b400000001e1007970
-
- /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
- .octa 0x00000001cab571e000000000a57366ee
-
- /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
- .octa 0x00000000c7020cfe0000000192011284
-
- /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
- .octa 0x00000000cdaed1ae0000000162716d9a
-
- /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
- .octa 0x00000001e804effc00000000cd97ecde
-
- /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
- .octa 0x0000000077c3ea3a0000000058812bc0
-
- /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
- .octa 0x0000000068df31b40000000088b8c12e
-
- /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
- .octa 0x00000000b059b6c200000001230b234c
-
- /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
- .octa 0x0000000145fb8ed800000001120b416e
-
- /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
- .octa 0x00000000cbc0916800000001974aecb0
-
- /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
- .octa 0x000000005ceeedc2000000008ee3f226
-
- /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
- .octa 0x0000000047d74e8600000001089aba9a
-
- /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
- .octa 0x00000001407e9e220000000065113872
-
- /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
- .octa 0x00000001da967bda000000005c07ec10
-
- /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
- .octa 0x000000006c8983680000000187590924
-
- /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
- .octa 0x00000000f2d14c9800000000e35da7c6
-
- /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
- .octa 0x00000001993c6ad4000000000415855a
-
- /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
- .octa 0x000000014683d1ac0000000073617758
-
- /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
- .octa 0x00000001a7c93e6c0000000176021d28
-
- /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
- .octa 0x000000010211e90a00000001c358fd0a
-
- /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
- .octa 0x000000001119403e00000001ff7a2c18
-
- /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
- .octa 0x000000001c3261aa00000000f2d9f7e4
-
- /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
- .octa 0x000000014e37a634000000016cf1f9c8
-
- /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
- .octa 0x0000000073786c0c000000010af9279a
-
- /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
- .octa 0x000000011dc037f80000000004f101e8
-
- /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
- .octa 0x0000000031433dfc0000000070bcf184
-
- /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
- .octa 0x000000009cde8348000000000a8de642
-
- /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
- .octa 0x0000000038d3c2a60000000062ea130c
-
- /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
- .octa 0x000000011b25f26000000001eb31cbb2
-
- /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
- .octa 0x000000001629e6f00000000170783448
-
- /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
- .octa 0x0000000160838b4c00000001a684b4c6
-
- /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
- .octa 0x000000007a44011c00000000253ca5b4
-
- /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
- .octa 0x00000000226f417a0000000057b4b1e2
-
- /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
- .octa 0x0000000045eb2eb400000000b6bd084c
-
- /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
- .octa 0x000000014459d70c0000000123c2d592
-
- /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
- .octa 0x00000001d406ed8200000000159dafce
-
- /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
- .octa 0x0000000160c8e1a80000000127e1a64e
-
- /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
- .octa 0x0000000027ba80980000000056860754
-
- /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
- .octa 0x000000006d92d01800000001e661aae8
-
- /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
- .octa 0x000000012ed7e3f200000000f82c6166
-
- /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
- .octa 0x000000002dc8778800000000c4f9c7ae
-
- /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
- .octa 0x0000000018240bb80000000074203d20
-
- /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
- .octa 0x000000001ad381580000000198173052
-
- /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
- .octa 0x00000001396b78f200000001ce8aba54
-
- /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
- .octa 0x000000011a68133400000001850d5d94
-
- /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
- .octa 0x000000012104732e00000001d609239c
-
- /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
- .octa 0x00000000a140d90c000000001595f048
-
- /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
- .octa 0x00000001b7215eda0000000042ccee08
-
- /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
- .octa 0x00000001aaf1df3c000000010a389d74
-
- /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
- .octa 0x0000000029d15b8a000000012a840da6
-
- /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
- .octa 0x00000000f1a96922000000001d181c0c
-
- /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
- .octa 0x00000001ac80d03c0000000068b7d1f6
-
- /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
- .octa 0x000000000f11d56a000000005b0f14fc
-
- /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
- .octa 0x00000001f1c022a20000000179e9e730
-
- /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
- .octa 0x0000000173d00ae200000001ce1368d6
-
- /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
- .octa 0x00000001d4ffe4ac0000000112c3a84c
-
- /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
- .octa 0x000000016edc5ae400000000de940fee
-
- /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
- .octa 0x00000001f1a0214000000000fe896b7e
-
- /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
- .octa 0x00000000ca0b28a000000001f797431c
-
- /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
- .octa 0x00000001928e30a20000000053e989ba
-
- /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
- .octa 0x0000000097b1b002000000003920cd16
-
- /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
- .octa 0x00000000b15bf90600000001e6f579b8
-
- /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
- .octa 0x00000000411c5d52000000007493cb0a
-
- /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
- .octa 0x00000001c36f330000000001bdd376d8
-
- /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
- .octa 0x00000001119227e0000000016badfee6
-
- /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
- .octa 0x00000000114d47020000000071de5c58
-
- /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
- .octa 0x00000000458b5b9800000000453f317c
-
- /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
- .octa 0x000000012e31fb8e0000000121675cce
-
- /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
- .octa 0x000000005cf619d800000001f409ee92
-
- /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
- .octa 0x0000000063f4d8b200000000f36b9c88
-
- /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
- .octa 0x000000004138dc8a0000000036b398f4
-
- /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
- .octa 0x00000001d29ee8e000000001748f9adc
-
- /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
- .octa 0x000000006a08ace800000001be94ec00
-
- /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
- .octa 0x0000000127d4201000000000b74370d6
-
- /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
- .octa 0x0000000019d76b6200000001174d0b98
-
- /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
- .octa 0x00000001b1471f6e00000000befc06a4
-
- /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
- .octa 0x00000001f64c19cc00000001ae125288
-
- /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
- .octa 0x00000000003c0ea00000000095c19b34
-
- /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
- .octa 0x000000014d73abf600000001a78496f2
-
- /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
- .octa 0x00000001620eb84400000001ac5390a0
-
- /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
- .octa 0x0000000147655048000000002a80ed6e
-
- /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
- .octa 0x0000000067b5077e00000001fa9b0128
-
- /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
- .octa 0x0000000010ffe20600000001ea94929e
-
- /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
- .octa 0x000000000fee8f1e0000000125f4305c
-
- /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
- .octa 0x00000001da26fbae00000001471e2002
-
- /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
- .octa 0x00000001b3a8bd880000000132d2253a
-
- /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
- .octa 0x00000000e8f3898e00000000f26b3592
-
- /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
- .octa 0x00000000b0d0d28c00000000bc8b67b0
-
- /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
- .octa 0x0000000030f2a798000000013a826ef2
-
- /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
- .octa 0x000000000fba10020000000081482c84
-
- /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
- .octa 0x00000000bdb9bd7200000000e77307c2
-
- /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
- .octa 0x0000000075d3bf5a00000000d4a07ec8
-
- /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
- .octa 0x00000000ef1f98a00000000017102100
-
- /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
- .octa 0x00000000689c760200000000db406486
-
- /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
- .octa 0x000000016d5fa5fe0000000192db7f88
-
- /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
- .octa 0x00000001d0d2b9ca000000018bf67b1e
-
- /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
- .octa 0x0000000041e7b470000000007c09163e
-
- /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
- .octa 0x00000001cbb6495e000000000adac060
-
- /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
- .octa 0x000000010052a0b000000000bd8316ae
-
- /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
- .octa 0x00000001d8effb5c000000019f09ab54
-
- /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
- .octa 0x00000001d969853c0000000125155542
-
- /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
- .octa 0x00000000523ccce2000000018fdb5882
-
- /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
- .octa 0x000000001e2436bc00000000e794b3f4
-
- /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
- .octa 0x00000000ddd1c3a2000000016f9bb022
-
- /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
- .octa 0x0000000019fcfe3800000000290c9978
-
- /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
- .octa 0x00000001ce95db640000000083c0f350
-
- /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
- .octa 0x00000000af5828060000000173ea6628
-
- /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
- .octa 0x00000001006388f600000001c8b4e00a
-
- /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
- .octa 0x0000000179eca00a00000000de95d6aa
-
- /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
- .octa 0x0000000122410a6a000000010b7f7248
-
- /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
- .octa 0x000000004288e87c00000001326e3a06
-
- /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
- .octa 0x000000016c5490da00000000bb62c2e6
-
- /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
- .octa 0x00000000d1c71f6e0000000156a4b2c2
-
- /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
- .octa 0x00000001b4ce08a6000000011dfe763a
-
- /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
- .octa 0x00000001466ba60c000000007bcca8e2
-
- /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
- .octa 0x00000001f6c488a40000000186118faa
-
- /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
- .octa 0x000000013bfb06820000000111a65a88
-
- /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
- .octa 0x00000000690e9e54000000003565e1c4
-
- /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
- .octa 0x00000000281346b6000000012ed02a82
-
- /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
- .octa 0x000000015646402400000000c486ecfc
-
- /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
- .octa 0x000000016063a8dc0000000001b951b2
-
- /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
- .octa 0x0000000116a663620000000048143916
-
- /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
- .octa 0x000000017e8aa4d200000001dc2ae124
-
- /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
- .octa 0x00000001728eb10c00000001416c58d6
-
- /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
- .octa 0x00000001b08fd7fa00000000a479744a
-
- /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
- .octa 0x00000001092a16e80000000096ca3a26
-
- /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
- .octa 0x00000000a505637c00000000ff223d4e
-
- /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
- .octa 0x00000000d94869b2000000010e84da42
-
- /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
- .octa 0x00000001c8b203ae00000001b61ba3d0
-
- /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
- .octa 0x000000005704aea000000000680f2de8
-
- /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
- .octa 0x000000012e295fa2000000008772a9a8
-
- /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
- .octa 0x000000011d0908bc0000000155f295bc
-
- /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
- .octa 0x0000000193ed97ea00000000595f9282
-
- /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
- .octa 0x000000013a0f1c520000000164b1c25a
-
- /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
- .octa 0x000000010c2c40c000000000fbd67c50
-
- /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
- .octa 0x00000000ff6fac3e0000000096076268
-
- /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
- .octa 0x000000017b3609c000000001d288e4cc
-
- /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
- .octa 0x0000000088c8c92200000001eaac1bdc
-
- /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
- .octa 0x00000001751baae600000001f1ea39e2
-
- /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
- .octa 0x000000010795297200000001eb6506fc
-
- /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
- .octa 0x0000000162b00abe000000010f806ffe
-
- /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
- .octa 0x000000000d7b404c000000010408481e
-
- /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
- .octa 0x00000000763b13d40000000188260534
-
- /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
- .octa 0x00000000f6dc22d80000000058fc73e0
-
- /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
- .octa 0x000000007daae06000000000391c59b8
-
- /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
- .octa 0x000000013359ab7c000000018b638400
-
- /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
- .octa 0x000000008add438a000000011738f5c4
-
- /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
- .octa 0x00000001edbefdea000000008cf7c6da
-
- /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
- .octa 0x000000004104e0f800000001ef97fb16
-
- /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
- .octa 0x00000000b48a82220000000102130e20
-
- /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
- .octa 0x00000001bcb4684400000000db968898
-
- /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
- .octa 0x000000013293ce0a00000000b5047b5e
-
- /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
- .octa 0x00000001710d0844000000010b90fdb2
-
- /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
- .octa 0x0000000117907f6e000000004834a32e
-
- /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
- .octa 0x0000000087ddf93e0000000059c8f2b0
-
- /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
- .octa 0x000000005970e9b00000000122cec508
-
- /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
- .octa 0x0000000185b2b7d0000000000a330cda
-
- /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
- .octa 0x00000001dcee0efc000000014a47148c
-
- /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
- .octa 0x0000000030da27220000000042c61cb8
-
- /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
- .octa 0x000000012f925a180000000012fe6960
-
- /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
- .octa 0x00000000dd2e357c00000000dbda2c20
-
- /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
- .octa 0x00000000071c80de000000011122410c
-
- /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
- .octa 0x000000011513140a00000000977b2070
-
- /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
- .octa 0x00000001df876e8e000000014050438e
-
- /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
- .octa 0x000000015f81d6ce0000000147c840e8
-
- /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
- .octa 0x000000019dd94dbe00000001cc7c88ce
-
- /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
- .octa 0x00000001373d206e00000001476b35a4
-
- /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
- .octa 0x00000000668ccade000000013d52d508
-
- /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
- .octa 0x00000001b192d268000000008e4be32e
-
- /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
- .octa 0x00000000e30f3a7800000000024120fe
-
- /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
- .octa 0x000000010ef1f7bc00000000ddecddb4
-
- /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
- .octa 0x00000001f5ac738000000000d4d403bc
-
- /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
- .octa 0x000000011822ea7000000001734b89aa
-
- /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
- .octa 0x00000000c3a33848000000010e7a58d6
-
- /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
- .octa 0x00000001bd151c2400000001f9f04e9c
-
- /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
- .octa 0x0000000056002d7600000000b692225e
-
- /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
- .octa 0x000000014657c4f4000000019b8d3f3e
-
- /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
- .octa 0x0000000113742d7c00000001a874f11e
-
- /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
- .octa 0x000000019c5920ba000000010d5a4254
-
- /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
- .octa 0x000000005216d2d600000000bbb2f5d6
-
- /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
- .octa 0x0000000136f5ad8a0000000179cc0e36
-
- /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
- .octa 0x000000018b07beb600000001dca1da4a
-
- /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
- .octa 0x00000000db1e93b000000000feb1a192
-
- /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
- .octa 0x000000000b96fa3a00000000d1eeedd6
-
- /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
- .octa 0x00000001d9968af0000000008fad9bb4
-
- /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
- .octa 0x000000000e4a77a200000001884938e4
-
- /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
- .octa 0x00000000508c2ac800000001bc2e9bc0
-
- /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
- .octa 0x0000000021572a8000000001f9658a68
-
- /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
- .octa 0x00000001b859daf2000000001b9224fc
-
- /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
- .octa 0x000000016f7884740000000055b2fb84
-
- /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
- .octa 0x00000001b438810e000000018b090348
-
- /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
- .octa 0x0000000095ddc6f2000000011ccbd5ea
-
- /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
- .octa 0x00000001d977c20c0000000007ae47f8
-
- /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
- .octa 0x00000000ebedb99a0000000172acbec0
-
- /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
- .octa 0x00000001df9e9e9200000001c6e3ff20
-
- /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
- .octa 0x00000001a4a3f95200000000e1b38744
-
- /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
- .octa 0x00000000e2f5122000000000791585b2
-
- /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
- .octa 0x000000004aa01f3e00000000ac53b894
-
- /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
- .octa 0x00000000b3e90a5800000001ed5f2cf4
-
- /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
- .octa 0x000000000c9ca2aa00000001df48b2e0
-
- /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
- .octa 0x000000015168231600000000049c1c62
-
- /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
- .octa 0x0000000036fce78c000000017c460c12
-
- /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
- .octa 0x000000009037dc10000000015be4da7e
-
- /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
- .octa 0x00000000d3298582000000010f38f668
-
- /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
- .octa 0x00000001b42e8ad60000000039f40a00
-
- /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
- .octa 0x00000000142a983800000000bd4c10c4
-
- /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
- .octa 0x0000000109c7f1900000000042db1d98
-
- /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
- .octa 0x0000000056ff931000000001c905bae6
-
- /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
- .octa 0x00000001594513aa00000000069d40ea
-
- /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
- .octa 0x00000001e3b5b1e8000000008e4fbad0
-
- /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
- .octa 0x000000011dd5fc080000000047bedd46
-
- /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
- .octa 0x00000001675f0cc20000000026396bf8
-
- /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
- .octa 0x00000000d1c8dd4400000000379beb92
-
- /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
- .octa 0x0000000115ebd3d8000000000abae54a
-
- /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
- .octa 0x00000001ecbd0dac0000000007e6a128
-
- /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
- .octa 0x00000000cdf67af2000000000ade29d2
-
- /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
- .octa 0x000000004c01ff4c00000000f974c45c
-
- /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
- .octa 0x00000000f2d8657e00000000e77ac60a
-
- /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
- .octa 0x000000006bae74c40000000145895816
-
- /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
- .octa 0x0000000152af8aa00000000038e362be
-
- /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
- .octa 0x0000000004663802000000007f991a64
-
- /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
- .octa 0x00000001ab2f5afc00000000fa366d3a
-
- /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
- .octa 0x0000000074a4ebd400000001a2bb34f0
-
- /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
- .octa 0x00000001d7ab3a4c0000000028a9981e
-
- /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
- .octa 0x00000001a8da60c600000001dbc672be
-
- /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
- .octa 0x000000013cf6382000000000b04d77f6
-
- /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
- .octa 0x00000000bec12e1e0000000124400d96
-
- /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
- .octa 0x00000001c6368010000000014ca4b414
-
- /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
- .octa 0x00000001e6e78758000000012fe2c938
-
- /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
- .octa 0x000000008d7f2b3c00000001faed01e6
-
- /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
- .octa 0x000000016b4a156e000000007e80ecfe
-
- /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
- .octa 0x00000001c63cfeb60000000098daee94
-
- /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
- .octa 0x000000015f902670000000010a04edea
-
- /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
- .octa 0x00000001cd5de11e00000001c00b4524
-
- /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
- .octa 0x000000001acaec540000000170296550
-
- /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
- .octa 0x000000002bd0ca780000000181afaa48
-
- /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
- .octa 0x0000000032d63d5c0000000185a31ffa
-
- /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
- .octa 0x000000001c6d4e4c000000002469f608
-
- /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
- .octa 0x0000000106a60b92000000006980102a
-
- /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
- .octa 0x00000000d3855e120000000111ea9ca8
-
- /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
- .octa 0x00000000e312563600000001bd1d29ce
-
- /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
- .octa 0x000000009e8f7ea400000001b34b9580
-
- /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
- .octa 0x00000001c82e562c000000003076054e
-
- /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
- .octa 0x00000000ca9f09ce000000012a608ea4
-
- /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
- .octa 0x00000000c63764e600000000784d05fe
-
- /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
- .octa 0x0000000168d2e49e000000016ef0d82a
-
- /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
- .octa 0x00000000e986c1480000000075bda454
-
- /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
- .octa 0x00000000cfb65894000000003dc0a1c4
-
- /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
- .octa 0x0000000111cadee400000000e9a5d8be
-
- /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
- .octa 0x0000000171fb63ce00000001609bc4b4
-
-.short_constants:
-
- /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
- /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */
- .octa 0x7fec2963e5bf80485cf015c388e56f72
-
- /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */
- .octa 0x38e888d4844752a9963a18920246e2e6
-
- /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */
- .octa 0x42316c00730206ad419a441956993a31
-
- /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */
- .octa 0x543d5c543e65ddf9924752ba2b830011
-
- /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */
- .octa 0x78e87aaf56767c9255bd7f9518e4a304
-
- /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */
- .octa 0x8f68fcec1903da7f6d76739fe0553f1e
-
- /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */
- .octa 0x3f4840246791d588c133722b1fe0b5c3
-
- /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */
- .octa 0x34c96751b04de25a64b67ee0e55ef1f3
-
- /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */
- .octa 0x156c8e180b4a395b069db049b8fdb1e7
-
- /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */
- .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e
-
- /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */
- .octa 0x041d37768cd75659817cdc5119b29a35
-
- /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */
- .octa 0x3a0777818cfaa9651ce9d94b36c41f1c
-
- /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */
- .octa 0x0e148e8252377a554f256efcb82be955
-
- /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */
- .octa 0x9c25531d19e65ddeec1631edb2dea967
-
- /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */
- .octa 0x790606ff9957c0a65d27e147510ac59a
-
- /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */
- .octa 0x82f63b786ea2d55ca66805eb18b8ea18
-
-
-.barrett_constants:
- /* 33 bit reflected Barrett constant m - (4^32)/n */
- .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */
- /* 33 bit reflected Barrett constant n */
- .octa 0x00000000000000000000000105ec76f1
-
-#define CRC_FUNCTION_NAME __crc32c_vpmsum
-#define REFLECT
-#include "crc-vpmsum-template.S"
diff --git a/arch/powerpc/lib/crct10dif-vpmsum_asm.S b/arch/powerpc/lib/crct10dif-vpmsum_asm.S
deleted file mode 100644
index 47a6266d89a8..000000000000
--- a/arch/powerpc/lib/crct10dif-vpmsum_asm.S
+++ /dev/null
@@ -1,845 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Calculate a CRC T10DIF with vpmsum acceleration
- *
- * Constants generated by crc32-vpmsum, available at
- * https://github.com/antonblanchard/crc32-vpmsum
- *
- * crc32-vpmsum is
- * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
- */
- .section .rodata
-.balign 16
-
-.byteswap_constant:
- /* byte reverse permute constant */
- .octa 0x0F0E0D0C0B0A09080706050403020100
-
-.constants:
-
- /* Reduce 262144 kbits to 1024 bits */
- /* x^261184 mod p(x), x^261120 mod p(x) */
- .octa 0x0000000056d300000000000052550000
-
- /* x^260160 mod p(x), x^260096 mod p(x) */
- .octa 0x00000000ee67000000000000a1e40000
-
- /* x^259136 mod p(x), x^259072 mod p(x) */
- .octa 0x0000000060830000000000004ad10000
-
- /* x^258112 mod p(x), x^258048 mod p(x) */
- .octa 0x000000008cfe0000000000009ab40000
-
- /* x^257088 mod p(x), x^257024 mod p(x) */
- .octa 0x000000003e93000000000000fdb50000
-
- /* x^256064 mod p(x), x^256000 mod p(x) */
- .octa 0x000000003c2000000000000045480000
-
- /* x^255040 mod p(x), x^254976 mod p(x) */
- .octa 0x00000000b1fc0000000000008d690000
-
- /* x^254016 mod p(x), x^253952 mod p(x) */
- .octa 0x00000000f82b00000000000024ad0000
-
- /* x^252992 mod p(x), x^252928 mod p(x) */
- .octa 0x0000000044420000000000009f1a0000
-
- /* x^251968 mod p(x), x^251904 mod p(x) */
- .octa 0x00000000e88c00000000000066ec0000
-
- /* x^250944 mod p(x), x^250880 mod p(x) */
- .octa 0x00000000385c000000000000c87d0000
-
- /* x^249920 mod p(x), x^249856 mod p(x) */
- .octa 0x000000003227000000000000c8ff0000
-
- /* x^248896 mod p(x), x^248832 mod p(x) */
- .octa 0x00000000a9a900000000000033440000
-
- /* x^247872 mod p(x), x^247808 mod p(x) */
- .octa 0x00000000abaa00000000000066eb0000
-
- /* x^246848 mod p(x), x^246784 mod p(x) */
- .octa 0x000000001ac3000000000000c4ef0000
-
- /* x^245824 mod p(x), x^245760 mod p(x) */
- .octa 0x0000000063f000000000000056f30000
-
- /* x^244800 mod p(x), x^244736 mod p(x) */
- .octa 0x0000000032cc00000000000002050000
-
- /* x^243776 mod p(x), x^243712 mod p(x) */
- .octa 0x00000000f8b5000000000000568e0000
-
- /* x^242752 mod p(x), x^242688 mod p(x) */
- .octa 0x000000008db100000000000064290000
-
- /* x^241728 mod p(x), x^241664 mod p(x) */
- .octa 0x0000000059ca0000000000006b660000
-
- /* x^240704 mod p(x), x^240640 mod p(x) */
- .octa 0x000000005f5c00000000000018f80000
-
- /* x^239680 mod p(x), x^239616 mod p(x) */
- .octa 0x0000000061af000000000000b6090000
-
- /* x^238656 mod p(x), x^238592 mod p(x) */
- .octa 0x00000000e29e000000000000099a0000
-
- /* x^237632 mod p(x), x^237568 mod p(x) */
- .octa 0x000000000975000000000000a8360000
-
- /* x^236608 mod p(x), x^236544 mod p(x) */
- .octa 0x0000000043900000000000004f570000
-
- /* x^235584 mod p(x), x^235520 mod p(x) */
- .octa 0x00000000f9cd000000000000134c0000
-
- /* x^234560 mod p(x), x^234496 mod p(x) */
- .octa 0x000000007c29000000000000ec380000
-
- /* x^233536 mod p(x), x^233472 mod p(x) */
- .octa 0x000000004c6a000000000000b0d10000
-
- /* x^232512 mod p(x), x^232448 mod p(x) */
- .octa 0x00000000e7290000000000007d3e0000
-
- /* x^231488 mod p(x), x^231424 mod p(x) */
- .octa 0x00000000f1ab000000000000f0b20000
-
- /* x^230464 mod p(x), x^230400 mod p(x) */
- .octa 0x0000000039db0000000000009c270000
-
- /* x^229440 mod p(x), x^229376 mod p(x) */
- .octa 0x000000005e2800000000000092890000
-
- /* x^228416 mod p(x), x^228352 mod p(x) */
- .octa 0x00000000d44e000000000000d5ee0000
-
- /* x^227392 mod p(x), x^227328 mod p(x) */
- .octa 0x00000000cd0a00000000000041f50000
-
- /* x^226368 mod p(x), x^226304 mod p(x) */
- .octa 0x00000000c5b400000000000010520000
-
- /* x^225344 mod p(x), x^225280 mod p(x) */
- .octa 0x00000000fd2100000000000042170000
-
- /* x^224320 mod p(x), x^224256 mod p(x) */
- .octa 0x000000002f2500000000000095c20000
-
- /* x^223296 mod p(x), x^223232 mod p(x) */
- .octa 0x000000001b0100000000000001ce0000
-
- /* x^222272 mod p(x), x^222208 mod p(x) */
- .octa 0x000000000d430000000000002aca0000
-
- /* x^221248 mod p(x), x^221184 mod p(x) */
- .octa 0x0000000030a6000000000000385e0000
-
- /* x^220224 mod p(x), x^220160 mod p(x) */
- .octa 0x00000000e37b0000000000006f7a0000
-
- /* x^219200 mod p(x), x^219136 mod p(x) */
- .octa 0x00000000873600000000000024320000
-
- /* x^218176 mod p(x), x^218112 mod p(x) */
- .octa 0x00000000e9fb000000000000bd9c0000
-
- /* x^217152 mod p(x), x^217088 mod p(x) */
- .octa 0x000000003b9500000000000054bc0000
-
- /* x^216128 mod p(x), x^216064 mod p(x) */
- .octa 0x00000000133e000000000000a4660000
-
- /* x^215104 mod p(x), x^215040 mod p(x) */
- .octa 0x00000000784500000000000079930000
-
- /* x^214080 mod p(x), x^214016 mod p(x) */
- .octa 0x00000000b9800000000000001bb80000
-
- /* x^213056 mod p(x), x^212992 mod p(x) */
- .octa 0x00000000687600000000000024400000
-
- /* x^212032 mod p(x), x^211968 mod p(x) */
- .octa 0x00000000aff300000000000029e10000
-
- /* x^211008 mod p(x), x^210944 mod p(x) */
- .octa 0x0000000024b50000000000005ded0000
-
- /* x^209984 mod p(x), x^209920 mod p(x) */
- .octa 0x0000000017e8000000000000b12e0000
-
- /* x^208960 mod p(x), x^208896 mod p(x) */
- .octa 0x00000000128400000000000026d20000
-
- /* x^207936 mod p(x), x^207872 mod p(x) */
- .octa 0x000000002115000000000000a32a0000
-
- /* x^206912 mod p(x), x^206848 mod p(x) */
- .octa 0x000000009595000000000000a1210000
-
- /* x^205888 mod p(x), x^205824 mod p(x) */
- .octa 0x00000000281e000000000000ee8b0000
-
- /* x^204864 mod p(x), x^204800 mod p(x) */
- .octa 0x0000000006010000000000003d0d0000
-
- /* x^203840 mod p(x), x^203776 mod p(x) */
- .octa 0x00000000e2b600000000000034e90000
-
- /* x^202816 mod p(x), x^202752 mod p(x) */
- .octa 0x000000001bd40000000000004cdb0000
-
- /* x^201792 mod p(x), x^201728 mod p(x) */
- .octa 0x00000000df2800000000000030e90000
-
- /* x^200768 mod p(x), x^200704 mod p(x) */
- .octa 0x0000000049c200000000000042590000
-
- /* x^199744 mod p(x), x^199680 mod p(x) */
- .octa 0x000000009b97000000000000df950000
-
- /* x^198720 mod p(x), x^198656 mod p(x) */
- .octa 0x000000006184000000000000da7b0000
-
- /* x^197696 mod p(x), x^197632 mod p(x) */
- .octa 0x00000000461700000000000012510000
-
- /* x^196672 mod p(x), x^196608 mod p(x) */
- .octa 0x000000009b40000000000000f37e0000
-
- /* x^195648 mod p(x), x^195584 mod p(x) */
- .octa 0x00000000eeb2000000000000ecf10000
-
- /* x^194624 mod p(x), x^194560 mod p(x) */
- .octa 0x00000000b2e800000000000050f20000
-
- /* x^193600 mod p(x), x^193536 mod p(x) */
- .octa 0x00000000f59a000000000000e0b30000
-
- /* x^192576 mod p(x), x^192512 mod p(x) */
- .octa 0x00000000467f0000000000004d5a0000
-
- /* x^191552 mod p(x), x^191488 mod p(x) */
- .octa 0x00000000da92000000000000bb010000
-
- /* x^190528 mod p(x), x^190464 mod p(x) */
- .octa 0x000000001e1000000000000022a40000
-
- /* x^189504 mod p(x), x^189440 mod p(x) */
- .octa 0x0000000058fe000000000000836f0000
-
- /* x^188480 mod p(x), x^188416 mod p(x) */
- .octa 0x00000000b9ce000000000000d78d0000
-
- /* x^187456 mod p(x), x^187392 mod p(x) */
- .octa 0x0000000022210000000000004f8d0000
-
- /* x^186432 mod p(x), x^186368 mod p(x) */
- .octa 0x00000000744600000000000033760000
-
- /* x^185408 mod p(x), x^185344 mod p(x) */
- .octa 0x000000001c2e000000000000a1e50000
-
- /* x^184384 mod p(x), x^184320 mod p(x) */
- .octa 0x00000000dcc8000000000000a1a40000
-
- /* x^183360 mod p(x), x^183296 mod p(x) */
- .octa 0x00000000910f00000000000019a20000
-
- /* x^182336 mod p(x), x^182272 mod p(x) */
- .octa 0x0000000055d5000000000000f6ae0000
-
- /* x^181312 mod p(x), x^181248 mod p(x) */
- .octa 0x00000000c8ba000000000000a7ac0000
-
- /* x^180288 mod p(x), x^180224 mod p(x) */
- .octa 0x0000000031f8000000000000eea20000
-
- /* x^179264 mod p(x), x^179200 mod p(x) */
- .octa 0x000000001966000000000000c4d90000
-
- /* x^178240 mod p(x), x^178176 mod p(x) */
- .octa 0x00000000b9810000000000002b470000
-
- /* x^177216 mod p(x), x^177152 mod p(x) */
- .octa 0x000000008303000000000000f7cf0000
-
- /* x^176192 mod p(x), x^176128 mod p(x) */
- .octa 0x000000002ce500000000000035b30000
-
- /* x^175168 mod p(x), x^175104 mod p(x) */
- .octa 0x000000002fae0000000000000c7c0000
-
- /* x^174144 mod p(x), x^174080 mod p(x) */
- .octa 0x00000000f50c0000000000009edf0000
-
- /* x^173120 mod p(x), x^173056 mod p(x) */
- .octa 0x00000000714f00000000000004cd0000
-
- /* x^172096 mod p(x), x^172032 mod p(x) */
- .octa 0x00000000c161000000000000541b0000
-
- /* x^171072 mod p(x), x^171008 mod p(x) */
- .octa 0x0000000021c8000000000000e2700000
-
- /* x^170048 mod p(x), x^169984 mod p(x) */
- .octa 0x00000000b93d00000000000009a60000
-
- /* x^169024 mod p(x), x^168960 mod p(x) */
- .octa 0x00000000fbcf000000000000761c0000
-
- /* x^168000 mod p(x), x^167936 mod p(x) */
- .octa 0x0000000026350000000000009db30000
-
- /* x^166976 mod p(x), x^166912 mod p(x) */
- .octa 0x00000000b64f0000000000003e9f0000
-
- /* x^165952 mod p(x), x^165888 mod p(x) */
- .octa 0x00000000bd0e00000000000078590000
-
- /* x^164928 mod p(x), x^164864 mod p(x) */
- .octa 0x00000000d9360000000000008bc80000
-
- /* x^163904 mod p(x), x^163840 mod p(x) */
- .octa 0x000000002f140000000000008c9f0000
-
- /* x^162880 mod p(x), x^162816 mod p(x) */
- .octa 0x000000006a270000000000006af70000
-
- /* x^161856 mod p(x), x^161792 mod p(x) */
- .octa 0x000000006685000000000000e5210000
-
- /* x^160832 mod p(x), x^160768 mod p(x) */
- .octa 0x0000000062da00000000000008290000
-
- /* x^159808 mod p(x), x^159744 mod p(x) */
- .octa 0x00000000bb4b000000000000e4d00000
-
- /* x^158784 mod p(x), x^158720 mod p(x) */
- .octa 0x00000000d2490000000000004ae10000
-
- /* x^157760 mod p(x), x^157696 mod p(x) */
- .octa 0x00000000c85b00000000000000e70000
-
- /* x^156736 mod p(x), x^156672 mod p(x) */
- .octa 0x00000000c37a00000000000015650000
-
- /* x^155712 mod p(x), x^155648 mod p(x) */
- .octa 0x0000000018530000000000001c2f0000
-
- /* x^154688 mod p(x), x^154624 mod p(x) */
- .octa 0x00000000b46600000000000037bd0000
-
- /* x^153664 mod p(x), x^153600 mod p(x) */
- .octa 0x00000000439b00000000000012190000
-
- /* x^152640 mod p(x), x^152576 mod p(x) */
- .octa 0x00000000b1260000000000005ece0000
-
- /* x^151616 mod p(x), x^151552 mod p(x) */
- .octa 0x00000000d8110000000000002a5e0000
-
- /* x^150592 mod p(x), x^150528 mod p(x) */
- .octa 0x00000000099f00000000000052330000
-
- /* x^149568 mod p(x), x^149504 mod p(x) */
- .octa 0x00000000f9f9000000000000f9120000
-
- /* x^148544 mod p(x), x^148480 mod p(x) */
- .octa 0x000000005cc00000000000000ddc0000
-
- /* x^147520 mod p(x), x^147456 mod p(x) */
- .octa 0x00000000343b00000000000012200000
-
- /* x^146496 mod p(x), x^146432 mod p(x) */
- .octa 0x000000009222000000000000d12b0000
-
- /* x^145472 mod p(x), x^145408 mod p(x) */
- .octa 0x00000000d781000000000000eb2d0000
-
- /* x^144448 mod p(x), x^144384 mod p(x) */
- .octa 0x000000000bf400000000000058970000
-
- /* x^143424 mod p(x), x^143360 mod p(x) */
- .octa 0x00000000094200000000000013690000
-
- /* x^142400 mod p(x), x^142336 mod p(x) */
- .octa 0x00000000d55100000000000051950000
-
- /* x^141376 mod p(x), x^141312 mod p(x) */
- .octa 0x000000008f11000000000000954b0000
-
- /* x^140352 mod p(x), x^140288 mod p(x) */
- .octa 0x00000000140f000000000000b29e0000
-
- /* x^139328 mod p(x), x^139264 mod p(x) */
- .octa 0x00000000c6db000000000000db5d0000
-
- /* x^138304 mod p(x), x^138240 mod p(x) */
- .octa 0x00000000715b000000000000dfaf0000
-
- /* x^137280 mod p(x), x^137216 mod p(x) */
- .octa 0x000000000dea000000000000e3b60000
-
- /* x^136256 mod p(x), x^136192 mod p(x) */
- .octa 0x000000006f94000000000000ddaf0000
-
- /* x^135232 mod p(x), x^135168 mod p(x) */
- .octa 0x0000000024e1000000000000e4f70000
-
- /* x^134208 mod p(x), x^134144 mod p(x) */
- .octa 0x000000008810000000000000aa110000
-
- /* x^133184 mod p(x), x^133120 mod p(x) */
- .octa 0x0000000030c2000000000000a8e60000
-
- /* x^132160 mod p(x), x^132096 mod p(x) */
- .octa 0x00000000e6d0000000000000ccf30000
-
- /* x^131136 mod p(x), x^131072 mod p(x) */
- .octa 0x000000004da000000000000079bf0000
-
- /* x^130112 mod p(x), x^130048 mod p(x) */
- .octa 0x000000007759000000000000b3a30000
-
- /* x^129088 mod p(x), x^129024 mod p(x) */
- .octa 0x00000000597400000000000028790000
-
- /* x^128064 mod p(x), x^128000 mod p(x) */
- .octa 0x000000007acd000000000000b5820000
-
- /* x^127040 mod p(x), x^126976 mod p(x) */
- .octa 0x00000000e6e400000000000026ad0000
-
- /* x^126016 mod p(x), x^125952 mod p(x) */
- .octa 0x000000006d49000000000000985b0000
-
- /* x^124992 mod p(x), x^124928 mod p(x) */
- .octa 0x000000000f0800000000000011520000
-
- /* x^123968 mod p(x), x^123904 mod p(x) */
- .octa 0x000000002c7f000000000000846c0000
-
- /* x^122944 mod p(x), x^122880 mod p(x) */
- .octa 0x000000005ce7000000000000ae1d0000
-
- /* x^121920 mod p(x), x^121856 mod p(x) */
- .octa 0x00000000d4cb000000000000e21d0000
-
- /* x^120896 mod p(x), x^120832 mod p(x) */
- .octa 0x000000003a2300000000000019bb0000
-
- /* x^119872 mod p(x), x^119808 mod p(x) */
- .octa 0x000000000e1700000000000095290000
-
- /* x^118848 mod p(x), x^118784 mod p(x) */
- .octa 0x000000006e6400000000000050d20000
-
- /* x^117824 mod p(x), x^117760 mod p(x) */
- .octa 0x000000008d5c0000000000000cd10000
-
- /* x^116800 mod p(x), x^116736 mod p(x) */
- .octa 0x00000000ef310000000000007b570000
-
- /* x^115776 mod p(x), x^115712 mod p(x) */
- .octa 0x00000000645d00000000000053d60000
-
- /* x^114752 mod p(x), x^114688 mod p(x) */
- .octa 0x0000000018fc00000000000077510000
-
- /* x^113728 mod p(x), x^113664 mod p(x) */
- .octa 0x000000000cb3000000000000a7b70000
-
- /* x^112704 mod p(x), x^112640 mod p(x) */
- .octa 0x00000000991b000000000000d0780000
-
- /* x^111680 mod p(x), x^111616 mod p(x) */
- .octa 0x00000000845a000000000000be3c0000
-
- /* x^110656 mod p(x), x^110592 mod p(x) */
- .octa 0x00000000d3a9000000000000df020000
-
- /* x^109632 mod p(x), x^109568 mod p(x) */
- .octa 0x0000000017d7000000000000063e0000
-
- /* x^108608 mod p(x), x^108544 mod p(x) */
- .octa 0x000000007a860000000000008ab40000
-
- /* x^107584 mod p(x), x^107520 mod p(x) */
- .octa 0x00000000fd7c000000000000c7bd0000
-
- /* x^106560 mod p(x), x^106496 mod p(x) */
- .octa 0x00000000a56b000000000000efd60000
-
- /* x^105536 mod p(x), x^105472 mod p(x) */
- .octa 0x0000000010e400000000000071380000
-
- /* x^104512 mod p(x), x^104448 mod p(x) */
- .octa 0x00000000994500000000000004d30000
-
- /* x^103488 mod p(x), x^103424 mod p(x) */
- .octa 0x00000000b83c0000000000003b0e0000
-
- /* x^102464 mod p(x), x^102400 mod p(x) */
- .octa 0x00000000d6c10000000000008b020000
-
- /* x^101440 mod p(x), x^101376 mod p(x) */
- .octa 0x000000009efc000000000000da940000
-
- /* x^100416 mod p(x), x^100352 mod p(x) */
- .octa 0x000000005e87000000000000f9f70000
-
- /* x^99392 mod p(x), x^99328 mod p(x) */
- .octa 0x000000006c9b00000000000045e40000
-
- /* x^98368 mod p(x), x^98304 mod p(x) */
- .octa 0x00000000178a00000000000083940000
-
- /* x^97344 mod p(x), x^97280 mod p(x) */
- .octa 0x00000000f0c8000000000000f0a00000
-
- /* x^96320 mod p(x), x^96256 mod p(x) */
- .octa 0x00000000f699000000000000b74b0000
-
- /* x^95296 mod p(x), x^95232 mod p(x) */
- .octa 0x00000000316d000000000000c1cf0000
-
- /* x^94272 mod p(x), x^94208 mod p(x) */
- .octa 0x00000000987e00000000000072680000
-
- /* x^93248 mod p(x), x^93184 mod p(x) */
- .octa 0x00000000acff000000000000e0ab0000
-
- /* x^92224 mod p(x), x^92160 mod p(x) */
- .octa 0x00000000a1f6000000000000c5a80000
-
- /* x^91200 mod p(x), x^91136 mod p(x) */
- .octa 0x0000000061bd000000000000cf690000
-
- /* x^90176 mod p(x), x^90112 mod p(x) */
- .octa 0x00000000c9f2000000000000cbcc0000
-
- /* x^89152 mod p(x), x^89088 mod p(x) */
- .octa 0x000000005a33000000000000de050000
-
- /* x^88128 mod p(x), x^88064 mod p(x) */
- .octa 0x00000000e416000000000000ccd70000
-
- /* x^87104 mod p(x), x^87040 mod p(x) */
- .octa 0x0000000058930000000000002f670000
-
- /* x^86080 mod p(x), x^86016 mod p(x) */
- .octa 0x00000000a9d3000000000000152f0000
-
- /* x^85056 mod p(x), x^84992 mod p(x) */
- .octa 0x00000000c114000000000000ecc20000
-
- /* x^84032 mod p(x), x^83968 mod p(x) */
- .octa 0x00000000b9270000000000007c890000
-
- /* x^83008 mod p(x), x^82944 mod p(x) */
- .octa 0x000000002e6000000000000006ee0000
-
- /* x^81984 mod p(x), x^81920 mod p(x) */
- .octa 0x00000000dfc600000000000009100000
-
- /* x^80960 mod p(x), x^80896 mod p(x) */
- .octa 0x000000004911000000000000ad4e0000
-
- /* x^79936 mod p(x), x^79872 mod p(x) */
- .octa 0x00000000ae1b000000000000b04d0000
-
- /* x^78912 mod p(x), x^78848 mod p(x) */
- .octa 0x0000000005fa000000000000e9900000
-
- /* x^77888 mod p(x), x^77824 mod p(x) */
- .octa 0x0000000004a1000000000000cc6f0000
-
- /* x^76864 mod p(x), x^76800 mod p(x) */
- .octa 0x00000000af73000000000000ed110000
-
- /* x^75840 mod p(x), x^75776 mod p(x) */
- .octa 0x0000000082530000000000008f7e0000
-
- /* x^74816 mod p(x), x^74752 mod p(x) */
- .octa 0x00000000cfdc000000000000594f0000
-
- /* x^73792 mod p(x), x^73728 mod p(x) */
- .octa 0x00000000a6b6000000000000a8750000
-
- /* x^72768 mod p(x), x^72704 mod p(x) */
- .octa 0x00000000fd76000000000000aa0c0000
-
- /* x^71744 mod p(x), x^71680 mod p(x) */
- .octa 0x0000000006f500000000000071db0000
-
- /* x^70720 mod p(x), x^70656 mod p(x) */
- .octa 0x0000000037ca000000000000ab0c0000
-
- /* x^69696 mod p(x), x^69632 mod p(x) */
- .octa 0x00000000d7ab000000000000b7a00000
-
- /* x^68672 mod p(x), x^68608 mod p(x) */
- .octa 0x00000000440800000000000090d30000
-
- /* x^67648 mod p(x), x^67584 mod p(x) */
- .octa 0x00000000186100000000000054730000
-
- /* x^66624 mod p(x), x^66560 mod p(x) */
- .octa 0x000000007368000000000000a3a20000
-
- /* x^65600 mod p(x), x^65536 mod p(x) */
- .octa 0x0000000026d0000000000000f9040000
-
- /* x^64576 mod p(x), x^64512 mod p(x) */
- .octa 0x00000000fe770000000000009c0a0000
-
- /* x^63552 mod p(x), x^63488 mod p(x) */
- .octa 0x000000002cba000000000000d1e70000
-
- /* x^62528 mod p(x), x^62464 mod p(x) */
- .octa 0x00000000f8bd0000000000005ac10000
-
- /* x^61504 mod p(x), x^61440 mod p(x) */
- .octa 0x000000007372000000000000d68d0000
-
- /* x^60480 mod p(x), x^60416 mod p(x) */
- .octa 0x00000000f37f00000000000089f60000
-
- /* x^59456 mod p(x), x^59392 mod p(x) */
- .octa 0x00000000078400000000000008a90000
-
- /* x^58432 mod p(x), x^58368 mod p(x) */
- .octa 0x00000000d3e400000000000042360000
-
- /* x^57408 mod p(x), x^57344 mod p(x) */
- .octa 0x00000000eba800000000000092d50000
-
- /* x^56384 mod p(x), x^56320 mod p(x) */
- .octa 0x00000000afbe000000000000b4d50000
-
- /* x^55360 mod p(x), x^55296 mod p(x) */
- .octa 0x00000000d8ca000000000000c9060000
-
- /* x^54336 mod p(x), x^54272 mod p(x) */
- .octa 0x00000000c2d00000000000008f4f0000
-
- /* x^53312 mod p(x), x^53248 mod p(x) */
- .octa 0x00000000373200000000000028690000
-
- /* x^52288 mod p(x), x^52224 mod p(x) */
- .octa 0x0000000046ae000000000000c3b30000
-
- /* x^51264 mod p(x), x^51200 mod p(x) */
- .octa 0x00000000b243000000000000f8700000
-
- /* x^50240 mod p(x), x^50176 mod p(x) */
- .octa 0x00000000f7f500000000000029eb0000
-
- /* x^49216 mod p(x), x^49152 mod p(x) */
- .octa 0x000000000c7e000000000000fe730000
-
- /* x^48192 mod p(x), x^48128 mod p(x) */
- .octa 0x00000000c38200000000000096000000
-
- /* x^47168 mod p(x), x^47104 mod p(x) */
- .octa 0x000000008956000000000000683c0000
-
- /* x^46144 mod p(x), x^46080 mod p(x) */
- .octa 0x00000000422d0000000000005f1e0000
-
- /* x^45120 mod p(x), x^45056 mod p(x) */
- .octa 0x00000000ac0f0000000000006f810000
-
- /* x^44096 mod p(x), x^44032 mod p(x) */
- .octa 0x00000000ce30000000000000031f0000
-
- /* x^43072 mod p(x), x^43008 mod p(x) */
- .octa 0x000000003d43000000000000455a0000
-
- /* x^42048 mod p(x), x^41984 mod p(x) */
- .octa 0x000000007ebe000000000000a6050000
-
- /* x^41024 mod p(x), x^40960 mod p(x) */
- .octa 0x00000000976e00000000000077eb0000
-
- /* x^40000 mod p(x), x^39936 mod p(x) */
- .octa 0x000000000872000000000000389c0000
-
- /* x^38976 mod p(x), x^38912 mod p(x) */
- .octa 0x000000008979000000000000c7b20000
-
- /* x^37952 mod p(x), x^37888 mod p(x) */
- .octa 0x000000005c1e0000000000001d870000
-
- /* x^36928 mod p(x), x^36864 mod p(x) */
- .octa 0x00000000aebb00000000000045810000
-
- /* x^35904 mod p(x), x^35840 mod p(x) */
- .octa 0x000000004f7e0000000000006d4a0000
-
- /* x^34880 mod p(x), x^34816 mod p(x) */
- .octa 0x00000000ea98000000000000b9200000
-
- /* x^33856 mod p(x), x^33792 mod p(x) */
- .octa 0x00000000f39600000000000022f20000
-
- /* x^32832 mod p(x), x^32768 mod p(x) */
- .octa 0x000000000bc500000000000041ca0000
-
- /* x^31808 mod p(x), x^31744 mod p(x) */
- .octa 0x00000000786400000000000078500000
-
- /* x^30784 mod p(x), x^30720 mod p(x) */
- .octa 0x00000000be970000000000009e7e0000
-
- /* x^29760 mod p(x), x^29696 mod p(x) */
- .octa 0x00000000dd6d000000000000a53c0000
-
- /* x^28736 mod p(x), x^28672 mod p(x) */
- .octa 0x000000004c3f00000000000039340000
-
- /* x^27712 mod p(x), x^27648 mod p(x) */
- .octa 0x0000000093a4000000000000b58e0000
-
- /* x^26688 mod p(x), x^26624 mod p(x) */
- .octa 0x0000000050fb00000000000062d40000
-
- /* x^25664 mod p(x), x^25600 mod p(x) */
- .octa 0x00000000f505000000000000a26f0000
-
- /* x^24640 mod p(x), x^24576 mod p(x) */
- .octa 0x0000000064f900000000000065e60000
-
- /* x^23616 mod p(x), x^23552 mod p(x) */
- .octa 0x00000000e8c2000000000000aad90000
-
- /* x^22592 mod p(x), x^22528 mod p(x) */
- .octa 0x00000000720b000000000000a3b00000
-
- /* x^21568 mod p(x), x^21504 mod p(x) */
- .octa 0x00000000e992000000000000d2680000
-
- /* x^20544 mod p(x), x^20480 mod p(x) */
- .octa 0x000000009132000000000000cf4c0000
-
- /* x^19520 mod p(x), x^19456 mod p(x) */
- .octa 0x00000000608a00000000000076610000
-
- /* x^18496 mod p(x), x^18432 mod p(x) */
- .octa 0x000000009948000000000000fb9f0000
-
- /* x^17472 mod p(x), x^17408 mod p(x) */
- .octa 0x00000000173000000000000003770000
-
- /* x^16448 mod p(x), x^16384 mod p(x) */
- .octa 0x000000006fe300000000000004880000
-
- /* x^15424 mod p(x), x^15360 mod p(x) */
- .octa 0x00000000e15300000000000056a70000
-
- /* x^14400 mod p(x), x^14336 mod p(x) */
- .octa 0x0000000092d60000000000009dfd0000
-
- /* x^13376 mod p(x), x^13312 mod p(x) */
- .octa 0x0000000002fd00000000000074c80000
-
- /* x^12352 mod p(x), x^12288 mod p(x) */
- .octa 0x00000000c78b000000000000a3ec0000
-
- /* x^11328 mod p(x), x^11264 mod p(x) */
- .octa 0x000000009262000000000000b3530000
-
- /* x^10304 mod p(x), x^10240 mod p(x) */
- .octa 0x0000000084f200000000000047bf0000
-
- /* x^9280 mod p(x), x^9216 mod p(x) */
- .octa 0x0000000067ee000000000000e97c0000
-
- /* x^8256 mod p(x), x^8192 mod p(x) */
- .octa 0x00000000535b00000000000091e10000
-
- /* x^7232 mod p(x), x^7168 mod p(x) */
- .octa 0x000000007ebb00000000000055060000
-
- /* x^6208 mod p(x), x^6144 mod p(x) */
- .octa 0x00000000c6a1000000000000fd360000
-
- /* x^5184 mod p(x), x^5120 mod p(x) */
- .octa 0x000000001be500000000000055860000
-
- /* x^4160 mod p(x), x^4096 mod p(x) */
- .octa 0x00000000ae0e0000000000005bd00000
-
- /* x^3136 mod p(x), x^3072 mod p(x) */
- .octa 0x0000000022040000000000008db20000
-
- /* x^2112 mod p(x), x^2048 mod p(x) */
- .octa 0x00000000c9eb000000000000efe20000
-
- /* x^1088 mod p(x), x^1024 mod p(x) */
- .octa 0x0000000039b400000000000051d10000
-
-.short_constants:
-
- /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
- /* x^2048 mod p(x), x^2016 mod p(x), x^1984 mod p(x), x^1952 mod p(x) */
- .octa 0xefe20000dccf00009440000033590000
-
- /* x^1920 mod p(x), x^1888 mod p(x), x^1856 mod p(x), x^1824 mod p(x) */
- .octa 0xee6300002f3f000062180000e0ed0000
-
- /* x^1792 mod p(x), x^1760 mod p(x), x^1728 mod p(x), x^1696 mod p(x) */
- .octa 0xcf5f000017ef0000ccbe000023d30000
-
- /* x^1664 mod p(x), x^1632 mod p(x), x^1600 mod p(x), x^1568 mod p(x) */
- .octa 0x6d0c0000a30e00000920000042630000
-
- /* x^1536 mod p(x), x^1504 mod p(x), x^1472 mod p(x), x^1440 mod p(x) */
- .octa 0x21d30000932b0000a7a00000efcc0000
-
- /* x^1408 mod p(x), x^1376 mod p(x), x^1344 mod p(x), x^1312 mod p(x) */
- .octa 0x10be00000b310000666f00000d1c0000
-
- /* x^1280 mod p(x), x^1248 mod p(x), x^1216 mod p(x), x^1184 mod p(x) */
- .octa 0x1f240000ce9e0000caad0000589e0000
-
- /* x^1152 mod p(x), x^1120 mod p(x), x^1088 mod p(x), x^1056 mod p(x) */
- .octa 0x29610000d02b000039b400007cf50000
-
- /* x^1024 mod p(x), x^992 mod p(x), x^960 mod p(x), x^928 mod p(x) */
- .octa 0x51d100009d9d00003c0e0000bfd60000
-
- /* x^896 mod p(x), x^864 mod p(x), x^832 mod p(x), x^800 mod p(x) */
- .octa 0xda390000ceae000013830000713c0000
-
- /* x^768 mod p(x), x^736 mod p(x), x^704 mod p(x), x^672 mod p(x) */
- .octa 0xb67800001e16000085c0000080a60000
-
- /* x^640 mod p(x), x^608 mod p(x), x^576 mod p(x), x^544 mod p(x) */
- .octa 0x0db40000f7f90000371d0000e6580000
-
- /* x^512 mod p(x), x^480 mod p(x), x^448 mod p(x), x^416 mod p(x) */
- .octa 0x87e70000044c0000aadb0000a4970000
-
- /* x^384 mod p(x), x^352 mod p(x), x^320 mod p(x), x^288 mod p(x) */
- .octa 0x1f990000ad180000d8b30000e7b50000
-
- /* x^256 mod p(x), x^224 mod p(x), x^192 mod p(x), x^160 mod p(x) */
- .octa 0xbe6c00006ee300004c1a000006df0000
-
- /* x^128 mod p(x), x^96 mod p(x), x^64 mod p(x), x^32 mod p(x) */
- .octa 0xfb0b00002d560000136800008bb70000
-
-
-.barrett_constants:
- /* Barrett constant m - (4^32)/n */
- .octa 0x000000000000000000000001f65a57f8 /* x^64 div p(x) */
- /* Barrett constant n */
- .octa 0x0000000000000000000000018bb70000
-
-#define CRC_FUNCTION_NAME __crct10dif_vpmsum
-#include "crc-vpmsum-template.S"
diff --git a/arch/powerpc/lib/crypto/Kconfig b/arch/powerpc/lib/crypto/Kconfig
deleted file mode 100644
index 3f9e1bbd9905..000000000000
--- a/arch/powerpc/lib/crypto/Kconfig
+++ /dev/null
@@ -1,22 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_CHACHA20_P10
- tristate
- depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
- default CRYPTO_LIB_CHACHA
- select CRYPTO_LIB_CHACHA_GENERIC
- select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_POLY1305_P10
- tristate
- depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
- depends on BROKEN # Needs to be fixed to work in softirq context
- default CRYPTO_LIB_POLY1305
- select CRYPTO_ARCH_HAVE_LIB_POLY1305
- select CRYPTO_LIB_POLY1305_GENERIC
-
-config CRYPTO_SHA256_PPC_SPE
- tristate
- depends on SPE
- default CRYPTO_LIB_SHA256
- select CRYPTO_ARCH_HAVE_LIB_SHA256
diff --git a/arch/powerpc/lib/crypto/Makefile b/arch/powerpc/lib/crypto/Makefile
deleted file mode 100644
index 27f231f8e334..000000000000
--- a/arch/powerpc/lib/crypto/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o
-chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o
-
-obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o
-poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o
-
-obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o
-sha256-ppc-spe-y := sha256.o sha256-spe-asm.o
diff --git a/arch/powerpc/lib/crypto/chacha-p10-glue.c b/arch/powerpc/lib/crypto/chacha-p10-glue.c
deleted file mode 100644
index fcd23c6f1590..000000000000
--- a/arch/powerpc/lib/crypto/chacha-p10-glue.c
+++ /dev/null
@@ -1,100 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * ChaCha stream cipher (P10 accelerated)
- *
- * Copyright 2023- IBM Corp. All rights reserved.
- */
-
-#include <crypto/chacha.h>
-#include <crypto/internal/simd.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/cpufeature.h>
-#include <linux/sizes.h>
-#include <asm/simd.h>
-#include <asm/switch_to.h>
-
-asmlinkage void chacha_p10le_8x(const struct chacha_state *state, u8 *dst,
- const u8 *src, unsigned int len, int nrounds);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
-
-static void vsx_begin(void)
-{
- preempt_disable();
- enable_kernel_vsx();
-}
-
-static void vsx_end(void)
-{
- disable_kernel_vsx();
- preempt_enable();
-}
-
-static void chacha_p10_do_8x(struct chacha_state *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- unsigned int l = bytes & ~0x0FF;
-
- if (l > 0) {
- chacha_p10le_8x(state, dst, src, l, nrounds);
- bytes -= l;
- src += l;
- dst += l;
- state->x[12] += l / CHACHA_BLOCK_SIZE;
- }
-
- if (bytes > 0)
- chacha_crypt_generic(state, dst, src, bytes, nrounds);
-}
-
-void hchacha_block_arch(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds)
-{
- hchacha_block_generic(state, out, nrounds);
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE ||
- !crypto_simd_usable())
- return chacha_crypt_generic(state, dst, src, bytes, nrounds);
-
- do {
- unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
-
- vsx_begin();
- chacha_p10_do_8x(state, dst, src, todo, nrounds);
- vsx_end();
-
- bytes -= todo;
- src += todo;
- dst += todo;
- } while (bytes);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-bool chacha_is_arch_optimized(void)
-{
- return static_key_enabled(&have_p10);
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-static int __init chacha_p10_init(void)
-{
- if (cpu_has_feature(CPU_FTR_ARCH_31))
- static_branch_enable(&have_p10);
- return 0;
-}
-subsys_initcall(chacha_p10_init);
-
-static void __exit chacha_p10_exit(void)
-{
-}
-module_exit(chacha_p10_exit);
-
-MODULE_DESCRIPTION("ChaCha stream cipher (P10 accelerated)");
-MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/powerpc/lib/crypto/chacha-p10le-8x.S b/arch/powerpc/lib/crypto/chacha-p10le-8x.S
deleted file mode 100644
index b29562bd5d40..000000000000
--- a/arch/powerpc/lib/crypto/chacha-p10le-8x.S
+++ /dev/null
@@ -1,840 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#
-# Accelerated chacha20 implementation for ppc64le.
-#
-# Copyright 2023- IBM Corp. All rights reserved
-#
-#===================================================================================
-# Written by Danny Tsen <dtsen@us.ibm.com>
-#
-# do rounds, 8 quarter rounds
-# 1. a += b; d ^= a; d <<<= 16;
-# 2. c += d; b ^= c; b <<<= 12;
-# 3. a += b; d ^= a; d <<<= 8;
-# 4. c += d; b ^= c; b <<<= 7
-#
-# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16
-# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12
-# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8
-# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7
-#
-# 4 blocks (a b c d)
-#
-# a0 b0 c0 d0
-# a1 b1 c1 d1
-# ...
-# a4 b4 c4 d4
-# ...
-# a8 b8 c8 d8
-# ...
-# a12 b12 c12 d12
-# a13 ...
-# a14 ...
-# a15 b15 c15 d15
-#
-# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
-# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
-#
-
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-#include <asm/asm-compat.h>
-#include <linux/linkage.h>
-
-.machine "any"
-.text
-
-.macro SAVE_GPR GPR OFFSET FRAME
- std \GPR,\OFFSET(\FRAME)
-.endm
-
-.macro SAVE_VRS VRS OFFSET FRAME
- li 16, \OFFSET
- stvx \VRS, 16, \FRAME
-.endm
-
-.macro SAVE_VSX VSX OFFSET FRAME
- li 16, \OFFSET
- stxvx \VSX, 16, \FRAME
-.endm
-
-.macro RESTORE_GPR GPR OFFSET FRAME
- ld \GPR,\OFFSET(\FRAME)
-.endm
-
-.macro RESTORE_VRS VRS OFFSET FRAME
- li 16, \OFFSET
- lvx \VRS, 16, \FRAME
-.endm
-
-.macro RESTORE_VSX VSX OFFSET FRAME
- li 16, \OFFSET
- lxvx \VSX, 16, \FRAME
-.endm
-
-.macro SAVE_REGS
- mflr 0
- std 0, 16(1)
- stdu 1,-752(1)
-
- SAVE_GPR 14, 112, 1
- SAVE_GPR 15, 120, 1
- SAVE_GPR 16, 128, 1
- SAVE_GPR 17, 136, 1
- SAVE_GPR 18, 144, 1
- SAVE_GPR 19, 152, 1
- SAVE_GPR 20, 160, 1
- SAVE_GPR 21, 168, 1
- SAVE_GPR 22, 176, 1
- SAVE_GPR 23, 184, 1
- SAVE_GPR 24, 192, 1
- SAVE_GPR 25, 200, 1
- SAVE_GPR 26, 208, 1
- SAVE_GPR 27, 216, 1
- SAVE_GPR 28, 224, 1
- SAVE_GPR 29, 232, 1
- SAVE_GPR 30, 240, 1
- SAVE_GPR 31, 248, 1
-
- addi 9, 1, 256
- SAVE_VRS 20, 0, 9
- SAVE_VRS 21, 16, 9
- SAVE_VRS 22, 32, 9
- SAVE_VRS 23, 48, 9
- SAVE_VRS 24, 64, 9
- SAVE_VRS 25, 80, 9
- SAVE_VRS 26, 96, 9
- SAVE_VRS 27, 112, 9
- SAVE_VRS 28, 128, 9
- SAVE_VRS 29, 144, 9
- SAVE_VRS 30, 160, 9
- SAVE_VRS 31, 176, 9
-
- SAVE_VSX 14, 192, 9
- SAVE_VSX 15, 208, 9
- SAVE_VSX 16, 224, 9
- SAVE_VSX 17, 240, 9
- SAVE_VSX 18, 256, 9
- SAVE_VSX 19, 272, 9
- SAVE_VSX 20, 288, 9
- SAVE_VSX 21, 304, 9
- SAVE_VSX 22, 320, 9
- SAVE_VSX 23, 336, 9
- SAVE_VSX 24, 352, 9
- SAVE_VSX 25, 368, 9
- SAVE_VSX 26, 384, 9
- SAVE_VSX 27, 400, 9
- SAVE_VSX 28, 416, 9
- SAVE_VSX 29, 432, 9
- SAVE_VSX 30, 448, 9
- SAVE_VSX 31, 464, 9
-.endm # SAVE_REGS
-
-.macro RESTORE_REGS
- addi 9, 1, 256
- RESTORE_VRS 20, 0, 9
- RESTORE_VRS 21, 16, 9
- RESTORE_VRS 22, 32, 9
- RESTORE_VRS 23, 48, 9
- RESTORE_VRS 24, 64, 9
- RESTORE_VRS 25, 80, 9
- RESTORE_VRS 26, 96, 9
- RESTORE_VRS 27, 112, 9
- RESTORE_VRS 28, 128, 9
- RESTORE_VRS 29, 144, 9
- RESTORE_VRS 30, 160, 9
- RESTORE_VRS 31, 176, 9
-
- RESTORE_VSX 14, 192, 9
- RESTORE_VSX 15, 208, 9
- RESTORE_VSX 16, 224, 9
- RESTORE_VSX 17, 240, 9
- RESTORE_VSX 18, 256, 9
- RESTORE_VSX 19, 272, 9
- RESTORE_VSX 20, 288, 9
- RESTORE_VSX 21, 304, 9
- RESTORE_VSX 22, 320, 9
- RESTORE_VSX 23, 336, 9
- RESTORE_VSX 24, 352, 9
- RESTORE_VSX 25, 368, 9
- RESTORE_VSX 26, 384, 9
- RESTORE_VSX 27, 400, 9
- RESTORE_VSX 28, 416, 9
- RESTORE_VSX 29, 432, 9
- RESTORE_VSX 30, 448, 9
- RESTORE_VSX 31, 464, 9
-
- RESTORE_GPR 14, 112, 1
- RESTORE_GPR 15, 120, 1
- RESTORE_GPR 16, 128, 1
- RESTORE_GPR 17, 136, 1
- RESTORE_GPR 18, 144, 1
- RESTORE_GPR 19, 152, 1
- RESTORE_GPR 20, 160, 1
- RESTORE_GPR 21, 168, 1
- RESTORE_GPR 22, 176, 1
- RESTORE_GPR 23, 184, 1
- RESTORE_GPR 24, 192, 1
- RESTORE_GPR 25, 200, 1
- RESTORE_GPR 26, 208, 1
- RESTORE_GPR 27, 216, 1
- RESTORE_GPR 28, 224, 1
- RESTORE_GPR 29, 232, 1
- RESTORE_GPR 30, 240, 1
- RESTORE_GPR 31, 248, 1
-
- addi 1, 1, 752
- ld 0, 16(1)
- mtlr 0
-.endm # RESTORE_REGS
-
-.macro QT_loop_8x
- # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
- xxlor 0, 32+25, 32+25
- xxlor 32+25, 20, 20
- vadduwm 0, 0, 4
- vadduwm 1, 1, 5
- vadduwm 2, 2, 6
- vadduwm 3, 3, 7
- vadduwm 16, 16, 20
- vadduwm 17, 17, 21
- vadduwm 18, 18, 22
- vadduwm 19, 19, 23
-
- vpermxor 12, 12, 0, 25
- vpermxor 13, 13, 1, 25
- vpermxor 14, 14, 2, 25
- vpermxor 15, 15, 3, 25
- vpermxor 28, 28, 16, 25
- vpermxor 29, 29, 17, 25
- vpermxor 30, 30, 18, 25
- vpermxor 31, 31, 19, 25
- xxlor 32+25, 0, 0
- vadduwm 8, 8, 12
- vadduwm 9, 9, 13
- vadduwm 10, 10, 14
- vadduwm 11, 11, 15
- vadduwm 24, 24, 28
- vadduwm 25, 25, 29
- vadduwm 26, 26, 30
- vadduwm 27, 27, 31
- vxor 4, 4, 8
- vxor 5, 5, 9
- vxor 6, 6, 10
- vxor 7, 7, 11
- vxor 20, 20, 24
- vxor 21, 21, 25
- vxor 22, 22, 26
- vxor 23, 23, 27
-
- xxlor 0, 32+25, 32+25
- xxlor 32+25, 21, 21
- vrlw 4, 4, 25 #
- vrlw 5, 5, 25
- vrlw 6, 6, 25
- vrlw 7, 7, 25
- vrlw 20, 20, 25 #
- vrlw 21, 21, 25
- vrlw 22, 22, 25
- vrlw 23, 23, 25
- xxlor 32+25, 0, 0
- vadduwm 0, 0, 4
- vadduwm 1, 1, 5
- vadduwm 2, 2, 6
- vadduwm 3, 3, 7
- vadduwm 16, 16, 20
- vadduwm 17, 17, 21
- vadduwm 18, 18, 22
- vadduwm 19, 19, 23
-
- xxlor 0, 32+25, 32+25
- xxlor 32+25, 22, 22
- vpermxor 12, 12, 0, 25
- vpermxor 13, 13, 1, 25
- vpermxor 14, 14, 2, 25
- vpermxor 15, 15, 3, 25
- vpermxor 28, 28, 16, 25
- vpermxor 29, 29, 17, 25
- vpermxor 30, 30, 18, 25
- vpermxor 31, 31, 19, 25
- xxlor 32+25, 0, 0
- vadduwm 8, 8, 12
- vadduwm 9, 9, 13
- vadduwm 10, 10, 14
- vadduwm 11, 11, 15
- vadduwm 24, 24, 28
- vadduwm 25, 25, 29
- vadduwm 26, 26, 30
- vadduwm 27, 27, 31
- xxlor 0, 32+28, 32+28
- xxlor 32+28, 23, 23
- vxor 4, 4, 8
- vxor 5, 5, 9
- vxor 6, 6, 10
- vxor 7, 7, 11
- vxor 20, 20, 24
- vxor 21, 21, 25
- vxor 22, 22, 26
- vxor 23, 23, 27
- vrlw 4, 4, 28 #
- vrlw 5, 5, 28
- vrlw 6, 6, 28
- vrlw 7, 7, 28
- vrlw 20, 20, 28 #
- vrlw 21, 21, 28
- vrlw 22, 22, 28
- vrlw 23, 23, 28
- xxlor 32+28, 0, 0
-
- # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
- xxlor 0, 32+25, 32+25
- xxlor 32+25, 20, 20
- vadduwm 0, 0, 5
- vadduwm 1, 1, 6
- vadduwm 2, 2, 7
- vadduwm 3, 3, 4
- vadduwm 16, 16, 21
- vadduwm 17, 17, 22
- vadduwm 18, 18, 23
- vadduwm 19, 19, 20
-
- vpermxor 15, 15, 0, 25
- vpermxor 12, 12, 1, 25
- vpermxor 13, 13, 2, 25
- vpermxor 14, 14, 3, 25
- vpermxor 31, 31, 16, 25
- vpermxor 28, 28, 17, 25
- vpermxor 29, 29, 18, 25
- vpermxor 30, 30, 19, 25
-
- xxlor 32+25, 0, 0
- vadduwm 10, 10, 15
- vadduwm 11, 11, 12
- vadduwm 8, 8, 13
- vadduwm 9, 9, 14
- vadduwm 26, 26, 31
- vadduwm 27, 27, 28
- vadduwm 24, 24, 29
- vadduwm 25, 25, 30
- vxor 5, 5, 10
- vxor 6, 6, 11
- vxor 7, 7, 8
- vxor 4, 4, 9
- vxor 21, 21, 26
- vxor 22, 22, 27
- vxor 23, 23, 24
- vxor 20, 20, 25
-
- xxlor 0, 32+25, 32+25
- xxlor 32+25, 21, 21
- vrlw 5, 5, 25
- vrlw 6, 6, 25
- vrlw 7, 7, 25
- vrlw 4, 4, 25
- vrlw 21, 21, 25
- vrlw 22, 22, 25
- vrlw 23, 23, 25
- vrlw 20, 20, 25
- xxlor 32+25, 0, 0
-
- vadduwm 0, 0, 5
- vadduwm 1, 1, 6
- vadduwm 2, 2, 7
- vadduwm 3, 3, 4
- vadduwm 16, 16, 21
- vadduwm 17, 17, 22
- vadduwm 18, 18, 23
- vadduwm 19, 19, 20
-
- xxlor 0, 32+25, 32+25
- xxlor 32+25, 22, 22
- vpermxor 15, 15, 0, 25
- vpermxor 12, 12, 1, 25
- vpermxor 13, 13, 2, 25
- vpermxor 14, 14, 3, 25
- vpermxor 31, 31, 16, 25
- vpermxor 28, 28, 17, 25
- vpermxor 29, 29, 18, 25
- vpermxor 30, 30, 19, 25
- xxlor 32+25, 0, 0
-
- vadduwm 10, 10, 15
- vadduwm 11, 11, 12
- vadduwm 8, 8, 13
- vadduwm 9, 9, 14
- vadduwm 26, 26, 31
- vadduwm 27, 27, 28
- vadduwm 24, 24, 29
- vadduwm 25, 25, 30
-
- xxlor 0, 32+28, 32+28
- xxlor 32+28, 23, 23
- vxor 5, 5, 10
- vxor 6, 6, 11
- vxor 7, 7, 8
- vxor 4, 4, 9
- vxor 21, 21, 26
- vxor 22, 22, 27
- vxor 23, 23, 24
- vxor 20, 20, 25
- vrlw 5, 5, 28
- vrlw 6, 6, 28
- vrlw 7, 7, 28
- vrlw 4, 4, 28
- vrlw 21, 21, 28
- vrlw 22, 22, 28
- vrlw 23, 23, 28
- vrlw 20, 20, 28
- xxlor 32+28, 0, 0
-.endm
-
-.macro QT_loop_4x
- # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
- vadduwm 0, 0, 4
- vadduwm 1, 1, 5
- vadduwm 2, 2, 6
- vadduwm 3, 3, 7
- vpermxor 12, 12, 0, 20
- vpermxor 13, 13, 1, 20
- vpermxor 14, 14, 2, 20
- vpermxor 15, 15, 3, 20
- vadduwm 8, 8, 12
- vadduwm 9, 9, 13
- vadduwm 10, 10, 14
- vadduwm 11, 11, 15
- vxor 4, 4, 8
- vxor 5, 5, 9
- vxor 6, 6, 10
- vxor 7, 7, 11
- vrlw 4, 4, 21
- vrlw 5, 5, 21
- vrlw 6, 6, 21
- vrlw 7, 7, 21
- vadduwm 0, 0, 4
- vadduwm 1, 1, 5
- vadduwm 2, 2, 6
- vadduwm 3, 3, 7
- vpermxor 12, 12, 0, 22
- vpermxor 13, 13, 1, 22
- vpermxor 14, 14, 2, 22
- vpermxor 15, 15, 3, 22
- vadduwm 8, 8, 12
- vadduwm 9, 9, 13
- vadduwm 10, 10, 14
- vadduwm 11, 11, 15
- vxor 4, 4, 8
- vxor 5, 5, 9
- vxor 6, 6, 10
- vxor 7, 7, 11
- vrlw 4, 4, 23
- vrlw 5, 5, 23
- vrlw 6, 6, 23
- vrlw 7, 7, 23
-
- # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
- vadduwm 0, 0, 5
- vadduwm 1, 1, 6
- vadduwm 2, 2, 7
- vadduwm 3, 3, 4
- vpermxor 15, 15, 0, 20
- vpermxor 12, 12, 1, 20
- vpermxor 13, 13, 2, 20
- vpermxor 14, 14, 3, 20
- vadduwm 10, 10, 15
- vadduwm 11, 11, 12
- vadduwm 8, 8, 13
- vadduwm 9, 9, 14
- vxor 5, 5, 10
- vxor 6, 6, 11
- vxor 7, 7, 8
- vxor 4, 4, 9
- vrlw 5, 5, 21
- vrlw 6, 6, 21
- vrlw 7, 7, 21
- vrlw 4, 4, 21
- vadduwm 0, 0, 5
- vadduwm 1, 1, 6
- vadduwm 2, 2, 7
- vadduwm 3, 3, 4
- vpermxor 15, 15, 0, 22
- vpermxor 12, 12, 1, 22
- vpermxor 13, 13, 2, 22
- vpermxor 14, 14, 3, 22
- vadduwm 10, 10, 15
- vadduwm 11, 11, 12
- vadduwm 8, 8, 13
- vadduwm 9, 9, 14
- vxor 5, 5, 10
- vxor 6, 6, 11
- vxor 7, 7, 8
- vxor 4, 4, 9
- vrlw 5, 5, 23
- vrlw 6, 6, 23
- vrlw 7, 7, 23
- vrlw 4, 4, 23
-.endm
-
-# Transpose
-.macro TP_4x a0 a1 a2 a3
- xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1
- xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3
- xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1
- xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3
- xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3
- xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3
- xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3
- xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3
-.endm
-
-# key stream = working state + state
-.macro Add_state S
- vadduwm \S+0, \S+0, 16-\S
- vadduwm \S+4, \S+4, 17-\S
- vadduwm \S+8, \S+8, 18-\S
- vadduwm \S+12, \S+12, 19-\S
-
- vadduwm \S+1, \S+1, 16-\S
- vadduwm \S+5, \S+5, 17-\S
- vadduwm \S+9, \S+9, 18-\S
- vadduwm \S+13, \S+13, 19-\S
-
- vadduwm \S+2, \S+2, 16-\S
- vadduwm \S+6, \S+6, 17-\S
- vadduwm \S+10, \S+10, 18-\S
- vadduwm \S+14, \S+14, 19-\S
-
- vadduwm \S+3, \S+3, 16-\S
- vadduwm \S+7, \S+7, 17-\S
- vadduwm \S+11, \S+11, 18-\S
- vadduwm \S+15, \S+15, 19-\S
-.endm
-
-#
-# write 256 bytes
-#
-.macro Write_256 S
- add 9, 14, 5
- add 16, 14, 4
- lxvw4x 0, 0, 9
- lxvw4x 1, 17, 9
- lxvw4x 2, 18, 9
- lxvw4x 3, 19, 9
- lxvw4x 4, 20, 9
- lxvw4x 5, 21, 9
- lxvw4x 6, 22, 9
- lxvw4x 7, 23, 9
- lxvw4x 8, 24, 9
- lxvw4x 9, 25, 9
- lxvw4x 10, 26, 9
- lxvw4x 11, 27, 9
- lxvw4x 12, 28, 9
- lxvw4x 13, 29, 9
- lxvw4x 14, 30, 9
- lxvw4x 15, 31, 9
-
- xxlxor \S+32, \S+32, 0
- xxlxor \S+36, \S+36, 1
- xxlxor \S+40, \S+40, 2
- xxlxor \S+44, \S+44, 3
- xxlxor \S+33, \S+33, 4
- xxlxor \S+37, \S+37, 5
- xxlxor \S+41, \S+41, 6
- xxlxor \S+45, \S+45, 7
- xxlxor \S+34, \S+34, 8
- xxlxor \S+38, \S+38, 9
- xxlxor \S+42, \S+42, 10
- xxlxor \S+46, \S+46, 11
- xxlxor \S+35, \S+35, 12
- xxlxor \S+39, \S+39, 13
- xxlxor \S+43, \S+43, 14
- xxlxor \S+47, \S+47, 15
-
- stxvw4x \S+32, 0, 16
- stxvw4x \S+36, 17, 16
- stxvw4x \S+40, 18, 16
- stxvw4x \S+44, 19, 16
-
- stxvw4x \S+33, 20, 16
- stxvw4x \S+37, 21, 16
- stxvw4x \S+41, 22, 16
- stxvw4x \S+45, 23, 16
-
- stxvw4x \S+34, 24, 16
- stxvw4x \S+38, 25, 16
- stxvw4x \S+42, 26, 16
- stxvw4x \S+46, 27, 16
-
- stxvw4x \S+35, 28, 16
- stxvw4x \S+39, 29, 16
- stxvw4x \S+43, 30, 16
- stxvw4x \S+47, 31, 16
-
-.endm
-
-#
-# void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, const u8 *src,
-# unsigned int len, int nrounds);
-#
-SYM_FUNC_START(chacha_p10le_8x)
-.align 5
- cmpdi 6, 0
- ble Out_no_chacha
-
- SAVE_REGS
-
- # r17 - r31 mainly for Write_256 macro.
- li 17, 16
- li 18, 32
- li 19, 48
- li 20, 64
- li 21, 80
- li 22, 96
- li 23, 112
- li 24, 128
- li 25, 144
- li 26, 160
- li 27, 176
- li 28, 192
- li 29, 208
- li 30, 224
- li 31, 240
-
- mr 15, 6 # len
- li 14, 0 # offset to inp and outp
-
- lxvw4x 48, 0, 3 # vr16, constants
- lxvw4x 49, 17, 3 # vr17, key 1
- lxvw4x 50, 18, 3 # vr18, key 2
- lxvw4x 51, 19, 3 # vr19, counter, nonce
-
- # create (0, 1, 2, 3) counters
- vspltisw 0, 0
- vspltisw 1, 1
- vspltisw 2, 2
- vspltisw 3, 3
- vmrghw 4, 0, 1
- vmrglw 5, 2, 3
- vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3)
-
- vspltisw 21, 12
- vspltisw 23, 7
-
- addis 11, 2, permx@toc@ha
- addi 11, 11, permx@toc@l
- lxvw4x 32+20, 0, 11
- lxvw4x 32+22, 17, 11
-
- sradi 8, 7, 1
-
- mtctr 8
-
- # save constants to vsx
- xxlor 16, 48, 48
- xxlor 17, 49, 49
- xxlor 18, 50, 50
- xxlor 19, 51, 51
-
- vspltisw 25, 4
- vspltisw 26, 8
-
- xxlor 25, 32+26, 32+26
- xxlor 24, 32+25, 32+25
-
- vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4)
- xxlor 30, 32+30, 32+30
- xxlor 31, 32+31, 32+31
-
- xxlor 20, 32+20, 32+20
- xxlor 21, 32+21, 32+21
- xxlor 22, 32+22, 32+22
- xxlor 23, 32+23, 32+23
-
- cmpdi 6, 512
- blt Loop_last
-
-Loop_8x:
- xxspltw 32+0, 16, 0
- xxspltw 32+1, 16, 1
- xxspltw 32+2, 16, 2
- xxspltw 32+3, 16, 3
-
- xxspltw 32+4, 17, 0
- xxspltw 32+5, 17, 1
- xxspltw 32+6, 17, 2
- xxspltw 32+7, 17, 3
- xxspltw 32+8, 18, 0
- xxspltw 32+9, 18, 1
- xxspltw 32+10, 18, 2
- xxspltw 32+11, 18, 3
- xxspltw 32+12, 19, 0
- xxspltw 32+13, 19, 1
- xxspltw 32+14, 19, 2
- xxspltw 32+15, 19, 3
- vadduwm 12, 12, 30 # increase counter
-
- xxspltw 32+16, 16, 0
- xxspltw 32+17, 16, 1
- xxspltw 32+18, 16, 2
- xxspltw 32+19, 16, 3
-
- xxspltw 32+20, 17, 0
- xxspltw 32+21, 17, 1
- xxspltw 32+22, 17, 2
- xxspltw 32+23, 17, 3
- xxspltw 32+24, 18, 0
- xxspltw 32+25, 18, 1
- xxspltw 32+26, 18, 2
- xxspltw 32+27, 18, 3
- xxspltw 32+28, 19, 0
- xxspltw 32+29, 19, 1
- vadduwm 28, 28, 31 # increase counter
- xxspltw 32+30, 19, 2
- xxspltw 32+31, 19, 3
-
-.align 5
-quarter_loop_8x:
- QT_loop_8x
-
- bdnz quarter_loop_8x
-
- xxlor 0, 32+30, 32+30
- xxlor 32+30, 30, 30
- vadduwm 12, 12, 30
- xxlor 32+30, 0, 0
- TP_4x 0, 1, 2, 3
- TP_4x 4, 5, 6, 7
- TP_4x 8, 9, 10, 11
- TP_4x 12, 13, 14, 15
-
- xxlor 0, 48, 48
- xxlor 1, 49, 49
- xxlor 2, 50, 50
- xxlor 3, 51, 51
- xxlor 48, 16, 16
- xxlor 49, 17, 17
- xxlor 50, 18, 18
- xxlor 51, 19, 19
- Add_state 0
- xxlor 48, 0, 0
- xxlor 49, 1, 1
- xxlor 50, 2, 2
- xxlor 51, 3, 3
- Write_256 0
- addi 14, 14, 256 # offset +=256
- addi 15, 15, -256 # len -=256
-
- xxlor 5, 32+31, 32+31
- xxlor 32+31, 31, 31
- vadduwm 28, 28, 31
- xxlor 32+31, 5, 5
- TP_4x 16+0, 16+1, 16+2, 16+3
- TP_4x 16+4, 16+5, 16+6, 16+7
- TP_4x 16+8, 16+9, 16+10, 16+11
- TP_4x 16+12, 16+13, 16+14, 16+15
-
- xxlor 32, 16, 16
- xxlor 33, 17, 17
- xxlor 34, 18, 18
- xxlor 35, 19, 19
- Add_state 16
- Write_256 16
- addi 14, 14, 256 # offset +=256
- addi 15, 15, -256 # len +=256
-
- xxlor 32+24, 24, 24
- xxlor 32+25, 25, 25
- xxlor 32+30, 30, 30
- vadduwm 30, 30, 25
- vadduwm 31, 30, 24
- xxlor 30, 32+30, 32+30
- xxlor 31, 32+31, 32+31
-
- cmpdi 15, 0
- beq Out_loop
-
- cmpdi 15, 512
- blt Loop_last
-
- mtctr 8
- b Loop_8x
-
-Loop_last:
- lxvw4x 48, 0, 3 # vr16, constants
- lxvw4x 49, 17, 3 # vr17, key 1
- lxvw4x 50, 18, 3 # vr18, key 2
- lxvw4x 51, 19, 3 # vr19, counter, nonce
-
- vspltisw 21, 12
- vspltisw 23, 7
- addis 11, 2, permx@toc@ha
- addi 11, 11, permx@toc@l
- lxvw4x 32+20, 0, 11
- lxvw4x 32+22, 17, 11
-
- sradi 8, 7, 1
- mtctr 8
-
-Loop_4x:
- vspltw 0, 16, 0
- vspltw 1, 16, 1
- vspltw 2, 16, 2
- vspltw 3, 16, 3
-
- vspltw 4, 17, 0
- vspltw 5, 17, 1
- vspltw 6, 17, 2
- vspltw 7, 17, 3
- vspltw 8, 18, 0
- vspltw 9, 18, 1
- vspltw 10, 18, 2
- vspltw 11, 18, 3
- vspltw 12, 19, 0
- vadduwm 12, 12, 30 # increase counter
- vspltw 13, 19, 1
- vspltw 14, 19, 2
- vspltw 15, 19, 3
-
-.align 5
-quarter_loop:
- QT_loop_4x
-
- bdnz quarter_loop
-
- vadduwm 12, 12, 30
- TP_4x 0, 1, 2, 3
- TP_4x 4, 5, 6, 7
- TP_4x 8, 9, 10, 11
- TP_4x 12, 13, 14, 15
-
- Add_state 0
- Write_256 0
- addi 14, 14, 256 # offset += 256
- addi 15, 15, -256 # len += 256
-
- # Update state counter
- vspltisw 25, 4
- vadduwm 30, 30, 25
-
- cmpdi 15, 0
- beq Out_loop
- cmpdi 15, 256
- blt Out_loop
-
- mtctr 8
- b Loop_4x
-
-Out_loop:
- RESTORE_REGS
- blr
-
-Out_no_chacha:
- li 3, 0
- blr
-SYM_FUNC_END(chacha_p10le_8x)
-
-SYM_DATA_START_LOCAL(PERMX)
-.align 5
-permx:
-.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd
-.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc
-SYM_DATA_END(PERMX)
diff --git a/arch/powerpc/lib/crypto/poly1305-p10-glue.c b/arch/powerpc/lib/crypto/poly1305-p10-glue.c
deleted file mode 100644
index 3f1664a724b6..000000000000
--- a/arch/powerpc/lib/crypto/poly1305-p10-glue.c
+++ /dev/null
@@ -1,96 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Poly1305 authenticator algorithm, RFC7539.
- *
- * Copyright 2023- IBM Corp. All rights reserved.
- */
-#include <asm/switch_to.h>
-#include <crypto/internal/poly1305.h>
-#include <linux/cpufeature.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/unaligned.h>
-
-asmlinkage void poly1305_p10le_4blocks(struct poly1305_block_state *state, const u8 *m, u32 mlen);
-asmlinkage void poly1305_64s(struct poly1305_block_state *state, const u8 *m, u32 mlen, int highbit);
-asmlinkage void poly1305_emit_64(const struct poly1305_state *state, const u32 nonce[4], u8 digest[POLY1305_DIGEST_SIZE]);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
-
-static void vsx_begin(void)
-{
- preempt_disable();
- enable_kernel_vsx();
-}
-
-static void vsx_end(void)
-{
- disable_kernel_vsx();
- preempt_enable();
-}
-
-void poly1305_block_init_arch(struct poly1305_block_state *dctx,
- const u8 raw_key[POLY1305_BLOCK_SIZE])
-{
- if (!static_key_enabled(&have_p10))
- return poly1305_block_init_generic(dctx, raw_key);
-
- dctx->h = (struct poly1305_state){};
- dctx->core_r.key.r64[0] = get_unaligned_le64(raw_key + 0);
- dctx->core_r.key.r64[1] = get_unaligned_le64(raw_key + 8);
-}
-EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
-
-void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
- unsigned int len, u32 padbit)
-{
- if (!static_key_enabled(&have_p10))
- return poly1305_blocks_generic(state, src, len, padbit);
- vsx_begin();
- if (len >= POLY1305_BLOCK_SIZE * 4) {
- poly1305_p10le_4blocks(state, src, len);
- src += len - (len % (POLY1305_BLOCK_SIZE * 4));
- len %= POLY1305_BLOCK_SIZE * 4;
- }
- while (len >= POLY1305_BLOCK_SIZE) {
- poly1305_64s(state, src, POLY1305_BLOCK_SIZE, padbit);
- len -= POLY1305_BLOCK_SIZE;
- src += POLY1305_BLOCK_SIZE;
- }
- vsx_end();
-}
-EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
-
-void poly1305_emit_arch(const struct poly1305_state *state,
- u8 digest[POLY1305_DIGEST_SIZE],
- const u32 nonce[4])
-{
- if (!static_key_enabled(&have_p10))
- return poly1305_emit_generic(state, digest, nonce);
- poly1305_emit_64(state, nonce, digest);
-}
-EXPORT_SYMBOL_GPL(poly1305_emit_arch);
-
-bool poly1305_is_arch_optimized(void)
-{
- return static_key_enabled(&have_p10);
-}
-EXPORT_SYMBOL(poly1305_is_arch_optimized);
-
-static int __init poly1305_p10_init(void)
-{
- if (cpu_has_feature(CPU_FTR_ARCH_31))
- static_branch_enable(&have_p10);
- return 0;
-}
-subsys_initcall(poly1305_p10_init);
-
-static void __exit poly1305_p10_exit(void)
-{
-}
-module_exit(poly1305_p10_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>");
-MODULE_DESCRIPTION("Optimized Poly1305 for P10");
diff --git a/arch/powerpc/lib/crypto/poly1305-p10le_64.S b/arch/powerpc/lib/crypto/poly1305-p10le_64.S
deleted file mode 100644
index a3c1987f1ecd..000000000000
--- a/arch/powerpc/lib/crypto/poly1305-p10le_64.S
+++ /dev/null
@@ -1,1075 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#
-# Accelerated poly1305 implementation for ppc64le.
-#
-# Copyright 2023- IBM Corp. All rights reserved
-#
-#===================================================================================
-# Written by Danny Tsen <dtsen@us.ibm.com>
-#
-# Poly1305 - this version mainly using vector/VSX/Scalar
-# - 26 bits limbs
-# - Handle multiple 64 byte blcok.
-#
-# Block size 16 bytes
-# key = (r, s)
-# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF
-# p = 2^130 - 5
-# a += m
-# a = (r + a) % p
-# a += s
-#
-# Improve performance by breaking down polynominal to the sum of products with
-# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
-#
-# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0
-# to 9 vectors for multiplications.
-#
-# setup r^4, r^3, r^2, r vectors
-# vs [r^1, r^3, r^2, r^4]
-# vs0 = [r0,.....]
-# vs1 = [r1,.....]
-# vs2 = [r2,.....]
-# vs3 = [r3,.....]
-# vs4 = [r4,.....]
-# vs5 = [r1*5,...]
-# vs6 = [r2*5,...]
-# vs7 = [r2*5,...]
-# vs8 = [r4*5,...]
-#
-# Each word in a vector consists a member of a "r/s" in [a * r/s].
-#
-# r0, r4*5, r3*5, r2*5, r1*5;
-# r1, r0, r4*5, r3*5, r2*5;
-# r2, r1, r0, r4*5, r3*5;
-# r3, r2, r1, r0, r4*5;
-# r4, r3, r2, r1, r0 ;
-#
-#
-# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
-# k = 32 bytes key
-# r3 = k (r, s)
-# r4 = mlen
-# r5 = m
-#
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-#include <asm/asm-compat.h>
-#include <linux/linkage.h>
-
-.machine "any"
-
-.text
-
-.macro SAVE_GPR GPR OFFSET FRAME
- std \GPR,\OFFSET(\FRAME)
-.endm
-
-.macro SAVE_VRS VRS OFFSET FRAME
- li 16, \OFFSET
- stvx \VRS, 16, \FRAME
-.endm
-
-.macro SAVE_VSX VSX OFFSET FRAME
- li 16, \OFFSET
- stxvx \VSX, 16, \FRAME
-.endm
-
-.macro RESTORE_GPR GPR OFFSET FRAME
- ld \GPR,\OFFSET(\FRAME)
-.endm
-
-.macro RESTORE_VRS VRS OFFSET FRAME
- li 16, \OFFSET
- lvx \VRS, 16, \FRAME
-.endm
-
-.macro RESTORE_VSX VSX OFFSET FRAME
- li 16, \OFFSET
- lxvx \VSX, 16, \FRAME
-.endm
-
-.macro SAVE_REGS
- mflr 0
- std 0, 16(1)
- stdu 1,-752(1)
-
- SAVE_GPR 14, 112, 1
- SAVE_GPR 15, 120, 1
- SAVE_GPR 16, 128, 1
- SAVE_GPR 17, 136, 1
- SAVE_GPR 18, 144, 1
- SAVE_GPR 19, 152, 1
- SAVE_GPR 20, 160, 1
- SAVE_GPR 21, 168, 1
- SAVE_GPR 22, 176, 1
- SAVE_GPR 23, 184, 1
- SAVE_GPR 24, 192, 1
- SAVE_GPR 25, 200, 1
- SAVE_GPR 26, 208, 1
- SAVE_GPR 27, 216, 1
- SAVE_GPR 28, 224, 1
- SAVE_GPR 29, 232, 1
- SAVE_GPR 30, 240, 1
- SAVE_GPR 31, 248, 1
-
- addi 9, 1, 256
- SAVE_VRS 20, 0, 9
- SAVE_VRS 21, 16, 9
- SAVE_VRS 22, 32, 9
- SAVE_VRS 23, 48, 9
- SAVE_VRS 24, 64, 9
- SAVE_VRS 25, 80, 9
- SAVE_VRS 26, 96, 9
- SAVE_VRS 27, 112, 9
- SAVE_VRS 28, 128, 9
- SAVE_VRS 29, 144, 9
- SAVE_VRS 30, 160, 9
- SAVE_VRS 31, 176, 9
-
- SAVE_VSX 14, 192, 9
- SAVE_VSX 15, 208, 9
- SAVE_VSX 16, 224, 9
- SAVE_VSX 17, 240, 9
- SAVE_VSX 18, 256, 9
- SAVE_VSX 19, 272, 9
- SAVE_VSX 20, 288, 9
- SAVE_VSX 21, 304, 9
- SAVE_VSX 22, 320, 9
- SAVE_VSX 23, 336, 9
- SAVE_VSX 24, 352, 9
- SAVE_VSX 25, 368, 9
- SAVE_VSX 26, 384, 9
- SAVE_VSX 27, 400, 9
- SAVE_VSX 28, 416, 9
- SAVE_VSX 29, 432, 9
- SAVE_VSX 30, 448, 9
- SAVE_VSX 31, 464, 9
-.endm # SAVE_REGS
-
-.macro RESTORE_REGS
- addi 9, 1, 256
- RESTORE_VRS 20, 0, 9
- RESTORE_VRS 21, 16, 9
- RESTORE_VRS 22, 32, 9
- RESTORE_VRS 23, 48, 9
- RESTORE_VRS 24, 64, 9
- RESTORE_VRS 25, 80, 9
- RESTORE_VRS 26, 96, 9
- RESTORE_VRS 27, 112, 9
- RESTORE_VRS 28, 128, 9
- RESTORE_VRS 29, 144, 9
- RESTORE_VRS 30, 160, 9
- RESTORE_VRS 31, 176, 9
-
- RESTORE_VSX 14, 192, 9
- RESTORE_VSX 15, 208, 9
- RESTORE_VSX 16, 224, 9
- RESTORE_VSX 17, 240, 9
- RESTORE_VSX 18, 256, 9
- RESTORE_VSX 19, 272, 9
- RESTORE_VSX 20, 288, 9
- RESTORE_VSX 21, 304, 9
- RESTORE_VSX 22, 320, 9
- RESTORE_VSX 23, 336, 9
- RESTORE_VSX 24, 352, 9
- RESTORE_VSX 25, 368, 9
- RESTORE_VSX 26, 384, 9
- RESTORE_VSX 27, 400, 9
- RESTORE_VSX 28, 416, 9
- RESTORE_VSX 29, 432, 9
- RESTORE_VSX 30, 448, 9
- RESTORE_VSX 31, 464, 9
-
- RESTORE_GPR 14, 112, 1
- RESTORE_GPR 15, 120, 1
- RESTORE_GPR 16, 128, 1
- RESTORE_GPR 17, 136, 1
- RESTORE_GPR 18, 144, 1
- RESTORE_GPR 19, 152, 1
- RESTORE_GPR 20, 160, 1
- RESTORE_GPR 21, 168, 1
- RESTORE_GPR 22, 176, 1
- RESTORE_GPR 23, 184, 1
- RESTORE_GPR 24, 192, 1
- RESTORE_GPR 25, 200, 1
- RESTORE_GPR 26, 208, 1
- RESTORE_GPR 27, 216, 1
- RESTORE_GPR 28, 224, 1
- RESTORE_GPR 29, 232, 1
- RESTORE_GPR 30, 240, 1
- RESTORE_GPR 31, 248, 1
-
- addi 1, 1, 752
- ld 0, 16(1)
- mtlr 0
-.endm # RESTORE_REGS
-
-#
-# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5;
-# p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5;
-# p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5;
-# p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5;
-# p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ;
-#
-# [r^2, r^3, r^1, r^4]
-# [m3, m2, m4, m1]
-#
-# multiply odd and even words
-.macro mul_odd
- vmulouw 14, 4, 26
- vmulouw 10, 5, 3
- vmulouw 11, 6, 2
- vmulouw 12, 7, 1
- vmulouw 13, 8, 0
- vmulouw 15, 4, 27
- vaddudm 14, 14, 10
- vaddudm 14, 14, 11
- vmulouw 10, 5, 26
- vmulouw 11, 6, 3
- vaddudm 14, 14, 12
- vaddudm 14, 14, 13 # x0
- vaddudm 15, 15, 10
- vaddudm 15, 15, 11
- vmulouw 12, 7, 2
- vmulouw 13, 8, 1
- vaddudm 15, 15, 12
- vaddudm 15, 15, 13 # x1
- vmulouw 16, 4, 28
- vmulouw 10, 5, 27
- vmulouw 11, 6, 26
- vaddudm 16, 16, 10
- vaddudm 16, 16, 11
- vmulouw 12, 7, 3
- vmulouw 13, 8, 2
- vaddudm 16, 16, 12
- vaddudm 16, 16, 13 # x2
- vmulouw 17, 4, 29
- vmulouw 10, 5, 28
- vmulouw 11, 6, 27
- vaddudm 17, 17, 10
- vaddudm 17, 17, 11
- vmulouw 12, 7, 26
- vmulouw 13, 8, 3
- vaddudm 17, 17, 12
- vaddudm 17, 17, 13 # x3
- vmulouw 18, 4, 30
- vmulouw 10, 5, 29
- vmulouw 11, 6, 28
- vaddudm 18, 18, 10
- vaddudm 18, 18, 11
- vmulouw 12, 7, 27
- vmulouw 13, 8, 26
- vaddudm 18, 18, 12
- vaddudm 18, 18, 13 # x4
-.endm
-
-.macro mul_even
- vmuleuw 9, 4, 26
- vmuleuw 10, 5, 3
- vmuleuw 11, 6, 2
- vmuleuw 12, 7, 1
- vmuleuw 13, 8, 0
- vaddudm 14, 14, 9
- vaddudm 14, 14, 10
- vaddudm 14, 14, 11
- vaddudm 14, 14, 12
- vaddudm 14, 14, 13 # x0
-
- vmuleuw 9, 4, 27
- vmuleuw 10, 5, 26
- vmuleuw 11, 6, 3
- vmuleuw 12, 7, 2
- vmuleuw 13, 8, 1
- vaddudm 15, 15, 9
- vaddudm 15, 15, 10
- vaddudm 15, 15, 11
- vaddudm 15, 15, 12
- vaddudm 15, 15, 13 # x1
-
- vmuleuw 9, 4, 28
- vmuleuw 10, 5, 27
- vmuleuw 11, 6, 26
- vmuleuw 12, 7, 3
- vmuleuw 13, 8, 2
- vaddudm 16, 16, 9
- vaddudm 16, 16, 10
- vaddudm 16, 16, 11
- vaddudm 16, 16, 12
- vaddudm 16, 16, 13 # x2
-
- vmuleuw 9, 4, 29
- vmuleuw 10, 5, 28
- vmuleuw 11, 6, 27
- vmuleuw 12, 7, 26
- vmuleuw 13, 8, 3
- vaddudm 17, 17, 9
- vaddudm 17, 17, 10
- vaddudm 17, 17, 11
- vaddudm 17, 17, 12
- vaddudm 17, 17, 13 # x3
-
- vmuleuw 9, 4, 30
- vmuleuw 10, 5, 29
- vmuleuw 11, 6, 28
- vmuleuw 12, 7, 27
- vmuleuw 13, 8, 26
- vaddudm 18, 18, 9
- vaddudm 18, 18, 10
- vaddudm 18, 18, 11
- vaddudm 18, 18, 12
- vaddudm 18, 18, 13 # x4
-.endm
-
-#
-# poly1305_setup_r
-#
-# setup r^4, r^3, r^2, r vectors
-# [r, r^3, r^2, r^4]
-# vs0 = [r0,...]
-# vs1 = [r1,...]
-# vs2 = [r2,...]
-# vs3 = [r3,...]
-# vs4 = [r4,...]
-# vs5 = [r4*5,...]
-# vs6 = [r3*5,...]
-# vs7 = [r2*5,...]
-# vs8 = [r1*5,...]
-#
-# r0, r4*5, r3*5, r2*5, r1*5;
-# r1, r0, r4*5, r3*5, r2*5;
-# r2, r1, r0, r4*5, r3*5;
-# r3, r2, r1, r0, r4*5;
-# r4, r3, r2, r1, r0 ;
-#
-.macro poly1305_setup_r
-
- # save r
- xxlor 26, 58, 58
- xxlor 27, 59, 59
- xxlor 28, 60, 60
- xxlor 29, 61, 61
- xxlor 30, 62, 62
-
- xxlxor 31, 31, 31
-
-# [r, r^3, r^2, r^4]
- # compute r^2
- vmr 4, 26
- vmr 5, 27
- vmr 6, 28
- vmr 7, 29
- vmr 8, 30
- bl do_mul # r^2 r^1
- xxpermdi 58, 58, 36, 0x3 # r0
- xxpermdi 59, 59, 37, 0x3 # r1
- xxpermdi 60, 60, 38, 0x3 # r2
- xxpermdi 61, 61, 39, 0x3 # r3
- xxpermdi 62, 62, 40, 0x3 # r4
- xxpermdi 36, 36, 36, 0x3
- xxpermdi 37, 37, 37, 0x3
- xxpermdi 38, 38, 38, 0x3
- xxpermdi 39, 39, 39, 0x3
- xxpermdi 40, 40, 40, 0x3
- vspltisb 13, 2
- vsld 9, 27, 13
- vsld 10, 28, 13
- vsld 11, 29, 13
- vsld 12, 30, 13
- vaddudm 0, 9, 27
- vaddudm 1, 10, 28
- vaddudm 2, 11, 29
- vaddudm 3, 12, 30
-
- bl do_mul # r^4 r^3
- vmrgow 26, 26, 4
- vmrgow 27, 27, 5
- vmrgow 28, 28, 6
- vmrgow 29, 29, 7
- vmrgow 30, 30, 8
- vspltisb 13, 2
- vsld 9, 27, 13
- vsld 10, 28, 13
- vsld 11, 29, 13
- vsld 12, 30, 13
- vaddudm 0, 9, 27
- vaddudm 1, 10, 28
- vaddudm 2, 11, 29
- vaddudm 3, 12, 30
-
- # r^2 r^4
- xxlor 0, 58, 58
- xxlor 1, 59, 59
- xxlor 2, 60, 60
- xxlor 3, 61, 61
- xxlor 4, 62, 62
- xxlor 5, 32, 32
- xxlor 6, 33, 33
- xxlor 7, 34, 34
- xxlor 8, 35, 35
-
- vspltw 9, 26, 3
- vspltw 10, 26, 2
- vmrgow 26, 10, 9
- vspltw 9, 27, 3
- vspltw 10, 27, 2
- vmrgow 27, 10, 9
- vspltw 9, 28, 3
- vspltw 10, 28, 2
- vmrgow 28, 10, 9
- vspltw 9, 29, 3
- vspltw 10, 29, 2
- vmrgow 29, 10, 9
- vspltw 9, 30, 3
- vspltw 10, 30, 2
- vmrgow 30, 10, 9
-
- vsld 9, 27, 13
- vsld 10, 28, 13
- vsld 11, 29, 13
- vsld 12, 30, 13
- vaddudm 0, 9, 27
- vaddudm 1, 10, 28
- vaddudm 2, 11, 29
- vaddudm 3, 12, 30
-.endm
-
-SYM_FUNC_START_LOCAL(do_mul)
- mul_odd
-
- # do reduction ( h %= p )
- # carry reduction
- vspltisb 9, 2
- vsrd 10, 14, 31
- vsrd 11, 17, 31
- vand 7, 17, 25
- vand 4, 14, 25
- vaddudm 18, 18, 11
- vsrd 12, 18, 31
- vaddudm 15, 15, 10
-
- vsrd 11, 15, 31
- vand 8, 18, 25
- vand 5, 15, 25
- vaddudm 4, 4, 12
- vsld 10, 12, 9
- vaddudm 6, 16, 11
-
- vsrd 13, 6, 31
- vand 6, 6, 25
- vaddudm 4, 4, 10
- vsrd 10, 4, 31
- vaddudm 7, 7, 13
-
- vsrd 11, 7, 31
- vand 7, 7, 25
- vand 4, 4, 25
- vaddudm 5, 5, 10
- vaddudm 8, 8, 11
- blr
-SYM_FUNC_END(do_mul)
-
-#
-# init key
-#
-.macro do_poly1305_init
- addis 10, 2, rmask@toc@ha
- addi 10, 10, rmask@toc@l
-
- ld 11, 0(10)
- ld 12, 8(10)
-
- li 14, 16
- li 15, 32
- addis 10, 2, cnum@toc@ha
- addi 10, 10, cnum@toc@l
- lvx 25, 0, 10 # v25 - mask
- lvx 31, 14, 10 # v31 = 1a
- lvx 19, 15, 10 # v19 = 1 << 24
- lxv 24, 48(10) # vs24
- lxv 25, 64(10) # vs25
-
- # initialize
- # load key from r3 to vectors
- ld 9, 24(3)
- ld 10, 32(3)
- and. 9, 9, 11
- and. 10, 10, 12
-
- # break 26 bits
- extrdi 14, 9, 26, 38
- extrdi 15, 9, 26, 12
- extrdi 16, 9, 12, 0
- mtvsrdd 58, 0, 14
- insrdi 16, 10, 14, 38
- mtvsrdd 59, 0, 15
- extrdi 17, 10, 26, 24
- mtvsrdd 60, 0, 16
- extrdi 18, 10, 24, 0
- mtvsrdd 61, 0, 17
- mtvsrdd 62, 0, 18
-
- # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5
- li 9, 5
- mtvsrdd 36, 0, 9
- vmulouw 0, 27, 4 # v0 = rr0
- vmulouw 1, 28, 4 # v1 = rr1
- vmulouw 2, 29, 4 # v2 = rr2
- vmulouw 3, 30, 4 # v3 = rr3
-.endm
-
-#
-# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
-# k = 32 bytes key
-# r3 = k (r, s)
-# r4 = mlen
-# r5 = m
-#
-SYM_FUNC_START(poly1305_p10le_4blocks)
-.align 5
- cmpdi 5, 64
- blt Out_no_poly1305
-
- SAVE_REGS
-
- do_poly1305_init
-
- li 21, 0 # counter to message
-
- poly1305_setup_r
-
- # load previous H state
- # break/convert r6 to 26 bits
- ld 9, 0(3)
- ld 10, 8(3)
- ld 19, 16(3)
- sldi 19, 19, 24
- mtvsrdd 41, 0, 19
- extrdi 14, 9, 26, 38
- extrdi 15, 9, 26, 12
- extrdi 16, 9, 12, 0
- mtvsrdd 36, 0, 14
- insrdi 16, 10, 14, 38
- mtvsrdd 37, 0, 15
- extrdi 17, 10, 26, 24
- mtvsrdd 38, 0, 16
- extrdi 18, 10, 24, 0
- mtvsrdd 39, 0, 17
- mtvsrdd 40, 0, 18
- vor 8, 8, 9
-
- # input m1 m2
- add 20, 4, 21
- xxlor 49, 24, 24
- xxlor 50, 25, 25
- lxvw4x 43, 0, 20
- addi 17, 20, 16
- lxvw4x 44, 0, 17
- vperm 14, 11, 12, 17
- vperm 15, 11, 12, 18
- vand 9, 14, 25 # a0
- vsrd 10, 14, 31 # >> 26
- vsrd 11, 10, 31 # 12 bits left
- vand 10, 10, 25 # a1
- vspltisb 13, 12
- vand 16, 15, 25
- vsld 12, 16, 13
- vor 11, 11, 12
- vand 11, 11, 25 # a2
- vspltisb 13, 14
- vsrd 12, 15, 13 # >> 14
- vsrd 13, 12, 31 # >> 26, a4
- vand 12, 12, 25 # a3
-
- vaddudm 20, 4, 9
- vaddudm 21, 5, 10
- vaddudm 22, 6, 11
- vaddudm 23, 7, 12
- vaddudm 24, 8, 13
-
- # m3 m4
- addi 17, 17, 16
- lxvw4x 43, 0, 17
- addi 17, 17, 16
- lxvw4x 44, 0, 17
- vperm 14, 11, 12, 17
- vperm 15, 11, 12, 18
- vand 9, 14, 25 # a0
- vsrd 10, 14, 31 # >> 26
- vsrd 11, 10, 31 # 12 bits left
- vand 10, 10, 25 # a1
- vspltisb 13, 12
- vand 16, 15, 25
- vsld 12, 16, 13
- vspltisb 13, 14
- vor 11, 11, 12
- vand 11, 11, 25 # a2
- vsrd 12, 15, 13 # >> 14
- vsrd 13, 12, 31 # >> 26, a4
- vand 12, 12, 25 # a3
-
- # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
- vmrgow 4, 9, 20
- vmrgow 5, 10, 21
- vmrgow 6, 11, 22
- vmrgow 7, 12, 23
- vmrgow 8, 13, 24
- vaddudm 8, 8, 19
-
- addi 5, 5, -64 # len -= 64
- addi 21, 21, 64 # offset += 64
-
- li 9, 64
- divdu 31, 5, 9
-
- cmpdi 31, 0
- ble Skip_block_loop
-
- mtctr 31
-
-# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
-# Rewrite the polynominal sum of product as follows,
-# h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2
-# h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2
-# .... Repeat
-# h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 -->
-# h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
-#
-loop_4blocks:
-
- # Multiply odd words and even words
- mul_odd
- mul_even
- # carry reduction
- vspltisb 9, 2
- vsrd 10, 14, 31
- vsrd 11, 17, 31
- vand 7, 17, 25
- vand 4, 14, 25
- vaddudm 18, 18, 11
- vsrd 12, 18, 31
- vaddudm 15, 15, 10
-
- vsrd 11, 15, 31
- vand 8, 18, 25
- vand 5, 15, 25
- vaddudm 4, 4, 12
- vsld 10, 12, 9
- vaddudm 6, 16, 11
-
- vsrd 13, 6, 31
- vand 6, 6, 25
- vaddudm 4, 4, 10
- vsrd 10, 4, 31
- vaddudm 7, 7, 13
-
- vsrd 11, 7, 31
- vand 7, 7, 25
- vand 4, 4, 25
- vaddudm 5, 5, 10
- vaddudm 8, 8, 11
-
- # input m1 m2 m3 m4
- add 20, 4, 21
- xxlor 49, 24, 24
- xxlor 50, 25, 25
- lxvw4x 43, 0, 20
- addi 17, 20, 16
- lxvw4x 44, 0, 17
- vperm 14, 11, 12, 17
- vperm 15, 11, 12, 18
- addi 17, 17, 16
- lxvw4x 43, 0, 17
- addi 17, 17, 16
- lxvw4x 44, 0, 17
- vperm 17, 11, 12, 17
- vperm 18, 11, 12, 18
-
- vand 20, 14, 25 # a0
- vand 9, 17, 25 # a0
- vsrd 21, 14, 31 # >> 26
- vsrd 22, 21, 31 # 12 bits left
- vsrd 10, 17, 31 # >> 26
- vsrd 11, 10, 31 # 12 bits left
-
- vand 21, 21, 25 # a1
- vand 10, 10, 25 # a1
-
- vspltisb 13, 12
- vand 16, 15, 25
- vsld 23, 16, 13
- vor 22, 22, 23
- vand 22, 22, 25 # a2
- vand 16, 18, 25
- vsld 12, 16, 13
- vor 11, 11, 12
- vand 11, 11, 25 # a2
- vspltisb 13, 14
- vsrd 23, 15, 13 # >> 14
- vsrd 24, 23, 31 # >> 26, a4
- vand 23, 23, 25 # a3
- vsrd 12, 18, 13 # >> 14
- vsrd 13, 12, 31 # >> 26, a4
- vand 12, 12, 25 # a3
-
- vaddudm 4, 4, 20
- vaddudm 5, 5, 21
- vaddudm 6, 6, 22
- vaddudm 7, 7, 23
- vaddudm 8, 8, 24
-
- # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
- vmrgow 4, 9, 4
- vmrgow 5, 10, 5
- vmrgow 6, 11, 6
- vmrgow 7, 12, 7
- vmrgow 8, 13, 8
- vaddudm 8, 8, 19
-
- addi 5, 5, -64 # len -= 64
- addi 21, 21, 64 # offset += 64
-
- bdnz loop_4blocks
-
-Skip_block_loop:
- xxlor 58, 0, 0
- xxlor 59, 1, 1
- xxlor 60, 2, 2
- xxlor 61, 3, 3
- xxlor 62, 4, 4
- xxlor 32, 5, 5
- xxlor 33, 6, 6
- xxlor 34, 7, 7
- xxlor 35, 8, 8
-
- # Multiply odd words and even words
- mul_odd
- mul_even
-
- # Sum the products.
- xxpermdi 41, 31, 46, 0
- xxpermdi 42, 31, 47, 0
- vaddudm 4, 14, 9
- xxpermdi 36, 31, 36, 3
- vaddudm 5, 15, 10
- xxpermdi 37, 31, 37, 3
- xxpermdi 43, 31, 48, 0
- vaddudm 6, 16, 11
- xxpermdi 38, 31, 38, 3
- xxpermdi 44, 31, 49, 0
- vaddudm 7, 17, 12
- xxpermdi 39, 31, 39, 3
- xxpermdi 45, 31, 50, 0
- vaddudm 8, 18, 13
- xxpermdi 40, 31, 40, 3
-
- # carry reduction
- vspltisb 9, 2
- vsrd 10, 4, 31
- vsrd 11, 7, 31
- vand 7, 7, 25
- vand 4, 4, 25
- vaddudm 8, 8, 11
- vsrd 12, 8, 31
- vaddudm 5, 5, 10
-
- vsrd 11, 5, 31
- vand 8, 8, 25
- vand 5, 5, 25
- vaddudm 4, 4, 12
- vsld 10, 12, 9
- vaddudm 6, 6, 11
-
- vsrd 13, 6, 31
- vand 6, 6, 25
- vaddudm 4, 4, 10
- vsrd 10, 4, 31
- vaddudm 7, 7, 13
-
- vsrd 11, 7, 31
- vand 7, 7, 25
- vand 4, 4, 25
- vaddudm 5, 5, 10
- vsrd 10, 5, 31
- vand 5, 5, 25
- vaddudm 6, 6, 10
- vaddudm 8, 8, 11
-
- b do_final_update
-
-do_final_update:
- # combine 26 bit limbs
- # v4, v5, v6, v7 and v8 are 26 bit vectors
- vsld 5, 5, 31
- vor 20, 4, 5
- vspltisb 11, 12
- vsrd 12, 6, 11
- vsld 6, 6, 31
- vsld 6, 6, 31
- vor 20, 20, 6
- vspltisb 11, 14
- vsld 7, 7, 11
- vor 21, 7, 12
- mfvsrld 16, 40 # save last 2 bytes
- vsld 8, 8, 11
- vsld 8, 8, 31
- vor 21, 21, 8
- mfvsrld 17, 52
- mfvsrld 19, 53
- srdi 16, 16, 24
-
- std 17, 0(3)
- std 19, 8(3)
- stw 16, 16(3)
-
-Out_loop:
- li 3, 0
-
- RESTORE_REGS
-
- blr
-
-Out_no_poly1305:
- li 3, 0
- blr
-SYM_FUNC_END(poly1305_p10le_4blocks)
-
-#
-# =======================================================================
-# The following functions implement 64 x 64 bits multiplication poly1305.
-#
-SYM_FUNC_START_LOCAL(Poly1305_init_64)
- # mask 0x0FFFFFFC0FFFFFFC
- # mask 0x0FFFFFFC0FFFFFFF
- addis 10, 2, rmask@toc@ha
- addi 10, 10, rmask@toc@l
- ld 11, 0(10)
- ld 12, 8(10)
-
- # initialize
- # load key from r3
- ld 9, 24(3)
- ld 10, 32(3)
- and. 9, 9, 11 # cramp mask r0
- and. 10, 10, 12 # cramp mask r1
-
- srdi 21, 10, 2
- add 19, 21, 10 # s1: r19 - (r1 >> 2) *5
-
- # setup r and s
- li 25, 0
- mtvsrdd 32+0, 9, 19 # r0, s1
- mtvsrdd 32+1, 10, 9 # r1, r0
- mtvsrdd 32+2, 19, 25 # s1
- mtvsrdd 32+3, 9, 25 # r0
-
- blr
-SYM_FUNC_END(Poly1305_init_64)
-
-# Poly1305_mult
-# v6 = (h0, h1), v8 = h2
-# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0
-#
-# Output: v7, v10, v11
-#
-SYM_FUNC_START_LOCAL(Poly1305_mult)
- #
- # d0 = h0 * r0 + h1 * s1
- vmsumudm 7, 6, 0, 9 # h0 * r0, h1 * s1
-
- # d1 = h0 * r1 + h1 * r0 + h2 * s1
- vmsumudm 11, 6, 1, 9 # h0 * r1, h1 * r0
- vmsumudm 10, 8, 2, 11 # d1 += h2 * s1
-
- # d2 = r0
- vmsumudm 11, 8, 3, 9 # d2 = h2 * r0
- blr
-SYM_FUNC_END(Poly1305_mult)
-
-#
-# carry reduction
-# h %=p
-#
-# Input: v7, v10, v11
-# Output: r27, r28, r29
-#
-SYM_FUNC_START_LOCAL(Carry_reduction)
- mfvsrld 27, 32+7
- mfvsrld 28, 32+10
- mfvsrld 29, 32+11
- mfvsrd 20, 32+7 # h0.h
- mfvsrd 21, 32+10 # h1.h
-
- addc 28, 28, 20
- adde 29, 29, 21
- srdi 22, 29, 0x2
- sldi 23, 22, 0x2
- add 23, 23, 22 # (h2 & 3) * 5
- addc 27, 27, 23 # h0
- addze 28, 28 # h1
- andi. 29, 29, 0x3 # h2
- blr
-SYM_FUNC_END(Carry_reduction)
-
-#
-# poly1305 multiplication
-# h *= r, h %= p
-# d0 = h0 * r0 + h1 * s1
-# d1 = h0 * r1 + h1 * r0 + h2 * s1
-# d2 = h0 * r0
-#
-#
-# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit)
-# - no highbit if final leftover block (highbit = 0)
-#
-SYM_FUNC_START(poly1305_64s)
- cmpdi 5, 0
- ble Out_no_poly1305_64
-
- mflr 0
- std 0, 16(1)
- stdu 1,-400(1)
-
- SAVE_GPR 14, 112, 1
- SAVE_GPR 15, 120, 1
- SAVE_GPR 16, 128, 1
- SAVE_GPR 17, 136, 1
- SAVE_GPR 18, 144, 1
- SAVE_GPR 19, 152, 1
- SAVE_GPR 20, 160, 1
- SAVE_GPR 21, 168, 1
- SAVE_GPR 22, 176, 1
- SAVE_GPR 23, 184, 1
- SAVE_GPR 24, 192, 1
- SAVE_GPR 25, 200, 1
- SAVE_GPR 26, 208, 1
- SAVE_GPR 27, 216, 1
- SAVE_GPR 28, 224, 1
- SAVE_GPR 29, 232, 1
- SAVE_GPR 30, 240, 1
- SAVE_GPR 31, 248, 1
-
- # Init poly1305
- bl Poly1305_init_64
-
- li 25, 0 # offset to inp and outp
-
- add 11, 25, 4
-
- # load h
- # h0, h1, h2?
- ld 27, 0(3)
- ld 28, 8(3)
- lwz 29, 16(3)
-
- li 30, 16
- divdu 31, 5, 30
-
- mtctr 31
-
- mr 24, 6 # highbit
-
-Loop_block_64:
- vxor 9, 9, 9
-
- ld 20, 0(11)
- ld 21, 8(11)
- addi 11, 11, 16
-
- addc 27, 27, 20
- adde 28, 28, 21
- adde 29, 29, 24
-
- li 22, 0
- mtvsrdd 32+6, 27, 28 # h0, h1
- mtvsrdd 32+8, 29, 22 # h2
-
- bl Poly1305_mult
-
- bl Carry_reduction
-
- bdnz Loop_block_64
-
- std 27, 0(3)
- std 28, 8(3)
- stw 29, 16(3)
-
- li 3, 0
-
- RESTORE_GPR 14, 112, 1
- RESTORE_GPR 15, 120, 1
- RESTORE_GPR 16, 128, 1
- RESTORE_GPR 17, 136, 1
- RESTORE_GPR 18, 144, 1
- RESTORE_GPR 19, 152, 1
- RESTORE_GPR 20, 160, 1
- RESTORE_GPR 21, 168, 1
- RESTORE_GPR 22, 176, 1
- RESTORE_GPR 23, 184, 1
- RESTORE_GPR 24, 192, 1
- RESTORE_GPR 25, 200, 1
- RESTORE_GPR 26, 208, 1
- RESTORE_GPR 27, 216, 1
- RESTORE_GPR 28, 224, 1
- RESTORE_GPR 29, 232, 1
- RESTORE_GPR 30, 240, 1
- RESTORE_GPR 31, 248, 1
-
- addi 1, 1, 400
- ld 0, 16(1)
- mtlr 0
-
- blr
-
-Out_no_poly1305_64:
- li 3, 0
- blr
-SYM_FUNC_END(poly1305_64s)
-
-#
-# Input: r3 = h, r4 = s, r5 = mac
-# mac = h + s
-#
-SYM_FUNC_START(poly1305_emit_64)
- ld 10, 0(3)
- ld 11, 8(3)
- ld 12, 16(3)
-
- # compare modulus
- # h + 5 + (-p)
- mr 6, 10
- mr 7, 11
- mr 8, 12
- addic. 6, 6, 5
- addze 7, 7
- addze 8, 8
- srdi 9, 8, 2 # overflow?
- cmpdi 9, 0
- beq Skip_h64
- mr 10, 6
- mr 11, 7
- mr 12, 8
-
-Skip_h64:
- ld 6, 0(4)
- ld 7, 8(4)
- addc 10, 10, 6
- adde 11, 11, 7
- addze 12, 12
-
- std 10, 0(5)
- std 11, 8(5)
- blr
-SYM_FUNC_END(poly1305_emit_64)
-
-SYM_DATA_START_LOCAL(RMASK)
-.align 5
-rmask:
-.byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f
-cnum:
-.long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000
-.long 0x1a, 0x00, 0x1a, 0x00
-.long 0x01000000, 0x01000000, 0x01000000, 0x01000000
-.long 0x00010203, 0x04050607, 0x10111213, 0x14151617
-.long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f
-SYM_DATA_END(RMASK)
diff --git a/arch/powerpc/lib/crypto/sha256-spe-asm.S b/arch/powerpc/lib/crypto/sha256-spe-asm.S
deleted file mode 100644
index cd99d71dae34..000000000000
--- a/arch/powerpc/lib/crypto/sha256-spe-asm.S
+++ /dev/null
@@ -1,318 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Fast SHA-256 implementation for SPE instruction set (PPC)
- *
- * This code makes use of the SPE SIMD instruction set as defined in
- * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
- * Implementation is based on optimization guide notes from
- * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
- *
- * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
- */
-
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-
-#define rHP r3 /* pointer to hash values in memory */
-#define rKP r24 /* pointer to round constants */
-#define rWP r4 /* pointer to input data */
-
-#define rH0 r5 /* 8 32 bit hash values in 8 registers */
-#define rH1 r6
-#define rH2 r7
-#define rH3 r8
-#define rH4 r9
-#define rH5 r10
-#define rH6 r11
-#define rH7 r12
-
-#define rW0 r14 /* 64 bit registers. 16 words in 8 registers */
-#define rW1 r15
-#define rW2 r16
-#define rW3 r17
-#define rW4 r18
-#define rW5 r19
-#define rW6 r20
-#define rW7 r21
-
-#define rT0 r22 /* 64 bit temporaries */
-#define rT1 r23
-#define rT2 r0 /* 32 bit temporaries */
-#define rT3 r25
-
-#define CMP_KN_LOOP
-#define CMP_KC_LOOP \
- cmpwi rT1,0;
-
-#define INITIALIZE \
- stwu r1,-128(r1); /* create stack frame */ \
- evstdw r14,8(r1); /* We must save non volatile */ \
- evstdw r15,16(r1); /* registers. Take the chance */ \
- evstdw r16,24(r1); /* and save the SPE part too */ \
- evstdw r17,32(r1); \
- evstdw r18,40(r1); \
- evstdw r19,48(r1); \
- evstdw r20,56(r1); \
- evstdw r21,64(r1); \
- evstdw r22,72(r1); \
- evstdw r23,80(r1); \
- stw r24,88(r1); /* save normal registers */ \
- stw r25,92(r1);
-
-
-#define FINALIZE \
- evldw r14,8(r1); /* restore SPE registers */ \
- evldw r15,16(r1); \
- evldw r16,24(r1); \
- evldw r17,32(r1); \
- evldw r18,40(r1); \
- evldw r19,48(r1); \
- evldw r20,56(r1); \
- evldw r21,64(r1); \
- evldw r22,72(r1); \
- evldw r23,80(r1); \
- lwz r24,88(r1); /* restore normal registers */ \
- lwz r25,92(r1); \
- xor r0,r0,r0; \
- stw r0,8(r1); /* Delete sensitive data */ \
- stw r0,16(r1); /* that we might have pushed */ \
- stw r0,24(r1); /* from other context that runs */ \
- stw r0,32(r1); /* the same code. Assume that */ \
- stw r0,40(r1); /* the lower part of the GPRs */ \
- stw r0,48(r1); /* was already overwritten on */ \
- stw r0,56(r1); /* the way down to here */ \
- stw r0,64(r1); \
- stw r0,72(r1); \
- stw r0,80(r1); \
- addi r1,r1,128; /* cleanup stack frame */
-
-#ifdef __BIG_ENDIAN__
-#define LOAD_DATA(reg, off) \
- lwz reg,off(rWP); /* load data */
-#define NEXT_BLOCK \
- addi rWP,rWP,64; /* increment per block */
-#else
-#define LOAD_DATA(reg, off) \
- lwbrx reg,0,rWP; /* load data */ \
- addi rWP,rWP,4; /* increment per word */
-#define NEXT_BLOCK /* nothing to do */
-#endif
-
-#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \
- LOAD_DATA(w, off) /* 1: W */ \
- rotrwi rT0,e,6; /* 1: S1 = e rotr 6 */ \
- rotrwi rT1,e,11; /* 1: S1' = e rotr 11 */ \
- rotrwi rT2,e,25; /* 1: S1" = e rotr 25 */ \
- xor rT0,rT0,rT1; /* 1: S1 = S1 xor S1' */ \
- and rT3,e,f; /* 1: ch = e and f */ \
- xor rT0,rT0,rT2; /* 1: S1 = S1 xor S1" */ \
- andc rT1,g,e; /* 1: ch' = ~e and g */ \
- lwz rT2,off(rKP); /* 1: K */ \
- xor rT3,rT3,rT1; /* 1: ch = ch xor ch' */ \
- add h,h,rT0; /* 1: temp1 = h + S1 */ \
- add rT3,rT3,w; /* 1: temp1' = ch + w */ \
- rotrwi rT0,a,2; /* 1: S0 = a rotr 2 */ \
- add h,h,rT3; /* 1: temp1 = temp1 + temp1' */ \
- rotrwi rT1,a,13; /* 1: S0' = a rotr 13 */ \
- add h,h,rT2; /* 1: temp1 = temp1 + K */ \
- rotrwi rT3,a,22; /* 1: S0" = a rotr 22 */ \
- xor rT0,rT0,rT1; /* 1: S0 = S0 xor S0' */ \
- add d,d,h; /* 1: d = d + temp1 */ \
- xor rT3,rT0,rT3; /* 1: S0 = S0 xor S0" */ \
- evmergelo w,w,w; /* shift W */ \
- or rT2,a,b; /* 1: maj = a or b */ \
- and rT1,a,b; /* 1: maj' = a and b */ \
- and rT2,rT2,c; /* 1: maj = maj and c */ \
- LOAD_DATA(w, off+4) /* 2: W */ \
- or rT2,rT1,rT2; /* 1: maj = maj or maj' */ \
- rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \
- add rT3,rT3,rT2; /* 1: temp2 = S0 + maj */ \
- rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \
- add h,h,rT3; /* 1: h = temp1 + temp2 */ \
- rotrwi rT2,d,25; /* 2: S1" = e rotr 25 */ \
- xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \
- and rT3,d,e; /* 2: ch = e and f */ \
- xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \
- andc rT1,f,d; /* 2: ch' = ~e and g */ \
- lwz rT2,off+4(rKP); /* 2: K */ \
- xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \
- add g,g,rT0; /* 2: temp1 = h + S1 */ \
- add rT3,rT3,w; /* 2: temp1' = ch + w */ \
- rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \
- add g,g,rT3; /* 2: temp1 = temp1 + temp1' */ \
- rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \
- add g,g,rT2; /* 2: temp1 = temp1 + K */ \
- rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \
- xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \
- or rT2,h,a; /* 2: maj = a or b */ \
- xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \
- and rT1,h,a; /* 2: maj' = a and b */ \
- and rT2,rT2,b; /* 2: maj = maj and c */ \
- add c,c,g; /* 2: d = d + temp1 */ \
- or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \
- add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \
- add g,g,rT3 /* 2: h = temp1 + temp2 */
-
-#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \
- rotrwi rT2,e,6; /* 1: S1 = e rotr 6 */ \
- evmergelohi rT0,w0,w1; /* w[-15] */ \
- rotrwi rT3,e,11; /* 1: S1' = e rotr 11 */ \
- evsrwiu rT1,rT0,3; /* s0 = w[-15] >> 3 */ \
- xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \
- evrlwi rT0,rT0,25; /* s0' = w[-15] rotr 7 */ \
- rotrwi rT3,e,25; /* 1: S1' = e rotr 25 */ \
- evxor rT1,rT1,rT0; /* s0 = s0 xor s0' */ \
- xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \
- evrlwi rT0,rT0,21; /* s0' = w[-15] rotr 18 */ \
- add h,h,rT2; /* 1: temp1 = h + S1 */ \
- evxor rT0,rT0,rT1; /* s0 = s0 xor s0' */ \
- and rT2,e,f; /* 1: ch = e and f */ \
- evaddw w0,w0,rT0; /* w = w[-16] + s0 */ \
- andc rT3,g,e; /* 1: ch' = ~e and g */ \
- evsrwiu rT0,w7,10; /* s1 = w[-2] >> 10 */ \
- xor rT2,rT2,rT3; /* 1: ch = ch xor ch' */ \
- evrlwi rT1,w7,15; /* s1' = w[-2] rotr 17 */ \
- add h,h,rT2; /* 1: temp1 = temp1 + ch */ \
- evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \
- rotrwi rT2,a,2; /* 1: S0 = a rotr 2 */ \
- evrlwi rT1,w7,13; /* s1' = w[-2] rotr 19 */ \
- rotrwi rT3,a,13; /* 1: S0' = a rotr 13 */ \
- evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \
- xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \
- evldw rT1,off(rKP); /* k */ \
- rotrwi rT3,a,22; /* 1: S0' = a rotr 22 */ \
- evaddw w0,w0,rT0; /* w = w + s1 */ \
- xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \
- evmergelohi rT0,w4,w5; /* w[-7] */ \
- and rT3,a,b; /* 1: maj = a and b */ \
- evaddw w0,w0,rT0; /* w = w + w[-7] */ \
- CMP_K##k##_LOOP \
- add rT2,rT2,rT3; /* 1: temp2 = S0 + maj */ \
- evaddw rT1,rT1,w0; /* wk = w + k */ \
- xor rT3,a,b; /* 1: maj = a xor b */ \
- evmergehi rT0,rT1,rT1; /* wk1/wk2 */ \
- and rT3,rT3,c; /* 1: maj = maj and c */ \
- add h,h,rT0; /* 1: temp1 = temp1 + wk */ \
- add rT2,rT2,rT3; /* 1: temp2 = temp2 + maj */ \
- add g,g,rT1; /* 2: temp1 = temp1 + wk */ \
- add d,d,h; /* 1: d = d + temp1 */ \
- rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \
- add h,h,rT2; /* 1: h = temp1 + temp2 */ \
- rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \
- rotrwi rT2,d,25; /* 2: S" = e rotr 25 */ \
- xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \
- and rT3,d,e; /* 2: ch = e and f */ \
- xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \
- andc rT1,f,d; /* 2: ch' = ~e and g */ \
- add g,g,rT0; /* 2: temp1 = h + S1 */ \
- xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \
- rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \
- add g,g,rT3; /* 2: temp1 = temp1 + ch */ \
- rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \
- rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \
- xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \
- or rT2,h,a; /* 2: maj = a or b */ \
- and rT1,h,a; /* 2: maj' = a and b */ \
- and rT2,rT2,b; /* 2: maj = maj and c */ \
- xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \
- or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \
- add c,c,g; /* 2: d = d + temp1 */ \
- add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \
- add g,g,rT3 /* 2: h = temp1 + temp2 */
-
-_GLOBAL(ppc_spe_sha256_transform)
- INITIALIZE
-
- mtctr r5
- lwz rH0,0(rHP)
- lwz rH1,4(rHP)
- lwz rH2,8(rHP)
- lwz rH3,12(rHP)
- lwz rH4,16(rHP)
- lwz rH5,20(rHP)
- lwz rH6,24(rHP)
- lwz rH7,28(rHP)
-
-ppc_spe_sha256_main:
- lis rKP,PPC_SPE_SHA256_K@ha
- addi rKP,rKP,PPC_SPE_SHA256_K@l
-
- R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0)
- R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8)
- R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16)
- R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24)
- R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32)
- R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40)
- R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48)
- R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56)
-ppc_spe_sha256_16_rounds:
- addi rKP,rKP,64
- R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
- rW0, rW1, rW4, rW5, rW7, N, 0)
- R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
- rW1, rW2, rW5, rW6, rW0, N, 8)
- R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
- rW2, rW3, rW6, rW7, rW1, N, 16)
- R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
- rW3, rW4, rW7, rW0, rW2, N, 24)
- R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
- rW4, rW5, rW0, rW1, rW3, N, 32)
- R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
- rW5, rW6, rW1, rW2, rW4, N, 40)
- R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
- rW6, rW7, rW2, rW3, rW5, N, 48)
- R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
- rW7, rW0, rW3, rW4, rW6, C, 56)
- bt gt,ppc_spe_sha256_16_rounds
-
- lwz rW0,0(rHP)
- NEXT_BLOCK
- lwz rW1,4(rHP)
- lwz rW2,8(rHP)
- lwz rW3,12(rHP)
- lwz rW4,16(rHP)
- lwz rW5,20(rHP)
- lwz rW6,24(rHP)
- lwz rW7,28(rHP)
-
- add rH0,rH0,rW0
- stw rH0,0(rHP)
- add rH1,rH1,rW1
- stw rH1,4(rHP)
- add rH2,rH2,rW2
- stw rH2,8(rHP)
- add rH3,rH3,rW3
- stw rH3,12(rHP)
- add rH4,rH4,rW4
- stw rH4,16(rHP)
- add rH5,rH5,rW5
- stw rH5,20(rHP)
- add rH6,rH6,rW6
- stw rH6,24(rHP)
- add rH7,rH7,rW7
- stw rH7,28(rHP)
-
- bdnz ppc_spe_sha256_main
-
- FINALIZE
- blr
-
-.data
-.align 5
-PPC_SPE_SHA256_K:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
diff --git a/arch/powerpc/lib/crypto/sha256.c b/arch/powerpc/lib/crypto/sha256.c
deleted file mode 100644
index 6b0f079587eb..000000000000
--- a/arch/powerpc/lib/crypto/sha256.c
+++ /dev/null
@@ -1,70 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 Secure Hash Algorithm, SPE optimized
- *
- * Based on generic implementation. The assembler module takes care
- * about the SPE registers so it can run from interrupt context.
- *
- * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
- */
-
-#include <asm/switch_to.h>
-#include <crypto/internal/sha2.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/preempt.h>
-
-/*
- * MAX_BYTES defines the number of bytes that are allowed to be processed
- * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000
- * operations per 64 bytes. e500 cores can issue two arithmetic instructions
- * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
- * Thus 1KB of input data will need an estimated maximum of 18,000 cycles.
- * Headroom for cache misses included. Even with the low end model clocked
- * at 667 MHz this equals to a critical time window of less than 27us.
- *
- */
-#define MAX_BYTES 1024
-
-extern void ppc_spe_sha256_transform(u32 *state, const u8 *src, u32 blocks);
-
-static void spe_begin(void)
-{
- /* We just start SPE operations and will save SPE registers later. */
- preempt_disable();
- enable_kernel_spe();
-}
-
-static void spe_end(void)
-{
- disable_kernel_spe();
- /* reenable preemption */
- preempt_enable();
-}
-
-void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks)
-{
- do {
- /* cut input data into smaller blocks */
- u32 unit = min_t(size_t, nblocks,
- MAX_BYTES / SHA256_BLOCK_SIZE);
-
- spe_begin();
- ppc_spe_sha256_transform(state, data, unit);
- spe_end();
-
- data += unit * SHA256_BLOCK_SIZE;
- nblocks -= unit;
- } while (nblocks);
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-
-bool sha256_is_arch_optimized(void)
-{
- return true;
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 Secure Hash Algorithm, SPE optimized");
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 5158aefe4873..4693c464fc5a 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -343,7 +343,7 @@ static inline bool hash_supports_debug_pagealloc(void)
static u8 *linear_map_hash_slots;
static unsigned long linear_map_hash_count;
static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
-static void hash_debug_pagealloc_alloc_slots(void)
+static __init void hash_debug_pagealloc_alloc_slots(void)
{
if (!hash_supports_debug_pagealloc())
return;
@@ -409,7 +409,7 @@ static DEFINE_RAW_SPINLOCK(linear_map_kf_hash_lock);
static phys_addr_t kfence_pool;
-static inline void hash_kfence_alloc_pool(void)
+static __init void hash_kfence_alloc_pool(void)
{
if (!kfence_early_init_enabled())
goto err;
@@ -445,7 +445,7 @@ err:
disable_kfence();
}
-static inline void hash_kfence_map_pool(void)
+static __init void hash_kfence_map_pool(void)
{
unsigned long kfence_pool_start, kfence_pool_end;
unsigned long prot = pgprot_val(PAGE_KERNEL);
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 9f764bc42b8c..376dba5992f2 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -363,7 +363,7 @@ static int __meminit create_physical_mapping(unsigned long start,
}
#ifdef CONFIG_KFENCE
-static inline phys_addr_t alloc_kfence_pool(void)
+static __init phys_addr_t alloc_kfence_pool(void)
{
phys_addr_t kfence_pool;
@@ -393,7 +393,7 @@ no_kfence:
return 0;
}
-static inline void map_kfence_pool(phys_addr_t kfence_pool)
+static __init void map_kfence_pool(phys_addr_t kfence_pool)
{
if (!kfence_pool)
return;
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 9f9e4b871627..7ec60290abe6 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -143,42 +143,13 @@ spufs_evict_inode(struct inode *inode)
put_spu_gang(ei->i_gang);
}
-static void spufs_prune_dir(struct dentry *dir)
-{
- struct dentry *dentry;
- struct hlist_node *n;
-
- inode_lock(d_inode(dir));
- hlist_for_each_entry_safe(dentry, n, &dir->d_children, d_sib) {
- spin_lock(&dentry->d_lock);
- if (simple_positive(dentry)) {
- dget_dlock(dentry);
- __d_drop(dentry);
- spin_unlock(&dentry->d_lock);
- simple_unlink(d_inode(dir), dentry);
- /* XXX: what was dcache_lock protecting here? Other
- * filesystems (IB, configfs) release dcache_lock
- * before unlink */
- dput(dentry);
- } else {
- spin_unlock(&dentry->d_lock);
- }
- }
- shrink_dcache_parent(dir);
- inode_unlock(d_inode(dir));
-}
-
/* Caller must hold parent->i_mutex */
-static int spufs_rmdir(struct inode *parent, struct dentry *dir)
+static void spufs_rmdir(struct inode *parent, struct dentry *dir)
{
- /* remove all entries */
- int res;
- spufs_prune_dir(dir);
- d_drop(dir);
- res = simple_rmdir(parent, dir);
- /* We have to give up the mm_struct */
- spu_forget(SPUFS_I(d_inode(dir))->i_ctx);
- return res;
+ struct spu_context *ctx = SPUFS_I(d_inode(dir))->i_ctx;
+
+ locked_recursive_removal(dir, NULL);
+ spu_forget(ctx);
}
static int spufs_fill_dir(struct dentry *dir,
@@ -222,15 +193,13 @@ static int spufs_dir_close(struct inode *inode, struct file *file)
{
struct inode *parent;
struct dentry *dir;
- int ret;
dir = file->f_path.dentry;
parent = d_inode(dir->d_parent);
inode_lock_nested(parent, I_MUTEX_PARENT);
- ret = spufs_rmdir(parent, dir);
+ spufs_rmdir(parent, dir);
inode_unlock(parent);
- WARN_ON(ret);
unuse_gang(dir->d_parent);
return dcache_dir_close(inode, file);
@@ -288,11 +257,11 @@ spufs_mkdir(struct inode *dir, struct dentry *dentry, unsigned int flags,
ret = spufs_fill_dir(dentry, spufs_dir_debug_contents,
mode, ctx);
+ inode_unlock(inode);
+
if (ret)
spufs_rmdir(dir, dentry);
- inode_unlock(inode);
-
return ret;
}
@@ -475,7 +444,7 @@ spufs_create_context(struct inode *inode, struct dentry *dentry,
ret = spufs_context_open(&path);
if (ret < 0)
- WARN_ON(spufs_rmdir(inode, dentry));
+ spufs_rmdir(inode, dentry);
out_aff_unlock:
if (affinity)
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index d71ea0f4466f..5352932badd8 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -24,9 +24,6 @@ config RISCV
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
select ARCH_HAS_BINFMT_FLAT
- select ARCH_HAS_CRC32 if RISCV_ISA_ZBC
- select ARCH_HAS_CRC64 if 64BIT && RISCV_ISA_ZBC
- select ARCH_HAS_CRC_T10DIF if RISCV_ISA_ZBC
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL if MMU
select ARCH_HAS_DEBUG_VM_PGTABLE
@@ -98,6 +95,7 @@ config RISCV
select CLONE_BACKWARDS
select COMMON_CLK
select CPU_PM if CPU_IDLE || HIBERNATION || SUSPEND
+ select DYNAMIC_FTRACE if FUNCTION_TRACER
select EDAC_SUPPORT
select FRAME_POINTER if PERF_EVENTS || (FUNCTION_TRACER && !DYNAMIC_FTRACE)
select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY if DYNAMIC_FTRACE
@@ -136,13 +134,13 @@ config RISCV
select HAVE_ARCH_KASAN if MMU && 64BIT
select HAVE_ARCH_KASAN_VMALLOC if MMU && 64BIT
select HAVE_ARCH_KFENCE if MMU && 64BIT
+ select HAVE_ARCH_KSTACK_ERASE
select HAVE_ARCH_KGDB if !XIP_KERNEL
select HAVE_ARCH_KGDB_QXFER_PKT
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
select HAVE_ARCH_SECCOMP_FILTER
- select HAVE_ARCH_STACKLEAK
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT && MMU
@@ -162,7 +160,7 @@ config RISCV
select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
select HAVE_FUNCTION_GRAPH_TRACER if HAVE_DYNAMIC_FTRACE_WITH_ARGS
select HAVE_FUNCTION_GRAPH_FREGS
- select HAVE_FUNCTION_TRACER if !XIP_KERNEL
+ select HAVE_FUNCTION_TRACER if !XIP_KERNEL && HAVE_DYNAMIC_FTRACE
select HAVE_EBPF_JIT if MMU
select HAVE_GUP_FAST if MMU
select HAVE_FUNCTION_ARG_ACCESS_API
diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig
index cd9b776602f8..a75d6325607b 100644
--- a/arch/riscv/crypto/Kconfig
+++ b/arch/riscv/crypto/Kconfig
@@ -28,17 +28,6 @@ config CRYPTO_GHASH_RISCV64
Architecture: riscv64 using:
- Zvkg vector crypto extension
-config CRYPTO_SHA512_RISCV64
- tristate "Hash functions: SHA-384 and SHA-512"
- depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
- select CRYPTO_SHA512
- help
- SHA-384 and SHA-512 secure hash algorithm (FIPS 180)
-
- Architecture: riscv64 using:
- - Zvknhb vector crypto extension
- - Zvkb vector crypto extension
-
config CRYPTO_SM3_RISCV64
tristate "Hash functions: SM3 (ShangMi 3)"
depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
diff --git a/arch/riscv/crypto/Makefile b/arch/riscv/crypto/Makefile
index e10e8257734e..183495a95cc0 100644
--- a/arch/riscv/crypto/Makefile
+++ b/arch/riscv/crypto/Makefile
@@ -7,9 +7,6 @@ aes-riscv64-y := aes-riscv64-glue.o aes-riscv64-zvkned.o \
obj-$(CONFIG_CRYPTO_GHASH_RISCV64) += ghash-riscv64.o
ghash-riscv64-y := ghash-riscv64-glue.o ghash-riscv64-zvkg.o
-obj-$(CONFIG_CRYPTO_SHA512_RISCV64) += sha512-riscv64.o
-sha512-riscv64-y := sha512-riscv64-glue.o sha512-riscv64-zvknhb-zvkb.o
-
obj-$(CONFIG_CRYPTO_SM3_RISCV64) += sm3-riscv64.o
sm3-riscv64-y := sm3-riscv64-glue.o sm3-riscv64-zvksh-zvkb.o
diff --git a/arch/riscv/crypto/sha512-riscv64-glue.c b/arch/riscv/crypto/sha512-riscv64-glue.c
deleted file mode 100644
index 4634fca78ae2..000000000000
--- a/arch/riscv/crypto/sha512-riscv64-glue.c
+++ /dev/null
@@ -1,124 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-512 and SHA-384 using the RISC-V vector crypto extensions
- *
- * Copyright (C) 2023 VRULL GmbH
- * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
- *
- * Copyright (C) 2023 SiFive, Inc.
- * Author: Jerry Shih <jerry.shih@sifive.com>
- */
-
-#include <asm/simd.h>
-#include <asm/vector.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <crypto/sha512_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-/*
- * Note: the asm function only uses the 'state' field of struct sha512_state.
- * It is assumed to be the first field.
- */
-asmlinkage void sha512_transform_zvknhb_zvkb(
- struct sha512_state *state, const u8 *data, int num_blocks);
-
-static void sha512_block(struct sha512_state *state, const u8 *data,
- int num_blocks)
-{
- /*
- * Ensure struct sha512_state begins directly with the SHA-512
- * 512-bit internal state, as this is what the asm function expects.
- */
- BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0);
-
- if (crypto_simd_usable()) {
- kernel_vector_begin();
- sha512_transform_zvknhb_zvkb(state, data, num_blocks);
- kernel_vector_end();
- } else {
- sha512_generic_block_fn(state, data, num_blocks);
- }
-}
-
-static int riscv64_sha512_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha512_base_do_update_blocks(desc, data, len, sha512_block);
-}
-
-static int riscv64_sha512_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- sha512_base_do_finup(desc, data, len, sha512_block);
- return sha512_base_finish(desc, out);
-}
-
-static int riscv64_sha512_digest(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha512_base_init(desc) ?:
- riscv64_sha512_finup(desc, data, len, out);
-}
-
-static struct shash_alg riscv64_sha512_algs[] = {
- {
- .init = sha512_base_init,
- .update = riscv64_sha512_update,
- .finup = riscv64_sha512_finup,
- .digest = riscv64_sha512_digest,
- .descsize = SHA512_STATE_SIZE,
- .digestsize = SHA512_DIGEST_SIZE,
- .base = {
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_priority = 300,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_name = "sha512",
- .cra_driver_name = "sha512-riscv64-zvknhb-zvkb",
- .cra_module = THIS_MODULE,
- },
- }, {
- .init = sha384_base_init,
- .update = riscv64_sha512_update,
- .finup = riscv64_sha512_finup,
- .descsize = SHA512_STATE_SIZE,
- .digestsize = SHA384_DIGEST_SIZE,
- .base = {
- .cra_blocksize = SHA384_BLOCK_SIZE,
- .cra_priority = 300,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_name = "sha384",
- .cra_driver_name = "sha384-riscv64-zvknhb-zvkb",
- .cra_module = THIS_MODULE,
- },
- },
-};
-
-static int __init riscv64_sha512_mod_init(void)
-{
- if (riscv_isa_extension_available(NULL, ZVKNHB) &&
- riscv_isa_extension_available(NULL, ZVKB) &&
- riscv_vector_vlen() >= 128)
- return crypto_register_shashes(riscv64_sha512_algs,
- ARRAY_SIZE(riscv64_sha512_algs));
-
- return -ENODEV;
-}
-
-static void __exit riscv64_sha512_mod_exit(void)
-{
- crypto_unregister_shashes(riscv64_sha512_algs,
- ARRAY_SIZE(riscv64_sha512_algs));
-}
-
-module_init(riscv64_sha512_mod_init);
-module_exit(riscv64_sha512_mod_exit);
-
-MODULE_DESCRIPTION("SHA-512 (RISC-V accelerated)");
-MODULE_AUTHOR("Heiko Stuebner <heiko.stuebner@vrull.eu>");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_CRYPTO("sha512");
-MODULE_ALIAS_CRYPTO("sha384");
diff --git a/arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S b/arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S
deleted file mode 100644
index 89f4a10d12dd..000000000000
--- a/arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S
+++ /dev/null
@@ -1,203 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
-//
-// This file is dual-licensed, meaning that you can use it under your
-// choice of either of the following two licenses:
-//
-// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
-//
-// Licensed under the Apache License 2.0 (the "License"). You can obtain
-// a copy in the file LICENSE in the source distribution or at
-// https://www.openssl.org/source/license.html
-//
-// or
-//
-// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
-// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
-// Copyright 2024 Google LLC
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// The generated code of this file depends on the following RISC-V extensions:
-// - RV64I
-// - RISC-V Vector ('V') with VLEN >= 128
-// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknhb')
-// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
-
-#include <linux/linkage.h>
-
-.text
-.option arch, +zvknhb, +zvkb
-
-#define STATEP a0
-#define DATA a1
-#define NUM_BLOCKS a2
-
-#define STATEP_C a3
-#define K a4
-
-#define MASK v0
-#define INDICES v1
-#define W0 v10 // LMUL=2
-#define W1 v12 // LMUL=2
-#define W2 v14 // LMUL=2
-#define W3 v16 // LMUL=2
-#define VTMP v20 // LMUL=2
-#define FEBA v22 // LMUL=2
-#define HGDC v24 // LMUL=2
-#define PREV_FEBA v26 // LMUL=2
-#define PREV_HGDC v28 // LMUL=2
-
-// Do 4 rounds of SHA-512. w0 contains the current 4 message schedule words.
-//
-// If not all the message schedule words have been computed yet, then this also
-// computes 4 more message schedule words. w1-w3 contain the next 3 groups of 4
-// message schedule words; this macro computes the group after w3 and writes it
-// to w0. This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3,
-// w0), so the caller must cycle through the registers accordingly.
-.macro sha512_4rounds last, w0, w1, w2, w3
- vle64.v VTMP, (K)
- addi K, K, 32
- vadd.vv VTMP, VTMP, \w0
- vsha2cl.vv HGDC, FEBA, VTMP
- vsha2ch.vv FEBA, HGDC, VTMP
-.if !\last
- vmerge.vvm VTMP, \w2, \w1, MASK
- vsha2ms.vv \w0, VTMP, \w3
-.endif
-.endm
-
-.macro sha512_16rounds last
- sha512_4rounds \last, W0, W1, W2, W3
- sha512_4rounds \last, W1, W2, W3, W0
- sha512_4rounds \last, W2, W3, W0, W1
- sha512_4rounds \last, W3, W0, W1, W2
-.endm
-
-// void sha512_transform_zvknhb_zvkb(u64 state[8], const u8 *data,
-// int num_blocks);
-SYM_FUNC_START(sha512_transform_zvknhb_zvkb)
-
- // Setup mask for the vmerge to replace the first word (idx==0) in
- // message scheduling. There are 4 words, so an 8-bit mask suffices.
- vsetivli zero, 1, e8, m1, ta, ma
- vmv.v.i MASK, 0x01
-
- // Load the state. The state is stored as {a,b,c,d,e,f,g,h}, but we
- // need {f,e,b,a},{h,g,d,c}. The dst vtype is e64m2 and the index vtype
- // is e8mf4. We use index-load with the i8 indices {40, 32, 8, 0},
- // loaded using the 32-bit little endian value 0x00082028.
- li t0, 0x00082028
- vsetivli zero, 1, e32, m1, ta, ma
- vmv.v.x INDICES, t0
- addi STATEP_C, STATEP, 16
- vsetivli zero, 4, e64, m2, ta, ma
- vluxei8.v FEBA, (STATEP), INDICES
- vluxei8.v HGDC, (STATEP_C), INDICES
-
-.Lnext_block:
- la K, K512
- addi NUM_BLOCKS, NUM_BLOCKS, -1
-
- // Save the previous state, as it's needed later.
- vmv.v.v PREV_FEBA, FEBA
- vmv.v.v PREV_HGDC, HGDC
-
- // Load the next 1024-bit message block and endian-swap each 64-bit word
- vle64.v W0, (DATA)
- vrev8.v W0, W0
- addi DATA, DATA, 32
- vle64.v W1, (DATA)
- vrev8.v W1, W1
- addi DATA, DATA, 32
- vle64.v W2, (DATA)
- vrev8.v W2, W2
- addi DATA, DATA, 32
- vle64.v W3, (DATA)
- vrev8.v W3, W3
- addi DATA, DATA, 32
-
- // Do the 80 rounds of SHA-512.
- sha512_16rounds 0
- sha512_16rounds 0
- sha512_16rounds 0
- sha512_16rounds 0
- sha512_16rounds 1
-
- // Add the previous state.
- vadd.vv FEBA, FEBA, PREV_FEBA
- vadd.vv HGDC, HGDC, PREV_HGDC
-
- // Repeat if more blocks remain.
- bnez NUM_BLOCKS, .Lnext_block
-
- // Store the new state and return.
- vsuxei8.v FEBA, (STATEP), INDICES
- vsuxei8.v HGDC, (STATEP_C), INDICES
- ret
-SYM_FUNC_END(sha512_transform_zvknhb_zvkb)
-
-.section ".rodata"
-.p2align 3
-.type K512, @object
-K512:
- .dword 0x428a2f98d728ae22, 0x7137449123ef65cd
- .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
- .dword 0x3956c25bf348b538, 0x59f111f1b605d019
- .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
- .dword 0xd807aa98a3030242, 0x12835b0145706fbe
- .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
- .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
- .dword 0x9bdc06a725c71235, 0xc19bf174cf692694
- .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
- .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
- .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
- .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
- .dword 0x983e5152ee66dfab, 0xa831c66d2db43210
- .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4
- .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725
- .dword 0x06ca6351e003826f, 0x142929670a0e6e70
- .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926
- .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
- .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8
- .dword 0x81c2c92e47edaee6, 0x92722c851482353b
- .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001
- .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30
- .dword 0xd192e819d6ef5218, 0xd69906245565a910
- .dword 0xf40e35855771202a, 0x106aa07032bbd1b8
- .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
- .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
- .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
- .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
- .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60
- .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec
- .dword 0x90befffa23631e28, 0xa4506cebde82bde9
- .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b
- .dword 0xca273eceea26619c, 0xd186b8c721c0c207
- .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
- .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6
- .dword 0x113f9804bef90dae, 0x1b710b35131c471b
- .dword 0x28db77f523047d84, 0x32caab7b40c72493
- .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
- .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
- .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
-.size K512, . - K512
diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h
index 525e50db24f7..b88a6218b7f2 100644
--- a/arch/riscv/include/asm/uaccess.h
+++ b/arch/riscv/include/asm/uaccess.h
@@ -311,8 +311,8 @@ do { \
do { \
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && \
!IS_ALIGNED((uintptr_t)__gu_ptr, sizeof(*__gu_ptr))) { \
- __inttype(x) val = (__inttype(x))x; \
- if (__asm_copy_to_user_sum_enabled(__gu_ptr, &(val), sizeof(*__gu_ptr))) \
+ __inttype(x) ___val = (__inttype(x))x; \
+ if (__asm_copy_to_user_sum_enabled(__gu_ptr, &(___val), sizeof(*__gu_ptr))) \
goto label; \
break; \
} \
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 75656afa2d6b..3a0ec6fd5956 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -220,7 +220,7 @@ SYM_CODE_START_NOALIGN(ret_from_exception)
#endif
bnez s0, 1f
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
call stackleak_erase_on_task_stack
#endif
diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
index 4c6c24380cfd..8d18d6727f0f 100644
--- a/arch/riscv/kernel/ftrace.c
+++ b/arch/riscv/kernel/ftrace.c
@@ -14,6 +14,18 @@
#include <asm/text-patching.h>
#ifdef CONFIG_DYNAMIC_FTRACE
+void ftrace_arch_code_modify_prepare(void)
+ __acquires(&text_mutex)
+{
+ mutex_lock(&text_mutex);
+}
+
+void ftrace_arch_code_modify_post_process(void)
+ __releases(&text_mutex)
+{
+ mutex_unlock(&text_mutex);
+}
+
unsigned long ftrace_call_adjust(unsigned long addr)
{
if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS))
@@ -29,10 +41,8 @@ unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip)
void arch_ftrace_update_code(int command)
{
- mutex_lock(&text_mutex);
command |= FTRACE_MAY_SLEEP;
ftrace_modify_all_code(command);
- mutex_unlock(&text_mutex);
flush_icache_all();
}
@@ -149,6 +159,8 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
unsigned int nops[2], offset;
int ret;
+ guard(mutex)(&text_mutex);
+
ret = ftrace_rec_set_nop_ops(rec);
if (ret)
return ret;
@@ -157,9 +169,7 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
nops[0] = to_auipc_t0(offset);
nops[1] = RISCV_INSN_NOP4;
- mutex_lock(&text_mutex);
ret = patch_insn_write((void *)pc, nops, 2 * MCOUNT_INSN_SIZE);
- mutex_unlock(&text_mutex);
return ret;
}
diff --git a/arch/riscv/kernel/pi/Makefile b/arch/riscv/kernel/pi/Makefile
index 81d69d45c06c..7dd15be69c90 100644
--- a/arch/riscv/kernel/pi/Makefile
+++ b/arch/riscv/kernel/pi/Makefile
@@ -2,7 +2,7 @@
# This file was copied from arm64/kernel/pi/Makefile.
KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \
- -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_STACKLEAK_PLUGIN) \
+ -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_KSTACK_ERASE) \
$(call cc-option,-mbranch-protection=none) \
-I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \
-include $(srctree)/include/linux/hidden.h \
diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
index ea67e9fb7a58..8e86305831ea 100644
--- a/arch/riscv/kernel/ptrace.c
+++ b/arch/riscv/kernel/ptrace.c
@@ -186,7 +186,7 @@ static int tagged_addr_ctrl_set(struct task_struct *target,
static const struct user_regset riscv_user_regset[] = {
[REGSET_X] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = ELF_NGREG,
.size = sizeof(elf_greg_t),
.align = sizeof(elf_greg_t),
@@ -195,7 +195,7 @@ static const struct user_regset riscv_user_regset[] = {
},
#ifdef CONFIG_FPU
[REGSET_F] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = ELF_NFPREG,
.size = sizeof(elf_fpreg_t),
.align = sizeof(elf_fpreg_t),
@@ -205,7 +205,7 @@ static const struct user_regset riscv_user_regset[] = {
#endif
#ifdef CONFIG_RISCV_ISA_V
[REGSET_V] = {
- .core_note_type = NT_RISCV_VECTOR,
+ USER_REGSET_NOTE_TYPE(RISCV_VECTOR),
.align = 16,
.n = ((32 * RISCV_MAX_VLENB) +
sizeof(struct __riscv_v_regset_state)) / sizeof(__u32),
@@ -216,7 +216,7 @@ static const struct user_regset riscv_user_regset[] = {
#endif
#ifdef CONFIG_RISCV_ISA_SUPM
[REGSET_TAGGED_ADDR_CTRL] = {
- .core_note_type = NT_RISCV_TAGGED_ADDR_CTRL,
+ USER_REGSET_NOTE_TYPE(RISCV_TAGGED_ADDR_CTRL),
.n = 1,
.size = sizeof(long),
.align = sizeof(long),
@@ -380,7 +380,7 @@ static int compat_riscv_gpr_set(struct task_struct *target,
static const struct user_regset compat_riscv_user_regset[] = {
[REGSET_X] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = ELF_NGREG,
.size = sizeof(compat_elf_greg_t),
.align = sizeof(compat_elf_greg_t),
@@ -389,7 +389,7 @@ static const struct user_regset compat_riscv_user_regset[] = {
},
#ifdef CONFIG_FPU
[REGSET_F] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = ELF_NFPREG,
.size = sizeof(elf_fpreg_t),
.align = sizeof(elf_fpreg_t),
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 9c83848797a7..80230de167de 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -6,6 +6,7 @@
#include <linux/cpu.h>
#include <linux/kernel.h>
#include <linux/init.h>
+#include <linux/irqflags.h>
#include <linux/randomize_kstack.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
@@ -151,7 +152,9 @@ asmlinkage __visible __trap_section void name(struct pt_regs *regs) \
{ \
if (user_mode(regs)) { \
irqentry_enter_from_user_mode(regs); \
+ local_irq_enable(); \
do_trap_error(regs, signo, code, regs->epc, "Oops - " str); \
+ local_irq_disable(); \
irqentry_exit_to_user_mode(regs); \
} else { \
irqentry_state_t state = irqentry_nmi_enter(regs); \
@@ -173,17 +176,14 @@ asmlinkage __visible __trap_section void do_trap_insn_illegal(struct pt_regs *re
if (user_mode(regs)) {
irqentry_enter_from_user_mode(regs);
-
local_irq_enable();
handled = riscv_v_first_use_handler(regs);
-
- local_irq_disable();
-
if (!handled)
do_trap_error(regs, SIGILL, ILL_ILLOPC, regs->epc,
"Oops - illegal instruction");
+ local_irq_disable();
irqentry_exit_to_user_mode(regs);
} else {
irqentry_state_t state = irqentry_nmi_enter(regs);
@@ -308,9 +308,11 @@ asmlinkage __visible __trap_section void do_trap_break(struct pt_regs *regs)
{
if (user_mode(regs)) {
irqentry_enter_from_user_mode(regs);
+ local_irq_enable();
handle_break(regs);
+ local_irq_disable();
irqentry_exit_to_user_mode(regs);
} else {
irqentry_state_t state = irqentry_nmi_enter(regs);
diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
index 93043924fe6c..f760e4fcc052 100644
--- a/arch/riscv/kernel/traps_misaligned.c
+++ b/arch/riscv/kernel/traps_misaligned.c
@@ -461,7 +461,7 @@ static int handle_scalar_misaligned_load(struct pt_regs *regs)
}
if (!fp)
- SET_RD(insn, regs, val.data_ulong << shift >> shift);
+ SET_RD(insn, regs, (long)(val.data_ulong << shift) >> shift);
else if (len == 8)
set_f64_rd(insn, regs, val.data_u64);
else
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index 0baec92d2f55..bbc031124974 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -1,5 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-y += crypto/
lib-y += delay.o
lib-y += memcpy.o
lib-y += memset.o
@@ -16,12 +15,6 @@ endif
lib-$(CONFIG_MMU) += uaccess.o
lib-$(CONFIG_64BIT) += tishift.o
lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o
-obj-$(CONFIG_CRC32_ARCH) += crc32-riscv.o
-crc32-riscv-y := crc32.o crc32_msb.o crc32_lsb.o
-obj-$(CONFIG_CRC64_ARCH) += crc64-riscv.o
-crc64-riscv-y := crc64.o crc64_msb.o crc64_lsb.o
-obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-riscv.o
-crc-t10dif-riscv-y := crc-t10dif.o crc16_msb.o
obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
lib-$(CONFIG_RISCV_ISA_V) += xor.o
lib-$(CONFIG_RISCV_ISA_V) += riscv_v_helpers.o
diff --git a/arch/riscv/lib/crc-clmul-consts.h b/arch/riscv/lib/crc-clmul-consts.h
deleted file mode 100644
index 8d73449235ef..000000000000
--- a/arch/riscv/lib/crc-clmul-consts.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * CRC constants generated by:
- *
- * ./scripts/gen-crc-consts.py riscv_clmul crc16_msb_0x8bb7,crc32_msb_0x04c11db7,crc32_lsb_0xedb88320,crc32_lsb_0x82f63b78,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5
- *
- * Do not edit manually.
- */
-
-struct crc_clmul_consts {
- unsigned long fold_across_2_longs_const_hi;
- unsigned long fold_across_2_longs_const_lo;
- unsigned long barrett_reduction_const_1;
- unsigned long barrett_reduction_const_2;
-};
-
-/*
- * Constants generated for most-significant-bit-first CRC-16 using
- * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
- */
-static const struct crc_clmul_consts crc16_msb_0x8bb7_consts __maybe_unused = {
-#ifdef CONFIG_64BIT
- .fold_across_2_longs_const_hi = 0x0000000000001faa, /* x^192 mod G */
- .fold_across_2_longs_const_lo = 0x000000000000a010, /* x^128 mod G */
- .barrett_reduction_const_1 = 0xfb2d2bfc0e99d245, /* floor(x^79 / G) */
- .barrett_reduction_const_2 = 0x0000000000008bb7, /* G - x^16 */
-#else
- .fold_across_2_longs_const_hi = 0x00005890, /* x^96 mod G */
- .fold_across_2_longs_const_lo = 0x0000f249, /* x^64 mod G */
- .barrett_reduction_const_1 = 0xfb2d2bfc, /* floor(x^47 / G) */
- .barrett_reduction_const_2 = 0x00008bb7, /* G - x^16 */
-#endif
-};
-
-/*
- * Constants generated for most-significant-bit-first CRC-32 using
- * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
- * x^5 + x^4 + x^2 + x^1 + x^0
- */
-static const struct crc_clmul_consts crc32_msb_0x04c11db7_consts __maybe_unused = {
-#ifdef CONFIG_64BIT
- .fold_across_2_longs_const_hi = 0x00000000c5b9cd4c, /* x^192 mod G */
- .fold_across_2_longs_const_lo = 0x00000000e8a45605, /* x^128 mod G */
- .barrett_reduction_const_1 = 0x826880efa40da72d, /* floor(x^95 / G) */
- .barrett_reduction_const_2 = 0x0000000004c11db7, /* G - x^32 */
-#else
- .fold_across_2_longs_const_hi = 0xf200aa66, /* x^96 mod G */
- .fold_across_2_longs_const_lo = 0x490d678d, /* x^64 mod G */
- .barrett_reduction_const_1 = 0x826880ef, /* floor(x^63 / G) */
- .barrett_reduction_const_2 = 0x04c11db7, /* G - x^32 */
-#endif
-};
-
-/*
- * Constants generated for least-significant-bit-first CRC-32 using
- * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
- * x^5 + x^4 + x^2 + x^1 + x^0
- */
-static const struct crc_clmul_consts crc32_lsb_0xedb88320_consts __maybe_unused = {
-#ifdef CONFIG_64BIT
- .fold_across_2_longs_const_hi = 0x65673b4600000000, /* x^191 mod G */
- .fold_across_2_longs_const_lo = 0x9ba54c6f00000000, /* x^127 mod G */
- .barrett_reduction_const_1 = 0xb4e5b025f7011641, /* floor(x^95 / G) */
- .barrett_reduction_const_2 = 0x00000000edb88320, /* (G - x^32) * x^32 */
-#else
- .fold_across_2_longs_const_hi = 0xccaa009e, /* x^95 mod G */
- .fold_across_2_longs_const_lo = 0xb8bc6765, /* x^63 mod G */
- .barrett_reduction_const_1 = 0xf7011641, /* floor(x^63 / G) */
- .barrett_reduction_const_2 = 0xedb88320, /* (G - x^32) * x^0 */
-#endif
-};
-
-/*
- * Constants generated for least-significant-bit-first CRC-32 using
- * G(x) = x^32 + x^28 + x^27 + x^26 + x^25 + x^23 + x^22 + x^20 + x^19 + x^18 +
- * x^14 + x^13 + x^11 + x^10 + x^9 + x^8 + x^6 + x^0
- */
-static const struct crc_clmul_consts crc32_lsb_0x82f63b78_consts __maybe_unused = {
-#ifdef CONFIG_64BIT
- .fold_across_2_longs_const_hi = 0x3743f7bd00000000, /* x^191 mod G */
- .fold_across_2_longs_const_lo = 0x3171d43000000000, /* x^127 mod G */
- .barrett_reduction_const_1 = 0x4869ec38dea713f1, /* floor(x^95 / G) */
- .barrett_reduction_const_2 = 0x0000000082f63b78, /* (G - x^32) * x^32 */
-#else
- .fold_across_2_longs_const_hi = 0x493c7d27, /* x^95 mod G */
- .fold_across_2_longs_const_lo = 0xdd45aab8, /* x^63 mod G */
- .barrett_reduction_const_1 = 0xdea713f1, /* floor(x^63 / G) */
- .barrett_reduction_const_2 = 0x82f63b78, /* (G - x^32) * x^0 */
-#endif
-};
-
-/*
- * Constants generated for most-significant-bit-first CRC-64 using
- * G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
- * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
- * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
- * x^7 + x^4 + x^1 + x^0
- */
-#ifdef CONFIG_64BIT
-static const struct crc_clmul_consts crc64_msb_0x42f0e1eba9ea3693_consts __maybe_unused = {
- .fold_across_2_longs_const_hi = 0x4eb938a7d257740e, /* x^192 mod G */
- .fold_across_2_longs_const_lo = 0x05f5c3c7eb52fab6, /* x^128 mod G */
- .barrett_reduction_const_1 = 0xabc694e836627c39, /* floor(x^127 / G) */
- .barrett_reduction_const_2 = 0x42f0e1eba9ea3693, /* G - x^64 */
-};
-#endif
-
-/*
- * Constants generated for least-significant-bit-first CRC-64 using
- * G(x) = x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 +
- * x^47 + x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 +
- * x^26 + x^23 + x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 +
- * x^4 + x^3 + x^0
- */
-#ifdef CONFIG_64BIT
-static const struct crc_clmul_consts crc64_lsb_0x9a6c9329ac4bc9b5_consts __maybe_unused = {
- .fold_across_2_longs_const_hi = 0xeadc41fd2ba3d420, /* x^191 mod G */
- .fold_across_2_longs_const_lo = 0x21e9761e252621ac, /* x^127 mod G */
- .barrett_reduction_const_1 = 0x27ecfa329aef9f77, /* floor(x^127 / G) */
- .barrett_reduction_const_2 = 0x9a6c9329ac4bc9b5, /* (G - x^64) * x^0 */
-};
-#endif
diff --git a/arch/riscv/lib/crc-clmul-template.h b/arch/riscv/lib/crc-clmul-template.h
deleted file mode 100644
index 77187e7f1762..000000000000
--- a/arch/riscv/lib/crc-clmul-template.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/* Copyright 2025 Google LLC */
-
-/*
- * This file is a "template" that generates a CRC function optimized using the
- * RISC-V Zbc (scalar carryless multiplication) extension. The includer of this
- * file must define the following parameters to specify the type of CRC:
- *
- * crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC
- * LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural
- * mapping between bits and polynomial coefficients
- * 1 for a lsb (least-significant-bit) first CRC, i.e. reflected
- * mapping between bits and polynomial coefficients
- */
-
-#include <asm/byteorder.h>
-#include <linux/minmax.h>
-
-#define CRC_BITS (8 * sizeof(crc_t)) /* a.k.a. 'n' */
-
-static inline unsigned long clmul(unsigned long a, unsigned long b)
-{
- unsigned long res;
-
- asm(".option push\n"
- ".option arch,+zbc\n"
- "clmul %0, %1, %2\n"
- ".option pop\n"
- : "=r" (res) : "r" (a), "r" (b));
- return res;
-}
-
-static inline unsigned long clmulh(unsigned long a, unsigned long b)
-{
- unsigned long res;
-
- asm(".option push\n"
- ".option arch,+zbc\n"
- "clmulh %0, %1, %2\n"
- ".option pop\n"
- : "=r" (res) : "r" (a), "r" (b));
- return res;
-}
-
-static inline unsigned long clmulr(unsigned long a, unsigned long b)
-{
- unsigned long res;
-
- asm(".option push\n"
- ".option arch,+zbc\n"
- "clmulr %0, %1, %2\n"
- ".option pop\n"
- : "=r" (res) : "r" (a), "r" (b));
- return res;
-}
-
-/*
- * crc_load_long() loads one "unsigned long" of aligned data bytes, producing a
- * polynomial whose bit order matches the CRC's bit order.
- */
-#ifdef CONFIG_64BIT
-# if LSB_CRC
-# define crc_load_long(x) le64_to_cpup(x)
-# else
-# define crc_load_long(x) be64_to_cpup(x)
-# endif
-#else
-# if LSB_CRC
-# define crc_load_long(x) le32_to_cpup(x)
-# else
-# define crc_load_long(x) be32_to_cpup(x)
-# endif
-#endif
-
-/* XOR @crc into the end of @msgpoly that represents the high-order terms. */
-static inline unsigned long
-crc_clmul_prep(crc_t crc, unsigned long msgpoly)
-{
-#if LSB_CRC
- return msgpoly ^ crc;
-#else
- return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS));
-#endif
-}
-
-/*
- * Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it
- * modulo the generator polynomial G. This gives the CRC of @msgpoly.
- */
-static inline crc_t
-crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts)
-{
- unsigned long tmp;
-
- /*
- * First step of Barrett reduction with integrated multiplication by
- * x^n: calculate floor((msgpoly * x^n) / G). This is the value by
- * which G needs to be multiplied to cancel out the x^n and higher terms
- * of msgpoly * x^n. Do it using the following formula:
- *
- * msb-first:
- * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1))
- * lsb-first:
- * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG)
- *
- * barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G),
- * which fits a long exactly. Using any lower power of x there would
- * not carry enough precision through the calculation, while using any
- * higher power of x would require extra instructions to handle a wider
- * multiplication. In the msb-first case, using this power of x results
- * in needing a floored division by x^(BITS_PER_LONG-1), which matches
- * what clmulr produces. In the lsb-first case, a factor of x gets
- * implicitly introduced by each carryless multiplication (shown as
- * '* x' above), and the floored division instead needs to be by
- * x^BITS_PER_LONG which matches what clmul produces.
- */
-#if LSB_CRC
- tmp = clmul(msgpoly, consts->barrett_reduction_const_1);
-#else
- tmp = clmulr(msgpoly, consts->barrett_reduction_const_1);
-#endif
-
- /*
- * Second step of Barrett reduction:
- *
- * crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G))
- *
- * This reduces (msgpoly * x^n) modulo G by adding the appropriate
- * multiple of G to it. The result uses only the x^0..x^(n-1) terms.
- * HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those
- * terms in the first place, it is more efficient to do the equivalent:
- *
- * crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n
- *
- * In the lsb-first case further modify it to the following which avoids
- * a shift, as the crc ends up in the physically low n bits from clmulr:
- *
- * product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x
- * crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n
- *
- * barrett_reduction_const_2 contains the constant multiplier (G - x^n)
- * or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above. The
- * cast of the result to crc_t is essential, as it applies the mod x^n!
- */
-#if LSB_CRC
- return clmulr(tmp, consts->barrett_reduction_const_2);
-#else
- return clmul(tmp, consts->barrett_reduction_const_2);
-#endif
-}
-
-/* Update @crc with the data from @msgpoly. */
-static inline crc_t
-crc_clmul_update_long(crc_t crc, unsigned long msgpoly,
- const struct crc_clmul_consts *consts)
-{
- return crc_clmul_long(crc_clmul_prep(crc, msgpoly), consts);
-}
-
-/* Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. */
-static inline crc_t
-crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len,
- const struct crc_clmul_consts *consts)
-{
- unsigned long msgpoly;
- size_t i;
-
-#if LSB_CRC
- msgpoly = (unsigned long)p[0] << (BITS_PER_LONG - 8);
- for (i = 1; i < len; i++)
- msgpoly = (msgpoly >> 8) ^ ((unsigned long)p[i] << (BITS_PER_LONG - 8));
-#else
- msgpoly = p[0];
- for (i = 1; i < len; i++)
- msgpoly = (msgpoly << 8) ^ p[i];
-#endif
-
- if (len >= sizeof(crc_t)) {
- #if LSB_CRC
- msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
- #else
- msgpoly ^= (unsigned long)crc << (8*len - CRC_BITS);
- #endif
- return crc_clmul_long(msgpoly, consts);
- }
-#if LSB_CRC
- msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
- return crc_clmul_long(msgpoly, consts) ^ (crc >> (8*len));
-#else
- msgpoly ^= crc >> (CRC_BITS - 8*len);
- return crc_clmul_long(msgpoly, consts) ^ (crc << (8*len));
-#endif
-}
-
-static inline crc_t
-crc_clmul(crc_t crc, const void *p, size_t len,
- const struct crc_clmul_consts *consts)
-{
- size_t align;
-
- /* This implementation assumes that the CRC fits in an unsigned long. */
- BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long));
-
- /* If the buffer is not long-aligned, align it. */
- align = (unsigned long)p % sizeof(unsigned long);
- if (align && len) {
- align = min(sizeof(unsigned long) - align, len);
- crc = crc_clmul_update_partial(crc, p, align, consts);
- p += align;
- len -= align;
- }
-
- if (len >= 4 * sizeof(unsigned long)) {
- unsigned long m0, m1;
-
- m0 = crc_clmul_prep(crc, crc_load_long(p));
- m1 = crc_load_long(p + sizeof(unsigned long));
- p += 2 * sizeof(unsigned long);
- len -= 2 * sizeof(unsigned long);
- /*
- * Main loop. Each iteration starts with a message polynomial
- * (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two
- * more longs of data to form x^(3*BITS_PER_LONG)*m0 +
- * x^(2*BITS_PER_LONG)*m1 + x^BITS_PER_LONG*m2 + m3, then
- * "folds" that back into a congruent (modulo G) value that uses
- * just m0 and m1 again. This is done by multiplying m0 by the
- * precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by
- * the precomputed constant (x^(2*BITS_PER_LONG) mod G), then
- * adding the results to m2 and m3 as appropriate. Each such
- * multiplication produces a result twice the length of a long,
- * which in RISC-V is two instructions clmul and clmulh.
- *
- * This could be changed to fold across more than 2 longs at a
- * time if there is a CPU that can take advantage of it.
- */
- do {
- unsigned long p0, p1, p2, p3;
-
- p0 = clmulh(m0, consts->fold_across_2_longs_const_hi);
- p1 = clmul(m0, consts->fold_across_2_longs_const_hi);
- p2 = clmulh(m1, consts->fold_across_2_longs_const_lo);
- p3 = clmul(m1, consts->fold_across_2_longs_const_lo);
- m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p);
- m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^
- crc_load_long(p + sizeof(unsigned long));
-
- p += 2 * sizeof(unsigned long);
- len -= 2 * sizeof(unsigned long);
- } while (len >= 2 * sizeof(unsigned long));
-
- crc = crc_clmul_long(m0, consts);
- crc = crc_clmul_update_long(crc, m1, consts);
- }
-
- while (len >= sizeof(unsigned long)) {
- crc = crc_clmul_update_long(crc, crc_load_long(p), consts);
- p += sizeof(unsigned long);
- len -= sizeof(unsigned long);
- }
-
- if (len)
- crc = crc_clmul_update_partial(crc, p, len, consts);
-
- return crc;
-}
diff --git a/arch/riscv/lib/crc-clmul.h b/arch/riscv/lib/crc-clmul.h
deleted file mode 100644
index dd1736245815..000000000000
--- a/arch/riscv/lib/crc-clmul.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/* Copyright 2025 Google LLC */
-
-#ifndef _RISCV_CRC_CLMUL_H
-#define _RISCV_CRC_CLMUL_H
-
-#include <linux/types.h>
-#include "crc-clmul-consts.h"
-
-u16 crc16_msb_clmul(u16 crc, const void *p, size_t len,
- const struct crc_clmul_consts *consts);
-u32 crc32_msb_clmul(u32 crc, const void *p, size_t len,
- const struct crc_clmul_consts *consts);
-u32 crc32_lsb_clmul(u32 crc, const void *p, size_t len,
- const struct crc_clmul_consts *consts);
-#ifdef CONFIG_64BIT
-u64 crc64_msb_clmul(u64 crc, const void *p, size_t len,
- const struct crc_clmul_consts *consts);
-u64 crc64_lsb_clmul(u64 crc, const void *p, size_t len,
- const struct crc_clmul_consts *consts);
-#endif
-
-#endif /* _RISCV_CRC_CLMUL_H */
diff --git a/arch/riscv/lib/crc-t10dif.c b/arch/riscv/lib/crc-t10dif.c
deleted file mode 100644
index e6b0051ccd86..000000000000
--- a/arch/riscv/lib/crc-t10dif.c
+++ /dev/null
@@ -1,24 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * RISC-V optimized CRC-T10DIF function
- *
- * Copyright 2025 Google LLC
- */
-
-#include <asm/hwcap.h>
-#include <asm/alternative-macros.h>
-#include <linux/crc-t10dif.h>
-#include <linux/module.h>
-
-#include "crc-clmul.h"
-
-u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len)
-{
- if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
- return crc16_msb_clmul(crc, p, len, &crc16_msb_0x8bb7_consts);
- return crc_t10dif_generic(crc, p, len);
-}
-EXPORT_SYMBOL(crc_t10dif_arch);
-
-MODULE_DESCRIPTION("RISC-V optimized CRC-T10DIF function");
-MODULE_LICENSE("GPL");
diff --git a/arch/riscv/lib/crc16_msb.c b/arch/riscv/lib/crc16_msb.c
deleted file mode 100644
index 554d295e95f5..000000000000
--- a/arch/riscv/lib/crc16_msb.c
+++ /dev/null
@@ -1,18 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * RISC-V optimized most-significant-bit-first CRC16
- *
- * Copyright 2025 Google LLC
- */
-
-#include "crc-clmul.h"
-
-typedef u16 crc_t;
-#define LSB_CRC 0
-#include "crc-clmul-template.h"
-
-u16 crc16_msb_clmul(u16 crc, const void *p, size_t len,
- const struct crc_clmul_consts *consts)
-{
- return crc_clmul(crc, p, len, consts);
-}
diff --git a/arch/riscv/lib/crc32.c b/arch/riscv/lib/crc32.c
deleted file mode 100644
index a3188b7d9c40..000000000000
--- a/arch/riscv/lib/crc32.c
+++ /dev/null
@@ -1,53 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * RISC-V optimized CRC32 functions
- *
- * Copyright 2025 Google LLC
- */
-
-#include <asm/hwcap.h>
-#include <asm/alternative-macros.h>
-#include <linux/crc32.h>
-#include <linux/module.h>
-
-#include "crc-clmul.h"
-
-u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
-{
- if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
- return crc32_lsb_clmul(crc, p, len,
- &crc32_lsb_0xedb88320_consts);
- return crc32_le_base(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_le_arch);
-
-u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
-{
- if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
- return crc32_msb_clmul(crc, p, len,
- &crc32_msb_0x04c11db7_consts);
- return crc32_be_base(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_be_arch);
-
-u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
-{
- if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
- return crc32_lsb_clmul(crc, p, len,
- &crc32_lsb_0x82f63b78_consts);
- return crc32c_base(crc, p, len);
-}
-EXPORT_SYMBOL(crc32c_arch);
-
-u32 crc32_optimizations(void)
-{
- if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
- return CRC32_LE_OPTIMIZATION |
- CRC32_BE_OPTIMIZATION |
- CRC32C_OPTIMIZATION;
- return 0;
-}
-EXPORT_SYMBOL(crc32_optimizations);
-
-MODULE_DESCRIPTION("RISC-V optimized CRC32 functions");
-MODULE_LICENSE("GPL");
diff --git a/arch/riscv/lib/crc32_lsb.c b/arch/riscv/lib/crc32_lsb.c
deleted file mode 100644
index 72fd67e7470c..000000000000
--- a/arch/riscv/lib/crc32_lsb.c
+++ /dev/null
@@ -1,18 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * RISC-V optimized least-significant-bit-first CRC32
- *
- * Copyright 2025 Google LLC
- */
-
-#include "crc-clmul.h"
-
-typedef u32 crc_t;
-#define LSB_CRC 1
-#include "crc-clmul-template.h"
-
-u32 crc32_lsb_clmul(u32 crc, const void *p, size_t len,
- const struct crc_clmul_consts *consts)
-{
- return crc_clmul(crc, p, len, consts);
-}
diff --git a/arch/riscv/lib/crc32_msb.c b/arch/riscv/lib/crc32_msb.c
deleted file mode 100644
index fdbeaccc369f..000000000000
--- a/arch/riscv/lib/crc32_msb.c
+++ /dev/null
@@ -1,18 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * RISC-V optimized most-significant-bit-first CRC32
- *
- * Copyright 2025 Google LLC
- */
-
-#include "crc-clmul.h"
-
-typedef u32 crc_t;
-#define LSB_CRC 0
-#include "crc-clmul-template.h"
-
-u32 crc32_msb_clmul(u32 crc, const void *p, size_t len,
- const struct crc_clmul_consts *consts)
-{
- return crc_clmul(crc, p, len, consts);
-}
diff --git a/arch/riscv/lib/crc64.c b/arch/riscv/lib/crc64.c
deleted file mode 100644
index f0015a27836a..000000000000
--- a/arch/riscv/lib/crc64.c
+++ /dev/null
@@ -1,34 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * RISC-V optimized CRC64 functions
- *
- * Copyright 2025 Google LLC
- */
-
-#include <asm/hwcap.h>
-#include <asm/alternative-macros.h>
-#include <linux/crc64.h>
-#include <linux/module.h>
-
-#include "crc-clmul.h"
-
-u64 crc64_be_arch(u64 crc, const u8 *p, size_t len)
-{
- if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
- return crc64_msb_clmul(crc, p, len,
- &crc64_msb_0x42f0e1eba9ea3693_consts);
- return crc64_be_generic(crc, p, len);
-}
-EXPORT_SYMBOL(crc64_be_arch);
-
-u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
-{
- if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
- return crc64_lsb_clmul(crc, p, len,
- &crc64_lsb_0x9a6c9329ac4bc9b5_consts);
- return crc64_nvme_generic(crc, p, len);
-}
-EXPORT_SYMBOL(crc64_nvme_arch);
-
-MODULE_DESCRIPTION("RISC-V optimized CRC64 functions");
-MODULE_LICENSE("GPL");
diff --git a/arch/riscv/lib/crc64_lsb.c b/arch/riscv/lib/crc64_lsb.c
deleted file mode 100644
index c5371bb85d90..000000000000
--- a/arch/riscv/lib/crc64_lsb.c
+++ /dev/null
@@ -1,18 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * RISC-V optimized least-significant-bit-first CRC64
- *
- * Copyright 2025 Google LLC
- */
-
-#include "crc-clmul.h"
-
-typedef u64 crc_t;
-#define LSB_CRC 1
-#include "crc-clmul-template.h"
-
-u64 crc64_lsb_clmul(u64 crc, const void *p, size_t len,
- const struct crc_clmul_consts *consts)
-{
- return crc_clmul(crc, p, len, consts);
-}
diff --git a/arch/riscv/lib/crc64_msb.c b/arch/riscv/lib/crc64_msb.c
deleted file mode 100644
index 1925d1dbe225..000000000000
--- a/arch/riscv/lib/crc64_msb.c
+++ /dev/null
@@ -1,18 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * RISC-V optimized most-significant-bit-first CRC64
- *
- * Copyright 2025 Google LLC
- */
-
-#include "crc-clmul.h"
-
-typedef u64 crc_t;
-#define LSB_CRC 0
-#include "crc-clmul-template.h"
-
-u64 crc64_msb_clmul(u64 crc, const void *p, size_t len,
- const struct crc_clmul_consts *consts)
-{
- return crc_clmul(crc, p, len, consts);
-}
diff --git a/arch/riscv/lib/crypto/Kconfig b/arch/riscv/lib/crypto/Kconfig
deleted file mode 100644
index 47c99ea97ce2..000000000000
--- a/arch/riscv/lib/crypto/Kconfig
+++ /dev/null
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_CHACHA_RISCV64
- tristate
- depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
- default CRYPTO_LIB_CHACHA
- select CRYPTO_ARCH_HAVE_LIB_CHACHA
- select CRYPTO_LIB_CHACHA_GENERIC
-
-config CRYPTO_SHA256_RISCV64
- tristate
- depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
- default CRYPTO_LIB_SHA256
- select CRYPTO_ARCH_HAVE_LIB_SHA256
- select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
- select CRYPTO_LIB_SHA256_GENERIC
diff --git a/arch/riscv/lib/crypto/Makefile b/arch/riscv/lib/crypto/Makefile
deleted file mode 100644
index b7cb877a2c07..000000000000
--- a/arch/riscv/lib/crypto/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_CHACHA_RISCV64) += chacha-riscv64.o
-chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o
-
-obj-$(CONFIG_CRYPTO_SHA256_RISCV64) += sha256-riscv64.o
-sha256-riscv64-y := sha256.o sha256-riscv64-zvknha_or_zvknhb-zvkb.o
diff --git a/arch/riscv/lib/crypto/chacha-riscv64-glue.c b/arch/riscv/lib/crypto/chacha-riscv64-glue.c
deleted file mode 100644
index 8c3f11d79be3..000000000000
--- a/arch/riscv/lib/crypto/chacha-riscv64-glue.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * ChaCha stream cipher (RISC-V optimized)
- *
- * Copyright (C) 2023 SiFive, Inc.
- * Author: Jerry Shih <jerry.shih@sifive.com>
- */
-
-#include <asm/simd.h>
-#include <asm/vector.h>
-#include <crypto/chacha.h>
-#include <crypto/internal/simd.h>
-#include <linux/linkage.h>
-#include <linux/module.h>
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_zvkb);
-
-asmlinkage void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out,
- size_t nblocks, int nrounds);
-
-void hchacha_block_arch(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds)
-{
- hchacha_block_generic(state, out, nrounds);
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- u8 block_buffer[CHACHA_BLOCK_SIZE];
- unsigned int full_blocks = bytes / CHACHA_BLOCK_SIZE;
- unsigned int tail_bytes = bytes % CHACHA_BLOCK_SIZE;
-
- if (!static_branch_likely(&use_zvkb) || !crypto_simd_usable())
- return chacha_crypt_generic(state, dst, src, bytes, nrounds);
-
- kernel_vector_begin();
- if (full_blocks) {
- chacha_zvkb(state, src, dst, full_blocks, nrounds);
- src += full_blocks * CHACHA_BLOCK_SIZE;
- dst += full_blocks * CHACHA_BLOCK_SIZE;
- }
- if (tail_bytes) {
- memcpy(block_buffer, src, tail_bytes);
- chacha_zvkb(state, block_buffer, block_buffer, 1, nrounds);
- memcpy(dst, block_buffer, tail_bytes);
- }
- kernel_vector_end();
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-bool chacha_is_arch_optimized(void)
-{
- return static_key_enabled(&use_zvkb);
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-static int __init riscv64_chacha_mod_init(void)
-{
- if (riscv_isa_extension_available(NULL, ZVKB) &&
- riscv_vector_vlen() >= 128)
- static_branch_enable(&use_zvkb);
- return 0;
-}
-subsys_initcall(riscv64_chacha_mod_init);
-
-static void __exit riscv64_chacha_mod_exit(void)
-{
-}
-module_exit(riscv64_chacha_mod_exit);
-
-MODULE_DESCRIPTION("ChaCha stream cipher (RISC-V optimized)");
-MODULE_AUTHOR("Jerry Shih <jerry.shih@sifive.com>");
-MODULE_LICENSE("GPL");
diff --git a/arch/riscv/lib/crypto/chacha-riscv64-zvkb.S b/arch/riscv/lib/crypto/chacha-riscv64-zvkb.S
deleted file mode 100644
index b777d0b4e379..000000000000
--- a/arch/riscv/lib/crypto/chacha-riscv64-zvkb.S
+++ /dev/null
@@ -1,297 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
-//
-// This file is dual-licensed, meaning that you can use it under your
-// choice of either of the following two licenses:
-//
-// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
-//
-// Licensed under the Apache License 2.0 (the "License"). You can obtain
-// a copy in the file LICENSE in the source distribution or at
-// https://www.openssl.org/source/license.html
-//
-// or
-//
-// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
-// Copyright 2024 Google LLC
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// The generated code of this file depends on the following RISC-V extensions:
-// - RV64I
-// - RISC-V Vector ('V') with VLEN >= 128
-// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
-
-#include <linux/linkage.h>
-
-.text
-.option arch, +zvkb
-
-#define STATEP a0
-#define INP a1
-#define OUTP a2
-#define NBLOCKS a3
-#define NROUNDS a4
-
-#define CONSTS0 a5
-#define CONSTS1 a6
-#define CONSTS2 a7
-#define CONSTS3 t0
-#define TMP t1
-#define VL t2
-#define STRIDE t3
-#define ROUND_CTR t4
-#define KEY0 s0
-#define KEY1 s1
-#define KEY2 s2
-#define KEY3 s3
-#define KEY4 s4
-#define KEY5 s5
-#define KEY6 s6
-#define KEY7 s7
-#define COUNTER s8
-#define NONCE0 s9
-#define NONCE1 s10
-#define NONCE2 s11
-
-.macro chacha_round a0, b0, c0, d0, a1, b1, c1, d1, \
- a2, b2, c2, d2, a3, b3, c3, d3
- // a += b; d ^= a; d = rol(d, 16);
- vadd.vv \a0, \a0, \b0
- vadd.vv \a1, \a1, \b1
- vadd.vv \a2, \a2, \b2
- vadd.vv \a3, \a3, \b3
- vxor.vv \d0, \d0, \a0
- vxor.vv \d1, \d1, \a1
- vxor.vv \d2, \d2, \a2
- vxor.vv \d3, \d3, \a3
- vror.vi \d0, \d0, 32 - 16
- vror.vi \d1, \d1, 32 - 16
- vror.vi \d2, \d2, 32 - 16
- vror.vi \d3, \d3, 32 - 16
-
- // c += d; b ^= c; b = rol(b, 12);
- vadd.vv \c0, \c0, \d0
- vadd.vv \c1, \c1, \d1
- vadd.vv \c2, \c2, \d2
- vadd.vv \c3, \c3, \d3
- vxor.vv \b0, \b0, \c0
- vxor.vv \b1, \b1, \c1
- vxor.vv \b2, \b2, \c2
- vxor.vv \b3, \b3, \c3
- vror.vi \b0, \b0, 32 - 12
- vror.vi \b1, \b1, 32 - 12
- vror.vi \b2, \b2, 32 - 12
- vror.vi \b3, \b3, 32 - 12
-
- // a += b; d ^= a; d = rol(d, 8);
- vadd.vv \a0, \a0, \b0
- vadd.vv \a1, \a1, \b1
- vadd.vv \a2, \a2, \b2
- vadd.vv \a3, \a3, \b3
- vxor.vv \d0, \d0, \a0
- vxor.vv \d1, \d1, \a1
- vxor.vv \d2, \d2, \a2
- vxor.vv \d3, \d3, \a3
- vror.vi \d0, \d0, 32 - 8
- vror.vi \d1, \d1, 32 - 8
- vror.vi \d2, \d2, 32 - 8
- vror.vi \d3, \d3, 32 - 8
-
- // c += d; b ^= c; b = rol(b, 7);
- vadd.vv \c0, \c0, \d0
- vadd.vv \c1, \c1, \d1
- vadd.vv \c2, \c2, \d2
- vadd.vv \c3, \c3, \d3
- vxor.vv \b0, \b0, \c0
- vxor.vv \b1, \b1, \c1
- vxor.vv \b2, \b2, \c2
- vxor.vv \b3, \b3, \c3
- vror.vi \b0, \b0, 32 - 7
- vror.vi \b1, \b1, 32 - 7
- vror.vi \b2, \b2, 32 - 7
- vror.vi \b3, \b3, 32 - 7
-.endm
-
-// void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out,
-// size_t nblocks, int nrounds);
-//
-// |nblocks| is the number of 64-byte blocks to process, and must be nonzero.
-//
-// |state| gives the ChaCha state matrix, including the 32-bit counter in
-// state->x[12] following the RFC7539 convention; note that this differs from
-// the original Salsa20 paper which uses a 64-bit counter in state->x[12..13].
-// The updated 32-bit counter is written back to state->x[12] before returning.
-SYM_FUNC_START(chacha_zvkb)
- addi sp, sp, -96
- sd s0, 0(sp)
- sd s1, 8(sp)
- sd s2, 16(sp)
- sd s3, 24(sp)
- sd s4, 32(sp)
- sd s5, 40(sp)
- sd s6, 48(sp)
- sd s7, 56(sp)
- sd s8, 64(sp)
- sd s9, 72(sp)
- sd s10, 80(sp)
- sd s11, 88(sp)
-
- li STRIDE, 64
-
- // Set up the initial state matrix in scalar registers.
- lw CONSTS0, 0(STATEP)
- lw CONSTS1, 4(STATEP)
- lw CONSTS2, 8(STATEP)
- lw CONSTS3, 12(STATEP)
- lw KEY0, 16(STATEP)
- lw KEY1, 20(STATEP)
- lw KEY2, 24(STATEP)
- lw KEY3, 28(STATEP)
- lw KEY4, 32(STATEP)
- lw KEY5, 36(STATEP)
- lw KEY6, 40(STATEP)
- lw KEY7, 44(STATEP)
- lw COUNTER, 48(STATEP)
- lw NONCE0, 52(STATEP)
- lw NONCE1, 56(STATEP)
- lw NONCE2, 60(STATEP)
-
-.Lblock_loop:
- // Set vl to the number of blocks to process in this iteration.
- vsetvli VL, NBLOCKS, e32, m1, ta, ma
-
- // Set up the initial state matrix for the next VL blocks in v0-v15.
- // v{i} holds the i'th 32-bit word of the state matrix for all blocks.
- // Note that only the counter word, at index 12, differs across blocks.
- vmv.v.x v0, CONSTS0
- vmv.v.x v1, CONSTS1
- vmv.v.x v2, CONSTS2
- vmv.v.x v3, CONSTS3
- vmv.v.x v4, KEY0
- vmv.v.x v5, KEY1
- vmv.v.x v6, KEY2
- vmv.v.x v7, KEY3
- vmv.v.x v8, KEY4
- vmv.v.x v9, KEY5
- vmv.v.x v10, KEY6
- vmv.v.x v11, KEY7
- vid.v v12
- vadd.vx v12, v12, COUNTER
- vmv.v.x v13, NONCE0
- vmv.v.x v14, NONCE1
- vmv.v.x v15, NONCE2
-
- // Load the first half of the input data for each block into v16-v23.
- // v{16+i} holds the i'th 32-bit word for all blocks.
- vlsseg8e32.v v16, (INP), STRIDE
-
- mv ROUND_CTR, NROUNDS
-.Lnext_doubleround:
- addi ROUND_CTR, ROUND_CTR, -2
- // column round
- chacha_round v0, v4, v8, v12, v1, v5, v9, v13, \
- v2, v6, v10, v14, v3, v7, v11, v15
- // diagonal round
- chacha_round v0, v5, v10, v15, v1, v6, v11, v12, \
- v2, v7, v8, v13, v3, v4, v9, v14
- bnez ROUND_CTR, .Lnext_doubleround
-
- // Load the second half of the input data for each block into v24-v31.
- // v{24+i} holds the {8+i}'th 32-bit word for all blocks.
- addi TMP, INP, 32
- vlsseg8e32.v v24, (TMP), STRIDE
-
- // Finalize the first half of the keystream for each block.
- vadd.vx v0, v0, CONSTS0
- vadd.vx v1, v1, CONSTS1
- vadd.vx v2, v2, CONSTS2
- vadd.vx v3, v3, CONSTS3
- vadd.vx v4, v4, KEY0
- vadd.vx v5, v5, KEY1
- vadd.vx v6, v6, KEY2
- vadd.vx v7, v7, KEY3
-
- // Encrypt/decrypt the first half of the data for each block.
- vxor.vv v16, v16, v0
- vxor.vv v17, v17, v1
- vxor.vv v18, v18, v2
- vxor.vv v19, v19, v3
- vxor.vv v20, v20, v4
- vxor.vv v21, v21, v5
- vxor.vv v22, v22, v6
- vxor.vv v23, v23, v7
-
- // Store the first half of the output data for each block.
- vssseg8e32.v v16, (OUTP), STRIDE
-
- // Finalize the second half of the keystream for each block.
- vadd.vx v8, v8, KEY4
- vadd.vx v9, v9, KEY5
- vadd.vx v10, v10, KEY6
- vadd.vx v11, v11, KEY7
- vid.v v0
- vadd.vx v12, v12, COUNTER
- vadd.vx v13, v13, NONCE0
- vadd.vx v14, v14, NONCE1
- vadd.vx v15, v15, NONCE2
- vadd.vv v12, v12, v0
-
- // Encrypt/decrypt the second half of the data for each block.
- vxor.vv v24, v24, v8
- vxor.vv v25, v25, v9
- vxor.vv v26, v26, v10
- vxor.vv v27, v27, v11
- vxor.vv v29, v29, v13
- vxor.vv v28, v28, v12
- vxor.vv v30, v30, v14
- vxor.vv v31, v31, v15
-
- // Store the second half of the output data for each block.
- addi TMP, OUTP, 32
- vssseg8e32.v v24, (TMP), STRIDE
-
- // Update the counter, the remaining number of blocks, and the input and
- // output pointers according to the number of blocks processed (VL).
- add COUNTER, COUNTER, VL
- sub NBLOCKS, NBLOCKS, VL
- slli TMP, VL, 6
- add OUTP, OUTP, TMP
- add INP, INP, TMP
- bnez NBLOCKS, .Lblock_loop
-
- sw COUNTER, 48(STATEP)
- ld s0, 0(sp)
- ld s1, 8(sp)
- ld s2, 16(sp)
- ld s3, 24(sp)
- ld s4, 32(sp)
- ld s5, 40(sp)
- ld s6, 48(sp)
- ld s7, 56(sp)
- ld s8, 64(sp)
- ld s9, 72(sp)
- ld s10, 80(sp)
- ld s11, 88(sp)
- addi sp, sp, 96
- ret
-SYM_FUNC_END(chacha_zvkb)
diff --git a/arch/riscv/lib/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S b/arch/riscv/lib/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S
deleted file mode 100644
index fad501ad0617..000000000000
--- a/arch/riscv/lib/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S
+++ /dev/null
@@ -1,225 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
-//
-// This file is dual-licensed, meaning that you can use it under your
-// choice of either of the following two licenses:
-//
-// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
-//
-// Licensed under the Apache License 2.0 (the "License"). You can obtain
-// a copy in the file LICENSE in the source distribution or at
-// https://www.openssl.org/source/license.html
-//
-// or
-//
-// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
-// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
-// Copyright 2024 Google LLC
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// The generated code of this file depends on the following RISC-V extensions:
-// - RV64I
-// - RISC-V Vector ('V') with VLEN >= 128
-// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknha' or 'Zvknhb')
-// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
-
-#include <linux/linkage.h>
-
-.text
-.option arch, +zvknha, +zvkb
-
-#define STATEP a0
-#define DATA a1
-#define NUM_BLOCKS a2
-
-#define STATEP_C a3
-
-#define MASK v0
-#define INDICES v1
-#define W0 v2
-#define W1 v3
-#define W2 v4
-#define W3 v5
-#define VTMP v6
-#define FEBA v7
-#define HGDC v8
-#define K0 v10
-#define K1 v11
-#define K2 v12
-#define K3 v13
-#define K4 v14
-#define K5 v15
-#define K6 v16
-#define K7 v17
-#define K8 v18
-#define K9 v19
-#define K10 v20
-#define K11 v21
-#define K12 v22
-#define K13 v23
-#define K14 v24
-#define K15 v25
-#define PREV_FEBA v26
-#define PREV_HGDC v27
-
-// Do 4 rounds of SHA-256. w0 contains the current 4 message schedule words.
-//
-// If not all the message schedule words have been computed yet, then this also
-// computes 4 more message schedule words. w1-w3 contain the next 3 groups of 4
-// message schedule words; this macro computes the group after w3 and writes it
-// to w0. This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3,
-// w0), so the caller must cycle through the registers accordingly.
-.macro sha256_4rounds last, k, w0, w1, w2, w3
- vadd.vv VTMP, \k, \w0
- vsha2cl.vv HGDC, FEBA, VTMP
- vsha2ch.vv FEBA, HGDC, VTMP
-.if !\last
- vmerge.vvm VTMP, \w2, \w1, MASK
- vsha2ms.vv \w0, VTMP, \w3
-.endif
-.endm
-
-.macro sha256_16rounds last, k0, k1, k2, k3
- sha256_4rounds \last, \k0, W0, W1, W2, W3
- sha256_4rounds \last, \k1, W1, W2, W3, W0
- sha256_4rounds \last, \k2, W2, W3, W0, W1
- sha256_4rounds \last, \k3, W3, W0, W1, W2
-.endm
-
-// void sha256_transform_zvknha_or_zvknhb_zvkb(u32 state[SHA256_STATE_WORDS],
-// const u8 *data, size_t nblocks);
-SYM_FUNC_START(sha256_transform_zvknha_or_zvknhb_zvkb)
-
- // Load the round constants into K0-K15.
- vsetivli zero, 4, e32, m1, ta, ma
- la t0, K256
- vle32.v K0, (t0)
- addi t0, t0, 16
- vle32.v K1, (t0)
- addi t0, t0, 16
- vle32.v K2, (t0)
- addi t0, t0, 16
- vle32.v K3, (t0)
- addi t0, t0, 16
- vle32.v K4, (t0)
- addi t0, t0, 16
- vle32.v K5, (t0)
- addi t0, t0, 16
- vle32.v K6, (t0)
- addi t0, t0, 16
- vle32.v K7, (t0)
- addi t0, t0, 16
- vle32.v K8, (t0)
- addi t0, t0, 16
- vle32.v K9, (t0)
- addi t0, t0, 16
- vle32.v K10, (t0)
- addi t0, t0, 16
- vle32.v K11, (t0)
- addi t0, t0, 16
- vle32.v K12, (t0)
- addi t0, t0, 16
- vle32.v K13, (t0)
- addi t0, t0, 16
- vle32.v K14, (t0)
- addi t0, t0, 16
- vle32.v K15, (t0)
-
- // Setup mask for the vmerge to replace the first word (idx==0) in
- // message scheduling. There are 4 words, so an 8-bit mask suffices.
- vsetivli zero, 1, e8, m1, ta, ma
- vmv.v.i MASK, 0x01
-
- // Load the state. The state is stored as {a,b,c,d,e,f,g,h}, but we
- // need {f,e,b,a},{h,g,d,c}. The dst vtype is e32m1 and the index vtype
- // is e8mf4. We use index-load with the i8 indices {20, 16, 4, 0},
- // loaded using the 32-bit little endian value 0x00041014.
- li t0, 0x00041014
- vsetivli zero, 1, e32, m1, ta, ma
- vmv.v.x INDICES, t0
- addi STATEP_C, STATEP, 8
- vsetivli zero, 4, e32, m1, ta, ma
- vluxei8.v FEBA, (STATEP), INDICES
- vluxei8.v HGDC, (STATEP_C), INDICES
-
-.Lnext_block:
- addi NUM_BLOCKS, NUM_BLOCKS, -1
-
- // Save the previous state, as it's needed later.
- vmv.v.v PREV_FEBA, FEBA
- vmv.v.v PREV_HGDC, HGDC
-
- // Load the next 512-bit message block and endian-swap each 32-bit word.
- vle32.v W0, (DATA)
- vrev8.v W0, W0
- addi DATA, DATA, 16
- vle32.v W1, (DATA)
- vrev8.v W1, W1
- addi DATA, DATA, 16
- vle32.v W2, (DATA)
- vrev8.v W2, W2
- addi DATA, DATA, 16
- vle32.v W3, (DATA)
- vrev8.v W3, W3
- addi DATA, DATA, 16
-
- // Do the 64 rounds of SHA-256.
- sha256_16rounds 0, K0, K1, K2, K3
- sha256_16rounds 0, K4, K5, K6, K7
- sha256_16rounds 0, K8, K9, K10, K11
- sha256_16rounds 1, K12, K13, K14, K15
-
- // Add the previous state.
- vadd.vv FEBA, FEBA, PREV_FEBA
- vadd.vv HGDC, HGDC, PREV_HGDC
-
- // Repeat if more blocks remain.
- bnez NUM_BLOCKS, .Lnext_block
-
- // Store the new state and return.
- vsuxei8.v FEBA, (STATEP), INDICES
- vsuxei8.v HGDC, (STATEP_C), INDICES
- ret
-SYM_FUNC_END(sha256_transform_zvknha_or_zvknhb_zvkb)
-
-.section ".rodata"
-.p2align 2
-.type K256, @object
-K256:
- .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
- .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
- .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
- .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
- .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
- .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
- .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
- .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
- .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
- .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
- .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
- .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
- .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
- .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
- .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
- .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-.size K256, . - K256
diff --git a/arch/riscv/lib/crypto/sha256.c b/arch/riscv/lib/crypto/sha256.c
deleted file mode 100644
index 71808397dff4..000000000000
--- a/arch/riscv/lib/crypto/sha256.c
+++ /dev/null
@@ -1,67 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 (RISC-V accelerated)
- *
- * Copyright (C) 2022 VRULL GmbH
- * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
- *
- * Copyright (C) 2023 SiFive, Inc.
- * Author: Jerry Shih <jerry.shih@sifive.com>
- */
-
-#include <asm/vector.h>
-#include <crypto/internal/sha2.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha256_transform_zvknha_or_zvknhb_zvkb(
- u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions);
-
-void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks)
-{
- if (static_branch_likely(&have_extensions)) {
- kernel_vector_begin();
- sha256_transform_zvknha_or_zvknhb_zvkb(state, data, nblocks);
- kernel_vector_end();
- } else {
- sha256_blocks_generic(state, data, nblocks);
- }
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_simd);
-
-void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks)
-{
- sha256_blocks_generic(state, data, nblocks);
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-
-bool sha256_is_arch_optimized(void)
-{
- return static_key_enabled(&have_extensions);
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
-static int __init riscv64_sha256_mod_init(void)
-{
- /* Both zvknha and zvknhb provide the SHA-256 instructions. */
- if ((riscv_isa_extension_available(NULL, ZVKNHA) ||
- riscv_isa_extension_available(NULL, ZVKNHB)) &&
- riscv_isa_extension_available(NULL, ZVKB) &&
- riscv_vector_vlen() >= 128)
- static_branch_enable(&have_extensions);
- return 0;
-}
-subsys_initcall(riscv64_sha256_mod_init);
-
-static void __exit riscv64_sha256_mod_exit(void)
-{
-}
-module_exit(riscv64_sha256_mod_exit);
-
-MODULE_DESCRIPTION("SHA-256 (RISC-V accelerated)");
-MODULE_AUTHOR("Heiko Stuebner <heiko.stuebner@vrull.eu>");
-MODULE_LICENSE("GPL");
diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile
index fb9c917c9b45..240592e3f5c2 100644
--- a/arch/riscv/purgatory/Makefile
+++ b/arch/riscv/purgatory/Makefile
@@ -53,7 +53,7 @@ targets += purgatory.ro purgatory.chk
PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel
PURGATORY_CFLAGS := -mcmodel=medany -ffreestanding -fno-zero-initialized-in-bss
-PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) -DDISABLE_BRANCH_PROFILING
+PURGATORY_CFLAGS += $(DISABLE_KSTACK_ERASE) -DDISABLE_BRANCH_PROFILING
PURGATORY_CFLAGS += -fno-stack-protector -g0
# Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That
diff --git a/arch/riscv/purgatory/purgatory.c b/arch/riscv/purgatory/purgatory.c
index 80596ab5fb62..bbd5cfa4d741 100644
--- a/arch/riscv/purgatory/purgatory.c
+++ b/arch/riscv/purgatory/purgatory.c
@@ -20,14 +20,14 @@ struct kexec_sha_region purgatory_sha_regions[KEXEC_SEGMENT_MAX] __section(".kex
static int verify_sha256_digest(void)
{
struct kexec_sha_region *ptr, *end;
- struct sha256_state ss;
+ struct sha256_ctx sctx;
u8 digest[SHA256_DIGEST_SIZE];
- sha256_init(&ss);
+ sha256_init(&sctx);
end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions);
for (ptr = purgatory_sha_regions; ptr < end; ptr++)
- sha256_update(&ss, (uint8_t *)(ptr->start), ptr->len);
- sha256_final(&ss, digest);
+ sha256_update(&sctx, (uint8_t *)(ptr->start), ptr->len);
+ sha256_final(&sctx, digest);
if (memcmp(digest, purgatory_sha256_digest, sizeof(digest)) != 0)
return 1;
return 0;
diff --git a/arch/riscv/tools/relocs_check.sh b/arch/riscv/tools/relocs_check.sh
index baeb2e7b2290..742993e6a8cb 100755
--- a/arch/riscv/tools/relocs_check.sh
+++ b/arch/riscv/tools/relocs_check.sh
@@ -14,7 +14,9 @@ bad_relocs=$(
${srctree}/scripts/relocs_check.sh "$@" |
# These relocations are okay
# R_RISCV_RELATIVE
- grep -F -w -v 'R_RISCV_RELATIVE'
+ # R_RISCV_NONE
+ grep -F -w -v 'R_RISCV_RELATIVE
+R_RISCV_NONE'
)
if [ -z "$bad_relocs" ]; then
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 0c16dc443e2f..25a773e6596e 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -75,7 +75,6 @@ config S390
select ARCH_ENABLE_MEMORY_HOTREMOVE
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
select ARCH_HAS_CPU_FINALIZE_INIT
- select ARCH_HAS_CRC32
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE
@@ -176,10 +175,10 @@ config S390
select HAVE_ARCH_KCSAN
select HAVE_ARCH_KMSAN
select HAVE_ARCH_KFENCE
+ select HAVE_ARCH_KSTACK_ERASE
select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_SOFT_DIRTY
- select HAVE_ARCH_STACKLEAK
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_ARCH_VMAP_STACK
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 8ecad727497e..a7db7ed28720 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -804,8 +804,6 @@ CONFIG_CRYPTO_USER_API_HASH=m
CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
-CONFIG_CRYPTO_SHA512_S390=m
-CONFIG_CRYPTO_SHA1_S390=m
CONFIG_CRYPTO_SHA3_256_S390=m
CONFIG_CRYPTO_SHA3_512_S390=m
CONFIG_CRYPTO_GHASH_S390=m
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index c13a77765162..0217ed5616bf 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -791,8 +791,6 @@ CONFIG_CRYPTO_USER_API_HASH=m
CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
-CONFIG_CRYPTO_SHA512_S390=m
-CONFIG_CRYPTO_SHA1_S390=m
CONFIG_CRYPTO_SHA3_256_S390=m
CONFIG_CRYPTO_SHA3_512_S390=m
CONFIG_CRYPTO_GHASH_S390=m
diff --git a/arch/s390/crypto/Kconfig b/arch/s390/crypto/Kconfig
index e2c27588b21a..03f73fbd38b6 100644
--- a/arch/s390/crypto/Kconfig
+++ b/arch/s390/crypto/Kconfig
@@ -2,26 +2,6 @@
menu "Accelerated Cryptographic Algorithms for CPU (s390)"
-config CRYPTO_SHA512_S390
- tristate "Hash functions: SHA-384 and SHA-512"
- select CRYPTO_HASH
- help
- SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
-
- Architecture: s390
-
- It is available as of z10.
-
-config CRYPTO_SHA1_S390
- tristate "Hash functions: SHA-1"
- select CRYPTO_HASH
- help
- SHA-1 secure hash algorithm (FIPS 180)
-
- Architecture: s390
-
- It is available as of z990.
-
config CRYPTO_SHA3_256_S390
tristate "Hash functions: SHA3-224 and SHA3-256"
select CRYPTO_HASH
diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile
index 21757d86cd49..1e5a1038d491 100644
--- a/arch/s390/crypto/Makefile
+++ b/arch/s390/crypto/Makefile
@@ -3,8 +3,6 @@
# Cryptographic API
#
-obj-$(CONFIG_CRYPTO_SHA1_S390) += sha1_s390.o sha_common.o
-obj-$(CONFIG_CRYPTO_SHA512_S390) += sha512_s390.o sha_common.o
obj-$(CONFIG_CRYPTO_SHA3_256_S390) += sha3_256_s390.o sha_common.o
obj-$(CONFIG_CRYPTO_SHA3_512_S390) += sha3_512_s390.o sha_common.o
obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o
diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c
deleted file mode 100644
index 9b0d55be1239..000000000000
--- a/arch/s390/crypto/sha1_s390.c
+++ /dev/null
@@ -1,105 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Cryptographic API.
- *
- * s390 implementation of the SHA1 Secure Hash Algorithm.
- *
- * Derived from cryptoapi implementation, adapted for in-place
- * scatterlist interface. Originally based on the public domain
- * implementation written by Steve Reid.
- *
- * s390 Version:
- * Copyright IBM Corp. 2003, 2007
- * Author(s): Thomas Spatzier
- * Jan Glauber (jan.glauber@de.ibm.com)
- *
- * Derived from "crypto/sha1_generic.c"
- * Copyright (c) Alan Smithee.
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
- */
-#include <asm/cpacf.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include "sha.h"
-
-static int s390_sha1_init(struct shash_desc *desc)
-{
- struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
- sctx->state[0] = SHA1_H0;
- sctx->state[1] = SHA1_H1;
- sctx->state[2] = SHA1_H2;
- sctx->state[3] = SHA1_H3;
- sctx->state[4] = SHA1_H4;
- sctx->count = 0;
- sctx->func = CPACF_KIMD_SHA_1;
- sctx->first_message_part = 0;
-
- return 0;
-}
-
-static int s390_sha1_export(struct shash_desc *desc, void *out)
-{
- struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
- struct sha1_state *octx = out;
-
- octx->count = sctx->count;
- memcpy(octx->state, sctx->state, sizeof(octx->state));
- return 0;
-}
-
-static int s390_sha1_import(struct shash_desc *desc, const void *in)
-{
- struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
- const struct sha1_state *ictx = in;
-
- sctx->count = ictx->count;
- memcpy(sctx->state, ictx->state, sizeof(ictx->state));
- sctx->func = CPACF_KIMD_SHA_1;
- sctx->first_message_part = 0;
- return 0;
-}
-
-static struct shash_alg alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = s390_sha1_init,
- .update = s390_sha_update_blocks,
- .finup = s390_sha_finup,
- .export = s390_sha1_export,
- .import = s390_sha1_import,
- .descsize = S390_SHA_CTX_SIZE,
- .statesize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name= "sha1-s390",
- .cra_priority = 300,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static int __init sha1_s390_init(void)
-{
- if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_1))
- return -ENODEV;
- return crypto_register_shash(&alg);
-}
-
-static void __exit sha1_s390_fini(void)
-{
- crypto_unregister_shash(&alg);
-}
-
-module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha1_s390_init);
-module_exit(sha1_s390_fini);
-
-MODULE_ALIAS_CRYPTO("sha1");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm");
diff --git a/arch/s390/crypto/sha512_s390.c b/arch/s390/crypto/sha512_s390.c
deleted file mode 100644
index 6cbbf5e8555f..000000000000
--- a/arch/s390/crypto/sha512_s390.c
+++ /dev/null
@@ -1,154 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Cryptographic API.
- *
- * s390 implementation of the SHA512 and SHA38 Secure Hash Algorithm.
- *
- * Copyright IBM Corp. 2007
- * Author(s): Jan Glauber (jang@de.ibm.com)
- */
-#include <asm/cpacf.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <linux/cpufeature.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include "sha.h"
-
-static int sha512_init(struct shash_desc *desc)
-{
- struct s390_sha_ctx *ctx = shash_desc_ctx(desc);
-
- ctx->sha512.state[0] = SHA512_H0;
- ctx->sha512.state[1] = SHA512_H1;
- ctx->sha512.state[2] = SHA512_H2;
- ctx->sha512.state[3] = SHA512_H3;
- ctx->sha512.state[4] = SHA512_H4;
- ctx->sha512.state[5] = SHA512_H5;
- ctx->sha512.state[6] = SHA512_H6;
- ctx->sha512.state[7] = SHA512_H7;
- ctx->count = 0;
- ctx->sha512.count_hi = 0;
- ctx->func = CPACF_KIMD_SHA_512;
- ctx->first_message_part = 0;
-
- return 0;
-}
-
-static int sha512_export(struct shash_desc *desc, void *out)
-{
- struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
- struct sha512_state *octx = out;
-
- octx->count[0] = sctx->count;
- octx->count[1] = sctx->sha512.count_hi;
- memcpy(octx->state, sctx->state, sizeof(octx->state));
- return 0;
-}
-
-static int sha512_import(struct shash_desc *desc, const void *in)
-{
- struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
- const struct sha512_state *ictx = in;
-
- sctx->count = ictx->count[0];
- sctx->sha512.count_hi = ictx->count[1];
-
- memcpy(sctx->state, ictx->state, sizeof(ictx->state));
- sctx->func = CPACF_KIMD_SHA_512;
- sctx->first_message_part = 0;
- return 0;
-}
-
-static struct shash_alg sha512_alg = {
- .digestsize = SHA512_DIGEST_SIZE,
- .init = sha512_init,
- .update = s390_sha_update_blocks,
- .finup = s390_sha_finup,
- .export = sha512_export,
- .import = sha512_import,
- .descsize = sizeof(struct s390_sha_ctx),
- .statesize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha512",
- .cra_driver_name= "sha512-s390",
- .cra_priority = 300,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-MODULE_ALIAS_CRYPTO("sha512");
-
-static int sha384_init(struct shash_desc *desc)
-{
- struct s390_sha_ctx *ctx = shash_desc_ctx(desc);
-
- ctx->sha512.state[0] = SHA384_H0;
- ctx->sha512.state[1] = SHA384_H1;
- ctx->sha512.state[2] = SHA384_H2;
- ctx->sha512.state[3] = SHA384_H3;
- ctx->sha512.state[4] = SHA384_H4;
- ctx->sha512.state[5] = SHA384_H5;
- ctx->sha512.state[6] = SHA384_H6;
- ctx->sha512.state[7] = SHA384_H7;
- ctx->count = 0;
- ctx->sha512.count_hi = 0;
- ctx->func = CPACF_KIMD_SHA_512;
- ctx->first_message_part = 0;
-
- return 0;
-}
-
-static struct shash_alg sha384_alg = {
- .digestsize = SHA384_DIGEST_SIZE,
- .init = sha384_init,
- .update = s390_sha_update_blocks,
- .finup = s390_sha_finup,
- .export = sha512_export,
- .import = sha512_import,
- .descsize = sizeof(struct s390_sha_ctx),
- .statesize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha384",
- .cra_driver_name= "sha384-s390",
- .cra_priority = 300,
- .cra_blocksize = SHA384_BLOCK_SIZE,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_ctxsize = sizeof(struct s390_sha_ctx),
- .cra_module = THIS_MODULE,
- }
-};
-
-MODULE_ALIAS_CRYPTO("sha384");
-
-static int __init init(void)
-{
- int ret;
-
- if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_512))
- return -ENODEV;
- if ((ret = crypto_register_shash(&sha512_alg)) < 0)
- goto out;
- if ((ret = crypto_register_shash(&sha384_alg)) < 0)
- crypto_unregister_shash(&sha512_alg);
-out:
- return ret;
-}
-
-static void __exit fini(void)
-{
- crypto_unregister_shash(&sha512_alg);
- crypto_unregister_shash(&sha384_alg);
-}
-
-module_cpu_feature_match(S390_CPU_FEATURE_MSA, init);
-module_exit(fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA512 and SHA-384 Secure Hash Algorithm");
diff --git a/arch/s390/hypfs/hypfs.h b/arch/s390/hypfs/hypfs.h
index 83ebf54cca6b..4dc2e068e0ff 100644
--- a/arch/s390/hypfs/hypfs.h
+++ b/arch/s390/hypfs/hypfs.h
@@ -48,7 +48,7 @@ void hypfs_sprp_exit(void);
int __hypfs_fs_init(void);
-static inline int hypfs_fs_init(void)
+static __always_inline int hypfs_fs_init(void)
{
if (IS_ENABLED(CONFIG_S390_HYPFS_FS))
return __hypfs_fs_init();
diff --git a/arch/s390/hypfs/hypfs_diag.h b/arch/s390/hypfs/hypfs_diag.h
index 7090eff27fef..b5218135b8fe 100644
--- a/arch/s390/hypfs/hypfs_diag.h
+++ b/arch/s390/hypfs/hypfs_diag.h
@@ -19,7 +19,7 @@ int diag204_store(void *buf, int pages);
int __hypfs_diag_fs_init(void);
void __hypfs_diag_fs_exit(void);
-static inline int hypfs_diag_fs_init(void)
+static __always_inline int hypfs_diag_fs_init(void)
{
if (IS_ENABLED(CONFIG_S390_HYPFS_FS))
return __hypfs_diag_fs_init();
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 0f00f4b06d51..75b0fbb236d0 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -124,7 +124,7 @@ _LPP_OFFSET = __LC_LPP
#endif
.macro STACKLEAK_ERASE
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
brasl %r14,stackleak_erase_on_task_stack
#endif
.endm
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index e1240f6b29fa..494216c4b4f3 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -1209,7 +1209,7 @@ static int s390_runtime_instr_set(struct task_struct *target,
static const struct user_regset s390_regsets[] = {
{
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = sizeof(s390_regs) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
@@ -1217,7 +1217,7 @@ static const struct user_regset s390_regsets[] = {
.set = s390_regs_set,
},
{
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(s390_fp_regs) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
@@ -1225,7 +1225,7 @@ static const struct user_regset s390_regsets[] = {
.set = s390_fpregs_set,
},
{
- .core_note_type = NT_S390_SYSTEM_CALL,
+ USER_REGSET_NOTE_TYPE(S390_SYSTEM_CALL),
.n = 1,
.size = sizeof(unsigned int),
.align = sizeof(unsigned int),
@@ -1233,7 +1233,7 @@ static const struct user_regset s390_regsets[] = {
.set = s390_system_call_set,
},
{
- .core_note_type = NT_S390_LAST_BREAK,
+ USER_REGSET_NOTE_TYPE(S390_LAST_BREAK),
.n = 1,
.size = sizeof(long),
.align = sizeof(long),
@@ -1241,7 +1241,7 @@ static const struct user_regset s390_regsets[] = {
.set = s390_last_break_set,
},
{
- .core_note_type = NT_S390_TDB,
+ USER_REGSET_NOTE_TYPE(S390_TDB),
.n = 1,
.size = 256,
.align = 1,
@@ -1249,7 +1249,7 @@ static const struct user_regset s390_regsets[] = {
.set = s390_tdb_set,
},
{
- .core_note_type = NT_S390_VXRS_LOW,
+ USER_REGSET_NOTE_TYPE(S390_VXRS_LOW),
.n = __NUM_VXRS_LOW,
.size = sizeof(__u64),
.align = sizeof(__u64),
@@ -1257,7 +1257,7 @@ static const struct user_regset s390_regsets[] = {
.set = s390_vxrs_low_set,
},
{
- .core_note_type = NT_S390_VXRS_HIGH,
+ USER_REGSET_NOTE_TYPE(S390_VXRS_HIGH),
.n = __NUM_VXRS_HIGH,
.size = sizeof(__vector128),
.align = sizeof(__vector128),
@@ -1265,7 +1265,7 @@ static const struct user_regset s390_regsets[] = {
.set = s390_vxrs_high_set,
},
{
- .core_note_type = NT_S390_GS_CB,
+ USER_REGSET_NOTE_TYPE(S390_GS_CB),
.n = sizeof(struct gs_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
@@ -1273,7 +1273,7 @@ static const struct user_regset s390_regsets[] = {
.set = s390_gs_cb_set,
},
{
- .core_note_type = NT_S390_GS_BC,
+ USER_REGSET_NOTE_TYPE(S390_GS_BC),
.n = sizeof(struct gs_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
@@ -1281,7 +1281,7 @@ static const struct user_regset s390_regsets[] = {
.set = s390_gs_bc_set,
},
{
- .core_note_type = NT_S390_RI_CB,
+ USER_REGSET_NOTE_TYPE(S390_RI_CB),
.n = sizeof(struct runtime_instr_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
@@ -1413,7 +1413,7 @@ static int s390_compat_last_break_set(struct task_struct *target,
static const struct user_regset s390_compat_regsets[] = {
{
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = sizeof(s390_compat_regs) / sizeof(compat_long_t),
.size = sizeof(compat_long_t),
.align = sizeof(compat_long_t),
@@ -1421,7 +1421,7 @@ static const struct user_regset s390_compat_regsets[] = {
.set = s390_compat_regs_set,
},
{
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(s390_fp_regs) / sizeof(compat_long_t),
.size = sizeof(compat_long_t),
.align = sizeof(compat_long_t),
@@ -1429,7 +1429,7 @@ static const struct user_regset s390_compat_regsets[] = {
.set = s390_fpregs_set,
},
{
- .core_note_type = NT_S390_SYSTEM_CALL,
+ USER_REGSET_NOTE_TYPE(S390_SYSTEM_CALL),
.n = 1,
.size = sizeof(compat_uint_t),
.align = sizeof(compat_uint_t),
@@ -1437,7 +1437,7 @@ static const struct user_regset s390_compat_regsets[] = {
.set = s390_system_call_set,
},
{
- .core_note_type = NT_S390_LAST_BREAK,
+ USER_REGSET_NOTE_TYPE(S390_LAST_BREAK),
.n = 1,
.size = sizeof(long),
.align = sizeof(long),
@@ -1445,7 +1445,7 @@ static const struct user_regset s390_compat_regsets[] = {
.set = s390_compat_last_break_set,
},
{
- .core_note_type = NT_S390_TDB,
+ USER_REGSET_NOTE_TYPE(S390_TDB),
.n = 1,
.size = 256,
.align = 1,
@@ -1453,7 +1453,7 @@ static const struct user_regset s390_compat_regsets[] = {
.set = s390_tdb_set,
},
{
- .core_note_type = NT_S390_VXRS_LOW,
+ USER_REGSET_NOTE_TYPE(S390_VXRS_LOW),
.n = __NUM_VXRS_LOW,
.size = sizeof(__u64),
.align = sizeof(__u64),
@@ -1461,7 +1461,7 @@ static const struct user_regset s390_compat_regsets[] = {
.set = s390_vxrs_low_set,
},
{
- .core_note_type = NT_S390_VXRS_HIGH,
+ USER_REGSET_NOTE_TYPE(S390_VXRS_HIGH),
.n = __NUM_VXRS_HIGH,
.size = sizeof(__vector128),
.align = sizeof(__vector128),
@@ -1469,7 +1469,7 @@ static const struct user_regset s390_compat_regsets[] = {
.set = s390_vxrs_high_set,
},
{
- .core_note_type = NT_S390_HIGH_GPRS,
+ USER_REGSET_NOTE_TYPE(S390_HIGH_GPRS),
.n = sizeof(s390_compat_regs_high) / sizeof(compat_long_t),
.size = sizeof(compat_long_t),
.align = sizeof(compat_long_t),
@@ -1477,7 +1477,7 @@ static const struct user_regset s390_compat_regsets[] = {
.set = s390_compat_regs_high_set,
},
{
- .core_note_type = NT_S390_GS_CB,
+ USER_REGSET_NOTE_TYPE(S390_GS_CB),
.n = sizeof(struct gs_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
@@ -1485,7 +1485,7 @@ static const struct user_regset s390_compat_regsets[] = {
.set = s390_gs_cb_set,
},
{
- .core_note_type = NT_S390_GS_BC,
+ USER_REGSET_NOTE_TYPE(S390_GS_BC),
.n = sizeof(struct gs_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
@@ -1493,7 +1493,7 @@ static const struct user_regset s390_compat_regsets[] = {
.set = s390_gs_bc_set,
},
{
- .core_note_type = NT_S390_RI_CB,
+ USER_REGSET_NOTE_TYPE(S390_RI_CB),
.n = sizeof(struct runtime_instr_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index a4569b96ef06..8a6744d658db 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -470,3 +470,5 @@
465 common listxattrat sys_listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat sys_removexattrat
467 common open_tree_attr sys_open_tree_attr sys_open_tree_attr
+468 common file_getattr sys_file_getattr sys_file_getattr
+469 common file_setattr sys_file_setattr sys_file_setattr
diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile
index cd35cdbfa871..f43f897d3fc0 100644
--- a/arch/s390/lib/Makefile
+++ b/arch/s390/lib/Makefile
@@ -3,7 +3,6 @@
# Makefile for s390-specific library files..
#
-obj-y += crypto/
lib-y += delay.o string.o uaccess.o find.o spinlock.o tishift.o
lib-y += csum-partial.o
obj-y += mem.o xor.o
@@ -25,6 +24,3 @@ obj-$(CONFIG_S390_MODULES_SANITY_TEST_HELPERS) += test_modules_helpers.o
lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
obj-$(CONFIG_EXPOLINE_EXTERN) += expoline.o
-
-obj-$(CONFIG_CRC32_ARCH) += crc32-s390.o
-crc32-s390-y := crc32.o crc32le-vx.o crc32be-vx.o
diff --git a/arch/s390/lib/crc32-vx.h b/arch/s390/lib/crc32-vx.h
deleted file mode 100644
index 652c96e1a822..000000000000
--- a/arch/s390/lib/crc32-vx.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _CRC32_VX_S390_H
-#define _CRC32_VX_S390_H
-
-#include <linux/types.h>
-
-u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
-u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
-u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
-
-#endif /* _CRC32_VX_S390_H */
diff --git a/arch/s390/lib/crc32.c b/arch/s390/lib/crc32.c
deleted file mode 100644
index 3c4b344417c1..000000000000
--- a/arch/s390/lib/crc32.c
+++ /dev/null
@@ -1,77 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * CRC-32 implemented with the z/Architecture Vector Extension Facility.
- *
- * Copyright IBM Corp. 2015
- * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
- */
-#define KMSG_COMPONENT "crc32-vx"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/module.h>
-#include <linux/cpufeature.h>
-#include <linux/crc32.h>
-#include <asm/fpu.h>
-#include "crc32-vx.h"
-
-#define VX_MIN_LEN 64
-#define VX_ALIGNMENT 16L
-#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
-
-/*
- * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension
- *
- * Creates a function to perform a particular CRC-32 computation. Depending
- * on the message buffer, the hardware-accelerated or software implementation
- * is used. Note that the message buffer is aligned to improve fetch
- * operations of VECTOR LOAD MULTIPLE instructions.
- */
-#define DEFINE_CRC32_VX(___fname, ___crc32_vx, ___crc32_sw) \
- u32 ___fname(u32 crc, const u8 *data, size_t datalen) \
- { \
- unsigned long prealign, aligned, remaining; \
- DECLARE_KERNEL_FPU_ONSTACK16(vxstate); \
- \
- if (datalen < VX_MIN_LEN + VX_ALIGN_MASK || !cpu_has_vx()) \
- return ___crc32_sw(crc, data, datalen); \
- \
- if ((unsigned long)data & VX_ALIGN_MASK) { \
- prealign = VX_ALIGNMENT - \
- ((unsigned long)data & VX_ALIGN_MASK); \
- datalen -= prealign; \
- crc = ___crc32_sw(crc, data, prealign); \
- data = (void *)((unsigned long)data + prealign); \
- } \
- \
- aligned = datalen & ~VX_ALIGN_MASK; \
- remaining = datalen & VX_ALIGN_MASK; \
- \
- kernel_fpu_begin(&vxstate, KERNEL_VXR_LOW); \
- crc = ___crc32_vx(crc, data, aligned); \
- kernel_fpu_end(&vxstate, KERNEL_VXR_LOW); \
- \
- if (remaining) \
- crc = ___crc32_sw(crc, data + aligned, remaining); \
- \
- return crc; \
- } \
- EXPORT_SYMBOL(___fname);
-
-DEFINE_CRC32_VX(crc32_le_arch, crc32_le_vgfm_16, crc32_le_base)
-DEFINE_CRC32_VX(crc32_be_arch, crc32_be_vgfm_16, crc32_be_base)
-DEFINE_CRC32_VX(crc32c_arch, crc32c_le_vgfm_16, crc32c_base)
-
-u32 crc32_optimizations(void)
-{
- if (cpu_has_vx()) {
- return CRC32_LE_OPTIMIZATION |
- CRC32_BE_OPTIMIZATION |
- CRC32C_OPTIMIZATION;
- }
- return 0;
-}
-EXPORT_SYMBOL(crc32_optimizations);
-
-MODULE_AUTHOR("Hendrik Brueckner <brueckner@linux.vnet.ibm.com>");
-MODULE_DESCRIPTION("CRC-32 algorithms using z/Architecture Vector Extension Facility");
-MODULE_LICENSE("GPL");
diff --git a/arch/s390/lib/crc32be-vx.c b/arch/s390/lib/crc32be-vx.c
deleted file mode 100644
index fed7c9c70d05..000000000000
--- a/arch/s390/lib/crc32be-vx.c
+++ /dev/null
@@ -1,174 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Hardware-accelerated CRC-32 variants for Linux on z Systems
- *
- * Use the z/Architecture Vector Extension Facility to accelerate the
- * computing of CRC-32 checksums.
- *
- * This CRC-32 implementation algorithm processes the most-significant
- * bit first (BE).
- *
- * Copyright IBM Corp. 2015
- * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
- */
-
-#include <linux/types.h>
-#include <asm/fpu.h>
-#include "crc32-vx.h"
-
-/* Vector register range containing CRC-32 constants */
-#define CONST_R1R2 9
-#define CONST_R3R4 10
-#define CONST_R5 11
-#define CONST_R6 12
-#define CONST_RU_POLY 13
-#define CONST_CRC_POLY 14
-
-/*
- * The CRC-32 constant block contains reduction constants to fold and
- * process particular chunks of the input data stream in parallel.
- *
- * For the CRC-32 variants, the constants are precomputed according to
- * these definitions:
- *
- * R1 = x4*128+64 mod P(x)
- * R2 = x4*128 mod P(x)
- * R3 = x128+64 mod P(x)
- * R4 = x128 mod P(x)
- * R5 = x96 mod P(x)
- * R6 = x64 mod P(x)
- *
- * Barret reduction constant, u, is defined as floor(x**64 / P(x)).
- *
- * where P(x) is the polynomial in the normal domain and the P'(x) is the
- * polynomial in the reversed (bitreflected) domain.
- *
- * Note that the constant definitions below are extended in order to compute
- * intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
- * The rightmost doubleword can be 0 to prevent contribution to the result or
- * can be multiplied by 1 to perform an XOR without the need for a separate
- * VECTOR EXCLUSIVE OR instruction.
- *
- * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
- *
- * P(x) = 0x04C11DB7
- * P'(x) = 0xEDB88320
- */
-
-static unsigned long constants_CRC_32_BE[] = {
- 0x08833794c, 0x0e6228b11, /* R1, R2 */
- 0x0c5b9cd4c, 0x0e8a45605, /* R3, R4 */
- 0x0f200aa66, 1UL << 32, /* R5, x32 */
- 0x0490d678d, 1, /* R6, 1 */
- 0x104d101df, 0, /* u */
- 0x104C11DB7, 0, /* P(x) */
-};
-
-/**
- * crc32_be_vgfm_16 - Compute CRC-32 (BE variant) with vector registers
- * @crc: Initial CRC value, typically ~0.
- * @buf: Input buffer pointer, performance might be improved if the
- * buffer is on a doubleword boundary.
- * @size: Size of the buffer, must be 64 bytes or greater.
- *
- * Register usage:
- * V0: Initial CRC value and intermediate constants and results.
- * V1..V4: Data for CRC computation.
- * V5..V8: Next data chunks that are fetched from the input buffer.
- * V9..V14: CRC-32 constants.
- */
-u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
-{
- /* Load CRC-32 constants */
- fpu_vlm(CONST_R1R2, CONST_CRC_POLY, &constants_CRC_32_BE);
- fpu_vzero(0);
-
- /* Load the initial CRC value into the leftmost word of V0. */
- fpu_vlvgf(0, crc, 0);
-
- /* Load a 64-byte data chunk and XOR with CRC */
- fpu_vlm(1, 4, buf);
- fpu_vx(1, 0, 1);
- buf += 64;
- size -= 64;
-
- while (size >= 64) {
- /* Load the next 64-byte data chunk into V5 to V8 */
- fpu_vlm(5, 8, buf);
-
- /*
- * Perform a GF(2) multiplication of the doublewords in V1 with
- * the reduction constants in V0. The intermediate result is
- * then folded (accumulated) with the next data chunk in V5 and
- * stored in V1. Repeat this step for the register contents
- * in V2, V3, and V4 respectively.
- */
- fpu_vgfmag(1, CONST_R1R2, 1, 5);
- fpu_vgfmag(2, CONST_R1R2, 2, 6);
- fpu_vgfmag(3, CONST_R1R2, 3, 7);
- fpu_vgfmag(4, CONST_R1R2, 4, 8);
- buf += 64;
- size -= 64;
- }
-
- /* Fold V1 to V4 into a single 128-bit value in V1 */
- fpu_vgfmag(1, CONST_R3R4, 1, 2);
- fpu_vgfmag(1, CONST_R3R4, 1, 3);
- fpu_vgfmag(1, CONST_R3R4, 1, 4);
-
- while (size >= 16) {
- fpu_vl(2, buf);
- fpu_vgfmag(1, CONST_R3R4, 1, 2);
- buf += 16;
- size -= 16;
- }
-
- /*
- * The R5 constant is used to fold a 128-bit value into an 96-bit value
- * that is XORed with the next 96-bit input data chunk. To use a single
- * VGFMG instruction, multiply the rightmost 64-bit with x^32 (1<<32) to
- * form an intermediate 96-bit value (with appended zeros) which is then
- * XORed with the intermediate reduction result.
- */
- fpu_vgfmg(1, CONST_R5, 1);
-
- /*
- * Further reduce the remaining 96-bit value to a 64-bit value using a
- * single VGFMG, the rightmost doubleword is multiplied with 0x1. The
- * intermediate result is then XORed with the product of the leftmost
- * doubleword with R6. The result is a 64-bit value and is subject to
- * the Barret reduction.
- */
- fpu_vgfmg(1, CONST_R6, 1);
-
- /*
- * The input values to the Barret reduction are the degree-63 polynomial
- * in V1 (R(x)), degree-32 generator polynomial, and the reduction
- * constant u. The Barret reduction result is the CRC value of R(x) mod
- * P(x).
- *
- * The Barret reduction algorithm is defined as:
- *
- * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
- * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
- * 3. C(x) = R(x) XOR T2(x) mod x^32
- *
- * Note: To compensate the division by x^32, use the vector unpack
- * instruction to move the leftmost word into the leftmost doubleword
- * of the vector register. The rightmost doubleword is multiplied
- * with zero to not contribute to the intermediate results.
- */
-
- /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
- fpu_vupllf(2, 1);
- fpu_vgfmg(2, CONST_RU_POLY, 2);
-
- /*
- * Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
- * V2 and XOR the intermediate result, T2(x), with the value in V1.
- * The final result is in the rightmost word of V2.
- */
- fpu_vupllf(2, 2);
- fpu_vgfmag(2, CONST_CRC_POLY, 2, 1);
- return fpu_vlgvf(2, 3);
-}
diff --git a/arch/s390/lib/crc32le-vx.c b/arch/s390/lib/crc32le-vx.c
deleted file mode 100644
index 2f629f394df7..000000000000
--- a/arch/s390/lib/crc32le-vx.c
+++ /dev/null
@@ -1,240 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Hardware-accelerated CRC-32 variants for Linux on z Systems
- *
- * Use the z/Architecture Vector Extension Facility to accelerate the
- * computing of bitreflected CRC-32 checksums for IEEE 802.3 Ethernet
- * and Castagnoli.
- *
- * This CRC-32 implementation algorithm is bitreflected and processes
- * the least-significant bit first (Little-Endian).
- *
- * Copyright IBM Corp. 2015
- * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
- */
-
-#include <linux/types.h>
-#include <asm/fpu.h>
-#include "crc32-vx.h"
-
-/* Vector register range containing CRC-32 constants */
-#define CONST_PERM_LE2BE 9
-#define CONST_R2R1 10
-#define CONST_R4R3 11
-#define CONST_R5 12
-#define CONST_RU_POLY 13
-#define CONST_CRC_POLY 14
-
-/*
- * The CRC-32 constant block contains reduction constants to fold and
- * process particular chunks of the input data stream in parallel.
- *
- * For the CRC-32 variants, the constants are precomputed according to
- * these definitions:
- *
- * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
- * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
- * R3 = [(x128+32 mod P'(x) << 32)]' << 1
- * R4 = [(x128-32 mod P'(x) << 32)]' << 1
- * R5 = [(x64 mod P'(x) << 32)]' << 1
- * R6 = [(x32 mod P'(x) << 32)]' << 1
- *
- * The bitreflected Barret reduction constant, u', is defined as
- * the bit reversal of floor(x**64 / P(x)).
- *
- * where P(x) is the polynomial in the normal domain and the P'(x) is the
- * polynomial in the reversed (bitreflected) domain.
- *
- * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
- *
- * P(x) = 0x04C11DB7
- * P'(x) = 0xEDB88320
- *
- * CRC-32C (Castagnoli) polynomials:
- *
- * P(x) = 0x1EDC6F41
- * P'(x) = 0x82F63B78
- */
-
-static unsigned long constants_CRC_32_LE[] = {
- 0x0f0e0d0c0b0a0908, 0x0706050403020100, /* BE->LE mask */
- 0x1c6e41596, 0x154442bd4, /* R2, R1 */
- 0x0ccaa009e, 0x1751997d0, /* R4, R3 */
- 0x0, 0x163cd6124, /* R5 */
- 0x0, 0x1f7011641, /* u' */
- 0x0, 0x1db710641 /* P'(x) << 1 */
-};
-
-static unsigned long constants_CRC_32C_LE[] = {
- 0x0f0e0d0c0b0a0908, 0x0706050403020100, /* BE->LE mask */
- 0x09e4addf8, 0x740eef02, /* R2, R1 */
- 0x14cd00bd6, 0xf20c0dfe, /* R4, R3 */
- 0x0, 0x0dd45aab8, /* R5 */
- 0x0, 0x0dea713f1, /* u' */
- 0x0, 0x105ec76f0 /* P'(x) << 1 */
-};
-
-/**
- * crc32_le_vgfm_generic - Compute CRC-32 (LE variant) with vector registers
- * @crc: Initial CRC value, typically ~0.
- * @buf: Input buffer pointer, performance might be improved if the
- * buffer is on a doubleword boundary.
- * @size: Size of the buffer, must be 64 bytes or greater.
- * @constants: CRC-32 constant pool base pointer.
- *
- * Register usage:
- * V0: Initial CRC value and intermediate constants and results.
- * V1..V4: Data for CRC computation.
- * V5..V8: Next data chunks that are fetched from the input buffer.
- * V9: Constant for BE->LE conversion and shift operations
- * V10..V14: CRC-32 constants.
- */
-static u32 crc32_le_vgfm_generic(u32 crc, unsigned char const *buf, size_t size, unsigned long *constants)
-{
- /* Load CRC-32 constants */
- fpu_vlm(CONST_PERM_LE2BE, CONST_CRC_POLY, constants);
-
- /*
- * Load the initial CRC value.
- *
- * The CRC value is loaded into the rightmost word of the
- * vector register and is later XORed with the LSB portion
- * of the loaded input data.
- */
- fpu_vzero(0); /* Clear V0 */
- fpu_vlvgf(0, crc, 3); /* Load CRC into rightmost word */
-
- /* Load a 64-byte data chunk and XOR with CRC */
- fpu_vlm(1, 4, buf);
- fpu_vperm(1, 1, 1, CONST_PERM_LE2BE);
- fpu_vperm(2, 2, 2, CONST_PERM_LE2BE);
- fpu_vperm(3, 3, 3, CONST_PERM_LE2BE);
- fpu_vperm(4, 4, 4, CONST_PERM_LE2BE);
-
- fpu_vx(1, 0, 1); /* V1 ^= CRC */
- buf += 64;
- size -= 64;
-
- while (size >= 64) {
- fpu_vlm(5, 8, buf);
- fpu_vperm(5, 5, 5, CONST_PERM_LE2BE);
- fpu_vperm(6, 6, 6, CONST_PERM_LE2BE);
- fpu_vperm(7, 7, 7, CONST_PERM_LE2BE);
- fpu_vperm(8, 8, 8, CONST_PERM_LE2BE);
- /*
- * Perform a GF(2) multiplication of the doublewords in V1 with
- * the R1 and R2 reduction constants in V0. The intermediate
- * result is then folded (accumulated) with the next data chunk
- * in V5 and stored in V1. Repeat this step for the register
- * contents in V2, V3, and V4 respectively.
- */
- fpu_vgfmag(1, CONST_R2R1, 1, 5);
- fpu_vgfmag(2, CONST_R2R1, 2, 6);
- fpu_vgfmag(3, CONST_R2R1, 3, 7);
- fpu_vgfmag(4, CONST_R2R1, 4, 8);
- buf += 64;
- size -= 64;
- }
-
- /*
- * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3
- * and R4 and accumulating the next 128-bit chunk until a single 128-bit
- * value remains.
- */
- fpu_vgfmag(1, CONST_R4R3, 1, 2);
- fpu_vgfmag(1, CONST_R4R3, 1, 3);
- fpu_vgfmag(1, CONST_R4R3, 1, 4);
-
- while (size >= 16) {
- fpu_vl(2, buf);
- fpu_vperm(2, 2, 2, CONST_PERM_LE2BE);
- fpu_vgfmag(1, CONST_R4R3, 1, 2);
- buf += 16;
- size -= 16;
- }
-
- /*
- * Set up a vector register for byte shifts. The shift value must
- * be loaded in bits 1-4 in byte element 7 of a vector register.
- * Shift by 8 bytes: 0x40
- * Shift by 4 bytes: 0x20
- */
- fpu_vleib(9, 0x40, 7);
-
- /*
- * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
- * to move R4 into the rightmost doubleword and set the leftmost
- * doubleword to 0x1.
- */
- fpu_vsrlb(0, CONST_R4R3, 9);
- fpu_vleig(0, 1, 0);
-
- /*
- * Compute GF(2) product of V1 and V0. The rightmost doubleword
- * of V1 is multiplied with R4. The leftmost doubleword of V1 is
- * multiplied by 0x1 and is then XORed with rightmost product.
- * Implicitly, the intermediate leftmost product becomes padded
- */
- fpu_vgfmg(1, 0, 1);
-
- /*
- * Now do the final 32-bit fold by multiplying the rightmost word
- * in V1 with R5 and XOR the result with the remaining bits in V1.
- *
- * To achieve this by a single VGFMAG, right shift V1 by a word
- * and store the result in V2 which is then accumulated. Use the
- * vector unpack instruction to load the rightmost half of the
- * doubleword into the rightmost doubleword element of V1; the other
- * half is loaded in the leftmost doubleword.
- * The vector register with CONST_R5 contains the R5 constant in the
- * rightmost doubleword and the leftmost doubleword is zero to ignore
- * the leftmost product of V1.
- */
- fpu_vleib(9, 0x20, 7); /* Shift by words */
- fpu_vsrlb(2, 1, 9); /* Store remaining bits in V2 */
- fpu_vupllf(1, 1); /* Split rightmost doubleword */
- fpu_vgfmag(1, CONST_R5, 1, 2); /* V1 = (V1 * R5) XOR V2 */
-
- /*
- * Apply a Barret reduction to compute the final 32-bit CRC value.
- *
- * The input values to the Barret reduction are the degree-63 polynomial
- * in V1 (R(x)), degree-32 generator polynomial, and the reduction
- * constant u. The Barret reduction result is the CRC value of R(x) mod
- * P(x).
- *
- * The Barret reduction algorithm is defined as:
- *
- * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
- * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
- * 3. C(x) = R(x) XOR T2(x) mod x^32
- *
- * Note: The leftmost doubleword of vector register containing
- * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
- * is zero and does not contribute to the final result.
- */
-
- /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
- fpu_vupllf(2, 1);
- fpu_vgfmg(2, CONST_RU_POLY, 2);
-
- /*
- * Compute the GF(2) product of the CRC polynomial with T1(x) in
- * V2 and XOR the intermediate result, T2(x), with the value in V1.
- * The final result is stored in word element 2 of V2.
- */
- fpu_vupllf(2, 2);
- fpu_vgfmag(2, CONST_CRC_POLY, 2, 1);
-
- return fpu_vlgvf(2, 2);
-}
-
-u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
-{
- return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32_LE[0]);
-}
-
-u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
-{
- return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32C_LE[0]);
-}
diff --git a/arch/s390/lib/crypto/Kconfig b/arch/s390/lib/crypto/Kconfig
deleted file mode 100644
index e3f855ef4393..000000000000
--- a/arch/s390/lib/crypto/Kconfig
+++ /dev/null
@@ -1,13 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_CHACHA_S390
- tristate
- default CRYPTO_LIB_CHACHA
- select CRYPTO_LIB_CHACHA_GENERIC
- select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_SHA256_S390
- tristate
- default CRYPTO_LIB_SHA256
- select CRYPTO_ARCH_HAVE_LIB_SHA256
- select CRYPTO_LIB_SHA256_GENERIC
diff --git a/arch/s390/lib/crypto/Makefile b/arch/s390/lib/crypto/Makefile
deleted file mode 100644
index 5df30f1e7930..000000000000
--- a/arch/s390/lib/crypto/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o
-chacha_s390-y := chacha-glue.o chacha-s390.o
-
-obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256-s390.o
-sha256-s390-y := sha256.o
diff --git a/arch/s390/lib/crypto/chacha-glue.c b/arch/s390/lib/crypto/chacha-glue.c
deleted file mode 100644
index f95ba3483bbc..000000000000
--- a/arch/s390/lib/crypto/chacha-glue.c
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ChaCha stream cipher (s390 optimized)
- *
- * Copyright IBM Corp. 2021
- */
-
-#define KMSG_COMPONENT "chacha_s390"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <crypto/chacha.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/sizes.h>
-#include <asm/fpu.h>
-#include "chacha-s390.h"
-
-void hchacha_block_arch(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds)
-{
- /* TODO: implement hchacha_block_arch() in assembly */
- hchacha_block_generic(state, out, nrounds);
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- /* s390 chacha20 implementation has 20 rounds hard-coded,
- * it cannot handle a block of data or less, but otherwise
- * it can handle data of arbitrary size
- */
- if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20 || !cpu_has_vx()) {
- chacha_crypt_generic(state, dst, src, bytes, nrounds);
- } else {
- DECLARE_KERNEL_FPU_ONSTACK32(vxstate);
-
- kernel_fpu_begin(&vxstate, KERNEL_VXR);
- chacha20_vx(dst, src, bytes, &state->x[4], &state->x[12]);
- kernel_fpu_end(&vxstate, KERNEL_VXR);
-
- state->x[12] += round_up(bytes, CHACHA_BLOCK_SIZE) /
- CHACHA_BLOCK_SIZE;
- }
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-bool chacha_is_arch_optimized(void)
-{
- return cpu_has_vx();
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-MODULE_DESCRIPTION("ChaCha stream cipher (s390 optimized)");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/s390/lib/crypto/chacha-s390.S b/arch/s390/lib/crypto/chacha-s390.S
deleted file mode 100644
index 63f3102678c0..000000000000
--- a/arch/s390/lib/crypto/chacha-s390.S
+++ /dev/null
@@ -1,908 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Original implementation written by Andy Polyakov, @dot-asm.
- * This is an adaptation of the original code for kernel use.
- *
- * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
- */
-
-#include <linux/linkage.h>
-#include <asm/nospec-insn.h>
-#include <asm/fpu-insn.h>
-
-#define SP %r15
-#define FRAME (16 * 8 + 4 * 8)
-
- .data
- .balign 32
-
-SYM_DATA_START_LOCAL(sigma)
- .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
- .long 1,0,0,0
- .long 2,0,0,0
- .long 3,0,0,0
- .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap
-
- .long 0,1,2,3
- .long 0x61707865,0x61707865,0x61707865,0x61707865 # smashed sigma
- .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e
- .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32
- .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574
-SYM_DATA_END(sigma)
-
- .previous
-
- GEN_BR_THUNK %r14
-
- .text
-
-#############################################################################
-# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len,
-# counst u32 *key, const u32 *counter)
-
-#define OUT %r2
-#define INP %r3
-#define LEN %r4
-#define KEY %r5
-#define COUNTER %r6
-
-#define BEPERM %v31
-#define CTR %v26
-
-#define K0 %v16
-#define K1 %v17
-#define K2 %v18
-#define K3 %v19
-
-#define XA0 %v0
-#define XA1 %v1
-#define XA2 %v2
-#define XA3 %v3
-
-#define XB0 %v4
-#define XB1 %v5
-#define XB2 %v6
-#define XB3 %v7
-
-#define XC0 %v8
-#define XC1 %v9
-#define XC2 %v10
-#define XC3 %v11
-
-#define XD0 %v12
-#define XD1 %v13
-#define XD2 %v14
-#define XD3 %v15
-
-#define XT0 %v27
-#define XT1 %v28
-#define XT2 %v29
-#define XT3 %v30
-
-SYM_FUNC_START(chacha20_vx_4x)
- stmg %r6,%r7,6*8(SP)
-
- larl %r7,sigma
- lhi %r0,10
- lhi %r1,0
-
- VL K0,0,,%r7 # load sigma
- VL K1,0,,KEY # load key
- VL K2,16,,KEY
- VL K3,0,,COUNTER # load counter
-
- VL BEPERM,0x40,,%r7
- VL CTR,0x50,,%r7
-
- VLM XA0,XA3,0x60,%r7,4 # load [smashed] sigma
-
- VREPF XB0,K1,0 # smash the key
- VREPF XB1,K1,1
- VREPF XB2,K1,2
- VREPF XB3,K1,3
-
- VREPF XD0,K3,0
- VREPF XD1,K3,1
- VREPF XD2,K3,2
- VREPF XD3,K3,3
- VAF XD0,XD0,CTR
-
- VREPF XC0,K2,0
- VREPF XC1,K2,1
- VREPF XC2,K2,2
- VREPF XC3,K2,3
-
-.Loop_4x:
- VAF XA0,XA0,XB0
- VX XD0,XD0,XA0
- VERLLF XD0,XD0,16
-
- VAF XA1,XA1,XB1
- VX XD1,XD1,XA1
- VERLLF XD1,XD1,16
-
- VAF XA2,XA2,XB2
- VX XD2,XD2,XA2
- VERLLF XD2,XD2,16
-
- VAF XA3,XA3,XB3
- VX XD3,XD3,XA3
- VERLLF XD3,XD3,16
-
- VAF XC0,XC0,XD0
- VX XB0,XB0,XC0
- VERLLF XB0,XB0,12
-
- VAF XC1,XC1,XD1
- VX XB1,XB1,XC1
- VERLLF XB1,XB1,12
-
- VAF XC2,XC2,XD2
- VX XB2,XB2,XC2
- VERLLF XB2,XB2,12
-
- VAF XC3,XC3,XD3
- VX XB3,XB3,XC3
- VERLLF XB3,XB3,12
-
- VAF XA0,XA0,XB0
- VX XD0,XD0,XA0
- VERLLF XD0,XD0,8
-
- VAF XA1,XA1,XB1
- VX XD1,XD1,XA1
- VERLLF XD1,XD1,8
-
- VAF XA2,XA2,XB2
- VX XD2,XD2,XA2
- VERLLF XD2,XD2,8
-
- VAF XA3,XA3,XB3
- VX XD3,XD3,XA3
- VERLLF XD3,XD3,8
-
- VAF XC0,XC0,XD0
- VX XB0,XB0,XC0
- VERLLF XB0,XB0,7
-
- VAF XC1,XC1,XD1
- VX XB1,XB1,XC1
- VERLLF XB1,XB1,7
-
- VAF XC2,XC2,XD2
- VX XB2,XB2,XC2
- VERLLF XB2,XB2,7
-
- VAF XC3,XC3,XD3
- VX XB3,XB3,XC3
- VERLLF XB3,XB3,7
-
- VAF XA0,XA0,XB1
- VX XD3,XD3,XA0
- VERLLF XD3,XD3,16
-
- VAF XA1,XA1,XB2
- VX XD0,XD0,XA1
- VERLLF XD0,XD0,16
-
- VAF XA2,XA2,XB3
- VX XD1,XD1,XA2
- VERLLF XD1,XD1,16
-
- VAF XA3,XA3,XB0
- VX XD2,XD2,XA3
- VERLLF XD2,XD2,16
-
- VAF XC2,XC2,XD3
- VX XB1,XB1,XC2
- VERLLF XB1,XB1,12
-
- VAF XC3,XC3,XD0
- VX XB2,XB2,XC3
- VERLLF XB2,XB2,12
-
- VAF XC0,XC0,XD1
- VX XB3,XB3,XC0
- VERLLF XB3,XB3,12
-
- VAF XC1,XC1,XD2
- VX XB0,XB0,XC1
- VERLLF XB0,XB0,12
-
- VAF XA0,XA0,XB1
- VX XD3,XD3,XA0
- VERLLF XD3,XD3,8
-
- VAF XA1,XA1,XB2
- VX XD0,XD0,XA1
- VERLLF XD0,XD0,8
-
- VAF XA2,XA2,XB3
- VX XD1,XD1,XA2
- VERLLF XD1,XD1,8
-
- VAF XA3,XA3,XB0
- VX XD2,XD2,XA3
- VERLLF XD2,XD2,8
-
- VAF XC2,XC2,XD3
- VX XB1,XB1,XC2
- VERLLF XB1,XB1,7
-
- VAF XC3,XC3,XD0
- VX XB2,XB2,XC3
- VERLLF XB2,XB2,7
-
- VAF XC0,XC0,XD1
- VX XB3,XB3,XC0
- VERLLF XB3,XB3,7
-
- VAF XC1,XC1,XD2
- VX XB0,XB0,XC1
- VERLLF XB0,XB0,7
- brct %r0,.Loop_4x
-
- VAF XD0,XD0,CTR
-
- VMRHF XT0,XA0,XA1 # transpose data
- VMRHF XT1,XA2,XA3
- VMRLF XT2,XA0,XA1
- VMRLF XT3,XA2,XA3
- VPDI XA0,XT0,XT1,0b0000
- VPDI XA1,XT0,XT1,0b0101
- VPDI XA2,XT2,XT3,0b0000
- VPDI XA3,XT2,XT3,0b0101
-
- VMRHF XT0,XB0,XB1
- VMRHF XT1,XB2,XB3
- VMRLF XT2,XB0,XB1
- VMRLF XT3,XB2,XB3
- VPDI XB0,XT0,XT1,0b0000
- VPDI XB1,XT0,XT1,0b0101
- VPDI XB2,XT2,XT3,0b0000
- VPDI XB3,XT2,XT3,0b0101
-
- VMRHF XT0,XC0,XC1
- VMRHF XT1,XC2,XC3
- VMRLF XT2,XC0,XC1
- VMRLF XT3,XC2,XC3
- VPDI XC0,XT0,XT1,0b0000
- VPDI XC1,XT0,XT1,0b0101
- VPDI XC2,XT2,XT3,0b0000
- VPDI XC3,XT2,XT3,0b0101
-
- VMRHF XT0,XD0,XD1
- VMRHF XT1,XD2,XD3
- VMRLF XT2,XD0,XD1
- VMRLF XT3,XD2,XD3
- VPDI XD0,XT0,XT1,0b0000
- VPDI XD1,XT0,XT1,0b0101
- VPDI XD2,XT2,XT3,0b0000
- VPDI XD3,XT2,XT3,0b0101
-
- VAF XA0,XA0,K0
- VAF XB0,XB0,K1
- VAF XC0,XC0,K2
- VAF XD0,XD0,K3
-
- VPERM XA0,XA0,XA0,BEPERM
- VPERM XB0,XB0,XB0,BEPERM
- VPERM XC0,XC0,XC0,BEPERM
- VPERM XD0,XD0,XD0,BEPERM
-
- VLM XT0,XT3,0,INP,0
-
- VX XT0,XT0,XA0
- VX XT1,XT1,XB0
- VX XT2,XT2,XC0
- VX XT3,XT3,XD0
-
- VSTM XT0,XT3,0,OUT,0
-
- la INP,0x40(INP)
- la OUT,0x40(OUT)
- aghi LEN,-0x40
-
- VAF XA0,XA1,K0
- VAF XB0,XB1,K1
- VAF XC0,XC1,K2
- VAF XD0,XD1,K3
-
- VPERM XA0,XA0,XA0,BEPERM
- VPERM XB0,XB0,XB0,BEPERM
- VPERM XC0,XC0,XC0,BEPERM
- VPERM XD0,XD0,XD0,BEPERM
-
- clgfi LEN,0x40
- jl .Ltail_4x
-
- VLM XT0,XT3,0,INP,0
-
- VX XT0,XT0,XA0
- VX XT1,XT1,XB0
- VX XT2,XT2,XC0
- VX XT3,XT3,XD0
-
- VSTM XT0,XT3,0,OUT,0
-
- la INP,0x40(INP)
- la OUT,0x40(OUT)
- aghi LEN,-0x40
- je .Ldone_4x
-
- VAF XA0,XA2,K0
- VAF XB0,XB2,K1
- VAF XC0,XC2,K2
- VAF XD0,XD2,K3
-
- VPERM XA0,XA0,XA0,BEPERM
- VPERM XB0,XB0,XB0,BEPERM
- VPERM XC0,XC0,XC0,BEPERM
- VPERM XD0,XD0,XD0,BEPERM
-
- clgfi LEN,0x40
- jl .Ltail_4x
-
- VLM XT0,XT3,0,INP,0
-
- VX XT0,XT0,XA0
- VX XT1,XT1,XB0
- VX XT2,XT2,XC0
- VX XT3,XT3,XD0
-
- VSTM XT0,XT3,0,OUT,0
-
- la INP,0x40(INP)
- la OUT,0x40(OUT)
- aghi LEN,-0x40
- je .Ldone_4x
-
- VAF XA0,XA3,K0
- VAF XB0,XB3,K1
- VAF XC0,XC3,K2
- VAF XD0,XD3,K3
-
- VPERM XA0,XA0,XA0,BEPERM
- VPERM XB0,XB0,XB0,BEPERM
- VPERM XC0,XC0,XC0,BEPERM
- VPERM XD0,XD0,XD0,BEPERM
-
- clgfi LEN,0x40
- jl .Ltail_4x
-
- VLM XT0,XT3,0,INP,0
-
- VX XT0,XT0,XA0
- VX XT1,XT1,XB0
- VX XT2,XT2,XC0
- VX XT3,XT3,XD0
-
- VSTM XT0,XT3,0,OUT,0
-
-.Ldone_4x:
- lmg %r6,%r7,6*8(SP)
- BR_EX %r14
-
-.Ltail_4x:
- VLR XT0,XC0
- VLR XT1,XD0
-
- VST XA0,8*8+0x00,,SP
- VST XB0,8*8+0x10,,SP
- VST XT0,8*8+0x20,,SP
- VST XT1,8*8+0x30,,SP
-
- lghi %r1,0
-
-.Loop_tail_4x:
- llgc %r5,0(%r1,INP)
- llgc %r6,8*8(%r1,SP)
- xr %r6,%r5
- stc %r6,0(%r1,OUT)
- la %r1,1(%r1)
- brct LEN,.Loop_tail_4x
-
- lmg %r6,%r7,6*8(SP)
- BR_EX %r14
-SYM_FUNC_END(chacha20_vx_4x)
-
-#undef OUT
-#undef INP
-#undef LEN
-#undef KEY
-#undef COUNTER
-
-#undef BEPERM
-
-#undef K0
-#undef K1
-#undef K2
-#undef K3
-
-
-#############################################################################
-# void chacha20_vx(u8 *out, counst u8 *inp, size_t len,
-# counst u32 *key, const u32 *counter)
-
-#define OUT %r2
-#define INP %r3
-#define LEN %r4
-#define KEY %r5
-#define COUNTER %r6
-
-#define BEPERM %v31
-
-#define K0 %v27
-#define K1 %v24
-#define K2 %v25
-#define K3 %v26
-
-#define A0 %v0
-#define B0 %v1
-#define C0 %v2
-#define D0 %v3
-
-#define A1 %v4
-#define B1 %v5
-#define C1 %v6
-#define D1 %v7
-
-#define A2 %v8
-#define B2 %v9
-#define C2 %v10
-#define D2 %v11
-
-#define A3 %v12
-#define B3 %v13
-#define C3 %v14
-#define D3 %v15
-
-#define A4 %v16
-#define B4 %v17
-#define C4 %v18
-#define D4 %v19
-
-#define A5 %v20
-#define B5 %v21
-#define C5 %v22
-#define D5 %v23
-
-#define T0 %v27
-#define T1 %v28
-#define T2 %v29
-#define T3 %v30
-
-SYM_FUNC_START(chacha20_vx)
- clgfi LEN,256
- jle chacha20_vx_4x
- stmg %r6,%r7,6*8(SP)
-
- lghi %r1,-FRAME
- lgr %r0,SP
- la SP,0(%r1,SP)
- stg %r0,0(SP) # back-chain
-
- larl %r7,sigma
- lhi %r0,10
-
- VLM K1,K2,0,KEY,0 # load key
- VL K3,0,,COUNTER # load counter
-
- VLM K0,BEPERM,0,%r7,4 # load sigma, increments, ...
-
-.Loop_outer_vx:
- VLR A0,K0
- VLR B0,K1
- VLR A1,K0
- VLR B1,K1
- VLR A2,K0
- VLR B2,K1
- VLR A3,K0
- VLR B3,K1
- VLR A4,K0
- VLR B4,K1
- VLR A5,K0
- VLR B5,K1
-
- VLR D0,K3
- VAF D1,K3,T1 # K[3]+1
- VAF D2,K3,T2 # K[3]+2
- VAF D3,K3,T3 # K[3]+3
- VAF D4,D2,T2 # K[3]+4
- VAF D5,D2,T3 # K[3]+5
-
- VLR C0,K2
- VLR C1,K2
- VLR C2,K2
- VLR C3,K2
- VLR C4,K2
- VLR C5,K2
-
- VLR T1,D1
- VLR T2,D2
- VLR T3,D3
-
-.Loop_vx:
- VAF A0,A0,B0
- VAF A1,A1,B1
- VAF A2,A2,B2
- VAF A3,A3,B3
- VAF A4,A4,B4
- VAF A5,A5,B5
- VX D0,D0,A0
- VX D1,D1,A1
- VX D2,D2,A2
- VX D3,D3,A3
- VX D4,D4,A4
- VX D5,D5,A5
- VERLLF D0,D0,16
- VERLLF D1,D1,16
- VERLLF D2,D2,16
- VERLLF D3,D3,16
- VERLLF D4,D4,16
- VERLLF D5,D5,16
-
- VAF C0,C0,D0
- VAF C1,C1,D1
- VAF C2,C2,D2
- VAF C3,C3,D3
- VAF C4,C4,D4
- VAF C5,C5,D5
- VX B0,B0,C0
- VX B1,B1,C1
- VX B2,B2,C2
- VX B3,B3,C3
- VX B4,B4,C4
- VX B5,B5,C5
- VERLLF B0,B0,12
- VERLLF B1,B1,12
- VERLLF B2,B2,12
- VERLLF B3,B3,12
- VERLLF B4,B4,12
- VERLLF B5,B5,12
-
- VAF A0,A0,B0
- VAF A1,A1,B1
- VAF A2,A2,B2
- VAF A3,A3,B3
- VAF A4,A4,B4
- VAF A5,A5,B5
- VX D0,D0,A0
- VX D1,D1,A1
- VX D2,D2,A2
- VX D3,D3,A3
- VX D4,D4,A4
- VX D5,D5,A5
- VERLLF D0,D0,8
- VERLLF D1,D1,8
- VERLLF D2,D2,8
- VERLLF D3,D3,8
- VERLLF D4,D4,8
- VERLLF D5,D5,8
-
- VAF C0,C0,D0
- VAF C1,C1,D1
- VAF C2,C2,D2
- VAF C3,C3,D3
- VAF C4,C4,D4
- VAF C5,C5,D5
- VX B0,B0,C0
- VX B1,B1,C1
- VX B2,B2,C2
- VX B3,B3,C3
- VX B4,B4,C4
- VX B5,B5,C5
- VERLLF B0,B0,7
- VERLLF B1,B1,7
- VERLLF B2,B2,7
- VERLLF B3,B3,7
- VERLLF B4,B4,7
- VERLLF B5,B5,7
-
- VSLDB C0,C0,C0,8
- VSLDB C1,C1,C1,8
- VSLDB C2,C2,C2,8
- VSLDB C3,C3,C3,8
- VSLDB C4,C4,C4,8
- VSLDB C5,C5,C5,8
- VSLDB B0,B0,B0,4
- VSLDB B1,B1,B1,4
- VSLDB B2,B2,B2,4
- VSLDB B3,B3,B3,4
- VSLDB B4,B4,B4,4
- VSLDB B5,B5,B5,4
- VSLDB D0,D0,D0,12
- VSLDB D1,D1,D1,12
- VSLDB D2,D2,D2,12
- VSLDB D3,D3,D3,12
- VSLDB D4,D4,D4,12
- VSLDB D5,D5,D5,12
-
- VAF A0,A0,B0
- VAF A1,A1,B1
- VAF A2,A2,B2
- VAF A3,A3,B3
- VAF A4,A4,B4
- VAF A5,A5,B5
- VX D0,D0,A0
- VX D1,D1,A1
- VX D2,D2,A2
- VX D3,D3,A3
- VX D4,D4,A4
- VX D5,D5,A5
- VERLLF D0,D0,16
- VERLLF D1,D1,16
- VERLLF D2,D2,16
- VERLLF D3,D3,16
- VERLLF D4,D4,16
- VERLLF D5,D5,16
-
- VAF C0,C0,D0
- VAF C1,C1,D1
- VAF C2,C2,D2
- VAF C3,C3,D3
- VAF C4,C4,D4
- VAF C5,C5,D5
- VX B0,B0,C0
- VX B1,B1,C1
- VX B2,B2,C2
- VX B3,B3,C3
- VX B4,B4,C4
- VX B5,B5,C5
- VERLLF B0,B0,12
- VERLLF B1,B1,12
- VERLLF B2,B2,12
- VERLLF B3,B3,12
- VERLLF B4,B4,12
- VERLLF B5,B5,12
-
- VAF A0,A0,B0
- VAF A1,A1,B1
- VAF A2,A2,B2
- VAF A3,A3,B3
- VAF A4,A4,B4
- VAF A5,A5,B5
- VX D0,D0,A0
- VX D1,D1,A1
- VX D2,D2,A2
- VX D3,D3,A3
- VX D4,D4,A4
- VX D5,D5,A5
- VERLLF D0,D0,8
- VERLLF D1,D1,8
- VERLLF D2,D2,8
- VERLLF D3,D3,8
- VERLLF D4,D4,8
- VERLLF D5,D5,8
-
- VAF C0,C0,D0
- VAF C1,C1,D1
- VAF C2,C2,D2
- VAF C3,C3,D3
- VAF C4,C4,D4
- VAF C5,C5,D5
- VX B0,B0,C0
- VX B1,B1,C1
- VX B2,B2,C2
- VX B3,B3,C3
- VX B4,B4,C4
- VX B5,B5,C5
- VERLLF B0,B0,7
- VERLLF B1,B1,7
- VERLLF B2,B2,7
- VERLLF B3,B3,7
- VERLLF B4,B4,7
- VERLLF B5,B5,7
-
- VSLDB C0,C0,C0,8
- VSLDB C1,C1,C1,8
- VSLDB C2,C2,C2,8
- VSLDB C3,C3,C3,8
- VSLDB C4,C4,C4,8
- VSLDB C5,C5,C5,8
- VSLDB B0,B0,B0,12
- VSLDB B1,B1,B1,12
- VSLDB B2,B2,B2,12
- VSLDB B3,B3,B3,12
- VSLDB B4,B4,B4,12
- VSLDB B5,B5,B5,12
- VSLDB D0,D0,D0,4
- VSLDB D1,D1,D1,4
- VSLDB D2,D2,D2,4
- VSLDB D3,D3,D3,4
- VSLDB D4,D4,D4,4
- VSLDB D5,D5,D5,4
- brct %r0,.Loop_vx
-
- VAF A0,A0,K0
- VAF B0,B0,K1
- VAF C0,C0,K2
- VAF D0,D0,K3
- VAF A1,A1,K0
- VAF D1,D1,T1 # +K[3]+1
-
- VPERM A0,A0,A0,BEPERM
- VPERM B0,B0,B0,BEPERM
- VPERM C0,C0,C0,BEPERM
- VPERM D0,D0,D0,BEPERM
-
- clgfi LEN,0x40
- jl .Ltail_vx
-
- VAF D2,D2,T2 # +K[3]+2
- VAF D3,D3,T3 # +K[3]+3
- VLM T0,T3,0,INP,0
-
- VX A0,A0,T0
- VX B0,B0,T1
- VX C0,C0,T2
- VX D0,D0,T3
-
- VLM K0,T3,0,%r7,4 # re-load sigma and increments
-
- VSTM A0,D0,0,OUT,0
-
- la INP,0x40(INP)
- la OUT,0x40(OUT)
- aghi LEN,-0x40
- je .Ldone_vx
-
- VAF B1,B1,K1
- VAF C1,C1,K2
-
- VPERM A0,A1,A1,BEPERM
- VPERM B0,B1,B1,BEPERM
- VPERM C0,C1,C1,BEPERM
- VPERM D0,D1,D1,BEPERM
-
- clgfi LEN,0x40
- jl .Ltail_vx
-
- VLM A1,D1,0,INP,0
-
- VX A0,A0,A1
- VX B0,B0,B1
- VX C0,C0,C1
- VX D0,D0,D1
-
- VSTM A0,D0,0,OUT,0
-
- la INP,0x40(INP)
- la OUT,0x40(OUT)
- aghi LEN,-0x40
- je .Ldone_vx
-
- VAF A2,A2,K0
- VAF B2,B2,K1
- VAF C2,C2,K2
-
- VPERM A0,A2,A2,BEPERM
- VPERM B0,B2,B2,BEPERM
- VPERM C0,C2,C2,BEPERM
- VPERM D0,D2,D2,BEPERM
-
- clgfi LEN,0x40
- jl .Ltail_vx
-
- VLM A1,D1,0,INP,0
-
- VX A0,A0,A1
- VX B0,B0,B1
- VX C0,C0,C1
- VX D0,D0,D1
-
- VSTM A0,D0,0,OUT,0
-
- la INP,0x40(INP)
- la OUT,0x40(OUT)
- aghi LEN,-0x40
- je .Ldone_vx
-
- VAF A3,A3,K0
- VAF B3,B3,K1
- VAF C3,C3,K2
- VAF D2,K3,T3 # K[3]+3
-
- VPERM A0,A3,A3,BEPERM
- VPERM B0,B3,B3,BEPERM
- VPERM C0,C3,C3,BEPERM
- VPERM D0,D3,D3,BEPERM
-
- clgfi LEN,0x40
- jl .Ltail_vx
-
- VAF D3,D2,T1 # K[3]+4
- VLM A1,D1,0,INP,0
-
- VX A0,A0,A1
- VX B0,B0,B1
- VX C0,C0,C1
- VX D0,D0,D1
-
- VSTM A0,D0,0,OUT,0
-
- la INP,0x40(INP)
- la OUT,0x40(OUT)
- aghi LEN,-0x40
- je .Ldone_vx
-
- VAF A4,A4,K0
- VAF B4,B4,K1
- VAF C4,C4,K2
- VAF D4,D4,D3 # +K[3]+4
- VAF D3,D3,T1 # K[3]+5
- VAF K3,D2,T3 # K[3]+=6
-
- VPERM A0,A4,A4,BEPERM
- VPERM B0,B4,B4,BEPERM
- VPERM C0,C4,C4,BEPERM
- VPERM D0,D4,D4,BEPERM
-
- clgfi LEN,0x40
- jl .Ltail_vx
-
- VLM A1,D1,0,INP,0
-
- VX A0,A0,A1
- VX B0,B0,B1
- VX C0,C0,C1
- VX D0,D0,D1
-
- VSTM A0,D0,0,OUT,0
-
- la INP,0x40(INP)
- la OUT,0x40(OUT)
- aghi LEN,-0x40
- je .Ldone_vx
-
- VAF A5,A5,K0
- VAF B5,B5,K1
- VAF C5,C5,K2
- VAF D5,D5,D3 # +K[3]+5
-
- VPERM A0,A5,A5,BEPERM
- VPERM B0,B5,B5,BEPERM
- VPERM C0,C5,C5,BEPERM
- VPERM D0,D5,D5,BEPERM
-
- clgfi LEN,0x40
- jl .Ltail_vx
-
- VLM A1,D1,0,INP,0
-
- VX A0,A0,A1
- VX B0,B0,B1
- VX C0,C0,C1
- VX D0,D0,D1
-
- VSTM A0,D0,0,OUT,0
-
- la INP,0x40(INP)
- la OUT,0x40(OUT)
- lhi %r0,10
- aghi LEN,-0x40
- jne .Loop_outer_vx
-
-.Ldone_vx:
- lmg %r6,%r7,FRAME+6*8(SP)
- la SP,FRAME(SP)
- BR_EX %r14
-
-.Ltail_vx:
- VSTM A0,D0,8*8,SP,3
- lghi %r1,0
-
-.Loop_tail_vx:
- llgc %r5,0(%r1,INP)
- llgc %r6,8*8(%r1,SP)
- xr %r6,%r5
- stc %r6,0(%r1,OUT)
- la %r1,1(%r1)
- brct LEN,.Loop_tail_vx
-
- lmg %r6,%r7,FRAME+6*8(SP)
- la SP,FRAME(SP)
- BR_EX %r14
-SYM_FUNC_END(chacha20_vx)
-
-.previous
diff --git a/arch/s390/lib/crypto/chacha-s390.h b/arch/s390/lib/crypto/chacha-s390.h
deleted file mode 100644
index 733744ce30f5..000000000000
--- a/arch/s390/lib/crypto/chacha-s390.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * s390 ChaCha stream cipher.
- *
- * Copyright IBM Corp. 2021
- */
-
-#ifndef _CHACHA_S390_H
-#define _CHACHA_S390_H
-
-void chacha20_vx(u8 *out, const u8 *inp, size_t len, const u32 *key,
- const u32 *counter);
-
-#endif /* _CHACHA_S390_H */
diff --git a/arch/s390/lib/crypto/sha256.c b/arch/s390/lib/crypto/sha256.c
deleted file mode 100644
index 7dfe120fafab..000000000000
--- a/arch/s390/lib/crypto/sha256.c
+++ /dev/null
@@ -1,47 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 optimized using the CP Assist for Cryptographic Functions (CPACF)
- *
- * Copyright 2025 Google LLC
- */
-#include <asm/cpacf.h>
-#include <crypto/internal/sha2.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha256);
-
-void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks)
-{
- if (static_branch_likely(&have_cpacf_sha256))
- cpacf_kimd(CPACF_KIMD_SHA_256, state, data,
- nblocks * SHA256_BLOCK_SIZE);
- else
- sha256_blocks_generic(state, data, nblocks);
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-
-bool sha256_is_arch_optimized(void)
-{
- return static_key_enabled(&have_cpacf_sha256);
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
-static int __init sha256_s390_mod_init(void)
-{
- if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
- cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256))
- static_branch_enable(&have_cpacf_sha256);
- return 0;
-}
-subsys_initcall(sha256_s390_mod_init);
-
-static void __exit sha256_s390_mod_exit(void)
-{
-}
-module_exit(sha256_s390_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 using the CP Assist for Cryptographic Functions (CPACF)");
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 074bf4fb4ce2..e4953453d254 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -142,7 +142,7 @@ bool force_dma_unencrypted(struct device *dev)
}
/* protected virtualization */
-static void pv_init(void)
+static void __init pv_init(void)
{
if (!is_prot_virt_guest())
return;
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index c7f8313ba449..0c9a35782c83 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -566,7 +566,15 @@ static void bpf_jit_plt(struct bpf_plt *plt, void *ret, void *target)
{
memcpy(plt, &bpf_plt, sizeof(*plt));
plt->ret = ret;
- plt->target = target;
+ /*
+ * (target == NULL) implies that the branch to this PLT entry was
+ * patched and became a no-op. However, some CPU could have jumped
+ * to this PLT entry before patching and may be still executing it.
+ *
+ * Since the intention in this case is to make the PLT entry a no-op,
+ * make the target point to the return label instead of NULL.
+ */
+ plt->target = target ?: ret;
}
/*
diff --git a/arch/s390/purgatory/purgatory.c b/arch/s390/purgatory/purgatory.c
index 030efda05dbe..ecb38102187c 100644
--- a/arch/s390/purgatory/purgatory.c
+++ b/arch/s390/purgatory/purgatory.c
@@ -16,7 +16,7 @@ int verify_sha256_digest(void)
{
struct kexec_sha_region *ptr, *end;
u8 digest[SHA256_DIGEST_SIZE];
- struct sha256_state sctx;
+ struct sha256_ctx sctx;
sha256_init(&sctx);
end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions);
diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index 36f50ad81e83..06f765d71a29 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -291,7 +291,7 @@ static const struct user_regset sh_regsets[] = {
* PC, PR, SR, GBR, MACH, MACL, TRA
*/
[REGSET_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = ELF_NGREG,
.size = sizeof(long),
.align = sizeof(long),
@@ -301,7 +301,7 @@ static const struct user_regset sh_regsets[] = {
#ifdef CONFIG_SH_FPU
[REGSET_FPU] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(struct user_fpu_struct) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 52a7652fcff6..5e9c9eff5539 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -471,3 +471,5 @@
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
467 common open_tree_attr sys_open_tree_attr
+468 common file_getattr sys_file_getattr
+469 common file_setattr sys_file_setattr
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 0f88123925a4..dcfdb7f1dae9 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -110,7 +110,6 @@ config SPARC64
select HAVE_SETUP_PER_CPU_AREA
select NEED_PER_CPU_EMBED_FIRST_CHUNK
select NEED_PER_CPU_PAGE_FIRST_CHUNK
- select ARCH_HAS_CRC32
config ARCH_PROC_KCORE_TEXT
def_bool y
diff --git a/arch/sparc/crypto/Kconfig b/arch/sparc/crypto/Kconfig
index a6ba319c42dc..f5b2e720fec3 100644
--- a/arch/sparc/crypto/Kconfig
+++ b/arch/sparc/crypto/Kconfig
@@ -26,26 +26,6 @@ config CRYPTO_MD5_SPARC64
Architecture: sparc64 using crypto instructions, when available
-config CRYPTO_SHA1_SPARC64
- tristate "Hash functions: SHA-1"
- depends on SPARC64
- select CRYPTO_SHA1
- select CRYPTO_HASH
- help
- SHA-1 secure hash algorithm (FIPS 180)
-
- Architecture: sparc64
-
-config CRYPTO_SHA512_SPARC64
- tristate "Hash functions: SHA-384 and SHA-512"
- depends on SPARC64
- select CRYPTO_SHA512
- select CRYPTO_HASH
- help
- SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
-
- Architecture: sparc64 using crypto instructions, when available
-
config CRYPTO_AES_SPARC64
tristate "Ciphers: AES, modes: ECB, CBC, CTR"
depends on SPARC64
diff --git a/arch/sparc/crypto/Makefile b/arch/sparc/crypto/Makefile
index 701c39edb0d7..0d05a17988c4 100644
--- a/arch/sparc/crypto/Makefile
+++ b/arch/sparc/crypto/Makefile
@@ -3,16 +3,12 @@
# Arch-specific CryptoAPI modules.
#
-obj-$(CONFIG_CRYPTO_SHA1_SPARC64) += sha1-sparc64.o
-obj-$(CONFIG_CRYPTO_SHA512_SPARC64) += sha512-sparc64.o
obj-$(CONFIG_CRYPTO_MD5_SPARC64) += md5-sparc64.o
obj-$(CONFIG_CRYPTO_AES_SPARC64) += aes-sparc64.o
obj-$(CONFIG_CRYPTO_DES_SPARC64) += des-sparc64.o
obj-$(CONFIG_CRYPTO_CAMELLIA_SPARC64) += camellia-sparc64.o
-sha1-sparc64-y := sha1_asm.o sha1_glue.o
-sha512-sparc64-y := sha512_asm.o sha512_glue.o
md5-sparc64-y := md5_asm.o md5_glue.o
aes-sparc64-y := aes_asm.o aes_glue.o
diff --git a/arch/sparc/crypto/sha1_asm.S b/arch/sparc/crypto/sha1_asm.S
deleted file mode 100644
index 00b46bac1b08..000000000000
--- a/arch/sparc/crypto/sha1_asm.S
+++ /dev/null
@@ -1,72 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/linkage.h>
-#include <asm/opcodes.h>
-#include <asm/visasm.h>
-
-ENTRY(sha1_sparc64_transform)
- /* %o0 = digest, %o1 = data, %o2 = rounds */
- VISEntryHalf
- ld [%o0 + 0x00], %f0
- ld [%o0 + 0x04], %f1
- ld [%o0 + 0x08], %f2
- andcc %o1, 0x7, %g0
- ld [%o0 + 0x0c], %f3
- bne,pn %xcc, 10f
- ld [%o0 + 0x10], %f4
-
-1:
- ldd [%o1 + 0x00], %f8
- ldd [%o1 + 0x08], %f10
- ldd [%o1 + 0x10], %f12
- ldd [%o1 + 0x18], %f14
- ldd [%o1 + 0x20], %f16
- ldd [%o1 + 0x28], %f18
- ldd [%o1 + 0x30], %f20
- ldd [%o1 + 0x38], %f22
-
- SHA1
-
- subcc %o2, 1, %o2
- bne,pt %xcc, 1b
- add %o1, 0x40, %o1
-
-5:
- st %f0, [%o0 + 0x00]
- st %f1, [%o0 + 0x04]
- st %f2, [%o0 + 0x08]
- st %f3, [%o0 + 0x0c]
- st %f4, [%o0 + 0x10]
- retl
- VISExitHalf
-10:
- alignaddr %o1, %g0, %o1
-
- ldd [%o1 + 0x00], %f10
-1:
- ldd [%o1 + 0x08], %f12
- ldd [%o1 + 0x10], %f14
- ldd [%o1 + 0x18], %f16
- ldd [%o1 + 0x20], %f18
- ldd [%o1 + 0x28], %f20
- ldd [%o1 + 0x30], %f22
- ldd [%o1 + 0x38], %f24
- ldd [%o1 + 0x40], %f26
-
- faligndata %f10, %f12, %f8
- faligndata %f12, %f14, %f10
- faligndata %f14, %f16, %f12
- faligndata %f16, %f18, %f14
- faligndata %f18, %f20, %f16
- faligndata %f20, %f22, %f18
- faligndata %f22, %f24, %f20
- faligndata %f24, %f26, %f22
-
- SHA1
-
- subcc %o2, 1, %o2
- fsrc2 %f26, %f10
- bne,pt %xcc, 1b
- add %o1, 0x40, %o1
-
- ba,a,pt %xcc, 5b
-ENDPROC(sha1_sparc64_transform)
diff --git a/arch/sparc/crypto/sha1_glue.c b/arch/sparc/crypto/sha1_glue.c
deleted file mode 100644
index ef19d5023b1b..000000000000
--- a/arch/sparc/crypto/sha1_glue.c
+++ /dev/null
@@ -1,94 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* Glue code for SHA1 hashing optimized for sparc64 crypto opcodes.
- *
- * This is based largely upon arch/x86/crypto/sha1_ssse3_glue.c
- *
- * Copyright (c) Alan Smithee.
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
- * Copyright (c) Mathias Krause <minipli@googlemail.com>
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <asm/elf.h>
-#include <asm/opcodes.h>
-#include <asm/pstate.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha1_sparc64_transform(struct sha1_state *digest,
- const u8 *data, int rounds);
-
-static int sha1_sparc64_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha1_base_do_update_blocks(desc, data, len,
- sha1_sparc64_transform);
-}
-
-/* Add padding and return the message digest. */
-static int sha1_sparc64_finup(struct shash_desc *desc, const u8 *src,
- unsigned int len, u8 *out)
-{
- sha1_base_do_finup(desc, src, len, sha1_sparc64_transform);
- return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_base_init,
- .update = sha1_sparc64_update,
- .finup = sha1_sparc64_finup,
- .descsize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name= "sha1-sparc64",
- .cra_priority = SPARC_CR_OPCODE_PRIORITY,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static bool __init sparc64_has_sha1_opcode(void)
-{
- unsigned long cfr;
-
- if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
- return false;
-
- __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
- if (!(cfr & CFR_SHA1))
- return false;
-
- return true;
-}
-
-static int __init sha1_sparc64_mod_init(void)
-{
- if (sparc64_has_sha1_opcode()) {
- pr_info("Using sparc64 sha1 opcode optimized SHA-1 implementation\n");
- return crypto_register_shash(&alg);
- }
- pr_info("sparc64 sha1 opcode not available.\n");
- return -ENODEV;
-}
-
-static void __exit sha1_sparc64_mod_fini(void)
-{
- crypto_unregister_shash(&alg);
-}
-
-module_init(sha1_sparc64_mod_init);
-module_exit(sha1_sparc64_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, sparc64 sha1 opcode accelerated");
-
-MODULE_ALIAS_CRYPTO("sha1");
-
-#include "crop_devid.c"
diff --git a/arch/sparc/crypto/sha512_asm.S b/arch/sparc/crypto/sha512_asm.S
deleted file mode 100644
index 9932b4fe1b59..000000000000
--- a/arch/sparc/crypto/sha512_asm.S
+++ /dev/null
@@ -1,102 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/linkage.h>
-#include <asm/opcodes.h>
-#include <asm/visasm.h>
-
-ENTRY(sha512_sparc64_transform)
- /* %o0 = digest, %o1 = data, %o2 = rounds */
- VISEntry
- ldd [%o0 + 0x00], %f0
- ldd [%o0 + 0x08], %f2
- ldd [%o0 + 0x10], %f4
- ldd [%o0 + 0x18], %f6
- ldd [%o0 + 0x20], %f8
- ldd [%o0 + 0x28], %f10
- andcc %o1, 0x7, %g0
- ldd [%o0 + 0x30], %f12
- bne,pn %xcc, 10f
- ldd [%o0 + 0x38], %f14
-
-1:
- ldd [%o1 + 0x00], %f16
- ldd [%o1 + 0x08], %f18
- ldd [%o1 + 0x10], %f20
- ldd [%o1 + 0x18], %f22
- ldd [%o1 + 0x20], %f24
- ldd [%o1 + 0x28], %f26
- ldd [%o1 + 0x30], %f28
- ldd [%o1 + 0x38], %f30
- ldd [%o1 + 0x40], %f32
- ldd [%o1 + 0x48], %f34
- ldd [%o1 + 0x50], %f36
- ldd [%o1 + 0x58], %f38
- ldd [%o1 + 0x60], %f40
- ldd [%o1 + 0x68], %f42
- ldd [%o1 + 0x70], %f44
- ldd [%o1 + 0x78], %f46
-
- SHA512
-
- subcc %o2, 1, %o2
- bne,pt %xcc, 1b
- add %o1, 0x80, %o1
-
-5:
- std %f0, [%o0 + 0x00]
- std %f2, [%o0 + 0x08]
- std %f4, [%o0 + 0x10]
- std %f6, [%o0 + 0x18]
- std %f8, [%o0 + 0x20]
- std %f10, [%o0 + 0x28]
- std %f12, [%o0 + 0x30]
- std %f14, [%o0 + 0x38]
- retl
- VISExit
-10:
- alignaddr %o1, %g0, %o1
-
- ldd [%o1 + 0x00], %f18
-1:
- ldd [%o1 + 0x08], %f20
- ldd [%o1 + 0x10], %f22
- ldd [%o1 + 0x18], %f24
- ldd [%o1 + 0x20], %f26
- ldd [%o1 + 0x28], %f28
- ldd [%o1 + 0x30], %f30
- ldd [%o1 + 0x38], %f32
- ldd [%o1 + 0x40], %f34
- ldd [%o1 + 0x48], %f36
- ldd [%o1 + 0x50], %f38
- ldd [%o1 + 0x58], %f40
- ldd [%o1 + 0x60], %f42
- ldd [%o1 + 0x68], %f44
- ldd [%o1 + 0x70], %f46
- ldd [%o1 + 0x78], %f48
- ldd [%o1 + 0x80], %f50
-
- faligndata %f18, %f20, %f16
- faligndata %f20, %f22, %f18
- faligndata %f22, %f24, %f20
- faligndata %f24, %f26, %f22
- faligndata %f26, %f28, %f24
- faligndata %f28, %f30, %f26
- faligndata %f30, %f32, %f28
- faligndata %f32, %f34, %f30
- faligndata %f34, %f36, %f32
- faligndata %f36, %f38, %f34
- faligndata %f38, %f40, %f36
- faligndata %f40, %f42, %f38
- faligndata %f42, %f44, %f40
- faligndata %f44, %f46, %f42
- faligndata %f46, %f48, %f44
- faligndata %f48, %f50, %f46
-
- SHA512
-
- subcc %o2, 1, %o2
- fsrc2 %f50, %f18
- bne,pt %xcc, 1b
- add %o1, 0x80, %o1
-
- ba,a,pt %xcc, 5b
-ENDPROC(sha512_sparc64_transform)
diff --git a/arch/sparc/crypto/sha512_glue.c b/arch/sparc/crypto/sha512_glue.c
deleted file mode 100644
index 47b9277b6877..000000000000
--- a/arch/sparc/crypto/sha512_glue.c
+++ /dev/null
@@ -1,122 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* Glue code for SHA512 hashing optimized for sparc64 crypto opcodes.
- *
- * This is based largely upon crypto/sha512_generic.c
- *
- * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <asm/elf.h>
-#include <asm/opcodes.h>
-#include <asm/pstate.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha512_sparc64_transform(u64 *digest, const char *data,
- unsigned int rounds);
-
-static void sha512_block(struct sha512_state *sctx, const u8 *src, int blocks)
-{
- sha512_sparc64_transform(sctx->state, src, blocks);
-}
-
-static int sha512_sparc64_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha512_base_do_update_blocks(desc, data, len, sha512_block);
-}
-
-static int sha512_sparc64_finup(struct shash_desc *desc, const u8 *src,
- unsigned int len, u8 *out)
-{
- sha512_base_do_finup(desc, src, len, sha512_block);
- return sha512_base_finish(desc, out);
-}
-
-static struct shash_alg sha512 = {
- .digestsize = SHA512_DIGEST_SIZE,
- .init = sha512_base_init,
- .update = sha512_sparc64_update,
- .finup = sha512_sparc64_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha512",
- .cra_driver_name= "sha512-sparc64",
- .cra_priority = SPARC_CR_OPCODE_PRIORITY,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static struct shash_alg sha384 = {
- .digestsize = SHA384_DIGEST_SIZE,
- .init = sha384_base_init,
- .update = sha512_sparc64_update,
- .finup = sha512_sparc64_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha384",
- .cra_driver_name= "sha384-sparc64",
- .cra_priority = SPARC_CR_OPCODE_PRIORITY,
- .cra_blocksize = SHA384_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static bool __init sparc64_has_sha512_opcode(void)
-{
- unsigned long cfr;
-
- if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
- return false;
-
- __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
- if (!(cfr & CFR_SHA512))
- return false;
-
- return true;
-}
-
-static int __init sha512_sparc64_mod_init(void)
-{
- if (sparc64_has_sha512_opcode()) {
- int ret = crypto_register_shash(&sha384);
- if (ret < 0)
- return ret;
-
- ret = crypto_register_shash(&sha512);
- if (ret < 0) {
- crypto_unregister_shash(&sha384);
- return ret;
- }
-
- pr_info("Using sparc64 sha512 opcode optimized SHA-512/SHA-384 implementation\n");
- return 0;
- }
- pr_info("sparc64 sha512 opcode not available.\n");
- return -ENODEV;
-}
-
-static void __exit sha512_sparc64_mod_fini(void)
-{
- crypto_unregister_shash(&sha384);
- crypto_unregister_shash(&sha512);
-}
-
-module_init(sha512_sparc64_mod_init);
-module_exit(sha512_sparc64_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-384 and SHA-512 Secure Hash Algorithm, sparc64 sha512 opcode accelerated");
-
-MODULE_ALIAS_CRYPTO("sha384");
-MODULE_ALIAS_CRYPTO("sha512");
-
-#include "crop_devid.c"
diff --git a/arch/sparc/kernel/ptrace_32.c b/arch/sparc/kernel/ptrace_32.c
index c273ccebea46..c56333975fb1 100644
--- a/arch/sparc/kernel/ptrace_32.c
+++ b/arch/sparc/kernel/ptrace_32.c
@@ -218,7 +218,7 @@ static const struct user_regset sparc32_regsets[] = {
* PSR, PC, nPC, Y, WIM, TBR
*/
[REGSET_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = 38,
.size = sizeof(u32), .align = sizeof(u32),
.regset_get = genregs32_get, .set = genregs32_set
@@ -234,7 +234,7 @@ static const struct user_regset sparc32_regsets[] = {
* FPU QUEUE (64 32-bit ints)
*/
[REGSET_FP] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = 99,
.size = sizeof(u32), .align = sizeof(u32),
.regset_get = fpregs32_get, .set = fpregs32_set
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c
index 4deba5b6eddb..9fc67fa9336f 100644
--- a/arch/sparc/kernel/ptrace_64.c
+++ b/arch/sparc/kernel/ptrace_64.c
@@ -420,7 +420,7 @@ static const struct user_regset sparc64_regsets[] = {
* TSTATE, TPC, TNPC, Y
*/
[REGSET_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = 36,
.size = sizeof(u64), .align = sizeof(u64),
.regset_get = genregs64_get, .set = genregs64_set
@@ -432,7 +432,7 @@ static const struct user_regset sparc64_regsets[] = {
* FPRS
*/
[REGSET_FP] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = 35,
.size = sizeof(u64), .align = sizeof(u64),
.regset_get = fpregs64_get, .set = fpregs64_set
@@ -750,7 +750,7 @@ static const struct user_regset sparc32_regsets[] = {
* PSR, PC, nPC, Y, WIM, TBR
*/
[REGSET_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = 38,
.size = sizeof(u32), .align = sizeof(u32),
.regset_get = genregs32_get, .set = genregs32_set
@@ -766,7 +766,7 @@ static const struct user_regset sparc32_regsets[] = {
* FPU QUEUE (64 32-bit ints)
*/
[REGSET_FP] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = 99,
.size = sizeof(u32), .align = sizeof(u32),
.regset_get = fpregs32_get, .set = fpregs32_set
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 83e45eb6c095..ebb7d06d1044 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -513,3 +513,5 @@
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
467 common open_tree_attr sys_open_tree_attr
+468 common file_getattr sys_file_getattr
+469 common file_setattr sys_file_setattr
diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile
index 5cf9781d68b4..ee5091dd67ed 100644
--- a/arch/sparc/lib/Makefile
+++ b/arch/sparc/lib/Makefile
@@ -4,7 +4,6 @@
asflags-y := -ansi -DST_DIV0=0x02
-obj-y += crypto/
lib-$(CONFIG_SPARC32) += ashrdi3.o
lib-$(CONFIG_SPARC32) += memcpy.o memset.o
lib-y += strlen.o
@@ -54,5 +53,3 @@ lib-$(CONFIG_SPARC64) += mcount.o ipcsum.o xor.o hweight.o ffs.o
obj-$(CONFIG_SPARC64) += iomap.o
obj-$(CONFIG_SPARC32) += atomic32.o
obj-$(CONFIG_SPARC64) += PeeCeeI.o
-obj-$(CONFIG_CRC32_ARCH) += crc32-sparc.o
-crc32-sparc-y := crc32.o crc32c_asm.o
diff --git a/arch/sparc/lib/crc32.c b/arch/sparc/lib/crc32.c
deleted file mode 100644
index 40d4720a42a1..000000000000
--- a/arch/sparc/lib/crc32.c
+++ /dev/null
@@ -1,93 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* CRC32c (Castagnoli), sparc64 crc32c opcode accelerated
- *
- * This is based largely upon arch/x86/crypto/crc32c-intel.c
- *
- * Copyright (C) 2008 Intel Corporation
- * Authors: Austin Zhang <austin_zhang@linux.intel.com>
- * Kent Liu <kent.liu@intel.com>
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/crc32.h>
-#include <asm/pstate.h>
-#include <asm/elf.h>
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32c_opcode);
-
-u32 crc32_le_arch(u32 crc, const u8 *data, size_t len)
-{
- return crc32_le_base(crc, data, len);
-}
-EXPORT_SYMBOL(crc32_le_arch);
-
-void crc32c_sparc64(u32 *crcp, const u64 *data, size_t len);
-
-u32 crc32c_arch(u32 crc, const u8 *data, size_t len)
-{
- size_t n = -(uintptr_t)data & 7;
-
- if (!static_branch_likely(&have_crc32c_opcode))
- return crc32c_base(crc, data, len);
-
- if (n) {
- /* Data isn't 8-byte aligned. Align it. */
- n = min(n, len);
- crc = crc32c_base(crc, data, n);
- data += n;
- len -= n;
- }
- n = len & ~7U;
- if (n) {
- crc32c_sparc64(&crc, (const u64 *)data, n);
- data += n;
- len -= n;
- }
- if (len)
- crc = crc32c_base(crc, data, len);
- return crc;
-}
-EXPORT_SYMBOL(crc32c_arch);
-
-u32 crc32_be_arch(u32 crc, const u8 *data, size_t len)
-{
- return crc32_be_base(crc, data, len);
-}
-EXPORT_SYMBOL(crc32_be_arch);
-
-static int __init crc32_sparc_init(void)
-{
- unsigned long cfr;
-
- if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
- return 0;
-
- __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
- if (!(cfr & CFR_CRC32C))
- return 0;
-
- static_branch_enable(&have_crc32c_opcode);
- pr_info("Using sparc64 crc32c opcode optimized CRC32C implementation\n");
- return 0;
-}
-subsys_initcall(crc32_sparc_init);
-
-static void __exit crc32_sparc_exit(void)
-{
-}
-module_exit(crc32_sparc_exit);
-
-u32 crc32_optimizations(void)
-{
- if (static_key_enabled(&have_crc32c_opcode))
- return CRC32C_OPTIMIZATION;
- return 0;
-}
-EXPORT_SYMBOL(crc32_optimizations);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("CRC32c (Castagnoli), sparc64 crc32c opcode accelerated");
diff --git a/arch/sparc/lib/crc32c_asm.S b/arch/sparc/lib/crc32c_asm.S
deleted file mode 100644
index 4db873850f44..000000000000
--- a/arch/sparc/lib/crc32c_asm.S
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/linkage.h>
-#include <asm/opcodes.h>
-#include <asm/visasm.h>
-#include <asm/asi.h>
-
-ENTRY(crc32c_sparc64)
- /* %o0=crc32p, %o1=data_ptr, %o2=len */
- VISEntryHalf
- lda [%o0] ASI_PL, %f1
-1: ldd [%o1], %f2
- CRC32C(0,2,0)
- subcc %o2, 8, %o2
- bne,pt %icc, 1b
- add %o1, 0x8, %o1
- sta %f1, [%o0] ASI_PL
- VISExitHalf
-2: retl
- nop
-ENDPROC(crc32c_sparc64)
diff --git a/arch/sparc/lib/crypto/Kconfig b/arch/sparc/lib/crypto/Kconfig
deleted file mode 100644
index e5c3e4d3dba6..000000000000
--- a/arch/sparc/lib/crypto/Kconfig
+++ /dev/null
@@ -1,8 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_SHA256_SPARC64
- tristate
- depends on SPARC64
- default CRYPTO_LIB_SHA256
- select CRYPTO_ARCH_HAVE_LIB_SHA256
- select CRYPTO_LIB_SHA256_GENERIC
diff --git a/arch/sparc/lib/crypto/Makefile b/arch/sparc/lib/crypto/Makefile
deleted file mode 100644
index 75ee244ad6f7..000000000000
--- a/arch/sparc/lib/crypto/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_SHA256_SPARC64) += sha256-sparc64.o
-sha256-sparc64-y := sha256.o sha256_asm.o
diff --git a/arch/sparc/lib/crypto/sha256.c b/arch/sparc/lib/crypto/sha256.c
deleted file mode 100644
index 8bdec2db08b3..000000000000
--- a/arch/sparc/lib/crypto/sha256.c
+++ /dev/null
@@ -1,64 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * SHA-256 accelerated using the sparc64 sha256 opcodes
- *
- * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
- * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <asm/elf.h>
-#include <asm/opcodes.h>
-#include <asm/pstate.h>
-#include <crypto/internal/sha2.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_opcodes);
-
-asmlinkage void sha256_sparc64_transform(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-
-void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks)
-{
- if (static_branch_likely(&have_sha256_opcodes))
- sha256_sparc64_transform(state, data, nblocks);
- else
- sha256_blocks_generic(state, data, nblocks);
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-
-bool sha256_is_arch_optimized(void)
-{
- return static_key_enabled(&have_sha256_opcodes);
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
-static int __init sha256_sparc64_mod_init(void)
-{
- unsigned long cfr;
-
- if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
- return 0;
-
- __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
- if (!(cfr & CFR_SHA256))
- return 0;
-
- static_branch_enable(&have_sha256_opcodes);
- pr_info("Using sparc64 sha256 opcode optimized SHA-256/SHA-224 implementation\n");
- return 0;
-}
-subsys_initcall(sha256_sparc64_mod_init);
-
-static void __exit sha256_sparc64_mod_exit(void)
-{
-}
-module_exit(sha256_sparc64_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 accelerated using the sparc64 sha256 opcodes");
diff --git a/arch/sparc/lib/crypto/sha256_asm.S b/arch/sparc/lib/crypto/sha256_asm.S
deleted file mode 100644
index ddcdd3daf31e..000000000000
--- a/arch/sparc/lib/crypto/sha256_asm.S
+++ /dev/null
@@ -1,78 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/linkage.h>
-#include <asm/opcodes.h>
-#include <asm/visasm.h>
-
-ENTRY(sha256_sparc64_transform)
- /* %o0 = state, %o1 = data, %o2 = nblocks */
- VISEntryHalf
- ld [%o0 + 0x00], %f0
- ld [%o0 + 0x04], %f1
- ld [%o0 + 0x08], %f2
- ld [%o0 + 0x0c], %f3
- ld [%o0 + 0x10], %f4
- ld [%o0 + 0x14], %f5
- andcc %o1, 0x7, %g0
- ld [%o0 + 0x18], %f6
- bne,pn %xcc, 10f
- ld [%o0 + 0x1c], %f7
-
-1:
- ldd [%o1 + 0x00], %f8
- ldd [%o1 + 0x08], %f10
- ldd [%o1 + 0x10], %f12
- ldd [%o1 + 0x18], %f14
- ldd [%o1 + 0x20], %f16
- ldd [%o1 + 0x28], %f18
- ldd [%o1 + 0x30], %f20
- ldd [%o1 + 0x38], %f22
-
- SHA256
-
- subcc %o2, 1, %o2
- bne,pt %xcc, 1b
- add %o1, 0x40, %o1
-
-5:
- st %f0, [%o0 + 0x00]
- st %f1, [%o0 + 0x04]
- st %f2, [%o0 + 0x08]
- st %f3, [%o0 + 0x0c]
- st %f4, [%o0 + 0x10]
- st %f5, [%o0 + 0x14]
- st %f6, [%o0 + 0x18]
- st %f7, [%o0 + 0x1c]
- retl
- VISExitHalf
-10:
- alignaddr %o1, %g0, %o1
-
- ldd [%o1 + 0x00], %f10
-1:
- ldd [%o1 + 0x08], %f12
- ldd [%o1 + 0x10], %f14
- ldd [%o1 + 0x18], %f16
- ldd [%o1 + 0x20], %f18
- ldd [%o1 + 0x28], %f20
- ldd [%o1 + 0x30], %f22
- ldd [%o1 + 0x38], %f24
- ldd [%o1 + 0x40], %f26
-
- faligndata %f10, %f12, %f8
- faligndata %f12, %f14, %f10
- faligndata %f14, %f16, %f12
- faligndata %f16, %f18, %f14
- faligndata %f18, %f20, %f16
- faligndata %f20, %f22, %f18
- faligndata %f22, %f24, %f20
- faligndata %f24, %f26, %f22
-
- SHA256
-
- subcc %o2, 1, %o2
- fsrc2 %f26, %f10
- bne,pt %xcc, 1b
- add %o1, 0x40, %o1
-
- ba,a,pt %xcc, 5b
-ENDPROC(sha256_sparc64_transform)
diff --git a/arch/sparc/vdso/Makefile b/arch/sparc/vdso/Makefile
index fdc4a8f5a49c..683b2d408224 100644
--- a/arch/sparc/vdso/Makefile
+++ b/arch/sparc/vdso/Makefile
@@ -48,7 +48,7 @@ CFL := $(PROFILING) -mcmodel=medlow -fPIC -O2 -fasynchronous-unwind-tables -m64
SPARC_REG_CFLAGS = -ffixed-g4 -ffixed-g5 $(call cc-option,-fcall-used-g5) $(call cc-option,-fcall-used-g7)
-$(vobjs): KBUILD_CFLAGS := $(filter-out $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(SPARC_REG_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
+$(vobjs): KBUILD_CFLAGS := $(filter-out $(RANDSTRUCT_CFLAGS) $(KSTACK_ERASE_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(SPARC_REG_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
#
# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
@@ -79,6 +79,7 @@ KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
KBUILD_CFLAGS_32 := $(filter-out -mcmodel=medlow,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(RANDSTRUCT_CFLAGS),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(KSTACK_ERASE_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(SPARC_REG_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 += -m32 -msoft-float -fpic
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index 04ab3b653a48..b6810db24ca4 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -15,7 +15,6 @@ generic-y += mcs_spinlock.h
generic-y += mmiowb.h
generic-y += module.h
generic-y += module.lds.h
-generic-y += param.h
generic-y += parport.h
generic-y += percpu.h
generic-y += preempt.h
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8bed9030ad47..edaab220d9c1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -79,9 +79,6 @@ config X86
select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
select ARCH_HAS_CPU_FINALIZE_INIT
select ARCH_HAS_CPU_PASID if IOMMU_SVA
- select ARCH_HAS_CRC32
- select ARCH_HAS_CRC64 if X86_64
- select ARCH_HAS_CRC_T10DIF
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE
@@ -204,13 +201,13 @@ config X86
select HAVE_ARCH_KFENCE
select HAVE_ARCH_KMSAN if X86_64
select HAVE_ARCH_KGDB
+ select HAVE_ARCH_KSTACK_ERASE
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT
select HAVE_ARCH_PREL32_RELOCATIONS
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
- select HAVE_ARCH_STACKLEAK
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
diff --git a/arch/x86/coco/sev/Makefile b/arch/x86/coco/sev/Makefile
index db3255b979bd..342d79f0ab6a 100644
--- a/arch/x86/coco/sev/Makefile
+++ b/arch/x86/coco/sev/Makefile
@@ -5,5 +5,6 @@ obj-y += core.o sev-nmi.o vc-handle.o
# Clang 14 and older may fail to respect __no_sanitize_undefined when inlining
UBSAN_SANITIZE_sev-nmi.o := n
-# GCC may fail to respect __no_sanitize_address when inlining
+# GCC may fail to respect __no_sanitize_address or __no_kcsan when inlining
KASAN_SANITIZE_sev-nmi.o := n
+KCSAN_SANITIZE_sev-nmi.o := n
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 56cfdc79e2c6..94016c60561e 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -376,33 +376,6 @@ config CRYPTO_POLYVAL_CLMUL_NI
Architecture: x86_64 using:
- CLMUL-NI (carry-less multiplication new instructions)
-config CRYPTO_SHA1_SSSE3
- tristate "Hash functions: SHA-1 (SSSE3/AVX/AVX2/SHA-NI)"
- depends on 64BIT
- select CRYPTO_SHA1
- select CRYPTO_HASH
- help
- SHA-1 secure hash algorithm (FIPS 180)
-
- Architecture: x86_64 using:
- - SSSE3 (Supplemental SSE3)
- - AVX (Advanced Vector Extensions)
- - AVX2 (Advanced Vector Extensions 2)
- - SHA-NI (SHA Extensions New Instructions)
-
-config CRYPTO_SHA512_SSSE3
- tristate "Hash functions: SHA-384 and SHA-512 (SSSE3/AVX/AVX2)"
- depends on 64BIT
- select CRYPTO_SHA512
- select CRYPTO_HASH
- help
- SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
-
- Architecture: x86_64 using:
- - SSSE3 (Supplemental SSE3)
- - AVX (Advanced Vector Extensions)
- - AVX2 (Advanced Vector Extensions 2)
-
config CRYPTO_SM3_AVX_X86_64
tristate "Hash functions: SM3 (AVX)"
depends on 64BIT
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index aa289a9e0153..d402963d6b57 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -51,12 +51,6 @@ ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy)
aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o
endif
-obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
-sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ni_asm.o sha1_ssse3_glue.o
-
-obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
-sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
-
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
deleted file mode 100644
index 4b49bdc95265..000000000000
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ /dev/null
@@ -1,700 +0,0 @@
-/*
- * Implement fast SHA-1 with AVX2 instructions. (x86_64)
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Ilya Albrekht <ilya.albrekht@intel.com>
- * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
- * Ronen Zohar <ronen.zohar@intel.com>
- * Chandramouli Narayanan <mouli@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-/*
- * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
- *
- *This implementation is based on the previous SSSE3 release:
- *Visit http://software.intel.com/en-us/articles/
- *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
- *
- *Updates 20-byte SHA-1 record at start of 'state', from 'input', for
- *even number of 'blocks' consecutive 64-byte blocks.
- *
- *extern "C" void sha1_transform_avx2(
- * struct sha1_state *state, const u8* input, int blocks );
- */
-
-#include <linux/linkage.h>
-
-#define CTX %rdi /* arg1 */
-#define BUF %rsi /* arg2 */
-#define CNT %rdx /* arg3 */
-
-#define REG_A %ecx
-#define REG_B %esi
-#define REG_C %edi
-#define REG_D %eax
-#define REG_E %edx
-#define REG_TB %ebx
-#define REG_TA %r12d
-#define REG_RA %rcx
-#define REG_RB %rsi
-#define REG_RC %rdi
-#define REG_RD %rax
-#define REG_RE %rdx
-#define REG_RTA %r12
-#define REG_RTB %rbx
-#define REG_T1 %r11d
-#define xmm_mov vmovups
-#define avx2_zeroupper vzeroupper
-#define RND_F1 1
-#define RND_F2 2
-#define RND_F3 3
-
-.macro REGALLOC
- .set A, REG_A
- .set B, REG_B
- .set C, REG_C
- .set D, REG_D
- .set E, REG_E
- .set TB, REG_TB
- .set TA, REG_TA
-
- .set RA, REG_RA
- .set RB, REG_RB
- .set RC, REG_RC
- .set RD, REG_RD
- .set RE, REG_RE
-
- .set RTA, REG_RTA
- .set RTB, REG_RTB
-
- .set T1, REG_T1
-.endm
-
-#define HASH_PTR %r9
-#define BLOCKS_CTR %r8
-#define BUFFER_PTR %r10
-#define BUFFER_PTR2 %r13
-
-#define PRECALC_BUF %r14
-#define WK_BUF %r15
-
-#define W_TMP %xmm0
-#define WY_TMP %ymm0
-#define WY_TMP2 %ymm9
-
-# AVX2 variables
-#define WY0 %ymm3
-#define WY4 %ymm5
-#define WY08 %ymm7
-#define WY12 %ymm8
-#define WY16 %ymm12
-#define WY20 %ymm13
-#define WY24 %ymm14
-#define WY28 %ymm15
-
-#define YMM_SHUFB_BSWAP %ymm10
-
-/*
- * Keep 2 iterations precalculated at a time:
- * - 80 DWORDs per iteration * 2
- */
-#define W_SIZE (80*2*2 +16)
-
-#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
-#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
-
-
-.macro UPDATE_HASH hash, val
- add \hash, \val
- mov \val, \hash
-.endm
-
-.macro PRECALC_RESET_WY
- .set WY_00, WY0
- .set WY_04, WY4
- .set WY_08, WY08
- .set WY_12, WY12
- .set WY_16, WY16
- .set WY_20, WY20
- .set WY_24, WY24
- .set WY_28, WY28
- .set WY_32, WY_00
-.endm
-
-.macro PRECALC_ROTATE_WY
- /* Rotate macros */
- .set WY_32, WY_28
- .set WY_28, WY_24
- .set WY_24, WY_20
- .set WY_20, WY_16
- .set WY_16, WY_12
- .set WY_12, WY_08
- .set WY_08, WY_04
- .set WY_04, WY_00
- .set WY_00, WY_32
-
- /* Define register aliases */
- .set WY, WY_00
- .set WY_minus_04, WY_04
- .set WY_minus_08, WY_08
- .set WY_minus_12, WY_12
- .set WY_minus_16, WY_16
- .set WY_minus_20, WY_20
- .set WY_minus_24, WY_24
- .set WY_minus_28, WY_28
- .set WY_minus_32, WY
-.endm
-
-.macro PRECALC_00_15
- .if (i == 0) # Initialize and rotate registers
- PRECALC_RESET_WY
- PRECALC_ROTATE_WY
- .endif
-
- /* message scheduling pre-compute for rounds 0-15 */
- .if ((i & 7) == 0)
- /*
- * blended AVX2 and ALU instruction scheduling
- * 1 vector iteration per 8 rounds
- */
- vmovdqu (i * 2)(BUFFER_PTR), W_TMP
- .elseif ((i & 7) == 1)
- vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
- WY_TMP, WY_TMP
- .elseif ((i & 7) == 2)
- vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
- .elseif ((i & 7) == 4)
- vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
- .elseif ((i & 7) == 7)
- vmovdqu WY_TMP, PRECALC_WK(i&~7)
-
- PRECALC_ROTATE_WY
- .endif
-.endm
-
-.macro PRECALC_16_31
- /*
- * message scheduling pre-compute for rounds 16-31
- * calculating last 32 w[i] values in 8 XMM registers
- * pre-calculate K+w[i] values and store to mem
- * for later load by ALU add instruction
- *
- * "brute force" vectorization for rounds 16-31 only
- * due to w[i]->w[i-3] dependency
- */
- .if ((i & 7) == 0)
- /*
- * blended AVX2 and ALU instruction scheduling
- * 1 vector iteration per 8 rounds
- */
- /* w[i-14] */
- vpalignr $8, WY_minus_16, WY_minus_12, WY
- vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
- .elseif ((i & 7) == 1)
- vpxor WY_minus_08, WY, WY
- vpxor WY_minus_16, WY_TMP, WY_TMP
- .elseif ((i & 7) == 2)
- vpxor WY_TMP, WY, WY
- vpslldq $12, WY, WY_TMP2
- .elseif ((i & 7) == 3)
- vpslld $1, WY, WY_TMP
- vpsrld $31, WY, WY
- .elseif ((i & 7) == 4)
- vpor WY, WY_TMP, WY_TMP
- vpslld $2, WY_TMP2, WY
- .elseif ((i & 7) == 5)
- vpsrld $30, WY_TMP2, WY_TMP2
- vpxor WY, WY_TMP, WY_TMP
- .elseif ((i & 7) == 7)
- vpxor WY_TMP2, WY_TMP, WY
- vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
- vmovdqu WY_TMP, PRECALC_WK(i&~7)
-
- PRECALC_ROTATE_WY
- .endif
-.endm
-
-.macro PRECALC_32_79
- /*
- * in SHA-1 specification:
- * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
- * instead we do equal:
- * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
- * allows more efficient vectorization
- * since w[i]=>w[i-3] dependency is broken
- */
-
- .if ((i & 7) == 0)
- /*
- * blended AVX2 and ALU instruction scheduling
- * 1 vector iteration per 8 rounds
- */
- vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
- .elseif ((i & 7) == 1)
- /* W is W_minus_32 before xor */
- vpxor WY_minus_28, WY, WY
- .elseif ((i & 7) == 2)
- vpxor WY_minus_16, WY_TMP, WY_TMP
- .elseif ((i & 7) == 3)
- vpxor WY_TMP, WY, WY
- .elseif ((i & 7) == 4)
- vpslld $2, WY, WY_TMP
- .elseif ((i & 7) == 5)
- vpsrld $30, WY, WY
- vpor WY, WY_TMP, WY
- .elseif ((i & 7) == 7)
- vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
- vmovdqu WY_TMP, PRECALC_WK(i&~7)
-
- PRECALC_ROTATE_WY
- .endif
-.endm
-
-.macro PRECALC r, s
- .set i, \r
-
- .if (i < 40)
- .set K_XMM, 32*0
- .elseif (i < 80)
- .set K_XMM, 32*1
- .elseif (i < 120)
- .set K_XMM, 32*2
- .else
- .set K_XMM, 32*3
- .endif
-
- .if (i<32)
- PRECALC_00_15 \s
- .elseif (i<64)
- PRECALC_16_31 \s
- .elseif (i < 160)
- PRECALC_32_79 \s
- .endif
-.endm
-
-.macro ROTATE_STATE
- .set T_REG, E
- .set E, D
- .set D, C
- .set C, B
- .set B, TB
- .set TB, A
- .set A, T_REG
-
- .set T_REG, RE
- .set RE, RD
- .set RD, RC
- .set RC, RB
- .set RB, RTB
- .set RTB, RA
- .set RA, T_REG
-.endm
-
-/* Macro relies on saved ROUND_Fx */
-
-.macro RND_FUN f, r
- .if (\f == RND_F1)
- ROUND_F1 \r
- .elseif (\f == RND_F2)
- ROUND_F2 \r
- .elseif (\f == RND_F3)
- ROUND_F3 \r
- .endif
-.endm
-
-.macro RR r
- .set round_id, (\r % 80)
-
- .if (round_id == 0) /* Precalculate F for first round */
- .set ROUND_FUNC, RND_F1
- mov B, TB
-
- rorx $(32-30), B, B /* b>>>2 */
- andn D, TB, T1
- and C, TB
- xor T1, TB
- .endif
-
- RND_FUN ROUND_FUNC, \r
- ROTATE_STATE
-
- .if (round_id == 18)
- .set ROUND_FUNC, RND_F2
- .elseif (round_id == 38)
- .set ROUND_FUNC, RND_F3
- .elseif (round_id == 58)
- .set ROUND_FUNC, RND_F2
- .endif
-
- .set round_id, ( (\r+1) % 80)
-
- RND_FUN ROUND_FUNC, (\r+1)
- ROTATE_STATE
-.endm
-
-.macro ROUND_F1 r
- add WK(\r), E
-
- andn C, A, T1 /* ~b&d */
- lea (RE,RTB), E /* Add F from the previous round */
-
- rorx $(32-5), A, TA /* T2 = A >>> 5 */
- rorx $(32-30),A, TB /* b>>>2 for next round */
-
- PRECALC (\r) /* msg scheduling for next 2 blocks */
-
- /*
- * Calculate F for the next round
- * (b & c) ^ andn[b, d]
- */
- and B, A /* b&c */
- xor T1, A /* F1 = (b&c) ^ (~b&d) */
-
- lea (RE,RTA), E /* E += A >>> 5 */
-.endm
-
-.macro ROUND_F2 r
- add WK(\r), E
- lea (RE,RTB), E /* Add F from the previous round */
-
- /* Calculate F for the next round */
- rorx $(32-5), A, TA /* T2 = A >>> 5 */
- .if ((round_id) < 79)
- rorx $(32-30), A, TB /* b>>>2 for next round */
- .endif
- PRECALC (\r) /* msg scheduling for next 2 blocks */
-
- .if ((round_id) < 79)
- xor B, A
- .endif
-
- add TA, E /* E += A >>> 5 */
-
- .if ((round_id) < 79)
- xor C, A
- .endif
-.endm
-
-.macro ROUND_F3 r
- add WK(\r), E
- PRECALC (\r) /* msg scheduling for next 2 blocks */
-
- lea (RE,RTB), E /* Add F from the previous round */
-
- mov B, T1
- or A, T1
-
- rorx $(32-5), A, TA /* T2 = A >>> 5 */
- rorx $(32-30), A, TB /* b>>>2 for next round */
-
- /* Calculate F for the next round
- * (b and c) or (d and (b or c))
- */
- and C, T1
- and B, A
- or T1, A
-
- add TA, E /* E += A >>> 5 */
-
-.endm
-
-/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
- * %1 + %2 >= %3 ? %4 : 0
- */
-.macro ADD_IF_GE a, b, c, d
- mov \a, RTA
- add $\d, RTA
- cmp $\c, \b
- cmovge RTA, \a
-.endm
-
-/*
- * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
- */
-.macro SHA1_PIPELINED_MAIN_BODY
-
- REGALLOC
-
- mov (HASH_PTR), A
- mov 4(HASH_PTR), B
- mov 8(HASH_PTR), C
- mov 12(HASH_PTR), D
- mov 16(HASH_PTR), E
-
- mov %rsp, PRECALC_BUF
- lea (2*4*80+32)(%rsp), WK_BUF
-
- # Precalc WK for first 2 blocks
- ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
- .set i, 0
- .rept 160
- PRECALC i
- .set i, i + 1
- .endr
-
- /* Go to next block if needed */
- ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
- ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
- xchg WK_BUF, PRECALC_BUF
-
- .align 32
-.L_loop:
- /*
- * code loops through more than one block
- * we use K_BASE value as a signal of a last block,
- * it is set below by: cmovae BUFFER_PTR, K_BASE
- */
- test BLOCKS_CTR, BLOCKS_CTR
- jnz .L_begin
- .align 32
- jmp .L_end
- .align 32
-.L_begin:
-
- /*
- * Do first block
- * rounds: 0,2,4,6,8
- */
- .set j, 0
- .rept 5
- RR j
- .set j, j+2
- .endr
-
- /*
- * rounds:
- * 10,12,14,16,18
- * 20,22,24,26,28
- * 30,32,34,36,38
- * 40,42,44,46,48
- * 50,52,54,56,58
- */
- .rept 25
- RR j
- .set j, j+2
- .endr
-
- /* Update Counter */
- sub $1, BLOCKS_CTR
- /* Move to the next block only if needed*/
- ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
- /*
- * rounds
- * 60,62,64,66,68
- * 70,72,74,76,78
- */
- .rept 10
- RR j
- .set j, j+2
- .endr
-
- UPDATE_HASH (HASH_PTR), A
- UPDATE_HASH 4(HASH_PTR), TB
- UPDATE_HASH 8(HASH_PTR), C
- UPDATE_HASH 12(HASH_PTR), D
- UPDATE_HASH 16(HASH_PTR), E
-
- test BLOCKS_CTR, BLOCKS_CTR
- jz .L_loop
-
- mov TB, B
-
- /* Process second block */
- /*
- * rounds
- * 0+80, 2+80, 4+80, 6+80, 8+80
- * 10+80,12+80,14+80,16+80,18+80
- */
-
- .set j, 0
- .rept 10
- RR j+80
- .set j, j+2
- .endr
-
- /*
- * rounds
- * 20+80,22+80,24+80,26+80,28+80
- * 30+80,32+80,34+80,36+80,38+80
- */
- .rept 10
- RR j+80
- .set j, j+2
- .endr
-
- /*
- * rounds
- * 40+80,42+80,44+80,46+80,48+80
- * 50+80,52+80,54+80,56+80,58+80
- */
- .rept 10
- RR j+80
- .set j, j+2
- .endr
-
- /* update counter */
- sub $1, BLOCKS_CTR
- /* Move to the next block only if needed*/
- ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
-
- /*
- * rounds
- * 60+80,62+80,64+80,66+80,68+80
- * 70+80,72+80,74+80,76+80,78+80
- */
- .rept 10
- RR j+80
- .set j, j+2
- .endr
-
- UPDATE_HASH (HASH_PTR), A
- UPDATE_HASH 4(HASH_PTR), TB
- UPDATE_HASH 8(HASH_PTR), C
- UPDATE_HASH 12(HASH_PTR), D
- UPDATE_HASH 16(HASH_PTR), E
-
- /* Reset state for AVX2 reg permutation */
- mov A, TA
- mov TB, A
- mov C, TB
- mov E, C
- mov D, B
- mov TA, D
-
- REGALLOC
-
- xchg WK_BUF, PRECALC_BUF
-
- jmp .L_loop
-
- .align 32
-.L_end:
-
-.endm
-/*
- * macro implements SHA-1 function's body for several 64-byte blocks
- * param: function's name
- */
-.macro SHA1_VECTOR_ASM name
- SYM_FUNC_START(\name)
-
- push %rbx
- push %r12
- push %r13
- push %r14
- push %r15
-
- RESERVE_STACK = (W_SIZE*4 + 8+24)
-
- /* Align stack */
- push %rbp
- mov %rsp, %rbp
- and $~(0x20-1), %rsp
- sub $RESERVE_STACK, %rsp
-
- avx2_zeroupper
-
- /* Setup initial values */
- mov CTX, HASH_PTR
- mov BUF, BUFFER_PTR
-
- mov BUF, BUFFER_PTR2
- mov CNT, BLOCKS_CTR
-
- xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
-
- SHA1_PIPELINED_MAIN_BODY
-
- avx2_zeroupper
-
- mov %rbp, %rsp
- pop %rbp
-
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
-
- RET
-
- SYM_FUNC_END(\name)
-.endm
-
-.section .rodata
-
-#define K1 0x5a827999
-#define K2 0x6ed9eba1
-#define K3 0x8f1bbcdc
-#define K4 0xca62c1d6
-
-.align 128
-K_XMM_AR:
- .long K1, K1, K1, K1
- .long K1, K1, K1, K1
- .long K2, K2, K2, K2
- .long K2, K2, K2, K2
- .long K3, K3, K3, K3
- .long K3, K3, K3, K3
- .long K4, K4, K4, K4
- .long K4, K4, K4, K4
-
-BSWAP_SHUFB_CTL:
- .long 0x00010203
- .long 0x04050607
- .long 0x08090a0b
- .long 0x0c0d0e0f
- .long 0x00010203
- .long 0x04050607
- .long 0x08090a0b
- .long 0x0c0d0e0f
-.text
-
-SHA1_VECTOR_ASM sha1_transform_avx2
diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S
deleted file mode 100644
index cade913d4882..000000000000
--- a/arch/x86/crypto/sha1_ni_asm.S
+++ /dev/null
@@ -1,304 +0,0 @@
-/*
- * Intel SHA Extensions optimized implementation of a SHA-1 update function
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Sean Gulley <sean.m.gulley@intel.com>
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-#define DIGEST_PTR %rdi /* 1st arg */
-#define DATA_PTR %rsi /* 2nd arg */
-#define NUM_BLKS %rdx /* 3rd arg */
-
-/* gcc conversion */
-#define FRAME_SIZE 32 /* space for 2x16 bytes */
-
-#define ABCD %xmm0
-#define E0 %xmm1 /* Need two E's b/c they ping pong */
-#define E1 %xmm2
-#define MSG0 %xmm3
-#define MSG1 %xmm4
-#define MSG2 %xmm5
-#define MSG3 %xmm6
-#define SHUF_MASK %xmm7
-
-
-/*
- * Intel SHA Extensions optimized implementation of a SHA-1 update function
- *
- * The function takes a pointer to the current hash values, a pointer to the
- * input data, and a number of 64 byte blocks to process. Once all blocks have
- * been processed, the digest pointer is updated with the resulting hash value.
- * The function only processes complete blocks, there is no functionality to
- * store partial blocks. All message padding and hash value initialization must
- * be done outside the update function.
- *
- * The indented lines in the loop are instructions related to rounds processing.
- * The non-indented lines are instructions related to the message schedule.
- *
- * void sha1_ni_transform(uint32_t *digest, const void *data,
- uint32_t numBlocks)
- * digest : pointer to digest
- * data: pointer to input data
- * numBlocks: Number of blocks to process
- */
-.text
-SYM_TYPED_FUNC_START(sha1_ni_transform)
- push %rbp
- mov %rsp, %rbp
- sub $FRAME_SIZE, %rsp
- and $~0xF, %rsp
-
- shl $6, NUM_BLKS /* convert to bytes */
- jz .Ldone_hash
- add DATA_PTR, NUM_BLKS /* pointer to end of data */
-
- /* load initial hash values */
- pinsrd $3, 1*16(DIGEST_PTR), E0
- movdqu 0*16(DIGEST_PTR), ABCD
- pand UPPER_WORD_MASK(%rip), E0
- pshufd $0x1B, ABCD, ABCD
-
- movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
-
-.Lloop0:
- /* Save hash values for addition after rounds */
- movdqa E0, (0*16)(%rsp)
- movdqa ABCD, (1*16)(%rsp)
-
- /* Rounds 0-3 */
- movdqu 0*16(DATA_PTR), MSG0
- pshufb SHUF_MASK, MSG0
- paddd MSG0, E0
- movdqa ABCD, E1
- sha1rnds4 $0, E0, ABCD
-
- /* Rounds 4-7 */
- movdqu 1*16(DATA_PTR), MSG1
- pshufb SHUF_MASK, MSG1
- sha1nexte MSG1, E1
- movdqa ABCD, E0
- sha1rnds4 $0, E1, ABCD
- sha1msg1 MSG1, MSG0
-
- /* Rounds 8-11 */
- movdqu 2*16(DATA_PTR), MSG2
- pshufb SHUF_MASK, MSG2
- sha1nexte MSG2, E0
- movdqa ABCD, E1
- sha1rnds4 $0, E0, ABCD
- sha1msg1 MSG2, MSG1
- pxor MSG2, MSG0
-
- /* Rounds 12-15 */
- movdqu 3*16(DATA_PTR), MSG3
- pshufb SHUF_MASK, MSG3
- sha1nexte MSG3, E1
- movdqa ABCD, E0
- sha1msg2 MSG3, MSG0
- sha1rnds4 $0, E1, ABCD
- sha1msg1 MSG3, MSG2
- pxor MSG3, MSG1
-
- /* Rounds 16-19 */
- sha1nexte MSG0, E0
- movdqa ABCD, E1
- sha1msg2 MSG0, MSG1
- sha1rnds4 $0, E0, ABCD
- sha1msg1 MSG0, MSG3
- pxor MSG0, MSG2
-
- /* Rounds 20-23 */
- sha1nexte MSG1, E1
- movdqa ABCD, E0
- sha1msg2 MSG1, MSG2
- sha1rnds4 $1, E1, ABCD
- sha1msg1 MSG1, MSG0
- pxor MSG1, MSG3
-
- /* Rounds 24-27 */
- sha1nexte MSG2, E0
- movdqa ABCD, E1
- sha1msg2 MSG2, MSG3
- sha1rnds4 $1, E0, ABCD
- sha1msg1 MSG2, MSG1
- pxor MSG2, MSG0
-
- /* Rounds 28-31 */
- sha1nexte MSG3, E1
- movdqa ABCD, E0
- sha1msg2 MSG3, MSG0
- sha1rnds4 $1, E1, ABCD
- sha1msg1 MSG3, MSG2
- pxor MSG3, MSG1
-
- /* Rounds 32-35 */
- sha1nexte MSG0, E0
- movdqa ABCD, E1
- sha1msg2 MSG0, MSG1
- sha1rnds4 $1, E0, ABCD
- sha1msg1 MSG0, MSG3
- pxor MSG0, MSG2
-
- /* Rounds 36-39 */
- sha1nexte MSG1, E1
- movdqa ABCD, E0
- sha1msg2 MSG1, MSG2
- sha1rnds4 $1, E1, ABCD
- sha1msg1 MSG1, MSG0
- pxor MSG1, MSG3
-
- /* Rounds 40-43 */
- sha1nexte MSG2, E0
- movdqa ABCD, E1
- sha1msg2 MSG2, MSG3
- sha1rnds4 $2, E0, ABCD
- sha1msg1 MSG2, MSG1
- pxor MSG2, MSG0
-
- /* Rounds 44-47 */
- sha1nexte MSG3, E1
- movdqa ABCD, E0
- sha1msg2 MSG3, MSG0
- sha1rnds4 $2, E1, ABCD
- sha1msg1 MSG3, MSG2
- pxor MSG3, MSG1
-
- /* Rounds 48-51 */
- sha1nexte MSG0, E0
- movdqa ABCD, E1
- sha1msg2 MSG0, MSG1
- sha1rnds4 $2, E0, ABCD
- sha1msg1 MSG0, MSG3
- pxor MSG0, MSG2
-
- /* Rounds 52-55 */
- sha1nexte MSG1, E1
- movdqa ABCD, E0
- sha1msg2 MSG1, MSG2
- sha1rnds4 $2, E1, ABCD
- sha1msg1 MSG1, MSG0
- pxor MSG1, MSG3
-
- /* Rounds 56-59 */
- sha1nexte MSG2, E0
- movdqa ABCD, E1
- sha1msg2 MSG2, MSG3
- sha1rnds4 $2, E0, ABCD
- sha1msg1 MSG2, MSG1
- pxor MSG2, MSG0
-
- /* Rounds 60-63 */
- sha1nexte MSG3, E1
- movdqa ABCD, E0
- sha1msg2 MSG3, MSG0
- sha1rnds4 $3, E1, ABCD
- sha1msg1 MSG3, MSG2
- pxor MSG3, MSG1
-
- /* Rounds 64-67 */
- sha1nexte MSG0, E0
- movdqa ABCD, E1
- sha1msg2 MSG0, MSG1
- sha1rnds4 $3, E0, ABCD
- sha1msg1 MSG0, MSG3
- pxor MSG0, MSG2
-
- /* Rounds 68-71 */
- sha1nexte MSG1, E1
- movdqa ABCD, E0
- sha1msg2 MSG1, MSG2
- sha1rnds4 $3, E1, ABCD
- pxor MSG1, MSG3
-
- /* Rounds 72-75 */
- sha1nexte MSG2, E0
- movdqa ABCD, E1
- sha1msg2 MSG2, MSG3
- sha1rnds4 $3, E0, ABCD
-
- /* Rounds 76-79 */
- sha1nexte MSG3, E1
- movdqa ABCD, E0
- sha1rnds4 $3, E1, ABCD
-
- /* Add current hash values with previously saved */
- sha1nexte (0*16)(%rsp), E0
- paddd (1*16)(%rsp), ABCD
-
- /* Increment data pointer and loop if more to process */
- add $64, DATA_PTR
- cmp NUM_BLKS, DATA_PTR
- jne .Lloop0
-
- /* Write hash values back in the correct order */
- pshufd $0x1B, ABCD, ABCD
- movdqu ABCD, 0*16(DIGEST_PTR)
- pextrd $3, E0, 1*16(DIGEST_PTR)
-
-.Ldone_hash:
- mov %rbp, %rsp
- pop %rbp
-
- RET
-SYM_FUNC_END(sha1_ni_transform)
-
-.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x000102030405060708090a0b0c0d0e0f
-
-.section .rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16
-.align 16
-UPPER_WORD_MASK:
- .octa 0xFFFFFFFF000000000000000000000000
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
deleted file mode 100644
index f54988c80eb4..000000000000
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ /dev/null
@@ -1,554 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
- * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
- * processors. CPUs supporting Intel(R) AVX extensions will get an additional
- * boost.
- *
- * This work was inspired by the vectorized implementation of Dean Gaudet.
- * Additional information on it can be found at:
- * http://www.arctic.org/~dean/crypto/sha1.html
- *
- * It was improved upon with more efficient vectorization of the message
- * scheduling. This implementation has also been optimized for all current and
- * several future generations of Intel CPUs.
- *
- * See this article for more information about the implementation details:
- * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
- *
- * Copyright (C) 2010, Intel Corp.
- * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
- * Ronen Zohar <ronen.zohar@intel.com>
- *
- * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
- * Author: Mathias Krause <minipli@googlemail.com>
- */
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-#define CTX %rdi // arg1
-#define BUF %rsi // arg2
-#define CNT %rdx // arg3
-
-#define REG_A %ecx
-#define REG_B %esi
-#define REG_C %edi
-#define REG_D %r12d
-#define REG_E %edx
-
-#define REG_T1 %eax
-#define REG_T2 %ebx
-
-#define K_BASE %r8
-#define HASH_PTR %r9
-#define BUFFER_PTR %r10
-#define BUFFER_END %r11
-
-#define W_TMP1 %xmm0
-#define W_TMP2 %xmm9
-
-#define W0 %xmm1
-#define W4 %xmm2
-#define W8 %xmm3
-#define W12 %xmm4
-#define W16 %xmm5
-#define W20 %xmm6
-#define W24 %xmm7
-#define W28 %xmm8
-
-#define XMM_SHUFB_BSWAP %xmm10
-
-/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
-#define WK(t) (((t) & 15) * 4)(%rsp)
-#define W_PRECALC_AHEAD 16
-
-/*
- * This macro implements the SHA-1 function's body for single 64-byte block
- * param: function's name
- */
-.macro SHA1_VECTOR_ASM name
- SYM_TYPED_FUNC_START(\name)
-
- push %rbx
- push %r12
- push %rbp
- mov %rsp, %rbp
-
- sub $64, %rsp # allocate workspace
- and $~15, %rsp # align stack
-
- mov CTX, HASH_PTR
- mov BUF, BUFFER_PTR
-
- shl $6, CNT # multiply by 64
- add BUF, CNT
- mov CNT, BUFFER_END
-
- lea K_XMM_AR(%rip), K_BASE
- xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
-
- SHA1_PIPELINED_MAIN_BODY
-
- # cleanup workspace
- mov $8, %ecx
- mov %rsp, %rdi
- xor %eax, %eax
- rep stosq
-
- mov %rbp, %rsp # deallocate workspace
- pop %rbp
- pop %r12
- pop %rbx
- RET
-
- SYM_FUNC_END(\name)
-.endm
-
-/*
- * This macro implements 80 rounds of SHA-1 for one 64-byte block
- */
-.macro SHA1_PIPELINED_MAIN_BODY
- INIT_REGALLOC
-
- mov (HASH_PTR), A
- mov 4(HASH_PTR), B
- mov 8(HASH_PTR), C
- mov 12(HASH_PTR), D
- mov 16(HASH_PTR), E
-
- .set i, 0
- .rept W_PRECALC_AHEAD
- W_PRECALC i
- .set i, (i+1)
- .endr
-
-.align 4
-1:
- RR F1,A,B,C,D,E,0
- RR F1,D,E,A,B,C,2
- RR F1,B,C,D,E,A,4
- RR F1,E,A,B,C,D,6
- RR F1,C,D,E,A,B,8
-
- RR F1,A,B,C,D,E,10
- RR F1,D,E,A,B,C,12
- RR F1,B,C,D,E,A,14
- RR F1,E,A,B,C,D,16
- RR F1,C,D,E,A,B,18
-
- RR F2,A,B,C,D,E,20
- RR F2,D,E,A,B,C,22
- RR F2,B,C,D,E,A,24
- RR F2,E,A,B,C,D,26
- RR F2,C,D,E,A,B,28
-
- RR F2,A,B,C,D,E,30
- RR F2,D,E,A,B,C,32
- RR F2,B,C,D,E,A,34
- RR F2,E,A,B,C,D,36
- RR F2,C,D,E,A,B,38
-
- RR F3,A,B,C,D,E,40
- RR F3,D,E,A,B,C,42
- RR F3,B,C,D,E,A,44
- RR F3,E,A,B,C,D,46
- RR F3,C,D,E,A,B,48
-
- RR F3,A,B,C,D,E,50
- RR F3,D,E,A,B,C,52
- RR F3,B,C,D,E,A,54
- RR F3,E,A,B,C,D,56
- RR F3,C,D,E,A,B,58
-
- add $64, BUFFER_PTR # move to the next 64-byte block
- cmp BUFFER_END, BUFFER_PTR # if the current is the last one use
- cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun
-
- RR F4,A,B,C,D,E,60
- RR F4,D,E,A,B,C,62
- RR F4,B,C,D,E,A,64
- RR F4,E,A,B,C,D,66
- RR F4,C,D,E,A,B,68
-
- RR F4,A,B,C,D,E,70
- RR F4,D,E,A,B,C,72
- RR F4,B,C,D,E,A,74
- RR F4,E,A,B,C,D,76
- RR F4,C,D,E,A,B,78
-
- UPDATE_HASH (HASH_PTR), A
- UPDATE_HASH 4(HASH_PTR), B
- UPDATE_HASH 8(HASH_PTR), C
- UPDATE_HASH 12(HASH_PTR), D
- UPDATE_HASH 16(HASH_PTR), E
-
- RESTORE_RENAMED_REGS
- cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end
- jne 1b
-.endm
-
-.macro INIT_REGALLOC
- .set A, REG_A
- .set B, REG_B
- .set C, REG_C
- .set D, REG_D
- .set E, REG_E
- .set T1, REG_T1
- .set T2, REG_T2
-.endm
-
-.macro RESTORE_RENAMED_REGS
- # order is important (REG_C is where it should be)
- mov B, REG_B
- mov D, REG_D
- mov A, REG_A
- mov E, REG_E
-.endm
-
-.macro SWAP_REG_NAMES a, b
- .set _T, \a
- .set \a, \b
- .set \b, _T
-.endm
-
-.macro F1 b, c, d
- mov \c, T1
- SWAP_REG_NAMES \c, T1
- xor \d, T1
- and \b, T1
- xor \d, T1
-.endm
-
-.macro F2 b, c, d
- mov \d, T1
- SWAP_REG_NAMES \d, T1
- xor \c, T1
- xor \b, T1
-.endm
-
-.macro F3 b, c ,d
- mov \c, T1
- SWAP_REG_NAMES \c, T1
- mov \b, T2
- or \b, T1
- and \c, T2
- and \d, T1
- or T2, T1
-.endm
-
-.macro F4 b, c, d
- F2 \b, \c, \d
-.endm
-
-.macro UPDATE_HASH hash, val
- add \hash, \val
- mov \val, \hash
-.endm
-
-/*
- * RR does two rounds of SHA-1 back to back with W[] pre-calc
- * t1 = F(b, c, d); e += w(i)
- * e += t1; b <<= 30; d += w(i+1);
- * t1 = F(a, b, c);
- * d += t1; a <<= 5;
- * e += a;
- * t1 = e; a >>= 7;
- * t1 <<= 5;
- * d += t1;
- */
-.macro RR F, a, b, c, d, e, round
- add WK(\round), \e
- \F \b, \c, \d # t1 = F(b, c, d);
- W_PRECALC (\round + W_PRECALC_AHEAD)
- rol $30, \b
- add T1, \e
- add WK(\round + 1), \d
-
- \F \a, \b, \c
- W_PRECALC (\round + W_PRECALC_AHEAD + 1)
- rol $5, \a
- add \a, \e
- add T1, \d
- ror $7, \a # (a <<r 5) >>r 7) => a <<r 30)
-
- mov \e, T1
- SWAP_REG_NAMES \e, T1
-
- rol $5, T1
- add T1, \d
-
- # write: \a, \b
- # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
-.endm
-
-.macro W_PRECALC r
- .set i, \r
-
- .if (i < 20)
- .set K_XMM, 0
- .elseif (i < 40)
- .set K_XMM, 16
- .elseif (i < 60)
- .set K_XMM, 32
- .elseif (i < 80)
- .set K_XMM, 48
- .endif
-
- .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
- .set i, ((\r) % 80) # pre-compute for the next iteration
- .if (i == 0)
- W_PRECALC_RESET
- .endif
- W_PRECALC_00_15
- .elseif (i<32)
- W_PRECALC_16_31
- .elseif (i < 80) // rounds 32-79
- W_PRECALC_32_79
- .endif
-.endm
-
-.macro W_PRECALC_RESET
- .set W, W0
- .set W_minus_04, W4
- .set W_minus_08, W8
- .set W_minus_12, W12
- .set W_minus_16, W16
- .set W_minus_20, W20
- .set W_minus_24, W24
- .set W_minus_28, W28
- .set W_minus_32, W
-.endm
-
-.macro W_PRECALC_ROTATE
- .set W_minus_32, W_minus_28
- .set W_minus_28, W_minus_24
- .set W_minus_24, W_minus_20
- .set W_minus_20, W_minus_16
- .set W_minus_16, W_minus_12
- .set W_minus_12, W_minus_08
- .set W_minus_08, W_minus_04
- .set W_minus_04, W
- .set W, W_minus_32
-.endm
-
-.macro W_PRECALC_SSSE3
-
-.macro W_PRECALC_00_15
- W_PRECALC_00_15_SSSE3
-.endm
-.macro W_PRECALC_16_31
- W_PRECALC_16_31_SSSE3
-.endm
-.macro W_PRECALC_32_79
- W_PRECALC_32_79_SSSE3
-.endm
-
-/* message scheduling pre-compute for rounds 0-15 */
-.macro W_PRECALC_00_15_SSSE3
- .if ((i & 3) == 0)
- movdqu (i*4)(BUFFER_PTR), W_TMP1
- .elseif ((i & 3) == 1)
- pshufb XMM_SHUFB_BSWAP, W_TMP1
- movdqa W_TMP1, W
- .elseif ((i & 3) == 2)
- paddd (K_BASE), W_TMP1
- .elseif ((i & 3) == 3)
- movdqa W_TMP1, WK(i&~3)
- W_PRECALC_ROTATE
- .endif
-.endm
-
-/* message scheduling pre-compute for rounds 16-31
- *
- * - calculating last 32 w[i] values in 8 XMM registers
- * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
- * instruction
- *
- * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
- * dependency, but improves for 32-79
- */
-.macro W_PRECALC_16_31_SSSE3
- # blended scheduling of vector and scalar instruction streams, one 4-wide
- # vector iteration / 4 scalar rounds
- .if ((i & 3) == 0)
- movdqa W_minus_12, W
- palignr $8, W_minus_16, W # w[i-14]
- movdqa W_minus_04, W_TMP1
- psrldq $4, W_TMP1 # w[i-3]
- pxor W_minus_08, W
- .elseif ((i & 3) == 1)
- pxor W_minus_16, W_TMP1
- pxor W_TMP1, W
- movdqa W, W_TMP2
- movdqa W, W_TMP1
- pslldq $12, W_TMP2
- .elseif ((i & 3) == 2)
- psrld $31, W
- pslld $1, W_TMP1
- por W, W_TMP1
- movdqa W_TMP2, W
- psrld $30, W_TMP2
- pslld $2, W
- .elseif ((i & 3) == 3)
- pxor W, W_TMP1
- pxor W_TMP2, W_TMP1
- movdqa W_TMP1, W
- paddd K_XMM(K_BASE), W_TMP1
- movdqa W_TMP1, WK(i&~3)
- W_PRECALC_ROTATE
- .endif
-.endm
-
-/* message scheduling pre-compute for rounds 32-79
- *
- * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
- * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
- * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
- */
-.macro W_PRECALC_32_79_SSSE3
- .if ((i & 3) == 0)
- movdqa W_minus_04, W_TMP1
- pxor W_minus_28, W # W is W_minus_32 before xor
- palignr $8, W_minus_08, W_TMP1
- .elseif ((i & 3) == 1)
- pxor W_minus_16, W
- pxor W_TMP1, W
- movdqa W, W_TMP1
- .elseif ((i & 3) == 2)
- psrld $30, W
- pslld $2, W_TMP1
- por W, W_TMP1
- .elseif ((i & 3) == 3)
- movdqa W_TMP1, W
- paddd K_XMM(K_BASE), W_TMP1
- movdqa W_TMP1, WK(i&~3)
- W_PRECALC_ROTATE
- .endif
-.endm
-
-.endm // W_PRECALC_SSSE3
-
-
-#define K1 0x5a827999
-#define K2 0x6ed9eba1
-#define K3 0x8f1bbcdc
-#define K4 0xca62c1d6
-
-.section .rodata
-.align 16
-
-K_XMM_AR:
- .long K1, K1, K1, K1
- .long K2, K2, K2, K2
- .long K3, K3, K3, K3
- .long K4, K4, K4, K4
-
-BSWAP_SHUFB_CTL:
- .long 0x00010203
- .long 0x04050607
- .long 0x08090a0b
- .long 0x0c0d0e0f
-
-
-.section .text
-
-W_PRECALC_SSSE3
-.macro xmm_mov a, b
- movdqu \a,\b
-.endm
-
-/*
- * SSSE3 optimized implementation:
- *
- * extern "C" void sha1_transform_ssse3(struct sha1_state *state,
- * const u8 *data, int blocks);
- *
- * Note that struct sha1_state is assumed to begin with u32 state[5].
- */
-SHA1_VECTOR_ASM sha1_transform_ssse3
-
-.macro W_PRECALC_AVX
-
-.purgem W_PRECALC_00_15
-.macro W_PRECALC_00_15
- W_PRECALC_00_15_AVX
-.endm
-.purgem W_PRECALC_16_31
-.macro W_PRECALC_16_31
- W_PRECALC_16_31_AVX
-.endm
-.purgem W_PRECALC_32_79
-.macro W_PRECALC_32_79
- W_PRECALC_32_79_AVX
-.endm
-
-.macro W_PRECALC_00_15_AVX
- .if ((i & 3) == 0)
- vmovdqu (i*4)(BUFFER_PTR), W_TMP1
- .elseif ((i & 3) == 1)
- vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
- .elseif ((i & 3) == 2)
- vpaddd (K_BASE), W, W_TMP1
- .elseif ((i & 3) == 3)
- vmovdqa W_TMP1, WK(i&~3)
- W_PRECALC_ROTATE
- .endif
-.endm
-
-.macro W_PRECALC_16_31_AVX
- .if ((i & 3) == 0)
- vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
- vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
- vpxor W_minus_08, W, W
- vpxor W_minus_16, W_TMP1, W_TMP1
- .elseif ((i & 3) == 1)
- vpxor W_TMP1, W, W
- vpslldq $12, W, W_TMP2
- vpslld $1, W, W_TMP1
- .elseif ((i & 3) == 2)
- vpsrld $31, W, W
- vpor W, W_TMP1, W_TMP1
- vpslld $2, W_TMP2, W
- vpsrld $30, W_TMP2, W_TMP2
- .elseif ((i & 3) == 3)
- vpxor W, W_TMP1, W_TMP1
- vpxor W_TMP2, W_TMP1, W
- vpaddd K_XMM(K_BASE), W, W_TMP1
- vmovdqu W_TMP1, WK(i&~3)
- W_PRECALC_ROTATE
- .endif
-.endm
-
-.macro W_PRECALC_32_79_AVX
- .if ((i & 3) == 0)
- vpalignr $8, W_minus_08, W_minus_04, W_TMP1
- vpxor W_minus_28, W, W # W is W_minus_32 before xor
- .elseif ((i & 3) == 1)
- vpxor W_minus_16, W_TMP1, W_TMP1
- vpxor W_TMP1, W, W
- .elseif ((i & 3) == 2)
- vpslld $2, W, W_TMP1
- vpsrld $30, W, W
- vpor W, W_TMP1, W
- .elseif ((i & 3) == 3)
- vpaddd K_XMM(K_BASE), W, W_TMP1
- vmovdqu W_TMP1, WK(i&~3)
- W_PRECALC_ROTATE
- .endif
-.endm
-
-.endm // W_PRECALC_AVX
-
-W_PRECALC_AVX
-.purgem xmm_mov
-.macro xmm_mov a, b
- vmovdqu \a,\b
-.endm
-
-
-/* AVX optimized implementation:
- * extern "C" void sha1_transform_avx(struct sha1_state *state,
- * const u8 *data, int blocks);
- */
-SHA1_VECTOR_ASM sha1_transform_avx
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
deleted file mode 100644
index 0a912bfc86c5..000000000000
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ /dev/null
@@ -1,324 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Cryptographic API.
- *
- * Glue code for the SHA1 Secure Hash Algorithm assembler implementations
- * using SSSE3, AVX, AVX2, and SHA-NI instructions.
- *
- * This file is based on sha1_generic.c
- *
- * Copyright (c) Alan Smithee.
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
- * Copyright (c) Mathias Krause <minipli@googlemail.com>
- * Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <asm/cpu_device_id.h>
-#include <asm/simd.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-static const struct x86_cpu_id module_cpu_ids[] = {
- X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL),
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids);
-
-static inline int sha1_update(struct shash_desc *desc, const u8 *data,
- unsigned int len, sha1_block_fn *sha1_xform)
-{
- int remain;
-
- /*
- * Make sure struct sha1_state begins directly with the SHA1
- * 160-bit internal state, as this is what the asm functions expect.
- */
- BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
-
- kernel_fpu_begin();
- remain = sha1_base_do_update_blocks(desc, data, len, sha1_xform);
- kernel_fpu_end();
-
- return remain;
-}
-
-static inline int sha1_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out,
- sha1_block_fn *sha1_xform)
-{
- kernel_fpu_begin();
- sha1_base_do_finup(desc, data, len, sha1_xform);
- kernel_fpu_end();
-
- return sha1_base_finish(desc, out);
-}
-
-asmlinkage void sha1_transform_ssse3(struct sha1_state *state,
- const u8 *data, int blocks);
-
-static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha1_update(desc, data, len, sha1_transform_ssse3);
-}
-
-static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha1_finup(desc, data, len, out, sha1_transform_ssse3);
-}
-
-static struct shash_alg sha1_ssse3_alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_base_init,
- .update = sha1_ssse3_update,
- .finup = sha1_ssse3_finup,
- .descsize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name = "sha1-ssse3",
- .cra_priority = 150,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static int register_sha1_ssse3(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- return crypto_register_shash(&sha1_ssse3_alg);
- return 0;
-}
-
-static void unregister_sha1_ssse3(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- crypto_unregister_shash(&sha1_ssse3_alg);
-}
-
-asmlinkage void sha1_transform_avx(struct sha1_state *state,
- const u8 *data, int blocks);
-
-static int sha1_avx_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha1_update(desc, data, len, sha1_transform_avx);
-}
-
-static int sha1_avx_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha1_finup(desc, data, len, out, sha1_transform_avx);
-}
-
-static struct shash_alg sha1_avx_alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_base_init,
- .update = sha1_avx_update,
- .finup = sha1_avx_finup,
- .descsize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name = "sha1-avx",
- .cra_priority = 160,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static bool avx_usable(void)
-{
- if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
- if (boot_cpu_has(X86_FEATURE_AVX))
- pr_info("AVX detected but unusable.\n");
- return false;
- }
-
- return true;
-}
-
-static int register_sha1_avx(void)
-{
- if (avx_usable())
- return crypto_register_shash(&sha1_avx_alg);
- return 0;
-}
-
-static void unregister_sha1_avx(void)
-{
- if (avx_usable())
- crypto_unregister_shash(&sha1_avx_alg);
-}
-
-#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
-
-asmlinkage void sha1_transform_avx2(struct sha1_state *state,
- const u8 *data, int blocks);
-
-static bool avx2_usable(void)
-{
- if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
- && boot_cpu_has(X86_FEATURE_BMI1)
- && boot_cpu_has(X86_FEATURE_BMI2))
- return true;
-
- return false;
-}
-
-static inline void sha1_apply_transform_avx2(struct sha1_state *state,
- const u8 *data, int blocks)
-{
- /* Select the optimal transform based on data block size */
- if (blocks >= SHA1_AVX2_BLOCK_OPTSIZE)
- sha1_transform_avx2(state, data, blocks);
- else
- sha1_transform_avx(state, data, blocks);
-}
-
-static int sha1_avx2_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha1_update(desc, data, len, sha1_apply_transform_avx2);
-}
-
-static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha1_finup(desc, data, len, out, sha1_apply_transform_avx2);
-}
-
-static struct shash_alg sha1_avx2_alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_base_init,
- .update = sha1_avx2_update,
- .finup = sha1_avx2_finup,
- .descsize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name = "sha1-avx2",
- .cra_priority = 170,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static int register_sha1_avx2(void)
-{
- if (avx2_usable())
- return crypto_register_shash(&sha1_avx2_alg);
- return 0;
-}
-
-static void unregister_sha1_avx2(void)
-{
- if (avx2_usable())
- crypto_unregister_shash(&sha1_avx2_alg);
-}
-
-asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data,
- int rounds);
-
-static int sha1_ni_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha1_update(desc, data, len, sha1_ni_transform);
-}
-
-static int sha1_ni_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha1_finup(desc, data, len, out, sha1_ni_transform);
-}
-
-static struct shash_alg sha1_ni_alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_base_init,
- .update = sha1_ni_update,
- .finup = sha1_ni_finup,
- .descsize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name = "sha1-ni",
- .cra_priority = 250,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static int register_sha1_ni(void)
-{
- if (boot_cpu_has(X86_FEATURE_SHA_NI))
- return crypto_register_shash(&sha1_ni_alg);
- return 0;
-}
-
-static void unregister_sha1_ni(void)
-{
- if (boot_cpu_has(X86_FEATURE_SHA_NI))
- crypto_unregister_shash(&sha1_ni_alg);
-}
-
-static int __init sha1_ssse3_mod_init(void)
-{
- if (!x86_match_cpu(module_cpu_ids))
- return -ENODEV;
-
- if (register_sha1_ssse3())
- goto fail;
-
- if (register_sha1_avx()) {
- unregister_sha1_ssse3();
- goto fail;
- }
-
- if (register_sha1_avx2()) {
- unregister_sha1_avx();
- unregister_sha1_ssse3();
- goto fail;
- }
-
- if (register_sha1_ni()) {
- unregister_sha1_avx2();
- unregister_sha1_avx();
- unregister_sha1_ssse3();
- goto fail;
- }
-
- return 0;
-fail:
- return -ENODEV;
-}
-
-static void __exit sha1_ssse3_mod_fini(void)
-{
- unregister_sha1_ni();
- unregister_sha1_avx2();
- unregister_sha1_avx();
- unregister_sha1_ssse3();
-}
-
-module_init(sha1_ssse3_mod_init);
-module_exit(sha1_ssse3_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated");
-
-MODULE_ALIAS_CRYPTO("sha1");
-MODULE_ALIAS_CRYPTO("sha1-ssse3");
-MODULE_ALIAS_CRYPTO("sha1-avx");
-MODULE_ALIAS_CRYPTO("sha1-avx2");
-MODULE_ALIAS_CRYPTO("sha1-ni");
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
deleted file mode 100644
index 5bfce4b045fd..000000000000
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ /dev/null
@@ -1,423 +0,0 @@
-########################################################################
-# Implement fast SHA-512 with AVX instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# David Cote <david.m.cote@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-512 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-.text
-
-# Virtual Registers
-# ARG1
-digest = %rdi
-# ARG2
-msg = %rsi
-# ARG3
-msglen = %rdx
-T1 = %rcx
-T2 = %r8
-a_64 = %r9
-b_64 = %r10
-c_64 = %r11
-d_64 = %r12
-e_64 = %r13
-f_64 = %r14
-g_64 = %r15
-h_64 = %rbx
-tmp0 = %rax
-
-# Local variables (stack frame)
-
-# Message Schedule
-W_SIZE = 80*8
-# W[t] + K[t] | W[t+1] + K[t+1]
-WK_SIZE = 2*8
-
-frame_W = 0
-frame_WK = frame_W + W_SIZE
-frame_size = frame_WK + WK_SIZE
-
-# Useful QWORD "arrays" for simpler memory references
-# MSG, DIGEST, K_t, W_t are arrays
-# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
-
-# Input message (arg1)
-#define MSG(i) 8*i(msg)
-
-# Output Digest (arg2)
-#define DIGEST(i) 8*i(digest)
-
-# SHA Constants (static mem)
-#define K_t(i) 8*i+K512(%rip)
-
-# Message Schedule (stack frame)
-#define W_t(i) 8*i+frame_W(%rsp)
-
-# W[t]+K[t] (stack frame)
-#define WK_2(i) 8*((i%2))+frame_WK(%rsp)
-
-.macro RotateState
- # Rotate symbols a..h right
- TMP = h_64
- h_64 = g_64
- g_64 = f_64
- f_64 = e_64
- e_64 = d_64
- d_64 = c_64
- c_64 = b_64
- b_64 = a_64
- a_64 = TMP
-.endm
-
-.macro RORQ p1 p2
- # shld is faster than ror on Sandybridge
- shld $(64-\p2), \p1, \p1
-.endm
-
-.macro SHA512_Round rnd
- # Compute Round %%t
- mov f_64, T1 # T1 = f
- mov e_64, tmp0 # tmp = e
- xor g_64, T1 # T1 = f ^ g
- RORQ tmp0, 23 # 41 # tmp = e ror 23
- and e_64, T1 # T1 = (f ^ g) & e
- xor e_64, tmp0 # tmp = (e ror 23) ^ e
- xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
- idx = \rnd
- add WK_2(idx), T1 # W[t] + K[t] from message scheduler
- RORQ tmp0, 4 # 18 # tmp = ((e ror 23) ^ e) ror 4
- xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e
- mov a_64, T2 # T2 = a
- add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h
- RORQ tmp0, 14 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
- add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
- mov a_64, tmp0 # tmp = a
- xor c_64, T2 # T2 = a ^ c
- and c_64, tmp0 # tmp = a & c
- and b_64, T2 # T2 = (a ^ c) & b
- xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
- mov a_64, tmp0 # tmp = a
- RORQ tmp0, 5 # 39 # tmp = a ror 5
- xor a_64, tmp0 # tmp = (a ror 5) ^ a
- add T1, d_64 # e(next_state) = d + T1
- RORQ tmp0, 6 # 34 # tmp = ((a ror 5) ^ a) ror 6
- xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a
- lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c)
- RORQ tmp0, 28 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
- add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a)
- RotateState
-.endm
-
-.macro SHA512_2Sched_2Round_avx rnd
- # Compute rounds t-2 and t-1
- # Compute message schedule QWORDS t and t+1
-
- # Two rounds are computed based on the values for K[t-2]+W[t-2] and
- # K[t-1]+W[t-1] which were previously stored at WK_2 by the message
- # scheduler.
- # The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)].
- # They are then added to their respective SHA512 constants at
- # [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)]
- # For brievity, the comments following vectored instructions only refer to
- # the first of a pair of QWORDS.
- # Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
- # The computation of the message schedule and the rounds are tightly
- # stitched to take advantage of instruction-level parallelism.
-
- idx = \rnd - 2
- vmovdqa W_t(idx), %xmm4 # XMM4 = W[t-2]
- idx = \rnd - 15
- vmovdqu W_t(idx), %xmm5 # XMM5 = W[t-15]
- mov f_64, T1
- vpsrlq $61, %xmm4, %xmm0 # XMM0 = W[t-2]>>61
- mov e_64, tmp0
- vpsrlq $1, %xmm5, %xmm6 # XMM6 = W[t-15]>>1
- xor g_64, T1
- RORQ tmp0, 23 # 41
- vpsrlq $19, %xmm4, %xmm1 # XMM1 = W[t-2]>>19
- and e_64, T1
- xor e_64, tmp0
- vpxor %xmm1, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19
- xor g_64, T1
- idx = \rnd
- add WK_2(idx), T1#
- vpsrlq $8, %xmm5, %xmm7 # XMM7 = W[t-15]>>8
- RORQ tmp0, 4 # 18
- vpsrlq $6, %xmm4, %xmm2 # XMM2 = W[t-2]>>6
- xor e_64, tmp0
- mov a_64, T2
- add h_64, T1
- vpxor %xmm7, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8
- RORQ tmp0, 14 # 14
- add tmp0, T1
- vpsrlq $7, %xmm5, %xmm8 # XMM8 = W[t-15]>>7
- mov a_64, tmp0
- xor c_64, T2
- vpsllq $(64-61), %xmm4, %xmm3 # XMM3 = W[t-2]<<3
- and c_64, tmp0
- and b_64, T2
- vpxor %xmm3, %xmm2, %xmm2 # XMM2 = W[t-2]>>6 ^ W[t-2]<<3
- xor tmp0, T2
- mov a_64, tmp0
- vpsllq $(64-1), %xmm5, %xmm9 # XMM9 = W[t-15]<<63
- RORQ tmp0, 5 # 39
- vpxor %xmm9, %xmm8, %xmm8 # XMM8 = W[t-15]>>7 ^ W[t-15]<<63
- xor a_64, tmp0
- add T1, d_64
- RORQ tmp0, 6 # 34
- xor a_64, tmp0
- vpxor %xmm8, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^
- # W[t-15]>>7 ^ W[t-15]<<63
- lea (T1, T2), h_64
- RORQ tmp0, 28 # 28
- vpsllq $(64-19), %xmm4, %xmm4 # XMM4 = W[t-2]<<25
- add tmp0, h_64
- RotateState
- vpxor %xmm4, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^
- # W[t-2]<<25
- mov f_64, T1
- vpxor %xmm2, %xmm0, %xmm0 # XMM0 = s1(W[t-2])
- mov e_64, tmp0
- xor g_64, T1
- idx = \rnd - 16
- vpaddq W_t(idx), %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16]
- idx = \rnd - 7
- vmovdqu W_t(idx), %xmm1 # XMM1 = W[t-7]
- RORQ tmp0, 23 # 41
- and e_64, T1
- xor e_64, tmp0
- xor g_64, T1
- vpsllq $(64-8), %xmm5, %xmm5 # XMM5 = W[t-15]<<56
- idx = \rnd + 1
- add WK_2(idx), T1
- vpxor %xmm5, %xmm6, %xmm6 # XMM6 = s0(W[t-15])
- RORQ tmp0, 4 # 18
- vpaddq %xmm6, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
- xor e_64, tmp0
- vpaddq %xmm1, %xmm0, %xmm0 # XMM0 = W[t] = s1(W[t-2]) + W[t-7] +
- # s0(W[t-15]) + W[t-16]
- mov a_64, T2
- add h_64, T1
- RORQ tmp0, 14 # 14
- add tmp0, T1
- idx = \rnd
- vmovdqa %xmm0, W_t(idx) # Store W[t]
- vpaddq K_t(idx), %xmm0, %xmm0 # Compute W[t]+K[t]
- vmovdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds
- mov a_64, tmp0
- xor c_64, T2
- and c_64, tmp0
- and b_64, T2
- xor tmp0, T2
- mov a_64, tmp0
- RORQ tmp0, 5 # 39
- xor a_64, tmp0
- add T1, d_64
- RORQ tmp0, 6 # 34
- xor a_64, tmp0
- lea (T1, T2), h_64
- RORQ tmp0, 28 # 28
- add tmp0, h_64
- RotateState
-.endm
-
-########################################################################
-# void sha512_transform_avx(sha512_state *state, const u8 *data, int blocks)
-# Purpose: Updates the SHA512 digest stored at "state" with the message
-# stored in "data".
-# The size of the message pointed to by "data" must be an integer multiple
-# of SHA512 message blocks.
-# "blocks" is the message length in SHA512 blocks
-########################################################################
-SYM_TYPED_FUNC_START(sha512_transform_avx)
- test msglen, msglen
- je .Lnowork
-
- # Save GPRs
- push %rbx
- push %r12
- push %r13
- push %r14
- push %r15
-
- # Allocate Stack Space
- push %rbp
- mov %rsp, %rbp
- sub $frame_size, %rsp
- and $~(0x20 - 1), %rsp
-
-.Lupdateblock:
-
- # Load state variables
- mov DIGEST(0), a_64
- mov DIGEST(1), b_64
- mov DIGEST(2), c_64
- mov DIGEST(3), d_64
- mov DIGEST(4), e_64
- mov DIGEST(5), f_64
- mov DIGEST(6), g_64
- mov DIGEST(7), h_64
-
- t = 0
- .rept 80/2 + 1
- # (80 rounds) / (2 rounds/iteration) + (1 iteration)
- # +1 iteration because the scheduler leads hashing by 1 iteration
- .if t < 2
- # BSWAP 2 QWORDS
- vmovdqa XMM_QWORD_BSWAP(%rip), %xmm1
- vmovdqu MSG(t), %xmm0
- vpshufb %xmm1, %xmm0, %xmm0 # BSWAP
- vmovdqa %xmm0, W_t(t) # Store Scheduled Pair
- vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
- vmovdqa %xmm0, WK_2(t) # Store into WK for rounds
- .elseif t < 16
- # BSWAP 2 QWORDS# Compute 2 Rounds
- vmovdqu MSG(t), %xmm0
- vpshufb %xmm1, %xmm0, %xmm0 # BSWAP
- SHA512_Round t-2 # Round t-2
- vmovdqa %xmm0, W_t(t) # Store Scheduled Pair
- vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
- SHA512_Round t-1 # Round t-1
- vmovdqa %xmm0, WK_2(t)# Store W[t]+K[t] into WK
- .elseif t < 79
- # Schedule 2 QWORDS# Compute 2 Rounds
- SHA512_2Sched_2Round_avx t
- .else
- # Compute 2 Rounds
- SHA512_Round t-2
- SHA512_Round t-1
- .endif
- t = t+2
- .endr
-
- # Update digest
- add a_64, DIGEST(0)
- add b_64, DIGEST(1)
- add c_64, DIGEST(2)
- add d_64, DIGEST(3)
- add e_64, DIGEST(4)
- add f_64, DIGEST(5)
- add g_64, DIGEST(6)
- add h_64, DIGEST(7)
-
- # Advance to next message block
- add $16*8, msg
- dec msglen
- jnz .Lupdateblock
-
- # Restore Stack Pointer
- mov %rbp, %rsp
- pop %rbp
-
- # Restore GPRs
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
-
-.Lnowork:
- RET
-SYM_FUNC_END(sha512_transform_avx)
-
-########################################################################
-### Binary Data
-
-.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
-.align 16
-# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
-XMM_QWORD_BSWAP:
- .octa 0x08090a0b0c0d0e0f0001020304050607
-
-# Mergeable 640-byte rodata section. This allows linker to merge the table
-# with other, exactly the same 640-byte fragment of another rodata section
-# (if such section exists).
-.section .rodata.cst640.K512, "aM", @progbits, 640
-.align 64
-# K[t] used in SHA512 hashing
-K512:
- .quad 0x428a2f98d728ae22,0x7137449123ef65cd
- .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
- .quad 0x3956c25bf348b538,0x59f111f1b605d019
- .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
- .quad 0xd807aa98a3030242,0x12835b0145706fbe
- .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
- .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
- .quad 0x9bdc06a725c71235,0xc19bf174cf692694
- .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
- .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
- .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
- .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
- .quad 0x983e5152ee66dfab,0xa831c66d2db43210
- .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
- .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
- .quad 0x06ca6351e003826f,0x142929670a0e6e70
- .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
- .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
- .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
- .quad 0x81c2c92e47edaee6,0x92722c851482353b
- .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
- .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
- .quad 0xd192e819d6ef5218,0xd69906245565a910
- .quad 0xf40e35855771202a,0x106aa07032bbd1b8
- .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
- .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
- .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
- .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
- .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
- .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
- .quad 0x90befffa23631e28,0xa4506cebde82bde9
- .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
- .quad 0xca273eceea26619c,0xd186b8c721c0c207
- .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
- .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
- .quad 0x113f9804bef90dae,0x1b710b35131c471b
- .quad 0x28db77f523047d84,0x32caab7b40c72493
- .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
- .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
- .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
deleted file mode 100644
index 24973f42c43f..000000000000
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ /dev/null
@@ -1,750 +0,0 @@
-########################################################################
-# Implement fast SHA-512 with AVX2 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# David Cote <david.m.cote@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-512 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-# This code schedules 1 blocks at a time, with 4 lanes per block
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-.text
-
-# Virtual Registers
-Y_0 = %ymm4
-Y_1 = %ymm5
-Y_2 = %ymm6
-Y_3 = %ymm7
-
-YTMP0 = %ymm0
-YTMP1 = %ymm1
-YTMP2 = %ymm2
-YTMP3 = %ymm3
-YTMP4 = %ymm8
-XFER = YTMP0
-
-BYTE_FLIP_MASK = %ymm9
-
-# 1st arg is %rdi, which is saved to the stack and accessed later via %r12
-CTX1 = %rdi
-CTX2 = %r12
-# 2nd arg
-INP = %rsi
-# 3rd arg
-NUM_BLKS = %rdx
-
-c = %rcx
-d = %r8
-e = %rdx
-y3 = %rsi
-
-TBL = %rdi # clobbers CTX1
-
-a = %rax
-b = %rbx
-
-f = %r9
-g = %r10
-h = %r11
-old_h = %r11
-
-T1 = %r12 # clobbers CTX2
-y0 = %r13
-y1 = %r14
-y2 = %r15
-
-# Local variables (stack frame)
-XFER_SIZE = 4*8
-SRND_SIZE = 1*8
-INP_SIZE = 1*8
-INPEND_SIZE = 1*8
-CTX_SIZE = 1*8
-
-frame_XFER = 0
-frame_SRND = frame_XFER + XFER_SIZE
-frame_INP = frame_SRND + SRND_SIZE
-frame_INPEND = frame_INP + INP_SIZE
-frame_CTX = frame_INPEND + INPEND_SIZE
-frame_size = frame_CTX + CTX_SIZE
-
-## assume buffers not aligned
-#define VMOVDQ vmovdqu
-
-# addm [mem], reg
-# Add reg to mem using reg-mem add and store
-.macro addm p1 p2
- add \p1, \p2
- mov \p2, \p1
-.endm
-
-
-# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
-# Load ymm with mem and byte swap each dword
-.macro COPY_YMM_AND_BSWAP p1 p2 p3
- VMOVDQ \p2, \p1
- vpshufb \p3, \p1, \p1
-.endm
-# rotate_Ys
-# Rotate values of symbols Y0...Y3
-.macro rotate_Ys
- Y_ = Y_0
- Y_0 = Y_1
- Y_1 = Y_2
- Y_2 = Y_3
- Y_3 = Y_
-.endm
-
-# RotateState
-.macro RotateState
- # Rotate symbols a..h right
- old_h = h
- TMP_ = h
- h = g
- g = f
- f = e
- e = d
- d = c
- c = b
- b = a
- a = TMP_
-.endm
-
-# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL
-# YDST = {YSRC1, YSRC2} >> RVAL*8
-.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
- vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI}
- vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
-################################### RND N + 0 #########################################
-
- # Extract w[t-7]
- MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7]
- # Calculate w[t-16] + w[t-7]
- vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16]
- # Extract w[t-15]
- MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15]
-
- # Calculate sigma0
-
- # Calculate w[t-15] ror 1
- vpsrlq $1, YTMP1, YTMP2
- vpsllq $(64-1), YTMP1, YTMP3
- vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1
- # Calculate w[t-15] shr 7
- vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7
-
- mov a, y3 # y3 = a # MAJA
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- add frame_XFER(%rsp),h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
- mov f, y2 # y2 = f # CH
- rorx $34, a, T1 # T1 = a >> 34 # S0B
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- xor g, y2 # y2 = f^g # CH
- rorx $14, e, y1 # y1 = (e >> 14) # S1
-
- and e, y2 # y2 = (f^g)&e # CH
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- add h, d # d = k + w + h + d # --
-
- and b, y3 # y3 = (a|c)&b # MAJA
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
-
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
-
- add y0, y2 # y2 = S1 + CH # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- RotateState
-
-################################### RND N + 1 #########################################
-
- # Calculate w[t-15] ror 8
- vpsrlq $8, YTMP1, YTMP2
- vpsllq $(64-8), YTMP1, YTMP1
- vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8
- # XOR the three components
- vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
- vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0
-
-
- # Add three components, w[t-16], w[t-7] and sigma0
- vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0
- # Move to appropriate lanes for calculating w[16] and w[17]
- vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA}
- # Move to appropriate lanes for calculating w[18] and w[19]
- vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00}
-
- # Calculate w[16] and w[17] in both 128 bit lanes
-
- # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
- vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA}
- vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA}
-
-
- mov a, y3 # y3 = a # MAJA
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- add 1*8+frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
-
- mov f, y2 # y2 = f # CH
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- xor g, y2 # y2 = f^g # CH
-
-
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- and e, y2 # y2 = (f^g)&e # CH
- add h, d # d = k + w + h + d # --
-
- and b, y3 # y3 = (a|c)&b # MAJA
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
-
- rorx $28, a, T1 # T1 = (a >> 28) # S0
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- RotateState
-
-
-################################### RND N + 2 #########################################
-
- vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA}
- vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA}
- vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA}
- vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
- vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA}
- vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA}
- vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA}
- vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
- # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
-
- # Add sigma1 to the other compunents to get w[16] and w[17]
- vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]}
-
- # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
- vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--}
-
- mov a, y3 # y3 = a # MAJA
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- add 2*8+frame_XFER(%rsp), h # h = k + w + h # --
-
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- or c, y3 # y3 = a|c # MAJA
- mov f, y2 # y2 = f # CH
- xor g, y2 # y2 = f^g # CH
-
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- and e, y2 # y2 = (f^g)&e # CH
-
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- add h, d # d = k + w + h + d # --
- and b, y3 # y3 = (a|c)&b # MAJA
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
-
- add y3, h # h = t1 + S0 + MAJ # --
-
- RotateState
-
-################################### RND N + 3 #########################################
-
- vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--}
- vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--}
- vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--}
- vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
- vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--}
- vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--}
- vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--}
- vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
- # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
-
- # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
- # to newly calculated sigma1 to get w[18] and w[19]
- vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --}
-
- # Form w[19, w[18], w17], w[16]
- vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]}
-
- mov a, y3 # y3 = a # MAJA
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- add 3*8+frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
-
- mov f, y2 # y2 = f # CH
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- xor g, y2 # y2 = f^g # CH
-
-
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add h, d # d = k + w + h + d # --
- and b, y3 # y3 = (a|c)&b # MAJA
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- add y0, y2 # y2 = S1 + CH # --
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- rorx $28, a, T1 # T1 = (a >> 28) # S0
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
-
- add y1, h # h = k + w + h + S0 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- RotateState
-
- rotate_Ys
-.endm
-
-.macro DO_4ROUNDS
-
-################################### RND N + 0 #########################################
-
- mov f, y2 # y2 = f # CH
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- and e, y2 # y2 = (f^g)&e # CH
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
- add frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- RotateState
-
-################################### RND N + 1 #########################################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
- add 8*1+frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- RotateState
-
-################################### RND N + 2 #########################################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
- add 8*2+frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- RotateState
-
-################################### RND N + 3 #########################################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
- add 8*3+frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
-
- add y3, h # h = t1 + S0 + MAJ # --
-
- RotateState
-
-.endm
-
-########################################################################
-# void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks)
-# Purpose: Updates the SHA512 digest stored at "state" with the message
-# stored in "data".
-# The size of the message pointed to by "data" must be an integer multiple
-# of SHA512 message blocks.
-# "blocks" is the message length in SHA512 blocks
-########################################################################
-SYM_TYPED_FUNC_START(sha512_transform_rorx)
- # Save GPRs
- push %rbx
- push %r12
- push %r13
- push %r14
- push %r15
-
- # Allocate Stack Space
- push %rbp
- mov %rsp, %rbp
- sub $frame_size, %rsp
- and $~(0x20 - 1), %rsp
-
- shl $7, NUM_BLKS # convert to bytes
- jz .Ldone_hash
- add INP, NUM_BLKS # pointer to end of data
- mov NUM_BLKS, frame_INPEND(%rsp)
-
- ## load initial digest
- mov 8*0(CTX1), a
- mov 8*1(CTX1), b
- mov 8*2(CTX1), c
- mov 8*3(CTX1), d
- mov 8*4(CTX1), e
- mov 8*5(CTX1), f
- mov 8*6(CTX1), g
- mov 8*7(CTX1), h
-
- # save %rdi (CTX) before it gets clobbered
- mov %rdi, frame_CTX(%rsp)
-
- vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
-
-.Lloop0:
- lea K512(%rip), TBL
-
- ## byte swap first 16 dwords
- COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK
- COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK
- COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK
- COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK
-
- mov INP, frame_INP(%rsp)
-
- ## schedule 64 input dwords, by doing 12 rounds of 4 each
- movq $4, frame_SRND(%rsp)
-
-.align 16
-.Lloop1:
- vpaddq (TBL), Y_0, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddq 1*32(TBL), Y_0, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddq 2*32(TBL), Y_0, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddq 3*32(TBL), Y_0, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- add $(4*32), TBL
- FOUR_ROUNDS_AND_SCHED
-
- subq $1, frame_SRND(%rsp)
- jne .Lloop1
-
- movq $2, frame_SRND(%rsp)
-.Lloop2:
- vpaddq (TBL), Y_0, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- DO_4ROUNDS
- vpaddq 1*32(TBL), Y_1, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- add $(2*32), TBL
- DO_4ROUNDS
-
- vmovdqa Y_2, Y_0
- vmovdqa Y_3, Y_1
-
- subq $1, frame_SRND(%rsp)
- jne .Lloop2
-
- mov frame_CTX(%rsp), CTX2
- addm 8*0(CTX2), a
- addm 8*1(CTX2), b
- addm 8*2(CTX2), c
- addm 8*3(CTX2), d
- addm 8*4(CTX2), e
- addm 8*5(CTX2), f
- addm 8*6(CTX2), g
- addm 8*7(CTX2), h
-
- mov frame_INP(%rsp), INP
- add $128, INP
- cmp frame_INPEND(%rsp), INP
- jne .Lloop0
-
-.Ldone_hash:
-
- # Restore Stack Pointer
- mov %rbp, %rsp
- pop %rbp
-
- # Restore GPRs
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
-
- vzeroupper
- RET
-SYM_FUNC_END(sha512_transform_rorx)
-
-########################################################################
-### Binary Data
-
-
-# Mergeable 640-byte rodata section. This allows linker to merge the table
-# with other, exactly the same 640-byte fragment of another rodata section
-# (if such section exists).
-.section .rodata.cst640.K512, "aM", @progbits, 640
-.align 64
-# K[t] used in SHA512 hashing
-K512:
- .quad 0x428a2f98d728ae22,0x7137449123ef65cd
- .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
- .quad 0x3956c25bf348b538,0x59f111f1b605d019
- .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
- .quad 0xd807aa98a3030242,0x12835b0145706fbe
- .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
- .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
- .quad 0x9bdc06a725c71235,0xc19bf174cf692694
- .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
- .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
- .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
- .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
- .quad 0x983e5152ee66dfab,0xa831c66d2db43210
- .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
- .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
- .quad 0x06ca6351e003826f,0x142929670a0e6e70
- .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
- .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
- .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
- .quad 0x81c2c92e47edaee6,0x92722c851482353b
- .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
- .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
- .quad 0xd192e819d6ef5218,0xd69906245565a910
- .quad 0xf40e35855771202a,0x106aa07032bbd1b8
- .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
- .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
- .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
- .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
- .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
- .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
- .quad 0x90befffa23631e28,0xa4506cebde82bde9
- .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
- .quad 0xca273eceea26619c,0xd186b8c721c0c207
- .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
- .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
- .quad 0x113f9804bef90dae,0x1b710b35131c471b
- .quad 0x28db77f523047d84,0x32caab7b40c72493
- .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
- .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
- .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
-
-.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
-.align 32
-# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x08090a0b0c0d0e0f0001020304050607
- .octa 0x18191a1b1c1d1e1f1011121314151617
-
-.section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
-.align 32
-MASK_YMM_LO:
- .octa 0x00000000000000000000000000000000
- .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
deleted file mode 100644
index 30a2c4777f9d..000000000000
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ /dev/null
@@ -1,425 +0,0 @@
-########################################################################
-# Implement fast SHA-512 with SSSE3 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# David Cote <david.m.cote@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-512 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-.text
-
-# Virtual Registers
-# ARG1
-digest = %rdi
-# ARG2
-msg = %rsi
-# ARG3
-msglen = %rdx
-T1 = %rcx
-T2 = %r8
-a_64 = %r9
-b_64 = %r10
-c_64 = %r11
-d_64 = %r12
-e_64 = %r13
-f_64 = %r14
-g_64 = %r15
-h_64 = %rbx
-tmp0 = %rax
-
-# Local variables (stack frame)
-
-W_SIZE = 80*8
-WK_SIZE = 2*8
-
-frame_W = 0
-frame_WK = frame_W + W_SIZE
-frame_size = frame_WK + WK_SIZE
-
-# Useful QWORD "arrays" for simpler memory references
-# MSG, DIGEST, K_t, W_t are arrays
-# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
-
-# Input message (arg1)
-#define MSG(i) 8*i(msg)
-
-# Output Digest (arg2)
-#define DIGEST(i) 8*i(digest)
-
-# SHA Constants (static mem)
-#define K_t(i) 8*i+K512(%rip)
-
-# Message Schedule (stack frame)
-#define W_t(i) 8*i+frame_W(%rsp)
-
-# W[t]+K[t] (stack frame)
-#define WK_2(i) 8*((i%2))+frame_WK(%rsp)
-
-.macro RotateState
- # Rotate symbols a..h right
- TMP = h_64
- h_64 = g_64
- g_64 = f_64
- f_64 = e_64
- e_64 = d_64
- d_64 = c_64
- c_64 = b_64
- b_64 = a_64
- a_64 = TMP
-.endm
-
-.macro SHA512_Round rnd
-
- # Compute Round %%t
- mov f_64, T1 # T1 = f
- mov e_64, tmp0 # tmp = e
- xor g_64, T1 # T1 = f ^ g
- ror $23, tmp0 # 41 # tmp = e ror 23
- and e_64, T1 # T1 = (f ^ g) & e
- xor e_64, tmp0 # tmp = (e ror 23) ^ e
- xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
- idx = \rnd
- add WK_2(idx), T1 # W[t] + K[t] from message scheduler
- ror $4, tmp0 # 18 # tmp = ((e ror 23) ^ e) ror 4
- xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e
- mov a_64, T2 # T2 = a
- add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h
- ror $14, tmp0 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
- add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
- mov a_64, tmp0 # tmp = a
- xor c_64, T2 # T2 = a ^ c
- and c_64, tmp0 # tmp = a & c
- and b_64, T2 # T2 = (a ^ c) & b
- xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
- mov a_64, tmp0 # tmp = a
- ror $5, tmp0 # 39 # tmp = a ror 5
- xor a_64, tmp0 # tmp = (a ror 5) ^ a
- add T1, d_64 # e(next_state) = d + T1
- ror $6, tmp0 # 34 # tmp = ((a ror 5) ^ a) ror 6
- xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a
- lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c)
- ror $28, tmp0 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
- add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a)
- RotateState
-.endm
-
-.macro SHA512_2Sched_2Round_sse rnd
-
- # Compute rounds t-2 and t-1
- # Compute message schedule QWORDS t and t+1
-
- # Two rounds are computed based on the values for K[t-2]+W[t-2] and
- # K[t-1]+W[t-1] which were previously stored at WK_2 by the message
- # scheduler.
- # The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
- # They are then added to their respective SHA512 constants at
- # [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
- # For brievity, the comments following vectored instructions only refer to
- # the first of a pair of QWORDS.
- # Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
- # The computation of the message schedule and the rounds are tightly
- # stitched to take advantage of instruction-level parallelism.
- # For clarity, integer instructions (for the rounds calculation) are indented
- # by one tab. Vectored instructions (for the message scheduler) are indented
- # by two tabs.
-
- mov f_64, T1
- idx = \rnd -2
- movdqa W_t(idx), %xmm2 # XMM2 = W[t-2]
- xor g_64, T1
- and e_64, T1
- movdqa %xmm2, %xmm0 # XMM0 = W[t-2]
- xor g_64, T1
- idx = \rnd
- add WK_2(idx), T1
- idx = \rnd - 15
- movdqu W_t(idx), %xmm5 # XMM5 = W[t-15]
- mov e_64, tmp0
- ror $23, tmp0 # 41
- movdqa %xmm5, %xmm3 # XMM3 = W[t-15]
- xor e_64, tmp0
- ror $4, tmp0 # 18
- psrlq $61-19, %xmm0 # XMM0 = W[t-2] >> 42
- xor e_64, tmp0
- ror $14, tmp0 # 14
- psrlq $(8-7), %xmm3 # XMM3 = W[t-15] >> 1
- add tmp0, T1
- add h_64, T1
- pxor %xmm2, %xmm0 # XMM0 = (W[t-2] >> 42) ^ W[t-2]
- mov a_64, T2
- xor c_64, T2
- pxor %xmm5, %xmm3 # XMM3 = (W[t-15] >> 1) ^ W[t-15]
- and b_64, T2
- mov a_64, tmp0
- psrlq $(19-6), %xmm0 # XMM0 = ((W[t-2]>>42)^W[t-2])>>13
- and c_64, tmp0
- xor tmp0, T2
- psrlq $(7-1), %xmm3 # XMM3 = ((W[t-15]>>1)^W[t-15])>>6
- mov a_64, tmp0
- ror $5, tmp0 # 39
- pxor %xmm2, %xmm0 # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
- xor a_64, tmp0
- ror $6, tmp0 # 34
- pxor %xmm5, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
- xor a_64, tmp0
- ror $28, tmp0 # 28
- psrlq $6, %xmm0 # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
- add tmp0, T2
- add T1, d_64
- psrlq $1, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
- lea (T1, T2), h_64
- RotateState
- movdqa %xmm2, %xmm1 # XMM1 = W[t-2]
- mov f_64, T1
- xor g_64, T1
- movdqa %xmm5, %xmm4 # XMM4 = W[t-15]
- and e_64, T1
- xor g_64, T1
- psllq $(64-19)-(64-61) , %xmm1 # XMM1 = W[t-2] << 42
- idx = \rnd + 1
- add WK_2(idx), T1
- mov e_64, tmp0
- psllq $(64-1)-(64-8), %xmm4 # XMM4 = W[t-15] << 7
- ror $23, tmp0 # 41
- xor e_64, tmp0
- pxor %xmm2, %xmm1 # XMM1 = (W[t-2] << 42)^W[t-2]
- ror $4, tmp0 # 18
- xor e_64, tmp0
- pxor %xmm5, %xmm4 # XMM4 = (W[t-15]<<7)^W[t-15]
- ror $14, tmp0 # 14
- add tmp0, T1
- psllq $(64-61), %xmm1 # XMM1 = ((W[t-2] << 42)^W[t-2])<<3
- add h_64, T1
- mov a_64, T2
- psllq $(64-8), %xmm4 # XMM4 = ((W[t-15]<<7)^W[t-15])<<56
- xor c_64, T2
- and b_64, T2
- pxor %xmm1, %xmm0 # XMM0 = s1(W[t-2])
- mov a_64, tmp0
- and c_64, tmp0
- idx = \rnd - 7
- movdqu W_t(idx), %xmm1 # XMM1 = W[t-7]
- xor tmp0, T2
- pxor %xmm4, %xmm3 # XMM3 = s0(W[t-15])
- mov a_64, tmp0
- paddq %xmm3, %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15])
- ror $5, tmp0 # 39
- idx =\rnd-16
- paddq W_t(idx), %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
- xor a_64, tmp0
- paddq %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
- ror $6, tmp0 # 34
- movdqa %xmm0, W_t(\rnd) # Store scheduled qwords
- xor a_64, tmp0
- paddq K_t(\rnd), %xmm0 # Compute W[t]+K[t]
- ror $28, tmp0 # 28
- idx = \rnd
- movdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds
- add tmp0, T2
- add T1, d_64
- lea (T1, T2), h_64
- RotateState
-.endm
-
-########################################################################
-## void sha512_transform_ssse3(struct sha512_state *state, const u8 *data,
-## int blocks);
-# (struct sha512_state is assumed to begin with u64 state[8])
-# Purpose: Updates the SHA512 digest stored at "state" with the message
-# stored in "data".
-# The size of the message pointed to by "data" must be an integer multiple
-# of SHA512 message blocks.
-# "blocks" is the message length in SHA512 blocks.
-########################################################################
-SYM_TYPED_FUNC_START(sha512_transform_ssse3)
-
- test msglen, msglen
- je .Lnowork
-
- # Save GPRs
- push %rbx
- push %r12
- push %r13
- push %r14
- push %r15
-
- # Allocate Stack Space
- push %rbp
- mov %rsp, %rbp
- sub $frame_size, %rsp
- and $~(0x20 - 1), %rsp
-
-.Lupdateblock:
-
-# Load state variables
- mov DIGEST(0), a_64
- mov DIGEST(1), b_64
- mov DIGEST(2), c_64
- mov DIGEST(3), d_64
- mov DIGEST(4), e_64
- mov DIGEST(5), f_64
- mov DIGEST(6), g_64
- mov DIGEST(7), h_64
-
- t = 0
- .rept 80/2 + 1
- # (80 rounds) / (2 rounds/iteration) + (1 iteration)
- # +1 iteration because the scheduler leads hashing by 1 iteration
- .if t < 2
- # BSWAP 2 QWORDS
- movdqa XMM_QWORD_BSWAP(%rip), %xmm1
- movdqu MSG(t), %xmm0
- pshufb %xmm1, %xmm0 # BSWAP
- movdqa %xmm0, W_t(t) # Store Scheduled Pair
- paddq K_t(t), %xmm0 # Compute W[t]+K[t]
- movdqa %xmm0, WK_2(t) # Store into WK for rounds
- .elseif t < 16
- # BSWAP 2 QWORDS# Compute 2 Rounds
- movdqu MSG(t), %xmm0
- pshufb %xmm1, %xmm0 # BSWAP
- SHA512_Round t-2 # Round t-2
- movdqa %xmm0, W_t(t) # Store Scheduled Pair
- paddq K_t(t), %xmm0 # Compute W[t]+K[t]
- SHA512_Round t-1 # Round t-1
- movdqa %xmm0, WK_2(t) # Store W[t]+K[t] into WK
- .elseif t < 79
- # Schedule 2 QWORDS# Compute 2 Rounds
- SHA512_2Sched_2Round_sse t
- .else
- # Compute 2 Rounds
- SHA512_Round t-2
- SHA512_Round t-1
- .endif
- t = t+2
- .endr
-
- # Update digest
- add a_64, DIGEST(0)
- add b_64, DIGEST(1)
- add c_64, DIGEST(2)
- add d_64, DIGEST(3)
- add e_64, DIGEST(4)
- add f_64, DIGEST(5)
- add g_64, DIGEST(6)
- add h_64, DIGEST(7)
-
- # Advance to next message block
- add $16*8, msg
- dec msglen
- jnz .Lupdateblock
-
- # Restore Stack Pointer
- mov %rbp, %rsp
- pop %rbp
-
- # Restore GPRs
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
-
-.Lnowork:
- RET
-SYM_FUNC_END(sha512_transform_ssse3)
-
-########################################################################
-### Binary Data
-
-.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
-.align 16
-# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
-XMM_QWORD_BSWAP:
- .octa 0x08090a0b0c0d0e0f0001020304050607
-
-# Mergeable 640-byte rodata section. This allows linker to merge the table
-# with other, exactly the same 640-byte fragment of another rodata section
-# (if such section exists).
-.section .rodata.cst640.K512, "aM", @progbits, 640
-.align 64
-# K[t] used in SHA512 hashing
-K512:
- .quad 0x428a2f98d728ae22,0x7137449123ef65cd
- .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
- .quad 0x3956c25bf348b538,0x59f111f1b605d019
- .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
- .quad 0xd807aa98a3030242,0x12835b0145706fbe
- .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
- .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
- .quad 0x9bdc06a725c71235,0xc19bf174cf692694
- .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
- .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
- .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
- .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
- .quad 0x983e5152ee66dfab,0xa831c66d2db43210
- .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
- .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
- .quad 0x06ca6351e003826f,0x142929670a0e6e70
- .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
- .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
- .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
- .quad 0x81c2c92e47edaee6,0x92722c851482353b
- .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
- .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
- .quad 0xd192e819d6ef5218,0xd69906245565a910
- .quad 0xf40e35855771202a,0x106aa07032bbd1b8
- .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
- .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
- .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
- .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
- .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
- .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
- .quad 0x90befffa23631e28,0xa4506cebde82bde9
- .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
- .quad 0xca273eceea26619c,0xd186b8c721c0c207
- .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
- .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
- .quad 0x113f9804bef90dae,0x1b710b35131c471b
- .quad 0x28db77f523047d84,0x32caab7b40c72493
- .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
- .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
- .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
deleted file mode 100644
index 067684c54395..000000000000
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ /dev/null
@@ -1,322 +0,0 @@
-/*
- * Cryptographic API.
- *
- * Glue code for the SHA512 Secure Hash Algorithm assembler
- * implementation using supplemental SSE3 / AVX / AVX2 instructions.
- *
- * This file is based on sha512_generic.c
- *
- * Copyright (C) 2013 Intel Corporation
- * Author: Tim Chen <tim.c.chen@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <asm/cpu_device_id.h>
-#include <asm/simd.h>
-#include <crypto/internal/hash.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-
-asmlinkage void sha512_transform_ssse3(struct sha512_state *state,
- const u8 *data, int blocks);
-
-static int sha512_update(struct shash_desc *desc, const u8 *data,
- unsigned int len, sha512_block_fn *sha512_xform)
-{
- int remain;
-
- /*
- * Make sure struct sha512_state begins directly with the SHA512
- * 512-bit internal state, as this is what the asm functions expect.
- */
- BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0);
-
- kernel_fpu_begin();
- remain = sha512_base_do_update_blocks(desc, data, len, sha512_xform);
- kernel_fpu_end();
-
- return remain;
-}
-
-static int sha512_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out, sha512_block_fn *sha512_xform)
-{
- kernel_fpu_begin();
- sha512_base_do_finup(desc, data, len, sha512_xform);
- kernel_fpu_end();
-
- return sha512_base_finish(desc, out);
-}
-
-static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha512_update(desc, data, len, sha512_transform_ssse3);
-}
-
-static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha512_finup(desc, data, len, out, sha512_transform_ssse3);
-}
-
-static struct shash_alg sha512_ssse3_algs[] = { {
- .digestsize = SHA512_DIGEST_SIZE,
- .init = sha512_base_init,
- .update = sha512_ssse3_update,
- .finup = sha512_ssse3_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha512",
- .cra_driver_name = "sha512-ssse3",
- .cra_priority = 150,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA384_DIGEST_SIZE,
- .init = sha384_base_init,
- .update = sha512_ssse3_update,
- .finup = sha512_ssse3_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha384",
- .cra_driver_name = "sha384-ssse3",
- .cra_priority = 150,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA384_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static int register_sha512_ssse3(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- return crypto_register_shashes(sha512_ssse3_algs,
- ARRAY_SIZE(sha512_ssse3_algs));
- return 0;
-}
-
-static void unregister_sha512_ssse3(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- crypto_unregister_shashes(sha512_ssse3_algs,
- ARRAY_SIZE(sha512_ssse3_algs));
-}
-
-asmlinkage void sha512_transform_avx(struct sha512_state *state,
- const u8 *data, int blocks);
-static bool avx_usable(void)
-{
- if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
- if (boot_cpu_has(X86_FEATURE_AVX))
- pr_info("AVX detected but unusable.\n");
- return false;
- }
-
- return true;
-}
-
-static int sha512_avx_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha512_update(desc, data, len, sha512_transform_avx);
-}
-
-static int sha512_avx_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha512_finup(desc, data, len, out, sha512_transform_avx);
-}
-
-static struct shash_alg sha512_avx_algs[] = { {
- .digestsize = SHA512_DIGEST_SIZE,
- .init = sha512_base_init,
- .update = sha512_avx_update,
- .finup = sha512_avx_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha512",
- .cra_driver_name = "sha512-avx",
- .cra_priority = 160,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA384_DIGEST_SIZE,
- .init = sha384_base_init,
- .update = sha512_avx_update,
- .finup = sha512_avx_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha384",
- .cra_driver_name = "sha384-avx",
- .cra_priority = 160,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA384_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static int register_sha512_avx(void)
-{
- if (avx_usable())
- return crypto_register_shashes(sha512_avx_algs,
- ARRAY_SIZE(sha512_avx_algs));
- return 0;
-}
-
-static void unregister_sha512_avx(void)
-{
- if (avx_usable())
- crypto_unregister_shashes(sha512_avx_algs,
- ARRAY_SIZE(sha512_avx_algs));
-}
-
-asmlinkage void sha512_transform_rorx(struct sha512_state *state,
- const u8 *data, int blocks);
-
-static int sha512_avx2_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha512_update(desc, data, len, sha512_transform_rorx);
-}
-
-static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha512_finup(desc, data, len, out, sha512_transform_rorx);
-}
-
-static struct shash_alg sha512_avx2_algs[] = { {
- .digestsize = SHA512_DIGEST_SIZE,
- .init = sha512_base_init,
- .update = sha512_avx2_update,
- .finup = sha512_avx2_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha512",
- .cra_driver_name = "sha512-avx2",
- .cra_priority = 170,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA384_DIGEST_SIZE,
- .init = sha384_base_init,
- .update = sha512_avx2_update,
- .finup = sha512_avx2_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha384",
- .cra_driver_name = "sha384-avx2",
- .cra_priority = 170,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA384_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static bool avx2_usable(void)
-{
- if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_BMI2))
- return true;
-
- return false;
-}
-
-static int register_sha512_avx2(void)
-{
- if (avx2_usable())
- return crypto_register_shashes(sha512_avx2_algs,
- ARRAY_SIZE(sha512_avx2_algs));
- return 0;
-}
-static const struct x86_cpu_id module_cpu_ids[] = {
- X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL),
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids);
-
-static void unregister_sha512_avx2(void)
-{
- if (avx2_usable())
- crypto_unregister_shashes(sha512_avx2_algs,
- ARRAY_SIZE(sha512_avx2_algs));
-}
-
-static int __init sha512_ssse3_mod_init(void)
-{
- if (!x86_match_cpu(module_cpu_ids))
- return -ENODEV;
-
- if (register_sha512_ssse3())
- goto fail;
-
- if (register_sha512_avx()) {
- unregister_sha512_ssse3();
- goto fail;
- }
-
- if (register_sha512_avx2()) {
- unregister_sha512_avx();
- unregister_sha512_ssse3();
- goto fail;
- }
-
- return 0;
-fail:
- return -ENODEV;
-}
-
-static void __exit sha512_ssse3_mod_fini(void)
-{
- unregister_sha512_avx2();
- unregister_sha512_avx();
- unregister_sha512_ssse3();
-}
-
-module_init(sha512_ssse3_mod_init);
-module_exit(sha512_ssse3_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated");
-
-MODULE_ALIAS_CRYPTO("sha512");
-MODULE_ALIAS_CRYPTO("sha512-ssse3");
-MODULE_ALIAS_CRYPTO("sha512-avx");
-MODULE_ALIAS_CRYPTO("sha512-avx2");
-MODULE_ALIAS_CRYPTO("sha384");
-MODULE_ALIAS_CRYPTO("sha384-ssse3");
-MODULE_ALIAS_CRYPTO("sha384-avx");
-MODULE_ALIAS_CRYPTO("sha384-avx2");
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index d83236b96f22..94519688b007 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -369,7 +369,7 @@ For 32-bit we have the following conventions - kernel is built with
.endm
.macro STACKLEAK_ERASE_NOCLOBBER
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
PUSH_AND_CLEAR_REGS
call stackleak_erase
POP_REGS
@@ -388,7 +388,7 @@ For 32-bit we have the following conventions - kernel is built with
#endif /* !CONFIG_X86_64 */
.macro STACKLEAK_ERASE
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
call stackleak_erase
#endif
.endm
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ac007ea00979..4877e16da69a 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -473,3 +473,5 @@
465 i386 listxattrat sys_listxattrat
466 i386 removexattrat sys_removexattrat
467 i386 open_tree_attr sys_open_tree_attr
+468 i386 file_getattr sys_file_getattr
+469 i386 file_setattr sys_file_setattr
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index cfb5ca41e30d..92cf0fe2291e 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -391,6 +391,8 @@
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
467 common open_tree_attr sys_open_tree_attr
+468 common file_getattr sys_file_getattr
+469 common file_setattr sys_file_setattr
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 54d3e9774d62..f247f5f5cb44 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -62,7 +62,7 @@ ifneq ($(RETPOLINE_VDSO_CFLAGS),)
endif
endif
-$(vobjs): KBUILD_CFLAGS := $(filter-out $(PADDING_CFLAGS) $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
+$(vobjs): KBUILD_CFLAGS := $(filter-out $(PADDING_CFLAGS) $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(KSTACK_ERASE_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
$(vobjs): KBUILD_AFLAGS += -DBUILD_VDSO
#
@@ -123,6 +123,7 @@ KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(RANDSTRUCT_CFLAGS),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(KSTACK_ERASE_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32))
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 3d1d3547095a..afdbda2dd7b7 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -34,6 +34,7 @@
#include <linux/syscore_ops.h>
#include <clocksource/hyperv_timer.h>
#include <linux/highmem.h>
+#include <linux/export.h>
void *hv_hypercall_pg;
EXPORT_SYMBOL_GPL(hv_hypercall_pg);
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index 31f0d29cbc5e..090f5ac9f492 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -10,6 +10,7 @@
#include <linux/pci.h>
#include <linux/irq.h>
+#include <linux/export.h>
#include <asm/mshyperv.h>
static int hv_map_interrupt(union hv_device_id device_id, bool level,
@@ -46,7 +47,7 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level,
if (nr_bank < 0) {
local_irq_restore(flags);
pr_err("%s: unable to generate VP set\n", __func__);
- return EINVAL;
+ return -EINVAL;
}
intr_desc->target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
@@ -66,7 +67,7 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level,
if (!hv_result_success(status))
hv_status_err(status, "\n");
- return hv_result(status);
+ return hv_result_to_errno(status);
}
static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry)
@@ -88,7 +89,10 @@ static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry)
status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL);
local_irq_restore(flags);
- return hv_result(status);
+ if (!hv_result_success(status))
+ hv_status_err(status, "\n");
+
+ return hv_result_to_errno(status);
}
#ifdef CONFIG_PCI_MSI
@@ -169,13 +173,34 @@ static union hv_device_id hv_build_pci_dev_id(struct pci_dev *dev)
return dev_id;
}
-static int hv_map_msi_interrupt(struct pci_dev *dev, int cpu, int vector,
- struct hv_interrupt_entry *entry)
+/**
+ * hv_map_msi_interrupt() - "Map" the MSI IRQ in the hypervisor.
+ * @data: Describes the IRQ
+ * @out_entry: Hypervisor (MSI) interrupt entry (can be NULL)
+ *
+ * Map the IRQ in the hypervisor by issuing a MAP_DEVICE_INTERRUPT hypercall.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int hv_map_msi_interrupt(struct irq_data *data,
+ struct hv_interrupt_entry *out_entry)
{
- union hv_device_id device_id = hv_build_pci_dev_id(dev);
+ struct irq_cfg *cfg = irqd_cfg(data);
+ struct hv_interrupt_entry dummy;
+ union hv_device_id device_id;
+ struct msi_desc *msidesc;
+ struct pci_dev *dev;
+ int cpu;
- return hv_map_interrupt(device_id, false, cpu, vector, entry);
+ msidesc = irq_data_get_msi_desc(data);
+ dev = msi_desc_to_pci_dev(msidesc);
+ device_id = hv_build_pci_dev_id(dev);
+ cpu = cpumask_first(irq_data_get_effective_affinity_mask(data));
+
+ return hv_map_interrupt(device_id, false, cpu, cfg->vector,
+ out_entry ? out_entry : &dummy);
}
+EXPORT_SYMBOL_GPL(hv_map_msi_interrupt);
static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi_msg *msg)
{
@@ -188,13 +213,11 @@ static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi
static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry);
static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
{
+ struct hv_interrupt_entry *stored_entry;
+ struct irq_cfg *cfg = irqd_cfg(data);
struct msi_desc *msidesc;
struct pci_dev *dev;
- struct hv_interrupt_entry out_entry, *stored_entry;
- struct irq_cfg *cfg = irqd_cfg(data);
- const cpumask_t *affinity;
- int cpu;
- u64 status;
+ int ret;
msidesc = irq_data_get_msi_desc(data);
dev = msi_desc_to_pci_dev(msidesc);
@@ -204,9 +227,6 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
return;
}
- affinity = irq_data_get_effective_affinity_mask(data);
- cpu = cpumask_first_and(affinity, cpu_online_mask);
-
if (data->chip_data) {
/*
* This interrupt is already mapped. Let's unmap first.
@@ -219,14 +239,12 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
stored_entry = data->chip_data;
data->chip_data = NULL;
- status = hv_unmap_msi_interrupt(dev, stored_entry);
+ ret = hv_unmap_msi_interrupt(dev, stored_entry);
kfree(stored_entry);
- if (status != HV_STATUS_SUCCESS) {
- hv_status_debug(status, "failed to unmap\n");
+ if (ret)
return;
- }
}
stored_entry = kzalloc(sizeof(*stored_entry), GFP_ATOMIC);
@@ -235,15 +253,14 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
return;
}
- status = hv_map_msi_interrupt(dev, cpu, cfg->vector, &out_entry);
- if (status != HV_STATUS_SUCCESS) {
+ ret = hv_map_msi_interrupt(data, stored_entry);
+ if (ret) {
kfree(stored_entry);
return;
}
- *stored_entry = out_entry;
data->chip_data = stored_entry;
- entry_to_msi_msg(&out_entry, msg);
+ entry_to_msi_msg(data->chip_data, msg);
return;
}
@@ -257,7 +274,6 @@ static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd)
{
struct hv_interrupt_entry old_entry;
struct msi_msg msg;
- u64 status;
if (!irqd->chip_data) {
pr_debug("%s: no chip data\n!", __func__);
@@ -270,10 +286,7 @@ static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd)
kfree(irqd->chip_data);
irqd->chip_data = NULL;
- status = hv_unmap_msi_interrupt(dev, &old_entry);
-
- if (status != HV_STATUS_SUCCESS)
- hv_status_err(status, "\n");
+ (void)hv_unmap_msi_interrupt(dev, &old_entry);
}
static void hv_msi_free_irq(struct irq_domain *domain,
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index e93a2f488ff7..ade6c665c97e 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -10,6 +10,7 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/cpu.h>
+#include <linux/export.h>
#include <asm/svm.h>
#include <asm/sev.h>
#include <asm/io.h>
diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c
index 1083dc8646f9..8ccbb7c4fc27 100644
--- a/arch/x86/hyperv/nested.c
+++ b/arch/x86/hyperv/nested.c
@@ -11,6 +11,7 @@
#include <linux/types.h>
+#include <linux/export.h>
#include <hyperv/hvhdk.h>
#include <asm/mshyperv.h>
#include <asm/tlbflush.h>
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 5ab1a4598d00..a03aa6f999d1 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -158,13 +158,13 @@ static inline bool acpi_has_cpu_in_madt(void)
}
#define ACPI_HAVE_ARCH_SET_ROOT_POINTER
-static inline void acpi_arch_set_root_pointer(u64 addr)
+static __always_inline void acpi_arch_set_root_pointer(u64 addr)
{
x86_init.acpi.set_root_pointer(addr);
}
#define ACPI_HAVE_ARCH_GET_ROOT_POINTER
-static inline u64 acpi_arch_get_root_pointer(void)
+static __always_inline u64 acpi_arch_get_root_pointer(void)
{
return x86_init.acpi.get_root_pointer();
}
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 8b1b1abcef15..6bfdaeddbae8 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -5,7 +5,7 @@
#if defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000
#define __head __section(".head.text") __no_sanitize_undefined __no_stack_protector
#else
-#define __head __section(".head.text") __no_sanitize_undefined
+#define __head __section(".head.text") __no_sanitize_undefined __no_sanitize_coverage
#endif
struct x86_mapping_info {
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index e1752ba47e67..abc4659f5809 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -112,12 +112,6 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
return hv_status;
}
-/* Hypercall to the L0 hypervisor */
-static inline u64 hv_do_nested_hypercall(u64 control, void *input, void *output)
-{
- return hv_do_hypercall(control | HV_HYPERCALL_NESTED, input, output);
-}
-
/* Fast hypercall with 8 bytes of input and no output */
static inline u64 _hv_do_fast_hypercall8(u64 control, u64 input1)
{
@@ -165,13 +159,6 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
return _hv_do_fast_hypercall8(control, input1);
}
-static inline u64 hv_do_fast_nested_hypercall8(u16 code, u64 input1)
-{
- u64 control = (u64)code | HV_HYPERCALL_FAST_BIT | HV_HYPERCALL_NESTED;
-
- return _hv_do_fast_hypercall8(control, input1);
-}
-
/* Fast hypercall with 16 bytes of input */
static inline u64 _hv_do_fast_hypercall16(u64 control, u64 input1, u64 input2)
{
@@ -223,13 +210,6 @@ static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
return _hv_do_fast_hypercall16(control, input1, input2);
}
-static inline u64 hv_do_fast_nested_hypercall16(u16 code, u64 input1, u64 input2)
-{
- u64 control = (u64)code | HV_HYPERCALL_FAST_BIT | HV_HYPERCALL_NESTED;
-
- return _hv_do_fast_hypercall16(control, input1, input2);
-}
-
extern struct hv_vp_assist_page **hv_vp_assist_page;
static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
@@ -262,6 +242,8 @@ static inline void hv_apic_init(void) {}
struct irq_domain *hv_create_pci_msi_domain(void);
+int hv_map_msi_interrupt(struct irq_data *data,
+ struct hv_interrupt_entry *out_entry);
int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
struct hv_interrupt_entry *entry);
int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index f607081a022a..e406a1e92c63 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -78,7 +78,7 @@ extern unsigned char secondary_startup_64[];
extern unsigned char secondary_startup_64_no_verify[];
#endif
-static inline size_t real_mode_size_needed(void)
+static __always_inline size_t real_mode_size_needed(void)
{
if (real_mode_header)
return 0; /* already allocated. */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 921c1c783bc1..8ae750cde0c6 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -420,7 +420,7 @@ static u64 kvm_steal_clock(int cpu)
return steal;
}
-static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
+static inline __init void __set_percpu_decrypted(void *ptr, unsigned long size)
{
early_set_memory_decrypted((unsigned long) ptr, size);
}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 095f04bdabdc..3dcadc13f09a 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1236,7 +1236,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
static struct user_regset x86_64_regsets[] __ro_after_init = {
[REGSET64_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = sizeof(struct user_regs_struct) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
@@ -1244,7 +1244,7 @@ static struct user_regset x86_64_regsets[] __ro_after_init = {
.set = genregs_set
},
[REGSET64_FP] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(struct fxregs_state) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
@@ -1253,7 +1253,7 @@ static struct user_regset x86_64_regsets[] __ro_after_init = {
.set = xfpregs_set
},
[REGSET64_XSTATE] = {
- .core_note_type = NT_X86_XSTATE,
+ USER_REGSET_NOTE_TYPE(X86_XSTATE),
.size = sizeof(u64),
.align = sizeof(u64),
.active = xstateregs_active,
@@ -1261,7 +1261,7 @@ static struct user_regset x86_64_regsets[] __ro_after_init = {
.set = xstateregs_set
},
[REGSET64_IOPERM] = {
- .core_note_type = NT_386_IOPERM,
+ USER_REGSET_NOTE_TYPE(386_IOPERM),
.n = IO_BITMAP_LONGS,
.size = sizeof(long),
.align = sizeof(long),
@@ -1270,7 +1270,7 @@ static struct user_regset x86_64_regsets[] __ro_after_init = {
},
#ifdef CONFIG_X86_USER_SHADOW_STACK
[REGSET64_SSP] = {
- .core_note_type = NT_X86_SHSTK,
+ USER_REGSET_NOTE_TYPE(X86_SHSTK),
.n = 1,
.size = sizeof(u64),
.align = sizeof(u64),
@@ -1297,7 +1297,7 @@ static const struct user_regset_view user_x86_64_view = {
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
static struct user_regset x86_32_regsets[] __ro_after_init = {
[REGSET32_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = sizeof(struct user_regs_struct32) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -1305,7 +1305,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
.set = genregs32_set
},
[REGSET32_FP] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(struct user_i387_ia32_struct) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -1314,7 +1314,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
.set = fpregs_set
},
[REGSET32_XFP] = {
- .core_note_type = NT_PRXFPREG,
+ USER_REGSET_NOTE_TYPE(PRXFPREG),
.n = sizeof(struct fxregs_state) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -1323,7 +1323,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
.set = xfpregs_set
},
[REGSET32_XSTATE] = {
- .core_note_type = NT_X86_XSTATE,
+ USER_REGSET_NOTE_TYPE(X86_XSTATE),
.size = sizeof(u64),
.align = sizeof(u64),
.active = xstateregs_active,
@@ -1331,7 +1331,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
.set = xstateregs_set
},
[REGSET32_TLS] = {
- .core_note_type = NT_386_TLS,
+ USER_REGSET_NOTE_TYPE(386_TLS),
.n = GDT_ENTRY_TLS_ENTRIES,
.bias = GDT_ENTRY_TLS_MIN,
.size = sizeof(struct user_desc),
@@ -1341,7 +1341,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
.set = regset_tls_set
},
[REGSET32_IOPERM] = {
- .core_note_type = NT_386_IOPERM,
+ USER_REGSET_NOTE_TYPE(386_IOPERM),
.n = IO_BITMAP_BYTES / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore
index 8ae0f93ecbfd..ec2131c9fd20 100644
--- a/arch/x86/lib/.gitignore
+++ b/arch/x86/lib/.gitignore
@@ -1,2 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
+
+# This now-removed directory used to contain generated files.
+/crypto/
+
inat-tables.c
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 4fa5c4e1ba8a..2dba7f83ef97 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -3,8 +3,6 @@
# Makefile for x86 specific library files.
#
-obj-y += crypto/
-
# Produces uninteresting flaky coverage.
KCOV_INSTRUMENT_delay.o := n
@@ -40,16 +38,6 @@ lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
lib-$(CONFIG_MITIGATION_RETPOLINE) += retpoline.o
-obj-$(CONFIG_CRC32_ARCH) += crc32-x86.o
-crc32-x86-y := crc32.o crc32-pclmul.o
-crc32-x86-$(CONFIG_64BIT) += crc32c-3way.o
-
-obj-$(CONFIG_CRC64_ARCH) += crc64-x86.o
-crc64-x86-y := crc64.o crc64-pclmul.o
-
-obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-x86.o
-crc-t10dif-x86-y := crc-t10dif.o crc16-msb-pclmul.o
-
obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
obj-y += iomem.o
diff --git a/arch/x86/lib/crc-pclmul-consts.h b/arch/x86/lib/crc-pclmul-consts.h
deleted file mode 100644
index fcc63c064333..000000000000
--- a/arch/x86/lib/crc-pclmul-consts.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * CRC constants generated by:
- *
- * ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5
- *
- * Do not edit manually.
- */
-
-/*
- * CRC folding constants generated for most-significant-bit-first CRC-16 using
- * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
- */
-static const struct {
- u8 bswap_mask[16];
- u64 fold_across_2048_bits_consts[2];
- u64 fold_across_1024_bits_consts[2];
- u64 fold_across_512_bits_consts[2];
- u64 fold_across_256_bits_consts[2];
- u64 fold_across_128_bits_consts[2];
- u8 shuf_table[48];
- u64 barrett_reduction_consts[2];
-} crc16_msb_0x8bb7_consts ____cacheline_aligned __maybe_unused = {
- .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
- .fold_across_2048_bits_consts = {
- 0xdccf000000000000, /* LO64_TERMS: (x^2000 mod G) * x^48 */
- 0x4b0b000000000000, /* HI64_TERMS: (x^2064 mod G) * x^48 */
- },
- .fold_across_1024_bits_consts = {
- 0x9d9d000000000000, /* LO64_TERMS: (x^976 mod G) * x^48 */
- 0x7cf5000000000000, /* HI64_TERMS: (x^1040 mod G) * x^48 */
- },
- .fold_across_512_bits_consts = {
- 0x044c000000000000, /* LO64_TERMS: (x^464 mod G) * x^48 */
- 0xe658000000000000, /* HI64_TERMS: (x^528 mod G) * x^48 */
- },
- .fold_across_256_bits_consts = {
- 0x6ee3000000000000, /* LO64_TERMS: (x^208 mod G) * x^48 */
- 0xe7b5000000000000, /* HI64_TERMS: (x^272 mod G) * x^48 */
- },
- .fold_across_128_bits_consts = {
- 0x2d56000000000000, /* LO64_TERMS: (x^80 mod G) * x^48 */
- 0x06df000000000000, /* HI64_TERMS: (x^144 mod G) * x^48 */
- },
- .shuf_table = {
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- },
- .barrett_reduction_consts = {
- 0x8bb7000000000000, /* LO64_TERMS: (G - x^16) * x^48 */
- 0xf65a57f81d33a48a, /* HI64_TERMS: (floor(x^79 / G) * x) - x^64 */
- },
-};
-
-/*
- * CRC folding constants generated for least-significant-bit-first CRC-32 using
- * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
- * x^5 + x^4 + x^2 + x^1 + x^0
- */
-static const struct {
- u64 fold_across_2048_bits_consts[2];
- u64 fold_across_1024_bits_consts[2];
- u64 fold_across_512_bits_consts[2];
- u64 fold_across_256_bits_consts[2];
- u64 fold_across_128_bits_consts[2];
- u8 shuf_table[48];
- u64 barrett_reduction_consts[2];
-} crc32_lsb_0xedb88320_consts ____cacheline_aligned __maybe_unused = {
- .fold_across_2048_bits_consts = {
- 0x00000000ce3371cb, /* HI64_TERMS: (x^2079 mod G) * x^32 */
- 0x00000000e95c1271, /* LO64_TERMS: (x^2015 mod G) * x^32 */
- },
- .fold_across_1024_bits_consts = {
- 0x0000000033fff533, /* HI64_TERMS: (x^1055 mod G) * x^32 */
- 0x00000000910eeec1, /* LO64_TERMS: (x^991 mod G) * x^32 */
- },
- .fold_across_512_bits_consts = {
- 0x000000008f352d95, /* HI64_TERMS: (x^543 mod G) * x^32 */
- 0x000000001d9513d7, /* LO64_TERMS: (x^479 mod G) * x^32 */
- },
- .fold_across_256_bits_consts = {
- 0x00000000f1da05aa, /* HI64_TERMS: (x^287 mod G) * x^32 */
- 0x0000000081256527, /* LO64_TERMS: (x^223 mod G) * x^32 */
- },
- .fold_across_128_bits_consts = {
- 0x00000000ae689191, /* HI64_TERMS: (x^159 mod G) * x^32 */
- 0x00000000ccaa009e, /* LO64_TERMS: (x^95 mod G) * x^32 */
- },
- .shuf_table = {
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- },
- .barrett_reduction_consts = {
- 0xb4e5b025f7011641, /* HI64_TERMS: floor(x^95 / G) */
- 0x00000001db710640, /* LO64_TERMS: (G - x^32) * x^31 */
- },
-};
-
-/*
- * CRC folding constants generated for most-significant-bit-first CRC-64 using
- * G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
- * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
- * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
- * x^7 + x^4 + x^1 + x^0
- */
-static const struct {
- u8 bswap_mask[16];
- u64 fold_across_2048_bits_consts[2];
- u64 fold_across_1024_bits_consts[2];
- u64 fold_across_512_bits_consts[2];
- u64 fold_across_256_bits_consts[2];
- u64 fold_across_128_bits_consts[2];
- u8 shuf_table[48];
- u64 barrett_reduction_consts[2];
-} crc64_msb_0x42f0e1eba9ea3693_consts ____cacheline_aligned __maybe_unused = {
- .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
- .fold_across_2048_bits_consts = {
- 0x7f52691a60ddc70d, /* LO64_TERMS: (x^2048 mod G) * x^0 */
- 0x7036b0389f6a0c82, /* HI64_TERMS: (x^2112 mod G) * x^0 */
- },
- .fold_across_1024_bits_consts = {
- 0x05cf79dea9ac37d6, /* LO64_TERMS: (x^1024 mod G) * x^0 */
- 0x001067e571d7d5c2, /* HI64_TERMS: (x^1088 mod G) * x^0 */
- },
- .fold_across_512_bits_consts = {
- 0x5f6843ca540df020, /* LO64_TERMS: (x^512 mod G) * x^0 */
- 0xddf4b6981205b83f, /* HI64_TERMS: (x^576 mod G) * x^0 */
- },
- .fold_across_256_bits_consts = {
- 0x571bee0a227ef92b, /* LO64_TERMS: (x^256 mod G) * x^0 */
- 0x44bef2a201b5200c, /* HI64_TERMS: (x^320 mod G) * x^0 */
- },
- .fold_across_128_bits_consts = {
- 0x05f5c3c7eb52fab6, /* LO64_TERMS: (x^128 mod G) * x^0 */
- 0x4eb938a7d257740e, /* HI64_TERMS: (x^192 mod G) * x^0 */
- },
- .shuf_table = {
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- },
- .barrett_reduction_consts = {
- 0x42f0e1eba9ea3693, /* LO64_TERMS: (G - x^64) * x^0 */
- 0x578d29d06cc4f872, /* HI64_TERMS: (floor(x^127 / G) * x) - x^64 */
- },
-};
-
-/*
- * CRC folding constants generated for least-significant-bit-first CRC-64 using
- * G(x) = x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 +
- * x^47 + x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 +
- * x^26 + x^23 + x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 +
- * x^4 + x^3 + x^0
- */
-static const struct {
- u64 fold_across_2048_bits_consts[2];
- u64 fold_across_1024_bits_consts[2];
- u64 fold_across_512_bits_consts[2];
- u64 fold_across_256_bits_consts[2];
- u64 fold_across_128_bits_consts[2];
- u8 shuf_table[48];
- u64 barrett_reduction_consts[2];
-} crc64_lsb_0x9a6c9329ac4bc9b5_consts ____cacheline_aligned __maybe_unused = {
- .fold_across_2048_bits_consts = {
- 0x37ccd3e14069cabc, /* HI64_TERMS: (x^2111 mod G) * x^0 */
- 0xa043808c0f782663, /* LO64_TERMS: (x^2047 mod G) * x^0 */
- },
- .fold_across_1024_bits_consts = {
- 0xa1ca681e733f9c40, /* HI64_TERMS: (x^1087 mod G) * x^0 */
- 0x5f852fb61e8d92dc, /* LO64_TERMS: (x^1023 mod G) * x^0 */
- },
- .fold_across_512_bits_consts = {
- 0x0c32cdb31e18a84a, /* HI64_TERMS: (x^575 mod G) * x^0 */
- 0x62242240ace5045a, /* LO64_TERMS: (x^511 mod G) * x^0 */
- },
- .fold_across_256_bits_consts = {
- 0xb0bc2e589204f500, /* HI64_TERMS: (x^319 mod G) * x^0 */
- 0xe1e0bb9d45d7a44c, /* LO64_TERMS: (x^255 mod G) * x^0 */
- },
- .fold_across_128_bits_consts = {
- 0xeadc41fd2ba3d420, /* HI64_TERMS: (x^191 mod G) * x^0 */
- 0x21e9761e252621ac, /* LO64_TERMS: (x^127 mod G) * x^0 */
- },
- .shuf_table = {
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- },
- .barrett_reduction_consts = {
- 0x27ecfa329aef9f77, /* HI64_TERMS: floor(x^127 / G) */
- 0x34d926535897936a, /* LO64_TERMS: (G - x^64 - x^0) / x */
- },
-};
diff --git a/arch/x86/lib/crc-pclmul-template.S b/arch/x86/lib/crc-pclmul-template.S
deleted file mode 100644
index ae0b6144c503..000000000000
--- a/arch/x86/lib/crc-pclmul-template.S
+++ /dev/null
@@ -1,582 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-//
-// Template to generate [V]PCLMULQDQ-based CRC functions for x86
-//
-// Copyright 2025 Google LLC
-//
-// Author: Eric Biggers <ebiggers@google.com>
-
-#include <linux/linkage.h>
-#include <linux/objtool.h>
-
-// Offsets within the generated constants table
-.set OFFSETOF_BSWAP_MASK, -5*16 // msb-first CRCs only
-.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS, -4*16 // must precede next
-.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS, -3*16 // must precede next
-.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS, -2*16 // must precede next
-.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS, -1*16 // must precede next
-.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS, 0*16 // must be 0
-.set OFFSETOF_SHUF_TABLE, 1*16
-.set OFFSETOF_BARRETT_REDUCTION_CONSTS, 4*16
-
-// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the
-// corresponding non-VEX instruction plus any needed moves. The supported
-// instruction formats are:
-//
-// - Two-arg [src, dst], where the non-VEX format is the same.
-// - Three-arg [src1, src2, dst] where the non-VEX format is
-// [src1, src2_and_dst]. If src2 != dst, then src1 must != dst too.
-//
-// \insn gives the instruction without a "v" prefix and including any immediate
-// argument if needed to make the instruction follow one of the above formats.
-// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to
-// it first; this is needed when \arg1 is an unaligned mem operand.
-.macro _cond_vex insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp
-.if AVX_LEVEL == 0
- // VEX not allowed. Emulate it.
- .ifnb \arg3 // Three-arg [src1, src2, dst]
- .ifc "\arg2", "\arg3" // src2 == dst?
- .ifnb \unaligned_mem_tmp
- movdqu \arg1, \unaligned_mem_tmp
- \insn \unaligned_mem_tmp, \arg3
- .else
- \insn \arg1, \arg3
- .endif
- .else // src2 != dst
- .ifc "\arg1", "\arg3"
- .error "Can't have src1 == dst when src2 != dst"
- .endif
- .ifnb \unaligned_mem_tmp
- movdqu \arg1, \unaligned_mem_tmp
- movdqa \arg2, \arg3
- \insn \unaligned_mem_tmp, \arg3
- .else
- movdqa \arg2, \arg3
- \insn \arg1, \arg3
- .endif
- .endif
- .else // Two-arg [src, dst]
- .ifnb \unaligned_mem_tmp
- movdqu \arg1, \unaligned_mem_tmp
- \insn \unaligned_mem_tmp, \arg2
- .else
- \insn \arg1, \arg2
- .endif
- .endif
-.else
- // VEX is allowed. Emit the desired instruction directly.
- .ifnb \arg3
- v\insn \arg1, \arg2, \arg3
- .else
- v\insn \arg1, \arg2
- .endif
-.endif
-.endm
-
-// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector
-// register of length VL.
-.macro _vbroadcast src, dst
-.if VL == 16
- _cond_vex movdqa, \src, \dst
-.elseif VL == 32
- vbroadcasti128 \src, \dst
-.else
- vbroadcasti32x4 \src, \dst
-.endif
-.endm
-
-// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC
-// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane.
-.macro _load_data vl, src, bswap_mask, dst
-.if \vl < 64
- _cond_vex movdqu, "\src", \dst
-.else
- vmovdqu8 \src, \dst
-.endif
-.if !LSB_CRC
- _cond_vex pshufb, \bswap_mask, \dst, \dst
-.endif
-.endm
-
-.macro _prepare_v0 vl, v0, v1, bswap_mask
-.if LSB_CRC
- .if \vl < 64
- _cond_vex pxor, (BUF), \v0, \v0, unaligned_mem_tmp=\v1
- .else
- vpxorq (BUF), \v0, \v0
- .endif
-.else
- _load_data \vl, (BUF), \bswap_mask, \v1
- .if \vl < 64
- _cond_vex pxor, \v1, \v0, \v0
- .else
- vpxorq \v1, \v0, \v0
- .endif
-.endif
-.endm
-
-// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for
-// msb-first order or the physically high qword for lsb-first order
-#define LO64_TERMS 0
-
-// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high
-// qword for msb-first order or the physically low qword for lsb-first order
-#define HI64_TERMS 1
-
-// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given
-// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst.
-.macro _pclmulqdq src1, src1_terms, src2, src2_terms, dst
- _cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \
- \src1, \src2, \dst
-.endm
-
-// Fold \acc into \data and store the result back into \acc. \data can be an
-// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no
-// byte-reflection is needed; otherwise it must be a vector register. \consts
-// is a vector register containing the needed fold constants, and \tmp is a
-// temporary vector register. All arguments must be the same length.
-.macro _fold_vec acc, data, consts, tmp
- _pclmulqdq \consts, HI64_TERMS, \acc, HI64_TERMS, \tmp
- _pclmulqdq \consts, LO64_TERMS, \acc, LO64_TERMS, \acc
-.if AVX_LEVEL <= 2
- _cond_vex pxor, \data, \tmp, \tmp
- _cond_vex pxor, \tmp, \acc, \acc
-.else
- vpternlogq $0x96, \data, \tmp, \acc
-.endif
-.endm
-
-// Fold \acc into \data and store the result back into \acc. \data is an
-// unaligned mem operand, \consts is a vector register containing the needed
-// fold constants, \bswap_mask is a vector register containing the
-// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are
-// temporary vector registers. All arguments must have length \vl.
-.macro _fold_vec_mem vl, acc, data, consts, bswap_mask, tmp1, tmp2
-.if AVX_LEVEL == 0 || !LSB_CRC
- _load_data \vl, \data, \bswap_mask, \tmp1
- _fold_vec \acc, \tmp1, \consts, \tmp2
-.else
- _fold_vec \acc, \data, \consts, \tmp1
-.endif
-.endm
-
-// Load the constants for folding across 2**i vectors of length VL at a time
-// into all 128-bit lanes of the vector register CONSTS.
-.macro _load_vec_folding_consts i
- _vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \
- CONSTS
-.endm
-
-// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store
-// the result back into \v0. If the remaining length mod \vl is nonzero, also
-// fold \vl data bytes from BUF. For both operations the fold distance is \vl.
-// \consts must be a register of length \vl containing the fold constants.
-.macro _fold_vec_final vl, v0, v1, consts, bswap_mask, tmp1, tmp2
- _fold_vec \v0, \v1, \consts, \tmp1
- test $\vl, LEN8
- jz .Lfold_vec_final_done\@
- _fold_vec_mem \vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2
- add $\vl, BUF
-.Lfold_vec_final_done\@:
-.endm
-
-// This macro generates the body of a CRC function with the following prototype:
-//
-// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts);
-//
-// |crc| is the initial CRC, and crc_t is a data type wide enough to hold it.
-// |buf| is the data to checksum. |len| is the data length in bytes, which must
-// be at least 16. |consts| is a pointer to the fold_across_128_bits_consts
-// field of the constants struct that was generated for the chosen CRC variant.
-//
-// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g.
-// 32 for a CRC-32. Currently the supported values are 8, 16, 32, and 64. If
-// the file is compiled in i386 mode, then the maximum supported value is 32.
-//
-// \lsb_crc is 1 if the CRC processes the least significant bit of each byte
-// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0. \lsb_crc is 0
-// if the CRC processes the most significant bit of each byte first, i.e. maps
-// bit0 to x^0, bit1 to x^1, bit7 to x^7.
-//
-// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64.
-//
-// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or
-// 512 for AVX512.
-//
-// If \vl == 16 && \avx_level == 0, the generated code requires:
-// PCLMULQDQ && SSE4.1. (Note: all known CPUs with PCLMULQDQ also have SSE4.1.)
-//
-// If \vl == 32 && \avx_level == 2, the generated code requires:
-// VPCLMULQDQ && AVX2.
-//
-// If \vl == 64 && \avx_level == 512, the generated code requires:
-// VPCLMULQDQ && AVX512BW && AVX512VL.
-//
-// Other \vl and \avx_level combinations are either not supported or not useful.
-.macro _crc_pclmul n, lsb_crc, vl, avx_level
- .set LSB_CRC, \lsb_crc
- .set VL, \vl
- .set AVX_LEVEL, \avx_level
-
- // Define aliases for the xmm, ymm, or zmm registers according to VL.
-.irp i, 0,1,2,3,4,5,6,7
- .if VL == 16
- .set V\i, %xmm\i
- .set LOG2_VL, 4
- .elseif VL == 32
- .set V\i, %ymm\i
- .set LOG2_VL, 5
- .elseif VL == 64
- .set V\i, %zmm\i
- .set LOG2_VL, 6
- .else
- .error "Unsupported vector length"
- .endif
-.endr
- // Define aliases for the function parameters.
- // Note: when crc_t is shorter than u32, zero-extension to 32 bits is
- // guaranteed by the ABI. Zero-extension to 64 bits is *not* guaranteed
- // when crc_t is shorter than u64.
-#ifdef __x86_64__
-.if \n <= 32
- .set CRC, %edi
-.else
- .set CRC, %rdi
-.endif
- .set BUF, %rsi
- .set LEN, %rdx
- .set LEN32, %edx
- .set LEN8, %dl
- .set CONSTS_PTR, %rcx
-#else
- // 32-bit support, assuming -mregparm=3 and not including support for
- // CRC-64 (which would use both eax and edx to pass the crc parameter).
- .set CRC, %eax
- .set BUF, %edx
- .set LEN, %ecx
- .set LEN32, %ecx
- .set LEN8, %cl
- .set CONSTS_PTR, %ebx // Passed on stack
-#endif
-
- // Define aliases for some local variables. V0-V5 are used without
- // aliases (for accumulators, data, temporary values, etc). Staying
- // within the first 8 vector registers keeps the code 32-bit SSE
- // compatible and reduces the size of 64-bit SSE code slightly.
- .set BSWAP_MASK, V6
- .set BSWAP_MASK_YMM, %ymm6
- .set BSWAP_MASK_XMM, %xmm6
- .set CONSTS, V7
- .set CONSTS_YMM, %ymm7
- .set CONSTS_XMM, %xmm7
-
- // Use ANNOTATE_NOENDBR to suppress an objtool warning, since the
- // functions generated by this macro are called only by static_call.
- ANNOTATE_NOENDBR
-
-#ifdef __i386__
- push CONSTS_PTR
- mov 8(%esp), CONSTS_PTR
-#endif
-
- // Create a 128-bit vector that contains the initial CRC in the end
- // representing the high-order polynomial coefficients, and the rest 0.
- // If the CRC is msb-first, also load the byte-reflection table.
-.if \n <= 32
- _cond_vex movd, CRC, %xmm0
-.else
- _cond_vex movq, CRC, %xmm0
-.endif
-.if !LSB_CRC
- _cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0
- _vbroadcast OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK
-.endif
-
- // Load the first vector of data and XOR the initial CRC into the
- // appropriate end of the first 128-bit lane of data. If LEN < VL, then
- // use a short vector and jump ahead to the final reduction. (LEN >= 16
- // is guaranteed here but not necessarily LEN >= VL.)
-.if VL >= 32
- cmp $VL, LEN
- jae .Lat_least_1vec\@
- .if VL == 64
- cmp $32, LEN32
- jb .Lless_than_32bytes\@
- _prepare_v0 32, %ymm0, %ymm1, BSWAP_MASK_YMM
- add $32, BUF
- jmp .Lreduce_256bits_to_128bits\@
-.Lless_than_32bytes\@:
- .endif
- _prepare_v0 16, %xmm0, %xmm1, BSWAP_MASK_XMM
- add $16, BUF
- vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
- jmp .Lcheck_for_partial_block\@
-.Lat_least_1vec\@:
-.endif
- _prepare_v0 VL, V0, V1, BSWAP_MASK
-
- // Handle VL <= LEN < 4*VL.
- cmp $4*VL-1, LEN
- ja .Lat_least_4vecs\@
- add $VL, BUF
- // If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector.
- // If VL==16 then load fold_across_128_bits_consts first, as the final
- // reduction depends on it and it won't be loaded anywhere else.
- cmp $2*VL-1, LEN32
-.if VL == 16
- _cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
-.endif
- jbe .Lreduce_1vec_to_128bits\@
- // Otherwise 2*VL <= LEN < 4*VL. Load one more vector and jump ahead to
- // the reduction from 2 vectors.
- _load_data VL, (BUF), BSWAP_MASK, V1
- add $VL, BUF
- jmp .Lreduce_2vecs_to_1\@
-
-.Lat_least_4vecs\@:
- // Load 3 more vectors of data.
- _load_data VL, 1*VL(BUF), BSWAP_MASK, V1
- _load_data VL, 2*VL(BUF), BSWAP_MASK, V2
- _load_data VL, 3*VL(BUF), BSWAP_MASK, V3
- sub $-4*VL, BUF // Shorter than 'add 4*VL' when VL=32
- add $-4*VL, LEN // Shorter than 'sub 4*VL' when VL=32
-
- // Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next
- // 4 vectors of data and write the result back to V0-V3.
- cmp $4*VL-1, LEN // Shorter than 'cmp 4*VL' when VL=32
- jbe .Lreduce_4vecs_to_2\@
- _load_vec_folding_consts 2
-.Lfold_4vecs_loop\@:
- _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- _fold_vec_mem VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- _fold_vec_mem VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- sub $-4*VL, BUF
- add $-4*VL, LEN
- cmp $4*VL-1, LEN
- ja .Lfold_4vecs_loop\@
-
- // Fold V0,V1 into V2,V3 and write the result back to V0,V1. Then fold
- // two more vectors of data from BUF, if at least that much remains.
-.Lreduce_4vecs_to_2\@:
- _load_vec_folding_consts 1
- _fold_vec V0, V2, CONSTS, V4
- _fold_vec V1, V3, CONSTS, V4
- test $2*VL, LEN8
- jz .Lreduce_2vecs_to_1\@
- _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- sub $-2*VL, BUF
-
- // Fold V0 into V1 and write the result back to V0. Then fold one more
- // vector of data from BUF, if at least that much remains.
-.Lreduce_2vecs_to_1\@:
- _load_vec_folding_consts 0
- _fold_vec_final VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5
-
-.Lreduce_1vec_to_128bits\@:
-.if VL == 64
- // Reduce 512-bit %zmm0 to 256-bit %ymm0. Then fold 256 more bits of
- // data from BUF, if at least that much remains.
- vbroadcasti128 OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM
- vextracti64x4 $1, %zmm0, %ymm1
- _fold_vec_final 32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5
-.Lreduce_256bits_to_128bits\@:
-.endif
-.if VL >= 32
- // Reduce 256-bit %ymm0 to 128-bit %xmm0. Then fold 128 more bits of
- // data from BUF, if at least that much remains.
- vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
- vextracti128 $1, %ymm0, %xmm1
- _fold_vec_final 16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5
-.Lcheck_for_partial_block\@:
-.endif
- and $15, LEN32
- jz .Lreduce_128bits_to_crc\@
-
- // 1 <= LEN <= 15 data bytes remain in BUF. The polynomial is now
- // A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0
- // and B is the polynomial of the remaining LEN data bytes. To reduce
- // this to 128 bits without needing fold constants for each possible
- // LEN, rearrange this expression into C1*(x^128) + C2, where
- // C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128.
- // Then fold C1 into C2, which is just another fold across 128 bits.
-
-.if !LSB_CRC || AVX_LEVEL == 0
- // Load the last 16 data bytes. Note that originally LEN was >= 16.
- _load_data 16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2
-.endif // Else will use vpblendvb mem operand later.
-.if !LSB_CRC
- neg LEN // Needed for indexing shuf_table
-.endif
-
- // tmp = A*x^(8*LEN) mod x^128
- // lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1]
- // i.e. right-shift by LEN bytes.
- // msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN]
- // i.e. left-shift by LEN bytes.
- _cond_vex movdqu, "OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3
- _cond_vex pshufb, %xmm3, %xmm0, %xmm1
-
- // C1 = floor(A / x^(128 - 8*LEN))
- // lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1]
- // i.e. left-shift by 16-LEN bytes.
- // msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1]
- // i.e. right-shift by 16-LEN bytes.
- _cond_vex pshufb, "OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \
- %xmm0, %xmm0, unaligned_mem_tmp=%xmm4
-
- // C2 = tmp + B. This is just a blend of tmp with the last 16 data
- // bytes (reflected if msb-first). The blend mask is the shuffle table
- // that was used to create tmp. 0 selects tmp, and 1 last16databytes.
-.if AVX_LEVEL == 0
- movdqa %xmm0, %xmm4
- movdqa %xmm3, %xmm0
- pblendvb %xmm2, %xmm1 // uses %xmm0 as implicit operand
- movdqa %xmm4, %xmm0
-.elseif LSB_CRC
- vpblendvb %xmm3, -16(BUF,LEN), %xmm1, %xmm1
-.else
- vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
-.endif
-
- // Fold C1 into C2 and store the 128-bit result in %xmm0.
- _fold_vec %xmm0, %xmm1, CONSTS_XMM, %xmm4
-
-.Lreduce_128bits_to_crc\@:
- // Compute the CRC as %xmm0 * x^n mod G. Here %xmm0 means the 128-bit
- // polynomial stored in %xmm0 (using either lsb-first or msb-first bit
- // order according to LSB_CRC), and G is the CRC's generator polynomial.
-
- // First, multiply %xmm0 by x^n and reduce the result to 64+n bits:
- //
- // t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) +
- // x^n * (%xmm0 mod x^64)
- //
- // Store t0 * x^(64-n) in %xmm0. I.e., actually do:
- //
- // %xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) +
- // x^64 * (%xmm0 mod x^64)
- //
- // The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned
- // to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily
- // select it. The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the
- // msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case
- // (considering the extra factor of x that gets implicitly introduced by
- // each pclmulqdq when using lsb-first order), is identical to the
- // constant that was used earlier for folding the LO64_TERMS across 128
- // bits. Thus it's already available in LO64_TERMS of CONSTS_XMM.
- _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1
-.if LSB_CRC
- _cond_vex psrldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64)
-.else
- _cond_vex pslldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64)
-.endif
- _cond_vex pxor, %xmm1, %xmm0, %xmm0
- // The HI64_TERMS of %xmm0 now contain floor(t0 / x^n).
- // The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n).
-
- // First step of Barrett reduction: Compute floor(t0 / G). This is the
- // polynomial by which G needs to be multiplied to cancel out the x^n
- // and higher terms of t0, i.e. to reduce t0 mod G. First do:
- //
- // t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n)
- //
- // Then the desired value floor(t0 / G) is floor(t1 / x^64). The 63 in
- // x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest
- // value that makes enough precision be carried through the calculation.
- //
- // The '* x' makes it so the result is floor(t1 / x^64) rather than
- // floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it
- // can be extracted much more easily in the next step. In the lsb-first
- // case the '* x' happens implicitly. In the msb-first case it must be
- // done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the
- // constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and
- // the multiplication by the x^64 term is handled using a pxor. The
- // pxor causes the low 64 terms of t1 to be wrong, but they are unused.
- _cond_vex movdqa, OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM
- _pclmulqdq CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1
-.if !LSB_CRC
- _cond_vex pxor, %xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n)
-.endif
- // The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G).
-
- // Second step of Barrett reduction: Cancel out the x^n and higher terms
- // of t0 by subtracting the needed multiple of G. This gives the CRC:
- //
- // crc := t0 - (G * floor(t0 / G))
- //
- // But %xmm0 contains t0 * x^(64-n), so it's more convenient to do:
- //
- // crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n)
- //
- // Furthermore, since the resulting CRC is n-bit, if mod x^n is
- // explicitly applied to it then the x^n term of G makes no difference
- // in the result and can be omitted. This helps keep the constant
- // multiplier in 64 bits in most cases. This gives the following:
- //
- // %xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G))
- // crc := (%xmm0 / x^(64-n)) mod x^n
- //
- // In the lsb-first case, each pclmulqdq implicitly introduces
- // an extra factor of x, so in that case the constant that needs to be
- // passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63.
- // For lsb-first CRCs where n=64, the extra factor of x cannot be as
- // easily avoided. In that case, instead pass '(G - x^n - x^0) / x' to
- // pclmulqdq and handle the x^0 term (i.e. 1) separately. (All CRC
- // polynomials have nonzero x^n and x^0 terms.) It works out as: the
- // CRC has be XORed with the physically low qword of %xmm1, representing
- // floor(t0 / G). The most efficient way to do that is to move it to
- // the physically high qword and use a ternlog to combine the two XORs.
-.if LSB_CRC && \n == 64
- _cond_vex punpcklqdq, %xmm1, %xmm2, %xmm2
- _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
- .if AVX_LEVEL <= 2
- _cond_vex pxor, %xmm2, %xmm0, %xmm0
- _cond_vex pxor, %xmm1, %xmm0, %xmm0
- .else
- vpternlogq $0x96, %xmm2, %xmm1, %xmm0
- .endif
- _cond_vex "pextrq $1,", %xmm0, %rax // (%xmm0 / x^0) mod x^64
-.else
- _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
- _cond_vex pxor, %xmm1, %xmm0, %xmm0
- .if \n == 8
- _cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8
- .elseif \n == 16
- _cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16
- .elseif \n == 32
- _cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32
- .else // \n == 64 && !LSB_CRC
- _cond_vex movq, %xmm0, %rax // (%xmm0 / x^0) mod x^64
- .endif
-.endif
-
-.if VL > 16
- vzeroupper // Needed when ymm or zmm registers may have been used.
-.endif
-#ifdef __i386__
- pop CONSTS_PTR
-#endif
- RET
-.endm
-
-#ifdef CONFIG_AS_VPCLMULQDQ
-#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \
-SYM_FUNC_START(prefix##_pclmul_sse); \
- _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \
-SYM_FUNC_END(prefix##_pclmul_sse); \
- \
-SYM_FUNC_START(prefix##_vpclmul_avx2); \
- _crc_pclmul n=bits, lsb_crc=lsb, vl=32, avx_level=2; \
-SYM_FUNC_END(prefix##_vpclmul_avx2); \
- \
-SYM_FUNC_START(prefix##_vpclmul_avx512); \
- _crc_pclmul n=bits, lsb_crc=lsb, vl=64, avx_level=512; \
-SYM_FUNC_END(prefix##_vpclmul_avx512);
-#else
-#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \
-SYM_FUNC_START(prefix##_pclmul_sse); \
- _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \
-SYM_FUNC_END(prefix##_pclmul_sse);
-#endif // !CONFIG_AS_VPCLMULQDQ
diff --git a/arch/x86/lib/crc-pclmul-template.h b/arch/x86/lib/crc-pclmul-template.h
deleted file mode 100644
index c5b3bfe11d8d..000000000000
--- a/arch/x86/lib/crc-pclmul-template.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Macros for accessing the [V]PCLMULQDQ-based CRC functions that are
- * instantiated by crc-pclmul-template.S
- *
- * Copyright 2025 Google LLC
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-#ifndef _CRC_PCLMUL_TEMPLATE_H
-#define _CRC_PCLMUL_TEMPLATE_H
-
-#include <asm/cpufeatures.h>
-#include <asm/simd.h>
-#include <crypto/internal/simd.h>
-#include <linux/static_call.h>
-#include "crc-pclmul-consts.h"
-
-#define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t) \
-crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len, \
- const void *consts_ptr); \
-crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len, \
- const void *consts_ptr); \
-crc_t prefix##_vpclmul_avx512(crc_t crc, const u8 *p, size_t len, \
- const void *consts_ptr); \
-DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse)
-
-#define INIT_CRC_PCLMUL(prefix) \
-do { \
- if (IS_ENABLED(CONFIG_AS_VPCLMULQDQ) && \
- boot_cpu_has(X86_FEATURE_VPCLMULQDQ) && \
- boot_cpu_has(X86_FEATURE_AVX2) && \
- cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL)) { \
- if (boot_cpu_has(X86_FEATURE_AVX512BW) && \
- boot_cpu_has(X86_FEATURE_AVX512VL) && \
- !boot_cpu_has(X86_FEATURE_PREFER_YMM) && \
- cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL)) { \
- static_call_update(prefix##_pclmul, \
- prefix##_vpclmul_avx512); \
- } else { \
- static_call_update(prefix##_pclmul, \
- prefix##_vpclmul_avx2); \
- } \
- } \
-} while (0)
-
-/*
- * Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16
- * bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD.
- *
- * 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions.
- * There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(),
- * varying by CPU and factors such as which parts of the "FPU" state userspace
- * has touched, which could result in a larger cutoff being better. Indeed, a
- * larger cutoff is usually better for a *single* message. However, the
- * overhead of the FPU section gets amortized if multiple FPU sections get
- * executed before returning to userspace, since the XSAVE and XRSTOR occur only
- * once. Considering that and the fact that the [V]PCLMULQDQ code is lighter on
- * the dcache than the table-based code is, a 16-byte cutoff seems to work well.
- */
-#define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq) \
-do { \
- if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) && \
- crypto_simd_usable()) { \
- const void *consts_ptr; \
- \
- consts_ptr = (consts).fold_across_128_bits_consts; \
- kernel_fpu_begin(); \
- crc = static_call(prefix##_pclmul)((crc), (p), (len), \
- consts_ptr); \
- kernel_fpu_end(); \
- return crc; \
- } \
-} while (0)
-
-#endif /* _CRC_PCLMUL_TEMPLATE_H */
diff --git a/arch/x86/lib/crc-t10dif.c b/arch/x86/lib/crc-t10dif.c
deleted file mode 100644
index db7ce59c31ac..000000000000
--- a/arch/x86/lib/crc-t10dif.c
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * CRC-T10DIF using [V]PCLMULQDQ instructions
- *
- * Copyright 2024 Google LLC
- */
-
-#include <linux/crc-t10dif.h>
-#include <linux/module.h>
-#include "crc-pclmul-template.h"
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
-
-DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16);
-
-u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len)
-{
- CRC_PCLMUL(crc, p, len, crc16_msb, crc16_msb_0x8bb7_consts,
- have_pclmulqdq);
- return crc_t10dif_generic(crc, p, len);
-}
-EXPORT_SYMBOL(crc_t10dif_arch);
-
-static int __init crc_t10dif_x86_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
- static_branch_enable(&have_pclmulqdq);
- INIT_CRC_PCLMUL(crc16_msb);
- }
- return 0;
-}
-subsys_initcall(crc_t10dif_x86_init);
-
-static void __exit crc_t10dif_x86_exit(void)
-{
-}
-module_exit(crc_t10dif_x86_exit);
-
-MODULE_DESCRIPTION("CRC-T10DIF using [V]PCLMULQDQ instructions");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/lib/crc16-msb-pclmul.S b/arch/x86/lib/crc16-msb-pclmul.S
deleted file mode 100644
index e9fe248093a8..000000000000
--- a/arch/x86/lib/crc16-msb-pclmul.S
+++ /dev/null
@@ -1,6 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-// Copyright 2025 Google LLC
-
-#include "crc-pclmul-template.S"
-
-DEFINE_CRC_PCLMUL_FUNCS(crc16_msb, /* bits= */ 16, /* lsb= */ 0)
diff --git a/arch/x86/lib/crc32-pclmul.S b/arch/x86/lib/crc32-pclmul.S
deleted file mode 100644
index f20f40fb0172..000000000000
--- a/arch/x86/lib/crc32-pclmul.S
+++ /dev/null
@@ -1,6 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-// Copyright 2025 Google LLC
-
-#include "crc-pclmul-template.S"
-
-DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1)
diff --git a/arch/x86/lib/crc32.c b/arch/x86/lib/crc32.c
deleted file mode 100644
index d09343e2cea9..000000000000
--- a/arch/x86/lib/crc32.c
+++ /dev/null
@@ -1,111 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * x86-optimized CRC32 functions
- *
- * Copyright (C) 2008 Intel Corporation
- * Copyright 2012 Xyratex Technology Limited
- * Copyright 2024 Google LLC
- */
-
-#include <linux/crc32.h>
-#include <linux/module.h>
-#include "crc-pclmul-template.h"
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
-
-DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
-
-u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
-{
- CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
- have_pclmulqdq);
- return crc32_le_base(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_le_arch);
-
-#ifdef CONFIG_X86_64
-#define CRC32_INST "crc32q %1, %q0"
-#else
-#define CRC32_INST "crc32l %1, %0"
-#endif
-
-/*
- * Use carryless multiply version of crc32c when buffer size is >= 512 to
- * account for FPU state save/restore overhead.
- */
-#define CRC32C_PCLMUL_BREAKEVEN 512
-
-asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
-
-u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
-{
- size_t num_longs;
-
- if (!static_branch_likely(&have_crc32))
- return crc32c_base(crc, p, len);
-
- if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
- static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
- kernel_fpu_begin();
- crc = crc32c_x86_3way(crc, p, len);
- kernel_fpu_end();
- return crc;
- }
-
- for (num_longs = len / sizeof(unsigned long);
- num_longs != 0; num_longs--, p += sizeof(unsigned long))
- asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));
-
- if (sizeof(unsigned long) > 4 && (len & 4)) {
- asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p));
- p += 4;
- }
- if (len & 2) {
- asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p));
- p += 2;
- }
- if (len & 1)
- asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p));
-
- return crc;
-}
-EXPORT_SYMBOL(crc32c_arch);
-
-u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
-{
- return crc32_be_base(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_be_arch);
-
-static int __init crc32_x86_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_XMM4_2))
- static_branch_enable(&have_crc32);
- if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
- static_branch_enable(&have_pclmulqdq);
- INIT_CRC_PCLMUL(crc32_lsb);
- }
- return 0;
-}
-subsys_initcall(crc32_x86_init);
-
-static void __exit crc32_x86_exit(void)
-{
-}
-module_exit(crc32_x86_exit);
-
-u32 crc32_optimizations(void)
-{
- u32 optimizations = 0;
-
- if (static_key_enabled(&have_crc32))
- optimizations |= CRC32C_OPTIMIZATION;
- if (static_key_enabled(&have_pclmulqdq))
- optimizations |= CRC32_LE_OPTIMIZATION;
- return optimizations;
-}
-EXPORT_SYMBOL(crc32_optimizations);
-
-MODULE_DESCRIPTION("x86-optimized CRC32 functions");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/lib/crc32c-3way.S b/arch/x86/lib/crc32c-3way.S
deleted file mode 100644
index 9b8770503bbc..000000000000
--- a/arch/x86/lib/crc32c-3way.S
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
- *
- * The white papers on CRC32C calculations with PCLMULQDQ instruction can be
- * downloaded from:
- * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
- * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
- *
- * Copyright (C) 2012 Intel Corporation.
- * Copyright 2024 Google LLC
- *
- * Authors:
- * Wajdi Feghali <wajdi.k.feghali@intel.com>
- * James Guilford <james.guilford@intel.com>
- * David Cote <david.m.cote@intel.com>
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/linkage.h>
-
-## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
-
-# Define threshold below which buffers are considered "small" and routed to
-# regular CRC code that does not interleave the CRC instructions.
-#define SMALL_SIZE 200
-
-# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
-
-.text
-SYM_FUNC_START(crc32c_x86_3way)
-#define crc0 %edi
-#define crc0_q %rdi
-#define bufp %rsi
-#define bufp_d %esi
-#define len %rdx
-#define len_dw %edx
-#define n_misaligned %ecx /* overlaps chunk_bytes! */
-#define n_misaligned_q %rcx
-#define chunk_bytes %ecx /* overlaps n_misaligned! */
-#define chunk_bytes_q %rcx
-#define crc1 %r8
-#define crc2 %r9
-
- cmp $SMALL_SIZE, len
- jb .Lsmall
-
- ################################################################
- ## 1) ALIGN:
- ################################################################
- mov bufp_d, n_misaligned
- neg n_misaligned
- and $7, n_misaligned # calculate the misalignment amount of
- # the address
- je .Laligned # Skip if aligned
-
- # Process 1 <= n_misaligned <= 7 bytes individually in order to align
- # the remaining data to an 8-byte boundary.
-.Ldo_align:
- movq (bufp), %rax
- add n_misaligned_q, bufp
- sub n_misaligned_q, len
-.Lalign_loop:
- crc32b %al, crc0 # compute crc32 of 1-byte
- shr $8, %rax # get next byte
- dec n_misaligned
- jne .Lalign_loop
-.Laligned:
-
- ################################################################
- ## 2) PROCESS BLOCK:
- ################################################################
-
- cmp $128*24, len
- jae .Lfull_block
-
-.Lpartial_block:
- # Compute floor(len / 24) to get num qwords to process from each lane.
- imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24)
- shr $16, %eax
- jmp .Lcrc_3lanes
-
-.Lfull_block:
- # Processing 128 qwords from each lane.
- mov $128, %eax
-
- ################################################################
- ## 3) CRC each of three lanes:
- ################################################################
-
-.Lcrc_3lanes:
- xor crc1,crc1
- xor crc2,crc2
- mov %eax, chunk_bytes
- shl $3, chunk_bytes # num bytes to process from each lane
- sub $5, %eax # 4 for 4x_loop, 1 for special last iter
- jl .Lcrc_3lanes_4x_done
-
- # Unroll the loop by a factor of 4 to reduce the overhead of the loop
- # bookkeeping instructions, which can compete with crc32q for the ALUs.
-.Lcrc_3lanes_4x_loop:
- crc32q (bufp), crc0_q
- crc32q (bufp,chunk_bytes_q), crc1
- crc32q (bufp,chunk_bytes_q,2), crc2
- crc32q 8(bufp), crc0_q
- crc32q 8(bufp,chunk_bytes_q), crc1
- crc32q 8(bufp,chunk_bytes_q,2), crc2
- crc32q 16(bufp), crc0_q
- crc32q 16(bufp,chunk_bytes_q), crc1
- crc32q 16(bufp,chunk_bytes_q,2), crc2
- crc32q 24(bufp), crc0_q
- crc32q 24(bufp,chunk_bytes_q), crc1
- crc32q 24(bufp,chunk_bytes_q,2), crc2
- add $32, bufp
- sub $4, %eax
- jge .Lcrc_3lanes_4x_loop
-
-.Lcrc_3lanes_4x_done:
- add $4, %eax
- jz .Lcrc_3lanes_last_qword
-
-.Lcrc_3lanes_1x_loop:
- crc32q (bufp), crc0_q
- crc32q (bufp,chunk_bytes_q), crc1
- crc32q (bufp,chunk_bytes_q,2), crc2
- add $8, bufp
- dec %eax
- jnz .Lcrc_3lanes_1x_loop
-
-.Lcrc_3lanes_last_qword:
- crc32q (bufp), crc0_q
- crc32q (bufp,chunk_bytes_q), crc1
-# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
-
- ################################################################
- ## 4) Combine three results:
- ################################################################
-
- lea (K_table-8)(%rip), %rax # first entry is for idx 1
- pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
- lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
- sub %rax, len # len -= chunk_bytes * 3
-
- movq crc0_q, %xmm1 # CRC for block 1
- pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
-
- movq crc1, %xmm2 # CRC for block 2
- pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
-
- pxor %xmm2,%xmm1
- movq %xmm1, %rax
- xor (bufp,chunk_bytes_q,2), %rax
- mov crc2, crc0_q
- crc32 %rax, crc0_q
- lea 8(bufp,chunk_bytes_q,2), bufp
-
- ################################################################
- ## 5) If more blocks remain, goto (2):
- ################################################################
-
- cmp $128*24, len
- jae .Lfull_block
- cmp $SMALL_SIZE, len
- jae .Lpartial_block
-
- #######################################################################
- ## 6) Process any remainder without interleaving:
- #######################################################################
-.Lsmall:
- test len_dw, len_dw
- jz .Ldone
- mov len_dw, %eax
- shr $3, %eax
- jz .Ldo_dword
-.Ldo_qwords:
- crc32q (bufp), crc0_q
- add $8, bufp
- dec %eax
- jnz .Ldo_qwords
-.Ldo_dword:
- test $4, len_dw
- jz .Ldo_word
- crc32l (bufp), crc0
- add $4, bufp
-.Ldo_word:
- test $2, len_dw
- jz .Ldo_byte
- crc32w (bufp), crc0
- add $2, bufp
-.Ldo_byte:
- test $1, len_dw
- jz .Ldone
- crc32b (bufp), crc0
-.Ldone:
- mov crc0, %eax
- RET
-SYM_FUNC_END(crc32c_x86_3way)
-
-.section .rodata, "a", @progbits
- ################################################################
- ## PCLMULQDQ tables
- ## Table is 128 entries x 2 words (8 bytes) each
- ################################################################
-.align 8
-K_table:
- .long 0x493c7d27, 0x00000001
- .long 0xba4fc28e, 0x493c7d27
- .long 0xddc0152b, 0xf20c0dfe
- .long 0x9e4addf8, 0xba4fc28e
- .long 0x39d3b296, 0x3da6d0cb
- .long 0x0715ce53, 0xddc0152b
- .long 0x47db8317, 0x1c291d04
- .long 0x0d3b6092, 0x9e4addf8
- .long 0xc96cfdc0, 0x740eef02
- .long 0x878a92a7, 0x39d3b296
- .long 0xdaece73e, 0x083a6eec
- .long 0xab7aff2a, 0x0715ce53
- .long 0x2162d385, 0xc49f4f67
- .long 0x83348832, 0x47db8317
- .long 0x299847d5, 0x2ad91c30
- .long 0xb9e02b86, 0x0d3b6092
- .long 0x18b33a4e, 0x6992cea2
- .long 0xb6dd949b, 0xc96cfdc0
- .long 0x78d9ccb7, 0x7e908048
- .long 0xbac2fd7b, 0x878a92a7
- .long 0xa60ce07b, 0x1b3d8f29
- .long 0xce7f39f4, 0xdaece73e
- .long 0x61d82e56, 0xf1d0f55e
- .long 0xd270f1a2, 0xab7aff2a
- .long 0xc619809d, 0xa87ab8a8
- .long 0x2b3cac5d, 0x2162d385
- .long 0x65863b64, 0x8462d800
- .long 0x1b03397f, 0x83348832
- .long 0xebb883bd, 0x71d111a8
- .long 0xb3e32c28, 0x299847d5
- .long 0x064f7f26, 0xffd852c6
- .long 0xdd7e3b0c, 0xb9e02b86
- .long 0xf285651c, 0xdcb17aa4
- .long 0x10746f3c, 0x18b33a4e
- .long 0xc7a68855, 0xf37c5aee
- .long 0x271d9844, 0xb6dd949b
- .long 0x8e766a0c, 0x6051d5a2
- .long 0x93a5f730, 0x78d9ccb7
- .long 0x6cb08e5c, 0x18b0d4ff
- .long 0x6b749fb2, 0xbac2fd7b
- .long 0x1393e203, 0x21f3d99c
- .long 0xcec3662e, 0xa60ce07b
- .long 0x96c515bb, 0x8f158014
- .long 0xe6fc4e6a, 0xce7f39f4
- .long 0x8227bb8a, 0xa00457f7
- .long 0xb0cd4768, 0x61d82e56
- .long 0x39c7ff35, 0x8d6d2c43
- .long 0xd7a4825c, 0xd270f1a2
- .long 0x0ab3844b, 0x00ac29cf
- .long 0x0167d312, 0xc619809d
- .long 0xf6076544, 0xe9adf796
- .long 0x26f6a60a, 0x2b3cac5d
- .long 0xa741c1bf, 0x96638b34
- .long 0x98d8d9cb, 0x65863b64
- .long 0x49c3cc9c, 0xe0e9f351
- .long 0x68bce87a, 0x1b03397f
- .long 0x57a3d037, 0x9af01f2d
- .long 0x6956fc3b, 0xebb883bd
- .long 0x42d98888, 0x2cff42cf
- .long 0x3771e98f, 0xb3e32c28
- .long 0xb42ae3d9, 0x88f25a3a
- .long 0x2178513a, 0x064f7f26
- .long 0xe0ac139e, 0x4e36f0b0
- .long 0x170076fa, 0xdd7e3b0c
- .long 0x444dd413, 0xbd6f81f8
- .long 0x6f345e45, 0xf285651c
- .long 0x41d17b64, 0x91c9bd4b
- .long 0xff0dba97, 0x10746f3c
- .long 0xa2b73df1, 0x885f087b
- .long 0xf872e54c, 0xc7a68855
- .long 0x1e41e9fc, 0x4c144932
- .long 0x86d8e4d2, 0x271d9844
- .long 0x651bd98b, 0x52148f02
- .long 0x5bb8f1bc, 0x8e766a0c
- .long 0xa90fd27a, 0xa3c6f37a
- .long 0xb3af077a, 0x93a5f730
- .long 0x4984d782, 0xd7c0557f
- .long 0xca6ef3ac, 0x6cb08e5c
- .long 0x234e0b26, 0x63ded06a
- .long 0xdd66cbbb, 0x6b749fb2
- .long 0x4597456a, 0x4d56973c
- .long 0xe9e28eb4, 0x1393e203
- .long 0x7b3ff57a, 0x9669c9df
- .long 0xc9c8b782, 0xcec3662e
- .long 0x3f70cc6f, 0xe417f38a
- .long 0x93e106a4, 0x96c515bb
- .long 0x62ec6c6d, 0x4b9e0f71
- .long 0xd813b325, 0xe6fc4e6a
- .long 0x0df04680, 0xd104b8fc
- .long 0x2342001e, 0x8227bb8a
- .long 0x0a2a8d7e, 0x5b397730
- .long 0x6d9a4957, 0xb0cd4768
- .long 0xe8b6368b, 0xe78eb416
- .long 0xd2c3ed1a, 0x39c7ff35
- .long 0x995a5724, 0x61ff0e01
- .long 0x9ef68d35, 0xd7a4825c
- .long 0x0c139b31, 0x8d96551c
- .long 0xf2271e60, 0x0ab3844b
- .long 0x0b0bf8ca, 0x0bf80dd2
- .long 0x2664fd8b, 0x0167d312
- .long 0xed64812d, 0x8821abed
- .long 0x02ee03b2, 0xf6076544
- .long 0x8604ae0f, 0x6a45d2b2
- .long 0x363bd6b3, 0x26f6a60a
- .long 0x135c83fd, 0xd8d26619
- .long 0x5fabe670, 0xa741c1bf
- .long 0x35ec3279, 0xde87806c
- .long 0x00bcf5f6, 0x98d8d9cb
- .long 0x8ae00689, 0x14338754
- .long 0x17f27698, 0x49c3cc9c
- .long 0x58ca5f00, 0x5bd2011f
- .long 0xaa7c7ad5, 0x68bce87a
- .long 0xb5cfca28, 0xdd07448e
- .long 0xded288f8, 0x57a3d037
- .long 0x59f229bc, 0xdde8f5b9
- .long 0x6d390dec, 0x6956fc3b
- .long 0x37170390, 0xa3e3e02c
- .long 0x6353c1cc, 0x42d98888
- .long 0xc4584f5c, 0xd73c7bea
- .long 0xf48642e9, 0x3771e98f
- .long 0x531377e2, 0x80ff0093
- .long 0xdd35bc8d, 0xb42ae3d9
- .long 0xb25b29f2, 0x8fe4c34d
- .long 0x9a5ede41, 0x2178513a
- .long 0xa563905d, 0xdf99fc11
- .long 0x45cddf4e, 0xe0ac139e
- .long 0xacfa3103, 0x6c23e841
- .long 0xa51b6135, 0x170076fa
diff --git a/arch/x86/lib/crc64-pclmul.S b/arch/x86/lib/crc64-pclmul.S
deleted file mode 100644
index 4173051b5197..000000000000
--- a/arch/x86/lib/crc64-pclmul.S
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-// Copyright 2025 Google LLC
-
-#include "crc-pclmul-template.S"
-
-DEFINE_CRC_PCLMUL_FUNCS(crc64_msb, /* bits= */ 64, /* lsb= */ 0)
-DEFINE_CRC_PCLMUL_FUNCS(crc64_lsb, /* bits= */ 64, /* lsb= */ 1)
diff --git a/arch/x86/lib/crc64.c b/arch/x86/lib/crc64.c
deleted file mode 100644
index 351a09f5813e..000000000000
--- a/arch/x86/lib/crc64.c
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * CRC64 using [V]PCLMULQDQ instructions
- *
- * Copyright 2025 Google LLC
- */
-
-#include <linux/crc64.h>
-#include <linux/module.h>
-#include "crc-pclmul-template.h"
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
-
-DECLARE_CRC_PCLMUL_FUNCS(crc64_msb, u64);
-DECLARE_CRC_PCLMUL_FUNCS(crc64_lsb, u64);
-
-u64 crc64_be_arch(u64 crc, const u8 *p, size_t len)
-{
- CRC_PCLMUL(crc, p, len, crc64_msb, crc64_msb_0x42f0e1eba9ea3693_consts,
- have_pclmulqdq);
- return crc64_be_generic(crc, p, len);
-}
-EXPORT_SYMBOL_GPL(crc64_be_arch);
-
-u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
-{
- CRC_PCLMUL(crc, p, len, crc64_lsb, crc64_lsb_0x9a6c9329ac4bc9b5_consts,
- have_pclmulqdq);
- return crc64_nvme_generic(crc, p, len);
-}
-EXPORT_SYMBOL_GPL(crc64_nvme_arch);
-
-static int __init crc64_x86_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
- static_branch_enable(&have_pclmulqdq);
- INIT_CRC_PCLMUL(crc64_msb);
- INIT_CRC_PCLMUL(crc64_lsb);
- }
- return 0;
-}
-subsys_initcall(crc64_x86_init);
-
-static void __exit crc64_x86_exit(void)
-{
-}
-module_exit(crc64_x86_exit);
-
-MODULE_DESCRIPTION("CRC64 using [V]PCLMULQDQ instructions");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/lib/crypto/.gitignore b/arch/x86/lib/crypto/.gitignore
deleted file mode 100644
index 580c839bb177..000000000000
--- a/arch/x86/lib/crypto/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-poly1305-x86_64-cryptogams.S
diff --git a/arch/x86/lib/crypto/Kconfig b/arch/x86/lib/crypto/Kconfig
deleted file mode 100644
index 5e94cdee492c..000000000000
--- a/arch/x86/lib/crypto/Kconfig
+++ /dev/null
@@ -1,34 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_BLAKE2S_X86
- bool "Hash functions: BLAKE2s (SSSE3/AVX-512)"
- depends on 64BIT
- select CRYPTO_LIB_BLAKE2S_GENERIC
- select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
- help
- BLAKE2s cryptographic hash function (RFC 7693)
-
- Architecture: x86_64 using:
- - SSSE3 (Supplemental SSE3)
- - AVX-512 (Advanced Vector Extensions-512)
-
-config CRYPTO_CHACHA20_X86_64
- tristate
- depends on 64BIT
- default CRYPTO_LIB_CHACHA
- select CRYPTO_LIB_CHACHA_GENERIC
- select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_POLY1305_X86_64
- tristate
- depends on 64BIT
- default CRYPTO_LIB_POLY1305
- select CRYPTO_ARCH_HAVE_LIB_POLY1305
-
-config CRYPTO_SHA256_X86_64
- tristate
- depends on 64BIT
- default CRYPTO_LIB_SHA256
- select CRYPTO_ARCH_HAVE_LIB_SHA256
- select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
- select CRYPTO_LIB_SHA256_GENERIC
diff --git a/arch/x86/lib/crypto/Makefile b/arch/x86/lib/crypto/Makefile
deleted file mode 100644
index abceca3d31c0..000000000000
--- a/arch/x86/lib/crypto/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o
-libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o
-
-obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
-chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha-avx512vl-x86_64.o chacha_glue.o
-
-obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
-poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
-targets += poly1305-x86_64-cryptogams.S
-
-obj-$(CONFIG_CRYPTO_SHA256_X86_64) += sha256-x86_64.o
-sha256-x86_64-y := sha256.o sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256-ni-asm.o
-
-quiet_cmd_perlasm = PERLASM $@
- cmd_perlasm = $(PERL) $< > $@
-
-$(obj)/%.S: $(src)/%.pl FORCE
- $(call if_changed,perlasm)
diff --git a/arch/x86/lib/crypto/blake2s-core.S b/arch/x86/lib/crypto/blake2s-core.S
deleted file mode 100644
index ac1c845445a4..000000000000
--- a/arch/x86/lib/crypto/blake2s-core.S
+++ /dev/null
@@ -1,252 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
- */
-
-#include <linux/linkage.h>
-
-.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
-.align 32
-IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
- .octa 0x5BE0CD191F83D9AB9B05688C510E527F
-.section .rodata.cst16.ROT16, "aM", @progbits, 16
-.align 16
-ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
-.section .rodata.cst16.ROR328, "aM", @progbits, 16
-.align 16
-ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
-.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
-.align 64
-SIGMA:
-.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
-.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
-.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
-.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
-.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
-.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
-.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
-.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
-.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
-.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
-.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
-.align 64
-SIGMA2:
-.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
-.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
-.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
-.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
-.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
-.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
-.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
-.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
-.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
-.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
-
-.text
-SYM_FUNC_START(blake2s_compress_ssse3)
- testq %rdx,%rdx
- je .Lendofloop
- movdqu (%rdi),%xmm0
- movdqu 0x10(%rdi),%xmm1
- movdqa ROT16(%rip),%xmm12
- movdqa ROR328(%rip),%xmm13
- movdqu 0x20(%rdi),%xmm14
- movq %rcx,%xmm15
- leaq SIGMA+0xa0(%rip),%r8
- jmp .Lbeginofloop
- .align 32
-.Lbeginofloop:
- movdqa %xmm0,%xmm10
- movdqa %xmm1,%xmm11
- paddq %xmm15,%xmm14
- movdqa IV(%rip),%xmm2
- movdqa %xmm14,%xmm3
- pxor IV+0x10(%rip),%xmm3
- leaq SIGMA(%rip),%rcx
-.Lroundloop:
- movzbl (%rcx),%eax
- movd (%rsi,%rax,4),%xmm4
- movzbl 0x1(%rcx),%eax
- movd (%rsi,%rax,4),%xmm5
- movzbl 0x2(%rcx),%eax
- movd (%rsi,%rax,4),%xmm6
- movzbl 0x3(%rcx),%eax
- movd (%rsi,%rax,4),%xmm7
- punpckldq %xmm5,%xmm4
- punpckldq %xmm7,%xmm6
- punpcklqdq %xmm6,%xmm4
- paddd %xmm4,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movzbl 0x4(%rcx),%eax
- movd (%rsi,%rax,4),%xmm5
- movzbl 0x5(%rcx),%eax
- movd (%rsi,%rax,4),%xmm6
- movzbl 0x6(%rcx),%eax
- movd (%rsi,%rax,4),%xmm7
- movzbl 0x7(%rcx),%eax
- movd (%rsi,%rax,4),%xmm4
- punpckldq %xmm6,%xmm5
- punpckldq %xmm4,%xmm7
- punpcklqdq %xmm7,%xmm5
- paddd %xmm5,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x93,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x39,%xmm2,%xmm2
- movzbl 0x8(%rcx),%eax
- movd (%rsi,%rax,4),%xmm6
- movzbl 0x9(%rcx),%eax
- movd (%rsi,%rax,4),%xmm7
- movzbl 0xa(%rcx),%eax
- movd (%rsi,%rax,4),%xmm4
- movzbl 0xb(%rcx),%eax
- movd (%rsi,%rax,4),%xmm5
- punpckldq %xmm7,%xmm6
- punpckldq %xmm5,%xmm4
- punpcklqdq %xmm4,%xmm6
- paddd %xmm6,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movzbl 0xc(%rcx),%eax
- movd (%rsi,%rax,4),%xmm7
- movzbl 0xd(%rcx),%eax
- movd (%rsi,%rax,4),%xmm4
- movzbl 0xe(%rcx),%eax
- movd (%rsi,%rax,4),%xmm5
- movzbl 0xf(%rcx),%eax
- movd (%rsi,%rax,4),%xmm6
- punpckldq %xmm4,%xmm7
- punpckldq %xmm6,%xmm5
- punpcklqdq %xmm5,%xmm7
- paddd %xmm7,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x39,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x93,%xmm2,%xmm2
- addq $0x10,%rcx
- cmpq %r8,%rcx
- jnz .Lroundloop
- pxor %xmm2,%xmm0
- pxor %xmm3,%xmm1
- pxor %xmm10,%xmm0
- pxor %xmm11,%xmm1
- addq $0x40,%rsi
- decq %rdx
- jnz .Lbeginofloop
- movdqu %xmm0,(%rdi)
- movdqu %xmm1,0x10(%rdi)
- movdqu %xmm14,0x20(%rdi)
-.Lendofloop:
- RET
-SYM_FUNC_END(blake2s_compress_ssse3)
-
-SYM_FUNC_START(blake2s_compress_avx512)
- vmovdqu (%rdi),%xmm0
- vmovdqu 0x10(%rdi),%xmm1
- vmovdqu 0x20(%rdi),%xmm4
- vmovq %rcx,%xmm5
- vmovdqa IV(%rip),%xmm14
- vmovdqa IV+16(%rip),%xmm15
- jmp .Lblake2s_compress_avx512_mainloop
-.align 32
-.Lblake2s_compress_avx512_mainloop:
- vmovdqa %xmm0,%xmm10
- vmovdqa %xmm1,%xmm11
- vpaddq %xmm5,%xmm4,%xmm4
- vmovdqa %xmm14,%xmm2
- vpxor %xmm15,%xmm4,%xmm3
- vmovdqu (%rsi),%ymm6
- vmovdqu 0x20(%rsi),%ymm7
- addq $0x40,%rsi
- leaq SIGMA2(%rip),%rax
- movb $0xa,%cl
-.Lblake2s_compress_avx512_roundloop:
- addq $0x40,%rax
- vmovdqa -0x40(%rax),%ymm8
- vmovdqa -0x20(%rax),%ymm9
- vpermi2d %ymm7,%ymm6,%ymm8
- vpermi2d %ymm7,%ymm6,%ymm9
- vmovdqa %ymm8,%ymm6
- vmovdqa %ymm9,%ymm7
- vpaddd %xmm8,%xmm0,%xmm0
- vpaddd %xmm1,%xmm0,%xmm0
- vpxor %xmm0,%xmm3,%xmm3
- vprord $0x10,%xmm3,%xmm3
- vpaddd %xmm3,%xmm2,%xmm2
- vpxor %xmm2,%xmm1,%xmm1
- vprord $0xc,%xmm1,%xmm1
- vextracti128 $0x1,%ymm8,%xmm8
- vpaddd %xmm8,%xmm0,%xmm0
- vpaddd %xmm1,%xmm0,%xmm0
- vpxor %xmm0,%xmm3,%xmm3
- vprord $0x8,%xmm3,%xmm3
- vpaddd %xmm3,%xmm2,%xmm2
- vpxor %xmm2,%xmm1,%xmm1
- vprord $0x7,%xmm1,%xmm1
- vpshufd $0x93,%xmm0,%xmm0
- vpshufd $0x4e,%xmm3,%xmm3
- vpshufd $0x39,%xmm2,%xmm2
- vpaddd %xmm9,%xmm0,%xmm0
- vpaddd %xmm1,%xmm0,%xmm0
- vpxor %xmm0,%xmm3,%xmm3
- vprord $0x10,%xmm3,%xmm3
- vpaddd %xmm3,%xmm2,%xmm2
- vpxor %xmm2,%xmm1,%xmm1
- vprord $0xc,%xmm1,%xmm1
- vextracti128 $0x1,%ymm9,%xmm9
- vpaddd %xmm9,%xmm0,%xmm0
- vpaddd %xmm1,%xmm0,%xmm0
- vpxor %xmm0,%xmm3,%xmm3
- vprord $0x8,%xmm3,%xmm3
- vpaddd %xmm3,%xmm2,%xmm2
- vpxor %xmm2,%xmm1,%xmm1
- vprord $0x7,%xmm1,%xmm1
- vpshufd $0x39,%xmm0,%xmm0
- vpshufd $0x4e,%xmm3,%xmm3
- vpshufd $0x93,%xmm2,%xmm2
- decb %cl
- jne .Lblake2s_compress_avx512_roundloop
- vpxor %xmm10,%xmm0,%xmm0
- vpxor %xmm11,%xmm1,%xmm1
- vpxor %xmm2,%xmm0,%xmm0
- vpxor %xmm3,%xmm1,%xmm1
- decq %rdx
- jne .Lblake2s_compress_avx512_mainloop
- vmovdqu %xmm0,(%rdi)
- vmovdqu %xmm1,0x10(%rdi)
- vmovdqu %xmm4,0x20(%rdi)
- vzeroupper
- RET
-SYM_FUNC_END(blake2s_compress_avx512)
diff --git a/arch/x86/lib/crypto/blake2s-glue.c b/arch/x86/lib/crypto/blake2s-glue.c
deleted file mode 100644
index adc296cd17c9..000000000000
--- a/arch/x86/lib/crypto/blake2s-glue.c
+++ /dev/null
@@ -1,70 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <asm/cpufeature.h>
-#include <asm/fpu/api.h>
-#include <asm/processor.h>
-#include <asm/simd.h>
-#include <crypto/internal/blake2s.h>
-#include <linux/init.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/sizes.h>
-
-asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
- const u8 *block, const size_t nblocks,
- const u32 inc);
-asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
- const u8 *block, const size_t nblocks,
- const u32 inc);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
-
-void blake2s_compress(struct blake2s_state *state, const u8 *block,
- size_t nblocks, const u32 inc)
-{
- /* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
-
- if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
- blake2s_compress_generic(state, block, nblocks, inc);
- return;
- }
-
- do {
- const size_t blocks = min_t(size_t, nblocks,
- SZ_4K / BLAKE2S_BLOCK_SIZE);
-
- kernel_fpu_begin();
- if (static_branch_likely(&blake2s_use_avx512))
- blake2s_compress_avx512(state, block, blocks, inc);
- else
- blake2s_compress_ssse3(state, block, blocks, inc);
- kernel_fpu_end();
-
- nblocks -= blocks;
- block += blocks * BLAKE2S_BLOCK_SIZE;
- } while (nblocks);
-}
-EXPORT_SYMBOL(blake2s_compress);
-
-static int __init blake2s_mod_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- static_branch_enable(&blake2s_use_ssse3);
-
- if (boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_AVX512F) &&
- boot_cpu_has(X86_FEATURE_AVX512VL) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
- XFEATURE_MASK_AVX512, NULL))
- static_branch_enable(&blake2s_use_avx512);
-
- return 0;
-}
-
-subsys_initcall(blake2s_mod_init);
diff --git a/arch/x86/lib/crypto/chacha-avx2-x86_64.S b/arch/x86/lib/crypto/chacha-avx2-x86_64.S
deleted file mode 100644
index f3d8fc018249..000000000000
--- a/arch/x86/lib/crypto/chacha-avx2-x86_64.S
+++ /dev/null
@@ -1,1021 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <linux/linkage.h>
-
-.section .rodata.cst32.ROT8, "aM", @progbits, 32
-.align 32
-ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
- .octa 0x0e0d0c0f0a09080b0605040702010003
-
-.section .rodata.cst32.ROT16, "aM", @progbits, 32
-.align 32
-ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
- .octa 0x0d0c0f0e09080b0a0504070601000302
-
-.section .rodata.cst32.CTRINC, "aM", @progbits, 32
-.align 32
-CTRINC: .octa 0x00000003000000020000000100000000
- .octa 0x00000007000000060000000500000004
-
-.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
-.align 32
-CTR2BL: .octa 0x00000000000000000000000000000000
- .octa 0x00000000000000000000000000000001
-
-.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
-.align 32
-CTR4BL: .octa 0x00000000000000000000000000000002
- .octa 0x00000000000000000000000000000003
-
-.text
-
-SYM_FUNC_START(chacha_2block_xor_avx2)
- # %rdi: Input state matrix, s
- # %rsi: up to 2 data blocks output, o
- # %rdx: up to 2 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts two ChaCha blocks by loading the state
- # matrix twice across four AVX registers. It performs matrix operations
- # on four words in each matrix in parallel, but requires shuffling to
- # rearrange the words after each round.
-
- vzeroupper
-
- # x0..3[0-2] = s0..3
- vbroadcasti128 0x00(%rdi),%ymm0
- vbroadcasti128 0x10(%rdi),%ymm1
- vbroadcasti128 0x20(%rdi),%ymm2
- vbroadcasti128 0x30(%rdi),%ymm3
-
- vpaddd CTR2BL(%rip),%ymm3,%ymm3
-
- vmovdqa %ymm0,%ymm8
- vmovdqa %ymm1,%ymm9
- vmovdqa %ymm2,%ymm10
- vmovdqa %ymm3,%ymm11
-
- vmovdqa ROT8(%rip),%ymm4
- vmovdqa ROT16(%rip),%ymm5
-
- mov %rcx,%rax
-
-.Ldoubleround:
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm5,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm6
- vpslld $12,%ymm6,%ymm6
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm6,%ymm1,%ymm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm4,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm7
- vpslld $7,%ymm7,%ymm7
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm7,%ymm1,%ymm1
-
- # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm1,%ymm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm3,%ymm3
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm5,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm6
- vpslld $12,%ymm6,%ymm6
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm6,%ymm1,%ymm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm4,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm7
- vpslld $7,%ymm7,%ymm7
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm7,%ymm1,%ymm1
-
- # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm1,%ymm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm3,%ymm3
-
- sub $2,%r8d
- jnz .Ldoubleround
-
- # o0 = i0 ^ (x0 + s0)
- vpaddd %ymm8,%ymm0,%ymm7
- cmp $0x10,%rax
- jl .Lxorpart2
- vpxor 0x00(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x00(%rsi)
- vextracti128 $1,%ymm7,%xmm0
- # o1 = i1 ^ (x1 + s1)
- vpaddd %ymm9,%ymm1,%ymm7
- cmp $0x20,%rax
- jl .Lxorpart2
- vpxor 0x10(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x10(%rsi)
- vextracti128 $1,%ymm7,%xmm1
- # o2 = i2 ^ (x2 + s2)
- vpaddd %ymm10,%ymm2,%ymm7
- cmp $0x30,%rax
- jl .Lxorpart2
- vpxor 0x20(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x20(%rsi)
- vextracti128 $1,%ymm7,%xmm2
- # o3 = i3 ^ (x3 + s3)
- vpaddd %ymm11,%ymm3,%ymm7
- cmp $0x40,%rax
- jl .Lxorpart2
- vpxor 0x30(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x30(%rsi)
- vextracti128 $1,%ymm7,%xmm3
-
- # xor and write second block
- vmovdqa %xmm0,%xmm7
- cmp $0x50,%rax
- jl .Lxorpart2
- vpxor 0x40(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x40(%rsi)
-
- vmovdqa %xmm1,%xmm7
- cmp $0x60,%rax
- jl .Lxorpart2
- vpxor 0x50(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x50(%rsi)
-
- vmovdqa %xmm2,%xmm7
- cmp $0x70,%rax
- jl .Lxorpart2
- vpxor 0x60(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x60(%rsi)
-
- vmovdqa %xmm3,%xmm7
- cmp $0x80,%rax
- jl .Lxorpart2
- vpxor 0x70(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x70(%rsi)
-
-.Ldone2:
- vzeroupper
- RET
-
-.Lxorpart2:
- # xor remaining bytes from partial register into output
- mov %rax,%r9
- and $0x0f,%r9
- jz .Ldone2
- and $~0x0f,%rax
-
- mov %rsi,%r11
-
- lea 8(%rsp),%r10
- sub $0x10,%rsp
- and $~31,%rsp
-
- lea (%rdx,%rax),%rsi
- mov %rsp,%rdi
- mov %r9,%rcx
- rep movsb
-
- vpxor 0x00(%rsp),%xmm7,%xmm7
- vmovdqa %xmm7,0x00(%rsp)
-
- mov %rsp,%rsi
- lea (%r11,%rax),%rdi
- mov %r9,%rcx
- rep movsb
-
- lea -8(%r10),%rsp
- jmp .Ldone2
-
-SYM_FUNC_END(chacha_2block_xor_avx2)
-
-SYM_FUNC_START(chacha_4block_xor_avx2)
- # %rdi: Input state matrix, s
- # %rsi: up to 4 data blocks output, o
- # %rdx: up to 4 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts four ChaCha blocks by loading the state
- # matrix four times across eight AVX registers. It performs matrix
- # operations on four words in two matrices in parallel, sequentially
- # to the operations on the four words of the other two matrices. The
- # required word shuffling has a rather high latency, we can do the
- # arithmetic on two matrix-pairs without much slowdown.
-
- vzeroupper
-
- # x0..3[0-4] = s0..3
- vbroadcasti128 0x00(%rdi),%ymm0
- vbroadcasti128 0x10(%rdi),%ymm1
- vbroadcasti128 0x20(%rdi),%ymm2
- vbroadcasti128 0x30(%rdi),%ymm3
-
- vmovdqa %ymm0,%ymm4
- vmovdqa %ymm1,%ymm5
- vmovdqa %ymm2,%ymm6
- vmovdqa %ymm3,%ymm7
-
- vpaddd CTR2BL(%rip),%ymm3,%ymm3
- vpaddd CTR4BL(%rip),%ymm7,%ymm7
-
- vmovdqa %ymm0,%ymm11
- vmovdqa %ymm1,%ymm12
- vmovdqa %ymm2,%ymm13
- vmovdqa %ymm3,%ymm14
- vmovdqa %ymm7,%ymm15
-
- vmovdqa ROT8(%rip),%ymm8
- vmovdqa ROT16(%rip),%ymm9
-
- mov %rcx,%rax
-
-.Ldoubleround4:
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm9,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxor %ymm4,%ymm7,%ymm7
- vpshufb %ymm9,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm10
- vpslld $12,%ymm10,%ymm10
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm10,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm6,%ymm5,%ymm5
- vmovdqa %ymm5,%ymm10
- vpslld $12,%ymm10,%ymm10
- vpsrld $20,%ymm5,%ymm5
- vpor %ymm10,%ymm5,%ymm5
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm8,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxor %ymm4,%ymm7,%ymm7
- vpshufb %ymm8,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm10
- vpslld $7,%ymm10,%ymm10
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm10,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm6,%ymm5,%ymm5
- vmovdqa %ymm5,%ymm10
- vpslld $7,%ymm10,%ymm10
- vpsrld $25,%ymm5,%ymm5
- vpor %ymm10,%ymm5,%ymm5
-
- # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm1,%ymm1
- vpshufd $0x39,%ymm5,%ymm5
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- vpshufd $0x4e,%ymm6,%ymm6
- # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm3,%ymm3
- vpshufd $0x93,%ymm7,%ymm7
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm9,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxor %ymm4,%ymm7,%ymm7
- vpshufb %ymm9,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm10
- vpslld $12,%ymm10,%ymm10
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm10,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm6,%ymm5,%ymm5
- vmovdqa %ymm5,%ymm10
- vpslld $12,%ymm10,%ymm10
- vpsrld $20,%ymm5,%ymm5
- vpor %ymm10,%ymm5,%ymm5
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm8,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxor %ymm4,%ymm7,%ymm7
- vpshufb %ymm8,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm10
- vpslld $7,%ymm10,%ymm10
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm10,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm6,%ymm5,%ymm5
- vmovdqa %ymm5,%ymm10
- vpslld $7,%ymm10,%ymm10
- vpsrld $25,%ymm5,%ymm5
- vpor %ymm10,%ymm5,%ymm5
-
- # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm1,%ymm1
- vpshufd $0x93,%ymm5,%ymm5
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- vpshufd $0x4e,%ymm6,%ymm6
- # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm3,%ymm3
- vpshufd $0x39,%ymm7,%ymm7
-
- sub $2,%r8d
- jnz .Ldoubleround4
-
- # o0 = i0 ^ (x0 + s0), first block
- vpaddd %ymm11,%ymm0,%ymm10
- cmp $0x10,%rax
- jl .Lxorpart4
- vpxor 0x00(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x00(%rsi)
- vextracti128 $1,%ymm10,%xmm0
- # o1 = i1 ^ (x1 + s1), first block
- vpaddd %ymm12,%ymm1,%ymm10
- cmp $0x20,%rax
- jl .Lxorpart4
- vpxor 0x10(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x10(%rsi)
- vextracti128 $1,%ymm10,%xmm1
- # o2 = i2 ^ (x2 + s2), first block
- vpaddd %ymm13,%ymm2,%ymm10
- cmp $0x30,%rax
- jl .Lxorpart4
- vpxor 0x20(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x20(%rsi)
- vextracti128 $1,%ymm10,%xmm2
- # o3 = i3 ^ (x3 + s3), first block
- vpaddd %ymm14,%ymm3,%ymm10
- cmp $0x40,%rax
- jl .Lxorpart4
- vpxor 0x30(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x30(%rsi)
- vextracti128 $1,%ymm10,%xmm3
-
- # xor and write second block
- vmovdqa %xmm0,%xmm10
- cmp $0x50,%rax
- jl .Lxorpart4
- vpxor 0x40(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x40(%rsi)
-
- vmovdqa %xmm1,%xmm10
- cmp $0x60,%rax
- jl .Lxorpart4
- vpxor 0x50(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x50(%rsi)
-
- vmovdqa %xmm2,%xmm10
- cmp $0x70,%rax
- jl .Lxorpart4
- vpxor 0x60(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x60(%rsi)
-
- vmovdqa %xmm3,%xmm10
- cmp $0x80,%rax
- jl .Lxorpart4
- vpxor 0x70(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x70(%rsi)
-
- # o0 = i0 ^ (x0 + s0), third block
- vpaddd %ymm11,%ymm4,%ymm10
- cmp $0x90,%rax
- jl .Lxorpart4
- vpxor 0x80(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x80(%rsi)
- vextracti128 $1,%ymm10,%xmm4
- # o1 = i1 ^ (x1 + s1), third block
- vpaddd %ymm12,%ymm5,%ymm10
- cmp $0xa0,%rax
- jl .Lxorpart4
- vpxor 0x90(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x90(%rsi)
- vextracti128 $1,%ymm10,%xmm5
- # o2 = i2 ^ (x2 + s2), third block
- vpaddd %ymm13,%ymm6,%ymm10
- cmp $0xb0,%rax
- jl .Lxorpart4
- vpxor 0xa0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xa0(%rsi)
- vextracti128 $1,%ymm10,%xmm6
- # o3 = i3 ^ (x3 + s3), third block
- vpaddd %ymm15,%ymm7,%ymm10
- cmp $0xc0,%rax
- jl .Lxorpart4
- vpxor 0xb0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xb0(%rsi)
- vextracti128 $1,%ymm10,%xmm7
-
- # xor and write fourth block
- vmovdqa %xmm4,%xmm10
- cmp $0xd0,%rax
- jl .Lxorpart4
- vpxor 0xc0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xc0(%rsi)
-
- vmovdqa %xmm5,%xmm10
- cmp $0xe0,%rax
- jl .Lxorpart4
- vpxor 0xd0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xd0(%rsi)
-
- vmovdqa %xmm6,%xmm10
- cmp $0xf0,%rax
- jl .Lxorpart4
- vpxor 0xe0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xe0(%rsi)
-
- vmovdqa %xmm7,%xmm10
- cmp $0x100,%rax
- jl .Lxorpart4
- vpxor 0xf0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xf0(%rsi)
-
-.Ldone4:
- vzeroupper
- RET
-
-.Lxorpart4:
- # xor remaining bytes from partial register into output
- mov %rax,%r9
- and $0x0f,%r9
- jz .Ldone4
- and $~0x0f,%rax
-
- mov %rsi,%r11
-
- lea 8(%rsp),%r10
- sub $0x10,%rsp
- and $~31,%rsp
-
- lea (%rdx,%rax),%rsi
- mov %rsp,%rdi
- mov %r9,%rcx
- rep movsb
-
- vpxor 0x00(%rsp),%xmm10,%xmm10
- vmovdqa %xmm10,0x00(%rsp)
-
- mov %rsp,%rsi
- lea (%r11,%rax),%rdi
- mov %r9,%rcx
- rep movsb
-
- lea -8(%r10),%rsp
- jmp .Ldone4
-
-SYM_FUNC_END(chacha_4block_xor_avx2)
-
-SYM_FUNC_START(chacha_8block_xor_avx2)
- # %rdi: Input state matrix, s
- # %rsi: up to 8 data blocks output, o
- # %rdx: up to 8 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts eight consecutive ChaCha blocks by loading
- # the state matrix in AVX registers eight times. As we need some
- # scratch registers, we save the first four registers on the stack. The
- # algorithm performs each operation on the corresponding word of each
- # state matrix, hence requires no word shuffling. For final XORing step
- # we transpose the matrix by interleaving 32-, 64- and then 128-bit
- # words, which allows us to do XOR in AVX registers. 8/16-bit word
- # rotation is done with the slightly better performing byte shuffling,
- # 7/12-bit word rotation uses traditional shift+OR.
-
- vzeroupper
- # 4 * 32 byte stack, 32-byte aligned
- lea 8(%rsp),%r10
- and $~31, %rsp
- sub $0x80, %rsp
- mov %rcx,%rax
-
- # x0..15[0-7] = s[0..15]
- vpbroadcastd 0x00(%rdi),%ymm0
- vpbroadcastd 0x04(%rdi),%ymm1
- vpbroadcastd 0x08(%rdi),%ymm2
- vpbroadcastd 0x0c(%rdi),%ymm3
- vpbroadcastd 0x10(%rdi),%ymm4
- vpbroadcastd 0x14(%rdi),%ymm5
- vpbroadcastd 0x18(%rdi),%ymm6
- vpbroadcastd 0x1c(%rdi),%ymm7
- vpbroadcastd 0x20(%rdi),%ymm8
- vpbroadcastd 0x24(%rdi),%ymm9
- vpbroadcastd 0x28(%rdi),%ymm10
- vpbroadcastd 0x2c(%rdi),%ymm11
- vpbroadcastd 0x30(%rdi),%ymm12
- vpbroadcastd 0x34(%rdi),%ymm13
- vpbroadcastd 0x38(%rdi),%ymm14
- vpbroadcastd 0x3c(%rdi),%ymm15
- # x0..3 on stack
- vmovdqa %ymm0,0x00(%rsp)
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa %ymm2,0x40(%rsp)
- vmovdqa %ymm3,0x60(%rsp)
-
- vmovdqa CTRINC(%rip),%ymm1
- vmovdqa ROT8(%rip),%ymm2
- vmovdqa ROT16(%rip),%ymm3
-
- # x12 += counter values 0-3
- vpaddd %ymm1,%ymm12,%ymm12
-
-.Ldoubleround8:
- # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
- vpaddd 0x00(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm3,%ymm12,%ymm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
- vpaddd 0x20(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm3,%ymm13,%ymm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
- vpaddd 0x40(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm3,%ymm14,%ymm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
- vpaddd 0x60(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm3,%ymm15,%ymm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $12,%ymm4,%ymm0
- vpsrld $20,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $12,%ymm5,%ymm0
- vpsrld $20,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $12,%ymm6,%ymm0
- vpsrld $20,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
- vpaddd %ymm15,%ymm11,%ymm11
- vpxor %ymm11,%ymm7,%ymm7
- vpslld $12,%ymm7,%ymm0
- vpsrld $20,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
-
- # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
- vpaddd 0x00(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm2,%ymm12,%ymm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
- vpaddd 0x20(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm2,%ymm13,%ymm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
- vpaddd 0x40(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm2,%ymm14,%ymm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
- vpaddd 0x60(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm2,%ymm15,%ymm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm0
- vpsrld $25,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm0
- vpsrld $25,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm0
- vpsrld $25,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
- vpaddd %ymm15,%ymm11,%ymm11
- vpxor %ymm11,%ymm7,%ymm7
- vpslld $7,%ymm7,%ymm0
- vpsrld $25,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
- vpaddd 0x00(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm3,%ymm15,%ymm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
- vpaddd 0x20(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm3,%ymm12,%ymm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
- vpaddd 0x40(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm3,%ymm13,%ymm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
- vpaddd 0x60(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm3,%ymm14,%ymm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
- vpaddd %ymm15,%ymm10,%ymm10
- vpxor %ymm10,%ymm5,%ymm5
- vpslld $12,%ymm5,%ymm0
- vpsrld $20,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
- vpaddd %ymm12,%ymm11,%ymm11
- vpxor %ymm11,%ymm6,%ymm6
- vpslld $12,%ymm6,%ymm0
- vpsrld $20,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
- vpaddd %ymm13,%ymm8,%ymm8
- vpxor %ymm8,%ymm7,%ymm7
- vpslld $12,%ymm7,%ymm0
- vpsrld $20,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
- vpaddd %ymm14,%ymm9,%ymm9
- vpxor %ymm9,%ymm4,%ymm4
- vpslld $12,%ymm4,%ymm0
- vpsrld $20,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
- vpaddd 0x00(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm2,%ymm15,%ymm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
- vpaddd 0x20(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm2,%ymm12,%ymm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
- vpaddd 0x40(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm2,%ymm13,%ymm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
- vpaddd 0x60(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm2,%ymm14,%ymm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
- vpaddd %ymm15,%ymm10,%ymm10
- vpxor %ymm10,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm0
- vpsrld $25,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
- vpaddd %ymm12,%ymm11,%ymm11
- vpxor %ymm11,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm0
- vpsrld $25,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
- vpaddd %ymm13,%ymm8,%ymm8
- vpxor %ymm8,%ymm7,%ymm7
- vpslld $7,%ymm7,%ymm0
- vpsrld $25,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
- vpaddd %ymm14,%ymm9,%ymm9
- vpxor %ymm9,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm0
- vpsrld $25,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
-
- sub $2,%r8d
- jnz .Ldoubleround8
-
- # x0..15[0-3] += s[0..15]
- vpbroadcastd 0x00(%rdi),%ymm0
- vpaddd 0x00(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpbroadcastd 0x04(%rdi),%ymm0
- vpaddd 0x20(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpbroadcastd 0x08(%rdi),%ymm0
- vpaddd 0x40(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpbroadcastd 0x0c(%rdi),%ymm0
- vpaddd 0x60(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpbroadcastd 0x10(%rdi),%ymm0
- vpaddd %ymm0,%ymm4,%ymm4
- vpbroadcastd 0x14(%rdi),%ymm0
- vpaddd %ymm0,%ymm5,%ymm5
- vpbroadcastd 0x18(%rdi),%ymm0
- vpaddd %ymm0,%ymm6,%ymm6
- vpbroadcastd 0x1c(%rdi),%ymm0
- vpaddd %ymm0,%ymm7,%ymm7
- vpbroadcastd 0x20(%rdi),%ymm0
- vpaddd %ymm0,%ymm8,%ymm8
- vpbroadcastd 0x24(%rdi),%ymm0
- vpaddd %ymm0,%ymm9,%ymm9
- vpbroadcastd 0x28(%rdi),%ymm0
- vpaddd %ymm0,%ymm10,%ymm10
- vpbroadcastd 0x2c(%rdi),%ymm0
- vpaddd %ymm0,%ymm11,%ymm11
- vpbroadcastd 0x30(%rdi),%ymm0
- vpaddd %ymm0,%ymm12,%ymm12
- vpbroadcastd 0x34(%rdi),%ymm0
- vpaddd %ymm0,%ymm13,%ymm13
- vpbroadcastd 0x38(%rdi),%ymm0
- vpaddd %ymm0,%ymm14,%ymm14
- vpbroadcastd 0x3c(%rdi),%ymm0
- vpaddd %ymm0,%ymm15,%ymm15
-
- # x12 += counter values 0-3
- vpaddd %ymm1,%ymm12,%ymm12
-
- # interleave 32-bit words in state n, n+1
- vmovdqa 0x00(%rsp),%ymm0
- vmovdqa 0x20(%rsp),%ymm1
- vpunpckldq %ymm1,%ymm0,%ymm2
- vpunpckhdq %ymm1,%ymm0,%ymm1
- vmovdqa %ymm2,0x00(%rsp)
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa 0x40(%rsp),%ymm0
- vmovdqa 0x60(%rsp),%ymm1
- vpunpckldq %ymm1,%ymm0,%ymm2
- vpunpckhdq %ymm1,%ymm0,%ymm1
- vmovdqa %ymm2,0x40(%rsp)
- vmovdqa %ymm1,0x60(%rsp)
- vmovdqa %ymm4,%ymm0
- vpunpckldq %ymm5,%ymm0,%ymm4
- vpunpckhdq %ymm5,%ymm0,%ymm5
- vmovdqa %ymm6,%ymm0
- vpunpckldq %ymm7,%ymm0,%ymm6
- vpunpckhdq %ymm7,%ymm0,%ymm7
- vmovdqa %ymm8,%ymm0
- vpunpckldq %ymm9,%ymm0,%ymm8
- vpunpckhdq %ymm9,%ymm0,%ymm9
- vmovdqa %ymm10,%ymm0
- vpunpckldq %ymm11,%ymm0,%ymm10
- vpunpckhdq %ymm11,%ymm0,%ymm11
- vmovdqa %ymm12,%ymm0
- vpunpckldq %ymm13,%ymm0,%ymm12
- vpunpckhdq %ymm13,%ymm0,%ymm13
- vmovdqa %ymm14,%ymm0
- vpunpckldq %ymm15,%ymm0,%ymm14
- vpunpckhdq %ymm15,%ymm0,%ymm15
-
- # interleave 64-bit words in state n, n+2
- vmovdqa 0x00(%rsp),%ymm0
- vmovdqa 0x40(%rsp),%ymm2
- vpunpcklqdq %ymm2,%ymm0,%ymm1
- vpunpckhqdq %ymm2,%ymm0,%ymm2
- vmovdqa %ymm1,0x00(%rsp)
- vmovdqa %ymm2,0x40(%rsp)
- vmovdqa 0x20(%rsp),%ymm0
- vmovdqa 0x60(%rsp),%ymm2
- vpunpcklqdq %ymm2,%ymm0,%ymm1
- vpunpckhqdq %ymm2,%ymm0,%ymm2
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa %ymm2,0x60(%rsp)
- vmovdqa %ymm4,%ymm0
- vpunpcklqdq %ymm6,%ymm0,%ymm4
- vpunpckhqdq %ymm6,%ymm0,%ymm6
- vmovdqa %ymm5,%ymm0
- vpunpcklqdq %ymm7,%ymm0,%ymm5
- vpunpckhqdq %ymm7,%ymm0,%ymm7
- vmovdqa %ymm8,%ymm0
- vpunpcklqdq %ymm10,%ymm0,%ymm8
- vpunpckhqdq %ymm10,%ymm0,%ymm10
- vmovdqa %ymm9,%ymm0
- vpunpcklqdq %ymm11,%ymm0,%ymm9
- vpunpckhqdq %ymm11,%ymm0,%ymm11
- vmovdqa %ymm12,%ymm0
- vpunpcklqdq %ymm14,%ymm0,%ymm12
- vpunpckhqdq %ymm14,%ymm0,%ymm14
- vmovdqa %ymm13,%ymm0
- vpunpcklqdq %ymm15,%ymm0,%ymm13
- vpunpckhqdq %ymm15,%ymm0,%ymm15
-
- # interleave 128-bit words in state n, n+4
- # xor/write first four blocks
- vmovdqa 0x00(%rsp),%ymm1
- vperm2i128 $0x20,%ymm4,%ymm1,%ymm0
- cmp $0x0020,%rax
- jl .Lxorpart8
- vpxor 0x0000(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0000(%rsi)
- vperm2i128 $0x31,%ymm4,%ymm1,%ymm4
-
- vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
- cmp $0x0040,%rax
- jl .Lxorpart8
- vpxor 0x0020(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0020(%rsi)
- vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
-
- vmovdqa 0x40(%rsp),%ymm1
- vperm2i128 $0x20,%ymm6,%ymm1,%ymm0
- cmp $0x0060,%rax
- jl .Lxorpart8
- vpxor 0x0040(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0040(%rsi)
- vperm2i128 $0x31,%ymm6,%ymm1,%ymm6
-
- vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
- cmp $0x0080,%rax
- jl .Lxorpart8
- vpxor 0x0060(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0060(%rsi)
- vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
-
- vmovdqa 0x20(%rsp),%ymm1
- vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
- cmp $0x00a0,%rax
- jl .Lxorpart8
- vpxor 0x0080(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0080(%rsi)
- vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
-
- vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
- cmp $0x00c0,%rax
- jl .Lxorpart8
- vpxor 0x00a0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x00a0(%rsi)
- vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
-
- vmovdqa 0x60(%rsp),%ymm1
- vperm2i128 $0x20,%ymm7,%ymm1,%ymm0
- cmp $0x00e0,%rax
- jl .Lxorpart8
- vpxor 0x00c0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x00c0(%rsi)
- vperm2i128 $0x31,%ymm7,%ymm1,%ymm7
-
- vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
- cmp $0x0100,%rax
- jl .Lxorpart8
- vpxor 0x00e0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x00e0(%rsi)
- vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
-
- # xor remaining blocks, write to output
- vmovdqa %ymm4,%ymm0
- cmp $0x0120,%rax
- jl .Lxorpart8
- vpxor 0x0100(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0100(%rsi)
-
- vmovdqa %ymm12,%ymm0
- cmp $0x0140,%rax
- jl .Lxorpart8
- vpxor 0x0120(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0120(%rsi)
-
- vmovdqa %ymm6,%ymm0
- cmp $0x0160,%rax
- jl .Lxorpart8
- vpxor 0x0140(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0140(%rsi)
-
- vmovdqa %ymm14,%ymm0
- cmp $0x0180,%rax
- jl .Lxorpart8
- vpxor 0x0160(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0160(%rsi)
-
- vmovdqa %ymm5,%ymm0
- cmp $0x01a0,%rax
- jl .Lxorpart8
- vpxor 0x0180(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0180(%rsi)
-
- vmovdqa %ymm13,%ymm0
- cmp $0x01c0,%rax
- jl .Lxorpart8
- vpxor 0x01a0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x01a0(%rsi)
-
- vmovdqa %ymm7,%ymm0
- cmp $0x01e0,%rax
- jl .Lxorpart8
- vpxor 0x01c0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x01c0(%rsi)
-
- vmovdqa %ymm15,%ymm0
- cmp $0x0200,%rax
- jl .Lxorpart8
- vpxor 0x01e0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x01e0(%rsi)
-
-.Ldone8:
- vzeroupper
- lea -8(%r10),%rsp
- RET
-
-.Lxorpart8:
- # xor remaining bytes from partial register into output
- mov %rax,%r9
- and $0x1f,%r9
- jz .Ldone8
- and $~0x1f,%rax
-
- mov %rsi,%r11
-
- lea (%rdx,%rax),%rsi
- mov %rsp,%rdi
- mov %r9,%rcx
- rep movsb
-
- vpxor 0x00(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
-
- mov %rsp,%rsi
- lea (%r11,%rax),%rdi
- mov %r9,%rcx
- rep movsb
-
- jmp .Ldone8
-
-SYM_FUNC_END(chacha_8block_xor_avx2)
diff --git a/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S b/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S
deleted file mode 100644
index 259383e1ad44..000000000000
--- a/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S
+++ /dev/null
@@ -1,836 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
- *
- * Copyright (C) 2018 Martin Willi
- */
-
-#include <linux/linkage.h>
-
-.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
-.align 32
-CTR2BL: .octa 0x00000000000000000000000000000000
- .octa 0x00000000000000000000000000000001
-
-.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
-.align 32
-CTR4BL: .octa 0x00000000000000000000000000000002
- .octa 0x00000000000000000000000000000003
-
-.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
-.align 32
-CTR8BL: .octa 0x00000003000000020000000100000000
- .octa 0x00000007000000060000000500000004
-
-.text
-
-SYM_FUNC_START(chacha_2block_xor_avx512vl)
- # %rdi: Input state matrix, s
- # %rsi: up to 2 data blocks output, o
- # %rdx: up to 2 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts two ChaCha blocks by loading the state
- # matrix twice across four AVX registers. It performs matrix operations
- # on four words in each matrix in parallel, but requires shuffling to
- # rearrange the words after each round.
-
- vzeroupper
-
- # x0..3[0-2] = s0..3
- vbroadcasti128 0x00(%rdi),%ymm0
- vbroadcasti128 0x10(%rdi),%ymm1
- vbroadcasti128 0x20(%rdi),%ymm2
- vbroadcasti128 0x30(%rdi),%ymm3
-
- vpaddd CTR2BL(%rip),%ymm3,%ymm3
-
- vmovdqa %ymm0,%ymm8
- vmovdqa %ymm1,%ymm9
- vmovdqa %ymm2,%ymm10
- vmovdqa %ymm3,%ymm11
-
-.Ldoubleround:
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $16,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $12,%ymm1,%ymm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $8,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $7,%ymm1,%ymm1
-
- # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm1,%ymm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm3,%ymm3
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $16,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $12,%ymm1,%ymm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $8,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $7,%ymm1,%ymm1
-
- # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm1,%ymm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm3,%ymm3
-
- sub $2,%r8d
- jnz .Ldoubleround
-
- # o0 = i0 ^ (x0 + s0)
- vpaddd %ymm8,%ymm0,%ymm7
- cmp $0x10,%rcx
- jl .Lxorpart2
- vpxord 0x00(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x00(%rsi)
- vextracti128 $1,%ymm7,%xmm0
- # o1 = i1 ^ (x1 + s1)
- vpaddd %ymm9,%ymm1,%ymm7
- cmp $0x20,%rcx
- jl .Lxorpart2
- vpxord 0x10(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x10(%rsi)
- vextracti128 $1,%ymm7,%xmm1
- # o2 = i2 ^ (x2 + s2)
- vpaddd %ymm10,%ymm2,%ymm7
- cmp $0x30,%rcx
- jl .Lxorpart2
- vpxord 0x20(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x20(%rsi)
- vextracti128 $1,%ymm7,%xmm2
- # o3 = i3 ^ (x3 + s3)
- vpaddd %ymm11,%ymm3,%ymm7
- cmp $0x40,%rcx
- jl .Lxorpart2
- vpxord 0x30(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x30(%rsi)
- vextracti128 $1,%ymm7,%xmm3
-
- # xor and write second block
- vmovdqa %xmm0,%xmm7
- cmp $0x50,%rcx
- jl .Lxorpart2
- vpxord 0x40(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x40(%rsi)
-
- vmovdqa %xmm1,%xmm7
- cmp $0x60,%rcx
- jl .Lxorpart2
- vpxord 0x50(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x50(%rsi)
-
- vmovdqa %xmm2,%xmm7
- cmp $0x70,%rcx
- jl .Lxorpart2
- vpxord 0x60(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x60(%rsi)
-
- vmovdqa %xmm3,%xmm7
- cmp $0x80,%rcx
- jl .Lxorpart2
- vpxord 0x70(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x70(%rsi)
-
-.Ldone2:
- vzeroupper
- RET
-
-.Lxorpart2:
- # xor remaining bytes from partial register into output
- mov %rcx,%rax
- and $0xf,%rcx
- jz .Ldone2
- mov %rax,%r9
- and $~0xf,%r9
-
- mov $1,%rax
- shld %cl,%rax,%rax
- sub $1,%rax
- kmovq %rax,%k1
-
- vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
- vpxord %xmm7,%xmm1,%xmm1
- vmovdqu8 %xmm1,(%rsi,%r9){%k1}
-
- jmp .Ldone2
-
-SYM_FUNC_END(chacha_2block_xor_avx512vl)
-
-SYM_FUNC_START(chacha_4block_xor_avx512vl)
- # %rdi: Input state matrix, s
- # %rsi: up to 4 data blocks output, o
- # %rdx: up to 4 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts four ChaCha blocks by loading the state
- # matrix four times across eight AVX registers. It performs matrix
- # operations on four words in two matrices in parallel, sequentially
- # to the operations on the four words of the other two matrices. The
- # required word shuffling has a rather high latency, we can do the
- # arithmetic on two matrix-pairs without much slowdown.
-
- vzeroupper
-
- # x0..3[0-4] = s0..3
- vbroadcasti128 0x00(%rdi),%ymm0
- vbroadcasti128 0x10(%rdi),%ymm1
- vbroadcasti128 0x20(%rdi),%ymm2
- vbroadcasti128 0x30(%rdi),%ymm3
-
- vmovdqa %ymm0,%ymm4
- vmovdqa %ymm1,%ymm5
- vmovdqa %ymm2,%ymm6
- vmovdqa %ymm3,%ymm7
-
- vpaddd CTR2BL(%rip),%ymm3,%ymm3
- vpaddd CTR4BL(%rip),%ymm7,%ymm7
-
- vmovdqa %ymm0,%ymm11
- vmovdqa %ymm1,%ymm12
- vmovdqa %ymm2,%ymm13
- vmovdqa %ymm3,%ymm14
- vmovdqa %ymm7,%ymm15
-
-.Ldoubleround4:
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $16,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxord %ymm4,%ymm7,%ymm7
- vprold $16,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $12,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxord %ymm6,%ymm5,%ymm5
- vprold $12,%ymm5,%ymm5
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $8,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxord %ymm4,%ymm7,%ymm7
- vprold $8,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $7,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxord %ymm6,%ymm5,%ymm5
- vprold $7,%ymm5,%ymm5
-
- # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm1,%ymm1
- vpshufd $0x39,%ymm5,%ymm5
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- vpshufd $0x4e,%ymm6,%ymm6
- # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm3,%ymm3
- vpshufd $0x93,%ymm7,%ymm7
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $16,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxord %ymm4,%ymm7,%ymm7
- vprold $16,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $12,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxord %ymm6,%ymm5,%ymm5
- vprold $12,%ymm5,%ymm5
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $8,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxord %ymm4,%ymm7,%ymm7
- vprold $8,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $7,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxord %ymm6,%ymm5,%ymm5
- vprold $7,%ymm5,%ymm5
-
- # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm1,%ymm1
- vpshufd $0x93,%ymm5,%ymm5
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- vpshufd $0x4e,%ymm6,%ymm6
- # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm3,%ymm3
- vpshufd $0x39,%ymm7,%ymm7
-
- sub $2,%r8d
- jnz .Ldoubleround4
-
- # o0 = i0 ^ (x0 + s0), first block
- vpaddd %ymm11,%ymm0,%ymm10
- cmp $0x10,%rcx
- jl .Lxorpart4
- vpxord 0x00(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x00(%rsi)
- vextracti128 $1,%ymm10,%xmm0
- # o1 = i1 ^ (x1 + s1), first block
- vpaddd %ymm12,%ymm1,%ymm10
- cmp $0x20,%rcx
- jl .Lxorpart4
- vpxord 0x10(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x10(%rsi)
- vextracti128 $1,%ymm10,%xmm1
- # o2 = i2 ^ (x2 + s2), first block
- vpaddd %ymm13,%ymm2,%ymm10
- cmp $0x30,%rcx
- jl .Lxorpart4
- vpxord 0x20(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x20(%rsi)
- vextracti128 $1,%ymm10,%xmm2
- # o3 = i3 ^ (x3 + s3), first block
- vpaddd %ymm14,%ymm3,%ymm10
- cmp $0x40,%rcx
- jl .Lxorpart4
- vpxord 0x30(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x30(%rsi)
- vextracti128 $1,%ymm10,%xmm3
-
- # xor and write second block
- vmovdqa %xmm0,%xmm10
- cmp $0x50,%rcx
- jl .Lxorpart4
- vpxord 0x40(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x40(%rsi)
-
- vmovdqa %xmm1,%xmm10
- cmp $0x60,%rcx
- jl .Lxorpart4
- vpxord 0x50(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x50(%rsi)
-
- vmovdqa %xmm2,%xmm10
- cmp $0x70,%rcx
- jl .Lxorpart4
- vpxord 0x60(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x60(%rsi)
-
- vmovdqa %xmm3,%xmm10
- cmp $0x80,%rcx
- jl .Lxorpart4
- vpxord 0x70(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x70(%rsi)
-
- # o0 = i0 ^ (x0 + s0), third block
- vpaddd %ymm11,%ymm4,%ymm10
- cmp $0x90,%rcx
- jl .Lxorpart4
- vpxord 0x80(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x80(%rsi)
- vextracti128 $1,%ymm10,%xmm4
- # o1 = i1 ^ (x1 + s1), third block
- vpaddd %ymm12,%ymm5,%ymm10
- cmp $0xa0,%rcx
- jl .Lxorpart4
- vpxord 0x90(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x90(%rsi)
- vextracti128 $1,%ymm10,%xmm5
- # o2 = i2 ^ (x2 + s2), third block
- vpaddd %ymm13,%ymm6,%ymm10
- cmp $0xb0,%rcx
- jl .Lxorpart4
- vpxord 0xa0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xa0(%rsi)
- vextracti128 $1,%ymm10,%xmm6
- # o3 = i3 ^ (x3 + s3), third block
- vpaddd %ymm15,%ymm7,%ymm10
- cmp $0xc0,%rcx
- jl .Lxorpart4
- vpxord 0xb0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xb0(%rsi)
- vextracti128 $1,%ymm10,%xmm7
-
- # xor and write fourth block
- vmovdqa %xmm4,%xmm10
- cmp $0xd0,%rcx
- jl .Lxorpart4
- vpxord 0xc0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xc0(%rsi)
-
- vmovdqa %xmm5,%xmm10
- cmp $0xe0,%rcx
- jl .Lxorpart4
- vpxord 0xd0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xd0(%rsi)
-
- vmovdqa %xmm6,%xmm10
- cmp $0xf0,%rcx
- jl .Lxorpart4
- vpxord 0xe0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xe0(%rsi)
-
- vmovdqa %xmm7,%xmm10
- cmp $0x100,%rcx
- jl .Lxorpart4
- vpxord 0xf0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xf0(%rsi)
-
-.Ldone4:
- vzeroupper
- RET
-
-.Lxorpart4:
- # xor remaining bytes from partial register into output
- mov %rcx,%rax
- and $0xf,%rcx
- jz .Ldone4
- mov %rax,%r9
- and $~0xf,%r9
-
- mov $1,%rax
- shld %cl,%rax,%rax
- sub $1,%rax
- kmovq %rax,%k1
-
- vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
- vpxord %xmm10,%xmm1,%xmm1
- vmovdqu8 %xmm1,(%rsi,%r9){%k1}
-
- jmp .Ldone4
-
-SYM_FUNC_END(chacha_4block_xor_avx512vl)
-
-SYM_FUNC_START(chacha_8block_xor_avx512vl)
- # %rdi: Input state matrix, s
- # %rsi: up to 8 data blocks output, o
- # %rdx: up to 8 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts eight consecutive ChaCha blocks by loading
- # the state matrix in AVX registers eight times. Compared to AVX2, this
- # mostly benefits from the new rotate instructions in VL and the
- # additional registers.
-
- vzeroupper
-
- # x0..15[0-7] = s[0..15]
- vpbroadcastd 0x00(%rdi),%ymm0
- vpbroadcastd 0x04(%rdi),%ymm1
- vpbroadcastd 0x08(%rdi),%ymm2
- vpbroadcastd 0x0c(%rdi),%ymm3
- vpbroadcastd 0x10(%rdi),%ymm4
- vpbroadcastd 0x14(%rdi),%ymm5
- vpbroadcastd 0x18(%rdi),%ymm6
- vpbroadcastd 0x1c(%rdi),%ymm7
- vpbroadcastd 0x20(%rdi),%ymm8
- vpbroadcastd 0x24(%rdi),%ymm9
- vpbroadcastd 0x28(%rdi),%ymm10
- vpbroadcastd 0x2c(%rdi),%ymm11
- vpbroadcastd 0x30(%rdi),%ymm12
- vpbroadcastd 0x34(%rdi),%ymm13
- vpbroadcastd 0x38(%rdi),%ymm14
- vpbroadcastd 0x3c(%rdi),%ymm15
-
- # x12 += counter values 0-3
- vpaddd CTR8BL(%rip),%ymm12,%ymm12
-
- vmovdqa64 %ymm0,%ymm16
- vmovdqa64 %ymm1,%ymm17
- vmovdqa64 %ymm2,%ymm18
- vmovdqa64 %ymm3,%ymm19
- vmovdqa64 %ymm4,%ymm20
- vmovdqa64 %ymm5,%ymm21
- vmovdqa64 %ymm6,%ymm22
- vmovdqa64 %ymm7,%ymm23
- vmovdqa64 %ymm8,%ymm24
- vmovdqa64 %ymm9,%ymm25
- vmovdqa64 %ymm10,%ymm26
- vmovdqa64 %ymm11,%ymm27
- vmovdqa64 %ymm12,%ymm28
- vmovdqa64 %ymm13,%ymm29
- vmovdqa64 %ymm14,%ymm30
- vmovdqa64 %ymm15,%ymm31
-
-.Ldoubleround8:
- # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
- vpaddd %ymm0,%ymm4,%ymm0
- vpxord %ymm0,%ymm12,%ymm12
- vprold $16,%ymm12,%ymm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
- vpaddd %ymm1,%ymm5,%ymm1
- vpxord %ymm1,%ymm13,%ymm13
- vprold $16,%ymm13,%ymm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
- vpaddd %ymm2,%ymm6,%ymm2
- vpxord %ymm2,%ymm14,%ymm14
- vprold $16,%ymm14,%ymm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
- vpaddd %ymm3,%ymm7,%ymm3
- vpxord %ymm3,%ymm15,%ymm15
- vprold $16,%ymm15,%ymm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
- vpaddd %ymm12,%ymm8,%ymm8
- vpxord %ymm8,%ymm4,%ymm4
- vprold $12,%ymm4,%ymm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
- vpaddd %ymm13,%ymm9,%ymm9
- vpxord %ymm9,%ymm5,%ymm5
- vprold $12,%ymm5,%ymm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
- vpaddd %ymm14,%ymm10,%ymm10
- vpxord %ymm10,%ymm6,%ymm6
- vprold $12,%ymm6,%ymm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
- vpaddd %ymm15,%ymm11,%ymm11
- vpxord %ymm11,%ymm7,%ymm7
- vprold $12,%ymm7,%ymm7
-
- # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
- vpaddd %ymm0,%ymm4,%ymm0
- vpxord %ymm0,%ymm12,%ymm12
- vprold $8,%ymm12,%ymm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
- vpaddd %ymm1,%ymm5,%ymm1
- vpxord %ymm1,%ymm13,%ymm13
- vprold $8,%ymm13,%ymm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
- vpaddd %ymm2,%ymm6,%ymm2
- vpxord %ymm2,%ymm14,%ymm14
- vprold $8,%ymm14,%ymm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
- vpaddd %ymm3,%ymm7,%ymm3
- vpxord %ymm3,%ymm15,%ymm15
- vprold $8,%ymm15,%ymm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
- vpaddd %ymm12,%ymm8,%ymm8
- vpxord %ymm8,%ymm4,%ymm4
- vprold $7,%ymm4,%ymm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
- vpaddd %ymm13,%ymm9,%ymm9
- vpxord %ymm9,%ymm5,%ymm5
- vprold $7,%ymm5,%ymm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
- vpaddd %ymm14,%ymm10,%ymm10
- vpxord %ymm10,%ymm6,%ymm6
- vprold $7,%ymm6,%ymm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
- vpaddd %ymm15,%ymm11,%ymm11
- vpxord %ymm11,%ymm7,%ymm7
- vprold $7,%ymm7,%ymm7
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
- vpaddd %ymm0,%ymm5,%ymm0
- vpxord %ymm0,%ymm15,%ymm15
- vprold $16,%ymm15,%ymm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
- vpaddd %ymm1,%ymm6,%ymm1
- vpxord %ymm1,%ymm12,%ymm12
- vprold $16,%ymm12,%ymm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
- vpaddd %ymm2,%ymm7,%ymm2
- vpxord %ymm2,%ymm13,%ymm13
- vprold $16,%ymm13,%ymm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
- vpaddd %ymm3,%ymm4,%ymm3
- vpxord %ymm3,%ymm14,%ymm14
- vprold $16,%ymm14,%ymm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
- vpaddd %ymm15,%ymm10,%ymm10
- vpxord %ymm10,%ymm5,%ymm5
- vprold $12,%ymm5,%ymm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
- vpaddd %ymm12,%ymm11,%ymm11
- vpxord %ymm11,%ymm6,%ymm6
- vprold $12,%ymm6,%ymm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
- vpaddd %ymm13,%ymm8,%ymm8
- vpxord %ymm8,%ymm7,%ymm7
- vprold $12,%ymm7,%ymm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
- vpaddd %ymm14,%ymm9,%ymm9
- vpxord %ymm9,%ymm4,%ymm4
- vprold $12,%ymm4,%ymm4
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
- vpaddd %ymm0,%ymm5,%ymm0
- vpxord %ymm0,%ymm15,%ymm15
- vprold $8,%ymm15,%ymm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
- vpaddd %ymm1,%ymm6,%ymm1
- vpxord %ymm1,%ymm12,%ymm12
- vprold $8,%ymm12,%ymm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
- vpaddd %ymm2,%ymm7,%ymm2
- vpxord %ymm2,%ymm13,%ymm13
- vprold $8,%ymm13,%ymm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
- vpaddd %ymm3,%ymm4,%ymm3
- vpxord %ymm3,%ymm14,%ymm14
- vprold $8,%ymm14,%ymm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
- vpaddd %ymm15,%ymm10,%ymm10
- vpxord %ymm10,%ymm5,%ymm5
- vprold $7,%ymm5,%ymm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
- vpaddd %ymm12,%ymm11,%ymm11
- vpxord %ymm11,%ymm6,%ymm6
- vprold $7,%ymm6,%ymm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
- vpaddd %ymm13,%ymm8,%ymm8
- vpxord %ymm8,%ymm7,%ymm7
- vprold $7,%ymm7,%ymm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
- vpaddd %ymm14,%ymm9,%ymm9
- vpxord %ymm9,%ymm4,%ymm4
- vprold $7,%ymm4,%ymm4
-
- sub $2,%r8d
- jnz .Ldoubleround8
-
- # x0..15[0-3] += s[0..15]
- vpaddd %ymm16,%ymm0,%ymm0
- vpaddd %ymm17,%ymm1,%ymm1
- vpaddd %ymm18,%ymm2,%ymm2
- vpaddd %ymm19,%ymm3,%ymm3
- vpaddd %ymm20,%ymm4,%ymm4
- vpaddd %ymm21,%ymm5,%ymm5
- vpaddd %ymm22,%ymm6,%ymm6
- vpaddd %ymm23,%ymm7,%ymm7
- vpaddd %ymm24,%ymm8,%ymm8
- vpaddd %ymm25,%ymm9,%ymm9
- vpaddd %ymm26,%ymm10,%ymm10
- vpaddd %ymm27,%ymm11,%ymm11
- vpaddd %ymm28,%ymm12,%ymm12
- vpaddd %ymm29,%ymm13,%ymm13
- vpaddd %ymm30,%ymm14,%ymm14
- vpaddd %ymm31,%ymm15,%ymm15
-
- # interleave 32-bit words in state n, n+1
- vpunpckldq %ymm1,%ymm0,%ymm16
- vpunpckhdq %ymm1,%ymm0,%ymm17
- vpunpckldq %ymm3,%ymm2,%ymm18
- vpunpckhdq %ymm3,%ymm2,%ymm19
- vpunpckldq %ymm5,%ymm4,%ymm20
- vpunpckhdq %ymm5,%ymm4,%ymm21
- vpunpckldq %ymm7,%ymm6,%ymm22
- vpunpckhdq %ymm7,%ymm6,%ymm23
- vpunpckldq %ymm9,%ymm8,%ymm24
- vpunpckhdq %ymm9,%ymm8,%ymm25
- vpunpckldq %ymm11,%ymm10,%ymm26
- vpunpckhdq %ymm11,%ymm10,%ymm27
- vpunpckldq %ymm13,%ymm12,%ymm28
- vpunpckhdq %ymm13,%ymm12,%ymm29
- vpunpckldq %ymm15,%ymm14,%ymm30
- vpunpckhdq %ymm15,%ymm14,%ymm31
-
- # interleave 64-bit words in state n, n+2
- vpunpcklqdq %ymm18,%ymm16,%ymm0
- vpunpcklqdq %ymm19,%ymm17,%ymm1
- vpunpckhqdq %ymm18,%ymm16,%ymm2
- vpunpckhqdq %ymm19,%ymm17,%ymm3
- vpunpcklqdq %ymm22,%ymm20,%ymm4
- vpunpcklqdq %ymm23,%ymm21,%ymm5
- vpunpckhqdq %ymm22,%ymm20,%ymm6
- vpunpckhqdq %ymm23,%ymm21,%ymm7
- vpunpcklqdq %ymm26,%ymm24,%ymm8
- vpunpcklqdq %ymm27,%ymm25,%ymm9
- vpunpckhqdq %ymm26,%ymm24,%ymm10
- vpunpckhqdq %ymm27,%ymm25,%ymm11
- vpunpcklqdq %ymm30,%ymm28,%ymm12
- vpunpcklqdq %ymm31,%ymm29,%ymm13
- vpunpckhqdq %ymm30,%ymm28,%ymm14
- vpunpckhqdq %ymm31,%ymm29,%ymm15
-
- # interleave 128-bit words in state n, n+4
- # xor/write first four blocks
- vmovdqa64 %ymm0,%ymm16
- vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
- cmp $0x0020,%rcx
- jl .Lxorpart8
- vpxord 0x0000(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0000(%rsi)
- vmovdqa64 %ymm16,%ymm0
- vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
-
- vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
- cmp $0x0040,%rcx
- jl .Lxorpart8
- vpxord 0x0020(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0020(%rsi)
- vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
-
- vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
- cmp $0x0060,%rcx
- jl .Lxorpart8
- vpxord 0x0040(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0040(%rsi)
- vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
-
- vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
- cmp $0x0080,%rcx
- jl .Lxorpart8
- vpxord 0x0060(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0060(%rsi)
- vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
-
- vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
- cmp $0x00a0,%rcx
- jl .Lxorpart8
- vpxord 0x0080(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0080(%rsi)
- vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
-
- vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
- cmp $0x00c0,%rcx
- jl .Lxorpart8
- vpxord 0x00a0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x00a0(%rsi)
- vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
-
- vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
- cmp $0x00e0,%rcx
- jl .Lxorpart8
- vpxord 0x00c0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x00c0(%rsi)
- vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
-
- vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
- cmp $0x0100,%rcx
- jl .Lxorpart8
- vpxord 0x00e0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x00e0(%rsi)
- vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
-
- # xor remaining blocks, write to output
- vmovdqa64 %ymm4,%ymm0
- cmp $0x0120,%rcx
- jl .Lxorpart8
- vpxord 0x0100(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0100(%rsi)
-
- vmovdqa64 %ymm12,%ymm0
- cmp $0x0140,%rcx
- jl .Lxorpart8
- vpxord 0x0120(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0120(%rsi)
-
- vmovdqa64 %ymm6,%ymm0
- cmp $0x0160,%rcx
- jl .Lxorpart8
- vpxord 0x0140(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0140(%rsi)
-
- vmovdqa64 %ymm14,%ymm0
- cmp $0x0180,%rcx
- jl .Lxorpart8
- vpxord 0x0160(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0160(%rsi)
-
- vmovdqa64 %ymm5,%ymm0
- cmp $0x01a0,%rcx
- jl .Lxorpart8
- vpxord 0x0180(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0180(%rsi)
-
- vmovdqa64 %ymm13,%ymm0
- cmp $0x01c0,%rcx
- jl .Lxorpart8
- vpxord 0x01a0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x01a0(%rsi)
-
- vmovdqa64 %ymm7,%ymm0
- cmp $0x01e0,%rcx
- jl .Lxorpart8
- vpxord 0x01c0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x01c0(%rsi)
-
- vmovdqa64 %ymm15,%ymm0
- cmp $0x0200,%rcx
- jl .Lxorpart8
- vpxord 0x01e0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x01e0(%rsi)
-
-.Ldone8:
- vzeroupper
- RET
-
-.Lxorpart8:
- # xor remaining bytes from partial register into output
- mov %rcx,%rax
- and $0x1f,%rcx
- jz .Ldone8
- mov %rax,%r9
- and $~0x1f,%r9
-
- mov $1,%rax
- shld %cl,%rax,%rax
- sub $1,%rax
- kmovq %rax,%k1
-
- vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
- vpxord %ymm0,%ymm1,%ymm1
- vmovdqu8 %ymm1,(%rsi,%r9){%k1}
-
- jmp .Ldone8
-
-SYM_FUNC_END(chacha_8block_xor_avx512vl)
diff --git a/arch/x86/lib/crypto/chacha-ssse3-x86_64.S b/arch/x86/lib/crypto/chacha-ssse3-x86_64.S
deleted file mode 100644
index 7111949cd5b9..000000000000
--- a/arch/x86/lib/crypto/chacha-ssse3-x86_64.S
+++ /dev/null
@@ -1,791 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-.section .rodata.cst16.ROT8, "aM", @progbits, 16
-.align 16
-ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
-.section .rodata.cst16.ROT16, "aM", @progbits, 16
-.align 16
-ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
-.section .rodata.cst16.CTRINC, "aM", @progbits, 16
-.align 16
-CTRINC: .octa 0x00000003000000020000000100000000
-
-.text
-
-/*
- * chacha_permute - permute one block
- *
- * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
- * function performs matrix operations on four words in parallel, but requires
- * shuffling to rearrange the words after each round. 8/16-bit word rotation is
- * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
- * rotation uses traditional shift+OR.
- *
- * The round count is given in %r8d.
- *
- * Clobbers: %r8d, %xmm4-%xmm7
- */
-SYM_FUNC_START_LOCAL(chacha_permute)
-
- movdqa ROT8(%rip),%xmm4
- movdqa ROT16(%rip),%xmm5
-
-.Ldoubleround:
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm5,%xmm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm6
- pslld $12,%xmm6
- psrld $20,%xmm1
- por %xmm6,%xmm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm4,%xmm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm7
- pslld $7,%xmm7
- psrld $25,%xmm1
- por %xmm7,%xmm1
-
- # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- pshufd $0x39,%xmm1,%xmm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- pshufd $0x4e,%xmm2,%xmm2
- # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- pshufd $0x93,%xmm3,%xmm3
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm5,%xmm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm6
- pslld $12,%xmm6
- psrld $20,%xmm1
- por %xmm6,%xmm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm4,%xmm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm7
- pslld $7,%xmm7
- psrld $25,%xmm1
- por %xmm7,%xmm1
-
- # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- pshufd $0x93,%xmm1,%xmm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- pshufd $0x4e,%xmm2,%xmm2
- # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- pshufd $0x39,%xmm3,%xmm3
-
- sub $2,%r8d
- jnz .Ldoubleround
-
- RET
-SYM_FUNC_END(chacha_permute)
-
-SYM_FUNC_START(chacha_block_xor_ssse3)
- # %rdi: Input state matrix, s
- # %rsi: up to 1 data block output, o
- # %rdx: up to 1 data block input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
- FRAME_BEGIN
-
- # x0..3 = s0..3
- movdqu 0x00(%rdi),%xmm0
- movdqu 0x10(%rdi),%xmm1
- movdqu 0x20(%rdi),%xmm2
- movdqu 0x30(%rdi),%xmm3
- movdqa %xmm0,%xmm8
- movdqa %xmm1,%xmm9
- movdqa %xmm2,%xmm10
- movdqa %xmm3,%xmm11
-
- mov %rcx,%rax
- call chacha_permute
-
- # o0 = i0 ^ (x0 + s0)
- paddd %xmm8,%xmm0
- cmp $0x10,%rax
- jl .Lxorpart
- movdqu 0x00(%rdx),%xmm4
- pxor %xmm4,%xmm0
- movdqu %xmm0,0x00(%rsi)
- # o1 = i1 ^ (x1 + s1)
- paddd %xmm9,%xmm1
- movdqa %xmm1,%xmm0
- cmp $0x20,%rax
- jl .Lxorpart
- movdqu 0x10(%rdx),%xmm0
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x10(%rsi)
- # o2 = i2 ^ (x2 + s2)
- paddd %xmm10,%xmm2
- movdqa %xmm2,%xmm0
- cmp $0x30,%rax
- jl .Lxorpart
- movdqu 0x20(%rdx),%xmm0
- pxor %xmm2,%xmm0
- movdqu %xmm0,0x20(%rsi)
- # o3 = i3 ^ (x3 + s3)
- paddd %xmm11,%xmm3
- movdqa %xmm3,%xmm0
- cmp $0x40,%rax
- jl .Lxorpart
- movdqu 0x30(%rdx),%xmm0
- pxor %xmm3,%xmm0
- movdqu %xmm0,0x30(%rsi)
-
-.Ldone:
- FRAME_END
- RET
-
-.Lxorpart:
- # xor remaining bytes from partial register into output
- mov %rax,%r9
- and $0x0f,%r9
- jz .Ldone
- and $~0x0f,%rax
-
- mov %rsi,%r11
-
- lea 8(%rsp),%r10
- sub $0x10,%rsp
- and $~31,%rsp
-
- lea (%rdx,%rax),%rsi
- mov %rsp,%rdi
- mov %r9,%rcx
- rep movsb
-
- pxor 0x00(%rsp),%xmm0
- movdqa %xmm0,0x00(%rsp)
-
- mov %rsp,%rsi
- lea (%r11,%rax),%rdi
- mov %r9,%rcx
- rep movsb
-
- lea -8(%r10),%rsp
- jmp .Ldone
-
-SYM_FUNC_END(chacha_block_xor_ssse3)
-
-SYM_FUNC_START(hchacha_block_ssse3)
- # %rdi: Input state matrix, s
- # %rsi: output (8 32-bit words)
- # %edx: nrounds
- FRAME_BEGIN
-
- movdqu 0x00(%rdi),%xmm0
- movdqu 0x10(%rdi),%xmm1
- movdqu 0x20(%rdi),%xmm2
- movdqu 0x30(%rdi),%xmm3
-
- mov %edx,%r8d
- call chacha_permute
-
- movdqu %xmm0,0x00(%rsi)
- movdqu %xmm3,0x10(%rsi)
-
- FRAME_END
- RET
-SYM_FUNC_END(hchacha_block_ssse3)
-
-SYM_FUNC_START(chacha_4block_xor_ssse3)
- # %rdi: Input state matrix, s
- # %rsi: up to 4 data blocks output, o
- # %rdx: up to 4 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts four consecutive ChaCha blocks by loading the
- # the state matrix in SSE registers four times. As we need some scratch
- # registers, we save the first four registers on the stack. The
- # algorithm performs each operation on the corresponding word of each
- # state matrix, hence requires no word shuffling. For final XORing step
- # we transpose the matrix by interleaving 32- and then 64-bit words,
- # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
- # done with the slightly better performing SSSE3 byte shuffling,
- # 7/12-bit word rotation uses traditional shift+OR.
-
- lea 8(%rsp),%r10
- sub $0x80,%rsp
- and $~63,%rsp
- mov %rcx,%rax
-
- # x0..15[0-3] = s0..3[0..3]
- movq 0x00(%rdi),%xmm1
- pshufd $0x00,%xmm1,%xmm0
- pshufd $0x55,%xmm1,%xmm1
- movq 0x08(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- movq 0x10(%rdi),%xmm5
- pshufd $0x00,%xmm5,%xmm4
- pshufd $0x55,%xmm5,%xmm5
- movq 0x18(%rdi),%xmm7
- pshufd $0x00,%xmm7,%xmm6
- pshufd $0x55,%xmm7,%xmm7
- movq 0x20(%rdi),%xmm9
- pshufd $0x00,%xmm9,%xmm8
- pshufd $0x55,%xmm9,%xmm9
- movq 0x28(%rdi),%xmm11
- pshufd $0x00,%xmm11,%xmm10
- pshufd $0x55,%xmm11,%xmm11
- movq 0x30(%rdi),%xmm13
- pshufd $0x00,%xmm13,%xmm12
- pshufd $0x55,%xmm13,%xmm13
- movq 0x38(%rdi),%xmm15
- pshufd $0x00,%xmm15,%xmm14
- pshufd $0x55,%xmm15,%xmm15
- # x0..3 on stack
- movdqa %xmm0,0x00(%rsp)
- movdqa %xmm1,0x10(%rsp)
- movdqa %xmm2,0x20(%rsp)
- movdqa %xmm3,0x30(%rsp)
-
- movdqa CTRINC(%rip),%xmm1
- movdqa ROT8(%rip),%xmm2
- movdqa ROT16(%rip),%xmm3
-
- # x12 += counter values 0-3
- paddd %xmm1,%xmm12
-
-.Ldoubleround4:
- # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
- movdqa 0x00(%rsp),%xmm0
- paddd %xmm4,%xmm0
- movdqa %xmm0,0x00(%rsp)
- pxor %xmm0,%xmm12
- pshufb %xmm3,%xmm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
- movdqa 0x10(%rsp),%xmm0
- paddd %xmm5,%xmm0
- movdqa %xmm0,0x10(%rsp)
- pxor %xmm0,%xmm13
- pshufb %xmm3,%xmm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
- movdqa 0x20(%rsp),%xmm0
- paddd %xmm6,%xmm0
- movdqa %xmm0,0x20(%rsp)
- pxor %xmm0,%xmm14
- pshufb %xmm3,%xmm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
- movdqa 0x30(%rsp),%xmm0
- paddd %xmm7,%xmm0
- movdqa %xmm0,0x30(%rsp)
- pxor %xmm0,%xmm15
- pshufb %xmm3,%xmm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm4
- por %xmm0,%xmm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm5
- por %xmm0,%xmm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm6
- por %xmm0,%xmm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
- paddd %xmm15,%xmm11
- pxor %xmm11,%xmm7
- movdqa %xmm7,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm7
- por %xmm0,%xmm7
-
- # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
- movdqa 0x00(%rsp),%xmm0
- paddd %xmm4,%xmm0
- movdqa %xmm0,0x00(%rsp)
- pxor %xmm0,%xmm12
- pshufb %xmm2,%xmm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
- movdqa 0x10(%rsp),%xmm0
- paddd %xmm5,%xmm0
- movdqa %xmm0,0x10(%rsp)
- pxor %xmm0,%xmm13
- pshufb %xmm2,%xmm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
- movdqa 0x20(%rsp),%xmm0
- paddd %xmm6,%xmm0
- movdqa %xmm0,0x20(%rsp)
- pxor %xmm0,%xmm14
- pshufb %xmm2,%xmm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
- movdqa 0x30(%rsp),%xmm0
- paddd %xmm7,%xmm0
- movdqa %xmm0,0x30(%rsp)
- pxor %xmm0,%xmm15
- pshufb %xmm2,%xmm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm4
- por %xmm0,%xmm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm5
- por %xmm0,%xmm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm6
- por %xmm0,%xmm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
- paddd %xmm15,%xmm11
- pxor %xmm11,%xmm7
- movdqa %xmm7,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm7
- por %xmm0,%xmm7
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
- movdqa 0x00(%rsp),%xmm0
- paddd %xmm5,%xmm0
- movdqa %xmm0,0x00(%rsp)
- pxor %xmm0,%xmm15
- pshufb %xmm3,%xmm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
- movdqa 0x10(%rsp),%xmm0
- paddd %xmm6,%xmm0
- movdqa %xmm0,0x10(%rsp)
- pxor %xmm0,%xmm12
- pshufb %xmm3,%xmm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
- movdqa 0x20(%rsp),%xmm0
- paddd %xmm7,%xmm0
- movdqa %xmm0,0x20(%rsp)
- pxor %xmm0,%xmm13
- pshufb %xmm3,%xmm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
- movdqa 0x30(%rsp),%xmm0
- paddd %xmm4,%xmm0
- movdqa %xmm0,0x30(%rsp)
- pxor %xmm0,%xmm14
- pshufb %xmm3,%xmm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
- paddd %xmm15,%xmm10
- pxor %xmm10,%xmm5
- movdqa %xmm5,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm5
- por %xmm0,%xmm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
- paddd %xmm12,%xmm11
- pxor %xmm11,%xmm6
- movdqa %xmm6,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm6
- por %xmm0,%xmm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
- paddd %xmm13,%xmm8
- pxor %xmm8,%xmm7
- movdqa %xmm7,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm7
- por %xmm0,%xmm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
- paddd %xmm14,%xmm9
- pxor %xmm9,%xmm4
- movdqa %xmm4,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm4
- por %xmm0,%xmm4
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
- movdqa 0x00(%rsp),%xmm0
- paddd %xmm5,%xmm0
- movdqa %xmm0,0x00(%rsp)
- pxor %xmm0,%xmm15
- pshufb %xmm2,%xmm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
- movdqa 0x10(%rsp),%xmm0
- paddd %xmm6,%xmm0
- movdqa %xmm0,0x10(%rsp)
- pxor %xmm0,%xmm12
- pshufb %xmm2,%xmm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
- movdqa 0x20(%rsp),%xmm0
- paddd %xmm7,%xmm0
- movdqa %xmm0,0x20(%rsp)
- pxor %xmm0,%xmm13
- pshufb %xmm2,%xmm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
- movdqa 0x30(%rsp),%xmm0
- paddd %xmm4,%xmm0
- movdqa %xmm0,0x30(%rsp)
- pxor %xmm0,%xmm14
- pshufb %xmm2,%xmm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
- paddd %xmm15,%xmm10
- pxor %xmm10,%xmm5
- movdqa %xmm5,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm5
- por %xmm0,%xmm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
- paddd %xmm12,%xmm11
- pxor %xmm11,%xmm6
- movdqa %xmm6,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm6
- por %xmm0,%xmm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
- paddd %xmm13,%xmm8
- pxor %xmm8,%xmm7
- movdqa %xmm7,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm7
- por %xmm0,%xmm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
- paddd %xmm14,%xmm9
- pxor %xmm9,%xmm4
- movdqa %xmm4,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm4
- por %xmm0,%xmm4
-
- sub $2,%r8d
- jnz .Ldoubleround4
-
- # x0[0-3] += s0[0]
- # x1[0-3] += s0[1]
- movq 0x00(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd 0x00(%rsp),%xmm2
- movdqa %xmm2,0x00(%rsp)
- paddd 0x10(%rsp),%xmm3
- movdqa %xmm3,0x10(%rsp)
- # x2[0-3] += s0[2]
- # x3[0-3] += s0[3]
- movq 0x08(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd 0x20(%rsp),%xmm2
- movdqa %xmm2,0x20(%rsp)
- paddd 0x30(%rsp),%xmm3
- movdqa %xmm3,0x30(%rsp)
-
- # x4[0-3] += s1[0]
- # x5[0-3] += s1[1]
- movq 0x10(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm4
- paddd %xmm3,%xmm5
- # x6[0-3] += s1[2]
- # x7[0-3] += s1[3]
- movq 0x18(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm6
- paddd %xmm3,%xmm7
-
- # x8[0-3] += s2[0]
- # x9[0-3] += s2[1]
- movq 0x20(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm8
- paddd %xmm3,%xmm9
- # x10[0-3] += s2[2]
- # x11[0-3] += s2[3]
- movq 0x28(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm10
- paddd %xmm3,%xmm11
-
- # x12[0-3] += s3[0]
- # x13[0-3] += s3[1]
- movq 0x30(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm12
- paddd %xmm3,%xmm13
- # x14[0-3] += s3[2]
- # x15[0-3] += s3[3]
- movq 0x38(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm14
- paddd %xmm3,%xmm15
-
- # x12 += counter values 0-3
- paddd %xmm1,%xmm12
-
- # interleave 32-bit words in state n, n+1
- movdqa 0x00(%rsp),%xmm0
- movdqa 0x10(%rsp),%xmm1
- movdqa %xmm0,%xmm2
- punpckldq %xmm1,%xmm2
- punpckhdq %xmm1,%xmm0
- movdqa %xmm2,0x00(%rsp)
- movdqa %xmm0,0x10(%rsp)
- movdqa 0x20(%rsp),%xmm0
- movdqa 0x30(%rsp),%xmm1
- movdqa %xmm0,%xmm2
- punpckldq %xmm1,%xmm2
- punpckhdq %xmm1,%xmm0
- movdqa %xmm2,0x20(%rsp)
- movdqa %xmm0,0x30(%rsp)
- movdqa %xmm4,%xmm0
- punpckldq %xmm5,%xmm4
- punpckhdq %xmm5,%xmm0
- movdqa %xmm0,%xmm5
- movdqa %xmm6,%xmm0
- punpckldq %xmm7,%xmm6
- punpckhdq %xmm7,%xmm0
- movdqa %xmm0,%xmm7
- movdqa %xmm8,%xmm0
- punpckldq %xmm9,%xmm8
- punpckhdq %xmm9,%xmm0
- movdqa %xmm0,%xmm9
- movdqa %xmm10,%xmm0
- punpckldq %xmm11,%xmm10
- punpckhdq %xmm11,%xmm0
- movdqa %xmm0,%xmm11
- movdqa %xmm12,%xmm0
- punpckldq %xmm13,%xmm12
- punpckhdq %xmm13,%xmm0
- movdqa %xmm0,%xmm13
- movdqa %xmm14,%xmm0
- punpckldq %xmm15,%xmm14
- punpckhdq %xmm15,%xmm0
- movdqa %xmm0,%xmm15
-
- # interleave 64-bit words in state n, n+2
- movdqa 0x00(%rsp),%xmm0
- movdqa 0x20(%rsp),%xmm1
- movdqa %xmm0,%xmm2
- punpcklqdq %xmm1,%xmm2
- punpckhqdq %xmm1,%xmm0
- movdqa %xmm2,0x00(%rsp)
- movdqa %xmm0,0x20(%rsp)
- movdqa 0x10(%rsp),%xmm0
- movdqa 0x30(%rsp),%xmm1
- movdqa %xmm0,%xmm2
- punpcklqdq %xmm1,%xmm2
- punpckhqdq %xmm1,%xmm0
- movdqa %xmm2,0x10(%rsp)
- movdqa %xmm0,0x30(%rsp)
- movdqa %xmm4,%xmm0
- punpcklqdq %xmm6,%xmm4
- punpckhqdq %xmm6,%xmm0
- movdqa %xmm0,%xmm6
- movdqa %xmm5,%xmm0
- punpcklqdq %xmm7,%xmm5
- punpckhqdq %xmm7,%xmm0
- movdqa %xmm0,%xmm7
- movdqa %xmm8,%xmm0
- punpcklqdq %xmm10,%xmm8
- punpckhqdq %xmm10,%xmm0
- movdqa %xmm0,%xmm10
- movdqa %xmm9,%xmm0
- punpcklqdq %xmm11,%xmm9
- punpckhqdq %xmm11,%xmm0
- movdqa %xmm0,%xmm11
- movdqa %xmm12,%xmm0
- punpcklqdq %xmm14,%xmm12
- punpckhqdq %xmm14,%xmm0
- movdqa %xmm0,%xmm14
- movdqa %xmm13,%xmm0
- punpcklqdq %xmm15,%xmm13
- punpckhqdq %xmm15,%xmm0
- movdqa %xmm0,%xmm15
-
- # xor with corresponding input, write to output
- movdqa 0x00(%rsp),%xmm0
- cmp $0x10,%rax
- jl .Lxorpart4
- movdqu 0x00(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x00(%rsi)
-
- movdqu %xmm4,%xmm0
- cmp $0x20,%rax
- jl .Lxorpart4
- movdqu 0x10(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x10(%rsi)
-
- movdqu %xmm8,%xmm0
- cmp $0x30,%rax
- jl .Lxorpart4
- movdqu 0x20(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x20(%rsi)
-
- movdqu %xmm12,%xmm0
- cmp $0x40,%rax
- jl .Lxorpart4
- movdqu 0x30(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x30(%rsi)
-
- movdqa 0x20(%rsp),%xmm0
- cmp $0x50,%rax
- jl .Lxorpart4
- movdqu 0x40(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x40(%rsi)
-
- movdqu %xmm6,%xmm0
- cmp $0x60,%rax
- jl .Lxorpart4
- movdqu 0x50(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x50(%rsi)
-
- movdqu %xmm10,%xmm0
- cmp $0x70,%rax
- jl .Lxorpart4
- movdqu 0x60(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x60(%rsi)
-
- movdqu %xmm14,%xmm0
- cmp $0x80,%rax
- jl .Lxorpart4
- movdqu 0x70(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x70(%rsi)
-
- movdqa 0x10(%rsp),%xmm0
- cmp $0x90,%rax
- jl .Lxorpart4
- movdqu 0x80(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x80(%rsi)
-
- movdqu %xmm5,%xmm0
- cmp $0xa0,%rax
- jl .Lxorpart4
- movdqu 0x90(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x90(%rsi)
-
- movdqu %xmm9,%xmm0
- cmp $0xb0,%rax
- jl .Lxorpart4
- movdqu 0xa0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xa0(%rsi)
-
- movdqu %xmm13,%xmm0
- cmp $0xc0,%rax
- jl .Lxorpart4
- movdqu 0xb0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xb0(%rsi)
-
- movdqa 0x30(%rsp),%xmm0
- cmp $0xd0,%rax
- jl .Lxorpart4
- movdqu 0xc0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xc0(%rsi)
-
- movdqu %xmm7,%xmm0
- cmp $0xe0,%rax
- jl .Lxorpart4
- movdqu 0xd0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xd0(%rsi)
-
- movdqu %xmm11,%xmm0
- cmp $0xf0,%rax
- jl .Lxorpart4
- movdqu 0xe0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xe0(%rsi)
-
- movdqu %xmm15,%xmm0
- cmp $0x100,%rax
- jl .Lxorpart4
- movdqu 0xf0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xf0(%rsi)
-
-.Ldone4:
- lea -8(%r10),%rsp
- RET
-
-.Lxorpart4:
- # xor remaining bytes from partial register into output
- mov %rax,%r9
- and $0x0f,%r9
- jz .Ldone4
- and $~0x0f,%rax
-
- mov %rsi,%r11
-
- lea (%rdx,%rax),%rsi
- mov %rsp,%rdi
- mov %r9,%rcx
- rep movsb
-
- pxor 0x00(%rsp),%xmm0
- movdqa %xmm0,0x00(%rsp)
-
- mov %rsp,%rsi
- lea (%r11,%rax),%rdi
- mov %r9,%rcx
- rep movsb
-
- jmp .Ldone4
-
-SYM_FUNC_END(chacha_4block_xor_ssse3)
diff --git a/arch/x86/lib/crypto/chacha_glue.c b/arch/x86/lib/crypto/chacha_glue.c
deleted file mode 100644
index 10b2c945f541..000000000000
--- a/arch/x86/lib/crypto/chacha_glue.c
+++ /dev/null
@@ -1,196 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * ChaCha and HChaCha functions (x86_64 optimized)
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <asm/simd.h>
-#include <crypto/chacha.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/sizes.h>
-
-asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void hchacha_block_ssse3(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds);
-
-asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-
-asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
-
-static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
-{
- len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
- return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
-}
-
-static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- if (static_branch_likely(&chacha_use_avx512vl)) {
- while (bytes >= CHACHA_BLOCK_SIZE * 8) {
- chacha_8block_xor_avx512vl(state, dst, src, bytes,
- nrounds);
- bytes -= CHACHA_BLOCK_SIZE * 8;
- src += CHACHA_BLOCK_SIZE * 8;
- dst += CHACHA_BLOCK_SIZE * 8;
- state->x[12] += 8;
- }
- if (bytes > CHACHA_BLOCK_SIZE * 4) {
- chacha_8block_xor_avx512vl(state, dst, src, bytes,
- nrounds);
- state->x[12] += chacha_advance(bytes, 8);
- return;
- }
- if (bytes > CHACHA_BLOCK_SIZE * 2) {
- chacha_4block_xor_avx512vl(state, dst, src, bytes,
- nrounds);
- state->x[12] += chacha_advance(bytes, 4);
- return;
- }
- if (bytes) {
- chacha_2block_xor_avx512vl(state, dst, src, bytes,
- nrounds);
- state->x[12] += chacha_advance(bytes, 2);
- return;
- }
- }
-
- if (static_branch_likely(&chacha_use_avx2)) {
- while (bytes >= CHACHA_BLOCK_SIZE * 8) {
- chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
- bytes -= CHACHA_BLOCK_SIZE * 8;
- src += CHACHA_BLOCK_SIZE * 8;
- dst += CHACHA_BLOCK_SIZE * 8;
- state->x[12] += 8;
- }
- if (bytes > CHACHA_BLOCK_SIZE * 4) {
- chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
- state->x[12] += chacha_advance(bytes, 8);
- return;
- }
- if (bytes > CHACHA_BLOCK_SIZE * 2) {
- chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
- state->x[12] += chacha_advance(bytes, 4);
- return;
- }
- if (bytes > CHACHA_BLOCK_SIZE) {
- chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
- state->x[12] += chacha_advance(bytes, 2);
- return;
- }
- }
-
- while (bytes >= CHACHA_BLOCK_SIZE * 4) {
- chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
- bytes -= CHACHA_BLOCK_SIZE * 4;
- src += CHACHA_BLOCK_SIZE * 4;
- dst += CHACHA_BLOCK_SIZE * 4;
- state->x[12] += 4;
- }
- if (bytes > CHACHA_BLOCK_SIZE) {
- chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
- state->x[12] += chacha_advance(bytes, 4);
- return;
- }
- if (bytes) {
- chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
- state->x[12]++;
- }
-}
-
-void hchacha_block_arch(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds)
-{
- if (!static_branch_likely(&chacha_use_simd)) {
- hchacha_block_generic(state, out, nrounds);
- } else {
- kernel_fpu_begin();
- hchacha_block_ssse3(state, out, nrounds);
- kernel_fpu_end();
- }
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- if (!static_branch_likely(&chacha_use_simd) ||
- bytes <= CHACHA_BLOCK_SIZE)
- return chacha_crypt_generic(state, dst, src, bytes, nrounds);
-
- do {
- unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
-
- kernel_fpu_begin();
- chacha_dosimd(state, dst, src, todo, nrounds);
- kernel_fpu_end();
-
- bytes -= todo;
- src += todo;
- dst += todo;
- } while (bytes);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-bool chacha_is_arch_optimized(void)
-{
- return static_key_enabled(&chacha_use_simd);
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-static int __init chacha_simd_mod_init(void)
-{
- if (!boot_cpu_has(X86_FEATURE_SSSE3))
- return 0;
-
- static_branch_enable(&chacha_use_simd);
-
- if (boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
- static_branch_enable(&chacha_use_avx2);
-
- if (boot_cpu_has(X86_FEATURE_AVX512VL) &&
- boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
- static_branch_enable(&chacha_use_avx512vl);
- }
- return 0;
-}
-subsys_initcall(chacha_simd_mod_init);
-
-static void __exit chacha_simd_mod_exit(void)
-{
-}
-module_exit(chacha_simd_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
-MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)");
diff --git a/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl
deleted file mode 100644
index 501827254fed..000000000000
--- a/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl
+++ /dev/null
@@ -1,4253 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-#
-# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
-# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
-# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
-#
-# This code is taken from the OpenSSL project but the author, Andy Polyakov,
-# has relicensed it under the licenses specified in the SPDX header above.
-# The original headers, including the original license headers, are
-# included below for completeness.
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# This module implements Poly1305 hash for x86_64.
-#
-# March 2015
-#
-# Initial release.
-#
-# December 2016
-#
-# Add AVX512F+VL+BW code path.
-#
-# November 2017
-#
-# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
-# executed even on Knights Landing. Trigger for modification was
-# observation that AVX512 code paths can negatively affect overall
-# Skylake-X system performance. Since we are likely to suppress
-# AVX512F capability flag [at least on Skylake-X], conversion serves
-# as kind of "investment protection". Note that next *lake processor,
-# Cannonlake, has AVX512IFMA code path to execute...
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone,
-# measured with rdtsc at fixed clock frequency.
-#
-# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
-# P4 4.46/+120% -
-# Core 2 2.41/+90% -
-# Westmere 1.88/+120% -
-# Sandy Bridge 1.39/+140% 1.10
-# Haswell 1.14/+175% 1.11 0.65
-# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
-# Silvermont 2.83/+95% -
-# Knights L 3.60/? 1.65 1.10 0.41(***)
-# Goldmont 1.70/+180% -
-# VIA Nano 1.82/+150% -
-# Sledgehammer 1.38/+160% -
-# Bulldozer 2.30/+130% 0.97
-# Ryzen 1.15/+200% 1.08 1.18
-#
-# (*) improvement coefficients relative to clang are more modest and
-# are ~50% on most processors, in both cases we are comparing to
-# __int128 code;
-# (**) SSE2 implementation was attempted, but among non-AVX processors
-# it was faster than integer-only code only on older Intel P4 and
-# Core processors, 50-30%, less newer processor is, but slower on
-# contemporary ones, for example almost 2x slower on Atom, and as
-# former are naturally disappearing, SSE2 is deemed unnecessary;
-# (***) strangely enough performance seems to vary from core to core,
-# listed result is best case;
-
-$flavour = shift;
-$output = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-$kernel=0; $kernel=1 if (!$flavour && !$output);
-
-if (!$kernel) {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
- die "can't locate x86_64-xlate.pl";
-
- open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
- *STDOUT=*OUT;
-
- if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
- =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
- $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
- }
-
- if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
- `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
- $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
- $avx += 1 if ($1==2.11 && $2>=8);
- }
-
- if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
- `ml64 2>&1` =~ /Version ([0-9]+)\./) {
- $avx = ($1>=10) + ($1>=11);
- }
-
- if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
- $avx = ($2>=3.0) + ($2>3.0);
- }
-} else {
- $avx = 4; # The kernel uses ifdefs for this.
-}
-
-sub declare_function() {
- my ($name, $align, $nargs) = @_;
- if($kernel) {
- $code .= "SYM_FUNC_START($name)\n";
- $code .= ".L$name:\n";
- } else {
- $code .= ".globl $name\n";
- $code .= ".type $name,\@function,$nargs\n";
- $code .= ".align $align\n";
- $code .= "$name:\n";
- }
-}
-
-sub declare_typed_function() {
- my ($name, $align, $nargs) = @_;
- if($kernel) {
- $code .= "SYM_TYPED_FUNC_START($name)\n";
- $code .= ".L$name:\n";
- } else {
- $code .= ".globl $name\n";
- $code .= ".type $name,\@function,$nargs\n";
- $code .= ".align $align\n";
- $code .= "$name:\n";
- }
-}
-
-sub end_function() {
- my ($name) = @_;
- if($kernel) {
- $code .= "SYM_FUNC_END($name)\n";
- } else {
- $code .= ".size $name,.-$name\n";
- }
-}
-
-$code.=<<___ if $kernel;
-#include <linux/cfi_types.h>
-___
-
-if ($avx) {
-$code.=<<___ if $kernel;
-.section .rodata
-___
-$code.=<<___;
-.align 64
-.Lconst:
-.Lmask24:
-.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
-.L129:
-.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
-.Lmask26:
-.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
-.Lpermd_avx2:
-.long 2,2,2,3,2,0,2,1
-.Lpermd_avx512:
-.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
-
-.L2_44_inp_permd:
-.long 0,1,1,2,2,3,7,7
-.L2_44_inp_shift:
-.quad 0,12,24,64
-.L2_44_mask:
-.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
-.L2_44_shift_rgt:
-.quad 44,44,42,64
-.L2_44_shift_lft:
-.quad 8,8,10,64
-
-.align 64
-.Lx_mask44:
-.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
-.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
-.Lx_mask42:
-.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
-.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
-___
-}
-$code.=<<___ if (!$kernel);
-.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-.align 16
-___
-
-my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
-my ($mac,$nonce)=($inp,$len); # *_emit arguments
-my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
-my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
-
-sub poly1305_iteration {
-# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
-# output: $h0-$h2 *= $r0-$r1
-$code.=<<___;
- mulq $h0 # h0*r1
- mov %rax,$d2
- mov $r0,%rax
- mov %rdx,$d3
-
- mulq $h0 # h0*r0
- mov %rax,$h0 # future $h0
- mov $r0,%rax
- mov %rdx,$d1
-
- mulq $h1 # h1*r0
- add %rax,$d2
- mov $s1,%rax
- adc %rdx,$d3
-
- mulq $h1 # h1*s1
- mov $h2,$h1 # borrow $h1
- add %rax,$h0
- adc %rdx,$d1
-
- imulq $s1,$h1 # h2*s1
- add $h1,$d2
- mov $d1,$h1
- adc \$0,$d3
-
- imulq $r0,$h2 # h2*r0
- add $d2,$h1
- mov \$-4,%rax # mask value
- adc $h2,$d3
-
- and $d3,%rax # last reduction step
- mov $d3,$h2
- shr \$2,$d3
- and \$3,$h2
- add $d3,%rax
- add %rax,$h0
- adc \$0,$h1
- adc \$0,$h2
-___
-}
-
-########################################################################
-# Layout of opaque area is following.
-#
-# unsigned __int64 h[3]; # current hash value base 2^64
-# unsigned __int64 r[2]; # key value base 2^64
-
-$code.=<<___;
-.text
-___
-$code.=<<___ if (!$kernel);
-.extern OPENSSL_ia32cap_P
-
-.globl poly1305_block_init_arch
-.hidden poly1305_block_init_arch
-.globl poly1305_blocks_x86_64
-.hidden poly1305_blocks_x86_64
-.globl poly1305_emit_x86_64
-.hidden poly1305_emit_x86_64
-___
-&declare_typed_function("poly1305_block_init_arch", 32, 3);
-$code.=<<___;
- xor %eax,%eax
- mov %rax,0($ctx) # initialize hash value
- mov %rax,8($ctx)
- mov %rax,16($ctx)
-
- test $inp,$inp
- je .Lno_key
-___
-$code.=<<___ if (!$kernel);
- lea poly1305_blocks_x86_64(%rip),%r10
- lea poly1305_emit_x86_64(%rip),%r11
-___
-$code.=<<___ if (!$kernel && $avx);
- mov OPENSSL_ia32cap_P+4(%rip),%r9
- lea poly1305_blocks_avx(%rip),%rax
- lea poly1305_emit_avx(%rip),%rcx
- bt \$`60-32`,%r9 # AVX?
- cmovc %rax,%r10
- cmovc %rcx,%r11
-___
-$code.=<<___ if (!$kernel && $avx>1);
- lea poly1305_blocks_avx2(%rip),%rax
- bt \$`5+32`,%r9 # AVX2?
- cmovc %rax,%r10
-___
-$code.=<<___ if (!$kernel && $avx>3);
- mov \$`(1<<31|1<<21|1<<16)`,%rax
- shr \$32,%r9
- and %rax,%r9
- cmp %rax,%r9
- je .Linit_base2_44
-___
-$code.=<<___;
- mov \$0x0ffffffc0fffffff,%rax
- mov \$0x0ffffffc0ffffffc,%rcx
- and 0($inp),%rax
- and 8($inp),%rcx
- mov %rax,24($ctx)
- mov %rcx,32($ctx)
-___
-$code.=<<___ if (!$kernel && $flavour !~ /elf32/);
- mov %r10,0(%rdx)
- mov %r11,8(%rdx)
-___
-$code.=<<___ if (!$kernel && $flavour =~ /elf32/);
- mov %r10d,0(%rdx)
- mov %r11d,4(%rdx)
-___
-$code.=<<___;
- mov \$1,%eax
-.Lno_key:
- RET
-___
-&end_function("poly1305_block_init_arch");
-
-&declare_function("poly1305_blocks_x86_64", 32, 4);
-$code.=<<___;
-.cfi_startproc
-.Lblocks:
- shr \$4,$len
- jz .Lno_data # too short
-
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
- push $ctx
-.cfi_push $ctx
-.Lblocks_body:
-
- mov $len,%r15 # reassign $len
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- mov 0($ctx),$h0 # load hash value
- mov 8($ctx),$h1
- mov 16($ctx),$h2
-
- mov $s1,$r1
- shr \$2,$s1
- mov $r1,%rax
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
- jmp .Loop
-
-.align 32
-.Loop:
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
-___
-
- &poly1305_iteration();
-
-$code.=<<___;
- mov $r1,%rax
- dec %r15 # len-=16
- jnz .Loop
-
- mov 0(%rsp),$ctx
-.cfi_restore $ctx
-
- mov $h0,0($ctx) # store hash value
- mov $h1,8($ctx)
- mov $h2,16($ctx)
-
- mov 8(%rsp),%r15
-.cfi_restore %r15
- mov 16(%rsp),%r14
-.cfi_restore %r14
- mov 24(%rsp),%r13
-.cfi_restore %r13
- mov 32(%rsp),%r12
-.cfi_restore %r12
- mov 40(%rsp),%rbx
-.cfi_restore %rbx
- lea 48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
-.Lno_data:
-.Lblocks_epilogue:
- RET
-.cfi_endproc
-___
-&end_function("poly1305_blocks_x86_64");
-
-&declare_function("poly1305_emit_x86_64", 32, 3);
-$code.=<<___;
-.Lemit:
- mov 0($ctx),%r8 # load hash value
- mov 8($ctx),%r9
- mov 16($ctx),%r10
-
- mov %r8,%rax
- add \$5,%r8 # compare to modulus
- mov %r9,%rcx
- adc \$0,%r9
- adc \$0,%r10
- shr \$2,%r10 # did 130-bit value overflow?
- cmovnz %r8,%rax
- cmovnz %r9,%rcx
-
- add 0($nonce),%rax # accumulate nonce
- adc 8($nonce),%rcx
- mov %rax,0($mac) # write result
- mov %rcx,8($mac)
-
- RET
-___
-&end_function("poly1305_emit_x86_64");
-if ($avx) {
-
-########################################################################
-# Layout of opaque area is following.
-#
-# unsigned __int32 h[5]; # current hash value base 2^26
-# unsigned __int32 is_base2_26;
-# unsigned __int64 r[2]; # key value base 2^64
-# unsigned __int64 pad;
-# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
-#
-# where r^n are base 2^26 digits of degrees of multiplier key. There are
-# 5 digits, but last four are interleaved with multiples of 5, totalling
-# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
-
-my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
- map("%xmm$_",(0..15));
-
-$code.=<<___;
-.type __poly1305_block,\@abi-omnipotent
-.align 32
-__poly1305_block:
- push $ctx
-___
- &poly1305_iteration();
-$code.=<<___;
- pop $ctx
- RET
-.size __poly1305_block,.-__poly1305_block
-
-.type __poly1305_init_avx,\@abi-omnipotent
-.align 32
-__poly1305_init_avx:
- push %rbp
- mov %rsp,%rbp
- mov $r0,$h0
- mov $r1,$h1
- xor $h2,$h2
-
- lea 48+64($ctx),$ctx # size optimization
-
- mov $r1,%rax
- call __poly1305_block # r^2
-
- mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
- mov \$0x3ffffff,%edx
- mov $h0,$d1
- and $h0#d,%eax
- mov $r0,$d2
- and $r0#d,%edx
- mov %eax,`16*0+0-64`($ctx)
- shr \$26,$d1
- mov %edx,`16*0+4-64`($ctx)
- shr \$26,$d2
-
- mov \$0x3ffffff,%eax
- mov \$0x3ffffff,%edx
- and $d1#d,%eax
- and $d2#d,%edx
- mov %eax,`16*1+0-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov %edx,`16*1+4-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- mov %eax,`16*2+0-64`($ctx)
- shr \$26,$d1
- mov %edx,`16*2+4-64`($ctx)
- shr \$26,$d2
-
- mov $h1,%rax
- mov $r1,%rdx
- shl \$12,%rax
- shl \$12,%rdx
- or $d1,%rax
- or $d2,%rdx
- and \$0x3ffffff,%eax
- and \$0x3ffffff,%edx
- mov %eax,`16*3+0-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov %edx,`16*3+4-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- mov %eax,`16*4+0-64`($ctx)
- mov $h1,$d1
- mov %edx,`16*4+4-64`($ctx)
- mov $r1,$d2
-
- mov \$0x3ffffff,%eax
- mov \$0x3ffffff,%edx
- shr \$14,$d1
- shr \$14,$d2
- and $d1#d,%eax
- and $d2#d,%edx
- mov %eax,`16*5+0-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov %edx,`16*5+4-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- mov %eax,`16*6+0-64`($ctx)
- shr \$26,$d1
- mov %edx,`16*6+4-64`($ctx)
- shr \$26,$d2
-
- mov $h2,%rax
- shl \$24,%rax
- or %rax,$d1
- mov $d1#d,`16*7+0-64`($ctx)
- lea ($d1,$d1,4),$d1 # *5
- mov $d2#d,`16*7+4-64`($ctx)
- lea ($d2,$d2,4),$d2 # *5
- mov $d1#d,`16*8+0-64`($ctx)
- mov $d2#d,`16*8+4-64`($ctx)
-
- mov $r1,%rax
- call __poly1305_block # r^3
-
- mov \$0x3ffffff,%eax # save r^3 base 2^26
- mov $h0,$d1
- and $h0#d,%eax
- shr \$26,$d1
- mov %eax,`16*0+12-64`($ctx)
-
- mov \$0x3ffffff,%edx
- and $d1#d,%edx
- mov %edx,`16*1+12-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*2+12-64`($ctx)
-
- mov $h1,%rax
- shl \$12,%rax
- or $d1,%rax
- and \$0x3ffffff,%eax
- mov %eax,`16*3+12-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov $h1,$d1
- mov %eax,`16*4+12-64`($ctx)
-
- mov \$0x3ffffff,%edx
- shr \$14,$d1
- and $d1#d,%edx
- mov %edx,`16*5+12-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*6+12-64`($ctx)
-
- mov $h2,%rax
- shl \$24,%rax
- or %rax,$d1
- mov $d1#d,`16*7+12-64`($ctx)
- lea ($d1,$d1,4),$d1 # *5
- mov $d1#d,`16*8+12-64`($ctx)
-
- mov $r1,%rax
- call __poly1305_block # r^4
-
- mov \$0x3ffffff,%eax # save r^4 base 2^26
- mov $h0,$d1
- and $h0#d,%eax
- shr \$26,$d1
- mov %eax,`16*0+8-64`($ctx)
-
- mov \$0x3ffffff,%edx
- and $d1#d,%edx
- mov %edx,`16*1+8-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*2+8-64`($ctx)
-
- mov $h1,%rax
- shl \$12,%rax
- or $d1,%rax
- and \$0x3ffffff,%eax
- mov %eax,`16*3+8-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov $h1,$d1
- mov %eax,`16*4+8-64`($ctx)
-
- mov \$0x3ffffff,%edx
- shr \$14,$d1
- and $d1#d,%edx
- mov %edx,`16*5+8-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*6+8-64`($ctx)
-
- mov $h2,%rax
- shl \$24,%rax
- or %rax,$d1
- mov $d1#d,`16*7+8-64`($ctx)
- lea ($d1,$d1,4),$d1 # *5
- mov $d1#d,`16*8+8-64`($ctx)
-
- lea -48-64($ctx),$ctx # size [de-]optimization
- pop %rbp
- RET
-.size __poly1305_init_avx,.-__poly1305_init_avx
-___
-
-&declare_function("poly1305_blocks_avx", 32, 4);
-$code.=<<___;
-.cfi_startproc
- mov 20($ctx),%r8d # is_base2_26
- cmp \$128,$len
- jae .Lblocks_avx
- test %r8d,%r8d
- jz .Lblocks
-
-.Lblocks_avx:
- and \$-16,$len
- jz .Lno_data_avx
-
- vzeroupper
-
- test %r8d,%r8d
- jz .Lbase2_64_avx
-
- test \$31,$len
- jz .Leven_avx
-
- push %rbp
-.cfi_push %rbp
- mov %rsp,%rbp
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lblocks_avx_body:
-
- mov $len,%r15 # reassign $len
-
- mov 0($ctx),$d1 # load hash value
- mov 8($ctx),$d2
- mov 16($ctx),$h2#d
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- ################################# base 2^26 -> base 2^64
- mov $d1#d,$h0#d
- and \$`-1*(1<<31)`,$d1
- mov $d2,$r1 # borrow $r1
- mov $d2#d,$h1#d
- and \$`-1*(1<<31)`,$d2
-
- shr \$6,$d1
- shl \$52,$r1
- add $d1,$h0
- shr \$12,$h1
- shr \$18,$d2
- add $r1,$h0
- adc $d2,$h1
-
- mov $h2,$d1
- shl \$40,$d1
- shr \$24,$h2
- add $d1,$h1
- adc \$0,$h2 # can be partially reduced...
-
- mov \$-4,$d2 # ... so reduce
- mov $h2,$d1
- and $h2,$d2
- shr \$2,$d1
- and \$3,$h2
- add $d2,$d1 # =*5
- add $d1,$h0
- adc \$0,$h1
- adc \$0,$h2
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
-
- call __poly1305_block
-
- test $padbit,$padbit # if $padbit is zero,
- jz .Lstore_base2_64_avx # store hash in base 2^64 format
-
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$r0
- mov $h1,$r1
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$r0
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $r0,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$r1
- and \$0x3ffffff,$h1 # h[3]
- or $r1,$h2 # h[4]
-
- sub \$16,%r15
- jz .Lstore_base2_26_avx
-
- vmovd %rax#d,$H0
- vmovd %rdx#d,$H1
- vmovd $h0#d,$H2
- vmovd $h1#d,$H3
- vmovd $h2#d,$H4
- jmp .Lproceed_avx
-
-.align 32
-.Lstore_base2_64_avx:
- mov $h0,0($ctx)
- mov $h1,8($ctx)
- mov $h2,16($ctx) # note that is_base2_26 is zeroed
- jmp .Ldone_avx
-
-.align 16
-.Lstore_base2_26_avx:
- mov %rax#d,0($ctx) # store hash value base 2^26
- mov %rdx#d,4($ctx)
- mov $h0#d,8($ctx)
- mov $h1#d,12($ctx)
- mov $h2#d,16($ctx)
-.align 16
-.Ldone_avx:
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbx
-.cfi_restore %rbx
- pop %rbp
-.cfi_restore %rbp
-.Lno_data_avx:
-.Lblocks_avx_epilogue:
- RET
-.cfi_endproc
-
-.align 32
-.Lbase2_64_avx:
-.cfi_startproc
- push %rbp
-.cfi_push %rbp
- mov %rsp,%rbp
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lbase2_64_avx_body:
-
- mov $len,%r15 # reassign $len
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- mov 0($ctx),$h0 # load hash value
- mov 8($ctx),$h1
- mov 16($ctx),$h2#d
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
- test \$31,$len
- jz .Linit_avx
-
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
- sub \$16,%r15
-
- call __poly1305_block
-
-.Linit_avx:
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$d1
- mov $h1,$d2
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$d1
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $d1,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$d2
- and \$0x3ffffff,$h1 # h[3]
- or $d2,$h2 # h[4]
-
- vmovd %rax#d,$H0
- vmovd %rdx#d,$H1
- vmovd $h0#d,$H2
- vmovd $h1#d,$H3
- vmovd $h2#d,$H4
- movl \$1,20($ctx) # set is_base2_26
-
- call __poly1305_init_avx
-
-.Lproceed_avx:
- mov %r15,$len
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbx
-.cfi_restore %rbx
- pop %rbp
-.cfi_restore %rbp
-.Lbase2_64_avx_epilogue:
- jmp .Ldo_avx
-.cfi_endproc
-
-.align 32
-.Leven_avx:
-.cfi_startproc
- vmovd 4*0($ctx),$H0 # load hash value
- vmovd 4*1($ctx),$H1
- vmovd 4*2($ctx),$H2
- vmovd 4*3($ctx),$H3
- vmovd 4*4($ctx),$H4
-
-.Ldo_avx:
-___
-$code.=<<___ if (!$win64);
- lea 8(%rsp),%r10
-.cfi_def_cfa_register %r10
- and \$-32,%rsp
- sub \$-8,%rsp
- lea -0x58(%rsp),%r11
- sub \$0x178,%rsp
-___
-$code.=<<___ if ($win64);
- lea -0xf8(%rsp),%r11
- sub \$0x218,%rsp
- vmovdqa %xmm6,0x50(%r11)
- vmovdqa %xmm7,0x60(%r11)
- vmovdqa %xmm8,0x70(%r11)
- vmovdqa %xmm9,0x80(%r11)
- vmovdqa %xmm10,0x90(%r11)
- vmovdqa %xmm11,0xa0(%r11)
- vmovdqa %xmm12,0xb0(%r11)
- vmovdqa %xmm13,0xc0(%r11)
- vmovdqa %xmm14,0xd0(%r11)
- vmovdqa %xmm15,0xe0(%r11)
-.Ldo_avx_body:
-___
-$code.=<<___;
- sub \$64,$len
- lea -32($inp),%rax
- cmovc %rax,$inp
-
- vmovdqu `16*3`($ctx),$D4 # preload r0^2
- lea `16*3+64`($ctx),$ctx # size optimization
- lea .Lconst(%rip),%rcx
-
- ################################################################
- # load input
- vmovdqu 16*2($inp),$T0
- vmovdqu 16*3($inp),$T1
- vmovdqa 64(%rcx),$MASK # .Lmask26
-
- vpsrldq \$6,$T0,$T2 # splat input
- vpsrldq \$6,$T1,$T3
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpunpcklqdq $T3,$T2,$T3 # 2:3
-
- vpsrlq \$40,$T4,$T4 # 4
- vpsrlq \$26,$T0,$T1
- vpand $MASK,$T0,$T0 # 0
- vpsrlq \$4,$T3,$T2
- vpand $MASK,$T1,$T1 # 1
- vpsrlq \$30,$T3,$T3
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- jbe .Lskip_loop_avx
-
- # expand and copy pre-calculated table to stack
- vmovdqu `16*1-64`($ctx),$D1
- vmovdqu `16*2-64`($ctx),$D2
- vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
- vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
- vmovdqa $D3,-0x90(%r11)
- vmovdqa $D0,0x00(%rsp)
- vpshufd \$0xEE,$D1,$D4
- vmovdqu `16*3-64`($ctx),$D0
- vpshufd \$0x44,$D1,$D1
- vmovdqa $D4,-0x80(%r11)
- vmovdqa $D1,0x10(%rsp)
- vpshufd \$0xEE,$D2,$D3
- vmovdqu `16*4-64`($ctx),$D1
- vpshufd \$0x44,$D2,$D2
- vmovdqa $D3,-0x70(%r11)
- vmovdqa $D2,0x20(%rsp)
- vpshufd \$0xEE,$D0,$D4
- vmovdqu `16*5-64`($ctx),$D2
- vpshufd \$0x44,$D0,$D0
- vmovdqa $D4,-0x60(%r11)
- vmovdqa $D0,0x30(%rsp)
- vpshufd \$0xEE,$D1,$D3
- vmovdqu `16*6-64`($ctx),$D0
- vpshufd \$0x44,$D1,$D1
- vmovdqa $D3,-0x50(%r11)
- vmovdqa $D1,0x40(%rsp)
- vpshufd \$0xEE,$D2,$D4
- vmovdqu `16*7-64`($ctx),$D1
- vpshufd \$0x44,$D2,$D2
- vmovdqa $D4,-0x40(%r11)
- vmovdqa $D2,0x50(%rsp)
- vpshufd \$0xEE,$D0,$D3
- vmovdqu `16*8-64`($ctx),$D2
- vpshufd \$0x44,$D0,$D0
- vmovdqa $D3,-0x30(%r11)
- vmovdqa $D0,0x60(%rsp)
- vpshufd \$0xEE,$D1,$D4
- vpshufd \$0x44,$D1,$D1
- vmovdqa $D4,-0x20(%r11)
- vmovdqa $D1,0x70(%rsp)
- vpshufd \$0xEE,$D2,$D3
- vmovdqa 0x00(%rsp),$D4 # preload r0^2
- vpshufd \$0x44,$D2,$D2
- vmovdqa $D3,-0x10(%r11)
- vmovdqa $D2,0x80(%rsp)
-
- jmp .Loop_avx
-
-.align 32
-.Loop_avx:
- ################################################################
- # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- # \___________________/
- # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- # \___________________/ \____________________/
- #
- # Note that we start with inp[2:3]*r^2. This is because it
- # doesn't depend on reduction in previous iteration.
- ################################################################
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- #
- # though note that $Tx and $Hx are "reversed" in this section,
- # and $D4 is preloaded with r0^2...
-
- vpmuludq $T0,$D4,$D0 # d0 = h0*r0
- vpmuludq $T1,$D4,$D1 # d1 = h1*r0
- vmovdqa $H2,0x20(%r11) # offload hash
- vpmuludq $T2,$D4,$D2 # d3 = h2*r0
- vmovdqa 0x10(%rsp),$H2 # r1^2
- vpmuludq $T3,$D4,$D3 # d3 = h3*r0
- vpmuludq $T4,$D4,$D4 # d4 = h4*r0
-
- vmovdqa $H0,0x00(%r11) #
- vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
- vmovdqa $H1,0x10(%r11) #
- vpmuludq $T3,$H2,$H1 # h3*r1
- vpaddq $H0,$D0,$D0 # d0 += h4*s1
- vpaddq $H1,$D4,$D4 # d4 += h3*r1
- vmovdqa $H3,0x30(%r11) #
- vpmuludq $T2,$H2,$H0 # h2*r1
- vpmuludq $T1,$H2,$H1 # h1*r1
- vpaddq $H0,$D3,$D3 # d3 += h2*r1
- vmovdqa 0x30(%rsp),$H3 # r2^2
- vpaddq $H1,$D2,$D2 # d2 += h1*r1
- vmovdqa $H4,0x40(%r11) #
- vpmuludq $T0,$H2,$H2 # h0*r1
- vpmuludq $T2,$H3,$H0 # h2*r2
- vpaddq $H2,$D1,$D1 # d1 += h0*r1
-
- vmovdqa 0x40(%rsp),$H4 # s2^2
- vpaddq $H0,$D4,$D4 # d4 += h2*r2
- vpmuludq $T1,$H3,$H1 # h1*r2
- vpmuludq $T0,$H3,$H3 # h0*r2
- vpaddq $H1,$D3,$D3 # d3 += h1*r2
- vmovdqa 0x50(%rsp),$H2 # r3^2
- vpaddq $H3,$D2,$D2 # d2 += h0*r2
- vpmuludq $T4,$H4,$H0 # h4*s2
- vpmuludq $T3,$H4,$H4 # h3*s2
- vpaddq $H0,$D1,$D1 # d1 += h4*s2
- vmovdqa 0x60(%rsp),$H3 # s3^2
- vpaddq $H4,$D0,$D0 # d0 += h3*s2
-
- vmovdqa 0x80(%rsp),$H4 # s4^2
- vpmuludq $T1,$H2,$H1 # h1*r3
- vpmuludq $T0,$H2,$H2 # h0*r3
- vpaddq $H1,$D4,$D4 # d4 += h1*r3
- vpaddq $H2,$D3,$D3 # d3 += h0*r3
- vpmuludq $T4,$H3,$H0 # h4*s3
- vpmuludq $T3,$H3,$H1 # h3*s3
- vpaddq $H0,$D2,$D2 # d2 += h4*s3
- vmovdqu 16*0($inp),$H0 # load input
- vpaddq $H1,$D1,$D1 # d1 += h3*s3
- vpmuludq $T2,$H3,$H3 # h2*s3
- vpmuludq $T2,$H4,$T2 # h2*s4
- vpaddq $H3,$D0,$D0 # d0 += h2*s3
-
- vmovdqu 16*1($inp),$H1 #
- vpaddq $T2,$D1,$D1 # d1 += h2*s4
- vpmuludq $T3,$H4,$T3 # h3*s4
- vpmuludq $T4,$H4,$T4 # h4*s4
- vpsrldq \$6,$H0,$H2 # splat input
- vpaddq $T3,$D2,$D2 # d2 += h3*s4
- vpaddq $T4,$D3,$D3 # d3 += h4*s4
- vpsrldq \$6,$H1,$H3 #
- vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
- vpmuludq $T1,$H4,$T0 # h1*s4
- vpunpckhqdq $H1,$H0,$H4 # 4
- vpaddq $T4,$D4,$D4 # d4 += h0*r4
- vmovdqa -0x90(%r11),$T4 # r0^4
- vpaddq $T0,$D0,$D0 # d0 += h1*s4
-
- vpunpcklqdq $H1,$H0,$H0 # 0:1
- vpunpcklqdq $H3,$H2,$H3 # 2:3
-
- #vpsrlq \$40,$H4,$H4 # 4
- vpsrldq \$`40/8`,$H4,$H4 # 4
- vpsrlq \$26,$H0,$H1
- vpand $MASK,$H0,$H0 # 0
- vpsrlq \$4,$H3,$H2
- vpand $MASK,$H1,$H1 # 1
- vpand 0(%rcx),$H4,$H4 # .Lmask24
- vpsrlq \$30,$H3,$H3
- vpand $MASK,$H2,$H2 # 2
- vpand $MASK,$H3,$H3 # 3
- vpor 32(%rcx),$H4,$H4 # padbit, yes, always
-
- vpaddq 0x00(%r11),$H0,$H0 # add hash value
- vpaddq 0x10(%r11),$H1,$H1
- vpaddq 0x20(%r11),$H2,$H2
- vpaddq 0x30(%r11),$H3,$H3
- vpaddq 0x40(%r11),$H4,$H4
-
- lea 16*2($inp),%rax
- lea 16*4($inp),$inp
- sub \$64,$len
- cmovc %rax,$inp
-
- ################################################################
- # Now we accumulate (inp[0:1]+hash)*r^4
- ################################################################
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- vpmuludq $H0,$T4,$T0 # h0*r0
- vpmuludq $H1,$T4,$T1 # h1*r0
- vpaddq $T0,$D0,$D0
- vpaddq $T1,$D1,$D1
- vmovdqa -0x80(%r11),$T2 # r1^4
- vpmuludq $H2,$T4,$T0 # h2*r0
- vpmuludq $H3,$T4,$T1 # h3*r0
- vpaddq $T0,$D2,$D2
- vpaddq $T1,$D3,$D3
- vpmuludq $H4,$T4,$T4 # h4*r0
- vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
- vpaddq $T4,$D4,$D4
-
- vpaddq $T0,$D0,$D0 # d0 += h4*s1
- vpmuludq $H2,$T2,$T1 # h2*r1
- vpmuludq $H3,$T2,$T0 # h3*r1
- vpaddq $T1,$D3,$D3 # d3 += h2*r1
- vmovdqa -0x60(%r11),$T3 # r2^4
- vpaddq $T0,$D4,$D4 # d4 += h3*r1
- vpmuludq $H1,$T2,$T1 # h1*r1
- vpmuludq $H0,$T2,$T2 # h0*r1
- vpaddq $T1,$D2,$D2 # d2 += h1*r1
- vpaddq $T2,$D1,$D1 # d1 += h0*r1
-
- vmovdqa -0x50(%r11),$T4 # s2^4
- vpmuludq $H2,$T3,$T0 # h2*r2
- vpmuludq $H1,$T3,$T1 # h1*r2
- vpaddq $T0,$D4,$D4 # d4 += h2*r2
- vpaddq $T1,$D3,$D3 # d3 += h1*r2
- vmovdqa -0x40(%r11),$T2 # r3^4
- vpmuludq $H0,$T3,$T3 # h0*r2
- vpmuludq $H4,$T4,$T0 # h4*s2
- vpaddq $T3,$D2,$D2 # d2 += h0*r2
- vpaddq $T0,$D1,$D1 # d1 += h4*s2
- vmovdqa -0x30(%r11),$T3 # s3^4
- vpmuludq $H3,$T4,$T4 # h3*s2
- vpmuludq $H1,$T2,$T1 # h1*r3
- vpaddq $T4,$D0,$D0 # d0 += h3*s2
-
- vmovdqa -0x10(%r11),$T4 # s4^4
- vpaddq $T1,$D4,$D4 # d4 += h1*r3
- vpmuludq $H0,$T2,$T2 # h0*r3
- vpmuludq $H4,$T3,$T0 # h4*s3
- vpaddq $T2,$D3,$D3 # d3 += h0*r3
- vpaddq $T0,$D2,$D2 # d2 += h4*s3
- vmovdqu 16*2($inp),$T0 # load input
- vpmuludq $H3,$T3,$T2 # h3*s3
- vpmuludq $H2,$T3,$T3 # h2*s3
- vpaddq $T2,$D1,$D1 # d1 += h3*s3
- vmovdqu 16*3($inp),$T1 #
- vpaddq $T3,$D0,$D0 # d0 += h2*s3
-
- vpmuludq $H2,$T4,$H2 # h2*s4
- vpmuludq $H3,$T4,$H3 # h3*s4
- vpsrldq \$6,$T0,$T2 # splat input
- vpaddq $H2,$D1,$D1 # d1 += h2*s4
- vpmuludq $H4,$T4,$H4 # h4*s4
- vpsrldq \$6,$T1,$T3 #
- vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
- vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
- vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
- vpmuludq $H1,$T4,$H0
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
- vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
-
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpunpcklqdq $T3,$T2,$T3 # 2:3
-
- #vpsrlq \$40,$T4,$T4 # 4
- vpsrldq \$`40/8`,$T4,$T4 # 4
- vpsrlq \$26,$T0,$T1
- vmovdqa 0x00(%rsp),$D4 # preload r0^2
- vpand $MASK,$T0,$T0 # 0
- vpsrlq \$4,$T3,$T2
- vpand $MASK,$T1,$T1 # 1
- vpand 0(%rcx),$T4,$T4 # .Lmask24
- vpsrlq \$30,$T3,$T3
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- ################################################################
- # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- # and P. Schwabe
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$D1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D0
- vpand $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D0,$H0,$H0
- vpsllq \$2,$D0,$D0
- vpaddq $D0,$H0,$H0 # h4 -> h0
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- ja .Loop_avx
-
-.Lskip_loop_avx:
- ################################################################
- # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
- add \$32,$len
- jnz .Long_tail_avx
-
- vpaddq $H2,$T2,$T2
- vpaddq $H0,$T0,$T0
- vpaddq $H1,$T1,$T1
- vpaddq $H3,$T3,$T3
- vpaddq $H4,$T4,$T4
-
-.Long_tail_avx:
- vmovdqa $H2,0x20(%r11)
- vmovdqa $H0,0x00(%r11)
- vmovdqa $H1,0x10(%r11)
- vmovdqa $H3,0x30(%r11)
- vmovdqa $H4,0x40(%r11)
-
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- vpmuludq $T2,$D4,$D2 # d2 = h2*r0
- vpmuludq $T0,$D4,$D0 # d0 = h0*r0
- vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
- vpmuludq $T1,$D4,$D1 # d1 = h1*r0
- vpmuludq $T3,$D4,$D3 # d3 = h3*r0
- vpmuludq $T4,$D4,$D4 # d4 = h4*r0
-
- vpmuludq $T3,$H2,$H0 # h3*r1
- vpaddq $H0,$D4,$D4 # d4 += h3*r1
- vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
- vpmuludq $T2,$H2,$H1 # h2*r1
- vpaddq $H1,$D3,$D3 # d3 += h2*r1
- vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
- vpmuludq $T1,$H2,$H0 # h1*r1
- vpaddq $H0,$D2,$D2 # d2 += h1*r1
- vpmuludq $T0,$H2,$H2 # h0*r1
- vpaddq $H2,$D1,$D1 # d1 += h0*r1
- vpmuludq $T4,$H3,$H3 # h4*s1
- vpaddq $H3,$D0,$D0 # d0 += h4*s1
-
- vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
- vpmuludq $T2,$H4,$H1 # h2*r2
- vpaddq $H1,$D4,$D4 # d4 += h2*r2
- vpmuludq $T1,$H4,$H0 # h1*r2
- vpaddq $H0,$D3,$D3 # d3 += h1*r2
- vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
- vpmuludq $T0,$H4,$H4 # h0*r2
- vpaddq $H4,$D2,$D2 # d2 += h0*r2
- vpmuludq $T4,$H2,$H1 # h4*s2
- vpaddq $H1,$D1,$D1 # d1 += h4*s2
- vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
- vpmuludq $T3,$H2,$H2 # h3*s2
- vpaddq $H2,$D0,$D0 # d0 += h3*s2
-
- vpmuludq $T1,$H3,$H0 # h1*r3
- vpaddq $H0,$D4,$D4 # d4 += h1*r3
- vpmuludq $T0,$H3,$H3 # h0*r3
- vpaddq $H3,$D3,$D3 # d3 += h0*r3
- vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
- vpmuludq $T4,$H4,$H1 # h4*s3
- vpaddq $H1,$D2,$D2 # d2 += h4*s3
- vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
- vpmuludq $T3,$H4,$H0 # h3*s3
- vpaddq $H0,$D1,$D1 # d1 += h3*s3
- vpmuludq $T2,$H4,$H4 # h2*s3
- vpaddq $H4,$D0,$D0 # d0 += h2*s3
-
- vpmuludq $T0,$H2,$H2 # h0*r4
- vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
- vpmuludq $T4,$H3,$H1 # h4*s4
- vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
- vpmuludq $T3,$H3,$H0 # h3*s4
- vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
- vpmuludq $T2,$H3,$H1 # h2*s4
- vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
- vpmuludq $T1,$H3,$H3 # h1*s4
- vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
-
- jz .Lshort_tail_avx
-
- vmovdqu 16*0($inp),$H0 # load input
- vmovdqu 16*1($inp),$H1
-
- vpsrldq \$6,$H0,$H2 # splat input
- vpsrldq \$6,$H1,$H3
- vpunpckhqdq $H1,$H0,$H4 # 4
- vpunpcklqdq $H1,$H0,$H0 # 0:1
- vpunpcklqdq $H3,$H2,$H3 # 2:3
-
- vpsrlq \$40,$H4,$H4 # 4
- vpsrlq \$26,$H0,$H1
- vpand $MASK,$H0,$H0 # 0
- vpsrlq \$4,$H3,$H2
- vpand $MASK,$H1,$H1 # 1
- vpsrlq \$30,$H3,$H3
- vpand $MASK,$H2,$H2 # 2
- vpand $MASK,$H3,$H3 # 3
- vpor 32(%rcx),$H4,$H4 # padbit, yes, always
-
- vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
- vpaddq 0x00(%r11),$H0,$H0
- vpaddq 0x10(%r11),$H1,$H1
- vpaddq 0x20(%r11),$H2,$H2
- vpaddq 0x30(%r11),$H3,$H3
- vpaddq 0x40(%r11),$H4,$H4
-
- ################################################################
- # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
-
- vpmuludq $H0,$T4,$T0 # h0*r0
- vpaddq $T0,$D0,$D0 # d0 += h0*r0
- vpmuludq $H1,$T4,$T1 # h1*r0
- vpaddq $T1,$D1,$D1 # d1 += h1*r0
- vpmuludq $H2,$T4,$T0 # h2*r0
- vpaddq $T0,$D2,$D2 # d2 += h2*r0
- vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
- vpmuludq $H3,$T4,$T1 # h3*r0
- vpaddq $T1,$D3,$D3 # d3 += h3*r0
- vpmuludq $H4,$T4,$T4 # h4*r0
- vpaddq $T4,$D4,$D4 # d4 += h4*r0
-
- vpmuludq $H3,$T2,$T0 # h3*r1
- vpaddq $T0,$D4,$D4 # d4 += h3*r1
- vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
- vpmuludq $H2,$T2,$T1 # h2*r1
- vpaddq $T1,$D3,$D3 # d3 += h2*r1
- vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
- vpmuludq $H1,$T2,$T0 # h1*r1
- vpaddq $T0,$D2,$D2 # d2 += h1*r1
- vpmuludq $H0,$T2,$T2 # h0*r1
- vpaddq $T2,$D1,$D1 # d1 += h0*r1
- vpmuludq $H4,$T3,$T3 # h4*s1
- vpaddq $T3,$D0,$D0 # d0 += h4*s1
-
- vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
- vpmuludq $H2,$T4,$T1 # h2*r2
- vpaddq $T1,$D4,$D4 # d4 += h2*r2
- vpmuludq $H1,$T4,$T0 # h1*r2
- vpaddq $T0,$D3,$D3 # d3 += h1*r2
- vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
- vpmuludq $H0,$T4,$T4 # h0*r2
- vpaddq $T4,$D2,$D2 # d2 += h0*r2
- vpmuludq $H4,$T2,$T1 # h4*s2
- vpaddq $T1,$D1,$D1 # d1 += h4*s2
- vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
- vpmuludq $H3,$T2,$T2 # h3*s2
- vpaddq $T2,$D0,$D0 # d0 += h3*s2
-
- vpmuludq $H1,$T3,$T0 # h1*r3
- vpaddq $T0,$D4,$D4 # d4 += h1*r3
- vpmuludq $H0,$T3,$T3 # h0*r3
- vpaddq $T3,$D3,$D3 # d3 += h0*r3
- vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
- vpmuludq $H4,$T4,$T1 # h4*s3
- vpaddq $T1,$D2,$D2 # d2 += h4*s3
- vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
- vpmuludq $H3,$T4,$T0 # h3*s3
- vpaddq $T0,$D1,$D1 # d1 += h3*s3
- vpmuludq $H2,$T4,$T4 # h2*s3
- vpaddq $T4,$D0,$D0 # d0 += h2*s3
-
- vpmuludq $H0,$T2,$T2 # h0*r4
- vpaddq $T2,$D4,$D4 # d4 += h0*r4
- vpmuludq $H4,$T3,$T1 # h4*s4
- vpaddq $T1,$D3,$D3 # d3 += h4*s4
- vpmuludq $H3,$T3,$T0 # h3*s4
- vpaddq $T0,$D2,$D2 # d2 += h3*s4
- vpmuludq $H2,$T3,$T1 # h2*s4
- vpaddq $T1,$D1,$D1 # d1 += h2*s4
- vpmuludq $H1,$T3,$T3 # h1*s4
- vpaddq $T3,$D0,$D0 # d0 += h1*s4
-
-.Lshort_tail_avx:
- ################################################################
- # horizontal addition
-
- vpsrldq \$8,$D4,$T4
- vpsrldq \$8,$D3,$T3
- vpsrldq \$8,$D1,$T1
- vpsrldq \$8,$D0,$T0
- vpsrldq \$8,$D2,$T2
- vpaddq $T3,$D3,$D3
- vpaddq $T4,$D4,$D4
- vpaddq $T0,$D0,$D0
- vpaddq $T1,$D1,$D1
- vpaddq $T2,$D2,$D2
-
- ################################################################
- # lazy reduction
-
- vpsrlq \$26,$D3,$H3
- vpand $MASK,$D3,$D3
- vpaddq $H3,$D4,$D4 # h3 -> h4
-
- vpsrlq \$26,$D0,$H0
- vpand $MASK,$D0,$D0
- vpaddq $H0,$D1,$D1 # h0 -> h1
-
- vpsrlq \$26,$D4,$H4
- vpand $MASK,$D4,$D4
-
- vpsrlq \$26,$D1,$H1
- vpand $MASK,$D1,$D1
- vpaddq $H1,$D2,$D2 # h1 -> h2
-
- vpaddq $H4,$D0,$D0
- vpsllq \$2,$H4,$H4
- vpaddq $H4,$D0,$D0 # h4 -> h0
-
- vpsrlq \$26,$D2,$H2
- vpand $MASK,$D2,$D2
- vpaddq $H2,$D3,$D3 # h2 -> h3
-
- vpsrlq \$26,$D0,$H0
- vpand $MASK,$D0,$D0
- vpaddq $H0,$D1,$D1 # h0 -> h1
-
- vpsrlq \$26,$D3,$H3
- vpand $MASK,$D3,$D3
- vpaddq $H3,$D4,$D4 # h3 -> h4
-
- vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
- vmovd $D1,`4*1-48-64`($ctx)
- vmovd $D2,`4*2-48-64`($ctx)
- vmovd $D3,`4*3-48-64`($ctx)
- vmovd $D4,`4*4-48-64`($ctx)
-___
-$code.=<<___ if ($win64);
- vmovdqa 0x50(%r11),%xmm6
- vmovdqa 0x60(%r11),%xmm7
- vmovdqa 0x70(%r11),%xmm8
- vmovdqa 0x80(%r11),%xmm9
- vmovdqa 0x90(%r11),%xmm10
- vmovdqa 0xa0(%r11),%xmm11
- vmovdqa 0xb0(%r11),%xmm12
- vmovdqa 0xc0(%r11),%xmm13
- vmovdqa 0xd0(%r11),%xmm14
- vmovdqa 0xe0(%r11),%xmm15
- lea 0xf8(%r11),%rsp
-.Ldo_avx_epilogue:
-___
-$code.=<<___ if (!$win64);
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-___
-$code.=<<___;
- vzeroupper
- RET
-.cfi_endproc
-___
-&end_function("poly1305_blocks_avx");
-
-&declare_function("poly1305_emit_avx", 32, 3);
-$code.=<<___;
- cmpl \$0,20($ctx) # is_base2_26?
- je .Lemit
-
- mov 0($ctx),%eax # load hash value base 2^26
- mov 4($ctx),%ecx
- mov 8($ctx),%r8d
- mov 12($ctx),%r11d
- mov 16($ctx),%r10d
-
- shl \$26,%rcx # base 2^26 -> base 2^64
- mov %r8,%r9
- shl \$52,%r8
- add %rcx,%rax
- shr \$12,%r9
- add %rax,%r8 # h0
- adc \$0,%r9
-
- shl \$14,%r11
- mov %r10,%rax
- shr \$24,%r10
- add %r11,%r9
- shl \$40,%rax
- add %rax,%r9 # h1
- adc \$0,%r10 # h2
-
- mov %r10,%rax # could be partially reduced, so reduce
- mov %r10,%rcx
- and \$3,%r10
- shr \$2,%rax
- and \$-4,%rcx
- add %rcx,%rax
- add %rax,%r8
- adc \$0,%r9
- adc \$0,%r10
-
- mov %r8,%rax
- add \$5,%r8 # compare to modulus
- mov %r9,%rcx
- adc \$0,%r9
- adc \$0,%r10
- shr \$2,%r10 # did 130-bit value overflow?
- cmovnz %r8,%rax
- cmovnz %r9,%rcx
-
- add 0($nonce),%rax # accumulate nonce
- adc 8($nonce),%rcx
- mov %rax,0($mac) # write result
- mov %rcx,8($mac)
-
- RET
-___
-&end_function("poly1305_emit_avx");
-
-if ($avx>1) {
-
-my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
- map("%ymm$_",(0..15));
-my $S4=$MASK;
-
-sub poly1305_blocks_avxN {
- my ($avx512) = @_;
- my $suffix = $avx512 ? "_avx512" : "";
-$code.=<<___;
-.cfi_startproc
- mov 20($ctx),%r8d # is_base2_26
- cmp \$128,$len
- jae .Lblocks_avx2$suffix
- test %r8d,%r8d
- jz .Lblocks
-
-.Lblocks_avx2$suffix:
- and \$-16,$len
- jz .Lno_data_avx2$suffix
-
- vzeroupper
-
- test %r8d,%r8d
- jz .Lbase2_64_avx2$suffix
-
- test \$63,$len
- jz .Leven_avx2$suffix
-
- push %rbp
-.cfi_push %rbp
- mov %rsp,%rbp
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lblocks_avx2_body$suffix:
-
- mov $len,%r15 # reassign $len
-
- mov 0($ctx),$d1 # load hash value
- mov 8($ctx),$d2
- mov 16($ctx),$h2#d
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- ################################# base 2^26 -> base 2^64
- mov $d1#d,$h0#d
- and \$`-1*(1<<31)`,$d1
- mov $d2,$r1 # borrow $r1
- mov $d2#d,$h1#d
- and \$`-1*(1<<31)`,$d2
-
- shr \$6,$d1
- shl \$52,$r1
- add $d1,$h0
- shr \$12,$h1
- shr \$18,$d2
- add $r1,$h0
- adc $d2,$h1
-
- mov $h2,$d1
- shl \$40,$d1
- shr \$24,$h2
- add $d1,$h1
- adc \$0,$h2 # can be partially reduced...
-
- mov \$-4,$d2 # ... so reduce
- mov $h2,$d1
- and $h2,$d2
- shr \$2,$d1
- and \$3,$h2
- add $d2,$d1 # =*5
- add $d1,$h0
- adc \$0,$h1
- adc \$0,$h2
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
-.Lbase2_26_pre_avx2$suffix:
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
- sub \$16,%r15
-
- call __poly1305_block
- mov $r1,%rax
-
- test \$63,%r15
- jnz .Lbase2_26_pre_avx2$suffix
-
- test $padbit,$padbit # if $padbit is zero,
- jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format
-
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$r0
- mov $h1,$r1
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$r0
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $r0,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$r1
- and \$0x3ffffff,$h1 # h[3]
- or $r1,$h2 # h[4]
-
- test %r15,%r15
- jz .Lstore_base2_26_avx2$suffix
-
- vmovd %rax#d,%x#$H0
- vmovd %rdx#d,%x#$H1
- vmovd $h0#d,%x#$H2
- vmovd $h1#d,%x#$H3
- vmovd $h2#d,%x#$H4
- jmp .Lproceed_avx2$suffix
-
-.align 32
-.Lstore_base2_64_avx2$suffix:
- mov $h0,0($ctx)
- mov $h1,8($ctx)
- mov $h2,16($ctx) # note that is_base2_26 is zeroed
- jmp .Ldone_avx2$suffix
-
-.align 16
-.Lstore_base2_26_avx2$suffix:
- mov %rax#d,0($ctx) # store hash value base 2^26
- mov %rdx#d,4($ctx)
- mov $h0#d,8($ctx)
- mov $h1#d,12($ctx)
- mov $h2#d,16($ctx)
-.align 16
-.Ldone_avx2$suffix:
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbx
-.cfi_restore %rbx
- pop %rbp
-.cfi_restore %rbp
-.Lno_data_avx2$suffix:
-.Lblocks_avx2_epilogue$suffix:
- RET
-.cfi_endproc
-
-.align 32
-.Lbase2_64_avx2$suffix:
-.cfi_startproc
- push %rbp
-.cfi_push %rbp
- mov %rsp,%rbp
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lbase2_64_avx2_body$suffix:
-
- mov $len,%r15 # reassign $len
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- mov 0($ctx),$h0 # load hash value
- mov 8($ctx),$h1
- mov 16($ctx),$h2#d
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
- test \$63,$len
- jz .Linit_avx2$suffix
-
-.Lbase2_64_pre_avx2$suffix:
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
- sub \$16,%r15
-
- call __poly1305_block
- mov $r1,%rax
-
- test \$63,%r15
- jnz .Lbase2_64_pre_avx2$suffix
-
-.Linit_avx2$suffix:
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$d1
- mov $h1,$d2
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$d1
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $d1,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$d2
- and \$0x3ffffff,$h1 # h[3]
- or $d2,$h2 # h[4]
-
- vmovd %rax#d,%x#$H0
- vmovd %rdx#d,%x#$H1
- vmovd $h0#d,%x#$H2
- vmovd $h1#d,%x#$H3
- vmovd $h2#d,%x#$H4
- movl \$1,20($ctx) # set is_base2_26
-
- call __poly1305_init_avx
-
-.Lproceed_avx2$suffix:
- mov %r15,$len # restore $len
-___
-$code.=<<___ if (!$kernel);
- mov OPENSSL_ia32cap_P+8(%rip),%r9d
- mov \$`(1<<31|1<<30|1<<16)`,%r11d
-___
-$code.=<<___;
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbx
-.cfi_restore %rbx
- pop %rbp
-.cfi_restore %rbp
-.Lbase2_64_avx2_epilogue$suffix:
- jmp .Ldo_avx2$suffix
-.cfi_endproc
-
-.align 32
-.Leven_avx2$suffix:
-.cfi_startproc
-___
-$code.=<<___ if (!$kernel);
- mov OPENSSL_ia32cap_P+8(%rip),%r9d
-___
-$code.=<<___;
- vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
- vmovd 4*1($ctx),%x#$H1
- vmovd 4*2($ctx),%x#$H2
- vmovd 4*3($ctx),%x#$H3
- vmovd 4*4($ctx),%x#$H4
-
-.Ldo_avx2$suffix:
-___
-$code.=<<___ if (!$kernel && $avx>2);
- cmp \$512,$len
- jb .Lskip_avx512
- and %r11d,%r9d
- test \$`1<<16`,%r9d # check for AVX512F
- jnz .Lblocks_avx512
-.Lskip_avx512$suffix:
-___
-$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
- cmp \$512,$len
- jae .Lblocks_avx512
-___
-$code.=<<___ if (!$win64);
- lea 8(%rsp),%r10
-.cfi_def_cfa_register %r10
- sub \$0x128,%rsp
-___
-$code.=<<___ if ($win64);
- lea 8(%rsp),%r10
- sub \$0x1c8,%rsp
- vmovdqa %xmm6,-0xb0(%r10)
- vmovdqa %xmm7,-0xa0(%r10)
- vmovdqa %xmm8,-0x90(%r10)
- vmovdqa %xmm9,-0x80(%r10)
- vmovdqa %xmm10,-0x70(%r10)
- vmovdqa %xmm11,-0x60(%r10)
- vmovdqa %xmm12,-0x50(%r10)
- vmovdqa %xmm13,-0x40(%r10)
- vmovdqa %xmm14,-0x30(%r10)
- vmovdqa %xmm15,-0x20(%r10)
-.Ldo_avx2_body$suffix:
-___
-$code.=<<___;
- lea .Lconst(%rip),%rcx
- lea 48+64($ctx),$ctx # size optimization
- vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
-
- # expand and copy pre-calculated table to stack
- vmovdqu `16*0-64`($ctx),%x#$T2
- and \$-512,%rsp
- vmovdqu `16*1-64`($ctx),%x#$T3
- vmovdqu `16*2-64`($ctx),%x#$T4
- vmovdqu `16*3-64`($ctx),%x#$D0
- vmovdqu `16*4-64`($ctx),%x#$D1
- vmovdqu `16*5-64`($ctx),%x#$D2
- lea 0x90(%rsp),%rax # size optimization
- vmovdqu `16*6-64`($ctx),%x#$D3
- vpermd $T2,$T0,$T2 # 00003412 -> 14243444
- vmovdqu `16*7-64`($ctx),%x#$D4
- vpermd $T3,$T0,$T3
- vmovdqu `16*8-64`($ctx),%x#$MASK
- vpermd $T4,$T0,$T4
- vmovdqa $T2,0x00(%rsp)
- vpermd $D0,$T0,$D0
- vmovdqa $T3,0x20-0x90(%rax)
- vpermd $D1,$T0,$D1
- vmovdqa $T4,0x40-0x90(%rax)
- vpermd $D2,$T0,$D2
- vmovdqa $D0,0x60-0x90(%rax)
- vpermd $D3,$T0,$D3
- vmovdqa $D1,0x80-0x90(%rax)
- vpermd $D4,$T0,$D4
- vmovdqa $D2,0xa0-0x90(%rax)
- vpermd $MASK,$T0,$MASK
- vmovdqa $D3,0xc0-0x90(%rax)
- vmovdqa $D4,0xe0-0x90(%rax)
- vmovdqa $MASK,0x100-0x90(%rax)
- vmovdqa 64(%rcx),$MASK # .Lmask26
-
- ################################################################
- # load input
- vmovdqu 16*0($inp),%x#$T0
- vmovdqu 16*1($inp),%x#$T1
- vinserti128 \$1,16*2($inp),$T0,$T0
- vinserti128 \$1,16*3($inp),$T1,$T1
- lea 16*4($inp),$inp
-
- vpsrldq \$6,$T0,$T2 # splat input
- vpsrldq \$6,$T1,$T3
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpunpcklqdq $T3,$T2,$T2 # 2:3
- vpunpcklqdq $T1,$T0,$T0 # 0:1
-
- vpsrlq \$30,$T2,$T3
- vpsrlq \$4,$T2,$T2
- vpsrlq \$26,$T0,$T1
- vpsrlq \$40,$T4,$T4 # 4
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T0,$T0 # 0
- vpand $MASK,$T1,$T1 # 1
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- vpaddq $H2,$T2,$H2 # accumulate input
- sub \$64,$len
- jz .Ltail_avx2$suffix
- jmp .Loop_avx2$suffix
-
-.align 32
-.Loop_avx2$suffix:
- ################################################################
- # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
- # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
- # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
- # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
- # \________/\__________/
- ################################################################
- #vpaddq $H2,$T2,$H2 # accumulate input
- vpaddq $H0,$T0,$H0
- vmovdqa `32*0`(%rsp),$T0 # r0^4
- vpaddq $H1,$T1,$H1
- vmovdqa `32*1`(%rsp),$T1 # r1^4
- vpaddq $H3,$T3,$H3
- vmovdqa `32*3`(%rsp),$T2 # r2^4
- vpaddq $H4,$T4,$H4
- vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
- vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
-
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- #
- # however, as h2 is "chronologically" first one available pull
- # corresponding operations up, so it's
- #
- # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
- # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
- # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
-
- vpmuludq $H2,$T0,$D2 # d2 = h2*r0
- vpmuludq $H2,$T1,$D3 # d3 = h2*r1
- vpmuludq $H2,$T2,$D4 # d4 = h2*r2
- vpmuludq $H2,$T3,$D0 # d0 = h2*s3
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
-
- vpmuludq $H0,$T1,$T4 # h0*r1
- vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
- vpaddq $T4,$D1,$D1 # d1 += h0*r1
- vpaddq $H2,$D2,$D2 # d2 += h1*r1
- vpmuludq $H3,$T1,$T4 # h3*r1
- vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
- vpaddq $T4,$D4,$D4 # d4 += h3*r1
- vpaddq $H2,$D0,$D0 # d0 += h4*s1
- vmovdqa `32*4-0x90`(%rax),$T1 # s2
-
- vpmuludq $H0,$T0,$T4 # h0*r0
- vpmuludq $H1,$T0,$H2 # h1*r0
- vpaddq $T4,$D0,$D0 # d0 += h0*r0
- vpaddq $H2,$D1,$D1 # d1 += h1*r0
- vpmuludq $H3,$T0,$T4 # h3*r0
- vpmuludq $H4,$T0,$H2 # h4*r0
- vmovdqu 16*0($inp),%x#$T0 # load input
- vpaddq $T4,$D3,$D3 # d3 += h3*r0
- vpaddq $H2,$D4,$D4 # d4 += h4*r0
- vinserti128 \$1,16*2($inp),$T0,$T0
-
- vpmuludq $H3,$T1,$T4 # h3*s2
- vpmuludq $H4,$T1,$H2 # h4*s2
- vmovdqu 16*1($inp),%x#$T1
- vpaddq $T4,$D0,$D0 # d0 += h3*s2
- vpaddq $H2,$D1,$D1 # d1 += h4*s2
- vmovdqa `32*5-0x90`(%rax),$H2 # r3
- vpmuludq $H1,$T2,$T4 # h1*r2
- vpmuludq $H0,$T2,$T2 # h0*r2
- vpaddq $T4,$D3,$D3 # d3 += h1*r2
- vpaddq $T2,$D2,$D2 # d2 += h0*r2
- vinserti128 \$1,16*3($inp),$T1,$T1
- lea 16*4($inp),$inp
-
- vpmuludq $H1,$H2,$T4 # h1*r3
- vpmuludq $H0,$H2,$H2 # h0*r3
- vpsrldq \$6,$T0,$T2 # splat input
- vpaddq $T4,$D4,$D4 # d4 += h1*r3
- vpaddq $H2,$D3,$D3 # d3 += h0*r3
- vpmuludq $H3,$T3,$T4 # h3*s3
- vpmuludq $H4,$T3,$H2 # h4*s3
- vpsrldq \$6,$T1,$T3
- vpaddq $T4,$D1,$D1 # d1 += h3*s3
- vpaddq $H2,$D2,$D2 # d2 += h4*s3
- vpunpckhqdq $T1,$T0,$T4 # 4
-
- vpmuludq $H3,$S4,$H3 # h3*s4
- vpmuludq $H4,$S4,$H4 # h4*s4
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
- vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
- vpunpcklqdq $T3,$T2,$T3 # 2:3
- vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
- vpmuludq $H1,$S4,$H0 # h1*s4
- vmovdqa 64(%rcx),$MASK # .Lmask26
- vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
- vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
-
- ################################################################
- # lazy reduction (interleaved with tail of input splat)
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$D1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D4
- vpand $MASK,$H4,$H4
-
- vpsrlq \$4,$T3,$T2
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpand $MASK,$T2,$T2 # 2
- vpsrlq \$26,$T0,$T1
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpaddq $T2,$H2,$H2 # modulo-scheduled
- vpsrlq \$30,$T3,$T3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$40,$T4,$T4 # 4
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpand $MASK,$T0,$T0 # 0
- vpand $MASK,$T1,$T1 # 1
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- sub \$64,$len
- jnz .Loop_avx2$suffix
-
- .byte 0x66,0x90
-.Ltail_avx2$suffix:
- ################################################################
- # while above multiplications were by r^4 in all lanes, in last
- # iteration we multiply least significant lane by r^4 and most
- # significant one by r, so copy of above except that references
- # to the precomputed table are displaced by 4...
-
- #vpaddq $H2,$T2,$H2 # accumulate input
- vpaddq $H0,$T0,$H0
- vmovdqu `32*0+4`(%rsp),$T0 # r0^4
- vpaddq $H1,$T1,$H1
- vmovdqu `32*1+4`(%rsp),$T1 # r1^4
- vpaddq $H3,$T3,$H3
- vmovdqu `32*3+4`(%rsp),$T2 # r2^4
- vpaddq $H4,$T4,$H4
- vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
- vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
-
- vpmuludq $H2,$T0,$D2 # d2 = h2*r0
- vpmuludq $H2,$T1,$D3 # d3 = h2*r1
- vpmuludq $H2,$T2,$D4 # d4 = h2*r2
- vpmuludq $H2,$T3,$D0 # d0 = h2*s3
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
-
- vpmuludq $H0,$T1,$T4 # h0*r1
- vpmuludq $H1,$T1,$H2 # h1*r1
- vpaddq $T4,$D1,$D1 # d1 += h0*r1
- vpaddq $H2,$D2,$D2 # d2 += h1*r1
- vpmuludq $H3,$T1,$T4 # h3*r1
- vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
- vpaddq $T4,$D4,$D4 # d4 += h3*r1
- vpaddq $H2,$D0,$D0 # d0 += h4*s1
-
- vpmuludq $H0,$T0,$T4 # h0*r0
- vpmuludq $H1,$T0,$H2 # h1*r0
- vpaddq $T4,$D0,$D0 # d0 += h0*r0
- vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
- vpaddq $H2,$D1,$D1 # d1 += h1*r0
- vpmuludq $H3,$T0,$T4 # h3*r0
- vpmuludq $H4,$T0,$H2 # h4*r0
- vpaddq $T4,$D3,$D3 # d3 += h3*r0
- vpaddq $H2,$D4,$D4 # d4 += h4*r0
-
- vpmuludq $H3,$T1,$T4 # h3*s2
- vpmuludq $H4,$T1,$H2 # h4*s2
- vpaddq $T4,$D0,$D0 # d0 += h3*s2
- vpaddq $H2,$D1,$D1 # d1 += h4*s2
- vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
- vpmuludq $H1,$T2,$T4 # h1*r2
- vpmuludq $H0,$T2,$T2 # h0*r2
- vpaddq $T4,$D3,$D3 # d3 += h1*r2
- vpaddq $T2,$D2,$D2 # d2 += h0*r2
-
- vpmuludq $H1,$H2,$T4 # h1*r3
- vpmuludq $H0,$H2,$H2 # h0*r3
- vpaddq $T4,$D4,$D4 # d4 += h1*r3
- vpaddq $H2,$D3,$D3 # d3 += h0*r3
- vpmuludq $H3,$T3,$T4 # h3*s3
- vpmuludq $H4,$T3,$H2 # h4*s3
- vpaddq $T4,$D1,$D1 # d1 += h3*s3
- vpaddq $H2,$D2,$D2 # d2 += h4*s3
-
- vpmuludq $H3,$S4,$H3 # h3*s4
- vpmuludq $H4,$S4,$H4 # h4*s4
- vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
- vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
- vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
- vpmuludq $H1,$S4,$H0 # h1*s4
- vmovdqa 64(%rcx),$MASK # .Lmask26
- vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
- vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
-
- ################################################################
- # horizontal addition
-
- vpsrldq \$8,$D1,$T1
- vpsrldq \$8,$H2,$T2
- vpsrldq \$8,$H3,$T3
- vpsrldq \$8,$H4,$T4
- vpsrldq \$8,$H0,$T0
- vpaddq $T1,$D1,$D1
- vpaddq $T2,$H2,$H2
- vpaddq $T3,$H3,$H3
- vpaddq $T4,$H4,$H4
- vpaddq $T0,$H0,$H0
-
- vpermq \$0x2,$H3,$T3
- vpermq \$0x2,$H4,$T4
- vpermq \$0x2,$H0,$T0
- vpermq \$0x2,$D1,$T1
- vpermq \$0x2,$H2,$T2
- vpaddq $T3,$H3,$H3
- vpaddq $T4,$H4,$H4
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$D1,$D1
- vpaddq $T2,$H2,$H2
-
- ################################################################
- # lazy reduction
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$D1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D4
- vpand $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
- vmovd %x#$H1,`4*1-48-64`($ctx)
- vmovd %x#$H2,`4*2-48-64`($ctx)
- vmovd %x#$H3,`4*3-48-64`($ctx)
- vmovd %x#$H4,`4*4-48-64`($ctx)
-___
-$code.=<<___ if ($win64);
- vmovdqa -0xb0(%r10),%xmm6
- vmovdqa -0xa0(%r10),%xmm7
- vmovdqa -0x90(%r10),%xmm8
- vmovdqa -0x80(%r10),%xmm9
- vmovdqa -0x70(%r10),%xmm10
- vmovdqa -0x60(%r10),%xmm11
- vmovdqa -0x50(%r10),%xmm12
- vmovdqa -0x40(%r10),%xmm13
- vmovdqa -0x30(%r10),%xmm14
- vmovdqa -0x20(%r10),%xmm15
- lea -8(%r10),%rsp
-.Ldo_avx2_epilogue$suffix:
-___
-$code.=<<___ if (!$win64);
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-___
-$code.=<<___;
- vzeroupper
- RET
-.cfi_endproc
-___
-if($avx > 2 && $avx512) {
-my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
-my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
-my $PADBIT="%zmm30";
-
-map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
-map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
-map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
-map(s/%y/%z/,($MASK));
-
-$code.=<<___;
-.cfi_startproc
-.Lblocks_avx512:
- mov \$15,%eax
- kmovw %eax,%k2
-___
-$code.=<<___ if (!$win64);
- lea 8(%rsp),%r10
-.cfi_def_cfa_register %r10
- sub \$0x128,%rsp
-___
-$code.=<<___ if ($win64);
- lea 8(%rsp),%r10
- sub \$0x1c8,%rsp
- vmovdqa %xmm6,-0xb0(%r10)
- vmovdqa %xmm7,-0xa0(%r10)
- vmovdqa %xmm8,-0x90(%r10)
- vmovdqa %xmm9,-0x80(%r10)
- vmovdqa %xmm10,-0x70(%r10)
- vmovdqa %xmm11,-0x60(%r10)
- vmovdqa %xmm12,-0x50(%r10)
- vmovdqa %xmm13,-0x40(%r10)
- vmovdqa %xmm14,-0x30(%r10)
- vmovdqa %xmm15,-0x20(%r10)
-.Ldo_avx512_body:
-___
-$code.=<<___;
- lea .Lconst(%rip),%rcx
- lea 48+64($ctx),$ctx # size optimization
- vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
-
- # expand pre-calculated table
- vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
- and \$-512,%rsp
- vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
- mov \$0x20,%rax
- vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
- vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
- vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
- vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
- vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
- vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
- vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
- vpermd $D0,$T2,$R0 # 00003412 -> 14243444
- vpbroadcastq 64(%rcx),$MASK # .Lmask26
- vpermd $D1,$T2,$R1
- vpermd $T0,$T2,$S1
- vpermd $D2,$T2,$R2
- vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
- vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
- vpermd $T1,$T2,$S2
- vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
- vpsrlq \$32,$R1,$T1
- vpermd $D3,$T2,$R3
- vmovdqa64 $S1,0x40(%rsp){%k2}
- vpermd $T3,$T2,$S3
- vpermd $D4,$T2,$R4
- vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
- vpermd $T4,$T2,$S4
- vmovdqa64 $S2,0x80(%rsp){%k2}
- vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
- vmovdqa64 $S3,0xc0(%rsp){%k2}
- vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
- vmovdqa64 $S4,0x100(%rsp){%k2}
-
- ################################################################
- # calculate 5th through 8th powers of the key
- #
- # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
- # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
- # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
- # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
- # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
-
- vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
- vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
- vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
- vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
- vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
- vpsrlq \$32,$R2,$T2
-
- vpmuludq $T1,$S4,$M0
- vpmuludq $T1,$R0,$M1
- vpmuludq $T1,$R1,$M2
- vpmuludq $T1,$R2,$M3
- vpmuludq $T1,$R3,$M4
- vpsrlq \$32,$R3,$T3
- vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
- vpaddq $M1,$D1,$D1 # d1 += r1'*r0
- vpaddq $M2,$D2,$D2 # d2 += r1'*r1
- vpaddq $M3,$D3,$D3 # d3 += r1'*r2
- vpaddq $M4,$D4,$D4 # d4 += r1'*r3
-
- vpmuludq $T2,$S3,$M0
- vpmuludq $T2,$S4,$M1
- vpmuludq $T2,$R1,$M3
- vpmuludq $T2,$R2,$M4
- vpmuludq $T2,$R0,$M2
- vpsrlq \$32,$R4,$T4
- vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
- vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
- vpaddq $M3,$D3,$D3 # d3 += r2'*r1
- vpaddq $M4,$D4,$D4 # d4 += r2'*r2
- vpaddq $M2,$D2,$D2 # d2 += r2'*r0
-
- vpmuludq $T3,$S2,$M0
- vpmuludq $T3,$R0,$M3
- vpmuludq $T3,$R1,$M4
- vpmuludq $T3,$S3,$M1
- vpmuludq $T3,$S4,$M2
- vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
- vpaddq $M3,$D3,$D3 # d3 += r3'*r0
- vpaddq $M4,$D4,$D4 # d4 += r3'*r1
- vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
- vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
-
- vpmuludq $T4,$S4,$M3
- vpmuludq $T4,$R0,$M4
- vpmuludq $T4,$S1,$M0
- vpmuludq $T4,$S2,$M1
- vpmuludq $T4,$S3,$M2
- vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
- vpaddq $M4,$D4,$D4 # d4 += r2'*r0
- vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
- vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
- vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
-
- ################################################################
- # load input
- vmovdqu64 16*0($inp),%z#$T3
- vmovdqu64 16*4($inp),%z#$T4
- lea 16*8($inp),$inp
-
- ################################################################
- # lazy reduction
-
- vpsrlq \$26,$D3,$M3
- vpandq $MASK,$D3,$D3
- vpaddq $M3,$D4,$D4 # d3 -> d4
-
- vpsrlq \$26,$D0,$M0
- vpandq $MASK,$D0,$D0
- vpaddq $M0,$D1,$D1 # d0 -> d1
-
- vpsrlq \$26,$D4,$M4
- vpandq $MASK,$D4,$D4
-
- vpsrlq \$26,$D1,$M1
- vpandq $MASK,$D1,$D1
- vpaddq $M1,$D2,$D2 # d1 -> d2
-
- vpaddq $M4,$D0,$D0
- vpsllq \$2,$M4,$M4
- vpaddq $M4,$D0,$D0 # d4 -> d0
-
- vpsrlq \$26,$D2,$M2
- vpandq $MASK,$D2,$D2
- vpaddq $M2,$D3,$D3 # d2 -> d3
-
- vpsrlq \$26,$D0,$M0
- vpandq $MASK,$D0,$D0
- vpaddq $M0,$D1,$D1 # d0 -> d1
-
- vpsrlq \$26,$D3,$M3
- vpandq $MASK,$D3,$D3
- vpaddq $M3,$D4,$D4 # d3 -> d4
-
- ################################################################
- # at this point we have 14243444 in $R0-$S4 and 05060708 in
- # $D0-$D4, ...
-
- vpunpcklqdq $T4,$T3,$T0 # transpose input
- vpunpckhqdq $T4,$T3,$T4
-
- # ... since input 64-bit lanes are ordered as 73625140, we could
- # "vperm" it to 76543210 (here and in each loop iteration), *or*
- # we could just flow along, hence the goal for $R0-$S4 is
- # 1858286838784888 ...
-
- vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
- mov \$0x7777,%eax
- kmovw %eax,%k1
-
- vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
- vpermd $R1,$M0,$R1
- vpermd $R2,$M0,$R2
- vpermd $R3,$M0,$R3
- vpermd $R4,$M0,$R4
-
- vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
- vpermd $D1,$M0,${R1}{%k1}
- vpermd $D2,$M0,${R2}{%k1}
- vpermd $D3,$M0,${R3}{%k1}
- vpermd $D4,$M0,${R4}{%k1}
-
- vpslld \$2,$R1,$S1 # *5
- vpslld \$2,$R2,$S2
- vpslld \$2,$R3,$S3
- vpslld \$2,$R4,$S4
- vpaddd $R1,$S1,$S1
- vpaddd $R2,$S2,$S2
- vpaddd $R3,$S3,$S3
- vpaddd $R4,$S4,$S4
-
- vpbroadcastq 32(%rcx),$PADBIT # .L129
-
- vpsrlq \$52,$T0,$T2 # splat input
- vpsllq \$12,$T4,$T3
- vporq $T3,$T2,$T2
- vpsrlq \$26,$T0,$T1
- vpsrlq \$14,$T4,$T3
- vpsrlq \$40,$T4,$T4 # 4
- vpandq $MASK,$T2,$T2 # 2
- vpandq $MASK,$T0,$T0 # 0
- #vpandq $MASK,$T1,$T1 # 1
- #vpandq $MASK,$T3,$T3 # 3
- #vporq $PADBIT,$T4,$T4 # padbit, yes, always
-
- vpaddq $H2,$T2,$H2 # accumulate input
- sub \$192,$len
- jbe .Ltail_avx512
- jmp .Loop_avx512
-
-.align 32
-.Loop_avx512:
- ################################################################
- # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
- # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
- # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
- # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
- # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
- # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
- # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
- # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
- # \________/\___________/
- ################################################################
- #vpaddq $H2,$T2,$H2 # accumulate input
-
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- #
- # however, as h2 is "chronologically" first one available pull
- # corresponding operations up, so it's
- #
- # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
- # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
- # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
- # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
- # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
-
- vpmuludq $H2,$R1,$D3 # d3 = h2*r1
- vpaddq $H0,$T0,$H0
- vpmuludq $H2,$R2,$D4 # d4 = h2*r2
- vpandq $MASK,$T1,$T1 # 1
- vpmuludq $H2,$S3,$D0 # d0 = h2*s3
- vpandq $MASK,$T3,$T3 # 3
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
- vporq $PADBIT,$T4,$T4 # padbit, yes, always
- vpmuludq $H2,$R0,$D2 # d2 = h2*r0
- vpaddq $H1,$T1,$H1 # accumulate input
- vpaddq $H3,$T3,$H3
- vpaddq $H4,$T4,$H4
-
- vmovdqu64 16*0($inp),$T3 # load input
- vmovdqu64 16*4($inp),$T4
- lea 16*8($inp),$inp
- vpmuludq $H0,$R3,$M3
- vpmuludq $H0,$R4,$M4
- vpmuludq $H0,$R0,$M0
- vpmuludq $H0,$R1,$M1
- vpaddq $M3,$D3,$D3 # d3 += h0*r3
- vpaddq $M4,$D4,$D4 # d4 += h0*r4
- vpaddq $M0,$D0,$D0 # d0 += h0*r0
- vpaddq $M1,$D1,$D1 # d1 += h0*r1
-
- vpmuludq $H1,$R2,$M3
- vpmuludq $H1,$R3,$M4
- vpmuludq $H1,$S4,$M0
- vpmuludq $H0,$R2,$M2
- vpaddq $M3,$D3,$D3 # d3 += h1*r2
- vpaddq $M4,$D4,$D4 # d4 += h1*r3
- vpaddq $M0,$D0,$D0 # d0 += h1*s4
- vpaddq $M2,$D2,$D2 # d2 += h0*r2
-
- vpunpcklqdq $T4,$T3,$T0 # transpose input
- vpunpckhqdq $T4,$T3,$T4
-
- vpmuludq $H3,$R0,$M3
- vpmuludq $H3,$R1,$M4
- vpmuludq $H1,$R0,$M1
- vpmuludq $H1,$R1,$M2
- vpaddq $M3,$D3,$D3 # d3 += h3*r0
- vpaddq $M4,$D4,$D4 # d4 += h3*r1
- vpaddq $M1,$D1,$D1 # d1 += h1*r0
- vpaddq $M2,$D2,$D2 # d2 += h1*r1
-
- vpmuludq $H4,$S4,$M3
- vpmuludq $H4,$R0,$M4
- vpmuludq $H3,$S2,$M0
- vpmuludq $H3,$S3,$M1
- vpaddq $M3,$D3,$D3 # d3 += h4*s4
- vpmuludq $H3,$S4,$M2
- vpaddq $M4,$D4,$D4 # d4 += h4*r0
- vpaddq $M0,$D0,$D0 # d0 += h3*s2
- vpaddq $M1,$D1,$D1 # d1 += h3*s3
- vpaddq $M2,$D2,$D2 # d2 += h3*s4
-
- vpmuludq $H4,$S1,$M0
- vpmuludq $H4,$S2,$M1
- vpmuludq $H4,$S3,$M2
- vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
- vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
- vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
-
- ################################################################
- # lazy reduction (interleaved with input splat)
-
- vpsrlq \$52,$T0,$T2 # splat input
- vpsllq \$12,$T4,$T3
-
- vpsrlq \$26,$D3,$H3
- vpandq $MASK,$D3,$D3
- vpaddq $H3,$D4,$H4 # h3 -> h4
-
- vporq $T3,$T2,$T2
-
- vpsrlq \$26,$H0,$D0
- vpandq $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpandq $MASK,$T2,$T2 # 2
-
- vpsrlq \$26,$H4,$D4
- vpandq $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpandq $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpaddq $T2,$H2,$H2 # modulo-scheduled
- vpsrlq \$26,$T0,$T1
-
- vpsrlq \$26,$H2,$D2
- vpandq $MASK,$H2,$H2
- vpaddq $D2,$D3,$H3 # h2 -> h3
-
- vpsrlq \$14,$T4,$T3
-
- vpsrlq \$26,$H0,$D0
- vpandq $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$40,$T4,$T4 # 4
-
- vpsrlq \$26,$H3,$D3
- vpandq $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpandq $MASK,$T0,$T0 # 0
- #vpandq $MASK,$T1,$T1 # 1
- #vpandq $MASK,$T3,$T3 # 3
- #vporq $PADBIT,$T4,$T4 # padbit, yes, always
-
- sub \$128,$len
- ja .Loop_avx512
-
-.Ltail_avx512:
- ################################################################
- # while above multiplications were by r^8 in all lanes, in last
- # iteration we multiply least significant lane by r^8 and most
- # significant one by r, that's why table gets shifted...
-
- vpsrlq \$32,$R0,$R0 # 0105020603070408
- vpsrlq \$32,$R1,$R1
- vpsrlq \$32,$R2,$R2
- vpsrlq \$32,$S3,$S3
- vpsrlq \$32,$S4,$S4
- vpsrlq \$32,$R3,$R3
- vpsrlq \$32,$R4,$R4
- vpsrlq \$32,$S1,$S1
- vpsrlq \$32,$S2,$S2
-
- ################################################################
- # load either next or last 64 byte of input
- lea ($inp,$len),$inp
-
- #vpaddq $H2,$T2,$H2 # accumulate input
- vpaddq $H0,$T0,$H0
-
- vpmuludq $H2,$R1,$D3 # d3 = h2*r1
- vpmuludq $H2,$R2,$D4 # d4 = h2*r2
- vpmuludq $H2,$S3,$D0 # d0 = h2*s3
- vpandq $MASK,$T1,$T1 # 1
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
- vpandq $MASK,$T3,$T3 # 3
- vpmuludq $H2,$R0,$D2 # d2 = h2*r0
- vporq $PADBIT,$T4,$T4 # padbit, yes, always
- vpaddq $H1,$T1,$H1 # accumulate input
- vpaddq $H3,$T3,$H3
- vpaddq $H4,$T4,$H4
-
- vmovdqu 16*0($inp),%x#$T0
- vpmuludq $H0,$R3,$M3
- vpmuludq $H0,$R4,$M4
- vpmuludq $H0,$R0,$M0
- vpmuludq $H0,$R1,$M1
- vpaddq $M3,$D3,$D3 # d3 += h0*r3
- vpaddq $M4,$D4,$D4 # d4 += h0*r4
- vpaddq $M0,$D0,$D0 # d0 += h0*r0
- vpaddq $M1,$D1,$D1 # d1 += h0*r1
-
- vmovdqu 16*1($inp),%x#$T1
- vpmuludq $H1,$R2,$M3
- vpmuludq $H1,$R3,$M4
- vpmuludq $H1,$S4,$M0
- vpmuludq $H0,$R2,$M2
- vpaddq $M3,$D3,$D3 # d3 += h1*r2
- vpaddq $M4,$D4,$D4 # d4 += h1*r3
- vpaddq $M0,$D0,$D0 # d0 += h1*s4
- vpaddq $M2,$D2,$D2 # d2 += h0*r2
-
- vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
- vpmuludq $H3,$R0,$M3
- vpmuludq $H3,$R1,$M4
- vpmuludq $H1,$R0,$M1
- vpmuludq $H1,$R1,$M2
- vpaddq $M3,$D3,$D3 # d3 += h3*r0
- vpaddq $M4,$D4,$D4 # d4 += h3*r1
- vpaddq $M1,$D1,$D1 # d1 += h1*r0
- vpaddq $M2,$D2,$D2 # d2 += h1*r1
-
- vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
- vpmuludq $H4,$S4,$M3
- vpmuludq $H4,$R0,$M4
- vpmuludq $H3,$S2,$M0
- vpmuludq $H3,$S3,$M1
- vpmuludq $H3,$S4,$M2
- vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
- vpaddq $M4,$D4,$D4 # d4 += h4*r0
- vpaddq $M0,$D0,$D0 # d0 += h3*s2
- vpaddq $M1,$D1,$D1 # d1 += h3*s3
- vpaddq $M2,$D2,$D2 # d2 += h3*s4
-
- vpmuludq $H4,$S1,$M0
- vpmuludq $H4,$S2,$M1
- vpmuludq $H4,$S3,$M2
- vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
- vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
- vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
-
- ################################################################
- # horizontal addition
-
- mov \$1,%eax
- vpermq \$0xb1,$H3,$D3
- vpermq \$0xb1,$D4,$H4
- vpermq \$0xb1,$H0,$D0
- vpermq \$0xb1,$H1,$D1
- vpermq \$0xb1,$H2,$D2
- vpaddq $D3,$H3,$H3
- vpaddq $D4,$H4,$H4
- vpaddq $D0,$H0,$H0
- vpaddq $D1,$H1,$H1
- vpaddq $D2,$H2,$H2
-
- kmovw %eax,%k3
- vpermq \$0x2,$H3,$D3
- vpermq \$0x2,$H4,$D4
- vpermq \$0x2,$H0,$D0
- vpermq \$0x2,$H1,$D1
- vpermq \$0x2,$H2,$D2
- vpaddq $D3,$H3,$H3
- vpaddq $D4,$H4,$H4
- vpaddq $D0,$H0,$H0
- vpaddq $D1,$H1,$H1
- vpaddq $D2,$H2,$H2
-
- vextracti64x4 \$0x1,$H3,%y#$D3
- vextracti64x4 \$0x1,$H4,%y#$D4
- vextracti64x4 \$0x1,$H0,%y#$D0
- vextracti64x4 \$0x1,$H1,%y#$D1
- vextracti64x4 \$0x1,$H2,%y#$D2
- vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
- vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
- vpaddq $D0,$H0,${H0}{%k3}{z}
- vpaddq $D1,$H1,${H1}{%k3}{z}
- vpaddq $D2,$H2,${H2}{%k3}{z}
-___
-map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
-map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
-$code.=<<___;
- ################################################################
- # lazy reduction (interleaved with input splat)
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpsrldq \$6,$T0,$T2 # splat input
- vpsrldq \$6,$T1,$T3
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpunpcklqdq $T3,$T2,$T2 # 2:3
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D4
- vpand $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpsrlq \$30,$T2,$T3
- vpsrlq \$4,$T2,$T2
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpsrlq \$26,$T0,$T1
- vpsrlq \$40,$T4,$T4 # 4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T0,$T0 # 0
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
- vpand $MASK,$T1,$T1 # 1
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
- add \$64,$len
- jnz .Ltail_avx2$suffix
-
- vpsubq $T2,$H2,$H2 # undo input accumulation
- vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
- vmovd %x#$H1,`4*1-48-64`($ctx)
- vmovd %x#$H2,`4*2-48-64`($ctx)
- vmovd %x#$H3,`4*3-48-64`($ctx)
- vmovd %x#$H4,`4*4-48-64`($ctx)
- vzeroall
-___
-$code.=<<___ if ($win64);
- movdqa -0xb0(%r10),%xmm6
- movdqa -0xa0(%r10),%xmm7
- movdqa -0x90(%r10),%xmm8
- movdqa -0x80(%r10),%xmm9
- movdqa -0x70(%r10),%xmm10
- movdqa -0x60(%r10),%xmm11
- movdqa -0x50(%r10),%xmm12
- movdqa -0x40(%r10),%xmm13
- movdqa -0x30(%r10),%xmm14
- movdqa -0x20(%r10),%xmm15
- lea -8(%r10),%rsp
-.Ldo_avx512_epilogue:
-___
-$code.=<<___ if (!$win64);
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-___
-$code.=<<___;
- RET
-.cfi_endproc
-___
-
-}
-
-}
-
-&declare_function("poly1305_blocks_avx2", 32, 4);
-poly1305_blocks_avxN(0);
-&end_function("poly1305_blocks_avx2");
-
-#######################################################################
-if ($avx>2) {
-# On entry we have input length divisible by 64. But since inner loop
-# processes 128 bytes per iteration, cases when length is not divisible
-# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
-# reason stack layout is kept identical to poly1305_blocks_avx2. If not
-# for this tail, we wouldn't have to even allocate stack frame...
-
-&declare_function("poly1305_blocks_avx512", 32, 4);
-poly1305_blocks_avxN(1);
-&end_function("poly1305_blocks_avx512");
-
-if (!$kernel && $avx>3) {
-########################################################################
-# VPMADD52 version using 2^44 radix.
-#
-# One can argue that base 2^52 would be more natural. Well, even though
-# some operations would be more natural, one has to recognize couple of
-# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
-# at amount of multiply-n-accumulate operations. Secondly, it makes it
-# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
-# reference implementations], which means that more such operations
-# would have to be performed in inner loop, which in turn makes critical
-# path longer. In other words, even though base 2^44 reduction might
-# look less elegant, overall critical path is actually shorter...
-
-########################################################################
-# Layout of opaque area is following.
-#
-# unsigned __int64 h[3]; # current hash value base 2^44
-# unsigned __int64 s[2]; # key value*20 base 2^44
-# unsigned __int64 r[3]; # key value base 2^44
-# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
-# # r^n positions reflect
-# # placement in register, not
-# # memory, R[3] is R[1]*20
-
-$code.=<<___;
-.type poly1305_init_base2_44,\@function,3
-.align 32
-poly1305_init_base2_44:
- xor %eax,%eax
- mov %rax,0($ctx) # initialize hash value
- mov %rax,8($ctx)
- mov %rax,16($ctx)
-
-.Linit_base2_44:
- lea poly1305_blocks_vpmadd52(%rip),%r10
- lea poly1305_emit_base2_44(%rip),%r11
-
- mov \$0x0ffffffc0fffffff,%rax
- mov \$0x0ffffffc0ffffffc,%rcx
- and 0($inp),%rax
- mov \$0x00000fffffffffff,%r8
- and 8($inp),%rcx
- mov \$0x00000fffffffffff,%r9
- and %rax,%r8
- shrd \$44,%rcx,%rax
- mov %r8,40($ctx) # r0
- and %r9,%rax
- shr \$24,%rcx
- mov %rax,48($ctx) # r1
- lea (%rax,%rax,4),%rax # *5
- mov %rcx,56($ctx) # r2
- shl \$2,%rax # magic <<2
- lea (%rcx,%rcx,4),%rcx # *5
- shl \$2,%rcx # magic <<2
- mov %rax,24($ctx) # s1
- mov %rcx,32($ctx) # s2
- movq \$-1,64($ctx) # write impossible value
-___
-$code.=<<___ if ($flavour !~ /elf32/);
- mov %r10,0(%rdx)
- mov %r11,8(%rdx)
-___
-$code.=<<___ if ($flavour =~ /elf32/);
- mov %r10d,0(%rdx)
- mov %r11d,4(%rdx)
-___
-$code.=<<___;
- mov \$1,%eax
- RET
-.size poly1305_init_base2_44,.-poly1305_init_base2_44
-___
-{
-my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
-my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
-my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
-
-$code.=<<___;
-.type poly1305_blocks_vpmadd52,\@function,4
-.align 32
-poly1305_blocks_vpmadd52:
- shr \$4,$len
- jz .Lno_data_vpmadd52 # too short
-
- shl \$40,$padbit
- mov 64($ctx),%r8 # peek on power of the key
-
- # if powers of the key are not calculated yet, process up to 3
- # blocks with this single-block subroutine, otherwise ensure that
- # length is divisible by 2 blocks and pass the rest down to next
- # subroutine...
-
- mov \$3,%rax
- mov \$1,%r10
- cmp \$4,$len # is input long
- cmovae %r10,%rax
- test %r8,%r8 # is power value impossible?
- cmovns %r10,%rax
-
- and $len,%rax # is input of favourable length?
- jz .Lblocks_vpmadd52_4x
-
- sub %rax,$len
- mov \$7,%r10d
- mov \$1,%r11d
- kmovw %r10d,%k7
- lea .L2_44_inp_permd(%rip),%r10
- kmovw %r11d,%k1
-
- vmovq $padbit,%x#$PAD
- vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
- vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
- vpermq \$0xcf,$PAD,$PAD
- vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
-
- vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
- vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
- vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
- vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
-
- vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
- vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
-
- jmp .Loop_vpmadd52
-
-.align 32
-.Loop_vpmadd52:
- vmovdqu32 0($inp),%x#$T0 # load input as ----3210
- lea 16($inp),$inp
-
- vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
- vpsrlvq $inp_shift,$T0,$T0
- vpandq $reduc_mask,$T0,$T0
- vporq $PAD,$T0,$T0
-
- vpaddq $T0,$Dlo,$Dlo # accumulate input
-
- vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
- vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
- vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
-
- vpxord $Dlo,$Dlo,$Dlo
- vpxord $Dhi,$Dhi,$Dhi
-
- vpmadd52luq $r2r1r0,$H0,$Dlo
- vpmadd52huq $r2r1r0,$H0,$Dhi
-
- vpmadd52luq $r1r0s2,$H1,$Dlo
- vpmadd52huq $r1r0s2,$H1,$Dhi
-
- vpmadd52luq $r0s2s1,$H2,$Dlo
- vpmadd52huq $r0s2s1,$H2,$Dhi
-
- vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
- vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
- vpandq $reduc_mask,$Dlo,$Dlo
-
- vpaddq $T0,$Dhi,$Dhi
-
- vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
-
- vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
-
- vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
- vpandq $reduc_mask,$Dlo,$Dlo
-
- vpermq \$0b10010011,$T0,$T0
-
- vpaddq $T0,$Dlo,$Dlo
-
- vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
-
- vpaddq $T0,$Dlo,$Dlo
- vpsllq \$2,$T0,$T0
-
- vpaddq $T0,$Dlo,$Dlo
-
- dec %rax # len-=16
- jnz .Loop_vpmadd52
-
- vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
-
- test $len,$len
- jnz .Lblocks_vpmadd52_4x
-
-.Lno_data_vpmadd52:
- RET
-.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
-___
-}
-{
-########################################################################
-# As implied by its name 4x subroutine processes 4 blocks in parallel
-# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
-# and is handled in 256-bit %ymm registers.
-
-my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
-my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
-my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
-
-$code.=<<___;
-.type poly1305_blocks_vpmadd52_4x,\@function,4
-.align 32
-poly1305_blocks_vpmadd52_4x:
- shr \$4,$len
- jz .Lno_data_vpmadd52_4x # too short
-
- shl \$40,$padbit
- mov 64($ctx),%r8 # peek on power of the key
-
-.Lblocks_vpmadd52_4x:
- vpbroadcastq $padbit,$PAD
-
- vmovdqa64 .Lx_mask44(%rip),$mask44
- mov \$5,%eax
- vmovdqa64 .Lx_mask42(%rip),$mask42
- kmovw %eax,%k1 # used in 2x path
-
- test %r8,%r8 # is power value impossible?
- js .Linit_vpmadd52 # if it is, then init R[4]
-
- vmovq 0($ctx),%x#$H0 # load current hash value
- vmovq 8($ctx),%x#$H1
- vmovq 16($ctx),%x#$H2
-
- test \$3,$len # is length 4*n+2?
- jnz .Lblocks_vpmadd52_2x_do
-
-.Lblocks_vpmadd52_4x_do:
- vpbroadcastq 64($ctx),$R0 # load 4th power of the key
- vpbroadcastq 96($ctx),$R1
- vpbroadcastq 128($ctx),$R2
- vpbroadcastq 160($ctx),$S1
-
-.Lblocks_vpmadd52_4x_key_loaded:
- vpsllq \$2,$R2,$S2 # S2 = R2*5*4
- vpaddq $R2,$S2,$S2
- vpsllq \$2,$S2,$S2
-
- test \$7,$len # is len 8*n?
- jz .Lblocks_vpmadd52_8x
-
- vmovdqu64 16*0($inp),$T2 # load data
- vmovdqu64 16*2($inp),$T3
- lea 16*4($inp),$inp
-
- vpunpcklqdq $T3,$T2,$T1 # transpose data
- vpunpckhqdq $T3,$T2,$T3
-
- # at this point 64-bit lanes are ordered as 3-1-2-0
-
- vpsrlq \$24,$T3,$T2 # splat the data
- vporq $PAD,$T2,$T2
- vpaddq $T2,$H2,$H2 # accumulate input
- vpandq $mask44,$T1,$T0
- vpsrlq \$44,$T1,$T1
- vpsllq \$20,$T3,$T3
- vporq $T3,$T1,$T1
- vpandq $mask44,$T1,$T1
-
- sub \$4,$len
- jz .Ltail_vpmadd52_4x
- jmp .Loop_vpmadd52_4x
- ud2
-
-.align 32
-.Linit_vpmadd52:
- vmovq 24($ctx),%x#$S1 # load key
- vmovq 56($ctx),%x#$H2
- vmovq 32($ctx),%x#$S2
- vmovq 40($ctx),%x#$R0
- vmovq 48($ctx),%x#$R1
-
- vmovdqa $R0,$H0
- vmovdqa $R1,$H1
- vmovdqa $H2,$R2
-
- mov \$2,%eax
-
-.Lmul_init_vpmadd52:
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $H2,$S1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $H2,$S1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $H2,$S2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $H2,$S2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $H2,$R0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $H2,$R0,$D2hi
-
- vpmadd52luq $H0,$R0,$D0lo
- vpmadd52huq $H0,$R0,$D0hi
- vpmadd52luq $H0,$R1,$D1lo
- vpmadd52huq $H0,$R1,$D1hi
- vpmadd52luq $H0,$R2,$D2lo
- vpmadd52huq $H0,$R2,$D2hi
-
- vpmadd52luq $H1,$S2,$D0lo
- vpmadd52huq $H1,$S2,$D0hi
- vpmadd52luq $H1,$R0,$D1lo
- vpmadd52huq $H1,$R0,$D1hi
- vpmadd52luq $H1,$R1,$D2lo
- vpmadd52huq $H1,$R1,$D2hi
-
- ################################################################
- # partial reduction
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$H0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$H1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$H2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
-
- vpsrlq \$44,$H0,$tmp # additional step
- vpandq $mask44,$H0,$H0
-
- vpaddq $tmp,$H1,$H1
-
- dec %eax
- jz .Ldone_init_vpmadd52
-
- vpunpcklqdq $R1,$H1,$R1 # 1,2
- vpbroadcastq %x#$H1,%x#$H1 # 2,2
- vpunpcklqdq $R2,$H2,$R2
- vpbroadcastq %x#$H2,%x#$H2
- vpunpcklqdq $R0,$H0,$R0
- vpbroadcastq %x#$H0,%x#$H0
-
- vpsllq \$2,$R1,$S1 # S1 = R1*5*4
- vpsllq \$2,$R2,$S2 # S2 = R2*5*4
- vpaddq $R1,$S1,$S1
- vpaddq $R2,$S2,$S2
- vpsllq \$2,$S1,$S1
- vpsllq \$2,$S2,$S2
-
- jmp .Lmul_init_vpmadd52
- ud2
-
-.align 32
-.Ldone_init_vpmadd52:
- vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
- vinserti128 \$1,%x#$R2,$H2,$R2
- vinserti128 \$1,%x#$R0,$H0,$R0
-
- vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
- vpermq \$0b11011000,$R2,$R2
- vpermq \$0b11011000,$R0,$R0
-
- vpsllq \$2,$R1,$S1 # S1 = R1*5*4
- vpaddq $R1,$S1,$S1
- vpsllq \$2,$S1,$S1
-
- vmovq 0($ctx),%x#$H0 # load current hash value
- vmovq 8($ctx),%x#$H1
- vmovq 16($ctx),%x#$H2
-
- test \$3,$len # is length 4*n+2?
- jnz .Ldone_init_vpmadd52_2x
-
- vmovdqu64 $R0,64($ctx) # save key powers
- vpbroadcastq %x#$R0,$R0 # broadcast 4th power
- vmovdqu64 $R1,96($ctx)
- vpbroadcastq %x#$R1,$R1
- vmovdqu64 $R2,128($ctx)
- vpbroadcastq %x#$R2,$R2
- vmovdqu64 $S1,160($ctx)
- vpbroadcastq %x#$S1,$S1
-
- jmp .Lblocks_vpmadd52_4x_key_loaded
- ud2
-
-.align 32
-.Ldone_init_vpmadd52_2x:
- vmovdqu64 $R0,64($ctx) # save key powers
- vpsrldq \$8,$R0,$R0 # 0-1-0-2
- vmovdqu64 $R1,96($ctx)
- vpsrldq \$8,$R1,$R1
- vmovdqu64 $R2,128($ctx)
- vpsrldq \$8,$R2,$R2
- vmovdqu64 $S1,160($ctx)
- vpsrldq \$8,$S1,$S1
- jmp .Lblocks_vpmadd52_2x_key_loaded
- ud2
-
-.align 32
-.Lblocks_vpmadd52_2x_do:
- vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
- vmovdqu64 160+8($ctx),${S1}{%k1}{z}
- vmovdqu64 64+8($ctx),${R0}{%k1}{z}
- vmovdqu64 96+8($ctx),${R1}{%k1}{z}
-
-.Lblocks_vpmadd52_2x_key_loaded:
- vmovdqu64 16*0($inp),$T2 # load data
- vpxorq $T3,$T3,$T3
- lea 16*2($inp),$inp
-
- vpunpcklqdq $T3,$T2,$T1 # transpose data
- vpunpckhqdq $T3,$T2,$T3
-
- # at this point 64-bit lanes are ordered as x-1-x-0
-
- vpsrlq \$24,$T3,$T2 # splat the data
- vporq $PAD,$T2,$T2
- vpaddq $T2,$H2,$H2 # accumulate input
- vpandq $mask44,$T1,$T0
- vpsrlq \$44,$T1,$T1
- vpsllq \$20,$T3,$T3
- vporq $T3,$T1,$T1
- vpandq $mask44,$T1,$T1
-
- jmp .Ltail_vpmadd52_2x
- ud2
-
-.align 32
-.Loop_vpmadd52_4x:
- #vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$H1,$H1
-
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $H2,$S1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $H2,$S1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $H2,$S2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $H2,$S2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $H2,$R0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $H2,$R0,$D2hi
-
- vmovdqu64 16*0($inp),$T2 # load data
- vmovdqu64 16*2($inp),$T3
- lea 16*4($inp),$inp
- vpmadd52luq $H0,$R0,$D0lo
- vpmadd52huq $H0,$R0,$D0hi
- vpmadd52luq $H0,$R1,$D1lo
- vpmadd52huq $H0,$R1,$D1hi
- vpmadd52luq $H0,$R2,$D2lo
- vpmadd52huq $H0,$R2,$D2hi
-
- vpunpcklqdq $T3,$T2,$T1 # transpose data
- vpunpckhqdq $T3,$T2,$T3
- vpmadd52luq $H1,$S2,$D0lo
- vpmadd52huq $H1,$S2,$D0hi
- vpmadd52luq $H1,$R0,$D1lo
- vpmadd52huq $H1,$R0,$D1hi
- vpmadd52luq $H1,$R1,$D2lo
- vpmadd52huq $H1,$R1,$D2hi
-
- ################################################################
- # partial reduction (interleaved with data splat)
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$H0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpsrlq \$24,$T3,$T2
- vporq $PAD,$T2,$T2
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$H1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpandq $mask44,$T1,$T0
- vpsrlq \$44,$T1,$T1
- vpsllq \$20,$T3,$T3
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$H2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $D2hi,$H0,$H0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
- vporq $T3,$T1,$T1
- vpandq $mask44,$T1,$T1
-
- vpsrlq \$44,$H0,$tmp # additional step
- vpandq $mask44,$H0,$H0
-
- vpaddq $tmp,$H1,$H1
-
- sub \$4,$len # len-=64
- jnz .Loop_vpmadd52_4x
-
-.Ltail_vpmadd52_4x:
- vmovdqu64 128($ctx),$R2 # load all key powers
- vmovdqu64 160($ctx),$S1
- vmovdqu64 64($ctx),$R0
- vmovdqu64 96($ctx),$R1
-
-.Ltail_vpmadd52_2x:
- vpsllq \$2,$R2,$S2 # S2 = R2*5*4
- vpaddq $R2,$S2,$S2
- vpsllq \$2,$S2,$S2
-
- #vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$H1,$H1
-
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $H2,$S1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $H2,$S1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $H2,$S2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $H2,$S2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $H2,$R0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $H2,$R0,$D2hi
-
- vpmadd52luq $H0,$R0,$D0lo
- vpmadd52huq $H0,$R0,$D0hi
- vpmadd52luq $H0,$R1,$D1lo
- vpmadd52huq $H0,$R1,$D1hi
- vpmadd52luq $H0,$R2,$D2lo
- vpmadd52huq $H0,$R2,$D2hi
-
- vpmadd52luq $H1,$S2,$D0lo
- vpmadd52huq $H1,$S2,$D0hi
- vpmadd52luq $H1,$R0,$D1lo
- vpmadd52huq $H1,$R0,$D1hi
- vpmadd52luq $H1,$R1,$D2lo
- vpmadd52huq $H1,$R1,$D2hi
-
- ################################################################
- # horizontal addition
-
- mov \$1,%eax
- kmovw %eax,%k1
- vpsrldq \$8,$D0lo,$T0
- vpsrldq \$8,$D0hi,$H0
- vpsrldq \$8,$D1lo,$T1
- vpsrldq \$8,$D1hi,$H1
- vpaddq $T0,$D0lo,$D0lo
- vpaddq $H0,$D0hi,$D0hi
- vpsrldq \$8,$D2lo,$T2
- vpsrldq \$8,$D2hi,$H2
- vpaddq $T1,$D1lo,$D1lo
- vpaddq $H1,$D1hi,$D1hi
- vpermq \$0x2,$D0lo,$T0
- vpermq \$0x2,$D0hi,$H0
- vpaddq $T2,$D2lo,$D2lo
- vpaddq $H2,$D2hi,$D2hi
-
- vpermq \$0x2,$D1lo,$T1
- vpermq \$0x2,$D1hi,$H1
- vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
- vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
- vpermq \$0x2,$D2lo,$T2
- vpermq \$0x2,$D2hi,$H2
- vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
- vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
- vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
- vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
-
- ################################################################
- # partial reduction
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$H0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$H1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$H2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
-
- vpsrlq \$44,$H0,$tmp # additional step
- vpandq $mask44,$H0,$H0
-
- vpaddq $tmp,$H1,$H1
- # at this point $len is
- # either 4*n+2 or 0...
- sub \$2,$len # len-=32
- ja .Lblocks_vpmadd52_4x_do
-
- vmovq %x#$H0,0($ctx)
- vmovq %x#$H1,8($ctx)
- vmovq %x#$H2,16($ctx)
- vzeroall
-
-.Lno_data_vpmadd52_4x:
- RET
-.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
-___
-}
-{
-########################################################################
-# As implied by its name 8x subroutine processes 8 blocks in parallel...
-# This is intermediate version, as it's used only in cases when input
-# length is either 8*n, 8*n+1 or 8*n+2...
-
-my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
-my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
-my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
-my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
-
-$code.=<<___;
-.type poly1305_blocks_vpmadd52_8x,\@function,4
-.align 32
-poly1305_blocks_vpmadd52_8x:
- shr \$4,$len
- jz .Lno_data_vpmadd52_8x # too short
-
- shl \$40,$padbit
- mov 64($ctx),%r8 # peek on power of the key
-
- vmovdqa64 .Lx_mask44(%rip),$mask44
- vmovdqa64 .Lx_mask42(%rip),$mask42
-
- test %r8,%r8 # is power value impossible?
- js .Linit_vpmadd52 # if it is, then init R[4]
-
- vmovq 0($ctx),%x#$H0 # load current hash value
- vmovq 8($ctx),%x#$H1
- vmovq 16($ctx),%x#$H2
-
-.Lblocks_vpmadd52_8x:
- ################################################################
- # fist we calculate more key powers
-
- vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
- vmovdqu64 160($ctx),$S1
- vmovdqu64 64($ctx),$R0
- vmovdqu64 96($ctx),$R1
-
- vpsllq \$2,$R2,$S2 # S2 = R2*5*4
- vpaddq $R2,$S2,$S2
- vpsllq \$2,$S2,$S2
-
- vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
- vpbroadcastq %x#$R0,$RR0
- vpbroadcastq %x#$R1,$RR1
-
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $RR2,$S1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $RR2,$S1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $RR2,$S2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $RR2,$S2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $RR2,$R0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $RR2,$R0,$D2hi
-
- vpmadd52luq $RR0,$R0,$D0lo
- vpmadd52huq $RR0,$R0,$D0hi
- vpmadd52luq $RR0,$R1,$D1lo
- vpmadd52huq $RR0,$R1,$D1hi
- vpmadd52luq $RR0,$R2,$D2lo
- vpmadd52huq $RR0,$R2,$D2hi
-
- vpmadd52luq $RR1,$S2,$D0lo
- vpmadd52huq $RR1,$S2,$D0hi
- vpmadd52luq $RR1,$R0,$D1lo
- vpmadd52huq $RR1,$R0,$D1hi
- vpmadd52luq $RR1,$R1,$D2lo
- vpmadd52huq $RR1,$R1,$D2hi
-
- ################################################################
- # partial reduction
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$RR0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$RR1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$RR2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $D2hi,$RR0,$RR0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$RR0,$RR0
-
- vpsrlq \$44,$RR0,$tmp # additional step
- vpandq $mask44,$RR0,$RR0
-
- vpaddq $tmp,$RR1,$RR1
-
- ################################################################
- # At this point Rx holds 1324 powers, RRx - 5768, and the goal
- # is 15263748, which reflects how data is loaded...
-
- vpunpcklqdq $R2,$RR2,$T2 # 3748
- vpunpckhqdq $R2,$RR2,$R2 # 1526
- vpunpcklqdq $R0,$RR0,$T0
- vpunpckhqdq $R0,$RR0,$R0
- vpunpcklqdq $R1,$RR1,$T1
- vpunpckhqdq $R1,$RR1,$R1
-___
-######## switch to %zmm
-map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
-map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
-map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
-map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
-
-$code.=<<___;
- vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
- vshufi64x2 \$0x44,$R0,$T0,$RR0
- vshufi64x2 \$0x44,$R1,$T1,$RR1
-
- vmovdqu64 16*0($inp),$T2 # load data
- vmovdqu64 16*4($inp),$T3
- lea 16*8($inp),$inp
-
- vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
- vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
- vpaddq $RR2,$SS2,$SS2
- vpaddq $RR1,$SS1,$SS1
- vpsllq \$2,$SS2,$SS2
- vpsllq \$2,$SS1,$SS1
-
- vpbroadcastq $padbit,$PAD
- vpbroadcastq %x#$mask44,$mask44
- vpbroadcastq %x#$mask42,$mask42
-
- vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
- vpbroadcastq %x#$SS2,$S2
- vpbroadcastq %x#$RR0,$R0
- vpbroadcastq %x#$RR1,$R1
- vpbroadcastq %x#$RR2,$R2
-
- vpunpcklqdq $T3,$T2,$T1 # transpose data
- vpunpckhqdq $T3,$T2,$T3
-
- # at this point 64-bit lanes are ordered as 73625140
-
- vpsrlq \$24,$T3,$T2 # splat the data
- vporq $PAD,$T2,$T2
- vpaddq $T2,$H2,$H2 # accumulate input
- vpandq $mask44,$T1,$T0
- vpsrlq \$44,$T1,$T1
- vpsllq \$20,$T3,$T3
- vporq $T3,$T1,$T1
- vpandq $mask44,$T1,$T1
-
- sub \$8,$len
- jz .Ltail_vpmadd52_8x
- jmp .Loop_vpmadd52_8x
-
-.align 32
-.Loop_vpmadd52_8x:
- #vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$H1,$H1
-
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $H2,$S1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $H2,$S1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $H2,$S2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $H2,$S2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $H2,$R0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $H2,$R0,$D2hi
-
- vmovdqu64 16*0($inp),$T2 # load data
- vmovdqu64 16*4($inp),$T3
- lea 16*8($inp),$inp
- vpmadd52luq $H0,$R0,$D0lo
- vpmadd52huq $H0,$R0,$D0hi
- vpmadd52luq $H0,$R1,$D1lo
- vpmadd52huq $H0,$R1,$D1hi
- vpmadd52luq $H0,$R2,$D2lo
- vpmadd52huq $H0,$R2,$D2hi
-
- vpunpcklqdq $T3,$T2,$T1 # transpose data
- vpunpckhqdq $T3,$T2,$T3
- vpmadd52luq $H1,$S2,$D0lo
- vpmadd52huq $H1,$S2,$D0hi
- vpmadd52luq $H1,$R0,$D1lo
- vpmadd52huq $H1,$R0,$D1hi
- vpmadd52luq $H1,$R1,$D2lo
- vpmadd52huq $H1,$R1,$D2hi
-
- ################################################################
- # partial reduction (interleaved with data splat)
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$H0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpsrlq \$24,$T3,$T2
- vporq $PAD,$T2,$T2
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$H1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpandq $mask44,$T1,$T0
- vpsrlq \$44,$T1,$T1
- vpsllq \$20,$T3,$T3
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$H2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $D2hi,$H0,$H0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
- vporq $T3,$T1,$T1
- vpandq $mask44,$T1,$T1
-
- vpsrlq \$44,$H0,$tmp # additional step
- vpandq $mask44,$H0,$H0
-
- vpaddq $tmp,$H1,$H1
-
- sub \$8,$len # len-=128
- jnz .Loop_vpmadd52_8x
-
-.Ltail_vpmadd52_8x:
- #vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$H1,$H1
-
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $H2,$SS1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $H2,$SS1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $H2,$SS2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $H2,$SS2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $H2,$RR0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $H2,$RR0,$D2hi
-
- vpmadd52luq $H0,$RR0,$D0lo
- vpmadd52huq $H0,$RR0,$D0hi
- vpmadd52luq $H0,$RR1,$D1lo
- vpmadd52huq $H0,$RR1,$D1hi
- vpmadd52luq $H0,$RR2,$D2lo
- vpmadd52huq $H0,$RR2,$D2hi
-
- vpmadd52luq $H1,$SS2,$D0lo
- vpmadd52huq $H1,$SS2,$D0hi
- vpmadd52luq $H1,$RR0,$D1lo
- vpmadd52huq $H1,$RR0,$D1hi
- vpmadd52luq $H1,$RR1,$D2lo
- vpmadd52huq $H1,$RR1,$D2hi
-
- ################################################################
- # horizontal addition
-
- mov \$1,%eax
- kmovw %eax,%k1
- vpsrldq \$8,$D0lo,$T0
- vpsrldq \$8,$D0hi,$H0
- vpsrldq \$8,$D1lo,$T1
- vpsrldq \$8,$D1hi,$H1
- vpaddq $T0,$D0lo,$D0lo
- vpaddq $H0,$D0hi,$D0hi
- vpsrldq \$8,$D2lo,$T2
- vpsrldq \$8,$D2hi,$H2
- vpaddq $T1,$D1lo,$D1lo
- vpaddq $H1,$D1hi,$D1hi
- vpermq \$0x2,$D0lo,$T0
- vpermq \$0x2,$D0hi,$H0
- vpaddq $T2,$D2lo,$D2lo
- vpaddq $H2,$D2hi,$D2hi
-
- vpermq \$0x2,$D1lo,$T1
- vpermq \$0x2,$D1hi,$H1
- vpaddq $T0,$D0lo,$D0lo
- vpaddq $H0,$D0hi,$D0hi
- vpermq \$0x2,$D2lo,$T2
- vpermq \$0x2,$D2hi,$H2
- vpaddq $T1,$D1lo,$D1lo
- vpaddq $H1,$D1hi,$D1hi
- vextracti64x4 \$1,$D0lo,%y#$T0
- vextracti64x4 \$1,$D0hi,%y#$H0
- vpaddq $T2,$D2lo,$D2lo
- vpaddq $H2,$D2hi,$D2hi
-
- vextracti64x4 \$1,$D1lo,%y#$T1
- vextracti64x4 \$1,$D1hi,%y#$H1
- vextracti64x4 \$1,$D2lo,%y#$T2
- vextracti64x4 \$1,$D2hi,%y#$H2
-___
-######## switch back to %ymm
-map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
-map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
-map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
-
-$code.=<<___;
- vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
- vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
- vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
- vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
- vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
- vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
-
- ################################################################
- # partial reduction
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$H0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$H1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$H2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
-
- vpsrlq \$44,$H0,$tmp # additional step
- vpandq $mask44,$H0,$H0
-
- vpaddq $tmp,$H1,$H1
-
- ################################################################
-
- vmovq %x#$H0,0($ctx)
- vmovq %x#$H1,8($ctx)
- vmovq %x#$H2,16($ctx)
- vzeroall
-
-.Lno_data_vpmadd52_8x:
- RET
-.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
-___
-}
-$code.=<<___;
-.type poly1305_emit_base2_44,\@function,3
-.align 32
-poly1305_emit_base2_44:
- mov 0($ctx),%r8 # load hash value
- mov 8($ctx),%r9
- mov 16($ctx),%r10
-
- mov %r9,%rax
- shr \$20,%r9
- shl \$44,%rax
- mov %r10,%rcx
- shr \$40,%r10
- shl \$24,%rcx
-
- add %rax,%r8
- adc %rcx,%r9
- adc \$0,%r10
-
- mov %r8,%rax
- add \$5,%r8 # compare to modulus
- mov %r9,%rcx
- adc \$0,%r9
- adc \$0,%r10
- shr \$2,%r10 # did 130-bit value overflow?
- cmovnz %r8,%rax
- cmovnz %r9,%rcx
-
- add 0($nonce),%rax # accumulate nonce
- adc 8($nonce),%rcx
- mov %rax,0($mac) # write result
- mov %rcx,8($mac)
-
- RET
-.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
-___
-} } }
-}
-
-if (!$kernel)
-{ # chacha20-poly1305 helpers
-my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
- ("%rdi","%rsi","%rdx","%rcx"); # Unix order
-$code.=<<___;
-.globl xor128_encrypt_n_pad
-.type xor128_encrypt_n_pad,\@abi-omnipotent
-.align 16
-xor128_encrypt_n_pad:
- sub $otp,$inp
- sub $otp,$out
- mov $len,%r10 # put len aside
- shr \$4,$len # len / 16
- jz .Ltail_enc
- nop
-.Loop_enc_xmm:
- movdqu ($inp,$otp),%xmm0
- pxor ($otp),%xmm0
- movdqu %xmm0,($out,$otp)
- movdqa %xmm0,($otp)
- lea 16($otp),$otp
- dec $len
- jnz .Loop_enc_xmm
-
- and \$15,%r10 # len % 16
- jz .Ldone_enc
-
-.Ltail_enc:
- mov \$16,$len
- sub %r10,$len
- xor %eax,%eax
-.Loop_enc_byte:
- mov ($inp,$otp),%al
- xor ($otp),%al
- mov %al,($out,$otp)
- mov %al,($otp)
- lea 1($otp),$otp
- dec %r10
- jnz .Loop_enc_byte
-
- xor %eax,%eax
-.Loop_enc_pad:
- mov %al,($otp)
- lea 1($otp),$otp
- dec $len
- jnz .Loop_enc_pad
-
-.Ldone_enc:
- mov $otp,%rax
- RET
-.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
-
-.globl xor128_decrypt_n_pad
-.type xor128_decrypt_n_pad,\@abi-omnipotent
-.align 16
-xor128_decrypt_n_pad:
- sub $otp,$inp
- sub $otp,$out
- mov $len,%r10 # put len aside
- shr \$4,$len # len / 16
- jz .Ltail_dec
- nop
-.Loop_dec_xmm:
- movdqu ($inp,$otp),%xmm0
- movdqa ($otp),%xmm1
- pxor %xmm0,%xmm1
- movdqu %xmm1,($out,$otp)
- movdqa %xmm0,($otp)
- lea 16($otp),$otp
- dec $len
- jnz .Loop_dec_xmm
-
- pxor %xmm1,%xmm1
- and \$15,%r10 # len % 16
- jz .Ldone_dec
-
-.Ltail_dec:
- mov \$16,$len
- sub %r10,$len
- xor %eax,%eax
- xor %r11d,%r11d
-.Loop_dec_byte:
- mov ($inp,$otp),%r11b
- mov ($otp),%al
- xor %r11b,%al
- mov %al,($out,$otp)
- mov %r11b,($otp)
- lea 1($otp),$otp
- dec %r10
- jnz .Loop_dec_byte
-
- xor %eax,%eax
-.Loop_dec_pad:
- mov %al,($otp)
- lea 1($otp),$otp
- dec $len
- jnz .Loop_dec_pad
-
-.Ldone_dec:
- mov $otp,%rax
- RET
-.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
-___
-}
-
-# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-# CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
-$rec="%rcx";
-$frame="%rdx";
-$context="%r8";
-$disp="%r9";
-
-$code.=<<___;
-.extern __imp_RtlVirtualUnwind
-.type se_handler,\@abi-omnipotent
-.align 16
-se_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
-
- mov 0(%r11),%r10d # HandlerData[0]
- lea (%rsi,%r10),%r10 # prologue label
- cmp %r10,%rbx # context->Rip<.Lprologue
- jb .Lcommon_seh_tail
-
- mov 152($context),%rax # pull context->Rsp
-
- mov 4(%r11),%r10d # HandlerData[1]
- lea (%rsi,%r10),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=.Lepilogue
- jae .Lcommon_seh_tail
-
- lea 48(%rax),%rax
-
- mov -8(%rax),%rbx
- mov -16(%rax),%rbp
- mov -24(%rax),%r12
- mov -32(%rax),%r13
- mov -40(%rax),%r14
- mov -48(%rax),%r15
- mov %rbx,144($context) # restore context->Rbx
- mov %rbp,160($context) # restore context->Rbp
- mov %r12,216($context) # restore context->R12
- mov %r13,224($context) # restore context->R13
- mov %r14,232($context) # restore context->R14
- mov %r15,240($context) # restore context->R14
-
- jmp .Lcommon_seh_tail
-.size se_handler,.-se_handler
-
-.type avx_handler,\@abi-omnipotent
-.align 16
-avx_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
-
- mov 0(%r11),%r10d # HandlerData[0]
- lea (%rsi,%r10),%r10 # prologue label
- cmp %r10,%rbx # context->Rip<prologue label
- jb .Lcommon_seh_tail
-
- mov 152($context),%rax # pull context->Rsp
-
- mov 4(%r11),%r10d # HandlerData[1]
- lea (%rsi,%r10),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=epilogue label
- jae .Lcommon_seh_tail
-
- mov 208($context),%rax # pull context->R11
-
- lea 0x50(%rax),%rsi
- lea 0xf8(%rax),%rax
- lea 512($context),%rdi # &context.Xmm6
- mov \$20,%ecx
- .long 0xa548f3fc # cld; rep movsq
-
-.Lcommon_seh_tail:
- mov 8(%rax),%rdi
- mov 16(%rax),%rsi
- mov %rax,152($context) # restore context->Rsp
- mov %rsi,168($context) # restore context->Rsi
- mov %rdi,176($context) # restore context->Rdi
-
- mov 40($disp),%rdi # disp->ContextRecord
- mov $context,%rsi # context
- mov \$154,%ecx # sizeof(CONTEXT)
- .long 0xa548f3fc # cld; rep movsq
-
- mov $disp,%rsi
- xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER
- mov 8(%rsi),%rdx # arg2, disp->ImageBase
- mov 0(%rsi),%r8 # arg3, disp->ControlPc
- mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
- mov 40(%rsi),%r10 # disp->ContextRecord
- lea 56(%rsi),%r11 # &disp->HandlerData
- lea 24(%rsi),%r12 # &disp->EstablisherFrame
- mov %r10,32(%rsp) # arg5
- mov %r11,40(%rsp) # arg6
- mov %r12,48(%rsp) # arg7
- mov %rcx,56(%rsp) # arg8, (NULL)
- call *__imp_RtlVirtualUnwind(%rip)
-
- mov \$1,%eax # ExceptionContinueSearch
- add \$64,%rsp
- popfq
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- pop %rdi
- pop %rsi
- RET
-.size avx_handler,.-avx_handler
-
-.section .pdata
-.align 4
- .rva .LSEH_begin_poly1305_block_init_arch
- .rva .LSEH_end_poly1305_block_init_arch
- .rva .LSEH_info_poly1305_block_init_arch
-
- .rva .LSEH_begin_poly1305_blocks_x86_64
- .rva .LSEH_end_poly1305_blocks_x86_64
- .rva .LSEH_info_poly1305_blocks_x86_64
-
- .rva .LSEH_begin_poly1305_emit_x86_64
- .rva .LSEH_end_poly1305_emit_x86_64
- .rva .LSEH_info_poly1305_emit_x86_64
-___
-$code.=<<___ if ($avx);
- .rva .LSEH_begin_poly1305_blocks_avx
- .rva .Lbase2_64_avx
- .rva .LSEH_info_poly1305_blocks_avx_1
-
- .rva .Lbase2_64_avx
- .rva .Leven_avx
- .rva .LSEH_info_poly1305_blocks_avx_2
-
- .rva .Leven_avx
- .rva .LSEH_end_poly1305_blocks_avx
- .rva .LSEH_info_poly1305_blocks_avx_3
-
- .rva .LSEH_begin_poly1305_emit_avx
- .rva .LSEH_end_poly1305_emit_avx
- .rva .LSEH_info_poly1305_emit_avx
-___
-$code.=<<___ if ($avx>1);
- .rva .LSEH_begin_poly1305_blocks_avx2
- .rva .Lbase2_64_avx2
- .rva .LSEH_info_poly1305_blocks_avx2_1
-
- .rva .Lbase2_64_avx2
- .rva .Leven_avx2
- .rva .LSEH_info_poly1305_blocks_avx2_2
-
- .rva .Leven_avx2
- .rva .LSEH_end_poly1305_blocks_avx2
- .rva .LSEH_info_poly1305_blocks_avx2_3
-___
-$code.=<<___ if ($avx>2);
- .rva .LSEH_begin_poly1305_blocks_avx512
- .rva .LSEH_end_poly1305_blocks_avx512
- .rva .LSEH_info_poly1305_blocks_avx512
-___
-$code.=<<___;
-.section .xdata
-.align 8
-.LSEH_info_poly1305_block_init_arch:
- .byte 9,0,0,0
- .rva se_handler
- .rva .LSEH_begin_poly1305_block_init_arch,.LSEH_begin_poly1305_block_init_arch
-
-.LSEH_info_poly1305_blocks_x86_64:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lblocks_body,.Lblocks_epilogue
-
-.LSEH_info_poly1305_emit_x86_64:
- .byte 9,0,0,0
- .rva se_handler
- .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
-___
-$code.=<<___ if ($avx);
-.LSEH_info_poly1305_blocks_avx_1:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx_2:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx_3:
- .byte 9,0,0,0
- .rva avx_handler
- .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_emit_avx:
- .byte 9,0,0,0
- .rva se_handler
- .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
-___
-$code.=<<___ if ($avx>1);
-.LSEH_info_poly1305_blocks_avx2_1:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx2_2:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx2_3:
- .byte 9,0,0,0
- .rva avx_handler
- .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
-___
-$code.=<<___ if ($avx>2);
-.LSEH_info_poly1305_blocks_avx512:
- .byte 9,0,0,0
- .rva avx_handler
- .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
-___
-}
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/\/\// and !/^$/);
- print;
-}
-close SELF;
-
-foreach (split('\n',$code)) {
- s/\`([^\`]*)\`/eval($1)/ge;
- s/%r([a-z]+)#d/%e$1/g;
- s/%r([0-9]+)#d/%r$1d/g;
- s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
-
- if ($kernel) {
- s/(^\.type.*),[0-9]+$/\1/;
- s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
- next if /^\.cfi.*/;
- }
-
- print $_,"\n";
-}
-close STDOUT;
diff --git a/arch/x86/lib/crypto/poly1305_glue.c b/arch/x86/lib/crypto/poly1305_glue.c
deleted file mode 100644
index b7e78a583e07..000000000000
--- a/arch/x86/lib/crypto/poly1305_glue.c
+++ /dev/null
@@ -1,129 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <asm/cpu_device_id.h>
-#include <asm/fpu/api.h>
-#include <crypto/internal/poly1305.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/sizes.h>
-#include <linux/unaligned.h>
-
-struct poly1305_arch_internal {
- union {
- struct {
- u32 h[5];
- u32 is_base2_26;
- };
- u64 hs[3];
- };
- u64 r[2];
- u64 pad;
- struct { u32 r2, r1, r4, r3; } rn[9];
-};
-
-asmlinkage void poly1305_block_init_arch(
- struct poly1305_block_state *state,
- const u8 raw_key[POLY1305_BLOCK_SIZE]);
-EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
-asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx,
- const u8 *inp,
- const size_t len, const u32 padbit);
-asmlinkage void poly1305_emit_x86_64(const struct poly1305_state *ctx,
- u8 mac[POLY1305_DIGEST_SIZE],
- const u32 nonce[4]);
-asmlinkage void poly1305_emit_avx(const struct poly1305_state *ctx,
- u8 mac[POLY1305_DIGEST_SIZE],
- const u32 nonce[4]);
-asmlinkage void poly1305_blocks_avx(struct poly1305_arch_internal *ctx,
- const u8 *inp, const size_t len,
- const u32 padbit);
-asmlinkage void poly1305_blocks_avx2(struct poly1305_arch_internal *ctx,
- const u8 *inp, const size_t len,
- const u32 padbit);
-asmlinkage void poly1305_blocks_avx512(struct poly1305_arch_internal *ctx,
- const u8 *inp,
- const size_t len, const u32 padbit);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
-
-void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp,
- unsigned int len, u32 padbit)
-{
- struct poly1305_arch_internal *ctx =
- container_of(&state->h.h, struct poly1305_arch_internal, h);
-
- /* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
- SZ_4K % POLY1305_BLOCK_SIZE);
-
- if (!static_branch_likely(&poly1305_use_avx)) {
- poly1305_blocks_x86_64(ctx, inp, len, padbit);
- return;
- }
-
- do {
- const unsigned int bytes = min(len, SZ_4K);
-
- kernel_fpu_begin();
- if (static_branch_likely(&poly1305_use_avx512))
- poly1305_blocks_avx512(ctx, inp, bytes, padbit);
- else if (static_branch_likely(&poly1305_use_avx2))
- poly1305_blocks_avx2(ctx, inp, bytes, padbit);
- else
- poly1305_blocks_avx(ctx, inp, bytes, padbit);
- kernel_fpu_end();
-
- len -= bytes;
- inp += bytes;
- } while (len);
-}
-EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
-
-void poly1305_emit_arch(const struct poly1305_state *ctx,
- u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4])
-{
- if (!static_branch_likely(&poly1305_use_avx))
- poly1305_emit_x86_64(ctx, mac, nonce);
- else
- poly1305_emit_avx(ctx, mac, nonce);
-}
-EXPORT_SYMBOL_GPL(poly1305_emit_arch);
-
-bool poly1305_is_arch_optimized(void)
-{
- return static_key_enabled(&poly1305_use_avx);
-}
-EXPORT_SYMBOL(poly1305_is_arch_optimized);
-
-static int __init poly1305_simd_mod_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_AVX) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
- static_branch_enable(&poly1305_use_avx);
- if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
- static_branch_enable(&poly1305_use_avx2);
- if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_AVX512F) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
- /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
- boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X)
- static_branch_enable(&poly1305_use_avx512);
- return 0;
-}
-subsys_initcall(poly1305_simd_mod_init);
-
-static void __exit poly1305_simd_mod_exit(void)
-{
-}
-module_exit(poly1305_simd_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
-MODULE_DESCRIPTION("Poly1305 authenticator");
diff --git a/arch/x86/lib/crypto/sha256-avx-asm.S b/arch/x86/lib/crypto/sha256-avx-asm.S
deleted file mode 100644
index 0d7b2c3e45d9..000000000000
--- a/arch/x86/lib/crypto/sha256-avx-asm.S
+++ /dev/null
@@ -1,499 +0,0 @@
-########################################################################
-# Implement fast SHA-256 with AVX1 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-256 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-# This code schedules 1 block at a time, with 4 lanes per block
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/objtool.h>
-
-## assume buffers not aligned
-#define VMOVDQ vmovdqu
-
-################################ Define Macros
-
-# addm [mem], reg
-# Add reg to mem using reg-mem add and store
-.macro addm p1 p2
- add \p1, \p2
- mov \p2, \p1
-.endm
-
-
-.macro MY_ROR p1 p2
- shld $(32-(\p1)), \p2, \p2
-.endm
-
-################################
-
-# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
-# Load xmm with mem and byte swap each dword
-.macro COPY_XMM_AND_BSWAP p1 p2 p3
- VMOVDQ \p2, \p1
- vpshufb \p3, \p1, \p1
-.endm
-
-################################
-
-X0 = %xmm4
-X1 = %xmm5
-X2 = %xmm6
-X3 = %xmm7
-
-XTMP0 = %xmm0
-XTMP1 = %xmm1
-XTMP2 = %xmm2
-XTMP3 = %xmm3
-XTMP4 = %xmm8
-XFER = %xmm9
-XTMP5 = %xmm11
-
-SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
-SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
-BYTE_FLIP_MASK = %xmm13
-
-NUM_BLKS = %rdx # 3rd arg
-INP = %rsi # 2nd arg
-CTX = %rdi # 1st arg
-
-SRND = %rsi # clobbers INP
-c = %ecx
-d = %r8d
-e = %edx
-TBL = %r12
-a = %eax
-b = %ebx
-
-f = %r9d
-g = %r10d
-h = %r11d
-
-y0 = %r13d
-y1 = %r14d
-y2 = %r15d
-
-
-_INP_END_SIZE = 8
-_INP_SIZE = 8
-_XFER_SIZE = 16
-_XMM_SAVE_SIZE = 0
-
-_INP_END = 0
-_INP = _INP_END + _INP_END_SIZE
-_XFER = _INP + _INP_SIZE
-_XMM_SAVE = _XFER + _XFER_SIZE
-STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
-
-# rotate_Xs
-# Rotate values of symbols X0...X3
-.macro rotate_Xs
-X_ = X0
-X0 = X1
-X1 = X2
-X2 = X3
-X3 = X_
-.endm
-
-# ROTATE_ARGS
-# Rotate values of symbols a...h
-.macro ROTATE_ARGS
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
- ## compute s0 four at a time and s1 two at a time
- ## compute W[-16] + W[-7] 4 at a time
-
- mov e, y0 # y0 = e
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor a, y1 # y1 = a ^ (a >> (22-13)
- xor g, y2 # y2 = f^g
- vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- ## compute s0
- vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add y0, y2 # y2 = S1 + CH
- add _XFER(%rsp), y2 # y2 = k + w + S1 + CH
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- vpsrld $7, XTMP1, XTMP2
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- vpslld $(32-7), XTMP1, XTMP3
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
- mov e, y0 # y0 = e
- mov a, y1 # y1 = a
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- vpsrld $18, XTMP1, XTMP2 #
- xor a, y1 # y1 = a ^ (a >> (22-13)
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor g, y2 # y2 = f^g
- vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- vpslld $(32-18), XTMP1, XTMP1
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- vpxor XTMP1, XTMP3, XTMP3 #
- add y0, y2 # y2 = S1 + CH
- add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- ## compute low s1
- vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
- mov e, y0 # y0 = e
- mov a, y1 # y1 = a
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- mov f, y2 # y2 = f
- xor a, y1 # y1 = a ^ (a >> (22-13)
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
- xor g, y2 # y2 = f^g
- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- vpxor XTMP3, XTMP2, XTMP2 #
- add y0, y2 # y2 = S1 + CH
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- ## compute high s1
- vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
- mov e, y0 # y0 = e
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
- xor a, y1 # y1 = a ^ (a >> (22-13)
- xor g, y2 # y2 = f^g
- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- vpxor XTMP3, XTMP2, XTMP2
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add y0, y2 # y2 = S1 + CH
- add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
- rotate_Xs
-.endm
-
-## input is [rsp + _XFER + %1 * 4]
-.macro DO_ROUND round
- mov e, y0 # y0 = e
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- xor e, y0 # y0 = e ^ (e >> (25-11))
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- mov f, y2 # y2 = f
- xor a, y1 # y1 = a ^ (a >> (22-13)
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor g, y2 # y2 = f^g
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- and e, y2 # y2 = (f^g)&e
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- add y0, y2 # y2 = S1 + CH
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- offset = \round * 4 + _XFER #
- add offset(%rsp), y2 # y2 = k + w + S1 + CH
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
-.endm
-
-########################################################################
-## void sha256_transform_avx(u32 state[SHA256_STATE_WORDS],
-## const u8 *data, size_t nblocks);
-########################################################################
-.text
-SYM_FUNC_START(sha256_transform_avx)
- ANNOTATE_NOENDBR # since this is called only via static_call
-
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rbp
- movq %rsp, %rbp
-
- subq $STACK_SIZE, %rsp # allocate stack space
- and $~15, %rsp # align stack pointer
-
- shl $6, NUM_BLKS # convert to bytes
- jz .Ldone_hash
- add INP, NUM_BLKS # pointer to end of data
- mov NUM_BLKS, _INP_END(%rsp)
-
- ## load initial digest
- mov 4*0(CTX), a
- mov 4*1(CTX), b
- mov 4*2(CTX), c
- mov 4*3(CTX), d
- mov 4*4(CTX), e
- mov 4*5(CTX), f
- mov 4*6(CTX), g
- mov 4*7(CTX), h
-
- vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
- vmovdqa _SHUF_00BA(%rip), SHUF_00BA
- vmovdqa _SHUF_DC00(%rip), SHUF_DC00
-.Lloop0:
- lea K256(%rip), TBL
-
- ## byte swap first 16 dwords
- COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
-
- mov INP, _INP(%rsp)
-
- ## schedule 48 input dwords, by doing 3 rounds of 16 each
- mov $3, SRND
-.align 16
-.Lloop1:
- vpaddd (TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddd 1*16(TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddd 2*16(TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddd 3*16(TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- add $4*16, TBL
- FOUR_ROUNDS_AND_SCHED
-
- sub $1, SRND
- jne .Lloop1
-
- mov $2, SRND
-.Lloop2:
- vpaddd (TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
-
- vpaddd 1*16(TBL), X1, XFER
- vmovdqa XFER, _XFER(%rsp)
- add $2*16, TBL
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
-
- vmovdqa X2, X0
- vmovdqa X3, X1
-
- sub $1, SRND
- jne .Lloop2
-
- addm (4*0)(CTX),a
- addm (4*1)(CTX),b
- addm (4*2)(CTX),c
- addm (4*3)(CTX),d
- addm (4*4)(CTX),e
- addm (4*5)(CTX),f
- addm (4*6)(CTX),g
- addm (4*7)(CTX),h
-
- mov _INP(%rsp), INP
- add $64, INP
- cmp _INP_END(%rsp), INP
- jne .Lloop0
-
-.Ldone_hash:
-
- mov %rbp, %rsp
- popq %rbp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- RET
-SYM_FUNC_END(sha256_transform_avx)
-
-.section .rodata.cst256.K256, "aM", @progbits, 256
-.align 64
-K256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x0c0d0e0f08090a0b0405060700010203
-
-.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
-.align 16
-# shuffle xBxA -> 00BA
-_SHUF_00BA:
- .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
-
-.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
-.align 16
-# shuffle xDxC -> DC00
-_SHUF_DC00:
- .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/arch/x86/lib/crypto/sha256-avx2-asm.S b/arch/x86/lib/crypto/sha256-avx2-asm.S
deleted file mode 100644
index 25d3380321ec..000000000000
--- a/arch/x86/lib/crypto/sha256-avx2-asm.S
+++ /dev/null
@@ -1,774 +0,0 @@
-########################################################################
-# Implement fast SHA-256 with AVX2 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-256 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-# This code schedules 2 blocks at a time, with 4 lanes per block
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/objtool.h>
-
-## assume buffers not aligned
-#define VMOVDQ vmovdqu
-
-################################ Define Macros
-
-# addm [mem], reg
-# Add reg to mem using reg-mem add and store
-.macro addm p1 p2
- add \p1, \p2
- mov \p2, \p1
-.endm
-
-################################
-
-X0 = %ymm4
-X1 = %ymm5
-X2 = %ymm6
-X3 = %ymm7
-
-# XMM versions of above
-XWORD0 = %xmm4
-XWORD1 = %xmm5
-XWORD2 = %xmm6
-XWORD3 = %xmm7
-
-XTMP0 = %ymm0
-XTMP1 = %ymm1
-XTMP2 = %ymm2
-XTMP3 = %ymm3
-XTMP4 = %ymm8
-XFER = %ymm9
-XTMP5 = %ymm11
-
-SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
-SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
-BYTE_FLIP_MASK = %ymm13
-
-X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
-
-NUM_BLKS = %rdx # 3rd arg
-INP = %rsi # 2nd arg
-CTX = %rdi # 1st arg
-c = %ecx
-d = %r8d
-e = %edx # clobbers NUM_BLKS
-y3 = %esi # clobbers INP
-
-SRND = CTX # SRND is same register as CTX
-
-a = %eax
-b = %ebx
-f = %r9d
-g = %r10d
-h = %r11d
-old_h = %r11d
-
-T1 = %r12d
-y0 = %r13d
-y1 = %r14d
-y2 = %r15d
-
-
-_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
-_XMM_SAVE_SIZE = 0
-_INP_END_SIZE = 8
-_INP_SIZE = 8
-_CTX_SIZE = 8
-
-_XFER = 0
-_XMM_SAVE = _XFER + _XFER_SIZE
-_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
-_INP = _INP_END + _INP_END_SIZE
-_CTX = _INP + _INP_SIZE
-STACK_SIZE = _CTX + _CTX_SIZE
-
-# rotate_Xs
-# Rotate values of symbols X0...X3
-.macro rotate_Xs
- X_ = X0
- X0 = X1
- X1 = X2
- X2 = X3
- X3 = X_
-.endm
-
-# ROTATE_ARGS
-# Rotate values of symbols a...h
-.macro ROTATE_ARGS
- old_h = h
- TMP_ = h
- h = g
- g = f
- f = e
- e = d
- d = c
- c = b
- b = a
- a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED disp
-################################### RND N + 0 ############################
-
- mov a, y3 # y3 = a # MAJA
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
-
- addl \disp(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
- vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
- mov f, y2 # y2 = f # CH
- rorx $13, a, T1 # T1 = a >> 13 # S0B
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- xor g, y2 # y2 = f^g # CH
- vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
-
- and e, y2 # y2 = (f^g)&e # CH
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- add h, d # d = k + w + h + d # --
-
- and b, y3 # y3 = (a|c)&b # MAJA
- vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
-
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- vpsrld $7, XTMP1, XTMP2
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
-
- add y0, y2 # y2 = S1 + CH # --
- vpslld $(32-7), XTMP1, XTMP3
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
- vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
-
- vpsrld $18, XTMP1, XTMP2
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
-
- ROTATE_ARGS
-
-################################### RND N + 1 ############################
-
- mov a, y3 # y3 = a # MAJA
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- offset = \disp + 1*4
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
-
- vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
- mov f, y2 # y2 = f # CH
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- xor g, y2 # y2 = f^g # CH
-
-
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- and e, y2 # y2 = (f^g)&e # CH
- add h, d # d = k + w + h + d # --
-
- vpslld $(32-18), XTMP1, XTMP1
- and b, y3 # y3 = (a|c)&b # MAJA
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
-
- vpxor XTMP1, XTMP3, XTMP3
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
- vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
-
-
- ROTATE_ARGS
-
-################################### RND N + 2 ############################
-
- mov a, y3 # y3 = a # MAJA
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- offset = \disp + 2*4
- addl offset(%rsp, SRND), h # h = k + w + h # --
-
- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- or c, y3 # y3 = a|c # MAJA
- mov f, y2 # y2 = f # CH
- xor g, y2 # y2 = f^g # CH
-
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
- and e, y2 # y2 = (f^g)&e # CH
-
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- vpxor XTMP3, XTMP2, XTMP2
- add h, d # d = k + w + h + d # --
- and b, y3 # y3 = (a|c)&b # MAJA
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a ,T1 # T1 = (a >> 2) # S0
- vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
- vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
-
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1,h # h = k + w + h + S0 # --
- add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
- add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
-
- add y3,h # h = t1 + S0 + MAJ # --
-
-
- ROTATE_ARGS
-
-################################### RND N + 3 ############################
-
- mov a, y3 # y3 = a # MAJA
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- offset = \disp + 3*4
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
-
- vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
- mov f, y2 # y2 = f # CH
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- xor g, y2 # y2 = f^g # CH
-
-
- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add h, d # d = k + w + h + d # --
- and b, y3 # y3 = (a|c)&b # MAJA
-
- vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- vpxor XTMP3, XTMP2, XTMP2
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- add y0, y2 # y2 = S1 + CH # --
-
- vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
-
- vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
-
- add y1, h # h = k + w + h + S0 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- ROTATE_ARGS
- rotate_Xs
-.endm
-
-.macro DO_4ROUNDS disp
-################################### RND N + 0 ###########################
-
- mov f, y2 # y2 = f # CH
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- addl \disp(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- ROTATE_ARGS
-
-################################### RND N + 1 ###########################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- offset = 4*1 + \disp
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- ROTATE_ARGS
-
-################################### RND N + 2 ##############################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- offset = 4*2 + \disp
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- ROTATE_ARGS
-
-################################### RND N + 3 ###########################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- offset = 4*3 + \disp
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
-
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
-
- add y3, h # h = t1 + S0 + MAJ # --
-
- ROTATE_ARGS
-
-.endm
-
-########################################################################
-## void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS],
-## const u8 *data, size_t nblocks);
-########################################################################
-.text
-SYM_FUNC_START(sha256_transform_rorx)
- ANNOTATE_NOENDBR # since this is called only via static_call
-
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
-
- push %rbp
- mov %rsp, %rbp
-
- subq $STACK_SIZE, %rsp
- and $-32, %rsp # align rsp to 32 byte boundary
-
- shl $6, NUM_BLKS # convert to bytes
- jz .Ldone_hash
- lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
- mov NUM_BLKS, _INP_END(%rsp)
-
- cmp NUM_BLKS, INP
- je .Lonly_one_block
-
- ## load initial digest
- mov (CTX), a
- mov 4*1(CTX), b
- mov 4*2(CTX), c
- mov 4*3(CTX), d
- mov 4*4(CTX), e
- mov 4*5(CTX), f
- mov 4*6(CTX), g
- mov 4*7(CTX), h
-
- vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
- vmovdqa _SHUF_00BA(%rip), SHUF_00BA
- vmovdqa _SHUF_DC00(%rip), SHUF_DC00
-
- mov CTX, _CTX(%rsp)
-
-.Lloop0:
- ## Load first 16 dwords from two blocks
- VMOVDQ 0*32(INP),XTMP0
- VMOVDQ 1*32(INP),XTMP1
- VMOVDQ 2*32(INP),XTMP2
- VMOVDQ 3*32(INP),XTMP3
-
- ## byte swap data
- vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
- vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
- vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
- vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
-
- ## transpose data into high/low halves
- vperm2i128 $0x20, XTMP2, XTMP0, X0
- vperm2i128 $0x31, XTMP2, XTMP0, X1
- vperm2i128 $0x20, XTMP3, XTMP1, X2
- vperm2i128 $0x31, XTMP3, XTMP1, X3
-
-.Llast_block_enter:
- add $64, INP
- mov INP, _INP(%rsp)
-
- ## schedule 48 input dwords, by doing 3 rounds of 12 each
- xor SRND, SRND
-
-.align 16
-.Lloop1:
- leaq K256+0*32(%rip), INP ## reuse INP as scratch reg
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED (_XFER + 0*32)
-
- leaq K256+1*32(%rip), INP
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED (_XFER + 1*32)
-
- leaq K256+2*32(%rip), INP
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED (_XFER + 2*32)
-
- leaq K256+3*32(%rip), INP
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED (_XFER + 3*32)
-
- add $4*32, SRND
- cmp $3*4*32, SRND
- jb .Lloop1
-
-.Lloop2:
- ## Do last 16 rounds with no scheduling
- leaq K256+0*32(%rip), INP
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
- DO_4ROUNDS (_XFER + 0*32)
-
- leaq K256+1*32(%rip), INP
- vpaddd (INP, SRND), X1, XFER
- vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
- DO_4ROUNDS (_XFER + 1*32)
- add $2*32, SRND
-
- vmovdqa X2, X0
- vmovdqa X3, X1
-
- cmp $4*4*32, SRND
- jb .Lloop2
-
- mov _CTX(%rsp), CTX
- mov _INP(%rsp), INP
-
- addm (4*0)(CTX),a
- addm (4*1)(CTX),b
- addm (4*2)(CTX),c
- addm (4*3)(CTX),d
- addm (4*4)(CTX),e
- addm (4*5)(CTX),f
- addm (4*6)(CTX),g
- addm (4*7)(CTX),h
-
- cmp _INP_END(%rsp), INP
- ja .Ldone_hash
-
- #### Do second block using previously scheduled results
- xor SRND, SRND
-.align 16
-.Lloop3:
- DO_4ROUNDS (_XFER + 0*32 + 16)
- DO_4ROUNDS (_XFER + 1*32 + 16)
- add $2*32, SRND
- cmp $4*4*32, SRND
- jb .Lloop3
-
- mov _CTX(%rsp), CTX
- mov _INP(%rsp), INP
- add $64, INP
-
- addm (4*0)(CTX),a
- addm (4*1)(CTX),b
- addm (4*2)(CTX),c
- addm (4*3)(CTX),d
- addm (4*4)(CTX),e
- addm (4*5)(CTX),f
- addm (4*6)(CTX),g
- addm (4*7)(CTX),h
-
- cmp _INP_END(%rsp), INP
- jb .Lloop0
- ja .Ldone_hash
-
-.Ldo_last_block:
- VMOVDQ 0*16(INP),XWORD0
- VMOVDQ 1*16(INP),XWORD1
- VMOVDQ 2*16(INP),XWORD2
- VMOVDQ 3*16(INP),XWORD3
-
- vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
- vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
- vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
- vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
-
- jmp .Llast_block_enter
-
-.Lonly_one_block:
-
- ## load initial digest
- mov (4*0)(CTX),a
- mov (4*1)(CTX),b
- mov (4*2)(CTX),c
- mov (4*3)(CTX),d
- mov (4*4)(CTX),e
- mov (4*5)(CTX),f
- mov (4*6)(CTX),g
- mov (4*7)(CTX),h
-
- vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
- vmovdqa _SHUF_00BA(%rip), SHUF_00BA
- vmovdqa _SHUF_DC00(%rip), SHUF_DC00
-
- mov CTX, _CTX(%rsp)
- jmp .Ldo_last_block
-
-.Ldone_hash:
-
- mov %rbp, %rsp
- pop %rbp
-
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- vzeroupper
- RET
-SYM_FUNC_END(sha256_transform_rorx)
-
-.section .rodata.cst512.K256, "aM", @progbits, 512
-.align 64
-K256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
-.align 32
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
-
-# shuffle xBxA -> 00BA
-.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
-.align 32
-_SHUF_00BA:
- .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
-
-# shuffle xDxC -> DC00
-.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
-.align 32
-_SHUF_DC00:
- .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/arch/x86/lib/crypto/sha256-ni-asm.S b/arch/x86/lib/crypto/sha256-ni-asm.S
deleted file mode 100644
index d3548206cf3d..000000000000
--- a/arch/x86/lib/crypto/sha256-ni-asm.S
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Intel SHA Extensions optimized implementation of a SHA-256 update function
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Sean Gulley <sean.m.gulley@intel.com>
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/linkage.h>
-#include <linux/objtool.h>
-
-#define STATE_PTR %rdi /* 1st arg */
-#define DATA_PTR %rsi /* 2nd arg */
-#define NUM_BLKS %rdx /* 3rd arg */
-
-#define SHA256CONSTANTS %rax
-
-#define MSG %xmm0 /* sha256rnds2 implicit operand */
-#define STATE0 %xmm1
-#define STATE1 %xmm2
-#define MSG0 %xmm3
-#define MSG1 %xmm4
-#define MSG2 %xmm5
-#define MSG3 %xmm6
-#define TMP %xmm7
-
-#define SHUF_MASK %xmm8
-
-#define ABEF_SAVE %xmm9
-#define CDGH_SAVE %xmm10
-
-.macro do_4rounds i, m0, m1, m2, m3
-.if \i < 16
- movdqu \i*4(DATA_PTR), \m0
- pshufb SHUF_MASK, \m0
-.endif
- movdqa (\i-32)*4(SHA256CONSTANTS), MSG
- paddd \m0, MSG
- sha256rnds2 STATE0, STATE1
-.if \i >= 12 && \i < 60
- movdqa \m0, TMP
- palignr $4, \m3, TMP
- paddd TMP, \m1
- sha256msg2 \m0, \m1
-.endif
- punpckhqdq MSG, MSG
- sha256rnds2 STATE1, STATE0
-.if \i >= 4 && \i < 52
- sha256msg1 \m0, \m3
-.endif
-.endm
-
-/*
- * Intel SHA Extensions optimized implementation of a SHA-256 block function
- *
- * This function takes a pointer to the current SHA-256 state, a pointer to the
- * input data, and the number of 64-byte blocks to process. Once all blocks
- * have been processed, the state is updated with the new state. This function
- * only processes complete blocks. State initialization, buffering of partial
- * blocks, and digest finalization is expected to be handled elsewhere.
- *
- * void sha256_ni_transform(u32 state[SHA256_STATE_WORDS],
- * const u8 *data, size_t nblocks);
- */
-.text
-SYM_FUNC_START(sha256_ni_transform)
- ANNOTATE_NOENDBR # since this is called only via static_call
-
- shl $6, NUM_BLKS /* convert to bytes */
- jz .Ldone_hash
- add DATA_PTR, NUM_BLKS /* pointer to end of data */
-
- /*
- * load initial hash values
- * Need to reorder these appropriately
- * DCBA, HGFE -> ABEF, CDGH
- */
- movdqu 0*16(STATE_PTR), STATE0 /* DCBA */
- movdqu 1*16(STATE_PTR), STATE1 /* HGFE */
-
- movdqa STATE0, TMP
- punpcklqdq STATE1, STATE0 /* FEBA */
- punpckhqdq TMP, STATE1 /* DCHG */
- pshufd $0x1B, STATE0, STATE0 /* ABEF */
- pshufd $0xB1, STATE1, STATE1 /* CDGH */
-
- movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
- lea K256+32*4(%rip), SHA256CONSTANTS
-
-.Lloop0:
- /* Save hash values for addition after rounds */
- movdqa STATE0, ABEF_SAVE
- movdqa STATE1, CDGH_SAVE
-
-.irp i, 0, 16, 32, 48
- do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3
- do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0
- do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1
- do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2
-.endr
-
- /* Add current hash values with previously saved */
- paddd ABEF_SAVE, STATE0
- paddd CDGH_SAVE, STATE1
-
- /* Increment data pointer and loop if more to process */
- add $64, DATA_PTR
- cmp NUM_BLKS, DATA_PTR
- jne .Lloop0
-
- /* Write hash values back in the correct order */
- movdqa STATE0, TMP
- punpcklqdq STATE1, STATE0 /* GHEF */
- punpckhqdq TMP, STATE1 /* ABCD */
- pshufd $0xB1, STATE0, STATE0 /* HGFE */
- pshufd $0x1B, STATE1, STATE1 /* DCBA */
-
- movdqu STATE1, 0*16(STATE_PTR)
- movdqu STATE0, 1*16(STATE_PTR)
-
-.Ldone_hash:
-
- RET
-SYM_FUNC_END(sha256_ni_transform)
-
-.section .rodata.cst256.K256, "aM", @progbits, 256
-.align 64
-K256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/arch/x86/lib/crypto/sha256-ssse3-asm.S b/arch/x86/lib/crypto/sha256-ssse3-asm.S
deleted file mode 100644
index 7f24a4cdcb25..000000000000
--- a/arch/x86/lib/crypto/sha256-ssse3-asm.S
+++ /dev/null
@@ -1,511 +0,0 @@
-########################################################################
-# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-256 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/objtool.h>
-
-## assume buffers not aligned
-#define MOVDQ movdqu
-
-################################ Define Macros
-
-# addm [mem], reg
-# Add reg to mem using reg-mem add and store
-.macro addm p1 p2
- add \p1, \p2
- mov \p2, \p1
-.endm
-
-################################
-
-# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
-# Load xmm with mem and byte swap each dword
-.macro COPY_XMM_AND_BSWAP p1 p2 p3
- MOVDQ \p2, \p1
- pshufb \p3, \p1
-.endm
-
-################################
-
-X0 = %xmm4
-X1 = %xmm5
-X2 = %xmm6
-X3 = %xmm7
-
-XTMP0 = %xmm0
-XTMP1 = %xmm1
-XTMP2 = %xmm2
-XTMP3 = %xmm3
-XTMP4 = %xmm8
-XFER = %xmm9
-
-SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
-SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
-BYTE_FLIP_MASK = %xmm12
-
-NUM_BLKS = %rdx # 3rd arg
-INP = %rsi # 2nd arg
-CTX = %rdi # 1st arg
-
-SRND = %rsi # clobbers INP
-c = %ecx
-d = %r8d
-e = %edx
-TBL = %r12
-a = %eax
-b = %ebx
-
-f = %r9d
-g = %r10d
-h = %r11d
-
-y0 = %r13d
-y1 = %r14d
-y2 = %r15d
-
-
-
-_INP_END_SIZE = 8
-_INP_SIZE = 8
-_XFER_SIZE = 16
-_XMM_SAVE_SIZE = 0
-
-_INP_END = 0
-_INP = _INP_END + _INP_END_SIZE
-_XFER = _INP + _INP_SIZE
-_XMM_SAVE = _XFER + _XFER_SIZE
-STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
-
-# rotate_Xs
-# Rotate values of symbols X0...X3
-.macro rotate_Xs
-X_ = X0
-X0 = X1
-X1 = X2
-X2 = X3
-X3 = X_
-.endm
-
-# ROTATE_ARGS
-# Rotate values of symbols a...h
-.macro ROTATE_ARGS
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
- ## compute s0 four at a time and s1 two at a time
- ## compute W[-16] + W[-7] 4 at a time
- movdqa X3, XTMP0
- mov e, y0 # y0 = e
- ror $(25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- palignr $4, X2, XTMP0 # XTMP0 = W[-7]
- ror $(22-13), y1 # y1 = a >> (22-13)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- movdqa X1, XTMP1
- xor a, y1 # y1 = a ^ (a >> (22-13)
- xor g, y2 # y2 = f^g
- paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- ## compute s0
- palignr $4, X0, XTMP1 # XTMP1 = W[-15]
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- movdqa XTMP1, XTMP2 # XTMP2 = W[-15]
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add y0, y2 # y2 = S1 + CH
- add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH
- movdqa XTMP1, XTMP3 # XTMP3 = W[-15]
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- pslld $(32-7), XTMP1 #
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- psrld $7, XTMP2 #
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- #
- ROTATE_ARGS #
- movdqa XTMP3, XTMP2 # XTMP2 = W[-15]
- mov e, y0 # y0 = e
- mov a, y1 # y1 = a
- movdqa XTMP3, XTMP4 # XTMP4 = W[-15]
- ror $(25-11), y0 # y0 = e >> (25-11)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- ror $(22-13), y1 # y1 = a >> (22-13)
- pslld $(32-18), XTMP3 #
- xor a, y1 # y1 = a ^ (a >> (22-13)
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor g, y2 # y2 = f^g
- psrld $18, XTMP2 #
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- pxor XTMP3, XTMP1
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- psrld $3, XTMP4 # XTMP4 = W[-15] >> 3
- add y0, y2 # y2 = S1 + CH
- add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- pxor XTMP4, XTMP1 # XTMP1 = s0
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- ## compute low s1
- pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
-
- ROTATE_ARGS
- movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}
- mov e, y0 # y0 = e
- mov a, y1 # y1 = a
- ror $(25-11), y0 # y0 = e >> (25-11)
- movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}
- xor e, y0 # y0 = e ^ (e >> (25-11))
- ror $(22-13), y1 # y1 = a >> (22-13)
- mov f, y2 # y2 = f
- xor a, y1 # y1 = a ^ (a >> (22-13)
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
- xor g, y2 # y2 = f^g
- psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- pxor XTMP3, XTMP2
- add y0, y2 # y2 = S1 + CH
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA}
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA}
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- ## compute high s1
- pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- #
- ROTATE_ARGS #
- movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}
- mov e, y0 # y0 = e
- ror $(25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- movdqa XTMP2, X0 # X0 = W[-2] {DDCC}
- ror $(22-13), y1 # y1 = a >> (22-13)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
- xor a, y1 # y1 = a ^ (a >> (22-13)
- xor g, y2 # y2 = f^g
- psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25
- and e, y2 # y2 = (f^g)&e
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- pxor XTMP3, XTMP2 #
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
- add y0, y2 # y2 = S1 + CH
- add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- pxor XTMP2, X0 # X0 = s1 {xDxC}
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- pshufb SHUF_DC00, X0 # X0 = s1 {DC00}
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
-
- ROTATE_ARGS
- rotate_Xs
-.endm
-
-## input is [rsp + _XFER + %1 * 4]
-.macro DO_ROUND round
- mov e, y0 # y0 = e
- ror $(25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- xor e, y0 # y0 = e ^ (e >> (25-11))
- ror $(22-13), y1 # y1 = a >> (22-13)
- mov f, y2 # y2 = f
- xor a, y1 # y1 = a ^ (a >> (22-13)
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor g, y2 # y2 = f^g
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- and e, y2 # y2 = (f^g)&e
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- add y0, y2 # y2 = S1 + CH
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- offset = \round * 4 + _XFER
- add offset(%rsp), y2 # y2 = k + w + S1 + CH
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
-.endm
-
-########################################################################
-## void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS],
-## const u8 *data, size_t nblocks);
-########################################################################
-.text
-SYM_FUNC_START(sha256_transform_ssse3)
- ANNOTATE_NOENDBR # since this is called only via static_call
-
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rbp
- mov %rsp, %rbp
-
- subq $STACK_SIZE, %rsp
- and $~15, %rsp
-
- shl $6, NUM_BLKS # convert to bytes
- jz .Ldone_hash
- add INP, NUM_BLKS
- mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
-
- ## load initial digest
- mov 4*0(CTX), a
- mov 4*1(CTX), b
- mov 4*2(CTX), c
- mov 4*3(CTX), d
- mov 4*4(CTX), e
- mov 4*5(CTX), f
- mov 4*6(CTX), g
- mov 4*7(CTX), h
-
- movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
- movdqa _SHUF_00BA(%rip), SHUF_00BA
- movdqa _SHUF_DC00(%rip), SHUF_DC00
-
-.Lloop0:
- lea K256(%rip), TBL
-
- ## byte swap first 16 dwords
- COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
-
- mov INP, _INP(%rsp)
-
- ## schedule 48 input dwords, by doing 3 rounds of 16 each
- mov $3, SRND
-.align 16
-.Lloop1:
- movdqa (TBL), XFER
- paddd X0, XFER
- movdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- movdqa 1*16(TBL), XFER
- paddd X0, XFER
- movdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- movdqa 2*16(TBL), XFER
- paddd X0, XFER
- movdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- movdqa 3*16(TBL), XFER
- paddd X0, XFER
- movdqa XFER, _XFER(%rsp)
- add $4*16, TBL
- FOUR_ROUNDS_AND_SCHED
-
- sub $1, SRND
- jne .Lloop1
-
- mov $2, SRND
-.Lloop2:
- paddd (TBL), X0
- movdqa X0, _XFER(%rsp)
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
- paddd 1*16(TBL), X1
- movdqa X1, _XFER(%rsp)
- add $2*16, TBL
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
-
- movdqa X2, X0
- movdqa X3, X1
-
- sub $1, SRND
- jne .Lloop2
-
- addm (4*0)(CTX),a
- addm (4*1)(CTX),b
- addm (4*2)(CTX),c
- addm (4*3)(CTX),d
- addm (4*4)(CTX),e
- addm (4*5)(CTX),f
- addm (4*6)(CTX),g
- addm (4*7)(CTX),h
-
- mov _INP(%rsp), INP
- add $64, INP
- cmp _INP_END(%rsp), INP
- jne .Lloop0
-
-.Ldone_hash:
-
- mov %rbp, %rsp
- popq %rbp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
-
- RET
-SYM_FUNC_END(sha256_transform_ssse3)
-
-.section .rodata.cst256.K256, "aM", @progbits, 256
-.align 64
-K256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x0c0d0e0f08090a0b0405060700010203
-
-.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
-.align 16
-# shuffle xBxA -> 00BA
-_SHUF_00BA:
- .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
-
-.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
-.align 16
-# shuffle xDxC -> DC00
-_SHUF_DC00:
- .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/arch/x86/lib/crypto/sha256.c b/arch/x86/lib/crypto/sha256.c
deleted file mode 100644
index 80380f8fdcee..000000000000
--- a/arch/x86/lib/crypto/sha256.c
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 optimized for x86_64
- *
- * Copyright 2025 Google LLC
- */
-#include <asm/fpu/api.h>
-#include <crypto/internal/sha2.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/static_call.h>
-
-asmlinkage void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-asmlinkage void sha256_transform_avx(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-asmlinkage void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-asmlinkage void sha256_ni_transform(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_x86);
-
-DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_transform_ssse3);
-
-void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks)
-{
- if (static_branch_likely(&have_sha256_x86)) {
- kernel_fpu_begin();
- static_call(sha256_blocks_x86)(state, data, nblocks);
- kernel_fpu_end();
- } else {
- sha256_blocks_generic(state, data, nblocks);
- }
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_simd);
-
-void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks)
-{
- sha256_blocks_generic(state, data, nblocks);
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-
-bool sha256_is_arch_optimized(void)
-{
- return static_key_enabled(&have_sha256_x86);
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
-static int __init sha256_x86_mod_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
- static_call_update(sha256_blocks_x86, sha256_ni_transform);
- } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE |
- XFEATURE_MASK_YMM, NULL) &&
- boot_cpu_has(X86_FEATURE_AVX)) {
- if (boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_BMI2))
- static_call_update(sha256_blocks_x86,
- sha256_transform_rorx);
- else
- static_call_update(sha256_blocks_x86,
- sha256_transform_avx);
- } else if (!boot_cpu_has(X86_FEATURE_SSSE3)) {
- return 0;
- }
- static_branch_enable(&have_sha256_x86);
- return 0;
-}
-subsys_initcall(sha256_x86_mod_init);
-
-static void __exit sha256_x86_mod_exit(void)
-{
-}
-module_exit(sha256_x86_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 optimized for x86_64");
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index fdb6cab524f0..76e33bd7c556 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -805,7 +805,7 @@ kernel_physical_mapping_change(unsigned long paddr_start,
}
#ifndef CONFIG_NUMA
-static inline void x86_numa_init(void)
+static __always_inline void x86_numa_init(void)
{
memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
}
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index ebdfd7b84feb..e0a607a14e7e 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -35,7 +35,7 @@ targets += purgatory.ro purgatory.chk
PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel
PURGATORY_CFLAGS := -mcmodel=small -ffreestanding -fno-zero-initialized-in-bss -g0
PURGATORY_CFLAGS += -fpic -fvisibility=hidden
-PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) -DDISABLE_BRANCH_PROFILING
+PURGATORY_CFLAGS += $(DISABLE_KSTACK_ERASE) -DDISABLE_BRANCH_PROFILING
PURGATORY_CFLAGS += -fno-stack-protector
# Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That
diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c
index aea47e793963..655139dd0532 100644
--- a/arch/x86/purgatory/purgatory.c
+++ b/arch/x86/purgatory/purgatory.c
@@ -25,7 +25,7 @@ static int verify_sha256_digest(void)
{
struct kexec_sha_region *ptr, *end;
u8 digest[SHA256_DIGEST_SIZE];
- struct sha256_state sctx;
+ struct sha256_ctx sctx;
sha256_init(&sctx);
end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions);
diff --git a/arch/x86/um/ptrace.c b/arch/x86/um/ptrace.c
index fae8aabad10f..2635ca2595a3 100644
--- a/arch/x86/um/ptrace.c
+++ b/arch/x86/um/ptrace.c
@@ -236,7 +236,7 @@ static int generic_fpregs_set(struct task_struct *target,
static struct user_regset uml_regsets[] __ro_after_init = {
[REGSET_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = sizeof(struct user_regs_struct) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
@@ -246,7 +246,7 @@ static struct user_regset uml_regsets[] __ro_after_init = {
#ifdef CONFIG_X86_32
/* Old FP registers, they are needed in signal frames */
[REGSET_FP_LEGACY] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(struct user_i387_ia32_struct) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
@@ -257,10 +257,10 @@ static struct user_regset uml_regsets[] __ro_after_init = {
#endif
[REGSET_FP] = {
#ifdef CONFIG_X86_32
- .core_note_type = NT_PRXFPREG,
+ USER_REGSET_NOTE_TYPE(PRXFPREG),
.n = sizeof(struct user32_fxsr_struct) / sizeof(long),
#else
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(struct user_i387_struct) / sizeof(long),
#endif
.size = sizeof(long),
@@ -270,7 +270,7 @@ static struct user_regset uml_regsets[] __ro_after_init = {
.set = generic_fpregs_set,
},
[REGSET_XSTATE] = {
- .core_note_type = NT_X86_XSTATE,
+ USER_REGSET_NOTE_TYPE(X86_XSTATE),
.size = sizeof(long),
.align = sizeof(long),
.active = generic_fpregs_active,
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index cc5dba738389..13fe45dea296 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -3,7 +3,6 @@ generated-y += syscall_table.h
generic-y += extable.h
generic-y += kvm_para.h
generic-y += mcs_spinlock.h
-generic-y += param.h
generic-y += parport.h
generic-y += qrwlock.h
generic-y += qspinlock.h
diff --git a/arch/xtensa/include/uapi/asm/param.h b/arch/xtensa/include/uapi/asm/param.h
deleted file mode 100644
index e6feb4ee0590..000000000000
--- a/arch/xtensa/include/uapi/asm/param.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * include/asm-xtensa/param.h
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 2001 - 2005 Tensilica Inc.
- */
-
-#ifndef _UAPI_XTENSA_PARAM_H
-#define _UAPI_XTENSA_PARAM_H
-
-#ifndef __KERNEL__
-# define HZ 100
-#endif
-
-#define EXEC_PAGESIZE 4096
-
-#ifndef NGROUPS
-#define NGROUPS 32
-#endif
-
-#ifndef NOGROUP
-#define NOGROUP (-1)
-#endif
-
-#define MAXHOSTNAMELEN 64 /* max length of hostname */
-
-#endif /* _UAPI_XTENSA_PARAM_H */
diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c
index 9056cd1a8302..ff0600a0584c 100644
--- a/arch/xtensa/kernel/ptrace.c
+++ b/arch/xtensa/kernel/ptrace.c
@@ -193,7 +193,7 @@ enum xtensa_regset {
static const struct user_regset xtensa_regsets[] = {
[REGSET_GPR] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = sizeof(struct user_pt_regs) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -201,7 +201,7 @@ static const struct user_regset xtensa_regsets[] = {
.set = gpr_set,
},
[REGSET_TIE] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(elf_xtregs_t) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index f657a77314f8..374e4cb788d8 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -438,3 +438,5 @@
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
467 common open_tree_attr sys_open_tree_attr
+468 common file_getattr sys_file_getattr
+469 common file_setattr sys_file_setattr