diff options
Diffstat (limited to 'arch/arm/lib')
26 files changed, 1308 insertions, 354 deletions
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index 6d2ba454f25b..007874320937 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile @@ -36,15 +36,18 @@ else lib-y += io-readsw-armv4.o io-writesw-armv4.o endif -ifeq ($(CONFIG_ARCH_RPC),y) - AFLAGS_delay-loop.o += -march=armv4 -endif - $(obj)/csumpartialcopy.o: $(obj)/csumpartialcopygeneric.S $(obj)/csumpartialcopyuser.o: $(obj)/csumpartialcopygeneric.S ifeq ($(CONFIG_KERNEL_MODE_NEON),y) - NEON_FLAGS := -march=armv7-a -mfloat-abi=softfp -mfpu=neon - CFLAGS_xor-neon.o += $(NEON_FLAGS) + CFLAGS_xor-neon.o += $(CC_FLAGS_FPU) obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o endif + +obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o + +obj-$(CONFIG_CRC32_ARCH) += crc32-arm.o +crc32-arm-y := crc32-glue.o crc32-core.o + +obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-arm.o +crc-t10dif-arm-y := crc-t10dif-glue.o crc-t10dif-core.o diff --git a/arch/arm/lib/backtrace-clang.S b/arch/arm/lib/backtrace-clang.S index 2ff375144b55..290c52a60fc6 100644 --- a/arch/arm/lib/backtrace-clang.S +++ b/arch/arm/lib/backtrace-clang.S @@ -17,6 +17,7 @@ #define sv_pc r6 #define mask r7 #define sv_lr r8 +#define loglvl r9 ENTRY(c_backtrace) @@ -99,6 +100,7 @@ ENDPROC(c_backtrace) @ to ensure 8 byte alignment movs frame, r0 @ if frame pointer is zero beq no_frame @ we have no stack frames + mov loglvl, r2 tst r1, #0x10 @ 26 or 32-bit mode? moveq mask, #0xfc000003 movne mask, #0 @ mask for 32-bit @@ -142,7 +144,7 @@ for_each_frame: tst frame, mask @ Check for address exceptions */ 1003: ldr sv_lr, [sv_fp, #4] @ get saved lr from next frame - ldr r0, [sv_lr, #-4] @ get call instruction +1004: ldr r0, [sv_lr, #-4] @ get call instruction ldr r3, .Lopcode+4 and r2, r3, r0 @ is this a bl call teq r2, r3 @@ -162,11 +164,12 @@ finished_setup: /* * Print the function (sv_pc) and where it was called from (sv_lr). */ -1004: mov r0, sv_pc + mov r0, sv_pc mov r1, sv_lr mov r2, frame bic r1, r1, mask @ mask PC/LR for the mode + mov r3, loglvl bl dump_backtrace_entry /* @@ -183,6 +186,7 @@ finished_setup: ldr r0, [frame] @ locals are stored in @ the preceding frame subeq r0, r0, #4 + mov r2, loglvl bleq dump_backtrace_stm @ dump saved registers /* @@ -193,11 +197,20 @@ finished_setup: cmp sv_fp, frame @ next frame must be mov frame, sv_fp @ above the current frame +#ifdef CONFIG_IRQSTACKS + @ + @ Kernel stacks may be discontiguous in memory. If the next + @ frame is below the previous frame, accept it as long as it + @ lives in kernel memory. + @ + cmpls sv_fp, #PAGE_OFFSET +#endif bhi for_each_frame 1006: adr r0, .Lbad - mov r1, frame - bl printk + mov r1, loglvl + mov r2, frame + bl _printk no_frame: ldmfd sp!, {r4 - r9, fp, pc} ENDPROC(c_backtrace) .pushsection __ex_table,"a" @@ -205,11 +218,11 @@ ENDPROC(c_backtrace) .long 1001b, 1006b .long 1002b, 1006b .long 1003b, 1006b - .long 1004b, 1006b + .long 1004b, finished_setup .long 1005b, 1006b .popsection -.Lbad: .asciz "Backtrace aborted due to bad frame pointer <%p>\n" +.Lbad: .asciz "%sBacktrace aborted due to bad frame pointer <%p>\n" .align .Lopcode: .word 0xe92d4800 >> 11 @ stmfd sp!, {... fp, lr} .word 0x0b000000 @ bl if these bits are set diff --git a/arch/arm/lib/backtrace.S b/arch/arm/lib/backtrace.S index 582925238d65..293a2716bd20 100644 --- a/arch/arm/lib/backtrace.S +++ b/arch/arm/lib/backtrace.S @@ -18,6 +18,7 @@ #define sv_pc r6 #define mask r7 #define offset r8 +#define loglvl r9 ENTRY(c_backtrace) @@ -25,9 +26,10 @@ ENTRY(c_backtrace) ret lr ENDPROC(c_backtrace) #else - stmfd sp!, {r4 - r8, lr} @ Save an extra register so we have a location... + stmfd sp!, {r4 - r9, lr} @ Save an extra register so we have a location... movs frame, r0 @ if frame pointer is zero beq no_frame @ we have no stack frames + mov loglvl, r2 tst r1, #0x10 @ 26 or 32-bit mode? ARM( moveq mask, #0xfc000003 ) @@ -73,6 +75,7 @@ for_each_frame: tst frame, mask @ Check for address exceptions ldr r1, [frame, #-4] @ get saved lr mov r2, frame bic r1, r1, mask @ mask PC/LR for the mode + mov r3, loglvl bl dump_backtrace_entry ldr r1, [sv_pc, #-4] @ if stmfd sp!, {args} exists, @@ -80,12 +83,14 @@ for_each_frame: tst frame, mask @ Check for address exceptions teq r3, r1, lsr #11 ldreq r0, [frame, #-8] @ get sp subeq r0, r0, #4 @ point at the last arg + mov r2, loglvl bleq dump_backtrace_stm @ dump saved registers 1004: ldr r1, [sv_pc, #0] @ if stmfd sp!, {..., fp, ip, lr, pc} ldr r3, .Ldsi @ instruction exists, teq r3, r1, lsr #11 subeq r0, frame, #16 + mov r2, loglvl bleq dump_backtrace_stm @ dump saved registers teq sv_fp, #0 @ zero saved fp means @@ -93,12 +98,21 @@ for_each_frame: tst frame, mask @ Check for address exceptions cmp sv_fp, frame @ next frame must be mov frame, sv_fp @ above the current frame +#ifdef CONFIG_IRQSTACKS + @ + @ Kernel stacks may be discontiguous in memory. If the next + @ frame is below the previous frame, accept it as long as it + @ lives in kernel memory. + @ + cmpls sv_fp, #PAGE_OFFSET +#endif bhi for_each_frame 1006: adr r0, .Lbad - mov r1, frame - bl printk -no_frame: ldmfd sp!, {r4 - r8, pc} + mov r1, loglvl + mov r2, frame + bl _printk +no_frame: ldmfd sp!, {r4 - r9, pc} ENDPROC(c_backtrace) .pushsection __ex_table,"a" @@ -109,7 +123,7 @@ ENDPROC(c_backtrace) .long 1004b, 1006b .popsection -.Lbad: .asciz "Backtrace aborted due to bad frame pointer <%p>\n" +.Lbad: .asciz "%sBacktrace aborted due to bad frame pointer <%p>\n" .align .Ldsi: .word 0xe92dd800 >> 11 @ stmfd sp!, {... fp, ip, lr, pc} .word 0xe92d0000 >> 11 @ stmfd sp!, {} diff --git a/arch/arm/lib/bitops.h b/arch/arm/lib/bitops.h index 95bd35991288..f069d1b2318e 100644 --- a/arch/arm/lib/bitops.h +++ b/arch/arm/lib/bitops.h @@ -28,7 +28,7 @@ UNWIND( .fnend ) ENDPROC(\name ) .endm - .macro testop, name, instr, store + .macro __testop, name, instr, store, barrier ENTRY( \name ) UNWIND( .fnstart ) ands ip, r1, #3 @@ -38,7 +38,7 @@ UNWIND( .fnstart ) mov r0, r0, lsr #5 add r1, r1, r0, lsl #2 @ Get word offset mov r3, r2, lsl r3 @ create mask - smp_dmb + \barrier #if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP) .arch_extension mp ALT_SMP(W(pldw) [r1]) @@ -50,13 +50,21 @@ UNWIND( .fnstart ) strex ip, r2, [r1] cmp ip, #0 bne 1b - smp_dmb + \barrier cmp r0, #0 movne r0, #1 2: bx lr UNWIND( .fnend ) ENDPROC(\name ) .endm + + .macro testop, name, instr, store + __testop \name, \instr, \store, smp_dmb + .endm + + .macro sync_testop, name, instr, store + __testop \name, \instr, \store, __smp_dmb + .endm #else .macro bitop, name, instr ENTRY( \name ) diff --git a/arch/arm/lib/call_with_stack.S b/arch/arm/lib/call_with_stack.S index 28b0341ae786..5030d4e8d126 100644 --- a/arch/arm/lib/call_with_stack.S +++ b/arch/arm/lib/call_with_stack.S @@ -8,25 +8,44 @@ #include <linux/linkage.h> #include <asm/assembler.h> +#include <asm/unwind.h> /* * void call_with_stack(void (*fn)(void *), void *arg, void *sp) * * Change the stack to that pointed at by sp, then invoke fn(arg) with * the new stack. + * + * The sequence below follows the APCS frame convention for frame pointer + * unwinding, and implements the unwinder annotations needed by the EABI + * unwinder. */ -ENTRY(call_with_stack) - str sp, [r2, #-4]! - str lr, [r2, #-4]! +ENTRY(call_with_stack) +#if defined(CONFIG_UNWINDER_FRAME_POINTER) && defined(CONFIG_CC_IS_GCC) + mov ip, sp + push {fp, ip, lr, pc} + sub fp, ip, #4 +#else +UNWIND( .fnstart ) +UNWIND( .save {fpreg, lr} ) + push {fpreg, lr} +UNWIND( .setfp fpreg, sp ) + mov fpreg, sp +#endif mov sp, r2 mov r2, r0 mov r0, r1 - badr lr, 1f - ret r2 + bl_r r2 -1: ldr lr, [sp] - ldr sp, [sp, #4] - ret lr +#if defined(CONFIG_UNWINDER_FRAME_POINTER) && defined(CONFIG_CC_IS_GCC) + ldmdb fp, {fp, sp, pc} +#else + mov sp, fpreg + pop {fpreg, pc} +UNWIND( .fnend ) +#endif + .globl call_with_stack_end +call_with_stack_end: ENDPROC(call_with_stack) diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S index 95b2e1ce559c..270de7debd0f 100644 --- a/arch/arm/lib/copy_from_user.S +++ b/arch/arm/lib/copy_from_user.S @@ -91,26 +91,22 @@ strb\cond \reg, [\ptr], #1 .endm - .macro enter reg1 reg2 + .macro enter regs:vararg mov r3, #0 - stmdb sp!, {r0, r2, r3, \reg1, \reg2} +UNWIND( .save {r0, r2, r3, \regs} ) + stmdb sp!, {r0, r2, r3, \regs} .endm - .macro usave reg1 reg2 - UNWIND( .save {r0, r2, r3, \reg1, \reg2} ) - .endm - - .macro exit reg1 reg2 + .macro exit regs:vararg add sp, sp, #8 - ldmfd sp!, {r0, \reg1, \reg2} + ldmfd sp!, {r0, \regs} .endm .text ENTRY(arm_copy_from_user) #ifdef CONFIG_CPU_SPECTRE - get_thread_info r3 - ldr r3, [r3, #TI_ADDR_LIMIT] + ldr r3, =TASK_SIZE uaccess_mask_range_ptr r1, r2, r3, ip #endif @@ -118,7 +114,7 @@ ENTRY(arm_copy_from_user) ENDPROC(arm_copy_from_user) - .pushsection .fixup,"ax" + .pushsection .text.fixup,"ax" .align 0 copy_abort_preamble ldmfd sp!, {r1, r2, r3} diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S index 810a805d36dc..8fbafb074fe9 100644 --- a/arch/arm/lib/copy_template.S +++ b/arch/arm/lib/copy_template.S @@ -69,13 +69,10 @@ * than one 32bit instruction in Thumb-2) */ - - UNWIND( .fnstart ) - enter r4, lr - UNWIND( .fnend ) - UNWIND( .fnstart ) - usave r4, lr @ in first stmdb block + enter r4, UNWIND(fpreg,) lr + UNWIND( .setfp fpreg, sp ) + UNWIND( mov fpreg, sp ) subs r2, r2, #4 blt 8f @@ -86,12 +83,7 @@ bne 10f 1: subs r2, r2, #(28) - stmfd sp!, {r5 - r8} - UNWIND( .fnend ) - - UNWIND( .fnstart ) - usave r4, lr - UNWIND( .save {r5 - r8} ) @ in second stmfd block + stmfd sp!, {r5, r6, r8, r9} blt 5f CALGN( ands ip, r0, #31 ) @@ -110,9 +102,9 @@ PLD( pld [r1, #92] ) 3: PLD( pld [r1, #124] ) -4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f +4: ldr8w r1, r3, r4, r5, r6, r8, r9, ip, lr, abort=20f subs r2, r2, #32 - str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f + str8w r0, r3, r4, r5, r6, r8, r9, ip, lr, abort=20f bge 3b PLD( cmn r2, #96 ) PLD( bge 4b ) @@ -132,8 +124,8 @@ ldr1w r1, r4, abort=20f ldr1w r1, r5, abort=20f ldr1w r1, r6, abort=20f - ldr1w r1, r7, abort=20f ldr1w r1, r8, abort=20f + ldr1w r1, r9, abort=20f ldr1w r1, lr, abort=20f #if LDR1W_SHIFT < STR1W_SHIFT @@ -150,17 +142,14 @@ str1w r0, r4, abort=20f str1w r0, r5, abort=20f str1w r0, r6, abort=20f - str1w r0, r7, abort=20f str1w r0, r8, abort=20f + str1w r0, r9, abort=20f str1w r0, lr, abort=20f CALGN( bcs 2b ) -7: ldmfd sp!, {r5 - r8} - UNWIND( .fnend ) @ end of second stmfd block +7: ldmfd sp!, {r5, r6, r8, r9} - UNWIND( .fnstart ) - usave r4, lr @ still in first stmdb block 8: movs r2, r2, lsl #31 ldr1b r1, r3, ne, abort=21f ldr1b r1, r4, cs, abort=21f @@ -169,7 +158,7 @@ str1b r0, r4, cs, abort=21f str1b r0, ip, cs, abort=21f - exit r4, pc + exit r4, UNWIND(fpreg,) pc 9: rsb ip, ip, #4 cmp ip, #2 @@ -189,13 +178,10 @@ ldr1w r1, lr, abort=21f beq 17f bgt 18f - UNWIND( .fnend ) .macro forward_copy_shift pull push - UNWIND( .fnstart ) - usave r4, lr @ still in first stmdb block subs r2, r2, #28 blt 14f @@ -205,12 +191,8 @@ CALGN( subcc r2, r2, ip ) CALGN( bcc 15f ) -11: stmfd sp!, {r5 - r9} - UNWIND( .fnend ) +11: stmfd sp!, {r5, r6, r8 - r10} - UNWIND( .fnstart ) - usave r4, lr - UNWIND( .save {r5 - r9} ) @ in new second stmfd block PLD( pld [r1, #0] ) PLD( subs r2, r2, #96 ) PLD( pld [r1, #28] ) @@ -219,35 +201,32 @@ PLD( pld [r1, #92] ) 12: PLD( pld [r1, #124] ) -13: ldr4w r1, r4, r5, r6, r7, abort=19f +13: ldr4w r1, r4, r5, r6, r8, abort=19f mov r3, lr, lspull #\pull subs r2, r2, #32 - ldr4w r1, r8, r9, ip, lr, abort=19f + ldr4w r1, r9, r10, ip, lr, abort=19f orr r3, r3, r4, lspush #\push mov r4, r4, lspull #\pull orr r4, r4, r5, lspush #\push mov r5, r5, lspull #\pull orr r5, r5, r6, lspush #\push mov r6, r6, lspull #\pull - orr r6, r6, r7, lspush #\push - mov r7, r7, lspull #\pull - orr r7, r7, r8, lspush #\push + orr r6, r6, r8, lspush #\push mov r8, r8, lspull #\pull orr r8, r8, r9, lspush #\push mov r9, r9, lspull #\pull - orr r9, r9, ip, lspush #\push + orr r9, r9, r10, lspush #\push + mov r10, r10, lspull #\pull + orr r10, r10, ip, lspush #\push mov ip, ip, lspull #\pull orr ip, ip, lr, lspush #\push - str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, abort=19f + str8w r0, r3, r4, r5, r6, r8, r9, r10, ip, abort=19f bge 12b PLD( cmn r2, #96 ) PLD( bge 13b ) - ldmfd sp!, {r5 - r9} - UNWIND( .fnend ) @ end of the second stmfd block + ldmfd sp!, {r5, r6, r8 - r10} - UNWIND( .fnstart ) - usave r4, lr @ still in first stmdb block 14: ands ip, r2, #28 beq 16f @@ -262,7 +241,6 @@ 16: sub r1, r1, #(\push / 8) b 8b - UNWIND( .fnend ) .endm @@ -273,6 +251,7 @@ 18: forward_copy_shift pull=24 push=8 + UNWIND( .fnend ) /* * Abort preamble and completion macros. @@ -282,13 +261,13 @@ */ .macro copy_abort_preamble -19: ldmfd sp!, {r5 - r9} +19: ldmfd sp!, {r5, r6, r8 - r10} b 21f -20: ldmfd sp!, {r5 - r8} +20: ldmfd sp!, {r5, r6, r8, r9} 21: .endm .macro copy_abort_end - ldmfd sp!, {r4, pc} + ldmfd sp!, {r4, UNWIND(fpreg,) pc} .endm diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S index ebfe4cb3d912..fac49e57cc0b 100644 --- a/arch/arm/lib/copy_to_user.S +++ b/arch/arm/lib/copy_to_user.S @@ -90,18 +90,15 @@ strusr \reg, \ptr, 1, \cond, abort=\abort .endm - .macro enter reg1 reg2 + .macro enter regs:vararg mov r3, #0 - stmdb sp!, {r0, r2, r3, \reg1, \reg2} +UNWIND( .save {r0, r2, r3, \regs} ) + stmdb sp!, {r0, r2, r3, \regs} .endm - .macro usave reg1 reg2 - UNWIND( .save {r0, r2, r3, \reg1, \reg2} ) - .endm - - .macro exit reg1 reg2 + .macro exit regs:vararg add sp, sp, #8 - ldmfd sp!, {r0, \reg1, \reg2} + ldmfd sp!, {r0, \regs} .endm .text @@ -109,8 +106,7 @@ ENTRY(__copy_to_user_std) WEAK(arm_copy_to_user) #ifdef CONFIG_CPU_SPECTRE - get_thread_info r3 - ldr r3, [r3, #TI_ADDR_LIMIT] + ldr r3, =TASK_SIZE uaccess_mask_range_ptr r0, r2, r3, ip #endif diff --git a/arch/arm/lib/crc-t10dif-core.S b/arch/arm/lib/crc-t10dif-core.S new file mode 100644 index 000000000000..2bbf2df9c1e2 --- /dev/null +++ b/arch/arm/lib/crc-t10dif-core.S @@ -0,0 +1,468 @@ +// +// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions +// +// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> +// Copyright (C) 2019 Google LLC <ebiggers@google.com> +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License version 2 as +// published by the Free Software Foundation. +// + +// Derived from the x86 version: +// +// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions +// +// Copyright (c) 2013, Intel Corporation +// +// Authors: +// Erdinc Ozturk <erdinc.ozturk@intel.com> +// Vinodh Gopal <vinodh.gopal@intel.com> +// James Guilford <james.guilford@intel.com> +// Tim Chen <tim.c.chen@linux.intel.com> +// +// This software is available to you under a choice of one of two +// licenses. You may choose to be licensed under the terms of the GNU +// General Public License (GPL) Version 2, available from the file +// COPYING in the main directory of this source tree, or the +// OpenIB.org BSD license below: +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the +// distribution. +// +// * Neither the name of the Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// +// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Reference paper titled "Fast CRC Computation for Generic +// Polynomials Using PCLMULQDQ Instruction" +// URL: http://www.intel.com/content/dam/www/public/us/en/documents +// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +// + +#include <linux/linkage.h> +#include <asm/assembler.h> + +#ifdef CONFIG_CPU_ENDIAN_BE8 +#define CPU_LE(code...) +#else +#define CPU_LE(code...) code +#endif + + .text + .arch armv8-a + .fpu crypto-neon-fp-armv8 + + init_crc .req r0 + buf .req r1 + len .req r2 + + fold_consts_ptr .req ip + + q0l .req d0 + q0h .req d1 + q1l .req d2 + q1h .req d3 + q2l .req d4 + q2h .req d5 + q3l .req d6 + q3h .req d7 + q4l .req d8 + q4h .req d9 + q5l .req d10 + q5h .req d11 + q6l .req d12 + q6h .req d13 + q7l .req d14 + q7h .req d15 + q8l .req d16 + q8h .req d17 + q9l .req d18 + q9h .req d19 + q10l .req d20 + q10h .req d21 + q11l .req d22 + q11h .req d23 + q12l .req d24 + q12h .req d25 + + FOLD_CONSTS .req q10 + FOLD_CONST_L .req q10l + FOLD_CONST_H .req q10h + + /* + * Pairwise long polynomial multiplication of two 16-bit values + * + * { w0, w1 }, { y0, y1 } + * + * by two 64-bit values + * + * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 } + * + * where each vector element is a byte, ordered from least to most + * significant. The resulting 80-bit vectors are XOR'ed together. + * + * This can be implemented using 8x8 long polynomial multiplication, by + * reorganizing the input so that each pairwise 8x8 multiplication + * produces one of the terms from the decomposition below, and + * combining the results of each rank and shifting them into place. + * + * Rank + * 0 w0*x0 ^ | y0*z0 ^ + * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^ + * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^ + * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^ + * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^ + * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^ + * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^ + * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^ + * 8 w1*x7 << 64 | y1*z7 << 64 + * + * The inputs can be reorganized into + * + * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 } + * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 } + * + * and after performing 8x8->16 bit long polynomial multiplication of + * each of the halves of the first vector with those of the second one, + * we obtain the following four vectors of 16-bit elements: + * + * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 } + * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 } + * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 } + * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 } + * + * Results b and c can be XORed together, as the vector elements have + * matching ranks. Then, the final XOR can be pulled forward, and + * applied between the halves of each of the remaining three vectors, + * which are then shifted into place, and XORed together to produce the + * final 80-bit result. + */ + .macro pmull16x64_p8, v16, v64 + vext.8 q11, \v64, \v64, #1 + vld1.64 {q12}, [r4, :128] + vuzp.8 q11, \v64 + vtbl.8 d24, {\v16\()_L-\v16\()_H}, d24 + vtbl.8 d25, {\v16\()_L-\v16\()_H}, d25 + bl __pmull16x64_p8 + veor \v64, q12, q14 + .endm + +__pmull16x64_p8: + vmull.p8 q13, d23, d24 + vmull.p8 q14, d23, d25 + vmull.p8 q15, d22, d24 + vmull.p8 q12, d22, d25 + + veor q14, q14, q15 + veor d24, d24, d25 + veor d26, d26, d27 + veor d28, d28, d29 + vmov.i32 d25, #0 + vmov.i32 d29, #0 + vext.8 q12, q12, q12, #14 + vext.8 q14, q14, q14, #15 + veor d24, d24, d26 + bx lr +ENDPROC(__pmull16x64_p8) + + .macro pmull16x64_p64, v16, v64 + vmull.p64 q11, \v64\()l, \v16\()_L + vmull.p64 \v64, \v64\()h, \v16\()_H + veor \v64, \v64, q11 + .endm + + // Fold reg1, reg2 into the next 32 data bytes, storing the result back + // into reg1, reg2. + .macro fold_32_bytes, reg1, reg2, p + vld1.64 {q8-q9}, [buf]! + + pmull16x64_\p FOLD_CONST, \reg1 + pmull16x64_\p FOLD_CONST, \reg2 + +CPU_LE( vrev64.8 q8, q8 ) +CPU_LE( vrev64.8 q9, q9 ) + vswp q8l, q8h + vswp q9l, q9h + + veor.8 \reg1, \reg1, q8 + veor.8 \reg2, \reg2, q9 + .endm + + // Fold src_reg into dst_reg, optionally loading the next fold constants + .macro fold_16_bytes, src_reg, dst_reg, p, load_next_consts + pmull16x64_\p FOLD_CONST, \src_reg + .ifnb \load_next_consts + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! + .endif + veor.8 \dst_reg, \dst_reg, \src_reg + .endm + + .macro crct10dif, p + // For sizes less than 256 bytes, we can't fold 128 bytes at a time. + cmp len, #256 + blt .Lless_than_256_bytes\@ + + mov_l fold_consts_ptr, .Lfold_across_128_bytes_consts + + // Load the first 128 data bytes. Byte swapping is necessary to make + // the bit order match the polynomial coefficient order. + vld1.64 {q0-q1}, [buf]! + vld1.64 {q2-q3}, [buf]! + vld1.64 {q4-q5}, [buf]! + vld1.64 {q6-q7}, [buf]! +CPU_LE( vrev64.8 q0, q0 ) +CPU_LE( vrev64.8 q1, q1 ) +CPU_LE( vrev64.8 q2, q2 ) +CPU_LE( vrev64.8 q3, q3 ) +CPU_LE( vrev64.8 q4, q4 ) +CPU_LE( vrev64.8 q5, q5 ) +CPU_LE( vrev64.8 q6, q6 ) +CPU_LE( vrev64.8 q7, q7 ) + vswp q0l, q0h + vswp q1l, q1h + vswp q2l, q2h + vswp q3l, q3h + vswp q4l, q4h + vswp q5l, q5h + vswp q6l, q6h + vswp q7l, q7h + + // XOR the first 16 data *bits* with the initial CRC value. + vmov.i8 q8h, #0 + vmov.u16 q8h[3], init_crc + veor q0h, q0h, q8h + + // Load the constants for folding across 128 bytes. + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! + + // Subtract 128 for the 128 data bytes just consumed. Subtract another + // 128 to simplify the termination condition of the following loop. + sub len, len, #256 + + // While >= 128 data bytes remain (not counting q0-q7), fold the 128 + // bytes q0-q7 into them, storing the result back into q0-q7. +.Lfold_128_bytes_loop\@: + fold_32_bytes q0, q1, \p + fold_32_bytes q2, q3, \p + fold_32_bytes q4, q5, \p + fold_32_bytes q6, q7, \p + subs len, len, #128 + bge .Lfold_128_bytes_loop\@ + + // Now fold the 112 bytes in q0-q6 into the 16 bytes in q7. + + // Fold across 64 bytes. + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! + fold_16_bytes q0, q4, \p + fold_16_bytes q1, q5, \p + fold_16_bytes q2, q6, \p + fold_16_bytes q3, q7, \p, 1 + // Fold across 32 bytes. + fold_16_bytes q4, q6, \p + fold_16_bytes q5, q7, \p, 1 + // Fold across 16 bytes. + fold_16_bytes q6, q7, \p + + // Add 128 to get the correct number of data bytes remaining in 0...127 + // (not counting q7), following the previous extra subtraction by 128. + // Then subtract 16 to simplify the termination condition of the + // following loop. + adds len, len, #(128-16) + + // While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7 + // into them, storing the result back into q7. + blt .Lfold_16_bytes_loop_done\@ +.Lfold_16_bytes_loop\@: + pmull16x64_\p FOLD_CONST, q7 + vld1.64 {q0}, [buf]! +CPU_LE( vrev64.8 q0, q0 ) + vswp q0l, q0h + veor.8 q7, q7, q0 + subs len, len, #16 + bge .Lfold_16_bytes_loop\@ + +.Lfold_16_bytes_loop_done\@: + // Add 16 to get the correct number of data bytes remaining in 0...15 + // (not counting q7), following the previous extra subtraction by 16. + adds len, len, #16 + beq .Lreduce_final_16_bytes\@ + +.Lhandle_partial_segment\@: + // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first + // 16 bytes are in q7 and the rest are the remaining data in 'buf'. To + // do this without needing a fold constant for each possible 'len', + // redivide the bytes into a first chunk of 'len' bytes and a second + // chunk of 16 bytes, then fold the first chunk into the second. + + // q0 = last 16 original data bytes + add buf, buf, len + sub buf, buf, #16 + vld1.64 {q0}, [buf] +CPU_LE( vrev64.8 q0, q0 ) + vswp q0l, q0h + + // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes. + mov_l r1, .Lbyteshift_table + 16 + sub r1, r1, len + vld1.8 {q2}, [r1] + vtbl.8 q1l, {q7l-q7h}, q2l + vtbl.8 q1h, {q7l-q7h}, q2h + + // q3 = first chunk: q7 right-shifted by '16-len' bytes. + vmov.i8 q3, #0x80 + veor.8 q2, q2, q3 + vtbl.8 q3l, {q7l-q7h}, q2l + vtbl.8 q3h, {q7l-q7h}, q2h + + // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes. + vshr.s8 q2, q2, #7 + + // q2 = second chunk: 'len' bytes from q0 (low-order bytes), + // then '16-len' bytes from q1 (high-order bytes). + vbsl.8 q2, q1, q0 + + // Fold the first chunk into the second chunk, storing the result in q7. + pmull16x64_\p FOLD_CONST, q3 + veor.8 q7, q3, q2 + b .Lreduce_final_16_bytes\@ + +.Lless_than_256_bytes\@: + // Checksumming a buffer of length 16...255 bytes + + mov_l fold_consts_ptr, .Lfold_across_16_bytes_consts + + // Load the first 16 data bytes. + vld1.64 {q7}, [buf]! +CPU_LE( vrev64.8 q7, q7 ) + vswp q7l, q7h + + // XOR the first 16 data *bits* with the initial CRC value. + vmov.i8 q0h, #0 + vmov.u16 q0h[3], init_crc + veor.8 q7h, q7h, q0h + + // Load the fold-across-16-bytes constants. + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! + + cmp len, #16 + beq .Lreduce_final_16_bytes\@ // len == 16 + subs len, len, #32 + addlt len, len, #16 + blt .Lhandle_partial_segment\@ // 17 <= len <= 31 + b .Lfold_16_bytes_loop\@ // 32 <= len <= 255 + +.Lreduce_final_16_bytes\@: + .endm + +// +// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len); +// +// Assumes len >= 16. +// +ENTRY(crc_t10dif_pmull64) + crct10dif p64 + + // Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC. + + // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! + + // Fold the high 64 bits into the low 64 bits, while also multiplying by + // x^64. This produces a 128-bit value congruent to x^64 * M(x) and + // whose low 48 bits are 0. + vmull.p64 q0, q7h, FOLD_CONST_H // high bits * x^48 * (x^80 mod G(x)) + veor.8 q0h, q0h, q7l // + low bits * x^64 + + // Fold the high 32 bits into the low 96 bits. This produces a 96-bit + // value congruent to x^64 * M(x) and whose low 48 bits are 0. + vmov.i8 q1, #0 + vmov s4, s3 // extract high 32 bits + vmov s3, s5 // zero high 32 bits + vmull.p64 q1, q1l, FOLD_CONST_L // high 32 bits * x^48 * (x^48 mod G(x)) + veor.8 q0, q0, q1 // + low bits + + // Load G(x) and floor(x^48 / G(x)). + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128] + + // Use Barrett reduction to compute the final CRC value. + vmull.p64 q1, q0h, FOLD_CONST_H // high 32 bits * floor(x^48 / G(x)) + vshr.u64 q1l, q1l, #32 // /= x^32 + vmull.p64 q1, q1l, FOLD_CONST_L // *= G(x) + vshr.u64 q0l, q0l, #48 + veor.8 q0l, q0l, q1l // + low 16 nonzero bits + // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of q0. + + vmov.u16 r0, q0l[0] + bx lr +ENDPROC(crc_t10dif_pmull64) + +ENTRY(crc_t10dif_pmull8) + push {r4, lr} + mov_l r4, .L16x64perm + + crct10dif p8 + +CPU_LE( vrev64.8 q7, q7 ) + vswp q7l, q7h + vst1.64 {q7}, [r3, :128] + pop {r4, pc} +ENDPROC(crc_t10dif_pmull8) + + .section ".rodata", "a" + .align 4 + +// Fold constants precomputed from the polynomial 0x18bb7 +// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 +.Lfold_across_128_bytes_consts: + .quad 0x0000000000006123 // x^(8*128) mod G(x) + .quad 0x0000000000002295 // x^(8*128+64) mod G(x) +// .Lfold_across_64_bytes_consts: + .quad 0x0000000000001069 // x^(4*128) mod G(x) + .quad 0x000000000000dd31 // x^(4*128+64) mod G(x) +// .Lfold_across_32_bytes_consts: + .quad 0x000000000000857d // x^(2*128) mod G(x) + .quad 0x0000000000007acc // x^(2*128+64) mod G(x) +.Lfold_across_16_bytes_consts: + .quad 0x000000000000a010 // x^(1*128) mod G(x) + .quad 0x0000000000001faa // x^(1*128+64) mod G(x) +// .Lfinal_fold_consts: + .quad 0x1368000000000000 // x^48 * (x^48 mod G(x)) + .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x)) +// .Lbarrett_reduction_consts: + .quad 0x0000000000018bb7 // G(x) + .quad 0x00000001f65a57f8 // floor(x^48 / G(x)) + +// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - +// len] is the index vector to shift left by 'len' bytes, and is also {0x80, +// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes. +.Lbyteshift_table: + .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 + .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 + +.L16x64perm: + .quad 0x808080800000000, 0x909090901010101 diff --git a/arch/arm/lib/crc-t10dif-glue.c b/arch/arm/lib/crc-t10dif-glue.c new file mode 100644 index 000000000000..6efad3d78284 --- /dev/null +++ b/arch/arm/lib/crc-t10dif-glue.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/crc-t10dif.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> + +#include <crypto/internal/simd.h> + +#include <asm/neon.h> +#include <asm/simd.h> + +static DEFINE_STATIC_KEY_FALSE(have_neon); +static DEFINE_STATIC_KEY_FALSE(have_pmull); + +#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U + +asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len); +asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len, + u8 out[16]); + +u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) +{ + if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) { + if (static_branch_likely(&have_pmull)) { + if (crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc_t10dif_pmull64(crc, data, length); + kernel_neon_end(); + return crc; + } + } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && + static_branch_likely(&have_neon) && + crypto_simd_usable()) { + u8 buf[16] __aligned(16); + + kernel_neon_begin(); + crc_t10dif_pmull8(crc, data, length, buf); + kernel_neon_end(); + + return crc_t10dif_generic(0, buf, sizeof(buf)); + } + } + return crc_t10dif_generic(crc, data, length); +} +EXPORT_SYMBOL(crc_t10dif_arch); + +static int __init crc_t10dif_arm_init(void) +{ + if (elf_hwcap & HWCAP_NEON) { + static_branch_enable(&have_neon); + if (elf_hwcap2 & HWCAP2_PMULL) + static_branch_enable(&have_pmull); + } + return 0; +} +arch_initcall(crc_t10dif_arm_init); + +static void __exit crc_t10dif_arm_exit(void) +{ +} +module_exit(crc_t10dif_arm_exit); + +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_DESCRIPTION("Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/arm/lib/crc32-core.S b/arch/arm/lib/crc32-core.S new file mode 100644 index 000000000000..6f674f30c70b --- /dev/null +++ b/arch/arm/lib/crc32-core.S @@ -0,0 +1,306 @@ +/* + * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 + * calculation. + * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) + * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found + * at: + * https://www.intel.com/products/processor/manuals/ + * Intel(R) 64 and IA-32 Architectures Software Developer's Manual + * Volume 2B: Instruction Set Reference, N-Z + * + * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> + * Alexander Boyko <Alexander_Boyko@xyratex.com> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .align 6 + .arch armv8-a + .arch_extension crc + .fpu crypto-neon-fp-armv8 + +.Lcrc32_constants: + /* + * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 + * #define CONSTANT_R1 0x154442bd4LL + * + * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 + * #define CONSTANT_R2 0x1c6e41596LL + */ + .quad 0x0000000154442bd4 + .quad 0x00000001c6e41596 + + /* + * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 + * #define CONSTANT_R3 0x1751997d0LL + * + * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e + * #define CONSTANT_R4 0x0ccaa009eLL + */ + .quad 0x00000001751997d0 + .quad 0x00000000ccaa009e + + /* + * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 + * #define CONSTANT_R5 0x163cd6124LL + */ + .quad 0x0000000163cd6124 + .quad 0x00000000FFFFFFFF + + /* + * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL + * + * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` + * = 0x1F7011641LL + * #define CONSTANT_RU 0x1F7011641LL + */ + .quad 0x00000001DB710641 + .quad 0x00000001F7011641 + +.Lcrc32c_constants: + .quad 0x00000000740eef02 + .quad 0x000000009e4addf8 + .quad 0x00000000f20c0dfe + .quad 0x000000014cd00bd6 + .quad 0x00000000dd45aab8 + .quad 0x00000000FFFFFFFF + .quad 0x0000000105ec76f0 + .quad 0x00000000dea713f1 + + dCONSTANTl .req d0 + dCONSTANTh .req d1 + qCONSTANT .req q0 + + BUF .req r0 + LEN .req r1 + CRC .req r2 + + qzr .req q9 + + /** + * Calculate crc32 + * BUF - buffer + * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63 + * CRC - initial crc32 + * return %eax crc32 + * uint crc32_pmull_le(unsigned char const *buffer, + * size_t len, uint crc32) + */ +SYM_FUNC_START(crc32_pmull_le) + adr r3, .Lcrc32_constants + b 0f +SYM_FUNC_END(crc32_pmull_le) + +SYM_FUNC_START(crc32c_pmull_le) + adr r3, .Lcrc32c_constants + +0: bic LEN, LEN, #15 + vld1.8 {q1-q2}, [BUF, :128]! + vld1.8 {q3-q4}, [BUF, :128]! + vmov.i8 qzr, #0 + vmov.i8 qCONSTANT, #0 + vmov.32 dCONSTANTl[0], CRC + veor.8 d2, d2, dCONSTANTl + sub LEN, LEN, #0x40 + cmp LEN, #0x40 + blt less_64 + + vld1.64 {qCONSTANT}, [r3] + +loop_64: /* 64 bytes Full cache line folding */ + sub LEN, LEN, #0x40 + + vmull.p64 q5, d3, dCONSTANTh + vmull.p64 q6, d5, dCONSTANTh + vmull.p64 q7, d7, dCONSTANTh + vmull.p64 q8, d9, dCONSTANTh + + vmull.p64 q1, d2, dCONSTANTl + vmull.p64 q2, d4, dCONSTANTl + vmull.p64 q3, d6, dCONSTANTl + vmull.p64 q4, d8, dCONSTANTl + + veor.8 q1, q1, q5 + vld1.8 {q5}, [BUF, :128]! + veor.8 q2, q2, q6 + vld1.8 {q6}, [BUF, :128]! + veor.8 q3, q3, q7 + vld1.8 {q7}, [BUF, :128]! + veor.8 q4, q4, q8 + vld1.8 {q8}, [BUF, :128]! + + veor.8 q1, q1, q5 + veor.8 q2, q2, q6 + veor.8 q3, q3, q7 + veor.8 q4, q4, q8 + + cmp LEN, #0x40 + bge loop_64 + +less_64: /* Folding cache line into 128bit */ + vldr dCONSTANTl, [r3, #16] + vldr dCONSTANTh, [r3, #24] + + vmull.p64 q5, d3, dCONSTANTh + vmull.p64 q1, d2, dCONSTANTl + veor.8 q1, q1, q5 + veor.8 q1, q1, q2 + + vmull.p64 q5, d3, dCONSTANTh + vmull.p64 q1, d2, dCONSTANTl + veor.8 q1, q1, q5 + veor.8 q1, q1, q3 + + vmull.p64 q5, d3, dCONSTANTh + vmull.p64 q1, d2, dCONSTANTl + veor.8 q1, q1, q5 + veor.8 q1, q1, q4 + + teq LEN, #0 + beq fold_64 + +loop_16: /* Folding rest buffer into 128bit */ + subs LEN, LEN, #0x10 + + vld1.8 {q2}, [BUF, :128]! + vmull.p64 q5, d3, dCONSTANTh + vmull.p64 q1, d2, dCONSTANTl + veor.8 q1, q1, q5 + veor.8 q1, q1, q2 + + bne loop_16 + +fold_64: + /* perform the last 64 bit fold, also adds 32 zeroes + * to the input stream */ + vmull.p64 q2, d2, dCONSTANTh + vext.8 q1, q1, qzr, #8 + veor.8 q1, q1, q2 + + /* final 32-bit fold */ + vldr dCONSTANTl, [r3, #32] + vldr d6, [r3, #40] + vmov.i8 d7, #0 + + vext.8 q2, q1, qzr, #4 + vand.8 d2, d2, d6 + vmull.p64 q1, d2, dCONSTANTl + veor.8 q1, q1, q2 + + /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ + vldr dCONSTANTl, [r3, #48] + vldr dCONSTANTh, [r3, #56] + + vand.8 q2, q1, q3 + vext.8 q2, qzr, q2, #8 + vmull.p64 q2, d5, dCONSTANTh + vand.8 q2, q2, q3 + vmull.p64 q2, d4, dCONSTANTl + veor.8 q1, q1, q2 + vmov r0, s5 + + bx lr +SYM_FUNC_END(crc32c_pmull_le) + + .macro __crc32, c + subs ip, r2, #8 + bmi .Ltail\c + + tst r1, #3 + bne .Lunaligned\c + + teq ip, #0 +.Laligned8\c: + ldrd r2, r3, [r1], #8 +ARM_BE8(rev r2, r2 ) +ARM_BE8(rev r3, r3 ) + crc32\c\()w r0, r0, r2 + crc32\c\()w r0, r0, r3 + bxeq lr + subs ip, ip, #8 + bpl .Laligned8\c + +.Ltail\c: + tst ip, #4 + beq 2f + ldr r3, [r1], #4 +ARM_BE8(rev r3, r3 ) + crc32\c\()w r0, r0, r3 + +2: tst ip, #2 + beq 1f + ldrh r3, [r1], #2 +ARM_BE8(rev16 r3, r3 ) + crc32\c\()h r0, r0, r3 + +1: tst ip, #1 + bxeq lr + ldrb r3, [r1] + crc32\c\()b r0, r0, r3 + bx lr + +.Lunaligned\c: + tst r1, #1 + beq 2f + ldrb r3, [r1], #1 + subs r2, r2, #1 + crc32\c\()b r0, r0, r3 + + tst r1, #2 + beq 0f +2: ldrh r3, [r1], #2 + subs r2, r2, #2 +ARM_BE8(rev16 r3, r3 ) + crc32\c\()h r0, r0, r3 + +0: subs ip, r2, #8 + bpl .Laligned8\c + b .Ltail\c + .endm + + .align 5 +SYM_FUNC_START(crc32_armv8_le) + __crc32 +SYM_FUNC_END(crc32_armv8_le) + + .align 5 +SYM_FUNC_START(crc32c_armv8_le) + __crc32 c +SYM_FUNC_END(crc32c_armv8_le) diff --git a/arch/arm/lib/crc32-glue.c b/arch/arm/lib/crc32-glue.c new file mode 100644 index 000000000000..4340351dbde8 --- /dev/null +++ b/arch/arm/lib/crc32-glue.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/cpufeature.h> +#include <linux/crc32.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> + +#include <crypto/internal/simd.h> + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> + +static DEFINE_STATIC_KEY_FALSE(have_crc32); +static DEFINE_STATIC_KEY_FALSE(have_pmull); + +#define PMULL_MIN_LEN 64 /* min size of buffer for pmull functions */ + +asmlinkage u32 crc32_pmull_le(const u8 buf[], u32 len, u32 init_crc); +asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], u32 len); + +asmlinkage u32 crc32c_pmull_le(const u8 buf[], u32 len, u32 init_crc); +asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], u32 len); + +static u32 crc32_le_scalar(u32 crc, const u8 *p, size_t len) +{ + if (static_branch_likely(&have_crc32)) + return crc32_armv8_le(crc, p, len); + return crc32_le_base(crc, p, len); +} + +u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + if (len >= PMULL_MIN_LEN + 15 && + static_branch_likely(&have_pmull) && crypto_simd_usable()) { + size_t n = -(uintptr_t)p & 15; + + /* align p to 16-byte boundary */ + if (n) { + crc = crc32_le_scalar(crc, p, n); + p += n; + len -= n; + } + n = round_down(len, 16); + kernel_neon_begin(); + crc = crc32_pmull_le(p, n, crc); + kernel_neon_end(); + p += n; + len -= n; + } + return crc32_le_scalar(crc, p, len); +} +EXPORT_SYMBOL(crc32_le_arch); + +static u32 crc32c_scalar(u32 crc, const u8 *p, size_t len) +{ + if (static_branch_likely(&have_crc32)) + return crc32c_armv8_le(crc, p, len); + return crc32c_base(crc, p, len); +} + +u32 crc32c_arch(u32 crc, const u8 *p, size_t len) +{ + if (len >= PMULL_MIN_LEN + 15 && + static_branch_likely(&have_pmull) && crypto_simd_usable()) { + size_t n = -(uintptr_t)p & 15; + + /* align p to 16-byte boundary */ + if (n) { + crc = crc32c_scalar(crc, p, n); + p += n; + len -= n; + } + n = round_down(len, 16); + kernel_neon_begin(); + crc = crc32c_pmull_le(p, n, crc); + kernel_neon_end(); + p += n; + len -= n; + } + return crc32c_scalar(crc, p, len); +} +EXPORT_SYMBOL(crc32c_arch); + +u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) +{ + return crc32_be_base(crc, p, len); +} +EXPORT_SYMBOL(crc32_be_arch); + +static int __init crc32_arm_init(void) +{ + if (elf_hwcap2 & HWCAP2_CRC32) + static_branch_enable(&have_crc32); + if (elf_hwcap2 & HWCAP2_PMULL) + static_branch_enable(&have_pmull); + return 0; +} +arch_initcall(crc32_arm_init); + +static void __exit crc32_arm_exit(void) +{ +} +module_exit(crc32_arm_exit); + +u32 crc32_optimizations(void) +{ + if (elf_hwcap2 & (HWCAP2_CRC32 | HWCAP2_PMULL)) + return CRC32_LE_OPTIMIZATION | CRC32C_OPTIMIZATION; + return 0; +} +EXPORT_SYMBOL(crc32_optimizations); + +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_DESCRIPTION("Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/arm/lib/csumpartialcopy.S b/arch/arm/lib/csumpartialcopy.S index 184d97254a7a..1ca6aadd649c 100644 --- a/arch/arm/lib/csumpartialcopy.S +++ b/arch/arm/lib/csumpartialcopy.S @@ -9,8 +9,8 @@ .text -/* Function: __u32 csum_partial_copy_nocheck(const char *src, char *dst, int len, __u32 sum) - * Params : r0 = src, r1 = dst, r2 = len, r3 = checksum +/* Function: __u32 csum_partial_copy_nocheck(const char *src, char *dst, int len) + * Params : r0 = src, r1 = dst, r2 = len * Returns : r0 = new checksum */ diff --git a/arch/arm/lib/csumpartialcopygeneric.S b/arch/arm/lib/csumpartialcopygeneric.S index 0b706a39a677..0fd5c10e90a7 100644 --- a/arch/arm/lib/csumpartialcopygeneric.S +++ b/arch/arm/lib/csumpartialcopygeneric.S @@ -86,6 +86,7 @@ sum .req r3 FN_ENTRY save_regs + mov sum, #-1 cmp len, #8 @ Ensure that we have at least blo .Lless8 @ 8 bytes to copy. diff --git a/arch/arm/lib/csumpartialcopyuser.S b/arch/arm/lib/csumpartialcopyuser.S index 6bd3a93eaa3c..c289bde04743 100644 --- a/arch/arm/lib/csumpartialcopyuser.S +++ b/arch/arm/lib/csumpartialcopyuser.S @@ -13,7 +13,8 @@ .text -#ifdef CONFIG_CPU_SW_DOMAIN_PAN +#if defined(CONFIG_CPU_SW_DOMAIN_PAN) + .macro save_regs mrc p15, 0, ip, c3, c0, 0 stmfd sp!, {r1, r2, r4 - r8, ip, lr} @@ -25,7 +26,23 @@ mcr p15, 0, ip, c3, c0, 0 ret lr .endm + +#elif defined(CONFIG_CPU_TTBR0_PAN) + + .macro save_regs + mrc p15, 0, ip, c2, c0, 2 @ read TTBCR + stmfd sp!, {r1, r2, r4 - r8, ip, lr} + uaccess_enable ip + .endm + + .macro load_regs + ldmfd sp!, {r1, r2, r4 - r8, ip, lr} + mcr p15, 0, ip, c2, c0, 2 @ restore TTBCR + ret lr + .endm + #else + .macro save_regs stmfd sp!, {r1, r2, r4 - r8, lr} .endm @@ -33,6 +50,7 @@ .macro load_regs ldmfd sp!, {r1, r2, r4 - r8, pc} .endm + #endif .macro load1b, reg1 @@ -62,9 +80,9 @@ /* * unsigned int - * csum_partial_copy_from_user(const char *src, char *dst, int len, int sum, int *err_ptr) - * r0 = src, r1 = dst, r2 = len, r3 = sum, [sp] = *err_ptr - * Returns : r0 = checksum, [[sp, #0], #0] = 0 or -EFAULT + * csum_partial_copy_from_user(const char *src, char *dst, int len) + * r0 = src, r1 = dst, r2 = len + * Returns : r0 = checksum or 0 */ #define FN_ENTRY ENTRY(csum_partial_copy_from_user) @@ -73,25 +91,11 @@ #include "csumpartialcopygeneric.S" /* - * FIXME: minor buglet here - * We don't return the checksum for the data present in the buffer. To do - * so properly, we would have to add in whatever registers were loaded before - * the fault, which, with the current asm above is not predictable. + * We report fault by returning 0 csum - impossible in normal case, since + * we start with 0xffffffff for initial sum. */ .pushsection .text.fixup,"ax" .align 4 -9001: mov r4, #-EFAULT -#ifdef CONFIG_CPU_SW_DOMAIN_PAN - ldr r5, [sp, #9*4] @ *err_ptr -#else - ldr r5, [sp, #8*4] @ *err_ptr -#endif - str r4, [r5] - ldmia sp, {r1, r2} @ retrieve dst, len - add r2, r2, r1 - mov r0, #0 @ zero the buffer -9002: teq r2, r1 - strbne r0, [r1], #1 - bne 9002b +9001: mov r0, #0 load_regs .popsection diff --git a/arch/arm/lib/delay-loop.S b/arch/arm/lib/delay-loop.S index 3ccade0f8130..33b08ca1c242 100644 --- a/arch/arm/lib/delay-loop.S +++ b/arch/arm/lib/delay-loop.S @@ -5,9 +5,14 @@ * Copyright (C) 1995, 1996 Russell King */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/assembler.h> #include <asm/delay.h> +#ifdef CONFIG_ARCH_RPC + .arch armv4 +#endif + .text .LC0: .word loops_per_jiffy @@ -20,21 +25,26 @@ * HZ <= 1000 */ -ENTRY(__loop_udelay) +SYM_TYPED_FUNC_START(__loop_udelay) ldr r2, .LC1 mul r0, r2, r0 @ r0 = delay_us * UDELAY_MULT -ENTRY(__loop_const_udelay) @ 0 <= r0 <= 0xfffffaf0 + b __loop_const_udelay +SYM_FUNC_END(__loop_udelay) + +SYM_TYPED_FUNC_START(__loop_const_udelay) @ 0 <= r0 <= 0xfffffaf0 ldr r2, .LC0 ldr r2, [r2] umull r1, r0, r2, r0 @ r0-r1 = r0 * loops_per_jiffy adds r1, r1, #0xffffffff @ rounding up ... adcs r0, r0, r0 @ and right shift by 31 reteq lr + b __loop_delay +SYM_FUNC_END(__loop_const_udelay) .align 3 @ Delay routine -ENTRY(__loop_delay) +SYM_TYPED_FUNC_START(__loop_delay) subs r0, r0, #1 #if 0 retls lr @@ -54,6 +64,4 @@ ENTRY(__loop_delay) #endif bhi __loop_delay ret lr -ENDPROC(__loop_udelay) -ENDPROC(__loop_const_udelay) -ENDPROC(__loop_delay) +SYM_FUNC_END(__loop_delay) diff --git a/arch/arm/lib/error-inject.c b/arch/arm/lib/error-inject.c new file mode 100644 index 000000000000..5a5b405792ba --- /dev/null +++ b/arch/arm/lib/error-inject.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/error-injection.h> +#include <linux/kprobes.h> + +void override_function_with_return(struct pt_regs *regs) +{ + instruction_pointer_set(regs, regs->ARM_lr); +} +NOKPROBE_SYMBOL(override_function_with_return); diff --git a/arch/arm/lib/findbit.S b/arch/arm/lib/findbit.S index b5e8b9ae4c7d..b7ac2d3c0748 100644 --- a/arch/arm/lib/findbit.S +++ b/arch/arm/lib/findbit.S @@ -12,182 +12,128 @@ */ #include <linux/linkage.h> #include <asm/assembler.h> +#include <asm/unwind.h> .text -/* - * Purpose : Find a 'zero' bit - * Prototype: int find_first_zero_bit(void *addr, unsigned int maxbit); - */ -ENTRY(_find_first_zero_bit_le) - teq r1, #0 - beq 3f - mov r2, #0 -1: - ARM( ldrb r3, [r0, r2, lsr #3] ) - THUMB( lsr r3, r2, #3 ) - THUMB( ldrb r3, [r0, r3] ) - eors r3, r3, #0xff @ invert bits - bne .L_found @ any now set - found zero bit - add r2, r2, #8 @ next bit pointer -2: cmp r2, r1 @ any more? - blo 1b -3: mov r0, r1 @ no free bits - ret lr -ENDPROC(_find_first_zero_bit_le) +#ifdef __ARMEB__ +#define SWAB_ENDIAN le +#else +#define SWAB_ENDIAN be +#endif -/* - * Purpose : Find next 'zero' bit - * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset) - */ -ENTRY(_find_next_zero_bit_le) + .macro find_first, endian, set, name +ENTRY(_find_first_\name\()bit_\endian) + UNWIND( .fnstart) teq r1, #0 - beq 3b - ands ip, r2, #7 - beq 1b @ If new byte, goto old routine - ARM( ldrb r3, [r0, r2, lsr #3] ) - THUMB( lsr r3, r2, #3 ) - THUMB( ldrb r3, [r0, r3] ) - eor r3, r3, #0xff @ now looking for a 1 bit - movs r3, r3, lsr ip @ shift off unused bits - bne .L_found - orr r2, r2, #7 @ if zero, then no bits here - add r2, r2, #1 @ align bit pointer - b 2b @ loop for next bit -ENDPROC(_find_next_zero_bit_le) - -/* - * Purpose : Find a 'one' bit - * Prototype: int find_first_bit(const unsigned long *addr, unsigned int maxbit); - */ -ENTRY(_find_first_bit_le) - teq r1, #0 beq 3f mov r2, #0 -1: - ARM( ldrb r3, [r0, r2, lsr #3] ) - THUMB( lsr r3, r2, #3 ) - THUMB( ldrb r3, [r0, r3] ) - movs r3, r3 - bne .L_found @ any now set - found zero bit - add r2, r2, #8 @ next bit pointer +1: ldr r3, [r0], #4 + .ifeq \set + mvns r3, r3 @ invert/test bits + .else + movs r3, r3 @ test bits + .endif + .ifc \endian, SWAB_ENDIAN + bne .L_found_swab + .else + bne .L_found @ found the bit? + .endif + add r2, r2, #32 @ next index 2: cmp r2, r1 @ any more? blo 1b -3: mov r0, r1 @ no free bits +3: mov r0, r1 @ no more bits ret lr -ENDPROC(_find_first_bit_le) + UNWIND( .fnend) +ENDPROC(_find_first_\name\()bit_\endian) + .endm -/* - * Purpose : Find next 'one' bit - * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset) - */ -ENTRY(_find_next_bit_le) - teq r1, #0 - beq 3b - ands ip, r2, #7 - beq 1b @ If new byte, goto old routine - ARM( ldrb r3, [r0, r2, lsr #3] ) - THUMB( lsr r3, r2, #3 ) - THUMB( ldrb r3, [r0, r3] ) + .macro find_next, endian, set, name +ENTRY(_find_next_\name\()bit_\endian) + UNWIND( .fnstart) + cmp r2, r1 + bhs 3b + mov ip, r2, lsr #5 @ word index + add r0, r0, ip, lsl #2 + ands ip, r2, #31 @ bit position + beq 1b + ldr r3, [r0], #4 + .ifeq \set + mvn r3, r3 @ invert bits + .endif + .ifc \endian, SWAB_ENDIAN + rev_l r3, ip + .if .Lrev_l_uses_tmp + @ we need to recompute ip because rev_l will have overwritten + @ it. + and ip, r2, #31 @ bit position + .endif + .endif movs r3, r3, lsr ip @ shift off unused bits bne .L_found - orr r2, r2, #7 @ if zero, then no bits here + orr r2, r2, #31 @ no zero bits add r2, r2, #1 @ align bit pointer b 2b @ loop for next bit -ENDPROC(_find_next_bit_le) + UNWIND( .fnend) +ENDPROC(_find_next_\name\()bit_\endian) + .endm -#ifdef __ARMEB__ + .macro find_bit, endian, set, name + find_first \endian, \set, \name + find_next \endian, \set, \name + .endm -ENTRY(_find_first_zero_bit_be) - teq r1, #0 - beq 3f - mov r2, #0 -1: eor r3, r2, #0x18 @ big endian byte ordering - ARM( ldrb r3, [r0, r3, lsr #3] ) - THUMB( lsr r3, #3 ) - THUMB( ldrb r3, [r0, r3] ) - eors r3, r3, #0xff @ invert bits - bne .L_found @ any now set - found zero bit - add r2, r2, #8 @ next bit pointer -2: cmp r2, r1 @ any more? - blo 1b -3: mov r0, r1 @ no free bits - ret lr -ENDPROC(_find_first_zero_bit_be) +/* _find_first_zero_bit_le and _find_next_zero_bit_le */ + find_bit le, 0, zero_ -ENTRY(_find_next_zero_bit_be) - teq r1, #0 - beq 3b - ands ip, r2, #7 - beq 1b @ If new byte, goto old routine - eor r3, r2, #0x18 @ big endian byte ordering - ARM( ldrb r3, [r0, r3, lsr #3] ) - THUMB( lsr r3, #3 ) - THUMB( ldrb r3, [r0, r3] ) - eor r3, r3, #0xff @ now looking for a 1 bit - movs r3, r3, lsr ip @ shift off unused bits - bne .L_found - orr r2, r2, #7 @ if zero, then no bits here - add r2, r2, #1 @ align bit pointer - b 2b @ loop for next bit -ENDPROC(_find_next_zero_bit_be) +/* _find_first_bit_le and _find_next_bit_le */ + find_bit le, 1 -ENTRY(_find_first_bit_be) - teq r1, #0 - beq 3f - mov r2, #0 -1: eor r3, r2, #0x18 @ big endian byte ordering - ARM( ldrb r3, [r0, r3, lsr #3] ) - THUMB( lsr r3, #3 ) - THUMB( ldrb r3, [r0, r3] ) - movs r3, r3 - bne .L_found @ any now set - found zero bit - add r2, r2, #8 @ next bit pointer -2: cmp r2, r1 @ any more? - blo 1b -3: mov r0, r1 @ no free bits - ret lr -ENDPROC(_find_first_bit_be) +#ifdef __ARMEB__ -ENTRY(_find_next_bit_be) - teq r1, #0 - beq 3b - ands ip, r2, #7 - beq 1b @ If new byte, goto old routine - eor r3, r2, #0x18 @ big endian byte ordering - ARM( ldrb r3, [r0, r3, lsr #3] ) - THUMB( lsr r3, #3 ) - THUMB( ldrb r3, [r0, r3] ) - movs r3, r3, lsr ip @ shift off unused bits - bne .L_found - orr r2, r2, #7 @ if zero, then no bits here - add r2, r2, #1 @ align bit pointer - b 2b @ loop for next bit -ENDPROC(_find_next_bit_be) +/* _find_first_zero_bit_be and _find_next_zero_bit_be */ + find_bit be, 0, zero_ + +/* _find_first_bit_be and _find_next_bit_be */ + find_bit be, 1 #endif /* * One or more bits in the LSB of r3 are assumed to be set. */ +.L_found_swab: + UNWIND( .fnstart) + rev_l r3, ip .L_found: -#if __LINUX_ARM_ARCH__ >= 5 +#if __LINUX_ARM_ARCH__ >= 7 + rbit r3, r3 @ reverse bits + clz r3, r3 @ count high zero bits + add r0, r2, r3 @ add offset of first set bit +#elif __LINUX_ARM_ARCH__ >= 5 rsb r0, r3, #0 - and r3, r3, r0 - clz r3, r3 - rsb r3, r3, #31 - add r0, r2, r3 + and r3, r3, r0 @ mask out lowest bit set + clz r3, r3 @ count high zero bits + rsb r3, r3, #31 @ offset of first set bit + add r0, r2, r3 @ add offset of first set bit #else - tst r3, #0x0f + mov ip, #~0 + tst r3, ip, lsr #16 @ test bits 0-15 + addeq r2, r2, #16 + moveq r3, r3, lsr #16 + tst r3, #0x00ff + addeq r2, r2, #8 + moveq r3, r3, lsr #8 + tst r3, #0x000f addeq r2, r2, #4 - movne r3, r3, lsl #4 - tst r3, #0x30 + moveq r3, r3, lsr #4 + tst r3, #0x0003 addeq r2, r2, #2 - movne r3, r3, lsl #2 - tst r3, #0x40 + moveq r3, r3, lsr #2 + tst r3, #0x0001 addeq r2, r2, #1 mov r0, r2 #endif cmp r1, r0 @ Clamp to maxbit movlo r0, r1 ret lr - + UNWIND( .fnend) diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S index 09a333153dc6..90f2b645aa0d 100644 --- a/arch/arm/lib/memcpy.S +++ b/arch/arm/lib/memcpy.S @@ -42,26 +42,25 @@ strb\cond \reg, [\ptr], #1 .endm - .macro enter reg1 reg2 - stmdb sp!, {r0, \reg1, \reg2} + .macro enter regs:vararg +UNWIND( .save {r0, \regs} ) + stmdb sp!, {r0, \regs} .endm - .macro usave reg1 reg2 - UNWIND( .save {r0, \reg1, \reg2} ) - .endm - - .macro exit reg1 reg2 - ldmfd sp!, {r0, \reg1, \reg2} + .macro exit regs:vararg + ldmfd sp!, {r0, \regs} .endm .text /* Prototype: void *memcpy(void *dest, const void *src, size_t n); */ +ENTRY(__memcpy) ENTRY(mmiocpy) -ENTRY(memcpy) +WEAK(memcpy) #include "copy_template.S" ENDPROC(memcpy) ENDPROC(mmiocpy) +ENDPROC(__memcpy) diff --git a/arch/arm/lib/memmove.S b/arch/arm/lib/memmove.S index b50e5770fb44..6410554039fd 100644 --- a/arch/arm/lib/memmove.S +++ b/arch/arm/lib/memmove.S @@ -24,18 +24,20 @@ * occurring in the opposite direction. */ -ENTRY(memmove) +ENTRY(__memmove) +WEAK(memmove) UNWIND( .fnstart ) subs ip, r0, r1 cmphi r2, ip - bls memcpy - - stmfd sp!, {r0, r4, lr} + bls __memcpy UNWIND( .fnend ) UNWIND( .fnstart ) - UNWIND( .save {r0, r4, lr} ) @ in first stmfd block + UNWIND( .save {r0, r4, fpreg, lr} ) + stmfd sp!, {r0, r4, UNWIND(fpreg,) lr} + UNWIND( .setfp fpreg, sp ) + UNWIND( mov fpreg, sp ) add r1, r1, r2 add r0, r0, r2 subs r2, r2, #4 @@ -47,12 +49,7 @@ ENTRY(memmove) bne 10f 1: subs r2, r2, #(28) - stmfd sp!, {r5 - r8} - UNWIND( .fnend ) - - UNWIND( .fnstart ) - UNWIND( .save {r0, r4, lr} ) - UNWIND( .save {r5 - r8} ) @ in second stmfd block + stmfd sp!, {r5, r6, r8, r9} blt 5f CALGN( ands ip, r0, #31 ) @@ -71,9 +68,9 @@ ENTRY(memmove) PLD( pld [r1, #-96] ) 3: PLD( pld [r1, #-128] ) -4: ldmdb r1!, {r3, r4, r5, r6, r7, r8, ip, lr} +4: ldmdb r1!, {r3, r4, r5, r6, r8, r9, ip, lr} subs r2, r2, #32 - stmdb r0!, {r3, r4, r5, r6, r7, r8, ip, lr} + stmdb r0!, {r3, r4, r5, r6, r8, r9, ip, lr} bge 3b PLD( cmn r2, #96 ) PLD( bge 4b ) @@ -87,8 +84,8 @@ ENTRY(memmove) W(ldr) r4, [r1, #-4]! W(ldr) r5, [r1, #-4]! W(ldr) r6, [r1, #-4]! - W(ldr) r7, [r1, #-4]! W(ldr) r8, [r1, #-4]! + W(ldr) r9, [r1, #-4]! W(ldr) lr, [r1, #-4]! add pc, pc, ip @@ -98,17 +95,13 @@ ENTRY(memmove) W(str) r4, [r0, #-4]! W(str) r5, [r0, #-4]! W(str) r6, [r0, #-4]! - W(str) r7, [r0, #-4]! W(str) r8, [r0, #-4]! + W(str) r9, [r0, #-4]! W(str) lr, [r0, #-4]! CALGN( bcs 2b ) -7: ldmfd sp!, {r5 - r8} - UNWIND( .fnend ) @ end of second stmfd block - - UNWIND( .fnstart ) - UNWIND( .save {r0, r4, lr} ) @ still in first stmfd block +7: ldmfd sp!, {r5, r6, r8, r9} 8: movs r2, r2, lsl #31 ldrbne r3, [r1, #-1]! @@ -117,7 +110,7 @@ ENTRY(memmove) strbne r3, [r0, #-1]! strbcs r4, [r0, #-1]! strbcs ip, [r0, #-1] - ldmfd sp!, {r0, r4, pc} + ldmfd sp!, {r0, r4, UNWIND(fpreg,) pc} 9: cmp ip, #2 ldrbgt r3, [r1, #-1]! @@ -136,13 +129,10 @@ ENTRY(memmove) ldr r3, [r1, #0] beq 17f blt 18f - UNWIND( .fnend ) .macro backward_copy_shift push pull - UNWIND( .fnstart ) - UNWIND( .save {r0, r4, lr} ) @ still in first stmfd block subs r2, r2, #28 blt 14f @@ -151,12 +141,7 @@ ENTRY(memmove) CALGN( subcc r2, r2, ip ) CALGN( bcc 15f ) -11: stmfd sp!, {r5 - r9} - UNWIND( .fnend ) - - UNWIND( .fnstart ) - UNWIND( .save {r0, r4, lr} ) - UNWIND( .save {r5 - r9} ) @ in new second stmfd block +11: stmfd sp!, {r5, r6, r8 - r10} PLD( pld [r1, #-4] ) PLD( subs r2, r2, #96 ) @@ -166,35 +151,31 @@ ENTRY(memmove) PLD( pld [r1, #-96] ) 12: PLD( pld [r1, #-128] ) -13: ldmdb r1!, {r7, r8, r9, ip} +13: ldmdb r1!, {r8, r9, r10, ip} mov lr, r3, lspush #\push subs r2, r2, #32 ldmdb r1!, {r3, r4, r5, r6} orr lr, lr, ip, lspull #\pull mov ip, ip, lspush #\push - orr ip, ip, r9, lspull #\pull + orr ip, ip, r10, lspull #\pull + mov r10, r10, lspush #\push + orr r10, r10, r9, lspull #\pull mov r9, r9, lspush #\push orr r9, r9, r8, lspull #\pull mov r8, r8, lspush #\push - orr r8, r8, r7, lspull #\pull - mov r7, r7, lspush #\push - orr r7, r7, r6, lspull #\pull + orr r8, r8, r6, lspull #\pull mov r6, r6, lspush #\push orr r6, r6, r5, lspull #\pull mov r5, r5, lspush #\push orr r5, r5, r4, lspull #\pull mov r4, r4, lspush #\push orr r4, r4, r3, lspull #\pull - stmdb r0!, {r4 - r9, ip, lr} + stmdb r0!, {r4 - r6, r8 - r10, ip, lr} bge 12b PLD( cmn r2, #96 ) PLD( bge 13b ) - ldmfd sp!, {r5 - r9} - UNWIND( .fnend ) @ end of the second stmfd block - - UNWIND( .fnstart ) - UNWIND( .save {r0, r4, lr} ) @ still in first stmfd block + ldmfd sp!, {r5, r6, r8 - r10} 14: ands ip, r2, #28 beq 16f @@ -210,7 +191,6 @@ ENTRY(memmove) 16: add r1, r1, #(\pull / 8) b 8b - UNWIND( .fnend ) .endm @@ -221,4 +201,6 @@ ENTRY(memmove) 18: backward_copy_shift push=24 pull=8 + UNWIND( .fnend ) ENDPROC(memmove) +ENDPROC(__memmove) diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S index 6ca4535c47fb..de75ae4d5ab4 100644 --- a/arch/arm/lib/memset.S +++ b/arch/arm/lib/memset.S @@ -13,9 +13,11 @@ .text .align 5 +ENTRY(__memset) ENTRY(mmioset) -ENTRY(memset) +WEAK(memset) UNWIND( .fnstart ) + and r1, r1, #255 @ cast to unsigned char ands r3, r0, #3 @ 1 unaligned? mov ip, r0 @ preserve r0 as return value bne 6f @ 1 @@ -27,16 +29,16 @@ UNWIND( .fnstart ) mov r3, r1 7: cmp r2, #16 blt 4f +UNWIND( .fnend ) #if ! CALGN(1)+0 /* * We need 2 extra registers for this loop - use r8 and the LR */ - stmfd sp!, {r8, lr} -UNWIND( .fnend ) UNWIND( .fnstart ) UNWIND( .save {r8, lr} ) + stmfd sp!, {r8, lr} mov r8, r1 mov lr, r3 @@ -65,10 +67,9 @@ UNWIND( .fnend ) * whole cache lines at once. */ - stmfd sp!, {r4-r8, lr} -UNWIND( .fnend ) UNWIND( .fnstart ) UNWIND( .save {r4-r8, lr} ) + stmfd sp!, {r4-r8, lr} mov r4, r1 mov r5, r3 mov r6, r1 @@ -132,6 +133,7 @@ UNWIND( .fnstart ) UNWIND( .fnend ) ENDPROC(memset) ENDPROC(mmioset) +ENDPROC(__memset) ENTRY(__memset32) UNWIND( .fnstart ) diff --git a/arch/arm/lib/testchangebit.S b/arch/arm/lib/testchangebit.S index 4ebecc67e6e0..f13fe9bc2399 100644 --- a/arch/arm/lib/testchangebit.S +++ b/arch/arm/lib/testchangebit.S @@ -10,3 +10,7 @@ .text testop _test_and_change_bit, eor, str + +#if __LINUX_ARM_ARCH__ >= 6 +sync_testop _sync_test_and_change_bit, eor, str +#endif diff --git a/arch/arm/lib/testclearbit.S b/arch/arm/lib/testclearbit.S index 009afa0f5b4a..4d2c5ca620eb 100644 --- a/arch/arm/lib/testclearbit.S +++ b/arch/arm/lib/testclearbit.S @@ -10,3 +10,7 @@ .text testop _test_and_clear_bit, bicne, strne + +#if __LINUX_ARM_ARCH__ >= 6 +sync_testop _sync_test_and_clear_bit, bicne, strne +#endif diff --git a/arch/arm/lib/testsetbit.S b/arch/arm/lib/testsetbit.S index f3192e55acc8..649dbab65d8d 100644 --- a/arch/arm/lib/testsetbit.S +++ b/arch/arm/lib/testsetbit.S @@ -10,3 +10,7 @@ .text testop _test_and_set_bit, orreq, streq + +#if __LINUX_ARM_ARCH__ >= 6 +sync_testop _sync_test_and_set_bit, orreq, streq +#endif diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c index c9450982a155..c0ac7796d775 100644 --- a/arch/arm/lib/uaccess_with_memcpy.c +++ b/arch/arm/lib/uaccess_with_memcpy.c @@ -24,6 +24,7 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) { unsigned long addr = (unsigned long)_addr; pgd_t *pgd; + p4d_t *p4d; pmd_t *pmd; pte_t *pte; pud_t *pud; @@ -33,7 +34,11 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd))) return 0; - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (unlikely(p4d_none(*p4d) || p4d_bad(*p4d))) + return 0; + + pud = pud_offset(p4d, addr); if (unlikely(pud_none(*pud) || pud_bad(*pud))) return 0; @@ -51,10 +56,10 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) * to see that it's still huge and whether or not we will * need to fault on write. */ - if (unlikely(pmd_thp_or_huge(*pmd))) { + if (unlikely(pmd_leaf(*pmd))) { ptl = ¤t->mm->page_table_lock; spin_lock(ptl); - if (unlikely(!pmd_thp_or_huge(*pmd) + if (unlikely(!pmd_leaf(*pmd) || pmd_hugewillfault(*pmd))) { spin_unlock(ptl); return 0; @@ -69,6 +74,9 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) return 0; pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl); + if (unlikely(!pte)) + return 0; + if (unlikely(!pte_present(*pte) || !pte_young(*pte) || !pte_write(*pte) || !pte_dirty(*pte))) { pte_unmap_unlock(pte, ptl); @@ -87,16 +95,11 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n) unsigned long ua_flags; int atomic; - if (uaccess_kernel()) { - memcpy((void *)to, from, n); - return 0; - } - /* the mmap semaphore is taken only if not in an atomic context */ atomic = faulthandler_disabled(); if (!atomic) - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); while (n) { pte_t *pte; spinlock_t *ptl; @@ -104,11 +107,11 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n) while (!pin_page_for_write(to, &pte, &ptl)) { if (!atomic) - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (__put_user(0, (char __user *)to)) goto out; if (!atomic) - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); } tocopy = (~(unsigned long)to & ~PAGE_MASK) + 1; @@ -116,7 +119,7 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n) tocopy = n; ua_flags = uaccess_save_and_enable(); - memcpy((void *)to, from, tocopy); + __memcpy((void *)to, from, tocopy); uaccess_restore(ua_flags); to += tocopy; from += tocopy; @@ -128,7 +131,7 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n) spin_unlock(ptl); } if (!atomic) - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); out: return n; @@ -160,22 +163,17 @@ __clear_user_memset(void __user *addr, unsigned long n) { unsigned long ua_flags; - if (uaccess_kernel()) { - memset((void *)addr, 0, n); - return 0; - } - - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); while (n) { pte_t *pte; spinlock_t *ptl; int tocopy; while (!pin_page_for_write(addr, &pte, &ptl)) { - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (__put_user(0, (char __user *)addr)) goto out; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); } tocopy = (~(unsigned long)addr & ~PAGE_MASK) + 1; @@ -183,7 +181,7 @@ __clear_user_memset(void __user *addr, unsigned long n) tocopy = n; ua_flags = uaccess_save_and_enable(); - memset((void *)addr, 0, tocopy); + __memset((void *)addr, 0, tocopy); uaccess_restore(ua_flags); addr += tocopy; n -= tocopy; @@ -193,7 +191,7 @@ __clear_user_memset(void __user *addr, unsigned long n) else spin_unlock(ptl); } - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); out: return n; @@ -242,7 +240,7 @@ static int __init test_size_treshold(void) if (!dst_page) goto no_dst; kernel_ptr = page_address(src_page); - user_ptr = vmap(&dst_page, 1, VM_IOREMAP, __pgprot(__P010)); + user_ptr = vmap(&dst_page, 1, VM_IOREMAP, __pgprot(__PAGE_COPY)); if (!user_ptr) goto no_vmap; diff --git a/arch/arm/lib/xor-neon.c b/arch/arm/lib/xor-neon.c index b99dd8e1c93f..cf57fca97908 100644 --- a/arch/arm/lib/xor-neon.c +++ b/arch/arm/lib/xor-neon.c @@ -8,6 +8,7 @@ #include <linux/raid/xor.h> #include <linux/module.h> +MODULE_DESCRIPTION("NEON accelerated XOR implementation"); MODULE_LICENSE("GPL"); #ifndef __ARM_NEON__ @@ -17,17 +18,11 @@ MODULE_LICENSE("GPL"); /* * Pull in the reference implementations while instructing GCC (through * -ftree-vectorize) to attempt to exploit implicit parallelism and emit - * NEON instructions. + * NEON instructions. Clang does this by default at O2 so no pragma is + * needed. */ -#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#ifdef CONFIG_CC_IS_GCC #pragma GCC optimize "tree-vectorize" -#else -/* - * While older versions of GCC do not generate incorrect code, they fail to - * recognize the parallel nature of these functions, and emit plain ARM code, - * which is known to be slower than the optimized ARM code in asm-arm/xor.h. - */ -#warning This code requires at least version 4.6 of GCC #endif #pragma GCC diagnostic ignored "-Wunused-variable" |