summaryrefslogtreecommitdiff
path: root/arch/powerpc/lib/checksum_64.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/lib/checksum_64.S')
-rw-r--r--arch/powerpc/lib/checksum_64.S189
1 files changed, 89 insertions, 100 deletions
diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S
index 167f72555d60..d53d8f09a2c2 100644
--- a/arch/powerpc/lib/checksum_64.S
+++ b/arch/powerpc/lib/checksum_64.S
@@ -1,77 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* This file contains assembly-language implementations
* of IP-style 1's complement checksum routines.
*
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
*/
+#include <linux/export.h>
#include <linux/sys.h>
#include <asm/processor.h>
#include <asm/errno.h>
#include <asm/ppc_asm.h>
/*
- * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
- * len is in words and is always >= 5.
- *
- * In practice len == 5, but this is not guaranteed. So this code does not
- * attempt to use doubleword instructions.
- */
-_GLOBAL(ip_fast_csum)
- lwz r0,0(r3)
- lwzu r5,4(r3)
- addic. r4,r4,-2
- addc r0,r0,r5
- mtctr r4
- blelr-
-1: lwzu r4,4(r3)
- adde r0,r0,r4
- bdnz 1b
- addze r0,r0 /* add in final carry */
- rldicl r4,r0,32,0 /* fold two 32-bit halves together */
- add r0,r0,r4
- srdi r0,r0,32
- rlwinm r3,r0,16,0,31 /* fold two halves together */
- add r3,r0,r3
- not r3,r3
- srwi r3,r3,16
- blr
-
-/*
- * Compute checksum of TCP or UDP pseudo-header:
- * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
- * No real gain trying to do this specially for 64 bit, but
- * the 32 bit addition may spill into the upper bits of
- * the doubleword so we still must fold it down from 64.
- */
-_GLOBAL(csum_tcpudp_magic)
- rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
- addc r0,r3,r4 /* add 4 32-bit words together */
- adde r0,r0,r5
- adde r0,r0,r7
- rldicl r4,r0,32,0 /* fold 64 bit value */
- add r0,r4,r0
- srdi r0,r0,32
- rlwinm r3,r0,16,0,31 /* fold two halves together */
- add r3,r0,r3
- not r3,r3
- srwi r3,r3,16
- blr
-
-/*
* Computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit).
*
- * csum_partial(r3=buff, r4=len, r5=sum)
+ * __csum_partial(r3=buff, r4=len, r5=sum)
*/
-_GLOBAL(csum_partial)
+_GLOBAL(__csum_partial)
addic r0,r5,0 /* clear carry */
srdi. r6,r4,3 /* less than 8 bytes? */
@@ -83,7 +32,7 @@ _GLOBAL(csum_partial)
* work to calculate the correct checksum, we ignore that case
* and take the potential slowdown of unaligned loads.
*/
- rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
+ rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */
beq .Lcsum_aligned
li r7,4
@@ -122,9 +71,9 @@ _GLOBAL(csum_partial)
ld r11,24(r3)
/*
- * On POWER6 and POWER7 back to back addes take 2 cycles because of
- * the XER dependency. This means the fastest this loop can go is
- * 16 cycles per iteration. The scheduling of the loop below has
+ * On POWER6 and POWER7 back to back adde instructions take 2 cycles
+ * because of the XER dependency. This means the fastest this loop can
+ * go is 16 cycles per iteration. The scheduling of the loop below has
* been shown to hit this on both POWER6 and POWER7.
*/
.align 5
@@ -215,8 +164,12 @@ _GLOBAL(csum_partial)
beq .Lcsum_finish
lbz r6,0(r3)
+#ifdef __BIG_ENDIAN__
sldi r9,r6,8 /* Pad the byte out to 16 bits */
adde r0,r0,r9
+#else
+ adde r0,r0,r6
+#endif
.Lcsum_finish:
addze r0,r0 /* add in final carry */
@@ -224,34 +177,38 @@ _GLOBAL(csum_partial)
add r3,r4,r0
srdi r3,r3,32
blr
+EXPORT_SYMBOL(__csum_partial)
- .macro source
+ .macro srcnr
100:
- .section __ex_table,"a"
- .align 3
- .llong 100b,.Lsrc_error
- .previous
+ EX_TABLE(100b,.Lerror_nr)
.endm
- .macro dest
+ .macro source
+150:
+ EX_TABLE(150b,.Lerror)
+ .endm
+
+ .macro dstnr
200:
- .section __ex_table,"a"
- .align 3
- .llong 200b,.Ldest_error
- .previous
+ EX_TABLE(200b,.Lerror_nr)
+ .endm
+
+ .macro dest
+250:
+ EX_TABLE(250b,.Lerror)
.endm
/*
* Computes the checksum of a memory block at src, length len,
- * and adds in "sum" (32-bit), while copying the block to dst.
- * If an access exception occurs on src or dst, it stores -EFAULT
- * to *src_err or *dst_err respectively. The caller must take any action
- * required in this case (zeroing memory, recalculating partial checksum etc).
+ * and adds in 0xffffffff (32-bit), while copying the block to dst.
+ * If an access exception occurs, it returns 0.
*
- * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
+ * csum_partial_copy_generic(r3=src, r4=dst, r5=len)
*/
_GLOBAL(csum_partial_copy_generic)
+ li r6,-1
addic r0,r6,0 /* clear carry */
srdi. r6,r5,3 /* less than 8 bytes? */
@@ -266,19 +223,19 @@ _GLOBAL(csum_partial_copy_generic)
* If the source and destination are relatively unaligned we only
* align the source. This keeps things simple.
*/
- rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
+ rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */
beq .Lcopy_aligned
- li r7,4
- sub r6,r7,r6
+ li r9,4
+ sub r6,r9,r6
mtctr r6
1:
-source; lhz r6,0(r3) /* align to doubleword */
+srcnr; lhz r6,0(r3) /* align to doubleword */
subi r5,r5,2
addi r3,r3,2
adde r0,r0,r6
-dest; sth r6,0(r4)
+dstnr; sth r6,0(r4)
addi r4,r4,2
bdnz 1b
@@ -307,9 +264,9 @@ source; ld r10,16(r3)
source; ld r11,24(r3)
/*
- * On POWER6 and POWER7 back to back addes take 2 cycles because of
- * the XER dependency. This means the fastest this loop can go is
- * 16 cycles per iteration. The scheduling of the loop below has
+ * On POWER6 and POWER7 back to back adde instructions take 2 cycles
+ * because of the XER dependency. This means the fastest this loop can
+ * go is 16 cycles per iteration. The scheduling of the loop below has
* been shown to hit this on both POWER6 and POWER7.
*/
.align 5
@@ -392,10 +349,10 @@ dest; std r16,56(r4)
mtctr r6
3:
-source; ld r6,0(r3)
+srcnr; ld r6,0(r3)
addi r3,r3,8
adde r0,r0,r6
-dest; std r6,0(r4)
+dstnr; std r6,0(r4)
addi r4,r4,8
bdnz 3b
@@ -405,10 +362,10 @@ dest; std r6,0(r4)
srdi. r6,r5,2
beq .Lcopy_tail_halfword
-source; lwz r6,0(r3)
+srcnr; lwz r6,0(r3)
addi r3,r3,4
adde r0,r0,r6
-dest; stw r6,0(r4)
+dstnr; stw r6,0(r4)
addi r4,r4,4
subi r5,r5,4
@@ -416,10 +373,10 @@ dest; stw r6,0(r4)
srdi. r6,r5,1
beq .Lcopy_tail_byte
-source; lhz r6,0(r3)
+srcnr; lhz r6,0(r3)
addi r3,r3,2
adde r0,r0,r6
-dest; sth r6,0(r4)
+dstnr; sth r6,0(r4)
addi r4,r4,2
subi r5,r5,2
@@ -427,10 +384,14 @@ dest; sth r6,0(r4)
andi. r6,r5,1
beq .Lcopy_finish
-source; lbz r6,0(r3)
+srcnr; lbz r6,0(r3)
+#ifdef __BIG_ENDIAN__
sldi r9,r6,8 /* Pad the byte out to 16 bits */
adde r0,r0,r9
-dest; stb r6,0(r4)
+#else
+ adde r0,r0,r6
+#endif
+dstnr; stb r6,0(r4)
.Lcopy_finish:
addze r0,r0 /* add in final carry */
@@ -439,16 +400,44 @@ dest; stb r6,0(r4)
srdi r3,r3,32
blr
-.Lsrc_error:
- cmpdi 0,r7,0
- beqlr
- li r6,-EFAULT
- stw r6,0(r7)
+.Lerror:
+ ld r14,STK_REG(R14)(r1)
+ ld r15,STK_REG(R15)(r1)
+ ld r16,STK_REG(R16)(r1)
+ addi r1,r1,STACKFRAMESIZE
+.Lerror_nr:
+ li r3,0
blr
-.Ldest_error:
- cmpdi 0,r8,0
- beqlr
- li r6,-EFAULT
- stw r6,0(r8)
+EXPORT_SYMBOL(csum_partial_copy_generic)
+
+/*
+ * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ * const struct in6_addr *daddr,
+ * __u32 len, __u8 proto, __wsum sum)
+ */
+
+_GLOBAL(csum_ipv6_magic)
+ ld r8, 0(r3)
+ ld r9, 8(r3)
+ add r5, r5, r6
+ addc r0, r8, r9
+ ld r10, 0(r4)
+ ld r11, 8(r4)
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+ rotldi r5, r5, 8
+#endif
+ adde r0, r0, r10
+ add r5, r5, r7
+ adde r0, r0, r11
+ adde r0, r0, r5
+ addze r0, r0
+ rotldi r3, r0, 32 /* fold two 32 bit halves together */
+ add r3, r0, r3
+ srdi r0, r3, 32
+ rotlwi r3, r0, 16 /* fold two 16 bit halves together */
+ add r3, r0, r3
+ not r3, r3
+ rlwinm r3, r3, 16, 16, 31
blr
+EXPORT_SYMBOL(csum_ipv6_magic)