summaryrefslogtreecommitdiff
path: root/arch/parisc/lib/lusercopy.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/parisc/lib/lusercopy.S')
-rw-r--r--arch/parisc/lib/lusercopy.S395
1 files changed, 309 insertions, 86 deletions
diff --git a/arch/parisc/lib/lusercopy.S b/arch/parisc/lib/lusercopy.S
index 6f2d9355efe2..b428d29e45fb 100644
--- a/arch/parisc/lib/lusercopy.S
+++ b/arch/parisc/lib/lusercopy.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* User Space Access Routines
*
@@ -5,21 +6,8 @@
* Copyright (C) 2000 Richard Hirst <rhirst with parisc-linux.org>
* Copyright (C) 2001 Matthieu Delahaye <delahaym at esiee.fr>
* Copyright (C) 2003 Randolph Chung <tausq with parisc-linux.org>
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * Copyright (C) 2017 Helge Deller <deller@gmx.de>
+ * Copyright (C) 2017 John David Anglin <dave.anglin@bell.net>
*/
/*
@@ -40,100 +28,335 @@
#include <linux/linkage.h>
/*
- * get_sr gets the appropriate space value into
- * sr1 for kernel/user space access, depending
- * on the flag stored in the task structure.
- */
-
- .macro get_sr
- mfctl %cr30,%r1
- ldw TI_SEGMENT(%r1),%r22
- mfsp %sr3,%r1
- or,<> %r22,%r0,%r0
- copy %r0,%r1
- mtsp %r1,%sr1
- .endm
-
- .macro fixup_branch lbl
- ldil L%\lbl, %r1
- ldo R%\lbl(%r1), %r1
- bv %r0(%r1)
- .endm
-
- /*
* unsigned long lclear_user(void *to, unsigned long n)
*
* Returns 0 for success.
* otherwise, returns number of bytes not transferred.
*/
-ENTRY(lclear_user)
- .proc
- .callinfo NO_CALLS
- .entry
+ENTRY_CFI(lclear_user)
comib,=,n 0,%r25,$lclu_done
- get_sr
$lclu_loop:
addib,<> -1,%r25,$lclu_loop
-1: stbs,ma %r0,1(%sr1,%r26)
+1: stbs,ma %r0,1(%sr3,%r26)
$lclu_done:
bv %r0(%r2)
copy %r25,%r28
- .exit
-ENDPROC(lclear_user)
- .section .fixup,"ax"
-2: fixup_branch $lclu_done
- ldo 1(%r25),%r25
- .previous
+2: b $lclu_done
+ ldo 1(%r25),%r25
+
+ ASM_EXCEPTIONTABLE_ENTRY(1b,2b)
+ENDPROC_CFI(lclear_user)
+
+
+/*
+ * unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
+ *
+ * Inputs:
+ * - sr1 already contains space of source region
+ * - sr2 already contains space of destination region
+ *
+ * Returns:
+ * - number of bytes that could not be copied.
+ * On success, this will be zero.
+ *
+ * This code is based on a C-implementation of a copy routine written by
+ * Randolph Chung, which in turn was derived from the glibc.
+ *
+ * Several strategies are tried to try to get the best performance for various
+ * conditions. In the optimal case, we copy by loops that copy 32- or 16-bytes
+ * at a time using general registers. Unaligned copies are handled either by
+ * aligning the destination and then using shift-and-write method, or in a few
+ * cases by falling back to a byte-at-a-time copy.
+ *
+ * Testing with various alignments and buffer sizes shows that this code is
+ * often >10x faster than a simple byte-at-a-time copy, even for strangely
+ * aligned operands. It is interesting to note that the glibc version of memcpy
+ * (written in C) is actually quite fast already. This routine is able to beat
+ * it by 30-40% for aligned copies because of the loop unrolling, but in some
+ * cases the glibc version is still slightly faster. This lends more
+ * credibility that gcc can generate very good code as long as we are careful.
+ *
+ * Possible optimizations:
+ * - add cache prefetching
+ * - try not to use the post-increment address modifiers; they may create
+ * additional interlocks. Assumption is that those were only efficient on old
+ * machines (pre PA8000 processors)
+ */
+
+ dst = arg0
+ src = arg1
+ len = arg2
+ end = arg3
+ t1 = r19
+ t2 = r20
+ t3 = r21
+ t4 = r22
+ srcspc = sr1
+ dstspc = sr2
+
+ t0 = r1
+ a1 = t1
+ a2 = t2
+ a3 = t3
+ a0 = t4
+
+ save_src = ret0
+ save_dst = ret1
+ save_len = r31
+
+ENTRY_CFI(pa_memcpy)
+ /* Last destination address */
+ add dst,len,end
+
+ /* short copy with less than 16 bytes? */
+ cmpib,COND(>>=),n 15,len,.Lbyte_loop
+
+ /* same alignment? */
+ xor src,dst,t0
+ extru t0,31,2,t1
+ cmpib,<>,n 0,t1,.Lunaligned_copy
+
+#ifdef CONFIG_64BIT
+ /* only do 64-bit copies if we can get aligned. */
+ extru t0,31,3,t1
+ cmpib,<>,n 0,t1,.Lalign_loop32
+
+ /* loop until we are 64-bit aligned */
+.Lalign_loop64:
+ extru dst,31,3,t1
+ cmpib,=,n 0,t1,.Lcopy_loop_16_start
+20: ldb,ma 1(srcspc,src),t1
+21: stb,ma t1,1(dstspc,dst)
+ b .Lalign_loop64
+ ldo -1(len),len
- .section __ex_table,"aw"
- ASM_ULONG_INSN 1b,2b
- .previous
+ ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
+ ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
- .procend
+.Lcopy_loop_16_start:
+ ldi 31,t0
+.Lcopy_loop_16:
+ cmpb,COND(>>=),n t0,len,.Lword_loop
+
+10: ldd 0(srcspc,src),t1
+11: ldd 8(srcspc,src),t2
+ ldo 16(src),src
+12: std,ma t1,8(dstspc,dst)
+13: std,ma t2,8(dstspc,dst)
+14: ldd 0(srcspc,src),t1
+15: ldd 8(srcspc,src),t2
+ ldo 16(src),src
+16: std,ma t1,8(dstspc,dst)
+17: std,ma t2,8(dstspc,dst)
+
+ ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
+ ASM_EXCEPTIONTABLE_ENTRY(11b,.Lcopy16_fault)
+ ASM_EXCEPTIONTABLE_ENTRY(12b,.Lcopy_done)
+ ASM_EXCEPTIONTABLE_ENTRY(13b,.Lcopy_done)
+ ASM_EXCEPTIONTABLE_ENTRY(14b,.Lcopy_done)
+ ASM_EXCEPTIONTABLE_ENTRY(15b,.Lcopy16_fault)
+ ASM_EXCEPTIONTABLE_ENTRY(16b,.Lcopy_done)
+ ASM_EXCEPTIONTABLE_ENTRY(17b,.Lcopy_done)
+
+ b .Lcopy_loop_16
+ ldo -32(len),len
+
+.Lword_loop:
+ cmpib,COND(>>=),n 3,len,.Lbyte_loop
+20: ldw,ma 4(srcspc,src),t1
+21: stw,ma t1,4(dstspc,dst)
+ b .Lword_loop
+ ldo -4(len),len
+
+ ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
+ ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
+
+#endif /* CONFIG_64BIT */
+
+ /* loop until we are 32-bit aligned */
+.Lalign_loop32:
+ extru dst,31,2,t1
+ cmpib,=,n 0,t1,.Lcopy_loop_8
+20: ldb,ma 1(srcspc,src),t1
+21: stb,ma t1,1(dstspc,dst)
+ b .Lalign_loop32
+ ldo -1(len),len
+
+ ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
+ ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
+
+
+.Lcopy_loop_8:
+ cmpib,COND(>>=),n 15,len,.Lbyte_loop
+
+10: ldw 0(srcspc,src),t1
+11: ldw 4(srcspc,src),t2
+12: stw,ma t1,4(dstspc,dst)
+13: stw,ma t2,4(dstspc,dst)
+14: ldw 8(srcspc,src),t1
+15: ldw 12(srcspc,src),t2
+ ldo 16(src),src
+16: stw,ma t1,4(dstspc,dst)
+17: stw,ma t2,4(dstspc,dst)
+
+ ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
+ ASM_EXCEPTIONTABLE_ENTRY(11b,.Lcopy8_fault)
+ ASM_EXCEPTIONTABLE_ENTRY(12b,.Lcopy_done)
+ ASM_EXCEPTIONTABLE_ENTRY(13b,.Lcopy_done)
+ ASM_EXCEPTIONTABLE_ENTRY(14b,.Lcopy_done)
+ ASM_EXCEPTIONTABLE_ENTRY(15b,.Lcopy8_fault)
+ ASM_EXCEPTIONTABLE_ENTRY(16b,.Lcopy_done)
+ ASM_EXCEPTIONTABLE_ENTRY(17b,.Lcopy_done)
+
+ b .Lcopy_loop_8
+ ldo -16(len),len
+
+.Lbyte_loop:
+ cmpclr,COND(<>) len,%r0,%r0
+ b,n .Lcopy_done
+20: ldb 0(srcspc,src),t1
+ ldo 1(src),src
+21: stb,ma t1,1(dstspc,dst)
+ b .Lbyte_loop
+ ldo -1(len),len
+
+ ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
+ ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
+
+.Lcopy_done:
+ bv %r0(%r2)
+ sub end,dst,ret0
+
+
+ /* src and dst are not aligned the same way. */
+ /* need to go the hard way */
+.Lunaligned_copy:
+ /* align until dst is 32bit-word-aligned */
+ extru dst,31,2,t1
+ cmpib,=,n 0,t1,.Lcopy_dstaligned
+20: ldb 0(srcspc,src),t1
+ ldo 1(src),src
+21: stb,ma t1,1(dstspc,dst)
+ b .Lunaligned_copy
+ ldo -1(len),len
+
+ ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
+ ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
+
+.Lcopy_dstaligned:
+
+ /* store src, dst and len in safe place */
+ copy src,save_src
+ copy dst,save_dst
+ copy len,save_len
+
+ /* len now needs give number of words to copy */
+ SHRREG len,2,len
/*
- * long lstrnlen_user(char *s, long n)
- *
- * Returns 0 if exception before zero byte or reaching N,
- * N+1 if N would be exceeded,
- * else strlen + 1 (i.e. includes zero byte).
+ * Copy from a not-aligned src to an aligned dst using shifts.
+ * Handles 4 words per loop.
*/
-ENTRY(lstrnlen_user)
- .proc
- .callinfo NO_CALLS
- .entry
- comib,= 0,%r25,$lslen_nzero
- copy %r26,%r24
- get_sr
-1: ldbs,ma 1(%sr1,%r26),%r1
-$lslen_loop:
- comib,=,n 0,%r1,$lslen_done
- addib,<> -1,%r25,$lslen_loop
-2: ldbs,ma 1(%sr1,%r26),%r1
-$lslen_done:
- bv %r0(%r2)
- sub %r26,%r24,%r28
- .exit
+ depw,z src,28,2,t0
+ subi 32,t0,t0
+ mtsar t0
+ extru len,31,2,t0
+ cmpib,= 2,t0,.Lcase2
+ /* Make src aligned by rounding it down. */
+ depi 0,31,2,src
+
+ cmpiclr,<> 3,t0,%r0
+ b,n .Lcase3
+ cmpiclr,<> 1,t0,%r0
+ b,n .Lcase1
+.Lcase0:
+ cmpb,COND(=) %r0,len,.Lcda_finish
+ nop
+
+1: ldw,ma 4(srcspc,src), a3
+ ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
+1: ldw,ma 4(srcspc,src), a0
+ ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
+ b,n .Ldo3
+.Lcase1:
+1: ldw,ma 4(srcspc,src), a2
+ ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
+1: ldw,ma 4(srcspc,src), a3
+ ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
+ ldo -1(len),len
+ cmpb,COND(=),n %r0,len,.Ldo0
+.Ldo4:
+1: ldw,ma 4(srcspc,src), a0
+ ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
+ shrpw a2, a3, %sar, t0
+1: stw,ma t0, 4(dstspc,dst)
+ ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
+.Ldo3:
+1: ldw,ma 4(srcspc,src), a1
+ ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
+ shrpw a3, a0, %sar, t0
+1: stw,ma t0, 4(dstspc,dst)
+ ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
+.Ldo2:
+1: ldw,ma 4(srcspc,src), a2
+ ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
+ shrpw a0, a1, %sar, t0
+1: stw,ma t0, 4(dstspc,dst)
+ ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
+.Ldo1:
+1: ldw,ma 4(srcspc,src), a3
+ ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
+ shrpw a1, a2, %sar, t0
+1: stw,ma t0, 4(dstspc,dst)
+ ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
+ ldo -4(len),len
+ cmpb,COND(<>) %r0,len,.Ldo4
+ nop
+.Ldo0:
+ shrpw a2, a3, %sar, t0
+1: stw,ma t0, 4(dstspc,dst)
+ ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
+
+.Lcda_rdfault:
+.Lcda_finish:
+ /* calculate new src, dst and len and jump to byte-copy loop */
+ sub dst,save_dst,t0
+ add save_src,t0,src
+ b .Lbyte_loop
+ sub save_len,t0,len
-$lslen_nzero:
- b $lslen_done
- ldo 1(%r26),%r26 /* special case for N == 0 */
-ENDPROC(lstrnlen_user)
+.Lcase3:
+1: ldw,ma 4(srcspc,src), a0
+ ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
+1: ldw,ma 4(srcspc,src), a1
+ ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
+ b .Ldo2
+ ldo 1(len),len
+.Lcase2:
+1: ldw,ma 4(srcspc,src), a1
+ ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
+1: ldw,ma 4(srcspc,src), a2
+ ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
+ b .Ldo1
+ ldo 2(len),len
- .section .fixup,"ax"
-3: fixup_branch $lslen_done
- copy %r24,%r26 /* reset r26 so 0 is returned on fault */
- .previous
- .section __ex_table,"aw"
- ASM_ULONG_INSN 1b,3b
- ASM_ULONG_INSN 2b,3b
- .previous
+ /* fault exception fixup handlers: */
+#ifdef CONFIG_64BIT
+.Lcopy16_fault:
+ b .Lcopy_done
+10: std,ma t1,8(dstspc,dst)
+ ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
+#endif
- .procend
+.Lcopy8_fault:
+ b .Lcopy_done
+10: stw,ma t1,4(dstspc,dst)
+ ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
+ENDPROC_CFI(pa_memcpy)
.end