summaryrefslogtreecommitdiff
path: root/arch/ia64/lib
diff options
context:
space:
mode:
authorArd Biesheuvel <ardb@kernel.org>2022-10-20 15:54:33 +0200
committerArd Biesheuvel <ardb@kernel.org>2023-09-11 08:13:17 +0000
commitcf8e8658100d4eae80ce9b21f7a81cb024dd5057 (patch)
tree31d3b640bebf97c33d354768fc44dfd532c2df81 /arch/ia64/lib
parenta0334bf78b95532cec54f56b53e8ae1bfe7e1ca1 (diff)
arch: Remove Itanium (IA-64) architecture
The Itanium architecture is obsolete, and an informal survey [0] reveals that any residual use of Itanium hardware in production is mostly HP-UX or OpenVMS based. The use of Linux on Itanium appears to be limited to enthusiasts that occasionally boot a fresh Linux kernel to see whether things are still working as intended, and perhaps to churn out some distro packages that are rarely used in practice. None of the original companies behind Itanium still produce or support any hardware or software for the architecture, and it is listed as 'Orphaned' in the MAINTAINERS file, as apparently, none of the engineers that contributed on behalf of those companies (nor anyone else, for that matter) have been willing to support or maintain the architecture upstream or even be responsible for applying the odd fix. The Intel firmware team removed all IA-64 support from the Tianocore/EDK2 reference implementation of EFI in 2018. (Itanium is the original architecture for which EFI was developed, and the way Linux supports it deviates significantly from other architectures.) Some distros, such as Debian and Gentoo, still maintain [unofficial] ia64 ports, but many have dropped support years ago. While the argument is being made [1] that there is a 'for the common good' angle to being able to build and run existing projects such as the Grid Community Toolkit [2] on Itanium for interoperability testing, the fact remains that none of those projects are known to be deployed on Linux/ia64, and very few people actually have access to such a system in the first place. Even if there were ways imaginable in which Linux/ia64 could be put to good use today, what matters is whether anyone is actually doing that, and this does not appear to be the case. There are no emulators widely available, and so boot testing Itanium is generally infeasible for ordinary contributors. GCC still supports IA-64 but its compile farm [3] no longer has any IA-64 machines. GLIBC would like to get rid of IA-64 [4] too because it would permit some overdue code cleanups. In summary, the benefits to the ecosystem of having IA-64 be part of it are mostly theoretical, whereas the maintenance overhead of keeping it supported is real. So let's rip off the band aid, and remove the IA-64 arch code entirely. This follows the timeline proposed by the Debian/ia64 maintainer [5], which removes support in a controlled manner, leaving IA-64 in a known good state in the most recent LTS release. Other projects will follow once the kernel support is removed. [0] https://lore.kernel.org/all/CAMj1kXFCMh_578jniKpUtx_j8ByHnt=s7S+yQ+vGbKt9ud7+kQ@mail.gmail.com/ [1] https://lore.kernel.org/all/0075883c-7c51-00f5-2c2d-5119c1820410@web.de/ [2] https://gridcf.org/gct-docs/latest/index.html [3] https://cfarm.tetaneutral.net/machines/list/ [4] https://lore.kernel.org/all/87bkiilpc4.fsf@mid.deneb.enyo.de/ [5] https://lore.kernel.org/all/ff58a3e76e5102c94bb5946d99187b358def688a.camel@physik.fu-berlin.de/ Acked-by: Tony Luck <tony.luck@intel.com> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Diffstat (limited to 'arch/ia64/lib')
-rw-r--r--arch/ia64/lib/Makefile48
-rw-r--r--arch/ia64/lib/checksum.c102
-rw-r--r--arch/ia64/lib/clear_page.S79
-rw-r--r--arch/ia64/lib/clear_user.S212
-rw-r--r--arch/ia64/lib/copy_page.S101
-rw-r--r--arch/ia64/lib/copy_page_mck.S188
-rw-r--r--arch/ia64/lib/copy_user.S613
-rw-r--r--arch/ia64/lib/csum_partial_copy.c98
-rw-r--r--arch/ia64/lib/do_csum.S324
-rw-r--r--arch/ia64/lib/flush.S119
-rw-r--r--arch/ia64/lib/idiv32.S86
-rw-r--r--arch/ia64/lib/idiv64.S83
-rw-r--r--arch/ia64/lib/io.c51
-rw-r--r--arch/ia64/lib/ip_fast_csum.S148
-rw-r--r--arch/ia64/lib/memcpy.S304
-rw-r--r--arch/ia64/lib/memcpy_mck.S659
-rw-r--r--arch/ia64/lib/memset.S365
-rw-r--r--arch/ia64/lib/strlen.S195
-rw-r--r--arch/ia64/lib/strncpy_from_user.S47
-rw-r--r--arch/ia64/lib/strnlen_user.S48
-rw-r--r--arch/ia64/lib/xor.S181
21 files changed, 0 insertions, 4051 deletions
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
deleted file mode 100644
index 081fcba01dc0..000000000000
--- a/arch/ia64/lib/Makefile
+++ /dev/null
@@ -1,48 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for ia64-specific library routines..
-#
-
-lib-y := io.o __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \
- __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o \
- checksum.o clear_page.o csum_partial_copy.o \
- clear_user.o strncpy_from_user.o strnlen_user.o \
- flush.o ip_fast_csum.o do_csum.o \
- memset.o strlen.o xor.o
-
-lib-$(CONFIG_ITANIUM) += copy_page.o copy_user.o memcpy.o
-lib-$(CONFIG_MCKINLEY) += copy_page_mck.o memcpy_mck.o
-
-AFLAGS___divdi3.o =
-AFLAGS___udivdi3.o = -DUNSIGNED
-AFLAGS___moddi3.o = -DMODULO
-AFLAGS___umoddi3.o = -DUNSIGNED -DMODULO
-
-AFLAGS___divsi3.o =
-AFLAGS___udivsi3.o = -DUNSIGNED
-AFLAGS___modsi3.o = -DMODULO
-AFLAGS___umodsi3.o = -DUNSIGNED -DMODULO
-
-$(obj)/__divdi3.o: $(src)/idiv64.S FORCE
- $(call if_changed_rule,as_o_S)
-
-$(obj)/__udivdi3.o: $(src)/idiv64.S FORCE
- $(call if_changed_rule,as_o_S)
-
-$(obj)/__moddi3.o: $(src)/idiv64.S FORCE
- $(call if_changed_rule,as_o_S)
-
-$(obj)/__umoddi3.o: $(src)/idiv64.S FORCE
- $(call if_changed_rule,as_o_S)
-
-$(obj)/__divsi3.o: $(src)/idiv32.S FORCE
- $(call if_changed_rule,as_o_S)
-
-$(obj)/__udivsi3.o: $(src)/idiv32.S FORCE
- $(call if_changed_rule,as_o_S)
-
-$(obj)/__modsi3.o: $(src)/idiv32.S FORCE
- $(call if_changed_rule,as_o_S)
-
-$(obj)/__umodsi3.o: $(src)/idiv32.S FORCE
- $(call if_changed_rule,as_o_S)
diff --git a/arch/ia64/lib/checksum.c b/arch/ia64/lib/checksum.c
deleted file mode 100644
index d26517fe3500..000000000000
--- a/arch/ia64/lib/checksum.c
+++ /dev/null
@@ -1,102 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Network checksum routines
- *
- * Copyright (C) 1999, 2003 Hewlett-Packard Co
- * Stephane Eranian <eranian@hpl.hp.com>
- *
- * Most of the code coming from arch/alpha/lib/checksum.c
- *
- * This file contains network checksum routines that are better done
- * in an architecture-specific manner due to speed..
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <asm/byteorder.h>
-
-static inline unsigned short
-from64to16 (unsigned long x)
-{
- /* add up 32-bit words for 33 bits */
- x = (x & 0xffffffff) + (x >> 32);
- /* add up 16-bit and 17-bit words for 17+c bits */
- x = (x & 0xffff) + (x >> 16);
- /* add up 16-bit and 2-bit for 16+c bit */
- x = (x & 0xffff) + (x >> 16);
- /* add up carry.. */
- x = (x & 0xffff) + (x >> 16);
- return x;
-}
-
-/*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 16-bit checksum, already complemented.
- */
-__sum16
-csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
- __u8 proto, __wsum sum)
-{
- return (__force __sum16)~from64to16(
- (__force u64)saddr + (__force u64)daddr +
- (__force u64)sum + ((len + proto) << 8));
-}
-
-EXPORT_SYMBOL(csum_tcpudp_magic);
-
-__wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
- __u8 proto, __wsum sum)
-{
- unsigned long result;
-
- result = (__force u64)saddr + (__force u64)daddr +
- (__force u64)sum + ((len + proto) << 8);
-
- /* Fold down to 32-bits so we don't lose in the typedef-less network stack. */
- /* 64 to 33 */
- result = (result & 0xffffffff) + (result >> 32);
- /* 33 to 32 */
- result = (result & 0xffffffff) + (result >> 32);
- return (__force __wsum)result;
-}
-EXPORT_SYMBOL(csum_tcpudp_nofold);
-
-extern unsigned long do_csum (const unsigned char *, long);
-
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 32-bit boundary
- */
-__wsum csum_partial(const void *buff, int len, __wsum sum)
-{
- u64 result = do_csum(buff, len);
-
- /* add in old sum, and carry.. */
- result += (__force u32)sum;
- /* 32+c bits -> 32 bits */
- result = (result & 0xffffffff) + (result >> 32);
- return (__force __wsum)result;
-}
-
-EXPORT_SYMBOL(csum_partial);
-
-/*
- * this routine is used for miscellaneous IP-like checksums, mainly
- * in icmp.c
- */
-__sum16 ip_compute_csum (const void *buff, int len)
-{
- return (__force __sum16)~do_csum(buff,len);
-}
-
-EXPORT_SYMBOL(ip_compute_csum);
diff --git a/arch/ia64/lib/clear_page.S b/arch/ia64/lib/clear_page.S
deleted file mode 100644
index ba0dd2538fa5..000000000000
--- a/arch/ia64/lib/clear_page.S
+++ /dev/null
@@ -1,79 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 1999-2002 Hewlett-Packard Co
- * Stephane Eranian <eranian@hpl.hp.com>
- * David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
- *
- * 1/06/01 davidm Tuned for Itanium.
- * 2/12/02 kchen Tuned for both Itanium and McKinley
- * 3/08/02 davidm Some more tweaking
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#ifdef CONFIG_ITANIUM
-# define L3_LINE_SIZE 64 // Itanium L3 line size
-# define PREFETCH_LINES 9 // magic number
-#else
-# define L3_LINE_SIZE 128 // McKinley L3 line size
-# define PREFETCH_LINES 12 // magic number
-#endif
-
-#define saved_lc r2
-#define dst_fetch r3
-#define dst1 r8
-#define dst2 r9
-#define dst3 r10
-#define dst4 r11
-
-#define dst_last r31
-
-GLOBAL_ENTRY(clear_page)
- .prologue
- .regstk 1,0,0,0
- mov r16 = PAGE_SIZE/L3_LINE_SIZE-1 // main loop count, -1=repeat/until
- .save ar.lc, saved_lc
- mov saved_lc = ar.lc
-
- .body
- mov ar.lc = (PREFETCH_LINES - 1)
- mov dst_fetch = in0
- adds dst1 = 16, in0
- adds dst2 = 32, in0
- ;;
-.fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
- adds dst3 = 48, in0 // executing this multiple times is harmless
- br.cloop.sptk.few .fetch
- ;;
- addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
- mov ar.lc = r16 // one L3 line per iteration
- adds dst4 = 64, in0
- ;;
-#ifdef CONFIG_ITANIUM
- // Optimized for Itanium
-1: stf.spill.nta [dst1] = f0, 64
- stf.spill.nta [dst2] = f0, 64
- cmp.lt p8,p0=dst_fetch, dst_last
- ;;
-#else
- // Optimized for McKinley
-1: stf.spill.nta [dst1] = f0, 64
- stf.spill.nta [dst2] = f0, 64
- stf.spill.nta [dst3] = f0, 64
- stf.spill.nta [dst4] = f0, 128
- cmp.lt p8,p0=dst_fetch, dst_last
- ;;
- stf.spill.nta [dst1] = f0, 64
- stf.spill.nta [dst2] = f0, 64
-#endif
- stf.spill.nta [dst3] = f0, 64
-(p8) stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
- br.cloop.sptk.few 1b
- ;;
- mov ar.lc = saved_lc // restore lc
- br.ret.sptk.many rp
-END(clear_page)
-EXPORT_SYMBOL(clear_page)
diff --git a/arch/ia64/lib/clear_user.S b/arch/ia64/lib/clear_user.S
deleted file mode 100644
index 1d9e45ccf8e5..000000000000
--- a/arch/ia64/lib/clear_user.S
+++ /dev/null
@@ -1,212 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This routine clears to zero a linear memory buffer in user space.
- *
- * Inputs:
- * in0: address of buffer
- * in1: length of buffer in bytes
- * Outputs:
- * r8: number of bytes that didn't get cleared due to a fault
- *
- * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
- * Stephane Eranian <eranian@hpl.hp.com>
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-//
-// arguments
-//
-#define buf r32
-#define len r33
-
-//
-// local registers
-//
-#define cnt r16
-#define buf2 r17
-#define saved_lc r18
-#define saved_pfs r19
-#define tmp r20
-#define len2 r21
-#define len3 r22
-
-//
-// Theory of operations:
-// - we check whether or not the buffer is small, i.e., less than 17
-// in which case we do the byte by byte loop.
-//
-// - Otherwise we go progressively from 1 byte store to 8byte store in
-// the head part, the body is a 16byte store loop and we finish we the
-// tail for the last 15 bytes.
-// The good point about this breakdown is that the long buffer handling
-// contains only 2 branches.
-//
-// The reason for not using shifting & masking for both the head and the
-// tail is to stay semantically correct. This routine is not supposed
-// to write bytes outside of the buffer. While most of the time this would
-// be ok, we can't tolerate a mistake. A classical example is the case
-// of multithreaded code were to the extra bytes touched is actually owned
-// by another thread which runs concurrently to ours. Another, less likely,
-// example is with device drivers where reading an I/O mapped location may
-// have side effects (same thing for writing).
-//
-
-GLOBAL_ENTRY(__do_clear_user)
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,2,0,0,0
- cmp.eq p6,p0=r0,len // check for zero length
- .save ar.lc, saved_lc
- mov saved_lc=ar.lc // preserve ar.lc (slow)
- .body
- ;; // avoid WAW on CFM
- adds tmp=-1,len // br.ctop is repeat/until
- mov ret0=len // return value is length at this point
-(p6) br.ret.spnt.many rp
- ;;
- cmp.lt p6,p0=16,len // if len > 16 then long memset
- mov ar.lc=tmp // initialize lc for small count
-(p6) br.cond.dptk .long_do_clear
- ;; // WAR on ar.lc
- //
- // worst case 16 iterations, avg 8 iterations
- //
- // We could have played with the predicates to use the extra
- // M slot for 2 stores/iteration but the cost the initialization
- // the various counters compared to how long the loop is supposed
- // to last on average does not make this solution viable.
- //
-1:
- EX( .Lexit1, st1 [buf]=r0,1 )
- adds len=-1,len // countdown length using len
- br.cloop.dptk 1b
- ;; // avoid RAW on ar.lc
- //
- // .Lexit4: comes from byte by byte loop
- // len contains bytes left
-.Lexit1:
- mov ret0=len // faster than using ar.lc
- mov ar.lc=saved_lc
- br.ret.sptk.many rp // end of short clear_user
-
-
- //
- // At this point we know we have more than 16 bytes to copy
- // so we focus on alignment (no branches required)
- //
- // The use of len/len2 for countdown of the number of bytes left
- // instead of ret0 is due to the fact that the exception code
- // changes the values of r8.
- //
-.long_do_clear:
- tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear)
- ;;
- EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned
-(p6) adds len=-1,len;; // sync because buf is modified
- tbit.nz p6,p0=buf,1
- ;;
- EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned
-(p6) adds len=-2,len;;
- tbit.nz p6,p0=buf,2
- ;;
- EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned
-(p6) adds len=-4,len;;
- tbit.nz p6,p0=buf,3
- ;;
- EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned
-(p6) adds len=-8,len;;
- shr.u cnt=len,4 // number of 128-bit (2x64bit) words
- ;;
- cmp.eq p6,p0=r0,cnt
- adds tmp=-1,cnt
-(p6) br.cond.dpnt .dotail // we have less than 16 bytes left
- ;;
- adds buf2=8,buf // setup second base pointer
- mov ar.lc=tmp
- ;;
-
- //
- // 16bytes/iteration core loop
- //
- // The second store can never generate a fault because
- // we come into the loop only when we are 16-byte aligned.
- // This means that if we cross a page then it will always be
- // in the first store and never in the second.
- //
- //
- // We need to keep track of the remaining length. A possible (optimistic)
- // way would be to use ar.lc and derive how many byte were left by
- // doing : left= 16*ar.lc + 16. this would avoid the addition at
- // every iteration.
- // However we need to keep the synchronization point. A template
- // M;;MB does not exist and thus we can keep the addition at no
- // extra cycle cost (use a nop slot anyway). It also simplifies the
- // (unlikely) error recovery code
- //
-
-2: EX(.Lexit3, st8 [buf]=r0,16 )
- ;; // needed to get len correct when error
- st8 [buf2]=r0,16
- adds len=-16,len
- br.cloop.dptk 2b
- ;;
- mov ar.lc=saved_lc
- //
- // tail correction based on len only
- //
- // We alternate the use of len3,len2 to allow parallelism and correct
- // error handling. We also reuse p6/p7 to return correct value.
- // The addition of len2/len3 does not cost anything more compared to
- // the regular memset as we had empty slots.
- //
-.dotail:
- mov len2=len // for parallelization of error handling
- mov len3=len
- tbit.nz p6,p0=len,3
- ;;
- EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes
-(p6) adds len3=-8,len2
- tbit.nz p7,p6=len,2
- ;;
- EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes
-(p7) adds len2=-4,len3
- tbit.nz p6,p7=len,1
- ;;
- EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes
-(p6) adds len3=-2,len2
- tbit.nz p7,p6=len,0
- ;;
- EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left
- mov ret0=r0 // success
- br.ret.sptk.many rp // end of most likely path
-
- //
- // Outlined error handling code
- //
-
- //
- // .Lexit3: comes from core loop, need restore pr/lc
- // len contains bytes left
- //
- //
- // .Lexit2:
- // if p6 -> coming from st8 or st2 : len2 contains what's left
- // if p7 -> coming from st4 or st1 : len3 contains what's left
- // We must restore lc/pr even though might not have been used.
-.Lexit2:
- .pred.rel "mutex", p6, p7
-(p6) mov len=len2
-(p7) mov len=len3
- ;;
- //
- // .Lexit4: comes from head, need not restore pr/lc
- // len contains bytes left
- //
-.Lexit3:
- mov ret0=len
- mov ar.lc=saved_lc
- br.ret.sptk.many rp
-END(__do_clear_user)
-EXPORT_SYMBOL(__do_clear_user)
diff --git a/arch/ia64/lib/copy_page.S b/arch/ia64/lib/copy_page.S
deleted file mode 100644
index c0a0e6b2af00..000000000000
--- a/arch/ia64/lib/copy_page.S
+++ /dev/null
@@ -1,101 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the standard copy_page() function
- *
- * Inputs:
- * in0: address of target page
- * in1: address of source page
- * Output:
- * no return value
- *
- * Copyright (C) 1999, 2001 Hewlett-Packard Co
- * Stephane Eranian <eranian@hpl.hp.com>
- * David Mosberger <davidm@hpl.hp.com>
- *
- * 4/06/01 davidm Tuned to make it perform well both for cached and uncached copies.
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#define PIPE_DEPTH 3
-#define EPI p[PIPE_DEPTH-1]
-
-#define lcount r16
-#define saved_pr r17
-#define saved_lc r18
-#define saved_pfs r19
-#define src1 r20
-#define src2 r21
-#define tgt1 r22
-#define tgt2 r23
-#define srcf r24
-#define tgtf r25
-#define tgt_last r26
-
-#define Nrot ((8*PIPE_DEPTH+7)&~7)
-
-GLOBAL_ENTRY(copy_page)
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
-
- .rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \
- t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH]
- .rotp p[PIPE_DEPTH]
-
- .save ar.lc, saved_lc
- mov saved_lc=ar.lc
- mov ar.ec=PIPE_DEPTH
-
- mov lcount=PAGE_SIZE/64-1
- .save pr, saved_pr
- mov saved_pr=pr
- mov pr.rot=1<<16
-
- .body
-
- mov src1=in1
- adds src2=8,in1
- mov tgt_last = PAGE_SIZE
- ;;
- adds tgt2=8,in0
- add srcf=512,in1
- mov ar.lc=lcount
- mov tgt1=in0
- add tgtf=512,in0
- add tgt_last = tgt_last, in0
- ;;
-1:
-(p[0]) ld8 t1[0]=[src1],16
-(EPI) st8 [tgt1]=t1[PIPE_DEPTH-1],16
-(p[0]) ld8 t2[0]=[src2],16
-(EPI) st8 [tgt2]=t2[PIPE_DEPTH-1],16
- cmp.ltu p6,p0 = tgtf, tgt_last
- ;;
-(p[0]) ld8 t3[0]=[src1],16
-(EPI) st8 [tgt1]=t3[PIPE_DEPTH-1],16
-(p[0]) ld8 t4[0]=[src2],16
-(EPI) st8 [tgt2]=t4[PIPE_DEPTH-1],16
- ;;
-(p[0]) ld8 t5[0]=[src1],16
-(EPI) st8 [tgt1]=t5[PIPE_DEPTH-1],16
-(p[0]) ld8 t6[0]=[src2],16
-(EPI) st8 [tgt2]=t6[PIPE_DEPTH-1],16
- ;;
-(p[0]) ld8 t7[0]=[src1],16
-(EPI) st8 [tgt1]=t7[PIPE_DEPTH-1],16
-(p[0]) ld8 t8[0]=[src2],16
-(EPI) st8 [tgt2]=t8[PIPE_DEPTH-1],16
-
-(p6) lfetch [srcf], 64
-(p6) lfetch [tgtf], 64
- br.ctop.sptk.few 1b
- ;;
- mov pr=saved_pr,0xffffffffffff0000 // restore predicates
- mov ar.pfs=saved_pfs
- mov ar.lc=saved_lc
- br.ret.sptk.many rp
-END(copy_page)
-EXPORT_SYMBOL(copy_page)
diff --git a/arch/ia64/lib/copy_page_mck.S b/arch/ia64/lib/copy_page_mck.S
deleted file mode 100644
index 5e8bb4b4b535..000000000000
--- a/arch/ia64/lib/copy_page_mck.S
+++ /dev/null
@@ -1,188 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * McKinley-optimized version of copy_page().
- *
- * Copyright (C) 2002 Hewlett-Packard Co
- * David Mosberger <davidm@hpl.hp.com>
- *
- * Inputs:
- * in0: address of target page
- * in1: address of source page
- * Output:
- * no return value
- *
- * General idea:
- * - use regular loads and stores to prefetch data to avoid consuming M-slot just for
- * lfetches => good for in-cache performance
- * - avoid l2 bank-conflicts by not storing into the same 16-byte bank within a single
- * cycle
- *
- * Principle of operation:
- * First, note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes.
- * To avoid secondary misses in L2, we prefetch both source and destination with a line-size
- * of 128 bytes. When both of these lines are in the L2 and the first half of the
- * source line is in L1, we start copying the remaining words. The second half of the
- * source line is prefetched in an earlier iteration, so that by the time we start
- * accessing it, it's also present in the L1.
- *
- * We use a software-pipelined loop to control the overall operation. The pipeline
- * has 2*PREFETCH_DIST+K stages. The first PREFETCH_DIST stages are used for prefetching
- * source cache-lines. The second PREFETCH_DIST stages are used for prefetching destination
- * cache-lines, the last K stages are used to copy the cache-line words not copied by
- * the prefetches. The four relevant points in the pipelined are called A, B, C, D:
- * p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line
- * should be prefetched, p[C] is TRUE if the second half of an L2 line should be brought
- * into L1D and p[D] is TRUE if a cacheline needs to be copied.
- *
- * This all sounds very complicated, but thanks to the modulo-scheduled loop support,
- * the resulting code is very regular and quite easy to follow (once you get the idea).
- *
- * As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented
- * as the separate .prefetch_loop. Logically, this loop performs exactly like the
- * main-loop (.line_copy), but has all known-to-be-predicated-off instructions removed,
- * so that each loop iteration is faster (again, good for cached case).
- *
- * When reading the code, it helps to keep the following picture in mind:
- *
- * word 0 word 1
- * +------+------+---
- * | v[x] | t1 | ^
- * | t2 | t3 | |
- * | t4 | t5 | |
- * | t6 | t7 | | 128 bytes
- * | n[y] | t9 | | (L2 cache line)
- * | t10 | t11 | |
- * | t12 | t13 | |
- * | t14 | t15 | v
- * +------+------+---
- *
- * Here, v[x] is copied by the (memory) prefetch. n[y] is loaded at p[C]
- * to fetch the second-half of the L2 cache line into L1, and the tX words are copied in
- * an order that avoids bank conflicts.
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#define PREFETCH_DIST 8 // McKinley sustains 16 outstanding L2 misses (8 ld, 8 st)
-
-#define src0 r2
-#define src1 r3
-#define dst0 r9
-#define dst1 r10
-#define src_pre_mem r11
-#define dst_pre_mem r14
-#define src_pre_l2 r15
-#define dst_pre_l2 r16
-#define t1 r17
-#define t2 r18
-#define t3 r19
-#define t4 r20
-#define t5 t1 // alias!
-#define t6 t2 // alias!
-#define t7 t3 // alias!
-#define t9 t5 // alias!
-#define t10 t4 // alias!
-#define t11 t7 // alias!
-#define t12 t6 // alias!
-#define t14 t10 // alias!
-#define t13 r21
-#define t15 r22
-
-#define saved_lc r23
-#define saved_pr r24
-
-#define A 0
-#define B (PREFETCH_DIST)
-#define C (B + PREFETCH_DIST)
-#define D (C + 3)
-#define N (D + 1)
-#define Nrot ((N + 7) & ~7)
-
-GLOBAL_ENTRY(copy_page)
- .prologue
- alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot
-
- .rotr v[2*PREFETCH_DIST], n[D-C+1]
- .rotp p[N]
-
- .save ar.lc, saved_lc
- mov saved_lc = ar.lc
- .save pr, saved_pr
- mov saved_pr = pr
- .body
-
- mov src_pre_mem = in1
- mov pr.rot = 0x10000
- mov ar.ec = 1 // special unrolled loop
-
- mov dst_pre_mem = in0
- mov ar.lc = 2*PREFETCH_DIST - 1
-
- add src_pre_l2 = 8*8, in1
- add dst_pre_l2 = 8*8, in0
- add src0 = 8, in1 // first t1 src
- add src1 = 3*8, in1 // first t3 src
- add dst0 = 8, in0 // first t1 dst
- add dst1 = 3*8, in0 // first t3 dst
- mov t1 = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1
- nop.m 0
- nop.i 0
- ;;
- // same as .line_copy loop, but with all predicated-off instructions removed:
-.prefetch_loop:
-(p[A]) ld8 v[A] = [src_pre_mem], 128 // M0
-(p[B]) st8 [dst_pre_mem] = v[B], 128 // M2
- br.ctop.sptk .prefetch_loop
- ;;
- cmp.eq p16, p0 = r0, r0 // reset p16 to 1 (br.ctop cleared it to zero)
- mov ar.lc = t1 // with 64KB pages, t1 is too big to fit in 8 bits!
- mov ar.ec = N // # of stages in pipeline
- ;;
-.line_copy:
-(p[D]) ld8 t2 = [src0], 3*8 // M0
-(p[D]) ld8 t4 = [src1], 3*8 // M1
-(p[B]) st8 [dst_pre_mem] = v[B], 128 // M2 prefetch dst from memory
-(p[D]) st8 [dst_pre_l2] = n[D-C], 128 // M3 prefetch dst from L2
- ;;
-(p[A]) ld8 v[A] = [src_pre_mem], 128 // M0 prefetch src from memory
-(p[C]) ld8 n[0] = [src_pre_l2], 128 // M1 prefetch src from L2
-(p[D]) st8 [dst0] = t1, 8 // M2
-(p[D]) st8 [dst1] = t3, 8 // M3
- ;;
-(p[D]) ld8 t5 = [src0], 8
-(p[D]) ld8 t7 = [src1], 3*8
-(p[D]) st8 [dst0] = t2, 3*8
-(p[D]) st8 [dst1] = t4, 3*8
- ;;
-(p[D]) ld8 t6 = [src0], 3*8
-(p[D]) ld8 t10 = [src1], 8
-(p[D]) st8 [dst0] = t5, 8
-(p[D]) st8 [dst1] = t7, 3*8
- ;;
-(p[D]) ld8 t9 = [src0], 3*8
-(p[D]) ld8 t11 = [src1], 3*8
-(p[D]) st8 [dst0] = t6, 3*8
-(p[D]) st8 [dst1] = t10, 8
- ;;
-(p[D]) ld8 t12 = [src0], 8
-(p[D]) ld8 t14 = [src1], 8
-(p[D]) st8 [dst0] = t9, 3*8
-(p[D]) st8 [dst1] = t11, 3*8
- ;;
-(p[D]) ld8 t13 = [src0], 4*8
-(p[D]) ld8 t15 = [src1], 4*8
-(p[D]) st8 [dst0] = t12, 8
-(p[D]) st8 [dst1] = t14, 8
- ;;
-(p[D-1])ld8 t1 = [src0], 8
-(p[D-1])ld8 t3 = [src1], 8
-(p[D]) st8 [dst0] = t13, 4*8
-(p[D]) st8 [dst1] = t15, 4*8
- br.ctop.sptk .line_copy
- ;;
- mov ar.lc = saved_lc
- mov pr = saved_pr, -1
- br.ret.sptk.many rp
-END(copy_page)
-EXPORT_SYMBOL(copy_page)
diff --git a/arch/ia64/lib/copy_user.S b/arch/ia64/lib/copy_user.S
deleted file mode 100644
index 8daab72cfe77..000000000000
--- a/arch/ia64/lib/copy_user.S
+++ /dev/null
@@ -1,613 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the copy_user() routine.
- * It is used to copy date across the kernel/user boundary.
- *
- * The source and destination are always on opposite side of
- * the boundary. When reading from user space we must catch
- * faults on loads. When writing to user space we must catch
- * errors on stores. Note that because of the nature of the copy
- * we don't need to worry about overlapping regions.
- *
- *
- * Inputs:
- * in0 address of source buffer
- * in1 address of destination buffer
- * in2 number of bytes to copy
- *
- * Outputs:
- * ret0 0 in case of success. The number of bytes NOT copied in
- * case of error.
- *
- * Copyright (C) 2000-2001 Hewlett-Packard Co
- * Stephane Eranian <eranian@hpl.hp.com>
- *
- * Fixme:
- * - handle the case where we have more than 16 bytes and the alignment
- * are different.
- * - more benchmarking
- * - fix extraneous stop bit introduced by the EX() macro.
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-//
-// Tuneable parameters
-//
-#define COPY_BREAK 16 // we do byte copy below (must be >=16)
-#define PIPE_DEPTH 21 // pipe depth
-
-#define EPI p[PIPE_DEPTH-1]
-
-//
-// arguments
-//
-#define dst in0
-#define src in1
-#define len in2
-
-//
-// local registers
-//
-#define t1 r2 // rshift in bytes
-#define t2 r3 // lshift in bytes
-#define rshift r14 // right shift in bits
-#define lshift r15 // left shift in bits
-#define word1 r16
-#define word2 r17
-#define cnt r18
-#define len2 r19
-#define saved_lc r20
-#define saved_pr r21
-#define tmp r22
-#define val r23
-#define src1 r24
-#define dst1 r25
-#define src2 r26
-#define dst2 r27
-#define len1 r28
-#define enddst r29
-#define endsrc r30
-#define saved_pfs r31
-
-GLOBAL_ENTRY(__copy_user)
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
-
- .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
- .rotp p[PIPE_DEPTH]
-
- adds len2=-1,len // br.ctop is repeat/until
- mov ret0=r0
-
- ;; // RAW of cfm when len=0
- cmp.eq p8,p0=r0,len // check for zero length
- .save ar.lc, saved_lc
- mov saved_lc=ar.lc // preserve ar.lc (slow)
-(p8) br.ret.spnt.many rp // empty mempcy()
- ;;
- add enddst=dst,len // first byte after end of source
- add endsrc=src,len // first byte after end of destination
- .save pr, saved_pr
- mov saved_pr=pr // preserve predicates
-
- .body
-
- mov dst1=dst // copy because of rotation
- mov ar.ec=PIPE_DEPTH
- mov pr.rot=1<<16 // p16=true all others are false
-
- mov src1=src // copy because of rotation
- mov ar.lc=len2 // initialize lc for small count
- cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy
-
- xor tmp=src,dst // same alignment test prepare
-(p10) br.cond.dptk .long_copy_user
- ;; // RAW pr.rot/p16 ?
- //
- // Now we do the byte by byte loop with software pipeline
- //
- // p7 is necessarily false by now
-1:
- EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
- EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
- br.ctop.dptk.few 1b
- ;;
- mov ar.lc=saved_lc
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.pfs=saved_pfs // restore ar.ec
- br.ret.sptk.many rp // end of short memcpy
-
- //
- // Not 8-byte aligned
- //
-.diff_align_copy_user:
- // At this point we know we have more than 16 bytes to copy
- // and also that src and dest do _not_ have the same alignment.
- and src2=0x7,src1 // src offset
- and dst2=0x7,dst1 // dst offset
- ;;
- // The basic idea is that we copy byte-by-byte at the head so
- // that we can reach 8-byte alignment for both src1 and dst1.
- // Then copy the body using software pipelined 8-byte copy,
- // shifting the two back-to-back words right and left, then copy
- // the tail by copying byte-by-byte.
- //
- // Fault handling. If the byte-by-byte at the head fails on the
- // load, then restart and finish the pipleline by copying zeros
- // to the dst1. Then copy zeros for the rest of dst1.
- // If 8-byte software pipeline fails on the load, do the same as
- // failure_in3 does. If the byte-by-byte at the tail fails, it is
- // handled simply by failure_in_pipe1.
- //
- // The case p14 represents the source has more bytes in the
- // the first word (by the shifted part), whereas the p15 needs to
- // copy some bytes from the 2nd word of the source that has the
- // tail of the 1st of the destination.
- //
-
- //
- // Optimization. If dst1 is 8-byte aligned (quite common), we don't need
- // to copy the head to dst1, to start 8-byte copy software pipeline.
- // We know src1 is not 8-byte aligned in this case.
- //
- cmp.eq p14,p15=r0,dst2
-(p15) br.cond.spnt 1f
- ;;
- sub t1=8,src2
- mov t2=src2
- ;;
- shl rshift=t2,3
- sub len1=len,t1 // set len1
- ;;
- sub lshift=64,rshift
- ;;
- br.cond.spnt .word_copy_user
- ;;
-1:
- cmp.leu p14,p15=src2,dst2
- sub t1=dst2,src2
- ;;
- .pred.rel "mutex", p14, p15
-(p14) sub word1=8,src2 // (8 - src offset)
-(p15) sub t1=r0,t1 // absolute value
-(p15) sub word1=8,dst2 // (8 - dst offset)
- ;;
- // For the case p14, we don't need to copy the shifted part to
- // the 1st word of destination.
- sub t2=8,t1
-(p14) sub word1=word1,t1
- ;;
- sub len1=len,word1 // resulting len
-(p15) shl rshift=t1,3 // in bits
-(p14) shl rshift=t2,3
- ;;
-(p14) sub len1=len1,t1
- adds cnt=-1,word1
- ;;
- sub lshift=64,rshift
- mov ar.ec=PIPE_DEPTH
- mov pr.rot=1<<16 // p16=true all others are false
- mov ar.lc=cnt
- ;;
-2:
- EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
- EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
- br.ctop.dptk.few 2b
- ;;
- clrrrb
- ;;
-.word_copy_user:
- cmp.gtu p9,p0=16,len1
-(p9) br.cond.spnt 4f // if (16 > len1) skip 8-byte copy
- ;;
- shr.u cnt=len1,3 // number of 64-bit words
- ;;
- adds cnt=-1,cnt
- ;;
- .pred.rel "mutex", p14, p15
-(p14) sub src1=src1,t2
-(p15) sub src1=src1,t1
- //
- // Now both src1 and dst1 point to an 8-byte aligned address. And
- // we have more than 8 bytes to copy.
- //
- mov ar.lc=cnt
- mov ar.ec=PIPE_DEPTH
- mov pr.rot=1<<16 // p16=true all others are false
- ;;
-3:
- //
- // The pipleline consists of 3 stages:
- // 1 (p16): Load a word from src1
- // 2 (EPI_1): Shift right pair, saving to tmp
- // 3 (EPI): Store tmp to dst1
- //
- // To make it simple, use at least 2 (p16) loops to set up val1[n]
- // because we need 2 back-to-back val1[] to get tmp.
- // Note that this implies EPI_2 must be p18 or greater.
- //
-
-#define EPI_1 p[PIPE_DEPTH-2]
-#define SWITCH(pred, shift) cmp.eq pred,p0=shift,rshift
-#define CASE(pred, shift) \
- (pred) br.cond.spnt .copy_user_bit##shift
-#define BODY(rshift) \
-.copy_user_bit##rshift: \
-1: \
- EX(.failure_out,(EPI) st8 [dst1]=tmp,8); \
-(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
- EX(3f,(p16) ld8 val1[1]=[src1],8); \
-(p16) mov val1[0]=r0; \
- br.ctop.dptk 1b; \
- ;; \
- br.cond.sptk.many .diff_align_do_tail; \
-2: \
-(EPI) st8 [dst1]=tmp,8; \
-(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
-3: \
-(p16) mov val1[1]=r0; \
-(p16) mov val1[0]=r0; \
- br.ctop.dptk 2b; \
- ;; \
- br.cond.sptk.many .failure_in2
-
- //
- // Since the instruction 'shrp' requires a fixed 128-bit value
- // specifying the bits to shift, we need to provide 7 cases
- // below.
- //
- SWITCH(p6, 8)
- SWITCH(p7, 16)
- SWITCH(p8, 24)
- SWITCH(p9, 32)
- SWITCH(p10, 40)
- SWITCH(p11, 48)
- SWITCH(p12, 56)
- ;;
- CASE(p6, 8)
- CASE(p7, 16)
- CASE(p8, 24)
- CASE(p9, 32)
- CASE(p10, 40)
- CASE(p11, 48)
- CASE(p12, 56)
- ;;
- BODY(8)
- BODY(16)
- BODY(24)
- BODY(32)
- BODY(40)
- BODY(48)
- BODY(56)
- ;;
-.diff_align_do_tail:
- .pred.rel "mutex", p14, p15
-(p14) sub src1=src1,t1
-(p14) adds dst1=-8,dst1
-(p15) sub dst1=dst1,t1
- ;;
-4:
- // Tail correction.
- //
- // The problem with this piplelined loop is that the last word is not
- // loaded and thus parf of the last word written is not correct.
- // To fix that, we simply copy the tail byte by byte.
-
- sub len1=endsrc,src1,1
- clrrrb
- ;;
- mov ar.ec=PIPE_DEPTH
- mov pr.rot=1<<16 // p16=true all others are false
- mov ar.lc=len1
- ;;
-5:
- EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
- EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
- br.ctop.dptk.few 5b
- ;;
- mov ar.lc=saved_lc
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
- //
- // Beginning of long mempcy (i.e. > 16 bytes)
- //
-.long_copy_user:
- tbit.nz p6,p7=src1,0 // odd alignment
- and tmp=7,tmp
- ;;
- cmp.eq p10,p8=r0,tmp
- mov len1=len // copy because of rotation
-(p8) br.cond.dpnt .diff_align_copy_user
- ;;
- // At this point we know we have more than 16 bytes to copy
- // and also that both src and dest have the same alignment
- // which may not be the one we want. So for now we must move
- // forward slowly until we reach 16byte alignment: no need to
- // worry about reaching the end of buffer.
- //
- EX(.failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned
-(p6) adds len1=-1,len1;;
- tbit.nz p7,p0=src1,1
- ;;
- EX(.failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned
-(p7) adds len1=-2,len1;;
- tbit.nz p8,p0=src1,2
- ;;
- //
- // Stop bit not required after ld4 because if we fail on ld4
- // we have never executed the ld1, therefore st1 is not executed.
- //
- EX(.failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned
- ;;
- EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
- tbit.nz p9,p0=src1,3
- ;;
- //
- // Stop bit not required after ld8 because if we fail on ld8
- // we have never executed the ld2, therefore st2 is not executed.
- //
- EX(.failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned
- EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
-(p8) adds len1=-4,len1
- ;;
- EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
-(p9) adds len1=-8,len1;;
- shr.u cnt=len1,4 // number of 128-bit (2x64bit) words
- ;;
- EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
- tbit.nz p6,p0=len1,3
- cmp.eq p7,p0=r0,cnt
- adds tmp=-1,cnt // br.ctop is repeat/until
-(p7) br.cond.dpnt .dotail // we have less than 16 bytes left
- ;;
- adds src2=8,src1
- adds dst2=8,dst1
- mov ar.lc=tmp
- ;;
- //
- // 16bytes/iteration
- //
-2:
- EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
-(p16) ld8 val2[0]=[src2],16
-
- EX(.failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16)
-(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
- br.ctop.dptk 2b
- ;; // RAW on src1 when fall through from loop
- //
- // Tail correction based on len only
- //
- // No matter where we come from (loop or test) the src1 pointer
- // is 16 byte aligned AND we have less than 16 bytes to copy.
- //
-.dotail:
- EX(.failure_in1,(p6) ld8 val1[0]=[src1],8) // at least 8 bytes
- tbit.nz p7,p0=len1,2
- ;;
- EX(.failure_in1,(p7) ld4 val1[1]=[src1],4) // at least 4 bytes
- tbit.nz p8,p0=len1,1
- ;;
- EX(.failure_in1,(p8) ld2 val2[0]=[src1],2) // at least 2 bytes
- tbit.nz p9,p0=len1,0
- ;;
- EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
- ;;
- EX(.failure_in1,(p9) ld1 val2[1]=[src1]) // only 1 byte left
- mov ar.lc=saved_lc
- ;;
- EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
- mov pr=saved_pr,0xffffffffffff0000
- ;;
- EX(.failure_out, (p8) st2 [dst1]=val2[0],2)
- mov ar.pfs=saved_pfs
- ;;
- EX(.failure_out, (p9) st1 [dst1]=val2[1])
- br.ret.sptk.many rp
-
-
- //
- // Here we handle the case where the byte by byte copy fails
- // on the load.
- // Several factors make the zeroing of the rest of the buffer kind of
- // tricky:
- // - the pipeline: loads/stores are not in sync (pipeline)
- //
- // In the same loop iteration, the dst1 pointer does not directly
- // reflect where the faulty load was.
- //
- // - pipeline effect
- // When you get a fault on load, you may have valid data from
- // previous loads not yet store in transit. Such data must be
- // store normally before moving onto zeroing the rest.
- //
- // - single/multi dispersal independence.
- //
- // solution:
- // - we don't disrupt the pipeline, i.e. data in transit in
- // the software pipeline will be eventually move to memory.
- // We simply replace the load with a simple mov and keep the
- // pipeline going. We can't really do this inline because
- // p16 is always reset to 1 when lc > 0.
- //
-.failure_in_pipe1:
- sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
-1:
-(p16) mov val1[0]=r0
-(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
- br.ctop.dptk 1b
- ;;
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.lc=saved_lc
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
- //
- // This is the case where the byte by byte copy fails on the load
- // when we copy the head. We need to finish the pipeline and copy
- // zeros for the rest of the destination. Since this happens
- // at the top we still need to fill the body and tail.
-.failure_in_pipe2:
- sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
-2:
-(p16) mov val1[0]=r0
-(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
- br.ctop.dptk 2b
- ;;
- sub len=enddst,dst1,1 // precompute len
- br.cond.dptk.many .failure_in1bis
- ;;
-
- //
- // Here we handle the head & tail part when we check for alignment.
- // The following code handles only the load failures. The
- // main diffculty comes from the fact that loads/stores are
- // scheduled. So when you fail on a load, the stores corresponding
- // to previous successful loads must be executed.
- //
- // However some simplifications are possible given the way
- // things work.
- //
- // 1) HEAD
- // Theory of operation:
- //
- // Page A | Page B
- // ---------|-----
- // 1|8 x
- // 1 2|8 x
- // 4|8 x
- // 1 4|8 x
- // 2 4|8 x
- // 1 2 4|8 x
- // |1
- // |2 x
- // |4 x
- //
- // page_size >= 4k (2^12). (x means 4, 2, 1)
- // Here we suppose Page A exists and Page B does not.
- //
- // As we move towards eight byte alignment we may encounter faults.
- // The numbers on each page show the size of the load (current alignment).
- //
- // Key point:
- // - if you fail on 1, 2, 4 then you have never executed any smaller
- // size loads, e.g. failing ld4 means no ld1 nor ld2 executed
- // before.
- //
- // This allows us to simplify the cleanup code, because basically you
- // only have to worry about "pending" stores in the case of a failing
- // ld8(). Given the way the code is written today, this means only
- // worry about st2, st4. There we can use the information encapsulated
- // into the predicates.
- //
- // Other key point:
- // - if you fail on the ld8 in the head, it means you went straight
- // to it, i.e. 8byte alignment within an unexisting page.
- // Again this comes from the fact that if you crossed just for the ld8 then
- // you are 8byte aligned but also 16byte align, therefore you would
- // either go for the 16byte copy loop OR the ld8 in the tail part.
- // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible
- // because it would mean you had 15bytes to copy in which case you
- // would have defaulted to the byte by byte copy.
- //
- //
- // 2) TAIL
- // Here we now we have less than 16 bytes AND we are either 8 or 16 byte
- // aligned.
- //
- // Key point:
- // This means that we either:
- // - are right on a page boundary
- // OR
- // - are at more than 16 bytes from a page boundary with
- // at most 15 bytes to copy: no chance of crossing.
- //
- // This allows us to assume that if we fail on a load we haven't possibly
- // executed any of the previous (tail) ones, so we don't need to do
- // any stores. For instance, if we fail on ld2, this means we had
- // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
- //
- // This means that we are in a situation similar the a fault in the
- // head part. That's nice!
- //
-.failure_in1:
- sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
- sub len=endsrc,src1,1
- //
- // we know that ret0 can never be zero at this point
- // because we failed why trying to do a load, i.e. there is still
- // some work to do.
- // The failure_in1bis and length problem is taken care of at the
- // calling side.
- //
- ;;
-.failure_in1bis: // from (.failure_in3)
- mov ar.lc=len // Continue with a stupid byte store.
- ;;
-5:
- st1 [dst1]=r0,1
- br.cloop.dptk 5b
- ;;
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.lc=saved_lc
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
- //
- // Here we simply restart the loop but instead
- // of doing loads we fill the pipeline with zeroes
- // We can't simply store r0 because we may have valid
- // data in transit in the pipeline.
- // ar.lc and ar.ec are setup correctly at this point
- //
- // we MUST use src1/endsrc here and not dst1/enddst because
- // of the pipeline effect.
- //
-.failure_in3:
- sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
- ;;
-2:
-(p16) mov val1[0]=r0
-(p16) mov val2[0]=r0
-(EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16
-(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
- br.ctop.dptk 2b
- ;;
- cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
- sub len=enddst,dst1,1 // precompute len
-(p6) br.cond.dptk .failure_in1bis
- ;;
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.lc=saved_lc
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
-.failure_in2:
- sub ret0=endsrc,src1
- cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
- sub len=enddst,dst1,1 // precompute len
-(p6) br.cond.dptk .failure_in1bis
- ;;
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.lc=saved_lc
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
- //
- // handling of failures on stores: that's the easy part
- //
-.failure_out:
- sub ret0=enddst,dst1
- mov pr=saved_pr,0xffffffffffff0000
- mov ar.lc=saved_lc
-
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-END(__copy_user)
-EXPORT_SYMBOL(__copy_user)
diff --git a/arch/ia64/lib/csum_partial_copy.c b/arch/ia64/lib/csum_partial_copy.c
deleted file mode 100644
index 917e3138b277..000000000000
--- a/arch/ia64/lib/csum_partial_copy.c
+++ /dev/null
@@ -1,98 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Network Checksum & Copy routine
- *
- * Copyright (C) 1999, 2003-2004 Hewlett-Packard Co
- * Stephane Eranian <eranian@hpl.hp.com>
- *
- * Most of the code has been imported from Linux/Alpha
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-
-#include <net/checksum.h>
-
-/*
- * XXX Fixme: those 2 inlines are meant for debugging and will go away
- */
-static inline unsigned
-short from64to16(unsigned long x)
-{
- /* add up 32-bit words for 33 bits */
- x = (x & 0xffffffff) + (x >> 32);
- /* add up 16-bit and 17-bit words for 17+c bits */
- x = (x & 0xffff) + (x >> 16);
- /* add up 16-bit and 2-bit for 16+c bit */
- x = (x & 0xffff) + (x >> 16);
- /* add up carry.. */
- x = (x & 0xffff) + (x >> 16);
- return x;
-}
-
-static inline
-unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum)
-{
- int odd, count;
- unsigned long result = (unsigned long)psum;
-
- if (len <= 0)
- goto out;
- odd = 1 & (unsigned long) buff;
- if (odd) {
- result = *buff << 8;
- len--;
- buff++;
- }
- count = len >> 1; /* nr of 16-bit words.. */
- if (count) {
- if (2 & (unsigned long) buff) {
- result += *(unsigned short *) buff;
- count--;
- len -= 2;
- buff += 2;
- }
- count >>= 1; /* nr of 32-bit words.. */
- if (count) {
- if (4 & (unsigned long) buff) {
- result += *(unsigned int *) buff;
- count--;
- len -= 4;
- buff += 4;
- }
- count >>= 1; /* nr of 64-bit words.. */
- if (count) {
- unsigned long carry = 0;
- do {
- unsigned long w = *(unsigned long *) buff;
- count--;
- buff += 8;
- result += carry;
- result += w;
- carry = (w > result);
- } while (count);
- result += carry;
- result = (result & 0xffffffff) + (result >> 32);
- }
- if (len & 4) {
- result += *(unsigned int *) buff;
- buff += 4;
- }
- }
- if (len & 2) {
- result += *(unsigned short *) buff;
- buff += 2;
- }
- }
- if (len & 1)
- result += *buff;
-
- result = from64to16(result);
-
- if (odd)
- result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
-
-out:
- return result;
-}
diff --git a/arch/ia64/lib/do_csum.S b/arch/ia64/lib/do_csum.S
deleted file mode 100644
index 6004dad2597c..000000000000
--- a/arch/ia64/lib/do_csum.S
+++ /dev/null
@@ -1,324 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optmized version of the standard do_csum() function
- *
- * Return: a 64bit quantity containing the 16bit Internet checksum
- *
- * Inputs:
- * in0: address of buffer to checksum (char *)
- * in1: length of the buffer (int)
- *
- * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
- * Stephane Eranian <eranian@hpl.hp.com>
- *
- * 02/04/22 Ken Chen <kenneth.w.chen@intel.com>
- * Data locality study on the checksum buffer.
- * More optimization cleanup - remove excessive stop bits.
- * 02/04/08 David Mosberger <davidm@hpl.hp.com>
- * More cleanup and tuning.
- * 01/04/18 Jun Nakajima <jun.nakajima@intel.com>
- * Clean up and optimize and the software pipeline, loading two
- * back-to-back 8-byte words per loop. Clean up the initialization
- * for the loop. Support the cases where load latency = 1 or 2.
- * Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
- */
-
-#include <asm/asmmacro.h>
-
-//
-// Theory of operations:
-// The goal is to go as quickly as possible to the point where
-// we can checksum 16 bytes/loop. Before reaching that point we must
-// take care of incorrect alignment of first byte.
-//
-// The code hereafter also takes care of the "tail" part of the buffer
-// before entering the core loop, if any. The checksum is a sum so it
-// allows us to commute operations. So we do the "head" and "tail"
-// first to finish at full speed in the body. Once we get the head and
-// tail values, we feed them into the pipeline, very handy initialization.
-//
-// Of course we deal with the special case where the whole buffer fits
-// into one 8 byte word. In this case we have only one entry in the pipeline.
-//
-// We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
-// possible load latency and also to accommodate for head and tail.
-//
-// The end of the function deals with folding the checksum from 64bits
-// down to 16bits taking care of the carry.
-//
-// This version avoids synchronization in the core loop by also using a
-// pipeline for the accumulation of the checksum in resultx[] (x=1,2).
-//
-// wordx[] (x=1,2)
-// |---|
-// | | 0 : new value loaded in pipeline
-// |---|
-// | | - : in transit data
-// |---|
-// | | LOAD_LATENCY : current value to add to checksum
-// |---|
-// | | LOAD_LATENCY+1 : previous value added to checksum
-// |---| (previous iteration)
-//
-// resultx[] (x=1,2)
-// |---|
-// | | 0 : initial value
-// |---|
-// | | LOAD_LATENCY-1 : new checksum
-// |---|
-// | | LOAD_LATENCY : previous value of checksum
-// |---|
-// | | LOAD_LATENCY+1 : final checksum when out of the loop
-// |---|
-//
-//
-// See RFC1071 "Computing the Internet Checksum" for various techniques for
-// calculating the Internet checksum.
-//
-// NOT YET DONE:
-// - Maybe another algorithm which would take care of the folding at the
-// end in a different manner
-// - Work with people more knowledgeable than me on the network stack
-// to figure out if we could not split the function depending on the
-// type of packet or alignment we get. Like the ip_fast_csum() routine
-// where we know we have at least 20bytes worth of data to checksum.
-// - Do a better job of handling small packets.
-// - Note on prefetching: it was found that under various load, i.e. ftp read/write,
-// nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8%
-// on the data that buffer points to (partly because the checksum is often preceded by
-// a copy_from_user()). This finding indiate that lfetch will not be beneficial since
-// the data is already in the cache.
-//
-
-#define saved_pfs r11
-#define hmask r16
-#define tmask r17
-#define first1 r18
-#define firstval r19
-#define firstoff r20
-#define last r21
-#define lastval r22
-#define lastoff r23
-#define saved_lc r24
-#define saved_pr r25
-#define tmp1 r26
-#define tmp2 r27
-#define tmp3 r28
-#define carry1 r29
-#define carry2 r30
-#define first2 r31
-
-#define buf in0
-#define len in1
-
-#define LOAD_LATENCY 2 // XXX fix me
-
-#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
-# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
-#endif
-
-#define PIPE_DEPTH (LOAD_LATENCY+2)
-#define ELD p[LOAD_LATENCY] // end of load
-#define ELD_1 p[LOAD_LATENCY+1] // and next stage
-
-// unsigned long do_csum(unsigned char *buf,long len)
-
-GLOBAL_ENTRY(do_csum)
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,2,16,0,16
- .rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
- .rotp p[PIPE_DEPTH], pC1[2], pC2[2]
- mov ret0=r0 // in case we have zero length
- cmp.lt p0,p6=r0,len // check for zero length or negative (32bit len)
- ;;
- add tmp1=buf,len // last byte's address
- .save pr, saved_pr
- mov saved_pr=pr // preserve predicates (rotation)
-(p6) br.ret.spnt.many rp // return if zero or negative length
-
- mov hmask=-1 // initialize head mask
- tbit.nz p15,p0=buf,0 // is buf an odd address?
- and first1=-8,buf // 8-byte align down address of first1 element
-
- and firstoff=7,buf // how many bytes off for first1 element
- mov tmask=-1 // initialize tail mask
-
- ;;
- adds tmp2=-1,tmp1 // last-1
- and lastoff=7,tmp1 // how many bytes off for last element
- ;;
- sub tmp1=8,lastoff // complement to lastoff
- and last=-8,tmp2 // address of word containing last byte
- ;;
- sub tmp3=last,first1 // tmp3=distance from first1 to last
- .save ar.lc, saved_lc
- mov saved_lc=ar.lc // save lc
- cmp.eq p8,p9=last,first1 // everything fits in one word ?
-
- ld8 firstval=[first1],8 // load, ahead of time, "first1" word
- and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0
- shl tmp2=firstoff,3 // number of bits
- ;;
-(p9) ld8 lastval=[last] // load, ahead of time, "last" word, if needed
- shl tmp1=tmp1,3 // number of bits
-(p9) adds tmp3=-8,tmp3 // effectively loaded
- ;;
-(p8) mov lastval=r0 // we don't need lastval if first1==last
- shl hmask=hmask,tmp2 // build head mask, mask off [0,first1off[
- shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff]
- ;;
- .body
-#define count tmp3
-
-(p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only
-(p9) and word2[0]=lastval,tmask // mask last it as appropriate
- shr.u count=count,3 // how many 8-byte?
- ;;
- // If count is odd, finish this 8-byte word so that we can
- // load two back-to-back 8-byte words per loop thereafter.
- and word1[0]=firstval,hmask // and mask it as appropriate
- tbit.nz p10,p11=count,0 // if (count is odd)
- ;;
-(p8) mov result1[0]=word1[0]
-(p9) add result1[0]=word1[0],word2[0]
- ;;
- cmp.ltu p6,p0=result1[0],word1[0] // check the carry
- cmp.eq.or.andcm p8,p0=0,count // exit if zero 8-byte
- ;;
-(p6) adds result1[0]=1,result1[0]
-(p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word)
-(p11) br.cond.dptk .do_csum16 // if (count is even)
-
- // Here count is odd.
- ld8 word1[1]=[first1],8 // load an 8-byte word
- cmp.eq p9,p10=1,count // if (count == 1)
- adds count=-1,count // loaded an 8-byte word
- ;;
- add result1[0]=result1[0],word1[1]
- ;;
- cmp.ltu p6,p0=result1[0],word1[1]
- ;;
-(p6) adds result1[0]=1,result1[0]
-(p9) br.cond.sptk .do_csum_exit // if (count == 1) exit
- // Fall through to calculate the checksum, feeding result1[0] as
- // the initial value in result1[0].
- //
- // Calculate the checksum loading two 8-byte words per loop.
- //
-.do_csum16:
- add first2=8,first1
- shr.u count=count,1 // we do 16 bytes per loop
- ;;
- adds count=-1,count
- mov carry1=r0
- mov carry2=r0
- brp.loop.imp 1f,2f
- ;;
- mov ar.ec=PIPE_DEPTH
- mov ar.lc=count // set lc
- mov pr.rot=1<<16
- // result1[0] must be initialized in advance.
- mov result2[0]=r0
- ;;
- .align 32
-1:
-(ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
-(pC1[1])adds carry1=1,carry1
-(ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
-(pC2[1])adds carry2=1,carry2
-(ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
-(ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
-2:
-(p[0]) ld8 word1[0]=[first1],16
-(p[0]) ld8 word2[0]=[first2],16
- br.ctop.sptk 1b
- ;;
- // Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
-(pC1[1])adds carry1=1,carry1 // since we miss the last one
-(pC2[1])adds carry2=1,carry2
- ;;
- add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
- add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
- ;;
- cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
- cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
- ;;
-(p6) adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
-(p7) adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
- ;;
- add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
- ;;
- cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
- ;;
-(p6) adds result1[0]=1,result1[0]
- ;;
-.do_csum_exit:
- //
- // now fold 64 into 16 bits taking care of carry
- // that's not very good because it has lots of sequentiality
- //
- mov tmp3=0xffff
- zxt4 tmp1=result1[0]
- shr.u tmp2=result1[0],32
- ;;
- add result1[0]=tmp1,tmp2
- ;;
- and tmp1=result1[0],tmp3
- shr.u tmp2=result1[0],16
- ;;
- add result1[0]=tmp1,tmp2
- ;;
- and tmp1=result1[0],tmp3
- shr.u tmp2=result1[0],16
- ;;
- add result1[0]=tmp1,tmp2
- ;;
- and tmp1=result1[0],tmp3
- shr.u tmp2=result1[0],16
- ;;
- add ret0=tmp1,tmp2
- mov pr=saved_pr,0xffffffffffff0000
- ;;
- // if buf was odd then swap bytes
- mov ar.pfs=saved_pfs // restore ar.ec
-(p15) mux1 ret0=ret0,@rev // reverse word
- ;;
- mov ar.lc=saved_lc
-(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
- br.ret.sptk.many rp
-
-// I (Jun Nakajima) wrote an equivalent code (see below), but it was
-// not much better than the original. So keep the original there so that
-// someone else can challenge.
-//
-// shr.u word1[0]=result1[0],32
-// zxt4 result1[0]=result1[0]
-// ;;
-// add result1[0]=result1[0],word1[0]
-// ;;
-// zxt2 result2[0]=result1[0]
-// extr.u word1[0]=result1[0],16,16
-// shr.u carry1=result1[0],32
-// ;;
-// add result2[0]=result2[0],word1[0]
-// ;;
-// add result2[0]=result2[0],carry1
-// ;;
-// extr.u ret0=result2[0],16,16
-// ;;
-// add ret0=ret0,result2[0]
-// ;;
-// zxt2 ret0=ret0
-// mov ar.pfs=saved_pfs // restore ar.ec
-// mov pr=saved_pr,0xffffffffffff0000
-// ;;
-// // if buf was odd then swap bytes
-// mov ar.lc=saved_lc
-//(p15) mux1 ret0=ret0,@rev // reverse word
-// ;;
-//(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
-// br.ret.sptk.many rp
-
-END(do_csum)
diff --git a/arch/ia64/lib/flush.S b/arch/ia64/lib/flush.S
deleted file mode 100644
index f8e795fe45cb..000000000000
--- a/arch/ia64/lib/flush.S
+++ /dev/null
@@ -1,119 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Cache flushing routines.
- *
- * Copyright (C) 1999-2001, 2005 Hewlett-Packard Co
- * David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 05/28/05 Zoltan Menyhart Dynamic stride size
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
- /*
- * flush_icache_range(start,end)
- *
- * Make i-cache(s) coherent with d-caches.
- *
- * Must deal with range from start to end-1 but nothing else (need to
- * be careful not to touch addresses that may be unmapped).
- *
- * Note: "in0" and "in1" are preserved for debugging purposes.
- */
- .section .kprobes.text,"ax"
-GLOBAL_ENTRY(flush_icache_range)
-
- .prologue
- alloc r2=ar.pfs,2,0,0,0
- movl r3=ia64_i_cache_stride_shift
- mov r21=1
- ;;
- ld8 r20=[r3] // r20: stride shift
- sub r22=in1,r0,1 // last byte address
- ;;
- shr.u r23=in0,r20 // start / (stride size)
- shr.u r22=r22,r20 // (last byte address) / (stride size)
- shl r21=r21,r20 // r21: stride size of the i-cache(s)
- ;;
- sub r8=r22,r23 // number of strides - 1
- shl r24=r23,r20 // r24: addresses for "fc.i" =
- // "start" rounded down to stride boundary
- .save ar.lc,r3
- mov r3=ar.lc // save ar.lc
- ;;
-
- .body
- mov ar.lc=r8
- ;;
- /*
- * 32 byte aligned loop, even number of (actually 2) bundles
- */
-.Loop: fc.i r24 // issuable on M0 only
- add r24=r21,r24 // we flush "stride size" bytes per iteration
- nop.i 0
- br.cloop.sptk.few .Loop
- ;;
- sync.i
- ;;
- srlz.i
- ;;
- mov ar.lc=r3 // restore ar.lc
- br.ret.sptk.many rp
-END(flush_icache_range)
-EXPORT_SYMBOL_GPL(flush_icache_range)
-
- /*
- * clflush_cache_range(start,size)
- *
- * Flush cache lines from start to start+size-1.
- *
- * Must deal with range from start to start+size-1 but nothing else
- * (need to be careful not to touch addresses that may be
- * unmapped).
- *
- * Note: "in0" and "in1" are preserved for debugging purposes.
- */
- .section .kprobes.text,"ax"
-GLOBAL_ENTRY(clflush_cache_range)
-
- .prologue
- alloc r2=ar.pfs,2,0,0,0
- movl r3=ia64_cache_stride_shift
- mov r21=1
- add r22=in1,in0
- ;;
- ld8 r20=[r3] // r20: stride shift
- sub r22=r22,r0,1 // last byte address
- ;;
- shr.u r23=in0,r20 // start / (stride size)
- shr.u r22=r22,r20 // (last byte address) / (stride size)
- shl r21=r21,r20 // r21: stride size of the i-cache(s)
- ;;
- sub r8=r22,r23 // number of strides - 1
- shl r24=r23,r20 // r24: addresses for "fc" =
- // "start" rounded down to stride
- // boundary
- .save ar.lc,r3
- mov r3=ar.lc // save ar.lc
- ;;
-
- .body
- mov ar.lc=r8
- ;;
- /*
- * 32 byte aligned loop, even number of (actually 2) bundles
- */
-.Loop_fc:
- fc r24 // issuable on M0 only
- add r24=r21,r24 // we flush "stride size" bytes per iteration
- nop.i 0
- br.cloop.sptk.few .Loop_fc
- ;;
- sync.i
- ;;
- srlz.i
- ;;
- mov ar.lc=r3 // restore ar.lc
- br.ret.sptk.many rp
-END(clflush_cache_range)
diff --git a/arch/ia64/lib/idiv32.S b/arch/ia64/lib/idiv32.S
deleted file mode 100644
index 83586fbc51ff..000000000000
--- a/arch/ia64/lib/idiv32.S
+++ /dev/null
@@ -1,86 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2000 Hewlett-Packard Co
- * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 32-bit integer division.
- *
- * This code is based on the application note entitled "Divide, Square Root
- * and Remainder Algorithms for the IA-64 Architecture". This document
- * is available as Intel document number 248725-002 or via the web at
- * http://developer.intel.com/software/opensource/numerics/
- *
- * For more details on the theory behind these algorithms, see "IA-64
- * and Elementary Functions" by Peter Markstein; HP Professional Books
- * (http://www.goodreads.com/book/show/2019887.Ia_64_and_Elementary_Functions)
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-#ifdef MODULO
-# define OP mod
-#else
-# define OP div
-#endif
-
-#ifdef UNSIGNED
-# define SGN u
-# define EXTEND zxt4
-# define INT_TO_FP(a,b) fcvt.xuf.s1 a=b
-# define FP_TO_INT(a,b) fcvt.fxu.trunc.s1 a=b
-#else
-# define SGN
-# define EXTEND sxt4
-# define INT_TO_FP(a,b) fcvt.xf a=b
-# define FP_TO_INT(a,b) fcvt.fx.trunc.s1 a=b
-#endif
-
-#define PASTE1(a,b) a##b
-#define PASTE(a,b) PASTE1(a,b)
-#define NAME PASTE(PASTE(__,SGN),PASTE(OP,si3))
-
-GLOBAL_ENTRY(NAME)
- .regstk 2,0,0,0
- // Transfer inputs to FP registers.
- mov r2 = 0xffdd // r2 = -34 + 65535 (fp reg format bias)
- EXTEND in0 = in0 // in0 = a
- EXTEND in1 = in1 // in1 = b
- ;;
- setf.sig f8 = in0
- setf.sig f9 = in1
-#ifdef MODULO
- sub in1 = r0, in1 // in1 = -b
-#endif
- ;;
- // Convert the inputs to FP, to avoid FP software-assist faults.
- INT_TO_FP(f8, f8)
- INT_TO_FP(f9, f9)
- ;;
- setf.exp f7 = r2 // f7 = 2^-34
- frcpa.s1 f6, p6 = f8, f9 // y0 = frcpa(b)
- ;;
-(p6) fmpy.s1 f8 = f8, f6 // q0 = a*y0
-(p6) fnma.s1 f6 = f9, f6, f1 // e0 = -b*y0 + 1
- ;;
-#ifdef MODULO
- setf.sig f9 = in1 // f9 = -b
-#endif
-(p6) fma.s1 f8 = f6, f8, f8 // q1 = e0*q0 + q0
-(p6) fma.s1 f6 = f6, f6, f7 // e1 = e0*e0 + 2^-34
- ;;
-#ifdef MODULO
- setf.sig f7 = in0
-#endif
-(p6) fma.s1 f6 = f6, f8, f8 // q2 = e1*q1 + q1
- ;;
- FP_TO_INT(f6, f6) // q = trunc(q2)
- ;;
-#ifdef MODULO
- xma.l f6 = f6, f9, f7 // r = q*(-b) + a
- ;;
-#endif
- getf.sig r8 = f6 // transfer result to result register
- br.ret.sptk.many rp
-END(NAME)
-EXPORT_SYMBOL(NAME)
diff --git a/arch/ia64/lib/idiv64.S b/arch/ia64/lib/idiv64.S
deleted file mode 100644
index 5c9113691f72..000000000000
--- a/arch/ia64/lib/idiv64.S
+++ /dev/null
@@ -1,83 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 1999-2000 Hewlett-Packard Co
- * Copyright (C) 1999-2000 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 64-bit integer division.
- *
- * This code is based on the application note entitled "Divide, Square Root
- * and Remainder Algorithms for the IA-64 Architecture". This document
- * is available as Intel document number 248725-002 or via the web at
- * http://developer.intel.com/software/opensource/numerics/
- *
- * For more details on the theory behind these algorithms, see "IA-64
- * and Elementary Functions" by Peter Markstein; HP Professional Books
- * (http://www.goodreads.com/book/show/2019887.Ia_64_and_Elementary_Functions)
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-#ifdef MODULO
-# define OP mod
-#else
-# define OP div
-#endif
-
-#ifdef UNSIGNED
-# define SGN u
-# define INT_TO_FP(a,b) fcvt.xuf.s1 a=b
-# define FP_TO_INT(a,b) fcvt.fxu.trunc.s1 a=b
-#else
-# define SGN
-# define INT_TO_FP(a,b) fcvt.xf a=b
-# define FP_TO_INT(a,b) fcvt.fx.trunc.s1 a=b
-#endif
-
-#define PASTE1(a,b) a##b
-#define PASTE(a,b) PASTE1(a,b)
-#define NAME PASTE(PASTE(__,SGN),PASTE(OP,di3))
-
-GLOBAL_ENTRY(NAME)
- .regstk 2,0,0,0
- // Transfer inputs to FP registers.
- setf.sig f8 = in0
- setf.sig f9 = in1
- ;;
- // Convert the inputs to FP, to avoid FP software-assist faults.
- INT_TO_FP(f8, f8)
- INT_TO_FP(f9, f9)
- ;;
- frcpa.s1 f11, p6 = f8, f9 // y0 = frcpa(b)
- ;;
-(p6) fmpy.s1 f7 = f8, f11 // q0 = a*y0
-(p6) fnma.s1 f6 = f9, f11, f1 // e0 = -b*y0 + 1
- ;;
-(p6) fma.s1 f10 = f7, f6, f7 // q1 = q0*e0 + q0
-(p6) fmpy.s1 f7 = f6, f6 // e1 = e0*e0
- ;;
-#ifdef MODULO
- sub in1 = r0, in1 // in1 = -b
-#endif
-(p6) fma.s1 f10 = f10, f7, f10 // q2 = q1*e1 + q1
-(p6) fma.s1 f6 = f11, f6, f11 // y1 = y0*e0 + y0
- ;;
-(p6) fma.s1 f6 = f6, f7, f6 // y2 = y1*e1 + y1
-(p6) fnma.s1 f7 = f9, f10, f8 // r = -b*q2 + a
- ;;
-#ifdef MODULO
- setf.sig f8 = in0 // f8 = a
- setf.sig f9 = in1 // f9 = -b
-#endif
-(p6) fma.s1 f11 = f7, f6, f10 // q3 = r*y2 + q2
- ;;
- FP_TO_INT(f11, f11) // q = trunc(q3)
- ;;
-#ifdef MODULO
- xma.l f11 = f11, f9, f8 // r = q*(-b) + a
- ;;
-#endif
- getf.sig r8 = f11 // transfer result to result register
- br.ret.sptk.many rp
-END(NAME)
-EXPORT_SYMBOL(NAME)
diff --git a/arch/ia64/lib/io.c b/arch/ia64/lib/io.c
deleted file mode 100644
index c3e02462ed16..000000000000
--- a/arch/ia64/lib/io.c
+++ /dev/null
@@ -1,51 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/module.h>
-#include <linux/types.h>
-
-#include <asm/io.h>
-
-/*
- * Copy data from IO memory space to "real" memory space.
- * This needs to be optimized.
- */
-void memcpy_fromio(void *to, const volatile void __iomem *from, long count)
-{
- char *dst = to;
-
- while (count) {
- count--;
- *dst++ = readb(from++);
- }
-}
-EXPORT_SYMBOL(memcpy_fromio);
-
-/*
- * Copy data from "real" memory space to IO memory space.
- * This needs to be optimized.
- */
-void memcpy_toio(volatile void __iomem *to, const void *from, long count)
-{
- const char *src = from;
-
- while (count) {
- count--;
- writeb(*src++, to++);
- }
-}
-EXPORT_SYMBOL(memcpy_toio);
-
-/*
- * "memset" on IO memory space.
- * This needs to be optimized.
- */
-void memset_io(volatile void __iomem *dst, int c, long count)
-{
- unsigned char ch = (char)(c & 0xff);
-
- while (count) {
- count--;
- writeb(ch, dst);
- dst++;
- }
-}
-EXPORT_SYMBOL(memset_io);
diff --git a/arch/ia64/lib/ip_fast_csum.S b/arch/ia64/lib/ip_fast_csum.S
deleted file mode 100644
index fcc0b812ce2e..000000000000
--- a/arch/ia64/lib/ip_fast_csum.S
+++ /dev/null
@@ -1,148 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Optmized version of the ip_fast_csum() function
- * Used for calculating IP header checksum
- *
- * Return: 16bit checksum, complemented
- *
- * Inputs:
- * in0: address of buffer to checksum (char *)
- * in1: length of the buffer (int)
- *
- * Copyright (C) 2002, 2006 Intel Corp.
- * Copyright (C) 2002, 2006 Ken Chen <kenneth.w.chen@intel.com>
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-/*
- * Since we know that most likely this function is called with buf aligned
- * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
- * versus calling generic version of do_csum, which has lots of overhead in
- * handling various alignments and sizes. However, due to lack of constrains
- * put on the function input argument, cases with alignment not on 4-byte or
- * size not equal to 20 bytes will be handled by the generic do_csum function.
- */
-
-#define in0 r32
-#define in1 r33
-#define in2 r34
-#define in3 r35
-#define in4 r36
-#define ret0 r8
-
-GLOBAL_ENTRY(ip_fast_csum)
- .prologue
- .body
- cmp.ne p6,p7=5,in1 // size other than 20 byte?
- and r14=3,in0 // is it aligned on 4-byte?
- add r15=4,in0 // second source pointer
- ;;
- cmp.ne.or.andcm p6,p7=r14,r0
- ;;
-(p7) ld4 r20=[in0],8
-(p7) ld4 r21=[r15],8
-(p6) br.spnt .generic
- ;;
- ld4 r22=[in0],8
- ld4 r23=[r15],8
- ;;
- ld4 r24=[in0]
- add r20=r20,r21
- add r22=r22,r23
- ;;
- add r20=r20,r22
- ;;
- add r20=r20,r24
- ;;
- shr.u ret0=r20,16 // now need to add the carry
- zxt2 r20=r20
- ;;
- add r20=ret0,r20
- ;;
- shr.u ret0=r20,16 // add carry again
- zxt2 r20=r20
- ;;
- add r20=ret0,r20
- ;;
- shr.u ret0=r20,16
- zxt2 r20=r20
- ;;
- add r20=ret0,r20
- mov r9=0xffff
- ;;
- andcm ret0=r9,r20
- .restore sp // reset frame state
- br.ret.sptk.many b0
- ;;
-
-.generic:
- .prologue
- .save ar.pfs, r35
- alloc r35=ar.pfs,2,2,2,0
- .save rp, r34
- mov r34=b0
- .body
- dep.z out1=in1,2,30
- mov out0=in0
- ;;
- br.call.sptk.many b0=do_csum
- ;;
- andcm ret0=-1,ret0
- mov ar.pfs=r35
- mov b0=r34
- br.ret.sptk.many b0
-END(ip_fast_csum)
-EXPORT_SYMBOL(ip_fast_csum)
-
-GLOBAL_ENTRY(csum_ipv6_magic)
- ld4 r20=[in0],4
- ld4 r21=[in1],4
- zxt4 in2=in2
- ;;
- ld4 r22=[in0],4
- ld4 r23=[in1],4
- dep r15=in3,in2,32,16
- ;;
- ld4 r24=[in0],4
- ld4 r25=[in1],4
- mux1 r15=r15,@rev
- add r16=r20,r21
- add r17=r22,r23
- zxt4 in4=in4
- ;;
- ld4 r26=[in0],4
- ld4 r27=[in1],4
- shr.u r15=r15,16
- add r18=r24,r25
- add r8=r16,r17
- ;;
- add r19=r26,r27
- add r8=r8,r18
- ;;
- add r8=r8,r19
- add r15=r15,in4
- ;;
- add r8=r8,r15
- ;;
- shr.u r10=r8,32 // now fold sum into short
- zxt4 r11=r8
- ;;
- add r8=r10,r11
- ;;
- shr.u r10=r8,16 // yeah, keep it rolling
- zxt2 r11=r8
- ;;
- add r8=r10,r11
- ;;
- shr.u r10=r8,16 // three times lucky
- zxt2 r11=r8
- ;;
- add r8=r10,r11
- mov r9=0xffff
- ;;
- andcm r8=r9,r8
- br.ret.sptk.many b0
-END(csum_ipv6_magic)
-EXPORT_SYMBOL(csum_ipv6_magic)
diff --git a/arch/ia64/lib/memcpy.S b/arch/ia64/lib/memcpy.S
deleted file mode 100644
index 35c9069a8345..000000000000
--- a/arch/ia64/lib/memcpy.S
+++ /dev/null
@@ -1,304 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the standard memcpy() function
- *
- * Inputs:
- * in0: destination address
- * in1: source address
- * in2: number of bytes to copy
- * Output:
- * no return value
- *
- * Copyright (C) 2000-2001 Hewlett-Packard Co
- * Stephane Eranian <eranian@hpl.hp.com>
- * David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(memcpy)
-
-# define MEM_LAT 21 /* latency to memory */
-
-# define dst r2
-# define src r3
-# define retval r8
-# define saved_pfs r9
-# define saved_lc r10
-# define saved_pr r11
-# define cnt r16
-# define src2 r17
-# define t0 r18
-# define t1 r19
-# define t2 r20
-# define t3 r21
-# define t4 r22
-# define src_end r23
-
-# define N (MEM_LAT + 4)
-# define Nrot ((N + 7) & ~7)
-
- /*
- * First, check if everything (src, dst, len) is a multiple of eight. If
- * so, we handle everything with no taken branches (other than the loop
- * itself) and a small icache footprint. Otherwise, we jump off to
- * the more general copy routine handling arbitrary
- * sizes/alignment etc.
- */
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
- .save ar.lc, saved_lc
- mov saved_lc=ar.lc
- or t0=in0,in1
- ;;
-
- or t0=t0,in2
- .save pr, saved_pr
- mov saved_pr=pr
-
- .body
-
- cmp.eq p6,p0=in2,r0 // zero length?
- mov retval=in0 // return dst
-(p6) br.ret.spnt.many rp // zero length, return immediately
- ;;
-
- mov dst=in0 // copy because of rotation
- shr.u cnt=in2,3 // number of 8-byte words to copy
- mov pr.rot=1<<16
- ;;
-
- adds cnt=-1,cnt // br.ctop is repeat/until
- cmp.gtu p7,p0=16,in2 // copying less than 16 bytes?
- mov ar.ec=N
- ;;
-
- and t0=0x7,t0
- mov ar.lc=cnt
- ;;
- cmp.ne p6,p0=t0,r0
-
- mov src=in1 // copy because of rotation
-(p7) br.cond.spnt.few .memcpy_short
-(p6) br.cond.spnt.few .memcpy_long
- ;;
- nop.m 0
- ;;
- nop.m 0
- nop.i 0
- ;;
- nop.m 0
- ;;
- .rotr val[N]
- .rotp p[N]
- .align 32
-1: { .mib
-(p[0]) ld8 val[0]=[src],8
- nop.i 0
- brp.loop.imp 1b, 2f
-}
-2: { .mfb
-(p[N-1])st8 [dst]=val[N-1],8
- nop.f 0
- br.ctop.dptk.few 1b
-}
- ;;
- mov ar.lc=saved_lc
- mov pr=saved_pr,-1
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
- /*
- * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
- * copy loop. This performs relatively poorly on Itanium, but it doesn't
- * get used very often (gcc inlines small copies) and due to atomicity
- * issues, we want to avoid read-modify-write of entire words.
- */
- .align 32
-.memcpy_short:
- adds cnt=-1,in2 // br.ctop is repeat/until
- mov ar.ec=MEM_LAT
- brp.loop.imp 1f, 2f
- ;;
- mov ar.lc=cnt
- ;;
- nop.m 0
- ;;
- nop.m 0
- nop.i 0
- ;;
- nop.m 0
- ;;
- nop.m 0
- ;;
- /*
- * It is faster to put a stop bit in the loop here because it makes
- * the pipeline shorter (and latency is what matters on short copies).
- */
- .align 32
-1: { .mib
-(p[0]) ld1 val[0]=[src],1
- nop.i 0
- brp.loop.imp 1b, 2f
-} ;;
-2: { .mfb
-(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
- nop.f 0
- br.ctop.dptk.few 1b
-} ;;
- mov ar.lc=saved_lc
- mov pr=saved_pr,-1
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
- /*
- * Large (>= 16 bytes) copying is done in a fancy way. Latency isn't
- * an overriding concern here, but throughput is. We first do
- * sub-word copying until the destination is aligned, then we check
- * if the source is also aligned. If so, we do a simple load/store-loop
- * until there are less than 8 bytes left over and then we do the tail,
- * by storing the last few bytes using sub-word copying. If the source
- * is not aligned, we branch off to the non-congruent loop.
- *
- * stage: op:
- * 0 ld
- * :
- * MEM_LAT+3 shrp
- * MEM_LAT+4 st
- *
- * On Itanium, the pipeline itself runs without stalls. However, br.ctop
- * seems to introduce an unavoidable bubble in the pipeline so the overall
- * latency is 2 cycles/iteration. This gives us a _copy_ throughput
- * of 4 byte/cycle. Still not bad.
- */
-# undef N
-# undef Nrot
-# define N (MEM_LAT + 5) /* number of stages */
-# define Nrot ((N+1 + 2 + 7) & ~7) /* number of rotating regs */
-
-#define LOG_LOOP_SIZE 6
-
-.memcpy_long:
- alloc t3=ar.pfs,3,Nrot,0,Nrot // resize register frame
- and t0=-8,src // t0 = src & ~7
- and t2=7,src // t2 = src & 7
- ;;
- ld8 t0=[t0] // t0 = 1st source word
- adds src2=7,src // src2 = (src + 7)
- sub t4=r0,dst // t4 = -dst
- ;;
- and src2=-8,src2 // src2 = (src + 7) & ~7
- shl t2=t2,3 // t2 = 8*(src & 7)
- shl t4=t4,3 // t4 = 8*(dst & 7)
- ;;
- ld8 t1=[src2] // t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
- sub t3=64,t2 // t3 = 64-8*(src & 7)
- shr.u t0=t0,t2
- ;;
- add src_end=src,in2
- shl t1=t1,t3
- mov pr=t4,0x38 // (p5,p4,p3)=(dst & 7)
- ;;
- or t0=t0,t1
- mov cnt=r0
- adds src_end=-1,src_end
- ;;
-(p3) st1 [dst]=t0,1
-(p3) shr.u t0=t0,8
-(p3) adds cnt=1,cnt
- ;;
-(p4) st2 [dst]=t0,2
-(p4) shr.u t0=t0,16
-(p4) adds cnt=2,cnt
- ;;
-(p5) st4 [dst]=t0,4
-(p5) adds cnt=4,cnt
- and src_end=-8,src_end // src_end = last word of source buffer
- ;;
-
- // At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:
-
-1:{ add src=cnt,src // make src point to remainder of source buffer
- sub cnt=in2,cnt // cnt = number of bytes left to copy
- mov t4=ip
- } ;;
- and src2=-8,src // align source pointer
- adds t4=.memcpy_loops-1b,t4
- mov ar.ec=N
-
- and t0=7,src // t0 = src & 7
- shr.u t2=cnt,3 // t2 = number of 8-byte words left to copy
- shl cnt=cnt,3 // move bits 0-2 to 3-5
- ;;
-
- .rotr val[N+1], w[2]
- .rotp p[N]
-
- cmp.ne p6,p0=t0,r0 // is src aligned, too?
- shl t0=t0,LOG_LOOP_SIZE // t0 = 8*(src & 7)
- adds t2=-1,t2 // br.ctop is repeat/until
- ;;
- add t4=t0,t4
- mov pr=cnt,0x38 // set (p5,p4,p3) to # of bytes last-word bytes to copy
- mov ar.lc=t2
- ;;
- nop.m 0
- ;;
- nop.m 0
- nop.i 0
- ;;
- nop.m 0
- ;;
-(p6) ld8 val[1]=[src2],8 // prime the pump...
- mov b6=t4
- br.sptk.few b6
- ;;
-
-.memcpy_tail:
- // At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
- // less than 8) and t0 contains the last few bytes of the src buffer:
-(p5) st4 [dst]=t0,4
-(p5) shr.u t0=t0,32
- mov ar.lc=saved_lc
- ;;
-(p4) st2 [dst]=t0,2
-(p4) shr.u t0=t0,16
- mov ar.pfs=saved_pfs
- ;;
-(p3) st1 [dst]=t0
- mov pr=saved_pr,-1
- br.ret.sptk.many rp
-
-///////////////////////////////////////////////////////
- .align 64
-
-#define COPY(shift,index) \
- 1: { .mib \
- (p[0]) ld8 val[0]=[src2],8; \
- (p[MEM_LAT+3]) shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift; \
- brp.loop.imp 1b, 2f \
- }; \
- 2: { .mfb \
- (p[MEM_LAT+4]) st8 [dst]=w[1],8; \
- nop.f 0; \
- br.ctop.dptk.few 1b; \
- }; \
- ;; \
- ld8 val[N-1]=[src_end]; /* load last word (may be same as val[N]) */ \
- ;; \
- shrp t0=val[N-1],val[N-index],shift; \
- br .memcpy_tail
-.memcpy_loops:
- COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
- COPY(8, 0)
- COPY(16, 0)
- COPY(24, 0)
- COPY(32, 0)
- COPY(40, 0)
- COPY(48, 0)
- COPY(56, 0)
-
-END(memcpy)
-EXPORT_SYMBOL(memcpy)
diff --git a/arch/ia64/lib/memcpy_mck.S b/arch/ia64/lib/memcpy_mck.S
deleted file mode 100644
index c0d4362217ae..000000000000
--- a/arch/ia64/lib/memcpy_mck.S
+++ /dev/null
@@ -1,659 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Itanium 2-optimized version of memcpy and copy_user function
- *
- * Inputs:
- * in0: destination address
- * in1: source address
- * in2: number of bytes to copy
- * Output:
- * for memcpy: return dest
- * for copy_user: return 0 if success,
- * or number of byte NOT copied if error occurred.
- *
- * Copyright (C) 2002 Intel Corp.
- * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#define EK(y...) EX(y)
-
-/* McKinley specific optimization */
-
-#define retval r8
-#define saved_pfs r31
-#define saved_lc r10
-#define saved_pr r11
-#define saved_in0 r14
-#define saved_in1 r15
-#define saved_in2 r16
-
-#define src0 r2
-#define src1 r3
-#define dst0 r17
-#define dst1 r18
-#define cnt r9
-
-/* r19-r30 are temp for each code section */
-#define PREFETCH_DIST 8
-#define src_pre_mem r19
-#define dst_pre_mem r20
-#define src_pre_l2 r21
-#define dst_pre_l2 r22
-#define t1 r23
-#define t2 r24
-#define t3 r25
-#define t4 r26
-#define t5 t1 // alias!
-#define t6 t2 // alias!
-#define t7 t3 // alias!
-#define n8 r27
-#define t9 t5 // alias!
-#define t10 t4 // alias!
-#define t11 t7 // alias!
-#define t12 t6 // alias!
-#define t14 t10 // alias!
-#define t13 r28
-#define t15 r29
-#define tmp r30
-
-/* defines for long_copy block */
-#define A 0
-#define B (PREFETCH_DIST)
-#define C (B + PREFETCH_DIST)
-#define D (C + 1)
-#define N (D + 1)
-#define Nrot ((N + 7) & ~7)
-
-/* alias */
-#define in0 r32
-#define in1 r33
-#define in2 r34
-
-GLOBAL_ENTRY(memcpy)
- and r28=0x7,in0
- and r29=0x7,in1
- mov f6=f0
- mov retval=in0
- br.cond.sptk .common_code
- ;;
-END(memcpy)
-EXPORT_SYMBOL(memcpy)
-GLOBAL_ENTRY(__copy_user)
- .prologue
-// check dest alignment
- and r28=0x7,in0
- and r29=0x7,in1
- mov f6=f1
- mov saved_in0=in0 // save dest pointer
- mov saved_in1=in1 // save src pointer
- mov retval=r0 // initialize return value
- ;;
-.common_code:
- cmp.gt p15,p0=8,in2 // check for small size
- cmp.ne p13,p0=0,r28 // check dest alignment
- cmp.ne p14,p0=0,r29 // check src alignment
- add src0=0,in1
- sub r30=8,r28 // for .align_dest
- mov saved_in2=in2 // save len
- ;;
- add dst0=0,in0
- add dst1=1,in0 // dest odd index
- cmp.le p6,p0 = 1,r30 // for .align_dest
-(p15) br.cond.dpnt .memcpy_short
-(p13) br.cond.dpnt .align_dest
-(p14) br.cond.dpnt .unaligned_src
- ;;
-
-// both dest and src are aligned on 8-byte boundary
-.aligned_src:
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
- .save pr, saved_pr
- mov saved_pr=pr
-
- shr.u cnt=in2,7 // this much cache line
- ;;
- cmp.lt p6,p0=2*PREFETCH_DIST,cnt
- cmp.lt p7,p8=1,cnt
- .save ar.lc, saved_lc
- mov saved_lc=ar.lc
- .body
- add cnt=-1,cnt
- add src_pre_mem=0,in1 // prefetch src pointer
- add dst_pre_mem=0,in0 // prefetch dest pointer
- ;;
-(p7) mov ar.lc=cnt // prefetch count
-(p8) mov ar.lc=r0
-(p6) br.cond.dpnt .long_copy
- ;;
-
-.prefetch:
- lfetch.fault [src_pre_mem], 128
- lfetch.fault.excl [dst_pre_mem], 128
- br.cloop.dptk.few .prefetch
- ;;
-
-.medium_copy:
- and tmp=31,in2 // copy length after iteration
- shr.u r29=in2,5 // number of 32-byte iteration
- add dst1=8,dst0 // 2nd dest pointer
- ;;
- add cnt=-1,r29 // ctop iteration adjustment
- cmp.eq p10,p0=r29,r0 // do we really need to loop?
- add src1=8,src0 // 2nd src pointer
- cmp.le p6,p0=8,tmp
- ;;
- cmp.le p7,p0=16,tmp
- mov ar.lc=cnt // loop setup
- cmp.eq p16,p17 = r0,r0
- mov ar.ec=2
-(p10) br.dpnt.few .aligned_src_tail
- ;;
- TEXT_ALIGN(32)
-1:
-EX(.ex_handler, (p16) ld8 r34=[src0],16)
-EK(.ex_handler, (p16) ld8 r38=[src1],16)
-EX(.ex_handler, (p17) st8 [dst0]=r33,16)
-EK(.ex_handler, (p17) st8 [dst1]=r37,16)
- ;;
-EX(.ex_handler, (p16) ld8 r32=[src0],16)
-EK(.ex_handler, (p16) ld8 r36=[src1],16)
-EX(.ex_handler, (p16) st8 [dst0]=r34,16)
-EK(.ex_handler, (p16) st8 [dst1]=r38,16)
- br.ctop.dptk.few 1b
- ;;
-
-.aligned_src_tail:
-EX(.ex_handler, (p6) ld8 t1=[src0])
- mov ar.lc=saved_lc
- mov ar.pfs=saved_pfs
-EX(.ex_hndlr_s, (p7) ld8 t2=[src1],8)
- cmp.le p8,p0=24,tmp
- and r21=-8,tmp
- ;;
-EX(.ex_hndlr_s, (p8) ld8 t3=[src1])
-EX(.ex_handler, (p6) st8 [dst0]=t1) // store byte 1
- and in2=7,tmp // remaining length
-EX(.ex_hndlr_d, (p7) st8 [dst1]=t2,8) // store byte 2
- add src0=src0,r21 // setting up src pointer
- add dst0=dst0,r21 // setting up dest pointer
- ;;
-EX(.ex_handler, (p8) st8 [dst1]=t3) // store byte 3
- mov pr=saved_pr,-1
- br.dptk.many .memcpy_short
- ;;
-
-/* code taken from copy_page_mck */
-.long_copy:
- .rotr v[2*PREFETCH_DIST]
- .rotp p[N]
-
- mov src_pre_mem = src0
- mov pr.rot = 0x10000
- mov ar.ec = 1 // special unrolled loop
-
- mov dst_pre_mem = dst0
-
- add src_pre_l2 = 8*8, src0
- add dst_pre_l2 = 8*8, dst0
- ;;
- add src0 = 8, src_pre_mem // first t1 src
- mov ar.lc = 2*PREFETCH_DIST - 1
- shr.u cnt=in2,7 // number of lines
- add src1 = 3*8, src_pre_mem // first t3 src
- add dst0 = 8, dst_pre_mem // first t1 dst
- add dst1 = 3*8, dst_pre_mem // first t3 dst
- ;;
- and tmp=127,in2 // remaining bytes after this block
- add cnt = -(2*PREFETCH_DIST) - 1, cnt
- // same as .line_copy loop, but with all predicated-off instructions removed:
-.prefetch_loop:
-EX(.ex_hndlr_lcpy_1, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0
-EK(.ex_hndlr_lcpy_1, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2
- br.ctop.sptk .prefetch_loop
- ;;
- cmp.eq p16, p0 = r0, r0 // reset p16 to 1
- mov ar.lc = cnt
- mov ar.ec = N // # of stages in pipeline
- ;;
-.line_copy:
-EX(.ex_handler, (p[D]) ld8 t2 = [src0], 3*8) // M0
-EK(.ex_handler, (p[D]) ld8 t4 = [src1], 3*8) // M1
-EX(.ex_handler_lcpy, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2 prefetch dst from memory
-EK(.ex_handler_lcpy, (p[D]) st8 [dst_pre_l2] = n8, 128) // M3 prefetch dst from L2
- ;;
-EX(.ex_handler_lcpy, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0 prefetch src from memory
-EK(.ex_handler_lcpy, (p[C]) ld8 n8 = [src_pre_l2], 128) // M1 prefetch src from L2
-EX(.ex_handler, (p[D]) st8 [dst0] = t1, 8) // M2
-EK(.ex_handler, (p[D]) st8 [dst1] = t3, 8) // M3
- ;;
-EX(.ex_handler, (p[D]) ld8 t5 = [src0], 8)
-EK(.ex_handler, (p[D]) ld8 t7 = [src1], 3*8)
-EX(.ex_handler, (p[D]) st8 [dst0] = t2, 3*8)
-EK(.ex_handler, (p[D]) st8 [dst1] = t4, 3*8)
- ;;
-EX(.ex_handler, (p[D]) ld8 t6 = [src0], 3*8)
-EK(.ex_handler, (p[D]) ld8 t10 = [src1], 8)
-EX(.ex_handler, (p[D]) st8 [dst0] = t5, 8)
-EK(.ex_handler, (p[D]) st8 [dst1] = t7, 3*8)
- ;;
-EX(.ex_handler, (p[D]) ld8 t9 = [src0], 3*8)
-EK(.ex_handler, (p[D]) ld8 t11 = [src1], 3*8)
-EX(.ex_handler, (p[D]) st8 [dst0] = t6, 3*8)
-EK(.ex_handler, (p[D]) st8 [dst1] = t10, 8)
- ;;
-EX(.ex_handler, (p[D]) ld8 t12 = [src0], 8)
-EK(.ex_handler, (p[D]) ld8 t14 = [src1], 8)
-EX(.ex_handler, (p[D]) st8 [dst0] = t9, 3*8)
-EK(.ex_handler, (p[D]) st8 [dst1] = t11, 3*8)
- ;;
-EX(.ex_handler, (p[D]) ld8 t13 = [src0], 4*8)
-EK(.ex_handler, (p[D]) ld8 t15 = [src1], 4*8)
-EX(.ex_handler, (p[D]) st8 [dst0] = t12, 8)
-EK(.ex_handler, (p[D]) st8 [dst1] = t14, 8)
- ;;
-EX(.ex_handler, (p[C]) ld8 t1 = [src0], 8)
-EK(.ex_handler, (p[C]) ld8 t3 = [src1], 8)
-EX(.ex_handler, (p[D]) st8 [dst0] = t13, 4*8)
-EK(.ex_handler, (p[D]) st8 [dst1] = t15, 4*8)
- br.ctop.sptk .line_copy
- ;;
-
- add dst0=-8,dst0
- add src0=-8,src0
- mov in2=tmp
- .restore sp
- br.sptk.many .medium_copy
- ;;
-
-#define BLOCK_SIZE 128*32
-#define blocksize r23
-#define curlen r24
-
-// dest is on 8-byte boundary, src is not. We need to do
-// ld8-ld8, shrp, then st8. Max 8 byte copy per cycle.
-.unaligned_src:
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,3,5,0,8
- .save ar.lc, saved_lc
- mov saved_lc=ar.lc
- .save pr, saved_pr
- mov saved_pr=pr
- .body
-.4k_block:
- mov saved_in0=dst0 // need to save all input arguments
- mov saved_in2=in2
- mov blocksize=BLOCK_SIZE
- ;;
- cmp.lt p6,p7=blocksize,in2
- mov saved_in1=src0
- ;;
-(p6) mov in2=blocksize
- ;;
- shr.u r21=in2,7 // this much cache line
- shr.u r22=in2,4 // number of 16-byte iteration
- and curlen=15,in2 // copy length after iteration
- and r30=7,src0 // source alignment
- ;;
- cmp.lt p7,p8=1,r21
- add cnt=-1,r21
- ;;
-
- add src_pre_mem=0,src0 // prefetch src pointer
- add dst_pre_mem=0,dst0 // prefetch dest pointer
- and src0=-8,src0 // 1st src pointer
-(p7) mov ar.lc = cnt
-(p8) mov ar.lc = r0
- ;;
- TEXT_ALIGN(32)
-1: lfetch.fault [src_pre_mem], 128
- lfetch.fault.excl [dst_pre_mem], 128
- br.cloop.dptk.few 1b
- ;;
-
- shladd dst1=r22,3,dst0 // 2nd dest pointer
- shladd src1=r22,3,src0 // 2nd src pointer
- cmp.eq p8,p9=r22,r0 // do we really need to loop?
- cmp.le p6,p7=8,curlen; // have at least 8 byte remaining?
- add cnt=-1,r22 // ctop iteration adjustment
- ;;
-EX(.ex_handler, (p9) ld8 r33=[src0],8) // loop primer
-EK(.ex_handler, (p9) ld8 r37=[src1],8)
-(p8) br.dpnt.few .noloop
- ;;
-
-// The jump address is calculated based on src alignment. The COPYU
-// macro below need to confine its size to power of two, so an entry
-// can be caulated using shl instead of an expensive multiply. The
-// size is then hard coded by the following #define to match the
-// actual size. This make it somewhat tedious when COPYU macro gets
-// changed and this need to be adjusted to match.
-#define LOOP_SIZE 6
-1:
- mov r29=ip // jmp_table thread
- mov ar.lc=cnt
- ;;
- add r29=.jump_table - 1b - (.jmp1-.jump_table), r29
- shl r28=r30, LOOP_SIZE // jmp_table thread
- mov ar.ec=2 // loop setup
- ;;
- add r29=r29,r28 // jmp_table thread
- cmp.eq p16,p17=r0,r0
- ;;
- mov b6=r29 // jmp_table thread
- ;;
- br.cond.sptk.few b6
-
-// for 8-15 byte case
-// We will skip the loop, but need to replicate the side effect
-// that the loop produces.
-.noloop:
-EX(.ex_handler, (p6) ld8 r37=[src1],8)
- add src0=8,src0
-(p6) shl r25=r30,3
- ;;
-EX(.ex_handler, (p6) ld8 r27=[src1])
-(p6) shr.u r28=r37,r25
-(p6) sub r26=64,r25
- ;;
-(p6) shl r27=r27,r26
- ;;
-(p6) or r21=r28,r27
-
-.unaligned_src_tail:
-/* check if we have more than blocksize to copy, if so go back */
- cmp.gt p8,p0=saved_in2,blocksize
- ;;
-(p8) add dst0=saved_in0,blocksize
-(p8) add src0=saved_in1,blocksize
-(p8) sub in2=saved_in2,blocksize
-(p8) br.dpnt .4k_block
- ;;
-
-/* we have up to 15 byte to copy in the tail.
- * part of work is already done in the jump table code
- * we are at the following state.
- * src side:
- *
- * xxxxxx xx <----- r21 has xxxxxxxx already
- * -------- -------- --------
- * 0 8 16
- * ^
- * |
- * src1
- *
- * dst
- * -------- -------- --------
- * ^
- * |
- * dst1
- */
-EX(.ex_handler, (p6) st8 [dst1]=r21,8) // more than 8 byte to copy
-(p6) add curlen=-8,curlen // update length
- mov ar.pfs=saved_pfs
- ;;
- mov ar.lc=saved_lc
- mov pr=saved_pr,-1
- mov in2=curlen // remaining length
- mov dst0=dst1 // dest pointer
- add src0=src1,r30 // forward by src alignment
- ;;
-
-// 7 byte or smaller.
-.memcpy_short:
- cmp.le p8,p9 = 1,in2
- cmp.le p10,p11 = 2,in2
- cmp.le p12,p13 = 3,in2
- cmp.le p14,p15 = 4,in2
- add src1=1,src0 // second src pointer
- add dst1=1,dst0 // second dest pointer
- ;;
-
-EX(.ex_handler_short, (p8) ld1 t1=[src0],2)
-EK(.ex_handler_short, (p10) ld1 t2=[src1],2)
-(p9) br.ret.dpnt rp // 0 byte copy
- ;;
-
-EX(.ex_handler_short, (p8) st1 [dst0]=t1,2)
-EK(.ex_handler_short, (p10) st1 [dst1]=t2,2)
-(p11) br.ret.dpnt rp // 1 byte copy
-
-EX(.ex_handler_short, (p12) ld1 t3=[src0],2)
-EK(.ex_handler_short, (p14) ld1 t4=[src1],2)
-(p13) br.ret.dpnt rp // 2 byte copy
- ;;
-
- cmp.le p6,p7 = 5,in2
- cmp.le p8,p9 = 6,in2
- cmp.le p10,p11 = 7,in2
-
-EX(.ex_handler_short, (p12) st1 [dst0]=t3,2)
-EK(.ex_handler_short, (p14) st1 [dst1]=t4,2)
-(p15) br.ret.dpnt rp // 3 byte copy
- ;;
-
-EX(.ex_handler_short, (p6) ld1 t5=[src0],2)
-EK(.ex_handler_short, (p8) ld1 t6=[src1],2)
-(p7) br.ret.dpnt rp // 4 byte copy
- ;;
-
-EX(.ex_handler_short, (p6) st1 [dst0]=t5,2)
-EK(.ex_handler_short, (p8) st1 [dst1]=t6,2)
-(p9) br.ret.dptk rp // 5 byte copy
-
-EX(.ex_handler_short, (p10) ld1 t7=[src0],2)
-(p11) br.ret.dptk rp // 6 byte copy
- ;;
-
-EX(.ex_handler_short, (p10) st1 [dst0]=t7,2)
- br.ret.dptk rp // done all cases
-
-
-/* Align dest to nearest 8-byte boundary. We know we have at
- * least 7 bytes to copy, enough to crawl to 8-byte boundary.
- * Actual number of byte to crawl depend on the dest alignment.
- * 7 byte or less is taken care at .memcpy_short
-
- * src0 - source even index
- * src1 - source odd index
- * dst0 - dest even index
- * dst1 - dest odd index
- * r30 - distance to 8-byte boundary
- */
-
-.align_dest:
- add src1=1,in1 // source odd index
- cmp.le p7,p0 = 2,r30 // for .align_dest
- cmp.le p8,p0 = 3,r30 // for .align_dest
-EX(.ex_handler_short, (p6) ld1 t1=[src0],2)
- cmp.le p9,p0 = 4,r30 // for .align_dest
- cmp.le p10,p0 = 5,r30
- ;;
-EX(.ex_handler_short, (p7) ld1 t2=[src1],2)
-EK(.ex_handler_short, (p8) ld1 t3=[src0],2)
- cmp.le p11,p0 = 6,r30
-EX(.ex_handler_short, (p6) st1 [dst0] = t1,2)
- cmp.le p12,p0 = 7,r30
- ;;
-EX(.ex_handler_short, (p9) ld1 t4=[src1],2)
-EK(.ex_handler_short, (p10) ld1 t5=[src0],2)
-EX(.ex_handler_short, (p7) st1 [dst1] = t2,2)
-EK(.ex_handler_short, (p8) st1 [dst0] = t3,2)
- ;;
-EX(.ex_handler_short, (p11) ld1 t6=[src1],2)
-EK(.ex_handler_short, (p12) ld1 t7=[src0],2)
- cmp.eq p6,p7=r28,r29
-EX(.ex_handler_short, (p9) st1 [dst1] = t4,2)
-EK(.ex_handler_short, (p10) st1 [dst0] = t5,2)
- sub in2=in2,r30
- ;;
-EX(.ex_handler_short, (p11) st1 [dst1] = t6,2)
-EK(.ex_handler_short, (p12) st1 [dst0] = t7)
- add dst0=in0,r30 // setup arguments
- add src0=in1,r30
-(p6) br.cond.dptk .aligned_src
-(p7) br.cond.dpnt .unaligned_src
- ;;
-
-/* main loop body in jump table format */
-#define COPYU(shift) \
-1: \
-EX(.ex_handler, (p16) ld8 r32=[src0],8); /* 1 */ \
-EK(.ex_handler, (p16) ld8 r36=[src1],8); \
- (p17) shrp r35=r33,r34,shift;; /* 1 */ \
-EX(.ex_handler, (p6) ld8 r22=[src1]); /* common, prime for tail section */ \
- nop.m 0; \
- (p16) shrp r38=r36,r37,shift; \
-EX(.ex_handler, (p17) st8 [dst0]=r35,8); /* 1 */ \
-EK(.ex_handler, (p17) st8 [dst1]=r39,8); \
- br.ctop.dptk.few 1b;; \
- (p7) add src1=-8,src1; /* back out for <8 byte case */ \
- shrp r21=r22,r38,shift; /* speculative work */ \
- br.sptk.few .unaligned_src_tail /* branch out of jump table */ \
- ;;
- TEXT_ALIGN(32)
-.jump_table:
- COPYU(8) // unaligned cases
-.jmp1:
- COPYU(16)
- COPYU(24)
- COPYU(32)
- COPYU(40)
- COPYU(48)
- COPYU(56)
-
-#undef A
-#undef B
-#undef C
-#undef D
-
-/*
- * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
- * instruction failed in the bundle. The exception algorithm is that we
- * first figure out the faulting address, then detect if there is any
- * progress made on the copy, if so, redo the copy from last known copied
- * location up to the faulting address (exclusive). In the copy_from_user
- * case, remaining byte in kernel buffer will be zeroed.
- *
- * Take copy_from_user as an example, in the code there are multiple loads
- * in a bundle and those multiple loads could span over two pages, the
- * faulting address is calculated as page_round_down(max(src0, src1)).
- * This is based on knowledge that if we can access one byte in a page, we
- * can access any byte in that page.
- *
- * predicate used in the exception handler:
- * p6-p7: direction
- * p10-p11: src faulting addr calculation
- * p12-p13: dst faulting addr calculation
- */
-
-#define A r19
-#define B r20
-#define C r21
-#define D r22
-#define F r28
-
-#define saved_retval loc0
-#define saved_rtlink loc1
-#define saved_pfs_stack loc2
-
-.ex_hndlr_s:
- add src0=8,src0
- br.sptk .ex_handler
- ;;
-.ex_hndlr_d:
- add dst0=8,dst0
- br.sptk .ex_handler
- ;;
-.ex_hndlr_lcpy_1:
- mov src1=src_pre_mem
- mov dst1=dst_pre_mem
- cmp.gtu p10,p11=src_pre_mem,saved_in1
- cmp.gtu p12,p13=dst_pre_mem,saved_in0
- ;;
-(p10) add src0=8,saved_in1
-(p11) mov src0=saved_in1
-(p12) add dst0=8,saved_in0
-(p13) mov dst0=saved_in0
- br.sptk .ex_handler
-.ex_handler_lcpy:
- // in line_copy block, the preload addresses should always ahead
- // of the other two src/dst pointers. Furthermore, src1/dst1 should
- // always ahead of src0/dst0.
- mov src1=src_pre_mem
- mov dst1=dst_pre_mem
-.ex_handler:
- mov pr=saved_pr,-1 // first restore pr, lc, and pfs
- mov ar.lc=saved_lc
- mov ar.pfs=saved_pfs
- ;;
-.ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs
- cmp.ltu p6,p7=saved_in0, saved_in1 // get the copy direction
- cmp.ltu p10,p11=src0,src1
- cmp.ltu p12,p13=dst0,dst1
- fcmp.eq p8,p0=f6,f0 // is it memcpy?
- mov tmp = dst0
- ;;
-(p11) mov src1 = src0 // pick the larger of the two
-(p13) mov dst0 = dst1 // make dst0 the smaller one
-(p13) mov dst1 = tmp // and dst1 the larger one
- ;;
-(p6) dep F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary
-(p7) dep F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary
- ;;
-(p6) cmp.le p14,p0=dst0,saved_in0 // no progress has been made on store
-(p7) cmp.le p14,p0=src0,saved_in1 // no progress has been made on load
- mov retval=saved_in2
-(p8) ld1 tmp=[src1] // force an oops for memcpy call
-(p8) st1 [dst1]=r0 // force an oops for memcpy call
-(p14) br.ret.sptk.many rp
-
-/*
- * The remaining byte to copy is calculated as:
- *
- * A = (faulting_addr - orig_src) -> len to faulting ld address
- * or
- * (faulting_addr - orig_dst) -> len to faulting st address
- * B = (cur_dst - orig_dst) -> len copied so far
- * C = A - B -> len need to be copied
- * D = orig_len - A -> len need to be left along
- */
-(p6) sub A = F, saved_in0
-(p7) sub A = F, saved_in1
- clrrrb
- ;;
- alloc saved_pfs_stack=ar.pfs,3,3,3,0
- cmp.lt p8,p0=A,r0
- sub B = dst0, saved_in0 // how many byte copied so far
- ;;
-(p8) mov A = 0; // A shouldn't be negative, cap it
- ;;
- sub C = A, B
- sub D = saved_in2, A
- ;;
- cmp.gt p8,p0=C,r0 // more than 1 byte?
- mov r8=0
- mov saved_retval = D
- mov saved_rtlink = b0
-
- add out0=saved_in0, B
- add out1=saved_in1, B
- mov out2=C
-(p8) br.call.sptk.few b0=__copy_user // recursive call
- ;;
-
- add saved_retval=saved_retval,r8 // above might return non-zero value
- ;;
-
- mov retval=saved_retval
- mov ar.pfs=saved_pfs_stack
- mov b0=saved_rtlink
- br.ret.sptk.many rp
-
-/* end of McKinley specific optimization */
-END(__copy_user)
-EXPORT_SYMBOL(__copy_user)
diff --git a/arch/ia64/lib/memset.S b/arch/ia64/lib/memset.S
deleted file mode 100644
index 552c5c7e4d06..000000000000
--- a/arch/ia64/lib/memset.S
+++ /dev/null
@@ -1,365 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Optimized version of the standard memset() function.
-
- Copyright (c) 2002 Hewlett-Packard Co/CERN
- Sverre Jarp <Sverre.Jarp@cern.ch>
-
- Return: dest
-
- Inputs:
- in0: dest
- in1: value
- in2: count
-
- The algorithm is fairly straightforward: set byte by byte until we
- we get to a 16B-aligned address, then loop on 128 B chunks using an
- early store as prefetching, then loop on 32B chucks, then clear remaining
- words, finally clear remaining bytes.
- Since a stf.spill f0 can store 16B in one go, we use this instruction
- to get peak speed when value = 0. */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#undef ret
-
-#define dest in0
-#define value in1
-#define cnt in2
-
-#define tmp r31
-#define save_lc r30
-#define ptr0 r29
-#define ptr1 r28
-#define ptr2 r27
-#define ptr3 r26
-#define ptr9 r24
-#define loopcnt r23
-#define linecnt r22
-#define bytecnt r21
-
-#define fvalue f6
-
-// This routine uses only scratch predicate registers (p6 - p15)
-#define p_scr p6 // default register for same-cycle branches
-#define p_nz p7
-#define p_zr p8
-#define p_unalgn p9
-#define p_y p11
-#define p_n p12
-#define p_yy p13
-#define p_nn p14
-
-#define MIN1 15
-#define MIN1P1HALF 8
-#define LINE_SIZE 128
-#define LSIZE_SH 7 // shift amount
-#define PREF_AHEAD 8
-
-GLOBAL_ENTRY(memset)
-{ .mmi
- .prologue
- alloc tmp = ar.pfs, 3, 0, 0, 0
- lfetch.nt1 [dest] //
- .save ar.lc, save_lc
- mov.i save_lc = ar.lc
- .body
-} { .mmi
- mov ret0 = dest // return value
- cmp.ne p_nz, p_zr = value, r0 // use stf.spill if value is zero
- cmp.eq p_scr, p0 = cnt, r0
-;; }
-{ .mmi
- and ptr2 = -(MIN1+1), dest // aligned address
- and tmp = MIN1, dest // prepare to check for correct alignment
- tbit.nz p_y, p_n = dest, 0 // Do we have an odd address? (M_B_U)
-} { .mib
- mov ptr1 = dest
- mux1 value = value, @brcst // create 8 identical bytes in word
-(p_scr) br.ret.dpnt.many rp // return immediately if count = 0
-;; }
-{ .mib
- cmp.ne p_unalgn, p0 = tmp, r0 //
-} { .mib
- sub bytecnt = (MIN1+1), tmp // NB: # of bytes to move is 1 higher than loopcnt
- cmp.gt p_scr, p0 = 16, cnt // is it a minimalistic task?
-(p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U)
-;; }
-{ .mmi
-(p_unalgn) add ptr1 = (MIN1+1), ptr2 // after alignment
-(p_unalgn) add ptr2 = MIN1P1HALF, ptr2 // after alignment
-(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3 // should we do a st8 ?
-;; }
-{ .mib
-(p_y) add cnt = -8, cnt //
-(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2 // should we do a st4 ?
-} { .mib
-(p_y) st8 [ptr2] = value,-4 //
-(p_n) add ptr2 = 4, ptr2 //
-;; }
-{ .mib
-(p_yy) add cnt = -4, cnt //
-(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1 // should we do a st2 ?
-} { .mib
-(p_yy) st4 [ptr2] = value,-2 //
-(p_nn) add ptr2 = 2, ptr2 //
-;; }
-{ .mmi
- mov tmp = LINE_SIZE+1 // for compare
-(p_y) add cnt = -2, cnt //
-(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0 // should we do a st1 ?
-} { .mmi
- setf.sig fvalue=value // transfer value to FLP side
-(p_y) st2 [ptr2] = value,-1 //
-(p_n) add ptr2 = 1, ptr2 //
-;; }
-
-{ .mmi
-(p_yy) st1 [ptr2] = value //
- cmp.gt p_scr, p0 = tmp, cnt // is it a minimalistic task?
-} { .mbb
-(p_yy) add cnt = -1, cnt //
-(p_scr) br.cond.dpnt.many .fraction_of_line // go move just a few
-;; }
-
-{ .mib
- nop.m 0
- shr.u linecnt = cnt, LSIZE_SH
-(p_zr) br.cond.dptk.many .l1b // Jump to use stf.spill
-;; }
-
- TEXT_ALIGN(32) // --------------------- // L1A: store ahead into cache lines; fill later
-{ .mmi
- and tmp = -(LINE_SIZE), cnt // compute end of range
- mov ptr9 = ptr1 // used for prefetching
- and cnt = (LINE_SIZE-1), cnt // remainder
-} { .mmi
- mov loopcnt = PREF_AHEAD-1 // default prefetch loop
- cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
-;; }
-{ .mmi
-(p_scr) add loopcnt = -1, linecnt //
- add ptr2 = 8, ptr1 // start of stores (beyond prefetch stores)
- add ptr1 = tmp, ptr1 // first address beyond total range
-;; }
-{ .mmi
- add tmp = -1, linecnt // next loop count
- mov.i ar.lc = loopcnt //
-;; }
-.pref_l1a:
-{ .mib
- stf8 [ptr9] = fvalue, 128 // Do stores one cache line apart
- nop.i 0
- br.cloop.dptk.few .pref_l1a
-;; }
-{ .mmi
- add ptr0 = 16, ptr2 // Two stores in parallel
- mov.i ar.lc = tmp //
-;; }
-.l1ax:
- { .mmi
- stf8 [ptr2] = fvalue, 8
- stf8 [ptr0] = fvalue, 8
- ;; }
- { .mmi
- stf8 [ptr2] = fvalue, 24
- stf8 [ptr0] = fvalue, 24
- ;; }
- { .mmi
- stf8 [ptr2] = fvalue, 8
- stf8 [ptr0] = fvalue, 8
- ;; }
- { .mmi
- stf8 [ptr2] = fvalue, 24
- stf8 [ptr0] = fvalue, 24
- ;; }
- { .mmi
- stf8 [ptr2] = fvalue, 8
- stf8 [ptr0] = fvalue, 8
- ;; }
- { .mmi
- stf8 [ptr2] = fvalue, 24
- stf8 [ptr0] = fvalue, 24
- ;; }
- { .mmi
- stf8 [ptr2] = fvalue, 8
- stf8 [ptr0] = fvalue, 32
- cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
- ;; }
-{ .mmb
- stf8 [ptr2] = fvalue, 24
-(p_scr) stf8 [ptr9] = fvalue, 128
- br.cloop.dptk.few .l1ax
-;; }
-{ .mbb
- cmp.le p_scr, p0 = 8, cnt // just a few bytes left ?
-(p_scr) br.cond.dpnt.many .fraction_of_line // Branch no. 2
- br.cond.dpnt.many .move_bytes_from_alignment // Branch no. 3
-;; }
-
- TEXT_ALIGN(32)
-.l1b: // ------------------------------------ // L1B: store ahead into cache lines; fill later
-{ .mmi
- and tmp = -(LINE_SIZE), cnt // compute end of range
- mov ptr9 = ptr1 // used for prefetching
- and cnt = (LINE_SIZE-1), cnt // remainder
-} { .mmi
- mov loopcnt = PREF_AHEAD-1 // default prefetch loop
- cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
-;; }
-{ .mmi
-(p_scr) add loopcnt = -1, linecnt
- add ptr2 = 16, ptr1 // start of stores (beyond prefetch stores)
- add ptr1 = tmp, ptr1 // first address beyond total range
-;; }
-{ .mmi
- add tmp = -1, linecnt // next loop count
- mov.i ar.lc = loopcnt
-;; }
-.pref_l1b:
-{ .mib
- stf.spill [ptr9] = f0, 128 // Do stores one cache line apart
- nop.i 0
- br.cloop.dptk.few .pref_l1b
-;; }
-{ .mmi
- add ptr0 = 16, ptr2 // Two stores in parallel
- mov.i ar.lc = tmp
-;; }
-.l1bx:
- { .mmi
- stf.spill [ptr2] = f0, 32
- stf.spill [ptr0] = f0, 32
- ;; }
- { .mmi
- stf.spill [ptr2] = f0, 32
- stf.spill [ptr0] = f0, 32
- ;; }
- { .mmi
- stf.spill [ptr2] = f0, 32
- stf.spill [ptr0] = f0, 64
- cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
- ;; }
-{ .mmb
- stf.spill [ptr2] = f0, 32
-(p_scr) stf.spill [ptr9] = f0, 128
- br.cloop.dptk.few .l1bx
-;; }
-{ .mib
- cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
-(p_scr) br.cond.dpnt.many .move_bytes_from_alignment //
-;; }
-
-.fraction_of_line:
-{ .mib
- add ptr2 = 16, ptr1
- shr.u loopcnt = cnt, 5 // loopcnt = cnt / 32
-;; }
-{ .mib
- cmp.eq p_scr, p0 = loopcnt, r0
- add loopcnt = -1, loopcnt
-(p_scr) br.cond.dpnt.many .store_words
-;; }
-{ .mib
- and cnt = 0x1f, cnt // compute the remaining cnt
- mov.i ar.lc = loopcnt
-;; }
- TEXT_ALIGN(32)
-.l2: // ------------------------------------ // L2A: store 32B in 2 cycles
-{ .mmb
- stf8 [ptr1] = fvalue, 8
- stf8 [ptr2] = fvalue, 8
-;; } { .mmb
- stf8 [ptr1] = fvalue, 24
- stf8 [ptr2] = fvalue, 24
- br.cloop.dptk.many .l2
-;; }
-.store_words:
-{ .mib
- cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
-(p_scr) br.cond.dpnt.many .move_bytes_from_alignment // Branch
-;; }
-
-{ .mmi
- stf8 [ptr1] = fvalue, 8 // store
- cmp.le p_y, p_n = 16, cnt
- add cnt = -8, cnt // subtract
-;; }
-{ .mmi
-(p_y) stf8 [ptr1] = fvalue, 8 // store
-(p_y) cmp.le.unc p_yy, p_nn = 16, cnt
-(p_y) add cnt = -8, cnt // subtract
-;; }
-{ .mmi // store
-(p_yy) stf8 [ptr1] = fvalue, 8
-(p_yy) add cnt = -8, cnt // subtract
-;; }
-
-.move_bytes_from_alignment:
-{ .mib
- cmp.eq p_scr, p0 = cnt, r0
- tbit.nz.unc p_y, p0 = cnt, 2 // should we terminate with a st4 ?
-(p_scr) br.cond.dpnt.few .restore_and_exit
-;; }
-{ .mib
-(p_y) st4 [ptr1] = value,4
- tbit.nz.unc p_yy, p0 = cnt, 1 // should we terminate with a st2 ?
-;; }
-{ .mib
-(p_yy) st2 [ptr1] = value,2
- tbit.nz.unc p_y, p0 = cnt, 0 // should we terminate with a st1 ?
-;; }
-
-{ .mib
-(p_y) st1 [ptr1] = value
-;; }
-.restore_and_exit:
-{ .mib
- nop.m 0
- mov.i ar.lc = save_lc
- br.ret.sptk.many rp
-;; }
-
-.move_bytes_unaligned:
-{ .mmi
- .pred.rel "mutex",p_y, p_n
- .pred.rel "mutex",p_yy, p_nn
-(p_n) cmp.le p_yy, p_nn = 4, cnt
-(p_y) cmp.le p_yy, p_nn = 5, cnt
-(p_n) add ptr2 = 2, ptr1
-} { .mmi
-(p_y) add ptr2 = 3, ptr1
-(p_y) st1 [ptr1] = value, 1 // fill 1 (odd-aligned) byte [15, 14 (or less) left]
-(p_y) add cnt = -1, cnt
-;; }
-{ .mmi
-(p_yy) cmp.le.unc p_y, p0 = 8, cnt
- add ptr3 = ptr1, cnt // prepare last store
- mov.i ar.lc = save_lc
-} { .mmi
-(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
-(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [11, 10 (o less) left]
-(p_yy) add cnt = -4, cnt
-;; }
-{ .mmi
-(p_y) cmp.le.unc p_yy, p0 = 8, cnt
- add ptr3 = -1, ptr3 // last store
- tbit.nz p_scr, p0 = cnt, 1 // will there be a st2 at the end ?
-} { .mmi
-(p_y) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
-(p_y) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [7, 6 (or less) left]
-(p_y) add cnt = -4, cnt
-;; }
-{ .mmi
-(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
-(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [3, 2 (or less) left]
- tbit.nz p_y, p0 = cnt, 0 // will there be a st1 at the end ?
-} { .mmi
-(p_yy) add cnt = -4, cnt
-;; }
-{ .mmb
-(p_scr) st2 [ptr1] = value // fill 2 (aligned) bytes
-(p_y) st1 [ptr3] = value // fill last byte (using ptr3)
- br.ret.sptk.many rp
-}
-END(memset)
-EXPORT_SYMBOL(memset)
diff --git a/arch/ia64/lib/strlen.S b/arch/ia64/lib/strlen.S
deleted file mode 100644
index 1f4a46c15127..000000000000
--- a/arch/ia64/lib/strlen.S
+++ /dev/null
@@ -1,195 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the standard strlen() function
- *
- *
- * Inputs:
- * in0 address of string
- *
- * Outputs:
- * ret0 the number of characters in the string (0 if empty string)
- * does not count the \0
- *
- * Copyright (C) 1999, 2001 Hewlett-Packard Co
- * Stephane Eranian <eranian@hpl.hp.com>
- *
- * 09/24/99 S.Eranian add speculation recovery code
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-//
-//
-// This is an enhanced version of the basic strlen. it includes a combination
-// of compute zero index (czx), parallel comparisons, speculative loads and
-// loop unroll using rotating registers.
-//
-// General Ideas about the algorithm:
-// The goal is to look at the string in chunks of 8 bytes.
-// so we need to do a few extra checks at the beginning because the
-// string may not be 8-byte aligned. In this case we load the 8byte
-// quantity which includes the start of the string and mask the unused
-// bytes with 0xff to avoid confusing czx.
-// We use speculative loads and software pipelining to hide memory
-// latency and do read ahead safely. This way we defer any exception.
-//
-// Because we don't want the kernel to be relying on particular
-// settings of the DCR register, we provide recovery code in case
-// speculation fails. The recovery code is going to "redo" the work using
-// only normal loads. If we still get a fault then we generate a
-// kernel panic. Otherwise we return the strlen as usual.
-//
-// The fact that speculation may fail can be caused, for instance, by
-// the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
-// a NaT bit will be set if the translation is not present. The normal
-// load, on the other hand, will cause the translation to be inserted
-// if the mapping exists.
-//
-// It should be noted that we execute recovery code only when we need
-// to use the data that has been speculatively loaded: we don't execute
-// recovery code on pure read ahead data.
-//
-// Remarks:
-// - the cmp r0,r0 is used as a fast way to initialize a predicate
-// register to 1. This is required to make sure that we get the parallel
-// compare correct.
-//
-// - we don't use the epilogue counter to exit the loop but we need to set
-// it to zero beforehand.
-//
-// - after the loop we must test for Nat values because neither the
-// czx nor cmp instruction raise a NaT consumption fault. We must be
-// careful not to look too far for a Nat for which we don't care.
-// For instance we don't need to look at a NaT in val2 if the zero byte
-// was in val1.
-//
-// - Clearly performance tuning is required.
-//
-//
-//
-#define saved_pfs r11
-#define tmp r10
-#define base r16
-#define orig r17
-#define saved_pr r18
-#define src r19
-#define mask r20
-#define val r21
-#define val1 r22
-#define val2 r23
-
-GLOBAL_ENTRY(strlen)
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8
-
- .rotr v[2], w[2] // declares our 4 aliases
-
- extr.u tmp=in0,0,3 // tmp=least significant 3 bits
- mov orig=in0 // keep trackof initial byte address
- dep src=0,in0,0,3 // src=8byte-aligned in0 address
- .save pr, saved_pr
- mov saved_pr=pr // preserve predicates (rotation)
- ;;
-
- .body
-
- ld8 v[1]=[src],8 // must not speculate: can fail here
- shl tmp=tmp,3 // multiply by 8bits/byte
- mov mask=-1 // our mask
- ;;
- ld8.s w[1]=[src],8 // speculatively load next
- cmp.eq p6,p0=r0,r0 // sets p6 to true for cmp.and
- sub tmp=64,tmp // how many bits to shift our mask on the right
- ;;
- shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part
- mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs)
- ;;
- add base=-16,src // keep track of aligned base
- or v[1]=v[1],mask // now we have a safe initial byte pattern
- ;;
-1:
- ld8.s v[0]=[src],8 // speculatively load next
- czx1.r val1=v[1] // search 0 byte from right
- czx1.r val2=w[1] // search 0 byte from right following 8bytes
- ;;
- ld8.s w[0]=[src],8 // speculatively load next to next
- cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8
- cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8
-(p6) br.wtop.dptk 1b // loop until p6 == 0
- ;;
- //
- // We must return try the recovery code iff
- // val1_is_nat || (val1==8 && val2_is_nat)
- //
- // XXX Fixme
- // - there must be a better way of doing the test
- //
- cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate)
- tnat.nz p6,p7=val1 // test NaT on val1
-(p6) br.cond.spnt .recover // jump to recovery if val1 is NaT
- ;;
- //
- // if we come here p7 is true, i.e., initialized for // cmp
- //
- cmp.eq.and p7,p0=8,val1// val1==8?
- tnat.nz.and p7,p0=val2 // test NaT if val2
-(p7) br.cond.spnt .recover // jump to recovery if val2 is NaT
- ;;
-(p8) mov val1=val2 // the other test got us out of the loop
-(p8) adds src=-16,src // correct position when 3 ahead
-(p9) adds src=-24,src // correct position when 4 ahead
- ;;
- sub ret0=src,orig // distance from base
- sub tmp=8,val1 // which byte in word
- mov pr=saved_pr,0xffffffffffff0000
- ;;
- sub ret0=ret0,tmp // adjust
- mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
- br.ret.sptk.many rp // end of normal execution
-
- //
- // Outlined recovery code when speculation failed
- //
- // This time we don't use speculation and rely on the normal exception
- // mechanism. that's why the loop is not as good as the previous one
- // because read ahead is not possible
- //
- // IMPORTANT:
- // Please note that in the case of strlen() as opposed to strlen_user()
- // we don't use the exception mechanism, as this function is not
- // supposed to fail. If that happens it means we have a bug and the
- // code will cause of kernel fault.
- //
- // XXX Fixme
- // - today we restart from the beginning of the string instead
- // of trying to continue where we left off.
- //
-.recover:
- ld8 val=[base],8 // will fail if unrecoverable fault
- ;;
- or val=val,mask // remask first bytes
- cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop
- ;;
- //
- // ar.ec is still zero here
- //
-2:
-(p6) ld8 val=[base],8 // will fail if unrecoverable fault
- ;;
- czx1.r val1=val // search 0 byte from right
- ;;
- cmp.eq p6,p0=8,val1 // val1==8 ?
-(p6) br.wtop.dptk 2b // loop until p6 == 0
- ;; // (avoid WAW on p63)
- sub ret0=base,orig // distance from base
- sub tmp=8,val1
- mov pr=saved_pr,0xffffffffffff0000
- ;;
- sub ret0=ret0,tmp // length=now - back -1
- mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
- br.ret.sptk.many rp // end of successful recovery code
-END(strlen)
-EXPORT_SYMBOL(strlen)
diff --git a/arch/ia64/lib/strncpy_from_user.S b/arch/ia64/lib/strncpy_from_user.S
deleted file mode 100644
index a287169bd953..000000000000
--- a/arch/ia64/lib/strncpy_from_user.S
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Just like strncpy() except that if a fault occurs during copying,
- * -EFAULT is returned.
- *
- * Inputs:
- * in0: address of destination buffer
- * in1: address of string to be copied
- * in2: length of buffer in bytes
- * Outputs:
- * r8: -EFAULT in case of fault or number of bytes copied if no fault
- *
- * Copyright (C) 1998-2001 Hewlett-Packard Co
- * Copyright (C) 1998-2001 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 00/03/06 D. Mosberger Fixed to return proper return value (bug found by
- * by Andreas Schwab <schwab@suse.de>).
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(__strncpy_from_user)
- alloc r2=ar.pfs,3,0,0,0
- mov r8=0
- mov r9=in1
- ;;
- add r10=in1,in2
- cmp.eq p6,p0=r0,in2
-(p6) br.ret.spnt.many rp
-
- // XXX braindead copy loop---this needs to be optimized
-.Loop1:
- EX(.Lexit, ld1 r8=[in1],1)
- ;;
- EX(.Lexit, st1 [in0]=r8,1)
- cmp.ne p6,p7=r8,r0
- ;;
-(p6) cmp.ne.unc p8,p0=in1,r10
-(p8) br.cond.dpnt.few .Loop1
- ;;
-(p6) mov r8=in2 // buffer filled up---return buffer length
-(p7) sub r8=in1,r9,1 // return string length (excluding NUL character)
-[.Lexit:]
- br.ret.sptk.many rp
-END(__strncpy_from_user)
-EXPORT_SYMBOL(__strncpy_from_user)
diff --git a/arch/ia64/lib/strnlen_user.S b/arch/ia64/lib/strnlen_user.S
deleted file mode 100644
index a7eb56e840a9..000000000000
--- a/arch/ia64/lib/strnlen_user.S
+++ /dev/null
@@ -1,48 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Returns 0 if exception before NUL or reaching the supplied limit (N),
- * a value greater than N if the string is longer than the limit, else
- * strlen.
- *
- * Inputs:
- * in0: address of buffer
- * in1: string length limit N
- * Outputs:
- * r8: 0 in case of fault, strlen(buffer)+1 otherwise
- *
- * Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(__strnlen_user)
- .prologue
- alloc r2=ar.pfs,2,0,0,0
- .save ar.lc, r16
- mov r16=ar.lc // preserve ar.lc
-
- .body
-
- add r3=-1,in1
- ;;
- mov ar.lc=r3
- mov r9=0
- ;;
- // XXX braindead strlen loop---this needs to be optimized
-.Loop1:
- EXCLR(.Lexit, ld1 r8=[in0],1)
- add r9=1,r9
- ;;
- cmp.eq p6,p0=r8,r0
-(p6) br.cond.dpnt .Lexit
- br.cloop.dptk.few .Loop1
-
- add r9=1,in1 // NUL not found---return N+1
- ;;
-.Lexit:
- mov r8=r9
- mov ar.lc=r16 // restore ar.lc
- br.ret.sptk.many rp
-END(__strnlen_user)
-EXPORT_SYMBOL(__strnlen_user)
diff --git a/arch/ia64/lib/xor.S b/arch/ia64/lib/xor.S
deleted file mode 100644
index 6e2a69662c06..000000000000
--- a/arch/ia64/lib/xor.S
+++ /dev/null
@@ -1,181 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * arch/ia64/lib/xor.S
- *
- * Optimized RAID-5 checksumming functions for IA-64.
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(xor_ia64_2)
- .prologue
- .fframe 0
- .save ar.pfs, r31
- alloc r31 = ar.pfs, 3, 0, 13, 16
- .save ar.lc, r30
- mov r30 = ar.lc
- .save pr, r29
- mov r29 = pr
- ;;
- .body
- mov r8 = in1
- mov ar.ec = 6 + 2
- shr in0 = in0, 3
- ;;
- adds in0 = -1, in0
- mov r16 = in1
- mov r17 = in2
- ;;
- mov ar.lc = in0
- mov pr.rot = 1 << 16
- ;;
- .rotr s1[6+1], s2[6+1], d[2]
- .rotp p[6+2]
-0:
-(p[0]) ld8.nta s1[0] = [r16], 8
-(p[0]) ld8.nta s2[0] = [r17], 8
-(p[6]) xor d[0] = s1[6], s2[6]
-(p[6+1])st8.nta [r8] = d[1], 8
- nop.f 0
- br.ctop.dptk.few 0b
- ;;
- mov ar.lc = r30
- mov pr = r29, -1
- br.ret.sptk.few rp
-END(xor_ia64_2)
-EXPORT_SYMBOL(xor_ia64_2)
-
-GLOBAL_ENTRY(xor_ia64_3)
- .prologue
- .fframe 0
- .save ar.pfs, r31
- alloc r31 = ar.pfs, 4, 0, 20, 24
- .save ar.lc, r30
- mov r30 = ar.lc
- .save pr, r29
- mov r29 = pr
- ;;
- .body
- mov r8 = in1
- mov ar.ec = 6 + 2
- shr in0 = in0, 3
- ;;
- adds in0 = -1, in0
- mov r16 = in1
- mov r17 = in2
- ;;
- mov r18 = in3
- mov ar.lc = in0
- mov pr.rot = 1 << 16
- ;;
- .rotr s1[6+1], s2[6+1], s3[6+1], d[2]
- .rotp p[6+2]
-0:
-(p[0]) ld8.nta s1[0] = [r16], 8
-(p[0]) ld8.nta s2[0] = [r17], 8
-(p[6]) xor d[0] = s1[6], s2[6]
- ;;
-(p[0]) ld8.nta s3[0] = [r18], 8
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6]) xor d[0] = d[0], s3[6]
- br.ctop.dptk.few 0b
- ;;
- mov ar.lc = r30
- mov pr = r29, -1
- br.ret.sptk.few rp
-END(xor_ia64_3)
-EXPORT_SYMBOL(xor_ia64_3)
-
-GLOBAL_ENTRY(xor_ia64_4)
- .prologue
- .fframe 0
- .save ar.pfs, r31
- alloc r31 = ar.pfs, 5, 0, 27, 32
- .save ar.lc, r30
- mov r30 = ar.lc
- .save pr, r29
- mov r29 = pr
- ;;
- .body
- mov r8 = in1
- mov ar.ec = 6 + 2
- shr in0 = in0, 3
- ;;
- adds in0 = -1, in0
- mov r16 = in1
- mov r17 = in2
- ;;
- mov r18 = in3
- mov ar.lc = in0
- mov pr.rot = 1 << 16
- mov r19 = in4
- ;;
- .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
- .rotp p[6+2]
-0:
-(p[0]) ld8.nta s1[0] = [r16], 8
-(p[0]) ld8.nta s2[0] = [r17], 8
-(p[6]) xor d[0] = s1[6], s2[6]
-(p[0]) ld8.nta s3[0] = [r18], 8
-(p[0]) ld8.nta s4[0] = [r19], 8
-(p[6]) xor r20 = s3[6], s4[6]
- ;;
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6]) xor d[0] = d[0], r20
- br.ctop.dptk.few 0b
- ;;
- mov ar.lc = r30
- mov pr = r29, -1
- br.ret.sptk.few rp
-END(xor_ia64_4)
-EXPORT_SYMBOL(xor_ia64_4)
-
-GLOBAL_ENTRY(xor_ia64_5)
- .prologue
- .fframe 0
- .save ar.pfs, r31
- alloc r31 = ar.pfs, 6, 0, 34, 40
- .save ar.lc, r30
- mov r30 = ar.lc
- .save pr, r29
- mov r29 = pr
- ;;
- .body
- mov r8 = in1
- mov ar.ec = 6 + 2
- shr in0 = in0, 3
- ;;
- adds in0 = -1, in0
- mov r16 = in1
- mov r17 = in2
- ;;
- mov r18 = in3
- mov ar.lc = in0
- mov pr.rot = 1 << 16
- mov r19 = in4
- mov r20 = in5
- ;;
- .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
- .rotp p[6+2]
-0:
-(p[0]) ld8.nta s1[0] = [r16], 8
-(p[0]) ld8.nta s2[0] = [r17], 8
-(p[6]) xor d[0] = s1[6], s2[6]
-(p[0]) ld8.nta s3[0] = [r18], 8
-(p[0]) ld8.nta s4[0] = [r19], 8
-(p[6]) xor r21 = s3[6], s4[6]
- ;;
-(p[0]) ld8.nta s5[0] = [r20], 8
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6]) xor d[0] = d[0], r21
- ;;
-(p[6]) xor d[0] = d[0], s5[6]
- nop.f 0
- br.ctop.dptk.few 0b
- ;;
- mov ar.lc = r30
- mov pr = r29, -1
- br.ret.sptk.few rp
-END(xor_ia64_5)
-EXPORT_SYMBOL(xor_ia64_5)