arch: Remove Itanium (IA-64) architecture

The Itanium architecture is obsolete, and an informal survey [0] reveals that any residual use of Itanium hardware in production is mostly HP-UX or OpenVMS based. The use of Linux on Itanium appears to be limited to enthusiasts that occasionally boot a fresh Linux kernel to see whether things are still working as intended, and perhaps to churn out some distro packages that are rarely used in practice. None of the original companies behind Itanium still produce or support any hardware or software for the architecture, and it is listed as 'Orphaned' in the MAINTAINERS file, as apparently, none of the engineers that contributed on behalf of those companies (nor anyone else, for that matter) have been willing to support or maintain the architecture upstream or even be responsible for applying the odd fix. The Intel firmware team removed all IA-64 support from the Tianocore/EDK2 reference implementation of EFI in 2018. (Itanium is the original architecture for which EFI was developed, and the way Linux supports it deviates significantly from other architectures.) Some distros, such as Debian and Gentoo, still maintain [unofficial] ia64 ports, but many have dropped support years ago. While the argument is being made [1] that there is a 'for the common good' angle to being able to build and run existing projects such as the Grid Community Toolkit [2] on Itanium for interoperability testing, the fact remains that none of those projects are known to be deployed on Linux/ia64, and very few people actually have access to such a system in the first place. Even if there were ways imaginable in which Linux/ia64 could be put to good use today, what matters is whether anyone is actually doing that, and this does not appear to be the case. There are no emulators widely available, and so boot testing Itanium is generally infeasible for ordinary contributors. GCC still supports IA-64 but its compile farm [3] no longer has any IA-64 machines. GLIBC would like to get rid of IA-64 [4] too because it would permit some overdue code cleanups. In summary, the benefits to the ecosystem of having IA-64 be part of it are mostly theoretical, whereas the maintenance overhead of keeping it supported is real. So let's rip off the band aid, and remove the IA-64 arch code entirely. This follows the timeline proposed by the Debian/ia64 maintainer [5], which removes support in a controlled manner, leaving IA-64 in a known good state in the most recent LTS release. Other projects will follow once the kernel support is removed. [0] https://lore.kernel.org/all/CAMj1kXFCMh_578jniKpUtx_j8ByHnt=s7S+yQ+vGbKt9ud7+kQ@mail.gmail.com/ [1] https://lore.kernel.org/all/0075883c-7c51-00f5-2c2d-5119c1820410@web.de/ [2] https://gridcf.org/gct-docs/latest/index.html [3] https://cfarm.tetaneutral.net/machines/list/ [4] https://lore.kernel.org/all/87bkiilpc4.fsf@mid.deneb.enyo.de/ [5] https://lore.kernel.org/all/ff58a3e76e5102c94bb5946d99187b358def688a.camel@physik.fu-berlin.de/ Acked-by: Tony Luck <tony.luck@intel.com> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
author: Ard Biesheuvel <ardb@kernel.org> 2022-10-20 15:54:33 +0200
committer: Ard Biesheuvel <ardb@kernel.org> 2023-09-11 08:13:17 +0000
commit: cf8e8658100d4eae80ce9b21f7a81cb024dd5057 (patch)
tree: 31d3b640bebf97c33d354768fc44dfd532c2df81 /arch/ia64/lib
parent: a0334bf78b95532cec54f56b53e8ae1bfe7e1ca1 (diff)
21 files changed, 0 insertions, 4051 deletions
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
deleted file mode 100644
index 081fcba01dc0..000000000000
--- a/arch/ia64/lib/Makefile
+++ /dev/null
@@ -1,48 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for ia64-specific library routines..
-#
-
-lib-y := io.o __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o		\
-	__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o			\
-	checksum.o clear_page.o csum_partial_copy.o			\
-	clear_user.o strncpy_from_user.o strnlen_user.o			\
-	flush.o ip_fast_csum.o do_csum.o				\
-	memset.o strlen.o xor.o
-
-lib-$(CONFIG_ITANIUM)	+= copy_page.o copy_user.o memcpy.o
-lib-$(CONFIG_MCKINLEY)	+= copy_page_mck.o memcpy_mck.o
-
-AFLAGS___divdi3.o	=
-AFLAGS___udivdi3.o	= -DUNSIGNED
-AFLAGS___moddi3.o	= 	     -DMODULO
-AFLAGS___umoddi3.o	= -DUNSIGNED -DMODULO
-
-AFLAGS___divsi3.o	=
-AFLAGS___udivsi3.o	= -DUNSIGNED
-AFLAGS___modsi3.o	=	     -DMODULO
-AFLAGS___umodsi3.o	= -DUNSIGNED -DMODULO
-
-$(obj)/__divdi3.o: $(src)/idiv64.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__udivdi3.o: $(src)/idiv64.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__moddi3.o: $(src)/idiv64.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__umoddi3.o: $(src)/idiv64.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__divsi3.o: $(src)/idiv32.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__udivsi3.o: $(src)/idiv32.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__modsi3.o: $(src)/idiv32.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__umodsi3.o: $(src)/idiv32.S FORCE
-	$(call if_changed_rule,as_o_S)
diff --git a/arch/ia64/lib/checksum.c b/arch/ia64/lib/checksum.c
deleted file mode 100644
index d26517fe3500..000000000000
--- a/arch/ia64/lib/checksum.c
+++ /dev/null
@@ -1,102 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Network checksum routines
- *
- * Copyright (C) 1999, 2003 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * Most of the code coming from arch/alpha/lib/checksum.c
- *
- * This file contains network checksum routines that are better done
- * in an architecture-specific manner due to speed..
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <asm/byteorder.h>
-
-static inline unsigned short
-from64to16 (unsigned long x)
-{
-	/* add up 32-bit words for 33 bits */
-	x = (x & 0xffffffff) + (x >> 32);
-	/* add up 16-bit and 17-bit words for 17+c bits */
-	x = (x & 0xffff) + (x >> 16);
-	/* add up 16-bit and 2-bit for 16+c bit */
-	x = (x & 0xffff) + (x >> 16);
-	/* add up carry.. */
-	x = (x & 0xffff) + (x >> 16);
-	return x;
-}
-
-/*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 16-bit checksum, already complemented.
- */
-__sum16
-csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
-		  __u8 proto, __wsum sum)
-{
-	return (__force __sum16)~from64to16(
-		(__force u64)saddr + (__force u64)daddr +
-		(__force u64)sum + ((len + proto) << 8));
-}
-
-EXPORT_SYMBOL(csum_tcpudp_magic);
-
-__wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
-		   __u8 proto, __wsum sum)
-{
-	unsigned long result;
-
-	result = (__force u64)saddr + (__force u64)daddr +
-		 (__force u64)sum + ((len + proto) << 8);
-
-	/* Fold down to 32-bits so we don't lose in the typedef-less network stack.  */
-	/* 64 to 33 */
-	result = (result & 0xffffffff) + (result >> 32);
-	/* 33 to 32 */
-	result = (result & 0xffffffff) + (result >> 32);
-	return (__force __wsum)result;
-}
-EXPORT_SYMBOL(csum_tcpudp_nofold);
-
-extern unsigned long do_csum (const unsigned char *, long);
-
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 32-bit boundary
- */
-__wsum csum_partial(const void *buff, int len, __wsum sum)
-{
-	u64 result = do_csum(buff, len);
-
-	/* add in old sum, and carry.. */
-	result += (__force u32)sum;
-	/* 32+c bits -> 32 bits */
-	result = (result & 0xffffffff) + (result >> 32);
-	return (__force __wsum)result;
-}
-
-EXPORT_SYMBOL(csum_partial);
-
-/*
- * this routine is used for miscellaneous IP-like checksums, mainly
- * in icmp.c
- */
-__sum16 ip_compute_csum (const void *buff, int len)
-{
-	return (__force __sum16)~do_csum(buff,len);
-}
-
-EXPORT_SYMBOL(ip_compute_csum);
diff --git a/arch/ia64/lib/clear_page.S b/arch/ia64/lib/clear_page.S
deleted file mode 100644
index ba0dd2538fa5..000000000000
--- a/arch/ia64/lib/clear_page.S
+++ /dev/null
@@ -1,79 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 1999-2002 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
- *
- * 1/06/01 davidm	Tuned for Itanium.
- * 2/12/02 kchen	Tuned for both Itanium and McKinley
- * 3/08/02 davidm	Some more tweaking
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#ifdef CONFIG_ITANIUM
-# define L3_LINE_SIZE	64	// Itanium L3 line size
-# define PREFETCH_LINES	9	// magic number
-#else
-# define L3_LINE_SIZE	128	// McKinley L3 line size
-# define PREFETCH_LINES	12	// magic number
-#endif
-
-#define saved_lc	r2
-#define dst_fetch	r3
-#define dst1		r8
-#define dst2		r9
-#define dst3		r10
-#define dst4		r11
-
-#define dst_last	r31
-
-GLOBAL_ENTRY(clear_page)
-	.prologue
-	.regstk 1,0,0,0
-	mov r16 = PAGE_SIZE/L3_LINE_SIZE-1	// main loop count, -1=repeat/until
-	.save ar.lc, saved_lc
-	mov saved_lc = ar.lc
-
-	.body
-	mov ar.lc = (PREFETCH_LINES - 1)
-	mov dst_fetch = in0
-	adds dst1 = 16, in0
-	adds dst2 = 32, in0
-	;;
-.fetch:	stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
-	adds dst3 = 48, in0		// executing this multiple times is harmless
-	br.cloop.sptk.few .fetch
-	;;
-	addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
-	mov ar.lc = r16			// one L3 line per iteration
-	adds dst4 = 64, in0
-	;;
-#ifdef CONFIG_ITANIUM
-	// Optimized for Itanium
-1:	stf.spill.nta [dst1] = f0, 64
-	stf.spill.nta [dst2] = f0, 64
-	cmp.lt p8,p0=dst_fetch, dst_last
-	;;
-#else
-	// Optimized for McKinley
-1:	stf.spill.nta [dst1] = f0, 64
-	stf.spill.nta [dst2] = f0, 64
-	stf.spill.nta [dst3] = f0, 64
-	stf.spill.nta [dst4] = f0, 128
-	cmp.lt p8,p0=dst_fetch, dst_last
-	;;
-	stf.spill.nta [dst1] = f0, 64
-	stf.spill.nta [dst2] = f0, 64
-#endif
-	stf.spill.nta [dst3] = f0, 64
-(p8)	stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
-	br.cloop.sptk.few 1b
-	;;
-	mov ar.lc = saved_lc		// restore lc
-	br.ret.sptk.many rp
-END(clear_page)
-EXPORT_SYMBOL(clear_page)
diff --git a/arch/ia64/lib/clear_user.S b/arch/ia64/lib/clear_user.S
deleted file mode 100644
index 1d9e45ccf8e5..000000000000
--- a/arch/ia64/lib/clear_user.S
+++ /dev/null
@@ -1,212 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This routine clears to zero a linear memory buffer in user space.
- *
- * Inputs:
- *	in0:	address of buffer
- *	in1:	length of buffer in bytes
- * Outputs:
- *	r8:	number of bytes that didn't get cleared due to a fault
- *
- * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-//
-// arguments
-//
-#define buf		r32
-#define len		r33
-
-//
-// local registers
-//
-#define cnt		r16
-#define buf2		r17
-#define saved_lc	r18
-#define saved_pfs	r19
-#define tmp		r20
-#define len2		r21
-#define len3		r22
-
-//
-// Theory of operations:
-//	- we check whether or not the buffer is small, i.e., less than 17
-//	  in which case we do the byte by byte loop.
-//
-//	- Otherwise we go progressively from 1 byte store to 8byte store in
-//	  the head part, the body is a 16byte store loop and we finish we the
-//	  tail for the last 15 bytes.
-//	  The good point about this breakdown is that the long buffer handling
-//	  contains only 2 branches.
-//
-//	The reason for not using shifting & masking for both the head and the
-//	tail is to stay semantically correct. This routine is not supposed
-//	to write bytes outside of the buffer. While most of the time this would
-//	be ok, we can't tolerate a mistake. A classical example is the case
-//	of multithreaded code were to the extra bytes touched is actually owned
-//	by another thread which runs concurrently to ours. Another, less likely,
-//	example is with device drivers where reading an I/O mapped location may
-//	have side effects (same thing for writing).
-//
-
-GLOBAL_ENTRY(__do_clear_user)
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc	saved_pfs=ar.pfs,2,0,0,0
-	cmp.eq p6,p0=r0,len		// check for zero length
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc		// preserve ar.lc (slow)
-	.body
-	;;				// avoid WAW on CFM
-	adds tmp=-1,len			// br.ctop is repeat/until
-	mov ret0=len			// return value is length at this point
-(p6)	br.ret.spnt.many rp
-	;;
-	cmp.lt p6,p0=16,len		// if len > 16 then long memset
-	mov ar.lc=tmp			// initialize lc for small count
-(p6)	br.cond.dptk .long_do_clear
-	;;				// WAR on ar.lc
-	//
-	// worst case 16 iterations, avg 8 iterations
-	//
-	// We could have played with the predicates to use the extra
-	// M slot for 2 stores/iteration but the cost the initialization
-	// the various counters compared to how long the loop is supposed
-	// to last on average does not make this solution viable.
-	//
-1:
-	EX( .Lexit1, st1 [buf]=r0,1 )
-	adds len=-1,len			// countdown length using len
-	br.cloop.dptk 1b
-	;;				// avoid RAW on ar.lc
-	//
-	// .Lexit4: comes from byte by byte loop
-	//	    len contains bytes left
-.Lexit1:
-	mov ret0=len			// faster than using ar.lc
-	mov ar.lc=saved_lc
-	br.ret.sptk.many rp		// end of short clear_user
-
-
-	//
-	// At this point we know we have more than 16 bytes to copy
-	// so we focus on alignment (no branches required)
-	//
-	// The use of len/len2 for countdown of the number of bytes left
-	// instead of ret0 is due to the fact that the exception code
-	// changes the values of r8.
-	//
-.long_do_clear:
-	tbit.nz p6,p0=buf,0		// odd alignment (for long_do_clear)
-	;;
-	EX( .Lexit3, (p6) st1 [buf]=r0,1 )	// 1-byte aligned
-(p6)	adds len=-1,len;;		// sync because buf is modified
-	tbit.nz p6,p0=buf,1
-	;;
-	EX( .Lexit3, (p6) st2 [buf]=r0,2 )	// 2-byte aligned
-(p6)	adds len=-2,len;;
-	tbit.nz p6,p0=buf,2
-	;;
-	EX( .Lexit3, (p6) st4 [buf]=r0,4 )	// 4-byte aligned
-(p6)	adds len=-4,len;;
-	tbit.nz p6,p0=buf,3
-	;;
-	EX( .Lexit3, (p6) st8 [buf]=r0,8 )	// 8-byte aligned
-(p6)	adds len=-8,len;;
-	shr.u cnt=len,4		// number of 128-bit (2x64bit) words
-	;;
-	cmp.eq p6,p0=r0,cnt
-	adds tmp=-1,cnt
-(p6)	br.cond.dpnt .dotail		// we have less than 16 bytes left
-	;;
-	adds buf2=8,buf			// setup second base pointer
-	mov ar.lc=tmp
-	;;
-
-	//
-	// 16bytes/iteration core loop
-	//
-	// The second store can never generate a fault because
-	// we come into the loop only when we are 16-byte aligned.
-	// This means that if we cross a page then it will always be
-	// in the first store and never in the second.
-	//
-	//
-	// We need to keep track of the remaining length. A possible (optimistic)
-	// way would be to use ar.lc and derive how many byte were left by
-	// doing : left= 16*ar.lc + 16.  this would avoid the addition at
-	// every iteration.
-	// However we need to keep the synchronization point. A template
-	// M;;MB does not exist and thus we can keep the addition at no
-	// extra cycle cost (use a nop slot anyway). It also simplifies the
-	// (unlikely)  error recovery code
-	//
-
-2:	EX(.Lexit3, st8 [buf]=r0,16 )
-	;;				// needed to get len correct when error
-	st8 [buf2]=r0,16
-	adds len=-16,len
-	br.cloop.dptk 2b
-	;;
-	mov ar.lc=saved_lc
-	//
-	// tail correction based on len only
-	//
-	// We alternate the use of len3,len2 to allow parallelism and correct
-	// error handling. We also reuse p6/p7 to return correct value.
-	// The addition of len2/len3 does not cost anything more compared to
-	// the regular memset as we had empty slots.
-	//
-.dotail:
-	mov len2=len			// for parallelization of error handling
-	mov len3=len
-	tbit.nz p6,p0=len,3
-	;;
-	EX( .Lexit2, (p6) st8 [buf]=r0,8 )	// at least 8 bytes
-(p6)	adds len3=-8,len2
-	tbit.nz p7,p6=len,2
-	;;
-	EX( .Lexit2, (p7) st4 [buf]=r0,4 )	// at least 4 bytes
-(p7)	adds len2=-4,len3
-	tbit.nz p6,p7=len,1
-	;;
-	EX( .Lexit2, (p6) st2 [buf]=r0,2 )	// at least 2 bytes
-(p6)	adds len3=-2,len2
-	tbit.nz p7,p6=len,0
-	;;
-	EX( .Lexit2, (p7) st1 [buf]=r0 )	// only 1 byte left
-	mov ret0=r0				// success
-	br.ret.sptk.many rp			// end of most likely path
-
-	//
-	// Outlined error handling code
-	//
-
-	//
-	// .Lexit3: comes from core loop, need restore pr/lc
-	//	    len contains bytes left
-	//
-	//
-	// .Lexit2:
-	//	if p6 -> coming from st8 or st2 : len2 contains what's left
-	//	if p7 -> coming from st4 or st1 : len3 contains what's left
-	// We must restore lc/pr even though might not have been used.
-.Lexit2:
-	.pred.rel "mutex", p6, p7
-(p6)	mov len=len2
-(p7)	mov len=len3
-	;;
-	//
-	// .Lexit4: comes from head, need not restore pr/lc
-	//	    len contains bytes left
-	//
-.Lexit3:
-	mov ret0=len
-	mov ar.lc=saved_lc
-	br.ret.sptk.many rp
-END(__do_clear_user)
-EXPORT_SYMBOL(__do_clear_user)
diff --git a/arch/ia64/lib/copy_page.S b/arch/ia64/lib/copy_page.S
deleted file mode 100644
index c0a0e6b2af00..000000000000
--- a/arch/ia64/lib/copy_page.S
+++ /dev/null
@@ -1,101 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the standard copy_page() function
- *
- * Inputs:
- *	in0:	address of target page
- *	in1:	address of source page
- * Output:
- *	no return value
- *
- * Copyright (C) 1999, 2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *	David Mosberger <davidm@hpl.hp.com>
- *
- * 4/06/01 davidm	Tuned to make it perform well both for cached and uncached copies.
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#define PIPE_DEPTH	3
-#define EPI		p[PIPE_DEPTH-1]
-
-#define lcount		r16
-#define saved_pr	r17
-#define saved_lc	r18
-#define saved_pfs	r19
-#define src1		r20
-#define src2		r21
-#define tgt1		r22
-#define tgt2		r23
-#define srcf		r24
-#define tgtf		r25
-#define tgt_last	r26
-
-#define Nrot		((8*PIPE_DEPTH+7)&~7)
-
-GLOBAL_ENTRY(copy_page)
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
-
-	.rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \
-	      t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH]
-	.rotp p[PIPE_DEPTH]
-
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc
-	mov ar.ec=PIPE_DEPTH
-
-	mov lcount=PAGE_SIZE/64-1
-	.save pr, saved_pr
-	mov saved_pr=pr
-	mov pr.rot=1<<16
-
-	.body
-
-	mov src1=in1
-	adds src2=8,in1
-	mov tgt_last = PAGE_SIZE
-	;;
-	adds tgt2=8,in0
-	add srcf=512,in1
-	mov ar.lc=lcount
-	mov tgt1=in0
-	add tgtf=512,in0
-	add tgt_last = tgt_last, in0
-	;;
-1:
-(p[0])	ld8 t1[0]=[src1],16
-(EPI)	st8 [tgt1]=t1[PIPE_DEPTH-1],16
-(p[0])	ld8 t2[0]=[src2],16
-(EPI)	st8 [tgt2]=t2[PIPE_DEPTH-1],16
-	cmp.ltu p6,p0 = tgtf, tgt_last
-	;;
-(p[0])	ld8 t3[0]=[src1],16
-(EPI)	st8 [tgt1]=t3[PIPE_DEPTH-1],16
-(p[0])	ld8 t4[0]=[src2],16
-(EPI)	st8 [tgt2]=t4[PIPE_DEPTH-1],16
-	;;
-(p[0])	ld8 t5[0]=[src1],16
-(EPI)	st8 [tgt1]=t5[PIPE_DEPTH-1],16
-(p[0])	ld8 t6[0]=[src2],16
-(EPI)	st8 [tgt2]=t6[PIPE_DEPTH-1],16
-	;;
-(p[0])	ld8 t7[0]=[src1],16
-(EPI)	st8 [tgt1]=t7[PIPE_DEPTH-1],16
-(p[0])	ld8 t8[0]=[src2],16
-(EPI)	st8 [tgt2]=t8[PIPE_DEPTH-1],16
-
-(p6)	lfetch [srcf], 64
-(p6)	lfetch [tgtf], 64
-	br.ctop.sptk.few 1b
-	;;
-	mov pr=saved_pr,0xffffffffffff0000	// restore predicates
-	mov ar.pfs=saved_pfs
-	mov ar.lc=saved_lc
-	br.ret.sptk.many rp
-END(copy_page)
-EXPORT_SYMBOL(copy_page)
diff --git a/arch/ia64/lib/copy_page_mck.S b/arch/ia64/lib/copy_page_mck.S
deleted file mode 100644
index 5e8bb4b4b535..000000000000
--- a/arch/ia64/lib/copy_page_mck.S
+++ /dev/null
@@ -1,188 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * McKinley-optimized version of copy_page().
- *
- * Copyright (C) 2002 Hewlett-Packard Co
- *	David Mosberger <davidm@hpl.hp.com>
- *
- * Inputs:
- *	in0:	address of target page
- *	in1:	address of source page
- * Output:
- *	no return value
- *
- * General idea:
- *	- use regular loads and stores to prefetch data to avoid consuming M-slot just for
- *	  lfetches => good for in-cache performance
- *	- avoid l2 bank-conflicts by not storing into the same 16-byte bank within a single
- *	  cycle
- *
- * Principle of operation:
- *	First, note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes.
- *	To avoid secondary misses in L2, we prefetch both source and destination with a line-size
- *	of 128 bytes.  When both of these lines are in the L2 and the first half of the
- *	source line is in L1, we start copying the remaining words.  The second half of the
- *	source line is prefetched in an earlier iteration, so that by the time we start
- *	accessing it, it's also present in the L1.
- *
- *	We use a software-pipelined loop to control the overall operation.  The pipeline
- *	has 2*PREFETCH_DIST+K stages.  The first PREFETCH_DIST stages are used for prefetching
- *	source cache-lines.  The second PREFETCH_DIST stages are used for prefetching destination
- *	cache-lines, the last K stages are used to copy the cache-line words not copied by
- *	the prefetches.  The four relevant points in the pipelined are called A, B, C, D:
- *	p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line
- *	should be prefetched, p[C] is TRUE if the second half of an L2 line should be brought
- *	into L1D and p[D] is TRUE if a cacheline needs to be copied.
- *
- *	This all sounds very complicated, but thanks to the modulo-scheduled loop support,
- *	the resulting code is very regular and quite easy to follow (once you get the idea).
- *
- *	As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented
- *	as the separate .prefetch_loop.  Logically, this loop performs exactly like the
- *	main-loop (.line_copy), but has all known-to-be-predicated-off instructions removed,
- *	so that each loop iteration is faster (again, good for cached case).
- *
- *	When reading the code, it helps to keep the following picture in mind:
- *
- *	       word 0 word 1
- *            +------+------+---
- *	      |	v[x] | 	t1  | ^
- *	      |	t2   |	t3  | |
- *	      |	t4   |	t5  | |
- *	      |	t6   |	t7  | | 128 bytes
- *     	      |	n[y] | 	t9  | |	(L2 cache line)
- *	      |	t10  | 	t11 | |
- *	      |	t12  | 	t13 | |
- *	      |	t14  | 	t15 | v
- *	      +------+------+---
- *
- *	Here, v[x] is copied by the (memory) prefetch.  n[y] is loaded at p[C]
- *	to fetch the second-half of the L2 cache line into L1, and the tX words are copied in
- *	an order that avoids bank conflicts.
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#define PREFETCH_DIST	8		// McKinley sustains 16 outstanding L2 misses (8 ld, 8 st)
-
-#define src0		r2
-#define src1		r3
-#define dst0		r9
-#define dst1		r10
-#define src_pre_mem	r11
-#define dst_pre_mem	r14
-#define src_pre_l2	r15
-#define dst_pre_l2	r16
-#define t1		r17
-#define t2		r18
-#define t3		r19
-#define t4		r20
-#define t5		t1	// alias!
-#define t6		t2	// alias!
-#define t7		t3	// alias!
-#define t9		t5	// alias!
-#define t10		t4	// alias!
-#define t11		t7	// alias!
-#define t12		t6	// alias!
-#define t14		t10	// alias!
-#define t13		r21
-#define t15		r22
-
-#define saved_lc	r23
-#define saved_pr	r24
-
-#define	A	0
-#define B	(PREFETCH_DIST)
-#define C	(B + PREFETCH_DIST)
-#define D	(C + 3)
-#define N	(D + 1)
-#define Nrot	((N + 7) & ~7)
-
-GLOBAL_ENTRY(copy_page)
-	.prologue
-	alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot
-
-	.rotr v[2*PREFETCH_DIST], n[D-C+1]
-	.rotp p[N]
-
-	.save ar.lc, saved_lc
-	mov saved_lc = ar.lc
-	.save pr, saved_pr
-	mov saved_pr = pr
-	.body
-
-	mov src_pre_mem = in1
-	mov pr.rot = 0x10000
-	mov ar.ec = 1				// special unrolled loop
-
-	mov dst_pre_mem = in0
-	mov ar.lc = 2*PREFETCH_DIST - 1
-
-	add src_pre_l2 = 8*8, in1
-	add dst_pre_l2 = 8*8, in0
-	add src0 = 8, in1			// first t1 src
-	add src1 = 3*8, in1			// first t3 src
-	add dst0 = 8, in0			// first t1 dst
-	add dst1 = 3*8, in0			// first t3 dst
-	mov t1 = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1
-	nop.m 0
-	nop.i 0
-	;;
-	// same as .line_copy loop, but with all predicated-off instructions removed:
-.prefetch_loop:
-(p[A])	ld8 v[A] = [src_pre_mem], 128		// M0
-(p[B])	st8 [dst_pre_mem] = v[B], 128		// M2
-	br.ctop.sptk .prefetch_loop
-	;;
-	cmp.eq p16, p0 = r0, r0			// reset p16 to 1 (br.ctop cleared it to zero)
-	mov ar.lc = t1				// with 64KB pages, t1 is too big to fit in 8 bits!
-	mov ar.ec = N				// # of stages in pipeline
-	;;
-.line_copy:
-(p[D])	ld8 t2 = [src0], 3*8			// M0
-(p[D])	ld8 t4 = [src1], 3*8			// M1
-(p[B])	st8 [dst_pre_mem] = v[B], 128		// M2 prefetch dst from memory
-(p[D])	st8 [dst_pre_l2] = n[D-C], 128		// M3 prefetch dst from L2
-	;;
-(p[A])	ld8 v[A] = [src_pre_mem], 128		// M0 prefetch src from memory
-(p[C])	ld8 n[0] = [src_pre_l2], 128		// M1 prefetch src from L2
-(p[D])	st8 [dst0] =  t1, 8			// M2
-(p[D])	st8 [dst1] =  t3, 8			// M3
-	;;
-(p[D])	ld8  t5 = [src0], 8
-(p[D])	ld8  t7 = [src1], 3*8
-(p[D])	st8 [dst0] =  t2, 3*8
-(p[D])	st8 [dst1] =  t4, 3*8
-	;;
-(p[D])	ld8  t6 = [src0], 3*8
-(p[D])	ld8 t10 = [src1], 8
-(p[D])	st8 [dst0] =  t5, 8
-(p[D])	st8 [dst1] =  t7, 3*8
-	;;
-(p[D])	ld8  t9 = [src0], 3*8
-(p[D])	ld8 t11 = [src1], 3*8
-(p[D])	st8 [dst0] =  t6, 3*8
-(p[D])	st8 [dst1] = t10, 8
-	;;
-(p[D])	ld8 t12 = [src0], 8
-(p[D])	ld8 t14 = [src1], 8
-(p[D])	st8 [dst0] =  t9, 3*8
-(p[D])	st8 [dst1] = t11, 3*8
-	;;
-(p[D])	ld8 t13 = [src0], 4*8
-(p[D])	ld8 t15 = [src1], 4*8
-(p[D])	st8 [dst0] = t12, 8
-(p[D])	st8 [dst1] = t14, 8
-	;;
-(p[D-1])ld8  t1 = [src0], 8
-(p[D-1])ld8  t3 = [src1], 8
-(p[D])	st8 [dst0] = t13, 4*8
-(p[D])	st8 [dst1] = t15, 4*8
-	br.ctop.sptk .line_copy
-	;;
-	mov ar.lc = saved_lc
-	mov pr = saved_pr, -1
-	br.ret.sptk.many rp
-END(copy_page)
-EXPORT_SYMBOL(copy_page)
diff --git a/arch/ia64/lib/copy_user.S b/arch/ia64/lib/copy_user.S
deleted file mode 100644
index 8daab72cfe77..000000000000
--- a/arch/ia64/lib/copy_user.S
+++ /dev/null
@@ -1,613 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the copy_user() routine.
- * It is used to copy date across the kernel/user boundary.
- *
- * The source and destination are always on opposite side of
- * the boundary. When reading from user space we must catch
- * faults on loads. When writing to user space we must catch
- * errors on stores. Note that because of the nature of the copy
- * we don't need to worry about overlapping regions.
- *
- *
- * Inputs:
- *	in0	address of source buffer
- *	in1	address of destination buffer
- *	in2	number of bytes to copy
- *
- * Outputs:
- *	ret0	0 in case of success. The number of bytes NOT copied in
- *		case of error.
- *
- * Copyright (C) 2000-2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * Fixme:
- *	- handle the case where we have more than 16 bytes and the alignment
- *	  are different.
- *	- more benchmarking
- *	- fix extraneous stop bit introduced by the EX() macro.
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-//
-// Tuneable parameters
-//
-#define COPY_BREAK	16	// we do byte copy below (must be >=16)
-#define PIPE_DEPTH	21	// pipe depth
-
-#define EPI		p[PIPE_DEPTH-1]
-
-//
-// arguments
-//
-#define dst		in0
-#define src		in1
-#define len		in2
-
-//
-// local registers
-//
-#define t1		r2	// rshift in bytes
-#define t2		r3	// lshift in bytes
-#define rshift		r14	// right shift in bits
-#define lshift		r15	// left shift in bits
-#define word1		r16
-#define word2		r17
-#define cnt		r18
-#define len2		r19
-#define saved_lc	r20
-#define saved_pr	r21
-#define tmp		r22
-#define val		r23
-#define src1		r24
-#define dst1		r25
-#define src2		r26
-#define dst2		r27
-#define len1		r28
-#define enddst		r29
-#define endsrc		r30
-#define saved_pfs	r31
-
-GLOBAL_ENTRY(__copy_user)
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
-
-	.rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
-	.rotp p[PIPE_DEPTH]
-
-	adds len2=-1,len	// br.ctop is repeat/until
-	mov ret0=r0
-
-	;;			// RAW of cfm when len=0
-	cmp.eq p8,p0=r0,len	// check for zero length
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc	// preserve ar.lc (slow)
-(p8)	br.ret.spnt.many rp	// empty mempcy()
-	;;
-	add enddst=dst,len	// first byte after end of source
-	add endsrc=src,len	// first byte after end of destination
-	.save pr, saved_pr
-	mov saved_pr=pr		// preserve predicates
-
-	.body
-
-	mov dst1=dst		// copy because of rotation
-	mov ar.ec=PIPE_DEPTH
-	mov pr.rot=1<<16	// p16=true all others are false
-
-	mov src1=src		// copy because of rotation
-	mov ar.lc=len2		// initialize lc for small count
-	cmp.lt p10,p7=COPY_BREAK,len	// if len > COPY_BREAK then long copy
-
-	xor tmp=src,dst		// same alignment test prepare
-(p10)	br.cond.dptk .long_copy_user
-	;;			// RAW pr.rot/p16 ?
-	//
-	// Now we do the byte by byte loop with software pipeline
-	//
-	// p7 is necessarily false by now
-1:
-	EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
-	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
-	br.ctop.dptk.few 1b
-	;;
-	mov ar.lc=saved_lc
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.pfs=saved_pfs		// restore ar.ec
-	br.ret.sptk.many rp		// end of short memcpy
-
-	//
-	// Not 8-byte aligned
-	//
-.diff_align_copy_user:
-	// At this point we know we have more than 16 bytes to copy
-	// and also that src and dest do _not_ have the same alignment.
-	and src2=0x7,src1				// src offset
-	and dst2=0x7,dst1				// dst offset
-	;;
-	// The basic idea is that we copy byte-by-byte at the head so
-	// that we can reach 8-byte alignment for both src1 and dst1.
-	// Then copy the body using software pipelined 8-byte copy,
-	// shifting the two back-to-back words right and left, then copy
-	// the tail by copying byte-by-byte.
-	//
-	// Fault handling. If the byte-by-byte at the head fails on the
-	// load, then restart and finish the pipleline by copying zeros
-	// to the dst1. Then copy zeros for the rest of dst1.
-	// If 8-byte software pipeline fails on the load, do the same as
-	// failure_in3 does. If the byte-by-byte at the tail fails, it is
-	// handled simply by failure_in_pipe1.
-	//
-	// The case p14 represents the source has more bytes in the
-	// the first word (by the shifted part), whereas the p15 needs to
-	// copy some bytes from the 2nd word of the source that has the
-	// tail of the 1st of the destination.
-	//
-
-	//
-	// Optimization. If dst1 is 8-byte aligned (quite common), we don't need
-	// to copy the head to dst1, to start 8-byte copy software pipeline.
-	// We know src1 is not 8-byte aligned in this case.
-	//
-	cmp.eq p14,p15=r0,dst2
-(p15)	br.cond.spnt 1f
-	;;
-	sub t1=8,src2
-	mov t2=src2
-	;;
-	shl rshift=t2,3
-	sub len1=len,t1					// set len1
-	;;
-	sub lshift=64,rshift
-	;;
-	br.cond.spnt .word_copy_user
-	;;
-1:
-	cmp.leu	p14,p15=src2,dst2
-	sub t1=dst2,src2
-	;;
-	.pred.rel "mutex", p14, p15
-(p14)	sub word1=8,src2				// (8 - src offset)
-(p15)	sub t1=r0,t1					// absolute value
-(p15)	sub word1=8,dst2				// (8 - dst offset)
-	;;
-	// For the case p14, we don't need to copy the shifted part to
-	// the 1st word of destination.
-	sub t2=8,t1
-(p14)	sub word1=word1,t1
-	;;
-	sub len1=len,word1				// resulting len
-(p15)	shl rshift=t1,3					// in bits
-(p14)	shl rshift=t2,3
-	;;
-(p14)	sub len1=len1,t1
-	adds cnt=-1,word1
-	;;
-	sub lshift=64,rshift
-	mov ar.ec=PIPE_DEPTH
-	mov pr.rot=1<<16	// p16=true all others are false
-	mov ar.lc=cnt
-	;;
-2:
-	EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
-	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
-	br.ctop.dptk.few 2b
-	;;
-	clrrrb
-	;;
-.word_copy_user:
-	cmp.gtu p9,p0=16,len1
-(p9)	br.cond.spnt 4f			// if (16 > len1) skip 8-byte copy
-	;;
-	shr.u cnt=len1,3		// number of 64-bit words
-	;;
-	adds cnt=-1,cnt
-	;;
-	.pred.rel "mutex", p14, p15
-(p14)	sub src1=src1,t2
-(p15)	sub src1=src1,t1
-	//
-	// Now both src1 and dst1 point to an 8-byte aligned address. And
-	// we have more than 8 bytes to copy.
-	//
-	mov ar.lc=cnt
-	mov ar.ec=PIPE_DEPTH
-	mov pr.rot=1<<16	// p16=true all others are false
-	;;
-3:
-	//
-	// The pipleline consists of 3 stages:
-	// 1 (p16):	Load a word from src1
-	// 2 (EPI_1):	Shift right pair, saving to tmp
-	// 3 (EPI):	Store tmp to dst1
-	//
-	// To make it simple, use at least 2 (p16) loops to set up val1[n]
-	// because we need 2 back-to-back val1[] to get tmp.
-	// Note that this implies EPI_2 must be p18 or greater.
-	//
-
-#define EPI_1		p[PIPE_DEPTH-2]
-#define SWITCH(pred, shift)	cmp.eq pred,p0=shift,rshift
-#define CASE(pred, shift)	\
-	(pred)	br.cond.spnt .copy_user_bit##shift
-#define BODY(rshift)						\
-.copy_user_bit##rshift:						\
-1:								\
-	EX(.failure_out,(EPI) st8 [dst1]=tmp,8);		\
-(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift;	\
-	EX(3f,(p16) ld8 val1[1]=[src1],8);			\
-(p16)	mov val1[0]=r0;						\
-	br.ctop.dptk 1b;					\
-	;;							\
-	br.cond.sptk.many .diff_align_do_tail;			\
-2:								\
-(EPI)	st8 [dst1]=tmp,8;					\
-(EPI_1)	shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift;	\
-3:								\
-(p16)	mov val1[1]=r0;						\
-(p16)	mov val1[0]=r0;						\
-	br.ctop.dptk 2b;					\
-	;;							\
-	br.cond.sptk.many .failure_in2
-
-	//
-	// Since the instruction 'shrp' requires a fixed 128-bit value
-	// specifying the bits to shift, we need to provide 7 cases
-	// below.
-	//
-	SWITCH(p6, 8)
-	SWITCH(p7, 16)
-	SWITCH(p8, 24)
-	SWITCH(p9, 32)
-	SWITCH(p10, 40)
-	SWITCH(p11, 48)
-	SWITCH(p12, 56)
-	;;
-	CASE(p6, 8)
-	CASE(p7, 16)
-	CASE(p8, 24)
-	CASE(p9, 32)
-	CASE(p10, 40)
-	CASE(p11, 48)
-	CASE(p12, 56)
-	;;
-	BODY(8)
-	BODY(16)
-	BODY(24)
-	BODY(32)
-	BODY(40)
-	BODY(48)
-	BODY(56)
-	;;
-.diff_align_do_tail:
-	.pred.rel "mutex", p14, p15
-(p14)	sub src1=src1,t1
-(p14)	adds dst1=-8,dst1
-(p15)	sub dst1=dst1,t1
-	;;
-4:
-	// Tail correction.
-	//
-	// The problem with this piplelined loop is that the last word is not
-	// loaded and thus parf of the last word written is not correct.
-	// To fix that, we simply copy the tail byte by byte.
-
-	sub len1=endsrc,src1,1
-	clrrrb
-	;;
-	mov ar.ec=PIPE_DEPTH
-	mov pr.rot=1<<16	// p16=true all others are false
-	mov ar.lc=len1
-	;;
-5:
-	EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
-	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
-	br.ctop.dptk.few 5b
-	;;
-	mov ar.lc=saved_lc
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	//
-	// Beginning of long mempcy (i.e. > 16 bytes)
-	//
-.long_copy_user:
-	tbit.nz p6,p7=src1,0	// odd alignment
-	and tmp=7,tmp
-	;;
-	cmp.eq p10,p8=r0,tmp
-	mov len1=len		// copy because of rotation
-(p8)	br.cond.dpnt .diff_align_copy_user
-	;;
-	// At this point we know we have more than 16 bytes to copy
-	// and also that both src and dest have the same alignment
-	// which may not be the one we want. So for now we must move
-	// forward slowly until we reach 16byte alignment: no need to
-	// worry about reaching the end of buffer.
-	//
-	EX(.failure_in1,(p6) ld1 val1[0]=[src1],1)	// 1-byte aligned
-(p6)	adds len1=-1,len1;;
-	tbit.nz p7,p0=src1,1
-	;;
-	EX(.failure_in1,(p7) ld2 val1[1]=[src1],2)	// 2-byte aligned
-(p7)	adds len1=-2,len1;;
-	tbit.nz p8,p0=src1,2
-	;;
-	//
-	// Stop bit not required after ld4 because if we fail on ld4
-	// we have never executed the ld1, therefore st1 is not executed.
-	//
-	EX(.failure_in1,(p8) ld4 val2[0]=[src1],4)	// 4-byte aligned
-	;;
-	EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
-	tbit.nz p9,p0=src1,3
-	;;
-	//
-	// Stop bit not required after ld8 because if we fail on ld8
-	// we have never executed the ld2, therefore st2 is not executed.
-	//
-	EX(.failure_in1,(p9) ld8 val2[1]=[src1],8)	// 8-byte aligned
-	EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
-(p8)	adds len1=-4,len1
-	;;
-	EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
-(p9)	adds len1=-8,len1;;
-	shr.u cnt=len1,4		// number of 128-bit (2x64bit) words
-	;;
-	EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
-	tbit.nz p6,p0=len1,3
-	cmp.eq p7,p0=r0,cnt
-	adds tmp=-1,cnt			// br.ctop is repeat/until
-(p7)	br.cond.dpnt .dotail		// we have less than 16 bytes left
-	;;
-	adds src2=8,src1
-	adds dst2=8,dst1
-	mov ar.lc=tmp
-	;;
-	//
-	// 16bytes/iteration
-	//
-2:
-	EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
-(p16)	ld8 val2[0]=[src2],16
-
-	EX(.failure_out, (EPI)	st8 [dst1]=val1[PIPE_DEPTH-1],16)
-(EPI)	st8 [dst2]=val2[PIPE_DEPTH-1],16
-	br.ctop.dptk 2b
-	;;			// RAW on src1 when fall through from loop
-	//
-	// Tail correction based on len only
-	//
-	// No matter where we come from (loop or test) the src1 pointer
-	// is 16 byte aligned AND we have less than 16 bytes to copy.
-	//
-.dotail:
-	EX(.failure_in1,(p6) ld8 val1[0]=[src1],8)	// at least 8 bytes
-	tbit.nz p7,p0=len1,2
-	;;
-	EX(.failure_in1,(p7) ld4 val1[1]=[src1],4)	// at least 4 bytes
-	tbit.nz p8,p0=len1,1
-	;;
-	EX(.failure_in1,(p8) ld2 val2[0]=[src1],2)	// at least 2 bytes
-	tbit.nz p9,p0=len1,0
-	;;
-	EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
-	;;
-	EX(.failure_in1,(p9) ld1 val2[1]=[src1])	// only 1 byte left
-	mov ar.lc=saved_lc
-	;;
-	EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
-	mov pr=saved_pr,0xffffffffffff0000
-	;;
-	EX(.failure_out, (p8)	st2 [dst1]=val2[0],2)
-	mov ar.pfs=saved_pfs
-	;;
-	EX(.failure_out, (p9)	st1 [dst1]=val2[1])
-	br.ret.sptk.many rp
-
-
-	//
-	// Here we handle the case where the byte by byte copy fails
-	// on the load.
-	// Several factors make the zeroing of the rest of the buffer kind of
-	// tricky:
-	//	- the pipeline: loads/stores are not in sync (pipeline)
-	//
-	//	  In the same loop iteration, the dst1 pointer does not directly
-	//	  reflect where the faulty load was.
-	//
-	//	- pipeline effect
-	//	  When you get a fault on load, you may have valid data from
-	//	  previous loads not yet store in transit. Such data must be
-	//	  store normally before moving onto zeroing the rest.
-	//
-	//	- single/multi dispersal independence.
-	//
-	// solution:
-	//	- we don't disrupt the pipeline, i.e. data in transit in
-	//	  the software pipeline will be eventually move to memory.
-	//	  We simply replace the load with a simple mov and keep the
-	//	  pipeline going. We can't really do this inline because
-	//	  p16 is always reset to 1 when lc > 0.
-	//
-.failure_in_pipe1:
-	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
-1:
-(p16)	mov val1[0]=r0
-(EPI)	st1 [dst1]=val1[PIPE_DEPTH-1],1
-	br.ctop.dptk 1b
-	;;
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.lc=saved_lc
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	//
-	// This is the case where the byte by byte copy fails on the load
-	// when we copy the head. We need to finish the pipeline and copy
-	// zeros for the rest of the destination. Since this happens
-	// at the top we still need to fill the body and tail.
-.failure_in_pipe2:
-	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
-2:
-(p16)	mov val1[0]=r0
-(EPI)	st1 [dst1]=val1[PIPE_DEPTH-1],1
-	br.ctop.dptk 2b
-	;;
-	sub len=enddst,dst1,1		// precompute len
-	br.cond.dptk.many .failure_in1bis
-	;;
-
-	//
-	// Here we handle the head & tail part when we check for alignment.
-	// The following code handles only the load failures. The
-	// main diffculty comes from the fact that loads/stores are
-	// scheduled. So when you fail on a load, the stores corresponding
-	// to previous successful loads must be executed.
-	//
-	// However some simplifications are possible given the way
-	// things work.
-	//
-	// 1) HEAD
-	// Theory of operation:
-	//
-	//  Page A   | Page B
-	//  ---------|-----
-	//          1|8 x
-	//	  1 2|8 x
-	//	    4|8 x
-	//	  1 4|8 x
-	//        2 4|8 x
-	//      1 2 4|8 x
-	//	     |1
-	//	     |2 x
-	//	     |4 x
-	//
-	// page_size >= 4k (2^12).  (x means 4, 2, 1)
-	// Here we suppose Page A exists and Page B does not.
-	//
-	// As we move towards eight byte alignment we may encounter faults.
-	// The numbers on each page show the size of the load (current alignment).
-	//
-	// Key point:
-	//	- if you fail on 1, 2, 4 then you have never executed any smaller
-	//	  size loads, e.g. failing ld4 means no ld1 nor ld2 executed
-	//	  before.
-	//
-	// This allows us to simplify the cleanup code, because basically you
-	// only have to worry about "pending" stores in the case of a failing
-	// ld8(). Given the way the code is written today, this means only
-	// worry about st2, st4. There we can use the information encapsulated
-	// into the predicates.
-	//
-	// Other key point:
-	//	- if you fail on the ld8 in the head, it means you went straight
-	//	  to it, i.e. 8byte alignment within an unexisting page.
-	// Again this comes from the fact that if you crossed just for the ld8 then
-	// you are 8byte aligned but also 16byte align, therefore you would
-	// either go for the 16byte copy loop OR the ld8 in the tail part.
-	// The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible
-	// because it would mean you had 15bytes to copy in which case you
-	// would have defaulted to the byte by byte copy.
-	//
-	//
-	// 2) TAIL
-	// Here we now we have less than 16 bytes AND we are either 8 or 16 byte
-	// aligned.
-	//
-	// Key point:
-	// This means that we either:
-	//		- are right on a page boundary
-	//	OR
-	//		- are at more than 16 bytes from a page boundary with
-	//		  at most 15 bytes to copy: no chance of crossing.
-	//
-	// This allows us to assume that if we fail on a load we haven't possibly
-	// executed any of the previous (tail) ones, so we don't need to do
-	// any stores. For instance, if we fail on ld2, this means we had
-	// 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
-	//
-	// This means that we are in a situation similar the a fault in the
-	// head part. That's nice!
-	//
-.failure_in1:
-	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
-	sub len=endsrc,src1,1
-	//
-	// we know that ret0 can never be zero at this point
-	// because we failed why trying to do a load, i.e. there is still
-	// some work to do.
-	// The failure_in1bis and length problem is taken care of at the
-	// calling side.
-	//
-	;;
-.failure_in1bis:		// from (.failure_in3)
-	mov ar.lc=len		// Continue with a stupid byte store.
-	;;
-5:
-	st1 [dst1]=r0,1
-	br.cloop.dptk 5b
-	;;
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.lc=saved_lc
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	//
-	// Here we simply restart the loop but instead
-	// of doing loads we fill the pipeline with zeroes
-	// We can't simply store r0 because we may have valid
-	// data in transit in the pipeline.
-	// ar.lc and ar.ec are setup correctly at this point
-	//
-	// we MUST use src1/endsrc here and not dst1/enddst because
-	// of the pipeline effect.
-	//
-.failure_in3:
-	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
-	;;
-2:
-(p16)	mov val1[0]=r0
-(p16)	mov val2[0]=r0
-(EPI)	st8 [dst1]=val1[PIPE_DEPTH-1],16
-(EPI)	st8 [dst2]=val2[PIPE_DEPTH-1],16
-	br.ctop.dptk 2b
-	;;
-	cmp.ne p6,p0=dst1,enddst	// Do we need to finish the tail ?
-	sub len=enddst,dst1,1		// precompute len
-(p6)	br.cond.dptk .failure_in1bis
-	;;
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.lc=saved_lc
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-.failure_in2:
-	sub ret0=endsrc,src1
-	cmp.ne p6,p0=dst1,enddst	// Do we need to finish the tail ?
-	sub len=enddst,dst1,1		// precompute len
-(p6)	br.cond.dptk .failure_in1bis
-	;;
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.lc=saved_lc
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	//
-	// handling of failures on stores: that's the easy part
-	//
-.failure_out:
-	sub ret0=enddst,dst1
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.lc=saved_lc
-
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-END(__copy_user)
-EXPORT_SYMBOL(__copy_user)
diff --git a/arch/ia64/lib/csum_partial_copy.c b/arch/ia64/lib/csum_partial_copy.c
deleted file mode 100644
index 917e3138b277..000000000000
--- a/arch/ia64/lib/csum_partial_copy.c
+++ /dev/null
@@ -1,98 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Network Checksum & Copy routine
- *
- * Copyright (C) 1999, 2003-2004 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * Most of the code has been imported from Linux/Alpha
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-
-#include <net/checksum.h>
-
-/*
- * XXX Fixme: those 2 inlines are meant for debugging and will go away
- */
-static inline unsigned
-short from64to16(unsigned long x)
-{
-	/* add up 32-bit words for 33 bits */
-	x = (x & 0xffffffff) + (x >> 32);
-	/* add up 16-bit and 17-bit words for 17+c bits */
-	x = (x & 0xffff) + (x >> 16);
-	/* add up 16-bit and 2-bit for 16+c bit */
-	x = (x & 0xffff) + (x >> 16);
-	/* add up carry.. */
-	x = (x & 0xffff) + (x >> 16);
-	return x;
-}
-
-static inline
-unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum)
-{
-	int odd, count;
-	unsigned long result = (unsigned long)psum;
-
-	if (len <= 0)
-		goto out;
-	odd = 1 & (unsigned long) buff;
-	if (odd) {
-		result = *buff << 8;
-		len--;
-		buff++;
-	}
-	count = len >> 1;		/* nr of 16-bit words.. */
-	if (count) {
-		if (2 & (unsigned long) buff) {
-			result += *(unsigned short *) buff;
-			count--;
-			len -= 2;
-			buff += 2;
-		}
-		count >>= 1;		/* nr of 32-bit words.. */
-		if (count) {
-			if (4 & (unsigned long) buff) {
-				result += *(unsigned int *) buff;
-				count--;
-				len -= 4;
-				buff += 4;
-			}
-			count >>= 1;	/* nr of 64-bit words.. */
-			if (count) {
-				unsigned long carry = 0;
-				do {
-					unsigned long w = *(unsigned long *) buff;
-					count--;
-					buff += 8;
-					result += carry;
-					result += w;
-					carry = (w > result);
-				} while (count);
-				result += carry;
-				result = (result & 0xffffffff) + (result >> 32);
-			}
-			if (len & 4) {
-				result += *(unsigned int *) buff;
-				buff += 4;
-			}
-		}
-		if (len & 2) {
-			result += *(unsigned short *) buff;
-			buff += 2;
-		}
-	}
-	if (len & 1)
-		result += *buff;
-
-	result = from64to16(result);
-
-	if (odd)
-		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
-
-out:
-	return result;
-}
diff --git a/arch/ia64/lib/do_csum.S b/arch/ia64/lib/do_csum.S
deleted file mode 100644
index 6004dad2597c..000000000000
--- a/arch/ia64/lib/do_csum.S
+++ /dev/null
@@ -1,324 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optmized version of the standard do_csum() function
- *
- * Return: a 64bit quantity containing the 16bit Internet checksum
- *
- * Inputs:
- *	in0: address of buffer to checksum (char *)
- *	in1: length of the buffer (int)
- *
- * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * 02/04/22	Ken Chen <kenneth.w.chen@intel.com>
- *		Data locality study on the checksum buffer.
- *		More optimization cleanup - remove excessive stop bits.
- * 02/04/08	David Mosberger <davidm@hpl.hp.com>
- *		More cleanup and tuning.
- * 01/04/18	Jun Nakajima <jun.nakajima@intel.com>
- *		Clean up and optimize and the software pipeline, loading two
- *		back-to-back 8-byte words per loop. Clean up the initialization
- *		for the loop. Support the cases where load latency = 1 or 2.
- *		Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
- */
-
-#include <asm/asmmacro.h>
-
-//
-// Theory of operations:
-//	The goal is to go as quickly as possible to the point where
-//	we can checksum 16 bytes/loop. Before reaching that point we must
-//	take care of incorrect alignment of first byte.
-//
-//	The code hereafter also takes care of the "tail" part of the buffer
-//	before entering the core loop, if any. The checksum is a sum so it
-//	allows us to commute operations. So we do the "head" and "tail"
-//	first to finish at full speed in the body. Once we get the head and
-//	tail values, we feed them into the pipeline, very handy initialization.
-//
-//	Of course we deal with the special case where the whole buffer fits
-//	into one 8 byte word. In this case we have only one entry in the pipeline.
-//
-//	We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
-//	possible load latency and also to accommodate for head and tail.
-//
-//	The end of the function deals with folding the checksum from 64bits
-//	down to 16bits taking care of the carry.
-//
-//	This version avoids synchronization in the core loop by also using a
-//	pipeline for the accumulation of the checksum in resultx[] (x=1,2).
-//
-//	 wordx[] (x=1,2)
-//	|---|
-//      |   | 0			: new value loaded in pipeline
-//	|---|
-//      |   | -			: in transit data
-//	|---|
-//      |   | LOAD_LATENCY	: current value to add to checksum
-//	|---|
-//      |   | LOAD_LATENCY+1	: previous value added to checksum
-//      |---|			(previous iteration)
-//
-//	resultx[] (x=1,2)
-//	|---|
-//      |   | 0			: initial value
-//	|---|
-//      |   | LOAD_LATENCY-1	: new checksum
-//	|---|
-//      |   | LOAD_LATENCY	: previous value of checksum
-//	|---|
-//      |   | LOAD_LATENCY+1	: final checksum when out of the loop
-//      |---|
-//
-//
-//	See RFC1071 "Computing the Internet Checksum" for various techniques for
-//	calculating the Internet checksum.
-//
-// NOT YET DONE:
-//	- Maybe another algorithm which would take care of the folding at the
-//	  end in a different manner
-//	- Work with people more knowledgeable than me on the network stack
-//	  to figure out if we could not split the function depending on the
-//	  type of packet or alignment we get. Like the ip_fast_csum() routine
-//	  where we know we have at least 20bytes worth of data to checksum.
-//	- Do a better job of handling small packets.
-//	- Note on prefetching: it was found that under various load, i.e. ftp read/write,
-//	  nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8%
-//	  on the data that buffer points to (partly because the checksum is often preceded by
-//	  a copy_from_user()).  This finding indiate that lfetch will not be beneficial since
-//	  the data is already in the cache.
-//
-
-#define saved_pfs	r11
-#define hmask		r16
-#define tmask		r17
-#define first1		r18
-#define firstval	r19
-#define firstoff	r20
-#define last		r21
-#define lastval		r22
-#define lastoff		r23
-#define saved_lc	r24
-#define saved_pr	r25
-#define tmp1		r26
-#define tmp2		r27
-#define tmp3		r28
-#define carry1		r29
-#define carry2		r30
-#define first2		r31
-
-#define buf		in0
-#define len		in1
-
-#define LOAD_LATENCY	2	// XXX fix me
-
-#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
-# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
-#endif
-
-#define PIPE_DEPTH			(LOAD_LATENCY+2)
-#define ELD	p[LOAD_LATENCY]		// end of load
-#define ELD_1	p[LOAD_LATENCY+1]	// and next stage
-
-// unsigned long do_csum(unsigned char *buf,long len)
-
-GLOBAL_ENTRY(do_csum)
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,2,16,0,16
-	.rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
-	.rotp p[PIPE_DEPTH], pC1[2], pC2[2]
-	mov ret0=r0		// in case we have zero length
-	cmp.lt p0,p6=r0,len	// check for zero length or negative (32bit len)
-	;;
-	add tmp1=buf,len	// last byte's address
-	.save pr, saved_pr
-	mov saved_pr=pr		// preserve predicates (rotation)
-(p6)	br.ret.spnt.many rp	// return if zero or negative length
-
-	mov hmask=-1		// initialize head mask
-	tbit.nz p15,p0=buf,0	// is buf an odd address?
-	and first1=-8,buf	// 8-byte align down address of first1 element
-
-	and firstoff=7,buf	// how many bytes off for first1 element
-	mov tmask=-1		// initialize tail mask
-
-	;;
-	adds tmp2=-1,tmp1	// last-1
-	and lastoff=7,tmp1	// how many bytes off for last element
-	;;
-	sub tmp1=8,lastoff	// complement to lastoff
-	and last=-8,tmp2	// address of word containing last byte
-	;;
-	sub tmp3=last,first1	// tmp3=distance from first1 to last
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc	// save lc
-	cmp.eq p8,p9=last,first1	// everything fits in one word ?
-
-	ld8 firstval=[first1],8	// load, ahead of time, "first1" word
-	and tmp1=7, tmp1	// make sure that if tmp1==8 -> tmp1=0
-	shl tmp2=firstoff,3	// number of bits
-	;;
-(p9)	ld8 lastval=[last]	// load, ahead of time, "last" word, if needed
-	shl tmp1=tmp1,3		// number of bits
-(p9)	adds tmp3=-8,tmp3	// effectively loaded
-	;;
-(p8)	mov lastval=r0		// we don't need lastval if first1==last
-	shl hmask=hmask,tmp2	// build head mask, mask off [0,first1off[
-	shr.u tmask=tmask,tmp1	// build tail mask, mask off ]8,lastoff]
-	;;
-	.body
-#define count tmp3
-
-(p8)	and hmask=hmask,tmask	// apply tail mask to head mask if 1 word only
-(p9)	and word2[0]=lastval,tmask	// mask last it as appropriate
-	shr.u count=count,3	// how many 8-byte?
-	;;
-	// If count is odd, finish this 8-byte word so that we can
-	// load two back-to-back 8-byte words per loop thereafter.
-	and word1[0]=firstval,hmask	// and mask it as appropriate
-	tbit.nz p10,p11=count,0		// if (count is odd)
-	;;
-(p8)	mov result1[0]=word1[0]
-(p9)	add result1[0]=word1[0],word2[0]
-	;;
-	cmp.ltu p6,p0=result1[0],word1[0]	// check the carry
-	cmp.eq.or.andcm p8,p0=0,count		// exit if zero 8-byte
-	;;
-(p6)	adds result1[0]=1,result1[0]
-(p8)	br.cond.dptk .do_csum_exit	// if (within an 8-byte word)
-(p11)	br.cond.dptk .do_csum16		// if (count is even)
-
-	// Here count is odd.
-	ld8 word1[1]=[first1],8		// load an 8-byte word
-	cmp.eq p9,p10=1,count		// if (count == 1)
-	adds count=-1,count		// loaded an 8-byte word
-	;;
-	add result1[0]=result1[0],word1[1]
-	;;
-	cmp.ltu p6,p0=result1[0],word1[1]
-	;;
-(p6)	adds result1[0]=1,result1[0]
-(p9)	br.cond.sptk .do_csum_exit	// if (count == 1) exit
-	// Fall through to calculate the checksum, feeding result1[0] as
-	// the initial value in result1[0].
-	//
-	// Calculate the checksum loading two 8-byte words per loop.
-	//
-.do_csum16:
-	add first2=8,first1
-	shr.u count=count,1	// we do 16 bytes per loop
-	;;
-	adds count=-1,count
-	mov carry1=r0
-	mov carry2=r0
-	brp.loop.imp 1f,2f
-	;;
-	mov ar.ec=PIPE_DEPTH
-	mov ar.lc=count	// set lc
-	mov pr.rot=1<<16
-	// result1[0] must be initialized in advance.
-	mov result2[0]=r0
-	;;
-	.align 32
-1:
-(ELD_1)	cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
-(pC1[1])adds carry1=1,carry1
-(ELD_1)	cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
-(pC2[1])adds carry2=1,carry2
-(ELD)	add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
-(ELD)	add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
-2:
-(p[0])	ld8 word1[0]=[first1],16
-(p[0])	ld8 word2[0]=[first2],16
-	br.ctop.sptk 1b
-	;;
-	// Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
-(pC1[1])adds carry1=1,carry1	// since we miss the last one
-(pC2[1])adds carry2=1,carry2
-	;;
-	add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
-	add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
-	;;
-	cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
-	cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
-	;;
-(p6)	adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
-(p7)	adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
-	;;
-	add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
-	;;
-	cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
-	;;
-(p6)	adds result1[0]=1,result1[0]
-	;;
-.do_csum_exit:
-	//
-	// now fold 64 into 16 bits taking care of carry
-	// that's not very good because it has lots of sequentiality
-	//
-	mov tmp3=0xffff
-	zxt4 tmp1=result1[0]
-	shr.u tmp2=result1[0],32
-	;;
-	add result1[0]=tmp1,tmp2
-	;;
-	and tmp1=result1[0],tmp3
-	shr.u tmp2=result1[0],16
-	;;
-	add result1[0]=tmp1,tmp2
-	;;
-	and tmp1=result1[0],tmp3
-	shr.u tmp2=result1[0],16
-	;;
-	add result1[0]=tmp1,tmp2
-	;;
-	and tmp1=result1[0],tmp3
-	shr.u tmp2=result1[0],16
-	;;
-	add ret0=tmp1,tmp2
-	mov pr=saved_pr,0xffffffffffff0000
-	;;
-	// if buf was odd then swap bytes
-	mov ar.pfs=saved_pfs		// restore ar.ec
-(p15)	mux1 ret0=ret0,@rev		// reverse word
-	;;
-	mov ar.lc=saved_lc
-(p15)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
-	br.ret.sptk.many rp
-
-//	I (Jun Nakajima) wrote an equivalent code (see below), but it was
-//	not much better than the original. So keep the original there so that
-//	someone else can challenge.
-//
-//	shr.u word1[0]=result1[0],32
-//	zxt4 result1[0]=result1[0]
-//	;;
-//	add result1[0]=result1[0],word1[0]
-//	;;
-//	zxt2 result2[0]=result1[0]
-//	extr.u word1[0]=result1[0],16,16
-//	shr.u carry1=result1[0],32
-//	;;
-//	add result2[0]=result2[0],word1[0]
-//	;;
-//	add result2[0]=result2[0],carry1
-//	;;
-//	extr.u ret0=result2[0],16,16
-//	;;
-//	add ret0=ret0,result2[0]
-//	;;
-//	zxt2 ret0=ret0
-//	mov ar.pfs=saved_pfs		 // restore ar.ec
-//	mov pr=saved_pr,0xffffffffffff0000
-//	;;
-//	// if buf was odd then swap bytes
-//	mov ar.lc=saved_lc
-//(p15)	mux1 ret0=ret0,@rev		// reverse word
-//	;;
-//(p15)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
-//	br.ret.sptk.many rp
-
-END(do_csum)
diff --git a/arch/ia64/lib/flush.S b/arch/ia64/lib/flush.S
deleted file mode 100644
index f8e795fe45cb..000000000000
--- a/arch/ia64/lib/flush.S
+++ /dev/null
@@ -1,119 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Cache flushing routines.
- *
- * Copyright (C) 1999-2001, 2005 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 05/28/05 Zoltan Menyhart	Dynamic stride size
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-	/*
-	 * flush_icache_range(start,end)
-	 *
-	 *	Make i-cache(s) coherent with d-caches.
-	 *
-	 *	Must deal with range from start to end-1 but nothing else (need to
-	 *	be careful not to touch addresses that may be unmapped).
-	 *
-	 *	Note: "in0" and "in1" are preserved for debugging purposes.
-	 */
-	.section .kprobes.text,"ax"
-GLOBAL_ENTRY(flush_icache_range)
-
-	.prologue
-	alloc	r2=ar.pfs,2,0,0,0
-	movl	r3=ia64_i_cache_stride_shift
- 	mov	r21=1
-	;;
-	ld8	r20=[r3]		// r20: stride shift
-	sub	r22=in1,r0,1		// last byte address
-	;;
-	shr.u	r23=in0,r20		// start / (stride size)
-	shr.u	r22=r22,r20		// (last byte address) / (stride size)
-	shl	r21=r21,r20		// r21: stride size of the i-cache(s)
-	;;
-	sub	r8=r22,r23		// number of strides - 1
-	shl	r24=r23,r20		// r24: addresses for "fc.i" =
-					//	"start" rounded down to stride boundary
-	.save	ar.lc,r3
-	mov	r3=ar.lc		// save ar.lc
-	;;
-
-	.body
-	mov	ar.lc=r8
-	;;
-	/*
-	 * 32 byte aligned loop, even number of (actually 2) bundles
-	 */
-.Loop:	fc.i	r24			// issuable on M0 only
-	add	r24=r21,r24		// we flush "stride size" bytes per iteration
-	nop.i	0
-	br.cloop.sptk.few .Loop
-	;;
-	sync.i
-	;;
-	srlz.i
-	;;
-	mov	ar.lc=r3		// restore ar.lc
-	br.ret.sptk.many rp
-END(flush_icache_range)
-EXPORT_SYMBOL_GPL(flush_icache_range)
-
-	/*
-	 * clflush_cache_range(start,size)
-	 *
-	 *	Flush cache lines from start to start+size-1.
-	 *
-	 *	Must deal with range from start to start+size-1 but nothing else
-	 *	(need to be careful not to touch addresses that may be
-	 *	unmapped).
-	 *
-	 *	Note: "in0" and "in1" are preserved for debugging purposes.
-	 */
-	.section .kprobes.text,"ax"
-GLOBAL_ENTRY(clflush_cache_range)
-
-	.prologue
-	alloc	r2=ar.pfs,2,0,0,0
-	movl	r3=ia64_cache_stride_shift
-	mov	r21=1
-	add     r22=in1,in0
-	;;
-	ld8	r20=[r3]		// r20: stride shift
-	sub	r22=r22,r0,1		// last byte address
-	;;
-	shr.u	r23=in0,r20		// start / (stride size)
-	shr.u	r22=r22,r20		// (last byte address) / (stride size)
-	shl	r21=r21,r20		// r21: stride size of the i-cache(s)
-	;;
-	sub	r8=r22,r23		// number of strides - 1
-	shl	r24=r23,r20		// r24: addresses for "fc" =
-					//	"start" rounded down to stride
-					//	boundary
-	.save	ar.lc,r3
-	mov	r3=ar.lc		// save ar.lc
-	;;
-
-	.body
-	mov	ar.lc=r8
-	;;
-	/*
-	 * 32 byte aligned loop, even number of (actually 2) bundles
-	 */
-.Loop_fc:
-	fc	r24		// issuable on M0 only
-	add	r24=r21,r24	// we flush "stride size" bytes per iteration
-	nop.i	0
-	br.cloop.sptk.few .Loop_fc
-	;;
-	sync.i
-	;;
-	srlz.i
-	;;
-	mov	ar.lc=r3		// restore ar.lc
-	br.ret.sptk.many rp
-END(clflush_cache_range)
diff --git a/arch/ia64/lib/idiv32.S b/arch/ia64/lib/idiv32.S
deleted file mode 100644
index 83586fbc51ff..000000000000
--- a/arch/ia64/lib/idiv32.S
+++ /dev/null
@@ -1,86 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2000 Hewlett-Packard Co
- * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 32-bit integer division.
- *
- * This code is based on the application note entitled "Divide, Square Root
- * and Remainder Algorithms for the IA-64 Architecture".  This document
- * is available as Intel document number 248725-002 or via the web at
- * http://developer.intel.com/software/opensource/numerics/
- *
- * For more details on the theory behind these algorithms, see "IA-64
- * and Elementary Functions" by Peter Markstein; HP Professional Books
- * (http://www.goodreads.com/book/show/2019887.Ia_64_and_Elementary_Functions)
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-#ifdef MODULO
-# define OP	mod
-#else
-# define OP	div
-#endif
-
-#ifdef UNSIGNED
-# define SGN	u
-# define EXTEND	zxt4
-# define INT_TO_FP(a,b)	fcvt.xuf.s1 a=b
-# define FP_TO_INT(a,b)	fcvt.fxu.trunc.s1 a=b
-#else
-# define SGN
-# define EXTEND	sxt4
-# define INT_TO_FP(a,b)	fcvt.xf a=b
-# define FP_TO_INT(a,b)	fcvt.fx.trunc.s1 a=b
-#endif
-
-#define PASTE1(a,b)	a##b
-#define PASTE(a,b)	PASTE1(a,b)
-#define NAME		PASTE(PASTE(__,SGN),PASTE(OP,si3))
-
-GLOBAL_ENTRY(NAME)
-	.regstk 2,0,0,0
-	// Transfer inputs to FP registers.
-	mov r2 = 0xffdd			// r2 = -34 + 65535 (fp reg format bias)
-	EXTEND in0 = in0		// in0 = a
-	EXTEND in1 = in1		// in1 = b
-	;;
-	setf.sig f8 = in0
-	setf.sig f9 = in1
-#ifdef MODULO
-	sub in1 = r0, in1		// in1 = -b
-#endif
-	;;
-	// Convert the inputs to FP, to avoid FP software-assist faults.
-	INT_TO_FP(f8, f8)
-	INT_TO_FP(f9, f9)
-	;;
-	setf.exp f7 = r2		// f7 = 2^-34
-	frcpa.s1 f6, p6 = f8, f9	// y0 = frcpa(b)
-	;;
-(p6)	fmpy.s1 f8 = f8, f6		// q0 = a*y0
-(p6)	fnma.s1 f6 = f9, f6, f1		// e0 = -b*y0 + 1 
-	;;
-#ifdef MODULO
-	setf.sig f9 = in1		// f9 = -b
-#endif
-(p6)	fma.s1 f8 = f6, f8, f8		// q1 = e0*q0 + q0
-(p6)	fma.s1 f6 = f6, f6, f7		// e1 = e0*e0 + 2^-34
-	;;
-#ifdef MODULO
-	setf.sig f7 = in0
-#endif
-(p6)	fma.s1 f6 = f6, f8, f8		// q2 = e1*q1 + q1
-	;;
-	FP_TO_INT(f6, f6)		// q = trunc(q2)
-	;;
-#ifdef MODULO
-	xma.l f6 = f6, f9, f7		// r = q*(-b) + a
-	;;
-#endif
-	getf.sig r8 = f6		// transfer result to result register
-	br.ret.sptk.many rp
-END(NAME)
-EXPORT_SYMBOL(NAME)
diff --git a/arch/ia64/lib/idiv64.S b/arch/ia64/lib/idiv64.S
deleted file mode 100644
index 5c9113691f72..000000000000
--- a/arch/ia64/lib/idiv64.S
+++ /dev/null
@@ -1,83 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 1999-2000 Hewlett-Packard Co
- * Copyright (C) 1999-2000 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 64-bit integer division.
- *
- * This code is based on the application note entitled "Divide, Square Root
- * and Remainder Algorithms for the IA-64 Architecture".  This document
- * is available as Intel document number 248725-002 or via the web at
- * http://developer.intel.com/software/opensource/numerics/
- *
- * For more details on the theory behind these algorithms, see "IA-64
- * and Elementary Functions" by Peter Markstein; HP Professional Books
- * (http://www.goodreads.com/book/show/2019887.Ia_64_and_Elementary_Functions)
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-#ifdef MODULO
-# define OP	mod
-#else
-# define OP	div
-#endif
-
-#ifdef UNSIGNED
-# define SGN	u
-# define INT_TO_FP(a,b)	fcvt.xuf.s1 a=b
-# define FP_TO_INT(a,b)	fcvt.fxu.trunc.s1 a=b
-#else
-# define SGN
-# define INT_TO_FP(a,b)	fcvt.xf a=b
-# define FP_TO_INT(a,b)	fcvt.fx.trunc.s1 a=b
-#endif
-
-#define PASTE1(a,b)	a##b
-#define PASTE(a,b)	PASTE1(a,b)
-#define NAME		PASTE(PASTE(__,SGN),PASTE(OP,di3))
-
-GLOBAL_ENTRY(NAME)
-	.regstk 2,0,0,0
-	// Transfer inputs to FP registers.
-	setf.sig f8 = in0
-	setf.sig f9 = in1
-	;;
-	// Convert the inputs to FP, to avoid FP software-assist faults.
-	INT_TO_FP(f8, f8)
-	INT_TO_FP(f9, f9)
-	;;
-	frcpa.s1 f11, p6 = f8, f9	// y0 = frcpa(b)
-	;;
-(p6)	fmpy.s1 f7 = f8, f11		// q0 = a*y0
-(p6)	fnma.s1 f6 = f9, f11, f1	// e0 = -b*y0 + 1
-	;;
-(p6)	fma.s1 f10 = f7, f6, f7		// q1 = q0*e0 + q0
-(p6)	fmpy.s1 f7 = f6, f6		// e1 = e0*e0
-	;;
-#ifdef MODULO
-	sub in1 = r0, in1		// in1 = -b
-#endif
-(p6)	fma.s1 f10 = f10, f7, f10	// q2 = q1*e1 + q1
-(p6)	fma.s1 f6 = f11, f6, f11	// y1 = y0*e0 + y0
-	;;
-(p6)	fma.s1 f6 = f6, f7, f6		// y2 = y1*e1 + y1
-(p6)	fnma.s1 f7 = f9, f10, f8	// r = -b*q2 + a
-	;;
-#ifdef MODULO
-	setf.sig f8 = in0		// f8 = a
-	setf.sig f9 = in1		// f9 = -b
-#endif
-(p6)	fma.s1 f11 = f7, f6, f10	// q3 = r*y2 + q2
-	;;
-	FP_TO_INT(f11, f11)		// q = trunc(q3)
-	;;
-#ifdef MODULO
-	xma.l f11 = f11, f9, f8		// r = q*(-b) + a
-	;;
-#endif
-	getf.sig r8 = f11		// transfer result to result register
-	br.ret.sptk.many rp
-END(NAME)
-EXPORT_SYMBOL(NAME)
diff --git a/arch/ia64/lib/io.c b/arch/ia64/lib/io.c
deleted file mode 100644
index c3e02462ed16..000000000000
--- a/arch/ia64/lib/io.c
+++ /dev/null
@@ -1,51 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/module.h>
-#include <linux/types.h>
-
-#include <asm/io.h>
-
-/*
- * Copy data from IO memory space to "real" memory space.
- * This needs to be optimized.
- */
-void memcpy_fromio(void *to, const volatile void __iomem *from, long count)
-{
-	char *dst = to;
-
-	while (count) {
-		count--;
-		*dst++ = readb(from++);
-	}
-}
-EXPORT_SYMBOL(memcpy_fromio);
-
-/*
- * Copy data from "real" memory space to IO memory space.
- * This needs to be optimized.
- */
-void memcpy_toio(volatile void __iomem *to, const void *from, long count)
-{
-	const char *src = from;
-
-	while (count) {
-		count--;
-		writeb(*src++, to++);
-	}
-}
-EXPORT_SYMBOL(memcpy_toio);
-
-/*
- * "memset" on IO memory space.
- * This needs to be optimized.
- */
-void memset_io(volatile void __iomem *dst, int c, long count)
-{
-	unsigned char ch = (char)(c & 0xff);
-
-	while (count) {
-		count--;
-		writeb(ch, dst);
-		dst++;
-	}
-}
-EXPORT_SYMBOL(memset_io);
diff --git a/arch/ia64/lib/ip_fast_csum.S b/arch/ia64/lib/ip_fast_csum.S
deleted file mode 100644
index fcc0b812ce2e..000000000000
--- a/arch/ia64/lib/ip_fast_csum.S
+++ /dev/null
@@ -1,148 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Optmized version of the ip_fast_csum() function
- * Used for calculating IP header checksum
- *
- * Return: 16bit checksum, complemented
- *
- * Inputs:
- *      in0: address of buffer to checksum (char *)
- *      in1: length of the buffer (int)
- *
- * Copyright (C) 2002, 2006 Intel Corp.
- * Copyright (C) 2002, 2006 Ken Chen <kenneth.w.chen@intel.com>
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-/*
- * Since we know that most likely this function is called with buf aligned
- * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
- * versus calling generic version of do_csum, which has lots of overhead in
- * handling various alignments and sizes.  However, due to lack of constrains
- * put on the function input argument, cases with alignment not on 4-byte or
- * size not equal to 20 bytes will be handled by the generic do_csum function.
- */
-
-#define in0	r32
-#define in1	r33
-#define in2	r34
-#define in3	r35
-#define in4	r36
-#define ret0	r8
-
-GLOBAL_ENTRY(ip_fast_csum)
-	.prologue
-	.body
-	cmp.ne	p6,p7=5,in1	// size other than 20 byte?
-	and	r14=3,in0	// is it aligned on 4-byte?
-	add	r15=4,in0	// second source pointer
-	;;
-	cmp.ne.or.andcm p6,p7=r14,r0
-	;;
-(p7)	ld4	r20=[in0],8
-(p7)	ld4	r21=[r15],8
-(p6)	br.spnt	.generic
-	;;
-	ld4	r22=[in0],8
-	ld4	r23=[r15],8
-	;;
-	ld4	r24=[in0]
-	add	r20=r20,r21
-	add	r22=r22,r23
-	;;
-	add	r20=r20,r22
-	;;
-	add	r20=r20,r24
-	;;
-	shr.u	ret0=r20,16	// now need to add the carry
-	zxt2	r20=r20
-	;;
-	add	r20=ret0,r20
-	;;
-	shr.u	ret0=r20,16	// add carry again
-	zxt2	r20=r20
-	;;
-	add	r20=ret0,r20
-	;;
-	shr.u	ret0=r20,16
-	zxt2	r20=r20
-	;;
-	add	r20=ret0,r20
-	mov	r9=0xffff
-	;;
-	andcm	ret0=r9,r20
-	.restore sp		// reset frame state
-	br.ret.sptk.many b0
-	;;
-
-.generic:
-	.prologue
-	.save ar.pfs, r35
-	alloc	r35=ar.pfs,2,2,2,0
-	.save rp, r34
-	mov	r34=b0
-	.body
-	dep.z	out1=in1,2,30
-	mov	out0=in0
-	;;
-	br.call.sptk.many b0=do_csum
-	;;
-	andcm	ret0=-1,ret0
-	mov	ar.pfs=r35
-	mov	b0=r34
-	br.ret.sptk.many b0
-END(ip_fast_csum)
-EXPORT_SYMBOL(ip_fast_csum)
-
-GLOBAL_ENTRY(csum_ipv6_magic)
-	ld4	r20=[in0],4
-	ld4	r21=[in1],4
-	zxt4	in2=in2
-	;;
-	ld4	r22=[in0],4
-	ld4	r23=[in1],4
-	dep	r15=in3,in2,32,16
-	;;
-	ld4	r24=[in0],4
-	ld4	r25=[in1],4
-	mux1	r15=r15,@rev
-	add	r16=r20,r21
-	add	r17=r22,r23
-	zxt4	in4=in4
-	;;
-	ld4	r26=[in0],4
-	ld4	r27=[in1],4
-	shr.u	r15=r15,16
-	add	r18=r24,r25
-	add	r8=r16,r17
-	;;
-	add	r19=r26,r27
-	add	r8=r8,r18
-	;;
-	add	r8=r8,r19
-	add	r15=r15,in4
-	;;
-	add	r8=r8,r15
-	;;
-	shr.u	r10=r8,32	// now fold sum into short
-	zxt4	r11=r8
-	;;
-	add	r8=r10,r11
-	;;
-	shr.u	r10=r8,16	// yeah, keep it rolling
-	zxt2	r11=r8
-	;;
-	add	r8=r10,r11
-	;;
-	shr.u	r10=r8,16	// three times lucky
-	zxt2	r11=r8
-	;;
-	add	r8=r10,r11
-	mov	r9=0xffff
-	;;
-	andcm	r8=r9,r8
-	br.ret.sptk.many b0
-END(csum_ipv6_magic)
-EXPORT_SYMBOL(csum_ipv6_magic)
diff --git a/arch/ia64/lib/memcpy.S b/arch/ia64/lib/memcpy.S
deleted file mode 100644
index 35c9069a8345..000000000000
--- a/arch/ia64/lib/memcpy.S
+++ /dev/null
@@ -1,304 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the standard memcpy() function
- *
- * Inputs:
- * 	in0:	destination address
- *	in1:	source address
- *	in2:	number of bytes to copy
- * Output:
- * 	no return value
- *
- * Copyright (C) 2000-2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(memcpy)
-
-#	define MEM_LAT	21		/* latency to memory */
-
-#	define dst	r2
-#	define src	r3
-#	define retval	r8
-#	define saved_pfs r9
-#	define saved_lc	r10
-#	define saved_pr	r11
-#	define cnt	r16
-#	define src2	r17
-#	define t0	r18
-#	define t1	r19
-#	define t2	r20
-#	define t3	r21
-#	define t4	r22
-#	define src_end	r23
-
-#	define N	(MEM_LAT + 4)
-#	define Nrot	((N + 7) & ~7)
-
-	/*
-	 * First, check if everything (src, dst, len) is a multiple of eight.  If
-	 * so, we handle everything with no taken branches (other than the loop
-	 * itself) and a small icache footprint.  Otherwise, we jump off to
-	 * the more general copy routine handling arbitrary
-	 * sizes/alignment etc.
-	 */
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc
-	or t0=in0,in1
-	;;
-
-	or t0=t0,in2
-	.save pr, saved_pr
-	mov saved_pr=pr
-
-	.body
-
-	cmp.eq p6,p0=in2,r0	// zero length?
-	mov retval=in0		// return dst
-(p6)	br.ret.spnt.many rp	// zero length, return immediately
-	;;
-
-	mov dst=in0		// copy because of rotation
-	shr.u cnt=in2,3		// number of 8-byte words to copy
-	mov pr.rot=1<<16
-	;;
-
-	adds cnt=-1,cnt		// br.ctop is repeat/until
-	cmp.gtu p7,p0=16,in2	// copying less than 16 bytes?
-	mov ar.ec=N
-	;;
-
-	and t0=0x7,t0
-	mov ar.lc=cnt
-	;;
-	cmp.ne p6,p0=t0,r0
-
-	mov src=in1		// copy because of rotation
-(p7)	br.cond.spnt.few .memcpy_short
-(p6)	br.cond.spnt.few .memcpy_long
-	;;
-	nop.m	0
-	;;
-	nop.m	0
-	nop.i	0
-	;;
-	nop.m	0
-	;;
-	.rotr val[N]
-	.rotp p[N]
-	.align 32
-1: { .mib
-(p[0])	ld8 val[0]=[src],8
-	nop.i 0
-	brp.loop.imp 1b, 2f
-}
-2: { .mfb
-(p[N-1])st8 [dst]=val[N-1],8
-	nop.f 0
-	br.ctop.dptk.few 1b
-}
-	;;
-	mov ar.lc=saved_lc
-	mov pr=saved_pr,-1
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	/*
-	 * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
-	 * copy loop.  This performs relatively poorly on Itanium, but it doesn't
-	 * get used very often (gcc inlines small copies) and due to atomicity
-	 * issues, we want to avoid read-modify-write of entire words.
-	 */
-	.align 32
-.memcpy_short:
-	adds cnt=-1,in2		// br.ctop is repeat/until
-	mov ar.ec=MEM_LAT
-	brp.loop.imp 1f, 2f
-	;;
-	mov ar.lc=cnt
-	;;
-	nop.m	0
-	;;
-	nop.m	0
-	nop.i	0
-	;;
-	nop.m	0
-	;;
-	nop.m	0
-	;;
-	/*
-	 * It is faster to put a stop bit in the loop here because it makes
-	 * the pipeline shorter (and latency is what matters on short copies).
-	 */
-	.align 32
-1: { .mib
-(p[0])	ld1 val[0]=[src],1
-	nop.i 0
-	brp.loop.imp 1b, 2f
-} ;;
-2: { .mfb
-(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
-	nop.f 0
-	br.ctop.dptk.few 1b
-} ;;
-	mov ar.lc=saved_lc
-	mov pr=saved_pr,-1
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	/*
-	 * Large (>= 16 bytes) copying is done in a fancy way.  Latency isn't
-	 * an overriding concern here, but throughput is.  We first do
-	 * sub-word copying until the destination is aligned, then we check
-	 * if the source is also aligned.  If so, we do a simple load/store-loop
-	 * until there are less than 8 bytes left over and then we do the tail,
-	 * by storing the last few bytes using sub-word copying.  If the source
-	 * is not aligned, we branch off to the non-congruent loop.
-	 *
-	 *   stage:   op:
-	 *         0  ld
-	 *	   :
-	 * MEM_LAT+3  shrp
-	 * MEM_LAT+4  st
-	 *
-	 * On Itanium, the pipeline itself runs without stalls.  However,  br.ctop
-	 * seems to introduce an unavoidable bubble in the pipeline so the overall
-	 * latency is 2 cycles/iteration.  This gives us a _copy_ throughput
-	 * of 4 byte/cycle.  Still not bad.
-	 */
-#	undef N
-#	undef Nrot
-#	define N	(MEM_LAT + 5)		/* number of stages */
-#	define Nrot	((N+1 + 2 + 7) & ~7)	/* number of rotating regs */
-
-#define LOG_LOOP_SIZE	6
-
-.memcpy_long:
-	alloc t3=ar.pfs,3,Nrot,0,Nrot	// resize register frame
-	and t0=-8,src		// t0 = src & ~7
-	and t2=7,src		// t2 = src & 7
-	;;
-	ld8 t0=[t0]		// t0 = 1st source word
-	adds src2=7,src		// src2 = (src + 7)
-	sub t4=r0,dst		// t4 = -dst
-	;;
-	and src2=-8,src2	// src2 = (src + 7) & ~7
-	shl t2=t2,3		// t2 = 8*(src & 7)
-	shl t4=t4,3		// t4 = 8*(dst & 7)
-	;;
-	ld8 t1=[src2]		// t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
-	sub t3=64,t2		// t3 = 64-8*(src & 7)
-	shr.u t0=t0,t2
-	;;
-	add src_end=src,in2
-	shl t1=t1,t3
-	mov pr=t4,0x38		// (p5,p4,p3)=(dst & 7)
-	;;
-	or t0=t0,t1
-	mov cnt=r0
-	adds src_end=-1,src_end
-	;;
-(p3)	st1 [dst]=t0,1
-(p3)	shr.u t0=t0,8
-(p3)	adds cnt=1,cnt
-	;;
-(p4)	st2 [dst]=t0,2
-(p4)	shr.u t0=t0,16
-(p4)	adds cnt=2,cnt
-	;;
-(p5)	st4 [dst]=t0,4
-(p5)	adds cnt=4,cnt
-	and src_end=-8,src_end	// src_end = last word of source buffer
-	;;
-
-	// At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:
-
-1:{	add src=cnt,src			// make src point to remainder of source buffer
-	sub cnt=in2,cnt			// cnt = number of bytes left to copy
-	mov t4=ip
-  }	;;
-	and src2=-8,src			// align source pointer
-	adds t4=.memcpy_loops-1b,t4
-	mov ar.ec=N
-
-	and t0=7,src			// t0 = src & 7
-	shr.u t2=cnt,3			// t2 = number of 8-byte words left to copy
-	shl cnt=cnt,3			// move bits 0-2 to 3-5
-	;;
-
-	.rotr val[N+1], w[2]
-	.rotp p[N]
-
-	cmp.ne p6,p0=t0,r0		// is src aligned, too?
-	shl t0=t0,LOG_LOOP_SIZE		// t0 = 8*(src & 7)
-	adds t2=-1,t2			// br.ctop is repeat/until
-	;;
-	add t4=t0,t4
-	mov pr=cnt,0x38			// set (p5,p4,p3) to # of bytes last-word bytes to copy
-	mov ar.lc=t2
-	;;
-	nop.m	0
-	;;
-	nop.m	0
-	nop.i	0
-	;;
-	nop.m	0
-	;;
-(p6)	ld8 val[1]=[src2],8		// prime the pump...
-	mov b6=t4
-	br.sptk.few b6
-	;;
-
-.memcpy_tail:
-	// At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
-	// less than 8) and t0 contains the last few bytes of the src buffer:
-(p5)	st4 [dst]=t0,4
-(p5)	shr.u t0=t0,32
-	mov ar.lc=saved_lc
-	;;
-(p4)	st2 [dst]=t0,2
-(p4)	shr.u t0=t0,16
-	mov ar.pfs=saved_pfs
-	;;
-(p3)	st1 [dst]=t0
-	mov pr=saved_pr,-1
-	br.ret.sptk.many rp
-
-///////////////////////////////////////////////////////
-	.align 64
-
-#define COPY(shift,index)									\
- 1: { .mib											\
-	(p[0])		ld8 val[0]=[src2],8;							\
-	(p[MEM_LAT+3])	shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift;			\
-			brp.loop.imp 1b, 2f							\
-    };												\
- 2: { .mfb											\
-	(p[MEM_LAT+4])	st8 [dst]=w[1],8;							\
-			nop.f 0;								\
-			br.ctop.dptk.few 1b;							\
-    };												\
-			;;									\
-			ld8 val[N-1]=[src_end];	/* load last word (may be same as val[N]) */	\
-			;;									\
-			shrp t0=val[N-1],val[N-index],shift;					\
-			br .memcpy_tail
-.memcpy_loops:
-	COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
-	COPY(8, 0)
-	COPY(16, 0)
-	COPY(24, 0)
-	COPY(32, 0)
-	COPY(40, 0)
-	COPY(48, 0)
-	COPY(56, 0)
-
-END(memcpy)
-EXPORT_SYMBOL(memcpy)
diff --git a/arch/ia64/lib/memcpy_mck.S b/arch/ia64/lib/memcpy_mck.S
deleted file mode 100644
index c0d4362217ae..000000000000
--- a/arch/ia64/lib/memcpy_mck.S
+++ /dev/null
@@ -1,659 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Itanium 2-optimized version of memcpy and copy_user function
- *
- * Inputs:
- * 	in0:	destination address
- *	in1:	source address
- *	in2:	number of bytes to copy
- * Output:
- *	for memcpy:    return dest
- * 	for copy_user: return 0 if success,
- *		       or number of byte NOT copied if error occurred.
- *
- * Copyright (C) 2002 Intel Corp.
- * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#define EK(y...) EX(y)
-
-/* McKinley specific optimization */
-
-#define retval		r8
-#define saved_pfs	r31
-#define saved_lc	r10
-#define saved_pr	r11
-#define saved_in0	r14
-#define saved_in1	r15
-#define saved_in2	r16
-
-#define src0		r2
-#define src1		r3
-#define dst0		r17
-#define dst1		r18
-#define cnt		r9
-
-/* r19-r30 are temp for each code section */
-#define PREFETCH_DIST	8
-#define src_pre_mem	r19
-#define dst_pre_mem	r20
-#define src_pre_l2	r21
-#define dst_pre_l2	r22
-#define t1		r23
-#define t2		r24
-#define t3		r25
-#define t4		r26
-#define t5		t1	// alias!
-#define t6		t2	// alias!
-#define t7		t3	// alias!
-#define n8		r27
-#define t9		t5	// alias!
-#define t10		t4	// alias!
-#define t11		t7	// alias!
-#define t12		t6	// alias!
-#define t14		t10	// alias!
-#define t13		r28
-#define t15		r29
-#define tmp		r30
-
-/* defines for long_copy block */
-#define	A	0
-#define B	(PREFETCH_DIST)
-#define C	(B + PREFETCH_DIST)
-#define D	(C + 1)
-#define N	(D + 1)
-#define Nrot	((N + 7) & ~7)
-
-/* alias */
-#define in0		r32
-#define in1		r33
-#define in2		r34
-
-GLOBAL_ENTRY(memcpy)
-	and	r28=0x7,in0
-	and	r29=0x7,in1
-	mov	f6=f0
-	mov	retval=in0
-	br.cond.sptk .common_code
-	;;
-END(memcpy)
-EXPORT_SYMBOL(memcpy)
-GLOBAL_ENTRY(__copy_user)
-	.prologue
-// check dest alignment
-	and	r28=0x7,in0
-	and	r29=0x7,in1
-	mov	f6=f1
-	mov	saved_in0=in0	// save dest pointer
-	mov	saved_in1=in1	// save src pointer
-	mov	retval=r0	// initialize return value
-	;;
-.common_code:
-	cmp.gt	p15,p0=8,in2	// check for small size
-	cmp.ne	p13,p0=0,r28	// check dest alignment
-	cmp.ne	p14,p0=0,r29	// check src alignment
-	add	src0=0,in1
-	sub	r30=8,r28	// for .align_dest
-	mov	saved_in2=in2	// save len
-	;;
-	add	dst0=0,in0
-	add	dst1=1,in0	// dest odd index
-	cmp.le	p6,p0 = 1,r30	// for .align_dest
-(p15)	br.cond.dpnt .memcpy_short
-(p13)	br.cond.dpnt .align_dest
-(p14)	br.cond.dpnt .unaligned_src
-	;;
-
-// both dest and src are aligned on 8-byte boundary
-.aligned_src:
-	.save ar.pfs, saved_pfs
-	alloc	saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
-	.save pr, saved_pr
-	mov	saved_pr=pr
-
-	shr.u	cnt=in2,7	// this much cache line
-	;;
-	cmp.lt	p6,p0=2*PREFETCH_DIST,cnt
-	cmp.lt	p7,p8=1,cnt
-	.save ar.lc, saved_lc
-	mov	saved_lc=ar.lc
-	.body
-	add	cnt=-1,cnt
-	add	src_pre_mem=0,in1	// prefetch src pointer
-	add	dst_pre_mem=0,in0	// prefetch dest pointer
-	;;
-(p7)	mov	ar.lc=cnt	// prefetch count
-(p8)	mov	ar.lc=r0
-(p6)	br.cond.dpnt .long_copy
-	;;
-
-.prefetch:
-	lfetch.fault	  [src_pre_mem], 128
-	lfetch.fault.excl [dst_pre_mem], 128
-	br.cloop.dptk.few .prefetch
-	;;
-
-.medium_copy:
-	and	tmp=31,in2	// copy length after iteration
-	shr.u	r29=in2,5	// number of 32-byte iteration
-	add	dst1=8,dst0	// 2nd dest pointer
-	;;
-	add	cnt=-1,r29	// ctop iteration adjustment
-	cmp.eq	p10,p0=r29,r0	// do we really need to loop?
-	add	src1=8,src0	// 2nd src pointer
-	cmp.le	p6,p0=8,tmp
-	;;
-	cmp.le	p7,p0=16,tmp
-	mov	ar.lc=cnt	// loop setup
-	cmp.eq	p16,p17 = r0,r0
-	mov	ar.ec=2
-(p10)	br.dpnt.few .aligned_src_tail
-	;;
-	TEXT_ALIGN(32)
-1:
-EX(.ex_handler, (p16)	ld8	r34=[src0],16)
-EK(.ex_handler, (p16)	ld8	r38=[src1],16)
-EX(.ex_handler, (p17)	st8	[dst0]=r33,16)
-EK(.ex_handler, (p17)	st8	[dst1]=r37,16)
-	;;
-EX(.ex_handler, (p16)	ld8	r32=[src0],16)
-EK(.ex_handler, (p16)	ld8	r36=[src1],16)
-EX(.ex_handler, (p16)	st8	[dst0]=r34,16)
-EK(.ex_handler, (p16)	st8	[dst1]=r38,16)
-	br.ctop.dptk.few 1b
-	;;
-
-.aligned_src_tail:
-EX(.ex_handler, (p6)	ld8	t1=[src0])
-	mov	ar.lc=saved_lc
-	mov	ar.pfs=saved_pfs
-EX(.ex_hndlr_s, (p7)	ld8	t2=[src1],8)
-	cmp.le	p8,p0=24,tmp
-	and	r21=-8,tmp
-	;;
-EX(.ex_hndlr_s, (p8)	ld8	t3=[src1])
-EX(.ex_handler, (p6)	st8	[dst0]=t1)	// store byte 1
-	and	in2=7,tmp	// remaining length
-EX(.ex_hndlr_d, (p7)	st8	[dst1]=t2,8)	// store byte 2
-	add	src0=src0,r21	// setting up src pointer
-	add	dst0=dst0,r21	// setting up dest pointer
-	;;
-EX(.ex_handler, (p8)	st8	[dst1]=t3)	// store byte 3
-	mov	pr=saved_pr,-1
-	br.dptk.many .memcpy_short
-	;;
-
-/* code taken from copy_page_mck */
-.long_copy:
-	.rotr v[2*PREFETCH_DIST]
-	.rotp p[N]
-
-	mov src_pre_mem = src0
-	mov pr.rot = 0x10000
-	mov ar.ec = 1				// special unrolled loop
-
-	mov dst_pre_mem = dst0
-
-	add src_pre_l2 = 8*8, src0
-	add dst_pre_l2 = 8*8, dst0
-	;;
-	add src0 = 8, src_pre_mem		// first t1 src
-	mov ar.lc = 2*PREFETCH_DIST - 1
-	shr.u cnt=in2,7				// number of lines
-	add src1 = 3*8, src_pre_mem		// first t3 src
-	add dst0 = 8, dst_pre_mem		// first t1 dst
-	add dst1 = 3*8, dst_pre_mem		// first t3 dst
-	;;
-	and tmp=127,in2				// remaining bytes after this block
-	add cnt = -(2*PREFETCH_DIST) - 1, cnt
-	// same as .line_copy loop, but with all predicated-off instructions removed:
-.prefetch_loop:
-EX(.ex_hndlr_lcpy_1, (p[A])	ld8 v[A] = [src_pre_mem], 128)		// M0
-EK(.ex_hndlr_lcpy_1, (p[B])	st8 [dst_pre_mem] = v[B], 128)		// M2
-	br.ctop.sptk .prefetch_loop
-	;;
-	cmp.eq p16, p0 = r0, r0			// reset p16 to 1
-	mov ar.lc = cnt
-	mov ar.ec = N				// # of stages in pipeline
-	;;
-.line_copy:
-EX(.ex_handler,	(p[D])	ld8 t2 = [src0], 3*8)			// M0
-EK(.ex_handler,	(p[D])	ld8 t4 = [src1], 3*8)			// M1
-EX(.ex_handler_lcpy,	(p[B])	st8 [dst_pre_mem] = v[B], 128)		// M2 prefetch dst from memory
-EK(.ex_handler_lcpy,	(p[D])	st8 [dst_pre_l2] = n8, 128)		// M3 prefetch dst from L2
-	;;
-EX(.ex_handler_lcpy,	(p[A])	ld8 v[A] = [src_pre_mem], 128)		// M0 prefetch src from memory
-EK(.ex_handler_lcpy,	(p[C])	ld8 n8 = [src_pre_l2], 128)		// M1 prefetch src from L2
-EX(.ex_handler,	(p[D])	st8 [dst0] =  t1, 8)			// M2
-EK(.ex_handler,	(p[D])	st8 [dst1] =  t3, 8)			// M3
-	;;
-EX(.ex_handler,	(p[D])	ld8  t5 = [src0], 8)
-EK(.ex_handler,	(p[D])	ld8  t7 = [src1], 3*8)
-EX(.ex_handler,	(p[D])	st8 [dst0] =  t2, 3*8)
-EK(.ex_handler,	(p[D])	st8 [dst1] =  t4, 3*8)
-	;;
-EX(.ex_handler,	(p[D])	ld8  t6 = [src0], 3*8)
-EK(.ex_handler,	(p[D])	ld8 t10 = [src1], 8)
-EX(.ex_handler,	(p[D])	st8 [dst0] =  t5, 8)
-EK(.ex_handler,	(p[D])	st8 [dst1] =  t7, 3*8)
-	;;
-EX(.ex_handler,	(p[D])	ld8  t9 = [src0], 3*8)
-EK(.ex_handler,	(p[D])	ld8 t11 = [src1], 3*8)
-EX(.ex_handler,	(p[D])	st8 [dst0] =  t6, 3*8)
-EK(.ex_handler,	(p[D])	st8 [dst1] = t10, 8)
-	;;
-EX(.ex_handler,	(p[D])	ld8 t12 = [src0], 8)
-EK(.ex_handler,	(p[D])	ld8 t14 = [src1], 8)
-EX(.ex_handler,	(p[D])	st8 [dst0] =  t9, 3*8)
-EK(.ex_handler,	(p[D])	st8 [dst1] = t11, 3*8)
-	;;
-EX(.ex_handler,	(p[D])	ld8 t13 = [src0], 4*8)
-EK(.ex_handler,	(p[D])	ld8 t15 = [src1], 4*8)
-EX(.ex_handler,	(p[D])	st8 [dst0] = t12, 8)
-EK(.ex_handler,	(p[D])	st8 [dst1] = t14, 8)
-	;;
-EX(.ex_handler,	(p[C])	ld8  t1 = [src0], 8)
-EK(.ex_handler,	(p[C])	ld8  t3 = [src1], 8)
-EX(.ex_handler,	(p[D])	st8 [dst0] = t13, 4*8)
-EK(.ex_handler,	(p[D])	st8 [dst1] = t15, 4*8)
-	br.ctop.sptk .line_copy
-	;;
-
-	add dst0=-8,dst0
-	add src0=-8,src0
-	mov in2=tmp
-	.restore sp
-	br.sptk.many .medium_copy
-	;;
-
-#define BLOCK_SIZE	128*32
-#define blocksize	r23
-#define curlen		r24
-
-// dest is on 8-byte boundary, src is not. We need to do
-// ld8-ld8, shrp, then st8.  Max 8 byte copy per cycle.
-.unaligned_src:
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc	saved_pfs=ar.pfs,3,5,0,8
-	.save ar.lc, saved_lc
-	mov	saved_lc=ar.lc
-	.save pr, saved_pr
-	mov	saved_pr=pr
-	.body
-.4k_block:
-	mov	saved_in0=dst0	// need to save all input arguments
-	mov	saved_in2=in2
-	mov	blocksize=BLOCK_SIZE
-	;;
-	cmp.lt	p6,p7=blocksize,in2
-	mov	saved_in1=src0
-	;;
-(p6)	mov	in2=blocksize
-	;;
-	shr.u	r21=in2,7	// this much cache line
-	shr.u	r22=in2,4	// number of 16-byte iteration
-	and	curlen=15,in2	// copy length after iteration
-	and	r30=7,src0	// source alignment
-	;;
-	cmp.lt	p7,p8=1,r21
-	add	cnt=-1,r21
-	;;
-
-	add	src_pre_mem=0,src0	// prefetch src pointer
-	add	dst_pre_mem=0,dst0	// prefetch dest pointer
-	and	src0=-8,src0		// 1st src pointer
-(p7)	mov	ar.lc = cnt
-(p8)	mov	ar.lc = r0
-	;;
-	TEXT_ALIGN(32)
-1:	lfetch.fault	  [src_pre_mem], 128
-	lfetch.fault.excl [dst_pre_mem], 128
-	br.cloop.dptk.few 1b
-	;;
-
-	shladd	dst1=r22,3,dst0	// 2nd dest pointer
-	shladd	src1=r22,3,src0	// 2nd src pointer
-	cmp.eq	p8,p9=r22,r0	// do we really need to loop?
-	cmp.le	p6,p7=8,curlen;	// have at least 8 byte remaining?
-	add	cnt=-1,r22	// ctop iteration adjustment
-	;;
-EX(.ex_handler, (p9)	ld8	r33=[src0],8)	// loop primer
-EK(.ex_handler, (p9)	ld8	r37=[src1],8)
-(p8)	br.dpnt.few .noloop
-	;;
-
-// The jump address is calculated based on src alignment. The COPYU
-// macro below need to confine its size to power of two, so an entry
-// can be caulated using shl instead of an expensive multiply. The
-// size is then hard coded by the following #define to match the
-// actual size.  This make it somewhat tedious when COPYU macro gets
-// changed and this need to be adjusted to match.
-#define LOOP_SIZE 6
-1:
-	mov	r29=ip		// jmp_table thread
-	mov	ar.lc=cnt
-	;;
-	add	r29=.jump_table - 1b - (.jmp1-.jump_table), r29
-	shl	r28=r30, LOOP_SIZE	// jmp_table thread
-	mov	ar.ec=2		// loop setup
-	;;
-	add	r29=r29,r28		// jmp_table thread
-	cmp.eq	p16,p17=r0,r0
-	;;
-	mov	b6=r29			// jmp_table thread
-	;;
-	br.cond.sptk.few b6
-
-// for 8-15 byte case
-// We will skip the loop, but need to replicate the side effect
-// that the loop produces.
-.noloop:
-EX(.ex_handler, (p6)	ld8	r37=[src1],8)
-	add	src0=8,src0
-(p6)	shl	r25=r30,3
-	;;
-EX(.ex_handler, (p6)	ld8	r27=[src1])
-(p6)	shr.u	r28=r37,r25
-(p6)	sub	r26=64,r25
-	;;
-(p6)	shl	r27=r27,r26
-	;;
-(p6)	or	r21=r28,r27
-
-.unaligned_src_tail:
-/* check if we have more than blocksize to copy, if so go back */
-	cmp.gt	p8,p0=saved_in2,blocksize
-	;;
-(p8)	add	dst0=saved_in0,blocksize
-(p8)	add	src0=saved_in1,blocksize
-(p8)	sub	in2=saved_in2,blocksize
-(p8)	br.dpnt	.4k_block
-	;;
-
-/* we have up to 15 byte to copy in the tail.
- * part of work is already done in the jump table code
- * we are at the following state.
- * src side:
- * 
- *   xxxxxx xx                   <----- r21 has xxxxxxxx already
- * -------- -------- --------
- * 0        8        16
- *          ^
- *          |
- *          src1
- * 
- * dst
- * -------- -------- --------
- * ^
- * |
- * dst1
- */
-EX(.ex_handler, (p6)	st8	[dst1]=r21,8)	// more than 8 byte to copy
-(p6)	add	curlen=-8,curlen	// update length
-	mov	ar.pfs=saved_pfs
-	;;
-	mov	ar.lc=saved_lc
-	mov	pr=saved_pr,-1
-	mov	in2=curlen	// remaining length
-	mov	dst0=dst1	// dest pointer
-	add	src0=src1,r30	// forward by src alignment
-	;;
-
-// 7 byte or smaller.
-.memcpy_short:
-	cmp.le	p8,p9   = 1,in2
-	cmp.le	p10,p11 = 2,in2
-	cmp.le	p12,p13 = 3,in2
-	cmp.le	p14,p15 = 4,in2
-	add	src1=1,src0	// second src pointer
-	add	dst1=1,dst0	// second dest pointer
-	;;
-
-EX(.ex_handler_short, (p8)	ld1	t1=[src0],2)
-EK(.ex_handler_short, (p10)	ld1	t2=[src1],2)
-(p9)	br.ret.dpnt rp		// 0 byte copy
-	;;
-
-EX(.ex_handler_short, (p8)	st1	[dst0]=t1,2)
-EK(.ex_handler_short, (p10)	st1	[dst1]=t2,2)
-(p11)	br.ret.dpnt rp		// 1 byte copy
-
-EX(.ex_handler_short, (p12)	ld1	t3=[src0],2)
-EK(.ex_handler_short, (p14)	ld1	t4=[src1],2)
-(p13)	br.ret.dpnt rp		// 2 byte copy
-	;;
-
-	cmp.le	p6,p7   = 5,in2
-	cmp.le	p8,p9   = 6,in2
-	cmp.le	p10,p11 = 7,in2
-
-EX(.ex_handler_short, (p12)	st1	[dst0]=t3,2)
-EK(.ex_handler_short, (p14)	st1	[dst1]=t4,2)
-(p15)	br.ret.dpnt rp		// 3 byte copy
-	;;
-
-EX(.ex_handler_short, (p6)	ld1	t5=[src0],2)
-EK(.ex_handler_short, (p8)	ld1	t6=[src1],2)
-(p7)	br.ret.dpnt rp		// 4 byte copy
-	;;
-
-EX(.ex_handler_short, (p6)	st1	[dst0]=t5,2)
-EK(.ex_handler_short, (p8)	st1	[dst1]=t6,2)
-(p9)	br.ret.dptk rp		// 5 byte copy
-
-EX(.ex_handler_short, (p10)	ld1	t7=[src0],2)
-(p11)	br.ret.dptk rp		// 6 byte copy
-	;;
-
-EX(.ex_handler_short, (p10)	st1	[dst0]=t7,2)
-	br.ret.dptk rp		// done all cases
-
-
-/* Align dest to nearest 8-byte boundary. We know we have at
- * least 7 bytes to copy, enough to crawl to 8-byte boundary.
- * Actual number of byte to crawl depend on the dest alignment.
- * 7 byte or less is taken care at .memcpy_short
-
- * src0 - source even index
- * src1 - source  odd index
- * dst0 - dest even index
- * dst1 - dest  odd index
- * r30  - distance to 8-byte boundary
- */
-
-.align_dest:
-	add	src1=1,in1	// source odd index
-	cmp.le	p7,p0 = 2,r30	// for .align_dest
-	cmp.le	p8,p0 = 3,r30	// for .align_dest
-EX(.ex_handler_short, (p6)	ld1	t1=[src0],2)
-	cmp.le	p9,p0 = 4,r30	// for .align_dest
-	cmp.le	p10,p0 = 5,r30
-	;;
-EX(.ex_handler_short, (p7)	ld1	t2=[src1],2)
-EK(.ex_handler_short, (p8)	ld1	t3=[src0],2)
-	cmp.le	p11,p0 = 6,r30
-EX(.ex_handler_short, (p6)	st1	[dst0] = t1,2)
-	cmp.le	p12,p0 = 7,r30
-	;;
-EX(.ex_handler_short, (p9)	ld1	t4=[src1],2)
-EK(.ex_handler_short, (p10)	ld1	t5=[src0],2)
-EX(.ex_handler_short, (p7)	st1	[dst1] = t2,2)
-EK(.ex_handler_short, (p8)	st1	[dst0] = t3,2)
-	;;
-EX(.ex_handler_short, (p11)	ld1	t6=[src1],2)
-EK(.ex_handler_short, (p12)	ld1	t7=[src0],2)
-	cmp.eq	p6,p7=r28,r29
-EX(.ex_handler_short, (p9)	st1	[dst1] = t4,2)
-EK(.ex_handler_short, (p10)	st1	[dst0] = t5,2)
-	sub	in2=in2,r30
-	;;
-EX(.ex_handler_short, (p11)	st1	[dst1] = t6,2)
-EK(.ex_handler_short, (p12)	st1	[dst0] = t7)
-	add	dst0=in0,r30	// setup arguments
-	add	src0=in1,r30
-(p6)	br.cond.dptk .aligned_src
-(p7)	br.cond.dpnt .unaligned_src
-	;;
-
-/* main loop body in jump table format */
-#define COPYU(shift)									\
-1:											\
-EX(.ex_handler,  (p16)	ld8	r32=[src0],8);		/* 1 */				\
-EK(.ex_handler,  (p16)	ld8	r36=[src1],8);						\
-		 (p17)	shrp	r35=r33,r34,shift;;	/* 1 */				\
-EX(.ex_handler,  (p6)	ld8	r22=[src1]);	/* common, prime for tail section */	\
-		 nop.m	0;								\
-		 (p16)	shrp	r38=r36,r37,shift;					\
-EX(.ex_handler,  (p17)	st8	[dst0]=r35,8);		/* 1 */				\
-EK(.ex_handler,  (p17)	st8	[dst1]=r39,8);						\
-		 br.ctop.dptk.few 1b;;							\
-		 (p7)	add	src1=-8,src1;	/* back out for <8 byte case */		\
-		 shrp	r21=r22,r38,shift;	/* speculative work */			\
-		 br.sptk.few .unaligned_src_tail /* branch out of jump table */		\
-		 ;;
-	TEXT_ALIGN(32)
-.jump_table:
-	COPYU(8)	// unaligned cases
-.jmp1:
-	COPYU(16)
-	COPYU(24)
-	COPYU(32)
-	COPYU(40)
-	COPYU(48)
-	COPYU(56)
-
-#undef A
-#undef B
-#undef C
-#undef D
-
-/*
- * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
- * instruction failed in the bundle.  The exception algorithm is that we
- * first figure out the faulting address, then detect if there is any
- * progress made on the copy, if so, redo the copy from last known copied
- * location up to the faulting address (exclusive). In the copy_from_user
- * case, remaining byte in kernel buffer will be zeroed.
- *
- * Take copy_from_user as an example, in the code there are multiple loads
- * in a bundle and those multiple loads could span over two pages, the
- * faulting address is calculated as page_round_down(max(src0, src1)).
- * This is based on knowledge that if we can access one byte in a page, we
- * can access any byte in that page.
- *
- * predicate used in the exception handler:
- * p6-p7: direction
- * p10-p11: src faulting addr calculation
- * p12-p13: dst faulting addr calculation
- */
-
-#define A	r19
-#define B	r20
-#define C	r21
-#define D	r22
-#define F	r28
-
-#define saved_retval	loc0
-#define saved_rtlink	loc1
-#define saved_pfs_stack	loc2
-
-.ex_hndlr_s:
-	add	src0=8,src0
-	br.sptk .ex_handler
-	;;
-.ex_hndlr_d:
-	add	dst0=8,dst0
-	br.sptk .ex_handler
-	;;
-.ex_hndlr_lcpy_1:
-	mov	src1=src_pre_mem
-	mov	dst1=dst_pre_mem
-	cmp.gtu	p10,p11=src_pre_mem,saved_in1
-	cmp.gtu	p12,p13=dst_pre_mem,saved_in0
-	;;
-(p10)	add	src0=8,saved_in1
-(p11)	mov	src0=saved_in1
-(p12)	add	dst0=8,saved_in0
-(p13)	mov	dst0=saved_in0
-	br.sptk	.ex_handler
-.ex_handler_lcpy:
-	// in line_copy block, the preload addresses should always ahead
-	// of the other two src/dst pointers.  Furthermore, src1/dst1 should
-	// always ahead of src0/dst0.
-	mov	src1=src_pre_mem
-	mov	dst1=dst_pre_mem
-.ex_handler:
-	mov	pr=saved_pr,-1		// first restore pr, lc, and pfs
-	mov	ar.lc=saved_lc
-	mov	ar.pfs=saved_pfs
-	;;
-.ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs
-	cmp.ltu	p6,p7=saved_in0, saved_in1	// get the copy direction
-	cmp.ltu	p10,p11=src0,src1
-	cmp.ltu	p12,p13=dst0,dst1
-	fcmp.eq	p8,p0=f6,f0		// is it memcpy?
-	mov	tmp = dst0
-	;;
-(p11)	mov	src1 = src0		// pick the larger of the two
-(p13)	mov	dst0 = dst1		// make dst0 the smaller one
-(p13)	mov	dst1 = tmp		// and dst1 the larger one
-	;;
-(p6)	dep	F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary
-(p7)	dep	F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary
-	;;
-(p6)	cmp.le	p14,p0=dst0,saved_in0	// no progress has been made on store
-(p7)	cmp.le	p14,p0=src0,saved_in1	// no progress has been made on load
-	mov	retval=saved_in2
-(p8)	ld1	tmp=[src1]		// force an oops for memcpy call
-(p8)	st1	[dst1]=r0		// force an oops for memcpy call
-(p14)	br.ret.sptk.many rp
-
-/*
- * The remaining byte to copy is calculated as:
- *
- * A =	(faulting_addr - orig_src)	-> len to faulting ld address
- *	or 
- * 	(faulting_addr - orig_dst)	-> len to faulting st address
- * B =	(cur_dst - orig_dst)		-> len copied so far
- * C =	A - B				-> len need to be copied
- * D =	orig_len - A			-> len need to be left along
- */
-(p6)	sub	A = F, saved_in0
-(p7)	sub	A = F, saved_in1
-	clrrrb
-	;;
-	alloc	saved_pfs_stack=ar.pfs,3,3,3,0
-	cmp.lt	p8,p0=A,r0
-	sub	B = dst0, saved_in0	// how many byte copied so far
-	;;
-(p8)	mov	A = 0;			// A shouldn't be negative, cap it
-	;;
-	sub	C = A, B
-	sub	D = saved_in2, A
-	;;
-	cmp.gt	p8,p0=C,r0		// more than 1 byte?
-	mov	r8=0
-	mov	saved_retval = D
-	mov	saved_rtlink = b0
-
-	add	out0=saved_in0, B
-	add	out1=saved_in1, B
-	mov	out2=C
-(p8)	br.call.sptk.few b0=__copy_user	// recursive call
-	;;
-
-	add	saved_retval=saved_retval,r8	// above might return non-zero value
-	;;
-
-	mov	retval=saved_retval
-	mov	ar.pfs=saved_pfs_stack
-	mov	b0=saved_rtlink
-	br.ret.sptk.many rp
-
-/* end of McKinley specific optimization */
-END(__copy_user)
-EXPORT_SYMBOL(__copy_user)
diff --git a/arch/ia64/lib/memset.S b/arch/ia64/lib/memset.S
deleted file mode 100644
index 552c5c7e4d06..000000000000
--- a/arch/ia64/lib/memset.S
+++ /dev/null
@@ -1,365 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Optimized version of the standard memset() function.
-
-   Copyright (c) 2002 Hewlett-Packard Co/CERN
-	Sverre Jarp <Sverre.Jarp@cern.ch>
-
-   Return: dest
-
-   Inputs:
-        in0:    dest
-        in1:    value
-        in2:    count
-
-   The algorithm is fairly straightforward: set byte by byte until we
-   we get to a 16B-aligned address, then loop on 128 B chunks using an
-   early store as prefetching, then loop on 32B chucks, then clear remaining
-   words, finally clear remaining bytes.
-   Since a stf.spill f0 can store 16B in one go, we use this instruction
-   to get peak speed when value = 0.  */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#undef ret
-
-#define dest		in0
-#define value		in1
-#define	cnt		in2
-
-#define tmp		r31
-#define save_lc		r30
-#define ptr0		r29
-#define ptr1		r28
-#define ptr2		r27
-#define ptr3		r26
-#define ptr9 		r24
-#define	loopcnt		r23
-#define linecnt		r22
-#define bytecnt		r21
-
-#define fvalue		f6
-
-// This routine uses only scratch predicate registers (p6 - p15)
-#define p_scr		p6			// default register for same-cycle branches
-#define p_nz		p7
-#define p_zr		p8
-#define p_unalgn	p9
-#define p_y		p11
-#define p_n		p12
-#define p_yy		p13
-#define p_nn		p14
-
-#define MIN1		15
-#define MIN1P1HALF	8
-#define LINE_SIZE	128
-#define LSIZE_SH        7			// shift amount
-#define PREF_AHEAD	8
-
-GLOBAL_ENTRY(memset)
-{ .mmi
-	.prologue
-	alloc	tmp = ar.pfs, 3, 0, 0, 0
-	lfetch.nt1 [dest]			//
-	.save   ar.lc, save_lc
-	mov.i	save_lc = ar.lc
-	.body
-} { .mmi
-	mov	ret0 = dest			// return value
-	cmp.ne	p_nz, p_zr = value, r0		// use stf.spill if value is zero
-	cmp.eq	p_scr, p0 = cnt, r0
-;; }
-{ .mmi
-	and	ptr2 = -(MIN1+1), dest		// aligned address
-	and	tmp = MIN1, dest		// prepare to check for correct alignment
-	tbit.nz p_y, p_n = dest, 0		// Do we have an odd address? (M_B_U)
-} { .mib
-	mov	ptr1 = dest
-	mux1	value = value, @brcst		// create 8 identical bytes in word
-(p_scr)	br.ret.dpnt.many rp			// return immediately if count = 0
-;; }
-{ .mib
-	cmp.ne	p_unalgn, p0 = tmp, r0		//
-} { .mib
-	sub	bytecnt = (MIN1+1), tmp		// NB: # of bytes to move is 1 higher than loopcnt
-	cmp.gt	p_scr, p0 = 16, cnt		// is it a minimalistic task?
-(p_scr)	br.cond.dptk.many .move_bytes_unaligned	// go move just a few (M_B_U)
-;; }
-{ .mmi
-(p_unalgn) add	ptr1 = (MIN1+1), ptr2		// after alignment
-(p_unalgn) add	ptr2 = MIN1P1HALF, ptr2		// after alignment
-(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3	// should we do a st8 ?
-;; }
-{ .mib
-(p_y)	add	cnt = -8, cnt			//
-(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2	// should we do a st4 ?
-} { .mib
-(p_y)	st8	[ptr2] = value,-4		//
-(p_n)	add	ptr2 = 4, ptr2			//
-;; }
-{ .mib
-(p_yy)	add	cnt = -4, cnt			//
-(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1	// should we do a st2 ?
-} { .mib
-(p_yy)	st4	[ptr2] = value,-2		//
-(p_nn)	add	ptr2 = 2, ptr2			//
-;; }
-{ .mmi
-	mov	tmp = LINE_SIZE+1		// for compare
-(p_y)	add	cnt = -2, cnt			//
-(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0	// should we do a st1 ?
-} { .mmi
-	setf.sig fvalue=value			// transfer value to FLP side
-(p_y)	st2	[ptr2] = value,-1		//
-(p_n)	add	ptr2 = 1, ptr2			//
-;; }
-
-{ .mmi
-(p_yy)	st1	[ptr2] = value 			//
-  	cmp.gt	p_scr, p0 = tmp, cnt		// is it a minimalistic task?
-} { .mbb
-(p_yy)	add	cnt = -1, cnt			//
-(p_scr)	br.cond.dpnt.many .fraction_of_line	// go move just a few
-;; }
-
-{ .mib
-	nop.m 0
-	shr.u	linecnt = cnt, LSIZE_SH
-(p_zr)	br.cond.dptk.many .l1b			// Jump to use stf.spill
-;; }
-
-	TEXT_ALIGN(32) // --------------------- //  L1A: store ahead into cache lines; fill later
-{ .mmi
-	and	tmp = -(LINE_SIZE), cnt		// compute end of range
-	mov	ptr9 = ptr1			// used for prefetching
-	and	cnt = (LINE_SIZE-1), cnt	// remainder
-} { .mmi
-	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
-	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
-;; }
-{ .mmi
-(p_scr)	add	loopcnt = -1, linecnt		//
-	add	ptr2 = 8, ptr1			// start of stores (beyond prefetch stores)
-	add	ptr1 = tmp, ptr1		// first address beyond total range
-;; }
-{ .mmi
-	add	tmp = -1, linecnt		// next loop count
-	mov.i	ar.lc = loopcnt			//
-;; }
-.pref_l1a:
-{ .mib
-	stf8 [ptr9] = fvalue, 128		// Do stores one cache line apart
-	nop.i	0
-	br.cloop.dptk.few .pref_l1a
-;; }
-{ .mmi
-	add	ptr0 = 16, ptr2			// Two stores in parallel
-	mov.i	ar.lc = tmp			//
-;; }
-.l1ax:
- { .mmi
-	stf8 [ptr2] = fvalue, 8
-	stf8 [ptr0] = fvalue, 8
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 24
-	stf8 [ptr0] = fvalue, 24
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 8
-	stf8 [ptr0] = fvalue, 8
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 24
-	stf8 [ptr0] = fvalue, 24
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 8
-	stf8 [ptr0] = fvalue, 8
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 24
-	stf8 [ptr0] = fvalue, 24
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 8
-	stf8 [ptr0] = fvalue, 32
- 	cmp.lt	p_scr, p0 = ptr9, ptr1		// do we need more prefetching?
- ;; }
-{ .mmb
-	stf8 [ptr2] = fvalue, 24
-(p_scr)	stf8 [ptr9] = fvalue, 128
-	br.cloop.dptk.few .l1ax
-;; }
-{ .mbb
-	cmp.le  p_scr, p0 = 8, cnt		// just a few bytes left ?
-(p_scr) br.cond.dpnt.many  .fraction_of_line	// Branch no. 2
-	br.cond.dpnt.many  .move_bytes_from_alignment	// Branch no. 3
-;; }
-
-	TEXT_ALIGN(32)
-.l1b:	// ------------------------------------ //  L1B: store ahead into cache lines; fill later
-{ .mmi
-	and	tmp = -(LINE_SIZE), cnt		// compute end of range
-	mov	ptr9 = ptr1			// used for prefetching
-	and	cnt = (LINE_SIZE-1), cnt	// remainder
-} { .mmi
-	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
-	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
-;; }
-{ .mmi
-(p_scr)	add	loopcnt = -1, linecnt
-	add	ptr2 = 16, ptr1			// start of stores (beyond prefetch stores)
-	add	ptr1 = tmp, ptr1		// first address beyond total range
-;; }
-{ .mmi
-	add	tmp = -1, linecnt		// next loop count
-	mov.i	ar.lc = loopcnt
-;; }
-.pref_l1b:
-{ .mib
-	stf.spill [ptr9] = f0, 128		// Do stores one cache line apart
-	nop.i   0
-	br.cloop.dptk.few .pref_l1b
-;; }
-{ .mmi
-	add	ptr0 = 16, ptr2			// Two stores in parallel
-	mov.i	ar.lc = tmp
-;; }
-.l1bx:
- { .mmi
-	stf.spill [ptr2] = f0, 32
-	stf.spill [ptr0] = f0, 32
- ;; }
- { .mmi
-	stf.spill [ptr2] = f0, 32
-	stf.spill [ptr0] = f0, 32
- ;; }
- { .mmi
-	stf.spill [ptr2] = f0, 32
-	stf.spill [ptr0] = f0, 64
- 	cmp.lt	p_scr, p0 = ptr9, ptr1		// do we need more prefetching?
- ;; }
-{ .mmb
-	stf.spill [ptr2] = f0, 32
-(p_scr)	stf.spill [ptr9] = f0, 128
-	br.cloop.dptk.few .l1bx
-;; }
-{ .mib
-	cmp.gt  p_scr, p0 = 8, cnt		// just a few bytes left ?
-(p_scr)	br.cond.dpnt.many  .move_bytes_from_alignment	//
-;; }
-
-.fraction_of_line:
-{ .mib
-	add	ptr2 = 16, ptr1
-	shr.u	loopcnt = cnt, 5   		// loopcnt = cnt / 32
-;; }
-{ .mib
-	cmp.eq	p_scr, p0 = loopcnt, r0
-	add	loopcnt = -1, loopcnt
-(p_scr)	br.cond.dpnt.many .store_words
-;; }
-{ .mib
-	and	cnt = 0x1f, cnt			// compute the remaining cnt
-	mov.i   ar.lc = loopcnt
-;; }
-	TEXT_ALIGN(32)
-.l2:	// ------------------------------------ //  L2A:  store 32B in 2 cycles
-{ .mmb
-	stf8	[ptr1] = fvalue, 8
-	stf8	[ptr2] = fvalue, 8
-;; } { .mmb
-	stf8	[ptr1] = fvalue, 24
-	stf8	[ptr2] = fvalue, 24
-	br.cloop.dptk.many .l2
-;; }
-.store_words:
-{ .mib
-	cmp.gt	p_scr, p0 = 8, cnt		// just a few bytes left ?
-(p_scr)	br.cond.dpnt.many .move_bytes_from_alignment	// Branch
-;; }
-
-{ .mmi
-	stf8	[ptr1] = fvalue, 8		// store
-	cmp.le	p_y, p_n = 16, cnt
-	add	cnt = -8, cnt			// subtract
-;; }
-{ .mmi
-(p_y)	stf8	[ptr1] = fvalue, 8		// store
-(p_y)	cmp.le.unc p_yy, p_nn = 16, cnt
-(p_y)	add	cnt = -8, cnt			// subtract
-;; }
-{ .mmi						// store
-(p_yy)	stf8	[ptr1] = fvalue, 8
-(p_yy)	add	cnt = -8, cnt			// subtract
-;; }
-
-.move_bytes_from_alignment:
-{ .mib
-	cmp.eq	p_scr, p0 = cnt, r0
-	tbit.nz.unc p_y, p0 = cnt, 2		// should we terminate with a st4 ?
-(p_scr)	br.cond.dpnt.few .restore_and_exit
-;; }
-{ .mib
-(p_y)	st4	[ptr1] = value,4
-	tbit.nz.unc p_yy, p0 = cnt, 1		// should we terminate with a st2 ?
-;; }
-{ .mib
-(p_yy)	st2	[ptr1] = value,2
-	tbit.nz.unc p_y, p0 = cnt, 0		// should we terminate with a st1 ?
-;; }
-
-{ .mib
-(p_y)	st1	[ptr1] = value
-;; }
-.restore_and_exit:
-{ .mib
-	nop.m	0
-	mov.i	ar.lc = save_lc
-	br.ret.sptk.many rp
-;; }
-
-.move_bytes_unaligned:
-{ .mmi
-       .pred.rel "mutex",p_y, p_n
-       .pred.rel "mutex",p_yy, p_nn
-(p_n)	cmp.le  p_yy, p_nn = 4, cnt
-(p_y)	cmp.le  p_yy, p_nn = 5, cnt
-(p_n)	add	ptr2 = 2, ptr1
-} { .mmi
-(p_y)	add	ptr2 = 3, ptr1
-(p_y)	st1	[ptr1] = value, 1		// fill 1 (odd-aligned) byte [15, 14 (or less) left]
-(p_y)	add	cnt = -1, cnt
-;; }
-{ .mmi
-(p_yy)	cmp.le.unc p_y, p0 = 8, cnt
-	add	ptr3 = ptr1, cnt		// prepare last store
-	mov.i	ar.lc = save_lc
-} { .mmi
-(p_yy)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
-(p_yy)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [11, 10 (o less) left]
-(p_yy)	add	cnt = -4, cnt
-;; }
-{ .mmi
-(p_y)	cmp.le.unc p_yy, p0 = 8, cnt
-	add	ptr3 = -1, ptr3			// last store
-	tbit.nz p_scr, p0 = cnt, 1		// will there be a st2 at the end ?
-} { .mmi
-(p_y)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
-(p_y)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [7, 6 (or less) left]
-(p_y)	add	cnt = -4, cnt
-;; }
-{ .mmi
-(p_yy)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
-(p_yy)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [3, 2 (or less) left]
-	tbit.nz p_y, p0 = cnt, 0		// will there be a st1 at the end ?
-} { .mmi
-(p_yy)	add	cnt = -4, cnt
-;; }
-{ .mmb
-(p_scr)	st2	[ptr1] = value			// fill 2 (aligned) bytes
-(p_y)	st1	[ptr3] = value			// fill last byte (using ptr3)
-	br.ret.sptk.many rp
-}
-END(memset)
-EXPORT_SYMBOL(memset)
diff --git a/arch/ia64/lib/strlen.S b/arch/ia64/lib/strlen.S
deleted file mode 100644
index 1f4a46c15127..000000000000
--- a/arch/ia64/lib/strlen.S
+++ /dev/null
@@ -1,195 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the standard strlen() function
- *
- *
- * Inputs:
- *	in0	address of string
- *
- * Outputs:
- *	ret0	the number of characters in the string (0 if empty string)
- *	does not count the \0
- *
- * Copyright (C) 1999, 2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * 09/24/99 S.Eranian add speculation recovery code
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-//
-//
-// This is an enhanced version of the basic strlen. it includes a combination
-// of compute zero index (czx), parallel comparisons, speculative loads and
-// loop unroll using rotating registers.
-//
-// General Ideas about the algorithm:
-//	  The goal is to look at the string in chunks of 8 bytes.
-//	  so we need to do a few extra checks at the beginning because the
-//	  string may not be 8-byte aligned. In this case we load the 8byte
-//	  quantity which includes the start of the string and mask the unused
-//	  bytes with 0xff to avoid confusing czx.
-//	  We use speculative loads and software pipelining to hide memory
-//	  latency and do read ahead safely. This way we defer any exception.
-//
-//	  Because we don't want the kernel to be relying on particular
-//	  settings of the DCR register, we provide recovery code in case
-//	  speculation fails. The recovery code is going to "redo" the work using
-//	  only normal loads. If we still get a fault then we generate a
-//	  kernel panic. Otherwise we return the strlen as usual.
-//
-//	  The fact that speculation may fail can be caused, for instance, by
-//	  the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
-//	  a NaT bit will be set if the translation is not present. The normal
-//	  load, on the other hand, will cause the translation to be inserted
-//	  if the mapping exists.
-//
-//	  It should be noted that we execute recovery code only when we need
-//	  to use the data that has been speculatively loaded: we don't execute
-//	  recovery code on pure read ahead data.
-//
-// Remarks:
-//	- the cmp r0,r0 is used as a fast way to initialize a predicate
-//	  register to 1. This is required to make sure that we get the parallel
-//	  compare correct.
-//
-//	- we don't use the epilogue counter to exit the loop but we need to set
-//	  it to zero beforehand.
-//
-//	- after the loop we must test for Nat values because neither the
-//	  czx nor cmp instruction raise a NaT consumption fault. We must be
-//	  careful not to look too far for a Nat for which we don't care.
-//	  For instance we don't need to look at a NaT in val2 if the zero byte
-//	  was in val1.
-//
-//	- Clearly performance tuning is required.
-//
-//
-//
-#define saved_pfs	r11
-#define	tmp		r10
-#define base		r16
-#define orig		r17
-#define saved_pr	r18
-#define src		r19
-#define mask		r20
-#define val		r21
-#define val1		r22
-#define val2		r23
-
-GLOBAL_ENTRY(strlen)
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8
-
-	.rotr v[2], w[2]	// declares our 4 aliases
-
-	extr.u tmp=in0,0,3	// tmp=least significant 3 bits
-	mov orig=in0		// keep trackof initial byte address
-	dep src=0,in0,0,3	// src=8byte-aligned in0 address
-	.save pr, saved_pr
-	mov saved_pr=pr		// preserve predicates (rotation)
-	;;
-
-	.body
-
-	ld8 v[1]=[src],8	// must not speculate: can fail here
-	shl tmp=tmp,3		// multiply by 8bits/byte
-	mov mask=-1		// our mask
-	;;
-	ld8.s w[1]=[src],8	// speculatively load next
-	cmp.eq p6,p0=r0,r0	// sets p6 to true for cmp.and
-	sub tmp=64,tmp		// how many bits to shift our mask on the right
-	;;
-	shr.u	mask=mask,tmp	// zero enough bits to hold v[1] valuable part
-	mov ar.ec=r0		// clear epilogue counter (saved in ar.pfs)
-	;;
-	add base=-16,src	// keep track of aligned base
-	or v[1]=v[1],mask	// now we have a safe initial byte pattern
-	;;
-1:
-	ld8.s v[0]=[src],8	// speculatively load next
-	czx1.r val1=v[1]	// search 0 byte from right
-	czx1.r val2=w[1]	// search 0 byte from right following 8bytes
-	;;
-	ld8.s w[0]=[src],8	// speculatively load next to next
-	cmp.eq.and p6,p0=8,val1	// p6 = p6 and val1==8
-	cmp.eq.and p6,p0=8,val2	// p6 = p6 and mask==8
-(p6)	br.wtop.dptk 1b		// loop until p6 == 0
-	;;
-	//
-	// We must return try the recovery code iff
-	// val1_is_nat || (val1==8 && val2_is_nat)
-	//
-	// XXX Fixme
-	//	- there must be a better way of doing the test
-	//
-	cmp.eq  p8,p9=8,val1	// p6 = val1 had zero (disambiguate)
-	tnat.nz p6,p7=val1	// test NaT on val1
-(p6)	br.cond.spnt .recover	// jump to recovery if val1 is NaT
-	;;
-	//
-	// if we come here p7 is true, i.e., initialized for // cmp
-	//
-	cmp.eq.and  p7,p0=8,val1// val1==8?
-	tnat.nz.and p7,p0=val2	// test NaT if val2
-(p7)	br.cond.spnt .recover	// jump to recovery if val2 is NaT
-	;;
-(p8)	mov val1=val2		// the other test got us out of the loop
-(p8)	adds src=-16,src	// correct position when 3 ahead
-(p9)	adds src=-24,src	// correct position when 4 ahead
-	;;
-	sub ret0=src,orig	// distance from base
-	sub tmp=8,val1		// which byte in word
-	mov pr=saved_pr,0xffffffffffff0000
-	;;
-	sub ret0=ret0,tmp	// adjust
-	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
-	br.ret.sptk.many rp	// end of normal execution
-
-	//
-	// Outlined recovery code when speculation failed
-	//
-	// This time we don't use speculation and rely on the normal exception
-	// mechanism. that's why the loop is not as good as the previous one
-	// because read ahead is not possible
-	//
-	// IMPORTANT:
-	// Please note that in the case of strlen() as opposed to strlen_user()
-	// we don't use the exception mechanism, as this function is not
-	// supposed to fail. If that happens it means we have a bug and the
-	// code will cause of kernel fault.
-	//
-	// XXX Fixme
-	//	- today we restart from the beginning of the string instead
-	//	  of trying to continue where we left off.
-	//
-.recover:
-	ld8 val=[base],8	// will fail if unrecoverable fault
-	;;
-	or val=val,mask		// remask first bytes
-	cmp.eq p0,p6=r0,r0	// nullify first ld8 in loop
-	;;
-	//
-	// ar.ec is still zero here
-	//
-2:
-(p6)	ld8 val=[base],8	// will fail if unrecoverable fault
-	;;
-	czx1.r val1=val		// search 0 byte from right
-	;;
-	cmp.eq p6,p0=8,val1	// val1==8 ?
-(p6)	br.wtop.dptk 2b		// loop until p6 == 0
-	;;			// (avoid WAW on p63)
-	sub ret0=base,orig	// distance from base
-	sub tmp=8,val1
-	mov pr=saved_pr,0xffffffffffff0000
-	;;
-	sub ret0=ret0,tmp	// length=now - back -1
-	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
-	br.ret.sptk.many rp	// end of successful recovery code
-END(strlen)
-EXPORT_SYMBOL(strlen)
diff --git a/arch/ia64/lib/strncpy_from_user.S b/arch/ia64/lib/strncpy_from_user.S
deleted file mode 100644
index a287169bd953..000000000000
--- a/arch/ia64/lib/strncpy_from_user.S
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Just like strncpy() except that if a fault occurs during copying,
- * -EFAULT is returned.
- *
- * Inputs:
- *	in0:	address of destination buffer
- *	in1:	address of string to be copied
- *	in2:	length of buffer in bytes
- * Outputs:
- *	r8:	-EFAULT in case of fault or number of bytes copied if no fault
- *
- * Copyright (C) 1998-2001 Hewlett-Packard Co
- * Copyright (C) 1998-2001 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 00/03/06 D. Mosberger Fixed to return proper return value (bug found by
- *			 by Andreas Schwab <schwab@suse.de>).
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(__strncpy_from_user)
-	alloc r2=ar.pfs,3,0,0,0
-	mov r8=0
-	mov r9=in1
-	;;
-	add r10=in1,in2
-	cmp.eq p6,p0=r0,in2
-(p6)	br.ret.spnt.many rp
-
-	// XXX braindead copy loop---this needs to be optimized
-.Loop1:
-	EX(.Lexit, ld1 r8=[in1],1)
-	;;
-	EX(.Lexit, st1 [in0]=r8,1)
-	cmp.ne p6,p7=r8,r0
-	;;
-(p6)	cmp.ne.unc p8,p0=in1,r10
-(p8)	br.cond.dpnt.few .Loop1
-	;;
-(p6)	mov r8=in2		// buffer filled up---return buffer length
-(p7)	sub r8=in1,r9,1		// return string length (excluding NUL character)
-[.Lexit:]
-	br.ret.sptk.many rp
-END(__strncpy_from_user)
-EXPORT_SYMBOL(__strncpy_from_user)
diff --git a/arch/ia64/lib/strnlen_user.S b/arch/ia64/lib/strnlen_user.S
deleted file mode 100644
index a7eb56e840a9..000000000000
--- a/arch/ia64/lib/strnlen_user.S
+++ /dev/null
@@ -1,48 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Returns 0 if exception before NUL or reaching the supplied limit (N),
- * a value greater than N if the string is longer than the limit, else
- * strlen.
- *
- * Inputs:
- *	in0:	address of buffer
- *	in1:	string length limit N
- * Outputs:
- *	r8:	0 in case of fault, strlen(buffer)+1 otherwise
- *
- * Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(__strnlen_user)
-	.prologue
-	alloc r2=ar.pfs,2,0,0,0
-	.save ar.lc, r16
-	mov r16=ar.lc			// preserve ar.lc
-
-	.body
-
-	add r3=-1,in1
-	;;
-	mov ar.lc=r3
-	mov r9=0
-	;;
-	// XXX braindead strlen loop---this needs to be optimized
-.Loop1:
-	EXCLR(.Lexit, ld1 r8=[in0],1)
-	add r9=1,r9
-	;;
-	cmp.eq p6,p0=r8,r0
-(p6)	br.cond.dpnt .Lexit
-	br.cloop.dptk.few .Loop1
-
-	add r9=1,in1			// NUL not found---return N+1
-	;;
-.Lexit:
-	mov r8=r9
-	mov ar.lc=r16			// restore ar.lc
-	br.ret.sptk.many rp
-END(__strnlen_user)
-EXPORT_SYMBOL(__strnlen_user)
diff --git a/arch/ia64/lib/xor.S b/arch/ia64/lib/xor.S
deleted file mode 100644
index 6e2a69662c06..000000000000
--- a/arch/ia64/lib/xor.S
+++ /dev/null
@@ -1,181 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * arch/ia64/lib/xor.S
- *
- * Optimized RAID-5 checksumming functions for IA-64.
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(xor_ia64_2)
-	.prologue
-	.fframe 0
-	.save ar.pfs, r31
-	alloc r31 = ar.pfs, 3, 0, 13, 16
-	.save ar.lc, r30
-	mov r30 = ar.lc
-	.save pr, r29
-	mov r29 = pr
-	;;
-	.body
-	mov r8 = in1
-	mov ar.ec = 6 + 2
-	shr in0 = in0, 3
-	;;
-	adds in0 = -1, in0
-	mov r16 = in1
-	mov r17 = in2
-	;;
-	mov ar.lc = in0
-	mov pr.rot = 1 << 16
-	;;
-	.rotr s1[6+1], s2[6+1], d[2]
-	.rotp p[6+2]
-0:
-(p[0])	ld8.nta s1[0] = [r16], 8
-(p[0])	ld8.nta s2[0] = [r17], 8
-(p[6])	xor d[0] = s1[6], s2[6]
-(p[6+1])st8.nta [r8] = d[1], 8
-	nop.f 0
-	br.ctop.dptk.few 0b
-	;;
-	mov ar.lc = r30
-	mov pr = r29, -1
-	br.ret.sptk.few rp
-END(xor_ia64_2)
-EXPORT_SYMBOL(xor_ia64_2)
-
-GLOBAL_ENTRY(xor_ia64_3)
-	.prologue
-	.fframe 0
-	.save ar.pfs, r31
-	alloc r31 = ar.pfs, 4, 0, 20, 24
-	.save ar.lc, r30
-	mov r30 = ar.lc
-	.save pr, r29
-	mov r29 = pr
-	;;
-	.body
-	mov r8 = in1
-	mov ar.ec = 6 + 2
-	shr in0 = in0, 3
-	;;
-	adds in0 = -1, in0
-	mov r16 = in1
-	mov r17 = in2
-	;;
-	mov r18 = in3
-	mov ar.lc = in0
-	mov pr.rot = 1 << 16
-	;;
-	.rotr s1[6+1], s2[6+1], s3[6+1], d[2]
-	.rotp p[6+2]
-0:
-(p[0])	ld8.nta s1[0] = [r16], 8
-(p[0])	ld8.nta s2[0] = [r17], 8
-(p[6])	xor d[0] = s1[6], s2[6]
-	;;
-(p[0])	ld8.nta s3[0] = [r18], 8
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6])	xor d[0] = d[0], s3[6]
-	br.ctop.dptk.few 0b
-	;;
-	mov ar.lc = r30
-	mov pr = r29, -1
-	br.ret.sptk.few rp
-END(xor_ia64_3)
-EXPORT_SYMBOL(xor_ia64_3)
-
-GLOBAL_ENTRY(xor_ia64_4)
-	.prologue
-	.fframe 0
-	.save ar.pfs, r31
-	alloc r31 = ar.pfs, 5, 0, 27, 32
-	.save ar.lc, r30
-	mov r30 = ar.lc
-	.save pr, r29
-	mov r29 = pr
-	;;
-	.body
-	mov r8 = in1
-	mov ar.ec = 6 + 2
-	shr in0 = in0, 3
-	;;
-	adds in0 = -1, in0
-	mov r16 = in1
-	mov r17 = in2
-	;;
-	mov r18 = in3
-	mov ar.lc = in0
-	mov pr.rot = 1 << 16
-	mov r19 = in4
-	;;
-	.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
-	.rotp p[6+2]
-0:
-(p[0])	ld8.nta s1[0] = [r16], 8
-(p[0])	ld8.nta s2[0] = [r17], 8
-(p[6])	xor d[0] = s1[6], s2[6]
-(p[0])	ld8.nta s3[0] = [r18], 8
-(p[0])	ld8.nta s4[0] = [r19], 8
-(p[6])	xor r20 = s3[6], s4[6]
-	;;
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6])	xor d[0] = d[0], r20
-	br.ctop.dptk.few 0b
-	;;
-	mov ar.lc = r30
-	mov pr = r29, -1
-	br.ret.sptk.few rp
-END(xor_ia64_4)
-EXPORT_SYMBOL(xor_ia64_4)
-
-GLOBAL_ENTRY(xor_ia64_5)
-	.prologue
-	.fframe 0
-	.save ar.pfs, r31
-	alloc r31 = ar.pfs, 6, 0, 34, 40
-	.save ar.lc, r30
-	mov r30 = ar.lc
-	.save pr, r29
-	mov r29 = pr
-	;;
-	.body
-	mov r8 = in1
-	mov ar.ec = 6 + 2
-	shr in0 = in0, 3
-	;;
-	adds in0 = -1, in0
-	mov r16 = in1
-	mov r17 = in2
-	;;
-	mov r18 = in3
-	mov ar.lc = in0
-	mov pr.rot = 1 << 16
-	mov r19 = in4
-	mov r20 = in5
-	;;
-	.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
-	.rotp p[6+2]
-0:
-(p[0])	ld8.nta s1[0] = [r16], 8
-(p[0])	ld8.nta s2[0] = [r17], 8
-(p[6])	xor d[0] = s1[6], s2[6]
-(p[0])	ld8.nta s3[0] = [r18], 8
-(p[0])	ld8.nta s4[0] = [r19], 8
-(p[6])	xor r21 = s3[6], s4[6]
-	;;
-(p[0])	ld8.nta s5[0] = [r20], 8
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6])	xor d[0] = d[0], r21
-	;;
-(p[6])	  xor d[0] = d[0], s5[6]
-	nop.f 0
-	br.ctop.dptk.few 0b
-	;;
-	mov ar.lc = r30
-	mov pr = r29, -1
-	br.ret.sptk.few rp
-END(xor_ia64_5)
-EXPORT_SYMBOL(xor_ia64_5)
author	Ard Biesheuvel <ardb@kernel.org>	2022-10-20 15:54:33 +0200
committer	Ard Biesheuvel <ardb@kernel.org>	2023-09-11 08:13:17 +0000
commit	cf8e8658100d4eae80ce9b21f7a81cb024dd5057 (patch)
tree	31d3b640bebf97c33d354768fc44dfd532c2df81 /arch/ia64/lib
parent	a0334bf78b95532cec54f56b53e8ae1bfe7e1ca1 (diff)