summaryrefslogtreecommitdiff
path: root/arch/ia64/lib/memcpy.S
diff options
context:
space:
mode:
authorArd Biesheuvel <ardb@kernel.org>2022-10-20 15:54:33 +0200
committerArd Biesheuvel <ardb@kernel.org>2023-09-11 08:13:17 +0000
commitcf8e8658100d4eae80ce9b21f7a81cb024dd5057 (patch)
tree31d3b640bebf97c33d354768fc44dfd532c2df81 /arch/ia64/lib/memcpy.S
parenta0334bf78b95532cec54f56b53e8ae1bfe7e1ca1 (diff)
arch: Remove Itanium (IA-64) architecture
The Itanium architecture is obsolete, and an informal survey [0] reveals that any residual use of Itanium hardware in production is mostly HP-UX or OpenVMS based. The use of Linux on Itanium appears to be limited to enthusiasts that occasionally boot a fresh Linux kernel to see whether things are still working as intended, and perhaps to churn out some distro packages that are rarely used in practice. None of the original companies behind Itanium still produce or support any hardware or software for the architecture, and it is listed as 'Orphaned' in the MAINTAINERS file, as apparently, none of the engineers that contributed on behalf of those companies (nor anyone else, for that matter) have been willing to support or maintain the architecture upstream or even be responsible for applying the odd fix. The Intel firmware team removed all IA-64 support from the Tianocore/EDK2 reference implementation of EFI in 2018. (Itanium is the original architecture for which EFI was developed, and the way Linux supports it deviates significantly from other architectures.) Some distros, such as Debian and Gentoo, still maintain [unofficial] ia64 ports, but many have dropped support years ago. While the argument is being made [1] that there is a 'for the common good' angle to being able to build and run existing projects such as the Grid Community Toolkit [2] on Itanium for interoperability testing, the fact remains that none of those projects are known to be deployed on Linux/ia64, and very few people actually have access to such a system in the first place. Even if there were ways imaginable in which Linux/ia64 could be put to good use today, what matters is whether anyone is actually doing that, and this does not appear to be the case. There are no emulators widely available, and so boot testing Itanium is generally infeasible for ordinary contributors. GCC still supports IA-64 but its compile farm [3] no longer has any IA-64 machines. GLIBC would like to get rid of IA-64 [4] too because it would permit some overdue code cleanups. In summary, the benefits to the ecosystem of having IA-64 be part of it are mostly theoretical, whereas the maintenance overhead of keeping it supported is real. So let's rip off the band aid, and remove the IA-64 arch code entirely. This follows the timeline proposed by the Debian/ia64 maintainer [5], which removes support in a controlled manner, leaving IA-64 in a known good state in the most recent LTS release. Other projects will follow once the kernel support is removed. [0] https://lore.kernel.org/all/CAMj1kXFCMh_578jniKpUtx_j8ByHnt=s7S+yQ+vGbKt9ud7+kQ@mail.gmail.com/ [1] https://lore.kernel.org/all/0075883c-7c51-00f5-2c2d-5119c1820410@web.de/ [2] https://gridcf.org/gct-docs/latest/index.html [3] https://cfarm.tetaneutral.net/machines/list/ [4] https://lore.kernel.org/all/87bkiilpc4.fsf@mid.deneb.enyo.de/ [5] https://lore.kernel.org/all/ff58a3e76e5102c94bb5946d99187b358def688a.camel@physik.fu-berlin.de/ Acked-by: Tony Luck <tony.luck@intel.com> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Diffstat (limited to 'arch/ia64/lib/memcpy.S')
-rw-r--r--arch/ia64/lib/memcpy.S304
1 files changed, 0 insertions, 304 deletions
diff --git a/arch/ia64/lib/memcpy.S b/arch/ia64/lib/memcpy.S
deleted file mode 100644
index 35c9069a8345..000000000000
--- a/arch/ia64/lib/memcpy.S
+++ /dev/null
@@ -1,304 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the standard memcpy() function
- *
- * Inputs:
- * in0: destination address
- * in1: source address
- * in2: number of bytes to copy
- * Output:
- * no return value
- *
- * Copyright (C) 2000-2001 Hewlett-Packard Co
- * Stephane Eranian <eranian@hpl.hp.com>
- * David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(memcpy)
-
-# define MEM_LAT 21 /* latency to memory */
-
-# define dst r2
-# define src r3
-# define retval r8
-# define saved_pfs r9
-# define saved_lc r10
-# define saved_pr r11
-# define cnt r16
-# define src2 r17
-# define t0 r18
-# define t1 r19
-# define t2 r20
-# define t3 r21
-# define t4 r22
-# define src_end r23
-
-# define N (MEM_LAT + 4)
-# define Nrot ((N + 7) & ~7)
-
- /*
- * First, check if everything (src, dst, len) is a multiple of eight. If
- * so, we handle everything with no taken branches (other than the loop
- * itself) and a small icache footprint. Otherwise, we jump off to
- * the more general copy routine handling arbitrary
- * sizes/alignment etc.
- */
- .prologue
- .save ar.pfs, saved_pfs
- alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
- .save ar.lc, saved_lc
- mov saved_lc=ar.lc
- or t0=in0,in1
- ;;
-
- or t0=t0,in2
- .save pr, saved_pr
- mov saved_pr=pr
-
- .body
-
- cmp.eq p6,p0=in2,r0 // zero length?
- mov retval=in0 // return dst
-(p6) br.ret.spnt.many rp // zero length, return immediately
- ;;
-
- mov dst=in0 // copy because of rotation
- shr.u cnt=in2,3 // number of 8-byte words to copy
- mov pr.rot=1<<16
- ;;
-
- adds cnt=-1,cnt // br.ctop is repeat/until
- cmp.gtu p7,p0=16,in2 // copying less than 16 bytes?
- mov ar.ec=N
- ;;
-
- and t0=0x7,t0
- mov ar.lc=cnt
- ;;
- cmp.ne p6,p0=t0,r0
-
- mov src=in1 // copy because of rotation
-(p7) br.cond.spnt.few .memcpy_short
-(p6) br.cond.spnt.few .memcpy_long
- ;;
- nop.m 0
- ;;
- nop.m 0
- nop.i 0
- ;;
- nop.m 0
- ;;
- .rotr val[N]
- .rotp p[N]
- .align 32
-1: { .mib
-(p[0]) ld8 val[0]=[src],8
- nop.i 0
- brp.loop.imp 1b, 2f
-}
-2: { .mfb
-(p[N-1])st8 [dst]=val[N-1],8
- nop.f 0
- br.ctop.dptk.few 1b
-}
- ;;
- mov ar.lc=saved_lc
- mov pr=saved_pr,-1
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
- /*
- * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
- * copy loop. This performs relatively poorly on Itanium, but it doesn't
- * get used very often (gcc inlines small copies) and due to atomicity
- * issues, we want to avoid read-modify-write of entire words.
- */
- .align 32
-.memcpy_short:
- adds cnt=-1,in2 // br.ctop is repeat/until
- mov ar.ec=MEM_LAT
- brp.loop.imp 1f, 2f
- ;;
- mov ar.lc=cnt
- ;;
- nop.m 0
- ;;
- nop.m 0
- nop.i 0
- ;;
- nop.m 0
- ;;
- nop.m 0
- ;;
- /*
- * It is faster to put a stop bit in the loop here because it makes
- * the pipeline shorter (and latency is what matters on short copies).
- */
- .align 32
-1: { .mib
-(p[0]) ld1 val[0]=[src],1
- nop.i 0
- brp.loop.imp 1b, 2f
-} ;;
-2: { .mfb
-(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
- nop.f 0
- br.ctop.dptk.few 1b
-} ;;
- mov ar.lc=saved_lc
- mov pr=saved_pr,-1
- mov ar.pfs=saved_pfs
- br.ret.sptk.many rp
-
- /*
- * Large (>= 16 bytes) copying is done in a fancy way. Latency isn't
- * an overriding concern here, but throughput is. We first do
- * sub-word copying until the destination is aligned, then we check
- * if the source is also aligned. If so, we do a simple load/store-loop
- * until there are less than 8 bytes left over and then we do the tail,
- * by storing the last few bytes using sub-word copying. If the source
- * is not aligned, we branch off to the non-congruent loop.
- *
- * stage: op:
- * 0 ld
- * :
- * MEM_LAT+3 shrp
- * MEM_LAT+4 st
- *
- * On Itanium, the pipeline itself runs without stalls. However, br.ctop
- * seems to introduce an unavoidable bubble in the pipeline so the overall
- * latency is 2 cycles/iteration. This gives us a _copy_ throughput
- * of 4 byte/cycle. Still not bad.
- */
-# undef N
-# undef Nrot
-# define N (MEM_LAT + 5) /* number of stages */
-# define Nrot ((N+1 + 2 + 7) & ~7) /* number of rotating regs */
-
-#define LOG_LOOP_SIZE 6
-
-.memcpy_long:
- alloc t3=ar.pfs,3,Nrot,0,Nrot // resize register frame
- and t0=-8,src // t0 = src & ~7
- and t2=7,src // t2 = src & 7
- ;;
- ld8 t0=[t0] // t0 = 1st source word
- adds src2=7,src // src2 = (src + 7)
- sub t4=r0,dst // t4 = -dst
- ;;
- and src2=-8,src2 // src2 = (src + 7) & ~7
- shl t2=t2,3 // t2 = 8*(src & 7)
- shl t4=t4,3 // t4 = 8*(dst & 7)
- ;;
- ld8 t1=[src2] // t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
- sub t3=64,t2 // t3 = 64-8*(src & 7)
- shr.u t0=t0,t2
- ;;
- add src_end=src,in2
- shl t1=t1,t3
- mov pr=t4,0x38 // (p5,p4,p3)=(dst & 7)
- ;;
- or t0=t0,t1
- mov cnt=r0
- adds src_end=-1,src_end
- ;;
-(p3) st1 [dst]=t0,1
-(p3) shr.u t0=t0,8
-(p3) adds cnt=1,cnt
- ;;
-(p4) st2 [dst]=t0,2
-(p4) shr.u t0=t0,16
-(p4) adds cnt=2,cnt
- ;;
-(p5) st4 [dst]=t0,4
-(p5) adds cnt=4,cnt
- and src_end=-8,src_end // src_end = last word of source buffer
- ;;
-
- // At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:
-
-1:{ add src=cnt,src // make src point to remainder of source buffer
- sub cnt=in2,cnt // cnt = number of bytes left to copy
- mov t4=ip
- } ;;
- and src2=-8,src // align source pointer
- adds t4=.memcpy_loops-1b,t4
- mov ar.ec=N
-
- and t0=7,src // t0 = src & 7
- shr.u t2=cnt,3 // t2 = number of 8-byte words left to copy
- shl cnt=cnt,3 // move bits 0-2 to 3-5
- ;;
-
- .rotr val[N+1], w[2]
- .rotp p[N]
-
- cmp.ne p6,p0=t0,r0 // is src aligned, too?
- shl t0=t0,LOG_LOOP_SIZE // t0 = 8*(src & 7)
- adds t2=-1,t2 // br.ctop is repeat/until
- ;;
- add t4=t0,t4
- mov pr=cnt,0x38 // set (p5,p4,p3) to # of bytes last-word bytes to copy
- mov ar.lc=t2
- ;;
- nop.m 0
- ;;
- nop.m 0
- nop.i 0
- ;;
- nop.m 0
- ;;
-(p6) ld8 val[1]=[src2],8 // prime the pump...
- mov b6=t4
- br.sptk.few b6
- ;;
-
-.memcpy_tail:
- // At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
- // less than 8) and t0 contains the last few bytes of the src buffer:
-(p5) st4 [dst]=t0,4
-(p5) shr.u t0=t0,32
- mov ar.lc=saved_lc
- ;;
-(p4) st2 [dst]=t0,2
-(p4) shr.u t0=t0,16
- mov ar.pfs=saved_pfs
- ;;
-(p3) st1 [dst]=t0
- mov pr=saved_pr,-1
- br.ret.sptk.many rp
-
-///////////////////////////////////////////////////////
- .align 64
-
-#define COPY(shift,index) \
- 1: { .mib \
- (p[0]) ld8 val[0]=[src2],8; \
- (p[MEM_LAT+3]) shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift; \
- brp.loop.imp 1b, 2f \
- }; \
- 2: { .mfb \
- (p[MEM_LAT+4]) st8 [dst]=w[1],8; \
- nop.f 0; \
- br.ctop.dptk.few 1b; \
- }; \
- ;; \
- ld8 val[N-1]=[src_end]; /* load last word (may be same as val[N]) */ \
- ;; \
- shrp t0=val[N-1],val[N-index],shift; \
- br .memcpy_tail
-.memcpy_loops:
- COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
- COPY(8, 0)
- COPY(16, 0)
- COPY(24, 0)
- COPY(32, 0)
- COPY(40, 0)
- COPY(48, 0)
- COPY(56, 0)
-
-END(memcpy)
-EXPORT_SYMBOL(memcpy)