diff options
Diffstat (limited to 'lib/raid6')
| -rw-r--r-- | lib/raid6/Makefile | 43 | ||||
| -rw-r--r-- | lib/raid6/algos.c | 36 | ||||
| -rw-r--r-- | lib/raid6/avx512.c | 4 | ||||
| -rw-r--r-- | lib/raid6/int.uc | 9 | ||||
| -rw-r--r-- | lib/raid6/loongarch.h | 38 | ||||
| -rw-r--r-- | lib/raid6/loongarch_simd.c | 422 | ||||
| -rw-r--r-- | lib/raid6/mktables.c | 2 | ||||
| -rw-r--r-- | lib/raid6/neon.c | 17 | ||||
| -rw-r--r-- | lib/raid6/neon.h | 22 | ||||
| -rw-r--r-- | lib/raid6/neon.uc | 1 | ||||
| -rw-r--r-- | lib/raid6/recov.c | 7 | ||||
| -rw-r--r-- | lib/raid6/recov_avx2.c | 6 | ||||
| -rw-r--r-- | lib/raid6/recov_avx512.c | 12 | ||||
| -rw-r--r-- | lib/raid6/recov_loongarch_simd.c | 513 | ||||
| -rw-r--r-- | lib/raid6/recov_neon.c | 29 | ||||
| -rw-r--r-- | lib/raid6/recov_neon_inner.c | 1 | ||||
| -rw-r--r-- | lib/raid6/recov_rvv.c | 222 | ||||
| -rw-r--r-- | lib/raid6/recov_s390xc.c | 7 | ||||
| -rw-r--r-- | lib/raid6/recov_ssse3.c | 6 | ||||
| -rw-r--r-- | lib/raid6/rvv.c | 1228 | ||||
| -rw-r--r-- | lib/raid6/rvv.h | 56 | ||||
| -rw-r--r-- | lib/raid6/s390vx.uc | 65 | ||||
| -rw-r--r-- | lib/raid6/test/.gitignore | 3 | ||||
| -rw-r--r-- | lib/raid6/test/Makefile | 67 |
24 files changed, 2638 insertions, 178 deletions
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 45e17619422b..5be0a4e60ab1 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -2,13 +2,15 @@ obj-$(CONFIG_RAID6_PQ) += raid6_pq.o raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ - int8.o int16.o int32.o + int8.o raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \ vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o +raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o +raid6_pq-$(CONFIG_RISCV_ISA_V) += rvv.o recov_rvv.o hostprogs += mktables @@ -20,7 +22,7 @@ altivec_flags += -isystem $(shell $(CC) -print-file-name=include) ifdef CONFIG_CC_IS_CLANG # clang ppc port does not yet support -maltivec when -msoft-float is # enabled. A future release of clang will resolve this -# https://bugs.llvm.org/show_bug.cgi?id=31177 +# https://llvm.org/pr31177 CFLAGS_REMOVE_altivec1.o += -msoft-float CFLAGS_REMOVE_altivec2.o += -msoft-float CFLAGS_REMOVE_altivec4.o += -msoft-float @@ -32,29 +34,10 @@ CFLAGS_REMOVE_vpermxor8.o += -msoft-float endif endif -# The GCC option -ffreestanding is required in order to compile code containing -# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel) -ifeq ($(CONFIG_KERNEL_MODE_NEON),y) -NEON_FLAGS := -ffreestanding -# Enable <arm_neon.h> -NEON_FLAGS += -isystem $(shell $(CC) -print-file-name=include) -ifeq ($(ARCH),arm) -NEON_FLAGS += -march=armv7-a -mfloat-abi=softfp -mfpu=neon -endif -CFLAGS_recov_neon_inner.o += $(NEON_FLAGS) -ifeq ($(ARCH),arm64) -CFLAGS_REMOVE_recov_neon_inner.o += -mgeneral-regs-only -CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only -CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only -CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only -CFLAGS_REMOVE_neon8.o += -mgeneral-regs-only -endif -endif - quiet_cmd_unroll = UNROLL $@ - cmd_unroll = $(AWK) -v N=$* -f $(srctree)/$(src)/unroll.awk < $< > $@ + cmd_unroll = $(AWK) -v N=$* -f $(src)/unroll.awk < $< > $@ -targets += int1.c int2.c int4.c int8.c int16.c int32.c +targets += int1.c int2.c int4.c int8.c $(obj)/int%.c: $(src)/int.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) @@ -74,10 +57,16 @@ targets += vpermxor1.c vpermxor2.c vpermxor4.c vpermxor8.c $(obj)/vpermxor%.c: $(src)/vpermxor.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) -CFLAGS_neon1.o += $(NEON_FLAGS) -CFLAGS_neon2.o += $(NEON_FLAGS) -CFLAGS_neon4.o += $(NEON_FLAGS) -CFLAGS_neon8.o += $(NEON_FLAGS) +CFLAGS_neon1.o += $(CC_FLAGS_FPU) +CFLAGS_neon2.o += $(CC_FLAGS_FPU) +CFLAGS_neon4.o += $(CC_FLAGS_FPU) +CFLAGS_neon8.o += $(CC_FLAGS_FPU) +CFLAGS_recov_neon_inner.o += $(CC_FLAGS_FPU) +CFLAGS_REMOVE_neon1.o += $(CC_FLAGS_NO_FPU) +CFLAGS_REMOVE_neon2.o += $(CC_FLAGS_NO_FPU) +CFLAGS_REMOVE_neon4.o += $(CC_FLAGS_NO_FPU) +CFLAGS_REMOVE_neon8.o += $(CC_FLAGS_NO_FPU) +CFLAGS_REMOVE_recov_neon_inner.o += $(CC_FLAGS_NO_FPU) targets += neon1.c neon2.c neon4.c neon8.c $(obj)/neon%.c: $(src)/neon.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index a22a05c9af8a..799e0e5eac26 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -18,9 +18,6 @@ #else #include <linux/module.h> #include <linux/gfp.h> -/* In .bss so it's zeroed */ -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); -EXPORT_SYMBOL(raid6_empty_zero_page); #endif struct raid6_calls raid6_call; @@ -28,10 +25,8 @@ EXPORT_SYMBOL_GPL(raid6_call); const struct raid6_calls * const raid6_algos[] = { #if defined(__i386__) && !defined(__arch_um__) -#ifdef CONFIG_AS_AVX512 &raid6_avx512x2, &raid6_avx512x1, -#endif &raid6_avx2x2, &raid6_avx2x1, &raid6_sse2x2, @@ -42,11 +37,9 @@ const struct raid6_calls * const raid6_algos[] = { &raid6_mmxx1, #endif #if defined(__x86_64__) && !defined(__arch_um__) -#ifdef CONFIG_AS_AVX512 &raid6_avx512x4, &raid6_avx512x2, &raid6_avx512x1, -#endif &raid6_avx2x4, &raid6_avx2x2, &raid6_avx2x1, @@ -73,9 +66,19 @@ const struct raid6_calls * const raid6_algos[] = { &raid6_neonx2, &raid6_neonx1, #endif -#if defined(__ia64__) - &raid6_intx32, - &raid6_intx16, +#ifdef CONFIG_LOONGARCH +#ifdef CONFIG_CPU_HAS_LASX + &raid6_lasx, +#endif +#ifdef CONFIG_CPU_HAS_LSX + &raid6_lsx, +#endif +#endif +#ifdef CONFIG_RISCV_ISA_V + &raid6_rvvx1, + &raid6_rvvx2, + &raid6_rvvx4, + &raid6_rvvx8, #endif &raid6_intx8, &raid6_intx4, @@ -92,9 +95,7 @@ EXPORT_SYMBOL_GPL(raid6_datap_recov); const struct raid6_recov_calls *const raid6_recov_algos[] = { #ifdef CONFIG_X86 -#ifdef CONFIG_AS_AVX512 &raid6_recov_avx512, -#endif &raid6_recov_avx2, &raid6_recov_ssse3, #endif @@ -104,6 +105,17 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = { #if defined(CONFIG_KERNEL_MODE_NEON) &raid6_recov_neon, #endif +#ifdef CONFIG_LOONGARCH +#ifdef CONFIG_CPU_HAS_LASX + &raid6_recov_lasx, +#endif +#ifdef CONFIG_CPU_HAS_LSX + &raid6_recov_lsx, +#endif +#endif +#ifdef CONFIG_RISCV_ISA_V + &raid6_recov_rvv, +#endif &raid6_recov_intx1, NULL }; diff --git a/lib/raid6/avx512.c b/lib/raid6/avx512.c index 9c3e822e1adf..009bd0adeebf 100644 --- a/lib/raid6/avx512.c +++ b/lib/raid6/avx512.c @@ -17,8 +17,6 @@ * */ -#ifdef CONFIG_AS_AVX512 - #include <linux/raid/pq.h> #include "x86.h" @@ -560,5 +558,3 @@ const struct raid6_calls raid6_avx512x4 = { .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */ }; #endif - -#endif /* CONFIG_AS_AVX512 */ diff --git a/lib/raid6/int.uc b/lib/raid6/int.uc index 558aeac9342a..1ba56c3fa482 100644 --- a/lib/raid6/int.uc +++ b/lib/raid6/int.uc @@ -42,13 +42,6 @@ typedef u32 unative_t; /* - * IA-64 wants insane amounts of unrolling. On other architectures that - * is just a waste of space. - */ -#if ($# <= 8) || defined(__ia64__) - - -/* * These sub-operations are separate inlines since they can sometimes be * specially optimized using architecture-specific hacks. */ @@ -152,5 +145,3 @@ const struct raid6_calls raid6_intx$# = { "int" NSTRING "x$#", 0 }; - -#endif diff --git a/lib/raid6/loongarch.h b/lib/raid6/loongarch.h new file mode 100644 index 000000000000..acfc33ce7056 --- /dev/null +++ b/lib/raid6/loongarch.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2023 WANG Xuerui <git@xen0n.name> + * + * raid6/loongarch.h + * + * Definitions common to LoongArch RAID-6 code only + */ + +#ifndef _LIB_RAID6_LOONGARCH_H +#define _LIB_RAID6_LOONGARCH_H + +#ifdef __KERNEL__ + +#include <asm/cpu-features.h> +#include <asm/fpu.h> + +#else /* for user-space testing */ + +#include <sys/auxv.h> + +/* have to supply these defines for glibc 2.37- and musl */ +#ifndef HWCAP_LOONGARCH_LSX +#define HWCAP_LOONGARCH_LSX (1 << 4) +#endif +#ifndef HWCAP_LOONGARCH_LASX +#define HWCAP_LOONGARCH_LASX (1 << 5) +#endif + +#define kernel_fpu_begin() +#define kernel_fpu_end() + +#define cpu_has_lsx (getauxval(AT_HWCAP) & HWCAP_LOONGARCH_LSX) +#define cpu_has_lasx (getauxval(AT_HWCAP) & HWCAP_LOONGARCH_LASX) + +#endif /* __KERNEL__ */ + +#endif /* _LIB_RAID6_LOONGARCH_H */ diff --git a/lib/raid6/loongarch_simd.c b/lib/raid6/loongarch_simd.c new file mode 100644 index 000000000000..aa5d9f924ca3 --- /dev/null +++ b/lib/raid6/loongarch_simd.c @@ -0,0 +1,422 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RAID6 syndrome calculations in LoongArch SIMD (LSX & LASX) + * + * Copyright 2023 WANG Xuerui <git@xen0n.name> + * + * Based on the generic RAID-6 code (int.uc): + * + * Copyright 2002-2004 H. Peter Anvin + */ + +#include <linux/raid/pq.h> +#include "loongarch.h" + +/* + * The vector algorithms are currently priority 0, which means the generic + * scalar algorithms are not being disabled if vector support is present. + * This is like the similar LoongArch RAID5 XOR code, with the main reason + * repeated here: it cannot be ruled out at this point of time, that some + * future (maybe reduced) models could run the vector algorithms slower than + * the scalar ones, maybe for errata or micro-op reasons. It may be + * appropriate to revisit this after one or two more uarch generations. + */ + +#ifdef CONFIG_CPU_HAS_LSX +#define NSIZE 16 + +static int raid6_has_lsx(void) +{ + return cpu_has_lsx; +} + +static void raid6_lsx_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + /* + * $vr0, $vr1, $vr2, $vr3: wp + * $vr4, $vr5, $vr6, $vr7: wq + * $vr8, $vr9, $vr10, $vr11: wd + * $vr12, $vr13, $vr14, $vr15: w2 + * $vr16, $vr17, $vr18, $vr19: w1 + */ + for (d = 0; d < bytes; d += NSIZE*4) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE])); + asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE])); + asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE])); + asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE])); + asm volatile("vori.b $vr4, $vr0, 0"); + asm volatile("vori.b $vr5, $vr1, 0"); + asm volatile("vori.b $vr6, $vr2, 0"); + asm volatile("vori.b $vr7, $vr3, 0"); + for (z = z0-1; z >= 0; z--) { + /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */ + asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE])); + asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE])); + asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE])); + asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE])); + /* wp$$ ^= wd$$; */ + asm volatile("vxor.v $vr0, $vr0, $vr8"); + asm volatile("vxor.v $vr1, $vr1, $vr9"); + asm volatile("vxor.v $vr2, $vr2, $vr10"); + asm volatile("vxor.v $vr3, $vr3, $vr11"); + /* w2$$ = MASK(wq$$); */ + asm volatile("vslti.b $vr12, $vr4, 0"); + asm volatile("vslti.b $vr13, $vr5, 0"); + asm volatile("vslti.b $vr14, $vr6, 0"); + asm volatile("vslti.b $vr15, $vr7, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("vslli.b $vr16, $vr4, 1"); + asm volatile("vslli.b $vr17, $vr5, 1"); + asm volatile("vslli.b $vr18, $vr6, 1"); + asm volatile("vslli.b $vr19, $vr7, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("vandi.b $vr12, $vr12, 0x1d"); + asm volatile("vandi.b $vr13, $vr13, 0x1d"); + asm volatile("vandi.b $vr14, $vr14, 0x1d"); + asm volatile("vandi.b $vr15, $vr15, 0x1d"); + /* w1$$ ^= w2$$; */ + asm volatile("vxor.v $vr16, $vr16, $vr12"); + asm volatile("vxor.v $vr17, $vr17, $vr13"); + asm volatile("vxor.v $vr18, $vr18, $vr14"); + asm volatile("vxor.v $vr19, $vr19, $vr15"); + /* wq$$ = w1$$ ^ wd$$; */ + asm volatile("vxor.v $vr4, $vr16, $vr8"); + asm volatile("vxor.v $vr5, $vr17, $vr9"); + asm volatile("vxor.v $vr6, $vr18, $vr10"); + asm volatile("vxor.v $vr7, $vr19, $vr11"); + } + /* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */ + asm volatile("vst $vr0, %0" : "=m"(p[d+NSIZE*0])); + asm volatile("vst $vr1, %0" : "=m"(p[d+NSIZE*1])); + asm volatile("vst $vr2, %0" : "=m"(p[d+NSIZE*2])); + asm volatile("vst $vr3, %0" : "=m"(p[d+NSIZE*3])); + /* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */ + asm volatile("vst $vr4, %0" : "=m"(q[d+NSIZE*0])); + asm volatile("vst $vr5, %0" : "=m"(q[d+NSIZE*1])); + asm volatile("vst $vr6, %0" : "=m"(q[d+NSIZE*2])); + asm volatile("vst $vr7, %0" : "=m"(q[d+NSIZE*3])); + } + + kernel_fpu_end(); +} + +static void raid6_lsx_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + /* + * $vr0, $vr1, $vr2, $vr3: wp + * $vr4, $vr5, $vr6, $vr7: wq + * $vr8, $vr9, $vr10, $vr11: wd + * $vr12, $vr13, $vr14, $vr15: w2 + * $vr16, $vr17, $vr18, $vr19: w1 + */ + for (d = 0; d < bytes; d += NSIZE*4) { + /* P/Q data pages */ + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE])); + asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE])); + asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE])); + asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE])); + asm volatile("vori.b $vr4, $vr0, 0"); + asm volatile("vori.b $vr5, $vr1, 0"); + asm volatile("vori.b $vr6, $vr2, 0"); + asm volatile("vori.b $vr7, $vr3, 0"); + for (z = z0-1; z >= start; z--) { + /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */ + asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE])); + asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE])); + asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE])); + asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE])); + /* wp$$ ^= wd$$; */ + asm volatile("vxor.v $vr0, $vr0, $vr8"); + asm volatile("vxor.v $vr1, $vr1, $vr9"); + asm volatile("vxor.v $vr2, $vr2, $vr10"); + asm volatile("vxor.v $vr3, $vr3, $vr11"); + /* w2$$ = MASK(wq$$); */ + asm volatile("vslti.b $vr12, $vr4, 0"); + asm volatile("vslti.b $vr13, $vr5, 0"); + asm volatile("vslti.b $vr14, $vr6, 0"); + asm volatile("vslti.b $vr15, $vr7, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("vslli.b $vr16, $vr4, 1"); + asm volatile("vslli.b $vr17, $vr5, 1"); + asm volatile("vslli.b $vr18, $vr6, 1"); + asm volatile("vslli.b $vr19, $vr7, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("vandi.b $vr12, $vr12, 0x1d"); + asm volatile("vandi.b $vr13, $vr13, 0x1d"); + asm volatile("vandi.b $vr14, $vr14, 0x1d"); + asm volatile("vandi.b $vr15, $vr15, 0x1d"); + /* w1$$ ^= w2$$; */ + asm volatile("vxor.v $vr16, $vr16, $vr12"); + asm volatile("vxor.v $vr17, $vr17, $vr13"); + asm volatile("vxor.v $vr18, $vr18, $vr14"); + asm volatile("vxor.v $vr19, $vr19, $vr15"); + /* wq$$ = w1$$ ^ wd$$; */ + asm volatile("vxor.v $vr4, $vr16, $vr8"); + asm volatile("vxor.v $vr5, $vr17, $vr9"); + asm volatile("vxor.v $vr6, $vr18, $vr10"); + asm volatile("vxor.v $vr7, $vr19, $vr11"); + } + + /* P/Q left side optimization */ + for (z = start-1; z >= 0; z--) { + /* w2$$ = MASK(wq$$); */ + asm volatile("vslti.b $vr12, $vr4, 0"); + asm volatile("vslti.b $vr13, $vr5, 0"); + asm volatile("vslti.b $vr14, $vr6, 0"); + asm volatile("vslti.b $vr15, $vr7, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("vslli.b $vr16, $vr4, 1"); + asm volatile("vslli.b $vr17, $vr5, 1"); + asm volatile("vslli.b $vr18, $vr6, 1"); + asm volatile("vslli.b $vr19, $vr7, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("vandi.b $vr12, $vr12, 0x1d"); + asm volatile("vandi.b $vr13, $vr13, 0x1d"); + asm volatile("vandi.b $vr14, $vr14, 0x1d"); + asm volatile("vandi.b $vr15, $vr15, 0x1d"); + /* wq$$ = w1$$ ^ w2$$; */ + asm volatile("vxor.v $vr4, $vr16, $vr12"); + asm volatile("vxor.v $vr5, $vr17, $vr13"); + asm volatile("vxor.v $vr6, $vr18, $vr14"); + asm volatile("vxor.v $vr7, $vr19, $vr15"); + } + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + */ + asm volatile( + "vld $vr20, %0\n\t" + "vld $vr21, %1\n\t" + "vld $vr22, %2\n\t" + "vld $vr23, %3\n\t" + "vld $vr24, %4\n\t" + "vld $vr25, %5\n\t" + "vld $vr26, %6\n\t" + "vld $vr27, %7\n\t" + "vxor.v $vr20, $vr20, $vr0\n\t" + "vxor.v $vr21, $vr21, $vr1\n\t" + "vxor.v $vr22, $vr22, $vr2\n\t" + "vxor.v $vr23, $vr23, $vr3\n\t" + "vxor.v $vr24, $vr24, $vr4\n\t" + "vxor.v $vr25, $vr25, $vr5\n\t" + "vxor.v $vr26, $vr26, $vr6\n\t" + "vxor.v $vr27, $vr27, $vr7\n\t" + "vst $vr20, %0\n\t" + "vst $vr21, %1\n\t" + "vst $vr22, %2\n\t" + "vst $vr23, %3\n\t" + "vst $vr24, %4\n\t" + "vst $vr25, %5\n\t" + "vst $vr26, %6\n\t" + "vst $vr27, %7\n\t" + : "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]), + "+m"(p[d+NSIZE*2]), "+m"(p[d+NSIZE*3]), + "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]), + "+m"(q[d+NSIZE*2]), "+m"(q[d+NSIZE*3]) + ); + } + + kernel_fpu_end(); +} + +const struct raid6_calls raid6_lsx = { + raid6_lsx_gen_syndrome, + raid6_lsx_xor_syndrome, + raid6_has_lsx, + "lsx", + .priority = 0 /* see the comment near the top of the file for reason */ +}; + +#undef NSIZE +#endif /* CONFIG_CPU_HAS_LSX */ + +#ifdef CONFIG_CPU_HAS_LASX +#define NSIZE 32 + +static int raid6_has_lasx(void) +{ + return cpu_has_lasx; +} + +static void raid6_lasx_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + /* + * $xr0, $xr1: wp + * $xr2, $xr3: wq + * $xr4, $xr5: wd + * $xr6, $xr7: w2 + * $xr8, $xr9: w1 + */ + for (d = 0; d < bytes; d += NSIZE*2) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE])); + asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE])); + asm volatile("xvori.b $xr2, $xr0, 0"); + asm volatile("xvori.b $xr3, $xr1, 0"); + for (z = z0-1; z >= 0; z--) { + /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */ + asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE])); + asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE])); + /* wp$$ ^= wd$$; */ + asm volatile("xvxor.v $xr0, $xr0, $xr4"); + asm volatile("xvxor.v $xr1, $xr1, $xr5"); + /* w2$$ = MASK(wq$$); */ + asm volatile("xvslti.b $xr6, $xr2, 0"); + asm volatile("xvslti.b $xr7, $xr3, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("xvslli.b $xr8, $xr2, 1"); + asm volatile("xvslli.b $xr9, $xr3, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("xvandi.b $xr6, $xr6, 0x1d"); + asm volatile("xvandi.b $xr7, $xr7, 0x1d"); + /* w1$$ ^= w2$$; */ + asm volatile("xvxor.v $xr8, $xr8, $xr6"); + asm volatile("xvxor.v $xr9, $xr9, $xr7"); + /* wq$$ = w1$$ ^ wd$$; */ + asm volatile("xvxor.v $xr2, $xr8, $xr4"); + asm volatile("xvxor.v $xr3, $xr9, $xr5"); + } + /* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */ + asm volatile("xvst $xr0, %0" : "=m"(p[d+NSIZE*0])); + asm volatile("xvst $xr1, %0" : "=m"(p[d+NSIZE*1])); + /* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */ + asm volatile("xvst $xr2, %0" : "=m"(q[d+NSIZE*0])); + asm volatile("xvst $xr3, %0" : "=m"(q[d+NSIZE*1])); + } + + kernel_fpu_end(); +} + +static void raid6_lasx_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + /* + * $xr0, $xr1: wp + * $xr2, $xr3: wq + * $xr4, $xr5: wd + * $xr6, $xr7: w2 + * $xr8, $xr9: w1 + */ + for (d = 0; d < bytes; d += NSIZE*2) { + /* P/Q data pages */ + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE])); + asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE])); + asm volatile("xvori.b $xr2, $xr0, 0"); + asm volatile("xvori.b $xr3, $xr1, 0"); + for (z = z0-1; z >= start; z--) { + /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */ + asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE])); + asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE])); + /* wp$$ ^= wd$$; */ + asm volatile("xvxor.v $xr0, $xr0, $xr4"); + asm volatile("xvxor.v $xr1, $xr1, $xr5"); + /* w2$$ = MASK(wq$$); */ + asm volatile("xvslti.b $xr6, $xr2, 0"); + asm volatile("xvslti.b $xr7, $xr3, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("xvslli.b $xr8, $xr2, 1"); + asm volatile("xvslli.b $xr9, $xr3, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("xvandi.b $xr6, $xr6, 0x1d"); + asm volatile("xvandi.b $xr7, $xr7, 0x1d"); + /* w1$$ ^= w2$$; */ + asm volatile("xvxor.v $xr8, $xr8, $xr6"); + asm volatile("xvxor.v $xr9, $xr9, $xr7"); + /* wq$$ = w1$$ ^ wd$$; */ + asm volatile("xvxor.v $xr2, $xr8, $xr4"); + asm volatile("xvxor.v $xr3, $xr9, $xr5"); + } + + /* P/Q left side optimization */ + for (z = start-1; z >= 0; z--) { + /* w2$$ = MASK(wq$$); */ + asm volatile("xvslti.b $xr6, $xr2, 0"); + asm volatile("xvslti.b $xr7, $xr3, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("xvslli.b $xr8, $xr2, 1"); + asm volatile("xvslli.b $xr9, $xr3, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("xvandi.b $xr6, $xr6, 0x1d"); + asm volatile("xvandi.b $xr7, $xr7, 0x1d"); + /* wq$$ = w1$$ ^ w2$$; */ + asm volatile("xvxor.v $xr2, $xr8, $xr6"); + asm volatile("xvxor.v $xr3, $xr9, $xr7"); + } + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + */ + asm volatile( + "xvld $xr10, %0\n\t" + "xvld $xr11, %1\n\t" + "xvld $xr12, %2\n\t" + "xvld $xr13, %3\n\t" + "xvxor.v $xr10, $xr10, $xr0\n\t" + "xvxor.v $xr11, $xr11, $xr1\n\t" + "xvxor.v $xr12, $xr12, $xr2\n\t" + "xvxor.v $xr13, $xr13, $xr3\n\t" + "xvst $xr10, %0\n\t" + "xvst $xr11, %1\n\t" + "xvst $xr12, %2\n\t" + "xvst $xr13, %3\n\t" + : "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]), + "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]) + ); + } + + kernel_fpu_end(); +} + +const struct raid6_calls raid6_lasx = { + raid6_lasx_gen_syndrome, + raid6_lasx_xor_syndrome, + raid6_has_lasx, + "lasx", + .priority = 0 /* see the comment near the top of the file for reason */ +}; +#undef NSIZE +#endif /* CONFIG_CPU_HAS_LASX */ diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c index f02e10fa6238..3be03793237c 100644 --- a/lib/raid6/mktables.c +++ b/lib/raid6/mktables.c @@ -56,7 +56,9 @@ int main(int argc, char *argv[]) uint8_t v; uint8_t exptbl[256], invtbl[256]; + printf("#ifdef __KERNEL__\n"); printf("#include <linux/export.h>\n"); + printf("#endif\n"); printf("#include <linux/raid/pq.h>\n"); /* Compute multiplication table */ diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c index 0a2e76035ea9..6d9474ce6da9 100644 --- a/lib/raid6/neon.c +++ b/lib/raid6/neon.c @@ -8,10 +8,9 @@ #include <linux/raid/pq.h> #ifdef __KERNEL__ -#include <asm/neon.h> +#include <asm/simd.h> #else -#define kernel_neon_begin() -#define kernel_neon_end() +#define scoped_ksimd() #define cpu_has_neon() (1) #endif @@ -32,10 +31,9 @@ { \ void raid6_neon ## _n ## _gen_syndrome_real(int, \ unsigned long, void**); \ - kernel_neon_begin(); \ - raid6_neon ## _n ## _gen_syndrome_real(disks, \ + scoped_ksimd() \ + raid6_neon ## _n ## _gen_syndrome_real(disks, \ (unsigned long)bytes, ptrs); \ - kernel_neon_end(); \ } \ static void raid6_neon ## _n ## _xor_syndrome(int disks, \ int start, int stop, \ @@ -43,10 +41,9 @@ { \ void raid6_neon ## _n ## _xor_syndrome_real(int, \ int, int, unsigned long, void**); \ - kernel_neon_begin(); \ - raid6_neon ## _n ## _xor_syndrome_real(disks, \ - start, stop, (unsigned long)bytes, ptrs); \ - kernel_neon_end(); \ + scoped_ksimd() \ + raid6_neon ## _n ## _xor_syndrome_real(disks, \ + start, stop, (unsigned long)bytes, ptrs);\ } \ struct raid6_calls const raid6_neonx ## _n = { \ raid6_neon ## _n ## _gen_syndrome, \ diff --git a/lib/raid6/neon.h b/lib/raid6/neon.h new file mode 100644 index 000000000000..2ca41ee9b499 --- /dev/null +++ b/lib/raid6/neon.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only + +void raid6_neon1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs); +void raid6_neon1_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs); +void raid6_neon2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs); +void raid6_neon2_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs); +void raid6_neon4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs); +void raid6_neon4_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs); +void raid6_neon8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs); +void raid6_neon8_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs); +void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp, + uint8_t *dq, const uint8_t *pbmul, + const uint8_t *qmul); + +void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq, + const uint8_t *qmul); + + diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc index b7c68030da4f..355270af0cd6 100644 --- a/lib/raid6/neon.uc +++ b/lib/raid6/neon.uc @@ -25,6 +25,7 @@ */ #include <arm_neon.h> +#include "neon.h" typedef uint8x16_t unative_t; diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c index e49d519de6cb..b5e47c008b41 100644 --- a/lib/raid6/recov.c +++ b/lib/raid6/recov.c @@ -13,7 +13,6 @@ * the syndrome.) */ -#include <linux/export.h> #include <linux/raid/pq.h> /* Recover two failed data blocks. */ @@ -32,10 +31,10 @@ static void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -73,7 +72,7 @@ static void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c index 4e8095403ee2..97d598d2535c 100644 --- a/lib/raid6/recov_avx2.c +++ b/lib/raid6/recov_avx2.c @@ -28,10 +28,10 @@ static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -196,7 +196,7 @@ static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_avx512.c b/lib/raid6/recov_avx512.c index fd9e15bf3f30..7986120ca444 100644 --- a/lib/raid6/recov_avx512.c +++ b/lib/raid6/recov_avx512.c @@ -6,8 +6,6 @@ * Author: Megha Dey <megha.dey@linux.intel.com> */ -#ifdef CONFIG_AS_AVX512 - #include <linux/raid/pq.h> #include "x86.h" @@ -39,10 +37,10 @@ static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila, */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -240,7 +238,7 @@ static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila, */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -377,7 +375,3 @@ const struct raid6_recov_calls raid6_recov_avx512 = { #endif .priority = 3, }; - -#else -#warning "your version of binutils lacks AVX512 support" -#endif diff --git a/lib/raid6/recov_loongarch_simd.c b/lib/raid6/recov_loongarch_simd.c new file mode 100644 index 000000000000..93dc515997a1 --- /dev/null +++ b/lib/raid6/recov_loongarch_simd.c @@ -0,0 +1,513 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * RAID6 recovery algorithms in LoongArch SIMD (LSX & LASX) + * + * Copyright (C) 2023 WANG Xuerui <git@xen0n.name> + * + * Originally based on recov_avx2.c and recov_ssse3.c: + * + * Copyright (C) 2012 Intel Corporation + * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> + */ + +#include <linux/raid/pq.h> +#include "loongarch.h" + +/* + * Unlike with the syndrome calculation algorithms, there's no boot-time + * selection of recovery algorithms by benchmarking, so we have to specify + * the priorities and hope the future cores will all have decent vector + * support (i.e. no LASX slower than LSX, or even scalar code). + */ + +#ifdef CONFIG_CPU_HAS_LSX +static int raid6_has_lsx(void) +{ + return cpu_has_lsx; +} + +static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks - 2] = p; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + /* + * vr20, vr21: qmul + * vr22, vr23: pbmul + */ + asm volatile("vld $vr20, %0" : : "m" (qmul[0])); + asm volatile("vld $vr21, %0" : : "m" (qmul[16])); + asm volatile("vld $vr22, %0" : : "m" (pbmul[0])); + asm volatile("vld $vr23, %0" : : "m" (pbmul[16])); + + while (bytes) { + /* vr4 - vr7: Q */ + asm volatile("vld $vr4, %0" : : "m" (q[0])); + asm volatile("vld $vr5, %0" : : "m" (q[16])); + asm volatile("vld $vr6, %0" : : "m" (q[32])); + asm volatile("vld $vr7, %0" : : "m" (q[48])); + /* vr4 - vr7: Q + Qxy */ + asm volatile("vld $vr8, %0" : : "m" (dq[0])); + asm volatile("vld $vr9, %0" : : "m" (dq[16])); + asm volatile("vld $vr10, %0" : : "m" (dq[32])); + asm volatile("vld $vr11, %0" : : "m" (dq[48])); + asm volatile("vxor.v $vr4, $vr4, $vr8"); + asm volatile("vxor.v $vr5, $vr5, $vr9"); + asm volatile("vxor.v $vr6, $vr6, $vr10"); + asm volatile("vxor.v $vr7, $vr7, $vr11"); + /* vr0 - vr3: P */ + asm volatile("vld $vr0, %0" : : "m" (p[0])); + asm volatile("vld $vr1, %0" : : "m" (p[16])); + asm volatile("vld $vr2, %0" : : "m" (p[32])); + asm volatile("vld $vr3, %0" : : "m" (p[48])); + /* vr0 - vr3: P + Pxy */ + asm volatile("vld $vr8, %0" : : "m" (dp[0])); + asm volatile("vld $vr9, %0" : : "m" (dp[16])); + asm volatile("vld $vr10, %0" : : "m" (dp[32])); + asm volatile("vld $vr11, %0" : : "m" (dp[48])); + asm volatile("vxor.v $vr0, $vr0, $vr8"); + asm volatile("vxor.v $vr1, $vr1, $vr9"); + asm volatile("vxor.v $vr2, $vr2, $vr10"); + asm volatile("vxor.v $vr3, $vr3, $vr11"); + + /* vr8 - vr11: higher 4 bits of each byte of (Q + Qxy) */ + asm volatile("vsrli.b $vr8, $vr4, 4"); + asm volatile("vsrli.b $vr9, $vr5, 4"); + asm volatile("vsrli.b $vr10, $vr6, 4"); + asm volatile("vsrli.b $vr11, $vr7, 4"); + /* vr4 - vr7: lower 4 bits of each byte of (Q + Qxy) */ + asm volatile("vandi.b $vr4, $vr4, 0x0f"); + asm volatile("vandi.b $vr5, $vr5, 0x0f"); + asm volatile("vandi.b $vr6, $vr6, 0x0f"); + asm volatile("vandi.b $vr7, $vr7, 0x0f"); + /* lookup from qmul[0] */ + asm volatile("vshuf.b $vr4, $vr20, $vr20, $vr4"); + asm volatile("vshuf.b $vr5, $vr20, $vr20, $vr5"); + asm volatile("vshuf.b $vr6, $vr20, $vr20, $vr6"); + asm volatile("vshuf.b $vr7, $vr20, $vr20, $vr7"); + /* lookup from qmul[16] */ + asm volatile("vshuf.b $vr8, $vr21, $vr21, $vr8"); + asm volatile("vshuf.b $vr9, $vr21, $vr21, $vr9"); + asm volatile("vshuf.b $vr10, $vr21, $vr21, $vr10"); + asm volatile("vshuf.b $vr11, $vr21, $vr21, $vr11"); + /* vr16 - vr19: B(Q + Qxy) */ + asm volatile("vxor.v $vr16, $vr8, $vr4"); + asm volatile("vxor.v $vr17, $vr9, $vr5"); + asm volatile("vxor.v $vr18, $vr10, $vr6"); + asm volatile("vxor.v $vr19, $vr11, $vr7"); + + /* vr4 - vr7: higher 4 bits of each byte of (P + Pxy) */ + asm volatile("vsrli.b $vr4, $vr0, 4"); + asm volatile("vsrli.b $vr5, $vr1, 4"); + asm volatile("vsrli.b $vr6, $vr2, 4"); + asm volatile("vsrli.b $vr7, $vr3, 4"); + /* vr12 - vr15: lower 4 bits of each byte of (P + Pxy) */ + asm volatile("vandi.b $vr12, $vr0, 0x0f"); + asm volatile("vandi.b $vr13, $vr1, 0x0f"); + asm volatile("vandi.b $vr14, $vr2, 0x0f"); + asm volatile("vandi.b $vr15, $vr3, 0x0f"); + /* lookup from pbmul[0] */ + asm volatile("vshuf.b $vr12, $vr22, $vr22, $vr12"); + asm volatile("vshuf.b $vr13, $vr22, $vr22, $vr13"); + asm volatile("vshuf.b $vr14, $vr22, $vr22, $vr14"); + asm volatile("vshuf.b $vr15, $vr22, $vr22, $vr15"); + /* lookup from pbmul[16] */ + asm volatile("vshuf.b $vr4, $vr23, $vr23, $vr4"); + asm volatile("vshuf.b $vr5, $vr23, $vr23, $vr5"); + asm volatile("vshuf.b $vr6, $vr23, $vr23, $vr6"); + asm volatile("vshuf.b $vr7, $vr23, $vr23, $vr7"); + /* vr4 - vr7: A(P + Pxy) */ + asm volatile("vxor.v $vr4, $vr4, $vr12"); + asm volatile("vxor.v $vr5, $vr5, $vr13"); + asm volatile("vxor.v $vr6, $vr6, $vr14"); + asm volatile("vxor.v $vr7, $vr7, $vr15"); + + /* vr4 - vr7: A(P + Pxy) + B(Q + Qxy) = Dx */ + asm volatile("vxor.v $vr4, $vr4, $vr16"); + asm volatile("vxor.v $vr5, $vr5, $vr17"); + asm volatile("vxor.v $vr6, $vr6, $vr18"); + asm volatile("vxor.v $vr7, $vr7, $vr19"); + asm volatile("vst $vr4, %0" : "=m" (dq[0])); + asm volatile("vst $vr5, %0" : "=m" (dq[16])); + asm volatile("vst $vr6, %0" : "=m" (dq[32])); + asm volatile("vst $vr7, %0" : "=m" (dq[48])); + + /* vr0 - vr3: P + Pxy + Dx = Dy */ + asm volatile("vxor.v $vr0, $vr0, $vr4"); + asm volatile("vxor.v $vr1, $vr1, $vr5"); + asm volatile("vxor.v $vr2, $vr2, $vr6"); + asm volatile("vxor.v $vr3, $vr3, $vr7"); + asm volatile("vst $vr0, %0" : "=m" (dp[0])); + asm volatile("vst $vr1, %0" : "=m" (dp[16])); + asm volatile("vst $vr2, %0" : "=m" (dp[32])); + asm volatile("vst $vr3, %0" : "=m" (dp[48])); + + bytes -= 64; + p += 64; + q += 64; + dp += 64; + dq += 64; + } + + kernel_fpu_end(); +} + +static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + /* vr22, vr23: qmul */ + asm volatile("vld $vr22, %0" : : "m" (qmul[0])); + asm volatile("vld $vr23, %0" : : "m" (qmul[16])); + + while (bytes) { + /* vr0 - vr3: P + Dx */ + asm volatile("vld $vr0, %0" : : "m" (p[0])); + asm volatile("vld $vr1, %0" : : "m" (p[16])); + asm volatile("vld $vr2, %0" : : "m" (p[32])); + asm volatile("vld $vr3, %0" : : "m" (p[48])); + /* vr4 - vr7: Qx */ + asm volatile("vld $vr4, %0" : : "m" (dq[0])); + asm volatile("vld $vr5, %0" : : "m" (dq[16])); + asm volatile("vld $vr6, %0" : : "m" (dq[32])); + asm volatile("vld $vr7, %0" : : "m" (dq[48])); + /* vr4 - vr7: Q + Qx */ + asm volatile("vld $vr8, %0" : : "m" (q[0])); + asm volatile("vld $vr9, %0" : : "m" (q[16])); + asm volatile("vld $vr10, %0" : : "m" (q[32])); + asm volatile("vld $vr11, %0" : : "m" (q[48])); + asm volatile("vxor.v $vr4, $vr4, $vr8"); + asm volatile("vxor.v $vr5, $vr5, $vr9"); + asm volatile("vxor.v $vr6, $vr6, $vr10"); + asm volatile("vxor.v $vr7, $vr7, $vr11"); + + /* vr8 - vr11: higher 4 bits of each byte of (Q + Qx) */ + asm volatile("vsrli.b $vr8, $vr4, 4"); + asm volatile("vsrli.b $vr9, $vr5, 4"); + asm volatile("vsrli.b $vr10, $vr6, 4"); + asm volatile("vsrli.b $vr11, $vr7, 4"); + /* vr4 - vr7: lower 4 bits of each byte of (Q + Qx) */ + asm volatile("vandi.b $vr4, $vr4, 0x0f"); + asm volatile("vandi.b $vr5, $vr5, 0x0f"); + asm volatile("vandi.b $vr6, $vr6, 0x0f"); + asm volatile("vandi.b $vr7, $vr7, 0x0f"); + /* lookup from qmul[0] */ + asm volatile("vshuf.b $vr4, $vr22, $vr22, $vr4"); + asm volatile("vshuf.b $vr5, $vr22, $vr22, $vr5"); + asm volatile("vshuf.b $vr6, $vr22, $vr22, $vr6"); + asm volatile("vshuf.b $vr7, $vr22, $vr22, $vr7"); + /* lookup from qmul[16] */ + asm volatile("vshuf.b $vr8, $vr23, $vr23, $vr8"); + asm volatile("vshuf.b $vr9, $vr23, $vr23, $vr9"); + asm volatile("vshuf.b $vr10, $vr23, $vr23, $vr10"); + asm volatile("vshuf.b $vr11, $vr23, $vr23, $vr11"); + /* vr4 - vr7: qmul(Q + Qx) = Dx */ + asm volatile("vxor.v $vr4, $vr4, $vr8"); + asm volatile("vxor.v $vr5, $vr5, $vr9"); + asm volatile("vxor.v $vr6, $vr6, $vr10"); + asm volatile("vxor.v $vr7, $vr7, $vr11"); + asm volatile("vst $vr4, %0" : "=m" (dq[0])); + asm volatile("vst $vr5, %0" : "=m" (dq[16])); + asm volatile("vst $vr6, %0" : "=m" (dq[32])); + asm volatile("vst $vr7, %0" : "=m" (dq[48])); + + /* vr0 - vr3: P + Dx + Dx = P */ + asm volatile("vxor.v $vr0, $vr0, $vr4"); + asm volatile("vxor.v $vr1, $vr1, $vr5"); + asm volatile("vxor.v $vr2, $vr2, $vr6"); + asm volatile("vxor.v $vr3, $vr3, $vr7"); + asm volatile("vst $vr0, %0" : "=m" (p[0])); + asm volatile("vst $vr1, %0" : "=m" (p[16])); + asm volatile("vst $vr2, %0" : "=m" (p[32])); + asm volatile("vst $vr3, %0" : "=m" (p[48])); + + bytes -= 64; + p += 64; + q += 64; + dq += 64; + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_lsx = { + .data2 = raid6_2data_recov_lsx, + .datap = raid6_datap_recov_lsx, + .valid = raid6_has_lsx, + .name = "lsx", + .priority = 1, +}; +#endif /* CONFIG_CPU_HAS_LSX */ + +#ifdef CONFIG_CPU_HAS_LASX +static int raid6_has_lasx(void) +{ + return cpu_has_lasx; +} + +static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks - 2] = p; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + /* + * xr20, xr21: qmul + * xr22, xr23: pbmul + */ + asm volatile("vld $vr20, %0" : : "m" (qmul[0])); + asm volatile("vld $vr21, %0" : : "m" (qmul[16])); + asm volatile("vld $vr22, %0" : : "m" (pbmul[0])); + asm volatile("vld $vr23, %0" : : "m" (pbmul[16])); + asm volatile("xvreplve0.q $xr20, $xr20"); + asm volatile("xvreplve0.q $xr21, $xr21"); + asm volatile("xvreplve0.q $xr22, $xr22"); + asm volatile("xvreplve0.q $xr23, $xr23"); + + while (bytes) { + /* xr0, xr1: Q */ + asm volatile("xvld $xr0, %0" : : "m" (q[0])); + asm volatile("xvld $xr1, %0" : : "m" (q[32])); + /* xr0, xr1: Q + Qxy */ + asm volatile("xvld $xr4, %0" : : "m" (dq[0])); + asm volatile("xvld $xr5, %0" : : "m" (dq[32])); + asm volatile("xvxor.v $xr0, $xr0, $xr4"); + asm volatile("xvxor.v $xr1, $xr1, $xr5"); + /* xr2, xr3: P */ + asm volatile("xvld $xr2, %0" : : "m" (p[0])); + asm volatile("xvld $xr3, %0" : : "m" (p[32])); + /* xr2, xr3: P + Pxy */ + asm volatile("xvld $xr4, %0" : : "m" (dp[0])); + asm volatile("xvld $xr5, %0" : : "m" (dp[32])); + asm volatile("xvxor.v $xr2, $xr2, $xr4"); + asm volatile("xvxor.v $xr3, $xr3, $xr5"); + + /* xr4, xr5: higher 4 bits of each byte of (Q + Qxy) */ + asm volatile("xvsrli.b $xr4, $xr0, 4"); + asm volatile("xvsrli.b $xr5, $xr1, 4"); + /* xr0, xr1: lower 4 bits of each byte of (Q + Qxy) */ + asm volatile("xvandi.b $xr0, $xr0, 0x0f"); + asm volatile("xvandi.b $xr1, $xr1, 0x0f"); + /* lookup from qmul[0] */ + asm volatile("xvshuf.b $xr0, $xr20, $xr20, $xr0"); + asm volatile("xvshuf.b $xr1, $xr20, $xr20, $xr1"); + /* lookup from qmul[16] */ + asm volatile("xvshuf.b $xr4, $xr21, $xr21, $xr4"); + asm volatile("xvshuf.b $xr5, $xr21, $xr21, $xr5"); + /* xr6, xr7: B(Q + Qxy) */ + asm volatile("xvxor.v $xr6, $xr4, $xr0"); + asm volatile("xvxor.v $xr7, $xr5, $xr1"); + + /* xr4, xr5: higher 4 bits of each byte of (P + Pxy) */ + asm volatile("xvsrli.b $xr4, $xr2, 4"); + asm volatile("xvsrli.b $xr5, $xr3, 4"); + /* xr0, xr1: lower 4 bits of each byte of (P + Pxy) */ + asm volatile("xvandi.b $xr0, $xr2, 0x0f"); + asm volatile("xvandi.b $xr1, $xr3, 0x0f"); + /* lookup from pbmul[0] */ + asm volatile("xvshuf.b $xr0, $xr22, $xr22, $xr0"); + asm volatile("xvshuf.b $xr1, $xr22, $xr22, $xr1"); + /* lookup from pbmul[16] */ + asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4"); + asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5"); + /* xr0, xr1: A(P + Pxy) */ + asm volatile("xvxor.v $xr0, $xr0, $xr4"); + asm volatile("xvxor.v $xr1, $xr1, $xr5"); + + /* xr0, xr1: A(P + Pxy) + B(Q + Qxy) = Dx */ + asm volatile("xvxor.v $xr0, $xr0, $xr6"); + asm volatile("xvxor.v $xr1, $xr1, $xr7"); + + /* xr2, xr3: P + Pxy + Dx = Dy */ + asm volatile("xvxor.v $xr2, $xr2, $xr0"); + asm volatile("xvxor.v $xr3, $xr3, $xr1"); + + asm volatile("xvst $xr0, %0" : "=m" (dq[0])); + asm volatile("xvst $xr1, %0" : "=m" (dq[32])); + asm volatile("xvst $xr2, %0" : "=m" (dp[0])); + asm volatile("xvst $xr3, %0" : "=m" (dp[32])); + + bytes -= 64; + p += 64; + q += 64; + dp += 64; + dq += 64; + } + + kernel_fpu_end(); +} + +static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + /* xr22, xr23: qmul */ + asm volatile("vld $vr22, %0" : : "m" (qmul[0])); + asm volatile("xvreplve0.q $xr22, $xr22"); + asm volatile("vld $vr23, %0" : : "m" (qmul[16])); + asm volatile("xvreplve0.q $xr23, $xr23"); + + while (bytes) { + /* xr0, xr1: P + Dx */ + asm volatile("xvld $xr0, %0" : : "m" (p[0])); + asm volatile("xvld $xr1, %0" : : "m" (p[32])); + /* xr2, xr3: Qx */ + asm volatile("xvld $xr2, %0" : : "m" (dq[0])); + asm volatile("xvld $xr3, %0" : : "m" (dq[32])); + /* xr2, xr3: Q + Qx */ + asm volatile("xvld $xr4, %0" : : "m" (q[0])); + asm volatile("xvld $xr5, %0" : : "m" (q[32])); + asm volatile("xvxor.v $xr2, $xr2, $xr4"); + asm volatile("xvxor.v $xr3, $xr3, $xr5"); + + /* xr4, xr5: higher 4 bits of each byte of (Q + Qx) */ + asm volatile("xvsrli.b $xr4, $xr2, 4"); + asm volatile("xvsrli.b $xr5, $xr3, 4"); + /* xr2, xr3: lower 4 bits of each byte of (Q + Qx) */ + asm volatile("xvandi.b $xr2, $xr2, 0x0f"); + asm volatile("xvandi.b $xr3, $xr3, 0x0f"); + /* lookup from qmul[0] */ + asm volatile("xvshuf.b $xr2, $xr22, $xr22, $xr2"); + asm volatile("xvshuf.b $xr3, $xr22, $xr22, $xr3"); + /* lookup from qmul[16] */ + asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4"); + asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5"); + /* xr2, xr3: qmul(Q + Qx) = Dx */ + asm volatile("xvxor.v $xr2, $xr2, $xr4"); + asm volatile("xvxor.v $xr3, $xr3, $xr5"); + + /* xr0, xr1: P + Dx + Dx = P */ + asm volatile("xvxor.v $xr0, $xr0, $xr2"); + asm volatile("xvxor.v $xr1, $xr1, $xr3"); + + asm volatile("xvst $xr2, %0" : "=m" (dq[0])); + asm volatile("xvst $xr3, %0" : "=m" (dq[32])); + asm volatile("xvst $xr0, %0" : "=m" (p[0])); + asm volatile("xvst $xr1, %0" : "=m" (p[32])); + + bytes -= 64; + p += 64; + q += 64; + dq += 64; + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_lasx = { + .data2 = raid6_2data_recov_lasx, + .datap = raid6_datap_recov_lasx, + .valid = raid6_has_lasx, + .name = "lasx", + .priority = 2, +}; +#endif /* CONFIG_CPU_HAS_LASX */ diff --git a/lib/raid6/recov_neon.c b/lib/raid6/recov_neon.c index d6fba8bf8c0a..9d99aeabd31a 100644 --- a/lib/raid6/recov_neon.c +++ b/lib/raid6/recov_neon.c @@ -7,10 +7,10 @@ #include <linux/raid/pq.h> #ifdef __KERNEL__ -#include <asm/neon.h> +#include <asm/simd.h> +#include "neon.h" #else -#define kernel_neon_begin() -#define kernel_neon_end() +#define scoped_ksimd() #define cpu_has_neon() (1) #endif @@ -19,13 +19,6 @@ static int raid6_has_neon(void) return cpu_has_neon(); } -void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp, - uint8_t *dq, const uint8_t *pbmul, - const uint8_t *qmul); - -void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq, - const uint8_t *qmul); - static void raid6_2data_recov_neon(int disks, size_t bytes, int faila, int failb, void **ptrs) { @@ -42,10 +35,10 @@ static void raid6_2data_recov_neon(int disks, size_t bytes, int faila, * delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -61,9 +54,8 @@ static void raid6_2data_recov_neon(int disks, size_t bytes, int faila, qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]]; - kernel_neon_begin(); - __raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul); - kernel_neon_end(); + scoped_ksimd() + __raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul); } static void raid6_datap_recov_neon(int disks, size_t bytes, int faila, @@ -80,7 +72,7 @@ static void raid6_datap_recov_neon(int disks, size_t bytes, int faila, * Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -92,9 +84,8 @@ static void raid6_datap_recov_neon(int disks, size_t bytes, int faila, /* Now, pick the proper data tables */ qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; - kernel_neon_begin(); - __raid6_datap_recov_neon(bytes, p, q, dq, qmul); - kernel_neon_end(); + scoped_ksimd() + __raid6_datap_recov_neon(bytes, p, q, dq, qmul); } const struct raid6_recov_calls raid6_recov_neon = { diff --git a/lib/raid6/recov_neon_inner.c b/lib/raid6/recov_neon_inner.c index 90eb80d43790..f9e7e8f5a151 100644 --- a/lib/raid6/recov_neon_inner.c +++ b/lib/raid6/recov_neon_inner.c @@ -5,6 +5,7 @@ */ #include <arm_neon.h> +#include "neon.h" #ifdef CONFIG_ARM /* diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c new file mode 100644 index 000000000000..40c393206b6a --- /dev/null +++ b/lib/raid6/recov_rvv.c @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2024 Institute of Software, CAS. + * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn> + */ + +#include <linux/raid/pq.h> +#include "rvv.h" + +static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp, + u8 *dq, const u8 *pbmul, + const u8 *qmul) +{ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli x0, %[avl], e8, m1, ta, ma\n" + ".option pop\n" + : : + [avl]"r"(16) + ); + + /* + * while ( bytes-- ) { + * uint8_t px, qx, db; + * + * px = *p ^ *dp; + * qx = qmul[*q ^ *dq]; + * *dq++ = db = pbmul[px] ^ qx; + * *dp++ = db ^ px; + * p++; q++; + * } + */ + while (bytes) { + /* + * v0:px, v1:dp, + * v2:qx, v3:dq, + * v4:vx, v5:vy, + * v6:qm0, v7:qm1, + * v8:pm0, v9:pm1, + * v14:p/qm[vx], v15:p/qm[vy] + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[px])\n" + "vle8.v v1, (%[dp])\n" + "vxor.vv v0, v0, v1\n" + "vle8.v v2, (%[qx])\n" + "vle8.v v3, (%[dq])\n" + "vxor.vv v4, v2, v3\n" + "vsrl.vi v5, v4, 4\n" + "vand.vi v4, v4, 0xf\n" + "vle8.v v6, (%[qm0])\n" + "vle8.v v7, (%[qm1])\n" + "vrgather.vv v14, v6, v4\n" /* v14 = qm[vx] */ + "vrgather.vv v15, v7, v5\n" /* v15 = qm[vy] */ + "vxor.vv v2, v14, v15\n" /* v2 = qmul[*q ^ *dq] */ + + "vsrl.vi v5, v0, 4\n" + "vand.vi v4, v0, 0xf\n" + "vle8.v v8, (%[pm0])\n" + "vle8.v v9, (%[pm1])\n" + "vrgather.vv v14, v8, v4\n" /* v14 = pm[vx] */ + "vrgather.vv v15, v9, v5\n" /* v15 = pm[vy] */ + "vxor.vv v4, v14, v15\n" /* v4 = pbmul[px] */ + "vxor.vv v3, v4, v2\n" /* v3 = db = pbmul[px] ^ qx */ + "vxor.vv v1, v3, v0\n" /* v1 = db ^ px; */ + "vse8.v v3, (%[dq])\n" + "vse8.v v1, (%[dp])\n" + ".option pop\n" + : : + [px]"r"(p), + [dp]"r"(dp), + [qx]"r"(q), + [dq]"r"(dq), + [qm0]"r"(qmul), + [qm1]"r"(qmul + 16), + [pm0]"r"(pbmul), + [pm1]"r"(pbmul + 16) + :); + + bytes -= 16; + p += 16; + q += 16; + dp += 16; + dq += 16; + } +} + +static void __raid6_datap_recov_rvv(int bytes, u8 *p, u8 *q, + u8 *dq, const u8 *qmul) +{ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli x0, %[avl], e8, m1, ta, ma\n" + ".option pop\n" + : : + [avl]"r"(16) + ); + + /* + * while (bytes--) { + * *p++ ^= *dq = qmul[*q ^ *dq]; + * q++; dq++; + * } + */ + while (bytes) { + /* + * v0:vx, v1:vy, + * v2:dq, v3:p, + * v4:qm0, v5:qm1, + * v10:m[vx], v11:m[vy] + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[vx])\n" + "vle8.v v2, (%[dq])\n" + "vxor.vv v0, v0, v2\n" + "vsrl.vi v1, v0, 4\n" + "vand.vi v0, v0, 0xf\n" + "vle8.v v4, (%[qm0])\n" + "vle8.v v5, (%[qm1])\n" + "vrgather.vv v10, v4, v0\n" + "vrgather.vv v11, v5, v1\n" + "vxor.vv v0, v10, v11\n" + "vle8.v v1, (%[vy])\n" + "vxor.vv v1, v0, v1\n" + "vse8.v v0, (%[dq])\n" + "vse8.v v1, (%[vy])\n" + ".option pop\n" + : : + [vx]"r"(q), + [vy]"r"(p), + [dq]"r"(dq), + [qm0]"r"(qmul), + [qm1]"r"(qmul + 16) + :); + + bytes -= 16; + p += 16; + q += 16; + dq += 16; + } +} + +static void raid6_2data_recov_rvv(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks - 2] = p; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + kernel_vector_begin(); + __raid6_2data_recov_rvv(bytes, p, q, dp, dq, pbmul, qmul); + kernel_vector_end(); +} + +static void raid6_datap_recov_rvv(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_vector_begin(); + __raid6_datap_recov_rvv(bytes, p, q, dq, qmul); + kernel_vector_end(); +} + +const struct raid6_recov_calls raid6_recov_rvv = { + .data2 = raid6_2data_recov_rvv, + .datap = raid6_datap_recov_rvv, + .valid = rvv_has_vector, + .name = "rvv", + .priority = 1, +}; diff --git a/lib/raid6/recov_s390xc.c b/lib/raid6/recov_s390xc.c index 179eec900cea..487018f81192 100644 --- a/lib/raid6/recov_s390xc.c +++ b/lib/raid6/recov_s390xc.c @@ -6,7 +6,6 @@ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ -#include <linux/export.h> #include <linux/raid/pq.h> static inline void xor_block(u8 *p1, u8 *p2) @@ -35,10 +34,10 @@ static void raid6_2data_recov_s390xc(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -82,7 +81,7 @@ static void raid6_datap_recov_s390xc(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c index 4bfa3c6b60de..2e849185c32b 100644 --- a/lib/raid6/recov_ssse3.c +++ b/lib/raid6/recov_ssse3.c @@ -30,10 +30,10 @@ static void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -203,7 +203,7 @@ static void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c new file mode 100644 index 000000000000..75c9dafedb28 --- /dev/null +++ b/lib/raid6/rvv.c @@ -0,0 +1,1228 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RAID-6 syndrome calculation using RISC-V vector instructions + * + * Copyright 2024 Institute of Software, CAS. + * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn> + * + * Based on neon.uc: + * Copyright 2002-2004 H. Peter Anvin + */ + +#include "rvv.h" + +#ifdef __riscv_vector +#error "This code must be built without compiler support for vector" +#endif + +static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */ + for (d = 0; d < bytes; d += nsize * 1) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]) + ); + + for (z = z0 - 1 ; z >= 0 ; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] = wp$$; + * *(unative_t *)&q[d+NSIZE*$$] = wq$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vse8.v v0, (%[wp0])\n" + "vse8.v v1, (%[wq0])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]) + ); + } +} + +static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */ + for (d = 0 ; d < bytes ; d += nsize * 1) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]) + ); + + /* P/Q data pages */ + for (z = z0 - 1; z >= start; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * wq$$ = w1$$ ^ w2$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v1, v3, v2\n" + ".option pop\n" + : : + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + * v0:wp0, v1:wq0, v2:p0, v3:q0 + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v2, (%[wp0])\n" + "vle8.v v3, (%[wq0])\n" + "vxor.vv v2, v2, v0\n" + "vxor.vv v3, v3, v1\n" + "vse8.v v2, (%[wp0])\n" + "vse8.v v3, (%[wq0])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]) + ); + } +} + +static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + */ + for (d = 0; d < bytes; d += nsize * 2) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + "vle8.v v4, (%[wp1])\n" + "vmv.v.v v5, v4\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]), + [wp1]"r"(&dptr[z0][d + 1 * nsize]) + ); + + for (z = z0 - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [wd1]"r"(&dptr[z][d + 1 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] = wp$$; + * *(unative_t *)&q[d+NSIZE*$$] = wq$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vse8.v v0, (%[wp0])\n" + "vse8.v v1, (%[wq0])\n" + "vse8.v v4, (%[wp1])\n" + "vse8.v v5, (%[wq1])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]), + [wp1]"r"(&p[d + nsize * 1]), + [wq1]"r"(&q[d + nsize * 1]) + ); + } +} + +static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + */ + for (d = 0; d < bytes; d += nsize * 2) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + "vle8.v v4, (%[wp1])\n" + "vmv.v.v v5, v4\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]), + [wp1]"r"(&dptr[z0][d + 1 * nsize]) + ); + + /* P/Q data pages */ + for (z = z0 - 1; z >= start; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [wd1]"r"(&dptr[z][d + 1 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * wq$$ = w1$$ ^ w2$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v1, v3, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v5, v7, v6\n" + ".option pop\n" + : : + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + * v0:wp0, v1:wq0, v2:p0, v3:q0 + * v4:wp1, v5:wq1, v6:p1, v7:q1 + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v2, (%[wp0])\n" + "vle8.v v3, (%[wq0])\n" + "vxor.vv v2, v2, v0\n" + "vxor.vv v3, v3, v1\n" + "vse8.v v2, (%[wp0])\n" + "vse8.v v3, (%[wq0])\n" + + "vle8.v v6, (%[wp1])\n" + "vle8.v v7, (%[wq1])\n" + "vxor.vv v6, v6, v4\n" + "vxor.vv v7, v7, v5\n" + "vse8.v v6, (%[wp1])\n" + "vse8.v v7, (%[wq1])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]), + [wp1]"r"(&p[d + nsize * 1]), + [wq1]"r"(&q[d + nsize * 1]) + ); + } +} + +static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12 + * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13 + */ + for (d = 0; d < bytes; d += nsize * 4) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + "vle8.v v4, (%[wp1])\n" + "vmv.v.v v5, v4\n" + "vle8.v v8, (%[wp2])\n" + "vmv.v.v v9, v8\n" + "vle8.v v12, (%[wp3])\n" + "vmv.v.v v13, v12\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]), + [wp1]"r"(&dptr[z0][d + 1 * nsize]), + [wp2]"r"(&dptr[z0][d + 2 * nsize]), + [wp3]"r"(&dptr[z0][d + 3 * nsize]) + ); + + for (z = z0 - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v11, v11, v10\n" + "vle8.v v10, (%[wd2])\n" + "vxor.vv v9, v11, v10\n" + "vxor.vv v8, v8, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v15, v15, v14\n" + "vle8.v v14, (%[wd3])\n" + "vxor.vv v13, v15, v14\n" + "vxor.vv v12, v12, v14\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [wd1]"r"(&dptr[z][d + 1 * nsize]), + [wd2]"r"(&dptr[z][d + 2 * nsize]), + [wd3]"r"(&dptr[z][d + 3 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] = wp$$; + * *(unative_t *)&q[d+NSIZE*$$] = wq$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vse8.v v0, (%[wp0])\n" + "vse8.v v1, (%[wq0])\n" + "vse8.v v4, (%[wp1])\n" + "vse8.v v5, (%[wq1])\n" + "vse8.v v8, (%[wp2])\n" + "vse8.v v9, (%[wq2])\n" + "vse8.v v12, (%[wp3])\n" + "vse8.v v13, (%[wq3])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]), + [wp1]"r"(&p[d + nsize * 1]), + [wq1]"r"(&q[d + nsize * 1]), + [wp2]"r"(&p[d + nsize * 2]), + [wq2]"r"(&q[d + nsize * 2]), + [wp3]"r"(&p[d + nsize * 3]), + [wq3]"r"(&q[d + nsize * 3]) + ); + } +} + +static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12 + * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13 + */ + for (d = 0; d < bytes; d += nsize * 4) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + "vle8.v v4, (%[wp1])\n" + "vmv.v.v v5, v4\n" + "vle8.v v8, (%[wp2])\n" + "vmv.v.v v9, v8\n" + "vle8.v v12, (%[wp3])\n" + "vmv.v.v v13, v12\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]), + [wp1]"r"(&dptr[z0][d + 1 * nsize]), + [wp2]"r"(&dptr[z0][d + 2 * nsize]), + [wp3]"r"(&dptr[z0][d + 3 * nsize]) + ); + + /* P/Q data pages */ + for (z = z0 - 1; z >= start; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v11, v11, v10\n" + "vle8.v v10, (%[wd2])\n" + "vxor.vv v9, v11, v10\n" + "vxor.vv v8, v8, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v15, v15, v14\n" + "vle8.v v14, (%[wd3])\n" + "vxor.vv v13, v15, v14\n" + "vxor.vv v12, v12, v14\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [wd1]"r"(&dptr[z][d + 1 * nsize]), + [wd2]"r"(&dptr[z][d + 2 * nsize]), + [wd3]"r"(&dptr[z][d + 3 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * wq$$ = w1$$ ^ w2$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v1, v3, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v5, v7, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v9, v11, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v13, v15, v14\n" + ".option pop\n" + : : + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + * v0:wp0, v1:wq0, v2:p0, v3:q0 + * v4:wp1, v5:wq1, v6:p1, v7:q1 + * v8:wp2, v9:wq2, v10:p2, v11:q2 + * v12:wp3, v13:wq3, v14:p3, v15:q3 + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v2, (%[wp0])\n" + "vle8.v v3, (%[wq0])\n" + "vxor.vv v2, v2, v0\n" + "vxor.vv v3, v3, v1\n" + "vse8.v v2, (%[wp0])\n" + "vse8.v v3, (%[wq0])\n" + + "vle8.v v6, (%[wp1])\n" + "vle8.v v7, (%[wq1])\n" + "vxor.vv v6, v6, v4\n" + "vxor.vv v7, v7, v5\n" + "vse8.v v6, (%[wp1])\n" + "vse8.v v7, (%[wq1])\n" + + "vle8.v v10, (%[wp2])\n" + "vle8.v v11, (%[wq2])\n" + "vxor.vv v10, v10, v8\n" + "vxor.vv v11, v11, v9\n" + "vse8.v v10, (%[wp2])\n" + "vse8.v v11, (%[wq2])\n" + + "vle8.v v14, (%[wp3])\n" + "vle8.v v15, (%[wq3])\n" + "vxor.vv v14, v14, v12\n" + "vxor.vv v15, v15, v13\n" + "vse8.v v14, (%[wp3])\n" + "vse8.v v15, (%[wq3])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]), + [wp1]"r"(&p[d + nsize * 1]), + [wq1]"r"(&q[d + nsize * 1]), + [wp2]"r"(&p[d + nsize * 2]), + [wq2]"r"(&q[d + nsize * 2]), + [wp3]"r"(&p[d + nsize * 3]), + [wq3]"r"(&q[d + nsize * 3]) + ); + } +} + +static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12 + * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13 + * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14 + * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15 + * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16 + * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17 + */ + for (d = 0; d < bytes; d += nsize * 8) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + "vle8.v v4, (%[wp1])\n" + "vmv.v.v v5, v4\n" + "vle8.v v8, (%[wp2])\n" + "vmv.v.v v9, v8\n" + "vle8.v v12, (%[wp3])\n" + "vmv.v.v v13, v12\n" + "vle8.v v16, (%[wp4])\n" + "vmv.v.v v17, v16\n" + "vle8.v v20, (%[wp5])\n" + "vmv.v.v v21, v20\n" + "vle8.v v24, (%[wp6])\n" + "vmv.v.v v25, v24\n" + "vle8.v v28, (%[wp7])\n" + "vmv.v.v v29, v28\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]), + [wp1]"r"(&dptr[z0][d + 1 * nsize]), + [wp2]"r"(&dptr[z0][d + 2 * nsize]), + [wp3]"r"(&dptr[z0][d + 3 * nsize]), + [wp4]"r"(&dptr[z0][d + 4 * nsize]), + [wp5]"r"(&dptr[z0][d + 5 * nsize]), + [wp6]"r"(&dptr[z0][d + 6 * nsize]), + [wp7]"r"(&dptr[z0][d + 7 * nsize]) + ); + + for (z = z0 - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v11, v11, v10\n" + "vle8.v v10, (%[wd2])\n" + "vxor.vv v9, v11, v10\n" + "vxor.vv v8, v8, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v15, v15, v14\n" + "vle8.v v14, (%[wd3])\n" + "vxor.vv v13, v15, v14\n" + "vxor.vv v12, v12, v14\n" + + "vsra.vi v18, v17, 7\n" + "vsll.vi v19, v17, 1\n" + "vand.vx v18, v18, %[x1d]\n" + "vxor.vv v19, v19, v18\n" + "vle8.v v18, (%[wd4])\n" + "vxor.vv v17, v19, v18\n" + "vxor.vv v16, v16, v18\n" + + "vsra.vi v22, v21, 7\n" + "vsll.vi v23, v21, 1\n" + "vand.vx v22, v22, %[x1d]\n" + "vxor.vv v23, v23, v22\n" + "vle8.v v22, (%[wd5])\n" + "vxor.vv v21, v23, v22\n" + "vxor.vv v20, v20, v22\n" + + "vsra.vi v26, v25, 7\n" + "vsll.vi v27, v25, 1\n" + "vand.vx v26, v26, %[x1d]\n" + "vxor.vv v27, v27, v26\n" + "vle8.v v26, (%[wd6])\n" + "vxor.vv v25, v27, v26\n" + "vxor.vv v24, v24, v26\n" + + "vsra.vi v30, v29, 7\n" + "vsll.vi v31, v29, 1\n" + "vand.vx v30, v30, %[x1d]\n" + "vxor.vv v31, v31, v30\n" + "vle8.v v30, (%[wd7])\n" + "vxor.vv v29, v31, v30\n" + "vxor.vv v28, v28, v30\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [wd1]"r"(&dptr[z][d + 1 * nsize]), + [wd2]"r"(&dptr[z][d + 2 * nsize]), + [wd3]"r"(&dptr[z][d + 3 * nsize]), + [wd4]"r"(&dptr[z][d + 4 * nsize]), + [wd5]"r"(&dptr[z][d + 5 * nsize]), + [wd6]"r"(&dptr[z][d + 6 * nsize]), + [wd7]"r"(&dptr[z][d + 7 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] = wp$$; + * *(unative_t *)&q[d+NSIZE*$$] = wq$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vse8.v v0, (%[wp0])\n" + "vse8.v v1, (%[wq0])\n" + "vse8.v v4, (%[wp1])\n" + "vse8.v v5, (%[wq1])\n" + "vse8.v v8, (%[wp2])\n" + "vse8.v v9, (%[wq2])\n" + "vse8.v v12, (%[wp3])\n" + "vse8.v v13, (%[wq3])\n" + "vse8.v v16, (%[wp4])\n" + "vse8.v v17, (%[wq4])\n" + "vse8.v v20, (%[wp5])\n" + "vse8.v v21, (%[wq5])\n" + "vse8.v v24, (%[wp6])\n" + "vse8.v v25, (%[wq6])\n" + "vse8.v v28, (%[wp7])\n" + "vse8.v v29, (%[wq7])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]), + [wp1]"r"(&p[d + nsize * 1]), + [wq1]"r"(&q[d + nsize * 1]), + [wp2]"r"(&p[d + nsize * 2]), + [wq2]"r"(&q[d + nsize * 2]), + [wp3]"r"(&p[d + nsize * 3]), + [wq3]"r"(&q[d + nsize * 3]), + [wp4]"r"(&p[d + nsize * 4]), + [wq4]"r"(&q[d + nsize * 4]), + [wp5]"r"(&p[d + nsize * 5]), + [wq5]"r"(&q[d + nsize * 5]), + [wp6]"r"(&p[d + nsize * 6]), + [wq6]"r"(&q[d + nsize * 6]), + [wp7]"r"(&p[d + nsize * 7]), + [wq7]"r"(&q[d + nsize * 7]) + ); + } +} + +static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12 + * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13 + * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14 + * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15 + * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16 + * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17 + */ + for (d = 0; d < bytes; d += nsize * 8) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + "vle8.v v4, (%[wp1])\n" + "vmv.v.v v5, v4\n" + "vle8.v v8, (%[wp2])\n" + "vmv.v.v v9, v8\n" + "vle8.v v12, (%[wp3])\n" + "vmv.v.v v13, v12\n" + "vle8.v v16, (%[wp4])\n" + "vmv.v.v v17, v16\n" + "vle8.v v20, (%[wp5])\n" + "vmv.v.v v21, v20\n" + "vle8.v v24, (%[wp6])\n" + "vmv.v.v v25, v24\n" + "vle8.v v28, (%[wp7])\n" + "vmv.v.v v29, v28\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]), + [wp1]"r"(&dptr[z0][d + 1 * nsize]), + [wp2]"r"(&dptr[z0][d + 2 * nsize]), + [wp3]"r"(&dptr[z0][d + 3 * nsize]), + [wp4]"r"(&dptr[z0][d + 4 * nsize]), + [wp5]"r"(&dptr[z0][d + 5 * nsize]), + [wp6]"r"(&dptr[z0][d + 6 * nsize]), + [wp7]"r"(&dptr[z0][d + 7 * nsize]) + ); + + /* P/Q data pages */ + for (z = z0 - 1; z >= start; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v11, v11, v10\n" + "vle8.v v10, (%[wd2])\n" + "vxor.vv v9, v11, v10\n" + "vxor.vv v8, v8, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v15, v15, v14\n" + "vle8.v v14, (%[wd3])\n" + "vxor.vv v13, v15, v14\n" + "vxor.vv v12, v12, v14\n" + + "vsra.vi v18, v17, 7\n" + "vsll.vi v19, v17, 1\n" + "vand.vx v18, v18, %[x1d]\n" + "vxor.vv v19, v19, v18\n" + "vle8.v v18, (%[wd4])\n" + "vxor.vv v17, v19, v18\n" + "vxor.vv v16, v16, v18\n" + + "vsra.vi v22, v21, 7\n" + "vsll.vi v23, v21, 1\n" + "vand.vx v22, v22, %[x1d]\n" + "vxor.vv v23, v23, v22\n" + "vle8.v v22, (%[wd5])\n" + "vxor.vv v21, v23, v22\n" + "vxor.vv v20, v20, v22\n" + + "vsra.vi v26, v25, 7\n" + "vsll.vi v27, v25, 1\n" + "vand.vx v26, v26, %[x1d]\n" + "vxor.vv v27, v27, v26\n" + "vle8.v v26, (%[wd6])\n" + "vxor.vv v25, v27, v26\n" + "vxor.vv v24, v24, v26\n" + + "vsra.vi v30, v29, 7\n" + "vsll.vi v31, v29, 1\n" + "vand.vx v30, v30, %[x1d]\n" + "vxor.vv v31, v31, v30\n" + "vle8.v v30, (%[wd7])\n" + "vxor.vv v29, v31, v30\n" + "vxor.vv v28, v28, v30\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [wd1]"r"(&dptr[z][d + 1 * nsize]), + [wd2]"r"(&dptr[z][d + 2 * nsize]), + [wd3]"r"(&dptr[z][d + 3 * nsize]), + [wd4]"r"(&dptr[z][d + 4 * nsize]), + [wd5]"r"(&dptr[z][d + 5 * nsize]), + [wd6]"r"(&dptr[z][d + 6 * nsize]), + [wd7]"r"(&dptr[z][d + 7 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * wq$$ = w1$$ ^ w2$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v1, v3, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v5, v7, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v9, v11, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v13, v15, v14\n" + + "vsra.vi v18, v17, 7\n" + "vsll.vi v19, v17, 1\n" + "vand.vx v18, v18, %[x1d]\n" + "vxor.vv v17, v19, v18\n" + + "vsra.vi v22, v21, 7\n" + "vsll.vi v23, v21, 1\n" + "vand.vx v22, v22, %[x1d]\n" + "vxor.vv v21, v23, v22\n" + + "vsra.vi v26, v25, 7\n" + "vsll.vi v27, v25, 1\n" + "vand.vx v26, v26, %[x1d]\n" + "vxor.vv v25, v27, v26\n" + + "vsra.vi v30, v29, 7\n" + "vsll.vi v31, v29, 1\n" + "vand.vx v30, v30, %[x1d]\n" + "vxor.vv v29, v31, v30\n" + ".option pop\n" + : : + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + * v0:wp0, v1:wq0, v2:p0, v3:q0 + * v4:wp1, v5:wq1, v6:p1, v7:q1 + * v8:wp2, v9:wq2, v10:p2, v11:q2 + * v12:wp3, v13:wq3, v14:p3, v15:q3 + * v16:wp4, v17:wq4, v18:p4, v19:q4 + * v20:wp5, v21:wq5, v22:p5, v23:q5 + * v24:wp6, v25:wq6, v26:p6, v27:q6 + * v28:wp7, v29:wq7, v30:p7, v31:q7 + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v2, (%[wp0])\n" + "vle8.v v3, (%[wq0])\n" + "vxor.vv v2, v2, v0\n" + "vxor.vv v3, v3, v1\n" + "vse8.v v2, (%[wp0])\n" + "vse8.v v3, (%[wq0])\n" + + "vle8.v v6, (%[wp1])\n" + "vle8.v v7, (%[wq1])\n" + "vxor.vv v6, v6, v4\n" + "vxor.vv v7, v7, v5\n" + "vse8.v v6, (%[wp1])\n" + "vse8.v v7, (%[wq1])\n" + + "vle8.v v10, (%[wp2])\n" + "vle8.v v11, (%[wq2])\n" + "vxor.vv v10, v10, v8\n" + "vxor.vv v11, v11, v9\n" + "vse8.v v10, (%[wp2])\n" + "vse8.v v11, (%[wq2])\n" + + "vle8.v v14, (%[wp3])\n" + "vle8.v v15, (%[wq3])\n" + "vxor.vv v14, v14, v12\n" + "vxor.vv v15, v15, v13\n" + "vse8.v v14, (%[wp3])\n" + "vse8.v v15, (%[wq3])\n" + + "vle8.v v18, (%[wp4])\n" + "vle8.v v19, (%[wq4])\n" + "vxor.vv v18, v18, v16\n" + "vxor.vv v19, v19, v17\n" + "vse8.v v18, (%[wp4])\n" + "vse8.v v19, (%[wq4])\n" + + "vle8.v v22, (%[wp5])\n" + "vle8.v v23, (%[wq5])\n" + "vxor.vv v22, v22, v20\n" + "vxor.vv v23, v23, v21\n" + "vse8.v v22, (%[wp5])\n" + "vse8.v v23, (%[wq5])\n" + + "vle8.v v26, (%[wp6])\n" + "vle8.v v27, (%[wq6])\n" + "vxor.vv v26, v26, v24\n" + "vxor.vv v27, v27, v25\n" + "vse8.v v26, (%[wp6])\n" + "vse8.v v27, (%[wq6])\n" + + "vle8.v v30, (%[wp7])\n" + "vle8.v v31, (%[wq7])\n" + "vxor.vv v30, v30, v28\n" + "vxor.vv v31, v31, v29\n" + "vse8.v v30, (%[wp7])\n" + "vse8.v v31, (%[wq7])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]), + [wp1]"r"(&p[d + nsize * 1]), + [wq1]"r"(&q[d + nsize * 1]), + [wp2]"r"(&p[d + nsize * 2]), + [wq2]"r"(&q[d + nsize * 2]), + [wp3]"r"(&p[d + nsize * 3]), + [wq3]"r"(&q[d + nsize * 3]), + [wp4]"r"(&p[d + nsize * 4]), + [wq4]"r"(&q[d + nsize * 4]), + [wp5]"r"(&p[d + nsize * 5]), + [wq5]"r"(&q[d + nsize * 5]), + [wp6]"r"(&p[d + nsize * 6]), + [wq6]"r"(&q[d + nsize * 6]), + [wp7]"r"(&p[d + nsize * 7]), + [wq7]"r"(&q[d + nsize * 7]) + ); + } +} + +RAID6_RVV_WRAPPER(1); +RAID6_RVV_WRAPPER(2); +RAID6_RVV_WRAPPER(4); +RAID6_RVV_WRAPPER(8); diff --git a/lib/raid6/rvv.h b/lib/raid6/rvv.h new file mode 100644 index 000000000000..6d0708a2c8a4 --- /dev/null +++ b/lib/raid6/rvv.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2024 Institute of Software, CAS. + * + * raid6/rvv.h + * + * Definitions for RISC-V RAID-6 code + */ + +#ifdef __KERNEL__ +#include <asm/vector.h> +#else +#define kernel_vector_begin() +#define kernel_vector_end() +#include <sys/auxv.h> +#include <asm/hwcap.h> +#define has_vector() (getauxval(AT_HWCAP) & COMPAT_HWCAP_ISA_V) +#endif + +#include <linux/raid/pq.h> + +static int rvv_has_vector(void) +{ + return has_vector(); +} + +#define RAID6_RVV_WRAPPER(_n) \ + static void raid6_rvv ## _n ## _gen_syndrome(int disks, \ + size_t bytes, void **ptrs) \ + { \ + void raid6_rvv ## _n ## _gen_syndrome_real(int d, \ + unsigned long b, void **p); \ + kernel_vector_begin(); \ + raid6_rvv ## _n ## _gen_syndrome_real(disks, \ + (unsigned long)bytes, ptrs); \ + kernel_vector_end(); \ + } \ + static void raid6_rvv ## _n ## _xor_syndrome(int disks, \ + int start, int stop, \ + size_t bytes, void **ptrs) \ + { \ + void raid6_rvv ## _n ## _xor_syndrome_real(int d, \ + int s1, int s2, \ + unsigned long b, void **p); \ + kernel_vector_begin(); \ + raid6_rvv ## _n ## _xor_syndrome_real(disks, \ + start, stop, (unsigned long)bytes, ptrs); \ + kernel_vector_end(); \ + } \ + struct raid6_calls const raid6_rvvx ## _n = { \ + raid6_rvv ## _n ## _gen_syndrome, \ + raid6_rvv ## _n ## _xor_syndrome, \ + rvv_has_vector, \ + "rvvx" #_n, \ + 0 \ + } diff --git a/lib/raid6/s390vx.uc b/lib/raid6/s390vx.uc index b25dfc9c7759..8aa53eb2f395 100644 --- a/lib/raid6/s390vx.uc +++ b/lib/raid6/s390vx.uc @@ -11,16 +11,16 @@ * This file is postprocessed using unroll.awk. */ +#include <linux/cpufeature.h> #include <linux/raid/pq.h> -#include <asm/fpu/api.h> -#include <asm/vx-insn.h> +#include <asm/fpu.h> #define NSIZE 16 -static inline void LOAD_CONST(void) +static __always_inline void LOAD_CONST(void) { - asm volatile("VREPIB %v24,7"); - asm volatile("VREPIB %v25,0x1d"); + fpu_vrepib(24, 0x07); + fpu_vrepib(25, 0x1d); } /* @@ -28,10 +28,7 @@ static inline void LOAD_CONST(void) * vector register y left by 1 bit and stores the result in * vector register x. */ -static inline void SHLBYTE(int x, int y) -{ - asm volatile ("VAB %0,%1,%1" : : "i" (x), "i" (y)); -} +#define SHLBYTE(x, y) fpu_vab(x, y, y) /* * For each of the 16 bytes in the vector register y the MASK() @@ -39,49 +36,17 @@ static inline void SHLBYTE(int x, int y) * or 0x00 if the high bit is 0. The result is stored in vector * register x. */ -static inline void MASK(int x, int y) -{ - asm volatile ("VESRAVB %0,%1,24" : : "i" (x), "i" (y)); -} - -static inline void AND(int x, int y, int z) -{ - asm volatile ("VN %0,%1,%2" : : "i" (x), "i" (y), "i" (z)); -} - -static inline void XOR(int x, int y, int z) -{ - asm volatile ("VX %0,%1,%2" : : "i" (x), "i" (y), "i" (z)); -} +#define MASK(x, y) fpu_vesravb(x, y, 24) -static inline void LOAD_DATA(int x, u8 *ptr) -{ - typedef struct { u8 _[16 * $#]; } addrtype; - register addrtype *__ptr asm("1") = (addrtype *) ptr; - - asm volatile ("VLM %2,%3,0,%1" - : : "m" (*__ptr), "a" (__ptr), "i" (x), - "i" (x + $# - 1)); -} - -static inline void STORE_DATA(int x, u8 *ptr) -{ - typedef struct { u8 _[16 * $#]; } addrtype; - register addrtype *__ptr asm("1") = (addrtype *) ptr; - - asm volatile ("VSTM %2,%3,0,1" - : "=m" (*__ptr) : "a" (__ptr), "i" (x), - "i" (x + $# - 1)); -} - -static inline void COPY_VEC(int x, int y) -{ - asm volatile ("VLR %0,%1" : : "i" (x), "i" (y)); -} +#define AND(x, y, z) fpu_vn(x, y, z) +#define XOR(x, y, z) fpu_vx(x, y, z) +#define LOAD_DATA(x, ptr) fpu_vlm(x, x + $# - 1, ptr) +#define STORE_DATA(x, ptr) fpu_vstm(x, x + $# - 1, ptr) +#define COPY_VEC(x, y) fpu_vlr(x, y) static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs) { - struct kernel_fpu vxstate; + DECLARE_KERNEL_FPU_ONSTACK32(vxstate); u8 **dptr, *p, *q; int d, z, z0; @@ -114,7 +79,7 @@ static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs) static void raid6_s390vx$#_xor_syndrome(int disks, int start, int stop, size_t bytes, void **ptrs) { - struct kernel_fpu vxstate; + DECLARE_KERNEL_FPU_ONSTACK32(vxstate); u8 **dptr, *p, *q; int d, z, z0; @@ -158,7 +123,7 @@ static void raid6_s390vx$#_xor_syndrome(int disks, int start, int stop, static int raid6_s390vx$#_valid(void) { - return MACHINE_HAS_VX; + return cpu_has_vx(); } const struct raid6_calls raid6_s390vx$# = { diff --git a/lib/raid6/test/.gitignore b/lib/raid6/test/.gitignore new file mode 100644 index 000000000000..1b68a77f348f --- /dev/null +++ b/lib/raid6/test/.gitignore @@ -0,0 +1,3 @@ +/int.uc +/neon.uc +/raid6test diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 4fb7700a741b..09bbe2b14cce 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -6,14 +6,15 @@ pound := \# -CC = gcc -OPTFLAGS = -O2 # Adjust as desired -CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) -LD = ld -AWK = awk -f -AR = ar -RANLIB = ranlib -OBJS = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o +# Adjust as desired +CC = gcc +OPTFLAGS = -O2 +CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) +LD = ld +AWK = awk -f +AR = ar +RANLIB = ranlib +OBJS = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o ARCH := $(shell uname -m 2>/dev/null | sed -e /s/i.86/i386/) ifeq ($(ARCH),i386) @@ -34,24 +35,42 @@ ifeq ($(ARCH),aarch64) HAS_NEON = yes endif +ifeq ($(findstring riscv,$(ARCH)),riscv) + CFLAGS += -I../../../arch/riscv/include -DCONFIG_RISCV=1 + HAS_RVV = yes +endif + +ifeq ($(findstring ppc,$(ARCH)),ppc) + CFLAGS += -I../../../arch/powerpc/include + HAS_ALTIVEC := $(shell printf '$(pound)include <altivec.h>\nvector int a;\n' |\ + gcc -c -x c - >/dev/null && rm ./-.o && echo yes) +endif + +ifeq ($(ARCH),loongarch64) + CFLAGS += -I../../../arch/loongarch/include -DCONFIG_LOONGARCH=1 + CFLAGS += $(shell echo 'vld $$vr0, $$zero, 0' | \ + gcc -c -x assembler - >/dev/null 2>&1 && \ + rm ./-.o && echo -DCONFIG_CPU_HAS_LSX=1) + CFLAGS += $(shell echo 'xvld $$xr0, $$zero, 0' | \ + gcc -c -x assembler - >/dev/null 2>&1 && \ + rm ./-.o && echo -DCONFIG_CPU_HAS_LASX=1) +endif + ifeq ($(IS_X86),yes) OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o CFLAGS += -DCONFIG_X86 - CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" | \ - gcc -c -x assembler - >/dev/null 2>&1 && \ - rm ./-.o && echo -DCONFIG_AS_AVX512=1) else ifeq ($(HAS_NEON),yes) OBJS += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 -else - HAS_ALTIVEC := $(shell printf '$(pound)include <altivec.h>\nvector int a;\n' |\ - gcc -c -x c - >/dev/null && rm ./-.o && echo yes) - ifeq ($(HAS_ALTIVEC),yes) - CFLAGS += -I../../../arch/powerpc/include - CFLAGS += -DCONFIG_ALTIVEC - OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \ - vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o - endif +else ifeq ($(HAS_ALTIVEC),yes) + CFLAGS += -DCONFIG_ALTIVEC + OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \ + vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o +else ifeq ($(ARCH),loongarch64) + OBJS += loongarch_simd.o recov_loongarch_simd.o +else ifeq ($(HAS_RVV),yes) + OBJS += rvv.o recov_rvv.o + CFLAGS += -DCONFIG_RISCV_ISA_V=1 endif .c.o: @@ -63,12 +82,12 @@ endif %.uc: ../%.uc cp -f $< $@ -all: raid6.a raid6test +all: raid6.a raid6test raid6.a: $(OBJS) - rm -f $@ - $(AR) cq $@ $^ - $(RANLIB) $@ + rm -f $@ + $(AR) cq $@ $^ + $(RANLIB) $@ raid6test: test.c raid6.a $(CC) $(CFLAGS) -o raid6test $^ |
