diff options
Diffstat (limited to 'lib/raid6')
34 files changed, 5120 insertions, 259 deletions
diff --git a/lib/raid6/.gitignore b/lib/raid6/.gitignore index 162becacf97c..6be57745afd1 100644 --- a/lib/raid6/.gitignore +++ b/lib/raid6/.gitignore @@ -1,4 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only mktables altivec*.c int*.c tables.c +neon?.c +s390vx?.c +vpermxor*.c diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 9f7c184725d7..5be0a4e60ab1 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -1,77 +1,82 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_RAID6_PQ) += raid6_pq.o raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ - int8.o int16.o int32.o + int8.o -raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o -raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o +raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o +raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \ + vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o +raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o +raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o +raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o +raid6_pq-$(CONFIG_RISCV_ISA_V) += rvv.o recov_rvv.o -hostprogs-y += mktables - -quiet_cmd_unroll = UNROLL $@ - cmd_unroll = $(AWK) -f$(srctree)/$(src)/unroll.awk -vN=$(UNROLL) \ - < $< > $@ || ( rm -f $@ && exit 1 ) +hostprogs += mktables ifeq ($(CONFIG_ALTIVEC),y) -altivec_flags := -maltivec -mabi=altivec +altivec_flags := -maltivec $(call cc-option,-mabi=altivec) +# Enable <altivec.h> +altivec_flags += -isystem $(shell $(CC) -print-file-name=include) + +ifdef CONFIG_CC_IS_CLANG +# clang ppc port does not yet support -maltivec when -msoft-float is +# enabled. A future release of clang will resolve this +# https://llvm.org/pr31177 +CFLAGS_REMOVE_altivec1.o += -msoft-float +CFLAGS_REMOVE_altivec2.o += -msoft-float +CFLAGS_REMOVE_altivec4.o += -msoft-float +CFLAGS_REMOVE_altivec8.o += -msoft-float +CFLAGS_REMOVE_vpermxor1.o += -msoft-float +CFLAGS_REMOVE_vpermxor2.o += -msoft-float +CFLAGS_REMOVE_vpermxor4.o += -msoft-float +CFLAGS_REMOVE_vpermxor8.o += -msoft-float +endif endif -targets += int1.c -$(obj)/int1.c: UNROLL := 1 -$(obj)/int1.c: $(src)/int.uc $(src)/unroll.awk FORCE - $(call if_changed,unroll) - -targets += int2.c -$(obj)/int2.c: UNROLL := 2 -$(obj)/int2.c: $(src)/int.uc $(src)/unroll.awk FORCE - $(call if_changed,unroll) - -targets += int4.c -$(obj)/int4.c: UNROLL := 4 -$(obj)/int4.c: $(src)/int.uc $(src)/unroll.awk FORCE - $(call if_changed,unroll) - -targets += int8.c -$(obj)/int8.c: UNROLL := 8 -$(obj)/int8.c: $(src)/int.uc $(src)/unroll.awk FORCE - $(call if_changed,unroll) - -targets += int16.c -$(obj)/int16.c: UNROLL := 16 -$(obj)/int16.c: $(src)/int.uc $(src)/unroll.awk FORCE - $(call if_changed,unroll) +quiet_cmd_unroll = UNROLL $@ + cmd_unroll = $(AWK) -v N=$* -f $(src)/unroll.awk < $< > $@ -targets += int32.c -$(obj)/int32.c: UNROLL := 32 -$(obj)/int32.c: $(src)/int.uc $(src)/unroll.awk FORCE +targets += int1.c int2.c int4.c int8.c +$(obj)/int%.c: $(src)/int.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) CFLAGS_altivec1.o += $(altivec_flags) -targets += altivec1.c -$(obj)/altivec1.c: UNROLL := 1 -$(obj)/altivec1.c: $(src)/altivec.uc $(src)/unroll.awk FORCE +CFLAGS_altivec2.o += $(altivec_flags) +CFLAGS_altivec4.o += $(altivec_flags) +CFLAGS_altivec8.o += $(altivec_flags) +targets += altivec1.c altivec2.c altivec4.c altivec8.c +$(obj)/altivec%.c: $(src)/altivec.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) -CFLAGS_altivec2.o += $(altivec_flags) -targets += altivec2.c -$(obj)/altivec2.c: UNROLL := 2 -$(obj)/altivec2.c: $(src)/altivec.uc $(src)/unroll.awk FORCE +CFLAGS_vpermxor1.o += $(altivec_flags) +CFLAGS_vpermxor2.o += $(altivec_flags) +CFLAGS_vpermxor4.o += $(altivec_flags) +CFLAGS_vpermxor8.o += $(altivec_flags) +targets += vpermxor1.c vpermxor2.c vpermxor4.c vpermxor8.c +$(obj)/vpermxor%.c: $(src)/vpermxor.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) -CFLAGS_altivec4.o += $(altivec_flags) -targets += altivec4.c -$(obj)/altivec4.c: UNROLL := 4 -$(obj)/altivec4.c: $(src)/altivec.uc $(src)/unroll.awk FORCE +CFLAGS_neon1.o += $(CC_FLAGS_FPU) +CFLAGS_neon2.o += $(CC_FLAGS_FPU) +CFLAGS_neon4.o += $(CC_FLAGS_FPU) +CFLAGS_neon8.o += $(CC_FLAGS_FPU) +CFLAGS_recov_neon_inner.o += $(CC_FLAGS_FPU) +CFLAGS_REMOVE_neon1.o += $(CC_FLAGS_NO_FPU) +CFLAGS_REMOVE_neon2.o += $(CC_FLAGS_NO_FPU) +CFLAGS_REMOVE_neon4.o += $(CC_FLAGS_NO_FPU) +CFLAGS_REMOVE_neon8.o += $(CC_FLAGS_NO_FPU) +CFLAGS_REMOVE_recov_neon_inner.o += $(CC_FLAGS_NO_FPU) +targets += neon1.c neon2.c neon4.c neon8.c +$(obj)/neon%.c: $(src)/neon.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) -CFLAGS_altivec8.o += $(altivec_flags) -targets += altivec8.c -$(obj)/altivec8.c: UNROLL := 8 -$(obj)/altivec8.c: $(src)/altivec.uc $(src)/unroll.awk FORCE +targets += s390vx8.c +$(obj)/s390vx%.c: $(src)/s390vx.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) quiet_cmd_mktable = TABLE $@ - cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) + cmd_mktable = $(obj)/mktables > $@ targets += tables.c $(obj)/tables.c: $(obj)/mktables FORCE diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 6d7316fe9f30..799e0e5eac26 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -1,13 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright 2002 H. Peter Anvin - All Rights Reserved * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Boston MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * * ----------------------------------------------------------------------- */ /* @@ -23,53 +18,72 @@ #else #include <linux/module.h> #include <linux/gfp.h> -#if !RAID6_USE_EMPTY_ZERO_PAGE -/* In .bss so it's zeroed */ -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); -EXPORT_SYMBOL(raid6_empty_zero_page); -#endif #endif struct raid6_calls raid6_call; EXPORT_SYMBOL_GPL(raid6_call); const struct raid6_calls * const raid6_algos[] = { -#if defined(__ia64__) - &raid6_intx16, - &raid6_intx32, -#endif #if defined(__i386__) && !defined(__arch_um__) - &raid6_mmxx1, - &raid6_mmxx2, - &raid6_sse1x1, - &raid6_sse1x2, - &raid6_sse2x1, - &raid6_sse2x2, -#ifdef CONFIG_AS_AVX2 - &raid6_avx2x1, + &raid6_avx512x2, + &raid6_avx512x1, &raid6_avx2x2, -#endif + &raid6_avx2x1, + &raid6_sse2x2, + &raid6_sse2x1, + &raid6_sse1x2, + &raid6_sse1x1, + &raid6_mmxx2, + &raid6_mmxx1, #endif #if defined(__x86_64__) && !defined(__arch_um__) - &raid6_sse2x1, - &raid6_sse2x2, - &raid6_sse2x4, -#ifdef CONFIG_AS_AVX2 - &raid6_avx2x1, - &raid6_avx2x2, + &raid6_avx512x4, + &raid6_avx512x2, + &raid6_avx512x1, &raid6_avx2x4, -#endif + &raid6_avx2x2, + &raid6_avx2x1, + &raid6_sse2x4, + &raid6_sse2x2, + &raid6_sse2x1, #endif #ifdef CONFIG_ALTIVEC - &raid6_altivec1, - &raid6_altivec2, - &raid6_altivec4, + &raid6_vpermxor8, + &raid6_vpermxor4, + &raid6_vpermxor2, + &raid6_vpermxor1, &raid6_altivec8, + &raid6_altivec4, + &raid6_altivec2, + &raid6_altivec1, +#endif +#if defined(CONFIG_S390) + &raid6_s390vx8, +#endif +#ifdef CONFIG_KERNEL_MODE_NEON + &raid6_neonx8, + &raid6_neonx4, + &raid6_neonx2, + &raid6_neonx1, +#endif +#ifdef CONFIG_LOONGARCH +#ifdef CONFIG_CPU_HAS_LASX + &raid6_lasx, +#endif +#ifdef CONFIG_CPU_HAS_LSX + &raid6_lsx, +#endif +#endif +#ifdef CONFIG_RISCV_ISA_V + &raid6_rvvx1, + &raid6_rvvx2, + &raid6_rvvx4, + &raid6_rvvx8, #endif - &raid6_intx1, - &raid6_intx2, - &raid6_intx4, &raid6_intx8, + &raid6_intx4, + &raid6_intx2, + &raid6_intx1, NULL }; @@ -80,12 +94,28 @@ void (*raid6_datap_recov)(int, size_t, int, void **); EXPORT_SYMBOL_GPL(raid6_datap_recov); const struct raid6_recov_calls *const raid6_recov_algos[] = { -#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) -#ifdef CONFIG_AS_AVX2 +#ifdef CONFIG_X86 + &raid6_recov_avx512, &raid6_recov_avx2, -#endif &raid6_recov_ssse3, #endif +#ifdef CONFIG_S390 + &raid6_recov_s390xc, +#endif +#if defined(CONFIG_KERNEL_MODE_NEON) + &raid6_recov_neon, +#endif +#ifdef CONFIG_LOONGARCH +#ifdef CONFIG_CPU_HAS_LASX + &raid6_recov_lasx, +#endif +#ifdef CONFIG_CPU_HAS_LSX + &raid6_recov_lsx, +#endif +#endif +#ifdef CONFIG_RISCV_ISA_V + &raid6_recov_rvv, +#endif &raid6_recov_intx1, NULL }; @@ -98,6 +128,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = { #define time_before(x, y) ((x) < (y)) #endif +#define RAID6_TEST_DISKS 8 +#define RAID6_TEST_DISKS_ORDER 3 + static inline const struct raid6_recov_calls *raid6_choose_recov(void) { const struct raid6_recov_calls *const *algo; @@ -112,25 +145,31 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void) raid6_2data_recov = best->data2; raid6_datap_recov = best->datap; - printk("raid6: using %s recovery algorithm\n", best->name); + pr_info("raid6: using %s recovery algorithm\n", best->name); } else - printk("raid6: Yikes! No recovery algorithm found!\n"); + pr_err("raid6: Yikes! No recovery algorithm found!\n"); return best; } static inline const struct raid6_calls *raid6_choose_gen( - void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks) + void *(*const dptrs)[RAID6_TEST_DISKS], const int disks) { - unsigned long perf, bestperf, j0, j1; + unsigned long perf, bestgenperf, j0, j1; + int start = (disks>>1)-1, stop = disks-3; /* work on the second half of the disks */ const struct raid6_calls *const *algo; const struct raid6_calls *best; - for (bestperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) { - if (!best || (*algo)->prefer >= best->prefer) { + for (bestgenperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) { + if (!best || (*algo)->priority >= best->priority) { if ((*algo)->valid && !(*algo)->valid()) continue; + if (!IS_ENABLED(CONFIG_RAID6_PQ_BENCHMARK)) { + best = *algo; + break; + } + perf = 0; preempt_disable(); @@ -144,23 +183,55 @@ static inline const struct raid6_calls *raid6_choose_gen( } preempt_enable(); - if (perf > bestperf) { - bestperf = perf; + if (perf > bestgenperf) { + bestgenperf = perf; best = *algo; } - printk("raid6: %-8s %5ld MB/s\n", (*algo)->name, - (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); + pr_info("raid6: %-8s gen() %5ld MB/s\n", (*algo)->name, + (perf * HZ * (disks-2)) >> + (20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2)); } } - if (best) { - printk("raid6: using algorithm %s (%ld MB/s)\n", - best->name, - (bestperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); - raid6_call = *best; - } else - printk("raid6: Yikes! No algorithm found!\n"); + if (!best) { + pr_err("raid6: Yikes! No algorithm found!\n"); + goto out; + } + + raid6_call = *best; + + if (!IS_ENABLED(CONFIG_RAID6_PQ_BENCHMARK)) { + pr_info("raid6: skipped pq benchmark and selected %s\n", + best->name); + goto out; + } + + pr_info("raid6: using algorithm %s gen() %ld MB/s\n", + best->name, + (bestgenperf * HZ * (disks - 2)) >> + (20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2)); + + if (best->xor_syndrome) { + perf = 0; + + preempt_disable(); + j0 = jiffies; + while ((j1 = jiffies) == j0) + cpu_relax(); + while (time_before(jiffies, + j1 + (1 << RAID6_TIME_JIFFIES_LG2))) { + best->xor_syndrome(disks, start, stop, + PAGE_SIZE, *dptrs); + perf++; + } + preempt_enable(); + + pr_info("raid6: .... xor() %ld MB/s, rmw enabled\n", + (perf * HZ * (disks - 2)) >> + (20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2 + 1)); + } +out: return best; } @@ -170,27 +241,33 @@ static inline const struct raid6_calls *raid6_choose_gen( int __init raid6_select_algo(void) { - const int disks = (65536/PAGE_SIZE)+2; + const int disks = RAID6_TEST_DISKS; const struct raid6_calls *gen_best; const struct raid6_recov_calls *rec_best; - char *syndromes; - void *dptrs[(65536/PAGE_SIZE)+2]; - int i; - - for (i = 0; i < disks-2; i++) - dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; + char *disk_ptr, *p; + void *dptrs[RAID6_TEST_DISKS]; + int i, cycle; + + /* prepare the buffer and fill it circularly with gfmul table */ + disk_ptr = (char *)__get_free_pages(GFP_KERNEL, RAID6_TEST_DISKS_ORDER); + if (!disk_ptr) { + pr_err("raid6: Yikes! No memory available.\n"); + return -ENOMEM; + } - /* Normal code - use a 2-page allocation to avoid D$ conflict */ - syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); + p = disk_ptr; + for (i = 0; i < disks; i++) + dptrs[i] = p + PAGE_SIZE * i; - if (!syndromes) { - printk("raid6: Yikes! No memory available.\n"); - return -ENOMEM; + cycle = ((disks - 2) * PAGE_SIZE) / 65536; + for (i = 0; i < cycle; i++) { + memcpy(p, raid6_gfmul, 65536); + p += 65536; } - dptrs[disks-2] = syndromes; - dptrs[disks-1] = syndromes + PAGE_SIZE; + if ((disks - 2) * PAGE_SIZE % 65536) + memcpy(p, raid6_gfmul, (disks - 2) * PAGE_SIZE % 65536); /* select raid gen_syndrome function */ gen_best = raid6_choose_gen(&dptrs, disks); @@ -198,7 +275,7 @@ int __init raid6_select_algo(void) /* select raid recover functions */ rec_best = raid6_choose_recov(); - free_pages((unsigned long)syndromes, 1); + free_pages((unsigned long)disk_ptr, RAID6_TEST_DISKS_ORDER); return gen_best && rec_best ? 0 : -EINVAL; } diff --git a/lib/raid6/altivec.uc b/lib/raid6/altivec.uc index 7cc12b532e95..d20ed0d11411 100644 --- a/lib/raid6/altivec.uc +++ b/lib/raid6/altivec.uc @@ -24,10 +24,13 @@ #include <linux/raid/pq.h> +#ifdef CONFIG_ALTIVEC + #include <altivec.h> #ifdef __KERNEL__ # include <asm/cputable.h> # include <asm/switch_to.h> +#endif /* __KERNEL__ */ /* * This is the C data type to use. We use a vector of @@ -101,6 +104,7 @@ static void raid6_altivec$#_gen_syndrome(int disks, size_t bytes, void **ptrs) raid6_altivec$#_gen_syndrome_real(disks, bytes, ptrs); + disable_kernel_altivec(); preempt_enable(); } @@ -119,6 +123,7 @@ int raid6_have_altivec(void) const struct raid6_calls raid6_altivec$# = { raid6_altivec$#_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_altivec, "altivecx$#", 0 diff --git a/lib/raid6/avx2.c b/lib/raid6/avx2.c index bc3b1dd436eb..059024234dce 100644 --- a/lib/raid6/avx2.c +++ b/lib/raid6/avx2.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 2012 Intel Corporation @@ -5,13 +6,6 @@ * * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Boston MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * * ----------------------------------------------------------------------- */ /* @@ -19,8 +13,6 @@ * */ -#ifdef CONFIG_AS_AVX2 - #include <linux/raid/pq.h> #include "x86.h" @@ -87,11 +79,60 @@ static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs) kernel_fpu_end(); } +static void raid6_avx21_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 32) { + asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); + asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); + asm volatile("vpxor %ymm4,%ymm2,%ymm2"); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); + asm volatile("vpxor %ymm5,%ymm2,%ymm2"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + } + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + } + asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); + /* Don't use movntdq for r/w memory area < cache line */ + asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d])); + asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + const struct raid6_calls raid6_avx2x1 = { raid6_avx21_gen_syndrome, + raid6_avx21_xor_syndrome, raid6_have_avx2, "avx2x1", - 1 /* Has cache hints */ + .priority = 2 /* Prefer AVX2 over priority 1 (SSE2 and others) */ }; /* @@ -148,11 +189,80 @@ static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs) kernel_fpu_end(); } +static void raid6_avx22_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 64) { + asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); + asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32])); + asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); + asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32])); + asm volatile("vpxor %ymm4,%ymm2,%ymm2"); + asm volatile("vpxor %ymm6,%ymm3,%ymm3"); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpxor %ymm7,%ymm7,%ymm7"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); + asm volatile("vmovdqa %0,%%ymm7" + :: "m" (dptr[z][d+32])); + asm volatile("vpxor %ymm5,%ymm2,%ymm2"); + asm volatile("vpxor %ymm7,%ymm3,%ymm3"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + } + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpxor %ymm7,%ymm7,%ymm7"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + } + asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); + asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32])); + /* Don't use movntdq for r/w memory area < cache line */ + asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d])); + asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32])); + asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d])); + asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + const struct raid6_calls raid6_avx2x2 = { raid6_avx22_gen_syndrome, + raid6_avx22_xor_syndrome, raid6_have_avx2, "avx2x2", - 1 /* Has cache hints */ + .priority = 2 /* Prefer AVX2 over priority 1 (SSE2 and others) */ }; #ifdef CONFIG_X86_64 @@ -240,12 +350,121 @@ static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs) kernel_fpu_end(); } +static void raid6_avx24_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 128) { + asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); + asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32])); + asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64])); + asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96])); + asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); + asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32])); + asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64])); + asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96])); + asm volatile("vpxor %ymm4,%ymm2,%ymm2"); + asm volatile("vpxor %ymm6,%ymm3,%ymm3"); + asm volatile("vpxor %ymm12,%ymm10,%ymm10"); + asm volatile("vpxor %ymm14,%ymm11,%ymm11"); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); + asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64])); + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpxor %ymm7,%ymm7,%ymm7"); + asm volatile("vpxor %ymm13,%ymm13,%ymm13"); + asm volatile("vpxor %ymm15,%ymm15,%ymm15"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); + asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13"); + asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); + asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpand %ymm0,%ymm13,%ymm13"); + asm volatile("vpand %ymm0,%ymm15,%ymm15"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vpxor %ymm13,%ymm12,%ymm12"); + asm volatile("vpxor %ymm15,%ymm14,%ymm14"); + asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); + asm volatile("vmovdqa %0,%%ymm7" + :: "m" (dptr[z][d+32])); + asm volatile("vmovdqa %0,%%ymm13" + :: "m" (dptr[z][d+64])); + asm volatile("vmovdqa %0,%%ymm15" + :: "m" (dptr[z][d+96])); + asm volatile("vpxor %ymm5,%ymm2,%ymm2"); + asm volatile("vpxor %ymm7,%ymm3,%ymm3"); + asm volatile("vpxor %ymm13,%ymm10,%ymm10"); + asm volatile("vpxor %ymm15,%ymm11,%ymm11"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vpxor %ymm13,%ymm12,%ymm12"); + asm volatile("vpxor %ymm15,%ymm14,%ymm14"); + } + asm volatile("prefetchnta %0" :: "m" (q[d])); + asm volatile("prefetchnta %0" :: "m" (q[d+64])); + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpxor %ymm7,%ymm7,%ymm7"); + asm volatile("vpxor %ymm13,%ymm13,%ymm13"); + asm volatile("vpxor %ymm15,%ymm15,%ymm15"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); + asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13"); + asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); + asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpand %ymm0,%ymm13,%ymm13"); + asm volatile("vpand %ymm0,%ymm15,%ymm15"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vpxor %ymm13,%ymm12,%ymm12"); + asm volatile("vpxor %ymm15,%ymm14,%ymm14"); + } + asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); + asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); + asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64])); + asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96])); + asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); + asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32])); + asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64])); + asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96])); + asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); + asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); + asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64])); + asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96])); + } + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + const struct raid6_calls raid6_avx2x4 = { raid6_avx24_gen_syndrome, + raid6_avx24_xor_syndrome, raid6_have_avx2, "avx2x4", - 1 /* Has cache hints */ + .priority = 2 /* Prefer AVX2 over priority 1 (SSE2 and others) */ }; -#endif - -#endif /* CONFIG_AS_AVX2 */ +#endif /* CONFIG_X86_64 */ diff --git a/lib/raid6/avx512.c b/lib/raid6/avx512.c new file mode 100644 index 000000000000..009bd0adeebf --- /dev/null +++ b/lib/raid6/avx512.c @@ -0,0 +1,560 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* -*- linux-c -*- -------------------------------------------------------- + * + * Copyright (C) 2016 Intel Corporation + * + * Author: Gayatri Kammela <gayatri.kammela@intel.com> + * Author: Megha Dey <megha.dey@linux.intel.com> + * + * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved + * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * ----------------------------------------------------------------------- + */ + +/* + * AVX512 implementation of RAID-6 syndrome functions + * + */ + +#include <linux/raid/pq.h> +#include "x86.h" + +static const struct raid6_avx512_constants { + u64 x1d[8]; +} raid6_avx512_constants __aligned(512/8) = { + { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,}, +}; + +static int raid6_have_avx512(void) +{ + return boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512BW) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512DQ); +} + +static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0\n\t" + "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ + : + : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0; d < bytes; d += 64) { + asm volatile("prefetchnta %0\n\t" + "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ + "prefetchnta %1\n\t" + "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ + "vmovdqa64 %1,%%zmm6" + : + : "m" (dptr[z0][d]), "m" (dptr[z0-1][d])); + for (z = z0-2; z >= 0; z--) { + asm volatile("prefetchnta %0\n\t" + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" + "vmovdqa64 %0,%%zmm6" + : + : "m" (dptr[z][d])); + } + asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" + "vmovntdq %%zmm2,%0\n\t" + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" + "vmovntdq %%zmm4,%1\n\t" + "vpxorq %%zmm4,%%zmm4,%%zmm4" + : + : "m" (p[d]), "m" (q[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx5121_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0" + : : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 64) { + asm volatile("vmovdqa64 %0,%%zmm4\n\t" + "vmovdqa64 %1,%%zmm2\n\t" + "vpxorq %%zmm4,%%zmm2,%%zmm2" + : + : "m" (dptr[z0][d]), "m" (p[d])); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4" + : + : "m" (dptr[z][d])); + } + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4" + : + : ); + } + asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" + /* Don't use movntdq for r/w memory area < cache line */ + "vmovdqa64 %%zmm4,%0\n\t" + "vmovdqa64 %%zmm2,%1" + : + : "m" (q[d]), "m" (p[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx512x1 = { + raid6_avx5121_gen_syndrome, + raid6_avx5121_xor_syndrome, + raid6_have_avx512, + "avx512x1", + .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */ +}; + +/* + * Unrolled-by-2 AVX512 implementation + */ +static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0\n\t" + "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ + : + : "m" (raid6_avx512_constants.x1d[0])); + + /* We uniformly assume a single prefetch covers at least 64 bytes */ + for (d = 0; d < bytes; d += 128) { + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ + "vmovdqa64 %1,%%zmm3\n\t" /* P[1] */ + "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ + "vmovdqa64 %%zmm3,%%zmm6" /* Q[1] */ + : + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64])); + for (z = z0-1; z >= 0; z--) { + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64])); + } + asm volatile("vmovntdq %%zmm2,%0\n\t" + "vmovntdq %%zmm3,%1\n\t" + "vmovntdq %%zmm4,%2\n\t" + "vmovntdq %%zmm6,%3" + : + : "m" (p[d]), "m" (p[d+64]), "m" (q[d]), + "m" (q[d+64])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx5122_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0" + : : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 128) { + asm volatile("vmovdqa64 %0,%%zmm4\n\t" + "vmovdqa64 %1,%%zmm6\n\t" + "vmovdqa64 %2,%%zmm2\n\t" + "vmovdqa64 %3,%%zmm3\n\t" + "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm3,%%zmm3" + : + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), + "m" (p[d]), "m" (p[d+64])); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64])); + } + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6" + : + : ); + } + asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" + "vpxorq %1,%%zmm6,%%zmm6\n\t" + /* Don't use movntdq for r/w + * memory area < cache line + */ + "vmovdqa64 %%zmm4,%0\n\t" + "vmovdqa64 %%zmm6,%1\n\t" + "vmovdqa64 %%zmm2,%2\n\t" + "vmovdqa64 %%zmm3,%3" + : + : "m" (q[d]), "m" (q[d+64]), "m" (p[d]), + "m" (p[d+64])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx512x2 = { + raid6_avx5122_gen_syndrome, + raid6_avx5122_xor_syndrome, + raid6_have_avx512, + "avx512x2", + .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */ +}; + +#ifdef CONFIG_X86_64 + +/* + * Unrolled-by-4 AVX2 implementation + */ +static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0\n\t" + "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t" /* Zero temp */ + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" /* P[0] */ + "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" /* P[1] */ + "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" /* Q[0] */ + "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" /* Q[1] */ + "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" /* P[2] */ + "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" /* P[3] */ + "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" /* Q[2] */ + "vpxorq %%zmm14,%%zmm14,%%zmm14" /* Q[3] */ + : + : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0; d < bytes; d += 256) { + for (z = z0; z >= 0; z--) { + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + "prefetchnta %2\n\t" + "prefetchnta %3\n\t" + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" + "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t" + "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpmovm2b %%k3,%%zmm13\n\t" + "vpmovm2b %%k4,%%zmm15\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" + "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vmovdqa64 %2,%%zmm13\n\t" + "vmovdqa64 %3,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" + "vpxorq %%zmm15,%%zmm11,%%zmm11\n" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64]), + "m" (dptr[z][d+128]), "m" (dptr[z][d+192])); + } + asm volatile("vmovntdq %%zmm2,%0\n\t" + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" + "vmovntdq %%zmm3,%1\n\t" + "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" + "vmovntdq %%zmm10,%2\n\t" + "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" + "vmovntdq %%zmm11,%3\n\t" + "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" + "vmovntdq %%zmm4,%4\n\t" + "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" + "vmovntdq %%zmm6,%5\n\t" + "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" + "vmovntdq %%zmm12,%6\n\t" + "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" + "vmovntdq %%zmm14,%7\n\t" + "vpxorq %%zmm14,%%zmm14,%%zmm14" + : + : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), + "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), + "m" (q[d+128]), "m" (q[d+192])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx5124_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0" + :: "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 256) { + asm volatile("vmovdqa64 %0,%%zmm4\n\t" + "vmovdqa64 %1,%%zmm6\n\t" + "vmovdqa64 %2,%%zmm12\n\t" + "vmovdqa64 %3,%%zmm14\n\t" + "vmovdqa64 %4,%%zmm2\n\t" + "vmovdqa64 %5,%%zmm3\n\t" + "vmovdqa64 %6,%%zmm10\n\t" + "vmovdqa64 %7,%%zmm11\n\t" + "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t" + "vpxorq %%zmm14,%%zmm11,%%zmm11" + : + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), + "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]), + "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), + "m" (p[d+192])); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" + "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" + "prefetchnta %0\n\t" + "prefetchnta %2\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" + "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpmovm2b %%k3,%%zmm13\n\t" + "vpmovm2b %%k4,%%zmm15\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" + "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vmovdqa64 %2,%%zmm13\n\t" + "vmovdqa64 %3,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" + "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64]), + "m" (dptr[z][d+128]), + "m" (dptr[z][d+192])); + } + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + : + : "m" (q[d]), "m" (q[d+128])); + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" + "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" + "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpmovm2b %%k3,%%zmm13\n\t" + "vpmovm2b %%k4,%%zmm15\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" + "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14" + : + : ); + } + asm volatile("vmovntdq %%zmm2,%0\n\t" + "vmovntdq %%zmm3,%1\n\t" + "vmovntdq %%zmm10,%2\n\t" + "vmovntdq %%zmm11,%3\n\t" + "vpxorq %4,%%zmm4,%%zmm4\n\t" + "vpxorq %5,%%zmm6,%%zmm6\n\t" + "vpxorq %6,%%zmm12,%%zmm12\n\t" + "vpxorq %7,%%zmm14,%%zmm14\n\t" + "vmovntdq %%zmm4,%4\n\t" + "vmovntdq %%zmm6,%5\n\t" + "vmovntdq %%zmm12,%6\n\t" + "vmovntdq %%zmm14,%7" + : + : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), + "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), + "m" (q[d+128]), "m" (q[d+192])); + } + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} +const struct raid6_calls raid6_avx512x4 = { + raid6_avx5124_gen_syndrome, + raid6_avx5124_xor_syndrome, + raid6_have_avx512, + "avx512x4", + .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */ +}; +#endif diff --git a/lib/raid6/int.uc b/lib/raid6/int.uc index 5b50f8dfc5d2..1ba56c3fa482 100644 --- a/lib/raid6/int.uc +++ b/lib/raid6/int.uc @@ -42,13 +42,6 @@ typedef u32 unative_t; /* - * IA-64 wants insane amounts of unrolling. On other architectures that - * is just a waste of space. - */ -#if ($# <= 8) || defined(__ia64__) - - -/* * These sub-operations are separate inlines since they can sometimes be * specially optimized using architecture-specific hacks. */ @@ -107,11 +100,48 @@ static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs) } } +static void raid6_int$#_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + unative_t wd$$, wq$$, wp$$, w1$$, w2$$; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { + /* P/Q data pages */ + wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; + for ( z = z0-1 ; z >= start ; z-- ) { + wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + wp$$ ^= wd$$; + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + w2$$ &= NBYTES(0x1d); + w1$$ ^= w2$$; + wq$$ = w1$$ ^ wd$$; + } + /* P/Q left side optimization */ + for ( z = start-1 ; z >= 0 ; z-- ) { + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + w2$$ &= NBYTES(0x1d); + wq$$ = w1$$ ^ w2$$; + } + *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + } + +} + const struct raid6_calls raid6_intx$# = { raid6_int$#_gen_syndrome, - NULL, /* always valid */ + raid6_int$#_xor_syndrome, + NULL, /* always valid */ "int" NSTRING "x$#", 0 }; - -#endif diff --git a/lib/raid6/loongarch.h b/lib/raid6/loongarch.h new file mode 100644 index 000000000000..acfc33ce7056 --- /dev/null +++ b/lib/raid6/loongarch.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2023 WANG Xuerui <git@xen0n.name> + * + * raid6/loongarch.h + * + * Definitions common to LoongArch RAID-6 code only + */ + +#ifndef _LIB_RAID6_LOONGARCH_H +#define _LIB_RAID6_LOONGARCH_H + +#ifdef __KERNEL__ + +#include <asm/cpu-features.h> +#include <asm/fpu.h> + +#else /* for user-space testing */ + +#include <sys/auxv.h> + +/* have to supply these defines for glibc 2.37- and musl */ +#ifndef HWCAP_LOONGARCH_LSX +#define HWCAP_LOONGARCH_LSX (1 << 4) +#endif +#ifndef HWCAP_LOONGARCH_LASX +#define HWCAP_LOONGARCH_LASX (1 << 5) +#endif + +#define kernel_fpu_begin() +#define kernel_fpu_end() + +#define cpu_has_lsx (getauxval(AT_HWCAP) & HWCAP_LOONGARCH_LSX) +#define cpu_has_lasx (getauxval(AT_HWCAP) & HWCAP_LOONGARCH_LASX) + +#endif /* __KERNEL__ */ + +#endif /* _LIB_RAID6_LOONGARCH_H */ diff --git a/lib/raid6/loongarch_simd.c b/lib/raid6/loongarch_simd.c new file mode 100644 index 000000000000..aa5d9f924ca3 --- /dev/null +++ b/lib/raid6/loongarch_simd.c @@ -0,0 +1,422 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RAID6 syndrome calculations in LoongArch SIMD (LSX & LASX) + * + * Copyright 2023 WANG Xuerui <git@xen0n.name> + * + * Based on the generic RAID-6 code (int.uc): + * + * Copyright 2002-2004 H. Peter Anvin + */ + +#include <linux/raid/pq.h> +#include "loongarch.h" + +/* + * The vector algorithms are currently priority 0, which means the generic + * scalar algorithms are not being disabled if vector support is present. + * This is like the similar LoongArch RAID5 XOR code, with the main reason + * repeated here: it cannot be ruled out at this point of time, that some + * future (maybe reduced) models could run the vector algorithms slower than + * the scalar ones, maybe for errata or micro-op reasons. It may be + * appropriate to revisit this after one or two more uarch generations. + */ + +#ifdef CONFIG_CPU_HAS_LSX +#define NSIZE 16 + +static int raid6_has_lsx(void) +{ + return cpu_has_lsx; +} + +static void raid6_lsx_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + /* + * $vr0, $vr1, $vr2, $vr3: wp + * $vr4, $vr5, $vr6, $vr7: wq + * $vr8, $vr9, $vr10, $vr11: wd + * $vr12, $vr13, $vr14, $vr15: w2 + * $vr16, $vr17, $vr18, $vr19: w1 + */ + for (d = 0; d < bytes; d += NSIZE*4) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE])); + asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE])); + asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE])); + asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE])); + asm volatile("vori.b $vr4, $vr0, 0"); + asm volatile("vori.b $vr5, $vr1, 0"); + asm volatile("vori.b $vr6, $vr2, 0"); + asm volatile("vori.b $vr7, $vr3, 0"); + for (z = z0-1; z >= 0; z--) { + /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */ + asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE])); + asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE])); + asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE])); + asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE])); + /* wp$$ ^= wd$$; */ + asm volatile("vxor.v $vr0, $vr0, $vr8"); + asm volatile("vxor.v $vr1, $vr1, $vr9"); + asm volatile("vxor.v $vr2, $vr2, $vr10"); + asm volatile("vxor.v $vr3, $vr3, $vr11"); + /* w2$$ = MASK(wq$$); */ + asm volatile("vslti.b $vr12, $vr4, 0"); + asm volatile("vslti.b $vr13, $vr5, 0"); + asm volatile("vslti.b $vr14, $vr6, 0"); + asm volatile("vslti.b $vr15, $vr7, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("vslli.b $vr16, $vr4, 1"); + asm volatile("vslli.b $vr17, $vr5, 1"); + asm volatile("vslli.b $vr18, $vr6, 1"); + asm volatile("vslli.b $vr19, $vr7, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("vandi.b $vr12, $vr12, 0x1d"); + asm volatile("vandi.b $vr13, $vr13, 0x1d"); + asm volatile("vandi.b $vr14, $vr14, 0x1d"); + asm volatile("vandi.b $vr15, $vr15, 0x1d"); + /* w1$$ ^= w2$$; */ + asm volatile("vxor.v $vr16, $vr16, $vr12"); + asm volatile("vxor.v $vr17, $vr17, $vr13"); + asm volatile("vxor.v $vr18, $vr18, $vr14"); + asm volatile("vxor.v $vr19, $vr19, $vr15"); + /* wq$$ = w1$$ ^ wd$$; */ + asm volatile("vxor.v $vr4, $vr16, $vr8"); + asm volatile("vxor.v $vr5, $vr17, $vr9"); + asm volatile("vxor.v $vr6, $vr18, $vr10"); + asm volatile("vxor.v $vr7, $vr19, $vr11"); + } + /* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */ + asm volatile("vst $vr0, %0" : "=m"(p[d+NSIZE*0])); + asm volatile("vst $vr1, %0" : "=m"(p[d+NSIZE*1])); + asm volatile("vst $vr2, %0" : "=m"(p[d+NSIZE*2])); + asm volatile("vst $vr3, %0" : "=m"(p[d+NSIZE*3])); + /* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */ + asm volatile("vst $vr4, %0" : "=m"(q[d+NSIZE*0])); + asm volatile("vst $vr5, %0" : "=m"(q[d+NSIZE*1])); + asm volatile("vst $vr6, %0" : "=m"(q[d+NSIZE*2])); + asm volatile("vst $vr7, %0" : "=m"(q[d+NSIZE*3])); + } + + kernel_fpu_end(); +} + +static void raid6_lsx_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + /* + * $vr0, $vr1, $vr2, $vr3: wp + * $vr4, $vr5, $vr6, $vr7: wq + * $vr8, $vr9, $vr10, $vr11: wd + * $vr12, $vr13, $vr14, $vr15: w2 + * $vr16, $vr17, $vr18, $vr19: w1 + */ + for (d = 0; d < bytes; d += NSIZE*4) { + /* P/Q data pages */ + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE])); + asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE])); + asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE])); + asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE])); + asm volatile("vori.b $vr4, $vr0, 0"); + asm volatile("vori.b $vr5, $vr1, 0"); + asm volatile("vori.b $vr6, $vr2, 0"); + asm volatile("vori.b $vr7, $vr3, 0"); + for (z = z0-1; z >= start; z--) { + /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */ + asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE])); + asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE])); + asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE])); + asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE])); + /* wp$$ ^= wd$$; */ + asm volatile("vxor.v $vr0, $vr0, $vr8"); + asm volatile("vxor.v $vr1, $vr1, $vr9"); + asm volatile("vxor.v $vr2, $vr2, $vr10"); + asm volatile("vxor.v $vr3, $vr3, $vr11"); + /* w2$$ = MASK(wq$$); */ + asm volatile("vslti.b $vr12, $vr4, 0"); + asm volatile("vslti.b $vr13, $vr5, 0"); + asm volatile("vslti.b $vr14, $vr6, 0"); + asm volatile("vslti.b $vr15, $vr7, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("vslli.b $vr16, $vr4, 1"); + asm volatile("vslli.b $vr17, $vr5, 1"); + asm volatile("vslli.b $vr18, $vr6, 1"); + asm volatile("vslli.b $vr19, $vr7, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("vandi.b $vr12, $vr12, 0x1d"); + asm volatile("vandi.b $vr13, $vr13, 0x1d"); + asm volatile("vandi.b $vr14, $vr14, 0x1d"); + asm volatile("vandi.b $vr15, $vr15, 0x1d"); + /* w1$$ ^= w2$$; */ + asm volatile("vxor.v $vr16, $vr16, $vr12"); + asm volatile("vxor.v $vr17, $vr17, $vr13"); + asm volatile("vxor.v $vr18, $vr18, $vr14"); + asm volatile("vxor.v $vr19, $vr19, $vr15"); + /* wq$$ = w1$$ ^ wd$$; */ + asm volatile("vxor.v $vr4, $vr16, $vr8"); + asm volatile("vxor.v $vr5, $vr17, $vr9"); + asm volatile("vxor.v $vr6, $vr18, $vr10"); + asm volatile("vxor.v $vr7, $vr19, $vr11"); + } + + /* P/Q left side optimization */ + for (z = start-1; z >= 0; z--) { + /* w2$$ = MASK(wq$$); */ + asm volatile("vslti.b $vr12, $vr4, 0"); + asm volatile("vslti.b $vr13, $vr5, 0"); + asm volatile("vslti.b $vr14, $vr6, 0"); + asm volatile("vslti.b $vr15, $vr7, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("vslli.b $vr16, $vr4, 1"); + asm volatile("vslli.b $vr17, $vr5, 1"); + asm volatile("vslli.b $vr18, $vr6, 1"); + asm volatile("vslli.b $vr19, $vr7, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("vandi.b $vr12, $vr12, 0x1d"); + asm volatile("vandi.b $vr13, $vr13, 0x1d"); + asm volatile("vandi.b $vr14, $vr14, 0x1d"); + asm volatile("vandi.b $vr15, $vr15, 0x1d"); + /* wq$$ = w1$$ ^ w2$$; */ + asm volatile("vxor.v $vr4, $vr16, $vr12"); + asm volatile("vxor.v $vr5, $vr17, $vr13"); + asm volatile("vxor.v $vr6, $vr18, $vr14"); + asm volatile("vxor.v $vr7, $vr19, $vr15"); + } + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + */ + asm volatile( + "vld $vr20, %0\n\t" + "vld $vr21, %1\n\t" + "vld $vr22, %2\n\t" + "vld $vr23, %3\n\t" + "vld $vr24, %4\n\t" + "vld $vr25, %5\n\t" + "vld $vr26, %6\n\t" + "vld $vr27, %7\n\t" + "vxor.v $vr20, $vr20, $vr0\n\t" + "vxor.v $vr21, $vr21, $vr1\n\t" + "vxor.v $vr22, $vr22, $vr2\n\t" + "vxor.v $vr23, $vr23, $vr3\n\t" + "vxor.v $vr24, $vr24, $vr4\n\t" + "vxor.v $vr25, $vr25, $vr5\n\t" + "vxor.v $vr26, $vr26, $vr6\n\t" + "vxor.v $vr27, $vr27, $vr7\n\t" + "vst $vr20, %0\n\t" + "vst $vr21, %1\n\t" + "vst $vr22, %2\n\t" + "vst $vr23, %3\n\t" + "vst $vr24, %4\n\t" + "vst $vr25, %5\n\t" + "vst $vr26, %6\n\t" + "vst $vr27, %7\n\t" + : "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]), + "+m"(p[d+NSIZE*2]), "+m"(p[d+NSIZE*3]), + "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]), + "+m"(q[d+NSIZE*2]), "+m"(q[d+NSIZE*3]) + ); + } + + kernel_fpu_end(); +} + +const struct raid6_calls raid6_lsx = { + raid6_lsx_gen_syndrome, + raid6_lsx_xor_syndrome, + raid6_has_lsx, + "lsx", + .priority = 0 /* see the comment near the top of the file for reason */ +}; + +#undef NSIZE +#endif /* CONFIG_CPU_HAS_LSX */ + +#ifdef CONFIG_CPU_HAS_LASX +#define NSIZE 32 + +static int raid6_has_lasx(void) +{ + return cpu_has_lasx; +} + +static void raid6_lasx_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + /* + * $xr0, $xr1: wp + * $xr2, $xr3: wq + * $xr4, $xr5: wd + * $xr6, $xr7: w2 + * $xr8, $xr9: w1 + */ + for (d = 0; d < bytes; d += NSIZE*2) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE])); + asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE])); + asm volatile("xvori.b $xr2, $xr0, 0"); + asm volatile("xvori.b $xr3, $xr1, 0"); + for (z = z0-1; z >= 0; z--) { + /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */ + asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE])); + asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE])); + /* wp$$ ^= wd$$; */ + asm volatile("xvxor.v $xr0, $xr0, $xr4"); + asm volatile("xvxor.v $xr1, $xr1, $xr5"); + /* w2$$ = MASK(wq$$); */ + asm volatile("xvslti.b $xr6, $xr2, 0"); + asm volatile("xvslti.b $xr7, $xr3, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("xvslli.b $xr8, $xr2, 1"); + asm volatile("xvslli.b $xr9, $xr3, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("xvandi.b $xr6, $xr6, 0x1d"); + asm volatile("xvandi.b $xr7, $xr7, 0x1d"); + /* w1$$ ^= w2$$; */ + asm volatile("xvxor.v $xr8, $xr8, $xr6"); + asm volatile("xvxor.v $xr9, $xr9, $xr7"); + /* wq$$ = w1$$ ^ wd$$; */ + asm volatile("xvxor.v $xr2, $xr8, $xr4"); + asm volatile("xvxor.v $xr3, $xr9, $xr5"); + } + /* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */ + asm volatile("xvst $xr0, %0" : "=m"(p[d+NSIZE*0])); + asm volatile("xvst $xr1, %0" : "=m"(p[d+NSIZE*1])); + /* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */ + asm volatile("xvst $xr2, %0" : "=m"(q[d+NSIZE*0])); + asm volatile("xvst $xr3, %0" : "=m"(q[d+NSIZE*1])); + } + + kernel_fpu_end(); +} + +static void raid6_lasx_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + /* + * $xr0, $xr1: wp + * $xr2, $xr3: wq + * $xr4, $xr5: wd + * $xr6, $xr7: w2 + * $xr8, $xr9: w1 + */ + for (d = 0; d < bytes; d += NSIZE*2) { + /* P/Q data pages */ + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE])); + asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE])); + asm volatile("xvori.b $xr2, $xr0, 0"); + asm volatile("xvori.b $xr3, $xr1, 0"); + for (z = z0-1; z >= start; z--) { + /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */ + asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE])); + asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE])); + /* wp$$ ^= wd$$; */ + asm volatile("xvxor.v $xr0, $xr0, $xr4"); + asm volatile("xvxor.v $xr1, $xr1, $xr5"); + /* w2$$ = MASK(wq$$); */ + asm volatile("xvslti.b $xr6, $xr2, 0"); + asm volatile("xvslti.b $xr7, $xr3, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("xvslli.b $xr8, $xr2, 1"); + asm volatile("xvslli.b $xr9, $xr3, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("xvandi.b $xr6, $xr6, 0x1d"); + asm volatile("xvandi.b $xr7, $xr7, 0x1d"); + /* w1$$ ^= w2$$; */ + asm volatile("xvxor.v $xr8, $xr8, $xr6"); + asm volatile("xvxor.v $xr9, $xr9, $xr7"); + /* wq$$ = w1$$ ^ wd$$; */ + asm volatile("xvxor.v $xr2, $xr8, $xr4"); + asm volatile("xvxor.v $xr3, $xr9, $xr5"); + } + + /* P/Q left side optimization */ + for (z = start-1; z >= 0; z--) { + /* w2$$ = MASK(wq$$); */ + asm volatile("xvslti.b $xr6, $xr2, 0"); + asm volatile("xvslti.b $xr7, $xr3, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("xvslli.b $xr8, $xr2, 1"); + asm volatile("xvslli.b $xr9, $xr3, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("xvandi.b $xr6, $xr6, 0x1d"); + asm volatile("xvandi.b $xr7, $xr7, 0x1d"); + /* wq$$ = w1$$ ^ w2$$; */ + asm volatile("xvxor.v $xr2, $xr8, $xr6"); + asm volatile("xvxor.v $xr3, $xr9, $xr7"); + } + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + */ + asm volatile( + "xvld $xr10, %0\n\t" + "xvld $xr11, %1\n\t" + "xvld $xr12, %2\n\t" + "xvld $xr13, %3\n\t" + "xvxor.v $xr10, $xr10, $xr0\n\t" + "xvxor.v $xr11, $xr11, $xr1\n\t" + "xvxor.v $xr12, $xr12, $xr2\n\t" + "xvxor.v $xr13, $xr13, $xr3\n\t" + "xvst $xr10, %0\n\t" + "xvst $xr11, %1\n\t" + "xvst $xr12, %2\n\t" + "xvst $xr13, %3\n\t" + : "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]), + "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]) + ); + } + + kernel_fpu_end(); +} + +const struct raid6_calls raid6_lasx = { + raid6_lasx_gen_syndrome, + raid6_lasx_xor_syndrome, + raid6_has_lasx, + "lasx", + .priority = 0 /* see the comment near the top of the file for reason */ +}; +#undef NSIZE +#endif /* CONFIG_CPU_HAS_LASX */ diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c index 39787db588b0..3be03793237c 100644 --- a/lib/raid6/mktables.c +++ b/lib/raid6/mktables.c @@ -1,11 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2 or (at your - * option) any later version; incorporated herein by reference. - * * ----------------------------------------------------------------------- */ /* @@ -59,8 +56,10 @@ int main(int argc, char *argv[]) uint8_t v; uint8_t exptbl[256], invtbl[256]; - printf("#include <linux/raid/pq.h>\n"); + printf("#ifdef __KERNEL__\n"); printf("#include <linux/export.h>\n"); + printf("#endif\n"); + printf("#include <linux/raid/pq.h>\n"); /* Compute multiplication table */ printf("\nconst u8 __attribute__((aligned(256)))\n" @@ -125,6 +124,26 @@ int main(int argc, char *argv[]) printf("EXPORT_SYMBOL(raid6_gfexp);\n"); printf("#endif\n"); + /* Compute log-of-2 table */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gflog[256] =\n" "{\n"); + for (i = 0; i < 256; i += 8) { + printf("\t"); + for (j = 0; j < 8; j++) { + v = 255; + for (k = 0; k < 256; k++) + if (exptbl[k] == (i + j)) { + v = k; + break; + } + printf("0x%02x,%c", v, (j == 7) ? '\n' : ' '); + } + } + printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gflog);\n"); + printf("#endif\n"); + /* Compute inverse table x^-1 == x^254 */ printf("\nconst u8 __attribute__((aligned(256)))\n" "raid6_gfinv[256] =\n" "{\n"); diff --git a/lib/raid6/mmx.c b/lib/raid6/mmx.c index 590c71c9e200..3a5bf53a297b 100644 --- a/lib/raid6/mmx.c +++ b/lib/raid6/mmx.c @@ -1,13 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright 2002 H. Peter Anvin - All Rights Reserved * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Boston MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * * ----------------------------------------------------------------------- */ /* @@ -76,6 +71,7 @@ static void raid6_mmx1_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_mmxx1 = { raid6_mmx1_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_mmx, "mmxx1", 0 @@ -134,6 +130,7 @@ static void raid6_mmx2_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_mmxx2 = { raid6_mmx2_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_mmx, "mmxx2", 0 diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c new file mode 100644 index 000000000000..6d9474ce6da9 --- /dev/null +++ b/lib/raid6/neon.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/lib/raid6/neon.c - RAID6 syndrome calculation using ARM NEON intrinsics + * + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/raid/pq.h> + +#ifdef __KERNEL__ +#include <asm/simd.h> +#else +#define scoped_ksimd() +#define cpu_has_neon() (1) +#endif + +/* + * There are 2 reasons these wrappers are kept in a separate compilation unit + * from the actual implementations in neonN.c (generated from neon.uc by + * unroll.awk): + * - the actual implementations use NEON intrinsics, and the GCC support header + * (arm_neon.h) is not fully compatible (type wise) with the kernel; + * - the neonN.c files are compiled with -mfpu=neon and optimization enabled, + * and we have to make sure that we never use *any* NEON/VFP instructions + * outside a kernel_neon_begin()/kernel_neon_end() pair. + */ + +#define RAID6_NEON_WRAPPER(_n) \ + static void raid6_neon ## _n ## _gen_syndrome(int disks, \ + size_t bytes, void **ptrs) \ + { \ + void raid6_neon ## _n ## _gen_syndrome_real(int, \ + unsigned long, void**); \ + scoped_ksimd() \ + raid6_neon ## _n ## _gen_syndrome_real(disks, \ + (unsigned long)bytes, ptrs); \ + } \ + static void raid6_neon ## _n ## _xor_syndrome(int disks, \ + int start, int stop, \ + size_t bytes, void **ptrs) \ + { \ + void raid6_neon ## _n ## _xor_syndrome_real(int, \ + int, int, unsigned long, void**); \ + scoped_ksimd() \ + raid6_neon ## _n ## _xor_syndrome_real(disks, \ + start, stop, (unsigned long)bytes, ptrs);\ + } \ + struct raid6_calls const raid6_neonx ## _n = { \ + raid6_neon ## _n ## _gen_syndrome, \ + raid6_neon ## _n ## _xor_syndrome, \ + raid6_have_neon, \ + "neonx" #_n, \ + 0 \ + } + +static int raid6_have_neon(void) +{ + return cpu_has_neon(); +} + +RAID6_NEON_WRAPPER(1); +RAID6_NEON_WRAPPER(2); +RAID6_NEON_WRAPPER(4); +RAID6_NEON_WRAPPER(8); diff --git a/lib/raid6/neon.h b/lib/raid6/neon.h new file mode 100644 index 000000000000..2ca41ee9b499 --- /dev/null +++ b/lib/raid6/neon.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only + +void raid6_neon1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs); +void raid6_neon1_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs); +void raid6_neon2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs); +void raid6_neon2_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs); +void raid6_neon4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs); +void raid6_neon4_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs); +void raid6_neon8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs); +void raid6_neon8_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs); +void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp, + uint8_t *dq, const uint8_t *pbmul, + const uint8_t *qmul); + +void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq, + const uint8_t *qmul); + + diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc new file mode 100644 index 000000000000..355270af0cd6 --- /dev/null +++ b/lib/raid6/neon.uc @@ -0,0 +1,153 @@ +/* ----------------------------------------------------------------------- + * + * neon.uc - RAID-6 syndrome calculation using ARM NEON instructions + * + * Copyright (C) 2012 Rob Herring + * Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org> + * + * Based on altivec.uc: + * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * neon$#.c + * + * $#-way unrolled NEON intrinsics math RAID-6 instruction set + * + * This file is postprocessed using unroll.awk + */ + +#include <arm_neon.h> +#include "neon.h" + +typedef uint8x16_t unative_t; + +#define NSIZE sizeof(unative_t) + +/* + * The SHLBYTE() operation shifts each byte left by 1, *not* + * rolling over into the next byte + */ +static inline unative_t SHLBYTE(unative_t v) +{ + return vshlq_n_u8(v, 1); +} + +/* + * The MASK() operation returns 0xFF in any byte for which the high + * bit is 1, 0x00 for any byte for which the high bit is 0. + */ +static inline unative_t MASK(unative_t v) +{ + return (unative_t)vshrq_n_s8((int8x16_t)v, 7); +} + +static inline unative_t PMUL(unative_t v, unative_t u) +{ + return (unative_t)vmulq_p8((poly8x16_t)v, (poly8x16_t)u); +} + +void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) +{ + uint8_t **dptr = (uint8_t **)ptrs; + uint8_t *p, *q; + int d, z, z0; + + register unative_t wd$$, wq$$, wp$$, w1$$, w2$$; + const unative_t x1d = vdupq_n_u8(0x1d); + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { + wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]); + for ( z = z0-1 ; z >= 0 ; z-- ) { + wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]); + wp$$ = veorq_u8(wp$$, wd$$); + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + + w2$$ = vandq_u8(w2$$, x1d); + w1$$ = veorq_u8(w1$$, w2$$); + wq$$ = veorq_u8(w1$$, wd$$); + } + vst1q_u8(&p[d+NSIZE*$$], wp$$); + vst1q_u8(&q[d+NSIZE*$$], wq$$); + } +} + +void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs) +{ + uint8_t **dptr = (uint8_t **)ptrs; + uint8_t *p, *q; + int d, z, z0; + + register unative_t wd$$, wq$$, wp$$, w1$$, w2$$; + const unative_t x1d = vdupq_n_u8(0x1d); + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { + wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]); + wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$); + + /* P/Q data pages */ + for ( z = z0-1 ; z >= start ; z-- ) { + wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]); + wp$$ = veorq_u8(wp$$, wd$$); + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + + w2$$ = vandq_u8(w2$$, x1d); + w1$$ = veorq_u8(w1$$, w2$$); + wq$$ = veorq_u8(w1$$, wd$$); + } + /* P/Q left side optimization */ + for ( z = start-1 ; z >= 3 ; z -= 4 ) { + w2$$ = vshrq_n_u8(wq$$, 4); + w1$$ = vshlq_n_u8(wq$$, 4); + + w2$$ = PMUL(w2$$, x1d); + wq$$ = veorq_u8(w1$$, w2$$); + } + + switch (z) { + case 2: + w2$$ = vshrq_n_u8(wq$$, 5); + w1$$ = vshlq_n_u8(wq$$, 3); + + w2$$ = PMUL(w2$$, x1d); + wq$$ = veorq_u8(w1$$, w2$$); + break; + case 1: + w2$$ = vshrq_n_u8(wq$$, 6); + w1$$ = vshlq_n_u8(wq$$, 2); + + w2$$ = PMUL(w2$$, x1d); + wq$$ = veorq_u8(w1$$, w2$$); + break; + case 0: + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + + w2$$ = vandq_u8(w2$$, x1d); + wq$$ = veorq_u8(w1$$, w2$$); + } + w1$$ = vld1q_u8(&q[d+NSIZE*$$]); + wq$$ = veorq_u8(wq$$, w1$$); + + vst1q_u8(&p[d+NSIZE*$$], wp$$); + vst1q_u8(&q[d+NSIZE*$$], wq$$); + } +} diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c index a95bccb8497d..b5e47c008b41 100644 --- a/lib/raid6/recov.c +++ b/lib/raid6/recov.c @@ -1,13 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright 2002 H. Peter Anvin - All Rights Reserved * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Boston MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * * ----------------------------------------------------------------------- */ /* @@ -18,7 +13,6 @@ * the syndrome.) */ -#include <linux/export.h> #include <linux/raid/pq.h> /* Recover two failed data blocks. */ @@ -37,10 +31,10 @@ static void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -78,7 +72,7 @@ static void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c index e1eea433a493..97d598d2535c 100644 --- a/lib/raid6/recov_avx2.c +++ b/lib/raid6/recov_avx2.c @@ -1,15 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012 Intel Corporation * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. */ -#if CONFIG_AS_AVX2 - #include <linux/raid/pq.h> #include "x86.h" @@ -34,10 +28,10 @@ static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -202,7 +196,7 @@ static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -317,7 +311,3 @@ const struct raid6_recov_calls raid6_recov_avx2 = { #endif .priority = 2, }; - -#else -#warning "your version of binutils lacks AVX2 support" -#endif diff --git a/lib/raid6/recov_avx512.c b/lib/raid6/recov_avx512.c new file mode 100644 index 000000000000..7986120ca444 --- /dev/null +++ b/lib/raid6/recov_avx512.c @@ -0,0 +1,377 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2016 Intel Corporation + * + * Author: Gayatri Kammela <gayatri.kammela@intel.com> + * Author: Megha Dey <megha.dey@linux.intel.com> + */ + +#include <linux/raid/pq.h> +#include "x86.h" + +static int raid6_has_avx512(void) +{ + return boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512BW) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512DQ); +} + +static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + const u8 x0f = 0x0f; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + + dp = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = raid6_get_zero_page(); + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + /* zmm0 = x0f[16] */ + asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f)); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("vmovdqa64 %0, %%zmm1\n\t" + "vmovdqa64 %1, %%zmm9\n\t" + "vmovdqa64 %2, %%zmm0\n\t" + "vmovdqa64 %3, %%zmm8\n\t" + "vpxorq %4, %%zmm1, %%zmm1\n\t" + "vpxorq %5, %%zmm9, %%zmm9\n\t" + "vpxorq %6, %%zmm0, %%zmm0\n\t" + "vpxorq %7, %%zmm8, %%zmm8" + : + : "m" (q[0]), "m" (q[64]), "m" (p[0]), + "m" (p[64]), "m" (dq[0]), "m" (dq[64]), + "m" (dp[0]), "m" (dp[64])); + + /* + * 1 = dq[0] ^ q[0] + * 9 = dq[64] ^ q[64] + * 0 = dp[0] ^ p[0] + * 8 = dp[64] ^ p[64] + */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm5" + : + : "m" (qmul[0]), "m" (qmul[16])); + + asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t" + "vpsraw $4, %%zmm9, %%zmm12\n\t" + "vpandq %%zmm7, %%zmm1, %%zmm1\n\t" + "vpandq %%zmm7, %%zmm9, %%zmm9\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm12, %%zmm12\n\t" + "vpshufb %%zmm9, %%zmm4, %%zmm14\n\t" + "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm12, %%zmm5, %%zmm15\n\t" + "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t" + "vpxorq %%zmm14, %%zmm15, %%zmm15\n\t" + "vpxorq %%zmm4, %%zmm5, %%zmm5" + : + : ); + + /* + * 5 = qx[0] + * 15 = qx[64] + */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm1\n\t" + "vpsraw $4, %%zmm0, %%zmm2\n\t" + "vpsraw $4, %%zmm8, %%zmm6\n\t" + "vpandq %%zmm7, %%zmm0, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm8, %%zmm14\n\t" + "vpandq %%zmm7, %%zmm2, %%zmm2\n\t" + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" + "vpshufb %%zmm14, %%zmm4, %%zmm12\n\t" + "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm6, %%zmm1, %%zmm13\n\t" + "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm4, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm12, %%zmm13, %%zmm13" + : + : "m" (pbmul[0]), "m" (pbmul[16])); + + /* + * 1 = pbmul[px[0]] + * 13 = pbmul[px[64]] + */ + asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm15, %%zmm13, %%zmm13" + : + : ); + + /* + * 1 = db = DQ + * 13 = db[64] = DQ[64] + */ + asm volatile("vmovdqa64 %%zmm1, %0\n\t" + "vmovdqa64 %%zmm13,%1\n\t" + "vpxorq %%zmm1, %%zmm0, %%zmm0\n\t" + "vpxorq %%zmm13, %%zmm8, %%zmm8" + : + : "m" (dq[0]), "m" (dq[64])); + + asm volatile("vmovdqa64 %%zmm0, %0\n\t" + "vmovdqa64 %%zmm8, %1" + : + : "m" (dp[0]), "m" (dp[64])); + + bytes -= 128; + p += 128; + q += 128; + dp += 128; + dq += 128; +#else + asm volatile("vmovdqa64 %0, %%zmm1\n\t" + "vmovdqa64 %1, %%zmm0\n\t" + "vpxorq %2, %%zmm1, %%zmm1\n\t" + "vpxorq %3, %%zmm0, %%zmm0" + : + : "m" (*q), "m" (*p), "m"(*dq), "m" (*dp)); + + /* 1 = dq ^ q; 0 = dp ^ p */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm5" + : + : "m" (qmul[0]), "m" (qmul[16])); + + /* + * 1 = dq ^ q + * 3 = dq ^ p >> 4 + */ + asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm1, %%zmm1\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t" + "vpxorq %%zmm4, %%zmm5, %%zmm5" + : + : ); + + /* 5 = qx */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm1" + : + : "m" (pbmul[0]), "m" (pbmul[16])); + + asm volatile("vpsraw $4, %%zmm0, %%zmm2\n\t" + "vpandq %%zmm7, %%zmm0, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm2, %%zmm2\n\t" + "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm4, %%zmm1, %%zmm1" + : + : ); + + /* 1 = pbmul[px] */ + asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t" + /* 1 = db = DQ */ + "vmovdqa64 %%zmm1, %0\n\t" + : + : "m" (dq[0])); + + asm volatile("vpxorq %%zmm1, %%zmm0, %%zmm0\n\t" + "vmovdqa64 %%zmm0, %0" + : + : "m" (dp[0])); + + bytes -= 64; + p += 64; + q += 64; + dp += 64; + dq += 64; +#endif + } + + kernel_fpu_end(); +} + +static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + const u8 x0f = 0x0f; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + + dq = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f)); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("vmovdqa64 %0, %%zmm3\n\t" + "vmovdqa64 %1, %%zmm8\n\t" + "vpxorq %2, %%zmm3, %%zmm3\n\t" + "vpxorq %3, %%zmm8, %%zmm8" + : + : "m" (dq[0]), "m" (dq[64]), "m" (q[0]), + "m" (q[64])); + + /* + * 3 = q[0] ^ dq[0] + * 8 = q[64] ^ dq[64] + */ + asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t" + "vmovapd %%zmm0, %%zmm13\n\t" + "vbroadcasti64x2 %1, %%zmm1\n\t" + "vmovapd %%zmm1, %%zmm14" + : + : "m" (qmul[0]), "m" (qmul[16])); + + asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t" + "vpsraw $4, %%zmm8, %%zmm12\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm8, %%zmm8\n\t" + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" + "vpandq %%zmm7, %%zmm12, %%zmm12\n\t" + "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t" + "vpshufb %%zmm8, %%zmm13, %%zmm13\n\t" + "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t" + "vpshufb %%zmm12, %%zmm14, %%zmm14\n\t" + "vpxorq %%zmm0, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm13, %%zmm14, %%zmm14" + : + : ); + + /* + * 1 = qmul[q[0] ^ dq[0]] + * 14 = qmul[q[64] ^ dq[64]] + */ + asm volatile("vmovdqa64 %0, %%zmm2\n\t" + "vmovdqa64 %1, %%zmm12\n\t" + "vpxorq %%zmm1, %%zmm2, %%zmm2\n\t" + "vpxorq %%zmm14, %%zmm12, %%zmm12" + : + : "m" (p[0]), "m" (p[64])); + + /* + * 2 = p[0] ^ qmul[q[0] ^ dq[0]] + * 12 = p[64] ^ qmul[q[64] ^ dq[64]] + */ + + asm volatile("vmovdqa64 %%zmm1, %0\n\t" + "vmovdqa64 %%zmm14, %1\n\t" + "vmovdqa64 %%zmm2, %2\n\t" + "vmovdqa64 %%zmm12,%3" + : + : "m" (dq[0]), "m" (dq[64]), "m" (p[0]), + "m" (p[64])); + + bytes -= 128; + p += 128; + q += 128; + dq += 128; +#else + asm volatile("vmovdqa64 %0, %%zmm3\n\t" + "vpxorq %1, %%zmm3, %%zmm3" + : + : "m" (dq[0]), "m" (q[0])); + + /* 3 = q ^ dq */ + + asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t" + "vbroadcasti64x2 %1, %%zmm1" + : + : "m" (qmul[0]), "m" (qmul[16])); + + asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" + "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t" + "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm0, %%zmm1, %%zmm1" + : + : ); + + /* 1 = qmul[q ^ dq] */ + + asm volatile("vmovdqa64 %0, %%zmm2\n\t" + "vpxorq %%zmm1, %%zmm2, %%zmm2" + : + : "m" (p[0])); + + /* 2 = p ^ qmul[q ^ dq] */ + + asm volatile("vmovdqa64 %%zmm1, %0\n\t" + "vmovdqa64 %%zmm2, %1" + : + : "m" (dq[0]), "m" (p[0])); + + bytes -= 64; + p += 64; + q += 64; + dq += 64; +#endif + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_avx512 = { + .data2 = raid6_2data_recov_avx512, + .datap = raid6_datap_recov_avx512, + .valid = raid6_has_avx512, +#ifdef CONFIG_X86_64 + .name = "avx512x2", +#else + .name = "avx512x1", +#endif + .priority = 3, +}; diff --git a/lib/raid6/recov_loongarch_simd.c b/lib/raid6/recov_loongarch_simd.c new file mode 100644 index 000000000000..93dc515997a1 --- /dev/null +++ b/lib/raid6/recov_loongarch_simd.c @@ -0,0 +1,513 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * RAID6 recovery algorithms in LoongArch SIMD (LSX & LASX) + * + * Copyright (C) 2023 WANG Xuerui <git@xen0n.name> + * + * Originally based on recov_avx2.c and recov_ssse3.c: + * + * Copyright (C) 2012 Intel Corporation + * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> + */ + +#include <linux/raid/pq.h> +#include "loongarch.h" + +/* + * Unlike with the syndrome calculation algorithms, there's no boot-time + * selection of recovery algorithms by benchmarking, so we have to specify + * the priorities and hope the future cores will all have decent vector + * support (i.e. no LASX slower than LSX, or even scalar code). + */ + +#ifdef CONFIG_CPU_HAS_LSX +static int raid6_has_lsx(void) +{ + return cpu_has_lsx; +} + +static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks - 2] = p; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + /* + * vr20, vr21: qmul + * vr22, vr23: pbmul + */ + asm volatile("vld $vr20, %0" : : "m" (qmul[0])); + asm volatile("vld $vr21, %0" : : "m" (qmul[16])); + asm volatile("vld $vr22, %0" : : "m" (pbmul[0])); + asm volatile("vld $vr23, %0" : : "m" (pbmul[16])); + + while (bytes) { + /* vr4 - vr7: Q */ + asm volatile("vld $vr4, %0" : : "m" (q[0])); + asm volatile("vld $vr5, %0" : : "m" (q[16])); + asm volatile("vld $vr6, %0" : : "m" (q[32])); + asm volatile("vld $vr7, %0" : : "m" (q[48])); + /* vr4 - vr7: Q + Qxy */ + asm volatile("vld $vr8, %0" : : "m" (dq[0])); + asm volatile("vld $vr9, %0" : : "m" (dq[16])); + asm volatile("vld $vr10, %0" : : "m" (dq[32])); + asm volatile("vld $vr11, %0" : : "m" (dq[48])); + asm volatile("vxor.v $vr4, $vr4, $vr8"); + asm volatile("vxor.v $vr5, $vr5, $vr9"); + asm volatile("vxor.v $vr6, $vr6, $vr10"); + asm volatile("vxor.v $vr7, $vr7, $vr11"); + /* vr0 - vr3: P */ + asm volatile("vld $vr0, %0" : : "m" (p[0])); + asm volatile("vld $vr1, %0" : : "m" (p[16])); + asm volatile("vld $vr2, %0" : : "m" (p[32])); + asm volatile("vld $vr3, %0" : : "m" (p[48])); + /* vr0 - vr3: P + Pxy */ + asm volatile("vld $vr8, %0" : : "m" (dp[0])); + asm volatile("vld $vr9, %0" : : "m" (dp[16])); + asm volatile("vld $vr10, %0" : : "m" (dp[32])); + asm volatile("vld $vr11, %0" : : "m" (dp[48])); + asm volatile("vxor.v $vr0, $vr0, $vr8"); + asm volatile("vxor.v $vr1, $vr1, $vr9"); + asm volatile("vxor.v $vr2, $vr2, $vr10"); + asm volatile("vxor.v $vr3, $vr3, $vr11"); + + /* vr8 - vr11: higher 4 bits of each byte of (Q + Qxy) */ + asm volatile("vsrli.b $vr8, $vr4, 4"); + asm volatile("vsrli.b $vr9, $vr5, 4"); + asm volatile("vsrli.b $vr10, $vr6, 4"); + asm volatile("vsrli.b $vr11, $vr7, 4"); + /* vr4 - vr7: lower 4 bits of each byte of (Q + Qxy) */ + asm volatile("vandi.b $vr4, $vr4, 0x0f"); + asm volatile("vandi.b $vr5, $vr5, 0x0f"); + asm volatile("vandi.b $vr6, $vr6, 0x0f"); + asm volatile("vandi.b $vr7, $vr7, 0x0f"); + /* lookup from qmul[0] */ + asm volatile("vshuf.b $vr4, $vr20, $vr20, $vr4"); + asm volatile("vshuf.b $vr5, $vr20, $vr20, $vr5"); + asm volatile("vshuf.b $vr6, $vr20, $vr20, $vr6"); + asm volatile("vshuf.b $vr7, $vr20, $vr20, $vr7"); + /* lookup from qmul[16] */ + asm volatile("vshuf.b $vr8, $vr21, $vr21, $vr8"); + asm volatile("vshuf.b $vr9, $vr21, $vr21, $vr9"); + asm volatile("vshuf.b $vr10, $vr21, $vr21, $vr10"); + asm volatile("vshuf.b $vr11, $vr21, $vr21, $vr11"); + /* vr16 - vr19: B(Q + Qxy) */ + asm volatile("vxor.v $vr16, $vr8, $vr4"); + asm volatile("vxor.v $vr17, $vr9, $vr5"); + asm volatile("vxor.v $vr18, $vr10, $vr6"); + asm volatile("vxor.v $vr19, $vr11, $vr7"); + + /* vr4 - vr7: higher 4 bits of each byte of (P + Pxy) */ + asm volatile("vsrli.b $vr4, $vr0, 4"); + asm volatile("vsrli.b $vr5, $vr1, 4"); + asm volatile("vsrli.b $vr6, $vr2, 4"); + asm volatile("vsrli.b $vr7, $vr3, 4"); + /* vr12 - vr15: lower 4 bits of each byte of (P + Pxy) */ + asm volatile("vandi.b $vr12, $vr0, 0x0f"); + asm volatile("vandi.b $vr13, $vr1, 0x0f"); + asm volatile("vandi.b $vr14, $vr2, 0x0f"); + asm volatile("vandi.b $vr15, $vr3, 0x0f"); + /* lookup from pbmul[0] */ + asm volatile("vshuf.b $vr12, $vr22, $vr22, $vr12"); + asm volatile("vshuf.b $vr13, $vr22, $vr22, $vr13"); + asm volatile("vshuf.b $vr14, $vr22, $vr22, $vr14"); + asm volatile("vshuf.b $vr15, $vr22, $vr22, $vr15"); + /* lookup from pbmul[16] */ + asm volatile("vshuf.b $vr4, $vr23, $vr23, $vr4"); + asm volatile("vshuf.b $vr5, $vr23, $vr23, $vr5"); + asm volatile("vshuf.b $vr6, $vr23, $vr23, $vr6"); + asm volatile("vshuf.b $vr7, $vr23, $vr23, $vr7"); + /* vr4 - vr7: A(P + Pxy) */ + asm volatile("vxor.v $vr4, $vr4, $vr12"); + asm volatile("vxor.v $vr5, $vr5, $vr13"); + asm volatile("vxor.v $vr6, $vr6, $vr14"); + asm volatile("vxor.v $vr7, $vr7, $vr15"); + + /* vr4 - vr7: A(P + Pxy) + B(Q + Qxy) = Dx */ + asm volatile("vxor.v $vr4, $vr4, $vr16"); + asm volatile("vxor.v $vr5, $vr5, $vr17"); + asm volatile("vxor.v $vr6, $vr6, $vr18"); + asm volatile("vxor.v $vr7, $vr7, $vr19"); + asm volatile("vst $vr4, %0" : "=m" (dq[0])); + asm volatile("vst $vr5, %0" : "=m" (dq[16])); + asm volatile("vst $vr6, %0" : "=m" (dq[32])); + asm volatile("vst $vr7, %0" : "=m" (dq[48])); + + /* vr0 - vr3: P + Pxy + Dx = Dy */ + asm volatile("vxor.v $vr0, $vr0, $vr4"); + asm volatile("vxor.v $vr1, $vr1, $vr5"); + asm volatile("vxor.v $vr2, $vr2, $vr6"); + asm volatile("vxor.v $vr3, $vr3, $vr7"); + asm volatile("vst $vr0, %0" : "=m" (dp[0])); + asm volatile("vst $vr1, %0" : "=m" (dp[16])); + asm volatile("vst $vr2, %0" : "=m" (dp[32])); + asm volatile("vst $vr3, %0" : "=m" (dp[48])); + + bytes -= 64; + p += 64; + q += 64; + dp += 64; + dq += 64; + } + + kernel_fpu_end(); +} + +static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + /* vr22, vr23: qmul */ + asm volatile("vld $vr22, %0" : : "m" (qmul[0])); + asm volatile("vld $vr23, %0" : : "m" (qmul[16])); + + while (bytes) { + /* vr0 - vr3: P + Dx */ + asm volatile("vld $vr0, %0" : : "m" (p[0])); + asm volatile("vld $vr1, %0" : : "m" (p[16])); + asm volatile("vld $vr2, %0" : : "m" (p[32])); + asm volatile("vld $vr3, %0" : : "m" (p[48])); + /* vr4 - vr7: Qx */ + asm volatile("vld $vr4, %0" : : "m" (dq[0])); + asm volatile("vld $vr5, %0" : : "m" (dq[16])); + asm volatile("vld $vr6, %0" : : "m" (dq[32])); + asm volatile("vld $vr7, %0" : : "m" (dq[48])); + /* vr4 - vr7: Q + Qx */ + asm volatile("vld $vr8, %0" : : "m" (q[0])); + asm volatile("vld $vr9, %0" : : "m" (q[16])); + asm volatile("vld $vr10, %0" : : "m" (q[32])); + asm volatile("vld $vr11, %0" : : "m" (q[48])); + asm volatile("vxor.v $vr4, $vr4, $vr8"); + asm volatile("vxor.v $vr5, $vr5, $vr9"); + asm volatile("vxor.v $vr6, $vr6, $vr10"); + asm volatile("vxor.v $vr7, $vr7, $vr11"); + + /* vr8 - vr11: higher 4 bits of each byte of (Q + Qx) */ + asm volatile("vsrli.b $vr8, $vr4, 4"); + asm volatile("vsrli.b $vr9, $vr5, 4"); + asm volatile("vsrli.b $vr10, $vr6, 4"); + asm volatile("vsrli.b $vr11, $vr7, 4"); + /* vr4 - vr7: lower 4 bits of each byte of (Q + Qx) */ + asm volatile("vandi.b $vr4, $vr4, 0x0f"); + asm volatile("vandi.b $vr5, $vr5, 0x0f"); + asm volatile("vandi.b $vr6, $vr6, 0x0f"); + asm volatile("vandi.b $vr7, $vr7, 0x0f"); + /* lookup from qmul[0] */ + asm volatile("vshuf.b $vr4, $vr22, $vr22, $vr4"); + asm volatile("vshuf.b $vr5, $vr22, $vr22, $vr5"); + asm volatile("vshuf.b $vr6, $vr22, $vr22, $vr6"); + asm volatile("vshuf.b $vr7, $vr22, $vr22, $vr7"); + /* lookup from qmul[16] */ + asm volatile("vshuf.b $vr8, $vr23, $vr23, $vr8"); + asm volatile("vshuf.b $vr9, $vr23, $vr23, $vr9"); + asm volatile("vshuf.b $vr10, $vr23, $vr23, $vr10"); + asm volatile("vshuf.b $vr11, $vr23, $vr23, $vr11"); + /* vr4 - vr7: qmul(Q + Qx) = Dx */ + asm volatile("vxor.v $vr4, $vr4, $vr8"); + asm volatile("vxor.v $vr5, $vr5, $vr9"); + asm volatile("vxor.v $vr6, $vr6, $vr10"); + asm volatile("vxor.v $vr7, $vr7, $vr11"); + asm volatile("vst $vr4, %0" : "=m" (dq[0])); + asm volatile("vst $vr5, %0" : "=m" (dq[16])); + asm volatile("vst $vr6, %0" : "=m" (dq[32])); + asm volatile("vst $vr7, %0" : "=m" (dq[48])); + + /* vr0 - vr3: P + Dx + Dx = P */ + asm volatile("vxor.v $vr0, $vr0, $vr4"); + asm volatile("vxor.v $vr1, $vr1, $vr5"); + asm volatile("vxor.v $vr2, $vr2, $vr6"); + asm volatile("vxor.v $vr3, $vr3, $vr7"); + asm volatile("vst $vr0, %0" : "=m" (p[0])); + asm volatile("vst $vr1, %0" : "=m" (p[16])); + asm volatile("vst $vr2, %0" : "=m" (p[32])); + asm volatile("vst $vr3, %0" : "=m" (p[48])); + + bytes -= 64; + p += 64; + q += 64; + dq += 64; + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_lsx = { + .data2 = raid6_2data_recov_lsx, + .datap = raid6_datap_recov_lsx, + .valid = raid6_has_lsx, + .name = "lsx", + .priority = 1, +}; +#endif /* CONFIG_CPU_HAS_LSX */ + +#ifdef CONFIG_CPU_HAS_LASX +static int raid6_has_lasx(void) +{ + return cpu_has_lasx; +} + +static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks - 2] = p; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + /* + * xr20, xr21: qmul + * xr22, xr23: pbmul + */ + asm volatile("vld $vr20, %0" : : "m" (qmul[0])); + asm volatile("vld $vr21, %0" : : "m" (qmul[16])); + asm volatile("vld $vr22, %0" : : "m" (pbmul[0])); + asm volatile("vld $vr23, %0" : : "m" (pbmul[16])); + asm volatile("xvreplve0.q $xr20, $xr20"); + asm volatile("xvreplve0.q $xr21, $xr21"); + asm volatile("xvreplve0.q $xr22, $xr22"); + asm volatile("xvreplve0.q $xr23, $xr23"); + + while (bytes) { + /* xr0, xr1: Q */ + asm volatile("xvld $xr0, %0" : : "m" (q[0])); + asm volatile("xvld $xr1, %0" : : "m" (q[32])); + /* xr0, xr1: Q + Qxy */ + asm volatile("xvld $xr4, %0" : : "m" (dq[0])); + asm volatile("xvld $xr5, %0" : : "m" (dq[32])); + asm volatile("xvxor.v $xr0, $xr0, $xr4"); + asm volatile("xvxor.v $xr1, $xr1, $xr5"); + /* xr2, xr3: P */ + asm volatile("xvld $xr2, %0" : : "m" (p[0])); + asm volatile("xvld $xr3, %0" : : "m" (p[32])); + /* xr2, xr3: P + Pxy */ + asm volatile("xvld $xr4, %0" : : "m" (dp[0])); + asm volatile("xvld $xr5, %0" : : "m" (dp[32])); + asm volatile("xvxor.v $xr2, $xr2, $xr4"); + asm volatile("xvxor.v $xr3, $xr3, $xr5"); + + /* xr4, xr5: higher 4 bits of each byte of (Q + Qxy) */ + asm volatile("xvsrli.b $xr4, $xr0, 4"); + asm volatile("xvsrli.b $xr5, $xr1, 4"); + /* xr0, xr1: lower 4 bits of each byte of (Q + Qxy) */ + asm volatile("xvandi.b $xr0, $xr0, 0x0f"); + asm volatile("xvandi.b $xr1, $xr1, 0x0f"); + /* lookup from qmul[0] */ + asm volatile("xvshuf.b $xr0, $xr20, $xr20, $xr0"); + asm volatile("xvshuf.b $xr1, $xr20, $xr20, $xr1"); + /* lookup from qmul[16] */ + asm volatile("xvshuf.b $xr4, $xr21, $xr21, $xr4"); + asm volatile("xvshuf.b $xr5, $xr21, $xr21, $xr5"); + /* xr6, xr7: B(Q + Qxy) */ + asm volatile("xvxor.v $xr6, $xr4, $xr0"); + asm volatile("xvxor.v $xr7, $xr5, $xr1"); + + /* xr4, xr5: higher 4 bits of each byte of (P + Pxy) */ + asm volatile("xvsrli.b $xr4, $xr2, 4"); + asm volatile("xvsrli.b $xr5, $xr3, 4"); + /* xr0, xr1: lower 4 bits of each byte of (P + Pxy) */ + asm volatile("xvandi.b $xr0, $xr2, 0x0f"); + asm volatile("xvandi.b $xr1, $xr3, 0x0f"); + /* lookup from pbmul[0] */ + asm volatile("xvshuf.b $xr0, $xr22, $xr22, $xr0"); + asm volatile("xvshuf.b $xr1, $xr22, $xr22, $xr1"); + /* lookup from pbmul[16] */ + asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4"); + asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5"); + /* xr0, xr1: A(P + Pxy) */ + asm volatile("xvxor.v $xr0, $xr0, $xr4"); + asm volatile("xvxor.v $xr1, $xr1, $xr5"); + + /* xr0, xr1: A(P + Pxy) + B(Q + Qxy) = Dx */ + asm volatile("xvxor.v $xr0, $xr0, $xr6"); + asm volatile("xvxor.v $xr1, $xr1, $xr7"); + + /* xr2, xr3: P + Pxy + Dx = Dy */ + asm volatile("xvxor.v $xr2, $xr2, $xr0"); + asm volatile("xvxor.v $xr3, $xr3, $xr1"); + + asm volatile("xvst $xr0, %0" : "=m" (dq[0])); + asm volatile("xvst $xr1, %0" : "=m" (dq[32])); + asm volatile("xvst $xr2, %0" : "=m" (dp[0])); + asm volatile("xvst $xr3, %0" : "=m" (dp[32])); + + bytes -= 64; + p += 64; + q += 64; + dp += 64; + dq += 64; + } + + kernel_fpu_end(); +} + +static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + /* xr22, xr23: qmul */ + asm volatile("vld $vr22, %0" : : "m" (qmul[0])); + asm volatile("xvreplve0.q $xr22, $xr22"); + asm volatile("vld $vr23, %0" : : "m" (qmul[16])); + asm volatile("xvreplve0.q $xr23, $xr23"); + + while (bytes) { + /* xr0, xr1: P + Dx */ + asm volatile("xvld $xr0, %0" : : "m" (p[0])); + asm volatile("xvld $xr1, %0" : : "m" (p[32])); + /* xr2, xr3: Qx */ + asm volatile("xvld $xr2, %0" : : "m" (dq[0])); + asm volatile("xvld $xr3, %0" : : "m" (dq[32])); + /* xr2, xr3: Q + Qx */ + asm volatile("xvld $xr4, %0" : : "m" (q[0])); + asm volatile("xvld $xr5, %0" : : "m" (q[32])); + asm volatile("xvxor.v $xr2, $xr2, $xr4"); + asm volatile("xvxor.v $xr3, $xr3, $xr5"); + + /* xr4, xr5: higher 4 bits of each byte of (Q + Qx) */ + asm volatile("xvsrli.b $xr4, $xr2, 4"); + asm volatile("xvsrli.b $xr5, $xr3, 4"); + /* xr2, xr3: lower 4 bits of each byte of (Q + Qx) */ + asm volatile("xvandi.b $xr2, $xr2, 0x0f"); + asm volatile("xvandi.b $xr3, $xr3, 0x0f"); + /* lookup from qmul[0] */ + asm volatile("xvshuf.b $xr2, $xr22, $xr22, $xr2"); + asm volatile("xvshuf.b $xr3, $xr22, $xr22, $xr3"); + /* lookup from qmul[16] */ + asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4"); + asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5"); + /* xr2, xr3: qmul(Q + Qx) = Dx */ + asm volatile("xvxor.v $xr2, $xr2, $xr4"); + asm volatile("xvxor.v $xr3, $xr3, $xr5"); + + /* xr0, xr1: P + Dx + Dx = P */ + asm volatile("xvxor.v $xr0, $xr0, $xr2"); + asm volatile("xvxor.v $xr1, $xr1, $xr3"); + + asm volatile("xvst $xr2, %0" : "=m" (dq[0])); + asm volatile("xvst $xr3, %0" : "=m" (dq[32])); + asm volatile("xvst $xr0, %0" : "=m" (p[0])); + asm volatile("xvst $xr1, %0" : "=m" (p[32])); + + bytes -= 64; + p += 64; + q += 64; + dq += 64; + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_lasx = { + .data2 = raid6_2data_recov_lasx, + .datap = raid6_datap_recov_lasx, + .valid = raid6_has_lasx, + .name = "lasx", + .priority = 2, +}; +#endif /* CONFIG_CPU_HAS_LASX */ diff --git a/lib/raid6/recov_neon.c b/lib/raid6/recov_neon.c new file mode 100644 index 000000000000..9d99aeabd31a --- /dev/null +++ b/lib/raid6/recov_neon.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 Intel Corporation + * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <linux/raid/pq.h> + +#ifdef __KERNEL__ +#include <asm/simd.h> +#include "neon.h" +#else +#define scoped_ksimd() +#define cpu_has_neon() (1) +#endif + +static int raid6_has_neon(void) +{ + return cpu_has_neon(); +} + +static void raid6_2data_recov_neon(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks - 2] = p; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + scoped_ksimd() + __raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul); +} + +static void raid6_datap_recov_neon(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + scoped_ksimd() + __raid6_datap_recov_neon(bytes, p, q, dq, qmul); +} + +const struct raid6_recov_calls raid6_recov_neon = { + .data2 = raid6_2data_recov_neon, + .datap = raid6_datap_recov_neon, + .valid = raid6_has_neon, + .name = "neon", + .priority = 10, +}; diff --git a/lib/raid6/recov_neon_inner.c b/lib/raid6/recov_neon_inner.c new file mode 100644 index 000000000000..f9e7e8f5a151 --- /dev/null +++ b/lib/raid6/recov_neon_inner.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 Intel Corporation + * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <arm_neon.h> +#include "neon.h" + +#ifdef CONFIG_ARM +/* + * AArch32 does not provide this intrinsic natively because it does not + * implement the underlying instruction. AArch32 only provides a 64-bit + * wide vtbl.8 instruction, so use that instead. + */ +static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b) +{ + union { + uint8x16_t val; + uint8x8x2_t pair; + } __a = { a }; + + return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)), + vtbl2_u8(__a.pair, vget_high_u8(b))); +} +#endif + +void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp, + uint8_t *dq, const uint8_t *pbmul, + const uint8_t *qmul) +{ + uint8x16_t pm0 = vld1q_u8(pbmul); + uint8x16_t pm1 = vld1q_u8(pbmul + 16); + uint8x16_t qm0 = vld1q_u8(qmul); + uint8x16_t qm1 = vld1q_u8(qmul + 16); + uint8x16_t x0f = vdupq_n_u8(0x0f); + + /* + * while ( bytes-- ) { + * uint8_t px, qx, db; + * + * px = *p ^ *dp; + * qx = qmul[*q ^ *dq]; + * *dq++ = db = pbmul[px] ^ qx; + * *dp++ = db ^ px; + * p++; q++; + * } + */ + + while (bytes) { + uint8x16_t vx, vy, px, qx, db; + + px = veorq_u8(vld1q_u8(p), vld1q_u8(dp)); + vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq)); + + vy = vshrq_n_u8(vx, 4); + vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f)); + vy = vqtbl1q_u8(qm1, vy); + qx = veorq_u8(vx, vy); + + vy = vshrq_n_u8(px, 4); + vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f)); + vy = vqtbl1q_u8(pm1, vy); + vx = veorq_u8(vx, vy); + db = veorq_u8(vx, qx); + + vst1q_u8(dq, db); + vst1q_u8(dp, veorq_u8(db, px)); + + bytes -= 16; + p += 16; + q += 16; + dp += 16; + dq += 16; + } +} + +void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq, + const uint8_t *qmul) +{ + uint8x16_t qm0 = vld1q_u8(qmul); + uint8x16_t qm1 = vld1q_u8(qmul + 16); + uint8x16_t x0f = vdupq_n_u8(0x0f); + + /* + * while (bytes--) { + * *p++ ^= *dq = qmul[*q ^ *dq]; + * q++; dq++; + * } + */ + + while (bytes) { + uint8x16_t vx, vy; + + vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq)); + + vy = vshrq_n_u8(vx, 4); + vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f)); + vy = vqtbl1q_u8(qm1, vy); + vx = veorq_u8(vx, vy); + vy = veorq_u8(vx, vld1q_u8(p)); + + vst1q_u8(dq, vx); + vst1q_u8(p, vy); + + bytes -= 16; + p += 16; + q += 16; + dq += 16; + } +} diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c new file mode 100644 index 000000000000..40c393206b6a --- /dev/null +++ b/lib/raid6/recov_rvv.c @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2024 Institute of Software, CAS. + * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn> + */ + +#include <linux/raid/pq.h> +#include "rvv.h" + +static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp, + u8 *dq, const u8 *pbmul, + const u8 *qmul) +{ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli x0, %[avl], e8, m1, ta, ma\n" + ".option pop\n" + : : + [avl]"r"(16) + ); + + /* + * while ( bytes-- ) { + * uint8_t px, qx, db; + * + * px = *p ^ *dp; + * qx = qmul[*q ^ *dq]; + * *dq++ = db = pbmul[px] ^ qx; + * *dp++ = db ^ px; + * p++; q++; + * } + */ + while (bytes) { + /* + * v0:px, v1:dp, + * v2:qx, v3:dq, + * v4:vx, v5:vy, + * v6:qm0, v7:qm1, + * v8:pm0, v9:pm1, + * v14:p/qm[vx], v15:p/qm[vy] + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[px])\n" + "vle8.v v1, (%[dp])\n" + "vxor.vv v0, v0, v1\n" + "vle8.v v2, (%[qx])\n" + "vle8.v v3, (%[dq])\n" + "vxor.vv v4, v2, v3\n" + "vsrl.vi v5, v4, 4\n" + "vand.vi v4, v4, 0xf\n" + "vle8.v v6, (%[qm0])\n" + "vle8.v v7, (%[qm1])\n" + "vrgather.vv v14, v6, v4\n" /* v14 = qm[vx] */ + "vrgather.vv v15, v7, v5\n" /* v15 = qm[vy] */ + "vxor.vv v2, v14, v15\n" /* v2 = qmul[*q ^ *dq] */ + + "vsrl.vi v5, v0, 4\n" + "vand.vi v4, v0, 0xf\n" + "vle8.v v8, (%[pm0])\n" + "vle8.v v9, (%[pm1])\n" + "vrgather.vv v14, v8, v4\n" /* v14 = pm[vx] */ + "vrgather.vv v15, v9, v5\n" /* v15 = pm[vy] */ + "vxor.vv v4, v14, v15\n" /* v4 = pbmul[px] */ + "vxor.vv v3, v4, v2\n" /* v3 = db = pbmul[px] ^ qx */ + "vxor.vv v1, v3, v0\n" /* v1 = db ^ px; */ + "vse8.v v3, (%[dq])\n" + "vse8.v v1, (%[dp])\n" + ".option pop\n" + : : + [px]"r"(p), + [dp]"r"(dp), + [qx]"r"(q), + [dq]"r"(dq), + [qm0]"r"(qmul), + [qm1]"r"(qmul + 16), + [pm0]"r"(pbmul), + [pm1]"r"(pbmul + 16) + :); + + bytes -= 16; + p += 16; + q += 16; + dp += 16; + dq += 16; + } +} + +static void __raid6_datap_recov_rvv(int bytes, u8 *p, u8 *q, + u8 *dq, const u8 *qmul) +{ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli x0, %[avl], e8, m1, ta, ma\n" + ".option pop\n" + : : + [avl]"r"(16) + ); + + /* + * while (bytes--) { + * *p++ ^= *dq = qmul[*q ^ *dq]; + * q++; dq++; + * } + */ + while (bytes) { + /* + * v0:vx, v1:vy, + * v2:dq, v3:p, + * v4:qm0, v5:qm1, + * v10:m[vx], v11:m[vy] + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[vx])\n" + "vle8.v v2, (%[dq])\n" + "vxor.vv v0, v0, v2\n" + "vsrl.vi v1, v0, 4\n" + "vand.vi v0, v0, 0xf\n" + "vle8.v v4, (%[qm0])\n" + "vle8.v v5, (%[qm1])\n" + "vrgather.vv v10, v4, v0\n" + "vrgather.vv v11, v5, v1\n" + "vxor.vv v0, v10, v11\n" + "vle8.v v1, (%[vy])\n" + "vxor.vv v1, v0, v1\n" + "vse8.v v0, (%[dq])\n" + "vse8.v v1, (%[vy])\n" + ".option pop\n" + : : + [vx]"r"(q), + [vy]"r"(p), + [dq]"r"(dq), + [qm0]"r"(qmul), + [qm1]"r"(qmul + 16) + :); + + bytes -= 16; + p += 16; + q += 16; + dq += 16; + } +} + +static void raid6_2data_recov_rvv(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks - 2] = p; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + kernel_vector_begin(); + __raid6_2data_recov_rvv(bytes, p, q, dp, dq, pbmul, qmul); + kernel_vector_end(); +} + +static void raid6_datap_recov_rvv(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_vector_begin(); + __raid6_datap_recov_rvv(bytes, p, q, dq, qmul); + kernel_vector_end(); +} + +const struct raid6_recov_calls raid6_recov_rvv = { + .data2 = raid6_2data_recov_rvv, + .datap = raid6_datap_recov_rvv, + .valid = rvv_has_vector, + .name = "rvv", + .priority = 1, +}; diff --git a/lib/raid6/recov_s390xc.c b/lib/raid6/recov_s390xc.c new file mode 100644 index 000000000000..487018f81192 --- /dev/null +++ b/lib/raid6/recov_s390xc.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * RAID-6 data recovery in dual failure mode based on the XC instruction. + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ + +#include <linux/raid/pq.h> + +static inline void xor_block(u8 *p1, u8 *p2) +{ + typedef struct { u8 _[256]; } addrtype; + + asm volatile( + " xc 0(256,%[p1]),0(%[p2])\n" + : "+m" (*(addrtype *) p1) : "m" (*(addrtype *) p2), + [p1] "a" (p1), [p2] "a" (p2) : "cc"); +} + +/* Recover two failed data blocks. */ +static void raid6_2data_recov_s390xc(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + int i; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = raid6_get_zero_page(); + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_gfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]]; + + /* Now do it... */ + while (bytes) { + xor_block(dp, p); + xor_block(dq, q); + for (i = 0; i < 256; i++) + dq[i] = pbmul[dp[i]] ^ qmul[dq[i]]; + xor_block(dp, dq); + p += 256; + q += 256; + dp += 256; + dq += 256; + bytes -= 256; + } +} + +/* Recover failure of one data block plus the P block */ +static void raid6_datap_recov_s390xc(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + int i; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + /* Now do it... */ + while (bytes) { + xor_block(dq, q); + for (i = 0; i < 256; i++) + dq[i] = qmul[dq[i]]; + xor_block(p, dq); + p += 256; + q += 256; + dq += 256; + bytes -= 256; + } +} + + +const struct raid6_recov_calls raid6_recov_s390xc = { + .data2 = raid6_2data_recov_s390xc, + .datap = raid6_datap_recov_s390xc, + .valid = NULL, + .name = "s390xc", + .priority = 1, +}; diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c index a9168328f03b..2e849185c32b 100644 --- a/lib/raid6/recov_ssse3.c +++ b/lib/raid6/recov_ssse3.c @@ -1,10 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012 Intel Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. */ #include <linux/raid/pq.h> @@ -34,10 +30,10 @@ static void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -207,7 +203,7 @@ static void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c new file mode 100644 index 000000000000..75c9dafedb28 --- /dev/null +++ b/lib/raid6/rvv.c @@ -0,0 +1,1228 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RAID-6 syndrome calculation using RISC-V vector instructions + * + * Copyright 2024 Institute of Software, CAS. + * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn> + * + * Based on neon.uc: + * Copyright 2002-2004 H. Peter Anvin + */ + +#include "rvv.h" + +#ifdef __riscv_vector +#error "This code must be built without compiler support for vector" +#endif + +static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */ + for (d = 0; d < bytes; d += nsize * 1) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]) + ); + + for (z = z0 - 1 ; z >= 0 ; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] = wp$$; + * *(unative_t *)&q[d+NSIZE*$$] = wq$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vse8.v v0, (%[wp0])\n" + "vse8.v v1, (%[wq0])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]) + ); + } +} + +static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */ + for (d = 0 ; d < bytes ; d += nsize * 1) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]) + ); + + /* P/Q data pages */ + for (z = z0 - 1; z >= start; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * wq$$ = w1$$ ^ w2$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v1, v3, v2\n" + ".option pop\n" + : : + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + * v0:wp0, v1:wq0, v2:p0, v3:q0 + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v2, (%[wp0])\n" + "vle8.v v3, (%[wq0])\n" + "vxor.vv v2, v2, v0\n" + "vxor.vv v3, v3, v1\n" + "vse8.v v2, (%[wp0])\n" + "vse8.v v3, (%[wq0])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]) + ); + } +} + +static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + */ + for (d = 0; d < bytes; d += nsize * 2) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + "vle8.v v4, (%[wp1])\n" + "vmv.v.v v5, v4\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]), + [wp1]"r"(&dptr[z0][d + 1 * nsize]) + ); + + for (z = z0 - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [wd1]"r"(&dptr[z][d + 1 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] = wp$$; + * *(unative_t *)&q[d+NSIZE*$$] = wq$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vse8.v v0, (%[wp0])\n" + "vse8.v v1, (%[wq0])\n" + "vse8.v v4, (%[wp1])\n" + "vse8.v v5, (%[wq1])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]), + [wp1]"r"(&p[d + nsize * 1]), + [wq1]"r"(&q[d + nsize * 1]) + ); + } +} + +static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + */ + for (d = 0; d < bytes; d += nsize * 2) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + "vle8.v v4, (%[wp1])\n" + "vmv.v.v v5, v4\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]), + [wp1]"r"(&dptr[z0][d + 1 * nsize]) + ); + + /* P/Q data pages */ + for (z = z0 - 1; z >= start; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [wd1]"r"(&dptr[z][d + 1 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * wq$$ = w1$$ ^ w2$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v1, v3, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v5, v7, v6\n" + ".option pop\n" + : : + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + * v0:wp0, v1:wq0, v2:p0, v3:q0 + * v4:wp1, v5:wq1, v6:p1, v7:q1 + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v2, (%[wp0])\n" + "vle8.v v3, (%[wq0])\n" + "vxor.vv v2, v2, v0\n" + "vxor.vv v3, v3, v1\n" + "vse8.v v2, (%[wp0])\n" + "vse8.v v3, (%[wq0])\n" + + "vle8.v v6, (%[wp1])\n" + "vle8.v v7, (%[wq1])\n" + "vxor.vv v6, v6, v4\n" + "vxor.vv v7, v7, v5\n" + "vse8.v v6, (%[wp1])\n" + "vse8.v v7, (%[wq1])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]), + [wp1]"r"(&p[d + nsize * 1]), + [wq1]"r"(&q[d + nsize * 1]) + ); + } +} + +static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12 + * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13 + */ + for (d = 0; d < bytes; d += nsize * 4) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + "vle8.v v4, (%[wp1])\n" + "vmv.v.v v5, v4\n" + "vle8.v v8, (%[wp2])\n" + "vmv.v.v v9, v8\n" + "vle8.v v12, (%[wp3])\n" + "vmv.v.v v13, v12\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]), + [wp1]"r"(&dptr[z0][d + 1 * nsize]), + [wp2]"r"(&dptr[z0][d + 2 * nsize]), + [wp3]"r"(&dptr[z0][d + 3 * nsize]) + ); + + for (z = z0 - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v11, v11, v10\n" + "vle8.v v10, (%[wd2])\n" + "vxor.vv v9, v11, v10\n" + "vxor.vv v8, v8, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v15, v15, v14\n" + "vle8.v v14, (%[wd3])\n" + "vxor.vv v13, v15, v14\n" + "vxor.vv v12, v12, v14\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [wd1]"r"(&dptr[z][d + 1 * nsize]), + [wd2]"r"(&dptr[z][d + 2 * nsize]), + [wd3]"r"(&dptr[z][d + 3 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] = wp$$; + * *(unative_t *)&q[d+NSIZE*$$] = wq$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vse8.v v0, (%[wp0])\n" + "vse8.v v1, (%[wq0])\n" + "vse8.v v4, (%[wp1])\n" + "vse8.v v5, (%[wq1])\n" + "vse8.v v8, (%[wp2])\n" + "vse8.v v9, (%[wq2])\n" + "vse8.v v12, (%[wp3])\n" + "vse8.v v13, (%[wq3])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]), + [wp1]"r"(&p[d + nsize * 1]), + [wq1]"r"(&q[d + nsize * 1]), + [wp2]"r"(&p[d + nsize * 2]), + [wq2]"r"(&q[d + nsize * 2]), + [wp3]"r"(&p[d + nsize * 3]), + [wq3]"r"(&q[d + nsize * 3]) + ); + } +} + +static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12 + * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13 + */ + for (d = 0; d < bytes; d += nsize * 4) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + "vle8.v v4, (%[wp1])\n" + "vmv.v.v v5, v4\n" + "vle8.v v8, (%[wp2])\n" + "vmv.v.v v9, v8\n" + "vle8.v v12, (%[wp3])\n" + "vmv.v.v v13, v12\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]), + [wp1]"r"(&dptr[z0][d + 1 * nsize]), + [wp2]"r"(&dptr[z0][d + 2 * nsize]), + [wp3]"r"(&dptr[z0][d + 3 * nsize]) + ); + + /* P/Q data pages */ + for (z = z0 - 1; z >= start; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v11, v11, v10\n" + "vle8.v v10, (%[wd2])\n" + "vxor.vv v9, v11, v10\n" + "vxor.vv v8, v8, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v15, v15, v14\n" + "vle8.v v14, (%[wd3])\n" + "vxor.vv v13, v15, v14\n" + "vxor.vv v12, v12, v14\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [wd1]"r"(&dptr[z][d + 1 * nsize]), + [wd2]"r"(&dptr[z][d + 2 * nsize]), + [wd3]"r"(&dptr[z][d + 3 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * wq$$ = w1$$ ^ w2$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v1, v3, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v5, v7, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v9, v11, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v13, v15, v14\n" + ".option pop\n" + : : + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + * v0:wp0, v1:wq0, v2:p0, v3:q0 + * v4:wp1, v5:wq1, v6:p1, v7:q1 + * v8:wp2, v9:wq2, v10:p2, v11:q2 + * v12:wp3, v13:wq3, v14:p3, v15:q3 + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v2, (%[wp0])\n" + "vle8.v v3, (%[wq0])\n" + "vxor.vv v2, v2, v0\n" + "vxor.vv v3, v3, v1\n" + "vse8.v v2, (%[wp0])\n" + "vse8.v v3, (%[wq0])\n" + + "vle8.v v6, (%[wp1])\n" + "vle8.v v7, (%[wq1])\n" + "vxor.vv v6, v6, v4\n" + "vxor.vv v7, v7, v5\n" + "vse8.v v6, (%[wp1])\n" + "vse8.v v7, (%[wq1])\n" + + "vle8.v v10, (%[wp2])\n" + "vle8.v v11, (%[wq2])\n" + "vxor.vv v10, v10, v8\n" + "vxor.vv v11, v11, v9\n" + "vse8.v v10, (%[wp2])\n" + "vse8.v v11, (%[wq2])\n" + + "vle8.v v14, (%[wp3])\n" + "vle8.v v15, (%[wq3])\n" + "vxor.vv v14, v14, v12\n" + "vxor.vv v15, v15, v13\n" + "vse8.v v14, (%[wp3])\n" + "vse8.v v15, (%[wq3])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]), + [wp1]"r"(&p[d + nsize * 1]), + [wq1]"r"(&q[d + nsize * 1]), + [wp2]"r"(&p[d + nsize * 2]), + [wq2]"r"(&q[d + nsize * 2]), + [wp3]"r"(&p[d + nsize * 3]), + [wq3]"r"(&q[d + nsize * 3]) + ); + } +} + +static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12 + * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13 + * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14 + * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15 + * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16 + * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17 + */ + for (d = 0; d < bytes; d += nsize * 8) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + "vle8.v v4, (%[wp1])\n" + "vmv.v.v v5, v4\n" + "vle8.v v8, (%[wp2])\n" + "vmv.v.v v9, v8\n" + "vle8.v v12, (%[wp3])\n" + "vmv.v.v v13, v12\n" + "vle8.v v16, (%[wp4])\n" + "vmv.v.v v17, v16\n" + "vle8.v v20, (%[wp5])\n" + "vmv.v.v v21, v20\n" + "vle8.v v24, (%[wp6])\n" + "vmv.v.v v25, v24\n" + "vle8.v v28, (%[wp7])\n" + "vmv.v.v v29, v28\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]), + [wp1]"r"(&dptr[z0][d + 1 * nsize]), + [wp2]"r"(&dptr[z0][d + 2 * nsize]), + [wp3]"r"(&dptr[z0][d + 3 * nsize]), + [wp4]"r"(&dptr[z0][d + 4 * nsize]), + [wp5]"r"(&dptr[z0][d + 5 * nsize]), + [wp6]"r"(&dptr[z0][d + 6 * nsize]), + [wp7]"r"(&dptr[z0][d + 7 * nsize]) + ); + + for (z = z0 - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v11, v11, v10\n" + "vle8.v v10, (%[wd2])\n" + "vxor.vv v9, v11, v10\n" + "vxor.vv v8, v8, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v15, v15, v14\n" + "vle8.v v14, (%[wd3])\n" + "vxor.vv v13, v15, v14\n" + "vxor.vv v12, v12, v14\n" + + "vsra.vi v18, v17, 7\n" + "vsll.vi v19, v17, 1\n" + "vand.vx v18, v18, %[x1d]\n" + "vxor.vv v19, v19, v18\n" + "vle8.v v18, (%[wd4])\n" + "vxor.vv v17, v19, v18\n" + "vxor.vv v16, v16, v18\n" + + "vsra.vi v22, v21, 7\n" + "vsll.vi v23, v21, 1\n" + "vand.vx v22, v22, %[x1d]\n" + "vxor.vv v23, v23, v22\n" + "vle8.v v22, (%[wd5])\n" + "vxor.vv v21, v23, v22\n" + "vxor.vv v20, v20, v22\n" + + "vsra.vi v26, v25, 7\n" + "vsll.vi v27, v25, 1\n" + "vand.vx v26, v26, %[x1d]\n" + "vxor.vv v27, v27, v26\n" + "vle8.v v26, (%[wd6])\n" + "vxor.vv v25, v27, v26\n" + "vxor.vv v24, v24, v26\n" + + "vsra.vi v30, v29, 7\n" + "vsll.vi v31, v29, 1\n" + "vand.vx v30, v30, %[x1d]\n" + "vxor.vv v31, v31, v30\n" + "vle8.v v30, (%[wd7])\n" + "vxor.vv v29, v31, v30\n" + "vxor.vv v28, v28, v30\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [wd1]"r"(&dptr[z][d + 1 * nsize]), + [wd2]"r"(&dptr[z][d + 2 * nsize]), + [wd3]"r"(&dptr[z][d + 3 * nsize]), + [wd4]"r"(&dptr[z][d + 4 * nsize]), + [wd5]"r"(&dptr[z][d + 5 * nsize]), + [wd6]"r"(&dptr[z][d + 6 * nsize]), + [wd7]"r"(&dptr[z][d + 7 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] = wp$$; + * *(unative_t *)&q[d+NSIZE*$$] = wq$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vse8.v v0, (%[wp0])\n" + "vse8.v v1, (%[wq0])\n" + "vse8.v v4, (%[wp1])\n" + "vse8.v v5, (%[wq1])\n" + "vse8.v v8, (%[wp2])\n" + "vse8.v v9, (%[wq2])\n" + "vse8.v v12, (%[wp3])\n" + "vse8.v v13, (%[wq3])\n" + "vse8.v v16, (%[wp4])\n" + "vse8.v v17, (%[wq4])\n" + "vse8.v v20, (%[wp5])\n" + "vse8.v v21, (%[wq5])\n" + "vse8.v v24, (%[wp6])\n" + "vse8.v v25, (%[wq6])\n" + "vse8.v v28, (%[wp7])\n" + "vse8.v v29, (%[wq7])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]), + [wp1]"r"(&p[d + nsize * 1]), + [wq1]"r"(&q[d + nsize * 1]), + [wp2]"r"(&p[d + nsize * 2]), + [wq2]"r"(&q[d + nsize * 2]), + [wp3]"r"(&p[d + nsize * 3]), + [wq3]"r"(&q[d + nsize * 3]), + [wp4]"r"(&p[d + nsize * 4]), + [wq4]"r"(&q[d + nsize * 4]), + [wp5]"r"(&p[d + nsize * 5]), + [wq5]"r"(&q[d + nsize * 5]), + [wp6]"r"(&p[d + nsize * 6]), + [wq6]"r"(&q[d + nsize * 6]), + [wp7]"r"(&p[d + nsize * 7]), + [wq7]"r"(&q[d + nsize * 7]) + ); + } +} + +static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12 + * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13 + * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14 + * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15 + * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16 + * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17 + */ + for (d = 0; d < bytes; d += nsize * 8) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + "vle8.v v4, (%[wp1])\n" + "vmv.v.v v5, v4\n" + "vle8.v v8, (%[wp2])\n" + "vmv.v.v v9, v8\n" + "vle8.v v12, (%[wp3])\n" + "vmv.v.v v13, v12\n" + "vle8.v v16, (%[wp4])\n" + "vmv.v.v v17, v16\n" + "vle8.v v20, (%[wp5])\n" + "vmv.v.v v21, v20\n" + "vle8.v v24, (%[wp6])\n" + "vmv.v.v v25, v24\n" + "vle8.v v28, (%[wp7])\n" + "vmv.v.v v29, v28\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]), + [wp1]"r"(&dptr[z0][d + 1 * nsize]), + [wp2]"r"(&dptr[z0][d + 2 * nsize]), + [wp3]"r"(&dptr[z0][d + 3 * nsize]), + [wp4]"r"(&dptr[z0][d + 4 * nsize]), + [wp5]"r"(&dptr[z0][d + 5 * nsize]), + [wp6]"r"(&dptr[z0][d + 6 * nsize]), + [wp7]"r"(&dptr[z0][d + 7 * nsize]) + ); + + /* P/Q data pages */ + for (z = z0 - 1; z >= start; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v11, v11, v10\n" + "vle8.v v10, (%[wd2])\n" + "vxor.vv v9, v11, v10\n" + "vxor.vv v8, v8, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v15, v15, v14\n" + "vle8.v v14, (%[wd3])\n" + "vxor.vv v13, v15, v14\n" + "vxor.vv v12, v12, v14\n" + + "vsra.vi v18, v17, 7\n" + "vsll.vi v19, v17, 1\n" + "vand.vx v18, v18, %[x1d]\n" + "vxor.vv v19, v19, v18\n" + "vle8.v v18, (%[wd4])\n" + "vxor.vv v17, v19, v18\n" + "vxor.vv v16, v16, v18\n" + + "vsra.vi v22, v21, 7\n" + "vsll.vi v23, v21, 1\n" + "vand.vx v22, v22, %[x1d]\n" + "vxor.vv v23, v23, v22\n" + "vle8.v v22, (%[wd5])\n" + "vxor.vv v21, v23, v22\n" + "vxor.vv v20, v20, v22\n" + + "vsra.vi v26, v25, 7\n" + "vsll.vi v27, v25, 1\n" + "vand.vx v26, v26, %[x1d]\n" + "vxor.vv v27, v27, v26\n" + "vle8.v v26, (%[wd6])\n" + "vxor.vv v25, v27, v26\n" + "vxor.vv v24, v24, v26\n" + + "vsra.vi v30, v29, 7\n" + "vsll.vi v31, v29, 1\n" + "vand.vx v30, v30, %[x1d]\n" + "vxor.vv v31, v31, v30\n" + "vle8.v v30, (%[wd7])\n" + "vxor.vv v29, v31, v30\n" + "vxor.vv v28, v28, v30\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [wd1]"r"(&dptr[z][d + 1 * nsize]), + [wd2]"r"(&dptr[z][d + 2 * nsize]), + [wd3]"r"(&dptr[z][d + 3 * nsize]), + [wd4]"r"(&dptr[z][d + 4 * nsize]), + [wd5]"r"(&dptr[z][d + 5 * nsize]), + [wd6]"r"(&dptr[z][d + 6 * nsize]), + [wd7]"r"(&dptr[z][d + 7 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * wq$$ = w1$$ ^ w2$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v1, v3, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v5, v7, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v9, v11, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v13, v15, v14\n" + + "vsra.vi v18, v17, 7\n" + "vsll.vi v19, v17, 1\n" + "vand.vx v18, v18, %[x1d]\n" + "vxor.vv v17, v19, v18\n" + + "vsra.vi v22, v21, 7\n" + "vsll.vi v23, v21, 1\n" + "vand.vx v22, v22, %[x1d]\n" + "vxor.vv v21, v23, v22\n" + + "vsra.vi v26, v25, 7\n" + "vsll.vi v27, v25, 1\n" + "vand.vx v26, v26, %[x1d]\n" + "vxor.vv v25, v27, v26\n" + + "vsra.vi v30, v29, 7\n" + "vsll.vi v31, v29, 1\n" + "vand.vx v30, v30, %[x1d]\n" + "vxor.vv v29, v31, v30\n" + ".option pop\n" + : : + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + * v0:wp0, v1:wq0, v2:p0, v3:q0 + * v4:wp1, v5:wq1, v6:p1, v7:q1 + * v8:wp2, v9:wq2, v10:p2, v11:q2 + * v12:wp3, v13:wq3, v14:p3, v15:q3 + * v16:wp4, v17:wq4, v18:p4, v19:q4 + * v20:wp5, v21:wq5, v22:p5, v23:q5 + * v24:wp6, v25:wq6, v26:p6, v27:q6 + * v28:wp7, v29:wq7, v30:p7, v31:q7 + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v2, (%[wp0])\n" + "vle8.v v3, (%[wq0])\n" + "vxor.vv v2, v2, v0\n" + "vxor.vv v3, v3, v1\n" + "vse8.v v2, (%[wp0])\n" + "vse8.v v3, (%[wq0])\n" + + "vle8.v v6, (%[wp1])\n" + "vle8.v v7, (%[wq1])\n" + "vxor.vv v6, v6, v4\n" + "vxor.vv v7, v7, v5\n" + "vse8.v v6, (%[wp1])\n" + "vse8.v v7, (%[wq1])\n" + + "vle8.v v10, (%[wp2])\n" + "vle8.v v11, (%[wq2])\n" + "vxor.vv v10, v10, v8\n" + "vxor.vv v11, v11, v9\n" + "vse8.v v10, (%[wp2])\n" + "vse8.v v11, (%[wq2])\n" + + "vle8.v v14, (%[wp3])\n" + "vle8.v v15, (%[wq3])\n" + "vxor.vv v14, v14, v12\n" + "vxor.vv v15, v15, v13\n" + "vse8.v v14, (%[wp3])\n" + "vse8.v v15, (%[wq3])\n" + + "vle8.v v18, (%[wp4])\n" + "vle8.v v19, (%[wq4])\n" + "vxor.vv v18, v18, v16\n" + "vxor.vv v19, v19, v17\n" + "vse8.v v18, (%[wp4])\n" + "vse8.v v19, (%[wq4])\n" + + "vle8.v v22, (%[wp5])\n" + "vle8.v v23, (%[wq5])\n" + "vxor.vv v22, v22, v20\n" + "vxor.vv v23, v23, v21\n" + "vse8.v v22, (%[wp5])\n" + "vse8.v v23, (%[wq5])\n" + + "vle8.v v26, (%[wp6])\n" + "vle8.v v27, (%[wq6])\n" + "vxor.vv v26, v26, v24\n" + "vxor.vv v27, v27, v25\n" + "vse8.v v26, (%[wp6])\n" + "vse8.v v27, (%[wq6])\n" + + "vle8.v v30, (%[wp7])\n" + "vle8.v v31, (%[wq7])\n" + "vxor.vv v30, v30, v28\n" + "vxor.vv v31, v31, v29\n" + "vse8.v v30, (%[wp7])\n" + "vse8.v v31, (%[wq7])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]), + [wp1]"r"(&p[d + nsize * 1]), + [wq1]"r"(&q[d + nsize * 1]), + [wp2]"r"(&p[d + nsize * 2]), + [wq2]"r"(&q[d + nsize * 2]), + [wp3]"r"(&p[d + nsize * 3]), + [wq3]"r"(&q[d + nsize * 3]), + [wp4]"r"(&p[d + nsize * 4]), + [wq4]"r"(&q[d + nsize * 4]), + [wp5]"r"(&p[d + nsize * 5]), + [wq5]"r"(&q[d + nsize * 5]), + [wp6]"r"(&p[d + nsize * 6]), + [wq6]"r"(&q[d + nsize * 6]), + [wp7]"r"(&p[d + nsize * 7]), + [wq7]"r"(&q[d + nsize * 7]) + ); + } +} + +RAID6_RVV_WRAPPER(1); +RAID6_RVV_WRAPPER(2); +RAID6_RVV_WRAPPER(4); +RAID6_RVV_WRAPPER(8); diff --git a/lib/raid6/rvv.h b/lib/raid6/rvv.h new file mode 100644 index 000000000000..6d0708a2c8a4 --- /dev/null +++ b/lib/raid6/rvv.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2024 Institute of Software, CAS. + * + * raid6/rvv.h + * + * Definitions for RISC-V RAID-6 code + */ + +#ifdef __KERNEL__ +#include <asm/vector.h> +#else +#define kernel_vector_begin() +#define kernel_vector_end() +#include <sys/auxv.h> +#include <asm/hwcap.h> +#define has_vector() (getauxval(AT_HWCAP) & COMPAT_HWCAP_ISA_V) +#endif + +#include <linux/raid/pq.h> + +static int rvv_has_vector(void) +{ + return has_vector(); +} + +#define RAID6_RVV_WRAPPER(_n) \ + static void raid6_rvv ## _n ## _gen_syndrome(int disks, \ + size_t bytes, void **ptrs) \ + { \ + void raid6_rvv ## _n ## _gen_syndrome_real(int d, \ + unsigned long b, void **p); \ + kernel_vector_begin(); \ + raid6_rvv ## _n ## _gen_syndrome_real(disks, \ + (unsigned long)bytes, ptrs); \ + kernel_vector_end(); \ + } \ + static void raid6_rvv ## _n ## _xor_syndrome(int disks, \ + int start, int stop, \ + size_t bytes, void **ptrs) \ + { \ + void raid6_rvv ## _n ## _xor_syndrome_real(int d, \ + int s1, int s2, \ + unsigned long b, void **p); \ + kernel_vector_begin(); \ + raid6_rvv ## _n ## _xor_syndrome_real(disks, \ + start, stop, (unsigned long)bytes, ptrs); \ + kernel_vector_end(); \ + } \ + struct raid6_calls const raid6_rvvx ## _n = { \ + raid6_rvv ## _n ## _gen_syndrome, \ + raid6_rvv ## _n ## _xor_syndrome, \ + rvv_has_vector, \ + "rvvx" #_n, \ + 0 \ + } diff --git a/lib/raid6/s390vx.uc b/lib/raid6/s390vx.uc new file mode 100644 index 000000000000..8aa53eb2f395 --- /dev/null +++ b/lib/raid6/s390vx.uc @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * raid6_vx$#.c + * + * $#-way unrolled RAID6 gen/xor functions for s390 + * based on the vector facility + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + * + * This file is postprocessed using unroll.awk. + */ + +#include <linux/cpufeature.h> +#include <linux/raid/pq.h> +#include <asm/fpu.h> + +#define NSIZE 16 + +static __always_inline void LOAD_CONST(void) +{ + fpu_vrepib(24, 0x07); + fpu_vrepib(25, 0x1d); +} + +/* + * The SHLBYTE() operation shifts each of the 16 bytes in + * vector register y left by 1 bit and stores the result in + * vector register x. + */ +#define SHLBYTE(x, y) fpu_vab(x, y, y) + +/* + * For each of the 16 bytes in the vector register y the MASK() + * operation returns 0xFF if the high bit of the byte is 1, + * or 0x00 if the high bit is 0. The result is stored in vector + * register x. + */ +#define MASK(x, y) fpu_vesravb(x, y, 24) + +#define AND(x, y, z) fpu_vn(x, y, z) +#define XOR(x, y, z) fpu_vx(x, y, z) +#define LOAD_DATA(x, ptr) fpu_vlm(x, x + $# - 1, ptr) +#define STORE_DATA(x, ptr) fpu_vstm(x, x + $# - 1, ptr) +#define COPY_VEC(x, y) fpu_vlr(x, y) + +static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + DECLARE_KERNEL_FPU_ONSTACK32(vxstate); + u8 **dptr, *p, *q; + int d, z, z0; + + kernel_fpu_begin(&vxstate, KERNEL_VXR); + LOAD_CONST(); + + dptr = (u8 **) ptrs; + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + for (d = 0; d < bytes; d += $#*NSIZE) { + LOAD_DATA(0,&dptr[z0][d]); + COPY_VEC(8+$$,0+$$); + for (z = z0 - 1; z >= 0; z--) { + MASK(16+$$,8+$$); + AND(16+$$,16+$$,25); + SHLBYTE(8+$$,8+$$); + XOR(8+$$,8+$$,16+$$); + LOAD_DATA(16,&dptr[z][d]); + XOR(0+$$,0+$$,16+$$); + XOR(8+$$,8+$$,16+$$); + } + STORE_DATA(0,&p[d]); + STORE_DATA(8,&q[d]); + } + kernel_fpu_end(&vxstate, KERNEL_VXR); +} + +static void raid6_s390vx$#_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + DECLARE_KERNEL_FPU_ONSTACK32(vxstate); + u8 **dptr, *p, *q; + int d, z, z0; + + dptr = (u8 **) ptrs; + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + kernel_fpu_begin(&vxstate, KERNEL_VXR); + LOAD_CONST(); + + for (d = 0; d < bytes; d += $#*NSIZE) { + /* P/Q data pages */ + LOAD_DATA(0,&dptr[z0][d]); + COPY_VEC(8+$$,0+$$); + for (z = z0 - 1; z >= start; z--) { + MASK(16+$$,8+$$); + AND(16+$$,16+$$,25); + SHLBYTE(8+$$,8+$$); + XOR(8+$$,8+$$,16+$$); + LOAD_DATA(16,&dptr[z][d]); + XOR(0+$$,0+$$,16+$$); + XOR(8+$$,8+$$,16+$$); + } + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + MASK(16+$$,8+$$); + AND(16+$$,16+$$,25); + SHLBYTE(8+$$,8+$$); + XOR(8+$$,8+$$,16+$$); + } + LOAD_DATA(16,&p[d]); + XOR(16+$$,16+$$,0+$$); + STORE_DATA(16,&p[d]); + LOAD_DATA(16,&q[d]); + XOR(16+$$,16+$$,8+$$); + STORE_DATA(16,&q[d]); + } + kernel_fpu_end(&vxstate, KERNEL_VXR); +} + +static int raid6_s390vx$#_valid(void) +{ + return cpu_has_vx(); +} + +const struct raid6_calls raid6_s390vx$# = { + raid6_s390vx$#_gen_syndrome, + raid6_s390vx$#_xor_syndrome, + raid6_s390vx$#_valid, + "vx128x$#", + 1 +}; diff --git a/lib/raid6/sse1.c b/lib/raid6/sse1.c index f76297139445..692fa3a93bf0 100644 --- a/lib/raid6/sse1.c +++ b/lib/raid6/sse1.c @@ -1,13 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright 2002 H. Peter Anvin - All Rights Reserved * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Boston MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * * ----------------------------------------------------------------------- */ /* @@ -92,6 +87,7 @@ static void raid6_sse11_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_sse1x1 = { raid6_sse11_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_sse1_or_mmxext, "sse1x1", 1 /* Has cache hints */ @@ -154,6 +150,7 @@ static void raid6_sse12_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_sse1x2 = { raid6_sse12_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_sse1_or_mmxext, "sse1x2", 1 /* Has cache hints */ diff --git a/lib/raid6/sse2.c b/lib/raid6/sse2.c index 85b82c85f28e..2930220249c9 100644 --- a/lib/raid6/sse2.c +++ b/lib/raid6/sse2.c @@ -1,13 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright 2002 H. Peter Anvin - All Rights Reserved * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Boston MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * * ----------------------------------------------------------------------- */ /* @@ -88,8 +83,58 @@ static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs) kernel_fpu_end(); } + +static void raid6_sse21_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); + + for ( d = 0 ; d < bytes ; d += 16 ) { + asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); + asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); + asm volatile("pxor %xmm4,%xmm2"); + /* P/Q data pages */ + for ( z = z0-1 ; z >= start ; z-- ) { + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); + asm volatile("pxor %xmm5,%xmm2"); + asm volatile("pxor %xmm5,%xmm4"); + } + /* P/Q left side optimization */ + for ( z = start-1 ; z >= 0 ; z-- ) { + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pxor %xmm5,%xmm4"); + } + asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); + /* Don't use movntdq for r/w memory area < cache line */ + asm volatile("movdqa %%xmm4,%0" : "=m" (q[d])); + asm volatile("movdqa %%xmm2,%0" : "=m" (p[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + const struct raid6_calls raid6_sse2x1 = { raid6_sse21_gen_syndrome, + raid6_sse21_xor_syndrome, raid6_have_sse2, "sse2x1", 1 /* Has cache hints */ @@ -150,8 +195,76 @@ static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs) kernel_fpu_end(); } +static void raid6_sse22_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); + + for ( d = 0 ; d < bytes ; d += 32 ) { + asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); + asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16])); + asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); + asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16])); + asm volatile("pxor %xmm4,%xmm2"); + asm volatile("pxor %xmm6,%xmm3"); + /* P/Q data pages */ + for ( z = z0-1 ; z >= start ; z-- ) { + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); + asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); + asm volatile("pxor %xmm5,%xmm2"); + asm volatile("pxor %xmm7,%xmm3"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + } + /* P/Q left side optimization */ + for ( z = start-1 ; z >= 0 ; z-- ) { + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + } + asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); + asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16])); + /* Don't use movntdq for r/w memory area < cache line */ + asm volatile("movdqa %%xmm4,%0" : "=m" (q[d])); + asm volatile("movdqa %%xmm6,%0" : "=m" (q[d+16])); + asm volatile("movdqa %%xmm2,%0" : "=m" (p[d])); + asm volatile("movdqa %%xmm3,%0" : "=m" (p[d+16])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + const struct raid6_calls raid6_sse2x2 = { raid6_sse22_gen_syndrome, + raid6_sse22_xor_syndrome, raid6_have_sse2, "sse2x2", 1 /* Has cache hints */ @@ -248,8 +361,117 @@ static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs) kernel_fpu_end(); } +static void raid6_sse24_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0])); + + for ( d = 0 ; d < bytes ; d += 64 ) { + asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); + asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16])); + asm volatile("movdqa %0,%%xmm12" :: "m" (dptr[z0][d+32])); + asm volatile("movdqa %0,%%xmm14" :: "m" (dptr[z0][d+48])); + asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); + asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16])); + asm volatile("movdqa %0,%%xmm10" : : "m" (p[d+32])); + asm volatile("movdqa %0,%%xmm11" : : "m" (p[d+48])); + asm volatile("pxor %xmm4,%xmm2"); + asm volatile("pxor %xmm6,%xmm3"); + asm volatile("pxor %xmm12,%xmm10"); + asm volatile("pxor %xmm14,%xmm11"); + /* P/Q data pages */ + for ( z = z0-1 ; z >= start ; z-- ) { + asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); + asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32])); + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + asm volatile("pxor %xmm13,%xmm13"); + asm volatile("pxor %xmm15,%xmm15"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("pcmpgtb %xmm12,%xmm13"); + asm volatile("pcmpgtb %xmm14,%xmm15"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("paddb %xmm12,%xmm12"); + asm volatile("paddb %xmm14,%xmm14"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pand %xmm0,%xmm13"); + asm volatile("pand %xmm0,%xmm15"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm13,%xmm12"); + asm volatile("pxor %xmm15,%xmm14"); + asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); + asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); + asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32])); + asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48])); + asm volatile("pxor %xmm5,%xmm2"); + asm volatile("pxor %xmm7,%xmm3"); + asm volatile("pxor %xmm13,%xmm10"); + asm volatile("pxor %xmm15,%xmm11"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm13,%xmm12"); + asm volatile("pxor %xmm15,%xmm14"); + } + asm volatile("prefetchnta %0" :: "m" (q[d])); + asm volatile("prefetchnta %0" :: "m" (q[d+32])); + /* P/Q left side optimization */ + for ( z = start-1 ; z >= 0 ; z-- ) { + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + asm volatile("pxor %xmm13,%xmm13"); + asm volatile("pxor %xmm15,%xmm15"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("pcmpgtb %xmm12,%xmm13"); + asm volatile("pcmpgtb %xmm14,%xmm15"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("paddb %xmm12,%xmm12"); + asm volatile("paddb %xmm14,%xmm14"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pand %xmm0,%xmm13"); + asm volatile("pand %xmm0,%xmm15"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm13,%xmm12"); + asm volatile("pxor %xmm15,%xmm14"); + } + asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); + asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); + asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32])); + asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48])); + asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); + asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16])); + asm volatile("pxor %0,%%xmm12" : : "m" (q[d+32])); + asm volatile("pxor %0,%%xmm14" : : "m" (q[d+48])); + asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); + asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); + asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32])); + asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48])); + } + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + + const struct raid6_calls raid6_sse2x4 = { raid6_sse24_gen_syndrome, + raid6_sse24_xor_syndrome, raid6_have_sse2, "sse2x4", 1 /* Has cache hints */ diff --git a/lib/raid6/test/.gitignore b/lib/raid6/test/.gitignore new file mode 100644 index 000000000000..1b68a77f348f --- /dev/null +++ b/lib/raid6/test/.gitignore @@ -0,0 +1,3 @@ +/int.uc +/neon.uc +/raid6test diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 087332dbf8aa..09bbe2b14cce 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -1,16 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0 # # This is a simple Makefile to test some of the RAID-6 code # from userspace. # -CC = gcc -OPTFLAGS = -O2 # Adjust as desired -CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) -LD = ld -AWK = awk -f -AR = ar -RANLIB = ranlib -OBJS = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o +pound := \# + +# Adjust as desired +CC = gcc +OPTFLAGS = -O2 +CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) +LD = ld +AWK = awk -f +AR = ar +RANLIB = ranlib +OBJS = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o ARCH := $(shell uname -m 2>/dev/null | sed -e /s/i.86/i386/) ifeq ($(ARCH),i386) @@ -22,18 +26,51 @@ ifeq ($(ARCH),x86_64) IS_X86 = yes endif +ifeq ($(ARCH),arm) + CFLAGS += -I../../../arch/arm/include -mfpu=neon + HAS_NEON = yes +endif +ifeq ($(ARCH),aarch64) + CFLAGS += -I../../../arch/arm64/include + HAS_NEON = yes +endif + +ifeq ($(findstring riscv,$(ARCH)),riscv) + CFLAGS += -I../../../arch/riscv/include -DCONFIG_RISCV=1 + HAS_RVV = yes +endif + +ifeq ($(findstring ppc,$(ARCH)),ppc) + CFLAGS += -I../../../arch/powerpc/include + HAS_ALTIVEC := $(shell printf '$(pound)include <altivec.h>\nvector int a;\n' |\ + gcc -c -x c - >/dev/null && rm ./-.o && echo yes) +endif + +ifeq ($(ARCH),loongarch64) + CFLAGS += -I../../../arch/loongarch/include -DCONFIG_LOONGARCH=1 + CFLAGS += $(shell echo 'vld $$vr0, $$zero, 0' | \ + gcc -c -x assembler - >/dev/null 2>&1 && \ + rm ./-.o && echo -DCONFIG_CPU_HAS_LSX=1) + CFLAGS += $(shell echo 'xvld $$xr0, $$zero, 0' | \ + gcc -c -x assembler - >/dev/null 2>&1 && \ + rm ./-.o && echo -DCONFIG_CPU_HAS_LASX=1) +endif + ifeq ($(IS_X86),yes) - OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o - CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \ - gcc -c -x assembler - >&/dev/null && \ - rm ./-.o && echo -DCONFIG_AS_AVX2=1) -else - HAS_ALTIVEC := $(shell echo -e '\#include <altivec.h>\nvector int a;' |\ - gcc -c -x c - >&/dev/null && \ - rm ./-.o && echo yes) - ifeq ($(HAS_ALTIVEC),yes) - OBJS += altivec1.o altivec2.o altivec4.o altivec8.o - endif + OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o + CFLAGS += -DCONFIG_X86 +else ifeq ($(HAS_NEON),yes) + OBJS += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o + CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 +else ifeq ($(HAS_ALTIVEC),yes) + CFLAGS += -DCONFIG_ALTIVEC + OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \ + vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o +else ifeq ($(ARCH),loongarch64) + OBJS += loongarch_simd.o recov_loongarch_simd.o +else ifeq ($(HAS_RVV),yes) + OBJS += rvv.o recov_rvv.o + CFLAGS += -DCONFIG_RISCV_ISA_V=1 endif .c.o: @@ -45,16 +82,28 @@ endif %.uc: ../%.uc cp -f $< $@ -all: raid6.a raid6test +all: raid6.a raid6test raid6.a: $(OBJS) - rm -f $@ - $(AR) cq $@ $^ - $(RANLIB) $@ + rm -f $@ + $(AR) cq $@ $^ + $(RANLIB) $@ raid6test: test.c raid6.a $(CC) $(CFLAGS) -o raid6test $^ +neon1.c: neon.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=1 < neon.uc > $@ + +neon2.c: neon.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=2 < neon.uc > $@ + +neon4.c: neon.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=4 < neon.uc > $@ + +neon8.c: neon.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=8 < neon.uc > $@ + altivec1.c: altivec.uc ../unroll.awk $(AWK) ../unroll.awk -vN=1 < altivec.uc > $@ @@ -67,6 +116,18 @@ altivec4.c: altivec.uc ../unroll.awk altivec8.c: altivec.uc ../unroll.awk $(AWK) ../unroll.awk -vN=8 < altivec.uc > $@ +vpermxor1.c: vpermxor.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=1 < vpermxor.uc > $@ + +vpermxor2.c: vpermxor.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=2 < vpermxor.uc > $@ + +vpermxor4.c: vpermxor.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=4 < vpermxor.uc > $@ + +vpermxor8.c: vpermxor.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=8 < vpermxor.uc > $@ + int1.c: int.uc ../unroll.awk $(AWK) ../unroll.awk -vN=1 < int.uc > $@ @@ -89,7 +150,7 @@ tables.c: mktables ./mktables > tables.c clean: - rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c tables.c raid6test + rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c vpermxor*.c neon*.c tables.c raid6test spotless: clean rm -f *~ diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c index 5a485b7a7d3c..841a55242aba 100644 --- a/lib/raid6/test/test.c +++ b/lib/raid6/test/test.c @@ -1,11 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2 or (at your - * option) any later version; incorporated herein by reference. - * * ----------------------------------------------------------------------- */ /* @@ -21,18 +18,18 @@ #define NDISKS 16 /* Including P and Q */ -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); -struct raid6_calls raid6_call; +const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); char *dataptrs[NDISKS]; -char data[NDISKS][PAGE_SIZE]; -char recovi[PAGE_SIZE], recovj[PAGE_SIZE]; +char data[NDISKS][PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); +char recovi[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); +char recovj[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); -static void makedata(void) +static void makedata(int start, int stop) { int i, j; - for (i = 0; i < NDISKS; i++) { + for (i = start; i <= stop; i++) { for (j = 0; j < PAGE_SIZE; j++) data[i][j] = rand(); @@ -91,34 +88,55 @@ int main(int argc, char *argv[]) { const struct raid6_calls *const *algo; const struct raid6_recov_calls *const *ra; - int i, j; + int i, j, p1, p2; int err = 0; - makedata(); + makedata(0, NDISKS-1); for (ra = raid6_recov_algos; *ra; ra++) { if ((*ra)->valid && !(*ra)->valid()) continue; + raid6_2data_recov = (*ra)->data2; raid6_datap_recov = (*ra)->datap; printf("using recovery %s\n", (*ra)->name); for (algo = raid6_algos; *algo; algo++) { - if (!(*algo)->valid || (*algo)->valid()) { - raid6_call = **algo; + if ((*algo)->valid && !(*algo)->valid()) + continue; + + raid6_call = **algo; + + /* Nuke syndromes */ + memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); + + /* Generate assumed good syndrome */ + raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, + (void **)&dataptrs); + + for (i = 0; i < NDISKS-1; i++) + for (j = i+1; j < NDISKS; j++) + err += test_disks(i, j); + + if (!raid6_call.xor_syndrome) + continue; + + for (p1 = 0; p1 < NDISKS-2; p1++) + for (p2 = p1; p2 < NDISKS-2; p2++) { - /* Nuke syndromes */ - memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); + /* Simulate rmw run */ + raid6_call.xor_syndrome(NDISKS, p1, p2, PAGE_SIZE, + (void **)&dataptrs); + makedata(p1, p2); + raid6_call.xor_syndrome(NDISKS, p1, p2, PAGE_SIZE, + (void **)&dataptrs); - /* Generate assumed good syndrome */ - raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, - (void **)&dataptrs); + for (i = 0; i < NDISKS-1; i++) + for (j = i+1; j < NDISKS; j++) + err += test_disks(i, j); + } - for (i = 0; i < NDISKS-1; i++) - for (j = i+1; j < NDISKS; j++) - err += test_disks(i, j); - } } printf("\n"); } diff --git a/lib/raid6/unroll.awk b/lib/raid6/unroll.awk index c6aa03631df8..0809805a7e23 100644 --- a/lib/raid6/unroll.awk +++ b/lib/raid6/unroll.awk @@ -13,7 +13,7 @@ BEGIN { for (i = 0; i < rep; ++i) { tmp = $0 gsub(/\$\$/, i, tmp) - gsub(/\$\#/, n, tmp) + gsub(/\$#/, n, tmp) gsub(/\$\*/, "$", tmp) print tmp } diff --git a/lib/raid6/vpermxor.uc b/lib/raid6/vpermxor.uc new file mode 100644 index 000000000000..1bfb127fbfe8 --- /dev/null +++ b/lib/raid6/vpermxor.uc @@ -0,0 +1,105 @@ +/* + * Copyright 2017, Matt Brown, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * vpermxor$#.c + * + * Based on H. Peter Anvin's paper - The mathematics of RAID-6 + * + * $#-way unrolled portable integer math RAID-6 instruction set + * This file is postprocessed using unroll.awk + * + * vpermxor$#.c makes use of the vpermxor instruction to optimise the RAID6 Q + * syndrome calculations. + * This can be run on systems which have both Altivec and vpermxor instruction. + * + * This instruction was introduced in POWER8 - ISA v2.07. + */ + +#include <linux/raid/pq.h> +#ifdef CONFIG_ALTIVEC + +#include <altivec.h> +#include <asm/ppc-opcode.h> +#ifdef __KERNEL__ +#include <asm/cputable.h> +#include <asm/switch_to.h> +#endif + +typedef vector unsigned char unative_t; +#define NSIZE sizeof(unative_t) + +static const vector unsigned char gf_low = {0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, + 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, + 0x06, 0x04, 0x02,0x00}; +static const vector unsigned char gf_high = {0xfd, 0xdd, 0xbd, 0x9d, 0x7d, 0x5d, + 0x3d, 0x1d, 0xe0, 0xc0, 0xa0, 0x80, + 0x60, 0x40, 0x20, 0x00}; + +static void noinline raid6_vpermxor$#_gen_syndrome_real(int disks, size_t bytes, + void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + unative_t wp$$, wq$$, wd$$; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + for (d = 0; d < bytes; d += NSIZE*$#) { + wp$$ = wq$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; + + for (z = z0-1; z>=0; z--) { + wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + /* P syndrome */ + wp$$ = vec_xor(wp$$, wd$$); + + /* Q syndrome */ + asm(VPERMXOR(%0,%1,%2,%3):"=v"(wq$$):"v"(gf_high), "v"(gf_low), "v"(wq$$)); + wq$$ = vec_xor(wq$$, wd$$); + } + *(unative_t *)&p[d+NSIZE*$$] = wp$$; + *(unative_t *)&q[d+NSIZE*$$] = wq$$; + } +} + +static void raid6_vpermxor$#_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + preempt_disable(); + enable_kernel_altivec(); + + raid6_vpermxor$#_gen_syndrome_real(disks, bytes, ptrs); + + disable_kernel_altivec(); + preempt_enable(); +} + +int raid6_have_altivec_vpermxor(void); +#if $# == 1 +int raid6_have_altivec_vpermxor(void) +{ + /* Check if arch has both altivec and the vpermxor instructions */ +# ifdef __KERNEL__ + return (cpu_has_feature(CPU_FTR_ALTIVEC_COMP) && + cpu_has_feature(CPU_FTR_ARCH_207S)); +# else + return 1; +#endif + +} +#endif + +const struct raid6_calls raid6_vpermxor$# = { + raid6_vpermxor$#_gen_syndrome, + NULL, + raid6_have_altivec_vpermxor, + "vpermxor$#", + 0 +}; +#endif diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h index b7595484a815..9a6ff37115e7 100644 --- a/lib/raid6/x86.h +++ b/lib/raid6/x86.h @@ -1,13 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* ----------------------------------------------------------------------- * * * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Boston MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * * ----------------------------------------------------------------------- */ /* @@ -23,7 +18,7 @@ #ifdef __KERNEL__ /* Real code */ -#include <asm/i387.h> +#include <asm/fpu/api.h> #else /* Dummy code for user space testing */ @@ -46,6 +41,16 @@ static inline void kernel_fpu_end(void) #define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */ #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ #define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_AVX512F (9*32+16) /* AVX-512 Foundation */ +#define X86_FEATURE_AVX512DQ (9*32+17) /* AVX-512 DQ (Double/Quad granular) + * Instructions + */ +#define X86_FEATURE_AVX512BW (9*32+30) /* AVX-512 BW (Byte/Word granular) + * Instructions + */ +#define X86_FEATURE_AVX512VL (9*32+31) /* AVX-512 VL (128/256 Vector Length) + * Extensions + */ #define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ /* Should work well enough on modern CPUs for testing */ |
