33 files changed, 2755 insertions, 455 deletions
diff --git a/lib/raid6/.gitignore b/lib/raid6/.gitignore
index 3de0d8921286..6be57745afd1 100644
--- a/lib/raid6/.gitignore
+++ b/lib/raid6/.gitignore
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 mktables
 altivec*.c
 int*.c
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 4e90d443d1b0..5be0a4e60ab1 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -2,31 +2,31 @@
 obj-$(CONFIG_RAID6_PQ)	+= raid6_pq.o
 
 raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
-		   int8.o int16.o int32.o
+		   int8.o
 
 raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
                               vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
 raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
+raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o
+raid6_pq-$(CONFIG_RISCV_ISA_V) += rvv.o recov_rvv.o
 
-hostprogs-y	+= mktables
-
-quiet_cmd_unroll = UNROLL  $@
-      cmd_unroll = $(AWK) -f$(srctree)/$(src)/unroll.awk -vN=$(UNROLL) < $< > $@
+hostprogs	+= mktables
 
 ifeq ($(CONFIG_ALTIVEC),y)
 altivec_flags := -maltivec $(call cc-option,-mabi=altivec)
+# Enable <altivec.h>
+altivec_flags += -isystem $(shell $(CC) -print-file-name=include)
 
 ifdef CONFIG_CC_IS_CLANG
 # clang ppc port does not yet support -maltivec when -msoft-float is
 # enabled. A future release of clang will resolve this
-# https://bugs.llvm.org/show_bug.cgi?id=31177
+# https://llvm.org/pr31177
 CFLAGS_REMOVE_altivec1.o  += -msoft-float
 CFLAGS_REMOVE_altivec2.o  += -msoft-float
 CFLAGS_REMOVE_altivec4.o  += -msoft-float
 CFLAGS_REMOVE_altivec8.o  += -msoft-float
-CFLAGS_REMOVE_altivec8.o  += -msoft-float
 CFLAGS_REMOVE_vpermxor1.o += -msoft-float
 CFLAGS_REMOVE_vpermxor2.o += -msoft-float
 CFLAGS_REMOVE_vpermxor4.o += -msoft-float
@@ -34,128 +34,45 @@ CFLAGS_REMOVE_vpermxor8.o += -msoft-float
 endif
 endif
 
-# The GCC option -ffreestanding is required in order to compile code containing
-# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel)
-ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
-NEON_FLAGS := -ffreestanding
-ifeq ($(ARCH),arm)
-NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon
-endif
-CFLAGS_recov_neon_inner.o += $(NEON_FLAGS)
-ifeq ($(ARCH),arm64)
-CFLAGS_REMOVE_recov_neon_inner.o += -mgeneral-regs-only
-CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only
-CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only
-CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only
-CFLAGS_REMOVE_neon8.o += -mgeneral-regs-only
-endif
-endif
-
-targets += int1.c
-$(obj)/int1.c:   UNROLL := 1
-$(obj)/int1.c:   $(src)/int.uc $(src)/unroll.awk FORCE
-	$(call if_changed,unroll)
-
-targets += int2.c
-$(obj)/int2.c:   UNROLL := 2
-$(obj)/int2.c:   $(src)/int.uc $(src)/unroll.awk FORCE
-	$(call if_changed,unroll)
-
-targets += int4.c
-$(obj)/int4.c:   UNROLL := 4
-$(obj)/int4.c:   $(src)/int.uc $(src)/unroll.awk FORCE
-	$(call if_changed,unroll)
-
-targets += int8.c
-$(obj)/int8.c:   UNROLL := 8
-$(obj)/int8.c:   $(src)/int.uc $(src)/unroll.awk FORCE
-	$(call if_changed,unroll)
-
-targets += int16.c
-$(obj)/int16.c:  UNROLL := 16
-$(obj)/int16.c:  $(src)/int.uc $(src)/unroll.awk FORCE
-	$(call if_changed,unroll)
+quiet_cmd_unroll = UNROLL  $@
+      cmd_unroll = $(AWK) -v N=$* -f $(src)/unroll.awk < $< > $@
 
-targets += int32.c
-$(obj)/int32.c:  UNROLL := 32
-$(obj)/int32.c:  $(src)/int.uc $(src)/unroll.awk FORCE
+targets += int1.c int2.c int4.c int8.c
+$(obj)/int%.c: $(src)/int.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
 CFLAGS_altivec1.o += $(altivec_flags)
-targets += altivec1.c
-$(obj)/altivec1.c:   UNROLL := 1
-$(obj)/altivec1.c:   $(src)/altivec.uc $(src)/unroll.awk FORCE
-	$(call if_changed,unroll)
-
 CFLAGS_altivec2.o += $(altivec_flags)
-targets += altivec2.c
-$(obj)/altivec2.c:   UNROLL := 2
-$(obj)/altivec2.c:   $(src)/altivec.uc $(src)/unroll.awk FORCE
-	$(call if_changed,unroll)
-
 CFLAGS_altivec4.o += $(altivec_flags)
-targets += altivec4.c
-$(obj)/altivec4.c:   UNROLL := 4
-$(obj)/altivec4.c:   $(src)/altivec.uc $(src)/unroll.awk FORCE
-	$(call if_changed,unroll)
-
 CFLAGS_altivec8.o += $(altivec_flags)
-targets += altivec8.c
-$(obj)/altivec8.c:   UNROLL := 8
-$(obj)/altivec8.c:   $(src)/altivec.uc $(src)/unroll.awk FORCE
+targets += altivec1.c altivec2.c altivec4.c altivec8.c
+$(obj)/altivec%.c: $(src)/altivec.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
 CFLAGS_vpermxor1.o += $(altivec_flags)
-targets += vpermxor1.c
-$(obj)/vpermxor1.c: UNROLL := 1
-$(obj)/vpermxor1.c: $(src)/vpermxor.uc $(src)/unroll.awk FORCE
-	$(call if_changed,unroll)
-
 CFLAGS_vpermxor2.o += $(altivec_flags)
-targets += vpermxor2.c
-$(obj)/vpermxor2.c: UNROLL := 2
-$(obj)/vpermxor2.c: $(src)/vpermxor.uc $(src)/unroll.awk FORCE
-	$(call if_changed,unroll)
-
 CFLAGS_vpermxor4.o += $(altivec_flags)
-targets += vpermxor4.c
-$(obj)/vpermxor4.c: UNROLL := 4
-$(obj)/vpermxor4.c: $(src)/vpermxor.uc $(src)/unroll.awk FORCE
-	$(call if_changed,unroll)
-
 CFLAGS_vpermxor8.o += $(altivec_flags)
-targets += vpermxor8.c
-$(obj)/vpermxor8.c: UNROLL := 8
-$(obj)/vpermxor8.c: $(src)/vpermxor.uc $(src)/unroll.awk FORCE
-	$(call if_changed,unroll)
-
-CFLAGS_neon1.o += $(NEON_FLAGS)
-targets += neon1.c
-$(obj)/neon1.c:   UNROLL := 1
-$(obj)/neon1.c:   $(src)/neon.uc $(src)/unroll.awk FORCE
-	$(call if_changed,unroll)
-
-CFLAGS_neon2.o += $(NEON_FLAGS)
-targets += neon2.c
-$(obj)/neon2.c:   UNROLL := 2
-$(obj)/neon2.c:   $(src)/neon.uc $(src)/unroll.awk FORCE
-	$(call if_changed,unroll)
-
-CFLAGS_neon4.o += $(NEON_FLAGS)
-targets += neon4.c
-$(obj)/neon4.c:   UNROLL := 4
-$(obj)/neon4.c:   $(src)/neon.uc $(src)/unroll.awk FORCE
+targets += vpermxor1.c vpermxor2.c vpermxor4.c vpermxor8.c
+$(obj)/vpermxor%.c: $(src)/vpermxor.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
-CFLAGS_neon8.o += $(NEON_FLAGS)
-targets += neon8.c
-$(obj)/neon8.c:   UNROLL := 8
-$(obj)/neon8.c:   $(src)/neon.uc $(src)/unroll.awk FORCE
+CFLAGS_neon1.o += $(CC_FLAGS_FPU)
+CFLAGS_neon2.o += $(CC_FLAGS_FPU)
+CFLAGS_neon4.o += $(CC_FLAGS_FPU)
+CFLAGS_neon8.o += $(CC_FLAGS_FPU)
+CFLAGS_recov_neon_inner.o += $(CC_FLAGS_FPU)
+CFLAGS_REMOVE_neon1.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_REMOVE_neon2.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_REMOVE_neon4.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_REMOVE_neon8.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_REMOVE_recov_neon_inner.o += $(CC_FLAGS_NO_FPU)
+targets += neon1.c neon2.c neon4.c neon8.c
+$(obj)/neon%.c: $(src)/neon.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
 targets += s390vx8.c
-$(obj)/s390vx8.c:   UNROLL := 8
-$(obj)/s390vx8.c:   $(src)/s390vx.uc $(src)/unroll.awk FORCE
+$(obj)/s390vx%.c: $(src)/s390vx.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
 quiet_cmd_mktable = TABLE   $@
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 7e4f7a8ffa8e..799e0e5eac26 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- linux-c -*- ------------------------------------------------------- *
  *
  *   Copyright 2002 H. Peter Anvin - All Rights Reserved
  *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Boston MA 02111-1307, USA; either version 2 of the License, or
- *   (at your option) any later version; incorporated herein by reference.
- *
  * ----------------------------------------------------------------------- */
 
 /*
@@ -23,11 +18,6 @@
 #else
 #include <linux/module.h>
 #include <linux/gfp.h>
-#if !RAID6_USE_EMPTY_ZERO_PAGE
-/* In .bss so it's zeroed */
-const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
-EXPORT_SYMBOL(raid6_empty_zero_page);
-#endif
 #endif
 
 struct raid6_calls raid6_call;
@@ -35,14 +25,10 @@ EXPORT_SYMBOL_GPL(raid6_call);
 
 const struct raid6_calls * const raid6_algos[] = {
 #if defined(__i386__) && !defined(__arch_um__)
-#ifdef CONFIG_AS_AVX512
 	&raid6_avx512x2,
 	&raid6_avx512x1,
-#endif
-#ifdef CONFIG_AS_AVX2
 	&raid6_avx2x2,
 	&raid6_avx2x1,
-#endif
 	&raid6_sse2x2,
 	&raid6_sse2x1,
 	&raid6_sse1x2,
@@ -51,16 +37,12 @@ const struct raid6_calls * const raid6_algos[] = {
 	&raid6_mmxx1,
 #endif
 #if defined(__x86_64__) && !defined(__arch_um__)
-#ifdef CONFIG_AS_AVX512
 	&raid6_avx512x4,
 	&raid6_avx512x2,
 	&raid6_avx512x1,
-#endif
-#ifdef CONFIG_AS_AVX2
 	&raid6_avx2x4,
 	&raid6_avx2x2,
 	&raid6_avx2x1,
-#endif
 	&raid6_sse2x4,
 	&raid6_sse2x2,
 	&raid6_sse2x1,
@@ -84,9 +66,19 @@ const struct raid6_calls * const raid6_algos[] = {
 	&raid6_neonx2,
 	&raid6_neonx1,
 #endif
-#if defined(__ia64__)
-	&raid6_intx32,
-	&raid6_intx16,
+#ifdef CONFIG_LOONGARCH
+#ifdef CONFIG_CPU_HAS_LASX
+	&raid6_lasx,
+#endif
+#ifdef CONFIG_CPU_HAS_LSX
+	&raid6_lsx,
+#endif
+#endif
+#ifdef CONFIG_RISCV_ISA_V
+	&raid6_rvvx1,
+	&raid6_rvvx2,
+	&raid6_rvvx4,
+	&raid6_rvvx8,
 #endif
 	&raid6_intx8,
 	&raid6_intx4,
@@ -102,13 +94,9 @@ void (*raid6_datap_recov)(int, size_t, int, void **);
 EXPORT_SYMBOL_GPL(raid6_datap_recov);
 
 const struct raid6_recov_calls *const raid6_recov_algos[] = {
-#ifdef CONFIG_AS_AVX512
+#ifdef CONFIG_X86
 	&raid6_recov_avx512,
-#endif
-#ifdef CONFIG_AS_AVX2
 	&raid6_recov_avx2,
-#endif
-#ifdef CONFIG_AS_SSSE3
 	&raid6_recov_ssse3,
 #endif
 #ifdef CONFIG_S390
@@ -117,6 +105,17 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
 #if defined(CONFIG_KERNEL_MODE_NEON)
 	&raid6_recov_neon,
 #endif
+#ifdef CONFIG_LOONGARCH
+#ifdef CONFIG_CPU_HAS_LASX
+	&raid6_recov_lasx,
+#endif
+#ifdef CONFIG_CPU_HAS_LSX
+	&raid6_recov_lsx,
+#endif
+#endif
+#ifdef CONFIG_RISCV_ISA_V
+	&raid6_recov_rvv,
+#endif
 	&raid6_recov_intx1,
 	NULL
 };
@@ -129,6 +128,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
 #define time_before(x, y) ((x) < (y))
 #endif
 
+#define RAID6_TEST_DISKS	8
+#define RAID6_TEST_DISKS_ORDER	3
+
 static inline const struct raid6_recov_calls *raid6_choose_recov(void)
 {
 	const struct raid6_recov_calls *const *algo;
@@ -151,15 +153,15 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void)
 }
 
 static inline const struct raid6_calls *raid6_choose_gen(
-	void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks)
+	void *(*const dptrs)[RAID6_TEST_DISKS], const int disks)
 {
-	unsigned long perf, bestgenperf, bestxorperf, j0, j1;
+	unsigned long perf, bestgenperf, j0, j1;
 	int start = (disks>>1)-1, stop = disks-3;	/* work on the second half of the disks */
 	const struct raid6_calls *const *algo;
 	const struct raid6_calls *best;
 
-	for (bestgenperf = 0, bestxorperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) {
-		if (!best || (*algo)->prefer >= best->prefer) {
+	for (bestgenperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) {
+		if (!best || (*algo)->priority >= best->priority) {
 			if ((*algo)->valid && !(*algo)->valid())
 				continue;
 
@@ -186,44 +188,50 @@ static inline const struct raid6_calls *raid6_choose_gen(
 				best = *algo;
 			}
 			pr_info("raid6: %-8s gen() %5ld MB/s\n", (*algo)->name,
-			       (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2));
-
-			if (!(*algo)->xor_syndrome)
-				continue;
+				(perf * HZ * (disks-2)) >>
+				(20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2));
+		}
+	}
 
-			perf = 0;
+	if (!best) {
+		pr_err("raid6: Yikes! No algorithm found!\n");
+		goto out;
+	}
 
-			preempt_disable();
-			j0 = jiffies;
-			while ((j1 = jiffies) == j0)
-				cpu_relax();
-			while (time_before(jiffies,
-					    j1 + (1<<RAID6_TIME_JIFFIES_LG2))) {
-				(*algo)->xor_syndrome(disks, start, stop,
-						      PAGE_SIZE, *dptrs);
-				perf++;
-			}
-			preempt_enable();
+	raid6_call = *best;
 
-			if (best == *algo)
-				bestxorperf = perf;
+	if (!IS_ENABLED(CONFIG_RAID6_PQ_BENCHMARK)) {
+		pr_info("raid6: skipped pq benchmark and selected %s\n",
+			best->name);
+		goto out;
+	}
 
-			pr_info("raid6: %-8s xor() %5ld MB/s\n", (*algo)->name,
-				(perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2+1));
+	pr_info("raid6: using algorithm %s gen() %ld MB/s\n",
+		best->name,
+		(bestgenperf * HZ * (disks - 2)) >>
+		(20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2));
+
+	if (best->xor_syndrome) {
+		perf = 0;
+
+		preempt_disable();
+		j0 = jiffies;
+		while ((j1 = jiffies) == j0)
+			cpu_relax();
+		while (time_before(jiffies,
+				   j1 + (1 << RAID6_TIME_JIFFIES_LG2))) {
+			best->xor_syndrome(disks, start, stop,
+					   PAGE_SIZE, *dptrs);
+			perf++;
 		}
-	}
+		preempt_enable();
 
-	if (best) {
-		pr_info("raid6: using algorithm %s gen() %ld MB/s\n",
-		       best->name,
-		       (bestgenperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2));
-		if (best->xor_syndrome)
-			pr_info("raid6: .... xor() %ld MB/s, rmw enabled\n",
-			       (bestxorperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2+1));
-		raid6_call = *best;
-	} else
-		pr_err("raid6: Yikes!  No algorithm found!\n");
+		pr_info("raid6: .... xor() %ld MB/s, rmw enabled\n",
+			(perf * HZ * (disks - 2)) >>
+			(20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2 + 1));
+	}
 
+out:
 	return best;
 }
 
@@ -233,27 +241,33 @@ static inline const struct raid6_calls *raid6_choose_gen(
 
 int __init raid6_select_algo(void)
 {
-	const int disks = (65536/PAGE_SIZE)+2;
+	const int disks = RAID6_TEST_DISKS;
 
 	const struct raid6_calls *gen_best;
 	const struct raid6_recov_calls *rec_best;
-	char *syndromes;
-	void *dptrs[(65536/PAGE_SIZE)+2];
-	int i;
+	char *disk_ptr, *p;
+	void *dptrs[RAID6_TEST_DISKS];
+	int i, cycle;
 
-	for (i = 0; i < disks-2; i++)
-		dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i;
-
-	/* Normal code - use a 2-page allocation to avoid D$ conflict */
-	syndromes = (void *) __get_free_pages(GFP_KERNEL, 1);
-
-	if (!syndromes) {
+	/* prepare the buffer and fill it circularly with gfmul table */
+	disk_ptr = (char *)__get_free_pages(GFP_KERNEL, RAID6_TEST_DISKS_ORDER);
+	if (!disk_ptr) {
 		pr_err("raid6: Yikes!  No memory available.\n");
 		return -ENOMEM;
 	}
 
-	dptrs[disks-2] = syndromes;
-	dptrs[disks-1] = syndromes + PAGE_SIZE;
+	p = disk_ptr;
+	for (i = 0; i < disks; i++)
+		dptrs[i] = p + PAGE_SIZE * i;
+
+	cycle = ((disks - 2) * PAGE_SIZE) / 65536;
+	for (i = 0; i < cycle; i++) {
+		memcpy(p, raid6_gfmul, 65536);
+		p += 65536;
+	}
+
+	if ((disks - 2) * PAGE_SIZE % 65536)
+		memcpy(p, raid6_gfmul, (disks - 2) * PAGE_SIZE % 65536);
 
 	/* select raid gen_syndrome function */
 	gen_best = raid6_choose_gen(&dptrs, disks);
@@ -261,7 +275,7 @@ int __init raid6_select_algo(void)
 	/* select raid recover functions */
 	rec_best = raid6_choose_recov();
 
-	free_pages((unsigned long)syndromes, 1);
+	free_pages((unsigned long)disk_ptr, RAID6_TEST_DISKS_ORDER);
 
 	return gen_best && rec_best ? 0 : -EINVAL;
 }
diff --git a/lib/raid6/avx2.c b/lib/raid6/avx2.c
index 20bca3d44f67..059024234dce 100644
--- a/lib/raid6/avx2.c
+++ b/lib/raid6/avx2.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- linux-c -*- ------------------------------------------------------- *
  *
  *   Copyright (C) 2012 Intel Corporation
@@ -5,13 +6,6 @@
  *
  *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
  *
- *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Boston MA 02111-1307, USA; either version 2 of the License, or
- *   (at your option) any later version; incorporated herein by reference.
- *
  * ----------------------------------------------------------------------- */
 
 /*
@@ -19,8 +13,6 @@
  *
  */
 
-#ifdef CONFIG_AS_AVX2
-
 #include <linux/raid/pq.h>
 #include "x86.h"
 
@@ -140,7 +132,7 @@ const struct raid6_calls raid6_avx2x1 = {
 	raid6_avx21_xor_syndrome,
 	raid6_have_avx2,
 	"avx2x1",
-	1			/* Has cache hints */
+	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
 };
 
 /*
@@ -270,7 +262,7 @@ const struct raid6_calls raid6_avx2x2 = {
 	raid6_avx22_xor_syndrome,
 	raid6_have_avx2,
 	"avx2x2",
-	1			/* Has cache hints */
+	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
 };
 
 #ifdef CONFIG_X86_64
@@ -473,8 +465,6 @@ const struct raid6_calls raid6_avx2x4 = {
 	raid6_avx24_xor_syndrome,
 	raid6_have_avx2,
 	"avx2x4",
-	1			/* Has cache hints */
+	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
 };
-#endif
-
-#endif /* CONFIG_AS_AVX2 */
+#endif /* CONFIG_X86_64 */
diff --git a/lib/raid6/avx512.c b/lib/raid6/avx512.c
index 46df7977b971..009bd0adeebf 100644
--- a/lib/raid6/avx512.c
+++ b/lib/raid6/avx512.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- linux-c -*- --------------------------------------------------------
  *
  *   Copyright (C) 2016 Intel Corporation
@@ -8,12 +9,6 @@
  *   Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
  *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
  *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Boston MA 02111-1307, USA; either version 2 of the License, or
- *   (at your option) any later version; incorporated herein by reference.
- *
  * -----------------------------------------------------------------------
  */
 
@@ -22,8 +17,6 @@
  *
  */
 
-#ifdef CONFIG_AS_AVX512
-
 #include <linux/raid/pq.h>
 #include "x86.h"
 
@@ -167,7 +160,7 @@ const struct raid6_calls raid6_avx512x1 = {
 	raid6_avx5121_xor_syndrome,
 	raid6_have_avx512,
 	"avx512x1",
-	1                       /* Has cache hints */
+	.priority = 2		/* Prefer AVX512 over priority 1 (SSE2 and others) */
 };
 
 /*
@@ -324,7 +317,7 @@ const struct raid6_calls raid6_avx512x2 = {
 	raid6_avx5122_xor_syndrome,
 	raid6_have_avx512,
 	"avx512x2",
-	1                       /* Has cache hints */
+	.priority = 2		/* Prefer AVX512 over priority 1 (SSE2 and others) */
 };
 
 #ifdef CONFIG_X86_64
@@ -562,8 +555,6 @@ const struct raid6_calls raid6_avx512x4 = {
 	raid6_avx5124_xor_syndrome,
 	raid6_have_avx512,
 	"avx512x4",
-	1                       /* Has cache hints */
+	.priority = 2		/* Prefer AVX512 over priority 1 (SSE2 and others) */
 };
 #endif
-
-#endif /* CONFIG_AS_AVX512 */
diff --git a/lib/raid6/int.uc b/lib/raid6/int.uc
index 558aeac9342a..1ba56c3fa482 100644
--- a/lib/raid6/int.uc
+++ b/lib/raid6/int.uc
@@ -42,13 +42,6 @@ typedef u32 unative_t;
 
 
 /*
- * IA-64 wants insane amounts of unrolling.  On other architectures that
- * is just a waste of space.
- */
-#if ($# <= 8) || defined(__ia64__)
-
-
-/*
  * These sub-operations are separate inlines since they can sometimes be
  * specially optimized using architecture-specific hacks.
  */
@@ -152,5 +145,3 @@ const struct raid6_calls raid6_intx$# = {
 	"int" NSTRING "x$#",
 	0
 };
-
-#endif
diff --git a/lib/raid6/loongarch.h b/lib/raid6/loongarch.h
new file mode 100644
index 000000000000..acfc33ce7056
--- /dev/null
+++ b/lib/raid6/loongarch.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
+ *
+ * raid6/loongarch.h
+ *
+ * Definitions common to LoongArch RAID-6 code only
+ */
+
+#ifndef _LIB_RAID6_LOONGARCH_H
+#define _LIB_RAID6_LOONGARCH_H
+
+#ifdef __KERNEL__
+
+#include <asm/cpu-features.h>
+#include <asm/fpu.h>
+
+#else /* for user-space testing */
+
+#include <sys/auxv.h>
+
+/* have to supply these defines for glibc 2.37- and musl */
+#ifndef HWCAP_LOONGARCH_LSX
+#define HWCAP_LOONGARCH_LSX	(1 << 4)
+#endif
+#ifndef HWCAP_LOONGARCH_LASX
+#define HWCAP_LOONGARCH_LASX	(1 << 5)
+#endif
+
+#define kernel_fpu_begin()
+#define kernel_fpu_end()
+
+#define cpu_has_lsx	(getauxval(AT_HWCAP) & HWCAP_LOONGARCH_LSX)
+#define cpu_has_lasx	(getauxval(AT_HWCAP) & HWCAP_LOONGARCH_LASX)
+
+#endif /* __KERNEL__ */
+
+#endif /* _LIB_RAID6_LOONGARCH_H */
diff --git a/lib/raid6/loongarch_simd.c b/lib/raid6/loongarch_simd.c
new file mode 100644
index 000000000000..aa5d9f924ca3
--- /dev/null
+++ b/lib/raid6/loongarch_simd.c
@@ -0,0 +1,422 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RAID6 syndrome calculations in LoongArch SIMD (LSX & LASX)
+ *
+ * Copyright 2023 WANG Xuerui <git@xen0n.name>
+ *
+ * Based on the generic RAID-6 code (int.uc):
+ *
+ * Copyright 2002-2004 H. Peter Anvin
+ */
+
+#include <linux/raid/pq.h>
+#include "loongarch.h"
+
+/*
+ * The vector algorithms are currently priority 0, which means the generic
+ * scalar algorithms are not being disabled if vector support is present.
+ * This is like the similar LoongArch RAID5 XOR code, with the main reason
+ * repeated here: it cannot be ruled out at this point of time, that some
+ * future (maybe reduced) models could run the vector algorithms slower than
+ * the scalar ones, maybe for errata or micro-op reasons. It may be
+ * appropriate to revisit this after one or two more uarch generations.
+ */
+
+#ifdef CONFIG_CPU_HAS_LSX
+#define NSIZE 16
+
+static int raid6_has_lsx(void)
+{
+	return cpu_has_lsx;
+}
+
+static void raid6_lsx_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	/*
+	 * $vr0, $vr1, $vr2, $vr3: wp
+	 * $vr4, $vr5, $vr6, $vr7: wq
+	 * $vr8, $vr9, $vr10, $vr11: wd
+	 * $vr12, $vr13, $vr14, $vr15: w2
+	 * $vr16, $vr17, $vr18, $vr19: w1
+	 */
+	for (d = 0; d < bytes; d += NSIZE*4) {
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
+		asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
+		asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
+		asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
+		asm volatile("vori.b $vr4, $vr0, 0");
+		asm volatile("vori.b $vr5, $vr1, 0");
+		asm volatile("vori.b $vr6, $vr2, 0");
+		asm volatile("vori.b $vr7, $vr3, 0");
+		for (z = z0-1; z >= 0; z--) {
+			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
+			asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
+			asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
+			asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
+			asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
+			/* wp$$ ^= wd$$; */
+			asm volatile("vxor.v $vr0, $vr0, $vr8");
+			asm volatile("vxor.v $vr1, $vr1, $vr9");
+			asm volatile("vxor.v $vr2, $vr2, $vr10");
+			asm volatile("vxor.v $vr3, $vr3, $vr11");
+			/* w2$$ = MASK(wq$$); */
+			asm volatile("vslti.b $vr12, $vr4, 0");
+			asm volatile("vslti.b $vr13, $vr5, 0");
+			asm volatile("vslti.b $vr14, $vr6, 0");
+			asm volatile("vslti.b $vr15, $vr7, 0");
+			/* w1$$ = SHLBYTE(wq$$); */
+			asm volatile("vslli.b $vr16, $vr4, 1");
+			asm volatile("vslli.b $vr17, $vr5, 1");
+			asm volatile("vslli.b $vr18, $vr6, 1");
+			asm volatile("vslli.b $vr19, $vr7, 1");
+			/* w2$$ &= NBYTES(0x1d); */
+			asm volatile("vandi.b $vr12, $vr12, 0x1d");
+			asm volatile("vandi.b $vr13, $vr13, 0x1d");
+			asm volatile("vandi.b $vr14, $vr14, 0x1d");
+			asm volatile("vandi.b $vr15, $vr15, 0x1d");
+			/* w1$$ ^= w2$$; */
+			asm volatile("vxor.v $vr16, $vr16, $vr12");
+			asm volatile("vxor.v $vr17, $vr17, $vr13");
+			asm volatile("vxor.v $vr18, $vr18, $vr14");
+			asm volatile("vxor.v $vr19, $vr19, $vr15");
+			/* wq$$ = w1$$ ^ wd$$; */
+			asm volatile("vxor.v $vr4, $vr16, $vr8");
+			asm volatile("vxor.v $vr5, $vr17, $vr9");
+			asm volatile("vxor.v $vr6, $vr18, $vr10");
+			asm volatile("vxor.v $vr7, $vr19, $vr11");
+		}
+		/* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
+		asm volatile("vst $vr0, %0" : "=m"(p[d+NSIZE*0]));
+		asm volatile("vst $vr1, %0" : "=m"(p[d+NSIZE*1]));
+		asm volatile("vst $vr2, %0" : "=m"(p[d+NSIZE*2]));
+		asm volatile("vst $vr3, %0" : "=m"(p[d+NSIZE*3]));
+		/* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
+		asm volatile("vst $vr4, %0" : "=m"(q[d+NSIZE*0]));
+		asm volatile("vst $vr5, %0" : "=m"(q[d+NSIZE*1]));
+		asm volatile("vst $vr6, %0" : "=m"(q[d+NSIZE*2]));
+		asm volatile("vst $vr7, %0" : "=m"(q[d+NSIZE*3]));
+	}
+
+	kernel_fpu_end();
+}
+
+static void raid6_lsx_xor_syndrome(int disks, int start, int stop,
+				   size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	/*
+	 * $vr0, $vr1, $vr2, $vr3: wp
+	 * $vr4, $vr5, $vr6, $vr7: wq
+	 * $vr8, $vr9, $vr10, $vr11: wd
+	 * $vr12, $vr13, $vr14, $vr15: w2
+	 * $vr16, $vr17, $vr18, $vr19: w1
+	 */
+	for (d = 0; d < bytes; d += NSIZE*4) {
+		/* P/Q data pages */
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
+		asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
+		asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
+		asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
+		asm volatile("vori.b $vr4, $vr0, 0");
+		asm volatile("vori.b $vr5, $vr1, 0");
+		asm volatile("vori.b $vr6, $vr2, 0");
+		asm volatile("vori.b $vr7, $vr3, 0");
+		for (z = z0-1; z >= start; z--) {
+			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
+			asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
+			asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
+			asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
+			asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
+			/* wp$$ ^= wd$$; */
+			asm volatile("vxor.v $vr0, $vr0, $vr8");
+			asm volatile("vxor.v $vr1, $vr1, $vr9");
+			asm volatile("vxor.v $vr2, $vr2, $vr10");
+			asm volatile("vxor.v $vr3, $vr3, $vr11");
+			/* w2$$ = MASK(wq$$); */
+			asm volatile("vslti.b $vr12, $vr4, 0");
+			asm volatile("vslti.b $vr13, $vr5, 0");
+			asm volatile("vslti.b $vr14, $vr6, 0");
+			asm volatile("vslti.b $vr15, $vr7, 0");
+			/* w1$$ = SHLBYTE(wq$$); */
+			asm volatile("vslli.b $vr16, $vr4, 1");
+			asm volatile("vslli.b $vr17, $vr5, 1");
+			asm volatile("vslli.b $vr18, $vr6, 1");
+			asm volatile("vslli.b $vr19, $vr7, 1");
+			/* w2$$ &= NBYTES(0x1d); */
+			asm volatile("vandi.b $vr12, $vr12, 0x1d");
+			asm volatile("vandi.b $vr13, $vr13, 0x1d");
+			asm volatile("vandi.b $vr14, $vr14, 0x1d");
+			asm volatile("vandi.b $vr15, $vr15, 0x1d");
+			/* w1$$ ^= w2$$; */
+			asm volatile("vxor.v $vr16, $vr16, $vr12");
+			asm volatile("vxor.v $vr17, $vr17, $vr13");
+			asm volatile("vxor.v $vr18, $vr18, $vr14");
+			asm volatile("vxor.v $vr19, $vr19, $vr15");
+			/* wq$$ = w1$$ ^ wd$$; */
+			asm volatile("vxor.v $vr4, $vr16, $vr8");
+			asm volatile("vxor.v $vr5, $vr17, $vr9");
+			asm volatile("vxor.v $vr6, $vr18, $vr10");
+			asm volatile("vxor.v $vr7, $vr19, $vr11");
+		}
+
+		/* P/Q left side optimization */
+		for (z = start-1; z >= 0; z--) {
+			/* w2$$ = MASK(wq$$); */
+			asm volatile("vslti.b $vr12, $vr4, 0");
+			asm volatile("vslti.b $vr13, $vr5, 0");
+			asm volatile("vslti.b $vr14, $vr6, 0");
+			asm volatile("vslti.b $vr15, $vr7, 0");
+			/* w1$$ = SHLBYTE(wq$$); */
+			asm volatile("vslli.b $vr16, $vr4, 1");
+			asm volatile("vslli.b $vr17, $vr5, 1");
+			asm volatile("vslli.b $vr18, $vr6, 1");
+			asm volatile("vslli.b $vr19, $vr7, 1");
+			/* w2$$ &= NBYTES(0x1d); */
+			asm volatile("vandi.b $vr12, $vr12, 0x1d");
+			asm volatile("vandi.b $vr13, $vr13, 0x1d");
+			asm volatile("vandi.b $vr14, $vr14, 0x1d");
+			asm volatile("vandi.b $vr15, $vr15, 0x1d");
+			/* wq$$ = w1$$ ^ w2$$; */
+			asm volatile("vxor.v $vr4, $vr16, $vr12");
+			asm volatile("vxor.v $vr5, $vr17, $vr13");
+			asm volatile("vxor.v $vr6, $vr18, $vr14");
+			asm volatile("vxor.v $vr7, $vr19, $vr15");
+		}
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+		 */
+		asm volatile(
+			"vld $vr20, %0\n\t"
+			"vld $vr21, %1\n\t"
+			"vld $vr22, %2\n\t"
+			"vld $vr23, %3\n\t"
+			"vld $vr24, %4\n\t"
+			"vld $vr25, %5\n\t"
+			"vld $vr26, %6\n\t"
+			"vld $vr27, %7\n\t"
+			"vxor.v $vr20, $vr20, $vr0\n\t"
+			"vxor.v $vr21, $vr21, $vr1\n\t"
+			"vxor.v $vr22, $vr22, $vr2\n\t"
+			"vxor.v $vr23, $vr23, $vr3\n\t"
+			"vxor.v $vr24, $vr24, $vr4\n\t"
+			"vxor.v $vr25, $vr25, $vr5\n\t"
+			"vxor.v $vr26, $vr26, $vr6\n\t"
+			"vxor.v $vr27, $vr27, $vr7\n\t"
+			"vst $vr20, %0\n\t"
+			"vst $vr21, %1\n\t"
+			"vst $vr22, %2\n\t"
+			"vst $vr23, %3\n\t"
+			"vst $vr24, %4\n\t"
+			"vst $vr25, %5\n\t"
+			"vst $vr26, %6\n\t"
+			"vst $vr27, %7\n\t"
+			: "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
+			  "+m"(p[d+NSIZE*2]), "+m"(p[d+NSIZE*3]),
+			  "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]),
+			  "+m"(q[d+NSIZE*2]), "+m"(q[d+NSIZE*3])
+		);
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_lsx = {
+	raid6_lsx_gen_syndrome,
+	raid6_lsx_xor_syndrome,
+	raid6_has_lsx,
+	"lsx",
+	.priority = 0 /* see the comment near the top of the file for reason */
+};
+
+#undef NSIZE
+#endif /* CONFIG_CPU_HAS_LSX */
+
+#ifdef CONFIG_CPU_HAS_LASX
+#define NSIZE 32
+
+static int raid6_has_lasx(void)
+{
+	return cpu_has_lasx;
+}
+
+static void raid6_lasx_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	/*
+	 * $xr0, $xr1: wp
+	 * $xr2, $xr3: wq
+	 * $xr4, $xr5: wd
+	 * $xr6, $xr7: w2
+	 * $xr8, $xr9: w1
+	 */
+	for (d = 0; d < bytes; d += NSIZE*2) {
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
+		asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
+		asm volatile("xvori.b $xr2, $xr0, 0");
+		asm volatile("xvori.b $xr3, $xr1, 0");
+		for (z = z0-1; z >= 0; z--) {
+			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
+			asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
+			asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
+			/* wp$$ ^= wd$$; */
+			asm volatile("xvxor.v $xr0, $xr0, $xr4");
+			asm volatile("xvxor.v $xr1, $xr1, $xr5");
+			/* w2$$ = MASK(wq$$); */
+			asm volatile("xvslti.b $xr6, $xr2, 0");
+			asm volatile("xvslti.b $xr7, $xr3, 0");
+			/* w1$$ = SHLBYTE(wq$$); */
+			asm volatile("xvslli.b $xr8, $xr2, 1");
+			asm volatile("xvslli.b $xr9, $xr3, 1");
+			/* w2$$ &= NBYTES(0x1d); */
+			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
+			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
+			/* w1$$ ^= w2$$; */
+			asm volatile("xvxor.v $xr8, $xr8, $xr6");
+			asm volatile("xvxor.v $xr9, $xr9, $xr7");
+			/* wq$$ = w1$$ ^ wd$$; */
+			asm volatile("xvxor.v $xr2, $xr8, $xr4");
+			asm volatile("xvxor.v $xr3, $xr9, $xr5");
+		}
+		/* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
+		asm volatile("xvst $xr0, %0" : "=m"(p[d+NSIZE*0]));
+		asm volatile("xvst $xr1, %0" : "=m"(p[d+NSIZE*1]));
+		/* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
+		asm volatile("xvst $xr2, %0" : "=m"(q[d+NSIZE*0]));
+		asm volatile("xvst $xr3, %0" : "=m"(q[d+NSIZE*1]));
+	}
+
+	kernel_fpu_end();
+}
+
+static void raid6_lasx_xor_syndrome(int disks, int start, int stop,
+				    size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	/*
+	 * $xr0, $xr1: wp
+	 * $xr2, $xr3: wq
+	 * $xr4, $xr5: wd
+	 * $xr6, $xr7: w2
+	 * $xr8, $xr9: w1
+	 */
+	for (d = 0; d < bytes; d += NSIZE*2) {
+		/* P/Q data pages */
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
+		asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
+		asm volatile("xvori.b $xr2, $xr0, 0");
+		asm volatile("xvori.b $xr3, $xr1, 0");
+		for (z = z0-1; z >= start; z--) {
+			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
+			asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
+			asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
+			/* wp$$ ^= wd$$; */
+			asm volatile("xvxor.v $xr0, $xr0, $xr4");
+			asm volatile("xvxor.v $xr1, $xr1, $xr5");
+			/* w2$$ = MASK(wq$$); */
+			asm volatile("xvslti.b $xr6, $xr2, 0");
+			asm volatile("xvslti.b $xr7, $xr3, 0");
+			/* w1$$ = SHLBYTE(wq$$); */
+			asm volatile("xvslli.b $xr8, $xr2, 1");
+			asm volatile("xvslli.b $xr9, $xr3, 1");
+			/* w2$$ &= NBYTES(0x1d); */
+			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
+			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
+			/* w1$$ ^= w2$$; */
+			asm volatile("xvxor.v $xr8, $xr8, $xr6");
+			asm volatile("xvxor.v $xr9, $xr9, $xr7");
+			/* wq$$ = w1$$ ^ wd$$; */
+			asm volatile("xvxor.v $xr2, $xr8, $xr4");
+			asm volatile("xvxor.v $xr3, $xr9, $xr5");
+		}
+
+		/* P/Q left side optimization */
+		for (z = start-1; z >= 0; z--) {
+			/* w2$$ = MASK(wq$$); */
+			asm volatile("xvslti.b $xr6, $xr2, 0");
+			asm volatile("xvslti.b $xr7, $xr3, 0");
+			/* w1$$ = SHLBYTE(wq$$); */
+			asm volatile("xvslli.b $xr8, $xr2, 1");
+			asm volatile("xvslli.b $xr9, $xr3, 1");
+			/* w2$$ &= NBYTES(0x1d); */
+			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
+			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
+			/* wq$$ = w1$$ ^ w2$$; */
+			asm volatile("xvxor.v $xr2, $xr8, $xr6");
+			asm volatile("xvxor.v $xr3, $xr9, $xr7");
+		}
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+		 */
+		asm volatile(
+			"xvld $xr10, %0\n\t"
+			"xvld $xr11, %1\n\t"
+			"xvld $xr12, %2\n\t"
+			"xvld $xr13, %3\n\t"
+			"xvxor.v $xr10, $xr10, $xr0\n\t"
+			"xvxor.v $xr11, $xr11, $xr1\n\t"
+			"xvxor.v $xr12, $xr12, $xr2\n\t"
+			"xvxor.v $xr13, $xr13, $xr3\n\t"
+			"xvst $xr10, %0\n\t"
+			"xvst $xr11, %1\n\t"
+			"xvst $xr12, %2\n\t"
+			"xvst $xr13, %3\n\t"
+			: "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
+			  "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1])
+		);
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_lasx = {
+	raid6_lasx_gen_syndrome,
+	raid6_lasx_xor_syndrome,
+	raid6_has_lasx,
+	"lasx",
+	.priority = 0 /* see the comment near the top of the file for reason */
+};
+#undef NSIZE
+#endif /* CONFIG_CPU_HAS_LASX */
diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c
index e824d088f72c..3be03793237c 100644
--- a/lib/raid6/mktables.c
+++ b/lib/raid6/mktables.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- linux-c -*- ------------------------------------------------------- *
  *
  *   Copyright 2002-2007 H. Peter Anvin - All Rights Reserved
  *
- *   This file is part of the Linux kernel, and is made available under
- *   the terms of the GNU General Public License version 2 or (at your
- *   option) any later version; incorporated herein by reference.
- *
  * ----------------------------------------------------------------------- */
 
 /*
@@ -59,8 +56,10 @@ int main(int argc, char *argv[])
 	uint8_t v;
 	uint8_t exptbl[256], invtbl[256];
 
-	printf("#include <linux/raid/pq.h>\n");
+	printf("#ifdef __KERNEL__\n");
 	printf("#include <linux/export.h>\n");
+	printf("#endif\n");
+	printf("#include <linux/raid/pq.h>\n");
 
 	/* Compute multiplication table */
 	printf("\nconst u8  __attribute__((aligned(256)))\n"
diff --git a/lib/raid6/mmx.c b/lib/raid6/mmx.c
index b3b0e1fcd3af..3a5bf53a297b 100644
--- a/lib/raid6/mmx.c
+++ b/lib/raid6/mmx.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- linux-c -*- ------------------------------------------------------- *
  *
  *   Copyright 2002 H. Peter Anvin - All Rights Reserved
  *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Boston MA 02111-1307, USA; either version 2 of the License, or
- *   (at your option) any later version; incorporated herein by reference.
- *
  * ----------------------------------------------------------------------- */
 
 /*
diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c
index 7076ef1ba3dd..6d9474ce6da9 100644
--- a/lib/raid6/neon.c
+++ b/lib/raid6/neon.c
@@ -1,20 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * linux/lib/raid6/neon.c - RAID6 syndrome calculation using ARM NEON intrinsics
  *
  * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/raid/pq.h>
 
 #ifdef __KERNEL__
-#include <asm/neon.h>
+#include <asm/simd.h>
 #else
-#define kernel_neon_begin()
-#define kernel_neon_end()
+#define scoped_ksimd()
 #define cpu_has_neon()		(1)
 #endif
 
@@ -35,10 +31,9 @@
 	{								\
 		void raid6_neon ## _n  ## _gen_syndrome_real(int,	\
 						unsigned long, void**);	\
-		kernel_neon_begin();					\
-		raid6_neon ## _n ## _gen_syndrome_real(disks,		\
+		scoped_ksimd()						\
+			raid6_neon ## _n ## _gen_syndrome_real(disks,	\
 					(unsigned long)bytes, ptrs);	\
-		kernel_neon_end();					\
 	}								\
 	static void raid6_neon ## _n ## _xor_syndrome(int disks,	\
 					int start, int stop, 		\
@@ -46,10 +41,9 @@
 	{								\
 		void raid6_neon ## _n  ## _xor_syndrome_real(int,	\
 				int, int, unsigned long, void**);	\
-		kernel_neon_begin();					\
-		raid6_neon ## _n ## _xor_syndrome_real(disks,		\
-			start, stop, (unsigned long)bytes, ptrs);	\
-		kernel_neon_end();					\
+		scoped_ksimd()						\
+			raid6_neon ## _n ## _xor_syndrome_real(disks,	\
+				start, stop, (unsigned long)bytes, ptrs);\
 	}								\
 	struct raid6_calls const raid6_neonx ## _n = {			\
 		raid6_neon ## _n ## _gen_syndrome,			\
diff --git a/lib/raid6/neon.h b/lib/raid6/neon.h
new file mode 100644
index 000000000000..2ca41ee9b499
--- /dev/null
+++ b/lib/raid6/neon.h
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+void raid6_neon1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs);
+void raid6_neon1_xor_syndrome_real(int disks, int start, int stop,
+				    unsigned long bytes, void **ptrs);
+void raid6_neon2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs);
+void raid6_neon2_xor_syndrome_real(int disks, int start, int stop,
+				    unsigned long bytes, void **ptrs);
+void raid6_neon4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs);
+void raid6_neon4_xor_syndrome_real(int disks, int start, int stop,
+				    unsigned long bytes, void **ptrs);
+void raid6_neon8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs);
+void raid6_neon8_xor_syndrome_real(int disks, int start, int stop,
+				    unsigned long bytes, void **ptrs);
+void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
+			      uint8_t *dq, const uint8_t *pbmul,
+			      const uint8_t *qmul);
+
+void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
+			      const uint8_t *qmul);
+
+
diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc
index d5242f544551..355270af0cd6 100644
--- a/lib/raid6/neon.uc
+++ b/lib/raid6/neon.uc
@@ -25,10 +25,10 @@
  */
 
 #include <arm_neon.h>
+#include "neon.h"
 
 typedef uint8x16_t unative_t;
 
-#define NBYTES(x) ((unative_t){x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x})
 #define NSIZE	sizeof(unative_t)
 
 /*
@@ -61,7 +61,7 @@ void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
 	int d, z, z0;
 
 	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
-	const unative_t x1d = NBYTES(0x1d);
+	const unative_t x1d = vdupq_n_u8(0x1d);
 
 	z0 = disks - 3;		/* Highest data disk */
 	p = dptr[z0+1];		/* XOR parity */
@@ -92,7 +92,7 @@ void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,
 	int d, z, z0;
 
 	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
-	const unative_t x1d = NBYTES(0x1d);
+	const unative_t x1d = vdupq_n_u8(0x1d);
 
 	z0 = stop;		/* P/Q right side optimization */
 	p = dptr[disks-2];	/* XOR parity */
diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c
index a95bccb8497d..b5e47c008b41 100644
--- a/lib/raid6/recov.c
+++ b/lib/raid6/recov.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- linux-c -*- ------------------------------------------------------- *
  *
  *   Copyright 2002 H. Peter Anvin - All Rights Reserved
  *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Boston MA 02111-1307, USA; either version 2 of the License, or
- *   (at your option) any later version; incorporated herein by reference.
- *
  * ----------------------------------------------------------------------- */
 
 /*
@@ -18,7 +13,6 @@
  * the syndrome.)
  */
 
-#include <linux/export.h>
 #include <linux/raid/pq.h>
 
 /* Recover two failed data blocks. */
@@ -37,10 +31,10 @@ static void raid6_2data_recov_intx1(int disks, size_t bytes, int faila,
 	   Use the dead data pages as temporary storage for
 	   delta p and delta q */
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -78,7 +72,7 @@ static void raid6_datap_recov_intx1(int disks, size_t bytes, int faila,
 	/* Compute syndrome with zero for the missing data page
 	   Use the dead data page as temporary storage for delta q */
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c
index 53fe3d7bdfb3..97d598d2535c 100644
--- a/lib/raid6/recov_avx2.c
+++ b/lib/raid6/recov_avx2.c
@@ -1,15 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2012 Intel Corporation
  * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
  */
 
-#ifdef CONFIG_AS_AVX2
-
 #include <linux/raid/pq.h>
 #include "x86.h"
 
@@ -34,10 +28,10 @@ static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila,
 	   Use the dead data pages as temporary storage for
 	   delta p and delta q */
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -202,7 +196,7 @@ static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila,
 	/* Compute syndrome with zero for the missing data page
 	   Use the dead data page as temporary storage for delta q */
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -317,7 +311,3 @@ const struct raid6_recov_calls raid6_recov_avx2 = {
 #endif
 	.priority = 2,
 };
-
-#else
-#warning "your version of binutils lacks AVX2 support"
-#endif
diff --git a/lib/raid6/recov_avx512.c b/lib/raid6/recov_avx512.c
index 625aafa33b61..7986120ca444 100644
--- a/lib/raid6/recov_avx512.c
+++ b/lib/raid6/recov_avx512.c
@@ -1,18 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2016 Intel Corporation
  *
  * Author: Gayatri Kammela <gayatri.kammela@intel.com>
  * Author: Megha Dey <megha.dey@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- *
  */
 
-#ifdef CONFIG_AS_AVX512
-
 #include <linux/raid/pq.h>
 #include "x86.h"
 
@@ -44,10 +37,10 @@ static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila,
 	 */
 
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -245,7 +238,7 @@ static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila,
 	 */
 
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -382,7 +375,3 @@ const struct raid6_recov_calls raid6_recov_avx512 = {
 #endif
 	.priority = 3,
 };
-
-#else
-#warning "your version of binutils lacks AVX512 support"
-#endif
diff --git a/lib/raid6/recov_loongarch_simd.c b/lib/raid6/recov_loongarch_simd.c
new file mode 100644
index 000000000000..93dc515997a1
--- /dev/null
+++ b/lib/raid6/recov_loongarch_simd.c
@@ -0,0 +1,513 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * RAID6 recovery algorithms in LoongArch SIMD (LSX & LASX)
+ *
+ * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
+ *
+ * Originally based on recov_avx2.c and recov_ssse3.c:
+ *
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
+ */
+
+#include <linux/raid/pq.h>
+#include "loongarch.h"
+
+/*
+ * Unlike with the syndrome calculation algorithms, there's no boot-time
+ * selection of recovery algorithms by benchmarking, so we have to specify
+ * the priorities and hope the future cores will all have decent vector
+ * support (i.e. no LASX slower than LSX, or even scalar code).
+ */
+
+#ifdef CONFIG_CPU_HAS_LSX
+static int raid6_has_lsx(void)
+{
+	return cpu_has_lsx;
+}
+
+static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila,
+				  int failb, void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data pages
+	 * Use the dead data pages as temporary storage for
+	 * delta p and delta q
+	 */
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = raid6_get_zero_page();
+	ptrs[disks - 2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = raid6_get_zero_page();
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila] = dp;
+	ptrs[failb] = dq;
+	ptrs[disks - 2] = p;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]];
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]];
+
+	kernel_fpu_begin();
+
+	/*
+	 * vr20, vr21: qmul
+	 * vr22, vr23: pbmul
+	 */
+	asm volatile("vld $vr20, %0" : : "m" (qmul[0]));
+	asm volatile("vld $vr21, %0" : : "m" (qmul[16]));
+	asm volatile("vld $vr22, %0" : : "m" (pbmul[0]));
+	asm volatile("vld $vr23, %0" : : "m" (pbmul[16]));
+
+	while (bytes) {
+		/* vr4 - vr7: Q */
+		asm volatile("vld $vr4, %0" : : "m" (q[0]));
+		asm volatile("vld $vr5, %0" : : "m" (q[16]));
+		asm volatile("vld $vr6, %0" : : "m" (q[32]));
+		asm volatile("vld $vr7, %0" : : "m" (q[48]));
+		/*  vr4 - vr7: Q + Qxy */
+		asm volatile("vld $vr8, %0" : : "m" (dq[0]));
+		asm volatile("vld $vr9, %0" : : "m" (dq[16]));
+		asm volatile("vld $vr10, %0" : : "m" (dq[32]));
+		asm volatile("vld $vr11, %0" : : "m" (dq[48]));
+		asm volatile("vxor.v $vr4, $vr4, $vr8");
+		asm volatile("vxor.v $vr5, $vr5, $vr9");
+		asm volatile("vxor.v $vr6, $vr6, $vr10");
+		asm volatile("vxor.v $vr7, $vr7, $vr11");
+		/* vr0 - vr3: P */
+		asm volatile("vld $vr0, %0" : : "m" (p[0]));
+		asm volatile("vld $vr1, %0" : : "m" (p[16]));
+		asm volatile("vld $vr2, %0" : : "m" (p[32]));
+		asm volatile("vld $vr3, %0" : : "m" (p[48]));
+		/* vr0 - vr3: P + Pxy */
+		asm volatile("vld $vr8, %0" : : "m" (dp[0]));
+		asm volatile("vld $vr9, %0" : : "m" (dp[16]));
+		asm volatile("vld $vr10, %0" : : "m" (dp[32]));
+		asm volatile("vld $vr11, %0" : : "m" (dp[48]));
+		asm volatile("vxor.v $vr0, $vr0, $vr8");
+		asm volatile("vxor.v $vr1, $vr1, $vr9");
+		asm volatile("vxor.v $vr2, $vr2, $vr10");
+		asm volatile("vxor.v $vr3, $vr3, $vr11");
+
+		/* vr8 - vr11: higher 4 bits of each byte of (Q + Qxy) */
+		asm volatile("vsrli.b $vr8, $vr4, 4");
+		asm volatile("vsrli.b $vr9, $vr5, 4");
+		asm volatile("vsrli.b $vr10, $vr6, 4");
+		asm volatile("vsrli.b $vr11, $vr7, 4");
+		/* vr4 - vr7: lower 4 bits of each byte of (Q + Qxy) */
+		asm volatile("vandi.b $vr4, $vr4, 0x0f");
+		asm volatile("vandi.b $vr5, $vr5, 0x0f");
+		asm volatile("vandi.b $vr6, $vr6, 0x0f");
+		asm volatile("vandi.b $vr7, $vr7, 0x0f");
+		/* lookup from qmul[0] */
+		asm volatile("vshuf.b $vr4, $vr20, $vr20, $vr4");
+		asm volatile("vshuf.b $vr5, $vr20, $vr20, $vr5");
+		asm volatile("vshuf.b $vr6, $vr20, $vr20, $vr6");
+		asm volatile("vshuf.b $vr7, $vr20, $vr20, $vr7");
+		/* lookup from qmul[16] */
+		asm volatile("vshuf.b $vr8, $vr21, $vr21, $vr8");
+		asm volatile("vshuf.b $vr9, $vr21, $vr21, $vr9");
+		asm volatile("vshuf.b $vr10, $vr21, $vr21, $vr10");
+		asm volatile("vshuf.b $vr11, $vr21, $vr21, $vr11");
+		/* vr16 - vr19: B(Q + Qxy) */
+		asm volatile("vxor.v $vr16, $vr8, $vr4");
+		asm volatile("vxor.v $vr17, $vr9, $vr5");
+		asm volatile("vxor.v $vr18, $vr10, $vr6");
+		asm volatile("vxor.v $vr19, $vr11, $vr7");
+
+		/* vr4 - vr7: higher 4 bits of each byte of (P + Pxy) */
+		asm volatile("vsrli.b $vr4, $vr0, 4");
+		asm volatile("vsrli.b $vr5, $vr1, 4");
+		asm volatile("vsrli.b $vr6, $vr2, 4");
+		asm volatile("vsrli.b $vr7, $vr3, 4");
+		/* vr12 - vr15: lower 4 bits of each byte of (P + Pxy) */
+		asm volatile("vandi.b $vr12, $vr0, 0x0f");
+		asm volatile("vandi.b $vr13, $vr1, 0x0f");
+		asm volatile("vandi.b $vr14, $vr2, 0x0f");
+		asm volatile("vandi.b $vr15, $vr3, 0x0f");
+		/* lookup from pbmul[0] */
+		asm volatile("vshuf.b $vr12, $vr22, $vr22, $vr12");
+		asm volatile("vshuf.b $vr13, $vr22, $vr22, $vr13");
+		asm volatile("vshuf.b $vr14, $vr22, $vr22, $vr14");
+		asm volatile("vshuf.b $vr15, $vr22, $vr22, $vr15");
+		/* lookup from pbmul[16] */
+		asm volatile("vshuf.b $vr4, $vr23, $vr23, $vr4");
+		asm volatile("vshuf.b $vr5, $vr23, $vr23, $vr5");
+		asm volatile("vshuf.b $vr6, $vr23, $vr23, $vr6");
+		asm volatile("vshuf.b $vr7, $vr23, $vr23, $vr7");
+		/* vr4 - vr7: A(P + Pxy) */
+		asm volatile("vxor.v $vr4, $vr4, $vr12");
+		asm volatile("vxor.v $vr5, $vr5, $vr13");
+		asm volatile("vxor.v $vr6, $vr6, $vr14");
+		asm volatile("vxor.v $vr7, $vr7, $vr15");
+
+		/* vr4 - vr7: A(P + Pxy) + B(Q + Qxy) = Dx */
+		asm volatile("vxor.v $vr4, $vr4, $vr16");
+		asm volatile("vxor.v $vr5, $vr5, $vr17");
+		asm volatile("vxor.v $vr6, $vr6, $vr18");
+		asm volatile("vxor.v $vr7, $vr7, $vr19");
+		asm volatile("vst $vr4, %0" : "=m" (dq[0]));
+		asm volatile("vst $vr5, %0" : "=m" (dq[16]));
+		asm volatile("vst $vr6, %0" : "=m" (dq[32]));
+		asm volatile("vst $vr7, %0" : "=m" (dq[48]));
+
+		/* vr0 - vr3: P + Pxy + Dx = Dy */
+		asm volatile("vxor.v $vr0, $vr0, $vr4");
+		asm volatile("vxor.v $vr1, $vr1, $vr5");
+		asm volatile("vxor.v $vr2, $vr2, $vr6");
+		asm volatile("vxor.v $vr3, $vr3, $vr7");
+		asm volatile("vst $vr0, %0" : "=m" (dp[0]));
+		asm volatile("vst $vr1, %0" : "=m" (dp[16]));
+		asm volatile("vst $vr2, %0" : "=m" (dp[32]));
+		asm volatile("vst $vr3, %0" : "=m" (dp[48]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dp += 64;
+		dq += 64;
+	}
+
+	kernel_fpu_end();
+}
+
+static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila,
+				  void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data page
+	 * Use the dead data page as temporary storage for delta q
+	 */
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = raid6_get_zero_page();
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila] = dq;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	kernel_fpu_begin();
+
+	/* vr22, vr23: qmul */
+	asm volatile("vld $vr22, %0" : : "m" (qmul[0]));
+	asm volatile("vld $vr23, %0" : : "m" (qmul[16]));
+
+	while (bytes) {
+		/* vr0 - vr3: P + Dx */
+		asm volatile("vld $vr0, %0" : : "m" (p[0]));
+		asm volatile("vld $vr1, %0" : : "m" (p[16]));
+		asm volatile("vld $vr2, %0" : : "m" (p[32]));
+		asm volatile("vld $vr3, %0" : : "m" (p[48]));
+		/* vr4 - vr7: Qx */
+		asm volatile("vld $vr4, %0" : : "m" (dq[0]));
+		asm volatile("vld $vr5, %0" : : "m" (dq[16]));
+		asm volatile("vld $vr6, %0" : : "m" (dq[32]));
+		asm volatile("vld $vr7, %0" : : "m" (dq[48]));
+		/* vr4 - vr7: Q + Qx */
+		asm volatile("vld $vr8, %0" : : "m" (q[0]));
+		asm volatile("vld $vr9, %0" : : "m" (q[16]));
+		asm volatile("vld $vr10, %0" : : "m" (q[32]));
+		asm volatile("vld $vr11, %0" : : "m" (q[48]));
+		asm volatile("vxor.v $vr4, $vr4, $vr8");
+		asm volatile("vxor.v $vr5, $vr5, $vr9");
+		asm volatile("vxor.v $vr6, $vr6, $vr10");
+		asm volatile("vxor.v $vr7, $vr7, $vr11");
+
+		/* vr8 - vr11: higher 4 bits of each byte of (Q + Qx) */
+		asm volatile("vsrli.b $vr8, $vr4, 4");
+		asm volatile("vsrli.b $vr9, $vr5, 4");
+		asm volatile("vsrli.b $vr10, $vr6, 4");
+		asm volatile("vsrli.b $vr11, $vr7, 4");
+		/* vr4 - vr7: lower 4 bits of each byte of (Q + Qx) */
+		asm volatile("vandi.b $vr4, $vr4, 0x0f");
+		asm volatile("vandi.b $vr5, $vr5, 0x0f");
+		asm volatile("vandi.b $vr6, $vr6, 0x0f");
+		asm volatile("vandi.b $vr7, $vr7, 0x0f");
+		/* lookup from qmul[0] */
+		asm volatile("vshuf.b $vr4, $vr22, $vr22, $vr4");
+		asm volatile("vshuf.b $vr5, $vr22, $vr22, $vr5");
+		asm volatile("vshuf.b $vr6, $vr22, $vr22, $vr6");
+		asm volatile("vshuf.b $vr7, $vr22, $vr22, $vr7");
+		/* lookup from qmul[16] */
+		asm volatile("vshuf.b $vr8, $vr23, $vr23, $vr8");
+		asm volatile("vshuf.b $vr9, $vr23, $vr23, $vr9");
+		asm volatile("vshuf.b $vr10, $vr23, $vr23, $vr10");
+		asm volatile("vshuf.b $vr11, $vr23, $vr23, $vr11");
+		/* vr4 - vr7: qmul(Q + Qx) = Dx */
+		asm volatile("vxor.v $vr4, $vr4, $vr8");
+		asm volatile("vxor.v $vr5, $vr5, $vr9");
+		asm volatile("vxor.v $vr6, $vr6, $vr10");
+		asm volatile("vxor.v $vr7, $vr7, $vr11");
+		asm volatile("vst $vr4, %0" : "=m" (dq[0]));
+		asm volatile("vst $vr5, %0" : "=m" (dq[16]));
+		asm volatile("vst $vr6, %0" : "=m" (dq[32]));
+		asm volatile("vst $vr7, %0" : "=m" (dq[48]));
+
+		/* vr0 - vr3: P + Dx + Dx = P */
+		asm volatile("vxor.v $vr0, $vr0, $vr4");
+		asm volatile("vxor.v $vr1, $vr1, $vr5");
+		asm volatile("vxor.v $vr2, $vr2, $vr6");
+		asm volatile("vxor.v $vr3, $vr3, $vr7");
+		asm volatile("vst $vr0, %0" : "=m" (p[0]));
+		asm volatile("vst $vr1, %0" : "=m" (p[16]));
+		asm volatile("vst $vr2, %0" : "=m" (p[32]));
+		asm volatile("vst $vr3, %0" : "=m" (p[48]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dq += 64;
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_lsx = {
+	.data2 = raid6_2data_recov_lsx,
+	.datap = raid6_datap_recov_lsx,
+	.valid = raid6_has_lsx,
+	.name = "lsx",
+	.priority = 1,
+};
+#endif /* CONFIG_CPU_HAS_LSX */
+
+#ifdef CONFIG_CPU_HAS_LASX
+static int raid6_has_lasx(void)
+{
+	return cpu_has_lasx;
+}
+
+static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila,
+				   int failb, void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data pages
+	 * Use the dead data pages as temporary storage for
+	 * delta p and delta q
+	 */
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = raid6_get_zero_page();
+	ptrs[disks - 2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = raid6_get_zero_page();
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila] = dp;
+	ptrs[failb] = dq;
+	ptrs[disks - 2] = p;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]];
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]];
+
+	kernel_fpu_begin();
+
+	/*
+	 * xr20, xr21: qmul
+	 * xr22, xr23: pbmul
+	 */
+	asm volatile("vld $vr20, %0" : : "m" (qmul[0]));
+	asm volatile("vld $vr21, %0" : : "m" (qmul[16]));
+	asm volatile("vld $vr22, %0" : : "m" (pbmul[0]));
+	asm volatile("vld $vr23, %0" : : "m" (pbmul[16]));
+	asm volatile("xvreplve0.q $xr20, $xr20");
+	asm volatile("xvreplve0.q $xr21, $xr21");
+	asm volatile("xvreplve0.q $xr22, $xr22");
+	asm volatile("xvreplve0.q $xr23, $xr23");
+
+	while (bytes) {
+		/* xr0, xr1: Q */
+		asm volatile("xvld $xr0, %0" : : "m" (q[0]));
+		asm volatile("xvld $xr1, %0" : : "m" (q[32]));
+		/* xr0, xr1: Q + Qxy */
+		asm volatile("xvld $xr4, %0" : : "m" (dq[0]));
+		asm volatile("xvld $xr5, %0" : : "m" (dq[32]));
+		asm volatile("xvxor.v $xr0, $xr0, $xr4");
+		asm volatile("xvxor.v $xr1, $xr1, $xr5");
+		/* xr2, xr3: P */
+		asm volatile("xvld $xr2, %0" : : "m" (p[0]));
+		asm volatile("xvld $xr3, %0" : : "m" (p[32]));
+		/* xr2, xr3: P + Pxy */
+		asm volatile("xvld $xr4, %0" : : "m" (dp[0]));
+		asm volatile("xvld $xr5, %0" : : "m" (dp[32]));
+		asm volatile("xvxor.v $xr2, $xr2, $xr4");
+		asm volatile("xvxor.v $xr3, $xr3, $xr5");
+
+		/* xr4, xr5: higher 4 bits of each byte of (Q + Qxy) */
+		asm volatile("xvsrli.b $xr4, $xr0, 4");
+		asm volatile("xvsrli.b $xr5, $xr1, 4");
+		/* xr0, xr1: lower 4 bits of each byte of (Q + Qxy) */
+		asm volatile("xvandi.b $xr0, $xr0, 0x0f");
+		asm volatile("xvandi.b $xr1, $xr1, 0x0f");
+		/* lookup from qmul[0] */
+		asm volatile("xvshuf.b $xr0, $xr20, $xr20, $xr0");
+		asm volatile("xvshuf.b $xr1, $xr20, $xr20, $xr1");
+		/* lookup from qmul[16] */
+		asm volatile("xvshuf.b $xr4, $xr21, $xr21, $xr4");
+		asm volatile("xvshuf.b $xr5, $xr21, $xr21, $xr5");
+		/* xr6, xr7: B(Q + Qxy) */
+		asm volatile("xvxor.v $xr6, $xr4, $xr0");
+		asm volatile("xvxor.v $xr7, $xr5, $xr1");
+
+		/* xr4, xr5: higher 4 bits of each byte of (P + Pxy) */
+		asm volatile("xvsrli.b $xr4, $xr2, 4");
+		asm volatile("xvsrli.b $xr5, $xr3, 4");
+		/* xr0, xr1: lower 4 bits of each byte of (P + Pxy) */
+		asm volatile("xvandi.b $xr0, $xr2, 0x0f");
+		asm volatile("xvandi.b $xr1, $xr3, 0x0f");
+		/* lookup from pbmul[0] */
+		asm volatile("xvshuf.b $xr0, $xr22, $xr22, $xr0");
+		asm volatile("xvshuf.b $xr1, $xr22, $xr22, $xr1");
+		/* lookup from pbmul[16] */
+		asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4");
+		asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5");
+		/* xr0, xr1: A(P + Pxy) */
+		asm volatile("xvxor.v $xr0, $xr0, $xr4");
+		asm volatile("xvxor.v $xr1, $xr1, $xr5");
+
+		/* xr0, xr1: A(P + Pxy) + B(Q + Qxy) = Dx */
+		asm volatile("xvxor.v $xr0, $xr0, $xr6");
+		asm volatile("xvxor.v $xr1, $xr1, $xr7");
+
+		/* xr2, xr3: P + Pxy + Dx = Dy */
+		asm volatile("xvxor.v $xr2, $xr2, $xr0");
+		asm volatile("xvxor.v $xr3, $xr3, $xr1");
+
+		asm volatile("xvst $xr0, %0" : "=m" (dq[0]));
+		asm volatile("xvst $xr1, %0" : "=m" (dq[32]));
+		asm volatile("xvst $xr2, %0" : "=m" (dp[0]));
+		asm volatile("xvst $xr3, %0" : "=m" (dp[32]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dp += 64;
+		dq += 64;
+	}
+
+	kernel_fpu_end();
+}
+
+static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila,
+				   void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data page
+	 * Use the dead data page as temporary storage for delta q
+	 */
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = raid6_get_zero_page();
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila] = dq;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	kernel_fpu_begin();
+
+	/* xr22, xr23: qmul */
+	asm volatile("vld $vr22, %0" : : "m" (qmul[0]));
+	asm volatile("xvreplve0.q $xr22, $xr22");
+	asm volatile("vld $vr23, %0" : : "m" (qmul[16]));
+	asm volatile("xvreplve0.q $xr23, $xr23");
+
+	while (bytes) {
+		/* xr0, xr1: P + Dx */
+		asm volatile("xvld $xr0, %0" : : "m" (p[0]));
+		asm volatile("xvld $xr1, %0" : : "m" (p[32]));
+		/* xr2, xr3: Qx */
+		asm volatile("xvld $xr2, %0" : : "m" (dq[0]));
+		asm volatile("xvld $xr3, %0" : : "m" (dq[32]));
+		/* xr2, xr3: Q + Qx */
+		asm volatile("xvld $xr4, %0" : : "m" (q[0]));
+		asm volatile("xvld $xr5, %0" : : "m" (q[32]));
+		asm volatile("xvxor.v $xr2, $xr2, $xr4");
+		asm volatile("xvxor.v $xr3, $xr3, $xr5");
+
+		/* xr4, xr5: higher 4 bits of each byte of (Q + Qx) */
+		asm volatile("xvsrli.b $xr4, $xr2, 4");
+		asm volatile("xvsrli.b $xr5, $xr3, 4");
+		/* xr2, xr3: lower 4 bits of each byte of (Q + Qx) */
+		asm volatile("xvandi.b $xr2, $xr2, 0x0f");
+		asm volatile("xvandi.b $xr3, $xr3, 0x0f");
+		/* lookup from qmul[0] */
+		asm volatile("xvshuf.b $xr2, $xr22, $xr22, $xr2");
+		asm volatile("xvshuf.b $xr3, $xr22, $xr22, $xr3");
+		/* lookup from qmul[16] */
+		asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4");
+		asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5");
+		/* xr2, xr3: qmul(Q + Qx) = Dx */
+		asm volatile("xvxor.v $xr2, $xr2, $xr4");
+		asm volatile("xvxor.v $xr3, $xr3, $xr5");
+
+		/* xr0, xr1: P + Dx + Dx = P */
+		asm volatile("xvxor.v $xr0, $xr0, $xr2");
+		asm volatile("xvxor.v $xr1, $xr1, $xr3");
+
+		asm volatile("xvst $xr2, %0" : "=m" (dq[0]));
+		asm volatile("xvst $xr3, %0" : "=m" (dq[32]));
+		asm volatile("xvst $xr0, %0" : "=m" (p[0]));
+		asm volatile("xvst $xr1, %0" : "=m" (p[32]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dq += 64;
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_lasx = {
+	.data2 = raid6_2data_recov_lasx,
+	.datap = raid6_datap_recov_lasx,
+	.valid = raid6_has_lasx,
+	.name = "lasx",
+	.priority = 2,
+};
+#endif /* CONFIG_CPU_HAS_LASX */
diff --git a/lib/raid6/recov_neon.c b/lib/raid6/recov_neon.c
index eeb5c4065b92..9d99aeabd31a 100644
--- a/lib/raid6/recov_neon.c
+++ b/lib/raid6/recov_neon.c
@@ -1,20 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2012 Intel Corporation
  * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
  */
 
 #include <linux/raid/pq.h>
 
 #ifdef __KERNEL__
-#include <asm/neon.h>
+#include <asm/simd.h>
+#include "neon.h"
 #else
-#define kernel_neon_begin()
-#define kernel_neon_end()
+#define scoped_ksimd()
 #define cpu_has_neon()		(1)
 #endif
 
@@ -23,13 +19,6 @@ static int raid6_has_neon(void)
 	return cpu_has_neon();
 }
 
-void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
-			      uint8_t *dq, const uint8_t *pbmul,
-			      const uint8_t *qmul);
-
-void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
-			      const uint8_t *qmul);
-
 static void raid6_2data_recov_neon(int disks, size_t bytes, int faila,
 		int failb, void **ptrs)
 {
@@ -46,10 +35,10 @@ static void raid6_2data_recov_neon(int disks, size_t bytes, int faila,
 	 * delta p and delta q
 	 */
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks - 2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks - 1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -65,9 +54,8 @@ static void raid6_2data_recov_neon(int disks, size_t bytes, int faila,
 	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
 					 raid6_gfexp[failb]]];
 
-	kernel_neon_begin();
-	__raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul);
-	kernel_neon_end();
+	scoped_ksimd()
+		__raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul);
 }
 
 static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
@@ -84,7 +72,7 @@ static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
 	 * Use the dead data page as temporary storage for delta q
 	 */
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks - 1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -96,9 +84,8 @@ static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
 	/* Now, pick the proper data tables */
 	qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
 
-	kernel_neon_begin();
-	__raid6_datap_recov_neon(bytes, p, q, dq, qmul);
-	kernel_neon_end();
+	scoped_ksimd()
+		__raid6_datap_recov_neon(bytes, p, q, dq, qmul);
 }
 
 const struct raid6_recov_calls raid6_recov_neon = {
diff --git a/lib/raid6/recov_neon_inner.c b/lib/raid6/recov_neon_inner.c
index 8cd20c9f834a..f9e7e8f5a151 100644
--- a/lib/raid6/recov_neon_inner.c
+++ b/lib/raid6/recov_neon_inner.c
@@ -1,19 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2012 Intel Corporation
  * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
  */
 
 #include <arm_neon.h>
-
-static const uint8x16_t x0f = {
-	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
-	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
-};
+#include "neon.h"
 
 #ifdef CONFIG_ARM
 /*
@@ -41,6 +33,7 @@ void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
 	uint8x16_t pm1 = vld1q_u8(pbmul + 16);
 	uint8x16_t qm0 = vld1q_u8(qmul);
 	uint8x16_t qm1 = vld1q_u8(qmul + 16);
+	uint8x16_t x0f = vdupq_n_u8(0x0f);
 
 	/*
 	 * while ( bytes-- ) {
@@ -60,14 +53,14 @@ void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
 		px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
 		vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
 
-		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
+		vy = vshrq_n_u8(vx, 4);
 		vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
-		vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
+		vy = vqtbl1q_u8(qm1, vy);
 		qx = veorq_u8(vx, vy);
 
-		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4);
+		vy = vshrq_n_u8(px, 4);
 		vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
-		vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f));
+		vy = vqtbl1q_u8(pm1, vy);
 		vx = veorq_u8(vx, vy);
 		db = veorq_u8(vx, qx);
 
@@ -87,6 +80,7 @@ void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
 {
 	uint8x16_t qm0 = vld1q_u8(qmul);
 	uint8x16_t qm1 = vld1q_u8(qmul + 16);
+	uint8x16_t x0f = vdupq_n_u8(0x0f);
 
 	/*
 	 * while (bytes--) {
@@ -100,9 +94,9 @@ void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
 
 		vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
 
-		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
+		vy = vshrq_n_u8(vx, 4);
 		vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
-		vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
+		vy = vqtbl1q_u8(qm1, vy);
 		vx = veorq_u8(vx, vy);
 		vy = veorq_u8(vx, vld1q_u8(p));
 
diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c
new file mode 100644
index 000000000000..40c393206b6a
--- /dev/null
+++ b/lib/raid6/recov_rvv.c
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2024 Institute of Software, CAS.
+ * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
+ */
+
+#include <linux/raid/pq.h>
+#include "rvv.h"
+
+static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
+				    u8 *dq, const u8 *pbmul,
+				    const u8 *qmul)
+{
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetvli	x0, %[avl], e8, m1, ta, ma\n"
+		      ".option	pop\n"
+		      : :
+		      [avl]"r"(16)
+	);
+
+	/*
+	 * while ( bytes-- ) {
+	 *	uint8_t px, qx, db;
+	 *
+	 *	px	  = *p ^ *dp;
+	 *	qx	  = qmul[*q ^ *dq];
+	 *	*dq++ = db = pbmul[px] ^ qx;
+	 *	*dp++ = db ^ px;
+	 *	p++; q++;
+	 * }
+	 */
+	while (bytes) {
+		/*
+		 * v0:px, v1:dp,
+		 * v2:qx, v3:dq,
+		 * v4:vx, v5:vy,
+		 * v6:qm0, v7:qm1,
+		 * v8:pm0, v9:pm1,
+		 * v14:p/qm[vx], v15:p/qm[vy]
+		 */
+		asm volatile (".option		push\n"
+			      ".option		arch,+v\n"
+			      "vle8.v		v0, (%[px])\n"
+			      "vle8.v		v1, (%[dp])\n"
+			      "vxor.vv		v0, v0, v1\n"
+			      "vle8.v		v2, (%[qx])\n"
+			      "vle8.v		v3, (%[dq])\n"
+			      "vxor.vv		v4, v2, v3\n"
+			      "vsrl.vi		v5, v4, 4\n"
+			      "vand.vi		v4, v4, 0xf\n"
+			      "vle8.v		v6, (%[qm0])\n"
+			      "vle8.v		v7, (%[qm1])\n"
+			      "vrgather.vv	v14, v6, v4\n" /* v14 = qm[vx] */
+			      "vrgather.vv	v15, v7, v5\n" /* v15 = qm[vy] */
+			      "vxor.vv		v2, v14, v15\n" /* v2 = qmul[*q ^ *dq] */
+
+			      "vsrl.vi		v5, v0, 4\n"
+			      "vand.vi		v4, v0, 0xf\n"
+			      "vle8.v		v8, (%[pm0])\n"
+			      "vle8.v		v9, (%[pm1])\n"
+			      "vrgather.vv	v14, v8, v4\n" /* v14 = pm[vx] */
+			      "vrgather.vv	v15, v9, v5\n" /* v15 = pm[vy] */
+			      "vxor.vv		v4, v14, v15\n" /* v4 = pbmul[px] */
+			      "vxor.vv		v3, v4, v2\n" /* v3 = db = pbmul[px] ^ qx */
+			      "vxor.vv		v1, v3, v0\n" /* v1 = db ^ px; */
+			      "vse8.v		v3, (%[dq])\n"
+			      "vse8.v		v1, (%[dp])\n"
+			      ".option		pop\n"
+			      : :
+			      [px]"r"(p),
+			      [dp]"r"(dp),
+			      [qx]"r"(q),
+			      [dq]"r"(dq),
+			      [qm0]"r"(qmul),
+			      [qm1]"r"(qmul + 16),
+			      [pm0]"r"(pbmul),
+			      [pm1]"r"(pbmul + 16)
+			      :);
+
+		bytes -= 16;
+		p += 16;
+		q += 16;
+		dp += 16;
+		dq += 16;
+	}
+}
+
+static void __raid6_datap_recov_rvv(int bytes, u8 *p, u8 *q,
+				    u8 *dq, const u8 *qmul)
+{
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetvli	x0, %[avl], e8, m1, ta, ma\n"
+		      ".option	pop\n"
+		      : :
+		      [avl]"r"(16)
+	);
+
+	/*
+	 * while (bytes--) {
+	 *  *p++ ^= *dq = qmul[*q ^ *dq];
+	 *  q++; dq++;
+	 * }
+	 */
+	while (bytes) {
+		/*
+		 * v0:vx, v1:vy,
+		 * v2:dq, v3:p,
+		 * v4:qm0, v5:qm1,
+		 * v10:m[vx], v11:m[vy]
+		 */
+		asm volatile (".option		push\n"
+			      ".option		arch,+v\n"
+			      "vle8.v		v0, (%[vx])\n"
+			      "vle8.v		v2, (%[dq])\n"
+			      "vxor.vv		v0, v0, v2\n"
+			      "vsrl.vi		v1, v0, 4\n"
+			      "vand.vi		v0, v0, 0xf\n"
+			      "vle8.v		v4, (%[qm0])\n"
+			      "vle8.v		v5, (%[qm1])\n"
+			      "vrgather.vv	v10, v4, v0\n"
+			      "vrgather.vv	v11, v5, v1\n"
+			      "vxor.vv		v0, v10, v11\n"
+			      "vle8.v		v1, (%[vy])\n"
+			      "vxor.vv		v1, v0, v1\n"
+			      "vse8.v		v0, (%[dq])\n"
+			      "vse8.v		v1, (%[vy])\n"
+			      ".option		pop\n"
+			      : :
+			      [vx]"r"(q),
+			      [vy]"r"(p),
+			      [dq]"r"(dq),
+			      [qm0]"r"(qmul),
+			      [qm1]"r"(qmul + 16)
+			      :);
+
+		bytes -= 16;
+		p += 16;
+		q += 16;
+		dq += 16;
+	}
+}
+
+static void raid6_2data_recov_rvv(int disks, size_t bytes, int faila,
+				  int failb, void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data pages
+	 * Use the dead data pages as temporary storage for
+	 * delta p and delta q
+	 */
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = raid6_get_zero_page();
+	ptrs[disks - 2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = raid6_get_zero_page();
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]     = dp;
+	ptrs[failb]     = dq;
+	ptrs[disks - 2] = p;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]];
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+					 raid6_gfexp[failb]]];
+
+	kernel_vector_begin();
+	__raid6_2data_recov_rvv(bytes, p, q, dp, dq, pbmul, qmul);
+	kernel_vector_end();
+}
+
+static void raid6_datap_recov_rvv(int disks, size_t bytes, int faila,
+				  void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data page
+	 * Use the dead data page as temporary storage for delta q
+	 */
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = raid6_get_zero_page();
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]     = dq;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	kernel_vector_begin();
+	__raid6_datap_recov_rvv(bytes, p, q, dq, qmul);
+	kernel_vector_end();
+}
+
+const struct raid6_recov_calls raid6_recov_rvv = {
+	.data2		= raid6_2data_recov_rvv,
+	.datap		= raid6_datap_recov_rvv,
+	.valid		= rvv_has_vector,
+	.name		= "rvv",
+	.priority	= 1,
+};
diff --git a/lib/raid6/recov_s390xc.c b/lib/raid6/recov_s390xc.c
index 179eec900cea..487018f81192 100644
--- a/lib/raid6/recov_s390xc.c
+++ b/lib/raid6/recov_s390xc.c
@@ -6,7 +6,6 @@
  * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  */
 
-#include <linux/export.h>
 #include <linux/raid/pq.h>
 
 static inline void xor_block(u8 *p1, u8 *p2)
@@ -35,10 +34,10 @@ static void raid6_2data_recov_s390xc(int disks, size_t bytes, int faila,
 	   Use the dead data pages as temporary storage for
 	   delta p and delta q */
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -82,7 +81,7 @@ static void raid6_datap_recov_s390xc(int disks, size_t bytes, int faila,
 	/* Compute syndrome with zero for the missing data page
 	   Use the dead data page as temporary storage for delta q */
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c
index cda33e56a5e3..2e849185c32b 100644
--- a/lib/raid6/recov_ssse3.c
+++ b/lib/raid6/recov_ssse3.c
@@ -1,14 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2012 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
  */
 
-#ifdef CONFIG_AS_SSSE3
-
 #include <linux/raid/pq.h>
 #include "x86.h"
 
@@ -36,10 +30,10 @@ static void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila,
 	   Use the dead data pages as temporary storage for
 	   delta p and delta q */
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -209,7 +203,7 @@ static void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila,
 	/* Compute syndrome with zero for the missing data page
 	   Use the dead data page as temporary storage for delta q */
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -332,7 +326,3 @@ const struct raid6_recov_calls raid6_recov_ssse3 = {
 #endif
 	.priority = 1,
 };
-
-#else
-#warning "your version of binutils lacks SSSE3 support"
-#endif
diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c
new file mode 100644
index 000000000000..75c9dafedb28
--- /dev/null
+++ b/lib/raid6/rvv.c
@@ -0,0 +1,1228 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RAID-6 syndrome calculation using RISC-V vector instructions
+ *
+ * Copyright 2024 Institute of Software, CAS.
+ * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
+ *
+ * Based on neon.uc:
+ *	Copyright 2002-2004 H. Peter Anvin
+ */
+
+#include "rvv.h"
+
+#ifdef __riscv_vector
+#error "This code must be built without compiler support for vector"
+#endif
+
+static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	unsigned long vl, d, nsize;
+	int z, z0;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0 + 1];		/* XOR parity */
+	q = dptr[z0 + 2];		/* RS syndrome */
+
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
+		      ".option	pop\n"
+		      : "=&r" (vl)
+	);
+
+	nsize = vl;
+
+	 /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
+	for (d = 0; d < bytes; d += nsize * 1) {
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vle8.v	v0, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
+			      ".option	pop\n"
+			      : :
+			      [wp0]"r"(&dptr[z0][d + 0 * nsize])
+		);
+
+		for (z = z0 - 1 ; z >= 0 ; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (".option	push\n"
+				      ".option	arch,+v\n"
+				      "vsra.vi	v2, v1, 7\n"
+				      "vsll.vi	v3, v1, 1\n"
+				      "vand.vx	v2, v2, %[x1d]\n"
+				      "vxor.vv	v3, v3, v2\n"
+				      "vle8.v	v2, (%[wd0])\n"
+				      "vxor.vv	v1, v3, v2\n"
+				      "vxor.vv	v0, v0, v2\n"
+				      ".option	pop\n"
+				      : :
+				      [wd0]"r"(&dptr[z][d + 0 * nsize]),
+				      [x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
+		 */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vse8.v	v0, (%[wp0])\n"
+			      "vse8.v	v1, (%[wq0])\n"
+			      ".option	pop\n"
+			      : :
+			      [wp0]"r"(&p[d + nsize * 0]),
+			      [wq0]"r"(&q[d + nsize * 0])
+		);
+	}
+}
+
+static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
+					 unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	unsigned long vl, d, nsize;
+	int z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks - 2];	/* XOR parity */
+	q = dptr[disks - 1];	/* RS syndrome */
+
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
+		      ".option	pop\n"
+		      : "=&r" (vl)
+	);
+
+	nsize = vl;
+
+	/* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
+	for (d = 0 ; d < bytes ; d += nsize * 1) {
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vle8.v	v0, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
+			      ".option	pop\n"
+			      : :
+			      [wp0]"r"(&dptr[z0][d + 0 * nsize])
+		);
+
+		/* P/Q data pages */
+		for (z = z0 - 1; z >= start; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (".option	push\n"
+				      ".option	arch,+v\n"
+				      "vsra.vi	v2, v1, 7\n"
+				      "vsll.vi	v3, v1, 1\n"
+				      "vand.vx	v2, v2, %[x1d]\n"
+				      "vxor.vv	v3, v3, v2\n"
+				      "vle8.v	v2, (%[wd0])\n"
+				      "vxor.vv	v1, v3, v2\n"
+				      "vxor.vv	v0, v0, v2\n"
+				      ".option	pop\n"
+				      : :
+				      [wd0]"r"(&dptr[z][d + 0 * nsize]),
+				      [x1d]"r"(0x1d)
+			);
+		}
+
+		/* P/Q left side optimization */
+		for (z = start - 1; z >= 0; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * wq$$ = w1$$ ^ w2$$;
+			 */
+			asm volatile (".option	push\n"
+				      ".option	arch,+v\n"
+				      "vsra.vi	v2, v1, 7\n"
+				      "vsll.vi	v3, v1, 1\n"
+				      "vand.vx	v2, v2, %[x1d]\n"
+				      "vxor.vv	v1, v3, v2\n"
+				      ".option	pop\n"
+				      : :
+				      [x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+		 * v0:wp0, v1:wq0, v2:p0, v3:q0
+		 */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vle8.v	v2, (%[wp0])\n"
+			      "vle8.v	v3, (%[wq0])\n"
+			      "vxor.vv	v2, v2, v0\n"
+			      "vxor.vv	v3, v3, v1\n"
+			      "vse8.v	v2, (%[wp0])\n"
+			      "vse8.v	v3, (%[wq0])\n"
+			      ".option	pop\n"
+			      : :
+			      [wp0]"r"(&p[d + nsize * 0]),
+			      [wq0]"r"(&q[d + nsize * 0])
+		);
+	}
+}
+
+static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	unsigned long vl, d, nsize;
+	int z, z0;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0 + 1];		/* XOR parity */
+	q = dptr[z0 + 2];		/* RS syndrome */
+
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
+		      ".option	pop\n"
+		      : "=&r" (vl)
+	);
+
+	nsize = vl;
+
+	/*
+	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
+	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
+	 */
+	for (d = 0; d < bytes; d += nsize * 2) {
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vle8.v	v0, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
+			      "vle8.v	v4, (%[wp1])\n"
+			      "vmv.v.v	v5, v4\n"
+			      ".option	pop\n"
+			      : :
+			      [wp0]"r"(&dptr[z0][d + 0 * nsize]),
+			      [wp1]"r"(&dptr[z0][d + 1 * nsize])
+		);
+
+		for (z = z0 - 1; z >= 0; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (".option	push\n"
+				      ".option	arch,+v\n"
+				      "vsra.vi	v2, v1, 7\n"
+				      "vsll.vi	v3, v1, 1\n"
+				      "vand.vx	v2, v2, %[x1d]\n"
+				      "vxor.vv	v3, v3, v2\n"
+				      "vle8.v	v2, (%[wd0])\n"
+				      "vxor.vv	v1, v3, v2\n"
+				      "vxor.vv	v0, v0, v2\n"
+
+				      "vsra.vi	v6, v5, 7\n"
+				      "vsll.vi	v7, v5, 1\n"
+				      "vand.vx	v6, v6, %[x1d]\n"
+				      "vxor.vv	v7, v7, v6\n"
+				      "vle8.v	v6, (%[wd1])\n"
+				      "vxor.vv	v5, v7, v6\n"
+				      "vxor.vv	v4, v4, v6\n"
+				      ".option	pop\n"
+				      : :
+				      [wd0]"r"(&dptr[z][d + 0 * nsize]),
+				      [wd1]"r"(&dptr[z][d + 1 * nsize]),
+				      [x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
+		 */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vse8.v	v0, (%[wp0])\n"
+			      "vse8.v	v1, (%[wq0])\n"
+			      "vse8.v	v4, (%[wp1])\n"
+			      "vse8.v	v5, (%[wq1])\n"
+			      ".option	pop\n"
+			      : :
+			      [wp0]"r"(&p[d + nsize * 0]),
+			      [wq0]"r"(&q[d + nsize * 0]),
+			      [wp1]"r"(&p[d + nsize * 1]),
+			      [wq1]"r"(&q[d + nsize * 1])
+		);
+	}
+}
+
+static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
+					 unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	unsigned long vl, d, nsize;
+	int z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks - 2];	/* XOR parity */
+	q = dptr[disks - 1];	/* RS syndrome */
+
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
+		      ".option	pop\n"
+		      : "=&r" (vl)
+	);
+
+	nsize = vl;
+
+	/*
+	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
+	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
+	 */
+	for (d = 0; d < bytes; d += nsize * 2) {
+		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vle8.v	v0, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
+			      "vle8.v	v4, (%[wp1])\n"
+			      "vmv.v.v	v5, v4\n"
+			      ".option	pop\n"
+			      : :
+			      [wp0]"r"(&dptr[z0][d + 0 * nsize]),
+			      [wp1]"r"(&dptr[z0][d + 1 * nsize])
+		);
+
+		/* P/Q data pages */
+		for (z = z0 - 1; z >= start; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (".option	push\n"
+				      ".option	arch,+v\n"
+				      "vsra.vi	v2, v1, 7\n"
+				      "vsll.vi	v3, v1, 1\n"
+				      "vand.vx	v2, v2, %[x1d]\n"
+				      "vxor.vv	v3, v3, v2\n"
+				      "vle8.v	v2, (%[wd0])\n"
+				      "vxor.vv	v1, v3, v2\n"
+				      "vxor.vv	v0, v0, v2\n"
+
+				      "vsra.vi	v6, v5, 7\n"
+				      "vsll.vi	v7, v5, 1\n"
+				      "vand.vx	v6, v6, %[x1d]\n"
+				      "vxor.vv	v7, v7, v6\n"
+				      "vle8.v	v6, (%[wd1])\n"
+				      "vxor.vv	v5, v7, v6\n"
+				      "vxor.vv	v4, v4, v6\n"
+				      ".option	pop\n"
+				      : :
+				      [wd0]"r"(&dptr[z][d + 0 * nsize]),
+				      [wd1]"r"(&dptr[z][d + 1 * nsize]),
+				      [x1d]"r"(0x1d)
+			);
+		}
+
+		/* P/Q left side optimization */
+		for (z = start - 1; z >= 0; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * wq$$ = w1$$ ^ w2$$;
+			 */
+			asm volatile (".option	push\n"
+				      ".option	arch,+v\n"
+				      "vsra.vi	v2, v1, 7\n"
+				      "vsll.vi	v3, v1, 1\n"
+				      "vand.vx	v2, v2, %[x1d]\n"
+				      "vxor.vv	v1, v3, v2\n"
+
+				      "vsra.vi	v6, v5, 7\n"
+				      "vsll.vi	v7, v5, 1\n"
+				      "vand.vx	v6, v6, %[x1d]\n"
+				      "vxor.vv	v5, v7, v6\n"
+				      ".option	pop\n"
+				      : :
+				      [x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+		 * v0:wp0, v1:wq0, v2:p0, v3:q0
+		 * v4:wp1, v5:wq1, v6:p1, v7:q1
+		 */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vle8.v	v2, (%[wp0])\n"
+			      "vle8.v	v3, (%[wq0])\n"
+			      "vxor.vv	v2, v2, v0\n"
+			      "vxor.vv	v3, v3, v1\n"
+			      "vse8.v	v2, (%[wp0])\n"
+			      "vse8.v	v3, (%[wq0])\n"
+
+			      "vle8.v	v6, (%[wp1])\n"
+			      "vle8.v	v7, (%[wq1])\n"
+			      "vxor.vv	v6, v6, v4\n"
+			      "vxor.vv	v7, v7, v5\n"
+			      "vse8.v	v6, (%[wp1])\n"
+			      "vse8.v	v7, (%[wq1])\n"
+			      ".option	pop\n"
+			      : :
+			      [wp0]"r"(&p[d + nsize * 0]),
+			      [wq0]"r"(&q[d + nsize * 0]),
+			      [wp1]"r"(&p[d + nsize * 1]),
+			      [wq1]"r"(&q[d + nsize * 1])
+		);
+	}
+}
+
+static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	unsigned long vl, d, nsize;
+	int z, z0;
+
+	z0 = disks - 3;	/* Highest data disk */
+	p = dptr[z0 + 1];	/* XOR parity */
+	q = dptr[z0 + 2];	/* RS syndrome */
+
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
+		      ".option	pop\n"
+		      : "=&r" (vl)
+	);
+
+	nsize = vl;
+
+	/*
+	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
+	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
+	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
+	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
+	 */
+	for (d = 0; d < bytes; d += nsize * 4) {
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vle8.v	v0, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
+			      "vle8.v	v4, (%[wp1])\n"
+			      "vmv.v.v	v5, v4\n"
+			      "vle8.v	v8, (%[wp2])\n"
+			      "vmv.v.v	v9, v8\n"
+			      "vle8.v	v12, (%[wp3])\n"
+			      "vmv.v.v	v13, v12\n"
+			      ".option	pop\n"
+			      : :
+			      [wp0]"r"(&dptr[z0][d + 0 * nsize]),
+			      [wp1]"r"(&dptr[z0][d + 1 * nsize]),
+			      [wp2]"r"(&dptr[z0][d + 2 * nsize]),
+			      [wp3]"r"(&dptr[z0][d + 3 * nsize])
+		);
+
+		for (z = z0 - 1; z >= 0; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (".option	push\n"
+				      ".option	arch,+v\n"
+				      "vsra.vi	v2, v1, 7\n"
+				      "vsll.vi	v3, v1, 1\n"
+				      "vand.vx	v2, v2, %[x1d]\n"
+				      "vxor.vv	v3, v3, v2\n"
+				      "vle8.v	v2, (%[wd0])\n"
+				      "vxor.vv	v1, v3, v2\n"
+				      "vxor.vv	v0, v0, v2\n"
+
+				      "vsra.vi	v6, v5, 7\n"
+				      "vsll.vi	v7, v5, 1\n"
+				      "vand.vx	v6, v6, %[x1d]\n"
+				      "vxor.vv	v7, v7, v6\n"
+				      "vle8.v	v6, (%[wd1])\n"
+				      "vxor.vv	v5, v7, v6\n"
+				      "vxor.vv	v4, v4, v6\n"
+
+				      "vsra.vi	v10, v9, 7\n"
+				      "vsll.vi	v11, v9, 1\n"
+				      "vand.vx	v10, v10, %[x1d]\n"
+				      "vxor.vv	v11, v11, v10\n"
+				      "vle8.v	v10, (%[wd2])\n"
+				      "vxor.vv	v9, v11, v10\n"
+				      "vxor.vv	v8, v8, v10\n"
+
+				      "vsra.vi	v14, v13, 7\n"
+				      "vsll.vi	v15, v13, 1\n"
+				      "vand.vx	v14, v14, %[x1d]\n"
+				      "vxor.vv	v15, v15, v14\n"
+				      "vle8.v	v14, (%[wd3])\n"
+				      "vxor.vv	v13, v15, v14\n"
+				      "vxor.vv	v12, v12, v14\n"
+				      ".option	pop\n"
+				      : :
+				      [wd0]"r"(&dptr[z][d + 0 * nsize]),
+				      [wd1]"r"(&dptr[z][d + 1 * nsize]),
+				      [wd2]"r"(&dptr[z][d + 2 * nsize]),
+				      [wd3]"r"(&dptr[z][d + 3 * nsize]),
+				      [x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
+		 */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vse8.v	v0, (%[wp0])\n"
+			      "vse8.v	v1, (%[wq0])\n"
+			      "vse8.v	v4, (%[wp1])\n"
+			      "vse8.v	v5, (%[wq1])\n"
+			      "vse8.v	v8, (%[wp2])\n"
+			      "vse8.v	v9, (%[wq2])\n"
+			      "vse8.v	v12, (%[wp3])\n"
+			      "vse8.v	v13, (%[wq3])\n"
+			      ".option	pop\n"
+			      : :
+			      [wp0]"r"(&p[d + nsize * 0]),
+			      [wq0]"r"(&q[d + nsize * 0]),
+			      [wp1]"r"(&p[d + nsize * 1]),
+			      [wq1]"r"(&q[d + nsize * 1]),
+			      [wp2]"r"(&p[d + nsize * 2]),
+			      [wq2]"r"(&q[d + nsize * 2]),
+			      [wp3]"r"(&p[d + nsize * 3]),
+			      [wq3]"r"(&q[d + nsize * 3])
+		);
+	}
+}
+
+static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
+					 unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	unsigned long vl, d, nsize;
+	int z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks - 2];	/* XOR parity */
+	q = dptr[disks - 1];	/* RS syndrome */
+
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
+		      ".option	pop\n"
+		      : "=&r" (vl)
+	);
+
+	nsize = vl;
+
+	/*
+	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
+	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
+	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
+	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
+	 */
+	for (d = 0; d < bytes; d += nsize * 4) {
+		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vle8.v	v0, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
+			      "vle8.v	v4, (%[wp1])\n"
+			      "vmv.v.v	v5, v4\n"
+			      "vle8.v	v8, (%[wp2])\n"
+			      "vmv.v.v	v9, v8\n"
+			      "vle8.v	v12, (%[wp3])\n"
+			      "vmv.v.v	v13, v12\n"
+			      ".option	pop\n"
+			      : :
+			      [wp0]"r"(&dptr[z0][d + 0 * nsize]),
+			      [wp1]"r"(&dptr[z0][d + 1 * nsize]),
+			      [wp2]"r"(&dptr[z0][d + 2 * nsize]),
+			      [wp3]"r"(&dptr[z0][d + 3 * nsize])
+		);
+
+		/* P/Q data pages */
+		for (z = z0 - 1; z >= start; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (".option	push\n"
+				      ".option	arch,+v\n"
+				      "vsra.vi	v2, v1, 7\n"
+				      "vsll.vi	v3, v1, 1\n"
+				      "vand.vx	v2, v2, %[x1d]\n"
+				      "vxor.vv	v3, v3, v2\n"
+				      "vle8.v	v2, (%[wd0])\n"
+				      "vxor.vv	v1, v3, v2\n"
+				      "vxor.vv	v0, v0, v2\n"
+
+				      "vsra.vi	v6, v5, 7\n"
+				      "vsll.vi	v7, v5, 1\n"
+				      "vand.vx	v6, v6, %[x1d]\n"
+				      "vxor.vv	v7, v7, v6\n"
+				      "vle8.v	v6, (%[wd1])\n"
+				      "vxor.vv	v5, v7, v6\n"
+				      "vxor.vv	v4, v4, v6\n"
+
+				      "vsra.vi	v10, v9, 7\n"
+				      "vsll.vi	v11, v9, 1\n"
+				      "vand.vx	v10, v10, %[x1d]\n"
+				      "vxor.vv	v11, v11, v10\n"
+				      "vle8.v	v10, (%[wd2])\n"
+				      "vxor.vv	v9, v11, v10\n"
+				      "vxor.vv	v8, v8, v10\n"
+
+				      "vsra.vi	v14, v13, 7\n"
+				      "vsll.vi	v15, v13, 1\n"
+				      "vand.vx	v14, v14, %[x1d]\n"
+				      "vxor.vv	v15, v15, v14\n"
+				      "vle8.v	v14, (%[wd3])\n"
+				      "vxor.vv	v13, v15, v14\n"
+				      "vxor.vv	v12, v12, v14\n"
+				      ".option	pop\n"
+				      : :
+				      [wd0]"r"(&dptr[z][d + 0 * nsize]),
+				      [wd1]"r"(&dptr[z][d + 1 * nsize]),
+				      [wd2]"r"(&dptr[z][d + 2 * nsize]),
+				      [wd3]"r"(&dptr[z][d + 3 * nsize]),
+				      [x1d]"r"(0x1d)
+			);
+		}
+
+		/* P/Q left side optimization */
+		for (z = start - 1; z >= 0; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * wq$$ = w1$$ ^ w2$$;
+			 */
+			asm volatile (".option	push\n"
+				      ".option	arch,+v\n"
+				      "vsra.vi	v2, v1, 7\n"
+				      "vsll.vi	v3, v1, 1\n"
+				      "vand.vx	v2, v2, %[x1d]\n"
+				      "vxor.vv	v1, v3, v2\n"
+
+				      "vsra.vi	v6, v5, 7\n"
+				      "vsll.vi	v7, v5, 1\n"
+				      "vand.vx	v6, v6, %[x1d]\n"
+				      "vxor.vv	v5, v7, v6\n"
+
+				      "vsra.vi	v10, v9, 7\n"
+				      "vsll.vi	v11, v9, 1\n"
+				      "vand.vx	v10, v10, %[x1d]\n"
+				      "vxor.vv	v9, v11, v10\n"
+
+				      "vsra.vi	v14, v13, 7\n"
+				      "vsll.vi	v15, v13, 1\n"
+				      "vand.vx	v14, v14, %[x1d]\n"
+				      "vxor.vv	v13, v15, v14\n"
+				      ".option	pop\n"
+				      : :
+				      [x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+		 * v0:wp0, v1:wq0, v2:p0, v3:q0
+		 * v4:wp1, v5:wq1, v6:p1, v7:q1
+		 * v8:wp2, v9:wq2, v10:p2, v11:q2
+		 * v12:wp3, v13:wq3, v14:p3, v15:q3
+		 */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vle8.v	v2, (%[wp0])\n"
+			      "vle8.v	v3, (%[wq0])\n"
+			      "vxor.vv	v2, v2, v0\n"
+			      "vxor.vv	v3, v3, v1\n"
+			      "vse8.v	v2, (%[wp0])\n"
+			      "vse8.v	v3, (%[wq0])\n"
+
+			      "vle8.v	v6, (%[wp1])\n"
+			      "vle8.v	v7, (%[wq1])\n"
+			      "vxor.vv	v6, v6, v4\n"
+			      "vxor.vv	v7, v7, v5\n"
+			      "vse8.v	v6, (%[wp1])\n"
+			      "vse8.v	v7, (%[wq1])\n"
+
+			      "vle8.v	v10, (%[wp2])\n"
+			      "vle8.v	v11, (%[wq2])\n"
+			      "vxor.vv	v10, v10, v8\n"
+			      "vxor.vv	v11, v11, v9\n"
+			      "vse8.v	v10, (%[wp2])\n"
+			      "vse8.v	v11, (%[wq2])\n"
+
+			      "vle8.v	v14, (%[wp3])\n"
+			      "vle8.v	v15, (%[wq3])\n"
+			      "vxor.vv	v14, v14, v12\n"
+			      "vxor.vv	v15, v15, v13\n"
+			      "vse8.v	v14, (%[wp3])\n"
+			      "vse8.v	v15, (%[wq3])\n"
+			      ".option	pop\n"
+			      : :
+			      [wp0]"r"(&p[d + nsize * 0]),
+			      [wq0]"r"(&q[d + nsize * 0]),
+			      [wp1]"r"(&p[d + nsize * 1]),
+			      [wq1]"r"(&q[d + nsize * 1]),
+			      [wp2]"r"(&p[d + nsize * 2]),
+			      [wq2]"r"(&q[d + nsize * 2]),
+			      [wp3]"r"(&p[d + nsize * 3]),
+			      [wq3]"r"(&q[d + nsize * 3])
+		);
+	}
+}
+
+static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	unsigned long vl, d, nsize;
+	int z, z0;
+
+	z0 = disks - 3;	/* Highest data disk */
+	p = dptr[z0 + 1];	/* XOR parity */
+	q = dptr[z0 + 2];	/* RS syndrome */
+
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
+		      ".option	pop\n"
+		      : "=&r" (vl)
+	);
+
+	nsize = vl;
+
+	/*
+	 * v0:wp0,   v1:wq0,  v2:wd0/w20,  v3:w10
+	 * v4:wp1,   v5:wq1,  v6:wd1/w21,  v7:w11
+	 * v8:wp2,   v9:wq2, v10:wd2/w22, v11:w12
+	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
+	 * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14
+	 * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15
+	 * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16
+	 * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17
+	 */
+	for (d = 0; d < bytes; d += nsize * 8) {
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vle8.v	v0, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
+			      "vle8.v	v4, (%[wp1])\n"
+			      "vmv.v.v	v5, v4\n"
+			      "vle8.v	v8, (%[wp2])\n"
+			      "vmv.v.v	v9, v8\n"
+			      "vle8.v	v12, (%[wp3])\n"
+			      "vmv.v.v	v13, v12\n"
+			      "vle8.v	v16, (%[wp4])\n"
+			      "vmv.v.v	v17, v16\n"
+			      "vle8.v	v20, (%[wp5])\n"
+			      "vmv.v.v	v21, v20\n"
+			      "vle8.v	v24, (%[wp6])\n"
+			      "vmv.v.v	v25, v24\n"
+			      "vle8.v	v28, (%[wp7])\n"
+			      "vmv.v.v	v29, v28\n"
+			      ".option	pop\n"
+			      : :
+			      [wp0]"r"(&dptr[z0][d + 0 * nsize]),
+			      [wp1]"r"(&dptr[z0][d + 1 * nsize]),
+			      [wp2]"r"(&dptr[z0][d + 2 * nsize]),
+			      [wp3]"r"(&dptr[z0][d + 3 * nsize]),
+			      [wp4]"r"(&dptr[z0][d + 4 * nsize]),
+			      [wp5]"r"(&dptr[z0][d + 5 * nsize]),
+			      [wp6]"r"(&dptr[z0][d + 6 * nsize]),
+			      [wp7]"r"(&dptr[z0][d + 7 * nsize])
+		);
+
+		for (z = z0 - 1; z >= 0; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (".option	push\n"
+				      ".option	arch,+v\n"
+				      "vsra.vi	v2, v1, 7\n"
+				      "vsll.vi	v3, v1, 1\n"
+				      "vand.vx	v2, v2, %[x1d]\n"
+				      "vxor.vv	v3, v3, v2\n"
+				      "vle8.v	v2, (%[wd0])\n"
+				      "vxor.vv	v1, v3, v2\n"
+				      "vxor.vv	v0, v0, v2\n"
+
+				      "vsra.vi	v6, v5, 7\n"
+				      "vsll.vi	v7, v5, 1\n"
+				      "vand.vx	v6, v6, %[x1d]\n"
+				      "vxor.vv	v7, v7, v6\n"
+				      "vle8.v	v6, (%[wd1])\n"
+				      "vxor.vv	v5, v7, v6\n"
+				      "vxor.vv	v4, v4, v6\n"
+
+				      "vsra.vi	v10, v9, 7\n"
+				      "vsll.vi	v11, v9, 1\n"
+				      "vand.vx	v10, v10, %[x1d]\n"
+				      "vxor.vv	v11, v11, v10\n"
+				      "vle8.v	v10, (%[wd2])\n"
+				      "vxor.vv	v9, v11, v10\n"
+				      "vxor.vv	v8, v8, v10\n"
+
+				      "vsra.vi	v14, v13, 7\n"
+				      "vsll.vi	v15, v13, 1\n"
+				      "vand.vx	v14, v14, %[x1d]\n"
+				      "vxor.vv	v15, v15, v14\n"
+				      "vle8.v	v14, (%[wd3])\n"
+				      "vxor.vv	v13, v15, v14\n"
+				      "vxor.vv	v12, v12, v14\n"
+
+				      "vsra.vi	v18, v17, 7\n"
+				      "vsll.vi	v19, v17, 1\n"
+				      "vand.vx	v18, v18, %[x1d]\n"
+				      "vxor.vv	v19, v19, v18\n"
+				      "vle8.v	v18, (%[wd4])\n"
+				      "vxor.vv	v17, v19, v18\n"
+				      "vxor.vv	v16, v16, v18\n"
+
+				      "vsra.vi	v22, v21, 7\n"
+				      "vsll.vi	v23, v21, 1\n"
+				      "vand.vx	v22, v22, %[x1d]\n"
+				      "vxor.vv	v23, v23, v22\n"
+				      "vle8.v	v22, (%[wd5])\n"
+				      "vxor.vv	v21, v23, v22\n"
+				      "vxor.vv	v20, v20, v22\n"
+
+				      "vsra.vi	v26, v25, 7\n"
+				      "vsll.vi	v27, v25, 1\n"
+				      "vand.vx	v26, v26, %[x1d]\n"
+				      "vxor.vv	v27, v27, v26\n"
+				      "vle8.v	v26, (%[wd6])\n"
+				      "vxor.vv	v25, v27, v26\n"
+				      "vxor.vv	v24, v24, v26\n"
+
+				      "vsra.vi	v30, v29, 7\n"
+				      "vsll.vi	v31, v29, 1\n"
+				      "vand.vx	v30, v30, %[x1d]\n"
+				      "vxor.vv	v31, v31, v30\n"
+				      "vle8.v	v30, (%[wd7])\n"
+				      "vxor.vv	v29, v31, v30\n"
+				      "vxor.vv	v28, v28, v30\n"
+				      ".option	pop\n"
+				      : :
+				      [wd0]"r"(&dptr[z][d + 0 * nsize]),
+				      [wd1]"r"(&dptr[z][d + 1 * nsize]),
+				      [wd2]"r"(&dptr[z][d + 2 * nsize]),
+				      [wd3]"r"(&dptr[z][d + 3 * nsize]),
+				      [wd4]"r"(&dptr[z][d + 4 * nsize]),
+				      [wd5]"r"(&dptr[z][d + 5 * nsize]),
+				      [wd6]"r"(&dptr[z][d + 6 * nsize]),
+				      [wd7]"r"(&dptr[z][d + 7 * nsize]),
+				      [x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
+		 */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vse8.v	v0, (%[wp0])\n"
+			      "vse8.v	v1, (%[wq0])\n"
+			      "vse8.v	v4, (%[wp1])\n"
+			      "vse8.v	v5, (%[wq1])\n"
+			      "vse8.v	v8, (%[wp2])\n"
+			      "vse8.v	v9, (%[wq2])\n"
+			      "vse8.v	v12, (%[wp3])\n"
+			      "vse8.v	v13, (%[wq3])\n"
+			      "vse8.v	v16, (%[wp4])\n"
+			      "vse8.v	v17, (%[wq4])\n"
+			      "vse8.v	v20, (%[wp5])\n"
+			      "vse8.v	v21, (%[wq5])\n"
+			      "vse8.v	v24, (%[wp6])\n"
+			      "vse8.v	v25, (%[wq6])\n"
+			      "vse8.v	v28, (%[wp7])\n"
+			      "vse8.v	v29, (%[wq7])\n"
+			      ".option	pop\n"
+			      : :
+			      [wp0]"r"(&p[d + nsize * 0]),
+			      [wq0]"r"(&q[d + nsize * 0]),
+			      [wp1]"r"(&p[d + nsize * 1]),
+			      [wq1]"r"(&q[d + nsize * 1]),
+			      [wp2]"r"(&p[d + nsize * 2]),
+			      [wq2]"r"(&q[d + nsize * 2]),
+			      [wp3]"r"(&p[d + nsize * 3]),
+			      [wq3]"r"(&q[d + nsize * 3]),
+			      [wp4]"r"(&p[d + nsize * 4]),
+			      [wq4]"r"(&q[d + nsize * 4]),
+			      [wp5]"r"(&p[d + nsize * 5]),
+			      [wq5]"r"(&q[d + nsize * 5]),
+			      [wp6]"r"(&p[d + nsize * 6]),
+			      [wq6]"r"(&q[d + nsize * 6]),
+			      [wp7]"r"(&p[d + nsize * 7]),
+			      [wq7]"r"(&q[d + nsize * 7])
+		);
+	}
+}
+
+static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop,
+					 unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	unsigned long vl, d, nsize;
+	int z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks - 2];	/* XOR parity */
+	q = dptr[disks - 1];	/* RS syndrome */
+
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
+		      ".option	pop\n"
+		      : "=&r" (vl)
+	);
+
+	nsize = vl;
+
+	/*
+	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
+	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
+	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
+	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
+	 * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14
+	 * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15
+	 * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16
+	 * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17
+	 */
+	for (d = 0; d < bytes; d += nsize * 8) {
+		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vle8.v	v0, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
+			      "vle8.v	v4, (%[wp1])\n"
+			      "vmv.v.v	v5, v4\n"
+			      "vle8.v	v8, (%[wp2])\n"
+			      "vmv.v.v	v9, v8\n"
+			      "vle8.v	v12, (%[wp3])\n"
+			      "vmv.v.v	v13, v12\n"
+			      "vle8.v	v16, (%[wp4])\n"
+			      "vmv.v.v	v17, v16\n"
+			      "vle8.v	v20, (%[wp5])\n"
+			      "vmv.v.v	v21, v20\n"
+			      "vle8.v	v24, (%[wp6])\n"
+			      "vmv.v.v	v25, v24\n"
+			      "vle8.v	v28, (%[wp7])\n"
+			      "vmv.v.v	v29, v28\n"
+			      ".option	pop\n"
+			      : :
+			      [wp0]"r"(&dptr[z0][d + 0 * nsize]),
+			      [wp1]"r"(&dptr[z0][d + 1 * nsize]),
+			      [wp2]"r"(&dptr[z0][d + 2 * nsize]),
+			      [wp3]"r"(&dptr[z0][d + 3 * nsize]),
+			      [wp4]"r"(&dptr[z0][d + 4 * nsize]),
+			      [wp5]"r"(&dptr[z0][d + 5 * nsize]),
+			      [wp6]"r"(&dptr[z0][d + 6 * nsize]),
+			      [wp7]"r"(&dptr[z0][d + 7 * nsize])
+		);
+
+		/* P/Q data pages */
+		for (z = z0 - 1; z >= start; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (".option	push\n"
+				      ".option	arch,+v\n"
+				      "vsra.vi	v2, v1, 7\n"
+				      "vsll.vi	v3, v1, 1\n"
+				      "vand.vx	v2, v2, %[x1d]\n"
+				      "vxor.vv	v3, v3, v2\n"
+				      "vle8.v	v2, (%[wd0])\n"
+				      "vxor.vv	v1, v3, v2\n"
+				      "vxor.vv	v0, v0, v2\n"
+
+				      "vsra.vi	v6, v5, 7\n"
+				      "vsll.vi	v7, v5, 1\n"
+				      "vand.vx	v6, v6, %[x1d]\n"
+				      "vxor.vv	v7, v7, v6\n"
+				      "vle8.v	v6, (%[wd1])\n"
+				      "vxor.vv	v5, v7, v6\n"
+				      "vxor.vv	v4, v4, v6\n"
+
+				      "vsra.vi	v10, v9, 7\n"
+				      "vsll.vi	v11, v9, 1\n"
+				      "vand.vx	v10, v10, %[x1d]\n"
+				      "vxor.vv	v11, v11, v10\n"
+				      "vle8.v	v10, (%[wd2])\n"
+				      "vxor.vv	v9, v11, v10\n"
+				      "vxor.vv	v8, v8, v10\n"
+
+				      "vsra.vi	v14, v13, 7\n"
+				      "vsll.vi	v15, v13, 1\n"
+				      "vand.vx	v14, v14, %[x1d]\n"
+				      "vxor.vv	v15, v15, v14\n"
+				      "vle8.v	v14, (%[wd3])\n"
+				      "vxor.vv	v13, v15, v14\n"
+				      "vxor.vv	v12, v12, v14\n"
+
+				      "vsra.vi	v18, v17, 7\n"
+				      "vsll.vi	v19, v17, 1\n"
+				      "vand.vx	v18, v18, %[x1d]\n"
+				      "vxor.vv	v19, v19, v18\n"
+				      "vle8.v	v18, (%[wd4])\n"
+				      "vxor.vv	v17, v19, v18\n"
+				      "vxor.vv	v16, v16, v18\n"
+
+				      "vsra.vi	v22, v21, 7\n"
+				      "vsll.vi	v23, v21, 1\n"
+				      "vand.vx	v22, v22, %[x1d]\n"
+				      "vxor.vv	v23, v23, v22\n"
+				      "vle8.v	v22, (%[wd5])\n"
+				      "vxor.vv	v21, v23, v22\n"
+				      "vxor.vv	v20, v20, v22\n"
+
+				      "vsra.vi	v26, v25, 7\n"
+				      "vsll.vi	v27, v25, 1\n"
+				      "vand.vx	v26, v26, %[x1d]\n"
+				      "vxor.vv	v27, v27, v26\n"
+				      "vle8.v	v26, (%[wd6])\n"
+				      "vxor.vv	v25, v27, v26\n"
+				      "vxor.vv	v24, v24, v26\n"
+
+				      "vsra.vi	v30, v29, 7\n"
+				      "vsll.vi	v31, v29, 1\n"
+				      "vand.vx	v30, v30, %[x1d]\n"
+				      "vxor.vv	v31, v31, v30\n"
+				      "vle8.v	v30, (%[wd7])\n"
+				      "vxor.vv	v29, v31, v30\n"
+				      "vxor.vv	v28, v28, v30\n"
+				      ".option	pop\n"
+				      : :
+				      [wd0]"r"(&dptr[z][d + 0 * nsize]),
+				      [wd1]"r"(&dptr[z][d + 1 * nsize]),
+				      [wd2]"r"(&dptr[z][d + 2 * nsize]),
+				      [wd3]"r"(&dptr[z][d + 3 * nsize]),
+				      [wd4]"r"(&dptr[z][d + 4 * nsize]),
+				      [wd5]"r"(&dptr[z][d + 5 * nsize]),
+				      [wd6]"r"(&dptr[z][d + 6 * nsize]),
+				      [wd7]"r"(&dptr[z][d + 7 * nsize]),
+				      [x1d]"r"(0x1d)
+			);
+		}
+
+		/* P/Q left side optimization */
+		for (z = start - 1; z >= 0; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * wq$$ = w1$$ ^ w2$$;
+			 */
+			asm volatile (".option	push\n"
+				      ".option	arch,+v\n"
+				      "vsra.vi	v2, v1, 7\n"
+				      "vsll.vi	v3, v1, 1\n"
+				      "vand.vx	v2, v2, %[x1d]\n"
+				      "vxor.vv	v1, v3, v2\n"
+
+				      "vsra.vi	v6, v5, 7\n"
+				      "vsll.vi	v7, v5, 1\n"
+				      "vand.vx	v6, v6, %[x1d]\n"
+				      "vxor.vv	v5, v7, v6\n"
+
+				      "vsra.vi	v10, v9, 7\n"
+				      "vsll.vi	v11, v9, 1\n"
+				      "vand.vx	v10, v10, %[x1d]\n"
+				      "vxor.vv	v9, v11, v10\n"
+
+				      "vsra.vi	v14, v13, 7\n"
+				      "vsll.vi	v15, v13, 1\n"
+				      "vand.vx	v14, v14, %[x1d]\n"
+				      "vxor.vv	v13, v15, v14\n"
+
+				      "vsra.vi	v18, v17, 7\n"
+				      "vsll.vi	v19, v17, 1\n"
+				      "vand.vx	v18, v18, %[x1d]\n"
+				      "vxor.vv	v17, v19, v18\n"
+
+				      "vsra.vi	v22, v21, 7\n"
+				      "vsll.vi	v23, v21, 1\n"
+				      "vand.vx	v22, v22, %[x1d]\n"
+				      "vxor.vv	v21, v23, v22\n"
+
+				      "vsra.vi	v26, v25, 7\n"
+				      "vsll.vi	v27, v25, 1\n"
+				      "vand.vx	v26, v26, %[x1d]\n"
+				      "vxor.vv	v25, v27, v26\n"
+
+				      "vsra.vi	v30, v29, 7\n"
+				      "vsll.vi	v31, v29, 1\n"
+				      "vand.vx	v30, v30, %[x1d]\n"
+				      "vxor.vv	v29, v31, v30\n"
+				      ".option	pop\n"
+				      : :
+				      [x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+		 * v0:wp0, v1:wq0, v2:p0, v3:q0
+		 * v4:wp1, v5:wq1, v6:p1, v7:q1
+		 * v8:wp2, v9:wq2, v10:p2, v11:q2
+		 * v12:wp3, v13:wq3, v14:p3, v15:q3
+		 * v16:wp4, v17:wq4, v18:p4, v19:q4
+		 * v20:wp5, v21:wq5, v22:p5, v23:q5
+		 * v24:wp6, v25:wq6, v26:p6, v27:q6
+		 * v28:wp7, v29:wq7, v30:p7, v31:q7
+		 */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vle8.v	v2, (%[wp0])\n"
+			      "vle8.v	v3, (%[wq0])\n"
+			      "vxor.vv	v2, v2, v0\n"
+			      "vxor.vv	v3, v3, v1\n"
+			      "vse8.v	v2, (%[wp0])\n"
+			      "vse8.v	v3, (%[wq0])\n"
+
+			      "vle8.v	v6, (%[wp1])\n"
+			      "vle8.v	v7, (%[wq1])\n"
+			      "vxor.vv	v6, v6, v4\n"
+			      "vxor.vv	v7, v7, v5\n"
+			      "vse8.v	v6, (%[wp1])\n"
+			      "vse8.v	v7, (%[wq1])\n"
+
+			      "vle8.v	v10, (%[wp2])\n"
+			      "vle8.v	v11, (%[wq2])\n"
+			      "vxor.vv	v10, v10, v8\n"
+			      "vxor.vv	v11, v11, v9\n"
+			      "vse8.v	v10, (%[wp2])\n"
+			      "vse8.v	v11, (%[wq2])\n"
+
+			      "vle8.v	v14, (%[wp3])\n"
+			      "vle8.v	v15, (%[wq3])\n"
+			      "vxor.vv	v14, v14, v12\n"
+			      "vxor.vv	v15, v15, v13\n"
+			      "vse8.v	v14, (%[wp3])\n"
+			      "vse8.v	v15, (%[wq3])\n"
+
+			      "vle8.v	v18, (%[wp4])\n"
+			      "vle8.v	v19, (%[wq4])\n"
+			      "vxor.vv	v18, v18, v16\n"
+			      "vxor.vv	v19, v19, v17\n"
+			      "vse8.v	v18, (%[wp4])\n"
+			      "vse8.v	v19, (%[wq4])\n"
+
+			      "vle8.v	v22, (%[wp5])\n"
+			      "vle8.v	v23, (%[wq5])\n"
+			      "vxor.vv	v22, v22, v20\n"
+			      "vxor.vv	v23, v23, v21\n"
+			      "vse8.v	v22, (%[wp5])\n"
+			      "vse8.v	v23, (%[wq5])\n"
+
+			      "vle8.v	v26, (%[wp6])\n"
+			      "vle8.v	v27, (%[wq6])\n"
+			      "vxor.vv	v26, v26, v24\n"
+			      "vxor.vv	v27, v27, v25\n"
+			      "vse8.v	v26, (%[wp6])\n"
+			      "vse8.v	v27, (%[wq6])\n"
+
+			      "vle8.v	v30, (%[wp7])\n"
+			      "vle8.v	v31, (%[wq7])\n"
+			      "vxor.vv	v30, v30, v28\n"
+			      "vxor.vv	v31, v31, v29\n"
+			      "vse8.v	v30, (%[wp7])\n"
+			      "vse8.v	v31, (%[wq7])\n"
+			      ".option	pop\n"
+			      : :
+			      [wp0]"r"(&p[d + nsize * 0]),
+			      [wq0]"r"(&q[d + nsize * 0]),
+			      [wp1]"r"(&p[d + nsize * 1]),
+			      [wq1]"r"(&q[d + nsize * 1]),
+			      [wp2]"r"(&p[d + nsize * 2]),
+			      [wq2]"r"(&q[d + nsize * 2]),
+			      [wp3]"r"(&p[d + nsize * 3]),
+			      [wq3]"r"(&q[d + nsize * 3]),
+			      [wp4]"r"(&p[d + nsize * 4]),
+			      [wq4]"r"(&q[d + nsize * 4]),
+			      [wp5]"r"(&p[d + nsize * 5]),
+			      [wq5]"r"(&q[d + nsize * 5]),
+			      [wp6]"r"(&p[d + nsize * 6]),
+			      [wq6]"r"(&q[d + nsize * 6]),
+			      [wp7]"r"(&p[d + nsize * 7]),
+			      [wq7]"r"(&q[d + nsize * 7])
+		);
+	}
+}
+
+RAID6_RVV_WRAPPER(1);
+RAID6_RVV_WRAPPER(2);
+RAID6_RVV_WRAPPER(4);
+RAID6_RVV_WRAPPER(8);
diff --git a/lib/raid6/rvv.h b/lib/raid6/rvv.h
new file mode 100644
index 000000000000..6d0708a2c8a4
--- /dev/null
+++ b/lib/raid6/rvv.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2024 Institute of Software, CAS.
+ *
+ * raid6/rvv.h
+ *
+ * Definitions for RISC-V RAID-6 code
+ */
+
+#ifdef __KERNEL__
+#include <asm/vector.h>
+#else
+#define kernel_vector_begin()
+#define kernel_vector_end()
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+#define has_vector() (getauxval(AT_HWCAP) & COMPAT_HWCAP_ISA_V)
+#endif
+
+#include <linux/raid/pq.h>
+
+static int rvv_has_vector(void)
+{
+	return has_vector();
+}
+
+#define RAID6_RVV_WRAPPER(_n)						\
+	static void raid6_rvv ## _n ## _gen_syndrome(int disks,		\
+					size_t bytes, void **ptrs)	\
+	{								\
+		void raid6_rvv ## _n  ## _gen_syndrome_real(int d,	\
+					unsigned long b, void **p);	\
+		kernel_vector_begin();					\
+		raid6_rvv ## _n ## _gen_syndrome_real(disks,		\
+				(unsigned long)bytes, ptrs);		\
+		kernel_vector_end();					\
+	}								\
+	static void raid6_rvv ## _n ## _xor_syndrome(int disks,		\
+					int start, int stop,		\
+					size_t bytes, void **ptrs)	\
+	{								\
+		void raid6_rvv ## _n  ## _xor_syndrome_real(int d,	\
+					int s1, int s2,			\
+					unsigned long b, void **p);	\
+		kernel_vector_begin();					\
+		raid6_rvv ## _n ## _xor_syndrome_real(disks,		\
+			start, stop, (unsigned long)bytes, ptrs);	\
+		kernel_vector_end();					\
+	}								\
+	struct raid6_calls const raid6_rvvx ## _n = {			\
+		raid6_rvv ## _n ## _gen_syndrome,			\
+		raid6_rvv ## _n ## _xor_syndrome,			\
+		rvv_has_vector,						\
+		"rvvx" #_n,						\
+		0							\
+	}
diff --git a/lib/raid6/s390vx.uc b/lib/raid6/s390vx.uc
index 914ebe98fc21..8aa53eb2f395 100644
--- a/lib/raid6/s390vx.uc
+++ b/lib/raid6/s390vx.uc
@@ -11,17 +11,16 @@
  * This file is postprocessed using unroll.awk.
  */
 
+#include <linux/cpufeature.h>
 #include <linux/raid/pq.h>
-#include <asm/fpu/api.h>
-
-asm(".include \"asm/vx-insn.h\"\n");
+#include <asm/fpu.h>
 
 #define NSIZE 16
 
-static inline void LOAD_CONST(void)
+static __always_inline void LOAD_CONST(void)
 {
-	asm volatile("VREPIB %v24,7");
-	asm volatile("VREPIB %v25,0x1d");
+	fpu_vrepib(24, 0x07);
+	fpu_vrepib(25, 0x1d);
 }
 
 /*
@@ -29,10 +28,7 @@ static inline void LOAD_CONST(void)
  * vector register y left by 1 bit and stores the result in
  * vector register x.
  */
-static inline void SHLBYTE(int x, int y)
-{
-	asm volatile ("VAB %0,%1,%1" : : "i" (x), "i" (y));
-}
+#define SHLBYTE(x, y)		fpu_vab(x, y, y)
 
 /*
  * For each of the 16 bytes in the vector register y the MASK()
@@ -40,49 +36,17 @@ static inline void SHLBYTE(int x, int y)
  * or 0x00 if the high bit is 0. The result is stored in vector
  * register x.
  */
-static inline void MASK(int x, int y)
-{
-	asm volatile ("VESRAVB	%0,%1,24" : : "i" (x), "i" (y));
-}
-
-static inline void AND(int x, int y, int z)
-{
-	asm volatile ("VN %0,%1,%2" : : "i" (x), "i" (y), "i" (z));
-}
-
-static inline void XOR(int x, int y, int z)
-{
-	asm volatile ("VX %0,%1,%2" : : "i" (x), "i" (y), "i" (z));
-}
+#define MASK(x, y)		fpu_vesravb(x, y, 24)
 
-static inline void LOAD_DATA(int x, u8 *ptr)
-{
-	typedef struct { u8 _[16 * $#]; } addrtype;
-	register addrtype *__ptr asm("1") = (addrtype *) ptr;
-
-	asm volatile ("VLM %2,%3,0,%r1"
-		      : : "m" (*__ptr), "a" (__ptr), "i" (x),
-			  "i" (x + $# - 1));
-}
-
-static inline void STORE_DATA(int x, u8 *ptr)
-{
-	typedef struct { u8 _[16 * $#]; } addrtype;
-	register addrtype *__ptr asm("1") = (addrtype *) ptr;
-
-	asm volatile ("VSTM %2,%3,0,1"
-		      : "=m" (*__ptr) : "a" (__ptr), "i" (x),
-			"i" (x + $# - 1));
-}
-
-static inline void COPY_VEC(int x, int y)
-{
-	asm volatile ("VLR %0,%1" : : "i" (x), "i" (y));
-}
+#define AND(x, y, z)		fpu_vn(x, y, z)
+#define XOR(x, y, z)		fpu_vx(x, y, z)
+#define LOAD_DATA(x, ptr)	fpu_vlm(x, x + $# - 1, ptr)
+#define STORE_DATA(x, ptr)	fpu_vstm(x, x + $# - 1, ptr)
+#define COPY_VEC(x, y)		fpu_vlr(x, y)
 
 static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
 {
-	struct kernel_fpu vxstate;
+	DECLARE_KERNEL_FPU_ONSTACK32(vxstate);
 	u8 **dptr, *p, *q;
 	int d, z, z0;
 
@@ -115,7 +79,7 @@ static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
 static void raid6_s390vx$#_xor_syndrome(int disks, int start, int stop,
 					size_t bytes, void **ptrs)
 {
-	struct kernel_fpu vxstate;
+	DECLARE_KERNEL_FPU_ONSTACK32(vxstate);
 	u8 **dptr, *p, *q;
 	int d, z, z0;
 
@@ -159,7 +123,7 @@ static void raid6_s390vx$#_xor_syndrome(int disks, int start, int stop,
 
 static int raid6_s390vx$#_valid(void)
 {
-	return MACHINE_HAS_VX;
+	return cpu_has_vx();
 }
 
 const struct raid6_calls raid6_s390vx$# = {
diff --git a/lib/raid6/sse1.c b/lib/raid6/sse1.c
index 9025b8ca9aa3..692fa3a93bf0 100644
--- a/lib/raid6/sse1.c
+++ b/lib/raid6/sse1.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- linux-c -*- ------------------------------------------------------- *
  *
  *   Copyright 2002 H. Peter Anvin - All Rights Reserved
  *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Boston MA 02111-1307, USA; either version 2 of the License, or
- *   (at your option) any later version; incorporated herein by reference.
- *
  * ----------------------------------------------------------------------- */
 
 /*
diff --git a/lib/raid6/sse2.c b/lib/raid6/sse2.c
index 8191e1d0d2fb..2930220249c9 100644
--- a/lib/raid6/sse2.c
+++ b/lib/raid6/sse2.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- linux-c -*- ------------------------------------------------------- *
  *
  *   Copyright 2002 H. Peter Anvin - All Rights Reserved
  *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Boston MA 02111-1307, USA; either version 2 of the License, or
- *   (at your option) any later version; incorporated herein by reference.
- *
  * ----------------------------------------------------------------------- */
 
 /*
diff --git a/lib/raid6/test/.gitignore b/lib/raid6/test/.gitignore
new file mode 100644
index 000000000000..1b68a77f348f
--- /dev/null
+++ b/lib/raid6/test/.gitignore
@@ -0,0 +1,3 @@
+/int.uc
+/neon.uc
+/raid6test
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index 3ab8720aa2f8..09bbe2b14cce 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -4,14 +4,17 @@
 # from userspace.
 #
 
-CC	 = gcc
-OPTFLAGS = -O2			# Adjust as desired
-CFLAGS	 = -I.. -I ../../../include -g $(OPTFLAGS)
-LD	 = ld
-AWK	 = awk -f
-AR	 = ar
-RANLIB	 = ranlib
-OBJS	 = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o
+pound := \#
+
+# Adjust as desired
+CC       = gcc
+OPTFLAGS = -O2
+CFLAGS   = -I.. -I ../../../include -g $(OPTFLAGS)
+LD       = ld
+AWK      = awk -f
+AR       = ar
+RANLIB   = ranlib
+OBJS     = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o
 
 ARCH := $(shell uname -m 2>/dev/null | sed -e /s/i.86/i386/)
 ifeq ($(ARCH),i386)
@@ -32,29 +35,42 @@ ifeq ($(ARCH),aarch64)
         HAS_NEON = yes
 endif
 
+ifeq ($(findstring riscv,$(ARCH)),riscv)
+        CFLAGS += -I../../../arch/riscv/include -DCONFIG_RISCV=1
+        HAS_RVV = yes
+endif
+
+ifeq ($(findstring ppc,$(ARCH)),ppc)
+        CFLAGS += -I../../../arch/powerpc/include
+        HAS_ALTIVEC := $(shell printf '$(pound)include <altivec.h>\nvector int a;\n' |\
+                         gcc -c -x c - >/dev/null && rm ./-.o && echo yes)
+endif
+
+ifeq ($(ARCH),loongarch64)
+        CFLAGS += -I../../../arch/loongarch/include -DCONFIG_LOONGARCH=1
+        CFLAGS += $(shell echo 'vld $$vr0, $$zero, 0' |         \
+                    gcc -c -x assembler - >/dev/null 2>&1 &&    \
+                    rm ./-.o && echo -DCONFIG_CPU_HAS_LSX=1)
+        CFLAGS += $(shell echo 'xvld $$xr0, $$zero, 0' |        \
+                    gcc -c -x assembler - >/dev/null 2>&1 &&    \
+                    rm ./-.o && echo -DCONFIG_CPU_HAS_LASX=1)
+endif
+
 ifeq ($(IS_X86),yes)
         OBJS   += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o
-        CFLAGS += $(shell echo "pshufb %xmm0, %xmm0" |		\
-                    gcc -c -x assembler - >&/dev/null &&	\
-                    rm ./-.o && echo -DCONFIG_AS_SSSE3=1)
-        CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" |	\
-                    gcc -c -x assembler - >&/dev/null &&	\
-                    rm ./-.o && echo -DCONFIG_AS_AVX2=1)
-	CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" |          \
-		    gcc -c -x assembler - >&/dev/null &&        \
-		    rm ./-.o && echo -DCONFIG_AS_AVX512=1)
+        CFLAGS += -DCONFIG_X86
 else ifeq ($(HAS_NEON),yes)
         OBJS   += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
         CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1
-else
-        HAS_ALTIVEC := $(shell printf '\#include <altivec.h>\nvector int a;\n' |\
-                         gcc -c -x c - >/dev/null && rm ./-.o && echo yes)
-        ifeq ($(HAS_ALTIVEC),yes)
-                CFLAGS += -I../../../arch/powerpc/include
-                CFLAGS += -DCONFIG_ALTIVEC
-                OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \
-                        vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
-        endif
+else ifeq ($(HAS_ALTIVEC),yes)
+        CFLAGS += -DCONFIG_ALTIVEC
+        OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \
+                vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
+else ifeq ($(ARCH),loongarch64)
+        OBJS += loongarch_simd.o recov_loongarch_simd.o
+else ifeq ($(HAS_RVV),yes)
+        OBJS   += rvv.o recov_rvv.o
+        CFLAGS += -DCONFIG_RISCV_ISA_V=1
 endif
 
 .c.o:
@@ -66,12 +82,12 @@ endif
 %.uc: ../%.uc
 	cp -f $< $@
 
-all:	raid6.a raid6test
+all: raid6.a raid6test
 
 raid6.a: $(OBJS)
-	 rm -f $@
-	 $(AR) cq $@ $^
-	 $(RANLIB) $@
+	rm -f $@
+	$(AR) cq $@ $^
+	$(RANLIB) $@
 
 raid6test: test.c raid6.a
 	$(CC) $(CFLAGS) -o raid6test $^
diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c
index b07f4d8e6b03..841a55242aba 100644
--- a/lib/raid6/test/test.c
+++ b/lib/raid6/test/test.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- linux-c -*- ------------------------------------------------------- *
  *
  *   Copyright 2002-2007 H. Peter Anvin - All Rights Reserved
  *
- *   This file is part of the Linux kernel, and is made available under
- *   the terms of the GNU General Public License version 2 or (at your
- *   option) any later version; incorporated herein by reference.
- *
  * ----------------------------------------------------------------------- */
 
 /*
@@ -22,7 +19,6 @@
 #define NDISKS		16	/* Including P and Q */
 
 const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
-struct raid6_calls raid6_call;
 
 char *dataptrs[NDISKS];
 char data[NDISKS][PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
diff --git a/lib/raid6/unroll.awk b/lib/raid6/unroll.awk
index c6aa03631df8..0809805a7e23 100644
--- a/lib/raid6/unroll.awk
+++ b/lib/raid6/unroll.awk
@@ -13,7 +13,7 @@ BEGIN {
 	for (i = 0; i < rep; ++i) {
 		tmp = $0
 		gsub(/\$\$/, i, tmp)
-		gsub(/\$\#/, n, tmp)
+		gsub(/\$#/, n, tmp)
 		gsub(/\$\*/, "$", tmp)
 		print tmp
 	}
diff --git a/lib/raid6/vpermxor.uc b/lib/raid6/vpermxor.uc
index 10475dc423c1..1bfb127fbfe8 100644
--- a/lib/raid6/vpermxor.uc
+++ b/lib/raid6/vpermxor.uc
@@ -24,9 +24,9 @@
 #ifdef CONFIG_ALTIVEC
 
 #include <altivec.h>
+#include <asm/ppc-opcode.h>
 #ifdef __KERNEL__
 #include <asm/cputable.h>
-#include <asm/ppc-opcode.h>
 #include <asm/switch_to.h>
 #endif
 
diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h
index 834d268a4b05..9a6ff37115e7 100644
--- a/lib/raid6/x86.h
+++ b/lib/raid6/x86.h
@@ -1,13 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* ----------------------------------------------------------------------- *
  *
  *   Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
  *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Boston MA 02111-1307, USA; either version 2 of the License, or
- *   (at your option) any later version; incorporated herein by reference.
- *
  * ----------------------------------------------------------------------- */
 
 /*