diff options
86 files changed, 1826 insertions, 744 deletions
diff --git a/Documentation/admin-guide/device-mapper/dm-raid.rst b/Documentation/admin-guide/device-mapper/dm-raid.rst index bb17e26e3c1b..e11f10764770 100644 --- a/Documentation/admin-guide/device-mapper/dm-raid.rst +++ b/Documentation/admin-guide/device-mapper/dm-raid.rst @@ -20,10 +20,10 @@ The target is named "raid" and it accepts the following parameters:: raid0 RAID0 striping (no resilience) raid1 RAID1 mirroring raid4 RAID4 with dedicated last parity disk - raid5_n RAID5 with dedicated last parity disk supporting takeover + raid5_n RAID5 with dedicated last parity disk supporting takeover from/to raid1 Same as raid4 - - Transitory layout + - Transitory layout for takeover from/to raid1 raid5_la RAID5 left asymmetric - rotating parity 0 with data continuation @@ -48,8 +48,8 @@ The target is named "raid" and it accepts the following parameters:: raid6_n_6 RAID6 with dedicate parity disks - parity and Q-syndrome on the last 2 disks; - layout for takeover from/to raid4/raid5_n - raid6_la_6 Same as "raid_la" plus dedicated last Q-syndrome disk + layout for takeover from/to raid0/raid4/raid5_n + raid6_la_6 Same as "raid_la" plus dedicated last Q-syndrome disk supporting takeover from/to raid5 - layout for takeover from raid5_la from/to raid6 raid6_ra_6 Same as "raid5_ra" dedicated last Q-syndrome disk @@ -173,9 +173,9 @@ The target is named "raid" and it accepts the following parameters:: The delta_disks option value (-251 < N < +251) triggers device removal (negative value) or device addition (positive value) to any reshape supporting raid levels 4/5/6 and 10. - RAID levels 4/5/6 allow for addition of devices (metadata - and data device tuple), raid10_near and raid10_offset only - allow for device addition. raid10_far does not support any + RAID levels 4/5/6 allow for addition and removal of devices + (metadata and data device tuple), raid10_near and raid10_offset + only allow for device addition. raid10_far does not support any reshaping at all. A minimum of devices have to be kept to enforce resilience, which is 3 devices for raid4/5 and 4 devices for raid6. @@ -372,6 +372,72 @@ to safely enable discard support for RAID 4/5/6: 'devices_handle_discards_safely' +Takeover/Reshape Support +------------------------ +The target natively supports these two types of MDRAID conversions: + +o Takeover: Converts an array from one RAID level to another + +o Reshape: Changes the internal layout while maintaining the current RAID level + +Each operation is only valid under specific constraints imposed by the existing array's layout and configuration. + + +Takeover: +linear -> raid1 with N >= 2 mirrors +raid0 -> raid4 (add dedicated parity device) +raid0 -> raid5 (add dedicated parity device) +raid0 -> raid10 with near layout and N >= 2 mirror groups (raid0 stripes have to become first member within mirror groups) +raid1 -> linear +raid1 -> raid5 with 2 mirrors +raid4 -> raid5 w/ rotating parity +raid5 with dedicated parity device -> raid4 +raid5 -> raid6 (with dedicated Q-syndrome) +raid6 (with dedicated Q-syndrome) -> raid5 +raid10 with near layout and even number of disks -> raid0 (select any in-sync device from each mirror group) + +Reshape: +linear: not possible +raid0: not possible +raid1: change number of mirrors +raid4: add and remove stripes (minimum 3), change stripesize +raid5: add and remove stripes (minimum 3, special case 2 for raid1 takeover), change rotating parity algorithms, change stripesize +raid6: add and remove stripes (minimum 4), change rotating syndrome algorithms, change stripesize +raid10 near: add stripes (minimum 4), change stripesize, no stripe removal possible, change to offset layout +raid10 offset: add stripes, change stripesize, no stripe removal possible, change to near layout +raid10 far: not possible + +Table line examples: + +### raid1 -> raid5 +# +# 2 devices limitation in raid1. +# raid5 personality is able to just map 2 like raid1. +# Reshape after takeover to change to full raid5 layout + + 0 1960886272 raid raid1 3 0 region_size 2048 2 /dev/dm-0 /dev/dm-1 /dev/dm-2 /dev/dm-3 + +# dm-0 and dm-2 are e.g. 4MiB large metadata devices, dm-1 and dm-3 have to be at least 1960886272 big. +# +# Table line to takeover to raid5 + + 0 1960886272 raid raid5 3 0 region_size 2048 2 /dev/dm-0 /dev/dm-1 /dev/dm-2 /dev/dm-3 + +# Add required out-of-place reshape space to the beginniong of the given 2 data devices, +# allocate another metadata/data device tuple with the same sizes for the parity space +# and zero the first 4K of the metadata device. +# +# Example table of the out-of-place reshape space addition for one data device, e.g. dm-1 + + 0 8192 linear 8:0 0 1960903888 # <- must be free space segment + 8192 1960886272 linear 8:0 0 2048 # previous data segment + +# Mapping table for e.g. raid5_rs reshape causing the size of the raid device to double-fold once the reshape finishes. +# Check the status output (e.g. "dmsetup status $RaidDev") for progess. + + 0 $((2 * 1960886272)) raid raid5 7 0 region_size 2048 data_offset 8192 delta_disk 1 2 /dev/dm-0 /dev/dm-1 /dev/dm-2 /dev/dm-3 + + Version History --------------- diff --git a/Documentation/admin-guide/device-mapper/verity.rst b/Documentation/admin-guide/device-mapper/verity.rst index 8c3f1f967a3c..3ecab1cff9c6 100644 --- a/Documentation/admin-guide/device-mapper/verity.rst +++ b/Documentation/admin-guide/device-mapper/verity.rst @@ -236,8 +236,10 @@ is available at the cryptsetup project's wiki page Status ====== -V (for Valid) is returned if every check performed so far was valid. -If any check failed, C (for Corruption) is returned. +1. V (for Valid) is returned if every check performed so far was valid. + If any check failed, C (for Corruption) is returned. +2. Number of corrected blocks by Forward Error Correction. + '-' if Forward Error Correction is not enabled. Example ======= diff --git a/MAINTAINERS b/MAINTAINERS index aff3e162180d..c9e416ba74c6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7225,6 +7225,7 @@ DEVICE-MAPPER (LVM) M: Alasdair Kergon <agk@redhat.com> M: Mike Snitzer <snitzer@kernel.org> M: Mikulas Patocka <mpatocka@redhat.com> +M: Benjamin Marzinski <bmarzins@redhat.com> L: dm-devel@lists.linux.dev S: Maintained Q: http://patchwork.kernel.org/project/dm-devel/list/ diff --git a/arch/alpha/include/asm/console.h b/arch/alpha/include/asm/console.h index 088b7b9eb15a..1cabdb6064bb 100644 --- a/arch/alpha/include/asm/console.h +++ b/arch/alpha/include/asm/console.h @@ -4,7 +4,7 @@ #include <uapi/asm/console.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern long callback_puts(long unit, const char *s, long length); extern long callback_getc(long unit); extern long callback_open_console(void); @@ -26,5 +26,5 @@ struct crb_struct; struct hwrpb_struct; extern int callback_init_done; extern void * callback_init(void *); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __AXP_CONSOLE_H */ diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h index 5ec4c77e432e..d2c6667d73e9 100644 --- a/arch/alpha/include/asm/page.h +++ b/arch/alpha/include/asm/page.h @@ -6,7 +6,7 @@ #include <asm/pal.h> #include <vdso/page.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define STRICT_MM_TYPECHECKS @@ -74,7 +74,7 @@ typedef struct page *pgtable_t; #define PAGE_OFFSET 0xfffffc0000000000 #endif -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #define __pa(x) ((unsigned long) (x) - PAGE_OFFSET) #define __va(x) ((void *)((unsigned long) (x) + PAGE_OFFSET)) diff --git a/arch/alpha/include/asm/pal.h b/arch/alpha/include/asm/pal.h index db2b3b18b34c..799a64c05198 100644 --- a/arch/alpha/include/asm/pal.h +++ b/arch/alpha/include/asm/pal.h @@ -4,7 +4,7 @@ #include <uapi/asm/pal.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern void halt(void) __attribute__((noreturn)); #define __halt() __asm__ __volatile__ ("call_pal %0 #halt" : : "i" (PAL_halt)) @@ -183,5 +183,5 @@ qemu_get_vmtime(void) return v0; } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ALPHA_PAL_H */ diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h index 4a4d00b37986..98ccbca64984 100644 --- a/arch/alpha/include/asm/thread_info.h +++ b/arch/alpha/include/asm/thread_info.h @@ -4,14 +4,14 @@ #ifdef __KERNEL__ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/processor.h> #include <asm/types.h> #include <asm/hwrpb.h> #include <asm/sysinfo.h> #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct thread_info { struct pcb_struct pcb; /* palcode state */ @@ -44,7 +44,7 @@ register struct thread_info *__current_thread_info __asm__("$8"); register unsigned long *current_stack_pointer __asm__ ("$30"); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* Thread information allocation. */ #define THREAD_SIZE_ORDER 1 @@ -110,7 +110,7 @@ register unsigned long *current_stack_pointer __asm__ ("$30"); put_user(res, (int __user *)(value)); \ }) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern void __save_fpu(void); static inline void save_fpu(void) diff --git a/arch/alpha/include/uapi/asm/ioctls.h b/arch/alpha/include/uapi/asm/ioctls.h index 971311605288..a09d04b49cc6 100644 --- a/arch/alpha/include/uapi/asm/ioctls.h +++ b/arch/alpha/include/uapi/asm/ioctls.h @@ -23,10 +23,10 @@ #define TCSETSW _IOW('t', 21, struct termios) #define TCSETSF _IOW('t', 22, struct termios) -#define TCGETA _IOR('t', 23, struct termio) -#define TCSETA _IOW('t', 24, struct termio) -#define TCSETAW _IOW('t', 25, struct termio) -#define TCSETAF _IOW('t', 28, struct termio) +#define TCGETA 0x40127417 +#define TCSETA 0x80127418 +#define TCSETAW 0x80127419 +#define TCSETAF 0x8012741c #define TCSBRK _IO('t', 29) #define TCXONC _IO('t', 30) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index ff61891abe53..fa83c040ee2d 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -82,7 +82,7 @@ config ARM select HAS_IOPORT select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 - select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU + select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && (!PREEMPT_RT || !SMP) select HAVE_ARCH_KFENCE if MMU && !XIP_KERNEL select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL @@ -1213,7 +1213,7 @@ config HIGHMEM config HIGHPTE bool "Allocate 2nd-level pagetables from highmem" if EXPERT - depends on HIGHMEM + depends on HIGHMEM && !PREEMPT_RT default y help The VM uses one page of physical memory for each page table. diff --git a/arch/arm/include/asm/word-at-a-time.h b/arch/arm/include/asm/word-at-a-time.h index f9a3897b06e7..5023f98d8293 100644 --- a/arch/arm/include/asm/word-at-a-time.h +++ b/arch/arm/include/asm/word-at-a-time.h @@ -67,7 +67,7 @@ static inline unsigned long find_zero(unsigned long mask) */ static inline unsigned long load_unaligned_zeropad(const void *addr) { - unsigned long ret, offset; + unsigned long ret, tmp; /* Load word from unaligned pointer addr */ asm( @@ -75,9 +75,9 @@ static inline unsigned long load_unaligned_zeropad(const void *addr) "2:\n" " .pushsection .text.fixup,\"ax\"\n" " .align 2\n" - "3: and %1, %2, #0x3\n" - " bic %2, %2, #0x3\n" - " ldr %0, [%2]\n" + "3: bic %1, %2, #0x3\n" + " ldr %0, [%1]\n" + " and %1, %2, #0x3\n" " lsl %1, %1, #0x3\n" #ifndef __ARMEB__ " lsr %0, %0, %1\n" @@ -90,7 +90,7 @@ static inline unsigned long load_unaligned_zeropad(const void *addr) " .align 3\n" " .long 1b, 3b\n" " .popsection" - : "=&r" (ret), "=&r" (offset) + : "=&r" (ret), "=&r" (tmp) : "r" (addr), "Qo" (*(unsigned long *)addr)); return ret; diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c index 3c6ddb1afdc4..812380f30ae3 100644 --- a/arch/arm/mm/alignment.c +++ b/arch/arm/mm/alignment.c @@ -19,10 +19,11 @@ #include <linux/init.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> +#include <linux/unaligned.h> #include <asm/cp15.h> #include <asm/system_info.h> -#include <linux/unaligned.h> +#include <asm/system_misc.h> #include <asm/opcodes.h> #include "fault.h" @@ -809,6 +810,9 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs) int thumb2_32b = 0; int fault; + if (addr >= TASK_SIZE && user_mode(regs)) + harden_branch_predictor(); + if (interrupts_enabled(regs)) local_irq_enable(); diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 2bc828a1940c..ed4330cc3f4e 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -128,6 +128,19 @@ static inline bool is_translation_fault(unsigned int fsr) return false; } +static inline bool is_permission_fault(unsigned int fsr) +{ + int fs = fsr_fs(fsr); +#ifdef CONFIG_ARM_LPAE + if ((fs & FS_MMU_NOLL_MASK) == FS_PERM_NOLL) + return true; +#else + if (fs == FS_L1_PERM || fs == FS_L2_PERM) + return true; +#endif + return false; +} + static void die_kernel_fault(const char *msg, struct mm_struct *mm, unsigned long addr, unsigned int fsr, struct pt_regs *regs) @@ -162,6 +175,8 @@ __do_kernel_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr, */ if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; + } else if (is_permission_fault(fsr) && fsr & FSR_LNX_PF) { + msg = "execution of memory"; } else { if (is_translation_fault(fsr) && kfence_handle_page_fault(addr, is_write_fault(fsr), regs)) @@ -183,9 +198,6 @@ __do_user_fault(unsigned long addr, unsigned int fsr, unsigned int sig, { struct task_struct *tsk = current; - if (addr > TASK_SIZE) - harden_branch_predictor(); - #ifdef CONFIG_DEBUG_USER if (((user_debug & UDBG_SEGV) && (sig == SIGSEGV)) || ((user_debug & UDBG_BUS) && (sig == SIGBUS))) { @@ -225,19 +237,6 @@ void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs) } #ifdef CONFIG_MMU -static inline bool is_permission_fault(unsigned int fsr) -{ - int fs = fsr_fs(fsr); -#ifdef CONFIG_ARM_LPAE - if ((fs & FS_MMU_NOLL_MASK) == FS_PERM_NOLL) - return true; -#else - if (fs == FS_L1_PERM || fs == FS_L2_PERM) - return true; -#endif - return false; -} - #ifdef CONFIG_CPU_TTBR0_PAN static inline bool ttbr0_usermode_access_allowed(struct pt_regs *regs) { @@ -260,6 +259,37 @@ static inline bool ttbr0_usermode_access_allowed(struct pt_regs *regs) #endif static int __kprobes +do_kernel_address_page_fault(struct mm_struct *mm, unsigned long addr, + unsigned int fsr, struct pt_regs *regs) +{ + if (user_mode(regs)) { + /* + * Fault from user mode for a kernel space address. User mode + * should not be faulting in kernel space, which includes the + * vector/khelper page. Handle the branch predictor hardening + * while interrupts are still disabled, then send a SIGSEGV. + */ + harden_branch_predictor(); + __do_user_fault(addr, fsr, SIGSEGV, SEGV_MAPERR, regs); + } else { + /* + * Fault from kernel mode. Enable interrupts if they were + * enabled in the parent context. Section (upper page table) + * translation faults are handled via do_translation_fault(), + * so we will only get here for a non-present kernel space + * PTE or PTE permission fault. This may happen in exceptional + * circumstances and need the fixup tables to be walked. + */ + if (interrupts_enabled(regs)) + local_irq_enable(); + + __do_kernel_fault(mm, addr, fsr, regs); + } + + return 0; +} + +static int __kprobes do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { struct mm_struct *mm = current->mm; @@ -272,6 +302,12 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) if (kprobe_page_fault(regs, fsr)) return 0; + /* + * Handle kernel addresses faults separately, which avoids touching + * the mmap lock from contexts that are not able to sleep. + */ + if (addr >= TASK_SIZE) + return do_kernel_address_page_fault(mm, addr, fsr, regs); /* Enable interrupts if they were enabled in the parent context. */ if (interrupts_enabled(regs)) @@ -448,16 +484,20 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) * We enter here because the first level page table doesn't contain * a valid entry for the address. * - * If the address is in kernel space (>= TASK_SIZE), then we are - * probably faulting in the vmalloc() area. + * If this is a user address (addr < TASK_SIZE), we handle this as a + * normal page fault. This leaves the remainder of the function to handle + * kernel address translation faults. + * + * Since user mode is not permitted to access kernel addresses, pass these + * directly to do_kernel_address_page_fault() to handle. * - * If the init_task's first level page tables contains the relevant - * entry, we copy the it to this task. If not, we send the process - * a signal, fixup the exception, or oops the kernel. + * Otherwise, we're probably faulting in the vmalloc() area, so try to fix + * that up. Note that we must not take any locks or enable interrupts in + * this case. * - * NOTE! We MUST NOT take any locks for this case. We may be in an - * interrupt or a critical region, and should only copy the information - * from the master page table, nothing more. + * If vmalloc() fixup fails, that means the non-leaf page tables did not + * contain an entry for this address, so handle this via + * do_kernel_address_page_fault(). */ #ifdef CONFIG_MMU static int __kprobes @@ -523,7 +563,8 @@ do_translation_fault(unsigned long addr, unsigned int fsr, return 0; bad_area: - do_bad_area(addr, fsr, regs); + do_kernel_address_page_fault(current->mm, addr, fsr, regs); + return 0; } #else /* CONFIG_MMU */ @@ -543,7 +584,16 @@ do_translation_fault(unsigned long addr, unsigned int fsr, static int do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { + /* + * If this is a kernel address, but from user mode, then userspace + * is trying bad stuff. Invoke the branch predictor handling. + * Interrupts are disabled here. + */ + if (addr >= TASK_SIZE && user_mode(regs)) + harden_branch_predictor(); + do_bad_area(addr, fsr, regs); + return 0; } #endif /* CONFIG_ARM_LPAE */ diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 938e5df75b2d..0e5fad5f06ca 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -238,6 +238,7 @@ config S390 select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_PREEMPT_DYNAMIC_KEY select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE @@ -254,6 +255,7 @@ config S390 select HOTPLUG_SMT select IOMMU_HELPER if PCI select IOMMU_SUPPORT if PCI + select IRQ_MSI_LIB if PCI select KASAN_VMALLOC if KASAN select LOCK_MM_AND_FIND_VMA select MMU_GATHER_MERGE_VMAS diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index fbe64ffdfb96..7d6cc4c85af0 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -244,22 +244,10 @@ static void *boot_crst_alloc(unsigned long val) static pte_t *boot_pte_alloc(void) { - static void *pte_leftover; pte_t *pte; - /* - * handling pte_leftovers this way helps to avoid memory fragmentation - * during POPULATE_KASAN_MAP_SHADOW when EDAT is off - */ - if (!pte_leftover) { - pte_leftover = (void *)physmem_alloc_or_die(RR_VMEM, PAGE_SIZE, PAGE_SIZE); - pte = pte_leftover + _PAGE_TABLE_SIZE; - __arch_set_page_dat(pte, 1); - } else { - pte = pte_leftover; - pte_leftover = NULL; - } - + pte = (void *)physmem_alloc_or_die(RR_VMEM, PAGE_SIZE, PAGE_SIZE); + __arch_set_page_dat(pte, 1); memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); return pte; } diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h index acb4b13d98c5..ee9221bb5d18 100644 --- a/arch/s390/include/asm/bug.h +++ b/arch/s390/include/asm/bug.h @@ -4,11 +4,14 @@ #include <linux/stringify.h> +#ifdef CONFIG_BUG + #ifndef CONFIG_DEBUG_BUGVERBOSE #define _BUGVERBOSE_LOCATION(file, line) #else #define __BUGVERBOSE_LOCATION(file, line) \ .pushsection .rodata.str, "aMS", @progbits, 1; \ + .align 2; \ 10002: .ascii file "\0"; \ .popsection; \ \ @@ -52,6 +55,8 @@ do { \ #define HAVE_ARCH_BUG +#endif /* CONFIG_BUG */ + #include <asm-generic/bug.h> #endif /* _ASM_S390_BUG_H */ diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 9240a363c893..c1d63b613bf9 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -166,6 +166,8 @@ static inline int page_reset_referenced(unsigned long addr) return CC_TRANSFORM(cc); } +int split_pud_page(pud_t *pudp, unsigned long addr); + /* Bits int the storage key */ #define _PAGE_CHANGED 0x02 /* HW changed bit */ #define _PAGE_REFERENCED 0x04 /* HW referenced bit */ diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index a32f465ecf73..c0ff19dab580 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -5,6 +5,7 @@ #include <linux/pci.h> #include <linux/mutex.h> #include <linux/iommu.h> +#include <linux/irqdomain.h> #include <linux/pci_hotplug.h> #include <asm/pci_clp.h> #include <asm/pci_debug.h> @@ -109,6 +110,7 @@ struct zpci_bus { struct list_head resources; struct list_head bus_next; struct resource bus_resource; + struct irq_domain *msi_parent_domain; int topo; /* TID if topo_is_tid, PCHID otherwise */ int domain_nr; u8 multifunction : 1; @@ -310,6 +312,9 @@ int zpci_dma_exit_device(struct zpci_dev *zdev); /* IRQ */ int __init zpci_irq_init(void); void __init zpci_irq_exit(void); +int zpci_set_irq(struct zpci_dev *zdev); +int zpci_create_parent_msi_domain(struct zpci_bus *zbus); +void zpci_remove_parent_msi_domain(struct zpci_bus *zbus); /* FMB */ int zpci_fmb_enable_device(struct zpci_dev *); diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index 549f14ad08af..d41b19925a5a 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -47,6 +47,7 @@ static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) { struct vm_area_struct *vma; + unsigned long pgstev; spinlock_t *ptl; pgste_t pgste; pte_t *ptep; @@ -65,9 +66,13 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) if (pte_swap(*ptep)) { preempt_disable(); pgste = pgste_get_lock(ptep); + pgstev = pgste_val(pgste); - ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); - pte_clear(mm, vmaddr, ptep); + if ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || + (pgstev & _PGSTE_GPS_ZERO)) { + ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); + pte_clear(mm, vmaddr, ptep); + } pgste_set_unlock(ptep, pgste); preempt_enable(); diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 3042647c9dbf..d3ce04a4b248 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -204,7 +204,7 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end, return rc; } -static int split_pud_page(pud_t *pudp, unsigned long addr) +int split_pud_page(pud_t *pudp, unsigned long addr) { unsigned long pmd_addr, prot; pmd_t *pm_dir, *pmdp; diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index d96587b84e81..eeadff45e0e1 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -330,10 +330,14 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, if (pud_leaf(*pud)) { if (IS_ALIGNED(addr, PUD_SIZE) && IS_ALIGNED(next, PUD_SIZE)) { + if (!direct) + vmem_free_pages(pud_deref(*pud), get_order(PUD_SIZE), altmap); pud_clear(pud); pages++; + continue; + } else { + split_pud_page(pud, addr & PUD_MASK); } - continue; } } else if (pud_none(*pud)) { if (IS_ALIGNED(addr, PUD_SIZE) && @@ -433,9 +437,15 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add, if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) return -EINVAL; - /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ + /* Don't mess with any tables not fully in 1:1 mapping, vmemmap & kasan area */ +#ifdef CONFIG_KASAN + if (WARN_ON_ONCE(!(start >= KASAN_SHADOW_START && end <= KASAN_SHADOW_END) && + end > __abs_lowcore)) + return -EINVAL; +#else if (WARN_ON_ONCE(end > __abs_lowcore)) return -EINVAL; +#endif for (addr = start; addr < end; addr = next) { next = pgd_addr_end(addr, end); pgd = pgd_offset_k(addr); diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 93d2c9c780fc..5a6ace9d875a 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -708,6 +708,12 @@ int zpci_reenable_device(struct zpci_dev *zdev) if (rc) return rc; + if (zdev->msi_nr_irqs > 0) { + rc = zpci_set_irq(zdev); + if (rc) + return rc; + } + rc = zpci_iommu_register_ioat(zdev, &status); if (rc) zpci_disable_device(zdev); diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index 72adc8f6e94f..66c4bd888b29 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -14,6 +14,7 @@ #include <linux/err.h> #include <linux/delay.h> #include <linux/seq_file.h> +#include <linux/irqdomain.h> #include <linux/jump_label.h> #include <linux/pci.h> #include <linux/printk.h> @@ -198,19 +199,27 @@ static int zpci_bus_create_pci_bus(struct zpci_bus *zbus, struct zpci_dev *fr, s zbus->multifunction = zpci_bus_is_multifunction_root(fr); zbus->max_bus_speed = fr->max_bus_speed; + if (zpci_create_parent_msi_domain(zbus)) + goto out_free_domain; + /* * Note that the zbus->resources are taken over and zbus->resources * is empty after a successful call */ bus = pci_create_root_bus(NULL, ZPCI_BUS_NR, ops, zbus, &zbus->resources); - if (!bus) { - zpci_free_domain(zbus->domain_nr); - return -EFAULT; - } + if (!bus) + goto out_remove_msi_domain; zbus->bus = bus; + dev_set_msi_domain(&zbus->bus->dev, zbus->msi_parent_domain); return 0; + +out_remove_msi_domain: + zpci_remove_parent_msi_domain(zbus); +out_free_domain: + zpci_free_domain(zbus->domain_nr); + return -ENOMEM; } static void zpci_bus_release(struct kref *kref) @@ -231,6 +240,7 @@ static void zpci_bus_release(struct kref *kref) mutex_lock(&zbus_list_lock); list_del(&zbus->bus_next); mutex_unlock(&zbus_list_lock); + zpci_remove_parent_msi_domain(zbus); kfree(zbus); } diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c index 2a06df8c2498..e9dd45f3c09d 100644 --- a/arch/s390/pci/pci_irq.c +++ b/arch/s390/pci/pci_irq.c @@ -6,6 +6,7 @@ #include <linux/kernel_stat.h> #include <linux/pci.h> #include <linux/msi.h> +#include <linux/irqchip/irq-msi-lib.h> #include <linux/smp.h> #include <asm/isc.h> @@ -97,7 +98,7 @@ static int zpci_clear_directed_irq(struct zpci_dev *zdev) } /* Register adapter interruptions */ -static int zpci_set_irq(struct zpci_dev *zdev) +int zpci_set_irq(struct zpci_dev *zdev) { int rc; @@ -125,27 +126,53 @@ static int zpci_clear_irq(struct zpci_dev *zdev) static int zpci_set_irq_affinity(struct irq_data *data, const struct cpumask *dest, bool force) { - struct msi_desc *entry = irq_data_get_msi_desc(data); - struct msi_msg msg = entry->msg; - int cpu_addr = smp_cpu_get_cpu_address(cpumask_first(dest)); + irq_data_update_affinity(data, dest); + return IRQ_SET_MASK_OK; +} - msg.address_lo &= 0xff0000ff; - msg.address_lo |= (cpu_addr << 8); - pci_write_msi_msg(data->irq, &msg); +/* + * Encode the hwirq number for the parent domain. The encoding must be unique + * for each IRQ of each device in the parent domain, so it uses the devfn to + * identify the device and the msi_index to identify the IRQ within that device. + */ +static inline u32 zpci_encode_hwirq(u8 devfn, u16 msi_index) +{ + return (devfn << 16) | msi_index; +} - return IRQ_SET_MASK_OK; +static inline u16 zpci_decode_hwirq_msi_index(irq_hw_number_t hwirq) +{ + return hwirq & 0xffff; +} + +static void zpci_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct msi_desc *desc = irq_data_get_msi_desc(data); + struct zpci_dev *zdev = to_zpci_dev(desc->dev); + + if (irq_delivery == DIRECTED) { + int cpu = cpumask_first(irq_data_get_affinity_mask(data)); + + msg->address_lo = zdev->msi_addr & 0xff0000ff; + msg->address_lo |= (smp_cpu_get_cpu_address(cpu) << 8); + } else { + msg->address_lo = zdev->msi_addr & 0xffffffff; + } + msg->address_hi = zdev->msi_addr >> 32; + msg->data = zpci_decode_hwirq_msi_index(data->hwirq); } static struct irq_chip zpci_irq_chip = { .name = "PCI-MSI", - .irq_unmask = pci_msi_unmask_irq, - .irq_mask = pci_msi_mask_irq, + .irq_compose_msi_msg = zpci_compose_msi_msg, }; static void zpci_handle_cpu_local_irq(bool rescan) { struct airq_iv *dibv = zpci_ibv[smp_processor_id()]; union zpci_sic_iib iib = {{0}}; + struct irq_domain *msi_domain; + irq_hw_number_t hwirq; unsigned long bit; int irqs_on = 0; @@ -163,7 +190,9 @@ static void zpci_handle_cpu_local_irq(bool rescan) continue; } inc_irq_stat(IRQIO_MSI); - generic_handle_irq(airq_iv_get_data(dibv, bit)); + hwirq = airq_iv_get_data(dibv, bit); + msi_domain = (struct irq_domain *)airq_iv_get_ptr(dibv, bit); + generic_handle_domain_irq(msi_domain, hwirq); } } @@ -228,6 +257,8 @@ static void zpci_floating_irq_handler(struct airq_struct *airq, struct tpi_info *tpi_info) { union zpci_sic_iib iib = {{0}}; + struct irq_domain *msi_domain; + irq_hw_number_t hwirq; unsigned long si, ai; struct airq_iv *aibv; int irqs_on = 0; @@ -255,7 +286,9 @@ static void zpci_floating_irq_handler(struct airq_struct *airq, break; inc_irq_stat(IRQIO_MSI); airq_iv_lock(aibv, ai); - generic_handle_irq(airq_iv_get_data(aibv, ai)); + hwirq = airq_iv_get_data(aibv, ai); + msi_domain = (struct irq_domain *)airq_iv_get_ptr(aibv, ai); + generic_handle_domain_irq(msi_domain, hwirq); airq_iv_unlock(aibv, ai); } } @@ -277,7 +310,9 @@ static int __alloc_airq(struct zpci_dev *zdev, int msi_vecs, zdev->aisb = *bit; /* Create adapter interrupt vector */ - zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK, NULL); + zdev->aibv = airq_iv_create(msi_vecs, + AIRQ_IV_PTR | AIRQ_IV_DATA | AIRQ_IV_BITLOCK, + NULL); if (!zdev->aibv) return -ENOMEM; @@ -289,146 +324,220 @@ static int __alloc_airq(struct zpci_dev *zdev, int msi_vecs, return 0; } -int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +bool arch_restore_msi_irqs(struct pci_dev *pdev) { - unsigned int hwirq, msi_vecs, irqs_per_msi, i, cpu; struct zpci_dev *zdev = to_zpci(pdev); - struct msi_desc *msi; - struct msi_msg msg; - unsigned long bit; - int cpu_addr; - int rc, irq; + zpci_set_irq(zdev); + return true; +} + +static struct airq_struct zpci_airq = { + .handler = zpci_floating_irq_handler, + .isc = PCI_ISC, +}; + +static void zpci_msi_teardown_directed(struct zpci_dev *zdev) +{ + airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, zdev->max_msi); + zdev->msi_first_bit = -1U; + zdev->msi_nr_irqs = 0; +} + +static void zpci_msi_teardown_floating(struct zpci_dev *zdev) +{ + airq_iv_release(zdev->aibv); + zdev->aibv = NULL; + airq_iv_free_bit(zpci_sbv, zdev->aisb); zdev->aisb = -1UL; zdev->msi_first_bit = -1U; + zdev->msi_nr_irqs = 0; +} + +static void zpci_msi_teardown(struct irq_domain *domain, msi_alloc_info_t *arg) +{ + struct zpci_dev *zdev = to_zpci_dev(domain->dev); + + zpci_clear_irq(zdev); + if (irq_delivery == DIRECTED) + zpci_msi_teardown_directed(zdev); + else + zpci_msi_teardown_floating(zdev); +} + +static int zpci_msi_prepare(struct irq_domain *domain, + struct device *dev, int nvec, + msi_alloc_info_t *info) +{ + struct zpci_dev *zdev = to_zpci_dev(dev); + struct pci_dev *pdev = to_pci_dev(dev); + unsigned long bit; + int msi_vecs, rc; msi_vecs = min_t(unsigned int, nvec, zdev->max_msi); if (msi_vecs < nvec) { - pr_info("%s requested %d irqs, allocate system limit of %d", + pr_info("%s requested %d IRQs, allocate system limit of %d\n", pci_name(pdev), nvec, zdev->max_msi); } rc = __alloc_airq(zdev, msi_vecs, &bit); - if (rc < 0) + if (rc) { + pr_err("Allocating adapter IRQs for %s failed\n", pci_name(pdev)); return rc; + } - /* - * Request MSI interrupts: - * When using MSI, nvec_used interrupt sources and their irq - * descriptors are controlled through one msi descriptor. - * Thus the outer loop over msi descriptors shall run only once, - * while two inner loops iterate over the interrupt vectors. - * When using MSI-X, each interrupt vector/irq descriptor - * is bound to exactly one msi descriptor (nvec_used is one). - * So the inner loops are executed once, while the outer iterates - * over the MSI-X descriptors. - */ - hwirq = bit; - msi_for_each_desc(msi, &pdev->dev, MSI_DESC_NOTASSOCIATED) { - if (hwirq - bit >= msi_vecs) - break; - irqs_per_msi = min_t(unsigned int, msi_vecs, msi->nvec_used); - irq = __irq_alloc_descs(-1, 0, irqs_per_msi, 0, THIS_MODULE, - (irq_delivery == DIRECTED) ? - msi->affinity : NULL); - if (irq < 0) - return -ENOMEM; + zdev->msi_first_bit = bit; + zdev->msi_nr_irqs = msi_vecs; + rc = zpci_set_irq(zdev); + if (rc) { + pr_err("Registering adapter IRQs for %s failed\n", + pci_name(pdev)); + + if (irq_delivery == DIRECTED) + zpci_msi_teardown_directed(zdev); + else + zpci_msi_teardown_floating(zdev); + return rc; + } + return 0; +} - for (i = 0; i < irqs_per_msi; i++) { - rc = irq_set_msi_desc_off(irq, i, msi); - if (rc) - return rc; - irq_set_chip_and_handler(irq + i, &zpci_irq_chip, - handle_percpu_irq); - } +static int zpci_msi_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *args) +{ + struct msi_desc *desc = ((msi_alloc_info_t *)args)->desc; + struct zpci_dev *zdev = to_zpci_dev(desc->dev); + struct zpci_bus *zbus = zdev->zbus; + unsigned int cpu, hwirq; + unsigned long bit; + int i; - msg.data = hwirq - bit; - if (irq_delivery == DIRECTED) { - if (msi->affinity) - cpu = cpumask_first(&msi->affinity->mask); - else - cpu = 0; - cpu_addr = smp_cpu_get_cpu_address(cpu); + bit = zdev->msi_first_bit + desc->msi_index; + hwirq = zpci_encode_hwirq(zdev->devfn, desc->msi_index); - msg.address_lo = zdev->msi_addr & 0xff0000ff; - msg.address_lo |= (cpu_addr << 8); + if (desc->msi_index + nr_irqs > zdev->max_msi) + return -EINVAL; + for (i = 0; i < nr_irqs; i++) { + irq_domain_set_info(domain, virq + i, hwirq + i, + &zpci_irq_chip, zdev, + handle_percpu_irq, NULL, NULL); + + if (irq_delivery == DIRECTED) { for_each_possible_cpu(cpu) { - for (i = 0; i < irqs_per_msi; i++) - airq_iv_set_data(zpci_ibv[cpu], - hwirq + i, irq + i); + airq_iv_set_ptr(zpci_ibv[cpu], bit + i, + (unsigned long)zbus->msi_parent_domain); + airq_iv_set_data(zpci_ibv[cpu], bit + i, hwirq + i); } } else { - msg.address_lo = zdev->msi_addr & 0xffffffff; - for (i = 0; i < irqs_per_msi; i++) - airq_iv_set_data(zdev->aibv, hwirq + i, irq + i); + airq_iv_set_ptr(zdev->aibv, bit + i, + (unsigned long)zbus->msi_parent_domain); + airq_iv_set_data(zdev->aibv, bit + i, hwirq + i); } - msg.address_hi = zdev->msi_addr >> 32; - pci_write_msi_msg(irq, &msg); - hwirq += irqs_per_msi; } - zdev->msi_first_bit = bit; - zdev->msi_nr_irqs = hwirq - bit; - - rc = zpci_set_irq(zdev); - if (rc) - return rc; - - return (zdev->msi_nr_irqs == nvec) ? 0 : zdev->msi_nr_irqs; + return 0; } -void arch_teardown_msi_irqs(struct pci_dev *pdev) +static void zpci_msi_clear_airq(struct irq_data *d, int i) { - struct zpci_dev *zdev = to_zpci(pdev); - struct msi_desc *msi; - unsigned int i; - int rc; + struct msi_desc *desc = irq_data_get_msi_desc(d); + struct zpci_dev *zdev = to_zpci_dev(desc->dev); + unsigned long bit; + unsigned int cpu; + u16 msi_index; - /* Disable interrupts */ - rc = zpci_clear_irq(zdev); - if (rc) - return; + msi_index = zpci_decode_hwirq_msi_index(d->hwirq); + bit = zdev->msi_first_bit + msi_index; - /* Release MSI interrupts */ - msi_for_each_desc(msi, &pdev->dev, MSI_DESC_ASSOCIATED) { - for (i = 0; i < msi->nvec_used; i++) { - irq_set_msi_desc(msi->irq + i, NULL); - irq_free_desc(msi->irq + i); + if (irq_delivery == DIRECTED) { + for_each_possible_cpu(cpu) { + airq_iv_set_ptr(zpci_ibv[cpu], bit + i, 0); + airq_iv_set_data(zpci_ibv[cpu], bit + i, 0); } - msi->msg.address_lo = 0; - msi->msg.address_hi = 0; - msi->msg.data = 0; - msi->irq = 0; + } else { + airq_iv_set_ptr(zdev->aibv, bit + i, 0); + airq_iv_set_data(zdev->aibv, bit + i, 0); } +} - if (zdev->aisb != -1UL) { - zpci_ibv[zdev->aisb] = NULL; - airq_iv_free_bit(zpci_sbv, zdev->aisb); - zdev->aisb = -1UL; - } - if (zdev->aibv) { - airq_iv_release(zdev->aibv); - zdev->aibv = NULL; - } +static void zpci_msi_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d; + int i; - if ((irq_delivery == DIRECTED) && zdev->msi_first_bit != -1U) - airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, zdev->msi_nr_irqs); + for (i = 0; i < nr_irqs; i++) { + d = irq_domain_get_irq_data(domain, virq + i); + zpci_msi_clear_airq(d, i); + irq_domain_reset_irq_data(d); + } } -bool arch_restore_msi_irqs(struct pci_dev *pdev) +static const struct irq_domain_ops zpci_msi_domain_ops = { + .alloc = zpci_msi_domain_alloc, + .free = zpci_msi_domain_free, +}; + +static bool zpci_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, + struct msi_domain_info *info) { - struct zpci_dev *zdev = to_zpci(pdev); + if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info)) + return false; + + info->ops->msi_prepare = zpci_msi_prepare; + info->ops->msi_teardown = zpci_msi_teardown; - zpci_set_irq(zdev); return true; } -static struct airq_struct zpci_airq = { - .handler = zpci_floating_irq_handler, - .isc = PCI_ISC, +static struct msi_parent_ops zpci_msi_parent_ops = { + .supported_flags = MSI_GENERIC_FLAGS_MASK | + MSI_FLAG_PCI_MSIX | + MSI_FLAG_MULTI_PCI_MSI, + .required_flags = MSI_FLAG_USE_DEF_DOM_OPS | + MSI_FLAG_USE_DEF_CHIP_OPS, + .init_dev_msi_info = zpci_init_dev_msi_info, }; +int zpci_create_parent_msi_domain(struct zpci_bus *zbus) +{ + char fwnode_name[18]; + + snprintf(fwnode_name, sizeof(fwnode_name), "ZPCI_MSI_DOM_%04x", zbus->domain_nr); + struct irq_domain_info info = { + .fwnode = irq_domain_alloc_named_fwnode(fwnode_name), + .ops = &zpci_msi_domain_ops, + }; + + if (!info.fwnode) { + pr_err("Failed to allocate fwnode for MSI IRQ domain\n"); + return -ENOMEM; + } + + if (irq_delivery == FLOATING) + zpci_msi_parent_ops.required_flags |= MSI_FLAG_NO_AFFINITY; + + zbus->msi_parent_domain = msi_create_parent_irq_domain(&info, &zpci_msi_parent_ops); + if (!zbus->msi_parent_domain) { + irq_domain_free_fwnode(info.fwnode); + pr_err("Failed to create MSI IRQ domain\n"); + return -ENOMEM; + } + + return 0; +} + +void zpci_remove_parent_msi_domain(struct zpci_bus *zbus) +{ + struct fwnode_handle *fn; + + fn = zbus->msi_parent_domain->fwnode; + irq_domain_remove(zbus->msi_parent_domain); + irq_domain_free_fwnode(fn); +} + static void __init cpu_enable_directed_irq(void *unused) { union zpci_sic_iib iib = {{0}}; @@ -465,6 +574,7 @@ static int __init zpci_directed_irq_init(void) * is only done on the first vector. */ zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE, + AIRQ_IV_PTR | AIRQ_IV_DATA | AIRQ_IV_CACHELINE | (!cpu ? AIRQ_IV_ALLOC : 0), NULL); diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 104aa5355090..239c1744a926 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -299,6 +299,7 @@ config DM_CRYPT select CRYPTO select CRYPTO_CBC select CRYPTO_ESSIV + select CRYPTO_LIB_MD5 # needed by lmk IV mode help This device-mapper target allows you to create a device that transparently encrypts the data on it. You'll need to activate @@ -546,6 +547,7 @@ config DM_VERITY depends on BLK_DEV_DM select CRYPTO select CRYPTO_HASH + select CRYPTO_LIB_SHA256 select DM_BUFIO help This device-mapper target creates a read-only device that diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index e6d28be11c5c..5235f3e4924b 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1374,7 +1374,7 @@ static void submit_io(struct dm_buffer *b, enum req_op op, unsigned short ioprio { unsigned int n_sectors; sector_t sector; - unsigned int offset, end; + unsigned int offset, end, align; b->end_io = end_io; @@ -1388,9 +1388,11 @@ static void submit_io(struct dm_buffer *b, enum req_op op, unsigned short ioprio b->c->write_callback(b); offset = b->write_start; end = b->write_end; - offset &= -DM_BUFIO_WRITE_ALIGN; - end += DM_BUFIO_WRITE_ALIGN - 1; - end &= -DM_BUFIO_WRITE_ALIGN; + align = max(DM_BUFIO_WRITE_ALIGN, + bdev_physical_block_size(b->c->bdev)); + offset &= -align; + end += align - 1; + end &= -align; if (unlikely(end > b->c->block_size)) end = b->c->block_size; diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index a3c9f74fe2dc..1cda8618d74d 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -139,7 +139,6 @@ struct mapped_device { struct srcu_struct io_barrier; #ifdef CONFIG_BLK_DEV_ZONED - unsigned int nr_zones; void *zone_revalidate_map; struct task_struct *revalidate_map_task; #endif diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 5ef43231fe77..79704fbc523b 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -21,6 +21,7 @@ #include <linux/mempool.h> #include <linux/slab.h> #include <linux/crypto.h> +#include <linux/fips.h> #include <linux/workqueue.h> #include <linux/kthread.h> #include <linux/backing-dev.h> @@ -120,7 +121,6 @@ struct iv_benbi_private { #define LMK_SEED_SIZE 64 /* hash + 0 */ struct iv_lmk_private { - struct crypto_shash *hash_tfm; u8 *seed; }; @@ -254,22 +254,15 @@ static unsigned int max_write_size = 0; module_param(max_write_size, uint, 0644); MODULE_PARM_DESC(max_write_size, "Maximum size of a write request"); -static unsigned get_max_request_sectors(struct dm_target *ti, struct bio *bio) +static unsigned get_max_request_sectors(struct dm_target *ti, struct bio *bio, bool no_split) { struct crypt_config *cc = ti->private; unsigned val, sector_align; bool wrt = op_is_write(bio_op(bio)); - if (wrt) { - /* - * For zoned devices, splitting write operations creates the - * risk of deadlocking queue freeze operations with zone write - * plugging BIO work when the reminder of a split BIO is - * issued. So always allow the entire BIO to proceed. - */ - if (ti->emulate_zone_append) - return bio_sectors(bio); - + if (no_split) { + val = -1; + } else if (wrt) { val = min_not_zero(READ_ONCE(max_write_size), DM_CRYPT_DEFAULT_MAX_WRITE_SIZE); } else { @@ -465,10 +458,6 @@ static void crypt_iv_lmk_dtr(struct crypt_config *cc) { struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; - if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm)) - crypto_free_shash(lmk->hash_tfm); - lmk->hash_tfm = NULL; - kfree_sensitive(lmk->seed); lmk->seed = NULL; } @@ -483,11 +472,10 @@ static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti, return -EINVAL; } - lmk->hash_tfm = crypto_alloc_shash("md5", 0, - CRYPTO_ALG_ALLOCATES_MEMORY); - if (IS_ERR(lmk->hash_tfm)) { - ti->error = "Error initializing LMK hash"; - return PTR_ERR(lmk->hash_tfm); + if (fips_enabled) { + ti->error = "LMK support is disabled due to FIPS"; + /* ... because it uses MD5. */ + return -EINVAL; } /* No seed in LMK version 2 */ @@ -498,7 +486,6 @@ static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti, lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL); if (!lmk->seed) { - crypt_iv_lmk_dtr(cc); ti->error = "Error kmallocing seed storage in LMK"; return -ENOMEM; } @@ -514,7 +501,7 @@ static int crypt_iv_lmk_init(struct crypt_config *cc) /* LMK seed is on the position of LMK_KEYS + 1 key */ if (lmk->seed) memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size), - crypto_shash_digestsize(lmk->hash_tfm)); + MD5_DIGEST_SIZE); return 0; } @@ -529,55 +516,31 @@ static int crypt_iv_lmk_wipe(struct crypt_config *cc) return 0; } -static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, - struct dm_crypt_request *dmreq, - u8 *data) +static void crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq, u8 *data) { struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; - SHASH_DESC_ON_STACK(desc, lmk->hash_tfm); - union { - struct md5_state md5state; - u8 state[CRYPTO_MD5_STATESIZE]; - } u; + struct md5_ctx ctx; __le32 buf[4]; - int i, r; - desc->tfm = lmk->hash_tfm; + md5_init(&ctx); - r = crypto_shash_init(desc); - if (r) - return r; - - if (lmk->seed) { - r = crypto_shash_update(desc, lmk->seed, LMK_SEED_SIZE); - if (r) - return r; - } + if (lmk->seed) + md5_update(&ctx, lmk->seed, LMK_SEED_SIZE); /* Sector is always 512B, block size 16, add data of blocks 1-31 */ - r = crypto_shash_update(desc, data + 16, 16 * 31); - if (r) - return r; + md5_update(&ctx, data + 16, 16 * 31); /* Sector is cropped to 56 bits here */ buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF); buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000); buf[2] = cpu_to_le32(4024); buf[3] = 0; - r = crypto_shash_update(desc, (u8 *)buf, sizeof(buf)); - if (r) - return r; + md5_update(&ctx, (u8 *)buf, sizeof(buf)); /* No MD5 padding here */ - r = crypto_shash_export(desc, &u.md5state); - if (r) - return r; - - for (i = 0; i < MD5_HASH_WORDS; i++) - __cpu_to_le32s(&u.md5state.hash[i]); - memcpy(iv, &u.md5state.hash, cc->iv_size); - - return 0; + cpu_to_le32_array(ctx.state.h, ARRAY_SIZE(ctx.state.h)); + memcpy(iv, ctx.state.h, cc->iv_size); } static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, @@ -585,17 +548,15 @@ static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, { struct scatterlist *sg; u8 *src; - int r = 0; if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { sg = crypt_get_sg_data(cc, dmreq->sg_in); src = kmap_local_page(sg_page(sg)); - r = crypt_iv_lmk_one(cc, iv, dmreq, src + sg->offset); + crypt_iv_lmk_one(cc, iv, dmreq, src + sg->offset); kunmap_local(src); } else memset(iv, 0, cc->iv_size); - - return r; + return 0; } static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, @@ -603,21 +564,19 @@ static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, { struct scatterlist *sg; u8 *dst; - int r; if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) return 0; sg = crypt_get_sg_data(cc, dmreq->sg_out); dst = kmap_local_page(sg_page(sg)); - r = crypt_iv_lmk_one(cc, iv, dmreq, dst + sg->offset); + crypt_iv_lmk_one(cc, iv, dmreq, dst + sg->offset); /* Tweak the first block of plaintext sector */ - if (!r) - crypto_xor(dst + sg->offset, iv, cc->iv_size); + crypto_xor(dst + sg->offset, iv, cc->iv_size); kunmap_local(dst); - return r; + return 0; } static void crypt_iv_tcw_dtr(struct crypt_config *cc) @@ -1781,7 +1740,7 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) bio_for_each_folio_all(fi, clone) { if (folio_test_large(fi.folio)) { percpu_counter_sub(&cc->n_allocated_pages, - 1 << folio_order(fi.folio)); + folio_nr_pages(fi.folio)); folio_put(fi.folio); } else { mempool_free(&fi.folio->page, &cc->page_pool); @@ -3496,6 +3455,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) struct dm_crypt_io *io; struct crypt_config *cc = ti->private; unsigned max_sectors; + bool no_split; /* * If bio is REQ_PREFLUSH or REQ_OP_DISCARD, just bypass crypt queues. @@ -3513,10 +3473,20 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) /* * Check if bio is too large, split as needed. + * + * For zoned devices, splitting write operations creates the + * risk of deadlocking queue freeze operations with zone write + * plugging BIO work when the reminder of a split BIO is + * issued. So always allow the entire BIO to proceed. */ - max_sectors = get_max_request_sectors(ti, bio); - if (unlikely(bio_sectors(bio) > max_sectors)) + no_split = (ti->emulate_zone_append && op_is_write(bio_op(bio))) || + (bio->bi_opf & REQ_ATOMIC); + max_sectors = get_max_request_sectors(ti, bio, no_split); + if (unlikely(bio_sectors(bio) > max_sectors)) { + if (unlikely(no_split)) + return DM_MAPIO_KILL; dm_accept_partial_bio(bio, max_sectors); + } /* * Ensure that bio is a multiple of internal sector encryption size @@ -3762,15 +3732,20 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) if (ti->emulate_zone_append) limits->max_hw_sectors = min(limits->max_hw_sectors, BIO_MAX_VECS << PAGE_SECTORS_SHIFT); + + limits->atomic_write_hw_unit_max = min(limits->atomic_write_hw_unit_max, + BIO_MAX_VECS << PAGE_SHIFT); + limits->atomic_write_hw_max = min(limits->atomic_write_hw_max, + BIO_MAX_VECS << PAGE_SHIFT); } static struct target_type crypt_target = { .name = "crypt", - .version = {1, 28, 0}, + .version = {1, 29, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, - .features = DM_TARGET_ZONED_HM, + .features = DM_TARGET_ZONED_HM | DM_TARGET_ATOMIC_WRITES, .report_zones = crypt_report_zones, .map = crypt_map, .status = crypt_status, diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index 6abb31ca9662..b354e74a670e 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -103,7 +103,7 @@ static int __ebs_rw_bvec(struct ebs_c *ec, enum req_op op, struct bio_vec *bv, } else { flush_dcache_page(bv->bv_page); memcpy(ba, pa, cur_len); - dm_bufio_mark_partial_buffer_dirty(b, buf_off, buf_off + cur_len); + dm_bufio_mark_buffer_dirty(b); } dm_bufio_release(b); diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index b67976637538..061b4d310813 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -29,7 +29,7 @@ typedef sector_t chunk_t; * chunk within the device. */ struct dm_exception { - struct hlist_bl_node hash_list; + struct hlist_node hash_list; chunk_t old_chunk; chunk_t new_chunk; diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 7bb7174f8f4f..f0c84e7a5daa 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -432,6 +432,7 @@ static int log_writes_kthread(void *arg) struct log_writes_c *lc = arg; sector_t sector = 0; + set_freezable(); while (!kthread_should_stop()) { bool super = false; bool logging_enabled; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index aaf4a0a4b0eb..c18358271618 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -131,7 +131,7 @@ static void queue_if_no_path_timeout_work(struct timer_list *t); #define MPATHF_QUEUE_IO 0 /* Must we queue all I/O? */ #define MPATHF_QUEUE_IF_NO_PATH 1 /* Queue I/O if last path fails? */ #define MPATHF_SAVED_QUEUE_IF_NO_PATH 2 /* Saved state during suspension */ -#define MPATHF_RETAIN_ATTACHED_HW_HANDLER 3 /* If there's already a hw_handler present, don't change it. */ +/* MPATHF_RETAIN_ATTACHED_HW_HANDLER no longer has any effect */ #define MPATHF_PG_INIT_DISABLED 4 /* pg_init is not currently allowed */ #define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */ #define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */ @@ -237,16 +237,10 @@ static struct multipath *alloc_multipath(struct dm_target *ti) static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) { - if (m->queue_mode == DM_TYPE_NONE) { + if (m->queue_mode == DM_TYPE_NONE) m->queue_mode = DM_TYPE_REQUEST_BASED; - } else if (m->queue_mode == DM_TYPE_BIO_BASED) { + else if (m->queue_mode == DM_TYPE_BIO_BASED) INIT_WORK(&m->process_queued_bios, process_queued_bios); - /* - * bio-based doesn't support any direct scsi_dh management; - * it just discovers if a scsi_dh is attached. - */ - set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags); - } dm_table_set_type(ti->table, m->queue_mode); @@ -887,36 +881,30 @@ static int setup_scsi_dh(struct block_device *bdev, struct multipath *m, struct request_queue *q = bdev_get_queue(bdev); int r; - if (mpath_double_check_test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, m)) { -retain: - if (*attached_handler_name) { - /* - * Clear any hw_handler_params associated with a - * handler that isn't already attached. - */ - if (m->hw_handler_name && strcmp(*attached_handler_name, m->hw_handler_name)) { - kfree(m->hw_handler_params); - m->hw_handler_params = NULL; - } - - /* - * Reset hw_handler_name to match the attached handler - * - * NB. This modifies the table line to show the actual - * handler instead of the original table passed in. - */ - kfree(m->hw_handler_name); - m->hw_handler_name = *attached_handler_name; - *attached_handler_name = NULL; + if (*attached_handler_name) { + /* + * Clear any hw_handler_params associated with a + * handler that isn't already attached. + */ + if (m->hw_handler_name && strcmp(*attached_handler_name, + m->hw_handler_name)) { + kfree(m->hw_handler_params); + m->hw_handler_params = NULL; } + + /* + * Reset hw_handler_name to match the attached handler + * + * NB. This modifies the table line to show the actual + * handler instead of the original table passed in. + */ + kfree(m->hw_handler_name); + m->hw_handler_name = *attached_handler_name; + *attached_handler_name = NULL; } if (m->hw_handler_name) { r = scsi_dh_attach(q, m->hw_handler_name); - if (r == -EBUSY) { - DMINFO("retaining handler on device %pg", bdev); - goto retain; - } if (r < 0) { *error = "error attaching hardware handler"; return r; @@ -1138,7 +1126,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) } if (!strcasecmp(arg_name, "retain_attached_hw_handler")) { - set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags); + /* no longer has any effect */ continue; } @@ -1823,7 +1811,6 @@ static void multipath_status(struct dm_target *ti, status_type_t type, DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) + (m->pg_init_retries > 0) * 2 + (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 + - test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) + (m->queue_mode != DM_TYPE_REQUEST_BASED) * 2); if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) @@ -1832,8 +1819,6 @@ static void multipath_status(struct dm_target *ti, status_type_t type, DMEMIT("pg_init_retries %u ", m->pg_init_retries); if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); - if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) - DMEMIT("retain_attached_hw_handler "); if (m->queue_mode != DM_TYPE_REQUEST_BASED) { switch (m->queue_mode) { case DM_TYPE_BIO_BASED: @@ -2307,7 +2292,7 @@ static struct target_type multipath_target = { .name = "multipath", .version = {1, 15, 0}, .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE | - DM_TARGET_PASSES_INTEGRITY, + DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ATOMIC_WRITES, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, diff --git a/drivers/md/dm-pcache/cache.c b/drivers/md/dm-pcache/cache.c index 698697a7a73c..534bf07b794f 100644 --- a/drivers/md/dm-pcache/cache.c +++ b/drivers/md/dm-pcache/cache.c @@ -10,7 +10,8 @@ struct kmem_cache *key_cache; static inline struct pcache_cache_info *get_cache_info_addr(struct pcache_cache *cache) { - return cache->cache_info_addr + cache->info_index; + return (struct pcache_cache_info *)((char *)cache->cache_info_addr + + (size_t)cache->info_index * PCACHE_CACHE_INFO_SIZE); } static void cache_info_write(struct pcache_cache *cache) @@ -21,10 +22,10 @@ static void cache_info_write(struct pcache_cache *cache) cache_info->header.crc = pcache_meta_crc(&cache_info->header, sizeof(struct pcache_cache_info)); + cache->info_index = (cache->info_index + 1) % PCACHE_META_INDEX_MAX; memcpy_flushcache(get_cache_info_addr(cache), cache_info, sizeof(struct pcache_cache_info)); - - cache->info_index = (cache->info_index + 1) % PCACHE_META_INDEX_MAX; + pmem_wmb(); } static void cache_info_init_default(struct pcache_cache *cache); @@ -49,6 +50,8 @@ static int cache_info_init(struct pcache_cache *cache, struct pcache_cache_optio return -EINVAL; } + cache->info_index = ((char *)cache_info_addr - (char *)cache->cache_info_addr) / PCACHE_CACHE_INFO_SIZE; + return 0; } @@ -93,10 +96,10 @@ void cache_pos_encode(struct pcache_cache *cache, pos_onmedia.header.seq = seq; pos_onmedia.header.crc = cache_pos_onmedia_crc(&pos_onmedia); + *index = (*index + 1) % PCACHE_META_INDEX_MAX; + memcpy_flushcache(pos_onmedia_addr, &pos_onmedia, sizeof(struct pcache_cache_pos_onmedia)); pmem_wmb(); - - *index = (*index + 1) % PCACHE_META_INDEX_MAX; } int cache_pos_decode(struct pcache_cache *cache, diff --git a/drivers/md/dm-pcache/cache_segment.c b/drivers/md/dm-pcache/cache_segment.c index f0b58980806e..9d92e2b067ed 100644 --- a/drivers/md/dm-pcache/cache_segment.c +++ b/drivers/md/dm-pcache/cache_segment.c @@ -26,11 +26,11 @@ static void cache_seg_info_write(struct pcache_cache_segment *cache_seg) seg_info->header.seq++; seg_info->header.crc = pcache_meta_crc(&seg_info->header, sizeof(struct pcache_segment_info)); + cache_seg->info_index = (cache_seg->info_index + 1) % PCACHE_META_INDEX_MAX; + seg_info_addr = get_seg_info_addr(cache_seg); memcpy_flushcache(seg_info_addr, seg_info, sizeof(struct pcache_segment_info)); pmem_wmb(); - - cache_seg->info_index = (cache_seg->info_index + 1) % PCACHE_META_INDEX_MAX; mutex_unlock(&cache_seg->info_lock); } @@ -56,7 +56,10 @@ static int cache_seg_info_load(struct pcache_cache_segment *cache_seg) ret = -EIO; goto out; } - cache_seg->info_index = cache_seg_info_addr - cache_seg_info_addr_base; + + cache_seg->info_index = + ((char *)cache_seg_info_addr - (char *)cache_seg_info_addr_base) / + PCACHE_SEG_INFO_SIZE; out: mutex_unlock(&cache_seg->info_lock); @@ -129,10 +132,10 @@ static void cache_seg_ctrl_write(struct pcache_cache_segment *cache_seg) cache_seg_gen.header.crc = pcache_meta_crc(&cache_seg_gen.header, sizeof(struct pcache_cache_seg_gen)); + cache_seg->gen_index = (cache_seg->gen_index + 1) % PCACHE_META_INDEX_MAX; + memcpy_flushcache(get_cache_seg_gen_addr(cache_seg), &cache_seg_gen, sizeof(struct pcache_cache_seg_gen)); pmem_wmb(); - - cache_seg->gen_index = (cache_seg->gen_index + 1) % PCACHE_META_INDEX_MAX; } static void cache_seg_ctrl_init(struct pcache_cache_segment *cache_seg) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index c6f7129e43d3..4bacdc499984 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -2287,6 +2287,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) mddev->reshape_position = le64_to_cpu(sb->reshape_position); rs->raid_type = get_raid_type_by_ll(mddev->level, mddev->layout); + if (!rs->raid_type) + return -EINVAL; } } else { diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index f40c18da4000..dbd148967de4 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -40,10 +40,15 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; #define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ (DM_TRACKED_CHUNK_HASH_SIZE - 1)) +struct dm_hlist_head { + struct hlist_head head; + spinlock_t lock; +}; + struct dm_exception_table { uint32_t hash_mask; unsigned int hash_shift; - struct hlist_bl_head *table; + struct dm_hlist_head *table; }; struct dm_snapshot { @@ -628,8 +633,8 @@ static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk); /* Lock to protect access to the completed and pending exception hash tables. */ struct dm_exception_table_lock { - struct hlist_bl_head *complete_slot; - struct hlist_bl_head *pending_slot; + spinlock_t *complete_slot; + spinlock_t *pending_slot; }; static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk, @@ -638,20 +643,20 @@ static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk, struct dm_exception_table *complete = &s->complete; struct dm_exception_table *pending = &s->pending; - lock->complete_slot = &complete->table[exception_hash(complete, chunk)]; - lock->pending_slot = &pending->table[exception_hash(pending, chunk)]; + lock->complete_slot = &complete->table[exception_hash(complete, chunk)].lock; + lock->pending_slot = &pending->table[exception_hash(pending, chunk)].lock; } static void dm_exception_table_lock(struct dm_exception_table_lock *lock) { - hlist_bl_lock(lock->complete_slot); - hlist_bl_lock(lock->pending_slot); + spin_lock_nested(lock->complete_slot, 1); + spin_lock_nested(lock->pending_slot, 2); } static void dm_exception_table_unlock(struct dm_exception_table_lock *lock) { - hlist_bl_unlock(lock->pending_slot); - hlist_bl_unlock(lock->complete_slot); + spin_unlock(lock->pending_slot); + spin_unlock(lock->complete_slot); } static int dm_exception_table_init(struct dm_exception_table *et, @@ -661,13 +666,15 @@ static int dm_exception_table_init(struct dm_exception_table *et, et->hash_shift = hash_shift; et->hash_mask = size - 1; - et->table = kvmalloc_array(size, sizeof(struct hlist_bl_head), + et->table = kvmalloc_array(size, sizeof(struct dm_hlist_head), GFP_KERNEL); if (!et->table) return -ENOMEM; - for (i = 0; i < size; i++) - INIT_HLIST_BL_HEAD(et->table + i); + for (i = 0; i < size; i++) { + INIT_HLIST_HEAD(&et->table[i].head); + spin_lock_init(&et->table[i].lock); + } return 0; } @@ -675,16 +682,17 @@ static int dm_exception_table_init(struct dm_exception_table *et, static void dm_exception_table_exit(struct dm_exception_table *et, struct kmem_cache *mem) { - struct hlist_bl_head *slot; + struct dm_hlist_head *slot; struct dm_exception *ex; - struct hlist_bl_node *pos, *n; + struct hlist_node *pos; int i, size; size = et->hash_mask + 1; for (i = 0; i < size; i++) { slot = et->table + i; - hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list) { + hlist_for_each_entry_safe(ex, pos, &slot->head, hash_list) { + hlist_del(&ex->hash_list); kmem_cache_free(mem, ex); cond_resched(); } @@ -700,7 +708,7 @@ static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk) static void dm_remove_exception(struct dm_exception *e) { - hlist_bl_del(&e->hash_list); + hlist_del(&e->hash_list); } /* @@ -710,12 +718,11 @@ static void dm_remove_exception(struct dm_exception *e) static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et, chunk_t chunk) { - struct hlist_bl_head *slot; - struct hlist_bl_node *pos; + struct hlist_head *slot; struct dm_exception *e; - slot = &et->table[exception_hash(et, chunk)]; - hlist_bl_for_each_entry(e, pos, slot, hash_list) + slot = &et->table[exception_hash(et, chunk)].head; + hlist_for_each_entry(e, slot, hash_list) if (chunk >= e->old_chunk && chunk <= e->old_chunk + dm_consecutive_chunk_count(e)) return e; @@ -762,18 +769,17 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe) static void dm_insert_exception(struct dm_exception_table *eh, struct dm_exception *new_e) { - struct hlist_bl_head *l; - struct hlist_bl_node *pos; + struct hlist_head *l; struct dm_exception *e = NULL; - l = &eh->table[exception_hash(eh, new_e->old_chunk)]; + l = &eh->table[exception_hash(eh, new_e->old_chunk)].head; /* Add immediately if this table doesn't support consecutive chunks */ if (!eh->hash_shift) goto out; /* List is ordered by old_chunk */ - hlist_bl_for_each_entry(e, pos, l, hash_list) { + hlist_for_each_entry(e, l, hash_list) { /* Insert after an existing chunk? */ if (new_e->old_chunk == (e->old_chunk + dm_consecutive_chunk_count(e) + 1) && @@ -804,13 +810,13 @@ out: * Either the table doesn't support consecutive chunks or slot * l is empty. */ - hlist_bl_add_head(&new_e->hash_list, l); + hlist_add_head(&new_e->hash_list, l); } else if (new_e->old_chunk < e->old_chunk) { /* Add before an existing exception */ - hlist_bl_add_before(&new_e->hash_list, &e->hash_list); + hlist_add_before(&new_e->hash_list, &e->hash_list); } else { /* Add to l's tail: e is the last exception in this slot */ - hlist_bl_add_behind(&new_e->hash_list, &e->hash_list); + hlist_add_behind(&new_e->hash_list, &e->hash_list); } } @@ -820,7 +826,6 @@ out: */ static int dm_add_exception(void *context, chunk_t old, chunk_t new) { - struct dm_exception_table_lock lock; struct dm_snapshot *s = context; struct dm_exception *e; @@ -833,17 +838,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) /* Consecutive_count is implicitly initialised to zero */ e->new_chunk = new; - /* - * Although there is no need to lock access to the exception tables - * here, if we don't then hlist_bl_add_head(), called by - * dm_insert_exception(), will complain about accessing the - * corresponding list without locking it first. - */ - dm_exception_table_lock_init(s, old, &lock); - - dm_exception_table_lock(&lock); dm_insert_exception(&s->complete, e); - dm_exception_table_unlock(&lock); return 0; } @@ -873,7 +868,7 @@ static int calc_max_buckets(void) /* use a fixed size of 2MB */ unsigned long mem = 2 * 1024 * 1024; - mem /= sizeof(struct hlist_bl_head); + mem /= sizeof(struct dm_hlist_head); return mem; } diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index bfaef27ca79f..22bc70923a83 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c @@ -86,17 +86,13 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf) static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) { - sprintf(buf, "%d\n", dm_suspended_md(md)); - - return strlen(buf); + return sysfs_emit(buf, "%d\n", dm_suspended_md(md)); } static ssize_t dm_attr_use_blk_mq_show(struct mapped_device *md, char *buf) { /* Purely for userspace compatibility */ - sprintf(buf, "%d\n", true); - - return strlen(buf); + return sysfs_emit(buf, "%d\n", true); } static DM_ATTR_RO(name); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index ad0a60a07b93..0522cd700e0e 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -2043,6 +2043,10 @@ bool dm_table_supports_size_change(struct dm_table *t, sector_t old_size, return true; } +/* + * This function will be skipped by noflush reloads of immutable request + * based devices (dm-mpath). + */ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index c84149ba4e38..52ffb495f5a8 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -395,13 +395,13 @@ static void begin_discard(struct discard_op *op, struct thin_c *tc, struct bio * op->bio = NULL; } -static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t data_e) +static void issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t data_e) { struct thin_c *tc = op->tc; sector_t s = block_to_sectors(tc->pool, data_b); sector_t len = block_to_sectors(tc->pool, data_e - data_b); - return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOIO, &op->bio); + __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOIO, &op->bio); } static void end_discard(struct discard_op *op, int r) @@ -1113,9 +1113,7 @@ static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m break; } - r = issue_discard(&op, b, e); - if (r) - goto out; + issue_discard(&op, b, e); b = e; } @@ -1188,8 +1186,8 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m) struct discard_op op; begin_discard(&op, tc, discard_parent); - r = issue_discard(&op, m->data_block, data_end); - end_discard(&op, r); + issue_discard(&op, m->data_block, data_end); + end_discard(&op, 0); } } @@ -4383,11 +4381,8 @@ static void thin_postsuspend(struct dm_target *ti) { struct thin_c *tc = ti->private; - /* - * The dm_noflush_suspending flag has been cleared by now, so - * unfortunately we must always run this. - */ - noflush_work(tc, do_noflush_stop); + if (dm_noflush_suspending(ti)) + noflush_work(tc, do_noflush_stop); } static int thin_preresume(struct dm_target *ti) diff --git a/drivers/md/dm-vdo/action-manager.c b/drivers/md/dm-vdo/action-manager.c index a0e5e7077d13..e3bba0b28aad 100644 --- a/drivers/md/dm-vdo/action-manager.c +++ b/drivers/md/dm-vdo/action-manager.c @@ -43,7 +43,7 @@ struct action { * @actions: The two action slots. * @current_action: The current action slot. * @zones: The number of zones in which an action is to be applied. - * @Scheduler: A function to schedule a default next action. + * @scheduler: A function to schedule a default next action. * @get_zone_thread_id: A function to get the id of the thread on which to apply an action to a * zone. * @initiator_thread_id: The ID of the thread on which actions may be initiated. diff --git a/drivers/md/dm-vdo/admin-state.c b/drivers/md/dm-vdo/admin-state.c index 3f9dba525154..da153fef085e 100644 --- a/drivers/md/dm-vdo/admin-state.c +++ b/drivers/md/dm-vdo/admin-state.c @@ -149,7 +149,8 @@ const struct admin_state_code *VDO_ADMIN_STATE_RESUMING = &VDO_CODE_RESUMING; /** * get_next_state() - Determine the state which should be set after a given operation completes * based on the operation and the current state. - * @operation The operation to be started. + * @state: The current admin state. + * @operation: The operation to be started. * * Return: The state to set when the operation completes or NULL if the operation can not be * started in the current state. @@ -187,6 +188,8 @@ static const struct admin_state_code *get_next_state(const struct admin_state *s /** * vdo_finish_operation() - Finish the current operation. + * @state: The current admin state. + * @result: The result of the operation. * * Will notify the operation waiter if there is one. This method should be used for operations * started with vdo_start_operation(). For operations which were started with vdo_start_draining(), @@ -214,8 +217,10 @@ bool vdo_finish_operation(struct admin_state *state, int result) /** * begin_operation() - Begin an operation if it may be started given the current state. - * @waiter A completion to notify when the operation is complete; may be NULL. - * @initiator The vdo_admin_initiator_fn to call if the operation may begin; may be NULL. + * @state: The current admin state. + * @operation: The operation to be started. + * @waiter: A completion to notify when the operation is complete; may be NULL. + * @initiator: The vdo_admin_initiator_fn to call if the operation may begin; may be NULL. * * Return: VDO_SUCCESS or an error. */ @@ -259,8 +264,10 @@ static int __must_check begin_operation(struct admin_state *state, /** * start_operation() - Start an operation if it may be started given the current state. - * @waiter A completion to notify when the operation is complete. - * @initiator The vdo_admin_initiator_fn to call if the operation may begin; may be NULL. + * @state: The current admin state. + * @operation: The operation to be started. + * @waiter: A completion to notify when the operation is complete; may be NULL. + * @initiator: The vdo_admin_initiator_fn to call if the operation may begin; may be NULL. * * Return: true if the operation was started. */ @@ -274,10 +281,10 @@ static inline bool __must_check start_operation(struct admin_state *state, /** * check_code() - Check the result of a state validation. - * @valid true if the code is of an appropriate type. - * @code The code which failed to be of the correct type. - * @what What the code failed to be, for logging. - * @waiter The completion to notify of the error; may be NULL. + * @valid: True if the code is of an appropriate type. + * @code: The code which failed to be of the correct type. + * @what: What the code failed to be, for logging. + * @waiter: The completion to notify of the error; may be NULL. * * If the result failed, log an invalid state error and, if there is a waiter, notify it. * @@ -301,7 +308,8 @@ static bool check_code(bool valid, const struct admin_state_code *code, const ch /** * assert_vdo_drain_operation() - Check that an operation is a drain. - * @waiter The completion to finish with an error if the operation is not a drain. + * @operation: The operation to check. + * @waiter: The completion to finish with an error if the operation is not a drain. * * Return: true if the specified operation is a drain. */ @@ -313,9 +321,10 @@ static bool __must_check assert_vdo_drain_operation(const struct admin_state_cod /** * vdo_start_draining() - Initiate a drain operation if the current state permits it. - * @operation The type of drain to initiate. - * @waiter The completion to notify when the drain is complete. - * @initiator The vdo_admin_initiator_fn to call if the operation may begin; may be NULL. + * @state: The current admin state. + * @operation: The type of drain to initiate. + * @waiter: The completion to notify when the drain is complete. + * @initiator: The vdo_admin_initiator_fn to call if the operation may begin; may be NULL. * * Return: true if the drain was initiated, if not the waiter will be notified. */ @@ -345,6 +354,7 @@ bool vdo_start_draining(struct admin_state *state, /** * vdo_finish_draining() - Finish a drain operation if one was in progress. + * @state: The current admin state. * * Return: true if the state was draining; will notify the waiter if so. */ @@ -355,6 +365,8 @@ bool vdo_finish_draining(struct admin_state *state) /** * vdo_finish_draining_with_result() - Finish a drain operation with a status code. + * @state: The current admin state. + * @result: The result of the drain operation. * * Return: true if the state was draining; will notify the waiter if so. */ @@ -365,7 +377,8 @@ bool vdo_finish_draining_with_result(struct admin_state *state, int result) /** * vdo_assert_load_operation() - Check that an operation is a load. - * @waiter The completion to finish with an error if the operation is not a load. + * @operation: The operation to check. + * @waiter: The completion to finish with an error if the operation is not a load. * * Return: true if the specified operation is a load. */ @@ -377,9 +390,10 @@ bool vdo_assert_load_operation(const struct admin_state_code *operation, /** * vdo_start_loading() - Initiate a load operation if the current state permits it. - * @operation The type of load to initiate. - * @waiter The completion to notify when the load is complete (may be NULL). - * @initiator The vdo_admin_initiator_fn to call if the operation may begin; may be NULL. + * @state: The current admin state. + * @operation: The type of load to initiate. + * @waiter: The completion to notify when the load is complete; may be NULL. + * @initiator: The vdo_admin_initiator_fn to call if the operation may begin; may be NULL. * * Return: true if the load was initiated, if not the waiter will be notified. */ @@ -393,6 +407,7 @@ bool vdo_start_loading(struct admin_state *state, /** * vdo_finish_loading() - Finish a load operation if one was in progress. + * @state: The current admin state. * * Return: true if the state was loading; will notify the waiter if so. */ @@ -403,7 +418,8 @@ bool vdo_finish_loading(struct admin_state *state) /** * vdo_finish_loading_with_result() - Finish a load operation with a status code. - * @result The result of the load operation. + * @state: The current admin state. + * @result: The result of the load operation. * * Return: true if the state was loading; will notify the waiter if so. */ @@ -414,7 +430,8 @@ bool vdo_finish_loading_with_result(struct admin_state *state, int result) /** * assert_vdo_resume_operation() - Check whether an admin_state_code is a resume operation. - * @waiter The completion to notify if the operation is not a resume operation; may be NULL. + * @operation: The operation to check. + * @waiter: The completion to notify if the operation is not a resume operation; may be NULL. * * Return: true if the code is a resume operation. */ @@ -427,9 +444,10 @@ static bool __must_check assert_vdo_resume_operation(const struct admin_state_co /** * vdo_start_resuming() - Initiate a resume operation if the current state permits it. - * @operation The type of resume to start. - * @waiter The completion to notify when the resume is complete (may be NULL). - * @initiator The vdo_admin_initiator_fn to call if the operation may begin; may be NULL. + * @state: The current admin state. + * @operation: The type of resume to start. + * @waiter: The completion to notify when the resume is complete; may be NULL. + * @initiator: The vdo_admin_initiator_fn to call if the operation may begin; may be NULL. * * Return: true if the resume was initiated, if not the waiter will be notified. */ @@ -443,6 +461,7 @@ bool vdo_start_resuming(struct admin_state *state, /** * vdo_finish_resuming() - Finish a resume operation if one was in progress. + * @state: The current admin state. * * Return: true if the state was resuming; will notify the waiter if so. */ @@ -453,7 +472,8 @@ bool vdo_finish_resuming(struct admin_state *state) /** * vdo_finish_resuming_with_result() - Finish a resume operation with a status code. - * @result The result of the resume operation. + * @state: The current admin state. + * @result: The result of the resume operation. * * Return: true if the state was resuming; will notify the waiter if so. */ @@ -465,6 +485,7 @@ bool vdo_finish_resuming_with_result(struct admin_state *state, int result) /** * vdo_resume_if_quiescent() - Change the state to normal operation if the current state is * quiescent. + * @state: The current admin state. * * Return: VDO_SUCCESS if the state resumed, VDO_INVALID_ADMIN_STATE otherwise. */ @@ -479,6 +500,8 @@ int vdo_resume_if_quiescent(struct admin_state *state) /** * vdo_start_operation() - Attempt to start an operation. + * @state: The current admin state. + * @operation: The operation to attempt to start. * * Return: VDO_SUCCESS if the operation was started, VDO_INVALID_ADMIN_STATE if not */ @@ -490,8 +513,10 @@ int vdo_start_operation(struct admin_state *state, /** * vdo_start_operation_with_waiter() - Attempt to start an operation. - * @waiter the completion to notify when the operation completes or fails to start; may be NULL. - * @initiator The vdo_admin_initiator_fn to call if the operation may begin; may be NULL. + * @state: The current admin state. + * @operation: The operation to attempt to start. + * @waiter: The completion to notify when the operation completes or fails to start; may be NULL. + * @initiator: The vdo_admin_initiator_fn to call if the operation may begin; may be NULL. * * Return: VDO_SUCCESS if the operation was started, VDO_INVALID_ADMIN_STATE if not */ diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c index baf683cabb1b..a7db5b41155e 100644 --- a/drivers/md/dm-vdo/block-map.c +++ b/drivers/md/dm-vdo/block-map.c @@ -174,6 +174,7 @@ static inline struct vdo_page_completion *page_completion_from_waiter(struct vdo /** * initialize_info() - Initialize all page info structures and put them on the free list. + * @cache: The page cache. * * Return: VDO_SUCCESS or an error. */ @@ -209,6 +210,7 @@ static int initialize_info(struct vdo_page_cache *cache) /** * allocate_cache_components() - Allocate components of the cache which require their own * allocation. + * @cache: The page cache. * * The caller is responsible for all clean up on errors. * @@ -238,6 +240,8 @@ static int __must_check allocate_cache_components(struct vdo_page_cache *cache) /** * assert_on_cache_thread() - Assert that a function has been called on the VDO page cache's * thread. + * @cache: The page cache. + * @function_name: The funtion name to report if the assertion fails. */ static inline void assert_on_cache_thread(struct vdo_page_cache *cache, const char *function_name) @@ -271,6 +275,7 @@ static void report_cache_pressure(struct vdo_page_cache *cache) /** * get_page_state_name() - Return the name of a page state. + * @state: The page state to describe. * * If the page state is invalid a static string is returned and the invalid state is logged. * @@ -342,6 +347,8 @@ static void update_lru(struct page_info *info) /** * set_info_state() - Set the state of a page_info and put it on the right list, adjusting * counters. + * @info: The page info to update. + * @new_state: The new state to set. */ static void set_info_state(struct page_info *info, enum vdo_page_buffer_state new_state) { @@ -416,6 +423,7 @@ static int reset_page_info(struct page_info *info) /** * find_free_page() - Find a free page. + * @cache: The page cache. * * Return: A pointer to the page info structure (if found), NULL otherwise. */ @@ -433,6 +441,7 @@ static struct page_info * __must_check find_free_page(struct vdo_page_cache *cac /** * find_page() - Find the page info (if any) associated with a given pbn. + * @cache: The page cache. * @pbn: The absolute physical block number of the page. * * Return: The page info for the page if available, or NULL if not. @@ -449,6 +458,7 @@ static struct page_info * __must_check find_page(struct vdo_page_cache *cache, /** * select_lru_page() - Determine which page is least recently used. + * @cache: The page cache. * * Picks the least recently used from among the non-busy entries at the front of each of the lru * list. Since whenever we mark a page busy we also put it to the end of the list it is unlikely @@ -523,6 +533,8 @@ static void complete_waiter_with_page(struct vdo_waiter *waiter, void *page_info /** * distribute_page_over_waitq() - Complete a waitq of VDO page completions with a page result. + * @info: The loaded page info. + * @waitq: The list of waiting data_vios. * * Upon completion the waitq will be empty. * @@ -548,7 +560,9 @@ static unsigned int distribute_page_over_waitq(struct page_info *info, /** * set_persistent_error() - Set a persistent error which all requests will receive in the future. + * @cache: The page cache. * @context: A string describing what triggered the error. + * @result: The error result to set on the cache. * * Once triggered, all enqueued completions will get this error. Any future requests will result in * this error as well. @@ -581,6 +595,7 @@ static void set_persistent_error(struct vdo_page_cache *cache, const char *conte /** * validate_completed_page() - Check that a page completion which is being freed to the cache * referred to a valid page and is in a valid state. + * @completion: The page completion to check. * @writable: Whether a writable page is required. * * Return: VDO_SUCCESS if the page was valid, otherwise as error @@ -758,6 +773,8 @@ static void load_cache_page_endio(struct bio *bio) /** * launch_page_load() - Begin the process of loading a page. + * @info: The page info to launch. + * @pbn: The absolute physical block number of the page to load. * * Return: VDO_SUCCESS or an error code. */ @@ -836,6 +853,7 @@ static void save_pages(struct vdo_page_cache *cache) /** * schedule_page_save() - Add a page to the outgoing list of pages waiting to be saved. + * @info: The page info to save. * * Once in the list, a page may not be used until it has been written out. */ @@ -854,6 +872,7 @@ static void schedule_page_save(struct page_info *info) /** * launch_page_save() - Add a page to outgoing pages waiting to be saved, and then start saving * pages if another save is not in progress. + * @info: The page info to save. */ static void launch_page_save(struct page_info *info) { @@ -864,6 +883,7 @@ static void launch_page_save(struct page_info *info) /** * completion_needs_page() - Determine whether a given vdo_page_completion (as a waiter) is * requesting a given page number. + * @waiter: The page completion waiter to check. * @context: A pointer to the pbn of the desired page. * * Implements waiter_match_fn. @@ -880,6 +900,7 @@ static bool completion_needs_page(struct vdo_waiter *waiter, void *context) /** * allocate_free_page() - Allocate a free page to the first completion in the waiting queue, and * any other completions that match it in page number. + * @info: The page info to allocate a page for. */ static void allocate_free_page(struct page_info *info) { @@ -925,6 +946,7 @@ static void allocate_free_page(struct page_info *info) /** * discard_a_page() - Begin the process of discarding a page. + * @cache: The page cache. * * If no page is discardable, increments a count of deferred frees so that the next release of a * page which is no longer busy will kick off another discard cycle. This is an indication that the @@ -955,10 +977,6 @@ static void discard_a_page(struct vdo_page_cache *cache) launch_page_save(info); } -/** - * discard_page_for_completion() - Helper used to trigger a discard so that the completion can get - * a different page. - */ static void discard_page_for_completion(struct vdo_page_completion *vdo_page_comp) { struct vdo_page_cache *cache = vdo_page_comp->cache; @@ -1132,6 +1150,7 @@ static void write_pages(struct vdo_completion *flush_completion) /** * vdo_release_page_completion() - Release a VDO Page Completion. + * @completion: The page completion to release. * * The page referenced by this completion (if any) will no longer be held busy by this completion. * If a page becomes discardable and there are completions awaiting free pages then a new round of @@ -1172,10 +1191,6 @@ void vdo_release_page_completion(struct vdo_completion *completion) } } -/** - * load_page_for_completion() - Helper function to load a page as described by a VDO Page - * Completion. - */ static void load_page_for_completion(struct page_info *info, struct vdo_page_completion *vdo_page_comp) { @@ -1319,6 +1334,7 @@ int vdo_get_cached_page(struct vdo_completion *completion, /** * vdo_invalidate_page_cache() - Invalidate all entries in the VDO page cache. + * @cache: The page cache. * * There must not be any dirty pages in the cache. * @@ -1345,6 +1361,10 @@ int vdo_invalidate_page_cache(struct vdo_page_cache *cache) /** * get_tree_page_by_index() - Get the tree page for a given height and page index. + * @forest: The block map forest. + * @root_index: The root index of the tree to search. + * @height: The height in the tree. + * @page_index: The page index. * * Return: The requested page. */ @@ -2211,6 +2231,7 @@ static void allocate_block_map_page(struct block_map_zone *zone, /** * vdo_find_block_map_slot() - Find the block map slot in which the block map entry for a data_vio * resides and cache that result in the data_vio. + * @data_vio: The data vio. * * All ancestors in the tree will be allocated or loaded, as needed. */ @@ -2435,6 +2456,7 @@ static void deforest(struct forest *forest, size_t first_page_segment) /** * make_forest() - Make a collection of trees for a block_map, expanding the existing forest if * there is one. + * @map: The block map. * @entries: The number of entries the block map will hold. * * Return: VDO_SUCCESS or an error. @@ -2476,6 +2498,7 @@ static int make_forest(struct block_map *map, block_count_t entries) /** * replace_forest() - Replace a block_map's forest with the already-prepared larger forest. + * @map: The block map. */ static void replace_forest(struct block_map *map) { @@ -2492,6 +2515,7 @@ static void replace_forest(struct block_map *map) /** * finish_cursor() - Finish the traversal of a single tree. If it was the last cursor, finish the * traversal. + * @cursor: The cursor to complete. */ static void finish_cursor(struct cursor *cursor) { @@ -2549,6 +2573,7 @@ static void traversal_endio(struct bio *bio) /** * traverse() - Traverse a single block map tree. + * @cursor: A cursor tracking traversal progress. * * This is the recursive heart of the traversal process. */ @@ -2619,6 +2644,7 @@ static void traverse(struct cursor *cursor) /** * launch_cursor() - Start traversing a single block map tree now that the cursor has a VIO with * which to load pages. + * @waiter: The parent of the cursor to launch. * @context: The pooled_vio just acquired. * * Implements waiter_callback_fn. @@ -2636,6 +2662,8 @@ static void launch_cursor(struct vdo_waiter *waiter, void *context) /** * compute_boundary() - Compute the number of pages used at each level of the given root's tree. + * @map: The block map. + * @root_index: The tree root index. * * Return: The list of page counts as a boundary structure. */ @@ -2668,6 +2696,7 @@ static struct boundary compute_boundary(struct block_map *map, root_count_t root /** * vdo_traverse_forest() - Walk the entire forest of a block map. + * @map: The block map. * @callback: A function to call with the pbn of each allocated node in the forest. * @completion: The completion to notify on each traversed PBN, and when traversal completes. */ @@ -2707,6 +2736,9 @@ void vdo_traverse_forest(struct block_map *map, vdo_entry_callback_fn callback, /** * initialize_block_map_zone() - Initialize the per-zone portions of the block map. + * @map: The block map. + * @zone_number: The zone to initialize. + * @cache_size: The total block map cache size. * @maximum_age: The number of journal blocks before a dirtied page is considered old and must be * written out. */ @@ -3091,6 +3123,7 @@ static void fetch_mapping_page(struct data_vio *data_vio, bool modifiable, /** * clear_mapped_location() - Clear a data_vio's mapped block location, setting it to be unmapped. + * @data_vio: The data vio. * * This indicates the block map entry for the logical block is either unmapped or corrupted. */ @@ -3104,6 +3137,8 @@ static void clear_mapped_location(struct data_vio *data_vio) /** * set_mapped_location() - Decode and validate a block map entry, and set the mapped location of a * data_vio. + * @data_vio: The data vio. + * @entry: The new mapped entry to set. * * Return: VDO_SUCCESS or VDO_BAD_MAPPING if the map entry is invalid or an error code for any * other failure diff --git a/drivers/md/dm-vdo/completion.c b/drivers/md/dm-vdo/completion.c index 5ad85334632d..2f00acbb3b2b 100644 --- a/drivers/md/dm-vdo/completion.c +++ b/drivers/md/dm-vdo/completion.c @@ -65,6 +65,8 @@ static inline void assert_incomplete(struct vdo_completion *completion) /** * vdo_set_completion_result() - Set the result of a completion. + * @completion: The completion to update. + * @result: The result to set. * * Older errors will not be masked. */ @@ -77,6 +79,7 @@ void vdo_set_completion_result(struct vdo_completion *completion, int result) /** * vdo_launch_completion_with_priority() - Run or enqueue a completion. + * @completion: The completion to launch. * @priority: The priority at which to enqueue the completion. * * If called on the correct thread (i.e. the one specified in the completion's callback_thread_id @@ -125,6 +128,8 @@ void vdo_enqueue_completion(struct vdo_completion *completion, /** * vdo_requeue_completion_if_needed() - Requeue a completion if not called on the specified thread. + * @completion: The completion to requeue. + * @callback_thread_id: The thread on which to requeue the completion. * * Return: True if the completion was requeued; callers may not access the completion in this case. */ diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c index 262e11581f2d..3333e1e5b02e 100644 --- a/drivers/md/dm-vdo/data-vio.c +++ b/drivers/md/dm-vdo/data-vio.c @@ -227,6 +227,7 @@ static inline u64 get_arrival_time(struct bio *bio) /** * check_for_drain_complete_locked() - Check whether a data_vio_pool has no outstanding data_vios * or waiters while holding the pool's lock. + * @pool: The data_vio pool. */ static bool check_for_drain_complete_locked(struct data_vio_pool *pool) { @@ -387,6 +388,7 @@ struct data_vio_compression_status advance_data_vio_compression_stage(struct dat /** * cancel_data_vio_compression() - Prevent this data_vio from being compressed or packed. + * @data_vio: The data_vio. * * Return: true if the data_vio is in the packer and the caller was the first caller to cancel it. */ @@ -483,6 +485,8 @@ static void attempt_logical_block_lock(struct vdo_completion *completion) /** * launch_data_vio() - (Re)initialize a data_vio to have a new logical block number, keeping the * same parent and other state and send it on its way. + * @data_vio: The data_vio to launch. + * @lbn: The logical block number. */ static void launch_data_vio(struct data_vio *data_vio, logical_block_number_t lbn) { @@ -641,6 +645,7 @@ static void update_limiter(struct limiter *limiter) /** * schedule_releases() - Ensure that release processing is scheduled. + * @pool: The data_vio pool. * * If this call switches the state to processing, enqueue. Otherwise, some other thread has already * done so. @@ -768,6 +773,8 @@ static void initialize_limiter(struct limiter *limiter, struct data_vio_pool *po /** * initialize_data_vio() - Allocate the components of a data_vio. + * @data_vio: The data_vio to initialize. + * @vdo: The vdo containing the data_vio. * * The caller is responsible for cleaning up the data_vio on error. * @@ -880,6 +887,7 @@ int make_data_vio_pool(struct vdo *vdo, data_vio_count_t pool_size, /** * free_data_vio_pool() - Free a data_vio_pool and the data_vios in it. + * @pool: The data_vio pool to free. * * All data_vios must be returned to the pool before calling this function. */ @@ -944,6 +952,8 @@ static void wait_permit(struct limiter *limiter, struct bio *bio) /** * vdo_launch_bio() - Acquire a data_vio from the pool, assign the bio to it, and launch it. + * @pool: The data_vio pool. + * @bio: The bio to launch. * * This will block if data_vios or discard permits are not available. */ @@ -994,6 +1004,7 @@ static void assert_on_vdo_cpu_thread(const struct vdo *vdo, const char *name) /** * drain_data_vio_pool() - Wait asynchronously for all data_vios to be returned to the pool. + * @pool: The data_vio pool. * @completion: The completion to notify when the pool has drained. */ void drain_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *completion) @@ -1005,6 +1016,7 @@ void drain_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *comp /** * resume_data_vio_pool() - Resume a data_vio pool. + * @pool: The data_vio pool. * @completion: The completion to notify when the pool has resumed. */ void resume_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *completion) @@ -1024,6 +1036,7 @@ static void dump_limiter(const char *name, struct limiter *limiter) /** * dump_data_vio_pool() - Dump a data_vio pool to the log. + * @pool: The data_vio pool. * @dump_vios: Whether to dump the details of each busy data_vio as well. */ void dump_data_vio_pool(struct data_vio_pool *pool, bool dump_vios) @@ -1114,6 +1127,7 @@ static void perform_cleanup_stage(struct data_vio *data_vio, /** * release_allocated_lock() - Release the PBN lock and/or the reference on the allocated block at * the end of processing a data_vio. + * @completion: The data_vio holding the lock. */ static void release_allocated_lock(struct vdo_completion *completion) { @@ -1194,6 +1208,7 @@ static void transfer_lock(struct data_vio *data_vio, struct lbn_lock *lock) /** * release_logical_lock() - Release the logical block lock and flush generation lock at the end of * processing a data_vio. + * @completion: The data_vio holding the lock. */ static void release_logical_lock(struct vdo_completion *completion) { @@ -1228,6 +1243,7 @@ static void clean_hash_lock(struct vdo_completion *completion) /** * finish_cleanup() - Make some assertions about a data_vio which has finished cleaning up. + * @data_vio: The data_vio. * * If it is part of a multi-block discard, starts on the next block, otherwise, returns it to the * pool. @@ -1342,6 +1358,7 @@ void handle_data_vio_error(struct vdo_completion *completion) /** * get_data_vio_operation_name() - Get the name of the last asynchronous operation performed on a * data_vio. + * @data_vio: The data_vio. */ const char *get_data_vio_operation_name(struct data_vio *data_vio) { @@ -1355,7 +1372,7 @@ const char *get_data_vio_operation_name(struct data_vio *data_vio) /** * data_vio_allocate_data_block() - Allocate a data block. - * + * @data_vio: The data_vio. * @write_lock_type: The type of write lock to obtain on the block. * @callback: The callback which will attempt an allocation in the current zone and continue if it * succeeds. @@ -1379,6 +1396,7 @@ void data_vio_allocate_data_block(struct data_vio *data_vio, /** * release_data_vio_allocation_lock() - Release the PBN lock on a data_vio's allocated block. + * @data_vio: The data_vio. * @reset: If true, the allocation will be reset (i.e. any allocated pbn will be forgotten). * * If the reference to the locked block is still provisional, it will be released as well. @@ -1399,6 +1417,7 @@ void release_data_vio_allocation_lock(struct data_vio *data_vio, bool reset) /** * uncompress_data_vio() - Uncompress the data a data_vio has just read. + * @data_vio: The data_vio. * @mapping_state: The mapping state indicating which fragment to decompress. * @buffer: The buffer to receive the uncompressed data. */ @@ -1519,6 +1538,7 @@ static void complete_zero_read(struct vdo_completion *completion) /** * read_block() - Read a block asynchronously. + * @completion: The data_vio doing the read. * * This is the callback registered in read_block_mapping(). */ @@ -1675,6 +1695,7 @@ static void journal_remapping(struct vdo_completion *completion) /** * read_old_block_mapping() - Get the previous PBN/LBN mapping of an in-progress write. + * @completion: The data_vio doing the read. * * Gets the previous PBN mapped to this LBN from the block map, so as to make an appropriate * journal entry referencing the removal of this LBN->PBN mapping. @@ -1704,6 +1725,7 @@ void update_metadata_for_data_vio_write(struct data_vio *data_vio, struct pbn_lo /** * pack_compressed_data() - Attempt to pack the compressed data_vio into a block. + * @completion: The data_vio. * * This is the callback registered in launch_compress_data_vio(). */ @@ -1725,6 +1747,7 @@ static void pack_compressed_data(struct vdo_completion *completion) /** * compress_data_vio() - Do the actual work of compressing the data on a CPU queue. + * @completion: The data_vio. * * This callback is registered in launch_compress_data_vio(). */ @@ -1754,6 +1777,7 @@ static void compress_data_vio(struct vdo_completion *completion) /** * launch_compress_data_vio() - Continue a write by attempting to compress the data. + * @data_vio: The data_vio. * * This is a re-entry point to vio_write used by hash locks. */ @@ -1796,7 +1820,8 @@ void launch_compress_data_vio(struct data_vio *data_vio) /** * hash_data_vio() - Hash the data in a data_vio and set the hash zone (which also flags the record * name as set). - + * @completion: The data_vio. + * * This callback is registered in prepare_for_dedupe(). */ static void hash_data_vio(struct vdo_completion *completion) @@ -1832,6 +1857,7 @@ static void prepare_for_dedupe(struct data_vio *data_vio) /** * write_bio_finished() - This is the bio_end_io function registered in write_block() to be called * when a data_vio's write to the underlying storage has completed. + * @bio: The bio to update. */ static void write_bio_finished(struct bio *bio) { @@ -1884,6 +1910,7 @@ void write_data_vio(struct data_vio *data_vio) /** * acknowledge_write_callback() - Acknowledge a write to the requestor. + * @completion: The data_vio. * * This callback is registered in allocate_block() and continue_write_with_block_map_slot(). */ @@ -1909,6 +1936,7 @@ static void acknowledge_write_callback(struct vdo_completion *completion) /** * allocate_block() - Attempt to allocate a block in the current allocation zone. + * @completion: The data_vio. * * This callback is registered in continue_write_with_block_map_slot(). */ @@ -1941,6 +1969,7 @@ static void allocate_block(struct vdo_completion *completion) /** * handle_allocation_error() - Handle an error attempting to allocate a block. + * @completion: The data_vio. * * This error handler is registered in continue_write_with_block_map_slot(). */ @@ -1970,6 +1999,7 @@ static int assert_is_discard(struct data_vio *data_vio) /** * continue_data_vio_with_block_map_slot() - Read the data_vio's mapping from the block map. + * @completion: The data_vio to continue. * * This callback is registered in launch_read_data_vio(). */ diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c index 4d983092a152..75a26f3f4461 100644 --- a/drivers/md/dm-vdo/dedupe.c +++ b/drivers/md/dm-vdo/dedupe.c @@ -917,6 +917,8 @@ static int __must_check acquire_lock(struct hash_zone *zone, /** * enter_forked_lock() - Bind the data_vio to a new hash lock. + * @waiter: The data_vio's waiter link. + * @context: The new hash lock. * * Implements waiter_callback_fn. Binds the data_vio that was waiting to a new hash lock and waits * on that lock. @@ -971,7 +973,7 @@ static void fork_hash_lock(struct hash_lock *old_lock, struct data_vio *new_agen * path. * @lock: The hash lock. * @data_vio: The data_vio to deduplicate using the hash lock. - * @has_claim: true if the data_vio already has claimed an increment from the duplicate lock. + * @has_claim: True if the data_vio already has claimed an increment from the duplicate lock. * * If no increments are available, this will roll over to a new hash lock and launch the data_vio * as the writing agent for that lock. @@ -996,7 +998,7 @@ static void launch_dedupe(struct hash_lock *lock, struct data_vio *data_vio, * true copy of their data on disk. * @lock: The hash lock. * @agent: The data_vio acting as the agent for the lock. - * @agent_is_done: true only if the agent has already written or deduplicated against its data. + * @agent_is_done: True only if the agent has already written or deduplicated against its data. * * If the agent itself needs to deduplicate, an increment for it must already have been claimed * from the duplicate lock, ensuring the hash lock will still have a data_vio holding it. @@ -2146,8 +2148,8 @@ static void start_expiration_timer(struct dedupe_context *context) /** * report_dedupe_timeouts() - Record and eventually report that some dedupe requests reached their * expiration time without getting answers, so we timed them out. - * @zones: the hash zones. - * @timeouts: the number of newly timed out requests. + * @zones: The hash zones. + * @timeouts: The number of newly timed out requests. */ static void report_dedupe_timeouts(struct hash_zones *zones, unsigned int timeouts) { @@ -2509,6 +2511,8 @@ static void initiate_suspend_index(struct admin_state *state) /** * suspend_index() - Suspend the UDS index prior to draining hash zones. + * @context: Not used. + * @completion: The completion for the suspend operation. * * Implements vdo_action_preamble_fn */ @@ -2521,21 +2525,13 @@ static void suspend_index(void *context, struct vdo_completion *completion) initiate_suspend_index); } -/** - * initiate_drain() - Initiate a drain. - * - * Implements vdo_admin_initiator_fn. - */ +/** Implements vdo_admin_initiator_fn. */ static void initiate_drain(struct admin_state *state) { check_for_drain_complete(container_of(state, struct hash_zone, state)); } -/** - * drain_hash_zone() - Drain a hash zone. - * - * Implements vdo_zone_action_fn. - */ +/** Implements vdo_zone_action_fn. */ static void drain_hash_zone(void *context, zone_count_t zone_number, struct vdo_completion *parent) { @@ -2572,6 +2568,8 @@ static void launch_dedupe_state_change(struct hash_zones *zones) /** * resume_index() - Resume the UDS index prior to resuming hash zones. + * @context: Not used. + * @parent: The completion for the resume operation. * * Implements vdo_action_preamble_fn */ @@ -2602,11 +2600,7 @@ static void resume_index(void *context, struct vdo_completion *parent) vdo_finish_completion(parent); } -/** - * resume_hash_zone() - Resume a hash zone. - * - * Implements vdo_zone_action_fn. - */ +/** Implements vdo_zone_action_fn. */ static void resume_hash_zone(void *context, zone_count_t zone_number, struct vdo_completion *parent) { @@ -2634,7 +2628,7 @@ void vdo_resume_hash_zones(struct hash_zones *zones, struct vdo_completion *pare /** * get_hash_zone_statistics() - Add the statistics for this hash zone to the tally for all zones. * @zone: The hash zone to query. - * @tally: The tally + * @tally: The tally. */ static void get_hash_zone_statistics(const struct hash_zone *zone, struct hash_lock_statistics *tally) @@ -2680,8 +2674,8 @@ static void get_index_statistics(struct hash_zones *zones, /** * vdo_get_dedupe_statistics() - Tally the statistics from all the hash zones and the UDS index. - * @zones: The hash zones to query - * @stats: A structure to store the statistics + * @zones: The hash zones to query. + * @stats: A structure to store the statistics. * * Return: The sum of the hash lock statistics from all hash zones plus the statistics from the UDS * index @@ -2856,9 +2850,9 @@ void vdo_set_dedupe_index_min_timer_interval(unsigned int value) /** * acquire_context() - Acquire a dedupe context from a hash_zone if any are available. - * @zone: the hash zone + * @zone: The hash zone. * - * Return: A dedupe_context or NULL if none are available + * Return: A dedupe_context or NULL if none are available. */ static struct dedupe_context * __must_check acquire_context(struct hash_zone *zone) { diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c index 0e04c2021682..6af40d40f255 100644 --- a/drivers/md/dm-vdo/dm-vdo-target.c +++ b/drivers/md/dm-vdo/dm-vdo-target.c @@ -1144,6 +1144,7 @@ static bool vdo_uses_device(struct vdo *vdo, const void *context) /** * get_thread_id_for_phase() - Get the thread id for the current phase of the admin operation in * progress. + * @vdo: The vdo. */ static thread_id_t __must_check get_thread_id_for_phase(struct vdo *vdo) { @@ -1188,9 +1189,9 @@ static struct vdo_completion *prepare_admin_completion(struct vdo *vdo, /** * advance_phase() - Increment the phase of the current admin operation and prepare the admin * completion to run on the thread for the next phase. - * @vdo: The on which an admin operation is being performed + * @vdo: The vdo on which an admin operation is being performed. * - * Return: The current phase + * Return: The current phase. */ static u32 advance_phase(struct vdo *vdo) { diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c index b7cc0f41caca..dd59691be840 100644 --- a/drivers/md/dm-vdo/encodings.c +++ b/drivers/md/dm-vdo/encodings.c @@ -432,7 +432,10 @@ static void encode_block_map_state_2_0(u8 *buffer, size_t *offset, /** * vdo_compute_new_forest_pages() - Compute the number of pages which must be allocated at each * level in order to grow the forest to a new number of entries. + * @root_count: The number of block map roots. + * @old_sizes: The sizes of the old tree segments. * @entries: The new number of entries the block map must address. + * @new_sizes: The sizes of the new tree segments. * * Return: The total number of non-leaf pages required. */ @@ -462,6 +465,9 @@ block_count_t vdo_compute_new_forest_pages(root_count_t root_count, /** * encode_recovery_journal_state_7_0() - Encode the state of a recovery journal. + * @buffer: A buffer to store the encoding. + * @offset: The offset in the buffer at which to encode. + * @state: The recovery journal state to encode. * * Return: VDO_SUCCESS or an error code. */ @@ -484,6 +490,7 @@ static void encode_recovery_journal_state_7_0(u8 *buffer, size_t *offset, /** * decode_recovery_journal_state_7_0() - Decode the state of a recovery journal saved in a buffer. * @buffer: The buffer containing the saved state. + * @offset: The offset to start decoding from. * @state: A pointer to a recovery journal state to hold the result of a successful decode. * * Return: VDO_SUCCESS or an error code. @@ -544,6 +551,9 @@ const char *vdo_get_journal_operation_name(enum journal_operation operation) /** * encode_slab_depot_state_2_0() - Encode the state of a slab depot into a buffer. + * @buffer: A buffer to store the encoding. + * @offset: The offset in the buffer at which to encode. + * @state: The slab depot state to encode. */ static void encode_slab_depot_state_2_0(u8 *buffer, size_t *offset, struct slab_depot_state_2_0 state) @@ -570,6 +580,9 @@ static void encode_slab_depot_state_2_0(u8 *buffer, size_t *offset, /** * decode_slab_depot_state_2_0() - Decode slab depot component state version 2.0 from a buffer. + * @buffer: The buffer being decoded. + * @offset: The offset to start decoding from. + * @state: A pointer to a slab depot state to hold the decoded result. * * Return: VDO_SUCCESS or an error code. */ @@ -1156,6 +1169,9 @@ static struct vdo_component unpack_vdo_component_41_0(struct packed_vdo_componen /** * decode_vdo_component() - Decode the component data for the vdo itself out of the super block. + * @buffer: The buffer being decoded. + * @offset: The offset to start decoding from. + * @component: The vdo component structure to decode into. * * Return: VDO_SUCCESS or an error. */ @@ -1290,7 +1306,7 @@ void vdo_destroy_component_states(struct vdo_component_states *states) * understand. * @buffer: The buffer being decoded. * @offset: The offset to start decoding from. - * @geometry: The vdo geometry + * @geometry: The vdo geometry. * @states: An object to hold the successfully decoded state. * * Return: VDO_SUCCESS or an error. @@ -1329,7 +1345,7 @@ static int __must_check decode_components(u8 *buffer, size_t *offset, /** * vdo_decode_component_states() - Decode the payload of a super block. * @buffer: The buffer containing the encoded super block contents. - * @geometry: The vdo geometry + * @geometry: The vdo geometry. * @states: A pointer to hold the decoded states. * * Return: VDO_SUCCESS or an error. @@ -1383,6 +1399,9 @@ int vdo_validate_component_states(struct vdo_component_states *states, /** * vdo_encode_component_states() - Encode the state of all vdo components in the super block. + * @buffer: A buffer to store the encoding. + * @offset: The offset into the buffer to start the encoding. + * @states: The component states to encode. */ static void vdo_encode_component_states(u8 *buffer, size_t *offset, const struct vdo_component_states *states) @@ -1402,6 +1421,8 @@ static void vdo_encode_component_states(u8 *buffer, size_t *offset, /** * vdo_encode_super_block() - Encode a super block into its on-disk representation. + * @buffer: A buffer to store the encoding. + * @states: The component states to encode. */ void vdo_encode_super_block(u8 *buffer, struct vdo_component_states *states) { @@ -1426,6 +1447,7 @@ void vdo_encode_super_block(u8 *buffer, struct vdo_component_states *states) /** * vdo_decode_super_block() - Decode a super block from its on-disk representation. + * @buffer: The buffer to decode from. */ int vdo_decode_super_block(u8 *buffer) { diff --git a/drivers/md/dm-vdo/flush.c b/drivers/md/dm-vdo/flush.c index dd4fdee2ca0c..82a259ef1601 100644 --- a/drivers/md/dm-vdo/flush.c +++ b/drivers/md/dm-vdo/flush.c @@ -522,11 +522,7 @@ static void vdo_complete_flush(struct vdo_flush *flush) vdo_enqueue_completion(completion, BIO_Q_FLUSH_PRIORITY); } -/** - * initiate_drain() - Initiate a drain. - * - * Implements vdo_admin_initiator_fn. - */ +/** Implements vdo_admin_initiator_fn. */ static void initiate_drain(struct admin_state *state) { check_for_drain_complete(container_of(state, struct flusher, state)); diff --git a/drivers/md/dm-vdo/funnel-workqueue.c b/drivers/md/dm-vdo/funnel-workqueue.c index 0613c82bbe8e..8a79b33b8b09 100644 --- a/drivers/md/dm-vdo/funnel-workqueue.c +++ b/drivers/md/dm-vdo/funnel-workqueue.c @@ -372,6 +372,13 @@ static int make_simple_work_queue(const char *thread_name_prefix, const char *na /** * vdo_make_work_queue() - Create a work queue; if multiple threads are requested, completions will * be distributed to them in round-robin fashion. + * @thread_name_prefix: A prefix for the thread names to identify them as a vdo thread. + * @name: A base name to identify this queue. + * @owner: The vdo_thread structure to manage this queue. + * @type: The type of queue to create. + * @thread_count: The number of actual threads handling this queue. + * @thread_privates: An array of private contexts, one for each thread; may be NULL. + * @queue_ptr: A pointer to return the new work queue. * * Each queue is associated with a struct vdo_thread which has a single vdo thread id. Regardless * of the actual number of queues and threads allocated here, code outside of the queue diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c index 11d47770b54d..e26d75f8366d 100644 --- a/drivers/md/dm-vdo/io-submitter.c +++ b/drivers/md/dm-vdo/io-submitter.c @@ -118,6 +118,7 @@ static void send_bio_to_device(struct vio *vio, struct bio *bio) /** * vdo_submit_vio() - Submits a vio's bio to the underlying block device. May block if the device * is busy. This callback should be used by vios which did not attempt to merge. + * @completion: The vio to submit. */ void vdo_submit_vio(struct vdo_completion *completion) { @@ -133,7 +134,7 @@ void vdo_submit_vio(struct vdo_completion *completion) * The list will always contain at least one entry (the bio for the vio on which it is called), but * other bios may have been merged with it as well. * - * Return: bio The head of the bio list to submit. + * Return: The head of the bio list to submit. */ static struct bio *get_bio_list(struct vio *vio) { @@ -158,6 +159,7 @@ static struct bio *get_bio_list(struct vio *vio) /** * submit_data_vio() - Submit a data_vio's bio to the storage below along with * any bios that have been merged with it. + * @completion: The vio to submit. * * Context: This call may block and so should only be called from a bio thread. */ @@ -184,7 +186,7 @@ static void submit_data_vio(struct vdo_completion *completion) * There are two types of merging possible, forward and backward, which are distinguished by a flag * that uses kernel elevator terminology. * - * Return: the vio to merge to, NULL if no merging is possible. + * Return: The vio to merge to, NULL if no merging is possible. */ static struct vio *get_mergeable_locked(struct int_map *map, struct vio *vio, bool back_merge) @@ -262,7 +264,7 @@ static int merge_to_next_head(struct int_map *bio_map, struct vio *vio, * * Currently this is only used for data_vios, but is broken out for future use with metadata vios. * - * Return: whether or not the vio was merged. + * Return: Whether or not the vio was merged. */ static bool try_bio_map_merge(struct vio *vio) { @@ -306,7 +308,7 @@ static bool try_bio_map_merge(struct vio *vio) /** * vdo_submit_data_vio() - Submit I/O for a data_vio. - * @data_vio: the data_vio for which to issue I/O. + * @data_vio: The data_vio for which to issue I/O. * * If possible, this I/O will be merged other pending I/Os. Otherwise, the data_vio will be sent to * the appropriate bio zone directly. @@ -321,13 +323,13 @@ void vdo_submit_data_vio(struct data_vio *data_vio) /** * __submit_metadata_vio() - Submit I/O for a metadata vio. - * @vio: the vio for which to issue I/O - * @physical: the physical block number to read or write - * @callback: the bio endio function which will be called after the I/O completes - * @error_handler: the handler for submission or I/O errors (may be NULL) - * @operation: the type of I/O to perform - * @data: the buffer to read or write (may be NULL) - * @size: the I/O amount in bytes + * @vio: The vio for which to issue I/O. + * @physical: The physical block number to read or write. + * @callback: The bio endio function which will be called after the I/O completes. + * @error_handler: The handler for submission or I/O errors; may be NULL. + * @operation: The type of I/O to perform. + * @data: The buffer to read or write; may be NULL. + * @size: The I/O amount in bytes. * * The vio is enqueued on a vdo bio queue so that bio submission (which may block) does not block * other vdo threads. @@ -441,7 +443,7 @@ int vdo_make_io_submitter(unsigned int thread_count, unsigned int rotation_inter /** * vdo_cleanup_io_submitter() - Tear down the io_submitter fields as needed for a physical layer. - * @io_submitter: The I/O submitter data to tear down (may be NULL). + * @io_submitter: The I/O submitter data to tear down; may be NULL. */ void vdo_cleanup_io_submitter(struct io_submitter *io_submitter) { diff --git a/drivers/md/dm-vdo/logical-zone.c b/drivers/md/dm-vdo/logical-zone.c index 026f031ffc9e..0a27e60a9dfd 100644 --- a/drivers/md/dm-vdo/logical-zone.c +++ b/drivers/md/dm-vdo/logical-zone.c @@ -159,21 +159,13 @@ static void check_for_drain_complete(struct logical_zone *zone) vdo_finish_draining(&zone->state); } -/** - * initiate_drain() - Initiate a drain. - * - * Implements vdo_admin_initiator_fn. - */ +/** Implements vdo_admin_initiator_fn. */ static void initiate_drain(struct admin_state *state) { check_for_drain_complete(container_of(state, struct logical_zone, state)); } -/** - * drain_logical_zone() - Drain a logical zone. - * - * Implements vdo_zone_action_fn. - */ +/** Implements vdo_zone_action_fn. */ static void drain_logical_zone(void *context, zone_count_t zone_number, struct vdo_completion *parent) { @@ -192,11 +184,7 @@ void vdo_drain_logical_zones(struct logical_zones *zones, parent); } -/** - * resume_logical_zone() - Resume a logical zone. - * - * Implements vdo_zone_action_fn. - */ +/** Implements vdo_zone_action_fn. */ static void resume_logical_zone(void *context, zone_count_t zone_number, struct vdo_completion *parent) { @@ -356,7 +344,7 @@ struct physical_zone *vdo_get_next_allocation_zone(struct logical_zone *zone) /** * vdo_dump_logical_zone() - Dump information about a logical zone to the log for debugging. - * @zone: The zone to dump + * @zone: The zone to dump. * * Context: the information is dumped in a thread-unsafe fashion. * diff --git a/drivers/md/dm-vdo/packer.c b/drivers/md/dm-vdo/packer.c index f70f5edabc10..666be6d557e1 100644 --- a/drivers/md/dm-vdo/packer.c +++ b/drivers/md/dm-vdo/packer.c @@ -35,10 +35,10 @@ static const struct version_number COMPRESSED_BLOCK_1_0 = { /** * vdo_get_compressed_block_fragment() - Get a reference to a compressed fragment from a compressed * block. - * @mapping_state [in] The mapping state for the look up. - * @compressed_block [in] The compressed block that was read from disk. - * @fragment_offset [out] The offset of the fragment within a compressed block. - * @fragment_size [out] The size of the fragment. + * @mapping_state: The mapping state describing the fragment. + * @block: The compressed block that was read from disk. + * @fragment_offset: The offset of the fragment within the compressed block. + * @fragment_size: The size of the fragment. * * Return: If a valid compressed fragment is found, VDO_SUCCESS; otherwise, VDO_INVALID_FRAGMENT if * the fragment is invalid. @@ -382,6 +382,7 @@ static void initialize_compressed_block(struct compressed_block *block, u16 size * @compression: The agent's compression_state to pack in to. * @data_vio: The data_vio to pack. * @offset: The offset into the compressed block at which to pack the fragment. + * @slot: The slot number in the compressed block. * @block: The compressed block which will be written out when batch is fully packed. * * Return: The new amount of space used. @@ -705,11 +706,7 @@ void vdo_increment_packer_flush_generation(struct packer *packer) vdo_flush_packer(packer); } -/** - * initiate_drain() - Initiate a drain. - * - * Implements vdo_admin_initiator_fn. - */ +/** Implements vdo_admin_initiator_fn. */ static void initiate_drain(struct admin_state *state) { struct packer *packer = container_of(state, struct packer, state); diff --git a/drivers/md/dm-vdo/physical-zone.c b/drivers/md/dm-vdo/physical-zone.c index a43b5c45fab7..686eb7d714e6 100644 --- a/drivers/md/dm-vdo/physical-zone.c +++ b/drivers/md/dm-vdo/physical-zone.c @@ -60,7 +60,7 @@ static inline bool has_lock_type(const struct pbn_lock *lock, enum pbn_lock_type * vdo_is_pbn_read_lock() - Check whether a pbn_lock is a read lock. * @lock: The lock to check. * - * Return: true if the lock is a read lock. + * Return: True if the lock is a read lock. */ bool vdo_is_pbn_read_lock(const struct pbn_lock *lock) { @@ -75,6 +75,7 @@ static inline void set_pbn_lock_type(struct pbn_lock *lock, enum pbn_lock_type t /** * vdo_downgrade_pbn_write_lock() - Downgrade a PBN write lock to a PBN read lock. * @lock: The PBN write lock to downgrade. + * @compressed_write: True if the written block was a compressed block. * * The lock holder count is cleared and the caller is responsible for setting the new count. */ @@ -582,7 +583,7 @@ static bool continue_allocating(struct data_vio *data_vio) * that fails try the next if possible. * @data_vio: The data_vio needing an allocation. * - * Return: true if a block was allocated, if not the data_vio will have been dispatched so the + * Return: True if a block was allocated, if not the data_vio will have been dispatched so the * caller must not touch it. */ bool vdo_allocate_block_in_zone(struct data_vio *data_vio) diff --git a/drivers/md/dm-vdo/recovery-journal.c b/drivers/md/dm-vdo/recovery-journal.c index de58184f538f..9cc0f0ff1664 100644 --- a/drivers/md/dm-vdo/recovery-journal.c +++ b/drivers/md/dm-vdo/recovery-journal.c @@ -109,7 +109,7 @@ static atomic_t *get_decrement_counter(struct recovery_journal *journal, * @journal: The recovery journal. * @lock_number: The lock to check. * - * Return: true if the journal zone is locked. + * Return: True if the journal zone is locked. */ static bool is_journal_zone_locked(struct recovery_journal *journal, block_count_t lock_number) @@ -217,7 +217,7 @@ static struct recovery_journal_block * __must_check pop_free_list(struct recover * Indicates it has any uncommitted entries, which includes both entries not written and entries * written but not yet acknowledged. * - * Return: true if the block has any uncommitted entries. + * Return: True if the block has any uncommitted entries. */ static inline bool __must_check is_block_dirty(const struct recovery_journal_block *block) { @@ -228,7 +228,7 @@ static inline bool __must_check is_block_dirty(const struct recovery_journal_blo * is_block_empty() - Check whether a journal block is empty. * @block: The block to check. * - * Return: true if the block has no entries. + * Return: True if the block has no entries. */ static inline bool __must_check is_block_empty(const struct recovery_journal_block *block) { @@ -239,7 +239,7 @@ static inline bool __must_check is_block_empty(const struct recovery_journal_blo * is_block_full() - Check whether a journal block is full. * @block: The block to check. * - * Return: true if the block is full. + * Return: True if the block is full. */ static inline bool __must_check is_block_full(const struct recovery_journal_block *block) { @@ -260,6 +260,8 @@ static void assert_on_journal_thread(struct recovery_journal *journal, /** * continue_waiter() - Release a data_vio from the journal. + * @waiter: The data_vio waiting on journal activity. + * @context: The result of the journal operation. * * Invoked whenever a data_vio is to be released from the journal, either because its entry was * committed to disk, or because there was an error. Implements waiter_callback_fn. @@ -273,7 +275,7 @@ static void continue_waiter(struct vdo_waiter *waiter, void *context) * has_block_waiters() - Check whether the journal has any waiters on any blocks. * @journal: The journal in question. * - * Return: true if any block has a waiter. + * Return: True if any block has a waiter. */ static inline bool has_block_waiters(struct recovery_journal *journal) { @@ -296,7 +298,7 @@ static void notify_commit_waiters(struct recovery_journal *journal); * suspend_lock_counter() - Prevent the lock counter from notifying. * @counter: The counter. * - * Return: true if the lock counter was not notifying and hence the suspend was efficacious. + * Return: True if the lock counter was not notifying and hence the suspend was efficacious. */ static bool suspend_lock_counter(struct lock_counter *counter) { @@ -416,7 +418,7 @@ sequence_number_t vdo_get_recovery_journal_current_sequence_number(struct recove * * The head is the lowest sequence number of the block map head and the slab journal head. * - * Return: the head of the journal. + * Return: The head of the journal. */ static inline sequence_number_t get_recovery_journal_head(const struct recovery_journal *journal) { @@ -535,7 +537,7 @@ static void initialize_journal_state(struct recovery_journal *journal) * vdo_get_recovery_journal_length() - Get the number of usable recovery journal blocks. * @journal_size: The size of the recovery journal in blocks. * - * Return: the number of recovery journal blocks usable for entries. + * Return: The number of recovery journal blocks usable for entries. */ block_count_t vdo_get_recovery_journal_length(block_count_t journal_size) { @@ -1078,6 +1080,8 @@ static void update_usages(struct recovery_journal *journal, struct data_vio *dat /** * assign_entry() - Assign an entry waiter to the active block. + * @waiter: The data_vio. + * @context: The recovery journal block. * * Implements waiter_callback_fn. */ @@ -1165,6 +1169,8 @@ static void recycle_journal_block(struct recovery_journal_block *block) /** * continue_committed_waiter() - invoked whenever a VIO is to be released from the journal because * its entry was committed to disk. + * @waiter: The data_vio waiting on a journal write. + * @context: A pointer to the recovery journal. * * Implements waiter_callback_fn. */ @@ -1362,6 +1368,8 @@ static void add_queued_recovery_entries(struct recovery_journal_block *block) /** * write_block() - Issue a block for writing. + * @waiter: The recovery journal block to write. + * @context: Not used. * * Implements waiter_callback_fn. */ @@ -1611,11 +1619,7 @@ void vdo_release_journal_entry_lock(struct recovery_journal *journal, smp_mb__after_atomic(); } -/** - * initiate_drain() - Initiate a drain. - * - * Implements vdo_admin_initiator_fn. - */ +/** Implements vdo_admin_initiator_fn. */ static void initiate_drain(struct admin_state *state) { check_for_drain_complete(container_of(state, struct recovery_journal, state)); diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c index f3d80ff7bef5..034ecaa51f48 100644 --- a/drivers/md/dm-vdo/slab-depot.c +++ b/drivers/md/dm-vdo/slab-depot.c @@ -40,7 +40,7 @@ static const bool NORMAL_OPERATION = true; /** * get_lock() - Get the lock object for a slab journal block by sequence number. - * @journal: vdo_slab journal to retrieve from. + * @journal: The vdo_slab journal to retrieve from. * @sequence_number: Sequence number of the block. * * Return: The lock object for the given sequence number. @@ -110,7 +110,7 @@ static void initialize_journal_state(struct slab_journal *journal) * block_is_full() - Check whether a journal block is full. * @journal: The slab journal for the block. * - * Return: true if the tail block is full. + * Return: True if the tail block is full. */ static bool __must_check block_is_full(struct slab_journal *journal) { @@ -127,10 +127,11 @@ static void release_journal_locks(struct vdo_waiter *waiter, void *context); /** * is_slab_journal_blank() - Check whether a slab's journal is blank. + * @slab: The slab to check. * * A slab journal is blank if it has never had any entries recorded in it. * - * Return: true if the slab's journal has never been modified. + * Return: True if the slab's journal has never been modified. */ static bool is_slab_journal_blank(const struct vdo_slab *slab) { @@ -227,6 +228,7 @@ static u8 __must_check compute_fullness_hint(struct slab_depot *depot, /** * check_summary_drain_complete() - Check whether an allocators summary has finished draining. + * @allocator: The allocator to check. */ static void check_summary_drain_complete(struct block_allocator *allocator) { @@ -349,7 +351,7 @@ static void launch_write(struct slab_summary_block *block) /** * update_slab_summary_entry() - Update the entry for a slab. - * @slab: The slab whose entry is to be updated + * @slab: The slab whose entry is to be updated. * @waiter: The waiter that is updating the summary. * @tail_block_offset: The offset of the slab journal's tail block. * @load_ref_counts: Whether the reference counts must be loaded from disk on the vdo load. @@ -654,6 +656,7 @@ static void update_tail_block_location(struct slab_journal *journal) /** * reopen_slab_journal() - Reopen a slab's journal by emptying it and then adding pending entries. + * @slab: The slab to reopen. */ static void reopen_slab_journal(struct vdo_slab *slab) { @@ -839,8 +842,6 @@ static void commit_tail(struct slab_journal *journal) * @sbn: The slab block number of the entry to encode. * @operation: The type of the entry. * @increment: True if this is an increment. - * - * Exposed for unit tests. */ static void encode_slab_journal_entry(struct slab_journal_block_header *tail_header, slab_journal_payload *payload, @@ -951,7 +952,7 @@ static inline block_count_t journal_length(const struct slab_journal *journal) * @parent: The completion to notify when there is space to add the entry if the entry could not be * added immediately. * - * Return: true if the entry was added immediately. + * Return: True if the entry was added immediately. */ bool vdo_attempt_replay_into_slab(struct vdo_slab *slab, physical_block_number_t pbn, enum journal_operation operation, bool increment, @@ -1003,7 +1004,7 @@ bool vdo_attempt_replay_into_slab(struct vdo_slab *slab, physical_block_number_t * requires_reaping() - Check whether the journal must be reaped before adding new entries. * @journal: The journal to check. * - * Return: true if the journal must be reaped. + * Return: True if the journal must be reaped. */ static bool requires_reaping(const struct slab_journal *journal) { @@ -1275,6 +1276,8 @@ static void dirty_block(struct reference_block *block) /** * get_reference_block() - Get the reference block that covers the given block index. + * @slab: The slab containing the references. + * @index: The index of the physical block. */ static struct reference_block * __must_check get_reference_block(struct vdo_slab *slab, slab_block_number index) @@ -1379,7 +1382,8 @@ static void prioritize_slab(struct vdo_slab *slab) /** * adjust_free_block_count() - Adjust the free block count and (if needed) reprioritize the slab. - * @incremented: true if the free block count went up. + * @slab: The slab. + * @incremented: True if the free block count went up. */ static void adjust_free_block_count(struct vdo_slab *slab, bool incremented) { @@ -1885,6 +1889,7 @@ static void add_entries(struct slab_journal *journal) /** * reset_search_cursor() - Reset the free block search back to the first reference counter in the * first reference block of a slab. + * @slab: The slab. */ static void reset_search_cursor(struct vdo_slab *slab) { @@ -1892,17 +1897,17 @@ static void reset_search_cursor(struct vdo_slab *slab) cursor->block = cursor->first_block; cursor->index = 0; - /* Unit tests have slabs with only one reference block (and it's a runt). */ cursor->end_index = min_t(u32, COUNTS_PER_BLOCK, slab->block_count); } /** * advance_search_cursor() - Advance the search cursor to the start of the next reference block in - * a slab, + * a slab. + * @slab: The slab. * * Wraps around to the first reference block if the current block is the last reference block. * - * Return: true unless the cursor was at the last reference block. + * Return: True unless the cursor was at the last reference block. */ static bool advance_search_cursor(struct vdo_slab *slab) { @@ -1933,6 +1938,9 @@ static bool advance_search_cursor(struct vdo_slab *slab) /** * vdo_adjust_reference_count_for_rebuild() - Adjust the reference count of a block during rebuild. + * @depot: The slab depot. + * @pbn: The physical block number to adjust. + * @operation: The type opf operation. * * Return: VDO_SUCCESS or an error. */ @@ -2038,9 +2046,7 @@ static inline slab_block_number find_zero_byte_in_word(const u8 *word_ptr, * @slab: The slab counters to scan. * @index_ptr: A pointer to hold the array index of the free block. * - * Exposed for unit testing. - * - * Return: true if a free block was found in the specified range. + * Return: True if a free block was found in the specified range. */ static bool find_free_block(const struct vdo_slab *slab, slab_block_number *index_ptr) { @@ -2097,7 +2103,7 @@ static bool find_free_block(const struct vdo_slab *slab, slab_block_number *inde * @slab: The slab to search. * @free_index_ptr: A pointer to receive the array index of the zero reference count. * - * Return: true if an unreferenced counter was found. + * Return: True if an unreferenced counter was found. */ static bool search_current_reference_block(const struct vdo_slab *slab, slab_block_number *free_index_ptr) @@ -2116,7 +2122,7 @@ static bool search_current_reference_block(const struct vdo_slab *slab, * counter index saved in the search cursor and searching up to the end of the last reference * block. The search does not wrap. * - * Return: true if an unreferenced counter was found. + * Return: True if an unreferenced counter was found. */ static bool search_reference_blocks(struct vdo_slab *slab, slab_block_number *free_index_ptr) @@ -2136,6 +2142,8 @@ static bool search_reference_blocks(struct vdo_slab *slab, /** * make_provisional_reference() - Do the bookkeeping for making a provisional reference. + * @slab: The slab. + * @block_number: The index for the physical block to reference. */ static void make_provisional_reference(struct vdo_slab *slab, slab_block_number block_number) @@ -2155,6 +2163,7 @@ static void make_provisional_reference(struct vdo_slab *slab, /** * dirty_all_reference_blocks() - Mark all reference count blocks in a slab as dirty. + * @slab: The slab. */ static void dirty_all_reference_blocks(struct vdo_slab *slab) { @@ -2173,10 +2182,10 @@ static inline bool journal_points_equal(struct journal_point first, /** * match_bytes() - Check an 8-byte word for bytes matching the value specified - * @input: A word to examine the bytes of - * @match: The byte value sought + * @input: A word to examine the bytes of. + * @match: The byte value sought. * - * Return: 1 in each byte when the corresponding input byte matched, 0 otherwise + * Return: 1 in each byte when the corresponding input byte matched, 0 otherwise. */ static inline u64 match_bytes(u64 input, u8 match) { @@ -2191,12 +2200,12 @@ static inline u64 match_bytes(u64 input, u8 match) /** * count_valid_references() - Process a newly loaded refcount array - * @counters: the array of counters from a metadata block + * @counters: The array of counters from a metadata block. * - * Scan a 8-byte-aligned array of counters, fixing up any "provisional" values that weren't - * cleaned up at shutdown, changing them internally to "empty". + * Scan an 8-byte-aligned array of counters, fixing up any provisional values that + * weren't cleaned up at shutdown, changing them internally to zero. * - * Return: the number of blocks that are referenced (counters not "empty") + * Return: The number of blocks with a non-zero reference count. */ static unsigned int count_valid_references(vdo_refcount_t *counters) { @@ -2351,6 +2360,7 @@ static void load_reference_block_group(struct vdo_waiter *waiter, void *context) /** * load_reference_blocks() - Load a slab's reference blocks from the underlying storage into a * pre-allocated reference counter. + * @slab: The slab. */ static void load_reference_blocks(struct vdo_slab *slab) { @@ -2375,6 +2385,7 @@ static void load_reference_blocks(struct vdo_slab *slab) /** * drain_slab() - Drain all reference count I/O. + * @slab: The slab. * * Depending upon the type of drain being performed (as recorded in the ref_count's vdo_slab), the * reference blocks may be loaded from disk or dirty reference blocks may be written out. @@ -2564,6 +2575,7 @@ static void read_slab_journal_tail(struct vdo_waiter *waiter, void *context) /** * load_slab_journal() - Load a slab's journal by reading the journal's tail. + * @slab: The slab. */ static void load_slab_journal(struct vdo_slab *slab) { @@ -2663,11 +2675,7 @@ static void queue_slab(struct vdo_slab *slab) prioritize_slab(slab); } -/** - * initiate_slab_action() - Initiate a slab action. - * - * Implements vdo_admin_initiator_fn. - */ +/** Implements vdo_admin_initiator_fn. */ static void initiate_slab_action(struct admin_state *state) { struct vdo_slab *slab = container_of(state, struct vdo_slab, state); @@ -2720,7 +2728,7 @@ static struct vdo_slab *get_next_slab(struct slab_scrubber *scrubber) * has_slabs_to_scrub() - Check whether a scrubber has slabs to scrub. * @scrubber: The scrubber to check. * - * Return: true if the scrubber has slabs to scrub. + * Return: True if the scrubber has slabs to scrub. */ static inline bool __must_check has_slabs_to_scrub(struct slab_scrubber *scrubber) { @@ -2741,6 +2749,7 @@ static void uninitialize_scrubber_vio(struct slab_scrubber *scrubber) * finish_scrubbing() - Stop scrubbing, either because there are no more slabs to scrub or because * there's been an error. * @scrubber: The scrubber. + * @result: The result of the scrubbing operation. */ static void finish_scrubbing(struct slab_scrubber *scrubber, int result) { @@ -3132,11 +3141,13 @@ static struct vdo_slab *next_slab(struct slab_iterator *iterator) /** * abort_waiter() - Abort vios waiting to make journal entries when read-only. + * @waiter: A waiting data_vio. + * @context: Not used. * * This callback is invoked on all vios waiting to make slab journal entries after the VDO has gone * into read-only mode. Implements waiter_callback_fn. */ -static void abort_waiter(struct vdo_waiter *waiter, void *context __always_unused) +static void abort_waiter(struct vdo_waiter *waiter, void __always_unused *context) { struct reference_updater *updater = container_of(waiter, struct reference_updater, waiter); @@ -3536,7 +3547,7 @@ static void initiate_load(struct admin_state *state) /** * vdo_notify_slab_journals_are_recovered() - Inform a block allocator that its slab journals have * been recovered from the recovery journal. - * @completion The allocator completion + * @completion: The allocator completion. */ void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion) { @@ -3775,7 +3786,7 @@ static int initialize_slab_journal(struct vdo_slab *slab) * in the slab. * @allocator: The block allocator to which the slab belongs. * @slab_number: The slab number of the slab. - * @is_new: true if this slab is being allocated as part of a resize. + * @is_new: True if this slab is being allocated as part of a resize. * @slab_ptr: A pointer to receive the new slab. * * Return: VDO_SUCCESS or an error code. @@ -3894,11 +3905,7 @@ void vdo_abandon_new_slabs(struct slab_depot *depot) vdo_free(vdo_forget(depot->new_slabs)); } -/** - * get_allocator_thread_id() - Get the ID of the thread on which a given allocator operates. - * - * Implements vdo_zone_thread_getter_fn. - */ +/** Implements vdo_zone_thread_getter_fn. */ static thread_id_t get_allocator_thread_id(void *context, zone_count_t zone_number) { return ((struct slab_depot *) context)->allocators[zone_number].thread_id; @@ -3911,7 +3918,7 @@ static thread_id_t get_allocator_thread_id(void *context, zone_count_t zone_numb * @recovery_lock: The sequence number of the recovery journal block whose locks should be * released. * - * Return: true if the journal does hold a lock on the specified block (which it will release). + * Return: True if the journal released a lock on the specified block. */ static bool __must_check release_recovery_journal_lock(struct slab_journal *journal, sequence_number_t recovery_lock) @@ -3955,6 +3962,8 @@ static void release_tail_block_locks(void *context, zone_count_t zone_number, /** * prepare_for_tail_block_commit() - Prepare to commit oldest tail blocks. + * @context: The slab depot. + * @parent: The parent operation. * * Implements vdo_action_preamble_fn. */ @@ -3968,6 +3977,7 @@ static void prepare_for_tail_block_commit(void *context, struct vdo_completion * /** * schedule_tail_block_commit() - Schedule a tail block commit if necessary. + * @context: The slab depot. * * This method should not be called directly. Rather, call vdo_schedule_default_action() on the * depot's action manager. @@ -4361,6 +4371,7 @@ struct slab_depot_state_2_0 vdo_record_slab_depot(const struct slab_depot *depot /** * vdo_allocate_reference_counters() - Allocate the reference counters for all slabs in the depot. + * @depot: The slab depot. * * Context: This method may be called only before entering normal operation from the load thread. * @@ -4615,7 +4626,9 @@ static void load_summary_endio(struct bio *bio) } /** - * load_slab_summary() - The preamble of a load operation. + * load_slab_summary() - Load the slab summary before the slab data. + * @context: The slab depot. + * @parent: The load operation. * * Implements vdo_action_preamble_fn. */ @@ -4731,7 +4744,7 @@ void vdo_update_slab_depot_size(struct slab_depot *depot) * vdo_prepare_to_grow_slab_depot() - Allocate new memory needed for a resize of a slab depot to * the given size. * @depot: The depot to prepare to resize. - * @partition: The new depot partition + * @partition: The new depot partition. * * Return: VDO_SUCCESS or an error. */ @@ -4781,6 +4794,7 @@ int vdo_prepare_to_grow_slab_depot(struct slab_depot *depot, /** * finish_registration() - Finish registering new slabs now that all of the allocators have * received their new slabs. + * @context: The slab depot. * * Implements vdo_action_conclusion_fn. */ diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c index 80b608674022..09fd0628d18c 100644 --- a/drivers/md/dm-vdo/vdo.c +++ b/drivers/md/dm-vdo/vdo.c @@ -181,6 +181,8 @@ static void assign_thread_ids(struct thread_config *config, /** * initialize_thread_config() - Initialize the thread mapping + * @counts: The number and types of threads to create. + * @config: The thread_config to initialize. * * If the logical, physical, and hash zone counts are all 0, a single thread will be shared by all * three plus the packer and recovery journal. Otherwise, there must be at least one of each type, @@ -884,6 +886,7 @@ const struct admin_state_code *vdo_get_admin_state(const struct vdo *vdo) /** * record_vdo() - Record the state of the VDO for encoding in the super block. + * @vdo: The vdo. */ static void record_vdo(struct vdo *vdo) { @@ -1277,7 +1280,7 @@ void vdo_enter_read_only_mode(struct vdo *vdo, int error_code) * vdo_is_read_only() - Check whether the VDO is read-only. * @vdo: The vdo. * - * Return: true if the vdo is read-only. + * Return: True if the vdo is read-only. * * This method may be called from any thread, as opposed to examining the VDO's state field which * is only safe to check from the admin thread. @@ -1291,7 +1294,7 @@ bool vdo_is_read_only(struct vdo *vdo) * vdo_in_read_only_mode() - Check whether a vdo is in read-only mode. * @vdo: The vdo to query. * - * Return: true if the vdo is in read-only mode. + * Return: True if the vdo is in read-only mode. */ bool vdo_in_read_only_mode(const struct vdo *vdo) { @@ -1302,7 +1305,7 @@ bool vdo_in_read_only_mode(const struct vdo *vdo) * vdo_in_recovery_mode() - Check whether the vdo is in recovery mode. * @vdo: The vdo to query. * - * Return: true if the vdo is in recovery mode. + * Return: True if the vdo is in recovery mode. */ bool vdo_in_recovery_mode(const struct vdo *vdo) { diff --git a/drivers/md/dm-vdo/vdo.h b/drivers/md/dm-vdo/vdo.h index 483ae873e002..1aaba73997b7 100644 --- a/drivers/md/dm-vdo/vdo.h +++ b/drivers/md/dm-vdo/vdo.h @@ -279,8 +279,10 @@ static inline bool vdo_uses_bio_ack_queue(struct vdo *vdo) /** * typedef vdo_filter_fn - Method type for vdo matching methods. + * @vdo: The vdo to match. + * @context: A parameter for the filter to use. * - * A filter function returns false if the vdo doesn't match. + * Return: True if the vdo matches the filter criteria, false if it doesn't. */ typedef bool (*vdo_filter_fn)(struct vdo *vdo, const void *context); diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c index 8fc22fb14196..5ffc867d9c5e 100644 --- a/drivers/md/dm-vdo/vio.c +++ b/drivers/md/dm-vdo/vio.c @@ -398,8 +398,9 @@ void free_vio_pool(struct vio_pool *pool) /** * is_vio_pool_busy() - Check whether an vio pool has outstanding entries. + * @pool: The vio pool. * - * Return: true if the pool is busy. + * Return: True if the pool is busy. */ bool is_vio_pool_busy(struct vio_pool *pool) { diff --git a/drivers/md/dm-vdo/vio.h b/drivers/md/dm-vdo/vio.h index 4bfcb21901f1..7a8a6819aec4 100644 --- a/drivers/md/dm-vdo/vio.h +++ b/drivers/md/dm-vdo/vio.h @@ -156,8 +156,7 @@ static inline enum vdo_completion_priority get_metadata_priority(struct vio *vio /** * continue_vio() - Enqueue a vio to run its next callback. * @vio: The vio to continue. - * - * Return: The result of the current operation. + * @result: The result of the current operation. */ static inline void continue_vio(struct vio *vio, int result) { @@ -172,6 +171,9 @@ void vdo_count_completed_bios(struct bio *bio); /** * continue_vio_after_io() - Continue a vio now that its I/O has returned. + * @vio: The vio to continue. + * @callback: The next operation for this vio. + * @thread: Which thread to run the next operation on. */ static inline void continue_vio_after_io(struct vio *vio, vdo_action_fn callback, thread_id_t thread) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 72047b47a7a0..c79de517afee 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -177,9 +177,11 @@ error: if (r < 0 && neras) DMERR_LIMIT("%s: FEC %llu: failed to correct: %d", v->data_dev->name, (unsigned long long)rsb, r); - else if (r > 0) + else if (r > 0) { DMWARN_LIMIT("%s: FEC %llu: corrected %d errors", v->data_dev->name, (unsigned long long)rsb, r); + atomic64_inc(&v->fec->corrected); + } return r; } @@ -188,14 +190,13 @@ error: * Locate data block erasures using verity hashes. */ static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, - u8 *want_digest, u8 *data) + const u8 *want_digest, const u8 *data) { if (unlikely(verity_hash(v, io, data, 1 << v->data_dev_block_bits, - verity_io_real_digest(v, io)))) + io->tmp_digest))) return 0; - return memcmp(verity_io_real_digest(v, io), want_digest, - v->digest_size) != 0; + return memcmp(io->tmp_digest, want_digest, v->digest_size) != 0; } /* @@ -328,7 +329,7 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) if (fio->bufs[n]) continue; - fio->bufs[n] = mempool_alloc(&v->fec->extra_pool, GFP_NOWAIT); + fio->bufs[n] = kmem_cache_alloc(v->fec->cache, GFP_NOWAIT); /* we can manage with even one buffer if necessary */ if (unlikely(!fio->bufs[n])) break; @@ -362,7 +363,7 @@ static void fec_init_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) */ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, struct dm_verity_fec_io *fio, u64 rsb, u64 offset, - bool use_erasures) + const u8 *want_digest, bool use_erasures) { int r, neras = 0; unsigned int pos; @@ -388,12 +389,11 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, /* Always re-validate the corrected block against the expected hash */ r = verity_hash(v, io, fio->output, 1 << v->data_dev_block_bits, - verity_io_real_digest(v, io)); + io->tmp_digest); if (unlikely(r < 0)) return r; - if (memcmp(verity_io_real_digest(v, io), verity_io_want_digest(v, io), - v->digest_size)) { + if (memcmp(io->tmp_digest, want_digest, v->digest_size)) { DMERR_LIMIT("%s: FEC %llu: failed to correct (%d erasures)", v->data_dev->name, (unsigned long long)rsb, neras); return -EILSEQ; @@ -404,7 +404,8 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, /* Correct errors in a block. Copies corrected block to dest. */ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, - enum verity_block_type type, sector_t block, u8 *dest) + enum verity_block_type type, const u8 *want_digest, + sector_t block, u8 *dest) { int r; struct dm_verity_fec_io *fio = fec_io(io); @@ -413,10 +414,8 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, if (!verity_fec_is_enabled(v)) return -EOPNOTSUPP; - if (fio->level >= DM_VERITY_FEC_MAX_RECURSION) { - DMWARN_LIMIT("%s: FEC: recursion too deep", v->data_dev->name); + if (fio->level) return -EIO; - } fio->level++; @@ -447,9 +446,9 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, * them first. Do a second attempt with erasures if the corruption is * bad enough. */ - r = fec_decode_rsb(v, io, fio, rsb, offset, false); + r = fec_decode_rsb(v, io, fio, rsb, offset, want_digest, false); if (r < 0) { - r = fec_decode_rsb(v, io, fio, rsb, offset, true); + r = fec_decode_rsb(v, io, fio, rsb, offset, want_digest, true); if (r < 0) goto done; } @@ -479,7 +478,8 @@ void verity_fec_finish_io(struct dm_verity_io *io) mempool_free(fio->bufs[n], &f->prealloc_pool); fec_for_each_extra_buffer(fio, n) - mempool_free(fio->bufs[n], &f->extra_pool); + if (fio->bufs[n]) + kmem_cache_free(f->cache, fio->bufs[n]); mempool_free(fio->output, &f->output_pool); } @@ -531,7 +531,6 @@ void verity_fec_dtr(struct dm_verity *v) mempool_exit(&f->rs_pool); mempool_exit(&f->prealloc_pool); - mempool_exit(&f->extra_pool); mempool_exit(&f->output_pool); kmem_cache_destroy(f->cache); @@ -784,12 +783,6 @@ int verity_fec_ctr(struct dm_verity *v) return ret; } - ret = mempool_init_slab_pool(&f->extra_pool, 0, f->cache); - if (ret) { - ti->error = "Cannot allocate FEC buffer extra pool"; - return ret; - } - /* Preallocate an output buffer for each thread */ ret = mempool_init_kmalloc_pool(&f->output_pool, num_online_cpus(), 1 << v->data_dev_block_bits); diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 09123a612953..5fd267873812 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -23,9 +23,6 @@ #define DM_VERITY_FEC_BUF_MAX \ (1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS)) -/* maximum recursion level for verity_fec_decode */ -#define DM_VERITY_FEC_MAX_RECURSION 4 - #define DM_VERITY_OPT_FEC_DEV "use_fec_from_device" #define DM_VERITY_OPT_FEC_BLOCKS "fec_blocks" #define DM_VERITY_OPT_FEC_START "fec_start" @@ -45,9 +42,9 @@ struct dm_verity_fec { unsigned char rsn; /* N of RS(M, N) */ mempool_t rs_pool; /* mempool for fio->rs */ mempool_t prealloc_pool; /* mempool for preallocated buffers */ - mempool_t extra_pool; /* mempool for extra buffers */ mempool_t output_pool; /* mempool for output */ struct kmem_cache *cache; /* cache for buffers */ + atomic64_t corrected; /* corrected errors */ }; /* per-bio data */ @@ -68,8 +65,8 @@ struct dm_verity_fec_io { extern bool verity_fec_is_enabled(struct dm_verity *v); extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, - enum verity_block_type type, sector_t block, - u8 *dest); + enum verity_block_type type, const u8 *want_digest, + sector_t block, u8 *dest); extern unsigned int verity_fec_status_table(struct dm_verity *v, unsigned int sz, char *result, unsigned int maxlen); @@ -99,6 +96,7 @@ static inline bool verity_fec_is_enabled(struct dm_verity *v) static inline int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, enum verity_block_type type, + const u8 *want_digest, sector_t block, u8 *dest) { return -EOPNOTSUPP; diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 66a00a8ccb39..5c17472d7896 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -117,11 +117,25 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, int verity_hash(struct dm_verity *v, struct dm_verity_io *io, const u8 *data, size_t len, u8 *digest) { - struct shash_desc *desc = &io->hash_desc; + struct shash_desc *desc; int r; + if (likely(v->use_sha256_lib)) { + struct sha256_ctx *ctx = &io->hash_ctx.sha256; + + /* + * Fast path using SHA-256 library. This is enabled only for + * verity version 1, where the salt is at the beginning. + */ + *ctx = *v->initial_hashstate.sha256; + sha256_update(ctx, data, len); + sha256_final(ctx, digest); + return 0; + } + + desc = &io->hash_ctx.shash; desc->tfm = v->shash_tfm; - if (unlikely(v->initial_hashstate == NULL)) { + if (unlikely(v->initial_hashstate.shash == NULL)) { /* Version 0: salt at end */ r = crypto_shash_init(desc) ?: crypto_shash_update(desc, data, len) ?: @@ -129,7 +143,7 @@ int verity_hash(struct dm_verity *v, struct dm_verity_io *io, crypto_shash_final(desc, digest); } else { /* Version 1: salt at beginning */ - r = crypto_shash_import(desc, v->initial_hashstate) ?: + r = crypto_shash_import(desc, v->initial_hashstate.shash) ?: crypto_shash_finup(desc, data, len, digest); } if (unlikely(r)) @@ -215,12 +229,12 @@ out: * Verify hash of a metadata block pertaining to the specified data block * ("block" argument) at a specified level ("level" argument). * - * On successful return, verity_io_want_digest(v, io) contains the hash value - * for a lower tree level or for the data block (if we're at the lowest level). + * On successful return, want_digest contains the hash value for a lower tree + * level or for the data block (if we're at the lowest level). * * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned. * If "skip_unverified" is false, unverified buffer is hashed and verified - * against current value of verity_io_want_digest(v, io). + * against current value of want_digest. */ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, sector_t block, int level, bool skip_unverified, @@ -259,7 +273,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, if (IS_ERR(data)) return r; if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA, - hash_block, data) == 0) { + want_digest, hash_block, data) == 0) { aux = dm_bufio_get_aux_data(buf); aux->hash_verified = 1; goto release_ok; @@ -279,11 +293,11 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, } r = verity_hash(v, io, data, 1 << v->hash_dev_block_bits, - verity_io_real_digest(v, io)); + io->tmp_digest); if (unlikely(r < 0)) goto release_ret_r; - if (likely(memcmp(verity_io_real_digest(v, io), want_digest, + if (likely(memcmp(io->tmp_digest, want_digest, v->digest_size) == 0)) aux->hash_verified = 1; else if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { @@ -294,7 +308,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, r = -EAGAIN; goto release_ret_r; } else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA, - hash_block, data) == 0) + want_digest, hash_block, data) == 0) aux->hash_verified = 1; else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA, @@ -358,7 +372,8 @@ out: } static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io, - sector_t cur_block, u8 *dest) + const u8 *want_digest, sector_t cur_block, + u8 *dest) { struct page *page; void *buffer; @@ -382,12 +397,11 @@ static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io, goto free_ret; r = verity_hash(v, io, buffer, 1 << v->data_dev_block_bits, - verity_io_real_digest(v, io)); + io->tmp_digest); if (unlikely(r)) goto free_ret; - if (memcmp(verity_io_real_digest(v, io), - verity_io_want_digest(v, io), v->digest_size)) { + if (memcmp(io->tmp_digest, want_digest, v->digest_size)) { r = -EIO; goto free_ret; } @@ -402,9 +416,13 @@ free_ret: static int verity_handle_data_hash_mismatch(struct dm_verity *v, struct dm_verity_io *io, - struct bio *bio, sector_t blkno, - u8 *data) + struct bio *bio, + struct pending_block *block) { + const u8 *want_digest = block->want_digest; + sector_t blkno = block->blkno; + u8 *data = block->data; + if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { /* * Error handling code (FEC included) cannot be run in the @@ -412,14 +430,14 @@ static int verity_handle_data_hash_mismatch(struct dm_verity *v, */ return -EAGAIN; } - if (verity_recheck(v, io, blkno, data) == 0) { + if (verity_recheck(v, io, want_digest, blkno, data) == 0) { if (v->validated_blocks) set_bit(blkno, v->validated_blocks); return 0; } #if defined(CONFIG_DM_VERITY_FEC) - if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, blkno, - data) == 0) + if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, want_digest, + blkno, data) == 0) return 0; #endif if (bio->bi_status) @@ -433,6 +451,58 @@ static int verity_handle_data_hash_mismatch(struct dm_verity *v, return 0; } +static void verity_clear_pending_blocks(struct dm_verity_io *io) +{ + int i; + + for (i = io->num_pending - 1; i >= 0; i--) { + kunmap_local(io->pending_blocks[i].data); + io->pending_blocks[i].data = NULL; + } + io->num_pending = 0; +} + +static int verity_verify_pending_blocks(struct dm_verity *v, + struct dm_verity_io *io, + struct bio *bio) +{ + const unsigned int block_size = 1 << v->data_dev_block_bits; + int i, r; + + if (io->num_pending == 2) { + /* num_pending == 2 implies that the algorithm is SHA-256 */ + sha256_finup_2x(v->initial_hashstate.sha256, + io->pending_blocks[0].data, + io->pending_blocks[1].data, block_size, + io->pending_blocks[0].real_digest, + io->pending_blocks[1].real_digest); + } else { + for (i = 0; i < io->num_pending; i++) { + r = verity_hash(v, io, io->pending_blocks[i].data, + block_size, + io->pending_blocks[i].real_digest); + if (unlikely(r)) + return r; + } + } + + for (i = 0; i < io->num_pending; i++) { + struct pending_block *block = &io->pending_blocks[i]; + + if (likely(memcmp(block->real_digest, block->want_digest, + v->digest_size) == 0)) { + if (v->validated_blocks) + set_bit(block->blkno, v->validated_blocks); + } else { + r = verity_handle_data_hash_mismatch(v, io, bio, block); + if (unlikely(r)) + return r; + } + } + verity_clear_pending_blocks(io); + return 0; +} + /* * Verify one "dm_verity_io" structure. */ @@ -440,10 +510,14 @@ static int verity_verify_io(struct dm_verity_io *io) { struct dm_verity *v = io->v; const unsigned int block_size = 1 << v->data_dev_block_bits; + const int max_pending = v->use_sha256_finup_2x ? 2 : 1; struct bvec_iter iter_copy; struct bvec_iter *iter; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); unsigned int b; + int r; + + io->num_pending = 0; if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { /* @@ -457,21 +531,22 @@ static int verity_verify_io(struct dm_verity_io *io) for (b = 0; b < io->n_blocks; b++, bio_advance_iter(bio, iter, block_size)) { - int r; - sector_t cur_block = io->block + b; + sector_t blkno = io->block + b; + struct pending_block *block; bool is_zero; struct bio_vec bv; void *data; if (v->validated_blocks && bio->bi_status == BLK_STS_OK && - likely(test_bit(cur_block, v->validated_blocks))) + likely(test_bit(blkno, v->validated_blocks))) continue; - r = verity_hash_for_block(v, io, cur_block, - verity_io_want_digest(v, io), + block = &io->pending_blocks[io->num_pending]; + + r = verity_hash_for_block(v, io, blkno, block->want_digest, &is_zero); if (unlikely(r < 0)) - return r; + goto error; bv = bio_iter_iovec(bio, *iter); if (unlikely(bv.bv_len < block_size)) { @@ -482,7 +557,8 @@ static int verity_verify_io(struct dm_verity_io *io) * data block size to be greater than PAGE_SIZE. */ DMERR_LIMIT("unaligned io (data block spans pages)"); - return -EIO; + r = -EIO; + goto error; } data = bvec_kmap_local(&bv); @@ -496,29 +572,26 @@ static int verity_verify_io(struct dm_verity_io *io) kunmap_local(data); continue; } - - r = verity_hash(v, io, data, block_size, - verity_io_real_digest(v, io)); - if (unlikely(r < 0)) { - kunmap_local(data); - return r; + block->data = data; + block->blkno = blkno; + if (++io->num_pending == max_pending) { + r = verity_verify_pending_blocks(v, io, bio); + if (unlikely(r)) + goto error; } + } - if (likely(memcmp(verity_io_real_digest(v, io), - verity_io_want_digest(v, io), v->digest_size) == 0)) { - if (v->validated_blocks) - set_bit(cur_block, v->validated_blocks); - kunmap_local(data); - continue; - } - r = verity_handle_data_hash_mismatch(v, io, bio, cur_block, - data); - kunmap_local(data); + if (io->num_pending) { + r = verity_verify_pending_blocks(v, io, bio); if (unlikely(r)) - return r; + goto error; } return 0; + +error: + verity_clear_pending_blocks(io); + return r; } /* @@ -775,6 +848,10 @@ static void verity_status(struct dm_target *ti, status_type_t type, switch (type) { case STATUSTYPE_INFO: DMEMIT("%c", v->hash_failed ? 'C' : 'V'); + if (verity_fec_is_enabled(v)) + DMEMIT(" %lld", atomic64_read(&v->fec->corrected)); + else + DMEMIT(" -"); break; case STATUSTYPE_TABLE: DMEMIT("%u %s %s %u %u %llu %llu %s ", @@ -1004,7 +1081,7 @@ static void verity_dtr(struct dm_target *ti) kvfree(v->validated_blocks); kfree(v->salt); - kfree(v->initial_hashstate); + kfree(v->initial_hashstate.shash); kfree(v->root_digest); kfree(v->zero_digest); verity_free_sig(v); @@ -1069,8 +1146,7 @@ static int verity_alloc_zero_digest(struct dm_verity *v) if (!v->zero_digest) return r; - io = kmalloc(sizeof(*io) + crypto_shash_descsize(v->shash_tfm), - GFP_KERNEL); + io = kmalloc(v->ti->per_io_data_size, GFP_KERNEL); if (!io) return r; /* verity_dtr will free zero_digest */ @@ -1252,11 +1328,26 @@ static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name) } v->shash_tfm = shash; v->digest_size = crypto_shash_digestsize(shash); - DMINFO("%s using \"%s\"", alg_name, crypto_shash_driver_name(shash)); if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { ti->error = "Digest size too big"; return -EINVAL; } + if (likely(v->version && strcmp(alg_name, "sha256") == 0)) { + /* + * Fast path: use the library API for reduced overhead and + * interleaved hashing support. + */ + v->use_sha256_lib = true; + if (sha256_finup_2x_is_optimized()) + v->use_sha256_finup_2x = true; + ti->per_io_data_size = + offsetofend(struct dm_verity_io, hash_ctx.sha256); + } else { + /* Fallback case: use the generic crypto API. */ + ti->per_io_data_size = + offsetofend(struct dm_verity_io, hash_ctx.shash) + + crypto_shash_descsize(shash); + } return 0; } @@ -1277,7 +1368,18 @@ static int verity_setup_salt_and_hashstate(struct dm_verity *v, const char *arg) return -EINVAL; } } - if (v->version) { /* Version 1: salt at beginning */ + if (likely(v->use_sha256_lib)) { + /* Implies version 1: salt at beginning */ + v->initial_hashstate.sha256 = + kmalloc(sizeof(struct sha256_ctx), GFP_KERNEL); + if (!v->initial_hashstate.sha256) { + ti->error = "Cannot allocate initial hash state"; + return -ENOMEM; + } + sha256_init(v->initial_hashstate.sha256); + sha256_update(v->initial_hashstate.sha256, + v->salt, v->salt_size); + } else if (v->version) { /* Version 1: salt at beginning */ SHASH_DESC_ON_STACK(desc, v->shash_tfm); int r; @@ -1285,16 +1387,16 @@ static int verity_setup_salt_and_hashstate(struct dm_verity *v, const char *arg) * Compute the pre-salted hash state that can be passed to * crypto_shash_import() for each block later. */ - v->initial_hashstate = kmalloc( + v->initial_hashstate.shash = kmalloc( crypto_shash_statesize(v->shash_tfm), GFP_KERNEL); - if (!v->initial_hashstate) { + if (!v->initial_hashstate.shash) { ti->error = "Cannot allocate initial hash state"; return -ENOMEM; } desc->tfm = v->shash_tfm; r = crypto_shash_init(desc) ?: crypto_shash_update(desc, v->salt, v->salt_size) ?: - crypto_shash_export(desc, v->initial_hashstate); + crypto_shash_export(desc, v->initial_hashstate.shash); if (r) { ti->error = "Cannot set up initial hash state"; return r; @@ -1556,9 +1658,6 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - ti->per_io_data_size = sizeof(struct dm_verity_io) + - crypto_shash_descsize(v->shash_tfm); - r = verity_fec_ctr(v); if (r) goto bad; @@ -1690,7 +1789,7 @@ static struct target_type verity_target = { .name = "verity", /* Note: the LSMs depend on the singleton and immutable features */ .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE, - .version = {1, 12, 0}, + .version = {1, 13, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 6d141abd965c..f975a9e5c5d6 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -16,6 +16,7 @@ #include <linux/device-mapper.h> #include <linux/interrupt.h> #include <crypto/hash.h> +#include <crypto/sha2.h> #define DM_VERITY_MAX_LEVELS 63 @@ -42,7 +43,10 @@ struct dm_verity { struct crypto_shash *shash_tfm; u8 *root_digest; /* digest of the root block */ u8 *salt; /* salt: its size is salt_size */ - u8 *initial_hashstate; /* salted initial state, if version >= 1 */ + union { + struct sha256_ctx *sha256; /* for use_sha256_lib=1 */ + u8 *shash; /* for use_sha256_lib=0 */ + } initial_hashstate; /* salted initial state, if version >= 1 */ u8 *zero_digest; /* digest for a zero block */ #ifdef CONFIG_SECURITY u8 *root_digest_sig; /* signature of the root digest */ @@ -59,6 +63,8 @@ struct dm_verity { unsigned char version; bool hash_failed:1; /* set if hash of any block failed */ bool use_bh_wq:1; /* try to verify in BH wq before normal work-queue */ + bool use_sha256_lib:1; /* use SHA-256 library instead of generic crypto API */ + bool use_sha256_finup_2x:1; /* use interleaved hashing optimization */ unsigned int digest_size; /* digest size for the current hash algorithm */ enum verity_mode mode; /* mode for handling verification errors */ enum verity_mode error_mode;/* mode for handling I/O errors */ @@ -78,6 +84,13 @@ struct dm_verity { mempool_t recheck_pool; }; +struct pending_block { + void *data; + sector_t blkno; + u8 want_digest[HASH_MAX_DIGESTSIZE]; + u8 real_digest[HASH_MAX_DIGESTSIZE]; +}; + struct dm_verity_io { struct dm_verity *v; @@ -94,28 +107,29 @@ struct dm_verity_io { struct work_struct work; struct work_struct bh_work; - u8 real_digest[HASH_MAX_DIGESTSIZE]; - u8 want_digest[HASH_MAX_DIGESTSIZE]; + u8 tmp_digest[HASH_MAX_DIGESTSIZE]; /* - * Temporary space for hashing. This is variable-length and must be at - * the end of the struct. struct shash_desc is just the fixed part; - * it's followed by a context of size crypto_shash_descsize(shash_tfm). + * This is the queue of data blocks that are pending verification. When + * the crypto layer supports interleaved hashing, we allow multiple + * blocks to be queued up in order to utilize it. This can improve + * performance significantly vs. sequential hashing of each block. */ - struct shash_desc hash_desc; -}; + int num_pending; + struct pending_block pending_blocks[2]; -static inline u8 *verity_io_real_digest(struct dm_verity *v, - struct dm_verity_io *io) -{ - return io->real_digest; -} - -static inline u8 *verity_io_want_digest(struct dm_verity *v, - struct dm_verity_io *io) -{ - return io->want_digest; -} + /* + * Temporary space for hashing. Either sha256 or shash is used, + * depending on the value of use_sha256_lib. If shash is used, + * then this field is variable-length, with total size + * sizeof(struct shash_desc) + crypto_shash_descsize(shash_tfm). + * For this reason, this field must be the end of the struct. + */ + union { + struct sha256_ctx sha256; + struct shash_desc shash; + } hash_ctx; +}; extern int verity_hash(struct dm_verity *v, struct dm_verity_io *io, const u8 *data, size_t len, u8 *digest); diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 5a840c4ae316..c95e417194b3 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -203,8 +203,6 @@ int dm_revalidate_zones(struct dm_table *t, struct request_queue *q) return ret; } - md->nr_zones = disk->nr_zones; - return 0; } @@ -452,7 +450,6 @@ void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim) set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); } else { clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - md->nr_zones = 0; md->disk->nr_zones = 0; } } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 6c83ab940af7..b63279202260 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -272,7 +272,7 @@ static int __init dm_init(void) int r, i; #if (IS_ENABLED(CONFIG_IMA) && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)) - DMWARN("CONFIG_IMA_DISABLE_HTABLE is disabled." + DMINFO("CONFIG_IMA_DISABLE_HTABLE is disabled." " Duplicate IMA measurements will not be recorded in the IMA log."); #endif @@ -1321,6 +1321,7 @@ void dm_accept_partial_bio(struct bio *bio, unsigned int n_sectors) BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); BUG_ON(bio_sectors > *tio->len_ptr); BUG_ON(n_sectors > bio_sectors); + BUG_ON(bio->bi_opf & REQ_ATOMIC); if (static_branch_unlikely(&zoned_enabled) && unlikely(bdev_is_zoned(bio->bi_bdev))) { @@ -1735,8 +1736,12 @@ static blk_status_t __split_and_process_bio(struct clone_info *ci) ci->submit_as_polled = !!(ci->bio->bi_opf & REQ_POLLED); len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); - if (ci->bio->bi_opf & REQ_ATOMIC && len != ci->sector_count) - return BLK_STS_IOERR; + if (ci->bio->bi_opf & REQ_ATOMIC) { + if (unlikely(!dm_target_supports_atomic_writes(ti->type))) + return BLK_STS_IOERR; + if (unlikely(len != ci->sector_count)) + return BLK_STS_IOERR; + } setup_split_accounting(ci, len); @@ -2439,7 +2444,6 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, { struct dm_table *old_map; sector_t size, old_size; - int ret; lockdep_assert_held(&md->suspend_lock); @@ -2454,11 +2458,13 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, set_capacity(md->disk, size); - ret = dm_table_set_restrictions(t, md->queue, limits); - if (ret) { - set_capacity(md->disk, old_size); - old_map = ERR_PTR(ret); - goto out; + if (limits) { + int ret = dm_table_set_restrictions(t, md->queue, limits); + if (ret) { + set_capacity(md->disk, old_size); + old_map = ERR_PTR(ret); + goto out; + } } /* @@ -2836,6 +2842,7 @@ static void dm_wq_work(struct work_struct *work) static void dm_queue_flush(struct mapped_device *md) { + clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); smp_mb__after_atomic(); queue_work(md->wq, &md->work); @@ -2848,6 +2855,7 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) { struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); struct queue_limits limits; + bool update_limits = true; int r; mutex_lock(&md->suspend_lock); @@ -2857,19 +2865,30 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) goto out; /* + * To avoid a potential deadlock locking the queue limits, disallow + * updating the queue limits during a table swap, when updating an + * immutable request-based dm device (dm-multipath) during a noflush + * suspend. It is userspace's responsibility to make sure that the new + * table uses the same limits as the existing table, if it asks for a + * noflush suspend. + */ + if (dm_request_based(md) && md->immutable_target && + __noflush_suspending(md)) + update_limits = false; + /* * If the new table has no data devices, retain the existing limits. * This helps multipath with queue_if_no_path if all paths disappear, * then new I/O is queued based on these limits, and then some paths * reappear. */ - if (dm_table_has_no_data_devices(table)) { + else if (dm_table_has_no_data_devices(table)) { live_map = dm_get_live_table_fast(md); if (live_map) limits = md->queue->limits; dm_put_live_table_fast(md); } - if (!live_map) { + if (update_limits && !live_map) { r = dm_calculate_queue_limits(table, &limits); if (r) { map = ERR_PTR(r); @@ -2877,7 +2896,7 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) } } - map = __bind(md, table, &limits); + map = __bind(md, table, update_limits ? &limits : NULL); dm_issue_global_event(); out: @@ -2930,7 +2949,6 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, /* * DMF_NOFLUSH_SUSPENDING must be set before presuspend. - * This flag is cleared before dm_suspend returns. */ if (noflush) set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); @@ -2993,8 +3011,6 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, if (!r) set_bit(dmf_suspended_flag, &md->flags); - if (noflush) - clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); if (map) synchronize_srcu(&md->io_barrier); diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index f4987f54e01b..4b6182cde859 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -2823,14 +2823,18 @@ static void regulator_ena_gpio_free(struct regulator_dev *rdev) static int regulator_ena_gpio_ctrl(struct regulator_dev *rdev, bool enable) { struct regulator_enable_gpio *pin = rdev->ena_pin; + int ret; if (!pin) return -EINVAL; if (enable) { /* Enable GPIO at initial use */ - if (pin->enable_count == 0) - gpiod_set_value_cansleep(pin->gpiod, 1); + if (pin->enable_count == 0) { + ret = gpiod_set_value_cansleep(pin->gpiod, 1); + if (ret) + return ret; + } pin->enable_count++; } else { @@ -2841,7 +2845,10 @@ static int regulator_ena_gpio_ctrl(struct regulator_dev *rdev, bool enable) /* Disable GPIO if not used */ if (pin->enable_count <= 1) { - gpiod_set_value_cansleep(pin->gpiod, 0); + ret = gpiod_set_value_cansleep(pin->gpiod, 0); + if (ret) + return ret; + pin->enable_count = 0; } } diff --git a/drivers/regulator/fixed.c b/drivers/regulator/fixed.c index a2d16e9abfb5..254c0a8a4555 100644 --- a/drivers/regulator/fixed.c +++ b/drivers/regulator/fixed.c @@ -330,13 +330,10 @@ static int reg_fixed_voltage_probe(struct platform_device *pdev) drvdata->dev = devm_regulator_register(&pdev->dev, &drvdata->desc, &cfg); - if (IS_ERR(drvdata->dev)) { - ret = dev_err_probe(&pdev->dev, PTR_ERR(drvdata->dev), - "Failed to register regulator: %ld\n", - PTR_ERR(drvdata->dev)); - gpiod_put(cfg.ena_gpiod); - return ret; - } + if (IS_ERR(drvdata->dev)) + return dev_err_probe(&pdev->dev, PTR_ERR(drvdata->dev), + "Failed to register regulator: %ld\n", + PTR_ERR(drvdata->dev)); platform_set_drvdata(pdev, drvdata); diff --git a/drivers/regulator/spacemit-p1.c b/drivers/regulator/spacemit-p1.c index d437e6738ea1..2bf9137e12b1 100644 --- a/drivers/regulator/spacemit-p1.c +++ b/drivers/regulator/spacemit-p1.c @@ -87,10 +87,10 @@ static const struct linear_range p1_ldo_ranges[] = { } #define P1_BUCK_DESC(_n) \ - P1_REG_DESC(BUCK, buck, _n, "vcc", 0x47, BUCK_MASK, 254, p1_buck_ranges) + P1_REG_DESC(BUCK, buck, _n, "vin", 0x47, BUCK_MASK, 254, p1_buck_ranges) #define P1_ALDO_DESC(_n) \ - P1_REG_DESC(ALDO, aldo, _n, "vcc", 0x5b, LDO_MASK, 117, p1_ldo_ranges) + P1_REG_DESC(ALDO, aldo, _n, "vin", 0x5b, LDO_MASK, 117, p1_ldo_ranges) #define P1_DLDO_DESC(_n) \ P1_REG_DESC(DLDO, dldo, _n, "buck5", 0x67, LDO_MASK, 117, p1_ldo_ranges) diff --git a/drivers/s390/char/sclp_mem.c b/drivers/s390/char/sclp_mem.c index 676c085b4f8a..27f0d2f12a8b 100644 --- a/drivers/s390/char/sclp_mem.c +++ b/drivers/s390/char/sclp_mem.c @@ -44,6 +44,9 @@ struct sclp_mem { unsigned int id; unsigned int memmap_on_memory; unsigned int config; +#ifdef CONFIG_KASAN + unsigned int early_shadow_mapped; +#endif }; struct sclp_mem_arg { @@ -244,6 +247,16 @@ static ssize_t sclp_config_mem_store(struct kobject *kobj, struct kobj_attribute put_device(&mem->dev); sclp_mem_change_state(addr, block_size, 0); __remove_memory(addr, block_size); +#ifdef CONFIG_KASAN + if (sclp_mem->early_shadow_mapped) { + unsigned long start, end; + + start = (unsigned long)kasan_mem_to_shadow(__va(addr)); + end = start + (block_size >> KASAN_SHADOW_SCALE_SHIFT); + vmemmap_free(start, end, NULL); + sclp_mem->early_shadow_mapped = 0; + } +#endif WRITE_ONCE(sclp_mem->config, 0); } out_unlock: @@ -316,6 +329,9 @@ static int sclp_create_mem(struct sclp_mem *sclp_mem, struct kset *kset, sclp_mem->memmap_on_memory = memmap_on_memory; sclp_mem->config = config; +#ifdef CONFIG_KASAN + sclp_mem->early_shadow_mapped = config; +#endif sclp_mem->id = id; kobject_init(&sclp_mem->kobj, &ktype); rc = kobject_add(&sclp_mem->kobj, &kset->kobj, "memory%d", id); diff --git a/drivers/s390/char/vmur.c b/drivers/s390/char/vmur.c index e3e0e9f36527..a226ff208eda 100644 --- a/drivers/s390/char/vmur.c +++ b/drivers/s390/char/vmur.c @@ -154,7 +154,7 @@ static struct urdev *urdev_get_from_devno(u16 devno) struct ccw_device *cdev; struct urdev *urd; - sprintf(bus_id, "0.0.%04x", devno); + scnprintf(bus_id, sizeof(bus_id), "0.0.%04x", devno); cdev = get_ccwdev_by_busid(&ur_driver, bus_id); if (!cdev) return NULL; @@ -904,11 +904,11 @@ static int ur_set_online(struct ccw_device *cdev) goto fail_free_cdev; if (urd->cdev->id.cu_type == READER_PUNCH_DEVTYPE) { if (urd->class == DEV_CLASS_UR_I) - sprintf(node_id, "vmrdr-%s", dev_name(&cdev->dev)); + scnprintf(node_id, sizeof(node_id), "vmrdr-%s", dev_name(&cdev->dev)); if (urd->class == DEV_CLASS_UR_O) - sprintf(node_id, "vmpun-%s", dev_name(&cdev->dev)); + scnprintf(node_id, sizeof(node_id), "vmpun-%s", dev_name(&cdev->dev)); } else if (urd->cdev->id.cu_type == PRINTER_DEVTYPE) { - sprintf(node_id, "vmprt-%s", dev_name(&cdev->dev)); + scnprintf(node_id, sizeof(node_id), "vmprt-%s", dev_name(&cdev->dev)); } else { rc = -EOPNOTSUPP; goto fail_free_cdev; diff --git a/drivers/spi/spi-microchip-core-spi.c b/drivers/spi/spi-microchip-core-spi.c index 98bf0e6cd00e..89e40fc45d73 100644 --- a/drivers/spi/spi-microchip-core-spi.c +++ b/drivers/spi/spi-microchip-core-spi.c @@ -387,6 +387,7 @@ static int mchp_corespi_probe(struct platform_device *pdev) ret = devm_spi_register_controller(dev, host); if (ret) { + mchp_corespi_disable_ints(spi); mchp_corespi_disable(spi); return dev_err_probe(dev, ret, "unable to register host for CoreSPI controller\n"); } diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 2ceda49c609f..aa36a0d1d9df 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -90,7 +90,7 @@ */ #define DMA_MAPPING_ERROR (~(dma_addr_t)0) -#define DMA_BIT_MASK(n) GENMASK_ULL(n - 1, 0) +#define DMA_BIT_MASK(n) GENMASK_ULL((n) - 1, 0) struct dma_iova_state { dma_addr_t addr; diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 37e0b5b5600a..17902861de76 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -182,9 +182,9 @@ int generic_handle_irq_safe(unsigned int irq); * and handle the result interrupt number. Return -EINVAL if * conversion failed. */ -int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq); -int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq); -int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq); +int generic_handle_domain_irq(struct irq_domain *domain, irq_hw_number_t hwirq); +int generic_handle_domain_irq_safe(struct irq_domain *domain, irq_hw_number_t hwirq); +int generic_handle_domain_nmi(struct irq_domain *domain, irq_hw_number_t hwirq); #endif /* Test to see if a driver has successfully requested an irq */ diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index 36ddfa1ec301..67d2bf579942 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -32,7 +32,7 @@ struct restart_block { u32 val; u32 flags; u32 bitset; - u64 time; + ktime_t time; u32 __user *uaddr2; } futex; /* For nanosleep */ diff --git a/include/linux/slab.h b/include/linux/slab.h index cf443f064a66..2482992248dc 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -1150,10 +1150,17 @@ static inline void kvfree_rcu_barrier(void) rcu_barrier(); } +static inline void kvfree_rcu_barrier_on_cache(struct kmem_cache *s) +{ + rcu_barrier(); +} + static inline void kfree_rcu_scheduler_running(void) { } #else void kvfree_rcu_barrier(void); +void kvfree_rcu_barrier_on_cache(struct kmem_cache *s); + void kfree_rcu_scheduler_running(void); #endif diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index ee45dee33d49..26392badc36b 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -93,7 +93,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, page = dma_alloc_from_contiguous(NULL, 1 << order, order, false); if (!page) - page = alloc_pages(gfp, order); + page = alloc_pages(gfp | __GFP_NOWARN, order); } while (!page && order-- > 0); if (!page) goto out; diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index e2bbe5509ec2..1c2dd03f11ec 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -738,12 +738,11 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = restart->futex.uaddr; - ktime_t t, *tp = NULL; + ktime_t *tp = NULL; + + if (restart->futex.flags & FLAGS_HAS_TIMEOUT) + tp = &restart->futex.time; - if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { - t = restart->futex.time; - tp = &t; - } restart->fn = do_no_restart_syscall; return (long)futex_wait(uaddr, restart->futex.flags, diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 6acf268f005b..f8e4e13dbe33 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -720,7 +720,7 @@ EXPORT_SYMBOL_GPL(generic_handle_irq_safe); * This function must be called from an IRQ context with irq regs * initialized. */ -int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq) +int generic_handle_domain_irq(struct irq_domain *domain, irq_hw_number_t hwirq) { return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } @@ -738,7 +738,7 @@ EXPORT_SYMBOL_GPL(generic_handle_domain_irq); * context). If the interrupt is marked as 'enforce IRQ-context only' then * the function must be invoked from hard interrupt context. */ -int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq) +int generic_handle_domain_irq_safe(struct irq_domain *domain, irq_hw_number_t hwirq) { unsigned long flags; int ret; @@ -761,7 +761,7 @@ EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe); * This function must be called from an NMI context with irq regs * initialized. **/ -int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq) +int generic_handle_domain_nmi(struct irq_domain *domain, irq_hw_number_t hwirq) { WARN_ON_ONCE(!in_nmi()); return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); diff --git a/mm/slab.h b/mm/slab.h index f730e012553c..e767aa7e91b0 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -422,6 +422,7 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s) bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj); void flush_all_rcu_sheaves(void); +void flush_rcu_sheaves_on_cache(struct kmem_cache *s); #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_PANIC | \ diff --git a/mm/slab_common.c b/mm/slab_common.c index b613533b29e7..eed7ea556cb1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -492,7 +492,7 @@ void kmem_cache_destroy(struct kmem_cache *s) return; /* in-flight kfree_rcu()'s may include objects from our cache */ - kvfree_rcu_barrier(); + kvfree_rcu_barrier_on_cache(s); if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) && (s->flags & SLAB_TYPESAFE_BY_RCU)) { @@ -2038,25 +2038,13 @@ unlock_return: } EXPORT_SYMBOL_GPL(kvfree_call_rcu); -/** - * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete. - * - * Note that a single argument of kvfree_rcu() call has a slow path that - * triggers synchronize_rcu() following by freeing a pointer. It is done - * before the return from the function. Therefore for any single-argument - * call that will result in a kfree() to a cache that is to be destroyed - * during module exit, it is developer's responsibility to ensure that all - * such calls have returned before the call to kmem_cache_destroy(). - */ -void kvfree_rcu_barrier(void) +static inline void __kvfree_rcu_barrier(void) { struct kfree_rcu_cpu_work *krwp; struct kfree_rcu_cpu *krcp; bool queued; int i, cpu; - flush_all_rcu_sheaves(); - /* * Firstly we detach objects and queue them over an RCU-batch * for all CPUs. Finally queued works are flushed for each CPU. @@ -2118,8 +2106,43 @@ void kvfree_rcu_barrier(void) } } } + +/** + * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete. + * + * Note that a single argument of kvfree_rcu() call has a slow path that + * triggers synchronize_rcu() following by freeing a pointer. It is done + * before the return from the function. Therefore for any single-argument + * call that will result in a kfree() to a cache that is to be destroyed + * during module exit, it is developer's responsibility to ensure that all + * such calls have returned before the call to kmem_cache_destroy(). + */ +void kvfree_rcu_barrier(void) +{ + flush_all_rcu_sheaves(); + __kvfree_rcu_barrier(); +} EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); +/** + * kvfree_rcu_barrier_on_cache - Wait for in-flight kvfree_rcu() calls on a + * specific slab cache. + * @s: slab cache to wait for + * + * See the description of kvfree_rcu_barrier() for details. + */ +void kvfree_rcu_barrier_on_cache(struct kmem_cache *s) +{ + if (s->cpu_sheaves) + flush_rcu_sheaves_on_cache(s); + /* + * TODO: Introduce a version of __kvfree_rcu_barrier() that works + * on a specific slab cache. + */ + __kvfree_rcu_barrier(); +} +EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache); + static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { @@ -2215,4 +2238,3 @@ void __init kvfree_rcu_init(void) } #endif /* CONFIG_KVFREE_RCU_BATCHED */ - diff --git a/mm/slub.c b/mm/slub.c index e6a330e24145..f21b2f0c6f5a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4122,42 +4122,47 @@ static void flush_rcu_sheaf(struct work_struct *w) /* needed for kvfree_rcu_barrier() */ -void flush_all_rcu_sheaves(void) +void flush_rcu_sheaves_on_cache(struct kmem_cache *s) { struct slub_flush_work *sfw; - struct kmem_cache *s; unsigned int cpu; - cpus_read_lock(); - mutex_lock(&slab_mutex); + mutex_lock(&flush_lock); - list_for_each_entry(s, &slab_caches, list) { - if (!s->cpu_sheaves) - continue; + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); - mutex_lock(&flush_lock); + /* + * we don't check if rcu_free sheaf exists - racing + * __kfree_rcu_sheaf() might have just removed it. + * by executing flush_rcu_sheaf() on the cpu we make + * sure the __kfree_rcu_sheaf() finished its call_rcu() + */ - for_each_online_cpu(cpu) { - sfw = &per_cpu(slub_flush, cpu); + INIT_WORK(&sfw->work, flush_rcu_sheaf); + sfw->s = s; + queue_work_on(cpu, flushwq, &sfw->work); + } - /* - * we don't check if rcu_free sheaf exists - racing - * __kfree_rcu_sheaf() might have just removed it. - * by executing flush_rcu_sheaf() on the cpu we make - * sure the __kfree_rcu_sheaf() finished its call_rcu() - */ + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + flush_work(&sfw->work); + } - INIT_WORK(&sfw->work, flush_rcu_sheaf); - sfw->s = s; - queue_work_on(cpu, flushwq, &sfw->work); - } + mutex_unlock(&flush_lock); +} - for_each_online_cpu(cpu) { - sfw = &per_cpu(slub_flush, cpu); - flush_work(&sfw->work); - } +void flush_all_rcu_sheaves(void) +{ + struct kmem_cache *s; + + cpus_read_lock(); + mutex_lock(&slab_mutex); - mutex_unlock(&flush_lock); + list_for_each_entry(s, &slab_caches, list) { + if (!s->cpu_sheaves) + continue; + flush_rcu_sheaves_on_cache(s); } mutex_unlock(&slab_mutex); diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore index 776ad658f75e..23b9fea8d190 100644 --- a/tools/testing/selftests/futex/functional/.gitignore +++ b/tools/testing/selftests/futex/functional/.gitignore @@ -12,3 +12,4 @@ futex_wait_uninitialized_heap futex_wait_wouldblock futex_waitv futex_numa +robust_list diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index 490ace1f017e..af7ec309ea78 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile @@ -22,7 +22,8 @@ TEST_GEN_PROGS := \ futex_priv_hash \ futex_numa_mpol \ futex_waitv \ - futex_numa + futex_numa \ + robust_list TEST_PROGS := run.sh diff --git a/tools/testing/selftests/futex/functional/futex_numa_mpol.c b/tools/testing/selftests/futex/functional/futex_numa_mpol.c index ab8555752137..220ef219c823 100644 --- a/tools/testing/selftests/futex/functional/futex_numa_mpol.c +++ b/tools/testing/selftests/futex/functional/futex_numa_mpol.c @@ -131,11 +131,6 @@ static void test_futex(void *futex_ptr, int err_value) __test_futex(futex_ptr, err_value, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA); } -static void test_futex_mpol(void *futex_ptr, int err_value) -{ - __test_futex(futex_ptr, err_value, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA | FUTEX2_MPOL); -} - TEST(futex_numa_mpol) { struct futex32_numa *futex_numa; diff --git a/tools/testing/selftests/futex/functional/futex_wait.c b/tools/testing/selftests/futex/functional/futex_wait.c index 0e69c53524c1..7b8879409007 100644 --- a/tools/testing/selftests/futex/functional/futex_wait.c +++ b/tools/testing/selftests/futex/functional/futex_wait.c @@ -71,6 +71,8 @@ TEST(anon_page) /* Testing an anon page shared memory */ shm_id = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666); if (shm_id < 0) { + if (errno == ENOSYS) + ksft_exit_skip("shmget syscall not supported\n"); perror("shmget"); exit(1); } @@ -108,14 +110,14 @@ TEST(file_backed) /* Testing a file backed shared memory */ fd = open(SHM_PATH, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); if (fd < 0) - ksft_exit_fail_msg("open"); + ksft_exit_fail_msg("open\n"); if (ftruncate(fd, sizeof(f_private))) - ksft_exit_fail_msg("ftruncate"); + ksft_exit_fail_msg("ftruncate\n"); shm = mmap(NULL, sizeof(f_private), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (shm == MAP_FAILED) - ksft_exit_fail_msg("mmap"); + ksft_exit_fail_msg("mmap\n"); memcpy(shm, &f_private, sizeof(f_private)); diff --git a/tools/testing/selftests/futex/functional/futex_waitv.c b/tools/testing/selftests/futex/functional/futex_waitv.c index d60876164d4b..b5ada9fdb26f 100644 --- a/tools/testing/selftests/futex/functional/futex_waitv.c +++ b/tools/testing/selftests/futex/functional/futex_waitv.c @@ -86,6 +86,8 @@ TEST(shared_waitv) int shm_id = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666); if (shm_id < 0) { + if (errno == ENOSYS) + ksft_exit_skip("shmget syscall not supported\n"); perror("shmget"); exit(1); } diff --git a/tools/testing/selftests/futex/functional/robust_list.c b/tools/testing/selftests/futex/functional/robust_list.c new file mode 100644 index 000000000000..e7d1254e18ca --- /dev/null +++ b/tools/testing/selftests/futex/functional/robust_list.c @@ -0,0 +1,552 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2025 Igalia S.L. + * + * Robust list test by André Almeida <andrealmeid@igalia.com> + * + * The robust list uAPI allows userspace to create "robust" locks, in the sense + * that if the lock holder thread dies, the remaining threads that are waiting + * for the lock won't block forever, waiting for a lock that will never be + * released. + * + * This is achieve by userspace setting a list where a thread can enter all the + * locks (futexes) that it is holding. The robust list is a linked list, and + * userspace register the start of the list with the syscall set_robust_list(). + * If such thread eventually dies, the kernel will walk this list, waking up one + * thread waiting for each futex and marking the futex word with the flag + * FUTEX_OWNER_DIED. + * + * See also + * man set_robust_list + * Documententation/locking/robust-futex-ABI.rst + * Documententation/locking/robust-futexes.rst + */ + +#define _GNU_SOURCE + +#include "futextest.h" +#include "../../kselftest_harness.h" + +#include <errno.h> +#include <pthread.h> +#include <signal.h> +#include <stdatomic.h> +#include <stdbool.h> +#include <stddef.h> +#include <sys/mman.h> +#include <sys/wait.h> + +#define STACK_SIZE (1024 * 1024) + +#define FUTEX_TIMEOUT 3 + +#define SLEEP_US 100 + +static pthread_barrier_t barrier, barrier2; + +static int set_robust_list(struct robust_list_head *head, size_t len) +{ + return syscall(SYS_set_robust_list, head, len); +} + +static int get_robust_list(int pid, struct robust_list_head **head, size_t *len_ptr) +{ + return syscall(SYS_get_robust_list, pid, head, len_ptr); +} + +/* + * Basic lock struct, contains just the futex word and the robust list element + * Real implementations have also a *prev to easily walk in the list + */ +struct lock_struct { + _Atomic(unsigned int) futex; + struct robust_list list; +}; + +/* + * Helper function to spawn a child thread. Returns -1 on error, pid on success + */ +static int create_child(int (*fn)(void *arg), void *arg) +{ + char *stack; + pid_t pid; + + stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0); + if (stack == MAP_FAILED) + return -1; + + stack += STACK_SIZE; + + pid = clone(fn, stack, CLONE_VM | SIGCHLD, arg); + + if (pid == -1) + return -1; + + return pid; +} + +/* + * Helper function to prepare and register a robust list + */ +static int set_list(struct robust_list_head *head) +{ + int ret; + + ret = set_robust_list(head, sizeof(*head)); + if (ret) + return ret; + + head->futex_offset = (size_t) offsetof(struct lock_struct, futex) - + (size_t) offsetof(struct lock_struct, list); + head->list.next = &head->list; + head->list_op_pending = NULL; + + return 0; +} + +/* + * A basic (and incomplete) mutex lock function with robustness + */ +static int mutex_lock(struct lock_struct *lock, struct robust_list_head *head, bool error_inject) +{ + _Atomic(unsigned int) *futex = &lock->futex; + unsigned int zero = 0; + pid_t tid = gettid(); + int ret = -1; + + /* + * Set list_op_pending before starting the lock, so the kernel can catch + * the case where the thread died during the lock operation + */ + head->list_op_pending = &lock->list; + + if (atomic_compare_exchange_strong(futex, &zero, tid)) { + /* + * We took the lock, insert it in the robust list + */ + struct robust_list *list = &head->list; + + /* Error injection to test list_op_pending */ + if (error_inject) + return 0; + + while (list->next != &head->list) + list = list->next; + + list->next = &lock->list; + lock->list.next = &head->list; + + ret = 0; + } else { + /* + * We didn't take the lock, wait until the owner wakes (or dies) + */ + struct timespec to; + + to.tv_sec = FUTEX_TIMEOUT; + to.tv_nsec = 0; + + tid = atomic_load(futex); + /* Kernel ignores futexes without the waiters flag */ + tid |= FUTEX_WAITERS; + atomic_store(futex, tid); + + ret = futex_wait((futex_t *) futex, tid, &to, 0); + + /* + * A real mutex_lock() implementation would loop here to finally + * take the lock. We don't care about that, so we stop here. + */ + } + + head->list_op_pending = NULL; + + return ret; +} + +/* + * This child thread will succeed taking the lock, and then will exit holding it + */ +static int child_fn_lock(void *arg) +{ + struct lock_struct *lock = arg; + struct robust_list_head head; + int ret; + + ret = set_list(&head); + if (ret) { + ksft_test_result_fail("set_robust_list error\n"); + return ret; + } + + ret = mutex_lock(lock, &head, false); + if (ret) { + ksft_test_result_fail("mutex_lock error\n"); + return ret; + } + + pthread_barrier_wait(&barrier); + + /* + * There's a race here: the parent thread needs to be inside + * futex_wait() before the child thread dies, otherwise it will miss the + * wakeup from handle_futex_death() that this child will emit. We wait a + * little bit just to make sure that this happens. + */ + usleep(SLEEP_US); + + return 0; +} + +/* + * Spawns a child thread that will set a robust list, take the lock, register it + * in the robust list and die. The parent thread will wait on this futex, and + * should be waken up when the child exits. + */ +TEST(test_robustness) +{ + struct lock_struct lock = { .futex = 0 }; + _Atomic(unsigned int) *futex = &lock.futex; + struct robust_list_head head; + int ret, pid, wstatus; + + ret = set_list(&head); + ASSERT_EQ(ret, 0); + + /* + * Lets use a barrier to ensure that the child thread takes the lock + * before the parent + */ + ret = pthread_barrier_init(&barrier, NULL, 2); + ASSERT_EQ(ret, 0); + + pid = create_child(&child_fn_lock, &lock); + ASSERT_NE(pid, -1); + + pthread_barrier_wait(&barrier); + ret = mutex_lock(&lock, &head, false); + + /* + * futex_wait() should return 0 and the futex word should be marked with + * FUTEX_OWNER_DIED + */ + ASSERT_EQ(ret, 0); + + ASSERT_TRUE(*futex & FUTEX_OWNER_DIED); + + wait(&wstatus); + pthread_barrier_destroy(&barrier); + + /* Pass only if the child hasn't return error */ + if (!WEXITSTATUS(wstatus)) + ksft_test_result_pass("%s\n", __func__); +} + +/* + * The only valid value for len is sizeof(*head) + */ +TEST(test_set_robust_list_invalid_size) +{ + struct robust_list_head head; + size_t head_size = sizeof(head); + int ret; + + ret = set_robust_list(&head, head_size); + ASSERT_EQ(ret, 0); + + ret = set_robust_list(&head, head_size * 2); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EINVAL); + + ret = set_robust_list(&head, head_size - 1); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EINVAL); + + ret = set_robust_list(&head, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EINVAL); + + ksft_test_result_pass("%s\n", __func__); +} + +/* + * Test get_robust_list with pid = 0, getting the list of the running thread + */ +TEST(test_get_robust_list_self) +{ + struct robust_list_head head, head2, *get_head; + size_t head_size = sizeof(head), len_ptr; + int ret; + + ret = set_robust_list(&head, head_size); + ASSERT_EQ(ret, 0); + + ret = get_robust_list(0, &get_head, &len_ptr); + ASSERT_EQ(ret, 0); + ASSERT_EQ(get_head, &head); + ASSERT_EQ(head_size, len_ptr); + + ret = set_robust_list(&head2, head_size); + ASSERT_EQ(ret, 0); + + ret = get_robust_list(0, &get_head, &len_ptr); + ASSERT_EQ(ret, 0); + ASSERT_EQ(get_head, &head2); + ASSERT_EQ(head_size, len_ptr); + + ksft_test_result_pass("%s\n", __func__); +} + +static int child_list(void *arg) +{ + struct robust_list_head *head = arg; + int ret; + + ret = set_robust_list(head, sizeof(*head)); + if (ret) { + ksft_test_result_fail("set_robust_list error\n"); + return -1; + } + + /* + * After setting the list head, wait until the main thread can call + * get_robust_list() for this thread before exiting. + */ + pthread_barrier_wait(&barrier); + pthread_barrier_wait(&barrier2); + + return 0; +} + +/* + * Test get_robust_list from another thread. We use two barriers here to ensure + * that: + * 1) the child thread set the list before we try to get it from the + * parent + * 2) the child thread still alive when we try to get the list from it + */ +TEST(test_get_robust_list_child) +{ + struct robust_list_head head, *get_head; + int ret, wstatus; + size_t len_ptr; + pid_t tid; + + ret = pthread_barrier_init(&barrier, NULL, 2); + ret = pthread_barrier_init(&barrier2, NULL, 2); + ASSERT_EQ(ret, 0); + + tid = create_child(&child_list, &head); + ASSERT_NE(tid, -1); + + pthread_barrier_wait(&barrier); + + ret = get_robust_list(tid, &get_head, &len_ptr); + ASSERT_EQ(ret, 0); + ASSERT_EQ(&head, get_head); + + pthread_barrier_wait(&barrier2); + + wait(&wstatus); + pthread_barrier_destroy(&barrier); + pthread_barrier_destroy(&barrier2); + + /* Pass only if the child hasn't return error */ + if (!WEXITSTATUS(wstatus)) + ksft_test_result_pass("%s\n", __func__); +} + +static int child_fn_lock_with_error(void *arg) +{ + struct lock_struct *lock = arg; + struct robust_list_head head; + int ret; + + ret = set_list(&head); + if (ret) { + ksft_test_result_fail("set_robust_list error\n"); + return -1; + } + + ret = mutex_lock(lock, &head, true); + if (ret) { + ksft_test_result_fail("mutex_lock error\n"); + return -1; + } + + pthread_barrier_wait(&barrier); + + /* See comment at child_fn_lock() */ + usleep(SLEEP_US); + + return 0; +} + +/* + * Same as robustness test, but inject an error where the mutex_lock() exits + * earlier, just after setting list_op_pending and taking the lock, to test the + * list_op_pending mechanism + */ +TEST(test_set_list_op_pending) +{ + struct lock_struct lock = { .futex = 0 }; + _Atomic(unsigned int) *futex = &lock.futex; + struct robust_list_head head; + int ret, wstatus; + + ret = set_list(&head); + ASSERT_EQ(ret, 0); + + ret = pthread_barrier_init(&barrier, NULL, 2); + ASSERT_EQ(ret, 0); + + ret = create_child(&child_fn_lock_with_error, &lock); + ASSERT_NE(ret, -1); + + pthread_barrier_wait(&barrier); + ret = mutex_lock(&lock, &head, false); + + ASSERT_EQ(ret, 0); + + ASSERT_TRUE(*futex & FUTEX_OWNER_DIED); + + wait(&wstatus); + pthread_barrier_destroy(&barrier); + + /* Pass only if the child hasn't return error */ + if (!WEXITSTATUS(wstatus)) + ksft_test_result_pass("%s\n", __func__); + else + ksft_test_result_fail("%s\n", __func__); +} + +#define CHILD_NR 10 + +static int child_lock_holder(void *arg) +{ + struct lock_struct *locks = arg; + struct robust_list_head head; + int i; + + set_list(&head); + + for (i = 0; i < CHILD_NR; i++) { + locks[i].futex = 0; + mutex_lock(&locks[i], &head, false); + } + + pthread_barrier_wait(&barrier); + pthread_barrier_wait(&barrier2); + + /* See comment at child_fn_lock() */ + usleep(SLEEP_US); + + return 0; +} + +static int child_wait_lock(void *arg) +{ + struct lock_struct *lock = arg; + struct robust_list_head head; + int ret; + + pthread_barrier_wait(&barrier2); + ret = mutex_lock(lock, &head, false); + + if (ret) { + ksft_test_result_fail("mutex_lock error\n"); + return -1; + } + + if (!(lock->futex & FUTEX_OWNER_DIED)) { + ksft_test_result_fail("futex not marked with FUTEX_OWNER_DIED\n"); + return -1; + } + + return 0; +} + +/* + * Test a robust list of more than one element. All the waiters should wake when + * the holder dies + */ +TEST(test_robust_list_multiple_elements) +{ + struct lock_struct locks[CHILD_NR]; + pid_t pids[CHILD_NR + 1]; + int i, ret, wstatus; + + ret = pthread_barrier_init(&barrier, NULL, 2); + ASSERT_EQ(ret, 0); + ret = pthread_barrier_init(&barrier2, NULL, CHILD_NR + 1); + ASSERT_EQ(ret, 0); + + pids[0] = create_child(&child_lock_holder, &locks); + + /* Wait until the locker thread takes the look */ + pthread_barrier_wait(&barrier); + + for (i = 0; i < CHILD_NR; i++) + pids[i+1] = create_child(&child_wait_lock, &locks[i]); + + /* Wait for all children to return */ + ret = 0; + + for (i = 0; i < CHILD_NR; i++) { + waitpid(pids[i], &wstatus, 0); + if (WEXITSTATUS(wstatus)) + ret = -1; + } + + pthread_barrier_destroy(&barrier); + pthread_barrier_destroy(&barrier2); + + /* Pass only if the child hasn't return error */ + if (!ret) + ksft_test_result_pass("%s\n", __func__); +} + +static int child_circular_list(void *arg) +{ + static struct robust_list_head head; + struct lock_struct a, b, c; + int ret; + + ret = set_list(&head); + if (ret) { + ksft_test_result_fail("set_list error\n"); + return -1; + } + + head.list.next = &a.list; + + /* + * The last element should point to head list, but we short circuit it + */ + a.list.next = &b.list; + b.list.next = &c.list; + c.list.next = &a.list; + + return 0; +} + +/* + * Create a circular robust list. The kernel should be able to destroy the list + * while processing it so it won't be trapped in an infinite loop while handling + * a process exit + */ +TEST(test_circular_list) +{ + int wstatus; + + create_child(child_circular_list, NULL); + + wait(&wstatus); + + /* Pass only if the child hasn't return error */ + if (!WEXITSTATUS(wstatus)) + ksft_test_result_pass("%s\n", __func__); +} + +TEST_HARNESS_MAIN |
